diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,318533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.68600129617628, + "eval_steps": 500, + "global_step": 45500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.101101749837978e-05, + "grad_norm": 0.02385132946074009, + "learning_rate": 4.0502227622519245e-08, + "loss": 1.0388, + "step": 1 + }, + { + "epoch": 0.00016202203499675956, + "grad_norm": 0.022809118032455444, + "learning_rate": 8.100445524503849e-08, + "loss": 1.169, + "step": 2 + }, + { + "epoch": 0.00024303305249513934, + "grad_norm": 0.021291494369506836, + "learning_rate": 1.215066828675577e-07, + "loss": 1.1149, + "step": 3 + }, + { + "epoch": 0.0003240440699935191, + "grad_norm": 0.020570585504174232, + "learning_rate": 1.6200891049007698e-07, + "loss": 0.9499, + "step": 4 + }, + { + "epoch": 0.0004050550874918989, + "grad_norm": 0.023432889953255653, + "learning_rate": 2.025111381125962e-07, + "loss": 1.0296, + "step": 5 + }, + { + "epoch": 0.0004860661049902787, + "grad_norm": 0.02114630490541458, + "learning_rate": 2.430133657351154e-07, + "loss": 1.1114, + "step": 6 + }, + { + "epoch": 0.0005670771224886585, + "grad_norm": 0.02415238879621029, + "learning_rate": 2.8351559335763466e-07, + "loss": 1.001, + "step": 7 + }, + { + "epoch": 0.0006480881399870382, + "grad_norm": 0.021928859874606133, + "learning_rate": 3.2401782098015396e-07, + "loss": 0.9555, + "step": 8 + }, + { + "epoch": 0.000729099157485418, + "grad_norm": 0.022112445905804634, + "learning_rate": 3.6452004860267315e-07, + "loss": 1.0955, + "step": 9 + }, + { + "epoch": 0.0008101101749837978, + "grad_norm": 0.022664247080683708, + "learning_rate": 4.050222762251924e-07, + "loss": 1.1065, + "step": 10 + }, + { + "epoch": 0.0008911211924821776, + "grad_norm": 0.021883847191929817, + "learning_rate": 4.455245038477117e-07, + "loss": 1.1031, + "step": 11 + }, + { + "epoch": 0.0009721322099805574, + "grad_norm": 0.024452999234199524, + "learning_rate": 4.860267314702308e-07, + "loss": 1.1181, + "step": 12 + }, + { + "epoch": 0.001053143227478937, + "grad_norm": 0.023982515558600426, + "learning_rate": 5.265289590927501e-07, + "loss": 1.0201, + "step": 13 + }, + { + "epoch": 0.001134154244977317, + "grad_norm": 0.020815137773752213, + "learning_rate": 5.670311867152693e-07, + "loss": 1.0554, + "step": 14 + }, + { + "epoch": 0.0012151652624756966, + "grad_norm": 0.023049861192703247, + "learning_rate": 6.075334143377886e-07, + "loss": 1.0279, + "step": 15 + }, + { + "epoch": 0.0012961762799740765, + "grad_norm": 0.025142598897218704, + "learning_rate": 6.480356419603079e-07, + "loss": 1.0198, + "step": 16 + }, + { + "epoch": 0.0013771872974724562, + "grad_norm": 0.02580203115940094, + "learning_rate": 6.885378695828271e-07, + "loss": 1.0425, + "step": 17 + }, + { + "epoch": 0.001458198314970836, + "grad_norm": 0.02373248152434826, + "learning_rate": 7.290400972053463e-07, + "loss": 0.9958, + "step": 18 + }, + { + "epoch": 0.0015392093324692157, + "grad_norm": 0.025345822796225548, + "learning_rate": 7.695423248278656e-07, + "loss": 1.0466, + "step": 19 + }, + { + "epoch": 0.0016202203499675956, + "grad_norm": 0.024625582620501518, + "learning_rate": 8.100445524503848e-07, + "loss": 1.0204, + "step": 20 + }, + { + "epoch": 0.0017012313674659753, + "grad_norm": 0.023450065404176712, + "learning_rate": 8.50546780072904e-07, + "loss": 1.1077, + "step": 21 + }, + { + "epoch": 0.0017822423849643552, + "grad_norm": 0.01932292804121971, + "learning_rate": 8.910490076954234e-07, + "loss": 1.0104, + "step": 22 + }, + { + "epoch": 0.0018632534024627348, + "grad_norm": 0.020872652530670166, + "learning_rate": 9.315512353179425e-07, + "loss": 1.1566, + "step": 23 + }, + { + "epoch": 0.0019442644199611147, + "grad_norm": 0.020221302285790443, + "learning_rate": 9.720534629404617e-07, + "loss": 1.0174, + "step": 24 + }, + { + "epoch": 0.0020252754374594944, + "grad_norm": 0.02044861391186714, + "learning_rate": 1.0125556905629811e-06, + "loss": 1.0797, + "step": 25 + }, + { + "epoch": 0.002106286454957874, + "grad_norm": 0.021288206800818443, + "learning_rate": 1.0530579181855002e-06, + "loss": 1.0198, + "step": 26 + }, + { + "epoch": 0.002187297472456254, + "grad_norm": 0.02114301174879074, + "learning_rate": 1.0935601458080196e-06, + "loss": 1.1145, + "step": 27 + }, + { + "epoch": 0.002268308489954634, + "grad_norm": 0.024435359984636307, + "learning_rate": 1.1340623734305387e-06, + "loss": 1.0083, + "step": 28 + }, + { + "epoch": 0.0023493195074530135, + "grad_norm": 0.023775780573487282, + "learning_rate": 1.174564601053058e-06, + "loss": 1.0904, + "step": 29 + }, + { + "epoch": 0.002430330524951393, + "grad_norm": 0.022898796945810318, + "learning_rate": 1.2150668286755771e-06, + "loss": 1.1089, + "step": 30 + }, + { + "epoch": 0.0025113415424497733, + "grad_norm": 0.023763054981827736, + "learning_rate": 1.2555690562980964e-06, + "loss": 1.0393, + "step": 31 + }, + { + "epoch": 0.002592352559948153, + "grad_norm": 0.023700254037976265, + "learning_rate": 1.2960712839206158e-06, + "loss": 0.9731, + "step": 32 + }, + { + "epoch": 0.0026733635774465326, + "grad_norm": 0.02282639406621456, + "learning_rate": 1.3365735115431349e-06, + "loss": 1.1776, + "step": 33 + }, + { + "epoch": 0.0027543745949449123, + "grad_norm": 0.021881787106394768, + "learning_rate": 1.3770757391656541e-06, + "loss": 1.0377, + "step": 34 + }, + { + "epoch": 0.0028353856124432924, + "grad_norm": 0.02300041727721691, + "learning_rate": 1.4175779667881734e-06, + "loss": 1.0286, + "step": 35 + }, + { + "epoch": 0.002916396629941672, + "grad_norm": 0.025359181687235832, + "learning_rate": 1.4580801944106926e-06, + "loss": 1.0717, + "step": 36 + }, + { + "epoch": 0.0029974076474400518, + "grad_norm": 0.020881187170743942, + "learning_rate": 1.4985824220332119e-06, + "loss": 0.9762, + "step": 37 + }, + { + "epoch": 0.0030784186649384314, + "grad_norm": 0.02200203947722912, + "learning_rate": 1.539084649655731e-06, + "loss": 1.0024, + "step": 38 + }, + { + "epoch": 0.0031594296824368115, + "grad_norm": 0.025778813287615776, + "learning_rate": 1.5795868772782503e-06, + "loss": 1.1089, + "step": 39 + }, + { + "epoch": 0.0032404406999351912, + "grad_norm": 0.022604526951909065, + "learning_rate": 1.6200891049007696e-06, + "loss": 1.0507, + "step": 40 + }, + { + "epoch": 0.003321451717433571, + "grad_norm": 0.028379792347550392, + "learning_rate": 1.660591332523289e-06, + "loss": 1.1776, + "step": 41 + }, + { + "epoch": 0.0034024627349319506, + "grad_norm": 0.02317628636956215, + "learning_rate": 1.701093560145808e-06, + "loss": 1.1075, + "step": 42 + }, + { + "epoch": 0.0034834737524303307, + "grad_norm": 0.02414836548268795, + "learning_rate": 1.7415957877683273e-06, + "loss": 1.1208, + "step": 43 + }, + { + "epoch": 0.0035644847699287103, + "grad_norm": 0.020418209955096245, + "learning_rate": 1.7820980153908468e-06, + "loss": 0.9636, + "step": 44 + }, + { + "epoch": 0.00364549578742709, + "grad_norm": 0.020706145092844963, + "learning_rate": 1.8226002430133656e-06, + "loss": 1.0209, + "step": 45 + }, + { + "epoch": 0.0037265068049254697, + "grad_norm": 0.022698137909173965, + "learning_rate": 1.863102470635885e-06, + "loss": 0.9973, + "step": 46 + }, + { + "epoch": 0.00380751782242385, + "grad_norm": 0.030558334663510323, + "learning_rate": 1.9036046982584043e-06, + "loss": 0.978, + "step": 47 + }, + { + "epoch": 0.0038885288399222295, + "grad_norm": 0.024986794218420982, + "learning_rate": 1.9441069258809233e-06, + "loss": 1.0603, + "step": 48 + }, + { + "epoch": 0.0039695398574206096, + "grad_norm": 0.024735961109399796, + "learning_rate": 1.9846091535034426e-06, + "loss": 1.0594, + "step": 49 + }, + { + "epoch": 0.004050550874918989, + "grad_norm": 0.02292553521692753, + "learning_rate": 2.0251113811259623e-06, + "loss": 1.0657, + "step": 50 + }, + { + "epoch": 0.004131561892417369, + "grad_norm": 0.027018576860427856, + "learning_rate": 2.0656136087484815e-06, + "loss": 1.0674, + "step": 51 + }, + { + "epoch": 0.004212572909915748, + "grad_norm": 0.0235233623534441, + "learning_rate": 2.1061158363710003e-06, + "loss": 1.0473, + "step": 52 + }, + { + "epoch": 0.004293583927414128, + "grad_norm": 0.022738710045814514, + "learning_rate": 2.1466180639935196e-06, + "loss": 1.0892, + "step": 53 + }, + { + "epoch": 0.004374594944912508, + "grad_norm": 0.022287271916866302, + "learning_rate": 2.1871202916160392e-06, + "loss": 0.9848, + "step": 54 + }, + { + "epoch": 0.004455605962410888, + "grad_norm": 0.025856684893369675, + "learning_rate": 2.227622519238558e-06, + "loss": 1.0407, + "step": 55 + }, + { + "epoch": 0.004536616979909268, + "grad_norm": 0.0249209925532341, + "learning_rate": 2.2681247468610773e-06, + "loss": 1.1791, + "step": 56 + }, + { + "epoch": 0.004617627997407648, + "grad_norm": 0.028566813096404076, + "learning_rate": 2.308626974483597e-06, + "loss": 1.1377, + "step": 57 + }, + { + "epoch": 0.004698639014906027, + "grad_norm": 0.0300540030002594, + "learning_rate": 2.349129202106116e-06, + "loss": 1.0835, + "step": 58 + }, + { + "epoch": 0.004779650032404407, + "grad_norm": 0.020815081894397736, + "learning_rate": 2.389631429728635e-06, + "loss": 1.1043, + "step": 59 + }, + { + "epoch": 0.004860661049902786, + "grad_norm": 0.023698071017861366, + "learning_rate": 2.4301336573511543e-06, + "loss": 1.0009, + "step": 60 + }, + { + "epoch": 0.0049416720674011665, + "grad_norm": 0.023796427994966507, + "learning_rate": 2.470635884973674e-06, + "loss": 1.0231, + "step": 61 + }, + { + "epoch": 0.005022683084899547, + "grad_norm": 0.02444332465529442, + "learning_rate": 2.5111381125961928e-06, + "loss": 1.0623, + "step": 62 + }, + { + "epoch": 0.005103694102397926, + "grad_norm": 0.024619588628411293, + "learning_rate": 2.551640340218712e-06, + "loss": 1.0212, + "step": 63 + }, + { + "epoch": 0.005184705119896306, + "grad_norm": 0.0225670225918293, + "learning_rate": 2.5921425678412317e-06, + "loss": 0.9564, + "step": 64 + }, + { + "epoch": 0.005265716137394686, + "grad_norm": 0.02799377217888832, + "learning_rate": 2.6326447954637505e-06, + "loss": 1.0529, + "step": 65 + }, + { + "epoch": 0.005346727154893065, + "grad_norm": 0.02838836796581745, + "learning_rate": 2.6731470230862698e-06, + "loss": 1.0972, + "step": 66 + }, + { + "epoch": 0.005427738172391445, + "grad_norm": 0.027125639840960503, + "learning_rate": 2.713649250708789e-06, + "loss": 0.9969, + "step": 67 + }, + { + "epoch": 0.005508749189889825, + "grad_norm": 0.027286173775792122, + "learning_rate": 2.7541514783313082e-06, + "loss": 1.0246, + "step": 68 + }, + { + "epoch": 0.005589760207388205, + "grad_norm": 0.029833434149622917, + "learning_rate": 2.7946537059538275e-06, + "loss": 1.1968, + "step": 69 + }, + { + "epoch": 0.005670771224886585, + "grad_norm": 0.025817332789301872, + "learning_rate": 2.8351559335763467e-06, + "loss": 1.175, + "step": 70 + }, + { + "epoch": 0.005751782242384964, + "grad_norm": 0.03145721182227135, + "learning_rate": 2.8756581611988664e-06, + "loss": 1.0967, + "step": 71 + }, + { + "epoch": 0.005832793259883344, + "grad_norm": 0.03033231757581234, + "learning_rate": 2.9161603888213852e-06, + "loss": 1.0946, + "step": 72 + }, + { + "epoch": 0.005913804277381724, + "grad_norm": 0.02514854073524475, + "learning_rate": 2.9566626164439045e-06, + "loss": 1.1251, + "step": 73 + }, + { + "epoch": 0.0059948152948801035, + "grad_norm": 0.02949725091457367, + "learning_rate": 2.9971648440664237e-06, + "loss": 0.9468, + "step": 74 + }, + { + "epoch": 0.006075826312378484, + "grad_norm": 0.02892262302339077, + "learning_rate": 3.037667071688943e-06, + "loss": 1.0442, + "step": 75 + }, + { + "epoch": 0.006156837329876863, + "grad_norm": 0.026688704267144203, + "learning_rate": 3.078169299311462e-06, + "loss": 0.934, + "step": 76 + }, + { + "epoch": 0.006237848347375243, + "grad_norm": 0.024869117885828018, + "learning_rate": 3.1186715269339815e-06, + "loss": 0.9493, + "step": 77 + }, + { + "epoch": 0.006318859364873623, + "grad_norm": 0.026760468259453773, + "learning_rate": 3.1591737545565007e-06, + "loss": 1.0778, + "step": 78 + }, + { + "epoch": 0.006399870382372002, + "grad_norm": 0.02856779471039772, + "learning_rate": 3.19967598217902e-06, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.0064808813998703824, + "grad_norm": 0.023248232901096344, + "learning_rate": 3.240178209801539e-06, + "loss": 0.9863, + "step": 80 + }, + { + "epoch": 0.0065618924173687625, + "grad_norm": 0.029149865731596947, + "learning_rate": 3.280680437424059e-06, + "loss": 1.0313, + "step": 81 + }, + { + "epoch": 0.006642903434867142, + "grad_norm": 0.02813745103776455, + "learning_rate": 3.321182665046578e-06, + "loss": 1.0753, + "step": 82 + }, + { + "epoch": 0.006723914452365522, + "grad_norm": 0.027179231867194176, + "learning_rate": 3.3616848926690965e-06, + "loss": 1.0573, + "step": 83 + }, + { + "epoch": 0.006804925469863901, + "grad_norm": 0.027503957971930504, + "learning_rate": 3.402187120291616e-06, + "loss": 0.9619, + "step": 84 + }, + { + "epoch": 0.006885936487362281, + "grad_norm": 0.027507808059453964, + "learning_rate": 3.4426893479141354e-06, + "loss": 0.965, + "step": 85 + }, + { + "epoch": 0.006966947504860661, + "grad_norm": 0.03333299234509468, + "learning_rate": 3.4831915755366547e-06, + "loss": 0.9754, + "step": 86 + }, + { + "epoch": 0.007047958522359041, + "grad_norm": 0.02706284075975418, + "learning_rate": 3.523693803159174e-06, + "loss": 1.0152, + "step": 87 + }, + { + "epoch": 0.007128969539857421, + "grad_norm": 0.024948805570602417, + "learning_rate": 3.5641960307816936e-06, + "loss": 0.9234, + "step": 88 + }, + { + "epoch": 0.007209980557355801, + "grad_norm": 0.0278322733938694, + "learning_rate": 3.604698258404213e-06, + "loss": 1.2218, + "step": 89 + }, + { + "epoch": 0.00729099157485418, + "grad_norm": 0.02799849770963192, + "learning_rate": 3.6452004860267312e-06, + "loss": 1.1309, + "step": 90 + }, + { + "epoch": 0.00737200259235256, + "grad_norm": 0.030282633379101753, + "learning_rate": 3.685702713649251e-06, + "loss": 1.0145, + "step": 91 + }, + { + "epoch": 0.007453013609850939, + "grad_norm": 0.02803598903119564, + "learning_rate": 3.72620494127177e-06, + "loss": 1.059, + "step": 92 + }, + { + "epoch": 0.0075340246273493195, + "grad_norm": 0.031542301177978516, + "learning_rate": 3.7667071688942894e-06, + "loss": 1.0055, + "step": 93 + }, + { + "epoch": 0.0076150356448477, + "grad_norm": 0.03627784922719002, + "learning_rate": 3.8072093965168086e-06, + "loss": 0.9798, + "step": 94 + }, + { + "epoch": 0.007696046662346079, + "grad_norm": 0.029904387891292572, + "learning_rate": 3.847711624139328e-06, + "loss": 1.1077, + "step": 95 + }, + { + "epoch": 0.007777057679844459, + "grad_norm": 0.03554466366767883, + "learning_rate": 3.888213851761847e-06, + "loss": 1.0548, + "step": 96 + }, + { + "epoch": 0.007858068697342839, + "grad_norm": 0.029148144647479057, + "learning_rate": 3.928716079384366e-06, + "loss": 0.9938, + "step": 97 + }, + { + "epoch": 0.007939079714841219, + "grad_norm": 0.03140278160572052, + "learning_rate": 3.969218307006885e-06, + "loss": 1.0229, + "step": 98 + }, + { + "epoch": 0.008020090732339598, + "grad_norm": 0.031796544790267944, + "learning_rate": 4.009720534629405e-06, + "loss": 1.0887, + "step": 99 + }, + { + "epoch": 0.008101101749837978, + "grad_norm": 0.030875643715262413, + "learning_rate": 4.0502227622519245e-06, + "loss": 1.1028, + "step": 100 + }, + { + "epoch": 0.008182112767336358, + "grad_norm": 0.03191255405545235, + "learning_rate": 4.090724989874443e-06, + "loss": 1.0444, + "step": 101 + }, + { + "epoch": 0.008263123784834738, + "grad_norm": 0.035354603081941605, + "learning_rate": 4.131227217496963e-06, + "loss": 1.053, + "step": 102 + }, + { + "epoch": 0.008344134802333118, + "grad_norm": 0.034191109240055084, + "learning_rate": 4.171729445119482e-06, + "loss": 1.0567, + "step": 103 + }, + { + "epoch": 0.008425145819831496, + "grad_norm": 0.028501426801085472, + "learning_rate": 4.212231672742001e-06, + "loss": 1.0091, + "step": 104 + }, + { + "epoch": 0.008506156837329876, + "grad_norm": 0.03529810532927513, + "learning_rate": 4.25273390036452e-06, + "loss": 1.117, + "step": 105 + }, + { + "epoch": 0.008587167854828257, + "grad_norm": 0.038472309708595276, + "learning_rate": 4.293236127987039e-06, + "loss": 1.0489, + "step": 106 + }, + { + "epoch": 0.008668178872326637, + "grad_norm": 0.034638624638319016, + "learning_rate": 4.333738355609559e-06, + "loss": 1.0371, + "step": 107 + }, + { + "epoch": 0.008749189889825017, + "grad_norm": 0.03382338955998421, + "learning_rate": 4.3742405832320785e-06, + "loss": 1.0435, + "step": 108 + }, + { + "epoch": 0.008830200907323397, + "grad_norm": 0.031995080411434174, + "learning_rate": 4.414742810854597e-06, + "loss": 0.9459, + "step": 109 + }, + { + "epoch": 0.008911211924821775, + "grad_norm": 0.03637147694826126, + "learning_rate": 4.455245038477116e-06, + "loss": 0.995, + "step": 110 + }, + { + "epoch": 0.008992222942320155, + "grad_norm": 0.03417530655860901, + "learning_rate": 4.495747266099636e-06, + "loss": 1.0156, + "step": 111 + }, + { + "epoch": 0.009073233959818535, + "grad_norm": 0.03398241102695465, + "learning_rate": 4.536249493722155e-06, + "loss": 1.1755, + "step": 112 + }, + { + "epoch": 0.009154244977316916, + "grad_norm": 0.039991866797208786, + "learning_rate": 4.576751721344674e-06, + "loss": 1.0777, + "step": 113 + }, + { + "epoch": 0.009235255994815296, + "grad_norm": 0.03602704033255577, + "learning_rate": 4.617253948967194e-06, + "loss": 0.9658, + "step": 114 + }, + { + "epoch": 0.009316267012313674, + "grad_norm": 0.040713947266340256, + "learning_rate": 4.657756176589713e-06, + "loss": 1.1282, + "step": 115 + }, + { + "epoch": 0.009397278029812054, + "grad_norm": 0.036449410021305084, + "learning_rate": 4.698258404212232e-06, + "loss": 1.0613, + "step": 116 + }, + { + "epoch": 0.009478289047310434, + "grad_norm": 0.03143557533621788, + "learning_rate": 4.738760631834751e-06, + "loss": 1.0661, + "step": 117 + }, + { + "epoch": 0.009559300064808814, + "grad_norm": 0.03461870178580284, + "learning_rate": 4.77926285945727e-06, + "loss": 1.0569, + "step": 118 + }, + { + "epoch": 0.009640311082307194, + "grad_norm": 0.03391311317682266, + "learning_rate": 4.81976508707979e-06, + "loss": 0.9362, + "step": 119 + }, + { + "epoch": 0.009721322099805573, + "grad_norm": 0.03787930682301521, + "learning_rate": 4.8602673147023086e-06, + "loss": 1.0893, + "step": 120 + }, + { + "epoch": 0.009802333117303953, + "grad_norm": 0.03874959051609039, + "learning_rate": 4.900769542324828e-06, + "loss": 0.9632, + "step": 121 + }, + { + "epoch": 0.009883344134802333, + "grad_norm": 0.03478735685348511, + "learning_rate": 4.941271769947348e-06, + "loss": 0.9535, + "step": 122 + }, + { + "epoch": 0.009964355152300713, + "grad_norm": 0.039458271116018295, + "learning_rate": 4.981773997569866e-06, + "loss": 1.1773, + "step": 123 + }, + { + "epoch": 0.010045366169799093, + "grad_norm": 0.03747010603547096, + "learning_rate": 5.0222762251923855e-06, + "loss": 1.0859, + "step": 124 + }, + { + "epoch": 0.010126377187297473, + "grad_norm": 0.047416865825653076, + "learning_rate": 5.062778452814905e-06, + "loss": 1.0857, + "step": 125 + }, + { + "epoch": 0.010207388204795852, + "grad_norm": 0.03998400643467903, + "learning_rate": 5.103280680437424e-06, + "loss": 0.9918, + "step": 126 + }, + { + "epoch": 0.010288399222294232, + "grad_norm": 0.03821217268705368, + "learning_rate": 5.143782908059944e-06, + "loss": 0.9687, + "step": 127 + }, + { + "epoch": 0.010369410239792612, + "grad_norm": 0.03778871148824692, + "learning_rate": 5.184285135682463e-06, + "loss": 0.9259, + "step": 128 + }, + { + "epoch": 0.010450421257290992, + "grad_norm": 0.043181974440813065, + "learning_rate": 5.224787363304982e-06, + "loss": 1.0013, + "step": 129 + }, + { + "epoch": 0.010531432274789372, + "grad_norm": 0.03853442519903183, + "learning_rate": 5.265289590927501e-06, + "loss": 1.0027, + "step": 130 + }, + { + "epoch": 0.01061244329228775, + "grad_norm": 0.03264236822724342, + "learning_rate": 5.305791818550021e-06, + "loss": 1.0482, + "step": 131 + }, + { + "epoch": 0.01069345430978613, + "grad_norm": 0.03932544216513634, + "learning_rate": 5.3462940461725395e-06, + "loss": 1.2041, + "step": 132 + }, + { + "epoch": 0.01077446532728451, + "grad_norm": 0.03828361630439758, + "learning_rate": 5.386796273795059e-06, + "loss": 0.9825, + "step": 133 + }, + { + "epoch": 0.01085547634478289, + "grad_norm": 0.04001957178115845, + "learning_rate": 5.427298501417578e-06, + "loss": 1.0606, + "step": 134 + }, + { + "epoch": 0.010936487362281271, + "grad_norm": 0.040740400552749634, + "learning_rate": 5.467800729040098e-06, + "loss": 1.0208, + "step": 135 + }, + { + "epoch": 0.01101749837977965, + "grad_norm": 0.03935074433684349, + "learning_rate": 5.5083029566626165e-06, + "loss": 1.0871, + "step": 136 + }, + { + "epoch": 0.01109850939727803, + "grad_norm": 0.0420953705906868, + "learning_rate": 5.548805184285136e-06, + "loss": 1.0619, + "step": 137 + }, + { + "epoch": 0.01117952041477641, + "grad_norm": 0.033982839435338974, + "learning_rate": 5.589307411907655e-06, + "loss": 1.024, + "step": 138 + }, + { + "epoch": 0.01126053143227479, + "grad_norm": 0.03850257024168968, + "learning_rate": 5.629809639530175e-06, + "loss": 1.0603, + "step": 139 + }, + { + "epoch": 0.01134154244977317, + "grad_norm": 0.040236424654722214, + "learning_rate": 5.6703118671526935e-06, + "loss": 1.0393, + "step": 140 + }, + { + "epoch": 0.011422553467271548, + "grad_norm": 0.04402274265885353, + "learning_rate": 5.710814094775213e-06, + "loss": 1.0713, + "step": 141 + }, + { + "epoch": 0.011503564484769928, + "grad_norm": 0.04175622761249542, + "learning_rate": 5.751316322397733e-06, + "loss": 1.0241, + "step": 142 + }, + { + "epoch": 0.011584575502268308, + "grad_norm": 0.03973923996090889, + "learning_rate": 5.791818550020251e-06, + "loss": 1.0643, + "step": 143 + }, + { + "epoch": 0.011665586519766688, + "grad_norm": 0.04083230346441269, + "learning_rate": 5.8323207776427705e-06, + "loss": 0.963, + "step": 144 + }, + { + "epoch": 0.011746597537265068, + "grad_norm": 0.03818622976541519, + "learning_rate": 5.87282300526529e-06, + "loss": 1.0514, + "step": 145 + }, + { + "epoch": 0.011827608554763449, + "grad_norm": 0.03518862649798393, + "learning_rate": 5.913325232887809e-06, + "loss": 1.0711, + "step": 146 + }, + { + "epoch": 0.011908619572261827, + "grad_norm": 0.040990497916936874, + "learning_rate": 5.953827460510329e-06, + "loss": 0.9552, + "step": 147 + }, + { + "epoch": 0.011989630589760207, + "grad_norm": 0.037080589681863785, + "learning_rate": 5.9943296881328474e-06, + "loss": 1.0958, + "step": 148 + }, + { + "epoch": 0.012070641607258587, + "grad_norm": 0.043971672654151917, + "learning_rate": 6.034831915755366e-06, + "loss": 1.0015, + "step": 149 + }, + { + "epoch": 0.012151652624756967, + "grad_norm": 0.0402100533246994, + "learning_rate": 6.075334143377886e-06, + "loss": 0.9715, + "step": 150 + }, + { + "epoch": 0.012232663642255347, + "grad_norm": 0.03516768664121628, + "learning_rate": 6.115836371000406e-06, + "loss": 0.9676, + "step": 151 + }, + { + "epoch": 0.012313674659753726, + "grad_norm": 0.03923693671822548, + "learning_rate": 6.156338598622924e-06, + "loss": 1.0169, + "step": 152 + }, + { + "epoch": 0.012394685677252106, + "grad_norm": 0.044282715767621994, + "learning_rate": 6.196840826245444e-06, + "loss": 1.106, + "step": 153 + }, + { + "epoch": 0.012475696694750486, + "grad_norm": 0.03654364496469498, + "learning_rate": 6.237343053867963e-06, + "loss": 1.0134, + "step": 154 + }, + { + "epoch": 0.012556707712248866, + "grad_norm": 0.039169635623693466, + "learning_rate": 6.2778452814904826e-06, + "loss": 0.9492, + "step": 155 + }, + { + "epoch": 0.012637718729747246, + "grad_norm": 0.03776717185974121, + "learning_rate": 6.318347509113001e-06, + "loss": 1.0097, + "step": 156 + }, + { + "epoch": 0.012718729747245625, + "grad_norm": 0.03706394135951996, + "learning_rate": 6.358849736735521e-06, + "loss": 0.9956, + "step": 157 + }, + { + "epoch": 0.012799740764744005, + "grad_norm": 0.03906858712434769, + "learning_rate": 6.39935196435804e-06, + "loss": 1.1265, + "step": 158 + }, + { + "epoch": 0.012880751782242385, + "grad_norm": 0.0432046577334404, + "learning_rate": 6.439854191980559e-06, + "loss": 1.0441, + "step": 159 + }, + { + "epoch": 0.012961762799740765, + "grad_norm": 0.04111461341381073, + "learning_rate": 6.480356419603078e-06, + "loss": 1.1336, + "step": 160 + }, + { + "epoch": 0.013042773817239145, + "grad_norm": 0.04539615660905838, + "learning_rate": 6.520858647225597e-06, + "loss": 1.0992, + "step": 161 + }, + { + "epoch": 0.013123784834737525, + "grad_norm": 0.03695986419916153, + "learning_rate": 6.561360874848118e-06, + "loss": 1.0151, + "step": 162 + }, + { + "epoch": 0.013204795852235903, + "grad_norm": 0.038577646017074585, + "learning_rate": 6.601863102470636e-06, + "loss": 0.9178, + "step": 163 + }, + { + "epoch": 0.013285806869734284, + "grad_norm": 0.039863359183073044, + "learning_rate": 6.642365330093156e-06, + "loss": 1.0743, + "step": 164 + }, + { + "epoch": 0.013366817887232664, + "grad_norm": 0.04418211802840233, + "learning_rate": 6.682867557715675e-06, + "loss": 1.0468, + "step": 165 + }, + { + "epoch": 0.013447828904731044, + "grad_norm": 0.040715545415878296, + "learning_rate": 6.723369785338193e-06, + "loss": 0.9407, + "step": 166 + }, + { + "epoch": 0.013528839922229424, + "grad_norm": 0.03955593705177307, + "learning_rate": 6.7638720129607135e-06, + "loss": 0.9896, + "step": 167 + }, + { + "epoch": 0.013609850939727802, + "grad_norm": 0.04149999842047691, + "learning_rate": 6.804374240583232e-06, + "loss": 1.1123, + "step": 168 + }, + { + "epoch": 0.013690861957226182, + "grad_norm": 0.03756433352828026, + "learning_rate": 6.844876468205752e-06, + "loss": 0.9447, + "step": 169 + }, + { + "epoch": 0.013771872974724562, + "grad_norm": 0.04297390207648277, + "learning_rate": 6.885378695828271e-06, + "loss": 0.9698, + "step": 170 + }, + { + "epoch": 0.013852883992222943, + "grad_norm": 0.044178616255521774, + "learning_rate": 6.9258809234507905e-06, + "loss": 1.0236, + "step": 171 + }, + { + "epoch": 0.013933895009721323, + "grad_norm": 0.04240819066762924, + "learning_rate": 6.966383151073309e-06, + "loss": 0.9572, + "step": 172 + }, + { + "epoch": 0.014014906027219701, + "grad_norm": 0.03693396970629692, + "learning_rate": 7.006885378695828e-06, + "loss": 0.9087, + "step": 173 + }, + { + "epoch": 0.014095917044718081, + "grad_norm": 0.03896639496088028, + "learning_rate": 7.047387606318348e-06, + "loss": 0.9924, + "step": 174 + }, + { + "epoch": 0.014176928062216461, + "grad_norm": 0.03346260264515877, + "learning_rate": 7.087889833940867e-06, + "loss": 0.9482, + "step": 175 + }, + { + "epoch": 0.014257939079714841, + "grad_norm": 0.0437021479010582, + "learning_rate": 7.128392061563387e-06, + "loss": 1.0355, + "step": 176 + }, + { + "epoch": 0.014338950097213221, + "grad_norm": 0.05387115105986595, + "learning_rate": 7.168894289185905e-06, + "loss": 1.0333, + "step": 177 + }, + { + "epoch": 0.014419961114711602, + "grad_norm": 0.043737273663282394, + "learning_rate": 7.209396516808426e-06, + "loss": 0.969, + "step": 178 + }, + { + "epoch": 0.01450097213220998, + "grad_norm": 0.043181467801332474, + "learning_rate": 7.2498987444309445e-06, + "loss": 0.9959, + "step": 179 + }, + { + "epoch": 0.01458198314970836, + "grad_norm": 0.03788682073354721, + "learning_rate": 7.2904009720534624e-06, + "loss": 1.0632, + "step": 180 + }, + { + "epoch": 0.01466299416720674, + "grad_norm": 0.049245551228523254, + "learning_rate": 7.330903199675983e-06, + "loss": 1.1571, + "step": 181 + }, + { + "epoch": 0.01474400518470512, + "grad_norm": 0.043730925768613815, + "learning_rate": 7.371405427298502e-06, + "loss": 1.0468, + "step": 182 + }, + { + "epoch": 0.0148250162022035, + "grad_norm": 0.041232235729694366, + "learning_rate": 7.4119076549210214e-06, + "loss": 1.0773, + "step": 183 + }, + { + "epoch": 0.014906027219701879, + "grad_norm": 0.03509649634361267, + "learning_rate": 7.45240988254354e-06, + "loss": 0.9525, + "step": 184 + }, + { + "epoch": 0.014987038237200259, + "grad_norm": 0.033452972769737244, + "learning_rate": 7.49291211016606e-06, + "loss": 0.9441, + "step": 185 + }, + { + "epoch": 0.015068049254698639, + "grad_norm": 0.044284090399742126, + "learning_rate": 7.533414337788579e-06, + "loss": 0.9824, + "step": 186 + }, + { + "epoch": 0.015149060272197019, + "grad_norm": 0.0432291179895401, + "learning_rate": 7.5739165654110976e-06, + "loss": 1.0314, + "step": 187 + }, + { + "epoch": 0.0152300712896954, + "grad_norm": 0.03911181911826134, + "learning_rate": 7.614418793033617e-06, + "loss": 0.9796, + "step": 188 + }, + { + "epoch": 0.015311082307193778, + "grad_norm": 0.04108152911067009, + "learning_rate": 7.654921020656136e-06, + "loss": 0.961, + "step": 189 + }, + { + "epoch": 0.015392093324692158, + "grad_norm": 0.04519292339682579, + "learning_rate": 7.695423248278656e-06, + "loss": 1.1185, + "step": 190 + }, + { + "epoch": 0.015473104342190538, + "grad_norm": 0.04229548200964928, + "learning_rate": 7.735925475901175e-06, + "loss": 1.0708, + "step": 191 + }, + { + "epoch": 0.015554115359688918, + "grad_norm": 0.038502808660268784, + "learning_rate": 7.776427703523693e-06, + "loss": 0.9103, + "step": 192 + }, + { + "epoch": 0.015635126377187298, + "grad_norm": 0.03859207034111023, + "learning_rate": 7.816929931146213e-06, + "loss": 0.9856, + "step": 193 + }, + { + "epoch": 0.015716137394685678, + "grad_norm": 0.03727702423930168, + "learning_rate": 7.857432158768733e-06, + "loss": 0.8671, + "step": 194 + }, + { + "epoch": 0.015797148412184058, + "grad_norm": 0.03756839036941528, + "learning_rate": 7.897934386391252e-06, + "loss": 1.0028, + "step": 195 + }, + { + "epoch": 0.015878159429682438, + "grad_norm": 0.040489424020051956, + "learning_rate": 7.93843661401377e-06, + "loss": 1.0696, + "step": 196 + }, + { + "epoch": 0.015959170447180815, + "grad_norm": 0.031826749444007874, + "learning_rate": 7.978938841636292e-06, + "loss": 0.9044, + "step": 197 + }, + { + "epoch": 0.016040181464679195, + "grad_norm": 0.03558851778507233, + "learning_rate": 8.01944106925881e-06, + "loss": 0.9084, + "step": 198 + }, + { + "epoch": 0.016121192482177575, + "grad_norm": 0.036237046122550964, + "learning_rate": 8.059943296881328e-06, + "loss": 0.941, + "step": 199 + }, + { + "epoch": 0.016202203499675955, + "grad_norm": 0.046845950186252594, + "learning_rate": 8.100445524503849e-06, + "loss": 1.0429, + "step": 200 + }, + { + "epoch": 0.016283214517174335, + "grad_norm": 0.042282190173864365, + "learning_rate": 8.140947752126367e-06, + "loss": 1.0032, + "step": 201 + }, + { + "epoch": 0.016364225534672715, + "grad_norm": 0.039992641657590866, + "learning_rate": 8.181449979748887e-06, + "loss": 1.0286, + "step": 202 + }, + { + "epoch": 0.016445236552171096, + "grad_norm": 0.03947419673204422, + "learning_rate": 8.221952207371406e-06, + "loss": 1.0194, + "step": 203 + }, + { + "epoch": 0.016526247569669476, + "grad_norm": 0.03853017836809158, + "learning_rate": 8.262454434993926e-06, + "loss": 1.0542, + "step": 204 + }, + { + "epoch": 0.016607258587167856, + "grad_norm": 0.0387580543756485, + "learning_rate": 8.302956662616444e-06, + "loss": 0.9822, + "step": 205 + }, + { + "epoch": 0.016688269604666236, + "grad_norm": 0.0381411649286747, + "learning_rate": 8.343458890238964e-06, + "loss": 0.9213, + "step": 206 + }, + { + "epoch": 0.016769280622164616, + "grad_norm": 0.04278302192687988, + "learning_rate": 8.383961117861483e-06, + "loss": 1.0441, + "step": 207 + }, + { + "epoch": 0.016850291639662993, + "grad_norm": 0.03946967422962189, + "learning_rate": 8.424463345484001e-06, + "loss": 0.9348, + "step": 208 + }, + { + "epoch": 0.016931302657161373, + "grad_norm": 0.03685486316680908, + "learning_rate": 8.464965573106521e-06, + "loss": 1.0595, + "step": 209 + }, + { + "epoch": 0.017012313674659753, + "grad_norm": 0.04094025865197182, + "learning_rate": 8.50546780072904e-06, + "loss": 0.9907, + "step": 210 + }, + { + "epoch": 0.017093324692158133, + "grad_norm": 0.04092978313565254, + "learning_rate": 8.54597002835156e-06, + "loss": 0.9561, + "step": 211 + }, + { + "epoch": 0.017174335709656513, + "grad_norm": 0.0413689911365509, + "learning_rate": 8.586472255974078e-06, + "loss": 1.0234, + "step": 212 + }, + { + "epoch": 0.017255346727154893, + "grad_norm": 0.038415905088186264, + "learning_rate": 8.626974483596598e-06, + "loss": 0.9763, + "step": 213 + }, + { + "epoch": 0.017336357744653273, + "grad_norm": 0.04191512241959572, + "learning_rate": 8.667476711219118e-06, + "loss": 1.0653, + "step": 214 + }, + { + "epoch": 0.017417368762151653, + "grad_norm": 0.03985489904880524, + "learning_rate": 8.707978938841636e-06, + "loss": 1.0275, + "step": 215 + }, + { + "epoch": 0.017498379779650033, + "grad_norm": 0.040544137358665466, + "learning_rate": 8.748481166464157e-06, + "loss": 0.9499, + "step": 216 + }, + { + "epoch": 0.017579390797148414, + "grad_norm": 0.041784144937992096, + "learning_rate": 8.788983394086675e-06, + "loss": 1.0257, + "step": 217 + }, + { + "epoch": 0.017660401814646794, + "grad_norm": 0.0427846759557724, + "learning_rate": 8.829485621709195e-06, + "loss": 1.022, + "step": 218 + }, + { + "epoch": 0.01774141283214517, + "grad_norm": 0.03604327514767647, + "learning_rate": 8.869987849331714e-06, + "loss": 0.9539, + "step": 219 + }, + { + "epoch": 0.01782242384964355, + "grad_norm": 0.03896184265613556, + "learning_rate": 8.910490076954232e-06, + "loss": 1.0411, + "step": 220 + }, + { + "epoch": 0.01790343486714193, + "grad_norm": 0.04511598125100136, + "learning_rate": 8.950992304576752e-06, + "loss": 1.0978, + "step": 221 + }, + { + "epoch": 0.01798444588464031, + "grad_norm": 0.04048030450940132, + "learning_rate": 8.991494532199272e-06, + "loss": 0.9711, + "step": 222 + }, + { + "epoch": 0.01806545690213869, + "grad_norm": 0.04436124861240387, + "learning_rate": 9.031996759821791e-06, + "loss": 0.9871, + "step": 223 + }, + { + "epoch": 0.01814646791963707, + "grad_norm": 0.039108771830797195, + "learning_rate": 9.07249898744431e-06, + "loss": 0.9747, + "step": 224 + }, + { + "epoch": 0.01822747893713545, + "grad_norm": 0.04072122648358345, + "learning_rate": 9.11300121506683e-06, + "loss": 0.9828, + "step": 225 + }, + { + "epoch": 0.01830848995463383, + "grad_norm": 0.03978056088089943, + "learning_rate": 9.153503442689349e-06, + "loss": 0.9255, + "step": 226 + }, + { + "epoch": 0.01838950097213221, + "grad_norm": 0.04174983873963356, + "learning_rate": 9.194005670311867e-06, + "loss": 1.0139, + "step": 227 + }, + { + "epoch": 0.01847051198963059, + "grad_norm": 0.043365299701690674, + "learning_rate": 9.234507897934388e-06, + "loss": 1.0303, + "step": 228 + }, + { + "epoch": 0.018551523007128968, + "grad_norm": 0.03832549601793289, + "learning_rate": 9.275010125556906e-06, + "loss": 0.9125, + "step": 229 + }, + { + "epoch": 0.018632534024627348, + "grad_norm": 0.04062044620513916, + "learning_rate": 9.315512353179426e-06, + "loss": 1.0105, + "step": 230 + }, + { + "epoch": 0.018713545042125728, + "grad_norm": 0.04038373380899429, + "learning_rate": 9.356014580801945e-06, + "loss": 1.0133, + "step": 231 + }, + { + "epoch": 0.018794556059624108, + "grad_norm": 0.04146847873926163, + "learning_rate": 9.396516808424463e-06, + "loss": 0.988, + "step": 232 + }, + { + "epoch": 0.01887556707712249, + "grad_norm": 0.040415383875370026, + "learning_rate": 9.437019036046983e-06, + "loss": 0.9543, + "step": 233 + }, + { + "epoch": 0.01895657809462087, + "grad_norm": 0.04330156370997429, + "learning_rate": 9.477521263669503e-06, + "loss": 0.9775, + "step": 234 + }, + { + "epoch": 0.01903758911211925, + "grad_norm": 0.043947651982307434, + "learning_rate": 9.518023491292022e-06, + "loss": 0.8902, + "step": 235 + }, + { + "epoch": 0.01911860012961763, + "grad_norm": 0.04477566108107567, + "learning_rate": 9.55852571891454e-06, + "loss": 1.0621, + "step": 236 + }, + { + "epoch": 0.01919961114711601, + "grad_norm": 0.039206307381391525, + "learning_rate": 9.59902794653706e-06, + "loss": 0.9572, + "step": 237 + }, + { + "epoch": 0.01928062216461439, + "grad_norm": 0.03924980387091637, + "learning_rate": 9.63953017415958e-06, + "loss": 0.9539, + "step": 238 + }, + { + "epoch": 0.01936163318211277, + "grad_norm": 0.040131330490112305, + "learning_rate": 9.680032401782097e-06, + "loss": 0.9643, + "step": 239 + }, + { + "epoch": 0.019442644199611146, + "grad_norm": 0.0443425178527832, + "learning_rate": 9.720534629404617e-06, + "loss": 0.9767, + "step": 240 + }, + { + "epoch": 0.019523655217109526, + "grad_norm": 0.04038490727543831, + "learning_rate": 9.761036857027137e-06, + "loss": 0.9597, + "step": 241 + }, + { + "epoch": 0.019604666234607906, + "grad_norm": 0.03880751505494118, + "learning_rate": 9.801539084649656e-06, + "loss": 0.9, + "step": 242 + }, + { + "epoch": 0.019685677252106286, + "grad_norm": 0.03484981507062912, + "learning_rate": 9.842041312272174e-06, + "loss": 0.861, + "step": 243 + }, + { + "epoch": 0.019766688269604666, + "grad_norm": 0.04652680829167366, + "learning_rate": 9.882543539894696e-06, + "loss": 1.046, + "step": 244 + }, + { + "epoch": 0.019847699287103046, + "grad_norm": 0.045620325952768326, + "learning_rate": 9.923045767517214e-06, + "loss": 1.0687, + "step": 245 + }, + { + "epoch": 0.019928710304601426, + "grad_norm": 0.0337100550532341, + "learning_rate": 9.963547995139732e-06, + "loss": 0.8199, + "step": 246 + }, + { + "epoch": 0.020009721322099806, + "grad_norm": 0.044301003217697144, + "learning_rate": 1.0004050222762253e-05, + "loss": 0.9616, + "step": 247 + }, + { + "epoch": 0.020090732339598186, + "grad_norm": 0.042871661484241486, + "learning_rate": 1.0044552450384771e-05, + "loss": 0.9569, + "step": 248 + }, + { + "epoch": 0.020171743357096567, + "grad_norm": 0.043442253023386, + "learning_rate": 1.008505467800729e-05, + "loss": 0.9999, + "step": 249 + }, + { + "epoch": 0.020252754374594947, + "grad_norm": 0.039713114500045776, + "learning_rate": 1.012555690562981e-05, + "loss": 0.8564, + "step": 250 + }, + { + "epoch": 0.020333765392093323, + "grad_norm": 0.047681573778390884, + "learning_rate": 1.016605913325233e-05, + "loss": 1.0182, + "step": 251 + }, + { + "epoch": 0.020414776409591703, + "grad_norm": 0.04213854670524597, + "learning_rate": 1.0206561360874848e-05, + "loss": 0.9129, + "step": 252 + }, + { + "epoch": 0.020495787427090083, + "grad_norm": 0.045345891267061234, + "learning_rate": 1.0247063588497368e-05, + "loss": 0.9895, + "step": 253 + }, + { + "epoch": 0.020576798444588464, + "grad_norm": 0.042612865567207336, + "learning_rate": 1.0287565816119887e-05, + "loss": 0.9485, + "step": 254 + }, + { + "epoch": 0.020657809462086844, + "grad_norm": 0.041252050548791885, + "learning_rate": 1.0328068043742405e-05, + "loss": 0.8881, + "step": 255 + }, + { + "epoch": 0.020738820479585224, + "grad_norm": 0.044315893203020096, + "learning_rate": 1.0368570271364927e-05, + "loss": 0.93, + "step": 256 + }, + { + "epoch": 0.020819831497083604, + "grad_norm": 0.04035898670554161, + "learning_rate": 1.0409072498987445e-05, + "loss": 0.9263, + "step": 257 + }, + { + "epoch": 0.020900842514581984, + "grad_norm": 0.03786478936672211, + "learning_rate": 1.0449574726609964e-05, + "loss": 0.8736, + "step": 258 + }, + { + "epoch": 0.020981853532080364, + "grad_norm": 0.04340916872024536, + "learning_rate": 1.0490076954232484e-05, + "loss": 0.8948, + "step": 259 + }, + { + "epoch": 0.021062864549578744, + "grad_norm": 0.04343588650226593, + "learning_rate": 1.0530579181855002e-05, + "loss": 0.9546, + "step": 260 + }, + { + "epoch": 0.02114387556707712, + "grad_norm": 0.049006178975105286, + "learning_rate": 1.0571081409477522e-05, + "loss": 0.9688, + "step": 261 + }, + { + "epoch": 0.0212248865845755, + "grad_norm": 0.04548350349068642, + "learning_rate": 1.0611583637100041e-05, + "loss": 0.9232, + "step": 262 + }, + { + "epoch": 0.02130589760207388, + "grad_norm": 0.04764221981167793, + "learning_rate": 1.0652085864722561e-05, + "loss": 0.9413, + "step": 263 + }, + { + "epoch": 0.02138690861957226, + "grad_norm": 0.05102779343724251, + "learning_rate": 1.0692588092345079e-05, + "loss": 1.0695, + "step": 264 + }, + { + "epoch": 0.02146791963707064, + "grad_norm": 0.04845573008060455, + "learning_rate": 1.0733090319967599e-05, + "loss": 0.9992, + "step": 265 + }, + { + "epoch": 0.02154893065456902, + "grad_norm": 0.04806605353951454, + "learning_rate": 1.0773592547590118e-05, + "loss": 0.9287, + "step": 266 + }, + { + "epoch": 0.0216299416720674, + "grad_norm": 0.04713521897792816, + "learning_rate": 1.0814094775212636e-05, + "loss": 0.9274, + "step": 267 + }, + { + "epoch": 0.02171095268956578, + "grad_norm": 0.041815537959337234, + "learning_rate": 1.0854597002835156e-05, + "loss": 0.8218, + "step": 268 + }, + { + "epoch": 0.02179196370706416, + "grad_norm": 0.04000954329967499, + "learning_rate": 1.0895099230457676e-05, + "loss": 0.8657, + "step": 269 + }, + { + "epoch": 0.021872974724562542, + "grad_norm": 0.03801000490784645, + "learning_rate": 1.0935601458080195e-05, + "loss": 0.8065, + "step": 270 + }, + { + "epoch": 0.021953985742060922, + "grad_norm": 0.04805195331573486, + "learning_rate": 1.0976103685702713e-05, + "loss": 0.9453, + "step": 271 + }, + { + "epoch": 0.0220349967595593, + "grad_norm": 0.04761458933353424, + "learning_rate": 1.1016605913325233e-05, + "loss": 0.9711, + "step": 272 + }, + { + "epoch": 0.02211600777705768, + "grad_norm": 0.05013057217001915, + "learning_rate": 1.1057108140947753e-05, + "loss": 0.9825, + "step": 273 + }, + { + "epoch": 0.02219701879455606, + "grad_norm": 0.04019589349627495, + "learning_rate": 1.1097610368570272e-05, + "loss": 0.7704, + "step": 274 + }, + { + "epoch": 0.02227802981205444, + "grad_norm": 0.05667426809668541, + "learning_rate": 1.1138112596192792e-05, + "loss": 1.0435, + "step": 275 + }, + { + "epoch": 0.02235904082955282, + "grad_norm": 0.04651724174618721, + "learning_rate": 1.117861482381531e-05, + "loss": 0.8615, + "step": 276 + }, + { + "epoch": 0.0224400518470512, + "grad_norm": 0.04452743008732796, + "learning_rate": 1.121911705143783e-05, + "loss": 0.9388, + "step": 277 + }, + { + "epoch": 0.02252106286454958, + "grad_norm": 0.04375988990068436, + "learning_rate": 1.125961927906035e-05, + "loss": 0.83, + "step": 278 + }, + { + "epoch": 0.02260207388204796, + "grad_norm": 0.04938081279397011, + "learning_rate": 1.1300121506682867e-05, + "loss": 0.8963, + "step": 279 + }, + { + "epoch": 0.02268308489954634, + "grad_norm": 0.039991993457078934, + "learning_rate": 1.1340623734305387e-05, + "loss": 0.7539, + "step": 280 + }, + { + "epoch": 0.02276409591704472, + "grad_norm": 0.044255875051021576, + "learning_rate": 1.1381125961927907e-05, + "loss": 0.8487, + "step": 281 + }, + { + "epoch": 0.022845106934543096, + "grad_norm": 0.05448516830801964, + "learning_rate": 1.1421628189550426e-05, + "loss": 0.9822, + "step": 282 + }, + { + "epoch": 0.022926117952041476, + "grad_norm": 0.04801035299897194, + "learning_rate": 1.1462130417172944e-05, + "loss": 0.8966, + "step": 283 + }, + { + "epoch": 0.023007128969539856, + "grad_norm": 0.052658502012491226, + "learning_rate": 1.1502632644795466e-05, + "loss": 0.8962, + "step": 284 + }, + { + "epoch": 0.023088139987038236, + "grad_norm": 0.05240850895643234, + "learning_rate": 1.1543134872417984e-05, + "loss": 0.8753, + "step": 285 + }, + { + "epoch": 0.023169151004536617, + "grad_norm": 0.05047140643000603, + "learning_rate": 1.1583637100040502e-05, + "loss": 0.9311, + "step": 286 + }, + { + "epoch": 0.023250162022034997, + "grad_norm": 0.04640672728419304, + "learning_rate": 1.1624139327663023e-05, + "loss": 0.8523, + "step": 287 + }, + { + "epoch": 0.023331173039533377, + "grad_norm": 0.04837445169687271, + "learning_rate": 1.1664641555285541e-05, + "loss": 0.9191, + "step": 288 + }, + { + "epoch": 0.023412184057031757, + "grad_norm": 0.04194364324212074, + "learning_rate": 1.170514378290806e-05, + "loss": 0.8187, + "step": 289 + }, + { + "epoch": 0.023493195074530137, + "grad_norm": 0.04633034020662308, + "learning_rate": 1.174564601053058e-05, + "loss": 0.8054, + "step": 290 + }, + { + "epoch": 0.023574206092028517, + "grad_norm": 0.04540485143661499, + "learning_rate": 1.17861482381531e-05, + "loss": 0.8684, + "step": 291 + }, + { + "epoch": 0.023655217109526897, + "grad_norm": 0.05126497149467468, + "learning_rate": 1.1826650465775618e-05, + "loss": 0.8609, + "step": 292 + }, + { + "epoch": 0.023736228127025274, + "grad_norm": 0.052500225603580475, + "learning_rate": 1.1867152693398138e-05, + "loss": 0.9471, + "step": 293 + }, + { + "epoch": 0.023817239144523654, + "grad_norm": 0.04483199119567871, + "learning_rate": 1.1907654921020657e-05, + "loss": 0.888, + "step": 294 + }, + { + "epoch": 0.023898250162022034, + "grad_norm": 0.04261082038283348, + "learning_rate": 1.1948157148643175e-05, + "loss": 0.7984, + "step": 295 + }, + { + "epoch": 0.023979261179520414, + "grad_norm": 0.04691658914089203, + "learning_rate": 1.1988659376265695e-05, + "loss": 0.9147, + "step": 296 + }, + { + "epoch": 0.024060272197018794, + "grad_norm": 0.039187829941511154, + "learning_rate": 1.2029161603888215e-05, + "loss": 0.787, + "step": 297 + }, + { + "epoch": 0.024141283214517174, + "grad_norm": 0.041846975684165955, + "learning_rate": 1.2069663831510733e-05, + "loss": 0.7899, + "step": 298 + }, + { + "epoch": 0.024222294232015554, + "grad_norm": 0.041736699640750885, + "learning_rate": 1.2110166059133254e-05, + "loss": 0.8537, + "step": 299 + }, + { + "epoch": 0.024303305249513935, + "grad_norm": 0.04296275973320007, + "learning_rate": 1.2150668286755772e-05, + "loss": 0.7563, + "step": 300 + }, + { + "epoch": 0.024384316267012315, + "grad_norm": 0.0476207509636879, + "learning_rate": 1.2191170514378292e-05, + "loss": 0.8532, + "step": 301 + }, + { + "epoch": 0.024465327284510695, + "grad_norm": 0.0477546826004982, + "learning_rate": 1.2231672742000811e-05, + "loss": 0.8733, + "step": 302 + }, + { + "epoch": 0.024546338302009075, + "grad_norm": 0.05017669498920441, + "learning_rate": 1.2272174969623331e-05, + "loss": 0.9032, + "step": 303 + }, + { + "epoch": 0.02462734931950745, + "grad_norm": 0.048817798495292664, + "learning_rate": 1.2312677197245849e-05, + "loss": 0.8685, + "step": 304 + }, + { + "epoch": 0.02470836033700583, + "grad_norm": 0.046138327568769455, + "learning_rate": 1.2353179424868368e-05, + "loss": 0.884, + "step": 305 + }, + { + "epoch": 0.02478937135450421, + "grad_norm": 0.046168919652700424, + "learning_rate": 1.2393681652490888e-05, + "loss": 0.8913, + "step": 306 + }, + { + "epoch": 0.024870382372002592, + "grad_norm": 0.043708957731723785, + "learning_rate": 1.2434183880113406e-05, + "loss": 0.8526, + "step": 307 + }, + { + "epoch": 0.024951393389500972, + "grad_norm": 0.043650537729263306, + "learning_rate": 1.2474686107735926e-05, + "loss": 0.8083, + "step": 308 + }, + { + "epoch": 0.025032404406999352, + "grad_norm": 0.04286431893706322, + "learning_rate": 1.2515188335358447e-05, + "loss": 0.7903, + "step": 309 + }, + { + "epoch": 0.025113415424497732, + "grad_norm": 0.03427548334002495, + "learning_rate": 1.2555690562980965e-05, + "loss": 0.7333, + "step": 310 + }, + { + "epoch": 0.025194426441996112, + "grad_norm": 0.04491386190056801, + "learning_rate": 1.2596192790603483e-05, + "loss": 0.8457, + "step": 311 + }, + { + "epoch": 0.025275437459494492, + "grad_norm": 0.0491580106317997, + "learning_rate": 1.2636695018226003e-05, + "loss": 0.8856, + "step": 312 + }, + { + "epoch": 0.025356448476992872, + "grad_norm": 0.040008701384067535, + "learning_rate": 1.267719724584852e-05, + "loss": 0.8176, + "step": 313 + }, + { + "epoch": 0.02543745949449125, + "grad_norm": 0.05017147958278656, + "learning_rate": 1.2717699473471042e-05, + "loss": 0.9175, + "step": 314 + }, + { + "epoch": 0.02551847051198963, + "grad_norm": 0.04300964996218681, + "learning_rate": 1.2758201701093562e-05, + "loss": 0.8511, + "step": 315 + }, + { + "epoch": 0.02559948152948801, + "grad_norm": 0.03991328552365303, + "learning_rate": 1.279870392871608e-05, + "loss": 0.7562, + "step": 316 + }, + { + "epoch": 0.02568049254698639, + "grad_norm": 0.04305849224328995, + "learning_rate": 1.2839206156338598e-05, + "loss": 0.8165, + "step": 317 + }, + { + "epoch": 0.02576150356448477, + "grad_norm": 0.04361141473054886, + "learning_rate": 1.2879708383961117e-05, + "loss": 0.8114, + "step": 318 + }, + { + "epoch": 0.02584251458198315, + "grad_norm": 0.04171218350529671, + "learning_rate": 1.2920210611583639e-05, + "loss": 0.8169, + "step": 319 + }, + { + "epoch": 0.02592352559948153, + "grad_norm": 0.034734416753053665, + "learning_rate": 1.2960712839206157e-05, + "loss": 0.7547, + "step": 320 + }, + { + "epoch": 0.02600453661697991, + "grad_norm": 0.04020668566226959, + "learning_rate": 1.3001215066828676e-05, + "loss": 0.829, + "step": 321 + }, + { + "epoch": 0.02608554763447829, + "grad_norm": 0.03808221593499184, + "learning_rate": 1.3041717294451194e-05, + "loss": 0.7703, + "step": 322 + }, + { + "epoch": 0.02616655865197667, + "grad_norm": 0.04488734155893326, + "learning_rate": 1.3082219522073716e-05, + "loss": 0.8324, + "step": 323 + }, + { + "epoch": 0.02624756966947505, + "grad_norm": 0.04443804547190666, + "learning_rate": 1.3122721749696235e-05, + "loss": 0.8446, + "step": 324 + }, + { + "epoch": 0.026328580686973427, + "grad_norm": 0.04030577838420868, + "learning_rate": 1.3163223977318753e-05, + "loss": 0.7798, + "step": 325 + }, + { + "epoch": 0.026409591704471807, + "grad_norm": 0.04477348551154137, + "learning_rate": 1.3203726204941271e-05, + "loss": 0.8708, + "step": 326 + }, + { + "epoch": 0.026490602721970187, + "grad_norm": 0.040085140615701675, + "learning_rate": 1.3244228432563791e-05, + "loss": 0.8228, + "step": 327 + }, + { + "epoch": 0.026571613739468567, + "grad_norm": 0.03322353959083557, + "learning_rate": 1.3284730660186312e-05, + "loss": 0.6957, + "step": 328 + }, + { + "epoch": 0.026652624756966947, + "grad_norm": 0.04367142170667648, + "learning_rate": 1.332523288780883e-05, + "loss": 0.8452, + "step": 329 + }, + { + "epoch": 0.026733635774465327, + "grad_norm": 0.03859969228506088, + "learning_rate": 1.336573511543135e-05, + "loss": 0.7426, + "step": 330 + }, + { + "epoch": 0.026814646791963707, + "grad_norm": 0.036489035934209824, + "learning_rate": 1.3406237343053868e-05, + "loss": 0.7271, + "step": 331 + }, + { + "epoch": 0.026895657809462088, + "grad_norm": 0.04094213247299194, + "learning_rate": 1.3446739570676386e-05, + "loss": 0.8086, + "step": 332 + }, + { + "epoch": 0.026976668826960468, + "grad_norm": 0.043221861124038696, + "learning_rate": 1.3487241798298907e-05, + "loss": 0.8558, + "step": 333 + }, + { + "epoch": 0.027057679844458848, + "grad_norm": 0.044967714697122574, + "learning_rate": 1.3527744025921427e-05, + "loss": 0.848, + "step": 334 + }, + { + "epoch": 0.027138690861957228, + "grad_norm": 0.04468412697315216, + "learning_rate": 1.3568246253543945e-05, + "loss": 0.849, + "step": 335 + }, + { + "epoch": 0.027219701879455604, + "grad_norm": 0.04480603709816933, + "learning_rate": 1.3608748481166465e-05, + "loss": 0.9005, + "step": 336 + }, + { + "epoch": 0.027300712896953985, + "grad_norm": 0.03967758268117905, + "learning_rate": 1.3649250708788986e-05, + "loss": 0.8326, + "step": 337 + }, + { + "epoch": 0.027381723914452365, + "grad_norm": 0.04579153284430504, + "learning_rate": 1.3689752936411504e-05, + "loss": 0.8577, + "step": 338 + }, + { + "epoch": 0.027462734931950745, + "grad_norm": 0.03907260298728943, + "learning_rate": 1.3730255164034022e-05, + "loss": 0.811, + "step": 339 + }, + { + "epoch": 0.027543745949449125, + "grad_norm": 0.04092499241232872, + "learning_rate": 1.3770757391656542e-05, + "loss": 0.8279, + "step": 340 + }, + { + "epoch": 0.027624756966947505, + "grad_norm": 0.046532005071640015, + "learning_rate": 1.381125961927906e-05, + "loss": 0.8455, + "step": 341 + }, + { + "epoch": 0.027705767984445885, + "grad_norm": 0.04032495245337486, + "learning_rate": 1.3851761846901581e-05, + "loss": 0.8789, + "step": 342 + }, + { + "epoch": 0.027786779001944265, + "grad_norm": 0.04253386706113815, + "learning_rate": 1.38922640745241e-05, + "loss": 0.8235, + "step": 343 + }, + { + "epoch": 0.027867790019442645, + "grad_norm": 0.04008479788899422, + "learning_rate": 1.3932766302146619e-05, + "loss": 0.8336, + "step": 344 + }, + { + "epoch": 0.027948801036941025, + "grad_norm": 0.04024027660489082, + "learning_rate": 1.3973268529769137e-05, + "loss": 0.7889, + "step": 345 + }, + { + "epoch": 0.028029812054439402, + "grad_norm": 0.04010341316461563, + "learning_rate": 1.4013770757391656e-05, + "loss": 0.7946, + "step": 346 + }, + { + "epoch": 0.028110823071937782, + "grad_norm": 0.03629535436630249, + "learning_rate": 1.4054272985014178e-05, + "loss": 0.7274, + "step": 347 + }, + { + "epoch": 0.028191834089436162, + "grad_norm": 0.04157916083931923, + "learning_rate": 1.4094775212636696e-05, + "loss": 0.7929, + "step": 348 + }, + { + "epoch": 0.028272845106934542, + "grad_norm": 0.04346117004752159, + "learning_rate": 1.4135277440259215e-05, + "loss": 0.7551, + "step": 349 + }, + { + "epoch": 0.028353856124432922, + "grad_norm": 0.03737987205386162, + "learning_rate": 1.4175779667881733e-05, + "loss": 0.7418, + "step": 350 + }, + { + "epoch": 0.028434867141931303, + "grad_norm": 0.040639728307724, + "learning_rate": 1.4216281895504251e-05, + "loss": 0.7781, + "step": 351 + }, + { + "epoch": 0.028515878159429683, + "grad_norm": 0.04043082147836685, + "learning_rate": 1.4256784123126774e-05, + "loss": 0.8111, + "step": 352 + }, + { + "epoch": 0.028596889176928063, + "grad_norm": 0.041902732104063034, + "learning_rate": 1.4297286350749292e-05, + "loss": 0.8684, + "step": 353 + }, + { + "epoch": 0.028677900194426443, + "grad_norm": 0.04072404280304909, + "learning_rate": 1.433778857837181e-05, + "loss": 0.8067, + "step": 354 + }, + { + "epoch": 0.028758911211924823, + "grad_norm": 0.04357956349849701, + "learning_rate": 1.437829080599433e-05, + "loss": 0.7638, + "step": 355 + }, + { + "epoch": 0.028839922229423203, + "grad_norm": 0.042490243911743164, + "learning_rate": 1.4418793033616851e-05, + "loss": 0.816, + "step": 356 + }, + { + "epoch": 0.02892093324692158, + "grad_norm": 0.04168788343667984, + "learning_rate": 1.445929526123937e-05, + "loss": 0.8139, + "step": 357 + }, + { + "epoch": 0.02900194426441996, + "grad_norm": 0.041793275624513626, + "learning_rate": 1.4499797488861889e-05, + "loss": 0.7767, + "step": 358 + }, + { + "epoch": 0.02908295528191834, + "grad_norm": 0.036548249423503876, + "learning_rate": 1.4540299716484407e-05, + "loss": 0.7338, + "step": 359 + }, + { + "epoch": 0.02916396629941672, + "grad_norm": 0.03943818062543869, + "learning_rate": 1.4580801944106925e-05, + "loss": 0.8113, + "step": 360 + }, + { + "epoch": 0.0292449773169151, + "grad_norm": 0.039916470646858215, + "learning_rate": 1.4621304171729446e-05, + "loss": 0.785, + "step": 361 + }, + { + "epoch": 0.02932598833441348, + "grad_norm": 0.04223480448126793, + "learning_rate": 1.4661806399351966e-05, + "loss": 0.7892, + "step": 362 + }, + { + "epoch": 0.02940699935191186, + "grad_norm": 0.03503810241818428, + "learning_rate": 1.4702308626974484e-05, + "loss": 0.6974, + "step": 363 + }, + { + "epoch": 0.02948801036941024, + "grad_norm": 0.04188969358801842, + "learning_rate": 1.4742810854597004e-05, + "loss": 0.8068, + "step": 364 + }, + { + "epoch": 0.02956902138690862, + "grad_norm": 0.039196666330099106, + "learning_rate": 1.4783313082219522e-05, + "loss": 0.7942, + "step": 365 + }, + { + "epoch": 0.029650032404407, + "grad_norm": 0.03726887330412865, + "learning_rate": 1.4823815309842043e-05, + "loss": 0.7629, + "step": 366 + }, + { + "epoch": 0.02973104342190538, + "grad_norm": 0.04150981456041336, + "learning_rate": 1.486431753746456e-05, + "loss": 0.7888, + "step": 367 + }, + { + "epoch": 0.029812054439403757, + "grad_norm": 0.041392356157302856, + "learning_rate": 1.490481976508708e-05, + "loss": 0.8023, + "step": 368 + }, + { + "epoch": 0.029893065456902138, + "grad_norm": 0.04221396520733833, + "learning_rate": 1.4945321992709598e-05, + "loss": 0.8325, + "step": 369 + }, + { + "epoch": 0.029974076474400518, + "grad_norm": 0.03871150687336922, + "learning_rate": 1.498582422033212e-05, + "loss": 0.7942, + "step": 370 + }, + { + "epoch": 0.030055087491898898, + "grad_norm": 0.04151451215147972, + "learning_rate": 1.502632644795464e-05, + "loss": 0.8003, + "step": 371 + }, + { + "epoch": 0.030136098509397278, + "grad_norm": 0.03538280725479126, + "learning_rate": 1.5066828675577157e-05, + "loss": 0.7003, + "step": 372 + }, + { + "epoch": 0.030217109526895658, + "grad_norm": 0.03889065608382225, + "learning_rate": 1.5107330903199675e-05, + "loss": 0.7495, + "step": 373 + }, + { + "epoch": 0.030298120544394038, + "grad_norm": 0.030184775590896606, + "learning_rate": 1.5147833130822195e-05, + "loss": 0.6278, + "step": 374 + }, + { + "epoch": 0.030379131561892418, + "grad_norm": 0.04202747344970703, + "learning_rate": 1.5188335358444717e-05, + "loss": 0.8286, + "step": 375 + }, + { + "epoch": 0.0304601425793908, + "grad_norm": 0.04680528864264488, + "learning_rate": 1.5228837586067234e-05, + "loss": 0.867, + "step": 376 + }, + { + "epoch": 0.03054115359688918, + "grad_norm": 0.04027519375085831, + "learning_rate": 1.5269339813689752e-05, + "loss": 0.7637, + "step": 377 + }, + { + "epoch": 0.030622164614387555, + "grad_norm": 0.041619181632995605, + "learning_rate": 1.5309842041312272e-05, + "loss": 0.7716, + "step": 378 + }, + { + "epoch": 0.030703175631885935, + "grad_norm": 0.03543030843138695, + "learning_rate": 1.5350344268934792e-05, + "loss": 0.757, + "step": 379 + }, + { + "epoch": 0.030784186649384315, + "grad_norm": 0.03332729637622833, + "learning_rate": 1.539084649655731e-05, + "loss": 0.6851, + "step": 380 + }, + { + "epoch": 0.030865197666882695, + "grad_norm": 0.03877364099025726, + "learning_rate": 1.543134872417983e-05, + "loss": 0.7643, + "step": 381 + }, + { + "epoch": 0.030946208684381075, + "grad_norm": 0.034454673528671265, + "learning_rate": 1.547185095180235e-05, + "loss": 0.6828, + "step": 382 + }, + { + "epoch": 0.031027219701879456, + "grad_norm": 0.039795245975255966, + "learning_rate": 1.5512353179424867e-05, + "loss": 0.7524, + "step": 383 + }, + { + "epoch": 0.031108230719377836, + "grad_norm": 0.03474024310708046, + "learning_rate": 1.5552855407047387e-05, + "loss": 0.6882, + "step": 384 + }, + { + "epoch": 0.031189241736876216, + "grad_norm": 0.04128913953900337, + "learning_rate": 1.559335763466991e-05, + "loss": 0.7806, + "step": 385 + }, + { + "epoch": 0.031270252754374596, + "grad_norm": 0.03762802109122276, + "learning_rate": 1.5633859862292426e-05, + "loss": 0.7746, + "step": 386 + }, + { + "epoch": 0.03135126377187297, + "grad_norm": 0.03340327739715576, + "learning_rate": 1.5674362089914946e-05, + "loss": 0.7158, + "step": 387 + }, + { + "epoch": 0.031432274789371356, + "grad_norm": 0.036030713468790054, + "learning_rate": 1.5714864317537465e-05, + "loss": 0.6745, + "step": 388 + }, + { + "epoch": 0.03151328580686973, + "grad_norm": 0.04570871591567993, + "learning_rate": 1.5755366545159985e-05, + "loss": 0.8154, + "step": 389 + }, + { + "epoch": 0.031594296824368116, + "grad_norm": 0.04245030879974365, + "learning_rate": 1.5795868772782505e-05, + "loss": 0.7874, + "step": 390 + }, + { + "epoch": 0.03167530784186649, + "grad_norm": 0.0361122190952301, + "learning_rate": 1.5836371000405024e-05, + "loss": 0.7843, + "step": 391 + }, + { + "epoch": 0.031756318859364877, + "grad_norm": 0.04195176810026169, + "learning_rate": 1.587687322802754e-05, + "loss": 0.7749, + "step": 392 + }, + { + "epoch": 0.03183732987686325, + "grad_norm": 0.0431600846350193, + "learning_rate": 1.591737545565006e-05, + "loss": 0.8219, + "step": 393 + }, + { + "epoch": 0.03191834089436163, + "grad_norm": 0.03839649632573128, + "learning_rate": 1.5957877683272583e-05, + "loss": 0.716, + "step": 394 + }, + { + "epoch": 0.03199935191186001, + "grad_norm": 0.039989449083805084, + "learning_rate": 1.59983799108951e-05, + "loss": 0.7706, + "step": 395 + }, + { + "epoch": 0.03208036292935839, + "grad_norm": 0.04222527891397476, + "learning_rate": 1.603888213851762e-05, + "loss": 0.7206, + "step": 396 + }, + { + "epoch": 0.032161373946856774, + "grad_norm": 0.04178968816995621, + "learning_rate": 1.607938436614014e-05, + "loss": 0.8058, + "step": 397 + }, + { + "epoch": 0.03224238496435515, + "grad_norm": 0.03837030008435249, + "learning_rate": 1.6119886593762655e-05, + "loss": 0.7037, + "step": 398 + }, + { + "epoch": 0.032323395981853534, + "grad_norm": 0.03963199257850647, + "learning_rate": 1.616038882138518e-05, + "loss": 0.7843, + "step": 399 + }, + { + "epoch": 0.03240440699935191, + "grad_norm": 0.03307361900806427, + "learning_rate": 1.6200891049007698e-05, + "loss": 0.6755, + "step": 400 + }, + { + "epoch": 0.032485418016850294, + "grad_norm": 0.03694215416908264, + "learning_rate": 1.6241393276630214e-05, + "loss": 0.7467, + "step": 401 + }, + { + "epoch": 0.03256642903434867, + "grad_norm": 0.041869282722473145, + "learning_rate": 1.6281895504252734e-05, + "loss": 0.7941, + "step": 402 + }, + { + "epoch": 0.032647440051847054, + "grad_norm": 0.043812256306409836, + "learning_rate": 1.6322397731875254e-05, + "loss": 0.7621, + "step": 403 + }, + { + "epoch": 0.03272845106934543, + "grad_norm": 0.0400121733546257, + "learning_rate": 1.6362899959497773e-05, + "loss": 0.718, + "step": 404 + }, + { + "epoch": 0.03280946208684381, + "grad_norm": 0.04324105381965637, + "learning_rate": 1.6403402187120293e-05, + "loss": 0.7901, + "step": 405 + }, + { + "epoch": 0.03289047310434219, + "grad_norm": 0.03996644914150238, + "learning_rate": 1.6443904414742813e-05, + "loss": 0.737, + "step": 406 + }, + { + "epoch": 0.03297148412184057, + "grad_norm": 0.04127480834722519, + "learning_rate": 1.648440664236533e-05, + "loss": 0.74, + "step": 407 + }, + { + "epoch": 0.03305249513933895, + "grad_norm": 0.04006365314126015, + "learning_rate": 1.6524908869987852e-05, + "loss": 0.7189, + "step": 408 + }, + { + "epoch": 0.03313350615683733, + "grad_norm": 0.036010291427373886, + "learning_rate": 1.6565411097610368e-05, + "loss": 0.6999, + "step": 409 + }, + { + "epoch": 0.03321451717433571, + "grad_norm": 0.04041115194559097, + "learning_rate": 1.6605913325232888e-05, + "loss": 0.7223, + "step": 410 + }, + { + "epoch": 0.03329552819183409, + "grad_norm": 0.03830122947692871, + "learning_rate": 1.6646415552855408e-05, + "loss": 0.6951, + "step": 411 + }, + { + "epoch": 0.03337653920933247, + "grad_norm": 0.03762373328208923, + "learning_rate": 1.6686917780477927e-05, + "loss": 0.7426, + "step": 412 + }, + { + "epoch": 0.03345755022683085, + "grad_norm": 0.04263002797961235, + "learning_rate": 1.6727420008100447e-05, + "loss": 0.7428, + "step": 413 + }, + { + "epoch": 0.03353856124432923, + "grad_norm": 0.03778361529111862, + "learning_rate": 1.6767922235722967e-05, + "loss": 0.722, + "step": 414 + }, + { + "epoch": 0.03361957226182761, + "grad_norm": 0.041179753839969635, + "learning_rate": 1.6808424463345483e-05, + "loss": 0.7686, + "step": 415 + }, + { + "epoch": 0.033700583279325985, + "grad_norm": 0.03923264890909195, + "learning_rate": 1.6848926690968003e-05, + "loss": 0.6927, + "step": 416 + }, + { + "epoch": 0.03378159429682437, + "grad_norm": 0.03404265269637108, + "learning_rate": 1.6889428918590526e-05, + "loss": 0.6726, + "step": 417 + }, + { + "epoch": 0.033862605314322745, + "grad_norm": 0.039949920028448105, + "learning_rate": 1.6929931146213042e-05, + "loss": 0.7377, + "step": 418 + }, + { + "epoch": 0.03394361633182113, + "grad_norm": 0.038464561104774475, + "learning_rate": 1.697043337383556e-05, + "loss": 0.7441, + "step": 419 + }, + { + "epoch": 0.034024627349319506, + "grad_norm": 0.037392206490039825, + "learning_rate": 1.701093560145808e-05, + "loss": 0.6999, + "step": 420 + }, + { + "epoch": 0.03410563836681789, + "grad_norm": 0.04498305171728134, + "learning_rate": 1.70514378290806e-05, + "loss": 0.6328, + "step": 421 + }, + { + "epoch": 0.034186649384316266, + "grad_norm": 0.03619861975312233, + "learning_rate": 1.709194005670312e-05, + "loss": 0.6997, + "step": 422 + }, + { + "epoch": 0.03426766040181465, + "grad_norm": 0.0377713143825531, + "learning_rate": 1.713244228432564e-05, + "loss": 0.6791, + "step": 423 + }, + { + "epoch": 0.034348671419313026, + "grad_norm": 0.042271509766578674, + "learning_rate": 1.7172944511948157e-05, + "loss": 0.7267, + "step": 424 + }, + { + "epoch": 0.03442968243681141, + "grad_norm": 0.03935857117176056, + "learning_rate": 1.7213446739570676e-05, + "loss": 0.6378, + "step": 425 + }, + { + "epoch": 0.034510693454309786, + "grad_norm": 0.04291445389389992, + "learning_rate": 1.7253948967193196e-05, + "loss": 0.7408, + "step": 426 + }, + { + "epoch": 0.03459170447180816, + "grad_norm": 0.04915785416960716, + "learning_rate": 1.7294451194815716e-05, + "loss": 0.6971, + "step": 427 + }, + { + "epoch": 0.034672715489306546, + "grad_norm": 0.042455509305000305, + "learning_rate": 1.7334953422438235e-05, + "loss": 0.6855, + "step": 428 + }, + { + "epoch": 0.03475372650680492, + "grad_norm": 0.03784140944480896, + "learning_rate": 1.7375455650060755e-05, + "loss": 0.6268, + "step": 429 + }, + { + "epoch": 0.03483473752430331, + "grad_norm": 0.04316165670752525, + "learning_rate": 1.741595787768327e-05, + "loss": 0.647, + "step": 430 + }, + { + "epoch": 0.03491574854180168, + "grad_norm": 0.04066496342420578, + "learning_rate": 1.745646010530579e-05, + "loss": 0.6596, + "step": 431 + }, + { + "epoch": 0.03499675955930007, + "grad_norm": 0.0413605198264122, + "learning_rate": 1.7496962332928314e-05, + "loss": 0.6529, + "step": 432 + }, + { + "epoch": 0.035077770576798444, + "grad_norm": 0.0477905236184597, + "learning_rate": 1.753746456055083e-05, + "loss": 0.7452, + "step": 433 + }, + { + "epoch": 0.03515878159429683, + "grad_norm": 0.04964848607778549, + "learning_rate": 1.757796678817335e-05, + "loss": 0.707, + "step": 434 + }, + { + "epoch": 0.035239792611795204, + "grad_norm": 0.03816310688853264, + "learning_rate": 1.761846901579587e-05, + "loss": 0.6183, + "step": 435 + }, + { + "epoch": 0.03532080362929359, + "grad_norm": 0.0438305027782917, + "learning_rate": 1.765897124341839e-05, + "loss": 0.6684, + "step": 436 + }, + { + "epoch": 0.035401814646791964, + "grad_norm": 0.042699217796325684, + "learning_rate": 1.769947347104091e-05, + "loss": 0.6744, + "step": 437 + }, + { + "epoch": 0.03548282566429034, + "grad_norm": 0.04368191957473755, + "learning_rate": 1.773997569866343e-05, + "loss": 0.6804, + "step": 438 + }, + { + "epoch": 0.035563836681788724, + "grad_norm": 0.04724517837166786, + "learning_rate": 1.7780477926285945e-05, + "loss": 0.7299, + "step": 439 + }, + { + "epoch": 0.0356448476992871, + "grad_norm": 0.04736173152923584, + "learning_rate": 1.7820980153908464e-05, + "loss": 0.697, + "step": 440 + }, + { + "epoch": 0.035725858716785484, + "grad_norm": 0.04788941144943237, + "learning_rate": 1.7861482381530988e-05, + "loss": 0.7005, + "step": 441 + }, + { + "epoch": 0.03580686973428386, + "grad_norm": 0.041798755526542664, + "learning_rate": 1.7901984609153504e-05, + "loss": 0.6847, + "step": 442 + }, + { + "epoch": 0.035887880751782245, + "grad_norm": 0.05260884389281273, + "learning_rate": 1.7942486836776023e-05, + "loss": 0.6841, + "step": 443 + }, + { + "epoch": 0.03596889176928062, + "grad_norm": 0.046023592352867126, + "learning_rate": 1.7982989064398543e-05, + "loss": 0.7487, + "step": 444 + }, + { + "epoch": 0.036049902786779005, + "grad_norm": 0.04284362122416496, + "learning_rate": 1.802349129202106e-05, + "loss": 0.6566, + "step": 445 + }, + { + "epoch": 0.03613091380427738, + "grad_norm": 0.04734259471297264, + "learning_rate": 1.8063993519643582e-05, + "loss": 0.7172, + "step": 446 + }, + { + "epoch": 0.03621192482177576, + "grad_norm": 0.03644348680973053, + "learning_rate": 1.8104495747266102e-05, + "loss": 0.6219, + "step": 447 + }, + { + "epoch": 0.03629293583927414, + "grad_norm": 0.04391570761799812, + "learning_rate": 1.814499797488862e-05, + "loss": 0.6723, + "step": 448 + }, + { + "epoch": 0.03637394685677252, + "grad_norm": 0.04355219379067421, + "learning_rate": 1.8185500202511138e-05, + "loss": 0.6287, + "step": 449 + }, + { + "epoch": 0.0364549578742709, + "grad_norm": 0.04065849632024765, + "learning_rate": 1.822600243013366e-05, + "loss": 0.6504, + "step": 450 + }, + { + "epoch": 0.03653596889176928, + "grad_norm": 0.04802323877811432, + "learning_rate": 1.8266504657756177e-05, + "loss": 0.6465, + "step": 451 + }, + { + "epoch": 0.03661697990926766, + "grad_norm": 0.04284946620464325, + "learning_rate": 1.8307006885378697e-05, + "loss": 0.6613, + "step": 452 + }, + { + "epoch": 0.03669799092676604, + "grad_norm": 0.039490021765232086, + "learning_rate": 1.8347509113001217e-05, + "loss": 0.6396, + "step": 453 + }, + { + "epoch": 0.03677900194426442, + "grad_norm": 0.03865380212664604, + "learning_rate": 1.8388011340623733e-05, + "loss": 0.6547, + "step": 454 + }, + { + "epoch": 0.0368600129617628, + "grad_norm": 0.048680275678634644, + "learning_rate": 1.8428513568246256e-05, + "loss": 0.6325, + "step": 455 + }, + { + "epoch": 0.03694102397926118, + "grad_norm": 0.047213200479745865, + "learning_rate": 1.8469015795868776e-05, + "loss": 0.7057, + "step": 456 + }, + { + "epoch": 0.03702203499675956, + "grad_norm": 0.04423484206199646, + "learning_rate": 1.8509518023491292e-05, + "loss": 0.639, + "step": 457 + }, + { + "epoch": 0.037103046014257936, + "grad_norm": 0.04309452697634697, + "learning_rate": 1.8550020251113812e-05, + "loss": 0.5955, + "step": 458 + }, + { + "epoch": 0.03718405703175632, + "grad_norm": 0.04167764261364937, + "learning_rate": 1.859052247873633e-05, + "loss": 0.6394, + "step": 459 + }, + { + "epoch": 0.037265068049254696, + "grad_norm": 0.04568406194448471, + "learning_rate": 1.863102470635885e-05, + "loss": 0.6784, + "step": 460 + }, + { + "epoch": 0.03734607906675308, + "grad_norm": 0.040312498807907104, + "learning_rate": 1.867152693398137e-05, + "loss": 0.6277, + "step": 461 + }, + { + "epoch": 0.037427090084251456, + "grad_norm": 0.04599596560001373, + "learning_rate": 1.871202916160389e-05, + "loss": 0.6869, + "step": 462 + }, + { + "epoch": 0.03750810110174984, + "grad_norm": 0.04221928119659424, + "learning_rate": 1.8752531389226407e-05, + "loss": 0.626, + "step": 463 + }, + { + "epoch": 0.037589112119248216, + "grad_norm": 0.042346443980932236, + "learning_rate": 1.8793033616848926e-05, + "loss": 0.6522, + "step": 464 + }, + { + "epoch": 0.0376701231367466, + "grad_norm": 0.04198833554983139, + "learning_rate": 1.8833535844471446e-05, + "loss": 0.6207, + "step": 465 + }, + { + "epoch": 0.03775113415424498, + "grad_norm": 0.03679432347416878, + "learning_rate": 1.8874038072093966e-05, + "loss": 0.5898, + "step": 466 + }, + { + "epoch": 0.03783214517174336, + "grad_norm": 0.04612481966614723, + "learning_rate": 1.8914540299716485e-05, + "loss": 0.6411, + "step": 467 + }, + { + "epoch": 0.03791315618924174, + "grad_norm": 0.04428162798285484, + "learning_rate": 1.8955042527339005e-05, + "loss": 0.6973, + "step": 468 + }, + { + "epoch": 0.03799416720674011, + "grad_norm": 0.04129292443394661, + "learning_rate": 1.8995544754961525e-05, + "loss": 0.5986, + "step": 469 + }, + { + "epoch": 0.0380751782242385, + "grad_norm": 0.04522555693984032, + "learning_rate": 1.9036046982584044e-05, + "loss": 0.6681, + "step": 470 + }, + { + "epoch": 0.038156189241736874, + "grad_norm": 0.04063719883561134, + "learning_rate": 1.9076549210206564e-05, + "loss": 0.6732, + "step": 471 + }, + { + "epoch": 0.03823720025923526, + "grad_norm": 0.03796732425689697, + "learning_rate": 1.911705143782908e-05, + "loss": 0.5774, + "step": 472 + }, + { + "epoch": 0.038318211276733634, + "grad_norm": 0.047407280653715134, + "learning_rate": 1.91575536654516e-05, + "loss": 0.6766, + "step": 473 + }, + { + "epoch": 0.03839922229423202, + "grad_norm": 0.042836595326662064, + "learning_rate": 1.919805589307412e-05, + "loss": 0.6184, + "step": 474 + }, + { + "epoch": 0.038480233311730394, + "grad_norm": 0.03992742300033569, + "learning_rate": 1.923855812069664e-05, + "loss": 0.568, + "step": 475 + }, + { + "epoch": 0.03856124432922878, + "grad_norm": 0.046161893755197525, + "learning_rate": 1.927906034831916e-05, + "loss": 0.634, + "step": 476 + }, + { + "epoch": 0.038642255346727154, + "grad_norm": 0.042704734951257706, + "learning_rate": 1.931956257594168e-05, + "loss": 0.6076, + "step": 477 + }, + { + "epoch": 0.03872326636422554, + "grad_norm": 0.04184344783425331, + "learning_rate": 1.9360064803564195e-05, + "loss": 0.5935, + "step": 478 + }, + { + "epoch": 0.038804277381723914, + "grad_norm": 0.04119317978620529, + "learning_rate": 1.9400567031186718e-05, + "loss": 0.6022, + "step": 479 + }, + { + "epoch": 0.03888528839922229, + "grad_norm": 0.04500815272331238, + "learning_rate": 1.9441069258809234e-05, + "loss": 0.6759, + "step": 480 + }, + { + "epoch": 0.038966299416720675, + "grad_norm": 0.04669018089771271, + "learning_rate": 1.9481571486431754e-05, + "loss": 0.5904, + "step": 481 + }, + { + "epoch": 0.03904731043421905, + "grad_norm": 0.07213626056909561, + "learning_rate": 1.9522073714054274e-05, + "loss": 0.7376, + "step": 482 + }, + { + "epoch": 0.039128321451717435, + "grad_norm": 0.04742096737027168, + "learning_rate": 1.9562575941676793e-05, + "loss": 0.5984, + "step": 483 + }, + { + "epoch": 0.03920933246921581, + "grad_norm": 0.03930765762925148, + "learning_rate": 1.9603078169299313e-05, + "loss": 0.6301, + "step": 484 + }, + { + "epoch": 0.039290343486714195, + "grad_norm": 0.049007635563611984, + "learning_rate": 1.9643580396921833e-05, + "loss": 0.6962, + "step": 485 + }, + { + "epoch": 0.03937135450421257, + "grad_norm": 0.049863554537296295, + "learning_rate": 1.968408262454435e-05, + "loss": 0.6641, + "step": 486 + }, + { + "epoch": 0.039452365521710955, + "grad_norm": 0.05560353770852089, + "learning_rate": 1.972458485216687e-05, + "loss": 0.6403, + "step": 487 + }, + { + "epoch": 0.03953337653920933, + "grad_norm": 0.04843142628669739, + "learning_rate": 1.976508707978939e-05, + "loss": 0.6101, + "step": 488 + }, + { + "epoch": 0.039614387556707716, + "grad_norm": 0.04507589340209961, + "learning_rate": 1.9805589307411908e-05, + "loss": 0.6062, + "step": 489 + }, + { + "epoch": 0.03969539857420609, + "grad_norm": 0.05050964653491974, + "learning_rate": 1.9846091535034428e-05, + "loss": 0.6674, + "step": 490 + }, + { + "epoch": 0.03977640959170447, + "grad_norm": 0.0509161613881588, + "learning_rate": 1.9886593762656947e-05, + "loss": 0.5875, + "step": 491 + }, + { + "epoch": 0.03985742060920285, + "grad_norm": 0.05824290215969086, + "learning_rate": 1.9927095990279464e-05, + "loss": 0.6179, + "step": 492 + }, + { + "epoch": 0.03993843162670123, + "grad_norm": 0.05500331521034241, + "learning_rate": 1.9967598217901987e-05, + "loss": 0.6085, + "step": 493 + }, + { + "epoch": 0.04001944264419961, + "grad_norm": 0.06655135005712509, + "learning_rate": 2.0008100445524506e-05, + "loss": 0.6541, + "step": 494 + }, + { + "epoch": 0.04010045366169799, + "grad_norm": 0.05266350507736206, + "learning_rate": 2.0048602673147023e-05, + "loss": 0.601, + "step": 495 + }, + { + "epoch": 0.04018146467919637, + "grad_norm": 0.045727360993623734, + "learning_rate": 2.0089104900769542e-05, + "loss": 0.573, + "step": 496 + }, + { + "epoch": 0.04026247569669475, + "grad_norm": 0.0533706471323967, + "learning_rate": 2.0129607128392062e-05, + "loss": 0.5564, + "step": 497 + }, + { + "epoch": 0.04034348671419313, + "grad_norm": 0.06397935003042221, + "learning_rate": 2.017010935601458e-05, + "loss": 0.6389, + "step": 498 + }, + { + "epoch": 0.04042449773169151, + "grad_norm": 0.056940142065286636, + "learning_rate": 2.02106115836371e-05, + "loss": 0.6105, + "step": 499 + }, + { + "epoch": 0.04050550874918989, + "grad_norm": 0.05736251175403595, + "learning_rate": 2.025111381125962e-05, + "loss": 0.5387, + "step": 500 + }, + { + "epoch": 0.04058651976668827, + "grad_norm": 0.058200638741254807, + "learning_rate": 2.0291616038882137e-05, + "loss": 0.6367, + "step": 501 + }, + { + "epoch": 0.040667530784186647, + "grad_norm": 0.05528083071112633, + "learning_rate": 2.033211826650466e-05, + "loss": 0.5223, + "step": 502 + }, + { + "epoch": 0.04074854180168503, + "grad_norm": 0.05075810104608536, + "learning_rate": 2.037262049412718e-05, + "loss": 0.6069, + "step": 503 + }, + { + "epoch": 0.04082955281918341, + "grad_norm": 0.05383017659187317, + "learning_rate": 2.0413122721749696e-05, + "loss": 0.5804, + "step": 504 + }, + { + "epoch": 0.04091056383668179, + "grad_norm": 0.052171312272548676, + "learning_rate": 2.0453624949372216e-05, + "loss": 0.5599, + "step": 505 + }, + { + "epoch": 0.04099157485418017, + "grad_norm": 0.06255053728818893, + "learning_rate": 2.0494127176994735e-05, + "loss": 0.5705, + "step": 506 + }, + { + "epoch": 0.04107258587167855, + "grad_norm": 0.05635403096675873, + "learning_rate": 2.0534629404617255e-05, + "loss": 0.604, + "step": 507 + }, + { + "epoch": 0.04115359688917693, + "grad_norm": 0.05404305458068848, + "learning_rate": 2.0575131632239775e-05, + "loss": 0.62, + "step": 508 + }, + { + "epoch": 0.04123460790667531, + "grad_norm": 0.05946921557188034, + "learning_rate": 2.0615633859862295e-05, + "loss": 0.5949, + "step": 509 + }, + { + "epoch": 0.04131561892417369, + "grad_norm": 0.04885870963335037, + "learning_rate": 2.065613608748481e-05, + "loss": 0.5022, + "step": 510 + }, + { + "epoch": 0.041396629941672064, + "grad_norm": 0.053143661469221115, + "learning_rate": 2.069663831510733e-05, + "loss": 0.5709, + "step": 511 + }, + { + "epoch": 0.04147764095917045, + "grad_norm": 0.0510605163872242, + "learning_rate": 2.0737140542729854e-05, + "loss": 0.5656, + "step": 512 + }, + { + "epoch": 0.041558651976668824, + "grad_norm": 0.053504448384046555, + "learning_rate": 2.077764277035237e-05, + "loss": 0.5462, + "step": 513 + }, + { + "epoch": 0.04163966299416721, + "grad_norm": 0.06571054458618164, + "learning_rate": 2.081814499797489e-05, + "loss": 0.527, + "step": 514 + }, + { + "epoch": 0.041720674011665584, + "grad_norm": 0.05549018830060959, + "learning_rate": 2.085864722559741e-05, + "loss": 0.5494, + "step": 515 + }, + { + "epoch": 0.04180168502916397, + "grad_norm": 0.05811379849910736, + "learning_rate": 2.089914945321993e-05, + "loss": 0.515, + "step": 516 + }, + { + "epoch": 0.041882696046662345, + "grad_norm": 0.06510355323553085, + "learning_rate": 2.093965168084245e-05, + "loss": 0.588, + "step": 517 + }, + { + "epoch": 0.04196370706416073, + "grad_norm": 0.055551595985889435, + "learning_rate": 2.0980153908464968e-05, + "loss": 0.5697, + "step": 518 + }, + { + "epoch": 0.042044718081659105, + "grad_norm": 0.05914895981550217, + "learning_rate": 2.1020656136087484e-05, + "loss": 0.5064, + "step": 519 + }, + { + "epoch": 0.04212572909915749, + "grad_norm": 0.07579497992992401, + "learning_rate": 2.1061158363710004e-05, + "loss": 0.535, + "step": 520 + }, + { + "epoch": 0.042206740116655865, + "grad_norm": 0.04847273975610733, + "learning_rate": 2.1101660591332527e-05, + "loss": 0.5059, + "step": 521 + }, + { + "epoch": 0.04228775113415424, + "grad_norm": 0.05634573474526405, + "learning_rate": 2.1142162818955043e-05, + "loss": 0.528, + "step": 522 + }, + { + "epoch": 0.042368762151652625, + "grad_norm": 0.0529668815433979, + "learning_rate": 2.1182665046577563e-05, + "loss": 0.5497, + "step": 523 + }, + { + "epoch": 0.042449773169151, + "grad_norm": 0.04280909150838852, + "learning_rate": 2.1223167274200083e-05, + "loss": 0.5208, + "step": 524 + }, + { + "epoch": 0.042530784186649385, + "grad_norm": 0.05029316246509552, + "learning_rate": 2.12636695018226e-05, + "loss": 0.5829, + "step": 525 + }, + { + "epoch": 0.04261179520414776, + "grad_norm": 0.052403755486011505, + "learning_rate": 2.1304171729445122e-05, + "loss": 0.5172, + "step": 526 + }, + { + "epoch": 0.042692806221646146, + "grad_norm": 0.055489446967840195, + "learning_rate": 2.1344673957067642e-05, + "loss": 0.5782, + "step": 527 + }, + { + "epoch": 0.04277381723914452, + "grad_norm": 0.05629388242959976, + "learning_rate": 2.1385176184690158e-05, + "loss": 0.5884, + "step": 528 + }, + { + "epoch": 0.042854828256642906, + "grad_norm": 0.047733061015605927, + "learning_rate": 2.1425678412312678e-05, + "loss": 0.515, + "step": 529 + }, + { + "epoch": 0.04293583927414128, + "grad_norm": 0.053567174822092056, + "learning_rate": 2.1466180639935197e-05, + "loss": 0.5624, + "step": 530 + }, + { + "epoch": 0.043016850291639666, + "grad_norm": 0.05233658850193024, + "learning_rate": 2.1506682867557717e-05, + "loss": 0.5617, + "step": 531 + }, + { + "epoch": 0.04309786130913804, + "grad_norm": 0.04258202761411667, + "learning_rate": 2.1547185095180237e-05, + "loss": 0.4907, + "step": 532 + }, + { + "epoch": 0.04317887232663642, + "grad_norm": 0.04429875314235687, + "learning_rate": 2.1587687322802756e-05, + "loss": 0.5367, + "step": 533 + }, + { + "epoch": 0.0432598833441348, + "grad_norm": 0.04000532627105713, + "learning_rate": 2.1628189550425273e-05, + "loss": 0.5274, + "step": 534 + }, + { + "epoch": 0.04334089436163318, + "grad_norm": 0.04910857975482941, + "learning_rate": 2.1668691778047796e-05, + "loss": 0.5301, + "step": 535 + }, + { + "epoch": 0.04342190537913156, + "grad_norm": 0.04290639981627464, + "learning_rate": 2.1709194005670312e-05, + "loss": 0.5091, + "step": 536 + }, + { + "epoch": 0.04350291639662994, + "grad_norm": 0.047956433147192, + "learning_rate": 2.174969623329283e-05, + "loss": 0.4973, + "step": 537 + }, + { + "epoch": 0.04358392741412832, + "grad_norm": 0.04619165509939194, + "learning_rate": 2.179019846091535e-05, + "loss": 0.5907, + "step": 538 + }, + { + "epoch": 0.0436649384316267, + "grad_norm": 0.05486949533224106, + "learning_rate": 2.183070068853787e-05, + "loss": 0.5225, + "step": 539 + }, + { + "epoch": 0.043745949449125084, + "grad_norm": 0.04930966719985008, + "learning_rate": 2.187120291616039e-05, + "loss": 0.5284, + "step": 540 + }, + { + "epoch": 0.04382696046662346, + "grad_norm": 0.06125545874238014, + "learning_rate": 2.191170514378291e-05, + "loss": 0.5276, + "step": 541 + }, + { + "epoch": 0.043907971484121844, + "grad_norm": 0.048369333148002625, + "learning_rate": 2.1952207371405427e-05, + "loss": 0.5033, + "step": 542 + }, + { + "epoch": 0.04398898250162022, + "grad_norm": 0.04628164693713188, + "learning_rate": 2.1992709599027946e-05, + "loss": 0.4926, + "step": 543 + }, + { + "epoch": 0.0440699935191186, + "grad_norm": 0.06109035015106201, + "learning_rate": 2.2033211826650466e-05, + "loss": 0.5382, + "step": 544 + }, + { + "epoch": 0.04415100453661698, + "grad_norm": 0.04685697704553604, + "learning_rate": 2.2073714054272986e-05, + "loss": 0.5546, + "step": 545 + }, + { + "epoch": 0.04423201555411536, + "grad_norm": 0.06735414266586304, + "learning_rate": 2.2114216281895505e-05, + "loss": 0.5866, + "step": 546 + }, + { + "epoch": 0.04431302657161374, + "grad_norm": 0.050085365772247314, + "learning_rate": 2.2154718509518025e-05, + "loss": 0.5075, + "step": 547 + }, + { + "epoch": 0.04439403758911212, + "grad_norm": 0.04673994705080986, + "learning_rate": 2.2195220737140545e-05, + "loss": 0.5104, + "step": 548 + }, + { + "epoch": 0.0444750486066105, + "grad_norm": 0.05349628999829292, + "learning_rate": 2.2235722964763064e-05, + "loss": 0.4982, + "step": 549 + }, + { + "epoch": 0.04455605962410888, + "grad_norm": 0.05613791197538376, + "learning_rate": 2.2276225192385584e-05, + "loss": 0.5439, + "step": 550 + }, + { + "epoch": 0.04463707064160726, + "grad_norm": 0.04818318039178848, + "learning_rate": 2.23167274200081e-05, + "loss": 0.5558, + "step": 551 + }, + { + "epoch": 0.04471808165910564, + "grad_norm": 0.047487739473581314, + "learning_rate": 2.235722964763062e-05, + "loss": 0.5025, + "step": 552 + }, + { + "epoch": 0.04479909267660402, + "grad_norm": 0.0547555573284626, + "learning_rate": 2.239773187525314e-05, + "loss": 0.4781, + "step": 553 + }, + { + "epoch": 0.0448801036941024, + "grad_norm": 0.05158519372344017, + "learning_rate": 2.243823410287566e-05, + "loss": 0.5309, + "step": 554 + }, + { + "epoch": 0.044961114711600775, + "grad_norm": 0.04126646742224693, + "learning_rate": 2.247873633049818e-05, + "loss": 0.4664, + "step": 555 + }, + { + "epoch": 0.04504212572909916, + "grad_norm": 0.06769046187400818, + "learning_rate": 2.25192385581207e-05, + "loss": 0.5093, + "step": 556 + }, + { + "epoch": 0.045123136746597535, + "grad_norm": 0.050162386149168015, + "learning_rate": 2.2559740785743215e-05, + "loss": 0.4988, + "step": 557 + }, + { + "epoch": 0.04520414776409592, + "grad_norm": 0.041536845266819, + "learning_rate": 2.2600243013365735e-05, + "loss": 0.5398, + "step": 558 + }, + { + "epoch": 0.045285158781594295, + "grad_norm": 0.0459623746573925, + "learning_rate": 2.2640745240988258e-05, + "loss": 0.4807, + "step": 559 + }, + { + "epoch": 0.04536616979909268, + "grad_norm": 0.044929806143045425, + "learning_rate": 2.2681247468610774e-05, + "loss": 0.483, + "step": 560 + }, + { + "epoch": 0.045447180816591055, + "grad_norm": 0.04633745923638344, + "learning_rate": 2.2721749696233294e-05, + "loss": 0.4784, + "step": 561 + }, + { + "epoch": 0.04552819183408944, + "grad_norm": 0.05819401517510414, + "learning_rate": 2.2762251923855813e-05, + "loss": 0.5187, + "step": 562 + }, + { + "epoch": 0.045609202851587816, + "grad_norm": 0.04854605346918106, + "learning_rate": 2.280275415147833e-05, + "loss": 0.5054, + "step": 563 + }, + { + "epoch": 0.04569021386908619, + "grad_norm": 0.05877658352255821, + "learning_rate": 2.2843256379100853e-05, + "loss": 0.4914, + "step": 564 + }, + { + "epoch": 0.045771224886584576, + "grad_norm": 0.04236457124352455, + "learning_rate": 2.2883758606723372e-05, + "loss": 0.4927, + "step": 565 + }, + { + "epoch": 0.04585223590408295, + "grad_norm": 0.04844123125076294, + "learning_rate": 2.292426083434589e-05, + "loss": 0.5393, + "step": 566 + }, + { + "epoch": 0.045933246921581336, + "grad_norm": 0.0493597574532032, + "learning_rate": 2.2964763061968408e-05, + "loss": 0.543, + "step": 567 + }, + { + "epoch": 0.04601425793907971, + "grad_norm": 0.044169507920742035, + "learning_rate": 2.300526528959093e-05, + "loss": 0.5062, + "step": 568 + }, + { + "epoch": 0.046095268956578096, + "grad_norm": 0.05171835795044899, + "learning_rate": 2.3045767517213448e-05, + "loss": 0.5232, + "step": 569 + }, + { + "epoch": 0.04617627997407647, + "grad_norm": 0.04781988635659218, + "learning_rate": 2.3086269744835967e-05, + "loss": 0.4988, + "step": 570 + }, + { + "epoch": 0.046257290991574856, + "grad_norm": 0.05599427595734596, + "learning_rate": 2.3126771972458487e-05, + "loss": 0.5655, + "step": 571 + }, + { + "epoch": 0.04633830200907323, + "grad_norm": 0.04716041311621666, + "learning_rate": 2.3167274200081003e-05, + "loss": 0.4983, + "step": 572 + }, + { + "epoch": 0.04641931302657162, + "grad_norm": 0.04140276089310646, + "learning_rate": 2.3207776427703526e-05, + "loss": 0.4953, + "step": 573 + }, + { + "epoch": 0.04650032404406999, + "grad_norm": 0.04450197145342827, + "learning_rate": 2.3248278655326046e-05, + "loss": 0.4608, + "step": 574 + }, + { + "epoch": 0.04658133506156837, + "grad_norm": 0.047172173857688904, + "learning_rate": 2.3288780882948562e-05, + "loss": 0.538, + "step": 575 + }, + { + "epoch": 0.046662346079066754, + "grad_norm": 0.06201828271150589, + "learning_rate": 2.3329283110571082e-05, + "loss": 0.5433, + "step": 576 + }, + { + "epoch": 0.04674335709656513, + "grad_norm": 0.05522223934531212, + "learning_rate": 2.33697853381936e-05, + "loss": 0.5024, + "step": 577 + }, + { + "epoch": 0.046824368114063514, + "grad_norm": 0.08238446712493896, + "learning_rate": 2.341028756581612e-05, + "loss": 0.5263, + "step": 578 + }, + { + "epoch": 0.04690537913156189, + "grad_norm": 0.07169990241527557, + "learning_rate": 2.345078979343864e-05, + "loss": 0.4711, + "step": 579 + }, + { + "epoch": 0.046986390149060274, + "grad_norm": 0.04631124436855316, + "learning_rate": 2.349129202106116e-05, + "loss": 0.5079, + "step": 580 + }, + { + "epoch": 0.04706740116655865, + "grad_norm": 0.05277573689818382, + "learning_rate": 2.3531794248683677e-05, + "loss": 0.5469, + "step": 581 + }, + { + "epoch": 0.047148412184057034, + "grad_norm": 0.0452033132314682, + "learning_rate": 2.35722964763062e-05, + "loss": 0.4745, + "step": 582 + }, + { + "epoch": 0.04722942320155541, + "grad_norm": 0.050169870257377625, + "learning_rate": 2.361279870392872e-05, + "loss": 0.4768, + "step": 583 + }, + { + "epoch": 0.047310434219053794, + "grad_norm": 0.04421820119023323, + "learning_rate": 2.3653300931551236e-05, + "loss": 0.4852, + "step": 584 + }, + { + "epoch": 0.04739144523655217, + "grad_norm": 0.04202679544687271, + "learning_rate": 2.3693803159173755e-05, + "loss": 0.4853, + "step": 585 + }, + { + "epoch": 0.04747245625405055, + "grad_norm": 0.07291880995035172, + "learning_rate": 2.3734305386796275e-05, + "loss": 0.5294, + "step": 586 + }, + { + "epoch": 0.04755346727154893, + "grad_norm": 0.05519851669669151, + "learning_rate": 2.3774807614418795e-05, + "loss": 0.505, + "step": 587 + }, + { + "epoch": 0.04763447828904731, + "grad_norm": 0.04110843688249588, + "learning_rate": 2.3815309842041314e-05, + "loss": 0.5181, + "step": 588 + }, + { + "epoch": 0.04771548930654569, + "grad_norm": 0.044017914682626724, + "learning_rate": 2.3855812069663834e-05, + "loss": 0.5272, + "step": 589 + }, + { + "epoch": 0.04779650032404407, + "grad_norm": 0.07405832409858704, + "learning_rate": 2.389631429728635e-05, + "loss": 0.4221, + "step": 590 + }, + { + "epoch": 0.04787751134154245, + "grad_norm": 0.05474500358104706, + "learning_rate": 2.393681652490887e-05, + "loss": 0.4942, + "step": 591 + }, + { + "epoch": 0.04795852235904083, + "grad_norm": 0.04653024300932884, + "learning_rate": 2.397731875253139e-05, + "loss": 0.4566, + "step": 592 + }, + { + "epoch": 0.04803953337653921, + "grad_norm": 0.052719101309776306, + "learning_rate": 2.401782098015391e-05, + "loss": 0.4965, + "step": 593 + }, + { + "epoch": 0.04812054439403759, + "grad_norm": 0.05099957436323166, + "learning_rate": 2.405832320777643e-05, + "loss": 0.4876, + "step": 594 + }, + { + "epoch": 0.04820155541153597, + "grad_norm": 0.061169691383838654, + "learning_rate": 2.409882543539895e-05, + "loss": 0.4586, + "step": 595 + }, + { + "epoch": 0.04828256642903435, + "grad_norm": 0.06282297521829605, + "learning_rate": 2.4139327663021465e-05, + "loss": 0.4801, + "step": 596 + }, + { + "epoch": 0.048363577446532725, + "grad_norm": 0.05508417636156082, + "learning_rate": 2.4179829890643988e-05, + "loss": 0.5072, + "step": 597 + }, + { + "epoch": 0.04844458846403111, + "grad_norm": 0.052037712186574936, + "learning_rate": 2.4220332118266508e-05, + "loss": 0.528, + "step": 598 + }, + { + "epoch": 0.048525599481529486, + "grad_norm": 0.06128312274813652, + "learning_rate": 2.4260834345889024e-05, + "loss": 0.4564, + "step": 599 + }, + { + "epoch": 0.04860661049902787, + "grad_norm": 0.04434271529316902, + "learning_rate": 2.4301336573511544e-05, + "loss": 0.4989, + "step": 600 + }, + { + "epoch": 0.048687621516526246, + "grad_norm": 0.05338647589087486, + "learning_rate": 2.4341838801134063e-05, + "loss": 0.5295, + "step": 601 + }, + { + "epoch": 0.04876863253402463, + "grad_norm": 0.051930706948041916, + "learning_rate": 2.4382341028756583e-05, + "loss": 0.4958, + "step": 602 + }, + { + "epoch": 0.048849643551523006, + "grad_norm": 0.06045117601752281, + "learning_rate": 2.4422843256379103e-05, + "loss": 0.4752, + "step": 603 + }, + { + "epoch": 0.04893065456902139, + "grad_norm": 0.050316449254751205, + "learning_rate": 2.4463345484001622e-05, + "loss": 0.4702, + "step": 604 + }, + { + "epoch": 0.049011665586519766, + "grad_norm": 0.044821321964263916, + "learning_rate": 2.450384771162414e-05, + "loss": 0.452, + "step": 605 + }, + { + "epoch": 0.04909267660401815, + "grad_norm": 0.07491052150726318, + "learning_rate": 2.4544349939246662e-05, + "loss": 0.5094, + "step": 606 + }, + { + "epoch": 0.049173687621516526, + "grad_norm": 0.05308236926794052, + "learning_rate": 2.4584852166869178e-05, + "loss": 0.4962, + "step": 607 + }, + { + "epoch": 0.0492546986390149, + "grad_norm": 0.04302395507693291, + "learning_rate": 2.4625354394491698e-05, + "loss": 0.4967, + "step": 608 + }, + { + "epoch": 0.04933570965651329, + "grad_norm": 0.04931635782122612, + "learning_rate": 2.4665856622114217e-05, + "loss": 0.4631, + "step": 609 + }, + { + "epoch": 0.04941672067401166, + "grad_norm": 0.04895971342921257, + "learning_rate": 2.4706358849736737e-05, + "loss": 0.4652, + "step": 610 + }, + { + "epoch": 0.04949773169151005, + "grad_norm": 0.046912480145692825, + "learning_rate": 2.4746861077359257e-05, + "loss": 0.5166, + "step": 611 + }, + { + "epoch": 0.04957874270900842, + "grad_norm": 0.0499003529548645, + "learning_rate": 2.4787363304981776e-05, + "loss": 0.5191, + "step": 612 + }, + { + "epoch": 0.04965975372650681, + "grad_norm": 0.05203314870595932, + "learning_rate": 2.4827865532604293e-05, + "loss": 0.477, + "step": 613 + }, + { + "epoch": 0.049740764744005184, + "grad_norm": 0.07039269059896469, + "learning_rate": 2.4868367760226812e-05, + "loss": 0.5298, + "step": 614 + }, + { + "epoch": 0.04982177576150357, + "grad_norm": 0.05096956714987755, + "learning_rate": 2.4908869987849335e-05, + "loss": 0.4775, + "step": 615 + }, + { + "epoch": 0.049902786779001944, + "grad_norm": 0.05297008156776428, + "learning_rate": 2.494937221547185e-05, + "loss": 0.5042, + "step": 616 + }, + { + "epoch": 0.04998379779650033, + "grad_norm": 0.046072036027908325, + "learning_rate": 2.498987444309437e-05, + "loss": 0.4911, + "step": 617 + }, + { + "epoch": 0.050064808813998704, + "grad_norm": 0.044839046895504, + "learning_rate": 2.5030376670716894e-05, + "loss": 0.4661, + "step": 618 + }, + { + "epoch": 0.05014581983149708, + "grad_norm": 0.06291180104017258, + "learning_rate": 2.507087889833941e-05, + "loss": 0.497, + "step": 619 + }, + { + "epoch": 0.050226830848995464, + "grad_norm": 0.049912694841623306, + "learning_rate": 2.511138112596193e-05, + "loss": 0.4384, + "step": 620 + }, + { + "epoch": 0.05030784186649384, + "grad_norm": 0.0411076582968235, + "learning_rate": 2.515188335358445e-05, + "loss": 0.5415, + "step": 621 + }, + { + "epoch": 0.050388852883992225, + "grad_norm": 0.046500127762556076, + "learning_rate": 2.5192385581206966e-05, + "loss": 0.4896, + "step": 622 + }, + { + "epoch": 0.0504698639014906, + "grad_norm": 0.05132298171520233, + "learning_rate": 2.5232887808829486e-05, + "loss": 0.4807, + "step": 623 + }, + { + "epoch": 0.050550874918988985, + "grad_norm": 0.05340069532394409, + "learning_rate": 2.5273390036452006e-05, + "loss": 0.4442, + "step": 624 + }, + { + "epoch": 0.05063188593648736, + "grad_norm": 0.05164061486721039, + "learning_rate": 2.5313892264074522e-05, + "loss": 0.4896, + "step": 625 + }, + { + "epoch": 0.050712896953985745, + "grad_norm": 0.05592244863510132, + "learning_rate": 2.535439449169704e-05, + "loss": 0.5025, + "step": 626 + }, + { + "epoch": 0.05079390797148412, + "grad_norm": 0.050286903977394104, + "learning_rate": 2.5394896719319568e-05, + "loss": 0.4409, + "step": 627 + }, + { + "epoch": 0.0508749189889825, + "grad_norm": 0.06757903099060059, + "learning_rate": 2.5435398946942084e-05, + "loss": 0.4934, + "step": 628 + }, + { + "epoch": 0.05095593000648088, + "grad_norm": 0.05722956359386444, + "learning_rate": 2.5475901174564604e-05, + "loss": 0.501, + "step": 629 + }, + { + "epoch": 0.05103694102397926, + "grad_norm": 0.05500578135251999, + "learning_rate": 2.5516403402187124e-05, + "loss": 0.488, + "step": 630 + }, + { + "epoch": 0.05111795204147764, + "grad_norm": 0.06366024166345596, + "learning_rate": 2.555690562980964e-05, + "loss": 0.5071, + "step": 631 + }, + { + "epoch": 0.05119896305897602, + "grad_norm": 0.050802573561668396, + "learning_rate": 2.559740785743216e-05, + "loss": 0.5274, + "step": 632 + }, + { + "epoch": 0.0512799740764744, + "grad_norm": 0.070594422519207, + "learning_rate": 2.563791008505468e-05, + "loss": 0.5142, + "step": 633 + }, + { + "epoch": 0.05136098509397278, + "grad_norm": 0.05826390162110329, + "learning_rate": 2.5678412312677195e-05, + "loss": 0.5286, + "step": 634 + }, + { + "epoch": 0.05144199611147116, + "grad_norm": 0.046842195093631744, + "learning_rate": 2.5718914540299715e-05, + "loss": 0.48, + "step": 635 + }, + { + "epoch": 0.05152300712896954, + "grad_norm": 0.08180008828639984, + "learning_rate": 2.5759416767922235e-05, + "loss": 0.5028, + "step": 636 + }, + { + "epoch": 0.05160401814646792, + "grad_norm": 0.05418627709150314, + "learning_rate": 2.5799918995544758e-05, + "loss": 0.4879, + "step": 637 + }, + { + "epoch": 0.0516850291639663, + "grad_norm": 0.053439754992723465, + "learning_rate": 2.5840421223167278e-05, + "loss": 0.4461, + "step": 638 + }, + { + "epoch": 0.051766040181464676, + "grad_norm": 0.04994548112154007, + "learning_rate": 2.5880923450789797e-05, + "loss": 0.4158, + "step": 639 + }, + { + "epoch": 0.05184705119896306, + "grad_norm": 0.052046455442905426, + "learning_rate": 2.5921425678412313e-05, + "loss": 0.4741, + "step": 640 + }, + { + "epoch": 0.051928062216461436, + "grad_norm": 0.0547301284968853, + "learning_rate": 2.5961927906034833e-05, + "loss": 0.4777, + "step": 641 + }, + { + "epoch": 0.05200907323395982, + "grad_norm": 0.06931430846452713, + "learning_rate": 2.6002430133657353e-05, + "loss": 0.4783, + "step": 642 + }, + { + "epoch": 0.052090084251458196, + "grad_norm": 0.06401245296001434, + "learning_rate": 2.604293236127987e-05, + "loss": 0.5112, + "step": 643 + }, + { + "epoch": 0.05217109526895658, + "grad_norm": 0.054302409291267395, + "learning_rate": 2.608343458890239e-05, + "loss": 0.4491, + "step": 644 + }, + { + "epoch": 0.052252106286454957, + "grad_norm": 0.07285508513450623, + "learning_rate": 2.612393681652491e-05, + "loss": 0.4729, + "step": 645 + }, + { + "epoch": 0.05233311730395334, + "grad_norm": 0.050434838980436325, + "learning_rate": 2.616443904414743e-05, + "loss": 0.4898, + "step": 646 + }, + { + "epoch": 0.05241412832145172, + "grad_norm": 0.07490405440330505, + "learning_rate": 2.620494127176995e-05, + "loss": 0.4827, + "step": 647 + }, + { + "epoch": 0.0524951393389501, + "grad_norm": 0.0542641319334507, + "learning_rate": 2.624544349939247e-05, + "loss": 0.4467, + "step": 648 + }, + { + "epoch": 0.05257615035644848, + "grad_norm": 0.05059095472097397, + "learning_rate": 2.6285945727014987e-05, + "loss": 0.4108, + "step": 649 + }, + { + "epoch": 0.052657161373946854, + "grad_norm": 0.06154976785182953, + "learning_rate": 2.6326447954637507e-05, + "loss": 0.4863, + "step": 650 + }, + { + "epoch": 0.05273817239144524, + "grad_norm": 0.05898062512278557, + "learning_rate": 2.6366950182260026e-05, + "loss": 0.44, + "step": 651 + }, + { + "epoch": 0.052819183408943614, + "grad_norm": 0.06105419248342514, + "learning_rate": 2.6407452409882543e-05, + "loss": 0.4522, + "step": 652 + }, + { + "epoch": 0.052900194426442, + "grad_norm": 0.07445076107978821, + "learning_rate": 2.6447954637505062e-05, + "loss": 0.4753, + "step": 653 + }, + { + "epoch": 0.052981205443940374, + "grad_norm": 0.08417477458715439, + "learning_rate": 2.6488456865127582e-05, + "loss": 0.4927, + "step": 654 + }, + { + "epoch": 0.05306221646143876, + "grad_norm": 0.04969826713204384, + "learning_rate": 2.6528959092750105e-05, + "loss": 0.4787, + "step": 655 + }, + { + "epoch": 0.053143227478937134, + "grad_norm": 0.059925880283117294, + "learning_rate": 2.6569461320372625e-05, + "loss": 0.4539, + "step": 656 + }, + { + "epoch": 0.05322423849643552, + "grad_norm": 0.0699731782078743, + "learning_rate": 2.660996354799514e-05, + "loss": 0.4452, + "step": 657 + }, + { + "epoch": 0.053305249513933894, + "grad_norm": 0.06968280673027039, + "learning_rate": 2.665046577561766e-05, + "loss": 0.4414, + "step": 658 + }, + { + "epoch": 0.05338626053143228, + "grad_norm": 0.052930060774087906, + "learning_rate": 2.669096800324018e-05, + "loss": 0.5047, + "step": 659 + }, + { + "epoch": 0.053467271548930655, + "grad_norm": 0.06393340229988098, + "learning_rate": 2.67314702308627e-05, + "loss": 0.5045, + "step": 660 + }, + { + "epoch": 0.05354828256642903, + "grad_norm": 0.08847188204526901, + "learning_rate": 2.6771972458485216e-05, + "loss": 0.4623, + "step": 661 + }, + { + "epoch": 0.053629293583927415, + "grad_norm": 0.037499427795410156, + "learning_rate": 2.6812474686107736e-05, + "loss": 0.3986, + "step": 662 + }, + { + "epoch": 0.05371030460142579, + "grad_norm": 0.10235746949911118, + "learning_rate": 2.6852976913730256e-05, + "loss": 0.5048, + "step": 663 + }, + { + "epoch": 0.053791315618924175, + "grad_norm": 0.07281772047281265, + "learning_rate": 2.6893479141352772e-05, + "loss": 0.4631, + "step": 664 + }, + { + "epoch": 0.05387232663642255, + "grad_norm": 0.06468793749809265, + "learning_rate": 2.69339813689753e-05, + "loss": 0.4814, + "step": 665 + }, + { + "epoch": 0.053953337653920935, + "grad_norm": 0.058206070214509964, + "learning_rate": 2.6974483596597815e-05, + "loss": 0.4306, + "step": 666 + }, + { + "epoch": 0.05403434867141931, + "grad_norm": 0.05109895393252373, + "learning_rate": 2.7014985824220334e-05, + "loss": 0.4726, + "step": 667 + }, + { + "epoch": 0.054115359688917695, + "grad_norm": 0.06766542047262192, + "learning_rate": 2.7055488051842854e-05, + "loss": 0.4929, + "step": 668 + }, + { + "epoch": 0.05419637070641607, + "grad_norm": 0.06401512026786804, + "learning_rate": 2.709599027946537e-05, + "loss": 0.4568, + "step": 669 + }, + { + "epoch": 0.054277381723914456, + "grad_norm": 0.09650994092226028, + "learning_rate": 2.713649250708789e-05, + "loss": 0.4668, + "step": 670 + }, + { + "epoch": 0.05435839274141283, + "grad_norm": 0.060593217611312866, + "learning_rate": 2.717699473471041e-05, + "loss": 0.4951, + "step": 671 + }, + { + "epoch": 0.05443940375891121, + "grad_norm": 0.0636834129691124, + "learning_rate": 2.721749696233293e-05, + "loss": 0.5109, + "step": 672 + }, + { + "epoch": 0.05452041477640959, + "grad_norm": 0.07222723215818405, + "learning_rate": 2.7257999189955446e-05, + "loss": 0.4741, + "step": 673 + }, + { + "epoch": 0.05460142579390797, + "grad_norm": 0.04111061245203018, + "learning_rate": 2.7298501417577972e-05, + "loss": 0.4789, + "step": 674 + }, + { + "epoch": 0.05468243681140635, + "grad_norm": 0.07527043670415878, + "learning_rate": 2.733900364520049e-05, + "loss": 0.4688, + "step": 675 + }, + { + "epoch": 0.05476344782890473, + "grad_norm": 0.058275461196899414, + "learning_rate": 2.7379505872823008e-05, + "loss": 0.4358, + "step": 676 + }, + { + "epoch": 0.05484445884640311, + "grad_norm": 0.07854972034692764, + "learning_rate": 2.7420008100445528e-05, + "loss": 0.4898, + "step": 677 + }, + { + "epoch": 0.05492546986390149, + "grad_norm": 0.06151697412133217, + "learning_rate": 2.7460510328068044e-05, + "loss": 0.4815, + "step": 678 + }, + { + "epoch": 0.05500648088139987, + "grad_norm": 0.06732051074504852, + "learning_rate": 2.7501012555690564e-05, + "loss": 0.4921, + "step": 679 + }, + { + "epoch": 0.05508749189889825, + "grad_norm": 0.05637775734066963, + "learning_rate": 2.7541514783313083e-05, + "loss": 0.481, + "step": 680 + }, + { + "epoch": 0.05516850291639663, + "grad_norm": 0.050734519958496094, + "learning_rate": 2.7582017010935603e-05, + "loss": 0.4436, + "step": 681 + }, + { + "epoch": 0.05524951393389501, + "grad_norm": 0.05997519567608833, + "learning_rate": 2.762251923855812e-05, + "loss": 0.4468, + "step": 682 + }, + { + "epoch": 0.05533052495139339, + "grad_norm": 0.07706957310438156, + "learning_rate": 2.766302146618064e-05, + "loss": 0.4777, + "step": 683 + }, + { + "epoch": 0.05541153596889177, + "grad_norm": 0.10084830969572067, + "learning_rate": 2.7703523693803162e-05, + "loss": 0.4774, + "step": 684 + }, + { + "epoch": 0.05549254698639015, + "grad_norm": 0.08028780668973923, + "learning_rate": 2.774402592142568e-05, + "loss": 0.441, + "step": 685 + }, + { + "epoch": 0.05557355800388853, + "grad_norm": 0.09915050864219666, + "learning_rate": 2.77845281490482e-05, + "loss": 0.3905, + "step": 686 + }, + { + "epoch": 0.05565456902138691, + "grad_norm": 0.05721239000558853, + "learning_rate": 2.7825030376670718e-05, + "loss": 0.48, + "step": 687 + }, + { + "epoch": 0.05573558003888529, + "grad_norm": 0.07557759433984756, + "learning_rate": 2.7865532604293237e-05, + "loss": 0.4644, + "step": 688 + }, + { + "epoch": 0.05581659105638367, + "grad_norm": 0.05306036397814751, + "learning_rate": 2.7906034831915757e-05, + "loss": 0.4579, + "step": 689 + }, + { + "epoch": 0.05589760207388205, + "grad_norm": 0.04879166930913925, + "learning_rate": 2.7946537059538273e-05, + "loss": 0.4459, + "step": 690 + }, + { + "epoch": 0.05597861309138043, + "grad_norm": 0.06670980900526047, + "learning_rate": 2.7987039287160793e-05, + "loss": 0.4797, + "step": 691 + }, + { + "epoch": 0.056059624108878804, + "grad_norm": 0.06885267049074173, + "learning_rate": 2.8027541514783313e-05, + "loss": 0.4272, + "step": 692 + }, + { + "epoch": 0.05614063512637719, + "grad_norm": 0.06158503517508507, + "learning_rate": 2.8068043742405836e-05, + "loss": 0.4645, + "step": 693 + }, + { + "epoch": 0.056221646143875564, + "grad_norm": 0.05785226821899414, + "learning_rate": 2.8108545970028355e-05, + "loss": 0.4371, + "step": 694 + }, + { + "epoch": 0.05630265716137395, + "grad_norm": 0.06631193310022354, + "learning_rate": 2.8149048197650875e-05, + "loss": 0.4346, + "step": 695 + }, + { + "epoch": 0.056383668178872325, + "grad_norm": 0.04557311534881592, + "learning_rate": 2.818955042527339e-05, + "loss": 0.3764, + "step": 696 + }, + { + "epoch": 0.05646467919637071, + "grad_norm": 0.07891491055488586, + "learning_rate": 2.823005265289591e-05, + "loss": 0.5242, + "step": 697 + }, + { + "epoch": 0.056545690213869085, + "grad_norm": 0.04715527594089508, + "learning_rate": 2.827055488051843e-05, + "loss": 0.4351, + "step": 698 + }, + { + "epoch": 0.05662670123136747, + "grad_norm": 0.07520975172519684, + "learning_rate": 2.8311057108140947e-05, + "loss": 0.4603, + "step": 699 + }, + { + "epoch": 0.056707712248865845, + "grad_norm": 0.0787554606795311, + "learning_rate": 2.8351559335763467e-05, + "loss": 0.495, + "step": 700 + }, + { + "epoch": 0.05678872326636423, + "grad_norm": 0.07464078068733215, + "learning_rate": 2.8392061563385986e-05, + "loss": 0.495, + "step": 701 + }, + { + "epoch": 0.056869734283862605, + "grad_norm": 0.062342628836631775, + "learning_rate": 2.8432563791008502e-05, + "loss": 0.4516, + "step": 702 + }, + { + "epoch": 0.05695074530136098, + "grad_norm": 0.06254198402166367, + "learning_rate": 2.847306601863103e-05, + "loss": 0.5159, + "step": 703 + }, + { + "epoch": 0.057031756318859365, + "grad_norm": 0.07052873075008392, + "learning_rate": 2.851356824625355e-05, + "loss": 0.4405, + "step": 704 + }, + { + "epoch": 0.05711276733635774, + "grad_norm": 0.058935195207595825, + "learning_rate": 2.8554070473876065e-05, + "loss": 0.5252, + "step": 705 + }, + { + "epoch": 0.057193778353856126, + "grad_norm": 0.06088797003030777, + "learning_rate": 2.8594572701498585e-05, + "loss": 0.4408, + "step": 706 + }, + { + "epoch": 0.0572747893713545, + "grad_norm": 0.05880602449178696, + "learning_rate": 2.8635074929121104e-05, + "loss": 0.4783, + "step": 707 + }, + { + "epoch": 0.057355800388852886, + "grad_norm": 0.05036713927984238, + "learning_rate": 2.867557715674362e-05, + "loss": 0.4618, + "step": 708 + }, + { + "epoch": 0.05743681140635126, + "grad_norm": 0.06838516145944595, + "learning_rate": 2.871607938436614e-05, + "loss": 0.4824, + "step": 709 + }, + { + "epoch": 0.057517822423849646, + "grad_norm": 0.07706855237483978, + "learning_rate": 2.875658161198866e-05, + "loss": 0.4507, + "step": 710 + }, + { + "epoch": 0.05759883344134802, + "grad_norm": 0.061234429478645325, + "learning_rate": 2.8797083839611176e-05, + "loss": 0.3989, + "step": 711 + }, + { + "epoch": 0.057679844458846406, + "grad_norm": 0.06529439985752106, + "learning_rate": 2.8837586067233703e-05, + "loss": 0.4465, + "step": 712 + }, + { + "epoch": 0.05776085547634478, + "grad_norm": 0.06635624170303345, + "learning_rate": 2.887808829485622e-05, + "loss": 0.519, + "step": 713 + }, + { + "epoch": 0.05784186649384316, + "grad_norm": 0.06093122810125351, + "learning_rate": 2.891859052247874e-05, + "loss": 0.4298, + "step": 714 + }, + { + "epoch": 0.05792287751134154, + "grad_norm": 0.06917279958724976, + "learning_rate": 2.8959092750101258e-05, + "loss": 0.4774, + "step": 715 + }, + { + "epoch": 0.05800388852883992, + "grad_norm": 0.059787567704916, + "learning_rate": 2.8999594977723778e-05, + "loss": 0.4112, + "step": 716 + }, + { + "epoch": 0.0580848995463383, + "grad_norm": 0.0676770806312561, + "learning_rate": 2.9040097205346294e-05, + "loss": 0.5195, + "step": 717 + }, + { + "epoch": 0.05816591056383668, + "grad_norm": 0.08954107761383057, + "learning_rate": 2.9080599432968814e-05, + "loss": 0.4654, + "step": 718 + }, + { + "epoch": 0.058246921581335064, + "grad_norm": 0.10148172080516815, + "learning_rate": 2.9121101660591333e-05, + "loss": 0.4899, + "step": 719 + }, + { + "epoch": 0.05832793259883344, + "grad_norm": 0.07054968923330307, + "learning_rate": 2.916160388821385e-05, + "loss": 0.4395, + "step": 720 + }, + { + "epoch": 0.058408943616331824, + "grad_norm": 0.06562081724405289, + "learning_rate": 2.9202106115836376e-05, + "loss": 0.4206, + "step": 721 + }, + { + "epoch": 0.0584899546338302, + "grad_norm": 0.07018006592988968, + "learning_rate": 2.9242608343458892e-05, + "loss": 0.4295, + "step": 722 + }, + { + "epoch": 0.058570965651328584, + "grad_norm": 0.09009187668561935, + "learning_rate": 2.9283110571081412e-05, + "loss": 0.4158, + "step": 723 + }, + { + "epoch": 0.05865197666882696, + "grad_norm": 0.08497653156518936, + "learning_rate": 2.9323612798703932e-05, + "loss": 0.4775, + "step": 724 + }, + { + "epoch": 0.05873298768632534, + "grad_norm": 0.07111209630966187, + "learning_rate": 2.9364115026326448e-05, + "loss": 0.4699, + "step": 725 + }, + { + "epoch": 0.05881399870382372, + "grad_norm": 0.059641528874635696, + "learning_rate": 2.9404617253948968e-05, + "loss": 0.4656, + "step": 726 + }, + { + "epoch": 0.0588950097213221, + "grad_norm": 0.07775542885065079, + "learning_rate": 2.9445119481571487e-05, + "loss": 0.4342, + "step": 727 + }, + { + "epoch": 0.05897602073882048, + "grad_norm": 0.08325430750846863, + "learning_rate": 2.9485621709194007e-05, + "loss": 0.4743, + "step": 728 + }, + { + "epoch": 0.05905703175631886, + "grad_norm": 0.05315934866666794, + "learning_rate": 2.9526123936816523e-05, + "loss": 0.4698, + "step": 729 + }, + { + "epoch": 0.05913804277381724, + "grad_norm": 0.06893842667341232, + "learning_rate": 2.9566626164439043e-05, + "loss": 0.4809, + "step": 730 + }, + { + "epoch": 0.05921905379131562, + "grad_norm": 0.07211416959762573, + "learning_rate": 2.9607128392061566e-05, + "loss": 0.4471, + "step": 731 + }, + { + "epoch": 0.059300064808814, + "grad_norm": 0.07356251776218414, + "learning_rate": 2.9647630619684086e-05, + "loss": 0.4359, + "step": 732 + }, + { + "epoch": 0.05938107582631238, + "grad_norm": 0.05225740373134613, + "learning_rate": 2.9688132847306605e-05, + "loss": 0.4097, + "step": 733 + }, + { + "epoch": 0.05946208684381076, + "grad_norm": 0.05501040071249008, + "learning_rate": 2.972863507492912e-05, + "loss": 0.48, + "step": 734 + }, + { + "epoch": 0.05954309786130914, + "grad_norm": 0.06707523763179779, + "learning_rate": 2.976913730255164e-05, + "loss": 0.4669, + "step": 735 + }, + { + "epoch": 0.059624108878807515, + "grad_norm": 0.048934899270534515, + "learning_rate": 2.980963953017416e-05, + "loss": 0.429, + "step": 736 + }, + { + "epoch": 0.0597051198963059, + "grad_norm": 0.07759115099906921, + "learning_rate": 2.985014175779668e-05, + "loss": 0.5033, + "step": 737 + }, + { + "epoch": 0.059786130913804275, + "grad_norm": 0.0644378662109375, + "learning_rate": 2.9890643985419197e-05, + "loss": 0.4323, + "step": 738 + }, + { + "epoch": 0.05986714193130266, + "grad_norm": 0.06665252149105072, + "learning_rate": 2.9931146213041717e-05, + "loss": 0.526, + "step": 739 + }, + { + "epoch": 0.059948152948801035, + "grad_norm": 0.0821952298283577, + "learning_rate": 2.997164844066424e-05, + "loss": 0.4263, + "step": 740 + }, + { + "epoch": 0.06002916396629942, + "grad_norm": 0.09428700804710388, + "learning_rate": 3.001215066828676e-05, + "loss": 0.4644, + "step": 741 + }, + { + "epoch": 0.060110174983797796, + "grad_norm": 0.07214829325675964, + "learning_rate": 3.005265289590928e-05, + "loss": 0.4371, + "step": 742 + }, + { + "epoch": 0.06019118600129618, + "grad_norm": 0.0625072717666626, + "learning_rate": 3.0093155123531795e-05, + "loss": 0.4382, + "step": 743 + }, + { + "epoch": 0.060272197018794556, + "grad_norm": 0.07718163728713989, + "learning_rate": 3.0133657351154315e-05, + "loss": 0.4709, + "step": 744 + }, + { + "epoch": 0.06035320803629293, + "grad_norm": 0.08825402706861496, + "learning_rate": 3.0174159578776835e-05, + "loss": 0.4502, + "step": 745 + }, + { + "epoch": 0.060434219053791316, + "grad_norm": 0.08489922434091568, + "learning_rate": 3.021466180639935e-05, + "loss": 0.4801, + "step": 746 + }, + { + "epoch": 0.06051523007128969, + "grad_norm": 0.06677371263504028, + "learning_rate": 3.025516403402187e-05, + "loss": 0.4335, + "step": 747 + }, + { + "epoch": 0.060596241088788076, + "grad_norm": 0.06519728899002075, + "learning_rate": 3.029566626164439e-05, + "loss": 0.4419, + "step": 748 + }, + { + "epoch": 0.06067725210628645, + "grad_norm": 0.07708553224802017, + "learning_rate": 3.033616848926691e-05, + "loss": 0.489, + "step": 749 + }, + { + "epoch": 0.060758263123784836, + "grad_norm": 0.06495978683233261, + "learning_rate": 3.0376670716889433e-05, + "loss": 0.4339, + "step": 750 + }, + { + "epoch": 0.06083927414128321, + "grad_norm": 0.07273128628730774, + "learning_rate": 3.0417172944511953e-05, + "loss": 0.4651, + "step": 751 + }, + { + "epoch": 0.0609202851587816, + "grad_norm": 0.06824234127998352, + "learning_rate": 3.045767517213447e-05, + "loss": 0.4655, + "step": 752 + }, + { + "epoch": 0.06100129617627997, + "grad_norm": 0.08591991662979126, + "learning_rate": 3.049817739975699e-05, + "loss": 0.4789, + "step": 753 + }, + { + "epoch": 0.06108230719377836, + "grad_norm": 0.05332494154572487, + "learning_rate": 3.0538679627379505e-05, + "loss": 0.4561, + "step": 754 + }, + { + "epoch": 0.06116331821127673, + "grad_norm": 0.0581384003162384, + "learning_rate": 3.0579181855002025e-05, + "loss": 0.4882, + "step": 755 + }, + { + "epoch": 0.06124432922877511, + "grad_norm": 0.07281646877527237, + "learning_rate": 3.0619684082624544e-05, + "loss": 0.4728, + "step": 756 + }, + { + "epoch": 0.061325340246273494, + "grad_norm": 0.06750751286745071, + "learning_rate": 3.0660186310247064e-05, + "loss": 0.4936, + "step": 757 + }, + { + "epoch": 0.06140635126377187, + "grad_norm": 0.08174298703670502, + "learning_rate": 3.0700688537869584e-05, + "loss": 0.5091, + "step": 758 + }, + { + "epoch": 0.061487362281270254, + "grad_norm": 0.061219800263643265, + "learning_rate": 3.07411907654921e-05, + "loss": 0.4394, + "step": 759 + }, + { + "epoch": 0.06156837329876863, + "grad_norm": 0.06408350169658661, + "learning_rate": 3.078169299311462e-05, + "loss": 0.4342, + "step": 760 + }, + { + "epoch": 0.061649384316267014, + "grad_norm": 0.06699630618095398, + "learning_rate": 3.082219522073714e-05, + "loss": 0.4705, + "step": 761 + }, + { + "epoch": 0.06173039533376539, + "grad_norm": 0.05425221472978592, + "learning_rate": 3.086269744835966e-05, + "loss": 0.454, + "step": 762 + }, + { + "epoch": 0.061811406351263774, + "grad_norm": 0.07127571105957031, + "learning_rate": 3.090319967598218e-05, + "loss": 0.4268, + "step": 763 + }, + { + "epoch": 0.06189241736876215, + "grad_norm": 0.07275000214576721, + "learning_rate": 3.09437019036047e-05, + "loss": 0.5141, + "step": 764 + }, + { + "epoch": 0.061973428386260535, + "grad_norm": 0.054138775914907455, + "learning_rate": 3.098420413122722e-05, + "loss": 0.3759, + "step": 765 + }, + { + "epoch": 0.06205443940375891, + "grad_norm": 0.10909716784954071, + "learning_rate": 3.1024706358849734e-05, + "loss": 0.4651, + "step": 766 + }, + { + "epoch": 0.06213545042125729, + "grad_norm": 0.05660035461187363, + "learning_rate": 3.1065208586472254e-05, + "loss": 0.4307, + "step": 767 + }, + { + "epoch": 0.06221646143875567, + "grad_norm": 0.06873264163732529, + "learning_rate": 3.1105710814094773e-05, + "loss": 0.3971, + "step": 768 + }, + { + "epoch": 0.06229747245625405, + "grad_norm": 0.06368529796600342, + "learning_rate": 3.11462130417173e-05, + "loss": 0.4432, + "step": 769 + }, + { + "epoch": 0.06237848347375243, + "grad_norm": 0.07408467680215836, + "learning_rate": 3.118671526933982e-05, + "loss": 0.454, + "step": 770 + }, + { + "epoch": 0.06245949449125081, + "grad_norm": 0.053322043269872665, + "learning_rate": 3.122721749696233e-05, + "loss": 0.4304, + "step": 771 + }, + { + "epoch": 0.06254050550874919, + "grad_norm": 0.07185545563697815, + "learning_rate": 3.126771972458485e-05, + "loss": 0.428, + "step": 772 + }, + { + "epoch": 0.06262151652624758, + "grad_norm": 0.05358489975333214, + "learning_rate": 3.130822195220737e-05, + "loss": 0.4019, + "step": 773 + }, + { + "epoch": 0.06270252754374595, + "grad_norm": 0.06032414361834526, + "learning_rate": 3.134872417982989e-05, + "loss": 0.4216, + "step": 774 + }, + { + "epoch": 0.06278353856124433, + "grad_norm": 0.08076735585927963, + "learning_rate": 3.138922640745241e-05, + "loss": 0.4212, + "step": 775 + }, + { + "epoch": 0.06286454957874271, + "grad_norm": 0.06268110126256943, + "learning_rate": 3.142972863507493e-05, + "loss": 0.4413, + "step": 776 + }, + { + "epoch": 0.06294556059624108, + "grad_norm": 0.06503452360630035, + "learning_rate": 3.147023086269745e-05, + "loss": 0.4647, + "step": 777 + }, + { + "epoch": 0.06302657161373947, + "grad_norm": 0.048685915768146515, + "learning_rate": 3.151073309031997e-05, + "loss": 0.4504, + "step": 778 + }, + { + "epoch": 0.06310758263123785, + "grad_norm": 0.074101522564888, + "learning_rate": 3.155123531794249e-05, + "loss": 0.4652, + "step": 779 + }, + { + "epoch": 0.06318859364873623, + "grad_norm": 0.05946307256817818, + "learning_rate": 3.159173754556501e-05, + "loss": 0.4551, + "step": 780 + }, + { + "epoch": 0.0632696046662346, + "grad_norm": 0.06036413088440895, + "learning_rate": 3.163223977318753e-05, + "loss": 0.44, + "step": 781 + }, + { + "epoch": 0.06335061568373299, + "grad_norm": 0.06359696388244629, + "learning_rate": 3.167274200081005e-05, + "loss": 0.4049, + "step": 782 + }, + { + "epoch": 0.06343162670123137, + "grad_norm": 0.09583299607038498, + "learning_rate": 3.171324422843256e-05, + "loss": 0.4147, + "step": 783 + }, + { + "epoch": 0.06351263771872975, + "grad_norm": 0.07221318781375885, + "learning_rate": 3.175374645605508e-05, + "loss": 0.5164, + "step": 784 + }, + { + "epoch": 0.06359364873622812, + "grad_norm": 0.07264591008424759, + "learning_rate": 3.17942486836776e-05, + "loss": 0.4692, + "step": 785 + }, + { + "epoch": 0.0636746597537265, + "grad_norm": 0.07795893400907516, + "learning_rate": 3.183475091130012e-05, + "loss": 0.4213, + "step": 786 + }, + { + "epoch": 0.06375567077122489, + "grad_norm": 0.07821632921695709, + "learning_rate": 3.187525313892265e-05, + "loss": 0.4439, + "step": 787 + }, + { + "epoch": 0.06383668178872326, + "grad_norm": 0.062484800815582275, + "learning_rate": 3.191575536654517e-05, + "loss": 0.4532, + "step": 788 + }, + { + "epoch": 0.06391769280622164, + "grad_norm": 0.07076051086187363, + "learning_rate": 3.195625759416768e-05, + "loss": 0.478, + "step": 789 + }, + { + "epoch": 0.06399870382372003, + "grad_norm": 0.06853286176919937, + "learning_rate": 3.19967598217902e-05, + "loss": 0.4268, + "step": 790 + }, + { + "epoch": 0.06407971484121841, + "grad_norm": 0.06096012890338898, + "learning_rate": 3.203726204941272e-05, + "loss": 0.4349, + "step": 791 + }, + { + "epoch": 0.06416072585871678, + "grad_norm": 0.08760856091976166, + "learning_rate": 3.207776427703524e-05, + "loss": 0.4645, + "step": 792 + }, + { + "epoch": 0.06424173687621516, + "grad_norm": 0.10125716030597687, + "learning_rate": 3.211826650465776e-05, + "loss": 0.4728, + "step": 793 + }, + { + "epoch": 0.06432274789371355, + "grad_norm": 0.067176952958107, + "learning_rate": 3.215876873228028e-05, + "loss": 0.4516, + "step": 794 + }, + { + "epoch": 0.06440375891121193, + "grad_norm": 0.06922373175621033, + "learning_rate": 3.219927095990279e-05, + "loss": 0.4323, + "step": 795 + }, + { + "epoch": 0.0644847699287103, + "grad_norm": 0.04995464161038399, + "learning_rate": 3.223977318752531e-05, + "loss": 0.4426, + "step": 796 + }, + { + "epoch": 0.06456578094620868, + "grad_norm": 0.07050605118274689, + "learning_rate": 3.228027541514784e-05, + "loss": 0.4209, + "step": 797 + }, + { + "epoch": 0.06464679196370707, + "grad_norm": 0.05389539897441864, + "learning_rate": 3.232077764277036e-05, + "loss": 0.4226, + "step": 798 + }, + { + "epoch": 0.06472780298120544, + "grad_norm": 0.07226435095071793, + "learning_rate": 3.2361279870392876e-05, + "loss": 0.4906, + "step": 799 + }, + { + "epoch": 0.06480881399870382, + "grad_norm": 0.0973285585641861, + "learning_rate": 3.2401782098015396e-05, + "loss": 0.4134, + "step": 800 + }, + { + "epoch": 0.0648898250162022, + "grad_norm": 0.06966665387153625, + "learning_rate": 3.244228432563791e-05, + "loss": 0.4223, + "step": 801 + }, + { + "epoch": 0.06497083603370059, + "grad_norm": 0.08046171069145203, + "learning_rate": 3.248278655326043e-05, + "loss": 0.4279, + "step": 802 + }, + { + "epoch": 0.06505184705119896, + "grad_norm": 0.06887330114841461, + "learning_rate": 3.252328878088295e-05, + "loss": 0.4018, + "step": 803 + }, + { + "epoch": 0.06513285806869734, + "grad_norm": 0.0826861634850502, + "learning_rate": 3.256379100850547e-05, + "loss": 0.4329, + "step": 804 + }, + { + "epoch": 0.06521386908619572, + "grad_norm": 0.062017880380153656, + "learning_rate": 3.260429323612799e-05, + "loss": 0.3822, + "step": 805 + }, + { + "epoch": 0.06529488010369411, + "grad_norm": 0.08863137662410736, + "learning_rate": 3.264479546375051e-05, + "loss": 0.4338, + "step": 806 + }, + { + "epoch": 0.06537589112119248, + "grad_norm": 0.08286241441965103, + "learning_rate": 3.268529769137303e-05, + "loss": 0.4763, + "step": 807 + }, + { + "epoch": 0.06545690213869086, + "grad_norm": 0.07620840519666672, + "learning_rate": 3.272579991899555e-05, + "loss": 0.3837, + "step": 808 + }, + { + "epoch": 0.06553791315618925, + "grad_norm": 0.07819268852472305, + "learning_rate": 3.2766302146618066e-05, + "loss": 0.4757, + "step": 809 + }, + { + "epoch": 0.06561892417368761, + "grad_norm": 0.052711982280015945, + "learning_rate": 3.2806804374240586e-05, + "loss": 0.4293, + "step": 810 + }, + { + "epoch": 0.065699935191186, + "grad_norm": 0.0609319843351841, + "learning_rate": 3.2847306601863106e-05, + "loss": 0.4521, + "step": 811 + }, + { + "epoch": 0.06578094620868438, + "grad_norm": 0.07290962338447571, + "learning_rate": 3.2887808829485625e-05, + "loss": 0.4798, + "step": 812 + }, + { + "epoch": 0.06586195722618277, + "grad_norm": 0.07731480151414871, + "learning_rate": 3.292831105710814e-05, + "loss": 0.4529, + "step": 813 + }, + { + "epoch": 0.06594296824368114, + "grad_norm": 0.05770780146121979, + "learning_rate": 3.296881328473066e-05, + "loss": 0.4183, + "step": 814 + }, + { + "epoch": 0.06602397926117952, + "grad_norm": 0.08070331811904907, + "learning_rate": 3.300931551235318e-05, + "loss": 0.4718, + "step": 815 + }, + { + "epoch": 0.0661049902786779, + "grad_norm": 0.09154818207025528, + "learning_rate": 3.3049817739975704e-05, + "loss": 0.4276, + "step": 816 + }, + { + "epoch": 0.06618600129617629, + "grad_norm": 0.05981754884123802, + "learning_rate": 3.3090319967598224e-05, + "loss": 0.4542, + "step": 817 + }, + { + "epoch": 0.06626701231367466, + "grad_norm": 0.06444111466407776, + "learning_rate": 3.3130822195220737e-05, + "loss": 0.4409, + "step": 818 + }, + { + "epoch": 0.06634802333117304, + "grad_norm": 0.0756894052028656, + "learning_rate": 3.3171324422843256e-05, + "loss": 0.458, + "step": 819 + }, + { + "epoch": 0.06642903434867142, + "grad_norm": 0.0637042224407196, + "learning_rate": 3.3211826650465776e-05, + "loss": 0.4301, + "step": 820 + }, + { + "epoch": 0.06651004536616979, + "grad_norm": 0.05820478871464729, + "learning_rate": 3.3252328878088296e-05, + "loss": 0.4502, + "step": 821 + }, + { + "epoch": 0.06659105638366818, + "grad_norm": 0.07865587621927261, + "learning_rate": 3.3292831105710815e-05, + "loss": 0.4105, + "step": 822 + }, + { + "epoch": 0.06667206740116656, + "grad_norm": 0.06652259081602097, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4158, + "step": 823 + }, + { + "epoch": 0.06675307841866494, + "grad_norm": 0.06405427306890488, + "learning_rate": 3.3373835560955855e-05, + "loss": 0.4431, + "step": 824 + }, + { + "epoch": 0.06683408943616331, + "grad_norm": 0.04830295965075493, + "learning_rate": 3.3414337788578374e-05, + "loss": 0.4088, + "step": 825 + }, + { + "epoch": 0.0669151004536617, + "grad_norm": 0.06837856024503708, + "learning_rate": 3.3454840016200894e-05, + "loss": 0.4458, + "step": 826 + }, + { + "epoch": 0.06699611147116008, + "grad_norm": 0.07393626123666763, + "learning_rate": 3.3495342243823414e-05, + "loss": 0.4726, + "step": 827 + }, + { + "epoch": 0.06707712248865846, + "grad_norm": 0.11590786278247833, + "learning_rate": 3.353584447144593e-05, + "loss": 0.4804, + "step": 828 + }, + { + "epoch": 0.06715813350615683, + "grad_norm": 0.05595247447490692, + "learning_rate": 3.357634669906845e-05, + "loss": 0.4547, + "step": 829 + }, + { + "epoch": 0.06723914452365522, + "grad_norm": 0.07072945684194565, + "learning_rate": 3.3616848926690966e-05, + "loss": 0.421, + "step": 830 + }, + { + "epoch": 0.0673201555411536, + "grad_norm": 0.07533926516771317, + "learning_rate": 3.3657351154313486e-05, + "loss": 0.4573, + "step": 831 + }, + { + "epoch": 0.06740116655865197, + "grad_norm": 0.06634481251239777, + "learning_rate": 3.3697853381936005e-05, + "loss": 0.4423, + "step": 832 + }, + { + "epoch": 0.06748217757615035, + "grad_norm": 0.08560493588447571, + "learning_rate": 3.3738355609558525e-05, + "loss": 0.4801, + "step": 833 + }, + { + "epoch": 0.06756318859364874, + "grad_norm": 0.07027285546064377, + "learning_rate": 3.377885783718105e-05, + "loss": 0.4304, + "step": 834 + }, + { + "epoch": 0.06764419961114712, + "grad_norm": 0.09453713148832321, + "learning_rate": 3.381936006480357e-05, + "loss": 0.4349, + "step": 835 + }, + { + "epoch": 0.06772521062864549, + "grad_norm": 0.06721215695142746, + "learning_rate": 3.3859862292426084e-05, + "loss": 0.4018, + "step": 836 + }, + { + "epoch": 0.06780622164614387, + "grad_norm": 0.09632536768913269, + "learning_rate": 3.3900364520048604e-05, + "loss": 0.4856, + "step": 837 + }, + { + "epoch": 0.06788723266364226, + "grad_norm": 0.08073796331882477, + "learning_rate": 3.394086674767112e-05, + "loss": 0.4082, + "step": 838 + }, + { + "epoch": 0.06796824368114064, + "grad_norm": 0.061706796288490295, + "learning_rate": 3.398136897529364e-05, + "loss": 0.3996, + "step": 839 + }, + { + "epoch": 0.06804925469863901, + "grad_norm": 0.046711515635252, + "learning_rate": 3.402187120291616e-05, + "loss": 0.5058, + "step": 840 + }, + { + "epoch": 0.0681302657161374, + "grad_norm": 0.057807352393865585, + "learning_rate": 3.406237343053868e-05, + "loss": 0.4455, + "step": 841 + }, + { + "epoch": 0.06821127673363578, + "grad_norm": 0.08929713815450668, + "learning_rate": 3.41028756581612e-05, + "loss": 0.4552, + "step": 842 + }, + { + "epoch": 0.06829228775113415, + "grad_norm": 0.06823332607746124, + "learning_rate": 3.4143377885783715e-05, + "loss": 0.4267, + "step": 843 + }, + { + "epoch": 0.06837329876863253, + "grad_norm": 0.07023243606090546, + "learning_rate": 3.418388011340624e-05, + "loss": 0.4496, + "step": 844 + }, + { + "epoch": 0.06845430978613092, + "grad_norm": 0.08038611710071564, + "learning_rate": 3.422438234102876e-05, + "loss": 0.4268, + "step": 845 + }, + { + "epoch": 0.0685353208036293, + "grad_norm": 0.09449832141399384, + "learning_rate": 3.426488456865128e-05, + "loss": 0.398, + "step": 846 + }, + { + "epoch": 0.06861633182112767, + "grad_norm": 0.08607659488916397, + "learning_rate": 3.43053867962738e-05, + "loss": 0.4441, + "step": 847 + }, + { + "epoch": 0.06869734283862605, + "grad_norm": 0.06817245483398438, + "learning_rate": 3.434588902389631e-05, + "loss": 0.4163, + "step": 848 + }, + { + "epoch": 0.06877835385612444, + "grad_norm": 0.07797311991453171, + "learning_rate": 3.438639125151883e-05, + "loss": 0.4569, + "step": 849 + }, + { + "epoch": 0.06885936487362282, + "grad_norm": 0.08795508742332458, + "learning_rate": 3.442689347914135e-05, + "loss": 0.4423, + "step": 850 + }, + { + "epoch": 0.06894037589112119, + "grad_norm": 0.08801967650651932, + "learning_rate": 3.446739570676387e-05, + "loss": 0.4253, + "step": 851 + }, + { + "epoch": 0.06902138690861957, + "grad_norm": 0.07507802546024323, + "learning_rate": 3.450789793438639e-05, + "loss": 0.39, + "step": 852 + }, + { + "epoch": 0.06910239792611796, + "grad_norm": 0.09158620238304138, + "learning_rate": 3.454840016200892e-05, + "loss": 0.4405, + "step": 853 + }, + { + "epoch": 0.06918340894361633, + "grad_norm": 0.07361084222793579, + "learning_rate": 3.458890238963143e-05, + "loss": 0.4276, + "step": 854 + }, + { + "epoch": 0.06926441996111471, + "grad_norm": 0.07052503526210785, + "learning_rate": 3.462940461725395e-05, + "loss": 0.4412, + "step": 855 + }, + { + "epoch": 0.06934543097861309, + "grad_norm": 0.08575800061225891, + "learning_rate": 3.466990684487647e-05, + "loss": 0.4585, + "step": 856 + }, + { + "epoch": 0.06942644199611148, + "grad_norm": 0.10039302706718445, + "learning_rate": 3.471040907249899e-05, + "loss": 0.4771, + "step": 857 + }, + { + "epoch": 0.06950745301360985, + "grad_norm": 0.08235925436019897, + "learning_rate": 3.475091130012151e-05, + "loss": 0.4749, + "step": 858 + }, + { + "epoch": 0.06958846403110823, + "grad_norm": 0.06944045424461365, + "learning_rate": 3.479141352774403e-05, + "loss": 0.4516, + "step": 859 + }, + { + "epoch": 0.06966947504860661, + "grad_norm": 0.05629153922200203, + "learning_rate": 3.483191575536654e-05, + "loss": 0.4347, + "step": 860 + }, + { + "epoch": 0.069750486066105, + "grad_norm": 0.0687924176454544, + "learning_rate": 3.487241798298906e-05, + "loss": 0.4505, + "step": 861 + }, + { + "epoch": 0.06983149708360337, + "grad_norm": 0.07229902595281601, + "learning_rate": 3.491292021061158e-05, + "loss": 0.4455, + "step": 862 + }, + { + "epoch": 0.06991250810110175, + "grad_norm": 0.07076973468065262, + "learning_rate": 3.495342243823411e-05, + "loss": 0.3961, + "step": 863 + }, + { + "epoch": 0.06999351911860013, + "grad_norm": 0.07589247077703476, + "learning_rate": 3.499392466585663e-05, + "loss": 0.4375, + "step": 864 + }, + { + "epoch": 0.0700745301360985, + "grad_norm": 0.09876564145088196, + "learning_rate": 3.503442689347915e-05, + "loss": 0.4497, + "step": 865 + }, + { + "epoch": 0.07015554115359689, + "grad_norm": 0.09799569845199585, + "learning_rate": 3.507492912110166e-05, + "loss": 0.3982, + "step": 866 + }, + { + "epoch": 0.07023655217109527, + "grad_norm": 0.07569810003042221, + "learning_rate": 3.511543134872418e-05, + "loss": 0.4153, + "step": 867 + }, + { + "epoch": 0.07031756318859365, + "grad_norm": 0.07665053009986877, + "learning_rate": 3.51559335763467e-05, + "loss": 0.4038, + "step": 868 + }, + { + "epoch": 0.07039857420609202, + "grad_norm": 0.07011052966117859, + "learning_rate": 3.519643580396922e-05, + "loss": 0.4528, + "step": 869 + }, + { + "epoch": 0.07047958522359041, + "grad_norm": 0.10473107546567917, + "learning_rate": 3.523693803159174e-05, + "loss": 0.4633, + "step": 870 + }, + { + "epoch": 0.07056059624108879, + "grad_norm": 0.07926227152347565, + "learning_rate": 3.527744025921426e-05, + "loss": 0.4294, + "step": 871 + }, + { + "epoch": 0.07064160725858717, + "grad_norm": 0.09561335295438766, + "learning_rate": 3.531794248683678e-05, + "loss": 0.407, + "step": 872 + }, + { + "epoch": 0.07072261827608554, + "grad_norm": 0.06910093128681183, + "learning_rate": 3.53584447144593e-05, + "loss": 0.4598, + "step": 873 + }, + { + "epoch": 0.07080362929358393, + "grad_norm": 0.07909014821052551, + "learning_rate": 3.539894694208182e-05, + "loss": 0.4522, + "step": 874 + }, + { + "epoch": 0.07088464031108231, + "grad_norm": 0.0774121955037117, + "learning_rate": 3.543944916970434e-05, + "loss": 0.3741, + "step": 875 + }, + { + "epoch": 0.07096565132858068, + "grad_norm": 0.07421425729990005, + "learning_rate": 3.547995139732686e-05, + "loss": 0.4193, + "step": 876 + }, + { + "epoch": 0.07104666234607906, + "grad_norm": 0.0737396851181984, + "learning_rate": 3.552045362494938e-05, + "loss": 0.466, + "step": 877 + }, + { + "epoch": 0.07112767336357745, + "grad_norm": 0.0909152403473854, + "learning_rate": 3.556095585257189e-05, + "loss": 0.4667, + "step": 878 + }, + { + "epoch": 0.07120868438107583, + "grad_norm": 0.07740772515535355, + "learning_rate": 3.560145808019441e-05, + "loss": 0.4708, + "step": 879 + }, + { + "epoch": 0.0712896953985742, + "grad_norm": 0.09760919213294983, + "learning_rate": 3.564196030781693e-05, + "loss": 0.4607, + "step": 880 + }, + { + "epoch": 0.07137070641607259, + "grad_norm": 0.07436461001634598, + "learning_rate": 3.568246253543945e-05, + "loss": 0.385, + "step": 881 + }, + { + "epoch": 0.07145171743357097, + "grad_norm": 0.09642557799816132, + "learning_rate": 3.5722964763061975e-05, + "loss": 0.4432, + "step": 882 + }, + { + "epoch": 0.07153272845106935, + "grad_norm": 0.07746947556734085, + "learning_rate": 3.576346699068449e-05, + "loss": 0.3736, + "step": 883 + }, + { + "epoch": 0.07161373946856772, + "grad_norm": 0.07410507649183273, + "learning_rate": 3.580396921830701e-05, + "loss": 0.3902, + "step": 884 + }, + { + "epoch": 0.0716947504860661, + "grad_norm": 0.08017998188734055, + "learning_rate": 3.584447144592953e-05, + "loss": 0.4489, + "step": 885 + }, + { + "epoch": 0.07177576150356449, + "grad_norm": 0.08439603447914124, + "learning_rate": 3.588497367355205e-05, + "loss": 0.4222, + "step": 886 + }, + { + "epoch": 0.07185677252106286, + "grad_norm": 0.06829867511987686, + "learning_rate": 3.592547590117457e-05, + "loss": 0.4425, + "step": 887 + }, + { + "epoch": 0.07193778353856124, + "grad_norm": 0.0824725404381752, + "learning_rate": 3.5965978128797086e-05, + "loss": 0.4839, + "step": 888 + }, + { + "epoch": 0.07201879455605963, + "grad_norm": 0.08847682923078537, + "learning_rate": 3.6006480356419606e-05, + "loss": 0.4496, + "step": 889 + }, + { + "epoch": 0.07209980557355801, + "grad_norm": 0.08362285047769547, + "learning_rate": 3.604698258404212e-05, + "loss": 0.3894, + "step": 890 + }, + { + "epoch": 0.07218081659105638, + "grad_norm": 0.13280020654201508, + "learning_rate": 3.6087484811664645e-05, + "loss": 0.4776, + "step": 891 + }, + { + "epoch": 0.07226182760855476, + "grad_norm": 0.0723172277212143, + "learning_rate": 3.6127987039287165e-05, + "loss": 0.4236, + "step": 892 + }, + { + "epoch": 0.07234283862605315, + "grad_norm": 0.08192502707242966, + "learning_rate": 3.6168489266909685e-05, + "loss": 0.484, + "step": 893 + }, + { + "epoch": 0.07242384964355152, + "grad_norm": 0.07791081815958023, + "learning_rate": 3.6208991494532204e-05, + "loss": 0.4526, + "step": 894 + }, + { + "epoch": 0.0725048606610499, + "grad_norm": 0.0736956000328064, + "learning_rate": 3.624949372215472e-05, + "loss": 0.4567, + "step": 895 + }, + { + "epoch": 0.07258587167854828, + "grad_norm": 0.08527030050754547, + "learning_rate": 3.628999594977724e-05, + "loss": 0.4406, + "step": 896 + }, + { + "epoch": 0.07266688269604667, + "grad_norm": 0.08626183122396469, + "learning_rate": 3.6330498177399757e-05, + "loss": 0.4275, + "step": 897 + }, + { + "epoch": 0.07274789371354504, + "grad_norm": 0.07522546499967575, + "learning_rate": 3.6371000405022276e-05, + "loss": 0.4164, + "step": 898 + }, + { + "epoch": 0.07282890473104342, + "grad_norm": 0.06778135895729065, + "learning_rate": 3.6411502632644796e-05, + "loss": 0.4287, + "step": 899 + }, + { + "epoch": 0.0729099157485418, + "grad_norm": 0.07022549957036972, + "learning_rate": 3.645200486026732e-05, + "loss": 0.41, + "step": 900 + }, + { + "epoch": 0.07299092676604019, + "grad_norm": 0.08264485746622086, + "learning_rate": 3.6492507087889835e-05, + "loss": 0.4176, + "step": 901 + }, + { + "epoch": 0.07307193778353856, + "grad_norm": 0.08415798097848892, + "learning_rate": 3.6533009315512355e-05, + "loss": 0.4389, + "step": 902 + }, + { + "epoch": 0.07315294880103694, + "grad_norm": 0.07540509849786758, + "learning_rate": 3.6573511543134875e-05, + "loss": 0.403, + "step": 903 + }, + { + "epoch": 0.07323395981853532, + "grad_norm": 0.07358894497156143, + "learning_rate": 3.6614013770757394e-05, + "loss": 0.4001, + "step": 904 + }, + { + "epoch": 0.0733149708360337, + "grad_norm": 0.06488876789808273, + "learning_rate": 3.6654515998379914e-05, + "loss": 0.4223, + "step": 905 + }, + { + "epoch": 0.07339598185353208, + "grad_norm": 0.07415130734443665, + "learning_rate": 3.6695018226002434e-05, + "loss": 0.435, + "step": 906 + }, + { + "epoch": 0.07347699287103046, + "grad_norm": 0.06136851757764816, + "learning_rate": 3.6735520453624946e-05, + "loss": 0.3851, + "step": 907 + }, + { + "epoch": 0.07355800388852884, + "grad_norm": 0.08178620040416718, + "learning_rate": 3.6776022681247466e-05, + "loss": 0.4772, + "step": 908 + }, + { + "epoch": 0.07363901490602721, + "grad_norm": 0.07980084419250488, + "learning_rate": 3.6816524908869986e-05, + "loss": 0.4604, + "step": 909 + }, + { + "epoch": 0.0737200259235256, + "grad_norm": 0.08928181976079941, + "learning_rate": 3.685702713649251e-05, + "loss": 0.4252, + "step": 910 + }, + { + "epoch": 0.07380103694102398, + "grad_norm": 0.0732206255197525, + "learning_rate": 3.689752936411503e-05, + "loss": 0.4495, + "step": 911 + }, + { + "epoch": 0.07388204795852236, + "grad_norm": 0.08373378217220306, + "learning_rate": 3.693803159173755e-05, + "loss": 0.4705, + "step": 912 + }, + { + "epoch": 0.07396305897602073, + "grad_norm": 0.08140382915735245, + "learning_rate": 3.6978533819360064e-05, + "loss": 0.3726, + "step": 913 + }, + { + "epoch": 0.07404406999351912, + "grad_norm": 0.07321371883153915, + "learning_rate": 3.7019036046982584e-05, + "loss": 0.3908, + "step": 914 + }, + { + "epoch": 0.0741250810110175, + "grad_norm": 0.0650591105222702, + "learning_rate": 3.7059538274605104e-05, + "loss": 0.4489, + "step": 915 + }, + { + "epoch": 0.07420609202851587, + "grad_norm": 0.06934455037117004, + "learning_rate": 3.7100040502227623e-05, + "loss": 0.3858, + "step": 916 + }, + { + "epoch": 0.07428710304601426, + "grad_norm": 0.060079026967287064, + "learning_rate": 3.714054272985014e-05, + "loss": 0.4396, + "step": 917 + }, + { + "epoch": 0.07436811406351264, + "grad_norm": 0.0832790955901146, + "learning_rate": 3.718104495747266e-05, + "loss": 0.4441, + "step": 918 + }, + { + "epoch": 0.07444912508101102, + "grad_norm": 0.08325023204088211, + "learning_rate": 3.722154718509518e-05, + "loss": 0.4077, + "step": 919 + }, + { + "epoch": 0.07453013609850939, + "grad_norm": 0.0719456821680069, + "learning_rate": 3.72620494127177e-05, + "loss": 0.4646, + "step": 920 + }, + { + "epoch": 0.07461114711600778, + "grad_norm": 0.0605672225356102, + "learning_rate": 3.730255164034022e-05, + "loss": 0.4334, + "step": 921 + }, + { + "epoch": 0.07469215813350616, + "grad_norm": 0.06589698046445847, + "learning_rate": 3.734305386796274e-05, + "loss": 0.3967, + "step": 922 + }, + { + "epoch": 0.07477316915100454, + "grad_norm": 0.06026124954223633, + "learning_rate": 3.738355609558526e-05, + "loss": 0.3843, + "step": 923 + }, + { + "epoch": 0.07485418016850291, + "grad_norm": 0.07374893873929977, + "learning_rate": 3.742405832320778e-05, + "loss": 0.4454, + "step": 924 + }, + { + "epoch": 0.0749351911860013, + "grad_norm": 0.09211552143096924, + "learning_rate": 3.7464560550830294e-05, + "loss": 0.4193, + "step": 925 + }, + { + "epoch": 0.07501620220349968, + "grad_norm": 0.09048813581466675, + "learning_rate": 3.750506277845281e-05, + "loss": 0.4556, + "step": 926 + }, + { + "epoch": 0.07509721322099805, + "grad_norm": 0.06524454802274704, + "learning_rate": 3.754556500607533e-05, + "loss": 0.4074, + "step": 927 + }, + { + "epoch": 0.07517822423849643, + "grad_norm": 0.09637756645679474, + "learning_rate": 3.758606723369785e-05, + "loss": 0.5207, + "step": 928 + }, + { + "epoch": 0.07525923525599482, + "grad_norm": 0.07778630405664444, + "learning_rate": 3.762656946132038e-05, + "loss": 0.4447, + "step": 929 + }, + { + "epoch": 0.0753402462734932, + "grad_norm": 0.07583451271057129, + "learning_rate": 3.766707168894289e-05, + "loss": 0.4346, + "step": 930 + }, + { + "epoch": 0.07542125729099157, + "grad_norm": 0.06921878457069397, + "learning_rate": 3.770757391656541e-05, + "loss": 0.4793, + "step": 931 + }, + { + "epoch": 0.07550226830848995, + "grad_norm": 0.057142313569784164, + "learning_rate": 3.774807614418793e-05, + "loss": 0.4345, + "step": 932 + }, + { + "epoch": 0.07558327932598834, + "grad_norm": 0.06628698110580444, + "learning_rate": 3.778857837181045e-05, + "loss": 0.393, + "step": 933 + }, + { + "epoch": 0.07566429034348672, + "grad_norm": 0.07588442414999008, + "learning_rate": 3.782908059943297e-05, + "loss": 0.4236, + "step": 934 + }, + { + "epoch": 0.07574530136098509, + "grad_norm": 0.0772482231259346, + "learning_rate": 3.786958282705549e-05, + "loss": 0.3927, + "step": 935 + }, + { + "epoch": 0.07582631237848347, + "grad_norm": 0.07052788883447647, + "learning_rate": 3.791008505467801e-05, + "loss": 0.4326, + "step": 936 + }, + { + "epoch": 0.07590732339598186, + "grad_norm": 0.06511484831571579, + "learning_rate": 3.795058728230052e-05, + "loss": 0.4243, + "step": 937 + }, + { + "epoch": 0.07598833441348023, + "grad_norm": 0.13957849144935608, + "learning_rate": 3.799108950992305e-05, + "loss": 0.403, + "step": 938 + }, + { + "epoch": 0.07606934543097861, + "grad_norm": 0.07951390743255615, + "learning_rate": 3.803159173754557e-05, + "loss": 0.4815, + "step": 939 + }, + { + "epoch": 0.076150356448477, + "grad_norm": 0.07832984626293182, + "learning_rate": 3.807209396516809e-05, + "loss": 0.4766, + "step": 940 + }, + { + "epoch": 0.07623136746597538, + "grad_norm": 0.09598967432975769, + "learning_rate": 3.811259619279061e-05, + "loss": 0.4121, + "step": 941 + }, + { + "epoch": 0.07631237848347375, + "grad_norm": 0.09158296138048172, + "learning_rate": 3.815309842041313e-05, + "loss": 0.3938, + "step": 942 + }, + { + "epoch": 0.07639338950097213, + "grad_norm": 0.09140002727508545, + "learning_rate": 3.819360064803564e-05, + "loss": 0.4612, + "step": 943 + }, + { + "epoch": 0.07647440051847051, + "grad_norm": 0.06446819752454758, + "learning_rate": 3.823410287565816e-05, + "loss": 0.3948, + "step": 944 + }, + { + "epoch": 0.0765554115359689, + "grad_norm": 0.09059543907642365, + "learning_rate": 3.827460510328068e-05, + "loss": 0.4104, + "step": 945 + }, + { + "epoch": 0.07663642255346727, + "grad_norm": 0.09735757112503052, + "learning_rate": 3.83151073309032e-05, + "loss": 0.4497, + "step": 946 + }, + { + "epoch": 0.07671743357096565, + "grad_norm": 0.05969754979014397, + "learning_rate": 3.835560955852572e-05, + "loss": 0.4121, + "step": 947 + }, + { + "epoch": 0.07679844458846403, + "grad_norm": 0.07921057939529419, + "learning_rate": 3.839611178614824e-05, + "loss": 0.4194, + "step": 948 + }, + { + "epoch": 0.0768794556059624, + "grad_norm": 0.08428069949150085, + "learning_rate": 3.843661401377076e-05, + "loss": 0.4155, + "step": 949 + }, + { + "epoch": 0.07696046662346079, + "grad_norm": 0.09198450297117233, + "learning_rate": 3.847711624139328e-05, + "loss": 0.4305, + "step": 950 + }, + { + "epoch": 0.07704147764095917, + "grad_norm": 0.05633941665291786, + "learning_rate": 3.85176184690158e-05, + "loss": 0.3704, + "step": 951 + }, + { + "epoch": 0.07712248865845756, + "grad_norm": 0.0748986154794693, + "learning_rate": 3.855812069663832e-05, + "loss": 0.4343, + "step": 952 + }, + { + "epoch": 0.07720349967595592, + "grad_norm": 0.09329600632190704, + "learning_rate": 3.859862292426084e-05, + "loss": 0.4409, + "step": 953 + }, + { + "epoch": 0.07728451069345431, + "grad_norm": 0.059523507952690125, + "learning_rate": 3.863912515188336e-05, + "loss": 0.4032, + "step": 954 + }, + { + "epoch": 0.07736552171095269, + "grad_norm": 0.09699185192584991, + "learning_rate": 3.867962737950587e-05, + "loss": 0.4062, + "step": 955 + }, + { + "epoch": 0.07744653272845108, + "grad_norm": 0.08740203082561493, + "learning_rate": 3.872012960712839e-05, + "loss": 0.428, + "step": 956 + }, + { + "epoch": 0.07752754374594945, + "grad_norm": 0.08077336847782135, + "learning_rate": 3.8760631834750916e-05, + "loss": 0.4207, + "step": 957 + }, + { + "epoch": 0.07760855476344783, + "grad_norm": 0.08518598228693008, + "learning_rate": 3.8801134062373436e-05, + "loss": 0.4363, + "step": 958 + }, + { + "epoch": 0.07768956578094621, + "grad_norm": 0.09720475971698761, + "learning_rate": 3.8841636289995956e-05, + "loss": 0.4176, + "step": 959 + }, + { + "epoch": 0.07777057679844458, + "grad_norm": 0.05982989817857742, + "learning_rate": 3.888213851761847e-05, + "loss": 0.3733, + "step": 960 + }, + { + "epoch": 0.07785158781594297, + "grad_norm": 0.1074175164103508, + "learning_rate": 3.892264074524099e-05, + "loss": 0.4032, + "step": 961 + }, + { + "epoch": 0.07793259883344135, + "grad_norm": 0.0886072888970375, + "learning_rate": 3.896314297286351e-05, + "loss": 0.431, + "step": 962 + }, + { + "epoch": 0.07801360985093973, + "grad_norm": 0.08959907293319702, + "learning_rate": 3.900364520048603e-05, + "loss": 0.4562, + "step": 963 + }, + { + "epoch": 0.0780946208684381, + "grad_norm": 0.0632220134139061, + "learning_rate": 3.904414742810855e-05, + "loss": 0.4105, + "step": 964 + }, + { + "epoch": 0.07817563188593649, + "grad_norm": 0.0738753005862236, + "learning_rate": 3.908464965573107e-05, + "loss": 0.4092, + "step": 965 + }, + { + "epoch": 0.07825664290343487, + "grad_norm": 0.07675029337406158, + "learning_rate": 3.9125151883353587e-05, + "loss": 0.4765, + "step": 966 + }, + { + "epoch": 0.07833765392093325, + "grad_norm": 0.07345636188983917, + "learning_rate": 3.9165654110976106e-05, + "loss": 0.4274, + "step": 967 + }, + { + "epoch": 0.07841866493843162, + "grad_norm": 0.08332125842571259, + "learning_rate": 3.9206156338598626e-05, + "loss": 0.3674, + "step": 968 + }, + { + "epoch": 0.07849967595593, + "grad_norm": 0.09537743031978607, + "learning_rate": 3.9246658566221146e-05, + "loss": 0.4072, + "step": 969 + }, + { + "epoch": 0.07858068697342839, + "grad_norm": 0.06771516054868698, + "learning_rate": 3.9287160793843665e-05, + "loss": 0.42, + "step": 970 + }, + { + "epoch": 0.07866169799092676, + "grad_norm": 0.07258981466293335, + "learning_rate": 3.9327663021466185e-05, + "loss": 0.3905, + "step": 971 + }, + { + "epoch": 0.07874270900842514, + "grad_norm": 0.07407287508249283, + "learning_rate": 3.93681652490887e-05, + "loss": 0.4602, + "step": 972 + }, + { + "epoch": 0.07882372002592353, + "grad_norm": 0.07650595903396606, + "learning_rate": 3.940866747671122e-05, + "loss": 0.3946, + "step": 973 + }, + { + "epoch": 0.07890473104342191, + "grad_norm": 0.08994881808757782, + "learning_rate": 3.944916970433374e-05, + "loss": 0.4315, + "step": 974 + }, + { + "epoch": 0.07898574206092028, + "grad_norm": 0.0726025402545929, + "learning_rate": 3.948967193195626e-05, + "loss": 0.4127, + "step": 975 + }, + { + "epoch": 0.07906675307841866, + "grad_norm": 0.0902196392416954, + "learning_rate": 3.953017415957878e-05, + "loss": 0.3589, + "step": 976 + }, + { + "epoch": 0.07914776409591705, + "grad_norm": 0.07093965262174606, + "learning_rate": 3.95706763872013e-05, + "loss": 0.3824, + "step": 977 + }, + { + "epoch": 0.07922877511341543, + "grad_norm": 0.07622726261615753, + "learning_rate": 3.9611178614823816e-05, + "loss": 0.3884, + "step": 978 + }, + { + "epoch": 0.0793097861309138, + "grad_norm": 0.07749678194522858, + "learning_rate": 3.9651680842446335e-05, + "loss": 0.4681, + "step": 979 + }, + { + "epoch": 0.07939079714841218, + "grad_norm": 0.0761314406991005, + "learning_rate": 3.9692183070068855e-05, + "loss": 0.4281, + "step": 980 + }, + { + "epoch": 0.07947180816591057, + "grad_norm": 0.07970507442951202, + "learning_rate": 3.9732685297691375e-05, + "loss": 0.4163, + "step": 981 + }, + { + "epoch": 0.07955281918340894, + "grad_norm": 0.07264453172683716, + "learning_rate": 3.9773187525313894e-05, + "loss": 0.4161, + "step": 982 + }, + { + "epoch": 0.07963383020090732, + "grad_norm": 0.09370779246091843, + "learning_rate": 3.9813689752936414e-05, + "loss": 0.458, + "step": 983 + }, + { + "epoch": 0.0797148412184057, + "grad_norm": 0.09485704451799393, + "learning_rate": 3.985419198055893e-05, + "loss": 0.4437, + "step": 984 + }, + { + "epoch": 0.07979585223590409, + "grad_norm": 0.06327967345714569, + "learning_rate": 3.9894694208181453e-05, + "loss": 0.4401, + "step": 985 + }, + { + "epoch": 0.07987686325340246, + "grad_norm": 0.05429242178797722, + "learning_rate": 3.993519643580397e-05, + "loss": 0.4138, + "step": 986 + }, + { + "epoch": 0.07995787427090084, + "grad_norm": 0.08034180104732513, + "learning_rate": 3.997569866342649e-05, + "loss": 0.4735, + "step": 987 + }, + { + "epoch": 0.08003888528839923, + "grad_norm": 0.12071143090724945, + "learning_rate": 4.001620089104901e-05, + "loss": 0.477, + "step": 988 + }, + { + "epoch": 0.08011989630589761, + "grad_norm": 0.08465772867202759, + "learning_rate": 4.005670311867153e-05, + "loss": 0.417, + "step": 989 + }, + { + "epoch": 0.08020090732339598, + "grad_norm": 0.05626750737428665, + "learning_rate": 4.0097205346294045e-05, + "loss": 0.3891, + "step": 990 + }, + { + "epoch": 0.08028191834089436, + "grad_norm": 0.09615819156169891, + "learning_rate": 4.0137707573916565e-05, + "loss": 0.4099, + "step": 991 + }, + { + "epoch": 0.08036292935839275, + "grad_norm": 0.09798921644687653, + "learning_rate": 4.0178209801539084e-05, + "loss": 0.4212, + "step": 992 + }, + { + "epoch": 0.08044394037589112, + "grad_norm": 0.07209984958171844, + "learning_rate": 4.0218712029161604e-05, + "loss": 0.3817, + "step": 993 + }, + { + "epoch": 0.0805249513933895, + "grad_norm": 0.07342953979969025, + "learning_rate": 4.0259214256784124e-05, + "loss": 0.4143, + "step": 994 + }, + { + "epoch": 0.08060596241088788, + "grad_norm": 0.07241872698068619, + "learning_rate": 4.0299716484406643e-05, + "loss": 0.4504, + "step": 995 + }, + { + "epoch": 0.08068697342838627, + "grad_norm": 0.08157894760370255, + "learning_rate": 4.034021871202916e-05, + "loss": 0.4215, + "step": 996 + }, + { + "epoch": 0.08076798444588464, + "grad_norm": 0.07313471287488937, + "learning_rate": 4.038072093965168e-05, + "loss": 0.4457, + "step": 997 + }, + { + "epoch": 0.08084899546338302, + "grad_norm": 0.09657979011535645, + "learning_rate": 4.04212231672742e-05, + "loss": 0.4337, + "step": 998 + }, + { + "epoch": 0.0809300064808814, + "grad_norm": 0.0723256841301918, + "learning_rate": 4.046172539489672e-05, + "loss": 0.4305, + "step": 999 + }, + { + "epoch": 0.08101101749837979, + "grad_norm": 0.10510041564702988, + "learning_rate": 4.050222762251924e-05, + "loss": 0.4376, + "step": 1000 + }, + { + "epoch": 0.08109202851587816, + "grad_norm": 0.07272931188344955, + "learning_rate": 4.054272985014176e-05, + "loss": 0.4248, + "step": 1001 + }, + { + "epoch": 0.08117303953337654, + "grad_norm": 0.08269768953323364, + "learning_rate": 4.0583232077764274e-05, + "loss": 0.4025, + "step": 1002 + }, + { + "epoch": 0.08125405055087492, + "grad_norm": 0.054768819361925125, + "learning_rate": 4.0623734305386794e-05, + "loss": 0.4148, + "step": 1003 + }, + { + "epoch": 0.08133506156837329, + "grad_norm": 0.12584498524665833, + "learning_rate": 4.066423653300932e-05, + "loss": 0.4429, + "step": 1004 + }, + { + "epoch": 0.08141607258587168, + "grad_norm": 0.05965457484126091, + "learning_rate": 4.070473876063184e-05, + "loss": 0.3961, + "step": 1005 + }, + { + "epoch": 0.08149708360337006, + "grad_norm": 0.07491681724786758, + "learning_rate": 4.074524098825436e-05, + "loss": 0.432, + "step": 1006 + }, + { + "epoch": 0.08157809462086844, + "grad_norm": 0.06580981612205505, + "learning_rate": 4.078574321587687e-05, + "loss": 0.3893, + "step": 1007 + }, + { + "epoch": 0.08165910563836681, + "grad_norm": 0.07918661832809448, + "learning_rate": 4.082624544349939e-05, + "loss": 0.4332, + "step": 1008 + }, + { + "epoch": 0.0817401166558652, + "grad_norm": 0.07634010910987854, + "learning_rate": 4.086674767112191e-05, + "loss": 0.377, + "step": 1009 + }, + { + "epoch": 0.08182112767336358, + "grad_norm": 0.0872841626405716, + "learning_rate": 4.090724989874443e-05, + "loss": 0.4351, + "step": 1010 + }, + { + "epoch": 0.08190213869086195, + "grad_norm": 0.08204904943704605, + "learning_rate": 4.094775212636695e-05, + "loss": 0.4632, + "step": 1011 + }, + { + "epoch": 0.08198314970836033, + "grad_norm": 0.06768794357776642, + "learning_rate": 4.098825435398947e-05, + "loss": 0.4524, + "step": 1012 + }, + { + "epoch": 0.08206416072585872, + "grad_norm": 0.07469814270734787, + "learning_rate": 4.102875658161199e-05, + "loss": 0.3856, + "step": 1013 + }, + { + "epoch": 0.0821451717433571, + "grad_norm": 0.0659903958439827, + "learning_rate": 4.106925880923451e-05, + "loss": 0.4345, + "step": 1014 + }, + { + "epoch": 0.08222618276085547, + "grad_norm": 0.06834937632083893, + "learning_rate": 4.110976103685703e-05, + "loss": 0.4568, + "step": 1015 + }, + { + "epoch": 0.08230719377835385, + "grad_norm": 0.08034605532884598, + "learning_rate": 4.115026326447955e-05, + "loss": 0.3623, + "step": 1016 + }, + { + "epoch": 0.08238820479585224, + "grad_norm": 0.08727092295885086, + "learning_rate": 4.119076549210207e-05, + "loss": 0.425, + "step": 1017 + }, + { + "epoch": 0.08246921581335062, + "grad_norm": 0.06007024645805359, + "learning_rate": 4.123126771972459e-05, + "loss": 0.3768, + "step": 1018 + }, + { + "epoch": 0.08255022683084899, + "grad_norm": 0.09328009933233261, + "learning_rate": 4.127176994734711e-05, + "loss": 0.415, + "step": 1019 + }, + { + "epoch": 0.08263123784834737, + "grad_norm": 0.0731268972158432, + "learning_rate": 4.131227217496962e-05, + "loss": 0.4157, + "step": 1020 + }, + { + "epoch": 0.08271224886584576, + "grad_norm": 0.08149100095033646, + "learning_rate": 4.135277440259214e-05, + "loss": 0.39, + "step": 1021 + }, + { + "epoch": 0.08279325988334413, + "grad_norm": 0.07745902240276337, + "learning_rate": 4.139327663021466e-05, + "loss": 0.4899, + "step": 1022 + }, + { + "epoch": 0.08287427090084251, + "grad_norm": 0.08260829001665115, + "learning_rate": 4.143377885783719e-05, + "loss": 0.4592, + "step": 1023 + }, + { + "epoch": 0.0829552819183409, + "grad_norm": 0.08275768160820007, + "learning_rate": 4.147428108545971e-05, + "loss": 0.4523, + "step": 1024 + }, + { + "epoch": 0.08303629293583928, + "grad_norm": 0.08937709033489227, + "learning_rate": 4.151478331308222e-05, + "loss": 0.4551, + "step": 1025 + }, + { + "epoch": 0.08311730395333765, + "grad_norm": 0.0691104531288147, + "learning_rate": 4.155528554070474e-05, + "loss": 0.4338, + "step": 1026 + }, + { + "epoch": 0.08319831497083603, + "grad_norm": 0.061282236129045486, + "learning_rate": 4.159578776832726e-05, + "loss": 0.4415, + "step": 1027 + }, + { + "epoch": 0.08327932598833442, + "grad_norm": 0.08525071293115616, + "learning_rate": 4.163628999594978e-05, + "loss": 0.4987, + "step": 1028 + }, + { + "epoch": 0.0833603370058328, + "grad_norm": 0.07696589827537537, + "learning_rate": 4.16767922235723e-05, + "loss": 0.409, + "step": 1029 + }, + { + "epoch": 0.08344134802333117, + "grad_norm": 0.06403271853923798, + "learning_rate": 4.171729445119482e-05, + "loss": 0.3939, + "step": 1030 + }, + { + "epoch": 0.08352235904082955, + "grad_norm": 0.11858736723661423, + "learning_rate": 4.175779667881734e-05, + "loss": 0.4389, + "step": 1031 + }, + { + "epoch": 0.08360337005832794, + "grad_norm": 0.1112351268529892, + "learning_rate": 4.179829890643986e-05, + "loss": 0.4359, + "step": 1032 + }, + { + "epoch": 0.0836843810758263, + "grad_norm": 0.07795192301273346, + "learning_rate": 4.183880113406238e-05, + "loss": 0.3926, + "step": 1033 + }, + { + "epoch": 0.08376539209332469, + "grad_norm": 0.08661824464797974, + "learning_rate": 4.18793033616849e-05, + "loss": 0.452, + "step": 1034 + }, + { + "epoch": 0.08384640311082307, + "grad_norm": 0.07108978182077408, + "learning_rate": 4.1919805589307417e-05, + "loss": 0.4535, + "step": 1035 + }, + { + "epoch": 0.08392741412832146, + "grad_norm": 0.08247314393520355, + "learning_rate": 4.1960307816929936e-05, + "loss": 0.4366, + "step": 1036 + }, + { + "epoch": 0.08400842514581983, + "grad_norm": 0.07033973187208176, + "learning_rate": 4.200081004455245e-05, + "loss": 0.3718, + "step": 1037 + }, + { + "epoch": 0.08408943616331821, + "grad_norm": 0.10139517486095428, + "learning_rate": 4.204131227217497e-05, + "loss": 0.4682, + "step": 1038 + }, + { + "epoch": 0.0841704471808166, + "grad_norm": 0.06558583676815033, + "learning_rate": 4.208181449979749e-05, + "loss": 0.4123, + "step": 1039 + }, + { + "epoch": 0.08425145819831498, + "grad_norm": 0.06659422814846039, + "learning_rate": 4.212231672742001e-05, + "loss": 0.4624, + "step": 1040 + }, + { + "epoch": 0.08433246921581335, + "grad_norm": 0.09026381373405457, + "learning_rate": 4.216281895504253e-05, + "loss": 0.3827, + "step": 1041 + }, + { + "epoch": 0.08441348023331173, + "grad_norm": 0.08064718544483185, + "learning_rate": 4.2203321182665054e-05, + "loss": 0.4509, + "step": 1042 + }, + { + "epoch": 0.08449449125081011, + "grad_norm": 0.07609561085700989, + "learning_rate": 4.224382341028757e-05, + "loss": 0.3976, + "step": 1043 + }, + { + "epoch": 0.08457550226830848, + "grad_norm": 0.07682601362466812, + "learning_rate": 4.228432563791009e-05, + "loss": 0.418, + "step": 1044 + }, + { + "epoch": 0.08465651328580687, + "grad_norm": 0.07300432026386261, + "learning_rate": 4.2324827865532607e-05, + "loss": 0.4237, + "step": 1045 + }, + { + "epoch": 0.08473752430330525, + "grad_norm": 0.08021645992994308, + "learning_rate": 4.2365330093155126e-05, + "loss": 0.4395, + "step": 1046 + }, + { + "epoch": 0.08481853532080363, + "grad_norm": 0.07140376418828964, + "learning_rate": 4.2405832320777646e-05, + "loss": 0.4582, + "step": 1047 + }, + { + "epoch": 0.084899546338302, + "grad_norm": 0.08937831968069077, + "learning_rate": 4.2446334548400166e-05, + "loss": 0.4022, + "step": 1048 + }, + { + "epoch": 0.08498055735580039, + "grad_norm": 0.0884295105934143, + "learning_rate": 4.248683677602268e-05, + "loss": 0.4502, + "step": 1049 + }, + { + "epoch": 0.08506156837329877, + "grad_norm": 0.0966046079993248, + "learning_rate": 4.25273390036452e-05, + "loss": 0.3892, + "step": 1050 + }, + { + "epoch": 0.08514257939079715, + "grad_norm": 0.07256580144166946, + "learning_rate": 4.2567841231267725e-05, + "loss": 0.4142, + "step": 1051 + }, + { + "epoch": 0.08522359040829552, + "grad_norm": 0.08426011353731155, + "learning_rate": 4.2608343458890244e-05, + "loss": 0.4438, + "step": 1052 + }, + { + "epoch": 0.08530460142579391, + "grad_norm": 0.07426105439662933, + "learning_rate": 4.2648845686512764e-05, + "loss": 0.403, + "step": 1053 + }, + { + "epoch": 0.08538561244329229, + "grad_norm": 0.09439975768327713, + "learning_rate": 4.2689347914135284e-05, + "loss": 0.3901, + "step": 1054 + }, + { + "epoch": 0.08546662346079066, + "grad_norm": 0.07137623429298401, + "learning_rate": 4.2729850141757796e-05, + "loss": 0.4224, + "step": 1055 + }, + { + "epoch": 0.08554763447828904, + "grad_norm": 0.07066964358091354, + "learning_rate": 4.2770352369380316e-05, + "loss": 0.4394, + "step": 1056 + }, + { + "epoch": 0.08562864549578743, + "grad_norm": 0.06699734926223755, + "learning_rate": 4.2810854597002836e-05, + "loss": 0.4161, + "step": 1057 + }, + { + "epoch": 0.08570965651328581, + "grad_norm": 0.06020607799291611, + "learning_rate": 4.2851356824625355e-05, + "loss": 0.4373, + "step": 1058 + }, + { + "epoch": 0.08579066753078418, + "grad_norm": 0.06886182725429535, + "learning_rate": 4.2891859052247875e-05, + "loss": 0.433, + "step": 1059 + }, + { + "epoch": 0.08587167854828257, + "grad_norm": 0.07165521383285522, + "learning_rate": 4.2932361279870395e-05, + "loss": 0.382, + "step": 1060 + }, + { + "epoch": 0.08595268956578095, + "grad_norm": 0.05116075277328491, + "learning_rate": 4.2972863507492914e-05, + "loss": 0.4373, + "step": 1061 + }, + { + "epoch": 0.08603370058327933, + "grad_norm": 0.060705091804265976, + "learning_rate": 4.3013365735115434e-05, + "loss": 0.4095, + "step": 1062 + }, + { + "epoch": 0.0861147116007777, + "grad_norm": 0.0898757055401802, + "learning_rate": 4.3053867962737954e-05, + "loss": 0.4073, + "step": 1063 + }, + { + "epoch": 0.08619572261827609, + "grad_norm": 0.0634063258767128, + "learning_rate": 4.3094370190360473e-05, + "loss": 0.3874, + "step": 1064 + }, + { + "epoch": 0.08627673363577447, + "grad_norm": 0.07108429074287415, + "learning_rate": 4.313487241798299e-05, + "loss": 0.3907, + "step": 1065 + }, + { + "epoch": 0.08635774465327284, + "grad_norm": 0.06919017434120178, + "learning_rate": 4.317537464560551e-05, + "loss": 0.405, + "step": 1066 + }, + { + "epoch": 0.08643875567077122, + "grad_norm": 0.08269021660089493, + "learning_rate": 4.3215876873228026e-05, + "loss": 0.4392, + "step": 1067 + }, + { + "epoch": 0.0865197666882696, + "grad_norm": 0.07313472777605057, + "learning_rate": 4.3256379100850545e-05, + "loss": 0.449, + "step": 1068 + }, + { + "epoch": 0.08660077770576799, + "grad_norm": 0.08952949196100235, + "learning_rate": 4.3296881328473065e-05, + "loss": 0.3906, + "step": 1069 + }, + { + "epoch": 0.08668178872326636, + "grad_norm": 0.06147094443440437, + "learning_rate": 4.333738355609559e-05, + "loss": 0.3965, + "step": 1070 + }, + { + "epoch": 0.08676279974076474, + "grad_norm": 0.05797317251563072, + "learning_rate": 4.337788578371811e-05, + "loss": 0.4425, + "step": 1071 + }, + { + "epoch": 0.08684381075826313, + "grad_norm": 0.07006549835205078, + "learning_rate": 4.3418388011340624e-05, + "loss": 0.42, + "step": 1072 + }, + { + "epoch": 0.08692482177576151, + "grad_norm": 0.0779939591884613, + "learning_rate": 4.3458890238963144e-05, + "loss": 0.4091, + "step": 1073 + }, + { + "epoch": 0.08700583279325988, + "grad_norm": 0.05775775760412216, + "learning_rate": 4.349939246658566e-05, + "loss": 0.4438, + "step": 1074 + }, + { + "epoch": 0.08708684381075826, + "grad_norm": 0.054523251950740814, + "learning_rate": 4.353989469420818e-05, + "loss": 0.4324, + "step": 1075 + }, + { + "epoch": 0.08716785482825665, + "grad_norm": 0.07762283086776733, + "learning_rate": 4.35803969218307e-05, + "loss": 0.4263, + "step": 1076 + }, + { + "epoch": 0.08724886584575502, + "grad_norm": 0.08139238506555557, + "learning_rate": 4.362089914945322e-05, + "loss": 0.3948, + "step": 1077 + }, + { + "epoch": 0.0873298768632534, + "grad_norm": 0.07066658139228821, + "learning_rate": 4.366140137707574e-05, + "loss": 0.4454, + "step": 1078 + }, + { + "epoch": 0.08741088788075178, + "grad_norm": 0.06936241686344147, + "learning_rate": 4.3701903604698255e-05, + "loss": 0.3634, + "step": 1079 + }, + { + "epoch": 0.08749189889825017, + "grad_norm": 0.08730461448431015, + "learning_rate": 4.374240583232078e-05, + "loss": 0.4485, + "step": 1080 + }, + { + "epoch": 0.08757290991574854, + "grad_norm": 0.05085871368646622, + "learning_rate": 4.37829080599433e-05, + "loss": 0.4426, + "step": 1081 + }, + { + "epoch": 0.08765392093324692, + "grad_norm": 0.07138410210609436, + "learning_rate": 4.382341028756582e-05, + "loss": 0.3827, + "step": 1082 + }, + { + "epoch": 0.0877349319507453, + "grad_norm": 0.06872845441102982, + "learning_rate": 4.386391251518834e-05, + "loss": 0.4263, + "step": 1083 + }, + { + "epoch": 0.08781594296824369, + "grad_norm": 0.09864786267280579, + "learning_rate": 4.390441474281085e-05, + "loss": 0.3815, + "step": 1084 + }, + { + "epoch": 0.08789695398574206, + "grad_norm": 0.1010093167424202, + "learning_rate": 4.394491697043337e-05, + "loss": 0.4514, + "step": 1085 + }, + { + "epoch": 0.08797796500324044, + "grad_norm": 0.07918153703212738, + "learning_rate": 4.398541919805589e-05, + "loss": 0.425, + "step": 1086 + }, + { + "epoch": 0.08805897602073882, + "grad_norm": 0.0776391550898552, + "learning_rate": 4.402592142567841e-05, + "loss": 0.406, + "step": 1087 + }, + { + "epoch": 0.0881399870382372, + "grad_norm": 0.07044905424118042, + "learning_rate": 4.406642365330093e-05, + "loss": 0.4271, + "step": 1088 + }, + { + "epoch": 0.08822099805573558, + "grad_norm": 0.07022466510534286, + "learning_rate": 4.410692588092346e-05, + "loss": 0.4309, + "step": 1089 + }, + { + "epoch": 0.08830200907323396, + "grad_norm": 0.08486749976873398, + "learning_rate": 4.414742810854597e-05, + "loss": 0.4455, + "step": 1090 + }, + { + "epoch": 0.08838302009073234, + "grad_norm": 0.057559676468372345, + "learning_rate": 4.418793033616849e-05, + "loss": 0.3676, + "step": 1091 + }, + { + "epoch": 0.08846403110823071, + "grad_norm": 0.07805667817592621, + "learning_rate": 4.422843256379101e-05, + "loss": 0.4804, + "step": 1092 + }, + { + "epoch": 0.0885450421257291, + "grad_norm": 0.12310101091861725, + "learning_rate": 4.426893479141353e-05, + "loss": 0.4195, + "step": 1093 + }, + { + "epoch": 0.08862605314322748, + "grad_norm": 0.07303401827812195, + "learning_rate": 4.430943701903605e-05, + "loss": 0.4732, + "step": 1094 + }, + { + "epoch": 0.08870706416072587, + "grad_norm": 0.08019717782735825, + "learning_rate": 4.434993924665857e-05, + "loss": 0.4216, + "step": 1095 + }, + { + "epoch": 0.08878807517822424, + "grad_norm": 0.08238019049167633, + "learning_rate": 4.439044147428109e-05, + "loss": 0.4106, + "step": 1096 + }, + { + "epoch": 0.08886908619572262, + "grad_norm": 0.06461286544799805, + "learning_rate": 4.44309437019036e-05, + "loss": 0.4242, + "step": 1097 + }, + { + "epoch": 0.088950097213221, + "grad_norm": 0.09898345917463303, + "learning_rate": 4.447144592952613e-05, + "loss": 0.4189, + "step": 1098 + }, + { + "epoch": 0.08903110823071937, + "grad_norm": 0.060210444033145905, + "learning_rate": 4.451194815714865e-05, + "loss": 0.3702, + "step": 1099 + }, + { + "epoch": 0.08911211924821776, + "grad_norm": 0.0533088743686676, + "learning_rate": 4.455245038477117e-05, + "loss": 0.3527, + "step": 1100 + }, + { + "epoch": 0.08919313026571614, + "grad_norm": 0.08914104104042053, + "learning_rate": 4.459295261239369e-05, + "loss": 0.4156, + "step": 1101 + }, + { + "epoch": 0.08927414128321452, + "grad_norm": 0.07332563400268555, + "learning_rate": 4.46334548400162e-05, + "loss": 0.377, + "step": 1102 + }, + { + "epoch": 0.08935515230071289, + "grad_norm": 0.07546962052583694, + "learning_rate": 4.467395706763872e-05, + "loss": 0.4239, + "step": 1103 + }, + { + "epoch": 0.08943616331821128, + "grad_norm": 0.06037290766835213, + "learning_rate": 4.471445929526124e-05, + "loss": 0.3883, + "step": 1104 + }, + { + "epoch": 0.08951717433570966, + "grad_norm": 0.059730976819992065, + "learning_rate": 4.475496152288376e-05, + "loss": 0.4102, + "step": 1105 + }, + { + "epoch": 0.08959818535320804, + "grad_norm": 0.07452671229839325, + "learning_rate": 4.479546375050628e-05, + "loss": 0.4174, + "step": 1106 + }, + { + "epoch": 0.08967919637070641, + "grad_norm": 0.06664783507585526, + "learning_rate": 4.48359659781288e-05, + "loss": 0.433, + "step": 1107 + }, + { + "epoch": 0.0897602073882048, + "grad_norm": 0.06502564251422882, + "learning_rate": 4.487646820575132e-05, + "loss": 0.3682, + "step": 1108 + }, + { + "epoch": 0.08984121840570318, + "grad_norm": 0.08823461830615997, + "learning_rate": 4.491697043337384e-05, + "loss": 0.4112, + "step": 1109 + }, + { + "epoch": 0.08992222942320155, + "grad_norm": 0.06355852633714676, + "learning_rate": 4.495747266099636e-05, + "loss": 0.3712, + "step": 1110 + }, + { + "epoch": 0.09000324044069993, + "grad_norm": 0.09234008938074112, + "learning_rate": 4.499797488861888e-05, + "loss": 0.3959, + "step": 1111 + }, + { + "epoch": 0.09008425145819832, + "grad_norm": 0.07149260491132736, + "learning_rate": 4.50384771162414e-05, + "loss": 0.4581, + "step": 1112 + }, + { + "epoch": 0.0901652624756967, + "grad_norm": 0.0520872138440609, + "learning_rate": 4.507897934386392e-05, + "loss": 0.3969, + "step": 1113 + }, + { + "epoch": 0.09024627349319507, + "grad_norm": 0.06375659257173538, + "learning_rate": 4.511948157148643e-05, + "loss": 0.4179, + "step": 1114 + }, + { + "epoch": 0.09032728451069345, + "grad_norm": 0.05911070480942726, + "learning_rate": 4.515998379910895e-05, + "loss": 0.3514, + "step": 1115 + }, + { + "epoch": 0.09040829552819184, + "grad_norm": 0.07041165977716446, + "learning_rate": 4.520048602673147e-05, + "loss": 0.3959, + "step": 1116 + }, + { + "epoch": 0.09048930654569022, + "grad_norm": 0.07625393569469452, + "learning_rate": 4.5240988254353996e-05, + "loss": 0.4026, + "step": 1117 + }, + { + "epoch": 0.09057031756318859, + "grad_norm": 0.06732834130525589, + "learning_rate": 4.5281490481976515e-05, + "loss": 0.4292, + "step": 1118 + }, + { + "epoch": 0.09065132858068697, + "grad_norm": 0.07233322411775589, + "learning_rate": 4.5321992709599035e-05, + "loss": 0.4127, + "step": 1119 + }, + { + "epoch": 0.09073233959818536, + "grad_norm": 0.09195321798324585, + "learning_rate": 4.536249493722155e-05, + "loss": 0.4276, + "step": 1120 + }, + { + "epoch": 0.09081335061568373, + "grad_norm": 0.06543777137994766, + "learning_rate": 4.540299716484407e-05, + "loss": 0.4317, + "step": 1121 + }, + { + "epoch": 0.09089436163318211, + "grad_norm": 0.08684305101633072, + "learning_rate": 4.544349939246659e-05, + "loss": 0.4126, + "step": 1122 + }, + { + "epoch": 0.0909753726506805, + "grad_norm": 0.07450434565544128, + "learning_rate": 4.548400162008911e-05, + "loss": 0.4115, + "step": 1123 + }, + { + "epoch": 0.09105638366817888, + "grad_norm": 0.06927470862865448, + "learning_rate": 4.5524503847711626e-05, + "loss": 0.411, + "step": 1124 + }, + { + "epoch": 0.09113739468567725, + "grad_norm": 0.08960287272930145, + "learning_rate": 4.5565006075334146e-05, + "loss": 0.4878, + "step": 1125 + }, + { + "epoch": 0.09121840570317563, + "grad_norm": 0.09957445412874222, + "learning_rate": 4.560550830295666e-05, + "loss": 0.3939, + "step": 1126 + }, + { + "epoch": 0.09129941672067401, + "grad_norm": 0.0676618292927742, + "learning_rate": 4.5646010530579185e-05, + "loss": 0.4266, + "step": 1127 + }, + { + "epoch": 0.09138042773817238, + "grad_norm": 0.07924690842628479, + "learning_rate": 4.5686512758201705e-05, + "loss": 0.4755, + "step": 1128 + }, + { + "epoch": 0.09146143875567077, + "grad_norm": 0.08089061081409454, + "learning_rate": 4.5727014985824225e-05, + "loss": 0.4054, + "step": 1129 + }, + { + "epoch": 0.09154244977316915, + "grad_norm": 0.05947039648890495, + "learning_rate": 4.5767517213446744e-05, + "loss": 0.3883, + "step": 1130 + }, + { + "epoch": 0.09162346079066754, + "grad_norm": 0.06786283105611801, + "learning_rate": 4.5808019441069264e-05, + "loss": 0.4051, + "step": 1131 + }, + { + "epoch": 0.0917044718081659, + "grad_norm": 0.07103536278009415, + "learning_rate": 4.584852166869178e-05, + "loss": 0.4103, + "step": 1132 + }, + { + "epoch": 0.09178548282566429, + "grad_norm": 0.09525204449892044, + "learning_rate": 4.58890238963143e-05, + "loss": 0.4124, + "step": 1133 + }, + { + "epoch": 0.09186649384316267, + "grad_norm": 0.08623608201742172, + "learning_rate": 4.5929526123936816e-05, + "loss": 0.4231, + "step": 1134 + }, + { + "epoch": 0.09194750486066106, + "grad_norm": 0.06909052282571793, + "learning_rate": 4.5970028351559336e-05, + "loss": 0.3996, + "step": 1135 + }, + { + "epoch": 0.09202851587815943, + "grad_norm": 0.07997187972068787, + "learning_rate": 4.601053057918186e-05, + "loss": 0.3815, + "step": 1136 + }, + { + "epoch": 0.09210952689565781, + "grad_norm": 0.07907916605472565, + "learning_rate": 4.6051032806804375e-05, + "loss": 0.3531, + "step": 1137 + }, + { + "epoch": 0.09219053791315619, + "grad_norm": 0.08305371552705765, + "learning_rate": 4.6091535034426895e-05, + "loss": 0.4223, + "step": 1138 + }, + { + "epoch": 0.09227154893065456, + "grad_norm": 0.05640506371855736, + "learning_rate": 4.6132037262049415e-05, + "loss": 0.417, + "step": 1139 + }, + { + "epoch": 0.09235255994815295, + "grad_norm": 0.07259729504585266, + "learning_rate": 4.6172539489671934e-05, + "loss": 0.4169, + "step": 1140 + }, + { + "epoch": 0.09243357096565133, + "grad_norm": 0.062133122235536575, + "learning_rate": 4.6213041717294454e-05, + "loss": 0.3546, + "step": 1141 + }, + { + "epoch": 0.09251458198314971, + "grad_norm": 0.05108145251870155, + "learning_rate": 4.6253543944916974e-05, + "loss": 0.3647, + "step": 1142 + }, + { + "epoch": 0.09259559300064808, + "grad_norm": 0.07486524432897568, + "learning_rate": 4.629404617253949e-05, + "loss": 0.4012, + "step": 1143 + }, + { + "epoch": 0.09267660401814647, + "grad_norm": 0.08537513017654419, + "learning_rate": 4.6334548400162006e-05, + "loss": 0.4385, + "step": 1144 + }, + { + "epoch": 0.09275761503564485, + "grad_norm": 0.07322200387716293, + "learning_rate": 4.637505062778453e-05, + "loss": 0.398, + "step": 1145 + }, + { + "epoch": 0.09283862605314323, + "grad_norm": 0.06534241139888763, + "learning_rate": 4.641555285540705e-05, + "loss": 0.3882, + "step": 1146 + }, + { + "epoch": 0.0929196370706416, + "grad_norm": 0.10034750401973724, + "learning_rate": 4.645605508302957e-05, + "loss": 0.4191, + "step": 1147 + }, + { + "epoch": 0.09300064808813999, + "grad_norm": 0.0938630998134613, + "learning_rate": 4.649655731065209e-05, + "loss": 0.4167, + "step": 1148 + }, + { + "epoch": 0.09308165910563837, + "grad_norm": 0.06524375826120377, + "learning_rate": 4.6537059538274605e-05, + "loss": 0.3875, + "step": 1149 + }, + { + "epoch": 0.09316267012313674, + "grad_norm": 0.07733950018882751, + "learning_rate": 4.6577561765897124e-05, + "loss": 0.4724, + "step": 1150 + }, + { + "epoch": 0.09324368114063512, + "grad_norm": 0.08383625745773315, + "learning_rate": 4.6618063993519644e-05, + "loss": 0.4273, + "step": 1151 + }, + { + "epoch": 0.09332469215813351, + "grad_norm": 0.07422877848148346, + "learning_rate": 4.6658566221142164e-05, + "loss": 0.4515, + "step": 1152 + }, + { + "epoch": 0.09340570317563189, + "grad_norm": 0.06479012966156006, + "learning_rate": 4.669906844876468e-05, + "loss": 0.4212, + "step": 1153 + }, + { + "epoch": 0.09348671419313026, + "grad_norm": 0.055432505905628204, + "learning_rate": 4.67395706763872e-05, + "loss": 0.417, + "step": 1154 + }, + { + "epoch": 0.09356772521062864, + "grad_norm": 0.09282301366329193, + "learning_rate": 4.678007290400972e-05, + "loss": 0.4056, + "step": 1155 + }, + { + "epoch": 0.09364873622812703, + "grad_norm": 0.08054335415363312, + "learning_rate": 4.682057513163224e-05, + "loss": 0.3989, + "step": 1156 + }, + { + "epoch": 0.09372974724562541, + "grad_norm": 0.0536513514816761, + "learning_rate": 4.686107735925476e-05, + "loss": 0.4002, + "step": 1157 + }, + { + "epoch": 0.09381075826312378, + "grad_norm": 0.071620412170887, + "learning_rate": 4.690157958687728e-05, + "loss": 0.4133, + "step": 1158 + }, + { + "epoch": 0.09389176928062216, + "grad_norm": 0.06727361679077148, + "learning_rate": 4.69420818144998e-05, + "loss": 0.4043, + "step": 1159 + }, + { + "epoch": 0.09397278029812055, + "grad_norm": 0.07755836099386215, + "learning_rate": 4.698258404212232e-05, + "loss": 0.415, + "step": 1160 + }, + { + "epoch": 0.09405379131561892, + "grad_norm": 0.06299854069948196, + "learning_rate": 4.7023086269744834e-05, + "loss": 0.406, + "step": 1161 + }, + { + "epoch": 0.0941348023331173, + "grad_norm": 0.07159342616796494, + "learning_rate": 4.7063588497367354e-05, + "loss": 0.414, + "step": 1162 + }, + { + "epoch": 0.09421581335061568, + "grad_norm": 0.07070232182741165, + "learning_rate": 4.710409072498987e-05, + "loss": 0.3701, + "step": 1163 + }, + { + "epoch": 0.09429682436811407, + "grad_norm": 0.06699605286121368, + "learning_rate": 4.71445929526124e-05, + "loss": 0.4029, + "step": 1164 + }, + { + "epoch": 0.09437783538561244, + "grad_norm": 0.05861575901508331, + "learning_rate": 4.718509518023492e-05, + "loss": 0.4001, + "step": 1165 + }, + { + "epoch": 0.09445884640311082, + "grad_norm": 0.06890154629945755, + "learning_rate": 4.722559740785744e-05, + "loss": 0.4163, + "step": 1166 + }, + { + "epoch": 0.0945398574206092, + "grad_norm": 0.07200920581817627, + "learning_rate": 4.726609963547995e-05, + "loss": 0.416, + "step": 1167 + }, + { + "epoch": 0.09462086843810759, + "grad_norm": 0.08826032280921936, + "learning_rate": 4.730660186310247e-05, + "loss": 0.3949, + "step": 1168 + }, + { + "epoch": 0.09470187945560596, + "grad_norm": 0.08327112346887589, + "learning_rate": 4.734710409072499e-05, + "loss": 0.4567, + "step": 1169 + }, + { + "epoch": 0.09478289047310434, + "grad_norm": 0.06406053900718689, + "learning_rate": 4.738760631834751e-05, + "loss": 0.3839, + "step": 1170 + }, + { + "epoch": 0.09486390149060273, + "grad_norm": 0.09097456187009811, + "learning_rate": 4.742810854597003e-05, + "loss": 0.402, + "step": 1171 + }, + { + "epoch": 0.0949449125081011, + "grad_norm": 0.11253924667835236, + "learning_rate": 4.746861077359255e-05, + "loss": 0.4074, + "step": 1172 + }, + { + "epoch": 0.09502592352559948, + "grad_norm": 0.08323650807142258, + "learning_rate": 4.750911300121506e-05, + "loss": 0.4308, + "step": 1173 + }, + { + "epoch": 0.09510693454309786, + "grad_norm": 0.05829388275742531, + "learning_rate": 4.754961522883759e-05, + "loss": 0.393, + "step": 1174 + }, + { + "epoch": 0.09518794556059625, + "grad_norm": 0.08917537331581116, + "learning_rate": 4.759011745646011e-05, + "loss": 0.4074, + "step": 1175 + }, + { + "epoch": 0.09526895657809462, + "grad_norm": 0.0945352092385292, + "learning_rate": 4.763061968408263e-05, + "loss": 0.4483, + "step": 1176 + }, + { + "epoch": 0.095349967595593, + "grad_norm": 0.08760906010866165, + "learning_rate": 4.767112191170515e-05, + "loss": 0.4784, + "step": 1177 + }, + { + "epoch": 0.09543097861309138, + "grad_norm": 0.08544166386127472, + "learning_rate": 4.771162413932767e-05, + "loss": 0.4429, + "step": 1178 + }, + { + "epoch": 0.09551198963058977, + "grad_norm": 0.07088976353406906, + "learning_rate": 4.775212636695018e-05, + "loss": 0.4216, + "step": 1179 + }, + { + "epoch": 0.09559300064808814, + "grad_norm": 0.08196429163217545, + "learning_rate": 4.77926285945727e-05, + "loss": 0.4091, + "step": 1180 + }, + { + "epoch": 0.09567401166558652, + "grad_norm": 0.06955184787511826, + "learning_rate": 4.783313082219522e-05, + "loss": 0.4191, + "step": 1181 + }, + { + "epoch": 0.0957550226830849, + "grad_norm": 0.06988652795553207, + "learning_rate": 4.787363304981774e-05, + "loss": 0.4125, + "step": 1182 + }, + { + "epoch": 0.09583603370058327, + "grad_norm": 0.10521332174539566, + "learning_rate": 4.7914135277440267e-05, + "loss": 0.4901, + "step": 1183 + }, + { + "epoch": 0.09591704471808166, + "grad_norm": 0.05970345810055733, + "learning_rate": 4.795463750506278e-05, + "loss": 0.428, + "step": 1184 + }, + { + "epoch": 0.09599805573558004, + "grad_norm": 0.07127997279167175, + "learning_rate": 4.79951397326853e-05, + "loss": 0.3929, + "step": 1185 + }, + { + "epoch": 0.09607906675307842, + "grad_norm": 0.07283204048871994, + "learning_rate": 4.803564196030782e-05, + "loss": 0.4243, + "step": 1186 + }, + { + "epoch": 0.0961600777705768, + "grad_norm": 0.10053546726703644, + "learning_rate": 4.807614418793034e-05, + "loss": 0.4163, + "step": 1187 + }, + { + "epoch": 0.09624108878807518, + "grad_norm": 0.126708984375, + "learning_rate": 4.811664641555286e-05, + "loss": 0.4715, + "step": 1188 + }, + { + "epoch": 0.09632209980557356, + "grad_norm": 0.10233563184738159, + "learning_rate": 4.815714864317538e-05, + "loss": 0.4774, + "step": 1189 + }, + { + "epoch": 0.09640311082307194, + "grad_norm": 0.08582703024148941, + "learning_rate": 4.81976508707979e-05, + "loss": 0.4193, + "step": 1190 + }, + { + "epoch": 0.09648412184057031, + "grad_norm": 0.07314405590295792, + "learning_rate": 4.823815309842041e-05, + "loss": 0.3819, + "step": 1191 + }, + { + "epoch": 0.0965651328580687, + "grad_norm": 0.09516175836324692, + "learning_rate": 4.827865532604293e-05, + "loss": 0.4087, + "step": 1192 + }, + { + "epoch": 0.09664614387556708, + "grad_norm": 0.08461283892393112, + "learning_rate": 4.8319157553665456e-05, + "loss": 0.408, + "step": 1193 + }, + { + "epoch": 0.09672715489306545, + "grad_norm": 0.0769420638680458, + "learning_rate": 4.8359659781287976e-05, + "loss": 0.401, + "step": 1194 + }, + { + "epoch": 0.09680816591056383, + "grad_norm": 0.06576257199048996, + "learning_rate": 4.8400162008910496e-05, + "loss": 0.4162, + "step": 1195 + }, + { + "epoch": 0.09688917692806222, + "grad_norm": 0.06271419674158096, + "learning_rate": 4.8440664236533015e-05, + "loss": 0.3753, + "step": 1196 + }, + { + "epoch": 0.0969701879455606, + "grad_norm": 0.07287538051605225, + "learning_rate": 4.848116646415553e-05, + "loss": 0.3884, + "step": 1197 + }, + { + "epoch": 0.09705119896305897, + "grad_norm": 0.07314209640026093, + "learning_rate": 4.852166869177805e-05, + "loss": 0.4365, + "step": 1198 + }, + { + "epoch": 0.09713220998055735, + "grad_norm": 0.09039682894945145, + "learning_rate": 4.856217091940057e-05, + "loss": 0.4272, + "step": 1199 + }, + { + "epoch": 0.09721322099805574, + "grad_norm": 0.07149705290794373, + "learning_rate": 4.860267314702309e-05, + "loss": 0.4285, + "step": 1200 + }, + { + "epoch": 0.09729423201555412, + "grad_norm": 0.07689498364925385, + "learning_rate": 4.864317537464561e-05, + "loss": 0.3871, + "step": 1201 + }, + { + "epoch": 0.09737524303305249, + "grad_norm": 0.0991893783211708, + "learning_rate": 4.868367760226813e-05, + "loss": 0.406, + "step": 1202 + }, + { + "epoch": 0.09745625405055088, + "grad_norm": 0.07114807516336441, + "learning_rate": 4.8724179829890646e-05, + "loss": 0.4091, + "step": 1203 + }, + { + "epoch": 0.09753726506804926, + "grad_norm": 0.09959740936756134, + "learning_rate": 4.8764682057513166e-05, + "loss": 0.4136, + "step": 1204 + }, + { + "epoch": 0.09761827608554763, + "grad_norm": 0.07426830381155014, + "learning_rate": 4.8805184285135686e-05, + "loss": 0.3902, + "step": 1205 + }, + { + "epoch": 0.09769928710304601, + "grad_norm": 0.06713051348924637, + "learning_rate": 4.8845686512758205e-05, + "loss": 0.4007, + "step": 1206 + }, + { + "epoch": 0.0977802981205444, + "grad_norm": 0.06639891117811203, + "learning_rate": 4.8886188740380725e-05, + "loss": 0.4468, + "step": 1207 + }, + { + "epoch": 0.09786130913804278, + "grad_norm": 0.07029426842927933, + "learning_rate": 4.8926690968003245e-05, + "loss": 0.3641, + "step": 1208 + }, + { + "epoch": 0.09794232015554115, + "grad_norm": 0.07336651533842087, + "learning_rate": 4.896719319562576e-05, + "loss": 0.4148, + "step": 1209 + }, + { + "epoch": 0.09802333117303953, + "grad_norm": 0.07670346647500992, + "learning_rate": 4.900769542324828e-05, + "loss": 0.438, + "step": 1210 + }, + { + "epoch": 0.09810434219053792, + "grad_norm": 0.07110664993524551, + "learning_rate": 4.9048197650870804e-05, + "loss": 0.4076, + "step": 1211 + }, + { + "epoch": 0.0981853532080363, + "grad_norm": 0.06885383278131485, + "learning_rate": 4.9088699878493323e-05, + "loss": 0.3902, + "step": 1212 + }, + { + "epoch": 0.09826636422553467, + "grad_norm": 0.07384192943572998, + "learning_rate": 4.912920210611584e-05, + "loss": 0.3999, + "step": 1213 + }, + { + "epoch": 0.09834737524303305, + "grad_norm": 0.06060566008090973, + "learning_rate": 4.9169704333738356e-05, + "loss": 0.4097, + "step": 1214 + }, + { + "epoch": 0.09842838626053144, + "grad_norm": 0.06468156725168228, + "learning_rate": 4.9210206561360876e-05, + "loss": 0.3903, + "step": 1215 + }, + { + "epoch": 0.0985093972780298, + "grad_norm": 0.0653204470872879, + "learning_rate": 4.9250708788983395e-05, + "loss": 0.413, + "step": 1216 + }, + { + "epoch": 0.09859040829552819, + "grad_norm": 0.07760292291641235, + "learning_rate": 4.9291211016605915e-05, + "loss": 0.3773, + "step": 1217 + }, + { + "epoch": 0.09867141931302657, + "grad_norm": 0.06723463535308838, + "learning_rate": 4.9331713244228435e-05, + "loss": 0.4027, + "step": 1218 + }, + { + "epoch": 0.09875243033052496, + "grad_norm": 0.07288457453250885, + "learning_rate": 4.9372215471850954e-05, + "loss": 0.4557, + "step": 1219 + }, + { + "epoch": 0.09883344134802333, + "grad_norm": 0.06699313223361969, + "learning_rate": 4.9412717699473474e-05, + "loss": 0.3996, + "step": 1220 + }, + { + "epoch": 0.09891445236552171, + "grad_norm": 0.07510928064584732, + "learning_rate": 4.9453219927095994e-05, + "loss": 0.4223, + "step": 1221 + }, + { + "epoch": 0.0989954633830201, + "grad_norm": 0.07308734208345413, + "learning_rate": 4.949372215471851e-05, + "loss": 0.4433, + "step": 1222 + }, + { + "epoch": 0.09907647440051848, + "grad_norm": 0.0717131718993187, + "learning_rate": 4.953422438234103e-05, + "loss": 0.4068, + "step": 1223 + }, + { + "epoch": 0.09915748541801685, + "grad_norm": 0.07816499471664429, + "learning_rate": 4.957472660996355e-05, + "loss": 0.4064, + "step": 1224 + }, + { + "epoch": 0.09923849643551523, + "grad_norm": 0.0869022086262703, + "learning_rate": 4.961522883758607e-05, + "loss": 0.3962, + "step": 1225 + }, + { + "epoch": 0.09931950745301361, + "grad_norm": 0.06032872572541237, + "learning_rate": 4.9655731065208585e-05, + "loss": 0.3723, + "step": 1226 + }, + { + "epoch": 0.09940051847051198, + "grad_norm": 0.09066958725452423, + "learning_rate": 4.9696233292831105e-05, + "loss": 0.4219, + "step": 1227 + }, + { + "epoch": 0.09948152948801037, + "grad_norm": 0.09620976448059082, + "learning_rate": 4.9736735520453625e-05, + "loss": 0.4514, + "step": 1228 + }, + { + "epoch": 0.09956254050550875, + "grad_norm": 0.07409602403640747, + "learning_rate": 4.9777237748076144e-05, + "loss": 0.4768, + "step": 1229 + }, + { + "epoch": 0.09964355152300713, + "grad_norm": 0.06351660937070847, + "learning_rate": 4.981773997569867e-05, + "loss": 0.3369, + "step": 1230 + }, + { + "epoch": 0.0997245625405055, + "grad_norm": 0.06918103992938995, + "learning_rate": 4.985824220332119e-05, + "loss": 0.4126, + "step": 1231 + }, + { + "epoch": 0.09980557355800389, + "grad_norm": 0.07381061464548111, + "learning_rate": 4.98987444309437e-05, + "loss": 0.4248, + "step": 1232 + }, + { + "epoch": 0.09988658457550227, + "grad_norm": 0.05555868148803711, + "learning_rate": 4.993924665856622e-05, + "loss": 0.3771, + "step": 1233 + }, + { + "epoch": 0.09996759559300065, + "grad_norm": 0.13505244255065918, + "learning_rate": 4.997974888618874e-05, + "loss": 0.4016, + "step": 1234 + }, + { + "epoch": 0.10004860661049902, + "grad_norm": 0.061966948211193085, + "learning_rate": 5.002025111381127e-05, + "loss": 0.4484, + "step": 1235 + }, + { + "epoch": 0.10012961762799741, + "grad_norm": 0.07690034806728363, + "learning_rate": 5.006075334143379e-05, + "loss": 0.391, + "step": 1236 + }, + { + "epoch": 0.10021062864549579, + "grad_norm": 0.0610932819545269, + "learning_rate": 5.01012555690563e-05, + "loss": 0.4042, + "step": 1237 + }, + { + "epoch": 0.10029163966299416, + "grad_norm": 0.0714229866862297, + "learning_rate": 5.014175779667882e-05, + "loss": 0.3876, + "step": 1238 + }, + { + "epoch": 0.10037265068049255, + "grad_norm": 0.055647656321525574, + "learning_rate": 5.018226002430134e-05, + "loss": 0.4291, + "step": 1239 + }, + { + "epoch": 0.10045366169799093, + "grad_norm": 0.09826336801052094, + "learning_rate": 5.022276225192386e-05, + "loss": 0.4639, + "step": 1240 + }, + { + "epoch": 0.10053467271548931, + "grad_norm": 0.06438162177801132, + "learning_rate": 5.026326447954638e-05, + "loss": 0.4241, + "step": 1241 + }, + { + "epoch": 0.10061568373298768, + "grad_norm": 0.05960644781589508, + "learning_rate": 5.03037667071689e-05, + "loss": 0.4309, + "step": 1242 + }, + { + "epoch": 0.10069669475048607, + "grad_norm": 0.08444269001483917, + "learning_rate": 5.034426893479142e-05, + "loss": 0.4588, + "step": 1243 + }, + { + "epoch": 0.10077770576798445, + "grad_norm": 0.05898994579911232, + "learning_rate": 5.038477116241393e-05, + "loss": 0.4007, + "step": 1244 + }, + { + "epoch": 0.10085871678548283, + "grad_norm": 0.0722171887755394, + "learning_rate": 5.042527339003645e-05, + "loss": 0.4289, + "step": 1245 + }, + { + "epoch": 0.1009397278029812, + "grad_norm": 0.07884074747562408, + "learning_rate": 5.046577561765897e-05, + "loss": 0.4367, + "step": 1246 + }, + { + "epoch": 0.10102073882047959, + "grad_norm": 0.07166250795125961, + "learning_rate": 5.050627784528149e-05, + "loss": 0.4291, + "step": 1247 + }, + { + "epoch": 0.10110174983797797, + "grad_norm": 0.07508683204650879, + "learning_rate": 5.054678007290401e-05, + "loss": 0.3988, + "step": 1248 + }, + { + "epoch": 0.10118276085547634, + "grad_norm": 0.08329222351312637, + "learning_rate": 5.058728230052653e-05, + "loss": 0.3927, + "step": 1249 + }, + { + "epoch": 0.10126377187297472, + "grad_norm": 0.05996193736791611, + "learning_rate": 5.0627784528149044e-05, + "loss": 0.4289, + "step": 1250 + }, + { + "epoch": 0.1013447828904731, + "grad_norm": 0.07019893079996109, + "learning_rate": 5.066828675577156e-05, + "loss": 0.4024, + "step": 1251 + }, + { + "epoch": 0.10142579390797149, + "grad_norm": 0.07923491299152374, + "learning_rate": 5.070878898339408e-05, + "loss": 0.3712, + "step": 1252 + }, + { + "epoch": 0.10150680492546986, + "grad_norm": 0.06898915022611618, + "learning_rate": 5.07492912110166e-05, + "loss": 0.3843, + "step": 1253 + }, + { + "epoch": 0.10158781594296824, + "grad_norm": 0.05755610764026642, + "learning_rate": 5.0789793438639136e-05, + "loss": 0.4135, + "step": 1254 + }, + { + "epoch": 0.10166882696046663, + "grad_norm": 0.08014683425426483, + "learning_rate": 5.083029566626165e-05, + "loss": 0.4377, + "step": 1255 + }, + { + "epoch": 0.101749837977965, + "grad_norm": 0.07435602694749832, + "learning_rate": 5.087079789388417e-05, + "loss": 0.393, + "step": 1256 + }, + { + "epoch": 0.10183084899546338, + "grad_norm": 0.07429254055023193, + "learning_rate": 5.091130012150669e-05, + "loss": 0.4, + "step": 1257 + }, + { + "epoch": 0.10191186001296176, + "grad_norm": 0.06816524267196655, + "learning_rate": 5.095180234912921e-05, + "loss": 0.4017, + "step": 1258 + }, + { + "epoch": 0.10199287103046015, + "grad_norm": 0.07542555779218674, + "learning_rate": 5.099230457675173e-05, + "loss": 0.4366, + "step": 1259 + }, + { + "epoch": 0.10207388204795852, + "grad_norm": 0.055631767958402634, + "learning_rate": 5.103280680437425e-05, + "loss": 0.3886, + "step": 1260 + }, + { + "epoch": 0.1021548930654569, + "grad_norm": 0.07585558295249939, + "learning_rate": 5.107330903199676e-05, + "loss": 0.381, + "step": 1261 + }, + { + "epoch": 0.10223590408295528, + "grad_norm": 0.06207391247153282, + "learning_rate": 5.111381125961928e-05, + "loss": 0.4269, + "step": 1262 + }, + { + "epoch": 0.10231691510045367, + "grad_norm": 0.07599161565303802, + "learning_rate": 5.11543134872418e-05, + "loss": 0.4273, + "step": 1263 + }, + { + "epoch": 0.10239792611795204, + "grad_norm": 0.07198527455329895, + "learning_rate": 5.119481571486432e-05, + "loss": 0.4296, + "step": 1264 + }, + { + "epoch": 0.10247893713545042, + "grad_norm": 0.06384332478046417, + "learning_rate": 5.123531794248684e-05, + "loss": 0.3764, + "step": 1265 + }, + { + "epoch": 0.1025599481529488, + "grad_norm": 0.0781475305557251, + "learning_rate": 5.127582017010936e-05, + "loss": 0.4283, + "step": 1266 + }, + { + "epoch": 0.10264095917044717, + "grad_norm": 0.05831428989768028, + "learning_rate": 5.131632239773188e-05, + "loss": 0.3982, + "step": 1267 + }, + { + "epoch": 0.10272197018794556, + "grad_norm": 0.06163075193762779, + "learning_rate": 5.135682462535439e-05, + "loss": 0.4392, + "step": 1268 + }, + { + "epoch": 0.10280298120544394, + "grad_norm": 0.07373479008674622, + "learning_rate": 5.139732685297691e-05, + "loss": 0.4261, + "step": 1269 + }, + { + "epoch": 0.10288399222294232, + "grad_norm": 0.07836463302373886, + "learning_rate": 5.143782908059943e-05, + "loss": 0.4043, + "step": 1270 + }, + { + "epoch": 0.1029650032404407, + "grad_norm": 0.05384104326367378, + "learning_rate": 5.147833130822195e-05, + "loss": 0.4128, + "step": 1271 + }, + { + "epoch": 0.10304601425793908, + "grad_norm": 0.10287515819072723, + "learning_rate": 5.151883353584447e-05, + "loss": 0.3915, + "step": 1272 + }, + { + "epoch": 0.10312702527543746, + "grad_norm": 0.06811502575874329, + "learning_rate": 5.1559335763466996e-05, + "loss": 0.4398, + "step": 1273 + }, + { + "epoch": 0.10320803629293585, + "grad_norm": 0.06112295761704445, + "learning_rate": 5.1599837991089516e-05, + "loss": 0.4399, + "step": 1274 + }, + { + "epoch": 0.10328904731043421, + "grad_norm": 0.07518140226602554, + "learning_rate": 5.1640340218712035e-05, + "loss": 0.3912, + "step": 1275 + }, + { + "epoch": 0.1033700583279326, + "grad_norm": 0.07181891798973083, + "learning_rate": 5.1680842446334555e-05, + "loss": 0.3966, + "step": 1276 + }, + { + "epoch": 0.10345106934543098, + "grad_norm": 0.07075998187065125, + "learning_rate": 5.1721344673957075e-05, + "loss": 0.4301, + "step": 1277 + }, + { + "epoch": 0.10353208036292935, + "grad_norm": 0.059220463037490845, + "learning_rate": 5.1761846901579594e-05, + "loss": 0.3807, + "step": 1278 + }, + { + "epoch": 0.10361309138042774, + "grad_norm": 0.06967325508594513, + "learning_rate": 5.180234912920211e-05, + "loss": 0.4307, + "step": 1279 + }, + { + "epoch": 0.10369410239792612, + "grad_norm": 0.06643428653478622, + "learning_rate": 5.184285135682463e-05, + "loss": 0.4111, + "step": 1280 + }, + { + "epoch": 0.1037751134154245, + "grad_norm": 0.07665630429983139, + "learning_rate": 5.188335358444715e-05, + "loss": 0.4827, + "step": 1281 + }, + { + "epoch": 0.10385612443292287, + "grad_norm": 0.06388570368289948, + "learning_rate": 5.1923855812069666e-05, + "loss": 0.3923, + "step": 1282 + }, + { + "epoch": 0.10393713545042126, + "grad_norm": 0.07032448053359985, + "learning_rate": 5.1964358039692186e-05, + "loss": 0.4156, + "step": 1283 + }, + { + "epoch": 0.10401814646791964, + "grad_norm": 0.06190132722258568, + "learning_rate": 5.2004860267314706e-05, + "loss": 0.3803, + "step": 1284 + }, + { + "epoch": 0.10409915748541802, + "grad_norm": 0.07370159029960632, + "learning_rate": 5.2045362494937225e-05, + "loss": 0.3936, + "step": 1285 + }, + { + "epoch": 0.10418016850291639, + "grad_norm": 0.054596032947301865, + "learning_rate": 5.208586472255974e-05, + "loss": 0.3749, + "step": 1286 + }, + { + "epoch": 0.10426117952041478, + "grad_norm": 0.07030202448368073, + "learning_rate": 5.212636695018226e-05, + "loss": 0.394, + "step": 1287 + }, + { + "epoch": 0.10434219053791316, + "grad_norm": 0.060638271272182465, + "learning_rate": 5.216686917780478e-05, + "loss": 0.413, + "step": 1288 + }, + { + "epoch": 0.10442320155541153, + "grad_norm": 0.0664840042591095, + "learning_rate": 5.22073714054273e-05, + "loss": 0.3936, + "step": 1289 + }, + { + "epoch": 0.10450421257290991, + "grad_norm": 0.060060229152441025, + "learning_rate": 5.224787363304982e-05, + "loss": 0.4421, + "step": 1290 + }, + { + "epoch": 0.1045852235904083, + "grad_norm": 0.08365281671285629, + "learning_rate": 5.2288375860672337e-05, + "loss": 0.3978, + "step": 1291 + }, + { + "epoch": 0.10466623460790668, + "grad_norm": 0.07517527043819427, + "learning_rate": 5.232887808829486e-05, + "loss": 0.4183, + "step": 1292 + }, + { + "epoch": 0.10474724562540505, + "grad_norm": 0.06062887981534004, + "learning_rate": 5.236938031591738e-05, + "loss": 0.4059, + "step": 1293 + }, + { + "epoch": 0.10482825664290343, + "grad_norm": 0.07648169249296188, + "learning_rate": 5.24098825435399e-05, + "loss": 0.4105, + "step": 1294 + }, + { + "epoch": 0.10490926766040182, + "grad_norm": 0.0700891986489296, + "learning_rate": 5.245038477116242e-05, + "loss": 0.3848, + "step": 1295 + }, + { + "epoch": 0.1049902786779002, + "grad_norm": 0.05411629378795624, + "learning_rate": 5.249088699878494e-05, + "loss": 0.3797, + "step": 1296 + }, + { + "epoch": 0.10507128969539857, + "grad_norm": 0.057578980922698975, + "learning_rate": 5.2531389226407455e-05, + "loss": 0.4323, + "step": 1297 + }, + { + "epoch": 0.10515230071289695, + "grad_norm": 0.08588584512472153, + "learning_rate": 5.2571891454029974e-05, + "loss": 0.4319, + "step": 1298 + }, + { + "epoch": 0.10523331173039534, + "grad_norm": 0.08884284645318985, + "learning_rate": 5.2612393681652494e-05, + "loss": 0.439, + "step": 1299 + }, + { + "epoch": 0.10531432274789371, + "grad_norm": 0.0666431188583374, + "learning_rate": 5.2652895909275014e-05, + "loss": 0.3792, + "step": 1300 + }, + { + "epoch": 0.10539533376539209, + "grad_norm": 0.06067335978150368, + "learning_rate": 5.269339813689753e-05, + "loss": 0.3713, + "step": 1301 + }, + { + "epoch": 0.10547634478289047, + "grad_norm": 0.06495458632707596, + "learning_rate": 5.273390036452005e-05, + "loss": 0.442, + "step": 1302 + }, + { + "epoch": 0.10555735580038886, + "grad_norm": 0.0760221853852272, + "learning_rate": 5.2774402592142566e-05, + "loss": 0.4034, + "step": 1303 + }, + { + "epoch": 0.10563836681788723, + "grad_norm": 0.07905784994363785, + "learning_rate": 5.2814904819765085e-05, + "loss": 0.4143, + "step": 1304 + }, + { + "epoch": 0.10571937783538561, + "grad_norm": 0.08168292790651321, + "learning_rate": 5.2855407047387605e-05, + "loss": 0.4194, + "step": 1305 + }, + { + "epoch": 0.105800388852884, + "grad_norm": 0.06710600107908249, + "learning_rate": 5.2895909275010125e-05, + "loss": 0.3838, + "step": 1306 + }, + { + "epoch": 0.10588139987038238, + "grad_norm": 0.06214667111635208, + "learning_rate": 5.2936411502632644e-05, + "loss": 0.4564, + "step": 1307 + }, + { + "epoch": 0.10596241088788075, + "grad_norm": 0.11926735192537308, + "learning_rate": 5.2976913730255164e-05, + "loss": 0.4303, + "step": 1308 + }, + { + "epoch": 0.10604342190537913, + "grad_norm": 0.08118419349193573, + "learning_rate": 5.3017415957877684e-05, + "loss": 0.3886, + "step": 1309 + }, + { + "epoch": 0.10612443292287752, + "grad_norm": 0.06788288801908493, + "learning_rate": 5.305791818550021e-05, + "loss": 0.3587, + "step": 1310 + }, + { + "epoch": 0.10620544394037588, + "grad_norm": 0.09914708882570267, + "learning_rate": 5.309842041312273e-05, + "loss": 0.4239, + "step": 1311 + }, + { + "epoch": 0.10628645495787427, + "grad_norm": 0.08624789863824844, + "learning_rate": 5.313892264074525e-05, + "loss": 0.4397, + "step": 1312 + }, + { + "epoch": 0.10636746597537265, + "grad_norm": 0.09602291136980057, + "learning_rate": 5.317942486836777e-05, + "loss": 0.3927, + "step": 1313 + }, + { + "epoch": 0.10644847699287104, + "grad_norm": 0.06951847672462463, + "learning_rate": 5.321992709599028e-05, + "loss": 0.4475, + "step": 1314 + }, + { + "epoch": 0.1065294880103694, + "grad_norm": 0.09299685060977936, + "learning_rate": 5.32604293236128e-05, + "loss": 0.371, + "step": 1315 + }, + { + "epoch": 0.10661049902786779, + "grad_norm": 0.0918346717953682, + "learning_rate": 5.330093155123532e-05, + "loss": 0.4075, + "step": 1316 + }, + { + "epoch": 0.10669151004536617, + "grad_norm": 0.07095564901828766, + "learning_rate": 5.334143377885784e-05, + "loss": 0.4471, + "step": 1317 + }, + { + "epoch": 0.10677252106286456, + "grad_norm": 0.08020664006471634, + "learning_rate": 5.338193600648036e-05, + "loss": 0.3802, + "step": 1318 + }, + { + "epoch": 0.10685353208036293, + "grad_norm": 0.08093412220478058, + "learning_rate": 5.342243823410288e-05, + "loss": 0.3666, + "step": 1319 + }, + { + "epoch": 0.10693454309786131, + "grad_norm": 0.09224852919578552, + "learning_rate": 5.34629404617254e-05, + "loss": 0.4185, + "step": 1320 + }, + { + "epoch": 0.10701555411535969, + "grad_norm": 0.10123410820960999, + "learning_rate": 5.350344268934791e-05, + "loss": 0.4172, + "step": 1321 + }, + { + "epoch": 0.10709656513285806, + "grad_norm": 0.08183860778808594, + "learning_rate": 5.354394491697043e-05, + "loss": 0.3952, + "step": 1322 + }, + { + "epoch": 0.10717757615035645, + "grad_norm": 0.06311725080013275, + "learning_rate": 5.358444714459295e-05, + "loss": 0.3537, + "step": 1323 + }, + { + "epoch": 0.10725858716785483, + "grad_norm": 0.06070149317383766, + "learning_rate": 5.362494937221547e-05, + "loss": 0.4073, + "step": 1324 + }, + { + "epoch": 0.10733959818535321, + "grad_norm": 0.06825286895036697, + "learning_rate": 5.366545159983799e-05, + "loss": 0.4092, + "step": 1325 + }, + { + "epoch": 0.10742060920285158, + "grad_norm": 0.08379079401493073, + "learning_rate": 5.370595382746051e-05, + "loss": 0.4024, + "step": 1326 + }, + { + "epoch": 0.10750162022034997, + "grad_norm": 0.06500423699617386, + "learning_rate": 5.3746456055083024e-05, + "loss": 0.3803, + "step": 1327 + }, + { + "epoch": 0.10758263123784835, + "grad_norm": 0.07769917696714401, + "learning_rate": 5.3786958282705544e-05, + "loss": 0.3683, + "step": 1328 + }, + { + "epoch": 0.10766364225534673, + "grad_norm": 0.13604873418807983, + "learning_rate": 5.382746051032808e-05, + "loss": 0.3725, + "step": 1329 + }, + { + "epoch": 0.1077446532728451, + "grad_norm": 0.08064484596252441, + "learning_rate": 5.38679627379506e-05, + "loss": 0.4188, + "step": 1330 + }, + { + "epoch": 0.10782566429034349, + "grad_norm": 0.07017720490694046, + "learning_rate": 5.3908464965573117e-05, + "loss": 0.3579, + "step": 1331 + }, + { + "epoch": 0.10790667530784187, + "grad_norm": 0.07512283325195312, + "learning_rate": 5.394896719319563e-05, + "loss": 0.4195, + "step": 1332 + }, + { + "epoch": 0.10798768632534024, + "grad_norm": 0.07279182970523834, + "learning_rate": 5.398946942081815e-05, + "loss": 0.4237, + "step": 1333 + }, + { + "epoch": 0.10806869734283862, + "grad_norm": 0.07479194551706314, + "learning_rate": 5.402997164844067e-05, + "loss": 0.4228, + "step": 1334 + }, + { + "epoch": 0.10814970836033701, + "grad_norm": 0.06885584443807602, + "learning_rate": 5.407047387606319e-05, + "loss": 0.3907, + "step": 1335 + }, + { + "epoch": 0.10823071937783539, + "grad_norm": 0.08084303885698318, + "learning_rate": 5.411097610368571e-05, + "loss": 0.3847, + "step": 1336 + }, + { + "epoch": 0.10831173039533376, + "grad_norm": 0.0600086972117424, + "learning_rate": 5.415147833130823e-05, + "loss": 0.4101, + "step": 1337 + }, + { + "epoch": 0.10839274141283214, + "grad_norm": 0.07234185189008713, + "learning_rate": 5.419198055893074e-05, + "loss": 0.4056, + "step": 1338 + }, + { + "epoch": 0.10847375243033053, + "grad_norm": 0.07753019034862518, + "learning_rate": 5.423248278655326e-05, + "loss": 0.4525, + "step": 1339 + }, + { + "epoch": 0.10855476344782891, + "grad_norm": 0.08446196466684341, + "learning_rate": 5.427298501417578e-05, + "loss": 0.3609, + "step": 1340 + }, + { + "epoch": 0.10863577446532728, + "grad_norm": 0.05372902750968933, + "learning_rate": 5.43134872417983e-05, + "loss": 0.3903, + "step": 1341 + }, + { + "epoch": 0.10871678548282566, + "grad_norm": 0.06935431808233261, + "learning_rate": 5.435398946942082e-05, + "loss": 0.445, + "step": 1342 + }, + { + "epoch": 0.10879779650032405, + "grad_norm": 0.07019354403018951, + "learning_rate": 5.439449169704334e-05, + "loss": 0.4459, + "step": 1343 + }, + { + "epoch": 0.10887880751782242, + "grad_norm": 0.07519102841615677, + "learning_rate": 5.443499392466586e-05, + "loss": 0.4252, + "step": 1344 + }, + { + "epoch": 0.1089598185353208, + "grad_norm": 0.07183311134576797, + "learning_rate": 5.447549615228837e-05, + "loss": 0.4769, + "step": 1345 + }, + { + "epoch": 0.10904082955281919, + "grad_norm": 0.0855363979935646, + "learning_rate": 5.451599837991089e-05, + "loss": 0.4073, + "step": 1346 + }, + { + "epoch": 0.10912184057031757, + "grad_norm": 0.06138298660516739, + "learning_rate": 5.455650060753341e-05, + "loss": 0.4301, + "step": 1347 + }, + { + "epoch": 0.10920285158781594, + "grad_norm": 0.09503749012947083, + "learning_rate": 5.4597002835155944e-05, + "loss": 0.4495, + "step": 1348 + }, + { + "epoch": 0.10928386260531432, + "grad_norm": 0.07598304003477097, + "learning_rate": 5.463750506277846e-05, + "loss": 0.3855, + "step": 1349 + }, + { + "epoch": 0.1093648736228127, + "grad_norm": 0.05978058651089668, + "learning_rate": 5.467800729040098e-05, + "loss": 0.3343, + "step": 1350 + }, + { + "epoch": 0.10944588464031109, + "grad_norm": 0.0791708379983902, + "learning_rate": 5.4718509518023496e-05, + "loss": 0.3953, + "step": 1351 + }, + { + "epoch": 0.10952689565780946, + "grad_norm": 0.07386378198862076, + "learning_rate": 5.4759011745646016e-05, + "loss": 0.3758, + "step": 1352 + }, + { + "epoch": 0.10960790667530784, + "grad_norm": 0.0669383704662323, + "learning_rate": 5.4799513973268536e-05, + "loss": 0.4606, + "step": 1353 + }, + { + "epoch": 0.10968891769280623, + "grad_norm": 0.05895500257611275, + "learning_rate": 5.4840016200891055e-05, + "loss": 0.4016, + "step": 1354 + }, + { + "epoch": 0.1097699287103046, + "grad_norm": 0.0767853856086731, + "learning_rate": 5.4880518428513575e-05, + "loss": 0.3933, + "step": 1355 + }, + { + "epoch": 0.10985093972780298, + "grad_norm": 0.055138807743787766, + "learning_rate": 5.492102065613609e-05, + "loss": 0.4144, + "step": 1356 + }, + { + "epoch": 0.10993195074530136, + "grad_norm": 0.06357793509960175, + "learning_rate": 5.496152288375861e-05, + "loss": 0.3813, + "step": 1357 + }, + { + "epoch": 0.11001296176279975, + "grad_norm": 0.062212321907281876, + "learning_rate": 5.500202511138113e-05, + "loss": 0.3552, + "step": 1358 + }, + { + "epoch": 0.11009397278029812, + "grad_norm": 0.07156316190958023, + "learning_rate": 5.504252733900365e-05, + "loss": 0.3622, + "step": 1359 + }, + { + "epoch": 0.1101749837977965, + "grad_norm": 0.06475205719470978, + "learning_rate": 5.5083029566626167e-05, + "loss": 0.3744, + "step": 1360 + }, + { + "epoch": 0.11025599481529488, + "grad_norm": 0.07238809019327164, + "learning_rate": 5.5123531794248686e-05, + "loss": 0.4212, + "step": 1361 + }, + { + "epoch": 0.11033700583279327, + "grad_norm": 0.0972784087061882, + "learning_rate": 5.5164034021871206e-05, + "loss": 0.3891, + "step": 1362 + }, + { + "epoch": 0.11041801685029164, + "grad_norm": 0.07933227717876434, + "learning_rate": 5.520453624949372e-05, + "loss": 0.4464, + "step": 1363 + }, + { + "epoch": 0.11049902786779002, + "grad_norm": 0.06054377183318138, + "learning_rate": 5.524503847711624e-05, + "loss": 0.3636, + "step": 1364 + }, + { + "epoch": 0.1105800388852884, + "grad_norm": 0.051436055451631546, + "learning_rate": 5.528554070473876e-05, + "loss": 0.3919, + "step": 1365 + }, + { + "epoch": 0.11066104990278677, + "grad_norm": 0.07529158145189285, + "learning_rate": 5.532604293236128e-05, + "loss": 0.3801, + "step": 1366 + }, + { + "epoch": 0.11074206092028516, + "grad_norm": 0.08469279110431671, + "learning_rate": 5.5366545159983804e-05, + "loss": 0.451, + "step": 1367 + }, + { + "epoch": 0.11082307193778354, + "grad_norm": 0.073479562997818, + "learning_rate": 5.5407047387606324e-05, + "loss": 0.3786, + "step": 1368 + }, + { + "epoch": 0.11090408295528192, + "grad_norm": 0.060560815036296844, + "learning_rate": 5.5447549615228844e-05, + "loss": 0.4208, + "step": 1369 + }, + { + "epoch": 0.1109850939727803, + "grad_norm": 0.10111406445503235, + "learning_rate": 5.548805184285136e-05, + "loss": 0.3602, + "step": 1370 + }, + { + "epoch": 0.11106610499027868, + "grad_norm": 0.05426943302154541, + "learning_rate": 5.552855407047388e-05, + "loss": 0.3702, + "step": 1371 + }, + { + "epoch": 0.11114711600777706, + "grad_norm": 0.04940052330493927, + "learning_rate": 5.55690562980964e-05, + "loss": 0.3639, + "step": 1372 + }, + { + "epoch": 0.11122812702527543, + "grad_norm": 0.07840988785028458, + "learning_rate": 5.5609558525718916e-05, + "loss": 0.4495, + "step": 1373 + }, + { + "epoch": 0.11130913804277381, + "grad_norm": 0.06023367494344711, + "learning_rate": 5.5650060753341435e-05, + "loss": 0.4355, + "step": 1374 + }, + { + "epoch": 0.1113901490602722, + "grad_norm": 0.06036898493766785, + "learning_rate": 5.5690562980963955e-05, + "loss": 0.4203, + "step": 1375 + }, + { + "epoch": 0.11147116007777058, + "grad_norm": 0.05843716487288475, + "learning_rate": 5.5731065208586475e-05, + "loss": 0.3917, + "step": 1376 + }, + { + "epoch": 0.11155217109526895, + "grad_norm": 0.06300392746925354, + "learning_rate": 5.5771567436208994e-05, + "loss": 0.4065, + "step": 1377 + }, + { + "epoch": 0.11163318211276733, + "grad_norm": 0.06990660727024078, + "learning_rate": 5.5812069663831514e-05, + "loss": 0.4314, + "step": 1378 + }, + { + "epoch": 0.11171419313026572, + "grad_norm": 0.060971733182668686, + "learning_rate": 5.5852571891454034e-05, + "loss": 0.342, + "step": 1379 + }, + { + "epoch": 0.1117952041477641, + "grad_norm": 0.05048093572258949, + "learning_rate": 5.5893074119076546e-05, + "loss": 0.3855, + "step": 1380 + }, + { + "epoch": 0.11187621516526247, + "grad_norm": 0.06826608628034592, + "learning_rate": 5.5933576346699066e-05, + "loss": 0.3843, + "step": 1381 + }, + { + "epoch": 0.11195722618276086, + "grad_norm": 0.06443291902542114, + "learning_rate": 5.5974078574321586e-05, + "loss": 0.3698, + "step": 1382 + }, + { + "epoch": 0.11203823720025924, + "grad_norm": 0.06290264427661896, + "learning_rate": 5.6014580801944105e-05, + "loss": 0.4046, + "step": 1383 + }, + { + "epoch": 0.11211924821775761, + "grad_norm": 0.06185779720544815, + "learning_rate": 5.6055083029566625e-05, + "loss": 0.4099, + "step": 1384 + }, + { + "epoch": 0.11220025923525599, + "grad_norm": 0.07247399538755417, + "learning_rate": 5.6095585257189145e-05, + "loss": 0.4091, + "step": 1385 + }, + { + "epoch": 0.11228127025275438, + "grad_norm": 0.05858607590198517, + "learning_rate": 5.613608748481167e-05, + "loss": 0.42, + "step": 1386 + }, + { + "epoch": 0.11236228127025276, + "grad_norm": 0.05494888499379158, + "learning_rate": 5.617658971243419e-05, + "loss": 0.3484, + "step": 1387 + }, + { + "epoch": 0.11244329228775113, + "grad_norm": 0.06474845111370087, + "learning_rate": 5.621709194005671e-05, + "loss": 0.3846, + "step": 1388 + }, + { + "epoch": 0.11252430330524951, + "grad_norm": 0.07955454289913177, + "learning_rate": 5.625759416767923e-05, + "loss": 0.429, + "step": 1389 + }, + { + "epoch": 0.1126053143227479, + "grad_norm": 0.07667361944913864, + "learning_rate": 5.629809639530175e-05, + "loss": 0.4158, + "step": 1390 + }, + { + "epoch": 0.11268632534024628, + "grad_norm": 0.06198974698781967, + "learning_rate": 5.633859862292426e-05, + "loss": 0.4174, + "step": 1391 + }, + { + "epoch": 0.11276733635774465, + "grad_norm": 0.07477148622274399, + "learning_rate": 5.637910085054678e-05, + "loss": 0.4521, + "step": 1392 + }, + { + "epoch": 0.11284834737524303, + "grad_norm": 0.06670566648244858, + "learning_rate": 5.64196030781693e-05, + "loss": 0.3652, + "step": 1393 + }, + { + "epoch": 0.11292935839274142, + "grad_norm": 0.09626266360282898, + "learning_rate": 5.646010530579182e-05, + "loss": 0.4482, + "step": 1394 + }, + { + "epoch": 0.11301036941023979, + "grad_norm": 0.06237015873193741, + "learning_rate": 5.650060753341434e-05, + "loss": 0.4102, + "step": 1395 + }, + { + "epoch": 0.11309138042773817, + "grad_norm": 0.054151054471731186, + "learning_rate": 5.654110976103686e-05, + "loss": 0.4147, + "step": 1396 + }, + { + "epoch": 0.11317239144523655, + "grad_norm": 0.07566509395837784, + "learning_rate": 5.658161198865938e-05, + "loss": 0.4318, + "step": 1397 + }, + { + "epoch": 0.11325340246273494, + "grad_norm": 0.05651364475488663, + "learning_rate": 5.6622114216281894e-05, + "loss": 0.3687, + "step": 1398 + }, + { + "epoch": 0.1133344134802333, + "grad_norm": 0.06089472398161888, + "learning_rate": 5.666261644390441e-05, + "loss": 0.3987, + "step": 1399 + }, + { + "epoch": 0.11341542449773169, + "grad_norm": 0.08430080115795135, + "learning_rate": 5.670311867152693e-05, + "loss": 0.4169, + "step": 1400 + }, + { + "epoch": 0.11349643551523007, + "grad_norm": 0.05499950423836708, + "learning_rate": 5.674362089914945e-05, + "loss": 0.4104, + "step": 1401 + }, + { + "epoch": 0.11357744653272846, + "grad_norm": 0.07133946567773819, + "learning_rate": 5.678412312677197e-05, + "loss": 0.3948, + "step": 1402 + }, + { + "epoch": 0.11365845755022683, + "grad_norm": 0.07891767472028732, + "learning_rate": 5.682462535439449e-05, + "loss": 0.3846, + "step": 1403 + }, + { + "epoch": 0.11373946856772521, + "grad_norm": 0.09289713948965073, + "learning_rate": 5.6865127582017005e-05, + "loss": 0.3524, + "step": 1404 + }, + { + "epoch": 0.1138204795852236, + "grad_norm": 0.0759500190615654, + "learning_rate": 5.690562980963954e-05, + "loss": 0.3903, + "step": 1405 + }, + { + "epoch": 0.11390149060272196, + "grad_norm": 0.05667630583047867, + "learning_rate": 5.694613203726206e-05, + "loss": 0.3905, + "step": 1406 + }, + { + "epoch": 0.11398250162022035, + "grad_norm": 0.07564699649810791, + "learning_rate": 5.698663426488458e-05, + "loss": 0.4242, + "step": 1407 + }, + { + "epoch": 0.11406351263771873, + "grad_norm": 0.05298449099063873, + "learning_rate": 5.70271364925071e-05, + "loss": 0.3956, + "step": 1408 + }, + { + "epoch": 0.11414452365521711, + "grad_norm": 0.08333901315927505, + "learning_rate": 5.706763872012961e-05, + "loss": 0.3865, + "step": 1409 + }, + { + "epoch": 0.11422553467271548, + "grad_norm": 0.05635182932019234, + "learning_rate": 5.710814094775213e-05, + "loss": 0.407, + "step": 1410 + }, + { + "epoch": 0.11430654569021387, + "grad_norm": 0.07674378901720047, + "learning_rate": 5.714864317537465e-05, + "loss": 0.4479, + "step": 1411 + }, + { + "epoch": 0.11438755670771225, + "grad_norm": 0.07845453172922134, + "learning_rate": 5.718914540299717e-05, + "loss": 0.3823, + "step": 1412 + }, + { + "epoch": 0.11446856772521063, + "grad_norm": 0.08581458032131195, + "learning_rate": 5.722964763061969e-05, + "loss": 0.3887, + "step": 1413 + }, + { + "epoch": 0.114549578742709, + "grad_norm": 0.08316148072481155, + "learning_rate": 5.727014985824221e-05, + "loss": 0.3622, + "step": 1414 + }, + { + "epoch": 0.11463058976020739, + "grad_norm": 0.11354885250329971, + "learning_rate": 5.731065208586472e-05, + "loss": 0.3941, + "step": 1415 + }, + { + "epoch": 0.11471160077770577, + "grad_norm": 0.07128278911113739, + "learning_rate": 5.735115431348724e-05, + "loss": 0.4047, + "step": 1416 + }, + { + "epoch": 0.11479261179520414, + "grad_norm": 0.07349122315645218, + "learning_rate": 5.739165654110976e-05, + "loss": 0.4173, + "step": 1417 + }, + { + "epoch": 0.11487362281270252, + "grad_norm": 0.06401881575584412, + "learning_rate": 5.743215876873228e-05, + "loss": 0.416, + "step": 1418 + }, + { + "epoch": 0.11495463383020091, + "grad_norm": 0.06177844852209091, + "learning_rate": 5.74726609963548e-05, + "loss": 0.4033, + "step": 1419 + }, + { + "epoch": 0.11503564484769929, + "grad_norm": 0.07423469424247742, + "learning_rate": 5.751316322397732e-05, + "loss": 0.3603, + "step": 1420 + }, + { + "epoch": 0.11511665586519766, + "grad_norm": 0.06056517735123634, + "learning_rate": 5.755366545159984e-05, + "loss": 0.3945, + "step": 1421 + }, + { + "epoch": 0.11519766688269605, + "grad_norm": 0.06560319662094116, + "learning_rate": 5.759416767922235e-05, + "loss": 0.4216, + "step": 1422 + }, + { + "epoch": 0.11527867790019443, + "grad_norm": 0.05511828511953354, + "learning_rate": 5.7634669906844885e-05, + "loss": 0.3935, + "step": 1423 + }, + { + "epoch": 0.11535968891769281, + "grad_norm": 0.0775819942355156, + "learning_rate": 5.7675172134467405e-05, + "loss": 0.4025, + "step": 1424 + }, + { + "epoch": 0.11544069993519118, + "grad_norm": 0.05826788395643234, + "learning_rate": 5.7715674362089925e-05, + "loss": 0.3966, + "step": 1425 + }, + { + "epoch": 0.11552171095268957, + "grad_norm": 0.059229105710983276, + "learning_rate": 5.775617658971244e-05, + "loss": 0.3551, + "step": 1426 + }, + { + "epoch": 0.11560272197018795, + "grad_norm": 0.054622627794742584, + "learning_rate": 5.779667881733496e-05, + "loss": 0.3783, + "step": 1427 + }, + { + "epoch": 0.11568373298768632, + "grad_norm": 0.07204500585794449, + "learning_rate": 5.783718104495748e-05, + "loss": 0.3871, + "step": 1428 + }, + { + "epoch": 0.1157647440051847, + "grad_norm": 0.062203872948884964, + "learning_rate": 5.787768327258e-05, + "loss": 0.3686, + "step": 1429 + }, + { + "epoch": 0.11584575502268309, + "grad_norm": 0.07069511711597443, + "learning_rate": 5.7918185500202516e-05, + "loss": 0.4034, + "step": 1430 + }, + { + "epoch": 0.11592676604018147, + "grad_norm": 0.053734250366687775, + "learning_rate": 5.7958687727825036e-05, + "loss": 0.3774, + "step": 1431 + }, + { + "epoch": 0.11600777705767984, + "grad_norm": 0.06350980699062347, + "learning_rate": 5.7999189955447556e-05, + "loss": 0.357, + "step": 1432 + }, + { + "epoch": 0.11608878807517822, + "grad_norm": 0.06450559943914413, + "learning_rate": 5.803969218307007e-05, + "loss": 0.373, + "step": 1433 + }, + { + "epoch": 0.1161697990926766, + "grad_norm": 0.05774838477373123, + "learning_rate": 5.808019441069259e-05, + "loss": 0.3711, + "step": 1434 + }, + { + "epoch": 0.11625081011017499, + "grad_norm": 0.05953490734100342, + "learning_rate": 5.812069663831511e-05, + "loss": 0.375, + "step": 1435 + }, + { + "epoch": 0.11633182112767336, + "grad_norm": 0.06642217189073563, + "learning_rate": 5.816119886593763e-05, + "loss": 0.4207, + "step": 1436 + }, + { + "epoch": 0.11641283214517174, + "grad_norm": 0.06046362221240997, + "learning_rate": 5.820170109356015e-05, + "loss": 0.3919, + "step": 1437 + }, + { + "epoch": 0.11649384316267013, + "grad_norm": 0.07783038169145584, + "learning_rate": 5.824220332118267e-05, + "loss": 0.3518, + "step": 1438 + }, + { + "epoch": 0.1165748541801685, + "grad_norm": 0.0818445235490799, + "learning_rate": 5.8282705548805187e-05, + "loss": 0.4278, + "step": 1439 + }, + { + "epoch": 0.11665586519766688, + "grad_norm": 0.08483424782752991, + "learning_rate": 5.83232077764277e-05, + "loss": 0.4215, + "step": 1440 + }, + { + "epoch": 0.11673687621516526, + "grad_norm": 0.06911425292491913, + "learning_rate": 5.836371000405022e-05, + "loss": 0.4033, + "step": 1441 + }, + { + "epoch": 0.11681788723266365, + "grad_norm": 0.0859529972076416, + "learning_rate": 5.840421223167275e-05, + "loss": 0.4568, + "step": 1442 + }, + { + "epoch": 0.11689889825016202, + "grad_norm": 0.07996172457933426, + "learning_rate": 5.844471445929527e-05, + "loss": 0.4261, + "step": 1443 + }, + { + "epoch": 0.1169799092676604, + "grad_norm": 0.07805225998163223, + "learning_rate": 5.8485216686917785e-05, + "loss": 0.3527, + "step": 1444 + }, + { + "epoch": 0.11706092028515878, + "grad_norm": 0.053170718252658844, + "learning_rate": 5.8525718914540305e-05, + "loss": 0.4232, + "step": 1445 + }, + { + "epoch": 0.11714193130265717, + "grad_norm": 0.056831154972314835, + "learning_rate": 5.8566221142162824e-05, + "loss": 0.3995, + "step": 1446 + }, + { + "epoch": 0.11722294232015554, + "grad_norm": 0.07470972090959549, + "learning_rate": 5.8606723369785344e-05, + "loss": 0.4433, + "step": 1447 + }, + { + "epoch": 0.11730395333765392, + "grad_norm": 0.08808553218841553, + "learning_rate": 5.8647225597407864e-05, + "loss": 0.4363, + "step": 1448 + }, + { + "epoch": 0.1173849643551523, + "grad_norm": 0.059553466737270355, + "learning_rate": 5.868772782503038e-05, + "loss": 0.3869, + "step": 1449 + }, + { + "epoch": 0.11746597537265067, + "grad_norm": 0.06513705104589462, + "learning_rate": 5.8728230052652896e-05, + "loss": 0.3305, + "step": 1450 + }, + { + "epoch": 0.11754698639014906, + "grad_norm": 0.07553792744874954, + "learning_rate": 5.8768732280275416e-05, + "loss": 0.4072, + "step": 1451 + }, + { + "epoch": 0.11762799740764744, + "grad_norm": 0.07356736063957214, + "learning_rate": 5.8809234507897935e-05, + "loss": 0.422, + "step": 1452 + }, + { + "epoch": 0.11770900842514583, + "grad_norm": 0.08110546320676804, + "learning_rate": 5.8849736735520455e-05, + "loss": 0.4178, + "step": 1453 + }, + { + "epoch": 0.1177900194426442, + "grad_norm": 0.07067373394966125, + "learning_rate": 5.8890238963142975e-05, + "loss": 0.3834, + "step": 1454 + }, + { + "epoch": 0.11787103046014258, + "grad_norm": 0.06569874286651611, + "learning_rate": 5.8930741190765494e-05, + "loss": 0.4695, + "step": 1455 + }, + { + "epoch": 0.11795204147764096, + "grad_norm": 0.06837479025125504, + "learning_rate": 5.8971243418388014e-05, + "loss": 0.4075, + "step": 1456 + }, + { + "epoch": 0.11803305249513935, + "grad_norm": 0.06939905881881714, + "learning_rate": 5.901174564601053e-05, + "loss": 0.4364, + "step": 1457 + }, + { + "epoch": 0.11811406351263772, + "grad_norm": 0.05386871099472046, + "learning_rate": 5.905224787363305e-05, + "loss": 0.3447, + "step": 1458 + }, + { + "epoch": 0.1181950745301361, + "grad_norm": 0.06182454526424408, + "learning_rate": 5.9092750101255566e-05, + "loss": 0.4063, + "step": 1459 + }, + { + "epoch": 0.11827608554763448, + "grad_norm": 0.08313605934381485, + "learning_rate": 5.9133252328878086e-05, + "loss": 0.4423, + "step": 1460 + }, + { + "epoch": 0.11835709656513285, + "grad_norm": 0.058940161019563675, + "learning_rate": 5.917375455650061e-05, + "loss": 0.3887, + "step": 1461 + }, + { + "epoch": 0.11843810758263124, + "grad_norm": 0.06423253566026688, + "learning_rate": 5.921425678412313e-05, + "loss": 0.4242, + "step": 1462 + }, + { + "epoch": 0.11851911860012962, + "grad_norm": 0.0520365871489048, + "learning_rate": 5.925475901174565e-05, + "loss": 0.3587, + "step": 1463 + }, + { + "epoch": 0.118600129617628, + "grad_norm": 0.06681492924690247, + "learning_rate": 5.929526123936817e-05, + "loss": 0.378, + "step": 1464 + }, + { + "epoch": 0.11868114063512637, + "grad_norm": 0.07512427121400833, + "learning_rate": 5.933576346699069e-05, + "loss": 0.4253, + "step": 1465 + }, + { + "epoch": 0.11876215165262476, + "grad_norm": 0.06821638345718384, + "learning_rate": 5.937626569461321e-05, + "loss": 0.3874, + "step": 1466 + }, + { + "epoch": 0.11884316267012314, + "grad_norm": 0.09401658922433853, + "learning_rate": 5.941676792223573e-05, + "loss": 0.4082, + "step": 1467 + }, + { + "epoch": 0.11892417368762152, + "grad_norm": 0.08017772436141968, + "learning_rate": 5.945727014985824e-05, + "loss": 0.4195, + "step": 1468 + }, + { + "epoch": 0.11900518470511989, + "grad_norm": 0.0690443366765976, + "learning_rate": 5.949777237748076e-05, + "loss": 0.4214, + "step": 1469 + }, + { + "epoch": 0.11908619572261828, + "grad_norm": 0.07056768983602524, + "learning_rate": 5.953827460510328e-05, + "loss": 0.4009, + "step": 1470 + }, + { + "epoch": 0.11916720674011666, + "grad_norm": 0.07003843039274216, + "learning_rate": 5.95787768327258e-05, + "loss": 0.4205, + "step": 1471 + }, + { + "epoch": 0.11924821775761503, + "grad_norm": 0.1017618402838707, + "learning_rate": 5.961927906034832e-05, + "loss": 0.4004, + "step": 1472 + }, + { + "epoch": 0.11932922877511341, + "grad_norm": 0.07712042331695557, + "learning_rate": 5.965978128797084e-05, + "loss": 0.3657, + "step": 1473 + }, + { + "epoch": 0.1194102397926118, + "grad_norm": 0.06559691578149796, + "learning_rate": 5.970028351559336e-05, + "loss": 0.4154, + "step": 1474 + }, + { + "epoch": 0.11949125081011018, + "grad_norm": 0.06711196154356003, + "learning_rate": 5.9740785743215874e-05, + "loss": 0.4071, + "step": 1475 + }, + { + "epoch": 0.11957226182760855, + "grad_norm": 0.06897864490747452, + "learning_rate": 5.9781287970838394e-05, + "loss": 0.4112, + "step": 1476 + }, + { + "epoch": 0.11965327284510693, + "grad_norm": 0.053040411323308945, + "learning_rate": 5.9821790198460914e-05, + "loss": 0.363, + "step": 1477 + }, + { + "epoch": 0.11973428386260532, + "grad_norm": 0.06178470328450203, + "learning_rate": 5.986229242608343e-05, + "loss": 0.3897, + "step": 1478 + }, + { + "epoch": 0.1198152948801037, + "grad_norm": 0.0642366036772728, + "learning_rate": 5.990279465370595e-05, + "loss": 0.3915, + "step": 1479 + }, + { + "epoch": 0.11989630589760207, + "grad_norm": 0.061436790972948074, + "learning_rate": 5.994329688132848e-05, + "loss": 0.3906, + "step": 1480 + }, + { + "epoch": 0.11997731691510045, + "grad_norm": 0.064246267080307, + "learning_rate": 5.9983799108951e-05, + "loss": 0.3562, + "step": 1481 + }, + { + "epoch": 0.12005832793259884, + "grad_norm": 0.07403618097305298, + "learning_rate": 6.002430133657352e-05, + "loss": 0.4127, + "step": 1482 + }, + { + "epoch": 0.12013933895009721, + "grad_norm": 0.06679215282201767, + "learning_rate": 6.006480356419604e-05, + "loss": 0.4089, + "step": 1483 + }, + { + "epoch": 0.12022034996759559, + "grad_norm": 0.08910468965768814, + "learning_rate": 6.010530579181856e-05, + "loss": 0.4234, + "step": 1484 + }, + { + "epoch": 0.12030136098509397, + "grad_norm": 0.07021404802799225, + "learning_rate": 6.014580801944108e-05, + "loss": 0.3685, + "step": 1485 + }, + { + "epoch": 0.12038237200259236, + "grad_norm": 0.07262944430112839, + "learning_rate": 6.018631024706359e-05, + "loss": 0.4486, + "step": 1486 + }, + { + "epoch": 0.12046338302009073, + "grad_norm": 0.060715995728969574, + "learning_rate": 6.022681247468611e-05, + "loss": 0.4298, + "step": 1487 + }, + { + "epoch": 0.12054439403758911, + "grad_norm": 0.07092267274856567, + "learning_rate": 6.026731470230863e-05, + "loss": 0.3316, + "step": 1488 + }, + { + "epoch": 0.1206254050550875, + "grad_norm": 0.055634964257478714, + "learning_rate": 6.030781692993115e-05, + "loss": 0.3747, + "step": 1489 + }, + { + "epoch": 0.12070641607258586, + "grad_norm": 0.06697001308202744, + "learning_rate": 6.034831915755367e-05, + "loss": 0.3923, + "step": 1490 + }, + { + "epoch": 0.12078742709008425, + "grad_norm": 0.068814717233181, + "learning_rate": 6.038882138517619e-05, + "loss": 0.3987, + "step": 1491 + }, + { + "epoch": 0.12086843810758263, + "grad_norm": 0.06390635669231415, + "learning_rate": 6.04293236127987e-05, + "loss": 0.3724, + "step": 1492 + }, + { + "epoch": 0.12094944912508102, + "grad_norm": 0.06354351341724396, + "learning_rate": 6.046982584042122e-05, + "loss": 0.3693, + "step": 1493 + }, + { + "epoch": 0.12103046014257939, + "grad_norm": 0.06260344386100769, + "learning_rate": 6.051032806804374e-05, + "loss": 0.406, + "step": 1494 + }, + { + "epoch": 0.12111147116007777, + "grad_norm": 0.06350363790988922, + "learning_rate": 6.055083029566626e-05, + "loss": 0.3977, + "step": 1495 + }, + { + "epoch": 0.12119248217757615, + "grad_norm": 0.06784169375896454, + "learning_rate": 6.059133252328878e-05, + "loss": 0.4385, + "step": 1496 + }, + { + "epoch": 0.12127349319507454, + "grad_norm": 0.06894693523645401, + "learning_rate": 6.06318347509113e-05, + "loss": 0.4927, + "step": 1497 + }, + { + "epoch": 0.1213545042125729, + "grad_norm": 0.059703197330236435, + "learning_rate": 6.067233697853382e-05, + "loss": 0.3762, + "step": 1498 + }, + { + "epoch": 0.12143551523007129, + "grad_norm": 0.0626472756266594, + "learning_rate": 6.0712839206156346e-05, + "loss": 0.4345, + "step": 1499 + }, + { + "epoch": 0.12151652624756967, + "grad_norm": 0.073844313621521, + "learning_rate": 6.0753341433778866e-05, + "loss": 0.418, + "step": 1500 + }, + { + "epoch": 0.12159753726506804, + "grad_norm": 0.07876653969287872, + "learning_rate": 6.0793843661401386e-05, + "loss": 0.3884, + "step": 1501 + }, + { + "epoch": 0.12167854828256643, + "grad_norm": 0.06772121787071228, + "learning_rate": 6.0834345889023905e-05, + "loss": 0.4145, + "step": 1502 + }, + { + "epoch": 0.12175955930006481, + "grad_norm": 0.0799356997013092, + "learning_rate": 6.087484811664642e-05, + "loss": 0.4178, + "step": 1503 + }, + { + "epoch": 0.1218405703175632, + "grad_norm": 0.06637130677700043, + "learning_rate": 6.091535034426894e-05, + "loss": 0.4, + "step": 1504 + }, + { + "epoch": 0.12192158133506156, + "grad_norm": 0.08216597139835358, + "learning_rate": 6.095585257189146e-05, + "loss": 0.4349, + "step": 1505 + }, + { + "epoch": 0.12200259235255995, + "grad_norm": 0.07377637922763824, + "learning_rate": 6.099635479951398e-05, + "loss": 0.4112, + "step": 1506 + }, + { + "epoch": 0.12208360337005833, + "grad_norm": 0.06952842324972153, + "learning_rate": 6.103685702713649e-05, + "loss": 0.3819, + "step": 1507 + }, + { + "epoch": 0.12216461438755671, + "grad_norm": 0.07549799978733063, + "learning_rate": 6.107735925475901e-05, + "loss": 0.4019, + "step": 1508 + }, + { + "epoch": 0.12224562540505508, + "grad_norm": 0.062323398888111115, + "learning_rate": 6.111786148238153e-05, + "loss": 0.382, + "step": 1509 + }, + { + "epoch": 0.12232663642255347, + "grad_norm": 0.07245488464832306, + "learning_rate": 6.115836371000405e-05, + "loss": 0.3724, + "step": 1510 + }, + { + "epoch": 0.12240764744005185, + "grad_norm": 0.07224228233098984, + "learning_rate": 6.119886593762657e-05, + "loss": 0.3675, + "step": 1511 + }, + { + "epoch": 0.12248865845755022, + "grad_norm": 0.06970011442899704, + "learning_rate": 6.123936816524909e-05, + "loss": 0.3884, + "step": 1512 + }, + { + "epoch": 0.1225696694750486, + "grad_norm": 0.06573334336280823, + "learning_rate": 6.127987039287161e-05, + "loss": 0.3984, + "step": 1513 + }, + { + "epoch": 0.12265068049254699, + "grad_norm": 0.07179760187864304, + "learning_rate": 6.132037262049413e-05, + "loss": 0.4381, + "step": 1514 + }, + { + "epoch": 0.12273169151004537, + "grad_norm": 0.06900203227996826, + "learning_rate": 6.136087484811665e-05, + "loss": 0.4239, + "step": 1515 + }, + { + "epoch": 0.12281270252754374, + "grad_norm": 0.08503764122724533, + "learning_rate": 6.140137707573917e-05, + "loss": 0.4344, + "step": 1516 + }, + { + "epoch": 0.12289371354504212, + "grad_norm": 0.06342464685440063, + "learning_rate": 6.144187930336169e-05, + "loss": 0.4055, + "step": 1517 + }, + { + "epoch": 0.12297472456254051, + "grad_norm": 0.07056683301925659, + "learning_rate": 6.14823815309842e-05, + "loss": 0.4432, + "step": 1518 + }, + { + "epoch": 0.12305573558003889, + "grad_norm": 0.05167962238192558, + "learning_rate": 6.152288375860673e-05, + "loss": 0.3766, + "step": 1519 + }, + { + "epoch": 0.12313674659753726, + "grad_norm": 0.06147640198469162, + "learning_rate": 6.156338598622925e-05, + "loss": 0.37, + "step": 1520 + }, + { + "epoch": 0.12321775761503564, + "grad_norm": 0.0592205636203289, + "learning_rate": 6.160388821385177e-05, + "loss": 0.3506, + "step": 1521 + }, + { + "epoch": 0.12329876863253403, + "grad_norm": 0.05324317514896393, + "learning_rate": 6.164439044147429e-05, + "loss": 0.3264, + "step": 1522 + }, + { + "epoch": 0.1233797796500324, + "grad_norm": 0.07853440195322037, + "learning_rate": 6.16848926690968e-05, + "loss": 0.3834, + "step": 1523 + }, + { + "epoch": 0.12346079066753078, + "grad_norm": 0.06663885712623596, + "learning_rate": 6.172539489671932e-05, + "loss": 0.4007, + "step": 1524 + }, + { + "epoch": 0.12354180168502917, + "grad_norm": 0.0634799674153328, + "learning_rate": 6.176589712434184e-05, + "loss": 0.4035, + "step": 1525 + }, + { + "epoch": 0.12362281270252755, + "grad_norm": 0.05222257971763611, + "learning_rate": 6.180639935196436e-05, + "loss": 0.3598, + "step": 1526 + }, + { + "epoch": 0.12370382372002592, + "grad_norm": 0.051549945026636124, + "learning_rate": 6.184690157958688e-05, + "loss": 0.4021, + "step": 1527 + }, + { + "epoch": 0.1237848347375243, + "grad_norm": 0.05802503600716591, + "learning_rate": 6.18874038072094e-05, + "loss": 0.417, + "step": 1528 + }, + { + "epoch": 0.12386584575502269, + "grad_norm": 0.08867768198251724, + "learning_rate": 6.192790603483192e-05, + "loss": 0.4079, + "step": 1529 + }, + { + "epoch": 0.12394685677252107, + "grad_norm": 0.06496446579694748, + "learning_rate": 6.196840826245444e-05, + "loss": 0.4484, + "step": 1530 + }, + { + "epoch": 0.12402786779001944, + "grad_norm": 0.057406000792980194, + "learning_rate": 6.200891049007695e-05, + "loss": 0.4442, + "step": 1531 + }, + { + "epoch": 0.12410887880751782, + "grad_norm": 0.054245512932538986, + "learning_rate": 6.204941271769947e-05, + "loss": 0.3731, + "step": 1532 + }, + { + "epoch": 0.1241898898250162, + "grad_norm": 0.0568988211452961, + "learning_rate": 6.208991494532199e-05, + "loss": 0.4182, + "step": 1533 + }, + { + "epoch": 0.12427090084251458, + "grad_norm": 0.05035136267542839, + "learning_rate": 6.213041717294451e-05, + "loss": 0.3707, + "step": 1534 + }, + { + "epoch": 0.12435191186001296, + "grad_norm": 0.059908732771873474, + "learning_rate": 6.217091940056703e-05, + "loss": 0.4289, + "step": 1535 + }, + { + "epoch": 0.12443292287751134, + "grad_norm": 0.05112859234213829, + "learning_rate": 6.221142162818955e-05, + "loss": 0.4052, + "step": 1536 + }, + { + "epoch": 0.12451393389500973, + "grad_norm": 0.05995730310678482, + "learning_rate": 6.225192385581208e-05, + "loss": 0.3902, + "step": 1537 + }, + { + "epoch": 0.1245949449125081, + "grad_norm": 0.06803309172391891, + "learning_rate": 6.22924260834346e-05, + "loss": 0.436, + "step": 1538 + }, + { + "epoch": 0.12467595593000648, + "grad_norm": 0.04792184755206108, + "learning_rate": 6.233292831105712e-05, + "loss": 0.3511, + "step": 1539 + }, + { + "epoch": 0.12475696694750486, + "grad_norm": 0.07876551896333694, + "learning_rate": 6.237343053867964e-05, + "loss": 0.4269, + "step": 1540 + }, + { + "epoch": 0.12483797796500325, + "grad_norm": 0.054781924933195114, + "learning_rate": 6.241393276630216e-05, + "loss": 0.4205, + "step": 1541 + }, + { + "epoch": 0.12491898898250162, + "grad_norm": 0.08007676899433136, + "learning_rate": 6.245443499392466e-05, + "loss": 0.3985, + "step": 1542 + }, + { + "epoch": 0.125, + "grad_norm": 0.05014176294207573, + "learning_rate": 6.249493722154718e-05, + "loss": 0.3702, + "step": 1543 + }, + { + "epoch": 0.12508101101749838, + "grad_norm": 0.05353700742125511, + "learning_rate": 6.25354394491697e-05, + "loss": 0.3644, + "step": 1544 + }, + { + "epoch": 0.12516202203499677, + "grad_norm": 0.06442080438137054, + "learning_rate": 6.257594167679222e-05, + "loss": 0.3492, + "step": 1545 + }, + { + "epoch": 0.12524303305249515, + "grad_norm": 0.06315489113330841, + "learning_rate": 6.261644390441474e-05, + "loss": 0.4156, + "step": 1546 + }, + { + "epoch": 0.1253240440699935, + "grad_norm": 0.06708311289548874, + "learning_rate": 6.265694613203726e-05, + "loss": 0.3946, + "step": 1547 + }, + { + "epoch": 0.1254050550874919, + "grad_norm": 0.049992773681879044, + "learning_rate": 6.269744835965978e-05, + "loss": 0.3638, + "step": 1548 + }, + { + "epoch": 0.12548606610499027, + "grad_norm": 0.0535244345664978, + "learning_rate": 6.27379505872823e-05, + "loss": 0.4226, + "step": 1549 + }, + { + "epoch": 0.12556707712248866, + "grad_norm": 0.055212512612342834, + "learning_rate": 6.277845281490482e-05, + "loss": 0.4012, + "step": 1550 + }, + { + "epoch": 0.12564808813998704, + "grad_norm": 0.11156027764081955, + "learning_rate": 6.281895504252734e-05, + "loss": 0.3796, + "step": 1551 + }, + { + "epoch": 0.12572909915748542, + "grad_norm": 0.0516376718878746, + "learning_rate": 6.285945727014986e-05, + "loss": 0.3988, + "step": 1552 + }, + { + "epoch": 0.1258101101749838, + "grad_norm": 0.06597688049077988, + "learning_rate": 6.289995949777238e-05, + "loss": 0.436, + "step": 1553 + }, + { + "epoch": 0.12589112119248216, + "grad_norm": 0.05716634914278984, + "learning_rate": 6.29404617253949e-05, + "loss": 0.3742, + "step": 1554 + }, + { + "epoch": 0.12597213220998055, + "grad_norm": 0.06599877029657364, + "learning_rate": 6.298096395301742e-05, + "loss": 0.4098, + "step": 1555 + }, + { + "epoch": 0.12605314322747893, + "grad_norm": 0.07445321977138519, + "learning_rate": 6.302146618063994e-05, + "loss": 0.445, + "step": 1556 + }, + { + "epoch": 0.12613415424497731, + "grad_norm": 0.05665164813399315, + "learning_rate": 6.306196840826246e-05, + "loss": 0.4132, + "step": 1557 + }, + { + "epoch": 0.1262151652624757, + "grad_norm": 0.053392160683870316, + "learning_rate": 6.310247063588498e-05, + "loss": 0.3849, + "step": 1558 + }, + { + "epoch": 0.12629617627997408, + "grad_norm": 0.07199081778526306, + "learning_rate": 6.31429728635075e-05, + "loss": 0.4211, + "step": 1559 + }, + { + "epoch": 0.12637718729747247, + "grad_norm": 0.07637181133031845, + "learning_rate": 6.318347509113002e-05, + "loss": 0.4031, + "step": 1560 + }, + { + "epoch": 0.12645819831497085, + "grad_norm": 0.057522937655448914, + "learning_rate": 6.322397731875254e-05, + "loss": 0.3851, + "step": 1561 + }, + { + "epoch": 0.1265392093324692, + "grad_norm": 0.06370987743139267, + "learning_rate": 6.326447954637506e-05, + "loss": 0.4303, + "step": 1562 + }, + { + "epoch": 0.1266202203499676, + "grad_norm": 0.05618150904774666, + "learning_rate": 6.330498177399758e-05, + "loss": 0.4337, + "step": 1563 + }, + { + "epoch": 0.12670123136746597, + "grad_norm": 0.0655088871717453, + "learning_rate": 6.33454840016201e-05, + "loss": 0.3722, + "step": 1564 + }, + { + "epoch": 0.12678224238496436, + "grad_norm": 0.05627769976854324, + "learning_rate": 6.338598622924262e-05, + "loss": 0.3779, + "step": 1565 + }, + { + "epoch": 0.12686325340246274, + "grad_norm": 0.06236083433032036, + "learning_rate": 6.342648845686512e-05, + "loss": 0.369, + "step": 1566 + }, + { + "epoch": 0.12694426441996112, + "grad_norm": 0.057616692036390305, + "learning_rate": 6.346699068448764e-05, + "loss": 0.3754, + "step": 1567 + }, + { + "epoch": 0.1270252754374595, + "grad_norm": 0.06232890859246254, + "learning_rate": 6.350749291211016e-05, + "loss": 0.3878, + "step": 1568 + }, + { + "epoch": 0.12710628645495786, + "grad_norm": 0.06295885890722275, + "learning_rate": 6.354799513973268e-05, + "loss": 0.4025, + "step": 1569 + }, + { + "epoch": 0.12718729747245625, + "grad_norm": 0.061489179730415344, + "learning_rate": 6.35884973673552e-05, + "loss": 0.3467, + "step": 1570 + }, + { + "epoch": 0.12726830848995463, + "grad_norm": 0.06776545196771622, + "learning_rate": 6.362899959497772e-05, + "loss": 0.4573, + "step": 1571 + }, + { + "epoch": 0.127349319507453, + "grad_norm": 0.06524661928415298, + "learning_rate": 6.366950182260024e-05, + "loss": 0.4187, + "step": 1572 + }, + { + "epoch": 0.1274303305249514, + "grad_norm": 0.061771683394908905, + "learning_rate": 6.371000405022276e-05, + "loss": 0.4127, + "step": 1573 + }, + { + "epoch": 0.12751134154244978, + "grad_norm": 0.057876091450452805, + "learning_rate": 6.37505062778453e-05, + "loss": 0.3942, + "step": 1574 + }, + { + "epoch": 0.12759235255994816, + "grad_norm": 0.054396193474531174, + "learning_rate": 6.379100850546781e-05, + "loss": 0.4281, + "step": 1575 + }, + { + "epoch": 0.12767336357744652, + "grad_norm": 0.07029586285352707, + "learning_rate": 6.383151073309033e-05, + "loss": 0.3947, + "step": 1576 + }, + { + "epoch": 0.1277543745949449, + "grad_norm": 0.07588616758584976, + "learning_rate": 6.387201296071284e-05, + "loss": 0.4575, + "step": 1577 + }, + { + "epoch": 0.1278353856124433, + "grad_norm": 0.06628143042325974, + "learning_rate": 6.391251518833536e-05, + "loss": 0.3918, + "step": 1578 + }, + { + "epoch": 0.12791639662994167, + "grad_norm": 0.06898647546768188, + "learning_rate": 6.395301741595788e-05, + "loss": 0.3896, + "step": 1579 + }, + { + "epoch": 0.12799740764744005, + "grad_norm": 0.07129407674074173, + "learning_rate": 6.39935196435804e-05, + "loss": 0.3751, + "step": 1580 + }, + { + "epoch": 0.12807841866493844, + "grad_norm": 0.06237871199846268, + "learning_rate": 6.403402187120292e-05, + "loss": 0.4224, + "step": 1581 + }, + { + "epoch": 0.12815942968243682, + "grad_norm": 0.08748257905244827, + "learning_rate": 6.407452409882544e-05, + "loss": 0.4231, + "step": 1582 + }, + { + "epoch": 0.1282404406999352, + "grad_norm": 0.062178194522857666, + "learning_rate": 6.411502632644796e-05, + "loss": 0.4333, + "step": 1583 + }, + { + "epoch": 0.12832145171743356, + "grad_norm": 0.07100991159677505, + "learning_rate": 6.415552855407048e-05, + "loss": 0.4207, + "step": 1584 + }, + { + "epoch": 0.12840246273493194, + "grad_norm": 0.0815022736787796, + "learning_rate": 6.4196030781693e-05, + "loss": 0.4191, + "step": 1585 + }, + { + "epoch": 0.12848347375243033, + "grad_norm": 0.07659836113452911, + "learning_rate": 6.423653300931552e-05, + "loss": 0.4201, + "step": 1586 + }, + { + "epoch": 0.1285644847699287, + "grad_norm": 0.07788506895303726, + "learning_rate": 6.427703523693804e-05, + "loss": 0.4051, + "step": 1587 + }, + { + "epoch": 0.1286454957874271, + "grad_norm": 0.09391462057828903, + "learning_rate": 6.431753746456056e-05, + "loss": 0.4634, + "step": 1588 + }, + { + "epoch": 0.12872650680492548, + "grad_norm": 0.054652221500873566, + "learning_rate": 6.435803969218308e-05, + "loss": 0.3983, + "step": 1589 + }, + { + "epoch": 0.12880751782242386, + "grad_norm": 0.06090129539370537, + "learning_rate": 6.439854191980558e-05, + "loss": 0.4262, + "step": 1590 + }, + { + "epoch": 0.12888852883992222, + "grad_norm": 0.05298708379268646, + "learning_rate": 6.44390441474281e-05, + "loss": 0.3402, + "step": 1591 + }, + { + "epoch": 0.1289695398574206, + "grad_norm": 0.06067481264472008, + "learning_rate": 6.447954637505062e-05, + "loss": 0.3825, + "step": 1592 + }, + { + "epoch": 0.12905055087491898, + "grad_norm": 0.08918090164661407, + "learning_rate": 6.452004860267315e-05, + "loss": 0.4773, + "step": 1593 + }, + { + "epoch": 0.12913156189241737, + "grad_norm": 0.06664161384105682, + "learning_rate": 6.456055083029567e-05, + "loss": 0.383, + "step": 1594 + }, + { + "epoch": 0.12921257290991575, + "grad_norm": 0.07685840129852295, + "learning_rate": 6.46010530579182e-05, + "loss": 0.3905, + "step": 1595 + }, + { + "epoch": 0.12929358392741414, + "grad_norm": 0.07122089713811874, + "learning_rate": 6.464155528554071e-05, + "loss": 0.3543, + "step": 1596 + }, + { + "epoch": 0.12937459494491252, + "grad_norm": 0.056531671434640884, + "learning_rate": 6.468205751316323e-05, + "loss": 0.4529, + "step": 1597 + }, + { + "epoch": 0.12945560596241087, + "grad_norm": 0.06631804257631302, + "learning_rate": 6.472255974078575e-05, + "loss": 0.39, + "step": 1598 + }, + { + "epoch": 0.12953661697990926, + "grad_norm": 0.08155537396669388, + "learning_rate": 6.476306196840827e-05, + "loss": 0.4113, + "step": 1599 + }, + { + "epoch": 0.12961762799740764, + "grad_norm": 0.0701952800154686, + "learning_rate": 6.480356419603079e-05, + "loss": 0.3384, + "step": 1600 + }, + { + "epoch": 0.12969863901490603, + "grad_norm": 0.07322200387716293, + "learning_rate": 6.48440664236533e-05, + "loss": 0.376, + "step": 1601 + }, + { + "epoch": 0.1297796500324044, + "grad_norm": 0.06151268631219864, + "learning_rate": 6.488456865127582e-05, + "loss": 0.398, + "step": 1602 + }, + { + "epoch": 0.1298606610499028, + "grad_norm": 0.06803981959819794, + "learning_rate": 6.492507087889834e-05, + "loss": 0.439, + "step": 1603 + }, + { + "epoch": 0.12994167206740118, + "grad_norm": 0.05600098520517349, + "learning_rate": 6.496557310652086e-05, + "loss": 0.3622, + "step": 1604 + }, + { + "epoch": 0.13002268308489956, + "grad_norm": 0.05879819765686989, + "learning_rate": 6.500607533414338e-05, + "loss": 0.4124, + "step": 1605 + }, + { + "epoch": 0.13010369410239792, + "grad_norm": 0.05631718039512634, + "learning_rate": 6.50465775617659e-05, + "loss": 0.3549, + "step": 1606 + }, + { + "epoch": 0.1301847051198963, + "grad_norm": 0.05812705308198929, + "learning_rate": 6.508707978938842e-05, + "loss": 0.368, + "step": 1607 + }, + { + "epoch": 0.13026571613739468, + "grad_norm": 0.061224520206451416, + "learning_rate": 6.512758201701094e-05, + "loss": 0.3571, + "step": 1608 + }, + { + "epoch": 0.13034672715489307, + "grad_norm": 0.06485003232955933, + "learning_rate": 6.516808424463346e-05, + "loss": 0.3681, + "step": 1609 + }, + { + "epoch": 0.13042773817239145, + "grad_norm": 0.062425050884485245, + "learning_rate": 6.520858647225598e-05, + "loss": 0.4119, + "step": 1610 + }, + { + "epoch": 0.13050874918988983, + "grad_norm": 0.073312908411026, + "learning_rate": 6.52490886998785e-05, + "loss": 0.392, + "step": 1611 + }, + { + "epoch": 0.13058976020738822, + "grad_norm": 0.07332398742437363, + "learning_rate": 6.528959092750101e-05, + "loss": 0.377, + "step": 1612 + }, + { + "epoch": 0.13067077122488657, + "grad_norm": 0.06517259031534195, + "learning_rate": 6.533009315512353e-05, + "loss": 0.4136, + "step": 1613 + }, + { + "epoch": 0.13075178224238496, + "grad_norm": 0.05453259125351906, + "learning_rate": 6.537059538274605e-05, + "loss": 0.3647, + "step": 1614 + }, + { + "epoch": 0.13083279325988334, + "grad_norm": 0.06073718145489693, + "learning_rate": 6.541109761036857e-05, + "loss": 0.4809, + "step": 1615 + }, + { + "epoch": 0.13091380427738172, + "grad_norm": 0.0551639199256897, + "learning_rate": 6.54515998379911e-05, + "loss": 0.401, + "step": 1616 + }, + { + "epoch": 0.1309948152948801, + "grad_norm": 0.04859033226966858, + "learning_rate": 6.549210206561361e-05, + "loss": 0.3624, + "step": 1617 + }, + { + "epoch": 0.1310758263123785, + "grad_norm": 0.0692361369729042, + "learning_rate": 6.553260429323613e-05, + "loss": 0.3955, + "step": 1618 + }, + { + "epoch": 0.13115683732987687, + "grad_norm": 0.07203768193721771, + "learning_rate": 6.557310652085865e-05, + "loss": 0.3639, + "step": 1619 + }, + { + "epoch": 0.13123784834737523, + "grad_norm": 0.06581790000200272, + "learning_rate": 6.561360874848117e-05, + "loss": 0.3937, + "step": 1620 + }, + { + "epoch": 0.1313188593648736, + "grad_norm": 0.06830241531133652, + "learning_rate": 6.565411097610369e-05, + "loss": 0.4286, + "step": 1621 + }, + { + "epoch": 0.131399870382372, + "grad_norm": 0.05630087852478027, + "learning_rate": 6.569461320372621e-05, + "loss": 0.3946, + "step": 1622 + }, + { + "epoch": 0.13148088139987038, + "grad_norm": 0.07321779429912567, + "learning_rate": 6.573511543134873e-05, + "loss": 0.4141, + "step": 1623 + }, + { + "epoch": 0.13156189241736876, + "grad_norm": 0.055487010627985, + "learning_rate": 6.577561765897125e-05, + "loss": 0.4224, + "step": 1624 + }, + { + "epoch": 0.13164290343486715, + "grad_norm": 0.06711199134588242, + "learning_rate": 6.581611988659376e-05, + "loss": 0.3823, + "step": 1625 + }, + { + "epoch": 0.13172391445236553, + "grad_norm": 0.06719058007001877, + "learning_rate": 6.585662211421628e-05, + "loss": 0.3954, + "step": 1626 + }, + { + "epoch": 0.13180492546986391, + "grad_norm": 0.06506709009408951, + "learning_rate": 6.58971243418388e-05, + "loss": 0.4039, + "step": 1627 + }, + { + "epoch": 0.13188593648736227, + "grad_norm": 0.07303988933563232, + "learning_rate": 6.593762656946132e-05, + "loss": 0.3961, + "step": 1628 + }, + { + "epoch": 0.13196694750486065, + "grad_norm": 0.05539443716406822, + "learning_rate": 6.597812879708384e-05, + "loss": 0.3972, + "step": 1629 + }, + { + "epoch": 0.13204795852235904, + "grad_norm": 0.07812444865703583, + "learning_rate": 6.601863102470636e-05, + "loss": 0.4308, + "step": 1630 + }, + { + "epoch": 0.13212896953985742, + "grad_norm": 0.049702536314725876, + "learning_rate": 6.605913325232889e-05, + "loss": 0.397, + "step": 1631 + }, + { + "epoch": 0.1322099805573558, + "grad_norm": 0.06067189574241638, + "learning_rate": 6.609963547995141e-05, + "loss": 0.3935, + "step": 1632 + }, + { + "epoch": 0.1322909915748542, + "grad_norm": 0.04642423987388611, + "learning_rate": 6.614013770757393e-05, + "loss": 0.3677, + "step": 1633 + }, + { + "epoch": 0.13237200259235257, + "grad_norm": 0.049967169761657715, + "learning_rate": 6.618063993519645e-05, + "loss": 0.333, + "step": 1634 + }, + { + "epoch": 0.13245301360985093, + "grad_norm": 0.07911203056573868, + "learning_rate": 6.622114216281897e-05, + "loss": 0.3841, + "step": 1635 + }, + { + "epoch": 0.1325340246273493, + "grad_norm": 0.05701819062232971, + "learning_rate": 6.626164439044147e-05, + "loss": 0.3792, + "step": 1636 + }, + { + "epoch": 0.1326150356448477, + "grad_norm": 0.071646548807621, + "learning_rate": 6.630214661806399e-05, + "loss": 0.4173, + "step": 1637 + }, + { + "epoch": 0.13269604666234608, + "grad_norm": 0.06087390333414078, + "learning_rate": 6.634264884568651e-05, + "loss": 0.3695, + "step": 1638 + }, + { + "epoch": 0.13277705767984446, + "grad_norm": 0.06627701967954636, + "learning_rate": 6.638315107330903e-05, + "loss": 0.3858, + "step": 1639 + }, + { + "epoch": 0.13285806869734285, + "grad_norm": 0.061554260551929474, + "learning_rate": 6.642365330093155e-05, + "loss": 0.4465, + "step": 1640 + }, + { + "epoch": 0.13293907971484123, + "grad_norm": 0.06847324967384338, + "learning_rate": 6.646415552855407e-05, + "loss": 0.3974, + "step": 1641 + }, + { + "epoch": 0.13302009073233959, + "grad_norm": 0.07292938232421875, + "learning_rate": 6.650465775617659e-05, + "loss": 0.4101, + "step": 1642 + }, + { + "epoch": 0.13310110174983797, + "grad_norm": 0.05951497703790665, + "learning_rate": 6.654515998379911e-05, + "loss": 0.375, + "step": 1643 + }, + { + "epoch": 0.13318211276733635, + "grad_norm": 0.0688377097249031, + "learning_rate": 6.658566221142163e-05, + "loss": 0.4024, + "step": 1644 + }, + { + "epoch": 0.13326312378483474, + "grad_norm": 0.06519949436187744, + "learning_rate": 6.662616443904415e-05, + "loss": 0.4162, + "step": 1645 + }, + { + "epoch": 0.13334413480233312, + "grad_norm": 0.061955176293849945, + "learning_rate": 6.666666666666667e-05, + "loss": 0.4252, + "step": 1646 + }, + { + "epoch": 0.1334251458198315, + "grad_norm": 0.07237394899129868, + "learning_rate": 6.670716889428919e-05, + "loss": 0.4233, + "step": 1647 + }, + { + "epoch": 0.1335061568373299, + "grad_norm": 0.05980387330055237, + "learning_rate": 6.674767112191171e-05, + "loss": 0.4072, + "step": 1648 + }, + { + "epoch": 0.13358716785482827, + "grad_norm": 0.06416431814432144, + "learning_rate": 6.678817334953423e-05, + "loss": 0.434, + "step": 1649 + }, + { + "epoch": 0.13366817887232663, + "grad_norm": 0.06477854400873184, + "learning_rate": 6.682867557715675e-05, + "loss": 0.3973, + "step": 1650 + }, + { + "epoch": 0.133749189889825, + "grad_norm": 0.07490331679582596, + "learning_rate": 6.686917780477927e-05, + "loss": 0.4374, + "step": 1651 + }, + { + "epoch": 0.1338302009073234, + "grad_norm": 0.06125623732805252, + "learning_rate": 6.690968003240179e-05, + "loss": 0.3869, + "step": 1652 + }, + { + "epoch": 0.13391121192482178, + "grad_norm": 0.06214214488863945, + "learning_rate": 6.695018226002431e-05, + "loss": 0.3977, + "step": 1653 + }, + { + "epoch": 0.13399222294232016, + "grad_norm": 0.046827007085084915, + "learning_rate": 6.699068448764683e-05, + "loss": 0.3905, + "step": 1654 + }, + { + "epoch": 0.13407323395981854, + "grad_norm": 0.05726904422044754, + "learning_rate": 6.703118671526935e-05, + "loss": 0.4151, + "step": 1655 + }, + { + "epoch": 0.13415424497731693, + "grad_norm": 0.07240765541791916, + "learning_rate": 6.707168894289187e-05, + "loss": 0.3689, + "step": 1656 + }, + { + "epoch": 0.13423525599481528, + "grad_norm": 0.049259696155786514, + "learning_rate": 6.711219117051439e-05, + "loss": 0.3673, + "step": 1657 + }, + { + "epoch": 0.13431626701231367, + "grad_norm": 0.05210607126355171, + "learning_rate": 6.71526933981369e-05, + "loss": 0.3938, + "step": 1658 + }, + { + "epoch": 0.13439727802981205, + "grad_norm": 0.05046253278851509, + "learning_rate": 6.719319562575943e-05, + "loss": 0.3753, + "step": 1659 + }, + { + "epoch": 0.13447828904731043, + "grad_norm": 0.052167586982250214, + "learning_rate": 6.723369785338193e-05, + "loss": 0.3874, + "step": 1660 + }, + { + "epoch": 0.13455930006480882, + "grad_norm": 0.06064862012863159, + "learning_rate": 6.727420008100445e-05, + "loss": 0.3736, + "step": 1661 + }, + { + "epoch": 0.1346403110823072, + "grad_norm": 0.07566836476325989, + "learning_rate": 6.731470230862697e-05, + "loss": 0.4438, + "step": 1662 + }, + { + "epoch": 0.13472132209980558, + "grad_norm": 0.058606453239917755, + "learning_rate": 6.735520453624949e-05, + "loss": 0.4132, + "step": 1663 + }, + { + "epoch": 0.13480233311730394, + "grad_norm": 0.059201929718256, + "learning_rate": 6.739570676387201e-05, + "loss": 0.4312, + "step": 1664 + }, + { + "epoch": 0.13488334413480232, + "grad_norm": 0.05796176567673683, + "learning_rate": 6.743620899149453e-05, + "loss": 0.3889, + "step": 1665 + }, + { + "epoch": 0.1349643551523007, + "grad_norm": 0.058323007076978683, + "learning_rate": 6.747671121911705e-05, + "loss": 0.3573, + "step": 1666 + }, + { + "epoch": 0.1350453661697991, + "grad_norm": 0.06881019473075867, + "learning_rate": 6.751721344673957e-05, + "loss": 0.3813, + "step": 1667 + }, + { + "epoch": 0.13512637718729748, + "grad_norm": 0.06973112374544144, + "learning_rate": 6.75577156743621e-05, + "loss": 0.3706, + "step": 1668 + }, + { + "epoch": 0.13520738820479586, + "grad_norm": 0.06532883644104004, + "learning_rate": 6.759821790198462e-05, + "loss": 0.3911, + "step": 1669 + }, + { + "epoch": 0.13528839922229424, + "grad_norm": 0.06697031110525131, + "learning_rate": 6.763872012960714e-05, + "loss": 0.3457, + "step": 1670 + }, + { + "epoch": 0.1353694102397926, + "grad_norm": 0.05248725041747093, + "learning_rate": 6.767922235722965e-05, + "loss": 0.3672, + "step": 1671 + }, + { + "epoch": 0.13545042125729098, + "grad_norm": 0.06255452334880829, + "learning_rate": 6.771972458485217e-05, + "loss": 0.384, + "step": 1672 + }, + { + "epoch": 0.13553143227478937, + "grad_norm": 0.04622405394911766, + "learning_rate": 6.776022681247469e-05, + "loss": 0.3759, + "step": 1673 + }, + { + "epoch": 0.13561244329228775, + "grad_norm": 0.07297157496213913, + "learning_rate": 6.780072904009721e-05, + "loss": 0.3776, + "step": 1674 + }, + { + "epoch": 0.13569345430978613, + "grad_norm": 0.06406359374523163, + "learning_rate": 6.784123126771973e-05, + "loss": 0.3921, + "step": 1675 + }, + { + "epoch": 0.13577446532728452, + "grad_norm": 0.053780291229486465, + "learning_rate": 6.788173349534225e-05, + "loss": 0.3156, + "step": 1676 + }, + { + "epoch": 0.1358554763447829, + "grad_norm": 0.06514348834753036, + "learning_rate": 6.792223572296477e-05, + "loss": 0.3968, + "step": 1677 + }, + { + "epoch": 0.13593648736228128, + "grad_norm": 0.06725168228149414, + "learning_rate": 6.796273795058729e-05, + "loss": 0.4387, + "step": 1678 + }, + { + "epoch": 0.13601749837977964, + "grad_norm": 0.0678589791059494, + "learning_rate": 6.80032401782098e-05, + "loss": 0.4239, + "step": 1679 + }, + { + "epoch": 0.13609850939727802, + "grad_norm": 0.06807240843772888, + "learning_rate": 6.804374240583233e-05, + "loss": 0.4178, + "step": 1680 + }, + { + "epoch": 0.1361795204147764, + "grad_norm": 0.06269272416830063, + "learning_rate": 6.808424463345484e-05, + "loss": 0.4557, + "step": 1681 + }, + { + "epoch": 0.1362605314322748, + "grad_norm": 0.05374554917216301, + "learning_rate": 6.812474686107736e-05, + "loss": 0.4185, + "step": 1682 + }, + { + "epoch": 0.13634154244977317, + "grad_norm": 0.06106915697455406, + "learning_rate": 6.816524908869988e-05, + "loss": 0.3768, + "step": 1683 + }, + { + "epoch": 0.13642255346727156, + "grad_norm": 0.08892907202243805, + "learning_rate": 6.82057513163224e-05, + "loss": 0.4065, + "step": 1684 + }, + { + "epoch": 0.13650356448476994, + "grad_norm": 0.04702616110444069, + "learning_rate": 6.824625354394491e-05, + "loss": 0.3306, + "step": 1685 + }, + { + "epoch": 0.1365845755022683, + "grad_norm": 0.051397860050201416, + "learning_rate": 6.828675577156743e-05, + "loss": 0.3856, + "step": 1686 + }, + { + "epoch": 0.13666558651976668, + "grad_norm": 0.07031011581420898, + "learning_rate": 6.832725799918996e-05, + "loss": 0.3844, + "step": 1687 + }, + { + "epoch": 0.13674659753726506, + "grad_norm": 0.057391006499528885, + "learning_rate": 6.836776022681248e-05, + "loss": 0.3764, + "step": 1688 + }, + { + "epoch": 0.13682760855476345, + "grad_norm": 0.05652105435729027, + "learning_rate": 6.8408262454435e-05, + "loss": 0.4324, + "step": 1689 + }, + { + "epoch": 0.13690861957226183, + "grad_norm": 0.06434738636016846, + "learning_rate": 6.844876468205752e-05, + "loss": 0.4152, + "step": 1690 + }, + { + "epoch": 0.13698963058976021, + "grad_norm": 0.07498002797365189, + "learning_rate": 6.848926690968004e-05, + "loss": 0.3906, + "step": 1691 + }, + { + "epoch": 0.1370706416072586, + "grad_norm": 0.074516162276268, + "learning_rate": 6.852976913730256e-05, + "loss": 0.3704, + "step": 1692 + }, + { + "epoch": 0.13715165262475695, + "grad_norm": 0.05279865488409996, + "learning_rate": 6.857027136492508e-05, + "loss": 0.3765, + "step": 1693 + }, + { + "epoch": 0.13723266364225534, + "grad_norm": 0.051047589629888535, + "learning_rate": 6.86107735925476e-05, + "loss": 0.3428, + "step": 1694 + }, + { + "epoch": 0.13731367465975372, + "grad_norm": 0.057483524084091187, + "learning_rate": 6.865127582017012e-05, + "loss": 0.4172, + "step": 1695 + }, + { + "epoch": 0.1373946856772521, + "grad_norm": 0.0812988355755806, + "learning_rate": 6.869177804779263e-05, + "loss": 0.4141, + "step": 1696 + }, + { + "epoch": 0.1374756966947505, + "grad_norm": 0.08455239981412888, + "learning_rate": 6.873228027541515e-05, + "loss": 0.3761, + "step": 1697 + }, + { + "epoch": 0.13755670771224887, + "grad_norm": 0.06918775290250778, + "learning_rate": 6.877278250303767e-05, + "loss": 0.4699, + "step": 1698 + }, + { + "epoch": 0.13763771872974725, + "grad_norm": 0.06675770878791809, + "learning_rate": 6.881328473066019e-05, + "loss": 0.4157, + "step": 1699 + }, + { + "epoch": 0.13771872974724564, + "grad_norm": 0.05350131541490555, + "learning_rate": 6.88537869582827e-05, + "loss": 0.378, + "step": 1700 + }, + { + "epoch": 0.137799740764744, + "grad_norm": 0.07368238270282745, + "learning_rate": 6.889428918590522e-05, + "loss": 0.4242, + "step": 1701 + }, + { + "epoch": 0.13788075178224238, + "grad_norm": 0.051566384732723236, + "learning_rate": 6.893479141352774e-05, + "loss": 0.4016, + "step": 1702 + }, + { + "epoch": 0.13796176279974076, + "grad_norm": 0.05574240908026695, + "learning_rate": 6.897529364115026e-05, + "loss": 0.4242, + "step": 1703 + }, + { + "epoch": 0.13804277381723914, + "grad_norm": 0.06958450376987457, + "learning_rate": 6.901579586877278e-05, + "loss": 0.4175, + "step": 1704 + }, + { + "epoch": 0.13812378483473753, + "grad_norm": 0.0514814667403698, + "learning_rate": 6.90562980963953e-05, + "loss": 0.3715, + "step": 1705 + }, + { + "epoch": 0.1382047958522359, + "grad_norm": 0.05750512704253197, + "learning_rate": 6.909680032401784e-05, + "loss": 0.4226, + "step": 1706 + }, + { + "epoch": 0.1382858068697343, + "grad_norm": 0.08278968930244446, + "learning_rate": 6.913730255164034e-05, + "loss": 0.4605, + "step": 1707 + }, + { + "epoch": 0.13836681788723265, + "grad_norm": 0.050866756588220596, + "learning_rate": 6.917780477926286e-05, + "loss": 0.4151, + "step": 1708 + }, + { + "epoch": 0.13844782890473104, + "grad_norm": 0.07240499556064606, + "learning_rate": 6.921830700688538e-05, + "loss": 0.3758, + "step": 1709 + }, + { + "epoch": 0.13852883992222942, + "grad_norm": 0.061994630843400955, + "learning_rate": 6.92588092345079e-05, + "loss": 0.3862, + "step": 1710 + }, + { + "epoch": 0.1386098509397278, + "grad_norm": 0.06153041124343872, + "learning_rate": 6.929931146213042e-05, + "loss": 0.4123, + "step": 1711 + }, + { + "epoch": 0.13869086195722619, + "grad_norm": 0.06387482583522797, + "learning_rate": 6.933981368975294e-05, + "loss": 0.4057, + "step": 1712 + }, + { + "epoch": 0.13877187297472457, + "grad_norm": 0.06365308165550232, + "learning_rate": 6.938031591737546e-05, + "loss": 0.3397, + "step": 1713 + }, + { + "epoch": 0.13885288399222295, + "grad_norm": 0.05819310247898102, + "learning_rate": 6.942081814499798e-05, + "loss": 0.3572, + "step": 1714 + }, + { + "epoch": 0.1389338950097213, + "grad_norm": 0.06904326379299164, + "learning_rate": 6.94613203726205e-05, + "loss": 0.4026, + "step": 1715 + }, + { + "epoch": 0.1390149060272197, + "grad_norm": 0.10969147086143494, + "learning_rate": 6.950182260024302e-05, + "loss": 0.4163, + "step": 1716 + }, + { + "epoch": 0.13909591704471808, + "grad_norm": 0.07195261120796204, + "learning_rate": 6.954232482786554e-05, + "loss": 0.4684, + "step": 1717 + }, + { + "epoch": 0.13917692806221646, + "grad_norm": 0.06505865603685379, + "learning_rate": 6.958282705548806e-05, + "loss": 0.384, + "step": 1718 + }, + { + "epoch": 0.13925793907971484, + "grad_norm": 0.05413194000720978, + "learning_rate": 6.962332928311058e-05, + "loss": 0.3935, + "step": 1719 + }, + { + "epoch": 0.13933895009721323, + "grad_norm": 0.05856451764702797, + "learning_rate": 6.966383151073308e-05, + "loss": 0.4345, + "step": 1720 + }, + { + "epoch": 0.1394199611147116, + "grad_norm": 0.05400045961141586, + "learning_rate": 6.97043337383556e-05, + "loss": 0.3794, + "step": 1721 + }, + { + "epoch": 0.13950097213221, + "grad_norm": 0.0684712827205658, + "learning_rate": 6.974483596597812e-05, + "loss": 0.3592, + "step": 1722 + }, + { + "epoch": 0.13958198314970835, + "grad_norm": 0.05927879735827446, + "learning_rate": 6.978533819360064e-05, + "loss": 0.4145, + "step": 1723 + }, + { + "epoch": 0.13966299416720673, + "grad_norm": 0.06611039489507675, + "learning_rate": 6.982584042122316e-05, + "loss": 0.4255, + "step": 1724 + }, + { + "epoch": 0.13974400518470512, + "grad_norm": 0.05736701190471649, + "learning_rate": 6.98663426488457e-05, + "loss": 0.4155, + "step": 1725 + }, + { + "epoch": 0.1398250162022035, + "grad_norm": 0.054496392607688904, + "learning_rate": 6.990684487646822e-05, + "loss": 0.3923, + "step": 1726 + }, + { + "epoch": 0.13990602721970188, + "grad_norm": 0.07339916378259659, + "learning_rate": 6.994734710409074e-05, + "loss": 0.386, + "step": 1727 + }, + { + "epoch": 0.13998703823720027, + "grad_norm": 0.08990602940320969, + "learning_rate": 6.998784933171326e-05, + "loss": 0.3809, + "step": 1728 + }, + { + "epoch": 0.14006804925469865, + "grad_norm": 0.05522826686501503, + "learning_rate": 7.002835155933578e-05, + "loss": 0.3997, + "step": 1729 + }, + { + "epoch": 0.140149060272197, + "grad_norm": 0.06750888377428055, + "learning_rate": 7.00688537869583e-05, + "loss": 0.4567, + "step": 1730 + }, + { + "epoch": 0.1402300712896954, + "grad_norm": 0.07188330590724945, + "learning_rate": 7.01093560145808e-05, + "loss": 0.3631, + "step": 1731 + }, + { + "epoch": 0.14031108230719377, + "grad_norm": 0.06761690974235535, + "learning_rate": 7.014985824220332e-05, + "loss": 0.3815, + "step": 1732 + }, + { + "epoch": 0.14039209332469216, + "grad_norm": 0.07259739190340042, + "learning_rate": 7.019036046982584e-05, + "loss": 0.3993, + "step": 1733 + }, + { + "epoch": 0.14047310434219054, + "grad_norm": 0.05059479549527168, + "learning_rate": 7.023086269744836e-05, + "loss": 0.4039, + "step": 1734 + }, + { + "epoch": 0.14055411535968892, + "grad_norm": 0.06526746600866318, + "learning_rate": 7.027136492507088e-05, + "loss": 0.4439, + "step": 1735 + }, + { + "epoch": 0.1406351263771873, + "grad_norm": 0.06921422481536865, + "learning_rate": 7.03118671526934e-05, + "loss": 0.4067, + "step": 1736 + }, + { + "epoch": 0.14071613739468566, + "grad_norm": 0.0597330667078495, + "learning_rate": 7.035236938031592e-05, + "loss": 0.4079, + "step": 1737 + }, + { + "epoch": 0.14079714841218405, + "grad_norm": 0.06214408203959465, + "learning_rate": 7.039287160793844e-05, + "loss": 0.394, + "step": 1738 + }, + { + "epoch": 0.14087815942968243, + "grad_norm": 0.05894998461008072, + "learning_rate": 7.043337383556096e-05, + "loss": 0.3823, + "step": 1739 + }, + { + "epoch": 0.14095917044718081, + "grad_norm": 0.07503427565097809, + "learning_rate": 7.047387606318348e-05, + "loss": 0.4185, + "step": 1740 + }, + { + "epoch": 0.1410401814646792, + "grad_norm": 0.05956784635782242, + "learning_rate": 7.0514378290806e-05, + "loss": 0.4074, + "step": 1741 + }, + { + "epoch": 0.14112119248217758, + "grad_norm": 0.07910850644111633, + "learning_rate": 7.055488051842852e-05, + "loss": 0.4129, + "step": 1742 + }, + { + "epoch": 0.14120220349967597, + "grad_norm": 0.056844715029001236, + "learning_rate": 7.059538274605104e-05, + "loss": 0.3862, + "step": 1743 + }, + { + "epoch": 0.14128321451717435, + "grad_norm": 0.08414852619171143, + "learning_rate": 7.063588497367356e-05, + "loss": 0.4196, + "step": 1744 + }, + { + "epoch": 0.1413642255346727, + "grad_norm": 0.06702537089586258, + "learning_rate": 7.067638720129608e-05, + "loss": 0.4098, + "step": 1745 + }, + { + "epoch": 0.1414452365521711, + "grad_norm": 0.06511709839105606, + "learning_rate": 7.07168894289186e-05, + "loss": 0.418, + "step": 1746 + }, + { + "epoch": 0.14152624756966947, + "grad_norm": 0.054429568350315094, + "learning_rate": 7.075739165654112e-05, + "loss": 0.3404, + "step": 1747 + }, + { + "epoch": 0.14160725858716786, + "grad_norm": 0.047777604311704636, + "learning_rate": 7.079789388416364e-05, + "loss": 0.3381, + "step": 1748 + }, + { + "epoch": 0.14168826960466624, + "grad_norm": 0.05889793112874031, + "learning_rate": 7.083839611178616e-05, + "loss": 0.4082, + "step": 1749 + }, + { + "epoch": 0.14176928062216462, + "grad_norm": 0.05489526316523552, + "learning_rate": 7.087889833940867e-05, + "loss": 0.4125, + "step": 1750 + }, + { + "epoch": 0.141850291639663, + "grad_norm": 0.06594657152891159, + "learning_rate": 7.09194005670312e-05, + "loss": 0.3895, + "step": 1751 + }, + { + "epoch": 0.14193130265716136, + "grad_norm": 0.0674719288945198, + "learning_rate": 7.095990279465371e-05, + "loss": 0.4384, + "step": 1752 + }, + { + "epoch": 0.14201231367465975, + "grad_norm": 0.05686030164361, + "learning_rate": 7.100040502227623e-05, + "loss": 0.3342, + "step": 1753 + }, + { + "epoch": 0.14209332469215813, + "grad_norm": 0.05702396109700203, + "learning_rate": 7.104090724989875e-05, + "loss": 0.4137, + "step": 1754 + }, + { + "epoch": 0.1421743357096565, + "grad_norm": 0.07312499731779099, + "learning_rate": 7.108140947752126e-05, + "loss": 0.3709, + "step": 1755 + }, + { + "epoch": 0.1422553467271549, + "grad_norm": 0.04934091120958328, + "learning_rate": 7.112191170514378e-05, + "loss": 0.3619, + "step": 1756 + }, + { + "epoch": 0.14233635774465328, + "grad_norm": 0.05569101497530937, + "learning_rate": 7.11624139327663e-05, + "loss": 0.4096, + "step": 1757 + }, + { + "epoch": 0.14241736876215166, + "grad_norm": 0.05229457840323448, + "learning_rate": 7.120291616038882e-05, + "loss": 0.3803, + "step": 1758 + }, + { + "epoch": 0.14249837977965002, + "grad_norm": 0.05280294641852379, + "learning_rate": 7.124341838801134e-05, + "loss": 0.4167, + "step": 1759 + }, + { + "epoch": 0.1425793907971484, + "grad_norm": 0.05248962715268135, + "learning_rate": 7.128392061563386e-05, + "loss": 0.4203, + "step": 1760 + }, + { + "epoch": 0.1426604018146468, + "grad_norm": 0.05805622413754463, + "learning_rate": 7.132442284325638e-05, + "loss": 0.3952, + "step": 1761 + }, + { + "epoch": 0.14274141283214517, + "grad_norm": 0.06868473440408707, + "learning_rate": 7.13649250708789e-05, + "loss": 0.3603, + "step": 1762 + }, + { + "epoch": 0.14282242384964355, + "grad_norm": 0.05611317232251167, + "learning_rate": 7.140542729850143e-05, + "loss": 0.3756, + "step": 1763 + }, + { + "epoch": 0.14290343486714194, + "grad_norm": 0.05748287960886955, + "learning_rate": 7.144592952612395e-05, + "loss": 0.428, + "step": 1764 + }, + { + "epoch": 0.14298444588464032, + "grad_norm": 0.0716121718287468, + "learning_rate": 7.148643175374647e-05, + "loss": 0.4295, + "step": 1765 + }, + { + "epoch": 0.1430654569021387, + "grad_norm": 0.0687982439994812, + "learning_rate": 7.152693398136898e-05, + "loss": 0.4295, + "step": 1766 + }, + { + "epoch": 0.14314646791963706, + "grad_norm": 0.06397130340337753, + "learning_rate": 7.15674362089915e-05, + "loss": 0.442, + "step": 1767 + }, + { + "epoch": 0.14322747893713544, + "grad_norm": 0.06169519200921059, + "learning_rate": 7.160793843661402e-05, + "loss": 0.4189, + "step": 1768 + }, + { + "epoch": 0.14330848995463383, + "grad_norm": 0.04765634983778, + "learning_rate": 7.164844066423653e-05, + "loss": 0.381, + "step": 1769 + }, + { + "epoch": 0.1433895009721322, + "grad_norm": 0.04898335412144661, + "learning_rate": 7.168894289185905e-05, + "loss": 0.4064, + "step": 1770 + }, + { + "epoch": 0.1434705119896306, + "grad_norm": 0.0570511594414711, + "learning_rate": 7.172944511948157e-05, + "loss": 0.3865, + "step": 1771 + }, + { + "epoch": 0.14355152300712898, + "grad_norm": 0.0582033172249794, + "learning_rate": 7.17699473471041e-05, + "loss": 0.3971, + "step": 1772 + }, + { + "epoch": 0.14363253402462736, + "grad_norm": 0.056540779769420624, + "learning_rate": 7.181044957472661e-05, + "loss": 0.4527, + "step": 1773 + }, + { + "epoch": 0.14371354504212572, + "grad_norm": 0.056025274097919464, + "learning_rate": 7.185095180234913e-05, + "loss": 0.3989, + "step": 1774 + }, + { + "epoch": 0.1437945560596241, + "grad_norm": 0.048998527228832245, + "learning_rate": 7.189145402997165e-05, + "loss": 0.3978, + "step": 1775 + }, + { + "epoch": 0.14387556707712248, + "grad_norm": 0.06863243877887726, + "learning_rate": 7.193195625759417e-05, + "loss": 0.4469, + "step": 1776 + }, + { + "epoch": 0.14395657809462087, + "grad_norm": 0.06901993602514267, + "learning_rate": 7.197245848521669e-05, + "loss": 0.4687, + "step": 1777 + }, + { + "epoch": 0.14403758911211925, + "grad_norm": 0.06378109008073807, + "learning_rate": 7.201296071283921e-05, + "loss": 0.3789, + "step": 1778 + }, + { + "epoch": 0.14411860012961764, + "grad_norm": 0.053304869681596756, + "learning_rate": 7.205346294046172e-05, + "loss": 0.4167, + "step": 1779 + }, + { + "epoch": 0.14419961114711602, + "grad_norm": 0.04796311631798744, + "learning_rate": 7.209396516808424e-05, + "loss": 0.4195, + "step": 1780 + }, + { + "epoch": 0.14428062216461437, + "grad_norm": 0.061010587960481644, + "learning_rate": 7.213446739570676e-05, + "loss": 0.425, + "step": 1781 + }, + { + "epoch": 0.14436163318211276, + "grad_norm": 0.05330296978354454, + "learning_rate": 7.217496962332929e-05, + "loss": 0.3903, + "step": 1782 + }, + { + "epoch": 0.14444264419961114, + "grad_norm": 0.06527654826641083, + "learning_rate": 7.221547185095181e-05, + "loss": 0.3622, + "step": 1783 + }, + { + "epoch": 0.14452365521710953, + "grad_norm": 0.05822371691465378, + "learning_rate": 7.225597407857433e-05, + "loss": 0.3866, + "step": 1784 + }, + { + "epoch": 0.1446046662346079, + "grad_norm": 0.05615841597318649, + "learning_rate": 7.229647630619685e-05, + "loss": 0.3653, + "step": 1785 + }, + { + "epoch": 0.1446856772521063, + "grad_norm": 0.06061498448252678, + "learning_rate": 7.233697853381937e-05, + "loss": 0.4049, + "step": 1786 + }, + { + "epoch": 0.14476668826960468, + "grad_norm": 0.06993502378463745, + "learning_rate": 7.237748076144189e-05, + "loss": 0.3763, + "step": 1787 + }, + { + "epoch": 0.14484769928710303, + "grad_norm": 0.07220494002103806, + "learning_rate": 7.241798298906441e-05, + "loss": 0.4207, + "step": 1788 + }, + { + "epoch": 0.14492871030460142, + "grad_norm": 0.06628101319074631, + "learning_rate": 7.245848521668693e-05, + "loss": 0.4357, + "step": 1789 + }, + { + "epoch": 0.1450097213220998, + "grad_norm": 0.06409826874732971, + "learning_rate": 7.249898744430943e-05, + "loss": 0.423, + "step": 1790 + }, + { + "epoch": 0.14509073233959818, + "grad_norm": 0.07170279324054718, + "learning_rate": 7.253948967193195e-05, + "loss": 0.4523, + "step": 1791 + }, + { + "epoch": 0.14517174335709657, + "grad_norm": 0.04600429907441139, + "learning_rate": 7.257999189955447e-05, + "loss": 0.3896, + "step": 1792 + }, + { + "epoch": 0.14525275437459495, + "grad_norm": 0.06032882630825043, + "learning_rate": 7.2620494127177e-05, + "loss": 0.3646, + "step": 1793 + }, + { + "epoch": 0.14533376539209333, + "grad_norm": 0.07741162925958633, + "learning_rate": 7.266099635479951e-05, + "loss": 0.3783, + "step": 1794 + }, + { + "epoch": 0.14541477640959172, + "grad_norm": 0.08997105062007904, + "learning_rate": 7.270149858242203e-05, + "loss": 0.4193, + "step": 1795 + }, + { + "epoch": 0.14549578742709007, + "grad_norm": 0.05801474675536156, + "learning_rate": 7.274200081004455e-05, + "loss": 0.3841, + "step": 1796 + }, + { + "epoch": 0.14557679844458846, + "grad_norm": 0.07002715021371841, + "learning_rate": 7.278250303766707e-05, + "loss": 0.4065, + "step": 1797 + }, + { + "epoch": 0.14565780946208684, + "grad_norm": 0.05423908308148384, + "learning_rate": 7.282300526528959e-05, + "loss": 0.4072, + "step": 1798 + }, + { + "epoch": 0.14573882047958522, + "grad_norm": 0.050748929381370544, + "learning_rate": 7.286350749291211e-05, + "loss": 0.3967, + "step": 1799 + }, + { + "epoch": 0.1458198314970836, + "grad_norm": 0.05518374964594841, + "learning_rate": 7.290400972053464e-05, + "loss": 0.3719, + "step": 1800 + }, + { + "epoch": 0.145900842514582, + "grad_norm": 0.06590072810649872, + "learning_rate": 7.294451194815715e-05, + "loss": 0.4037, + "step": 1801 + }, + { + "epoch": 0.14598185353208037, + "grad_norm": 0.06755327433347702, + "learning_rate": 7.298501417577967e-05, + "loss": 0.4461, + "step": 1802 + }, + { + "epoch": 0.14606286454957873, + "grad_norm": 0.058261655271053314, + "learning_rate": 7.302551640340219e-05, + "loss": 0.3815, + "step": 1803 + }, + { + "epoch": 0.14614387556707711, + "grad_norm": 0.05535726994276047, + "learning_rate": 7.306601863102471e-05, + "loss": 0.4019, + "step": 1804 + }, + { + "epoch": 0.1462248865845755, + "grad_norm": 0.05837560072541237, + "learning_rate": 7.310652085864723e-05, + "loss": 0.4141, + "step": 1805 + }, + { + "epoch": 0.14630589760207388, + "grad_norm": 0.059157032519578934, + "learning_rate": 7.314702308626975e-05, + "loss": 0.3868, + "step": 1806 + }, + { + "epoch": 0.14638690861957226, + "grad_norm": 0.06888743489980698, + "learning_rate": 7.318752531389227e-05, + "loss": 0.3937, + "step": 1807 + }, + { + "epoch": 0.14646791963707065, + "grad_norm": 0.06422577798366547, + "learning_rate": 7.322802754151479e-05, + "loss": 0.4016, + "step": 1808 + }, + { + "epoch": 0.14654893065456903, + "grad_norm": 0.05708504468202591, + "learning_rate": 7.326852976913731e-05, + "loss": 0.375, + "step": 1809 + }, + { + "epoch": 0.1466299416720674, + "grad_norm": 0.06336402148008347, + "learning_rate": 7.330903199675983e-05, + "loss": 0.4152, + "step": 1810 + }, + { + "epoch": 0.14671095268956577, + "grad_norm": 0.06681369990110397, + "learning_rate": 7.334953422438235e-05, + "loss": 0.3972, + "step": 1811 + }, + { + "epoch": 0.14679196370706415, + "grad_norm": 0.06833192706108093, + "learning_rate": 7.339003645200487e-05, + "loss": 0.4259, + "step": 1812 + }, + { + "epoch": 0.14687297472456254, + "grad_norm": 0.06117531284689903, + "learning_rate": 7.343053867962739e-05, + "loss": 0.4279, + "step": 1813 + }, + { + "epoch": 0.14695398574206092, + "grad_norm": 0.061122827231884, + "learning_rate": 7.347104090724989e-05, + "loss": 0.4019, + "step": 1814 + }, + { + "epoch": 0.1470349967595593, + "grad_norm": 0.06589484959840775, + "learning_rate": 7.351154313487241e-05, + "loss": 0.372, + "step": 1815 + }, + { + "epoch": 0.1471160077770577, + "grad_norm": 0.0601775124669075, + "learning_rate": 7.355204536249493e-05, + "loss": 0.4704, + "step": 1816 + }, + { + "epoch": 0.14719701879455607, + "grad_norm": 0.05938103049993515, + "learning_rate": 7.359254759011745e-05, + "loss": 0.4103, + "step": 1817 + }, + { + "epoch": 0.14727802981205443, + "grad_norm": 0.052586477249860764, + "learning_rate": 7.363304981773997e-05, + "loss": 0.4259, + "step": 1818 + }, + { + "epoch": 0.1473590408295528, + "grad_norm": 0.056762926280498505, + "learning_rate": 7.36735520453625e-05, + "loss": 0.4044, + "step": 1819 + }, + { + "epoch": 0.1474400518470512, + "grad_norm": 0.04729248583316803, + "learning_rate": 7.371405427298502e-05, + "loss": 0.4058, + "step": 1820 + }, + { + "epoch": 0.14752106286454958, + "grad_norm": 0.05358875170350075, + "learning_rate": 7.375455650060754e-05, + "loss": 0.3731, + "step": 1821 + }, + { + "epoch": 0.14760207388204796, + "grad_norm": 0.05654335767030716, + "learning_rate": 7.379505872823006e-05, + "loss": 0.3702, + "step": 1822 + }, + { + "epoch": 0.14768308489954635, + "grad_norm": 0.055869363248348236, + "learning_rate": 7.383556095585258e-05, + "loss": 0.3869, + "step": 1823 + }, + { + "epoch": 0.14776409591704473, + "grad_norm": 0.05027586966753006, + "learning_rate": 7.38760631834751e-05, + "loss": 0.4121, + "step": 1824 + }, + { + "epoch": 0.14784510693454309, + "grad_norm": 0.056683965027332306, + "learning_rate": 7.391656541109761e-05, + "loss": 0.3917, + "step": 1825 + }, + { + "epoch": 0.14792611795204147, + "grad_norm": 0.05755713954567909, + "learning_rate": 7.395706763872013e-05, + "loss": 0.4402, + "step": 1826 + }, + { + "epoch": 0.14800712896953985, + "grad_norm": 0.060089223086833954, + "learning_rate": 7.399756986634265e-05, + "loss": 0.3635, + "step": 1827 + }, + { + "epoch": 0.14808813998703824, + "grad_norm": 0.060978520661592484, + "learning_rate": 7.403807209396517e-05, + "loss": 0.4458, + "step": 1828 + }, + { + "epoch": 0.14816915100453662, + "grad_norm": 0.06925841420888901, + "learning_rate": 7.407857432158769e-05, + "loss": 0.4283, + "step": 1829 + }, + { + "epoch": 0.148250162022035, + "grad_norm": 0.06608008593320847, + "learning_rate": 7.411907654921021e-05, + "loss": 0.3925, + "step": 1830 + }, + { + "epoch": 0.1483311730395334, + "grad_norm": 0.06768857687711716, + "learning_rate": 7.415957877683273e-05, + "loss": 0.3769, + "step": 1831 + }, + { + "epoch": 0.14841218405703174, + "grad_norm": 0.05477285385131836, + "learning_rate": 7.420008100445525e-05, + "loss": 0.3388, + "step": 1832 + }, + { + "epoch": 0.14849319507453013, + "grad_norm": 0.05085244029760361, + "learning_rate": 7.424058323207777e-05, + "loss": 0.4125, + "step": 1833 + }, + { + "epoch": 0.1485742060920285, + "grad_norm": 0.05243555083870888, + "learning_rate": 7.428108545970029e-05, + "loss": 0.3376, + "step": 1834 + }, + { + "epoch": 0.1486552171095269, + "grad_norm": 0.05747552961111069, + "learning_rate": 7.43215876873228e-05, + "loss": 0.4099, + "step": 1835 + }, + { + "epoch": 0.14873622812702528, + "grad_norm": 0.05261996015906334, + "learning_rate": 7.436208991494533e-05, + "loss": 0.4171, + "step": 1836 + }, + { + "epoch": 0.14881723914452366, + "grad_norm": 0.07032431662082672, + "learning_rate": 7.440259214256785e-05, + "loss": 0.4228, + "step": 1837 + }, + { + "epoch": 0.14889825016202204, + "grad_norm": 0.0585196353495121, + "learning_rate": 7.444309437019036e-05, + "loss": 0.3887, + "step": 1838 + }, + { + "epoch": 0.14897926117952043, + "grad_norm": 0.04458057880401611, + "learning_rate": 7.448359659781288e-05, + "loss": 0.3939, + "step": 1839 + }, + { + "epoch": 0.14906027219701878, + "grad_norm": 0.04972951114177704, + "learning_rate": 7.45240988254354e-05, + "loss": 0.3625, + "step": 1840 + }, + { + "epoch": 0.14914128321451717, + "grad_norm": 0.05577665939927101, + "learning_rate": 7.456460105305792e-05, + "loss": 0.407, + "step": 1841 + }, + { + "epoch": 0.14922229423201555, + "grad_norm": 0.05278479680418968, + "learning_rate": 7.460510328068044e-05, + "loss": 0.4324, + "step": 1842 + }, + { + "epoch": 0.14930330524951393, + "grad_norm": 0.05005854740738869, + "learning_rate": 7.464560550830296e-05, + "loss": 0.4089, + "step": 1843 + }, + { + "epoch": 0.14938431626701232, + "grad_norm": 0.0569162480533123, + "learning_rate": 7.468610773592548e-05, + "loss": 0.3852, + "step": 1844 + }, + { + "epoch": 0.1494653272845107, + "grad_norm": 0.04794647544622421, + "learning_rate": 7.4726609963548e-05, + "loss": 0.3865, + "step": 1845 + }, + { + "epoch": 0.14954633830200909, + "grad_norm": 0.04714139923453331, + "learning_rate": 7.476711219117052e-05, + "loss": 0.3557, + "step": 1846 + }, + { + "epoch": 0.14962734931950744, + "grad_norm": 0.0505010262131691, + "learning_rate": 7.480761441879304e-05, + "loss": 0.3681, + "step": 1847 + }, + { + "epoch": 0.14970836033700582, + "grad_norm": 0.04720376804471016, + "learning_rate": 7.484811664641556e-05, + "loss": 0.3652, + "step": 1848 + }, + { + "epoch": 0.1497893713545042, + "grad_norm": 0.06479644775390625, + "learning_rate": 7.488861887403808e-05, + "loss": 0.4114, + "step": 1849 + }, + { + "epoch": 0.1498703823720026, + "grad_norm": 0.056275542825460434, + "learning_rate": 7.492912110166059e-05, + "loss": 0.3694, + "step": 1850 + }, + { + "epoch": 0.14995139338950098, + "grad_norm": 0.06178649142384529, + "learning_rate": 7.496962332928311e-05, + "loss": 0.4472, + "step": 1851 + }, + { + "epoch": 0.15003240440699936, + "grad_norm": 0.06014366075396538, + "learning_rate": 7.501012555690563e-05, + "loss": 0.38, + "step": 1852 + }, + { + "epoch": 0.15011341542449774, + "grad_norm": 0.06340664625167847, + "learning_rate": 7.505062778452815e-05, + "loss": 0.3629, + "step": 1853 + }, + { + "epoch": 0.1501944264419961, + "grad_norm": 0.05327383428812027, + "learning_rate": 7.509113001215067e-05, + "loss": 0.376, + "step": 1854 + }, + { + "epoch": 0.15027543745949448, + "grad_norm": 0.058881551027297974, + "learning_rate": 7.513163223977319e-05, + "loss": 0.3972, + "step": 1855 + }, + { + "epoch": 0.15035644847699287, + "grad_norm": 0.06142482906579971, + "learning_rate": 7.51721344673957e-05, + "loss": 0.4119, + "step": 1856 + }, + { + "epoch": 0.15043745949449125, + "grad_norm": 0.06492027640342712, + "learning_rate": 7.521263669501824e-05, + "loss": 0.4008, + "step": 1857 + }, + { + "epoch": 0.15051847051198963, + "grad_norm": 0.0463237501680851, + "learning_rate": 7.525313892264076e-05, + "loss": 0.3546, + "step": 1858 + }, + { + "epoch": 0.15059948152948802, + "grad_norm": 0.054810382425785065, + "learning_rate": 7.529364115026328e-05, + "loss": 0.4539, + "step": 1859 + }, + { + "epoch": 0.1506804925469864, + "grad_norm": 0.05126664787530899, + "learning_rate": 7.533414337788578e-05, + "loss": 0.3981, + "step": 1860 + }, + { + "epoch": 0.15076150356448478, + "grad_norm": 0.12834858894348145, + "learning_rate": 7.53746456055083e-05, + "loss": 0.4163, + "step": 1861 + }, + { + "epoch": 0.15084251458198314, + "grad_norm": 0.06354403495788574, + "learning_rate": 7.541514783313082e-05, + "loss": 0.3773, + "step": 1862 + }, + { + "epoch": 0.15092352559948152, + "grad_norm": 0.053905289620161057, + "learning_rate": 7.545565006075334e-05, + "loss": 0.4318, + "step": 1863 + }, + { + "epoch": 0.1510045366169799, + "grad_norm": 0.05705507472157478, + "learning_rate": 7.549615228837586e-05, + "loss": 0.3674, + "step": 1864 + }, + { + "epoch": 0.1510855476344783, + "grad_norm": 0.05759746581315994, + "learning_rate": 7.553665451599838e-05, + "loss": 0.3508, + "step": 1865 + }, + { + "epoch": 0.15116655865197667, + "grad_norm": 0.04640937224030495, + "learning_rate": 7.55771567436209e-05, + "loss": 0.4149, + "step": 1866 + }, + { + "epoch": 0.15124756966947506, + "grad_norm": 0.06412570178508759, + "learning_rate": 7.561765897124342e-05, + "loss": 0.4117, + "step": 1867 + }, + { + "epoch": 0.15132858068697344, + "grad_norm": 0.06056508794426918, + "learning_rate": 7.565816119886594e-05, + "loss": 0.3966, + "step": 1868 + }, + { + "epoch": 0.1514095917044718, + "grad_norm": 0.06665674597024918, + "learning_rate": 7.569866342648846e-05, + "loss": 0.4389, + "step": 1869 + }, + { + "epoch": 0.15149060272197018, + "grad_norm": 0.060271285474300385, + "learning_rate": 7.573916565411098e-05, + "loss": 0.4256, + "step": 1870 + }, + { + "epoch": 0.15157161373946856, + "grad_norm": 0.062425002455711365, + "learning_rate": 7.57796678817335e-05, + "loss": 0.3871, + "step": 1871 + }, + { + "epoch": 0.15165262475696695, + "grad_norm": 0.07723111659288406, + "learning_rate": 7.582017010935602e-05, + "loss": 0.4202, + "step": 1872 + }, + { + "epoch": 0.15173363577446533, + "grad_norm": 0.06984084099531174, + "learning_rate": 7.586067233697854e-05, + "loss": 0.4244, + "step": 1873 + }, + { + "epoch": 0.15181464679196371, + "grad_norm": 0.05606261268258095, + "learning_rate": 7.590117456460105e-05, + "loss": 0.3444, + "step": 1874 + }, + { + "epoch": 0.1518956578094621, + "grad_norm": 0.05358389392495155, + "learning_rate": 7.594167679222357e-05, + "loss": 0.3726, + "step": 1875 + }, + { + "epoch": 0.15197666882696045, + "grad_norm": 0.052349768579006195, + "learning_rate": 7.59821790198461e-05, + "loss": 0.394, + "step": 1876 + }, + { + "epoch": 0.15205767984445884, + "grad_norm": 0.05732255429029465, + "learning_rate": 7.602268124746862e-05, + "loss": 0.3717, + "step": 1877 + }, + { + "epoch": 0.15213869086195722, + "grad_norm": 0.05306144058704376, + "learning_rate": 7.606318347509114e-05, + "loss": 0.4004, + "step": 1878 + }, + { + "epoch": 0.1522197018794556, + "grad_norm": 0.05637253820896149, + "learning_rate": 7.610368570271366e-05, + "loss": 0.3899, + "step": 1879 + }, + { + "epoch": 0.152300712896954, + "grad_norm": 0.07494375109672546, + "learning_rate": 7.614418793033618e-05, + "loss": 0.3769, + "step": 1880 + }, + { + "epoch": 0.15238172391445237, + "grad_norm": 0.06573886424303055, + "learning_rate": 7.61846901579587e-05, + "loss": 0.3921, + "step": 1881 + }, + { + "epoch": 0.15246273493195076, + "grad_norm": 0.056376952677965164, + "learning_rate": 7.622519238558122e-05, + "loss": 0.4119, + "step": 1882 + }, + { + "epoch": 0.15254374594944914, + "grad_norm": 0.05358589440584183, + "learning_rate": 7.626569461320374e-05, + "loss": 0.3883, + "step": 1883 + }, + { + "epoch": 0.1526247569669475, + "grad_norm": 0.06232666224241257, + "learning_rate": 7.630619684082626e-05, + "loss": 0.395, + "step": 1884 + }, + { + "epoch": 0.15270576798444588, + "grad_norm": 0.07566624134778976, + "learning_rate": 7.634669906844876e-05, + "loss": 0.3768, + "step": 1885 + }, + { + "epoch": 0.15278677900194426, + "grad_norm": 0.07797633856534958, + "learning_rate": 7.638720129607128e-05, + "loss": 0.357, + "step": 1886 + }, + { + "epoch": 0.15286779001944265, + "grad_norm": 0.04703294858336449, + "learning_rate": 7.64277035236938e-05, + "loss": 0.3907, + "step": 1887 + }, + { + "epoch": 0.15294880103694103, + "grad_norm": 0.07149975746870041, + "learning_rate": 7.646820575131632e-05, + "loss": 0.4132, + "step": 1888 + }, + { + "epoch": 0.1530298120544394, + "grad_norm": 0.05562804639339447, + "learning_rate": 7.650870797893884e-05, + "loss": 0.3588, + "step": 1889 + }, + { + "epoch": 0.1531108230719378, + "grad_norm": 0.0702008605003357, + "learning_rate": 7.654921020656136e-05, + "loss": 0.41, + "step": 1890 + }, + { + "epoch": 0.15319183408943615, + "grad_norm": 0.0702863484621048, + "learning_rate": 7.658971243418388e-05, + "loss": 0.4284, + "step": 1891 + }, + { + "epoch": 0.15327284510693454, + "grad_norm": 0.11156198382377625, + "learning_rate": 7.66302146618064e-05, + "loss": 0.4401, + "step": 1892 + }, + { + "epoch": 0.15335385612443292, + "grad_norm": 0.06580785661935806, + "learning_rate": 7.667071688942892e-05, + "loss": 0.3719, + "step": 1893 + }, + { + "epoch": 0.1534348671419313, + "grad_norm": 0.058176323771476746, + "learning_rate": 7.671121911705144e-05, + "loss": 0.3352, + "step": 1894 + }, + { + "epoch": 0.1535158781594297, + "grad_norm": 0.05421845242381096, + "learning_rate": 7.675172134467397e-05, + "loss": 0.4187, + "step": 1895 + }, + { + "epoch": 0.15359688917692807, + "grad_norm": 0.04730819910764694, + "learning_rate": 7.679222357229648e-05, + "loss": 0.3854, + "step": 1896 + }, + { + "epoch": 0.15367790019442645, + "grad_norm": 0.05933893471956253, + "learning_rate": 7.6832725799919e-05, + "loss": 0.3833, + "step": 1897 + }, + { + "epoch": 0.1537589112119248, + "grad_norm": 0.058625269681215286, + "learning_rate": 7.687322802754152e-05, + "loss": 0.3672, + "step": 1898 + }, + { + "epoch": 0.1538399222294232, + "grad_norm": 0.05623107776045799, + "learning_rate": 7.691373025516404e-05, + "loss": 0.3706, + "step": 1899 + }, + { + "epoch": 0.15392093324692158, + "grad_norm": 0.06870344281196594, + "learning_rate": 7.695423248278656e-05, + "loss": 0.4005, + "step": 1900 + }, + { + "epoch": 0.15400194426441996, + "grad_norm": 0.06046906113624573, + "learning_rate": 7.699473471040908e-05, + "loss": 0.349, + "step": 1901 + }, + { + "epoch": 0.15408295528191834, + "grad_norm": 0.060264572501182556, + "learning_rate": 7.70352369380316e-05, + "loss": 0.3927, + "step": 1902 + }, + { + "epoch": 0.15416396629941673, + "grad_norm": 0.06525126844644547, + "learning_rate": 7.707573916565412e-05, + "loss": 0.374, + "step": 1903 + }, + { + "epoch": 0.1542449773169151, + "grad_norm": 0.05228757858276367, + "learning_rate": 7.711624139327664e-05, + "loss": 0.4138, + "step": 1904 + }, + { + "epoch": 0.15432598833441347, + "grad_norm": 0.05303167179226875, + "learning_rate": 7.715674362089916e-05, + "loss": 0.399, + "step": 1905 + }, + { + "epoch": 0.15440699935191185, + "grad_norm": 0.06746599823236465, + "learning_rate": 7.719724584852168e-05, + "loss": 0.4202, + "step": 1906 + }, + { + "epoch": 0.15448801036941023, + "grad_norm": 0.0527813620865345, + "learning_rate": 7.72377480761442e-05, + "loss": 0.3833, + "step": 1907 + }, + { + "epoch": 0.15456902138690862, + "grad_norm": 0.07892647385597229, + "learning_rate": 7.727825030376671e-05, + "loss": 0.4008, + "step": 1908 + }, + { + "epoch": 0.154650032404407, + "grad_norm": 0.05395408719778061, + "learning_rate": 7.731875253138922e-05, + "loss": 0.3503, + "step": 1909 + }, + { + "epoch": 0.15473104342190538, + "grad_norm": 0.060144975781440735, + "learning_rate": 7.735925475901174e-05, + "loss": 0.3815, + "step": 1910 + }, + { + "epoch": 0.15481205443940377, + "grad_norm": 0.07419056445360184, + "learning_rate": 7.739975698663426e-05, + "loss": 0.3393, + "step": 1911 + }, + { + "epoch": 0.15489306545690215, + "grad_norm": 0.07367844879627228, + "learning_rate": 7.744025921425678e-05, + "loss": 0.4157, + "step": 1912 + }, + { + "epoch": 0.1549740764744005, + "grad_norm": 0.0585818886756897, + "learning_rate": 7.74807614418793e-05, + "loss": 0.3414, + "step": 1913 + }, + { + "epoch": 0.1550550874918989, + "grad_norm": 0.05288233235478401, + "learning_rate": 7.752126366950183e-05, + "loss": 0.4007, + "step": 1914 + }, + { + "epoch": 0.15513609850939727, + "grad_norm": 0.054131995886564255, + "learning_rate": 7.756176589712435e-05, + "loss": 0.4007, + "step": 1915 + }, + { + "epoch": 0.15521710952689566, + "grad_norm": 0.05087273567914963, + "learning_rate": 7.760226812474687e-05, + "loss": 0.4121, + "step": 1916 + }, + { + "epoch": 0.15529812054439404, + "grad_norm": 0.0774221420288086, + "learning_rate": 7.764277035236939e-05, + "loss": 0.4083, + "step": 1917 + }, + { + "epoch": 0.15537913156189243, + "grad_norm": 0.08393000066280365, + "learning_rate": 7.768327257999191e-05, + "loss": 0.4433, + "step": 1918 + }, + { + "epoch": 0.1554601425793908, + "grad_norm": 0.04874301701784134, + "learning_rate": 7.772377480761443e-05, + "loss": 0.4229, + "step": 1919 + }, + { + "epoch": 0.15554115359688916, + "grad_norm": 0.05467551574110985, + "learning_rate": 7.776427703523694e-05, + "loss": 0.4047, + "step": 1920 + }, + { + "epoch": 0.15562216461438755, + "grad_norm": 0.063243567943573, + "learning_rate": 7.780477926285946e-05, + "loss": 0.4126, + "step": 1921 + }, + { + "epoch": 0.15570317563188593, + "grad_norm": 0.05471186712384224, + "learning_rate": 7.784528149048198e-05, + "loss": 0.395, + "step": 1922 + }, + { + "epoch": 0.15578418664938432, + "grad_norm": 0.06775137037038803, + "learning_rate": 7.78857837181045e-05, + "loss": 0.4176, + "step": 1923 + }, + { + "epoch": 0.1558651976668827, + "grad_norm": 0.06087944284081459, + "learning_rate": 7.792628594572702e-05, + "loss": 0.4275, + "step": 1924 + }, + { + "epoch": 0.15594620868438108, + "grad_norm": 0.0486009307205677, + "learning_rate": 7.796678817334954e-05, + "loss": 0.3679, + "step": 1925 + }, + { + "epoch": 0.15602721970187947, + "grad_norm": 0.05688975751399994, + "learning_rate": 7.800729040097206e-05, + "loss": 0.3961, + "step": 1926 + }, + { + "epoch": 0.15610823071937782, + "grad_norm": 0.05588873103260994, + "learning_rate": 7.804779262859457e-05, + "loss": 0.4191, + "step": 1927 + }, + { + "epoch": 0.1561892417368762, + "grad_norm": 0.04377869516611099, + "learning_rate": 7.80882948562171e-05, + "loss": 0.37, + "step": 1928 + }, + { + "epoch": 0.1562702527543746, + "grad_norm": 0.05242636427283287, + "learning_rate": 7.812879708383961e-05, + "loss": 0.4042, + "step": 1929 + }, + { + "epoch": 0.15635126377187297, + "grad_norm": 0.05123385041952133, + "learning_rate": 7.816929931146213e-05, + "loss": 0.4155, + "step": 1930 + }, + { + "epoch": 0.15643227478937136, + "grad_norm": 0.05295446887612343, + "learning_rate": 7.820980153908465e-05, + "loss": 0.3773, + "step": 1931 + }, + { + "epoch": 0.15651328580686974, + "grad_norm": 0.04457692801952362, + "learning_rate": 7.825030376670717e-05, + "loss": 0.4005, + "step": 1932 + }, + { + "epoch": 0.15659429682436812, + "grad_norm": 0.04625699669122696, + "learning_rate": 7.829080599432969e-05, + "loss": 0.34, + "step": 1933 + }, + { + "epoch": 0.1566753078418665, + "grad_norm": 0.049518827348947525, + "learning_rate": 7.833130822195221e-05, + "loss": 0.3716, + "step": 1934 + }, + { + "epoch": 0.15675631885936486, + "grad_norm": 0.06226338446140289, + "learning_rate": 7.837181044957473e-05, + "loss": 0.4163, + "step": 1935 + }, + { + "epoch": 0.15683732987686325, + "grad_norm": 0.05170482024550438, + "learning_rate": 7.841231267719725e-05, + "loss": 0.3994, + "step": 1936 + }, + { + "epoch": 0.15691834089436163, + "grad_norm": 0.06363697350025177, + "learning_rate": 7.845281490481977e-05, + "loss": 0.4003, + "step": 1937 + }, + { + "epoch": 0.15699935191186, + "grad_norm": 0.05502576008439064, + "learning_rate": 7.849331713244229e-05, + "loss": 0.4276, + "step": 1938 + }, + { + "epoch": 0.1570803629293584, + "grad_norm": 0.05680760741233826, + "learning_rate": 7.853381936006481e-05, + "loss": 0.4005, + "step": 1939 + }, + { + "epoch": 0.15716137394685678, + "grad_norm": 0.05502947419881821, + "learning_rate": 7.857432158768733e-05, + "loss": 0.378, + "step": 1940 + }, + { + "epoch": 0.15724238496435516, + "grad_norm": 0.04791278392076492, + "learning_rate": 7.861482381530985e-05, + "loss": 0.4095, + "step": 1941 + }, + { + "epoch": 0.15732339598185352, + "grad_norm": 0.05494358763098717, + "learning_rate": 7.865532604293237e-05, + "loss": 0.4119, + "step": 1942 + }, + { + "epoch": 0.1574044069993519, + "grad_norm": 0.061576370149850845, + "learning_rate": 7.869582827055489e-05, + "loss": 0.4125, + "step": 1943 + }, + { + "epoch": 0.1574854180168503, + "grad_norm": 0.05027461051940918, + "learning_rate": 7.87363304981774e-05, + "loss": 0.3575, + "step": 1944 + }, + { + "epoch": 0.15756642903434867, + "grad_norm": 0.07733383774757385, + "learning_rate": 7.877683272579992e-05, + "loss": 0.4461, + "step": 1945 + }, + { + "epoch": 0.15764744005184705, + "grad_norm": 0.06613076478242874, + "learning_rate": 7.881733495342243e-05, + "loss": 0.4225, + "step": 1946 + }, + { + "epoch": 0.15772845106934544, + "grad_norm": 0.05832867696881294, + "learning_rate": 7.885783718104495e-05, + "loss": 0.3247, + "step": 1947 + }, + { + "epoch": 0.15780946208684382, + "grad_norm": 0.04764074459671974, + "learning_rate": 7.889833940866747e-05, + "loss": 0.37, + "step": 1948 + }, + { + "epoch": 0.15789047310434218, + "grad_norm": 0.05749349296092987, + "learning_rate": 7.893884163629e-05, + "loss": 0.3437, + "step": 1949 + }, + { + "epoch": 0.15797148412184056, + "grad_norm": 0.05542978271842003, + "learning_rate": 7.897934386391251e-05, + "loss": 0.4011, + "step": 1950 + }, + { + "epoch": 0.15805249513933894, + "grad_norm": 0.043319717049598694, + "learning_rate": 7.901984609153505e-05, + "loss": 0.403, + "step": 1951 + }, + { + "epoch": 0.15813350615683733, + "grad_norm": 0.06057552993297577, + "learning_rate": 7.906034831915757e-05, + "loss": 0.4068, + "step": 1952 + }, + { + "epoch": 0.1582145171743357, + "grad_norm": 0.06298187375068665, + "learning_rate": 7.910085054678009e-05, + "loss": 0.4251, + "step": 1953 + }, + { + "epoch": 0.1582955281918341, + "grad_norm": 0.06181003153324127, + "learning_rate": 7.91413527744026e-05, + "loss": 0.3637, + "step": 1954 + }, + { + "epoch": 0.15837653920933248, + "grad_norm": 0.05909266695380211, + "learning_rate": 7.918185500202511e-05, + "loss": 0.4113, + "step": 1955 + }, + { + "epoch": 0.15845755022683086, + "grad_norm": 0.055890344083309174, + "learning_rate": 7.922235722964763e-05, + "loss": 0.3859, + "step": 1956 + }, + { + "epoch": 0.15853856124432922, + "grad_norm": 0.055468108505010605, + "learning_rate": 7.926285945727015e-05, + "loss": 0.3716, + "step": 1957 + }, + { + "epoch": 0.1586195722618276, + "grad_norm": 0.06345760822296143, + "learning_rate": 7.930336168489267e-05, + "loss": 0.376, + "step": 1958 + }, + { + "epoch": 0.15870058327932599, + "grad_norm": 0.08457563072443008, + "learning_rate": 7.934386391251519e-05, + "loss": 0.385, + "step": 1959 + }, + { + "epoch": 0.15878159429682437, + "grad_norm": 0.06403377652168274, + "learning_rate": 7.938436614013771e-05, + "loss": 0.3838, + "step": 1960 + }, + { + "epoch": 0.15886260531432275, + "grad_norm": 0.06183291971683502, + "learning_rate": 7.942486836776023e-05, + "loss": 0.3982, + "step": 1961 + }, + { + "epoch": 0.15894361633182114, + "grad_norm": 0.0550561286509037, + "learning_rate": 7.946537059538275e-05, + "loss": 0.3767, + "step": 1962 + }, + { + "epoch": 0.15902462734931952, + "grad_norm": 0.058353181928396225, + "learning_rate": 7.950587282300527e-05, + "loss": 0.3592, + "step": 1963 + }, + { + "epoch": 0.15910563836681788, + "grad_norm": 0.06605188548564911, + "learning_rate": 7.954637505062779e-05, + "loss": 0.4754, + "step": 1964 + }, + { + "epoch": 0.15918664938431626, + "grad_norm": 0.05959831550717354, + "learning_rate": 7.958687727825031e-05, + "loss": 0.4033, + "step": 1965 + }, + { + "epoch": 0.15926766040181464, + "grad_norm": 0.0478963702917099, + "learning_rate": 7.962737950587283e-05, + "loss": 0.3678, + "step": 1966 + }, + { + "epoch": 0.15934867141931303, + "grad_norm": 0.058596860617399216, + "learning_rate": 7.966788173349535e-05, + "loss": 0.3687, + "step": 1967 + }, + { + "epoch": 0.1594296824368114, + "grad_norm": 0.0579354427754879, + "learning_rate": 7.970838396111785e-05, + "loss": 0.39, + "step": 1968 + }, + { + "epoch": 0.1595106934543098, + "grad_norm": 0.06575972586870193, + "learning_rate": 7.974888618874037e-05, + "loss": 0.4387, + "step": 1969 + }, + { + "epoch": 0.15959170447180818, + "grad_norm": 0.06601779907941818, + "learning_rate": 7.978938841636291e-05, + "loss": 0.3899, + "step": 1970 + }, + { + "epoch": 0.15967271548930653, + "grad_norm": 0.08127705752849579, + "learning_rate": 7.982989064398543e-05, + "loss": 0.4376, + "step": 1971 + }, + { + "epoch": 0.15975372650680492, + "grad_norm": 0.05637722462415695, + "learning_rate": 7.987039287160795e-05, + "loss": 0.3444, + "step": 1972 + }, + { + "epoch": 0.1598347375243033, + "grad_norm": 0.04732414335012436, + "learning_rate": 7.991089509923047e-05, + "loss": 0.3893, + "step": 1973 + }, + { + "epoch": 0.15991574854180168, + "grad_norm": 0.056384116411209106, + "learning_rate": 7.995139732685299e-05, + "loss": 0.3915, + "step": 1974 + }, + { + "epoch": 0.15999675955930007, + "grad_norm": 0.059659332036972046, + "learning_rate": 7.99918995544755e-05, + "loss": 0.4129, + "step": 1975 + }, + { + "epoch": 0.16007777057679845, + "grad_norm": 0.0602417029440403, + "learning_rate": 8.003240178209802e-05, + "loss": 0.3395, + "step": 1976 + }, + { + "epoch": 0.16015878159429683, + "grad_norm": 0.04651299864053726, + "learning_rate": 8.007290400972054e-05, + "loss": 0.367, + "step": 1977 + }, + { + "epoch": 0.16023979261179522, + "grad_norm": 0.05572040379047394, + "learning_rate": 8.011340623734306e-05, + "loss": 0.4314, + "step": 1978 + }, + { + "epoch": 0.16032080362929357, + "grad_norm": 0.06522223353385925, + "learning_rate": 8.015390846496557e-05, + "loss": 0.3829, + "step": 1979 + }, + { + "epoch": 0.16040181464679196, + "grad_norm": 0.07616066187620163, + "learning_rate": 8.019441069258809e-05, + "loss": 0.4696, + "step": 1980 + }, + { + "epoch": 0.16048282566429034, + "grad_norm": 0.04531807824969292, + "learning_rate": 8.023491292021061e-05, + "loss": 0.3622, + "step": 1981 + }, + { + "epoch": 0.16056383668178872, + "grad_norm": 0.05485687404870987, + "learning_rate": 8.027541514783313e-05, + "loss": 0.4081, + "step": 1982 + }, + { + "epoch": 0.1606448476992871, + "grad_norm": 0.05889255553483963, + "learning_rate": 8.031591737545565e-05, + "loss": 0.3924, + "step": 1983 + }, + { + "epoch": 0.1607258587167855, + "grad_norm": 0.0493583045899868, + "learning_rate": 8.035641960307817e-05, + "loss": 0.37, + "step": 1984 + }, + { + "epoch": 0.16080686973428387, + "grad_norm": 0.04861397668719292, + "learning_rate": 8.039692183070069e-05, + "loss": 0.3602, + "step": 1985 + }, + { + "epoch": 0.16088788075178223, + "grad_norm": 0.04801037162542343, + "learning_rate": 8.043742405832321e-05, + "loss": 0.3762, + "step": 1986 + }, + { + "epoch": 0.16096889176928061, + "grad_norm": 0.055991508066654205, + "learning_rate": 8.047792628594573e-05, + "loss": 0.4076, + "step": 1987 + }, + { + "epoch": 0.161049902786779, + "grad_norm": 0.06045025214552879, + "learning_rate": 8.051842851356825e-05, + "loss": 0.4172, + "step": 1988 + }, + { + "epoch": 0.16113091380427738, + "grad_norm": 0.04982104152441025, + "learning_rate": 8.055893074119078e-05, + "loss": 0.3778, + "step": 1989 + }, + { + "epoch": 0.16121192482177576, + "grad_norm": 0.0629178062081337, + "learning_rate": 8.059943296881329e-05, + "loss": 0.4051, + "step": 1990 + }, + { + "epoch": 0.16129293583927415, + "grad_norm": 0.05481969192624092, + "learning_rate": 8.06399351964358e-05, + "loss": 0.3788, + "step": 1991 + }, + { + "epoch": 0.16137394685677253, + "grad_norm": 0.07303039729595184, + "learning_rate": 8.068043742405833e-05, + "loss": 0.3684, + "step": 1992 + }, + { + "epoch": 0.1614549578742709, + "grad_norm": 0.0633680671453476, + "learning_rate": 8.072093965168085e-05, + "loss": 0.356, + "step": 1993 + }, + { + "epoch": 0.16153596889176927, + "grad_norm": 0.04789692908525467, + "learning_rate": 8.076144187930337e-05, + "loss": 0.4002, + "step": 1994 + }, + { + "epoch": 0.16161697990926766, + "grad_norm": 0.05857124179601669, + "learning_rate": 8.080194410692589e-05, + "loss": 0.3854, + "step": 1995 + }, + { + "epoch": 0.16169799092676604, + "grad_norm": 0.04986008629202843, + "learning_rate": 8.08424463345484e-05, + "loss": 0.3633, + "step": 1996 + }, + { + "epoch": 0.16177900194426442, + "grad_norm": 0.055833835154771805, + "learning_rate": 8.088294856217092e-05, + "loss": 0.3494, + "step": 1997 + }, + { + "epoch": 0.1618600129617628, + "grad_norm": 0.057907700538635254, + "learning_rate": 8.092345078979344e-05, + "loss": 0.4351, + "step": 1998 + }, + { + "epoch": 0.1619410239792612, + "grad_norm": 0.06546240299940109, + "learning_rate": 8.096395301741596e-05, + "loss": 0.4461, + "step": 1999 + }, + { + "epoch": 0.16202203499675957, + "grad_norm": 0.052776411175727844, + "learning_rate": 8.100445524503848e-05, + "loss": 0.3789, + "step": 2000 + }, + { + "epoch": 0.16210304601425793, + "grad_norm": 0.05560920014977455, + "learning_rate": 8.1044957472661e-05, + "loss": 0.4568, + "step": 2001 + }, + { + "epoch": 0.1621840570317563, + "grad_norm": 0.06929999589920044, + "learning_rate": 8.108545970028352e-05, + "loss": 0.3983, + "step": 2002 + }, + { + "epoch": 0.1622650680492547, + "grad_norm": 0.051275137811899185, + "learning_rate": 8.112596192790603e-05, + "loss": 0.4339, + "step": 2003 + }, + { + "epoch": 0.16234607906675308, + "grad_norm": 0.061996664851903915, + "learning_rate": 8.116646415552855e-05, + "loss": 0.3624, + "step": 2004 + }, + { + "epoch": 0.16242709008425146, + "grad_norm": 0.06525516510009766, + "learning_rate": 8.120696638315107e-05, + "loss": 0.3954, + "step": 2005 + }, + { + "epoch": 0.16250810110174985, + "grad_norm": 0.04934125021100044, + "learning_rate": 8.124746861077359e-05, + "loss": 0.3958, + "step": 2006 + }, + { + "epoch": 0.16258911211924823, + "grad_norm": 0.05546298250555992, + "learning_rate": 8.128797083839611e-05, + "loss": 0.3855, + "step": 2007 + }, + { + "epoch": 0.16267012313674659, + "grad_norm": 0.057532209903001785, + "learning_rate": 8.132847306601864e-05, + "loss": 0.3938, + "step": 2008 + }, + { + "epoch": 0.16275113415424497, + "grad_norm": 0.06330697983503342, + "learning_rate": 8.136897529364116e-05, + "loss": 0.3873, + "step": 2009 + }, + { + "epoch": 0.16283214517174335, + "grad_norm": 0.06267868727445602, + "learning_rate": 8.140947752126368e-05, + "loss": 0.4103, + "step": 2010 + }, + { + "epoch": 0.16291315618924174, + "grad_norm": 0.06294619292020798, + "learning_rate": 8.14499797488862e-05, + "loss": 0.4097, + "step": 2011 + }, + { + "epoch": 0.16299416720674012, + "grad_norm": 0.046896860003471375, + "learning_rate": 8.149048197650872e-05, + "loss": 0.3275, + "step": 2012 + }, + { + "epoch": 0.1630751782242385, + "grad_norm": 0.05417289212346077, + "learning_rate": 8.153098420413124e-05, + "loss": 0.3654, + "step": 2013 + }, + { + "epoch": 0.1631561892417369, + "grad_norm": 0.06072888895869255, + "learning_rate": 8.157148643175375e-05, + "loss": 0.3878, + "step": 2014 + }, + { + "epoch": 0.16323720025923524, + "grad_norm": 0.08595926314592361, + "learning_rate": 8.161198865937626e-05, + "loss": 0.4025, + "step": 2015 + }, + { + "epoch": 0.16331821127673363, + "grad_norm": 0.061273351311683655, + "learning_rate": 8.165249088699878e-05, + "loss": 0.386, + "step": 2016 + }, + { + "epoch": 0.163399222294232, + "grad_norm": 0.05186104774475098, + "learning_rate": 8.16929931146213e-05, + "loss": 0.3795, + "step": 2017 + }, + { + "epoch": 0.1634802333117304, + "grad_norm": 0.053132180124521255, + "learning_rate": 8.173349534224382e-05, + "loss": 0.387, + "step": 2018 + }, + { + "epoch": 0.16356124432922878, + "grad_norm": 0.07236981391906738, + "learning_rate": 8.177399756986634e-05, + "loss": 0.3965, + "step": 2019 + }, + { + "epoch": 0.16364225534672716, + "grad_norm": 0.0509367398917675, + "learning_rate": 8.181449979748886e-05, + "loss": 0.3846, + "step": 2020 + }, + { + "epoch": 0.16372326636422554, + "grad_norm": 0.07419263571500778, + "learning_rate": 8.185500202511138e-05, + "loss": 0.4159, + "step": 2021 + }, + { + "epoch": 0.1638042773817239, + "grad_norm": 0.05094772204756737, + "learning_rate": 8.18955042527339e-05, + "loss": 0.316, + "step": 2022 + }, + { + "epoch": 0.16388528839922228, + "grad_norm": 0.054001349955797195, + "learning_rate": 8.193600648035642e-05, + "loss": 0.3751, + "step": 2023 + }, + { + "epoch": 0.16396629941672067, + "grad_norm": 0.047819335013628006, + "learning_rate": 8.197650870797894e-05, + "loss": 0.3919, + "step": 2024 + }, + { + "epoch": 0.16404731043421905, + "grad_norm": 0.056271765381097794, + "learning_rate": 8.201701093560146e-05, + "loss": 0.4111, + "step": 2025 + }, + { + "epoch": 0.16412832145171743, + "grad_norm": 0.04949527233839035, + "learning_rate": 8.205751316322398e-05, + "loss": 0.4071, + "step": 2026 + }, + { + "epoch": 0.16420933246921582, + "grad_norm": 0.05989866331219673, + "learning_rate": 8.20980153908465e-05, + "loss": 0.3947, + "step": 2027 + }, + { + "epoch": 0.1642903434867142, + "grad_norm": 0.0578424446284771, + "learning_rate": 8.213851761846902e-05, + "loss": 0.4411, + "step": 2028 + }, + { + "epoch": 0.16437135450421259, + "grad_norm": 0.05793230980634689, + "learning_rate": 8.217901984609154e-05, + "loss": 0.3714, + "step": 2029 + }, + { + "epoch": 0.16445236552171094, + "grad_norm": 0.05304427817463875, + "learning_rate": 8.221952207371406e-05, + "loss": 0.3843, + "step": 2030 + }, + { + "epoch": 0.16453337653920933, + "grad_norm": 0.050228215754032135, + "learning_rate": 8.226002430133658e-05, + "loss": 0.4134, + "step": 2031 + }, + { + "epoch": 0.1646143875567077, + "grad_norm": 0.053763143718242645, + "learning_rate": 8.23005265289591e-05, + "loss": 0.3539, + "step": 2032 + }, + { + "epoch": 0.1646953985742061, + "grad_norm": 0.06726308166980743, + "learning_rate": 8.234102875658162e-05, + "loss": 0.4218, + "step": 2033 + }, + { + "epoch": 0.16477640959170448, + "grad_norm": 0.06924933940172195, + "learning_rate": 8.238153098420414e-05, + "loss": 0.3621, + "step": 2034 + }, + { + "epoch": 0.16485742060920286, + "grad_norm": 0.057229988276958466, + "learning_rate": 8.242203321182666e-05, + "loss": 0.3866, + "step": 2035 + }, + { + "epoch": 0.16493843162670124, + "grad_norm": 0.09982109814882278, + "learning_rate": 8.246253543944918e-05, + "loss": 0.4047, + "step": 2036 + }, + { + "epoch": 0.1650194426441996, + "grad_norm": 0.07248137146234512, + "learning_rate": 8.25030376670717e-05, + "loss": 0.458, + "step": 2037 + }, + { + "epoch": 0.16510045366169798, + "grad_norm": 0.05413713678717613, + "learning_rate": 8.254353989469422e-05, + "loss": 0.3775, + "step": 2038 + }, + { + "epoch": 0.16518146467919637, + "grad_norm": 0.058229487389326096, + "learning_rate": 8.258404212231672e-05, + "loss": 0.371, + "step": 2039 + }, + { + "epoch": 0.16526247569669475, + "grad_norm": 0.06540478765964508, + "learning_rate": 8.262454434993924e-05, + "loss": 0.4214, + "step": 2040 + }, + { + "epoch": 0.16534348671419313, + "grad_norm": 0.050467513501644135, + "learning_rate": 8.266504657756176e-05, + "loss": 0.3416, + "step": 2041 + }, + { + "epoch": 0.16542449773169152, + "grad_norm": 0.11017223447561264, + "learning_rate": 8.270554880518428e-05, + "loss": 0.3417, + "step": 2042 + }, + { + "epoch": 0.1655055087491899, + "grad_norm": 0.061050888150930405, + "learning_rate": 8.27460510328068e-05, + "loss": 0.3726, + "step": 2043 + }, + { + "epoch": 0.16558651976668826, + "grad_norm": 0.058097269386053085, + "learning_rate": 8.278655326042932e-05, + "loss": 0.4059, + "step": 2044 + }, + { + "epoch": 0.16566753078418664, + "grad_norm": 0.0401441790163517, + "learning_rate": 8.282705548805186e-05, + "loss": 0.3885, + "step": 2045 + }, + { + "epoch": 0.16574854180168502, + "grad_norm": 0.05544157698750496, + "learning_rate": 8.286755771567437e-05, + "loss": 0.3818, + "step": 2046 + }, + { + "epoch": 0.1658295528191834, + "grad_norm": 0.06059475243091583, + "learning_rate": 8.29080599432969e-05, + "loss": 0.3719, + "step": 2047 + }, + { + "epoch": 0.1659105638366818, + "grad_norm": 0.05821231007575989, + "learning_rate": 8.294856217091941e-05, + "loss": 0.3742, + "step": 2048 + }, + { + "epoch": 0.16599157485418017, + "grad_norm": 0.0714845061302185, + "learning_rate": 8.298906439854193e-05, + "loss": 0.467, + "step": 2049 + }, + { + "epoch": 0.16607258587167856, + "grad_norm": 0.06715501844882965, + "learning_rate": 8.302956662616444e-05, + "loss": 0.4076, + "step": 2050 + }, + { + "epoch": 0.16615359688917694, + "grad_norm": 0.07473434507846832, + "learning_rate": 8.307006885378696e-05, + "loss": 0.4136, + "step": 2051 + }, + { + "epoch": 0.1662346079066753, + "grad_norm": 0.0510735958814621, + "learning_rate": 8.311057108140948e-05, + "loss": 0.3626, + "step": 2052 + }, + { + "epoch": 0.16631561892417368, + "grad_norm": 0.046676717698574066, + "learning_rate": 8.3151073309032e-05, + "loss": 0.4257, + "step": 2053 + }, + { + "epoch": 0.16639662994167206, + "grad_norm": 0.04868389293551445, + "learning_rate": 8.319157553665452e-05, + "loss": 0.3741, + "step": 2054 + }, + { + "epoch": 0.16647764095917045, + "grad_norm": 0.051401641219854355, + "learning_rate": 8.323207776427704e-05, + "loss": 0.3741, + "step": 2055 + }, + { + "epoch": 0.16655865197666883, + "grad_norm": 0.04849712923169136, + "learning_rate": 8.327257999189956e-05, + "loss": 0.4219, + "step": 2056 + }, + { + "epoch": 0.16663966299416721, + "grad_norm": 0.04904567822813988, + "learning_rate": 8.331308221952208e-05, + "loss": 0.3797, + "step": 2057 + }, + { + "epoch": 0.1667206740116656, + "grad_norm": 0.06897692382335663, + "learning_rate": 8.33535844471446e-05, + "loss": 0.3907, + "step": 2058 + }, + { + "epoch": 0.16680168502916395, + "grad_norm": 0.057387739419937134, + "learning_rate": 8.339408667476712e-05, + "loss": 0.364, + "step": 2059 + }, + { + "epoch": 0.16688269604666234, + "grad_norm": 0.056419409811496735, + "learning_rate": 8.343458890238964e-05, + "loss": 0.3966, + "step": 2060 + }, + { + "epoch": 0.16696370706416072, + "grad_norm": 0.05423840880393982, + "learning_rate": 8.347509113001216e-05, + "loss": 0.378, + "step": 2061 + }, + { + "epoch": 0.1670447180816591, + "grad_norm": 0.05476713925600052, + "learning_rate": 8.351559335763468e-05, + "loss": 0.3787, + "step": 2062 + }, + { + "epoch": 0.1671257290991575, + "grad_norm": 0.0587175115942955, + "learning_rate": 8.355609558525718e-05, + "loss": 0.3939, + "step": 2063 + }, + { + "epoch": 0.16720674011665587, + "grad_norm": 0.059848107397556305, + "learning_rate": 8.359659781287972e-05, + "loss": 0.386, + "step": 2064 + }, + { + "epoch": 0.16728775113415426, + "grad_norm": 0.05978243052959442, + "learning_rate": 8.363710004050223e-05, + "loss": 0.3865, + "step": 2065 + }, + { + "epoch": 0.1673687621516526, + "grad_norm": 0.052778005599975586, + "learning_rate": 8.367760226812475e-05, + "loss": 0.379, + "step": 2066 + }, + { + "epoch": 0.167449773169151, + "grad_norm": 0.05640120059251785, + "learning_rate": 8.371810449574727e-05, + "loss": 0.4059, + "step": 2067 + }, + { + "epoch": 0.16753078418664938, + "grad_norm": 0.06634902209043503, + "learning_rate": 8.37586067233698e-05, + "loss": 0.3989, + "step": 2068 + }, + { + "epoch": 0.16761179520414776, + "grad_norm": 0.06636673957109451, + "learning_rate": 8.379910895099231e-05, + "loss": 0.4047, + "step": 2069 + }, + { + "epoch": 0.16769280622164615, + "grad_norm": 0.060244470834732056, + "learning_rate": 8.383961117861483e-05, + "loss": 0.3811, + "step": 2070 + }, + { + "epoch": 0.16777381723914453, + "grad_norm": 0.05000202730298042, + "learning_rate": 8.388011340623735e-05, + "loss": 0.4119, + "step": 2071 + }, + { + "epoch": 0.1678548282566429, + "grad_norm": 0.05411381646990776, + "learning_rate": 8.392061563385987e-05, + "loss": 0.3634, + "step": 2072 + }, + { + "epoch": 0.1679358392741413, + "grad_norm": 0.046290017664432526, + "learning_rate": 8.396111786148239e-05, + "loss": 0.4702, + "step": 2073 + }, + { + "epoch": 0.16801685029163965, + "grad_norm": 0.05964218080043793, + "learning_rate": 8.40016200891049e-05, + "loss": 0.4264, + "step": 2074 + }, + { + "epoch": 0.16809786130913804, + "grad_norm": 0.06505851447582245, + "learning_rate": 8.404212231672742e-05, + "loss": 0.388, + "step": 2075 + }, + { + "epoch": 0.16817887232663642, + "grad_norm": 0.07671922445297241, + "learning_rate": 8.408262454434994e-05, + "loss": 0.4455, + "step": 2076 + }, + { + "epoch": 0.1682598833441348, + "grad_norm": 0.0595112070441246, + "learning_rate": 8.412312677197246e-05, + "loss": 0.3761, + "step": 2077 + }, + { + "epoch": 0.1683408943616332, + "grad_norm": 0.055115941911935806, + "learning_rate": 8.416362899959498e-05, + "loss": 0.3811, + "step": 2078 + }, + { + "epoch": 0.16842190537913157, + "grad_norm": 0.04717046394944191, + "learning_rate": 8.42041312272175e-05, + "loss": 0.353, + "step": 2079 + }, + { + "epoch": 0.16850291639662995, + "grad_norm": 0.0552898645401001, + "learning_rate": 8.424463345484002e-05, + "loss": 0.3661, + "step": 2080 + }, + { + "epoch": 0.1685839274141283, + "grad_norm": 0.06371273100376129, + "learning_rate": 8.428513568246254e-05, + "loss": 0.4038, + "step": 2081 + }, + { + "epoch": 0.1686649384316267, + "grad_norm": 0.06050824373960495, + "learning_rate": 8.432563791008506e-05, + "loss": 0.3478, + "step": 2082 + }, + { + "epoch": 0.16874594944912508, + "grad_norm": 0.05126861855387688, + "learning_rate": 8.436614013770759e-05, + "loss": 0.4055, + "step": 2083 + }, + { + "epoch": 0.16882696046662346, + "grad_norm": 0.05001140385866165, + "learning_rate": 8.440664236533011e-05, + "loss": 0.4168, + "step": 2084 + }, + { + "epoch": 0.16890797148412184, + "grad_norm": 0.07154504209756851, + "learning_rate": 8.444714459295261e-05, + "loss": 0.3987, + "step": 2085 + }, + { + "epoch": 0.16898898250162023, + "grad_norm": 0.06982825696468353, + "learning_rate": 8.448764682057513e-05, + "loss": 0.4167, + "step": 2086 + }, + { + "epoch": 0.1690699935191186, + "grad_norm": 0.051283951848745346, + "learning_rate": 8.452814904819765e-05, + "loss": 0.3803, + "step": 2087 + }, + { + "epoch": 0.16915100453661697, + "grad_norm": 0.054312046617269516, + "learning_rate": 8.456865127582017e-05, + "loss": 0.408, + "step": 2088 + }, + { + "epoch": 0.16923201555411535, + "grad_norm": 0.061378683894872665, + "learning_rate": 8.46091535034427e-05, + "loss": 0.4033, + "step": 2089 + }, + { + "epoch": 0.16931302657161373, + "grad_norm": 0.0683789774775505, + "learning_rate": 8.464965573106521e-05, + "loss": 0.4117, + "step": 2090 + }, + { + "epoch": 0.16939403758911212, + "grad_norm": 0.06293641030788422, + "learning_rate": 8.469015795868773e-05, + "loss": 0.3856, + "step": 2091 + }, + { + "epoch": 0.1694750486066105, + "grad_norm": 0.060117315500974655, + "learning_rate": 8.473066018631025e-05, + "loss": 0.453, + "step": 2092 + }, + { + "epoch": 0.16955605962410888, + "grad_norm": 0.05961222946643829, + "learning_rate": 8.477116241393277e-05, + "loss": 0.4213, + "step": 2093 + }, + { + "epoch": 0.16963707064160727, + "grad_norm": 0.05129764601588249, + "learning_rate": 8.481166464155529e-05, + "loss": 0.3548, + "step": 2094 + }, + { + "epoch": 0.16971808165910565, + "grad_norm": 0.06676856428384781, + "learning_rate": 8.485216686917781e-05, + "loss": 0.425, + "step": 2095 + }, + { + "epoch": 0.169799092676604, + "grad_norm": 0.05692289024591446, + "learning_rate": 8.489266909680033e-05, + "loss": 0.3769, + "step": 2096 + }, + { + "epoch": 0.1698801036941024, + "grad_norm": 0.056856393814086914, + "learning_rate": 8.493317132442285e-05, + "loss": 0.3897, + "step": 2097 + }, + { + "epoch": 0.16996111471160077, + "grad_norm": 0.05395682156085968, + "learning_rate": 8.497367355204536e-05, + "loss": 0.3646, + "step": 2098 + }, + { + "epoch": 0.17004212572909916, + "grad_norm": 0.051694948226213455, + "learning_rate": 8.501417577966788e-05, + "loss": 0.3813, + "step": 2099 + }, + { + "epoch": 0.17012313674659754, + "grad_norm": 0.05395714193582535, + "learning_rate": 8.50546780072904e-05, + "loss": 0.38, + "step": 2100 + }, + { + "epoch": 0.17020414776409593, + "grad_norm": 0.061914216727018356, + "learning_rate": 8.509518023491292e-05, + "loss": 0.3776, + "step": 2101 + }, + { + "epoch": 0.1702851587815943, + "grad_norm": 0.05741438642144203, + "learning_rate": 8.513568246253545e-05, + "loss": 0.3861, + "step": 2102 + }, + { + "epoch": 0.17036616979909266, + "grad_norm": 0.0539533868432045, + "learning_rate": 8.517618469015797e-05, + "loss": 0.3972, + "step": 2103 + }, + { + "epoch": 0.17044718081659105, + "grad_norm": 0.06513522565364838, + "learning_rate": 8.521668691778049e-05, + "loss": 0.3707, + "step": 2104 + }, + { + "epoch": 0.17052819183408943, + "grad_norm": 0.04477247595787048, + "learning_rate": 8.525718914540301e-05, + "loss": 0.3624, + "step": 2105 + }, + { + "epoch": 0.17060920285158782, + "grad_norm": 0.058546360582113266, + "learning_rate": 8.529769137302553e-05, + "loss": 0.3551, + "step": 2106 + }, + { + "epoch": 0.1706902138690862, + "grad_norm": 0.08391832560300827, + "learning_rate": 8.533819360064805e-05, + "loss": 0.4142, + "step": 2107 + }, + { + "epoch": 0.17077122488658458, + "grad_norm": 0.0546516515314579, + "learning_rate": 8.537869582827057e-05, + "loss": 0.3747, + "step": 2108 + }, + { + "epoch": 0.17085223590408297, + "grad_norm": 0.06025971099734306, + "learning_rate": 8.541919805589307e-05, + "loss": 0.4133, + "step": 2109 + }, + { + "epoch": 0.17093324692158132, + "grad_norm": 0.05930865556001663, + "learning_rate": 8.545970028351559e-05, + "loss": 0.3902, + "step": 2110 + }, + { + "epoch": 0.1710142579390797, + "grad_norm": 0.047234680503606796, + "learning_rate": 8.550020251113811e-05, + "loss": 0.3234, + "step": 2111 + }, + { + "epoch": 0.1710952689565781, + "grad_norm": 0.05951959267258644, + "learning_rate": 8.554070473876063e-05, + "loss": 0.3898, + "step": 2112 + }, + { + "epoch": 0.17117627997407647, + "grad_norm": 0.04036698862910271, + "learning_rate": 8.558120696638315e-05, + "loss": 0.3518, + "step": 2113 + }, + { + "epoch": 0.17125729099157486, + "grad_norm": 0.05052487924695015, + "learning_rate": 8.562170919400567e-05, + "loss": 0.4164, + "step": 2114 + }, + { + "epoch": 0.17133830200907324, + "grad_norm": 0.06837450712919235, + "learning_rate": 8.566221142162819e-05, + "loss": 0.4064, + "step": 2115 + }, + { + "epoch": 0.17141931302657162, + "grad_norm": 0.05384335294365883, + "learning_rate": 8.570271364925071e-05, + "loss": 0.3549, + "step": 2116 + }, + { + "epoch": 0.17150032404407, + "grad_norm": 0.06057494506239891, + "learning_rate": 8.574321587687323e-05, + "loss": 0.4043, + "step": 2117 + }, + { + "epoch": 0.17158133506156836, + "grad_norm": 0.06762439012527466, + "learning_rate": 8.578371810449575e-05, + "loss": 0.3724, + "step": 2118 + }, + { + "epoch": 0.17166234607906675, + "grad_norm": 0.05397611856460571, + "learning_rate": 8.582422033211827e-05, + "loss": 0.4143, + "step": 2119 + }, + { + "epoch": 0.17174335709656513, + "grad_norm": 0.051856379956007004, + "learning_rate": 8.586472255974079e-05, + "loss": 0.4403, + "step": 2120 + }, + { + "epoch": 0.1718243681140635, + "grad_norm": 0.052403032779693604, + "learning_rate": 8.590522478736331e-05, + "loss": 0.3789, + "step": 2121 + }, + { + "epoch": 0.1719053791315619, + "grad_norm": 0.07184526324272156, + "learning_rate": 8.594572701498583e-05, + "loss": 0.4007, + "step": 2122 + }, + { + "epoch": 0.17198639014906028, + "grad_norm": 0.05395899713039398, + "learning_rate": 8.598622924260835e-05, + "loss": 0.4117, + "step": 2123 + }, + { + "epoch": 0.17206740116655866, + "grad_norm": 0.05061892420053482, + "learning_rate": 8.602673147023087e-05, + "loss": 0.3984, + "step": 2124 + }, + { + "epoch": 0.17214841218405702, + "grad_norm": 0.05283121392130852, + "learning_rate": 8.606723369785339e-05, + "loss": 0.4138, + "step": 2125 + }, + { + "epoch": 0.1722294232015554, + "grad_norm": 0.07489524781703949, + "learning_rate": 8.610773592547591e-05, + "loss": 0.4387, + "step": 2126 + }, + { + "epoch": 0.1723104342190538, + "grad_norm": 0.05126599967479706, + "learning_rate": 8.614823815309843e-05, + "loss": 0.363, + "step": 2127 + }, + { + "epoch": 0.17239144523655217, + "grad_norm": 0.04971858859062195, + "learning_rate": 8.618874038072095e-05, + "loss": 0.4093, + "step": 2128 + }, + { + "epoch": 0.17247245625405055, + "grad_norm": 0.04760098457336426, + "learning_rate": 8.622924260834347e-05, + "loss": 0.4135, + "step": 2129 + }, + { + "epoch": 0.17255346727154894, + "grad_norm": 0.05287618190050125, + "learning_rate": 8.626974483596599e-05, + "loss": 0.4061, + "step": 2130 + }, + { + "epoch": 0.17263447828904732, + "grad_norm": 0.06464332342147827, + "learning_rate": 8.63102470635885e-05, + "loss": 0.3367, + "step": 2131 + }, + { + "epoch": 0.17271548930654568, + "grad_norm": 0.070146843791008, + "learning_rate": 8.635074929121103e-05, + "loss": 0.3794, + "step": 2132 + }, + { + "epoch": 0.17279650032404406, + "grad_norm": 0.06114206090569496, + "learning_rate": 8.639125151883353e-05, + "loss": 0.415, + "step": 2133 + }, + { + "epoch": 0.17287751134154244, + "grad_norm": 0.061411190778017044, + "learning_rate": 8.643175374645605e-05, + "loss": 0.3569, + "step": 2134 + }, + { + "epoch": 0.17295852235904083, + "grad_norm": 0.062324997037649155, + "learning_rate": 8.647225597407857e-05, + "loss": 0.4484, + "step": 2135 + }, + { + "epoch": 0.1730395333765392, + "grad_norm": 0.04799463227391243, + "learning_rate": 8.651275820170109e-05, + "loss": 0.3216, + "step": 2136 + }, + { + "epoch": 0.1731205443940376, + "grad_norm": 0.05411509796977043, + "learning_rate": 8.655326042932361e-05, + "loss": 0.3415, + "step": 2137 + }, + { + "epoch": 0.17320155541153598, + "grad_norm": 0.05922120809555054, + "learning_rate": 8.659376265694613e-05, + "loss": 0.4232, + "step": 2138 + }, + { + "epoch": 0.17328256642903433, + "grad_norm": 0.052514318376779556, + "learning_rate": 8.663426488456865e-05, + "loss": 0.406, + "step": 2139 + }, + { + "epoch": 0.17336357744653272, + "grad_norm": 0.06161622330546379, + "learning_rate": 8.667476711219118e-05, + "loss": 0.3945, + "step": 2140 + }, + { + "epoch": 0.1734445884640311, + "grad_norm": 0.05787722021341324, + "learning_rate": 8.67152693398137e-05, + "loss": 0.4028, + "step": 2141 + }, + { + "epoch": 0.17352559948152949, + "grad_norm": 0.05399668589234352, + "learning_rate": 8.675577156743622e-05, + "loss": 0.3698, + "step": 2142 + }, + { + "epoch": 0.17360661049902787, + "grad_norm": 0.0524609200656414, + "learning_rate": 8.679627379505874e-05, + "loss": 0.328, + "step": 2143 + }, + { + "epoch": 0.17368762151652625, + "grad_norm": 0.04540975019335747, + "learning_rate": 8.683677602268125e-05, + "loss": 0.3855, + "step": 2144 + }, + { + "epoch": 0.17376863253402464, + "grad_norm": 0.049933046102523804, + "learning_rate": 8.687727825030377e-05, + "loss": 0.3817, + "step": 2145 + }, + { + "epoch": 0.17384964355152302, + "grad_norm": 0.06545059382915497, + "learning_rate": 8.691778047792629e-05, + "loss": 0.383, + "step": 2146 + }, + { + "epoch": 0.17393065456902138, + "grad_norm": 0.05309139937162399, + "learning_rate": 8.695828270554881e-05, + "loss": 0.351, + "step": 2147 + }, + { + "epoch": 0.17401166558651976, + "grad_norm": 0.06722250580787659, + "learning_rate": 8.699878493317133e-05, + "loss": 0.4386, + "step": 2148 + }, + { + "epoch": 0.17409267660401814, + "grad_norm": 0.06491897255182266, + "learning_rate": 8.703928716079385e-05, + "loss": 0.417, + "step": 2149 + }, + { + "epoch": 0.17417368762151653, + "grad_norm": 0.04083773121237755, + "learning_rate": 8.707978938841637e-05, + "loss": 0.3465, + "step": 2150 + }, + { + "epoch": 0.1742546986390149, + "grad_norm": 0.06629964709281921, + "learning_rate": 8.712029161603889e-05, + "loss": 0.3801, + "step": 2151 + }, + { + "epoch": 0.1743357096565133, + "grad_norm": 0.045432791113853455, + "learning_rate": 8.71607938436614e-05, + "loss": 0.4369, + "step": 2152 + }, + { + "epoch": 0.17441672067401168, + "grad_norm": 0.045189496129751205, + "learning_rate": 8.720129607128393e-05, + "loss": 0.3773, + "step": 2153 + }, + { + "epoch": 0.17449773169151003, + "grad_norm": 0.04340081661939621, + "learning_rate": 8.724179829890644e-05, + "loss": 0.3656, + "step": 2154 + }, + { + "epoch": 0.17457874270900842, + "grad_norm": 0.05498838797211647, + "learning_rate": 8.728230052652896e-05, + "loss": 0.3915, + "step": 2155 + }, + { + "epoch": 0.1746597537265068, + "grad_norm": 0.0468340627849102, + "learning_rate": 8.732280275415148e-05, + "loss": 0.3468, + "step": 2156 + }, + { + "epoch": 0.17474076474400518, + "grad_norm": 0.05144199728965759, + "learning_rate": 8.736330498177399e-05, + "loss": 0.3562, + "step": 2157 + }, + { + "epoch": 0.17482177576150357, + "grad_norm": 0.05355091020464897, + "learning_rate": 8.740380720939651e-05, + "loss": 0.4315, + "step": 2158 + }, + { + "epoch": 0.17490278677900195, + "grad_norm": 0.04960298538208008, + "learning_rate": 8.744430943701904e-05, + "loss": 0.3555, + "step": 2159 + }, + { + "epoch": 0.17498379779650033, + "grad_norm": 0.05432707816362381, + "learning_rate": 8.748481166464156e-05, + "loss": 0.3638, + "step": 2160 + }, + { + "epoch": 0.1750648088139987, + "grad_norm": 0.047047730535268784, + "learning_rate": 8.752531389226408e-05, + "loss": 0.3595, + "step": 2161 + }, + { + "epoch": 0.17514581983149707, + "grad_norm": 0.04662622511386871, + "learning_rate": 8.75658161198866e-05, + "loss": 0.3562, + "step": 2162 + }, + { + "epoch": 0.17522683084899546, + "grad_norm": 0.05988886579871178, + "learning_rate": 8.760631834750912e-05, + "loss": 0.3803, + "step": 2163 + }, + { + "epoch": 0.17530784186649384, + "grad_norm": 0.06446512788534164, + "learning_rate": 8.764682057513164e-05, + "loss": 0.3907, + "step": 2164 + }, + { + "epoch": 0.17538885288399222, + "grad_norm": 0.05944246053695679, + "learning_rate": 8.768732280275416e-05, + "loss": 0.3786, + "step": 2165 + }, + { + "epoch": 0.1754698639014906, + "grad_norm": 0.06325705349445343, + "learning_rate": 8.772782503037668e-05, + "loss": 0.3958, + "step": 2166 + }, + { + "epoch": 0.175550874918989, + "grad_norm": 0.05642695724964142, + "learning_rate": 8.77683272579992e-05, + "loss": 0.3966, + "step": 2167 + }, + { + "epoch": 0.17563188593648738, + "grad_norm": 0.06587480753660202, + "learning_rate": 8.78088294856217e-05, + "loss": 0.3944, + "step": 2168 + }, + { + "epoch": 0.17571289695398573, + "grad_norm": 0.07901154458522797, + "learning_rate": 8.784933171324423e-05, + "loss": 0.3428, + "step": 2169 + }, + { + "epoch": 0.17579390797148411, + "grad_norm": 0.05240624397993088, + "learning_rate": 8.788983394086675e-05, + "loss": 0.3707, + "step": 2170 + }, + { + "epoch": 0.1758749189889825, + "grad_norm": 0.05577824264764786, + "learning_rate": 8.793033616848927e-05, + "loss": 0.4076, + "step": 2171 + }, + { + "epoch": 0.17595593000648088, + "grad_norm": 0.05159473046660423, + "learning_rate": 8.797083839611179e-05, + "loss": 0.3711, + "step": 2172 + }, + { + "epoch": 0.17603694102397927, + "grad_norm": 0.06700492650270462, + "learning_rate": 8.80113406237343e-05, + "loss": 0.4164, + "step": 2173 + }, + { + "epoch": 0.17611795204147765, + "grad_norm": 0.05094028636813164, + "learning_rate": 8.805184285135682e-05, + "loss": 0.4056, + "step": 2174 + }, + { + "epoch": 0.17619896305897603, + "grad_norm": 0.04142050817608833, + "learning_rate": 8.809234507897934e-05, + "loss": 0.3825, + "step": 2175 + }, + { + "epoch": 0.1762799740764744, + "grad_norm": 0.05937556177377701, + "learning_rate": 8.813284730660186e-05, + "loss": 0.3708, + "step": 2176 + }, + { + "epoch": 0.17636098509397277, + "grad_norm": 0.06152129918336868, + "learning_rate": 8.81733495342244e-05, + "loss": 0.4548, + "step": 2177 + }, + { + "epoch": 0.17644199611147116, + "grad_norm": 0.057044580578804016, + "learning_rate": 8.821385176184692e-05, + "loss": 0.424, + "step": 2178 + }, + { + "epoch": 0.17652300712896954, + "grad_norm": 0.04587271809577942, + "learning_rate": 8.825435398946942e-05, + "loss": 0.4239, + "step": 2179 + }, + { + "epoch": 0.17660401814646792, + "grad_norm": 0.05933641269803047, + "learning_rate": 8.829485621709194e-05, + "loss": 0.4028, + "step": 2180 + }, + { + "epoch": 0.1766850291639663, + "grad_norm": 0.051318321377038956, + "learning_rate": 8.833535844471446e-05, + "loss": 0.3685, + "step": 2181 + }, + { + "epoch": 0.1767660401814647, + "grad_norm": 0.06208323687314987, + "learning_rate": 8.837586067233698e-05, + "loss": 0.3273, + "step": 2182 + }, + { + "epoch": 0.17684705119896305, + "grad_norm": 0.06081528961658478, + "learning_rate": 8.84163628999595e-05, + "loss": 0.4095, + "step": 2183 + }, + { + "epoch": 0.17692806221646143, + "grad_norm": 0.04740947484970093, + "learning_rate": 8.845686512758202e-05, + "loss": 0.3657, + "step": 2184 + }, + { + "epoch": 0.1770090732339598, + "grad_norm": 0.04662710428237915, + "learning_rate": 8.849736735520454e-05, + "loss": 0.3934, + "step": 2185 + }, + { + "epoch": 0.1770900842514582, + "grad_norm": 0.05747944489121437, + "learning_rate": 8.853786958282706e-05, + "loss": 0.4094, + "step": 2186 + }, + { + "epoch": 0.17717109526895658, + "grad_norm": 0.04955825209617615, + "learning_rate": 8.857837181044958e-05, + "loss": 0.3559, + "step": 2187 + }, + { + "epoch": 0.17725210628645496, + "grad_norm": 0.05781328305602074, + "learning_rate": 8.86188740380721e-05, + "loss": 0.3837, + "step": 2188 + }, + { + "epoch": 0.17733311730395335, + "grad_norm": 0.0525483638048172, + "learning_rate": 8.865937626569462e-05, + "loss": 0.3811, + "step": 2189 + }, + { + "epoch": 0.17741412832145173, + "grad_norm": 0.05579186975955963, + "learning_rate": 8.869987849331714e-05, + "loss": 0.3574, + "step": 2190 + }, + { + "epoch": 0.1774951393389501, + "grad_norm": 0.04814240708947182, + "learning_rate": 8.874038072093966e-05, + "loss": 0.4373, + "step": 2191 + }, + { + "epoch": 0.17757615035644847, + "grad_norm": 0.04550255835056305, + "learning_rate": 8.878088294856218e-05, + "loss": 0.4166, + "step": 2192 + }, + { + "epoch": 0.17765716137394685, + "grad_norm": 0.056230008602142334, + "learning_rate": 8.882138517618468e-05, + "loss": 0.4454, + "step": 2193 + }, + { + "epoch": 0.17773817239144524, + "grad_norm": 0.04420791566371918, + "learning_rate": 8.88618874038072e-05, + "loss": 0.3212, + "step": 2194 + }, + { + "epoch": 0.17781918340894362, + "grad_norm": 0.06241984665393829, + "learning_rate": 8.890238963142972e-05, + "loss": 0.3696, + "step": 2195 + }, + { + "epoch": 0.177900194426442, + "grad_norm": 0.050221629440784454, + "learning_rate": 8.894289185905226e-05, + "loss": 0.4101, + "step": 2196 + }, + { + "epoch": 0.1779812054439404, + "grad_norm": 0.050246626138687134, + "learning_rate": 8.898339408667478e-05, + "loss": 0.3766, + "step": 2197 + }, + { + "epoch": 0.17806221646143874, + "grad_norm": 0.06250311434268951, + "learning_rate": 8.90238963142973e-05, + "loss": 0.3499, + "step": 2198 + }, + { + "epoch": 0.17814322747893713, + "grad_norm": 0.06322769820690155, + "learning_rate": 8.906439854191982e-05, + "loss": 0.3916, + "step": 2199 + }, + { + "epoch": 0.1782242384964355, + "grad_norm": 0.06526292115449905, + "learning_rate": 8.910490076954234e-05, + "loss": 0.3825, + "step": 2200 + }, + { + "epoch": 0.1783052495139339, + "grad_norm": 0.06252694875001907, + "learning_rate": 8.914540299716486e-05, + "loss": 0.3992, + "step": 2201 + }, + { + "epoch": 0.17838626053143228, + "grad_norm": 0.055992960929870605, + "learning_rate": 8.918590522478738e-05, + "loss": 0.3745, + "step": 2202 + }, + { + "epoch": 0.17846727154893066, + "grad_norm": 0.04313978552818298, + "learning_rate": 8.922640745240988e-05, + "loss": 0.3145, + "step": 2203 + }, + { + "epoch": 0.17854828256642905, + "grad_norm": 0.05175900086760521, + "learning_rate": 8.92669096800324e-05, + "loss": 0.4054, + "step": 2204 + }, + { + "epoch": 0.1786292935839274, + "grad_norm": 0.05246730521321297, + "learning_rate": 8.930741190765492e-05, + "loss": 0.3126, + "step": 2205 + }, + { + "epoch": 0.17871030460142578, + "grad_norm": 0.06548149883747101, + "learning_rate": 8.934791413527744e-05, + "loss": 0.4067, + "step": 2206 + }, + { + "epoch": 0.17879131561892417, + "grad_norm": 0.060931019484996796, + "learning_rate": 8.938841636289996e-05, + "loss": 0.4066, + "step": 2207 + }, + { + "epoch": 0.17887232663642255, + "grad_norm": 0.05262281745672226, + "learning_rate": 8.942891859052248e-05, + "loss": 0.4086, + "step": 2208 + }, + { + "epoch": 0.17895333765392094, + "grad_norm": 0.050549279898405075, + "learning_rate": 8.9469420818145e-05, + "loss": 0.3758, + "step": 2209 + }, + { + "epoch": 0.17903434867141932, + "grad_norm": 0.05054941028356552, + "learning_rate": 8.950992304576752e-05, + "loss": 0.3482, + "step": 2210 + }, + { + "epoch": 0.1791153596889177, + "grad_norm": 0.05035468190908432, + "learning_rate": 8.955042527339004e-05, + "loss": 0.4008, + "step": 2211 + }, + { + "epoch": 0.17919637070641609, + "grad_norm": 0.05484228581190109, + "learning_rate": 8.959092750101256e-05, + "loss": 0.3867, + "step": 2212 + }, + { + "epoch": 0.17927738172391444, + "grad_norm": 0.049987874925136566, + "learning_rate": 8.963142972863508e-05, + "loss": 0.4075, + "step": 2213 + }, + { + "epoch": 0.17935839274141283, + "grad_norm": 0.05101454257965088, + "learning_rate": 8.96719319562576e-05, + "loss": 0.367, + "step": 2214 + }, + { + "epoch": 0.1794394037589112, + "grad_norm": 0.05252314731478691, + "learning_rate": 8.971243418388012e-05, + "loss": 0.3951, + "step": 2215 + }, + { + "epoch": 0.1795204147764096, + "grad_norm": 0.05595763400197029, + "learning_rate": 8.975293641150264e-05, + "loss": 0.4107, + "step": 2216 + }, + { + "epoch": 0.17960142579390798, + "grad_norm": 0.051620274782180786, + "learning_rate": 8.979343863912516e-05, + "loss": 0.3786, + "step": 2217 + }, + { + "epoch": 0.17968243681140636, + "grad_norm": 0.04558439925312996, + "learning_rate": 8.983394086674768e-05, + "loss": 0.3614, + "step": 2218 + }, + { + "epoch": 0.17976344782890474, + "grad_norm": 0.05854353681206703, + "learning_rate": 8.98744430943702e-05, + "loss": 0.3866, + "step": 2219 + }, + { + "epoch": 0.1798444588464031, + "grad_norm": 0.06507213413715363, + "learning_rate": 8.991494532199272e-05, + "loss": 0.337, + "step": 2220 + }, + { + "epoch": 0.17992546986390148, + "grad_norm": 0.054253242909908295, + "learning_rate": 8.995544754961524e-05, + "loss": 0.378, + "step": 2221 + }, + { + "epoch": 0.18000648088139987, + "grad_norm": 0.05274822190403938, + "learning_rate": 8.999594977723776e-05, + "loss": 0.4095, + "step": 2222 + }, + { + "epoch": 0.18008749189889825, + "grad_norm": 0.04596314951777458, + "learning_rate": 9.003645200486027e-05, + "loss": 0.3689, + "step": 2223 + }, + { + "epoch": 0.18016850291639663, + "grad_norm": 0.04373635724186897, + "learning_rate": 9.00769542324828e-05, + "loss": 0.3374, + "step": 2224 + }, + { + "epoch": 0.18024951393389502, + "grad_norm": 0.0654408186674118, + "learning_rate": 9.011745646010531e-05, + "loss": 0.3679, + "step": 2225 + }, + { + "epoch": 0.1803305249513934, + "grad_norm": 0.04936238005757332, + "learning_rate": 9.015795868772783e-05, + "loss": 0.4143, + "step": 2226 + }, + { + "epoch": 0.18041153596889176, + "grad_norm": 0.05110304057598114, + "learning_rate": 9.019846091535035e-05, + "loss": 0.3529, + "step": 2227 + }, + { + "epoch": 0.18049254698639014, + "grad_norm": 0.05016132444143295, + "learning_rate": 9.023896314297286e-05, + "loss": 0.3847, + "step": 2228 + }, + { + "epoch": 0.18057355800388852, + "grad_norm": 0.04777160659432411, + "learning_rate": 9.027946537059538e-05, + "loss": 0.3529, + "step": 2229 + }, + { + "epoch": 0.1806545690213869, + "grad_norm": 0.06519763171672821, + "learning_rate": 9.03199675982179e-05, + "loss": 0.3865, + "step": 2230 + }, + { + "epoch": 0.1807355800388853, + "grad_norm": 0.06276557594537735, + "learning_rate": 9.036046982584042e-05, + "loss": 0.4392, + "step": 2231 + }, + { + "epoch": 0.18081659105638367, + "grad_norm": 0.05355183780193329, + "learning_rate": 9.040097205346294e-05, + "loss": 0.3734, + "step": 2232 + }, + { + "epoch": 0.18089760207388206, + "grad_norm": 0.041655007749795914, + "learning_rate": 9.044147428108546e-05, + "loss": 0.3352, + "step": 2233 + }, + { + "epoch": 0.18097861309138044, + "grad_norm": 0.054301269352436066, + "learning_rate": 9.048197650870799e-05, + "loss": 0.3925, + "step": 2234 + }, + { + "epoch": 0.1810596241088788, + "grad_norm": 0.049007266759872437, + "learning_rate": 9.052247873633051e-05, + "loss": 0.4126, + "step": 2235 + }, + { + "epoch": 0.18114063512637718, + "grad_norm": 0.04472389444708824, + "learning_rate": 9.056298096395303e-05, + "loss": 0.3172, + "step": 2236 + }, + { + "epoch": 0.18122164614387556, + "grad_norm": 0.04902403801679611, + "learning_rate": 9.060348319157555e-05, + "loss": 0.3517, + "step": 2237 + }, + { + "epoch": 0.18130265716137395, + "grad_norm": 0.05876627936959267, + "learning_rate": 9.064398541919807e-05, + "loss": 0.3968, + "step": 2238 + }, + { + "epoch": 0.18138366817887233, + "grad_norm": 0.0561637207865715, + "learning_rate": 9.068448764682058e-05, + "loss": 0.415, + "step": 2239 + }, + { + "epoch": 0.18146467919637072, + "grad_norm": 0.04728609696030617, + "learning_rate": 9.07249898744431e-05, + "loss": 0.3665, + "step": 2240 + }, + { + "epoch": 0.1815456902138691, + "grad_norm": 0.056595578789711, + "learning_rate": 9.076549210206562e-05, + "loss": 0.4514, + "step": 2241 + }, + { + "epoch": 0.18162670123136745, + "grad_norm": 0.06155933067202568, + "learning_rate": 9.080599432968813e-05, + "loss": 0.3642, + "step": 2242 + }, + { + "epoch": 0.18170771224886584, + "grad_norm": 0.045497674494981766, + "learning_rate": 9.084649655731065e-05, + "loss": 0.367, + "step": 2243 + }, + { + "epoch": 0.18178872326636422, + "grad_norm": 0.06236834451556206, + "learning_rate": 9.088699878493317e-05, + "loss": 0.3618, + "step": 2244 + }, + { + "epoch": 0.1818697342838626, + "grad_norm": 0.0569576732814312, + "learning_rate": 9.09275010125557e-05, + "loss": 0.4281, + "step": 2245 + }, + { + "epoch": 0.181950745301361, + "grad_norm": 0.04748576879501343, + "learning_rate": 9.096800324017821e-05, + "loss": 0.4056, + "step": 2246 + }, + { + "epoch": 0.18203175631885937, + "grad_norm": 0.04922928288578987, + "learning_rate": 9.100850546780073e-05, + "loss": 0.3859, + "step": 2247 + }, + { + "epoch": 0.18211276733635776, + "grad_norm": 0.05819029361009598, + "learning_rate": 9.104900769542325e-05, + "loss": 0.4301, + "step": 2248 + }, + { + "epoch": 0.1821937783538561, + "grad_norm": 0.05144455283880234, + "learning_rate": 9.108950992304577e-05, + "loss": 0.355, + "step": 2249 + }, + { + "epoch": 0.1822747893713545, + "grad_norm": 0.056657761335372925, + "learning_rate": 9.113001215066829e-05, + "loss": 0.3997, + "step": 2250 + }, + { + "epoch": 0.18235580038885288, + "grad_norm": 0.05661802738904953, + "learning_rate": 9.117051437829081e-05, + "loss": 0.3802, + "step": 2251 + }, + { + "epoch": 0.18243681140635126, + "grad_norm": 0.05875290557742119, + "learning_rate": 9.121101660591332e-05, + "loss": 0.4077, + "step": 2252 + }, + { + "epoch": 0.18251782242384965, + "grad_norm": 0.05324438959360123, + "learning_rate": 9.125151883353585e-05, + "loss": 0.3975, + "step": 2253 + }, + { + "epoch": 0.18259883344134803, + "grad_norm": 0.06660490483045578, + "learning_rate": 9.129202106115837e-05, + "loss": 0.3673, + "step": 2254 + }, + { + "epoch": 0.1826798444588464, + "grad_norm": 0.05302351340651512, + "learning_rate": 9.133252328878089e-05, + "loss": 0.3425, + "step": 2255 + }, + { + "epoch": 0.18276085547634477, + "grad_norm": 0.042320359498262405, + "learning_rate": 9.137302551640341e-05, + "loss": 0.3621, + "step": 2256 + }, + { + "epoch": 0.18284186649384315, + "grad_norm": 0.0677284523844719, + "learning_rate": 9.141352774402593e-05, + "loss": 0.3821, + "step": 2257 + }, + { + "epoch": 0.18292287751134154, + "grad_norm": 0.05299816280603409, + "learning_rate": 9.145402997164845e-05, + "loss": 0.4133, + "step": 2258 + }, + { + "epoch": 0.18300388852883992, + "grad_norm": 0.057131148874759674, + "learning_rate": 9.149453219927097e-05, + "loss": 0.3976, + "step": 2259 + }, + { + "epoch": 0.1830848995463383, + "grad_norm": 0.04714860022068024, + "learning_rate": 9.153503442689349e-05, + "loss": 0.3569, + "step": 2260 + }, + { + "epoch": 0.1831659105638367, + "grad_norm": 0.04154291749000549, + "learning_rate": 9.157553665451601e-05, + "loss": 0.3975, + "step": 2261 + }, + { + "epoch": 0.18324692158133507, + "grad_norm": 0.06255395710468292, + "learning_rate": 9.161603888213853e-05, + "loss": 0.4208, + "step": 2262 + }, + { + "epoch": 0.18332793259883345, + "grad_norm": 0.04732973128557205, + "learning_rate": 9.165654110976103e-05, + "loss": 0.429, + "step": 2263 + }, + { + "epoch": 0.1834089436163318, + "grad_norm": 0.0500243715941906, + "learning_rate": 9.169704333738355e-05, + "loss": 0.3555, + "step": 2264 + }, + { + "epoch": 0.1834899546338302, + "grad_norm": 0.047191690653562546, + "learning_rate": 9.173754556500607e-05, + "loss": 0.3791, + "step": 2265 + }, + { + "epoch": 0.18357096565132858, + "grad_norm": 0.04734690859913826, + "learning_rate": 9.17780477926286e-05, + "loss": 0.3663, + "step": 2266 + }, + { + "epoch": 0.18365197666882696, + "grad_norm": 0.0568060502409935, + "learning_rate": 9.181855002025111e-05, + "loss": 0.3479, + "step": 2267 + }, + { + "epoch": 0.18373298768632534, + "grad_norm": 0.061953410506248474, + "learning_rate": 9.185905224787363e-05, + "loss": 0.3681, + "step": 2268 + }, + { + "epoch": 0.18381399870382373, + "grad_norm": 0.04263650253415108, + "learning_rate": 9.189955447549615e-05, + "loss": 0.2988, + "step": 2269 + }, + { + "epoch": 0.1838950097213221, + "grad_norm": 0.04538256675004959, + "learning_rate": 9.194005670311867e-05, + "loss": 0.3543, + "step": 2270 + }, + { + "epoch": 0.18397602073882047, + "grad_norm": 0.04849075525999069, + "learning_rate": 9.198055893074119e-05, + "loss": 0.3667, + "step": 2271 + }, + { + "epoch": 0.18405703175631885, + "grad_norm": 0.04478795453906059, + "learning_rate": 9.202106115836372e-05, + "loss": 0.4318, + "step": 2272 + }, + { + "epoch": 0.18413804277381723, + "grad_norm": 0.04313032701611519, + "learning_rate": 9.206156338598624e-05, + "loss": 0.3399, + "step": 2273 + }, + { + "epoch": 0.18421905379131562, + "grad_norm": 0.047500211745500565, + "learning_rate": 9.210206561360875e-05, + "loss": 0.3399, + "step": 2274 + }, + { + "epoch": 0.184300064808814, + "grad_norm": 0.05286448076367378, + "learning_rate": 9.214256784123127e-05, + "loss": 0.3759, + "step": 2275 + }, + { + "epoch": 0.18438107582631239, + "grad_norm": 0.05197947472333908, + "learning_rate": 9.218307006885379e-05, + "loss": 0.346, + "step": 2276 + }, + { + "epoch": 0.18446208684381077, + "grad_norm": 0.06517504900693893, + "learning_rate": 9.222357229647631e-05, + "loss": 0.4529, + "step": 2277 + }, + { + "epoch": 0.18454309786130912, + "grad_norm": 0.04489710554480553, + "learning_rate": 9.226407452409883e-05, + "loss": 0.4124, + "step": 2278 + }, + { + "epoch": 0.1846241088788075, + "grad_norm": 0.04965034872293472, + "learning_rate": 9.230457675172135e-05, + "loss": 0.3523, + "step": 2279 + }, + { + "epoch": 0.1847051198963059, + "grad_norm": 0.053209736943244934, + "learning_rate": 9.234507897934387e-05, + "loss": 0.3404, + "step": 2280 + }, + { + "epoch": 0.18478613091380428, + "grad_norm": 0.06462380290031433, + "learning_rate": 9.238558120696639e-05, + "loss": 0.436, + "step": 2281 + }, + { + "epoch": 0.18486714193130266, + "grad_norm": 0.04681272432208061, + "learning_rate": 9.242608343458891e-05, + "loss": 0.3605, + "step": 2282 + }, + { + "epoch": 0.18494815294880104, + "grad_norm": 0.047515030950307846, + "learning_rate": 9.246658566221143e-05, + "loss": 0.3561, + "step": 2283 + }, + { + "epoch": 0.18502916396629943, + "grad_norm": 0.04604284465312958, + "learning_rate": 9.250708788983395e-05, + "loss": 0.3581, + "step": 2284 + }, + { + "epoch": 0.1851101749837978, + "grad_norm": 0.05426971614360809, + "learning_rate": 9.254759011745647e-05, + "loss": 0.3875, + "step": 2285 + }, + { + "epoch": 0.18519118600129617, + "grad_norm": 0.048005685210227966, + "learning_rate": 9.258809234507899e-05, + "loss": 0.3615, + "step": 2286 + }, + { + "epoch": 0.18527219701879455, + "grad_norm": 0.05091318115592003, + "learning_rate": 9.262859457270149e-05, + "loss": 0.4587, + "step": 2287 + }, + { + "epoch": 0.18535320803629293, + "grad_norm": 0.0451776497066021, + "learning_rate": 9.266909680032401e-05, + "loss": 0.3651, + "step": 2288 + }, + { + "epoch": 0.18543421905379132, + "grad_norm": 0.051855940371751785, + "learning_rate": 9.270959902794653e-05, + "loss": 0.3906, + "step": 2289 + }, + { + "epoch": 0.1855152300712897, + "grad_norm": 0.04242299869656563, + "learning_rate": 9.275010125556907e-05, + "loss": 0.351, + "step": 2290 + }, + { + "epoch": 0.18559624108878808, + "grad_norm": 0.04861443117260933, + "learning_rate": 9.279060348319159e-05, + "loss": 0.3527, + "step": 2291 + }, + { + "epoch": 0.18567725210628647, + "grad_norm": 0.04748028144240379, + "learning_rate": 9.28311057108141e-05, + "loss": 0.4083, + "step": 2292 + }, + { + "epoch": 0.18575826312378482, + "grad_norm": 0.0490681454539299, + "learning_rate": 9.287160793843662e-05, + "loss": 0.4009, + "step": 2293 + }, + { + "epoch": 0.1858392741412832, + "grad_norm": 0.05255401134490967, + "learning_rate": 9.291211016605914e-05, + "loss": 0.4057, + "step": 2294 + }, + { + "epoch": 0.1859202851587816, + "grad_norm": 0.05210600048303604, + "learning_rate": 9.295261239368166e-05, + "loss": 0.3714, + "step": 2295 + }, + { + "epoch": 0.18600129617627997, + "grad_norm": 0.0676577240228653, + "learning_rate": 9.299311462130418e-05, + "loss": 0.3882, + "step": 2296 + }, + { + "epoch": 0.18608230719377836, + "grad_norm": 0.06080527976155281, + "learning_rate": 9.30336168489267e-05, + "loss": 0.3907, + "step": 2297 + }, + { + "epoch": 0.18616331821127674, + "grad_norm": 0.04721786454319954, + "learning_rate": 9.307411907654921e-05, + "loss": 0.3615, + "step": 2298 + }, + { + "epoch": 0.18624432922877512, + "grad_norm": 0.04322146624326706, + "learning_rate": 9.311462130417173e-05, + "loss": 0.3894, + "step": 2299 + }, + { + "epoch": 0.18632534024627348, + "grad_norm": 0.04705704003572464, + "learning_rate": 9.315512353179425e-05, + "loss": 0.3722, + "step": 2300 + }, + { + "epoch": 0.18640635126377186, + "grad_norm": 0.05449768900871277, + "learning_rate": 9.319562575941677e-05, + "loss": 0.3898, + "step": 2301 + }, + { + "epoch": 0.18648736228127025, + "grad_norm": 0.05563833937048912, + "learning_rate": 9.323612798703929e-05, + "loss": 0.4292, + "step": 2302 + }, + { + "epoch": 0.18656837329876863, + "grad_norm": 0.05745968222618103, + "learning_rate": 9.327663021466181e-05, + "loss": 0.4307, + "step": 2303 + }, + { + "epoch": 0.18664938431626701, + "grad_norm": 0.06325607001781464, + "learning_rate": 9.331713244228433e-05, + "loss": 0.3989, + "step": 2304 + }, + { + "epoch": 0.1867303953337654, + "grad_norm": 0.04417753964662552, + "learning_rate": 9.335763466990685e-05, + "loss": 0.4082, + "step": 2305 + }, + { + "epoch": 0.18681140635126378, + "grad_norm": 0.05200282856822014, + "learning_rate": 9.339813689752937e-05, + "loss": 0.3822, + "step": 2306 + }, + { + "epoch": 0.18689241736876216, + "grad_norm": 0.036817897111177444, + "learning_rate": 9.343863912515189e-05, + "loss": 0.3125, + "step": 2307 + }, + { + "epoch": 0.18697342838626052, + "grad_norm": 0.053293824195861816, + "learning_rate": 9.34791413527744e-05, + "loss": 0.3794, + "step": 2308 + }, + { + "epoch": 0.1870544394037589, + "grad_norm": 0.0653097853064537, + "learning_rate": 9.351964358039693e-05, + "loss": 0.461, + "step": 2309 + }, + { + "epoch": 0.1871354504212573, + "grad_norm": 0.050535351037979126, + "learning_rate": 9.356014580801945e-05, + "loss": 0.3984, + "step": 2310 + }, + { + "epoch": 0.18721646143875567, + "grad_norm": 0.04054562374949455, + "learning_rate": 9.360064803564196e-05, + "loss": 0.3642, + "step": 2311 + }, + { + "epoch": 0.18729747245625405, + "grad_norm": 0.04133705049753189, + "learning_rate": 9.364115026326448e-05, + "loss": 0.3811, + "step": 2312 + }, + { + "epoch": 0.18737848347375244, + "grad_norm": 0.046158090233802795, + "learning_rate": 9.3681652490887e-05, + "loss": 0.3797, + "step": 2313 + }, + { + "epoch": 0.18745949449125082, + "grad_norm": 0.0477442741394043, + "learning_rate": 9.372215471850952e-05, + "loss": 0.3778, + "step": 2314 + }, + { + "epoch": 0.18754050550874918, + "grad_norm": 0.051181282848119736, + "learning_rate": 9.376265694613204e-05, + "loss": 0.4524, + "step": 2315 + }, + { + "epoch": 0.18762151652624756, + "grad_norm": 0.047320883721113205, + "learning_rate": 9.380315917375456e-05, + "loss": 0.3927, + "step": 2316 + }, + { + "epoch": 0.18770252754374595, + "grad_norm": 0.06091245263814926, + "learning_rate": 9.384366140137708e-05, + "loss": 0.3817, + "step": 2317 + }, + { + "epoch": 0.18778353856124433, + "grad_norm": 0.043418072164058685, + "learning_rate": 9.38841636289996e-05, + "loss": 0.3928, + "step": 2318 + }, + { + "epoch": 0.1878645495787427, + "grad_norm": 0.05807121470570564, + "learning_rate": 9.392466585662212e-05, + "loss": 0.3893, + "step": 2319 + }, + { + "epoch": 0.1879455605962411, + "grad_norm": 0.04889382794499397, + "learning_rate": 9.396516808424464e-05, + "loss": 0.3647, + "step": 2320 + }, + { + "epoch": 0.18802657161373948, + "grad_norm": 0.05064339190721512, + "learning_rate": 9.400567031186716e-05, + "loss": 0.4005, + "step": 2321 + }, + { + "epoch": 0.18810758263123784, + "grad_norm": 0.049906641244888306, + "learning_rate": 9.404617253948967e-05, + "loss": 0.3851, + "step": 2322 + }, + { + "epoch": 0.18818859364873622, + "grad_norm": 0.061403222382068634, + "learning_rate": 9.408667476711219e-05, + "loss": 0.4124, + "step": 2323 + }, + { + "epoch": 0.1882696046662346, + "grad_norm": 0.049313317984342575, + "learning_rate": 9.412717699473471e-05, + "loss": 0.3853, + "step": 2324 + }, + { + "epoch": 0.18835061568373299, + "grad_norm": 0.04512112960219383, + "learning_rate": 9.416767922235723e-05, + "loss": 0.3849, + "step": 2325 + }, + { + "epoch": 0.18843162670123137, + "grad_norm": 0.06029899790883064, + "learning_rate": 9.420818144997975e-05, + "loss": 0.4307, + "step": 2326 + }, + { + "epoch": 0.18851263771872975, + "grad_norm": 0.05402587726712227, + "learning_rate": 9.424868367760227e-05, + "loss": 0.4108, + "step": 2327 + }, + { + "epoch": 0.18859364873622814, + "grad_norm": 0.05312091484665871, + "learning_rate": 9.42891859052248e-05, + "loss": 0.3826, + "step": 2328 + }, + { + "epoch": 0.18867465975372652, + "grad_norm": 0.04519422724843025, + "learning_rate": 9.432968813284732e-05, + "loss": 0.4146, + "step": 2329 + }, + { + "epoch": 0.18875567077122488, + "grad_norm": 0.04941098019480705, + "learning_rate": 9.437019036046984e-05, + "loss": 0.3499, + "step": 2330 + }, + { + "epoch": 0.18883668178872326, + "grad_norm": 0.06194014847278595, + "learning_rate": 9.441069258809236e-05, + "loss": 0.4409, + "step": 2331 + }, + { + "epoch": 0.18891769280622164, + "grad_norm": 0.054493311792612076, + "learning_rate": 9.445119481571488e-05, + "loss": 0.3915, + "step": 2332 + }, + { + "epoch": 0.18899870382372003, + "grad_norm": 0.05225975066423416, + "learning_rate": 9.449169704333738e-05, + "loss": 0.3641, + "step": 2333 + }, + { + "epoch": 0.1890797148412184, + "grad_norm": 0.05683285742998123, + "learning_rate": 9.45321992709599e-05, + "loss": 0.3665, + "step": 2334 + }, + { + "epoch": 0.1891607258587168, + "grad_norm": 0.047711968421936035, + "learning_rate": 9.457270149858242e-05, + "loss": 0.4671, + "step": 2335 + }, + { + "epoch": 0.18924173687621518, + "grad_norm": 0.04842852056026459, + "learning_rate": 9.461320372620494e-05, + "loss": 0.3622, + "step": 2336 + }, + { + "epoch": 0.18932274789371353, + "grad_norm": 0.04394889995455742, + "learning_rate": 9.465370595382746e-05, + "loss": 0.3606, + "step": 2337 + }, + { + "epoch": 0.18940375891121192, + "grad_norm": 0.06385563313961029, + "learning_rate": 9.469420818144998e-05, + "loss": 0.4298, + "step": 2338 + }, + { + "epoch": 0.1894847699287103, + "grad_norm": 0.05741212144494057, + "learning_rate": 9.47347104090725e-05, + "loss": 0.3639, + "step": 2339 + }, + { + "epoch": 0.18956578094620868, + "grad_norm": 0.044967882335186005, + "learning_rate": 9.477521263669502e-05, + "loss": 0.3775, + "step": 2340 + }, + { + "epoch": 0.18964679196370707, + "grad_norm": 0.05367691069841385, + "learning_rate": 9.481571486431754e-05, + "loss": 0.4057, + "step": 2341 + }, + { + "epoch": 0.18972780298120545, + "grad_norm": 0.05288249999284744, + "learning_rate": 9.485621709194006e-05, + "loss": 0.3901, + "step": 2342 + }, + { + "epoch": 0.18980881399870383, + "grad_norm": 0.04440377652645111, + "learning_rate": 9.489671931956258e-05, + "loss": 0.3894, + "step": 2343 + }, + { + "epoch": 0.1898898250162022, + "grad_norm": 0.05041750520467758, + "learning_rate": 9.49372215471851e-05, + "loss": 0.4323, + "step": 2344 + }, + { + "epoch": 0.18997083603370057, + "grad_norm": 0.05050059035420418, + "learning_rate": 9.497772377480762e-05, + "loss": 0.393, + "step": 2345 + }, + { + "epoch": 0.19005184705119896, + "grad_norm": 0.04902779310941696, + "learning_rate": 9.501822600243013e-05, + "loss": 0.3747, + "step": 2346 + }, + { + "epoch": 0.19013285806869734, + "grad_norm": 0.06620379537343979, + "learning_rate": 9.505872823005266e-05, + "loss": 0.4253, + "step": 2347 + }, + { + "epoch": 0.19021386908619572, + "grad_norm": 0.05406967177987099, + "learning_rate": 9.509923045767518e-05, + "loss": 0.3722, + "step": 2348 + }, + { + "epoch": 0.1902948801036941, + "grad_norm": 0.04907216131687164, + "learning_rate": 9.51397326852977e-05, + "loss": 0.4089, + "step": 2349 + }, + { + "epoch": 0.1903758911211925, + "grad_norm": 0.05281197652220726, + "learning_rate": 9.518023491292022e-05, + "loss": 0.3484, + "step": 2350 + }, + { + "epoch": 0.19045690213869088, + "grad_norm": 0.06806964427232742, + "learning_rate": 9.522073714054274e-05, + "loss": 0.4212, + "step": 2351 + }, + { + "epoch": 0.19053791315618923, + "grad_norm": 0.05582256242632866, + "learning_rate": 9.526123936816526e-05, + "loss": 0.4001, + "step": 2352 + }, + { + "epoch": 0.19061892417368761, + "grad_norm": 0.053403329104185104, + "learning_rate": 9.530174159578778e-05, + "loss": 0.4009, + "step": 2353 + }, + { + "epoch": 0.190699935191186, + "grad_norm": 0.05009522661566734, + "learning_rate": 9.53422438234103e-05, + "loss": 0.3614, + "step": 2354 + }, + { + "epoch": 0.19078094620868438, + "grad_norm": 0.05327250063419342, + "learning_rate": 9.538274605103282e-05, + "loss": 0.4264, + "step": 2355 + }, + { + "epoch": 0.19086195722618277, + "grad_norm": 0.054642241448163986, + "learning_rate": 9.542324827865534e-05, + "loss": 0.3857, + "step": 2356 + }, + { + "epoch": 0.19094296824368115, + "grad_norm": 0.050950486212968826, + "learning_rate": 9.546375050627784e-05, + "loss": 0.3573, + "step": 2357 + }, + { + "epoch": 0.19102397926117953, + "grad_norm": 0.05052249878644943, + "learning_rate": 9.550425273390036e-05, + "loss": 0.3971, + "step": 2358 + }, + { + "epoch": 0.1911049902786779, + "grad_norm": 0.04672643914818764, + "learning_rate": 9.554475496152288e-05, + "loss": 0.3903, + "step": 2359 + }, + { + "epoch": 0.19118600129617627, + "grad_norm": 0.052174076437950134, + "learning_rate": 9.55852571891454e-05, + "loss": 0.3908, + "step": 2360 + }, + { + "epoch": 0.19126701231367466, + "grad_norm": 0.058124348521232605, + "learning_rate": 9.562575941676792e-05, + "loss": 0.4022, + "step": 2361 + }, + { + "epoch": 0.19134802333117304, + "grad_norm": 0.05619959160685539, + "learning_rate": 9.566626164439044e-05, + "loss": 0.3924, + "step": 2362 + }, + { + "epoch": 0.19142903434867142, + "grad_norm": 0.04171370714902878, + "learning_rate": 9.570676387201296e-05, + "loss": 0.3773, + "step": 2363 + }, + { + "epoch": 0.1915100453661698, + "grad_norm": 0.0523751825094223, + "learning_rate": 9.574726609963548e-05, + "loss": 0.3713, + "step": 2364 + }, + { + "epoch": 0.1915910563836682, + "grad_norm": 0.06082122027873993, + "learning_rate": 9.5787768327258e-05, + "loss": 0.3853, + "step": 2365 + }, + { + "epoch": 0.19167206740116655, + "grad_norm": 0.05510568991303444, + "learning_rate": 9.582827055488053e-05, + "loss": 0.3692, + "step": 2366 + }, + { + "epoch": 0.19175307841866493, + "grad_norm": 0.04889333248138428, + "learning_rate": 9.586877278250305e-05, + "loss": 0.3786, + "step": 2367 + }, + { + "epoch": 0.1918340894361633, + "grad_norm": 0.049090322107076645, + "learning_rate": 9.590927501012556e-05, + "loss": 0.3905, + "step": 2368 + }, + { + "epoch": 0.1919151004536617, + "grad_norm": 0.053586363792419434, + "learning_rate": 9.594977723774808e-05, + "loss": 0.3731, + "step": 2369 + }, + { + "epoch": 0.19199611147116008, + "grad_norm": 0.04378211498260498, + "learning_rate": 9.59902794653706e-05, + "loss": 0.3731, + "step": 2370 + }, + { + "epoch": 0.19207712248865846, + "grad_norm": 0.046975620090961456, + "learning_rate": 9.603078169299312e-05, + "loss": 0.3849, + "step": 2371 + }, + { + "epoch": 0.19215813350615685, + "grad_norm": 0.05281860753893852, + "learning_rate": 9.607128392061564e-05, + "loss": 0.3375, + "step": 2372 + }, + { + "epoch": 0.19223914452365523, + "grad_norm": 0.038259562104940414, + "learning_rate": 9.611178614823816e-05, + "loss": 0.3518, + "step": 2373 + }, + { + "epoch": 0.1923201555411536, + "grad_norm": 0.05231066793203354, + "learning_rate": 9.615228837586068e-05, + "loss": 0.4345, + "step": 2374 + }, + { + "epoch": 0.19240116655865197, + "grad_norm": 0.054934676736593246, + "learning_rate": 9.61927906034832e-05, + "loss": 0.3924, + "step": 2375 + }, + { + "epoch": 0.19248217757615035, + "grad_norm": 0.06908360868692398, + "learning_rate": 9.623329283110572e-05, + "loss": 0.3992, + "step": 2376 + }, + { + "epoch": 0.19256318859364874, + "grad_norm": 0.04846476763486862, + "learning_rate": 9.627379505872824e-05, + "loss": 0.3915, + "step": 2377 + }, + { + "epoch": 0.19264419961114712, + "grad_norm": 0.04592437669634819, + "learning_rate": 9.631429728635076e-05, + "loss": 0.3585, + "step": 2378 + }, + { + "epoch": 0.1927252106286455, + "grad_norm": 0.054542385041713715, + "learning_rate": 9.635479951397328e-05, + "loss": 0.4254, + "step": 2379 + }, + { + "epoch": 0.1928062216461439, + "grad_norm": 0.0487421490252018, + "learning_rate": 9.63953017415958e-05, + "loss": 0.3718, + "step": 2380 + }, + { + "epoch": 0.19288723266364224, + "grad_norm": 0.04887447506189346, + "learning_rate": 9.643580396921831e-05, + "loss": 0.4017, + "step": 2381 + }, + { + "epoch": 0.19296824368114063, + "grad_norm": 0.0589744932949543, + "learning_rate": 9.647630619684082e-05, + "loss": 0.374, + "step": 2382 + }, + { + "epoch": 0.193049254698639, + "grad_norm": 0.05274634808301926, + "learning_rate": 9.651680842446334e-05, + "loss": 0.3185, + "step": 2383 + }, + { + "epoch": 0.1931302657161374, + "grad_norm": 0.0514618456363678, + "learning_rate": 9.655731065208586e-05, + "loss": 0.3948, + "step": 2384 + }, + { + "epoch": 0.19321127673363578, + "grad_norm": 0.05362550541758537, + "learning_rate": 9.65978128797084e-05, + "loss": 0.3651, + "step": 2385 + }, + { + "epoch": 0.19329228775113416, + "grad_norm": 0.05524744093418121, + "learning_rate": 9.663831510733091e-05, + "loss": 0.4032, + "step": 2386 + }, + { + "epoch": 0.19337329876863255, + "grad_norm": 0.046348828822374344, + "learning_rate": 9.667881733495343e-05, + "loss": 0.3832, + "step": 2387 + }, + { + "epoch": 0.1934543097861309, + "grad_norm": 0.05013475939631462, + "learning_rate": 9.671931956257595e-05, + "loss": 0.3749, + "step": 2388 + }, + { + "epoch": 0.19353532080362928, + "grad_norm": 0.05070723965764046, + "learning_rate": 9.675982179019847e-05, + "loss": 0.4044, + "step": 2389 + }, + { + "epoch": 0.19361633182112767, + "grad_norm": 0.044254641979932785, + "learning_rate": 9.680032401782099e-05, + "loss": 0.3564, + "step": 2390 + }, + { + "epoch": 0.19369734283862605, + "grad_norm": 0.05322520062327385, + "learning_rate": 9.684082624544351e-05, + "loss": 0.3379, + "step": 2391 + }, + { + "epoch": 0.19377835385612444, + "grad_norm": 0.0782526433467865, + "learning_rate": 9.688132847306603e-05, + "loss": 0.3541, + "step": 2392 + }, + { + "epoch": 0.19385936487362282, + "grad_norm": 0.06899959594011307, + "learning_rate": 9.692183070068854e-05, + "loss": 0.3742, + "step": 2393 + }, + { + "epoch": 0.1939403758911212, + "grad_norm": 0.043857067823410034, + "learning_rate": 9.696233292831106e-05, + "loss": 0.3907, + "step": 2394 + }, + { + "epoch": 0.19402138690861956, + "grad_norm": 0.06296835094690323, + "learning_rate": 9.700283515593358e-05, + "loss": 0.353, + "step": 2395 + }, + { + "epoch": 0.19410239792611794, + "grad_norm": 0.060926903039216995, + "learning_rate": 9.70433373835561e-05, + "loss": 0.3564, + "step": 2396 + }, + { + "epoch": 0.19418340894361633, + "grad_norm": 0.052972421050071716, + "learning_rate": 9.708383961117862e-05, + "loss": 0.4146, + "step": 2397 + }, + { + "epoch": 0.1942644199611147, + "grad_norm": 0.0610865093767643, + "learning_rate": 9.712434183880114e-05, + "loss": 0.3754, + "step": 2398 + }, + { + "epoch": 0.1943454309786131, + "grad_norm": 0.045094795525074005, + "learning_rate": 9.716484406642366e-05, + "loss": 0.369, + "step": 2399 + }, + { + "epoch": 0.19442644199611148, + "grad_norm": 0.059093985706567764, + "learning_rate": 9.720534629404617e-05, + "loss": 0.3982, + "step": 2400 + }, + { + "epoch": 0.19450745301360986, + "grad_norm": 0.04496023431420326, + "learning_rate": 9.72458485216687e-05, + "loss": 0.3582, + "step": 2401 + }, + { + "epoch": 0.19458846403110824, + "grad_norm": 0.04400349035859108, + "learning_rate": 9.728635074929121e-05, + "loss": 0.38, + "step": 2402 + }, + { + "epoch": 0.1946694750486066, + "grad_norm": 0.05442773923277855, + "learning_rate": 9.732685297691373e-05, + "loss": 0.3755, + "step": 2403 + }, + { + "epoch": 0.19475048606610498, + "grad_norm": 0.05021575838327408, + "learning_rate": 9.736735520453625e-05, + "loss": 0.4224, + "step": 2404 + }, + { + "epoch": 0.19483149708360337, + "grad_norm": 0.04044508561491966, + "learning_rate": 9.740785743215877e-05, + "loss": 0.3635, + "step": 2405 + }, + { + "epoch": 0.19491250810110175, + "grad_norm": 0.054432496428489685, + "learning_rate": 9.744835965978129e-05, + "loss": 0.4186, + "step": 2406 + }, + { + "epoch": 0.19499351911860013, + "grad_norm": 0.044115956872701645, + "learning_rate": 9.748886188740381e-05, + "loss": 0.3876, + "step": 2407 + }, + { + "epoch": 0.19507453013609852, + "grad_norm": 0.04487552493810654, + "learning_rate": 9.752936411502633e-05, + "loss": 0.4089, + "step": 2408 + }, + { + "epoch": 0.1951555411535969, + "grad_norm": 0.04776868224143982, + "learning_rate": 9.756986634264885e-05, + "loss": 0.3757, + "step": 2409 + }, + { + "epoch": 0.19523655217109526, + "grad_norm": 0.056760191917419434, + "learning_rate": 9.761036857027137e-05, + "loss": 0.3678, + "step": 2410 + }, + { + "epoch": 0.19531756318859364, + "grad_norm": 0.06094193831086159, + "learning_rate": 9.765087079789389e-05, + "loss": 0.3973, + "step": 2411 + }, + { + "epoch": 0.19539857420609202, + "grad_norm": 0.05493546649813652, + "learning_rate": 9.769137302551641e-05, + "loss": 0.3608, + "step": 2412 + }, + { + "epoch": 0.1954795852235904, + "grad_norm": 0.04134117811918259, + "learning_rate": 9.773187525313893e-05, + "loss": 0.3641, + "step": 2413 + }, + { + "epoch": 0.1955605962410888, + "grad_norm": 0.05286389961838722, + "learning_rate": 9.777237748076145e-05, + "loss": 0.3864, + "step": 2414 + }, + { + "epoch": 0.19564160725858717, + "grad_norm": 0.055073704570531845, + "learning_rate": 9.781287970838397e-05, + "loss": 0.4151, + "step": 2415 + }, + { + "epoch": 0.19572261827608556, + "grad_norm": 0.07006943225860596, + "learning_rate": 9.785338193600649e-05, + "loss": 0.3754, + "step": 2416 + }, + { + "epoch": 0.19580362929358391, + "grad_norm": 0.05164717510342598, + "learning_rate": 9.7893884163629e-05, + "loss": 0.4127, + "step": 2417 + }, + { + "epoch": 0.1958846403110823, + "grad_norm": 0.051379360258579254, + "learning_rate": 9.793438639125152e-05, + "loss": 0.405, + "step": 2418 + }, + { + "epoch": 0.19596565132858068, + "grad_norm": 0.056827612221241, + "learning_rate": 9.797488861887403e-05, + "loss": 0.3474, + "step": 2419 + }, + { + "epoch": 0.19604666234607906, + "grad_norm": 0.0616968534886837, + "learning_rate": 9.801539084649655e-05, + "loss": 0.3876, + "step": 2420 + }, + { + "epoch": 0.19612767336357745, + "grad_norm": 0.0510423481464386, + "learning_rate": 9.805589307411907e-05, + "loss": 0.3849, + "step": 2421 + }, + { + "epoch": 0.19620868438107583, + "grad_norm": 0.05379074066877365, + "learning_rate": 9.809639530174161e-05, + "loss": 0.416, + "step": 2422 + }, + { + "epoch": 0.19628969539857422, + "grad_norm": 0.04857899248600006, + "learning_rate": 9.813689752936413e-05, + "loss": 0.4007, + "step": 2423 + }, + { + "epoch": 0.1963707064160726, + "grad_norm": 0.04664013907313347, + "learning_rate": 9.817739975698665e-05, + "loss": 0.3865, + "step": 2424 + }, + { + "epoch": 0.19645171743357095, + "grad_norm": 0.0516962967813015, + "learning_rate": 9.821790198460917e-05, + "loss": 0.4013, + "step": 2425 + }, + { + "epoch": 0.19653272845106934, + "grad_norm": 0.059693820774555206, + "learning_rate": 9.825840421223169e-05, + "loss": 0.3976, + "step": 2426 + }, + { + "epoch": 0.19661373946856772, + "grad_norm": 0.05525093898177147, + "learning_rate": 9.82989064398542e-05, + "loss": 0.3586, + "step": 2427 + }, + { + "epoch": 0.1966947504860661, + "grad_norm": 0.07122207432985306, + "learning_rate": 9.833940866747671e-05, + "loss": 0.374, + "step": 2428 + }, + { + "epoch": 0.1967757615035645, + "grad_norm": 0.03935431316494942, + "learning_rate": 9.837991089509923e-05, + "loss": 0.3822, + "step": 2429 + }, + { + "epoch": 0.19685677252106287, + "grad_norm": 0.05369625613093376, + "learning_rate": 9.842041312272175e-05, + "loss": 0.4008, + "step": 2430 + }, + { + "epoch": 0.19693778353856126, + "grad_norm": 0.05163230001926422, + "learning_rate": 9.846091535034427e-05, + "loss": 0.4191, + "step": 2431 + }, + { + "epoch": 0.1970187945560596, + "grad_norm": 0.0478132963180542, + "learning_rate": 9.850141757796679e-05, + "loss": 0.397, + "step": 2432 + }, + { + "epoch": 0.197099805573558, + "grad_norm": 0.04648119583725929, + "learning_rate": 9.854191980558931e-05, + "loss": 0.364, + "step": 2433 + }, + { + "epoch": 0.19718081659105638, + "grad_norm": 0.04318595305085182, + "learning_rate": 9.858242203321183e-05, + "loss": 0.3196, + "step": 2434 + }, + { + "epoch": 0.19726182760855476, + "grad_norm": 0.049725234508514404, + "learning_rate": 9.862292426083435e-05, + "loss": 0.4059, + "step": 2435 + }, + { + "epoch": 0.19734283862605315, + "grad_norm": 0.059837911278009415, + "learning_rate": 9.866342648845687e-05, + "loss": 0.3796, + "step": 2436 + }, + { + "epoch": 0.19742384964355153, + "grad_norm": 0.05953565239906311, + "learning_rate": 9.870392871607939e-05, + "loss": 0.4661, + "step": 2437 + }, + { + "epoch": 0.1975048606610499, + "grad_norm": 0.06108205392956734, + "learning_rate": 9.874443094370191e-05, + "loss": 0.3892, + "step": 2438 + }, + { + "epoch": 0.19758587167854827, + "grad_norm": 0.04766416177153587, + "learning_rate": 9.878493317132443e-05, + "loss": 0.3453, + "step": 2439 + }, + { + "epoch": 0.19766688269604665, + "grad_norm": 0.04841725900769234, + "learning_rate": 9.882543539894695e-05, + "loss": 0.4404, + "step": 2440 + }, + { + "epoch": 0.19774789371354504, + "grad_norm": 0.05051546171307564, + "learning_rate": 9.886593762656947e-05, + "loss": 0.3734, + "step": 2441 + }, + { + "epoch": 0.19782890473104342, + "grad_norm": 0.06493677943944931, + "learning_rate": 9.890643985419199e-05, + "loss": 0.4166, + "step": 2442 + }, + { + "epoch": 0.1979099157485418, + "grad_norm": 0.044552478939294815, + "learning_rate": 9.894694208181451e-05, + "loss": 0.3468, + "step": 2443 + }, + { + "epoch": 0.1979909267660402, + "grad_norm": 0.04488362744450569, + "learning_rate": 9.898744430943703e-05, + "loss": 0.3645, + "step": 2444 + }, + { + "epoch": 0.19807193778353857, + "grad_norm": 0.04239024221897125, + "learning_rate": 9.902794653705955e-05, + "loss": 0.3579, + "step": 2445 + }, + { + "epoch": 0.19815294880103695, + "grad_norm": 0.05800511687994003, + "learning_rate": 9.906844876468207e-05, + "loss": 0.3979, + "step": 2446 + }, + { + "epoch": 0.1982339598185353, + "grad_norm": 0.05241608992218971, + "learning_rate": 9.910895099230459e-05, + "loss": 0.4038, + "step": 2447 + }, + { + "epoch": 0.1983149708360337, + "grad_norm": 0.04767435044050217, + "learning_rate": 9.91494532199271e-05, + "loss": 0.3741, + "step": 2448 + }, + { + "epoch": 0.19839598185353208, + "grad_norm": 0.05435354635119438, + "learning_rate": 9.918995544754962e-05, + "loss": 0.4271, + "step": 2449 + }, + { + "epoch": 0.19847699287103046, + "grad_norm": 0.04715365171432495, + "learning_rate": 9.923045767517214e-05, + "loss": 0.3608, + "step": 2450 + }, + { + "epoch": 0.19855800388852884, + "grad_norm": 0.05234677344560623, + "learning_rate": 9.927095990279466e-05, + "loss": 0.369, + "step": 2451 + }, + { + "epoch": 0.19863901490602723, + "grad_norm": 0.050974685698747635, + "learning_rate": 9.931146213041717e-05, + "loss": 0.3251, + "step": 2452 + }, + { + "epoch": 0.1987200259235256, + "grad_norm": 0.059354811906814575, + "learning_rate": 9.935196435803969e-05, + "loss": 0.4558, + "step": 2453 + }, + { + "epoch": 0.19880103694102397, + "grad_norm": 0.057084355503320694, + "learning_rate": 9.939246658566221e-05, + "loss": 0.4074, + "step": 2454 + }, + { + "epoch": 0.19888204795852235, + "grad_norm": 0.05420767143368721, + "learning_rate": 9.943296881328473e-05, + "loss": 0.3669, + "step": 2455 + }, + { + "epoch": 0.19896305897602073, + "grad_norm": 0.04995109885931015, + "learning_rate": 9.947347104090725e-05, + "loss": 0.3623, + "step": 2456 + }, + { + "epoch": 0.19904406999351912, + "grad_norm": 0.07834792882204056, + "learning_rate": 9.951397326852977e-05, + "loss": 0.3814, + "step": 2457 + }, + { + "epoch": 0.1991250810110175, + "grad_norm": 0.04483048617839813, + "learning_rate": 9.955447549615229e-05, + "loss": 0.3305, + "step": 2458 + }, + { + "epoch": 0.19920609202851589, + "grad_norm": 0.05417317524552345, + "learning_rate": 9.959497772377481e-05, + "loss": 0.3865, + "step": 2459 + }, + { + "epoch": 0.19928710304601427, + "grad_norm": 0.041050031781196594, + "learning_rate": 9.963547995139734e-05, + "loss": 0.3489, + "step": 2460 + }, + { + "epoch": 0.19936811406351262, + "grad_norm": 0.03944450989365578, + "learning_rate": 9.967598217901986e-05, + "loss": 0.3544, + "step": 2461 + }, + { + "epoch": 0.199449125081011, + "grad_norm": 0.05919930338859558, + "learning_rate": 9.971648440664238e-05, + "loss": 0.3941, + "step": 2462 + }, + { + "epoch": 0.1995301360985094, + "grad_norm": 0.04712875559926033, + "learning_rate": 9.975698663426489e-05, + "loss": 0.4015, + "step": 2463 + }, + { + "epoch": 0.19961114711600778, + "grad_norm": 0.04720218479633331, + "learning_rate": 9.97974888618874e-05, + "loss": 0.3927, + "step": 2464 + }, + { + "epoch": 0.19969215813350616, + "grad_norm": 0.0454418770968914, + "learning_rate": 9.983799108950993e-05, + "loss": 0.3821, + "step": 2465 + }, + { + "epoch": 0.19977316915100454, + "grad_norm": 0.03888606280088425, + "learning_rate": 9.987849331713245e-05, + "loss": 0.3211, + "step": 2466 + }, + { + "epoch": 0.19985418016850293, + "grad_norm": 0.05296216905117035, + "learning_rate": 9.991899554475497e-05, + "loss": 0.4288, + "step": 2467 + }, + { + "epoch": 0.1999351911860013, + "grad_norm": 0.04445220157504082, + "learning_rate": 9.995949777237749e-05, + "loss": 0.3433, + "step": 2468 + }, + { + "epoch": 0.20001620220349967, + "grad_norm": 0.04101482406258583, + "learning_rate": 0.0001, + "loss": 0.3738, + "step": 2469 + }, + { + "epoch": 0.20009721322099805, + "grad_norm": 0.05536132678389549, + "learning_rate": 0.00010004050222762254, + "loss": 0.416, + "step": 2470 + }, + { + "epoch": 0.20017822423849643, + "grad_norm": 0.05537095293402672, + "learning_rate": 0.00010008100445524504, + "loss": 0.4103, + "step": 2471 + }, + { + "epoch": 0.20025923525599482, + "grad_norm": 0.04526954144239426, + "learning_rate": 0.00010012150668286758, + "loss": 0.3716, + "step": 2472 + }, + { + "epoch": 0.2003402462734932, + "grad_norm": 0.04359053820371628, + "learning_rate": 0.00010016200891049008, + "loss": 0.3563, + "step": 2473 + }, + { + "epoch": 0.20042125729099158, + "grad_norm": 0.04995141550898552, + "learning_rate": 0.0001002025111381126, + "loss": 0.3854, + "step": 2474 + }, + { + "epoch": 0.20050226830848997, + "grad_norm": 0.04640784114599228, + "learning_rate": 0.00010024301336573512, + "loss": 0.4231, + "step": 2475 + }, + { + "epoch": 0.20058327932598832, + "grad_norm": 0.045250989496707916, + "learning_rate": 0.00010028351559335764, + "loss": 0.4278, + "step": 2476 + }, + { + "epoch": 0.2006642903434867, + "grad_norm": 0.053471554070711136, + "learning_rate": 0.00010032401782098015, + "loss": 0.3881, + "step": 2477 + }, + { + "epoch": 0.2007453013609851, + "grad_norm": 0.054227881133556366, + "learning_rate": 0.00010036452004860268, + "loss": 0.4368, + "step": 2478 + }, + { + "epoch": 0.20082631237848347, + "grad_norm": 0.05022569000720978, + "learning_rate": 0.00010040502227622519, + "loss": 0.372, + "step": 2479 + }, + { + "epoch": 0.20090732339598186, + "grad_norm": 0.06383142620325089, + "learning_rate": 0.00010044552450384772, + "loss": 0.4035, + "step": 2480 + }, + { + "epoch": 0.20098833441348024, + "grad_norm": 0.0433533675968647, + "learning_rate": 0.00010048602673147023, + "loss": 0.3546, + "step": 2481 + }, + { + "epoch": 0.20106934543097862, + "grad_norm": 0.05223929136991501, + "learning_rate": 0.00010052652895909276, + "loss": 0.4119, + "step": 2482 + }, + { + "epoch": 0.20115035644847698, + "grad_norm": 0.04103383049368858, + "learning_rate": 0.00010056703118671527, + "loss": 0.3788, + "step": 2483 + }, + { + "epoch": 0.20123136746597536, + "grad_norm": 0.05163447558879852, + "learning_rate": 0.0001006075334143378, + "loss": 0.4652, + "step": 2484 + }, + { + "epoch": 0.20131237848347375, + "grad_norm": 0.05344129726290703, + "learning_rate": 0.0001006480356419603, + "loss": 0.4269, + "step": 2485 + }, + { + "epoch": 0.20139338950097213, + "grad_norm": 0.056302350014448166, + "learning_rate": 0.00010068853786958284, + "loss": 0.3922, + "step": 2486 + }, + { + "epoch": 0.20147440051847051, + "grad_norm": 0.05290120095014572, + "learning_rate": 0.00010072904009720535, + "loss": 0.3809, + "step": 2487 + }, + { + "epoch": 0.2015554115359689, + "grad_norm": 0.038659267127513885, + "learning_rate": 0.00010076954232482786, + "loss": 0.4318, + "step": 2488 + }, + { + "epoch": 0.20163642255346728, + "grad_norm": 0.06425322592258453, + "learning_rate": 0.0001008100445524504, + "loss": 0.3526, + "step": 2489 + }, + { + "epoch": 0.20171743357096567, + "grad_norm": 0.04753356799483299, + "learning_rate": 0.0001008505467800729, + "loss": 0.3346, + "step": 2490 + }, + { + "epoch": 0.20179844458846402, + "grad_norm": 0.043934766203165054, + "learning_rate": 0.00010089104900769544, + "loss": 0.371, + "step": 2491 + }, + { + "epoch": 0.2018794556059624, + "grad_norm": 0.049425188452005386, + "learning_rate": 0.00010093155123531794, + "loss": 0.3821, + "step": 2492 + }, + { + "epoch": 0.2019604666234608, + "grad_norm": 0.059526655822992325, + "learning_rate": 0.00010097205346294048, + "loss": 0.3777, + "step": 2493 + }, + { + "epoch": 0.20204147764095917, + "grad_norm": 0.054595060646533966, + "learning_rate": 0.00010101255569056298, + "loss": 0.45, + "step": 2494 + }, + { + "epoch": 0.20212248865845756, + "grad_norm": 0.04631621390581131, + "learning_rate": 0.00010105305791818552, + "loss": 0.3896, + "step": 2495 + }, + { + "epoch": 0.20220349967595594, + "grad_norm": 0.051338374614715576, + "learning_rate": 0.00010109356014580802, + "loss": 0.3814, + "step": 2496 + }, + { + "epoch": 0.20228451069345432, + "grad_norm": 0.046590592712163925, + "learning_rate": 0.00010113406237343056, + "loss": 0.4024, + "step": 2497 + }, + { + "epoch": 0.20236552171095268, + "grad_norm": 0.04861883446574211, + "learning_rate": 0.00010117456460105306, + "loss": 0.3794, + "step": 2498 + }, + { + "epoch": 0.20244653272845106, + "grad_norm": 0.045799244195222855, + "learning_rate": 0.00010121506682867558, + "loss": 0.3595, + "step": 2499 + }, + { + "epoch": 0.20252754374594945, + "grad_norm": 0.0462491437792778, + "learning_rate": 0.00010125556905629809, + "loss": 0.4529, + "step": 2500 + }, + { + "epoch": 0.20260855476344783, + "grad_norm": 0.05045001208782196, + "learning_rate": 0.00010129607128392062, + "loss": 0.378, + "step": 2501 + }, + { + "epoch": 0.2026895657809462, + "grad_norm": 0.04670971632003784, + "learning_rate": 0.00010133657351154313, + "loss": 0.3551, + "step": 2502 + }, + { + "epoch": 0.2027705767984446, + "grad_norm": 0.04216444492340088, + "learning_rate": 0.00010137707573916566, + "loss": 0.4095, + "step": 2503 + }, + { + "epoch": 0.20285158781594298, + "grad_norm": 0.0527469776570797, + "learning_rate": 0.00010141757796678817, + "loss": 0.3529, + "step": 2504 + }, + { + "epoch": 0.20293259883344134, + "grad_norm": 0.04359910637140274, + "learning_rate": 0.0001014580801944107, + "loss": 0.3884, + "step": 2505 + }, + { + "epoch": 0.20301360985093972, + "grad_norm": 0.05967698618769646, + "learning_rate": 0.0001014985824220332, + "loss": 0.4167, + "step": 2506 + }, + { + "epoch": 0.2030946208684381, + "grad_norm": 0.048827651888132095, + "learning_rate": 0.00010153908464965574, + "loss": 0.4089, + "step": 2507 + }, + { + "epoch": 0.2031756318859365, + "grad_norm": 0.04760623350739479, + "learning_rate": 0.00010157958687727827, + "loss": 0.3515, + "step": 2508 + }, + { + "epoch": 0.20325664290343487, + "grad_norm": 0.038198892027139664, + "learning_rate": 0.00010162008910490078, + "loss": 0.3312, + "step": 2509 + }, + { + "epoch": 0.20333765392093325, + "grad_norm": 0.04905112832784653, + "learning_rate": 0.0001016605913325233, + "loss": 0.3888, + "step": 2510 + }, + { + "epoch": 0.20341866493843164, + "grad_norm": 0.04764381796121597, + "learning_rate": 0.0001017010935601458, + "loss": 0.3204, + "step": 2511 + }, + { + "epoch": 0.20349967595593, + "grad_norm": 0.045229025185108185, + "learning_rate": 0.00010174159578776834, + "loss": 0.382, + "step": 2512 + }, + { + "epoch": 0.20358068697342838, + "grad_norm": 0.053234439343214035, + "learning_rate": 0.00010178209801539084, + "loss": 0.3822, + "step": 2513 + }, + { + "epoch": 0.20366169799092676, + "grad_norm": 0.053509242832660675, + "learning_rate": 0.00010182260024301338, + "loss": 0.4125, + "step": 2514 + }, + { + "epoch": 0.20374270900842514, + "grad_norm": 0.04826676845550537, + "learning_rate": 0.00010186310247063588, + "loss": 0.3603, + "step": 2515 + }, + { + "epoch": 0.20382372002592353, + "grad_norm": 0.04815275967121124, + "learning_rate": 0.00010190360469825842, + "loss": 0.3774, + "step": 2516 + }, + { + "epoch": 0.2039047310434219, + "grad_norm": 0.04670335352420807, + "learning_rate": 0.00010194410692588092, + "loss": 0.4279, + "step": 2517 + }, + { + "epoch": 0.2039857420609203, + "grad_norm": 0.050155483186244965, + "learning_rate": 0.00010198460915350346, + "loss": 0.3811, + "step": 2518 + }, + { + "epoch": 0.20406675307841868, + "grad_norm": 0.03997861593961716, + "learning_rate": 0.00010202511138112596, + "loss": 0.3826, + "step": 2519 + }, + { + "epoch": 0.20414776409591703, + "grad_norm": 0.04606309160590172, + "learning_rate": 0.0001020656136087485, + "loss": 0.4141, + "step": 2520 + }, + { + "epoch": 0.20422877511341542, + "grad_norm": 0.048883430659770966, + "learning_rate": 0.000102106115836371, + "loss": 0.3719, + "step": 2521 + }, + { + "epoch": 0.2043097861309138, + "grad_norm": 0.04640788212418556, + "learning_rate": 0.00010214661806399352, + "loss": 0.4113, + "step": 2522 + }, + { + "epoch": 0.20439079714841218, + "grad_norm": 0.05309394747018814, + "learning_rate": 0.00010218712029161604, + "loss": 0.4066, + "step": 2523 + }, + { + "epoch": 0.20447180816591057, + "grad_norm": 0.04602646827697754, + "learning_rate": 0.00010222762251923856, + "loss": 0.3612, + "step": 2524 + }, + { + "epoch": 0.20455281918340895, + "grad_norm": 0.04883800819516182, + "learning_rate": 0.00010226812474686107, + "loss": 0.4093, + "step": 2525 + }, + { + "epoch": 0.20463383020090734, + "grad_norm": 0.044395849108695984, + "learning_rate": 0.0001023086269744836, + "loss": 0.3983, + "step": 2526 + }, + { + "epoch": 0.2047148412184057, + "grad_norm": 0.050485894083976746, + "learning_rate": 0.00010234912920210613, + "loss": 0.4242, + "step": 2527 + }, + { + "epoch": 0.20479585223590407, + "grad_norm": 0.0483909510076046, + "learning_rate": 0.00010238963142972864, + "loss": 0.3771, + "step": 2528 + }, + { + "epoch": 0.20487686325340246, + "grad_norm": 0.04684009775519371, + "learning_rate": 0.00010243013365735117, + "loss": 0.3917, + "step": 2529 + }, + { + "epoch": 0.20495787427090084, + "grad_norm": 0.0491405613720417, + "learning_rate": 0.00010247063588497368, + "loss": 0.3377, + "step": 2530 + }, + { + "epoch": 0.20503888528839923, + "grad_norm": 0.04853895306587219, + "learning_rate": 0.00010251113811259621, + "loss": 0.4153, + "step": 2531 + }, + { + "epoch": 0.2051198963058976, + "grad_norm": 0.05164919048547745, + "learning_rate": 0.00010255164034021872, + "loss": 0.3867, + "step": 2532 + }, + { + "epoch": 0.205200907323396, + "grad_norm": 0.04687311127781868, + "learning_rate": 0.00010259214256784124, + "loss": 0.3784, + "step": 2533 + }, + { + "epoch": 0.20528191834089435, + "grad_norm": 0.034922413527965546, + "learning_rate": 0.00010263264479546376, + "loss": 0.3459, + "step": 2534 + }, + { + "epoch": 0.20536292935839273, + "grad_norm": 0.047675810754299164, + "learning_rate": 0.00010267314702308628, + "loss": 0.3675, + "step": 2535 + }, + { + "epoch": 0.20544394037589112, + "grad_norm": 0.05016015097498894, + "learning_rate": 0.00010271364925070878, + "loss": 0.3978, + "step": 2536 + }, + { + "epoch": 0.2055249513933895, + "grad_norm": 0.03888197988271713, + "learning_rate": 0.00010275415147833132, + "loss": 0.3539, + "step": 2537 + }, + { + "epoch": 0.20560596241088788, + "grad_norm": 0.06236858665943146, + "learning_rate": 0.00010279465370595382, + "loss": 0.4093, + "step": 2538 + }, + { + "epoch": 0.20568697342838627, + "grad_norm": 0.038991302251815796, + "learning_rate": 0.00010283515593357635, + "loss": 0.3799, + "step": 2539 + }, + { + "epoch": 0.20576798444588465, + "grad_norm": 0.049997709691524506, + "learning_rate": 0.00010287565816119886, + "loss": 0.3985, + "step": 2540 + }, + { + "epoch": 0.20584899546338303, + "grad_norm": 0.05341407284140587, + "learning_rate": 0.0001029161603888214, + "loss": 0.3524, + "step": 2541 + }, + { + "epoch": 0.2059300064808814, + "grad_norm": 0.03703078255057335, + "learning_rate": 0.0001029566626164439, + "loss": 0.4079, + "step": 2542 + }, + { + "epoch": 0.20601101749837977, + "grad_norm": 0.06276031583547592, + "learning_rate": 0.00010299716484406643, + "loss": 0.3747, + "step": 2543 + }, + { + "epoch": 0.20609202851587816, + "grad_norm": 0.04427820071578026, + "learning_rate": 0.00010303766707168894, + "loss": 0.3774, + "step": 2544 + }, + { + "epoch": 0.20617303953337654, + "grad_norm": 0.051968708634376526, + "learning_rate": 0.00010307816929931147, + "loss": 0.3683, + "step": 2545 + }, + { + "epoch": 0.20625405055087492, + "grad_norm": 0.0557091124355793, + "learning_rate": 0.00010311867152693399, + "loss": 0.3791, + "step": 2546 + }, + { + "epoch": 0.2063350615683733, + "grad_norm": 0.046910978853702545, + "learning_rate": 0.0001031591737545565, + "loss": 0.4094, + "step": 2547 + }, + { + "epoch": 0.2064160725858717, + "grad_norm": 0.05115199834108353, + "learning_rate": 0.00010319967598217903, + "loss": 0.4132, + "step": 2548 + }, + { + "epoch": 0.20649708360337005, + "grad_norm": 0.049847543239593506, + "learning_rate": 0.00010324017820980154, + "loss": 0.3635, + "step": 2549 + }, + { + "epoch": 0.20657809462086843, + "grad_norm": 0.055366579443216324, + "learning_rate": 0.00010328068043742407, + "loss": 0.4279, + "step": 2550 + }, + { + "epoch": 0.2066591056383668, + "grad_norm": 0.05920335277915001, + "learning_rate": 0.00010332118266504658, + "loss": 0.4094, + "step": 2551 + }, + { + "epoch": 0.2067401166558652, + "grad_norm": 0.04443211480975151, + "learning_rate": 0.00010336168489266911, + "loss": 0.3963, + "step": 2552 + }, + { + "epoch": 0.20682112767336358, + "grad_norm": 0.046224404126405716, + "learning_rate": 0.00010340218712029162, + "loss": 0.4038, + "step": 2553 + }, + { + "epoch": 0.20690213869086196, + "grad_norm": 0.05442473664879799, + "learning_rate": 0.00010344268934791415, + "loss": 0.4074, + "step": 2554 + }, + { + "epoch": 0.20698314970836035, + "grad_norm": 0.04899989441037178, + "learning_rate": 0.00010348319157553666, + "loss": 0.3474, + "step": 2555 + }, + { + "epoch": 0.2070641607258587, + "grad_norm": 0.054443079978227615, + "learning_rate": 0.00010352369380315919, + "loss": 0.4292, + "step": 2556 + }, + { + "epoch": 0.2071451717433571, + "grad_norm": 0.054214950650930405, + "learning_rate": 0.0001035641960307817, + "loss": 0.3893, + "step": 2557 + }, + { + "epoch": 0.20722618276085547, + "grad_norm": 0.048906613141298294, + "learning_rate": 0.00010360469825840421, + "loss": 0.4234, + "step": 2558 + }, + { + "epoch": 0.20730719377835385, + "grad_norm": 0.05881531909108162, + "learning_rate": 0.00010364520048602673, + "loss": 0.4157, + "step": 2559 + }, + { + "epoch": 0.20738820479585224, + "grad_norm": 0.05360455438494682, + "learning_rate": 0.00010368570271364925, + "loss": 0.4134, + "step": 2560 + }, + { + "epoch": 0.20746921581335062, + "grad_norm": 0.046132009476423264, + "learning_rate": 0.00010372620494127176, + "loss": 0.3317, + "step": 2561 + }, + { + "epoch": 0.207550226830849, + "grad_norm": 0.05172186344861984, + "learning_rate": 0.0001037667071688943, + "loss": 0.4097, + "step": 2562 + }, + { + "epoch": 0.2076312378483474, + "grad_norm": 0.06279409676790237, + "learning_rate": 0.0001038072093965168, + "loss": 0.4337, + "step": 2563 + }, + { + "epoch": 0.20771224886584574, + "grad_norm": 0.05410788580775261, + "learning_rate": 0.00010384771162413933, + "loss": 0.351, + "step": 2564 + }, + { + "epoch": 0.20779325988334413, + "grad_norm": 0.04601385444402695, + "learning_rate": 0.00010388821385176187, + "loss": 0.428, + "step": 2565 + }, + { + "epoch": 0.2078742709008425, + "grad_norm": 0.04627078399062157, + "learning_rate": 0.00010392871607938437, + "loss": 0.3752, + "step": 2566 + }, + { + "epoch": 0.2079552819183409, + "grad_norm": 0.06033918634057045, + "learning_rate": 0.0001039692183070069, + "loss": 0.3628, + "step": 2567 + }, + { + "epoch": 0.20803629293583928, + "grad_norm": 0.046143606305122375, + "learning_rate": 0.00010400972053462941, + "loss": 0.3764, + "step": 2568 + }, + { + "epoch": 0.20811730395333766, + "grad_norm": 0.04848916456103325, + "learning_rate": 0.00010405022276225193, + "loss": 0.3636, + "step": 2569 + }, + { + "epoch": 0.20819831497083605, + "grad_norm": 0.04305524379014969, + "learning_rate": 0.00010409072498987445, + "loss": 0.3783, + "step": 2570 + }, + { + "epoch": 0.2082793259883344, + "grad_norm": 0.05727194994688034, + "learning_rate": 0.00010413122721749697, + "loss": 0.4555, + "step": 2571 + }, + { + "epoch": 0.20836033700583279, + "grad_norm": 0.05420166626572609, + "learning_rate": 0.00010417172944511948, + "loss": 0.3874, + "step": 2572 + }, + { + "epoch": 0.20844134802333117, + "grad_norm": 0.05022074654698372, + "learning_rate": 0.00010421223167274201, + "loss": 0.352, + "step": 2573 + }, + { + "epoch": 0.20852235904082955, + "grad_norm": 0.0534265972673893, + "learning_rate": 0.00010425273390036452, + "loss": 0.4138, + "step": 2574 + }, + { + "epoch": 0.20860337005832794, + "grad_norm": 0.03776722401380539, + "learning_rate": 0.00010429323612798705, + "loss": 0.3459, + "step": 2575 + }, + { + "epoch": 0.20868438107582632, + "grad_norm": 0.04339131712913513, + "learning_rate": 0.00010433373835560956, + "loss": 0.4046, + "step": 2576 + }, + { + "epoch": 0.2087653920933247, + "grad_norm": 0.050997912883758545, + "learning_rate": 0.00010437424058323209, + "loss": 0.3805, + "step": 2577 + }, + { + "epoch": 0.20884640311082306, + "grad_norm": 0.04996298626065254, + "learning_rate": 0.0001044147428108546, + "loss": 0.4127, + "step": 2578 + }, + { + "epoch": 0.20892741412832144, + "grad_norm": 0.06990036368370056, + "learning_rate": 0.00010445524503847713, + "loss": 0.426, + "step": 2579 + }, + { + "epoch": 0.20900842514581983, + "grad_norm": 0.053124912083148956, + "learning_rate": 0.00010449574726609963, + "loss": 0.4036, + "step": 2580 + }, + { + "epoch": 0.2090894361633182, + "grad_norm": 0.047472499310970306, + "learning_rate": 0.00010453624949372217, + "loss": 0.4067, + "step": 2581 + }, + { + "epoch": 0.2091704471808166, + "grad_norm": 0.05586513504385948, + "learning_rate": 0.00010457675172134467, + "loss": 0.3958, + "step": 2582 + }, + { + "epoch": 0.20925145819831498, + "grad_norm": 0.048053573817014694, + "learning_rate": 0.00010461725394896719, + "loss": 0.3987, + "step": 2583 + }, + { + "epoch": 0.20933246921581336, + "grad_norm": 0.043412040919065475, + "learning_rate": 0.00010465775617658973, + "loss": 0.3462, + "step": 2584 + }, + { + "epoch": 0.20941348023331174, + "grad_norm": 0.05099153518676758, + "learning_rate": 0.00010469825840421223, + "loss": 0.4066, + "step": 2585 + }, + { + "epoch": 0.2094944912508101, + "grad_norm": 0.04131011292338371, + "learning_rate": 0.00010473876063183477, + "loss": 0.4256, + "step": 2586 + }, + { + "epoch": 0.20957550226830848, + "grad_norm": 0.04058512672781944, + "learning_rate": 0.00010477926285945727, + "loss": 0.3686, + "step": 2587 + }, + { + "epoch": 0.20965651328580687, + "grad_norm": 0.04154278337955475, + "learning_rate": 0.0001048197650870798, + "loss": 0.3233, + "step": 2588 + }, + { + "epoch": 0.20973752430330525, + "grad_norm": 0.07527300715446472, + "learning_rate": 0.00010486026731470231, + "loss": 0.3186, + "step": 2589 + }, + { + "epoch": 0.20981853532080363, + "grad_norm": 0.042909786105155945, + "learning_rate": 0.00010490076954232484, + "loss": 0.4065, + "step": 2590 + }, + { + "epoch": 0.20989954633830202, + "grad_norm": 0.04452323168516159, + "learning_rate": 0.00010494127176994735, + "loss": 0.4012, + "step": 2591 + }, + { + "epoch": 0.2099805573558004, + "grad_norm": 0.06063823029398918, + "learning_rate": 0.00010498177399756988, + "loss": 0.4393, + "step": 2592 + }, + { + "epoch": 0.21006156837329876, + "grad_norm": 0.044901590794324875, + "learning_rate": 0.00010502227622519239, + "loss": 0.3851, + "step": 2593 + }, + { + "epoch": 0.21014257939079714, + "grad_norm": 0.04395297169685364, + "learning_rate": 0.00010506277845281491, + "loss": 0.392, + "step": 2594 + }, + { + "epoch": 0.21022359040829552, + "grad_norm": 0.04686028137803078, + "learning_rate": 0.00010510328068043742, + "loss": 0.351, + "step": 2595 + }, + { + "epoch": 0.2103046014257939, + "grad_norm": 0.05416541174054146, + "learning_rate": 0.00010514378290805995, + "loss": 0.3642, + "step": 2596 + }, + { + "epoch": 0.2103856124432923, + "grad_norm": 0.053576841950416565, + "learning_rate": 0.00010518428513568245, + "loss": 0.412, + "step": 2597 + }, + { + "epoch": 0.21046662346079067, + "grad_norm": 0.05176137387752533, + "learning_rate": 0.00010522478736330499, + "loss": 0.3755, + "step": 2598 + }, + { + "epoch": 0.21054763447828906, + "grad_norm": 0.04937148466706276, + "learning_rate": 0.0001052652895909275, + "loss": 0.3539, + "step": 2599 + }, + { + "epoch": 0.21062864549578741, + "grad_norm": 0.044205427169799805, + "learning_rate": 0.00010530579181855003, + "loss": 0.3502, + "step": 2600 + }, + { + "epoch": 0.2107096565132858, + "grad_norm": 0.04475211724638939, + "learning_rate": 0.00010534629404617256, + "loss": 0.3376, + "step": 2601 + }, + { + "epoch": 0.21079066753078418, + "grad_norm": 0.05387239158153534, + "learning_rate": 0.00010538679627379507, + "loss": 0.3748, + "step": 2602 + }, + { + "epoch": 0.21087167854828257, + "grad_norm": 0.04502122476696968, + "learning_rate": 0.00010542729850141759, + "loss": 0.3698, + "step": 2603 + }, + { + "epoch": 0.21095268956578095, + "grad_norm": 0.04435496777296066, + "learning_rate": 0.0001054678007290401, + "loss": 0.3881, + "step": 2604 + }, + { + "epoch": 0.21103370058327933, + "grad_norm": 0.054403528571128845, + "learning_rate": 0.00010550830295666263, + "loss": 0.3588, + "step": 2605 + }, + { + "epoch": 0.21111471160077772, + "grad_norm": 0.047774478793144226, + "learning_rate": 0.00010554880518428513, + "loss": 0.3836, + "step": 2606 + }, + { + "epoch": 0.2111957226182761, + "grad_norm": 0.04820839688181877, + "learning_rate": 0.00010558930741190766, + "loss": 0.3701, + "step": 2607 + }, + { + "epoch": 0.21127673363577446, + "grad_norm": 0.05031566321849823, + "learning_rate": 0.00010562980963953017, + "loss": 0.3585, + "step": 2608 + }, + { + "epoch": 0.21135774465327284, + "grad_norm": 0.06141781806945801, + "learning_rate": 0.0001056703118671527, + "loss": 0.4669, + "step": 2609 + }, + { + "epoch": 0.21143875567077122, + "grad_norm": 0.056281477212905884, + "learning_rate": 0.00010571081409477521, + "loss": 0.4542, + "step": 2610 + }, + { + "epoch": 0.2115197666882696, + "grad_norm": 0.04627525433897972, + "learning_rate": 0.00010575131632239774, + "loss": 0.4101, + "step": 2611 + }, + { + "epoch": 0.211600777705768, + "grad_norm": 0.05587709695100784, + "learning_rate": 0.00010579181855002025, + "loss": 0.4177, + "step": 2612 + }, + { + "epoch": 0.21168178872326637, + "grad_norm": 0.04824918136000633, + "learning_rate": 0.00010583232077764278, + "loss": 0.3946, + "step": 2613 + }, + { + "epoch": 0.21176279974076476, + "grad_norm": 0.05462539941072464, + "learning_rate": 0.00010587282300526529, + "loss": 0.3867, + "step": 2614 + }, + { + "epoch": 0.2118438107582631, + "grad_norm": 0.049656134098768234, + "learning_rate": 0.00010591332523288782, + "loss": 0.3657, + "step": 2615 + }, + { + "epoch": 0.2119248217757615, + "grad_norm": 0.048310454934835434, + "learning_rate": 0.00010595382746051033, + "loss": 0.4064, + "step": 2616 + }, + { + "epoch": 0.21200583279325988, + "grad_norm": 0.0601760558784008, + "learning_rate": 0.00010599432968813285, + "loss": 0.4552, + "step": 2617 + }, + { + "epoch": 0.21208684381075826, + "grad_norm": 0.042523082345724106, + "learning_rate": 0.00010603483191575537, + "loss": 0.3503, + "step": 2618 + }, + { + "epoch": 0.21216785482825665, + "grad_norm": 0.04929207265377045, + "learning_rate": 0.00010607533414337789, + "loss": 0.4063, + "step": 2619 + }, + { + "epoch": 0.21224886584575503, + "grad_norm": 0.04285252466797829, + "learning_rate": 0.00010611583637100042, + "loss": 0.3896, + "step": 2620 + }, + { + "epoch": 0.21232987686325341, + "grad_norm": 0.05070970952510834, + "learning_rate": 0.00010615633859862293, + "loss": 0.4165, + "step": 2621 + }, + { + "epoch": 0.21241088788075177, + "grad_norm": 0.05116109549999237, + "learning_rate": 0.00010619684082624546, + "loss": 0.3549, + "step": 2622 + }, + { + "epoch": 0.21249189889825015, + "grad_norm": 0.06008598208427429, + "learning_rate": 0.00010623734305386797, + "loss": 0.3841, + "step": 2623 + }, + { + "epoch": 0.21257290991574854, + "grad_norm": 0.04959864169359207, + "learning_rate": 0.0001062778452814905, + "loss": 0.4246, + "step": 2624 + }, + { + "epoch": 0.21265392093324692, + "grad_norm": 0.04739030823111534, + "learning_rate": 0.000106318347509113, + "loss": 0.3482, + "step": 2625 + }, + { + "epoch": 0.2127349319507453, + "grad_norm": 0.04815612733364105, + "learning_rate": 0.00010635884973673554, + "loss": 0.372, + "step": 2626 + }, + { + "epoch": 0.2128159429682437, + "grad_norm": 0.05172804743051529, + "learning_rate": 0.00010639935196435804, + "loss": 0.3737, + "step": 2627 + }, + { + "epoch": 0.21289695398574207, + "grad_norm": 0.05179368704557419, + "learning_rate": 0.00010643985419198056, + "loss": 0.4017, + "step": 2628 + }, + { + "epoch": 0.21297796500324043, + "grad_norm": 0.05360709875822067, + "learning_rate": 0.00010648035641960308, + "loss": 0.3869, + "step": 2629 + }, + { + "epoch": 0.2130589760207388, + "grad_norm": 0.04751502722501755, + "learning_rate": 0.0001065208586472256, + "loss": 0.4248, + "step": 2630 + }, + { + "epoch": 0.2131399870382372, + "grad_norm": 0.046408139169216156, + "learning_rate": 0.00010656136087484811, + "loss": 0.3668, + "step": 2631 + }, + { + "epoch": 0.21322099805573558, + "grad_norm": 0.05516528710722923, + "learning_rate": 0.00010660186310247064, + "loss": 0.4057, + "step": 2632 + }, + { + "epoch": 0.21330200907323396, + "grad_norm": 0.06695632636547089, + "learning_rate": 0.00010664236533009315, + "loss": 0.4126, + "step": 2633 + }, + { + "epoch": 0.21338302009073234, + "grad_norm": 0.04524846002459526, + "learning_rate": 0.00010668286755771568, + "loss": 0.4305, + "step": 2634 + }, + { + "epoch": 0.21346403110823073, + "grad_norm": 0.043575726449489594, + "learning_rate": 0.00010672336978533819, + "loss": 0.3572, + "step": 2635 + }, + { + "epoch": 0.2135450421257291, + "grad_norm": 0.04118606820702553, + "learning_rate": 0.00010676387201296072, + "loss": 0.337, + "step": 2636 + }, + { + "epoch": 0.21362605314322747, + "grad_norm": 0.03964829444885254, + "learning_rate": 0.00010680437424058323, + "loss": 0.3562, + "step": 2637 + }, + { + "epoch": 0.21370706416072585, + "grad_norm": 0.04576268047094345, + "learning_rate": 0.00010684487646820576, + "loss": 0.4024, + "step": 2638 + }, + { + "epoch": 0.21378807517822424, + "grad_norm": 0.04380327835679054, + "learning_rate": 0.00010688537869582828, + "loss": 0.4258, + "step": 2639 + }, + { + "epoch": 0.21386908619572262, + "grad_norm": 0.04862361401319504, + "learning_rate": 0.0001069258809234508, + "loss": 0.3582, + "step": 2640 + }, + { + "epoch": 0.213950097213221, + "grad_norm": 0.04417659342288971, + "learning_rate": 0.00010696638315107332, + "loss": 0.4156, + "step": 2641 + }, + { + "epoch": 0.21403110823071939, + "grad_norm": 0.048604417592287064, + "learning_rate": 0.00010700688537869583, + "loss": 0.3986, + "step": 2642 + }, + { + "epoch": 0.21411211924821777, + "grad_norm": 0.0473223477602005, + "learning_rate": 0.00010704738760631836, + "loss": 0.3804, + "step": 2643 + }, + { + "epoch": 0.21419313026571613, + "grad_norm": 0.049184344708919525, + "learning_rate": 0.00010708788983394087, + "loss": 0.3635, + "step": 2644 + }, + { + "epoch": 0.2142741412832145, + "grad_norm": 0.03883194550871849, + "learning_rate": 0.0001071283920615634, + "loss": 0.3426, + "step": 2645 + }, + { + "epoch": 0.2143551523007129, + "grad_norm": 0.04144468158483505, + "learning_rate": 0.0001071688942891859, + "loss": 0.3702, + "step": 2646 + }, + { + "epoch": 0.21443616331821128, + "grad_norm": 0.04413948208093643, + "learning_rate": 0.00010720939651680844, + "loss": 0.3699, + "step": 2647 + }, + { + "epoch": 0.21451717433570966, + "grad_norm": 0.05277412384748459, + "learning_rate": 0.00010724989874443094, + "loss": 0.4364, + "step": 2648 + }, + { + "epoch": 0.21459818535320804, + "grad_norm": 0.05613725632429123, + "learning_rate": 0.00010729040097205348, + "loss": 0.4129, + "step": 2649 + }, + { + "epoch": 0.21467919637070643, + "grad_norm": 0.04522144794464111, + "learning_rate": 0.00010733090319967598, + "loss": 0.4004, + "step": 2650 + }, + { + "epoch": 0.21476020738820478, + "grad_norm": 0.053900957107543945, + "learning_rate": 0.00010737140542729852, + "loss": 0.4037, + "step": 2651 + }, + { + "epoch": 0.21484121840570317, + "grad_norm": 0.04681387543678284, + "learning_rate": 0.00010741190765492102, + "loss": 0.3747, + "step": 2652 + }, + { + "epoch": 0.21492222942320155, + "grad_norm": 0.05777775123715401, + "learning_rate": 0.00010745240988254354, + "loss": 0.3672, + "step": 2653 + }, + { + "epoch": 0.21500324044069993, + "grad_norm": 0.058414071798324585, + "learning_rate": 0.00010749291211016605, + "loss": 0.4225, + "step": 2654 + }, + { + "epoch": 0.21508425145819832, + "grad_norm": 0.03851136565208435, + "learning_rate": 0.00010753341433778858, + "loss": 0.3504, + "step": 2655 + }, + { + "epoch": 0.2151652624756967, + "grad_norm": 0.03778860345482826, + "learning_rate": 0.00010757391656541109, + "loss": 0.3751, + "step": 2656 + }, + { + "epoch": 0.21524627349319508, + "grad_norm": 0.04218238964676857, + "learning_rate": 0.00010761441879303362, + "loss": 0.3385, + "step": 2657 + }, + { + "epoch": 0.21532728451069347, + "grad_norm": 0.04938570782542229, + "learning_rate": 0.00010765492102065615, + "loss": 0.3746, + "step": 2658 + }, + { + "epoch": 0.21540829552819182, + "grad_norm": 0.05085742846131325, + "learning_rate": 0.00010769542324827866, + "loss": 0.3804, + "step": 2659 + }, + { + "epoch": 0.2154893065456902, + "grad_norm": 0.05732310190796852, + "learning_rate": 0.0001077359254759012, + "loss": 0.4115, + "step": 2660 + }, + { + "epoch": 0.2155703175631886, + "grad_norm": 0.055434245616197586, + "learning_rate": 0.0001077764277035237, + "loss": 0.3893, + "step": 2661 + }, + { + "epoch": 0.21565132858068697, + "grad_norm": 0.042372528463602066, + "learning_rate": 0.00010781692993114623, + "loss": 0.3624, + "step": 2662 + }, + { + "epoch": 0.21573233959818536, + "grad_norm": 0.04685783386230469, + "learning_rate": 0.00010785743215876874, + "loss": 0.4375, + "step": 2663 + }, + { + "epoch": 0.21581335061568374, + "grad_norm": 0.04215598478913307, + "learning_rate": 0.00010789793438639126, + "loss": 0.3682, + "step": 2664 + }, + { + "epoch": 0.21589436163318212, + "grad_norm": 0.049776118248701096, + "learning_rate": 0.00010793843661401376, + "loss": 0.3488, + "step": 2665 + }, + { + "epoch": 0.21597537265068048, + "grad_norm": 0.04941713809967041, + "learning_rate": 0.0001079789388416363, + "loss": 0.3578, + "step": 2666 + }, + { + "epoch": 0.21605638366817886, + "grad_norm": 0.04695756733417511, + "learning_rate": 0.0001080194410692588, + "loss": 0.3845, + "step": 2667 + }, + { + "epoch": 0.21613739468567725, + "grad_norm": 0.0699956864118576, + "learning_rate": 0.00010805994329688134, + "loss": 0.4633, + "step": 2668 + }, + { + "epoch": 0.21621840570317563, + "grad_norm": 0.05027562007308006, + "learning_rate": 0.00010810044552450384, + "loss": 0.4073, + "step": 2669 + }, + { + "epoch": 0.21629941672067401, + "grad_norm": 0.043147969990968704, + "learning_rate": 0.00010814094775212638, + "loss": 0.3883, + "step": 2670 + }, + { + "epoch": 0.2163804277381724, + "grad_norm": 0.06733971834182739, + "learning_rate": 0.00010818144997974888, + "loss": 0.3542, + "step": 2671 + }, + { + "epoch": 0.21646143875567078, + "grad_norm": 0.04482163116335869, + "learning_rate": 0.00010822195220737142, + "loss": 0.3922, + "step": 2672 + }, + { + "epoch": 0.21654244977316914, + "grad_norm": 0.04736680909991264, + "learning_rate": 0.00010826245443499392, + "loss": 0.3624, + "step": 2673 + }, + { + "epoch": 0.21662346079066752, + "grad_norm": 0.04092049226164818, + "learning_rate": 0.00010830295666261646, + "loss": 0.3695, + "step": 2674 + }, + { + "epoch": 0.2167044718081659, + "grad_norm": 0.048764027655124664, + "learning_rate": 0.00010834345889023896, + "loss": 0.3841, + "step": 2675 + }, + { + "epoch": 0.2167854828256643, + "grad_norm": 0.04987451061606407, + "learning_rate": 0.00010838396111786148, + "loss": 0.4004, + "step": 2676 + }, + { + "epoch": 0.21686649384316267, + "grad_norm": 0.044140033423900604, + "learning_rate": 0.00010842446334548401, + "loss": 0.3409, + "step": 2677 + }, + { + "epoch": 0.21694750486066106, + "grad_norm": 0.05619576945900917, + "learning_rate": 0.00010846496557310652, + "loss": 0.4316, + "step": 2678 + }, + { + "epoch": 0.21702851587815944, + "grad_norm": 0.05569642409682274, + "learning_rate": 0.00010850546780072905, + "loss": 0.343, + "step": 2679 + }, + { + "epoch": 0.21710952689565782, + "grad_norm": 0.04952612519264221, + "learning_rate": 0.00010854597002835156, + "loss": 0.3723, + "step": 2680 + }, + { + "epoch": 0.21719053791315618, + "grad_norm": 0.050588030368089676, + "learning_rate": 0.0001085864722559741, + "loss": 0.401, + "step": 2681 + }, + { + "epoch": 0.21727154893065456, + "grad_norm": 0.036035146564245224, + "learning_rate": 0.0001086269744835966, + "loss": 0.3753, + "step": 2682 + }, + { + "epoch": 0.21735255994815295, + "grad_norm": 0.04765566438436508, + "learning_rate": 0.00010866747671121913, + "loss": 0.4035, + "step": 2683 + }, + { + "epoch": 0.21743357096565133, + "grad_norm": 0.04165409505367279, + "learning_rate": 0.00010870797893884164, + "loss": 0.459, + "step": 2684 + }, + { + "epoch": 0.2175145819831497, + "grad_norm": 0.05602968856692314, + "learning_rate": 0.00010874848116646417, + "loss": 0.3788, + "step": 2685 + }, + { + "epoch": 0.2175955930006481, + "grad_norm": 0.04455839842557907, + "learning_rate": 0.00010878898339408668, + "loss": 0.3706, + "step": 2686 + }, + { + "epoch": 0.21767660401814648, + "grad_norm": 0.048621680587530136, + "learning_rate": 0.0001088294856217092, + "loss": 0.4115, + "step": 2687 + }, + { + "epoch": 0.21775761503564484, + "grad_norm": 0.04829535260796547, + "learning_rate": 0.00010886998784933172, + "loss": 0.4108, + "step": 2688 + }, + { + "epoch": 0.21783862605314322, + "grad_norm": 0.04750366136431694, + "learning_rate": 0.00010891049007695424, + "loss": 0.3978, + "step": 2689 + }, + { + "epoch": 0.2179196370706416, + "grad_norm": 0.050557080656290054, + "learning_rate": 0.00010895099230457674, + "loss": 0.3803, + "step": 2690 + }, + { + "epoch": 0.21800064808814, + "grad_norm": 0.04632573202252388, + "learning_rate": 0.00010899149453219928, + "loss": 0.378, + "step": 2691 + }, + { + "epoch": 0.21808165910563837, + "grad_norm": 0.054422810673713684, + "learning_rate": 0.00010903199675982178, + "loss": 0.3912, + "step": 2692 + }, + { + "epoch": 0.21816267012313675, + "grad_norm": 0.044331423938274384, + "learning_rate": 0.00010907249898744432, + "loss": 0.3682, + "step": 2693 + }, + { + "epoch": 0.21824368114063514, + "grad_norm": 0.04680316522717476, + "learning_rate": 0.00010911300121506682, + "loss": 0.4347, + "step": 2694 + }, + { + "epoch": 0.2183246921581335, + "grad_norm": 0.0582558698952198, + "learning_rate": 0.00010915350344268936, + "loss": 0.3916, + "step": 2695 + }, + { + "epoch": 0.21840570317563188, + "grad_norm": 0.043212030082941055, + "learning_rate": 0.00010919400567031189, + "loss": 0.3624, + "step": 2696 + }, + { + "epoch": 0.21848671419313026, + "grad_norm": 0.04562610760331154, + "learning_rate": 0.0001092345078979344, + "loss": 0.3665, + "step": 2697 + }, + { + "epoch": 0.21856772521062864, + "grad_norm": 0.03662271425127983, + "learning_rate": 0.00010927501012555691, + "loss": 0.354, + "step": 2698 + }, + { + "epoch": 0.21864873622812703, + "grad_norm": 0.043365396559238434, + "learning_rate": 0.00010931551235317943, + "loss": 0.4155, + "step": 2699 + }, + { + "epoch": 0.2187297472456254, + "grad_norm": 0.04387480020523071, + "learning_rate": 0.00010935601458080195, + "loss": 0.3916, + "step": 2700 + }, + { + "epoch": 0.2188107582631238, + "grad_norm": 0.051516756415367126, + "learning_rate": 0.00010939651680842446, + "loss": 0.3896, + "step": 2701 + }, + { + "epoch": 0.21889176928062218, + "grad_norm": 0.04242309555411339, + "learning_rate": 0.00010943701903604699, + "loss": 0.3803, + "step": 2702 + }, + { + "epoch": 0.21897278029812053, + "grad_norm": 0.04661116003990173, + "learning_rate": 0.0001094775212636695, + "loss": 0.3575, + "step": 2703 + }, + { + "epoch": 0.21905379131561892, + "grad_norm": 0.04585146903991699, + "learning_rate": 0.00010951802349129203, + "loss": 0.398, + "step": 2704 + }, + { + "epoch": 0.2191348023331173, + "grad_norm": 0.04462905600667, + "learning_rate": 0.00010955852571891454, + "loss": 0.3762, + "step": 2705 + }, + { + "epoch": 0.21921581335061568, + "grad_norm": 0.05082102492451668, + "learning_rate": 0.00010959902794653707, + "loss": 0.3943, + "step": 2706 + }, + { + "epoch": 0.21929682436811407, + "grad_norm": 0.05209215730428696, + "learning_rate": 0.00010963953017415958, + "loss": 0.4001, + "step": 2707 + }, + { + "epoch": 0.21937783538561245, + "grad_norm": 0.040937576442956924, + "learning_rate": 0.00010968003240178211, + "loss": 0.3479, + "step": 2708 + }, + { + "epoch": 0.21945884640311084, + "grad_norm": 0.054162561893463135, + "learning_rate": 0.00010972053462940462, + "loss": 0.3737, + "step": 2709 + }, + { + "epoch": 0.2195398574206092, + "grad_norm": 0.04428819939494133, + "learning_rate": 0.00010976103685702715, + "loss": 0.3754, + "step": 2710 + }, + { + "epoch": 0.21962086843810757, + "grad_norm": 0.04762909933924675, + "learning_rate": 0.00010980153908464966, + "loss": 0.359, + "step": 2711 + }, + { + "epoch": 0.21970187945560596, + "grad_norm": 0.0502607524394989, + "learning_rate": 0.00010984204131227218, + "loss": 0.3484, + "step": 2712 + }, + { + "epoch": 0.21978289047310434, + "grad_norm": 0.04703029245138168, + "learning_rate": 0.0001098825435398947, + "loss": 0.3873, + "step": 2713 + }, + { + "epoch": 0.21986390149060273, + "grad_norm": 0.036031436175107956, + "learning_rate": 0.00010992304576751722, + "loss": 0.3486, + "step": 2714 + }, + { + "epoch": 0.2199449125081011, + "grad_norm": 0.050215840339660645, + "learning_rate": 0.00010996354799513975, + "loss": 0.3577, + "step": 2715 + }, + { + "epoch": 0.2200259235255995, + "grad_norm": 0.04181578382849693, + "learning_rate": 0.00011000405022276225, + "loss": 0.3569, + "step": 2716 + }, + { + "epoch": 0.22010693454309785, + "grad_norm": 0.048212967813014984, + "learning_rate": 0.00011004455245038479, + "loss": 0.412, + "step": 2717 + }, + { + "epoch": 0.22018794556059623, + "grad_norm": 0.040041908621788025, + "learning_rate": 0.0001100850546780073, + "loss": 0.3738, + "step": 2718 + }, + { + "epoch": 0.22026895657809462, + "grad_norm": 0.03832246735692024, + "learning_rate": 0.00011012555690562983, + "loss": 0.3477, + "step": 2719 + }, + { + "epoch": 0.220349967595593, + "grad_norm": 0.04586288332939148, + "learning_rate": 0.00011016605913325233, + "loss": 0.3495, + "step": 2720 + }, + { + "epoch": 0.22043097861309138, + "grad_norm": 0.04519755020737648, + "learning_rate": 0.00011020656136087487, + "loss": 0.3569, + "step": 2721 + }, + { + "epoch": 0.22051198963058977, + "grad_norm": 0.04331675544381142, + "learning_rate": 0.00011024706358849737, + "loss": 0.3464, + "step": 2722 + }, + { + "epoch": 0.22059300064808815, + "grad_norm": 0.049206074327230453, + "learning_rate": 0.00011028756581611989, + "loss": 0.3763, + "step": 2723 + }, + { + "epoch": 0.22067401166558653, + "grad_norm": 0.05203338712453842, + "learning_rate": 0.00011032806804374241, + "loss": 0.3811, + "step": 2724 + }, + { + "epoch": 0.2207550226830849, + "grad_norm": 0.05231618136167526, + "learning_rate": 0.00011036857027136493, + "loss": 0.3838, + "step": 2725 + }, + { + "epoch": 0.22083603370058327, + "grad_norm": 0.04322638362646103, + "learning_rate": 0.00011040907249898744, + "loss": 0.3613, + "step": 2726 + }, + { + "epoch": 0.22091704471808166, + "grad_norm": 0.048019472509622574, + "learning_rate": 0.00011044957472660997, + "loss": 0.4003, + "step": 2727 + }, + { + "epoch": 0.22099805573558004, + "grad_norm": 0.05193858593702316, + "learning_rate": 0.00011049007695423248, + "loss": 0.4298, + "step": 2728 + }, + { + "epoch": 0.22107906675307842, + "grad_norm": 0.04562699794769287, + "learning_rate": 0.00011053057918185501, + "loss": 0.4184, + "step": 2729 + }, + { + "epoch": 0.2211600777705768, + "grad_norm": 0.043055713176727295, + "learning_rate": 0.00011057108140947752, + "loss": 0.4453, + "step": 2730 + }, + { + "epoch": 0.2212410887880752, + "grad_norm": 0.042347393929958344, + "learning_rate": 0.00011061158363710005, + "loss": 0.3841, + "step": 2731 + }, + { + "epoch": 0.22132209980557355, + "grad_norm": 0.048777904361486435, + "learning_rate": 0.00011065208586472256, + "loss": 0.3981, + "step": 2732 + }, + { + "epoch": 0.22140311082307193, + "grad_norm": 0.048668425530195236, + "learning_rate": 0.00011069258809234509, + "loss": 0.4039, + "step": 2733 + }, + { + "epoch": 0.2214841218405703, + "grad_norm": 0.052155207842588425, + "learning_rate": 0.00011073309031996761, + "loss": 0.3668, + "step": 2734 + }, + { + "epoch": 0.2215651328580687, + "grad_norm": 0.04615851119160652, + "learning_rate": 0.00011077359254759013, + "loss": 0.3642, + "step": 2735 + }, + { + "epoch": 0.22164614387556708, + "grad_norm": 0.04468770697712898, + "learning_rate": 0.00011081409477521265, + "loss": 0.4079, + "step": 2736 + }, + { + "epoch": 0.22172715489306546, + "grad_norm": 0.05454346165060997, + "learning_rate": 0.00011085459700283515, + "loss": 0.3461, + "step": 2737 + }, + { + "epoch": 0.22180816591056385, + "grad_norm": 0.049995217472314835, + "learning_rate": 0.00011089509923045769, + "loss": 0.3601, + "step": 2738 + }, + { + "epoch": 0.2218891769280622, + "grad_norm": 0.06284768134355545, + "learning_rate": 0.0001109356014580802, + "loss": 0.4296, + "step": 2739 + }, + { + "epoch": 0.2219701879455606, + "grad_norm": 0.04179792478680611, + "learning_rate": 0.00011097610368570273, + "loss": 0.3439, + "step": 2740 + }, + { + "epoch": 0.22205119896305897, + "grad_norm": 0.056236620992422104, + "learning_rate": 0.00011101660591332523, + "loss": 0.3928, + "step": 2741 + }, + { + "epoch": 0.22213220998055735, + "grad_norm": 0.05494439974427223, + "learning_rate": 0.00011105710814094777, + "loss": 0.3415, + "step": 2742 + }, + { + "epoch": 0.22221322099805574, + "grad_norm": 0.04557449743151665, + "learning_rate": 0.00011109761036857027, + "loss": 0.3638, + "step": 2743 + }, + { + "epoch": 0.22229423201555412, + "grad_norm": 0.039909373968839645, + "learning_rate": 0.0001111381125961928, + "loss": 0.3681, + "step": 2744 + }, + { + "epoch": 0.2223752430330525, + "grad_norm": 0.04111361876130104, + "learning_rate": 0.00011117861482381531, + "loss": 0.3306, + "step": 2745 + }, + { + "epoch": 0.22245625405055086, + "grad_norm": 0.05619952082633972, + "learning_rate": 0.00011121911705143783, + "loss": 0.4013, + "step": 2746 + }, + { + "epoch": 0.22253726506804924, + "grad_norm": 0.0419950969517231, + "learning_rate": 0.00011125961927906035, + "loss": 0.3853, + "step": 2747 + }, + { + "epoch": 0.22261827608554763, + "grad_norm": 0.046850599348545074, + "learning_rate": 0.00011130012150668287, + "loss": 0.3279, + "step": 2748 + }, + { + "epoch": 0.222699287103046, + "grad_norm": 0.042228203266859055, + "learning_rate": 0.00011134062373430538, + "loss": 0.3912, + "step": 2749 + }, + { + "epoch": 0.2227802981205444, + "grad_norm": 0.0419267974793911, + "learning_rate": 0.00011138112596192791, + "loss": 0.3359, + "step": 2750 + }, + { + "epoch": 0.22286130913804278, + "grad_norm": 0.05071806535124779, + "learning_rate": 0.00011142162818955042, + "loss": 0.4229, + "step": 2751 + }, + { + "epoch": 0.22294232015554116, + "grad_norm": 0.04096124693751335, + "learning_rate": 0.00011146213041717295, + "loss": 0.3433, + "step": 2752 + }, + { + "epoch": 0.22302333117303955, + "grad_norm": 0.043501559644937515, + "learning_rate": 0.00011150263264479548, + "loss": 0.3647, + "step": 2753 + }, + { + "epoch": 0.2231043421905379, + "grad_norm": 0.044449783861637115, + "learning_rate": 0.00011154313487241799, + "loss": 0.4118, + "step": 2754 + }, + { + "epoch": 0.22318535320803629, + "grad_norm": 0.05392434075474739, + "learning_rate": 0.00011158363710004052, + "loss": 0.4326, + "step": 2755 + }, + { + "epoch": 0.22326636422553467, + "grad_norm": 0.051186319440603256, + "learning_rate": 0.00011162413932766303, + "loss": 0.4005, + "step": 2756 + }, + { + "epoch": 0.22334737524303305, + "grad_norm": 0.04222441464662552, + "learning_rate": 0.00011166464155528555, + "loss": 0.3626, + "step": 2757 + }, + { + "epoch": 0.22342838626053144, + "grad_norm": 0.045014429837465286, + "learning_rate": 0.00011170514378290807, + "loss": 0.3478, + "step": 2758 + }, + { + "epoch": 0.22350939727802982, + "grad_norm": 0.05044560879468918, + "learning_rate": 0.00011174564601053059, + "loss": 0.3998, + "step": 2759 + }, + { + "epoch": 0.2235904082955282, + "grad_norm": 0.04669976606965065, + "learning_rate": 0.00011178614823815309, + "loss": 0.3786, + "step": 2760 + }, + { + "epoch": 0.22367141931302656, + "grad_norm": 0.06014898791909218, + "learning_rate": 0.00011182665046577563, + "loss": 0.385, + "step": 2761 + }, + { + "epoch": 0.22375243033052494, + "grad_norm": 0.04031899571418762, + "learning_rate": 0.00011186715269339813, + "loss": 0.4309, + "step": 2762 + }, + { + "epoch": 0.22383344134802333, + "grad_norm": 0.052987974137067795, + "learning_rate": 0.00011190765492102067, + "loss": 0.3789, + "step": 2763 + }, + { + "epoch": 0.2239144523655217, + "grad_norm": 0.043287429958581924, + "learning_rate": 0.00011194815714864317, + "loss": 0.3664, + "step": 2764 + }, + { + "epoch": 0.2239954633830201, + "grad_norm": 0.044800058007240295, + "learning_rate": 0.0001119886593762657, + "loss": 0.368, + "step": 2765 + }, + { + "epoch": 0.22407647440051848, + "grad_norm": 0.04879983142018318, + "learning_rate": 0.00011202916160388821, + "loss": 0.3831, + "step": 2766 + }, + { + "epoch": 0.22415748541801686, + "grad_norm": 0.04009270668029785, + "learning_rate": 0.00011206966383151074, + "loss": 0.3842, + "step": 2767 + }, + { + "epoch": 0.22423849643551522, + "grad_norm": 0.047250065952539444, + "learning_rate": 0.00011211016605913325, + "loss": 0.4083, + "step": 2768 + }, + { + "epoch": 0.2243195074530136, + "grad_norm": 0.03695997968316078, + "learning_rate": 0.00011215066828675578, + "loss": 0.3762, + "step": 2769 + }, + { + "epoch": 0.22440051847051198, + "grad_norm": 0.04160401225090027, + "learning_rate": 0.00011219117051437829, + "loss": 0.3743, + "step": 2770 + }, + { + "epoch": 0.22448152948801037, + "grad_norm": 0.041437141597270966, + "learning_rate": 0.00011223167274200081, + "loss": 0.3421, + "step": 2771 + }, + { + "epoch": 0.22456254050550875, + "grad_norm": 0.0401858426630497, + "learning_rate": 0.00011227217496962334, + "loss": 0.3826, + "step": 2772 + }, + { + "epoch": 0.22464355152300713, + "grad_norm": 0.06117672845721245, + "learning_rate": 0.00011231267719724585, + "loss": 0.4798, + "step": 2773 + }, + { + "epoch": 0.22472456254050552, + "grad_norm": 0.045669130980968475, + "learning_rate": 0.00011235317942486838, + "loss": 0.403, + "step": 2774 + }, + { + "epoch": 0.2248055735580039, + "grad_norm": 0.05554332211613655, + "learning_rate": 0.00011239368165249089, + "loss": 0.3553, + "step": 2775 + }, + { + "epoch": 0.22488658457550226, + "grad_norm": 0.048306893557310104, + "learning_rate": 0.00011243418388011342, + "loss": 0.3933, + "step": 2776 + }, + { + "epoch": 0.22496759559300064, + "grad_norm": 0.046369053423404694, + "learning_rate": 0.00011247468610773593, + "loss": 0.383, + "step": 2777 + }, + { + "epoch": 0.22504860661049902, + "grad_norm": 0.0498780757188797, + "learning_rate": 0.00011251518833535846, + "loss": 0.4161, + "step": 2778 + }, + { + "epoch": 0.2251296176279974, + "grad_norm": 0.04474148526787758, + "learning_rate": 0.00011255569056298097, + "loss": 0.3793, + "step": 2779 + }, + { + "epoch": 0.2252106286454958, + "grad_norm": 0.04136667028069496, + "learning_rate": 0.0001125961927906035, + "loss": 0.3628, + "step": 2780 + }, + { + "epoch": 0.22529163966299418, + "grad_norm": 0.040542151778936386, + "learning_rate": 0.000112636695018226, + "loss": 0.3953, + "step": 2781 + }, + { + "epoch": 0.22537265068049256, + "grad_norm": 0.041604239493608475, + "learning_rate": 0.00011267719724584853, + "loss": 0.3645, + "step": 2782 + }, + { + "epoch": 0.22545366169799091, + "grad_norm": 0.039977479726076126, + "learning_rate": 0.00011271769947347105, + "loss": 0.3775, + "step": 2783 + }, + { + "epoch": 0.2255346727154893, + "grad_norm": 0.04203964024782181, + "learning_rate": 0.00011275820170109356, + "loss": 0.3811, + "step": 2784 + }, + { + "epoch": 0.22561568373298768, + "grad_norm": 0.042460497468709946, + "learning_rate": 0.00011279870392871607, + "loss": 0.3768, + "step": 2785 + }, + { + "epoch": 0.22569669475048607, + "grad_norm": 0.04054595157504082, + "learning_rate": 0.0001128392061563386, + "loss": 0.4022, + "step": 2786 + }, + { + "epoch": 0.22577770576798445, + "grad_norm": 0.05155777558684349, + "learning_rate": 0.00011287970838396111, + "loss": 0.4044, + "step": 2787 + }, + { + "epoch": 0.22585871678548283, + "grad_norm": 0.037050679326057434, + "learning_rate": 0.00011292021061158364, + "loss": 0.3806, + "step": 2788 + }, + { + "epoch": 0.22593972780298122, + "grad_norm": 0.050555843859910965, + "learning_rate": 0.00011296071283920615, + "loss": 0.468, + "step": 2789 + }, + { + "epoch": 0.22602073882047957, + "grad_norm": 0.04230424389243126, + "learning_rate": 0.00011300121506682868, + "loss": 0.3509, + "step": 2790 + }, + { + "epoch": 0.22610174983797796, + "grad_norm": 0.05011545121669769, + "learning_rate": 0.00011304171729445122, + "loss": 0.3929, + "step": 2791 + }, + { + "epoch": 0.22618276085547634, + "grad_norm": 0.04369264468550682, + "learning_rate": 0.00011308221952207372, + "loss": 0.4351, + "step": 2792 + }, + { + "epoch": 0.22626377187297472, + "grad_norm": 0.0465107187628746, + "learning_rate": 0.00011312272174969624, + "loss": 0.3824, + "step": 2793 + }, + { + "epoch": 0.2263447828904731, + "grad_norm": 0.043736957013607025, + "learning_rate": 0.00011316322397731876, + "loss": 0.3363, + "step": 2794 + }, + { + "epoch": 0.2264257939079715, + "grad_norm": 0.056683193892240524, + "learning_rate": 0.00011320372620494128, + "loss": 0.3998, + "step": 2795 + }, + { + "epoch": 0.22650680492546987, + "grad_norm": 0.05034268647432327, + "learning_rate": 0.00011324422843256379, + "loss": 0.4284, + "step": 2796 + }, + { + "epoch": 0.22658781594296826, + "grad_norm": 0.03823321685194969, + "learning_rate": 0.00011328473066018632, + "loss": 0.3417, + "step": 2797 + }, + { + "epoch": 0.2266688269604666, + "grad_norm": 0.040836721658706665, + "learning_rate": 0.00011332523288780883, + "loss": 0.3479, + "step": 2798 + }, + { + "epoch": 0.226749837977965, + "grad_norm": 0.044317472726106644, + "learning_rate": 0.00011336573511543136, + "loss": 0.3861, + "step": 2799 + }, + { + "epoch": 0.22683084899546338, + "grad_norm": 0.05670579895377159, + "learning_rate": 0.00011340623734305387, + "loss": 0.4012, + "step": 2800 + }, + { + "epoch": 0.22691186001296176, + "grad_norm": 0.038431454449892044, + "learning_rate": 0.0001134467395706764, + "loss": 0.3231, + "step": 2801 + }, + { + "epoch": 0.22699287103046015, + "grad_norm": 0.04589037969708443, + "learning_rate": 0.0001134872417982989, + "loss": 0.3879, + "step": 2802 + }, + { + "epoch": 0.22707388204795853, + "grad_norm": 0.04616350680589676, + "learning_rate": 0.00011352774402592144, + "loss": 0.3994, + "step": 2803 + }, + { + "epoch": 0.22715489306545691, + "grad_norm": 0.037241414189338684, + "learning_rate": 0.00011356824625354394, + "loss": 0.3305, + "step": 2804 + }, + { + "epoch": 0.22723590408295527, + "grad_norm": 0.037454795092344284, + "learning_rate": 0.00011360874848116648, + "loss": 0.3414, + "step": 2805 + }, + { + "epoch": 0.22731691510045365, + "grad_norm": 0.04402243718504906, + "learning_rate": 0.00011364925070878898, + "loss": 0.3993, + "step": 2806 + }, + { + "epoch": 0.22739792611795204, + "grad_norm": 0.04333695396780968, + "learning_rate": 0.0001136897529364115, + "loss": 0.4096, + "step": 2807 + }, + { + "epoch": 0.22747893713545042, + "grad_norm": 0.04681192710995674, + "learning_rate": 0.00011373025516403401, + "loss": 0.4094, + "step": 2808 + }, + { + "epoch": 0.2275599481529488, + "grad_norm": 0.055429939180612564, + "learning_rate": 0.00011377075739165654, + "loss": 0.4254, + "step": 2809 + }, + { + "epoch": 0.2276409591704472, + "grad_norm": 0.06147387996315956, + "learning_rate": 0.00011381125961927908, + "loss": 0.4017, + "step": 2810 + }, + { + "epoch": 0.22772197018794557, + "grad_norm": 0.04066922888159752, + "learning_rate": 0.00011385176184690158, + "loss": 0.3356, + "step": 2811 + }, + { + "epoch": 0.22780298120544393, + "grad_norm": 0.05167434737086296, + "learning_rate": 0.00011389226407452412, + "loss": 0.3007, + "step": 2812 + }, + { + "epoch": 0.2278839922229423, + "grad_norm": 0.05443723127245903, + "learning_rate": 0.00011393276630214662, + "loss": 0.4414, + "step": 2813 + }, + { + "epoch": 0.2279650032404407, + "grad_norm": 0.044885192066431046, + "learning_rate": 0.00011397326852976915, + "loss": 0.3484, + "step": 2814 + }, + { + "epoch": 0.22804601425793908, + "grad_norm": 0.040323249995708466, + "learning_rate": 0.00011401377075739166, + "loss": 0.3723, + "step": 2815 + }, + { + "epoch": 0.22812702527543746, + "grad_norm": 0.038147564977407455, + "learning_rate": 0.0001140542729850142, + "loss": 0.406, + "step": 2816 + }, + { + "epoch": 0.22820803629293585, + "grad_norm": 0.05138280987739563, + "learning_rate": 0.0001140947752126367, + "loss": 0.3594, + "step": 2817 + }, + { + "epoch": 0.22828904731043423, + "grad_norm": 0.0651860162615776, + "learning_rate": 0.00011413527744025922, + "loss": 0.4064, + "step": 2818 + }, + { + "epoch": 0.2283700583279326, + "grad_norm": 0.042356159538030624, + "learning_rate": 0.00011417577966788173, + "loss": 0.3587, + "step": 2819 + }, + { + "epoch": 0.22845106934543097, + "grad_norm": 0.051314469426870346, + "learning_rate": 0.00011421628189550426, + "loss": 0.4061, + "step": 2820 + }, + { + "epoch": 0.22853208036292935, + "grad_norm": 0.05304783210158348, + "learning_rate": 0.00011425678412312677, + "loss": 0.4175, + "step": 2821 + }, + { + "epoch": 0.22861309138042774, + "grad_norm": 0.04307013750076294, + "learning_rate": 0.0001142972863507493, + "loss": 0.405, + "step": 2822 + }, + { + "epoch": 0.22869410239792612, + "grad_norm": 0.04428693279623985, + "learning_rate": 0.0001143377885783718, + "loss": 0.3473, + "step": 2823 + }, + { + "epoch": 0.2287751134154245, + "grad_norm": 0.04833827540278435, + "learning_rate": 0.00011437829080599434, + "loss": 0.3711, + "step": 2824 + }, + { + "epoch": 0.22885612443292289, + "grad_norm": 0.04875127598643303, + "learning_rate": 0.00011441879303361684, + "loss": 0.3986, + "step": 2825 + }, + { + "epoch": 0.22893713545042127, + "grad_norm": 0.04338555783033371, + "learning_rate": 0.00011445929526123938, + "loss": 0.3913, + "step": 2826 + }, + { + "epoch": 0.22901814646791963, + "grad_norm": 0.03709283098578453, + "learning_rate": 0.00011449979748886188, + "loss": 0.3625, + "step": 2827 + }, + { + "epoch": 0.229099157485418, + "grad_norm": 0.040035828948020935, + "learning_rate": 0.00011454029971648442, + "loss": 0.3766, + "step": 2828 + }, + { + "epoch": 0.2291801685029164, + "grad_norm": 0.045361414551734924, + "learning_rate": 0.00011458080194410694, + "loss": 0.3784, + "step": 2829 + }, + { + "epoch": 0.22926117952041478, + "grad_norm": 0.05100679025053978, + "learning_rate": 0.00011462130417172944, + "loss": 0.3821, + "step": 2830 + }, + { + "epoch": 0.22934219053791316, + "grad_norm": 0.04482673481106758, + "learning_rate": 0.00011466180639935198, + "loss": 0.3936, + "step": 2831 + }, + { + "epoch": 0.22942320155541154, + "grad_norm": 0.04993167892098427, + "learning_rate": 0.00011470230862697448, + "loss": 0.4575, + "step": 2832 + }, + { + "epoch": 0.22950421257290993, + "grad_norm": 0.04634513333439827, + "learning_rate": 0.00011474281085459702, + "loss": 0.3786, + "step": 2833 + }, + { + "epoch": 0.22958522359040828, + "grad_norm": 0.044133465737104416, + "learning_rate": 0.00011478331308221952, + "loss": 0.4102, + "step": 2834 + }, + { + "epoch": 0.22966623460790667, + "grad_norm": 0.056882306933403015, + "learning_rate": 0.00011482381530984205, + "loss": 0.3638, + "step": 2835 + }, + { + "epoch": 0.22974724562540505, + "grad_norm": 0.05236594006419182, + "learning_rate": 0.00011486431753746456, + "loss": 0.361, + "step": 2836 + }, + { + "epoch": 0.22982825664290343, + "grad_norm": 0.049694813787937164, + "learning_rate": 0.0001149048197650871, + "loss": 0.4428, + "step": 2837 + }, + { + "epoch": 0.22990926766040182, + "grad_norm": 0.04660965874791145, + "learning_rate": 0.0001149453219927096, + "loss": 0.4022, + "step": 2838 + }, + { + "epoch": 0.2299902786779002, + "grad_norm": 0.05099980905652046, + "learning_rate": 0.00011498582422033213, + "loss": 0.4409, + "step": 2839 + }, + { + "epoch": 0.23007128969539858, + "grad_norm": 0.051555100828409195, + "learning_rate": 0.00011502632644795464, + "loss": 0.3945, + "step": 2840 + }, + { + "epoch": 0.23015230071289697, + "grad_norm": 0.037023525685071945, + "learning_rate": 0.00011506682867557716, + "loss": 0.3314, + "step": 2841 + }, + { + "epoch": 0.23023331173039532, + "grad_norm": 0.0383535772562027, + "learning_rate": 0.00011510733090319968, + "loss": 0.3609, + "step": 2842 + }, + { + "epoch": 0.2303143227478937, + "grad_norm": 0.03986530005931854, + "learning_rate": 0.0001151478331308222, + "loss": 0.381, + "step": 2843 + }, + { + "epoch": 0.2303953337653921, + "grad_norm": 0.045942630618810654, + "learning_rate": 0.0001151883353584447, + "loss": 0.4098, + "step": 2844 + }, + { + "epoch": 0.23047634478289047, + "grad_norm": 0.04153173789381981, + "learning_rate": 0.00011522883758606724, + "loss": 0.3453, + "step": 2845 + }, + { + "epoch": 0.23055735580038886, + "grad_norm": 0.04518669471144676, + "learning_rate": 0.00011526933981368977, + "loss": 0.3895, + "step": 2846 + }, + { + "epoch": 0.23063836681788724, + "grad_norm": 0.0531323216855526, + "learning_rate": 0.00011530984204131228, + "loss": 0.3566, + "step": 2847 + }, + { + "epoch": 0.23071937783538563, + "grad_norm": 0.04103211313486099, + "learning_rate": 0.00011535034426893481, + "loss": 0.3482, + "step": 2848 + }, + { + "epoch": 0.23080038885288398, + "grad_norm": 0.037041306495666504, + "learning_rate": 0.00011539084649655732, + "loss": 0.3532, + "step": 2849 + }, + { + "epoch": 0.23088139987038236, + "grad_norm": 0.04363298416137695, + "learning_rate": 0.00011543134872417985, + "loss": 0.3321, + "step": 2850 + }, + { + "epoch": 0.23096241088788075, + "grad_norm": 0.04319039359688759, + "learning_rate": 0.00011547185095180236, + "loss": 0.3919, + "step": 2851 + }, + { + "epoch": 0.23104342190537913, + "grad_norm": 0.044030141085386276, + "learning_rate": 0.00011551235317942488, + "loss": 0.3925, + "step": 2852 + }, + { + "epoch": 0.23112443292287752, + "grad_norm": 0.04883784055709839, + "learning_rate": 0.0001155528554070474, + "loss": 0.3755, + "step": 2853 + }, + { + "epoch": 0.2312054439403759, + "grad_norm": 0.042822521179914474, + "learning_rate": 0.00011559335763466991, + "loss": 0.4036, + "step": 2854 + }, + { + "epoch": 0.23128645495787428, + "grad_norm": 0.039836104959249496, + "learning_rate": 0.00011563385986229242, + "loss": 0.4299, + "step": 2855 + }, + { + "epoch": 0.23136746597537264, + "grad_norm": 0.05859369412064552, + "learning_rate": 0.00011567436208991495, + "loss": 0.3835, + "step": 2856 + }, + { + "epoch": 0.23144847699287102, + "grad_norm": 0.039807382971048355, + "learning_rate": 0.00011571486431753746, + "loss": 0.3918, + "step": 2857 + }, + { + "epoch": 0.2315294880103694, + "grad_norm": 0.04248964414000511, + "learning_rate": 0.00011575536654516, + "loss": 0.3577, + "step": 2858 + }, + { + "epoch": 0.2316104990278678, + "grad_norm": 0.04504331946372986, + "learning_rate": 0.0001157958687727825, + "loss": 0.3758, + "step": 2859 + }, + { + "epoch": 0.23169151004536617, + "grad_norm": 0.04224937781691551, + "learning_rate": 0.00011583637100040503, + "loss": 0.3899, + "step": 2860 + }, + { + "epoch": 0.23177252106286456, + "grad_norm": 0.04513216391205788, + "learning_rate": 0.00011587687322802754, + "loss": 0.4145, + "step": 2861 + }, + { + "epoch": 0.23185353208036294, + "grad_norm": 0.05136607587337494, + "learning_rate": 0.00011591737545565007, + "loss": 0.4014, + "step": 2862 + }, + { + "epoch": 0.2319345430978613, + "grad_norm": 0.042943354696035385, + "learning_rate": 0.00011595787768327258, + "loss": 0.353, + "step": 2863 + }, + { + "epoch": 0.23201555411535968, + "grad_norm": 0.04254595562815666, + "learning_rate": 0.00011599837991089511, + "loss": 0.3895, + "step": 2864 + }, + { + "epoch": 0.23209656513285806, + "grad_norm": 0.039488524198532104, + "learning_rate": 0.00011603888213851763, + "loss": 0.3684, + "step": 2865 + }, + { + "epoch": 0.23217757615035645, + "grad_norm": 0.03824709355831146, + "learning_rate": 0.00011607938436614014, + "loss": 0.3316, + "step": 2866 + }, + { + "epoch": 0.23225858716785483, + "grad_norm": 0.04050895944237709, + "learning_rate": 0.00011611988659376267, + "loss": 0.3971, + "step": 2867 + }, + { + "epoch": 0.2323395981853532, + "grad_norm": 0.04249674081802368, + "learning_rate": 0.00011616038882138518, + "loss": 0.371, + "step": 2868 + }, + { + "epoch": 0.2324206092028516, + "grad_norm": 0.04510214179754257, + "learning_rate": 0.00011620089104900771, + "loss": 0.3866, + "step": 2869 + }, + { + "epoch": 0.23250162022034998, + "grad_norm": 0.048988454043865204, + "learning_rate": 0.00011624139327663022, + "loss": 0.3157, + "step": 2870 + }, + { + "epoch": 0.23258263123784834, + "grad_norm": 0.03871070221066475, + "learning_rate": 0.00011628189550425275, + "loss": 0.3896, + "step": 2871 + }, + { + "epoch": 0.23266364225534672, + "grad_norm": 0.04526352509856224, + "learning_rate": 0.00011632239773187526, + "loss": 0.3846, + "step": 2872 + }, + { + "epoch": 0.2327446532728451, + "grad_norm": 0.049733031541109085, + "learning_rate": 0.00011636289995949779, + "loss": 0.3702, + "step": 2873 + }, + { + "epoch": 0.2328256642903435, + "grad_norm": 0.059630136936903, + "learning_rate": 0.0001164034021871203, + "loss": 0.3536, + "step": 2874 + }, + { + "epoch": 0.23290667530784187, + "grad_norm": 0.05297769978642464, + "learning_rate": 0.00011644390441474283, + "loss": 0.3572, + "step": 2875 + }, + { + "epoch": 0.23298768632534025, + "grad_norm": 0.0500696562230587, + "learning_rate": 0.00011648440664236533, + "loss": 0.3934, + "step": 2876 + }, + { + "epoch": 0.23306869734283864, + "grad_norm": 0.045815713703632355, + "learning_rate": 0.00011652490886998785, + "loss": 0.4009, + "step": 2877 + }, + { + "epoch": 0.233149708360337, + "grad_norm": 0.044900525361299515, + "learning_rate": 0.00011656541109761037, + "loss": 0.3715, + "step": 2878 + }, + { + "epoch": 0.23323071937783538, + "grad_norm": 0.05272326618432999, + "learning_rate": 0.00011660591332523289, + "loss": 0.3457, + "step": 2879 + }, + { + "epoch": 0.23331173039533376, + "grad_norm": 0.05682436376810074, + "learning_rate": 0.0001166464155528554, + "loss": 0.3678, + "step": 2880 + }, + { + "epoch": 0.23339274141283214, + "grad_norm": 0.05235380306839943, + "learning_rate": 0.00011668691778047793, + "loss": 0.3432, + "step": 2881 + }, + { + "epoch": 0.23347375243033053, + "grad_norm": 0.04751413315534592, + "learning_rate": 0.00011672742000810044, + "loss": 0.3745, + "step": 2882 + }, + { + "epoch": 0.2335547634478289, + "grad_norm": 0.06046335771679878, + "learning_rate": 0.00011676792223572297, + "loss": 0.4513, + "step": 2883 + }, + { + "epoch": 0.2336357744653273, + "grad_norm": 0.04304825887084007, + "learning_rate": 0.0001168084244633455, + "loss": 0.3731, + "step": 2884 + }, + { + "epoch": 0.23371678548282565, + "grad_norm": 0.04588375613093376, + "learning_rate": 0.00011684892669096801, + "loss": 0.4134, + "step": 2885 + }, + { + "epoch": 0.23379779650032403, + "grad_norm": 0.046558551490306854, + "learning_rate": 0.00011688942891859054, + "loss": 0.4054, + "step": 2886 + }, + { + "epoch": 0.23387880751782242, + "grad_norm": 0.05705951899290085, + "learning_rate": 0.00011692993114621305, + "loss": 0.369, + "step": 2887 + }, + { + "epoch": 0.2339598185353208, + "grad_norm": 0.04509139806032181, + "learning_rate": 0.00011697043337383557, + "loss": 0.3661, + "step": 2888 + }, + { + "epoch": 0.23404082955281919, + "grad_norm": 0.052528828382492065, + "learning_rate": 0.00011701093560145808, + "loss": 0.3595, + "step": 2889 + }, + { + "epoch": 0.23412184057031757, + "grad_norm": 0.04576665163040161, + "learning_rate": 0.00011705143782908061, + "loss": 0.3703, + "step": 2890 + }, + { + "epoch": 0.23420285158781595, + "grad_norm": 0.04580291360616684, + "learning_rate": 0.00011709194005670312, + "loss": 0.3458, + "step": 2891 + }, + { + "epoch": 0.23428386260531434, + "grad_norm": 0.06401868909597397, + "learning_rate": 0.00011713244228432565, + "loss": 0.3658, + "step": 2892 + }, + { + "epoch": 0.2343648736228127, + "grad_norm": 0.03928394988179207, + "learning_rate": 0.00011717294451194815, + "loss": 0.3226, + "step": 2893 + }, + { + "epoch": 0.23444588464031108, + "grad_norm": 0.0432765930891037, + "learning_rate": 0.00011721344673957069, + "loss": 0.3945, + "step": 2894 + }, + { + "epoch": 0.23452689565780946, + "grad_norm": 0.040293481200933456, + "learning_rate": 0.0001172539489671932, + "loss": 0.3825, + "step": 2895 + }, + { + "epoch": 0.23460790667530784, + "grad_norm": 0.0494161918759346, + "learning_rate": 0.00011729445119481573, + "loss": 0.3802, + "step": 2896 + }, + { + "epoch": 0.23468891769280623, + "grad_norm": 0.03994528949260712, + "learning_rate": 0.00011733495342243823, + "loss": 0.3772, + "step": 2897 + }, + { + "epoch": 0.2347699287103046, + "grad_norm": 0.05831955000758171, + "learning_rate": 0.00011737545565006077, + "loss": 0.3916, + "step": 2898 + }, + { + "epoch": 0.234850939727803, + "grad_norm": 0.04238951951265335, + "learning_rate": 0.00011741595787768327, + "loss": 0.359, + "step": 2899 + }, + { + "epoch": 0.23493195074530135, + "grad_norm": 0.05210850015282631, + "learning_rate": 0.00011745646010530579, + "loss": 0.381, + "step": 2900 + }, + { + "epoch": 0.23501296176279973, + "grad_norm": 0.0391576923429966, + "learning_rate": 0.00011749696233292831, + "loss": 0.3605, + "step": 2901 + }, + { + "epoch": 0.23509397278029812, + "grad_norm": 0.05010760948061943, + "learning_rate": 0.00011753746456055083, + "loss": 0.3643, + "step": 2902 + }, + { + "epoch": 0.2351749837977965, + "grad_norm": 0.05513544753193855, + "learning_rate": 0.00011757796678817336, + "loss": 0.4352, + "step": 2903 + }, + { + "epoch": 0.23525599481529488, + "grad_norm": 0.04464849829673767, + "learning_rate": 0.00011761846901579587, + "loss": 0.3811, + "step": 2904 + }, + { + "epoch": 0.23533700583279327, + "grad_norm": 0.04738510400056839, + "learning_rate": 0.0001176589712434184, + "loss": 0.4235, + "step": 2905 + }, + { + "epoch": 0.23541801685029165, + "grad_norm": 0.04617168754339218, + "learning_rate": 0.00011769947347104091, + "loss": 0.395, + "step": 2906 + }, + { + "epoch": 0.23549902786779, + "grad_norm": 0.05911537632346153, + "learning_rate": 0.00011773997569866344, + "loss": 0.3439, + "step": 2907 + }, + { + "epoch": 0.2355800388852884, + "grad_norm": 0.0571928471326828, + "learning_rate": 0.00011778047792628595, + "loss": 0.3754, + "step": 2908 + }, + { + "epoch": 0.23566104990278677, + "grad_norm": 0.04202379658818245, + "learning_rate": 0.00011782098015390848, + "loss": 0.3798, + "step": 2909 + }, + { + "epoch": 0.23574206092028516, + "grad_norm": 0.04209866374731064, + "learning_rate": 0.00011786148238153099, + "loss": 0.376, + "step": 2910 + }, + { + "epoch": 0.23582307193778354, + "grad_norm": 0.045575957745313644, + "learning_rate": 0.00011790198460915351, + "loss": 0.4286, + "step": 2911 + }, + { + "epoch": 0.23590408295528192, + "grad_norm": 0.03941948339343071, + "learning_rate": 0.00011794248683677603, + "loss": 0.3605, + "step": 2912 + }, + { + "epoch": 0.2359850939727803, + "grad_norm": 0.04388366639614105, + "learning_rate": 0.00011798298906439855, + "loss": 0.3692, + "step": 2913 + }, + { + "epoch": 0.2360661049902787, + "grad_norm": 0.0419548898935318, + "learning_rate": 0.00011802349129202105, + "loss": 0.3752, + "step": 2914 + }, + { + "epoch": 0.23614711600777705, + "grad_norm": 0.048258326947689056, + "learning_rate": 0.00011806399351964359, + "loss": 0.3246, + "step": 2915 + }, + { + "epoch": 0.23622812702527543, + "grad_norm": 0.043853510171175, + "learning_rate": 0.0001181044957472661, + "loss": 0.4175, + "step": 2916 + }, + { + "epoch": 0.23630913804277381, + "grad_norm": 0.034705597907304764, + "learning_rate": 0.00011814499797488863, + "loss": 0.4249, + "step": 2917 + }, + { + "epoch": 0.2363901490602722, + "grad_norm": 0.04022218659520149, + "learning_rate": 0.00011818550020251113, + "loss": 0.3845, + "step": 2918 + }, + { + "epoch": 0.23647116007777058, + "grad_norm": 0.04778154939413071, + "learning_rate": 0.00011822600243013367, + "loss": 0.3716, + "step": 2919 + }, + { + "epoch": 0.23655217109526896, + "grad_norm": 0.04581722244620323, + "learning_rate": 0.00011826650465775617, + "loss": 0.3863, + "step": 2920 + }, + { + "epoch": 0.23663318211276735, + "grad_norm": 0.045739613473415375, + "learning_rate": 0.0001183070068853787, + "loss": 0.4155, + "step": 2921 + }, + { + "epoch": 0.2367141931302657, + "grad_norm": 0.0473037026822567, + "learning_rate": 0.00011834750911300122, + "loss": 0.3698, + "step": 2922 + }, + { + "epoch": 0.2367952041477641, + "grad_norm": 0.0326833575963974, + "learning_rate": 0.00011838801134062374, + "loss": 0.3233, + "step": 2923 + }, + { + "epoch": 0.23687621516526247, + "grad_norm": 0.0429866723716259, + "learning_rate": 0.00011842851356824626, + "loss": 0.4209, + "step": 2924 + }, + { + "epoch": 0.23695722618276086, + "grad_norm": 0.03465301916003227, + "learning_rate": 0.00011846901579586877, + "loss": 0.3538, + "step": 2925 + }, + { + "epoch": 0.23703823720025924, + "grad_norm": 0.03621894493699074, + "learning_rate": 0.0001185095180234913, + "loss": 0.3601, + "step": 2926 + }, + { + "epoch": 0.23711924821775762, + "grad_norm": 0.04385453090071678, + "learning_rate": 0.00011855002025111381, + "loss": 0.3191, + "step": 2927 + }, + { + "epoch": 0.237200259235256, + "grad_norm": 0.041872210800647736, + "learning_rate": 0.00011859052247873634, + "loss": 0.3737, + "step": 2928 + }, + { + "epoch": 0.23728127025275436, + "grad_norm": 0.043836288154125214, + "learning_rate": 0.00011863102470635885, + "loss": 0.3315, + "step": 2929 + }, + { + "epoch": 0.23736228127025275, + "grad_norm": 0.05111682042479515, + "learning_rate": 0.00011867152693398138, + "loss": 0.3472, + "step": 2930 + }, + { + "epoch": 0.23744329228775113, + "grad_norm": 0.045502860099077225, + "learning_rate": 0.00011871202916160389, + "loss": 0.3504, + "step": 2931 + }, + { + "epoch": 0.2375243033052495, + "grad_norm": 0.050450120121240616, + "learning_rate": 0.00011875253138922642, + "loss": 0.4085, + "step": 2932 + }, + { + "epoch": 0.2376053143227479, + "grad_norm": 0.0447453111410141, + "learning_rate": 0.00011879303361684893, + "loss": 0.3852, + "step": 2933 + }, + { + "epoch": 0.23768632534024628, + "grad_norm": 0.03878478333353996, + "learning_rate": 0.00011883353584447146, + "loss": 0.2904, + "step": 2934 + }, + { + "epoch": 0.23776733635774466, + "grad_norm": 0.04184164106845856, + "learning_rate": 0.00011887403807209397, + "loss": 0.3481, + "step": 2935 + }, + { + "epoch": 0.23784834737524305, + "grad_norm": 0.04044514149427414, + "learning_rate": 0.00011891454029971649, + "loss": 0.4507, + "step": 2936 + }, + { + "epoch": 0.2379293583927414, + "grad_norm": 0.044943299144506454, + "learning_rate": 0.000118955042527339, + "loss": 0.3847, + "step": 2937 + }, + { + "epoch": 0.23801036941023979, + "grad_norm": 0.04254603385925293, + "learning_rate": 0.00011899554475496153, + "loss": 0.3852, + "step": 2938 + }, + { + "epoch": 0.23809138042773817, + "grad_norm": 0.042455434799194336, + "learning_rate": 0.00011903604698258403, + "loss": 0.3802, + "step": 2939 + }, + { + "epoch": 0.23817239144523655, + "grad_norm": 0.04601847007870674, + "learning_rate": 0.00011907654921020657, + "loss": 0.3877, + "step": 2940 + }, + { + "epoch": 0.23825340246273494, + "grad_norm": 0.05261930823326111, + "learning_rate": 0.0001191170514378291, + "loss": 0.3801, + "step": 2941 + }, + { + "epoch": 0.23833441348023332, + "grad_norm": 0.06650017946958542, + "learning_rate": 0.0001191575536654516, + "loss": 0.4182, + "step": 2942 + }, + { + "epoch": 0.2384154244977317, + "grad_norm": 0.04565853625535965, + "learning_rate": 0.00011919805589307414, + "loss": 0.3418, + "step": 2943 + }, + { + "epoch": 0.23849643551523006, + "grad_norm": 0.04410654678940773, + "learning_rate": 0.00011923855812069664, + "loss": 0.4302, + "step": 2944 + }, + { + "epoch": 0.23857744653272844, + "grad_norm": 0.04739765822887421, + "learning_rate": 0.00011927906034831918, + "loss": 0.4081, + "step": 2945 + }, + { + "epoch": 0.23865845755022683, + "grad_norm": 0.04107481986284256, + "learning_rate": 0.00011931956257594168, + "loss": 0.4017, + "step": 2946 + }, + { + "epoch": 0.2387394685677252, + "grad_norm": 0.04466596618294716, + "learning_rate": 0.0001193600648035642, + "loss": 0.3583, + "step": 2947 + }, + { + "epoch": 0.2388204795852236, + "grad_norm": 0.04515571519732475, + "learning_rate": 0.00011940056703118672, + "loss": 0.3896, + "step": 2948 + }, + { + "epoch": 0.23890149060272198, + "grad_norm": 0.045190826058387756, + "learning_rate": 0.00011944106925880924, + "loss": 0.3659, + "step": 2949 + }, + { + "epoch": 0.23898250162022036, + "grad_norm": 0.041392821818590164, + "learning_rate": 0.00011948157148643175, + "loss": 0.4046, + "step": 2950 + }, + { + "epoch": 0.23906351263771872, + "grad_norm": 0.04082554206252098, + "learning_rate": 0.00011952207371405428, + "loss": 0.4052, + "step": 2951 + }, + { + "epoch": 0.2391445236552171, + "grad_norm": 0.048217855393886566, + "learning_rate": 0.00011956257594167679, + "loss": 0.3632, + "step": 2952 + }, + { + "epoch": 0.23922553467271548, + "grad_norm": 0.04387287795543671, + "learning_rate": 0.00011960307816929932, + "loss": 0.3655, + "step": 2953 + }, + { + "epoch": 0.23930654569021387, + "grad_norm": 0.04062773659825325, + "learning_rate": 0.00011964358039692183, + "loss": 0.3817, + "step": 2954 + }, + { + "epoch": 0.23938755670771225, + "grad_norm": 0.04514176398515701, + "learning_rate": 0.00011968408262454436, + "loss": 0.3887, + "step": 2955 + }, + { + "epoch": 0.23946856772521063, + "grad_norm": 0.04475953057408333, + "learning_rate": 0.00011972458485216687, + "loss": 0.4148, + "step": 2956 + }, + { + "epoch": 0.23954957874270902, + "grad_norm": 0.042864445596933365, + "learning_rate": 0.0001197650870797894, + "loss": 0.3809, + "step": 2957 + }, + { + "epoch": 0.2396305897602074, + "grad_norm": 0.04250786826014519, + "learning_rate": 0.0001198055893074119, + "loss": 0.398, + "step": 2958 + }, + { + "epoch": 0.23971160077770576, + "grad_norm": 0.04754723981022835, + "learning_rate": 0.00011984609153503444, + "loss": 0.4056, + "step": 2959 + }, + { + "epoch": 0.23979261179520414, + "grad_norm": 0.04678526893258095, + "learning_rate": 0.00011988659376265696, + "loss": 0.3673, + "step": 2960 + }, + { + "epoch": 0.23987362281270252, + "grad_norm": 0.045734815299510956, + "learning_rate": 0.00011992709599027946, + "loss": 0.3905, + "step": 2961 + }, + { + "epoch": 0.2399546338302009, + "grad_norm": 0.038129664957523346, + "learning_rate": 0.000119967598217902, + "loss": 0.3594, + "step": 2962 + }, + { + "epoch": 0.2400356448476993, + "grad_norm": 0.04983527213335037, + "learning_rate": 0.0001200081004455245, + "loss": 0.3623, + "step": 2963 + }, + { + "epoch": 0.24011665586519768, + "grad_norm": 0.04684574529528618, + "learning_rate": 0.00012004860267314704, + "loss": 0.4494, + "step": 2964 + }, + { + "epoch": 0.24019766688269606, + "grad_norm": 0.04936652630567551, + "learning_rate": 0.00012008910490076954, + "loss": 0.4044, + "step": 2965 + }, + { + "epoch": 0.24027867790019442, + "grad_norm": 0.04301603138446808, + "learning_rate": 0.00012012960712839208, + "loss": 0.3792, + "step": 2966 + }, + { + "epoch": 0.2403596889176928, + "grad_norm": 0.044206999242305756, + "learning_rate": 0.00012017010935601458, + "loss": 0.4301, + "step": 2967 + }, + { + "epoch": 0.24044069993519118, + "grad_norm": 0.04447254166007042, + "learning_rate": 0.00012021061158363712, + "loss": 0.3812, + "step": 2968 + }, + { + "epoch": 0.24052171095268957, + "grad_norm": 0.038800474256277084, + "learning_rate": 0.00012025111381125962, + "loss": 0.4229, + "step": 2969 + }, + { + "epoch": 0.24060272197018795, + "grad_norm": 0.04933485761284828, + "learning_rate": 0.00012029161603888216, + "loss": 0.3904, + "step": 2970 + }, + { + "epoch": 0.24068373298768633, + "grad_norm": 0.04177100211381912, + "learning_rate": 0.00012033211826650466, + "loss": 0.3652, + "step": 2971 + }, + { + "epoch": 0.24076474400518472, + "grad_norm": 0.04116995632648468, + "learning_rate": 0.00012037262049412718, + "loss": 0.3841, + "step": 2972 + }, + { + "epoch": 0.24084575502268307, + "grad_norm": 0.04211720451712608, + "learning_rate": 0.00012041312272174969, + "loss": 0.3557, + "step": 2973 + }, + { + "epoch": 0.24092676604018146, + "grad_norm": 0.04911576583981514, + "learning_rate": 0.00012045362494937222, + "loss": 0.373, + "step": 2974 + }, + { + "epoch": 0.24100777705767984, + "grad_norm": 0.04702078551054001, + "learning_rate": 0.00012049412717699473, + "loss": 0.4155, + "step": 2975 + }, + { + "epoch": 0.24108878807517822, + "grad_norm": 0.03862113133072853, + "learning_rate": 0.00012053462940461726, + "loss": 0.3473, + "step": 2976 + }, + { + "epoch": 0.2411697990926766, + "grad_norm": 0.037792373448610306, + "learning_rate": 0.00012057513163223977, + "loss": 0.3565, + "step": 2977 + }, + { + "epoch": 0.241250810110175, + "grad_norm": 0.0489872545003891, + "learning_rate": 0.0001206156338598623, + "loss": 0.4076, + "step": 2978 + }, + { + "epoch": 0.24133182112767337, + "grad_norm": 0.05115858465433121, + "learning_rate": 0.00012065613608748483, + "loss": 0.3641, + "step": 2979 + }, + { + "epoch": 0.24141283214517173, + "grad_norm": 0.04305893927812576, + "learning_rate": 0.00012069663831510734, + "loss": 0.4047, + "step": 2980 + }, + { + "epoch": 0.2414938431626701, + "grad_norm": 0.05037764459848404, + "learning_rate": 0.00012073714054272987, + "loss": 0.3834, + "step": 2981 + }, + { + "epoch": 0.2415748541801685, + "grad_norm": 0.03909236937761307, + "learning_rate": 0.00012077764277035238, + "loss": 0.3717, + "step": 2982 + }, + { + "epoch": 0.24165586519766688, + "grad_norm": 0.041752856224775314, + "learning_rate": 0.0001208181449979749, + "loss": 0.3594, + "step": 2983 + }, + { + "epoch": 0.24173687621516526, + "grad_norm": 0.050167832523584366, + "learning_rate": 0.0001208586472255974, + "loss": 0.3732, + "step": 2984 + }, + { + "epoch": 0.24181788723266365, + "grad_norm": 0.03796180710196495, + "learning_rate": 0.00012089914945321994, + "loss": 0.3425, + "step": 2985 + }, + { + "epoch": 0.24189889825016203, + "grad_norm": 0.04057871922850609, + "learning_rate": 0.00012093965168084244, + "loss": 0.3774, + "step": 2986 + }, + { + "epoch": 0.24197990926766041, + "grad_norm": 0.04229074716567993, + "learning_rate": 0.00012098015390846498, + "loss": 0.3542, + "step": 2987 + }, + { + "epoch": 0.24206092028515877, + "grad_norm": 0.03807763010263443, + "learning_rate": 0.00012102065613608748, + "loss": 0.3493, + "step": 2988 + }, + { + "epoch": 0.24214193130265715, + "grad_norm": 0.0359298512339592, + "learning_rate": 0.00012106115836371002, + "loss": 0.3518, + "step": 2989 + }, + { + "epoch": 0.24222294232015554, + "grad_norm": 0.04807816445827484, + "learning_rate": 0.00012110166059133252, + "loss": 0.3222, + "step": 2990 + }, + { + "epoch": 0.24230395333765392, + "grad_norm": 0.04524649307131767, + "learning_rate": 0.00012114216281895505, + "loss": 0.3751, + "step": 2991 + }, + { + "epoch": 0.2423849643551523, + "grad_norm": 0.0531078465282917, + "learning_rate": 0.00012118266504657756, + "loss": 0.3387, + "step": 2992 + }, + { + "epoch": 0.2424659753726507, + "grad_norm": 0.044817257672548294, + "learning_rate": 0.0001212231672742001, + "loss": 0.3881, + "step": 2993 + }, + { + "epoch": 0.24254698639014907, + "grad_norm": 0.046434782445430756, + "learning_rate": 0.0001212636695018226, + "loss": 0.4195, + "step": 2994 + }, + { + "epoch": 0.24262799740764743, + "grad_norm": 0.047300081700086594, + "learning_rate": 0.00012130417172944512, + "loss": 0.3725, + "step": 2995 + }, + { + "epoch": 0.2427090084251458, + "grad_norm": 0.03659592196345329, + "learning_rate": 0.00012134467395706764, + "loss": 0.3658, + "step": 2996 + }, + { + "epoch": 0.2427900194426442, + "grad_norm": 0.04757637158036232, + "learning_rate": 0.00012138517618469016, + "loss": 0.3528, + "step": 2997 + }, + { + "epoch": 0.24287103046014258, + "grad_norm": 0.035950496792793274, + "learning_rate": 0.00012142567841231269, + "loss": 0.3559, + "step": 2998 + }, + { + "epoch": 0.24295204147764096, + "grad_norm": 0.050411324948072433, + "learning_rate": 0.0001214661806399352, + "loss": 0.4086, + "step": 2999 + }, + { + "epoch": 0.24303305249513935, + "grad_norm": 0.05020357668399811, + "learning_rate": 0.00012150668286755773, + "loss": 0.3628, + "step": 3000 + }, + { + "epoch": 0.24311406351263773, + "grad_norm": 0.04406118765473366, + "learning_rate": 0.00012154718509518024, + "loss": 0.3785, + "step": 3001 + }, + { + "epoch": 0.24319507453013609, + "grad_norm": 0.04498102515935898, + "learning_rate": 0.00012158768732280277, + "loss": 0.355, + "step": 3002 + }, + { + "epoch": 0.24327608554763447, + "grad_norm": 0.043865758925676346, + "learning_rate": 0.00012162818955042528, + "loss": 0.3933, + "step": 3003 + }, + { + "epoch": 0.24335709656513285, + "grad_norm": 0.03579156845808029, + "learning_rate": 0.00012166869177804781, + "loss": 0.3243, + "step": 3004 + }, + { + "epoch": 0.24343810758263124, + "grad_norm": 0.046793438494205475, + "learning_rate": 0.00012170919400567032, + "loss": 0.4052, + "step": 3005 + }, + { + "epoch": 0.24351911860012962, + "grad_norm": 0.04385807737708092, + "learning_rate": 0.00012174969623329284, + "loss": 0.3982, + "step": 3006 + }, + { + "epoch": 0.243600129617628, + "grad_norm": 0.044661667197942734, + "learning_rate": 0.00012179019846091536, + "loss": 0.3769, + "step": 3007 + }, + { + "epoch": 0.2436811406351264, + "grad_norm": 0.04510461539030075, + "learning_rate": 0.00012183070068853788, + "loss": 0.3756, + "step": 3008 + }, + { + "epoch": 0.24376215165262477, + "grad_norm": 0.03899111598730087, + "learning_rate": 0.00012187120291616038, + "loss": 0.3859, + "step": 3009 + }, + { + "epoch": 0.24384316267012313, + "grad_norm": 0.04161735624074936, + "learning_rate": 0.00012191170514378292, + "loss": 0.3634, + "step": 3010 + }, + { + "epoch": 0.2439241736876215, + "grad_norm": 0.046930018812417984, + "learning_rate": 0.00012195220737140542, + "loss": 0.3986, + "step": 3011 + }, + { + "epoch": 0.2440051847051199, + "grad_norm": 0.04540453106164932, + "learning_rate": 0.00012199270959902795, + "loss": 0.4051, + "step": 3012 + }, + { + "epoch": 0.24408619572261828, + "grad_norm": 0.04622745141386986, + "learning_rate": 0.00012203321182665046, + "loss": 0.3568, + "step": 3013 + }, + { + "epoch": 0.24416720674011666, + "grad_norm": 0.042839668691158295, + "learning_rate": 0.00012207371405427298, + "loss": 0.3686, + "step": 3014 + }, + { + "epoch": 0.24424821775761504, + "grad_norm": 0.05041300132870674, + "learning_rate": 0.00012211421628189549, + "loss": 0.4411, + "step": 3015 + }, + { + "epoch": 0.24432922877511343, + "grad_norm": 0.046316828578710556, + "learning_rate": 0.00012215471850951802, + "loss": 0.3958, + "step": 3016 + }, + { + "epoch": 0.24441023979261178, + "grad_norm": 0.044460829347372055, + "learning_rate": 0.00012219522073714055, + "loss": 0.3939, + "step": 3017 + }, + { + "epoch": 0.24449125081011017, + "grad_norm": 0.04973135143518448, + "learning_rate": 0.00012223572296476306, + "loss": 0.376, + "step": 3018 + }, + { + "epoch": 0.24457226182760855, + "grad_norm": 0.039541829377412796, + "learning_rate": 0.0001222762251923856, + "loss": 0.3998, + "step": 3019 + }, + { + "epoch": 0.24465327284510693, + "grad_norm": 0.03787733614444733, + "learning_rate": 0.0001223167274200081, + "loss": 0.3652, + "step": 3020 + }, + { + "epoch": 0.24473428386260532, + "grad_norm": 0.0433904230594635, + "learning_rate": 0.00012235722964763063, + "loss": 0.3448, + "step": 3021 + }, + { + "epoch": 0.2448152948801037, + "grad_norm": 0.036354780197143555, + "learning_rate": 0.00012239773187525314, + "loss": 0.3932, + "step": 3022 + }, + { + "epoch": 0.24489630589760208, + "grad_norm": 0.03718483820557594, + "learning_rate": 0.00012243823410287567, + "loss": 0.3671, + "step": 3023 + }, + { + "epoch": 0.24497731691510044, + "grad_norm": 0.04530775547027588, + "learning_rate": 0.00012247873633049818, + "loss": 0.3855, + "step": 3024 + }, + { + "epoch": 0.24505832793259882, + "grad_norm": 0.03824467584490776, + "learning_rate": 0.0001225192385581207, + "loss": 0.3913, + "step": 3025 + }, + { + "epoch": 0.2451393389500972, + "grad_norm": 0.040557969361543655, + "learning_rate": 0.00012255974078574322, + "loss": 0.391, + "step": 3026 + }, + { + "epoch": 0.2452203499675956, + "grad_norm": 0.04104351997375488, + "learning_rate": 0.00012260024301336575, + "loss": 0.3543, + "step": 3027 + }, + { + "epoch": 0.24530136098509397, + "grad_norm": 0.043880265206098557, + "learning_rate": 0.00012264074524098826, + "loss": 0.3496, + "step": 3028 + }, + { + "epoch": 0.24538237200259236, + "grad_norm": 0.04171985760331154, + "learning_rate": 0.0001226812474686108, + "loss": 0.3563, + "step": 3029 + }, + { + "epoch": 0.24546338302009074, + "grad_norm": 0.03931222856044769, + "learning_rate": 0.0001227217496962333, + "loss": 0.4155, + "step": 3030 + }, + { + "epoch": 0.24554439403758913, + "grad_norm": 0.039511002600193024, + "learning_rate": 0.00012276225192385583, + "loss": 0.3587, + "step": 3031 + }, + { + "epoch": 0.24562540505508748, + "grad_norm": 0.05257326364517212, + "learning_rate": 0.00012280275415147833, + "loss": 0.3488, + "step": 3032 + }, + { + "epoch": 0.24570641607258586, + "grad_norm": 0.04402840510010719, + "learning_rate": 0.00012284325637910087, + "loss": 0.3926, + "step": 3033 + }, + { + "epoch": 0.24578742709008425, + "grad_norm": 0.04026148095726967, + "learning_rate": 0.00012288375860672337, + "loss": 0.3613, + "step": 3034 + }, + { + "epoch": 0.24586843810758263, + "grad_norm": 0.05136657878756523, + "learning_rate": 0.0001229242608343459, + "loss": 0.3644, + "step": 3035 + }, + { + "epoch": 0.24594944912508102, + "grad_norm": 0.045099638402462006, + "learning_rate": 0.0001229647630619684, + "loss": 0.3997, + "step": 3036 + }, + { + "epoch": 0.2460304601425794, + "grad_norm": 0.04942428320646286, + "learning_rate": 0.00012300526528959092, + "loss": 0.3895, + "step": 3037 + }, + { + "epoch": 0.24611147116007778, + "grad_norm": 0.04704645648598671, + "learning_rate": 0.00012304576751721345, + "loss": 0.3687, + "step": 3038 + }, + { + "epoch": 0.24619248217757614, + "grad_norm": 0.044770658016204834, + "learning_rate": 0.00012308626974483596, + "loss": 0.413, + "step": 3039 + }, + { + "epoch": 0.24627349319507452, + "grad_norm": 0.053014349192380905, + "learning_rate": 0.0001231267719724585, + "loss": 0.4557, + "step": 3040 + }, + { + "epoch": 0.2463545042125729, + "grad_norm": 0.0513080395758152, + "learning_rate": 0.000123167274200081, + "loss": 0.4115, + "step": 3041 + }, + { + "epoch": 0.2464355152300713, + "grad_norm": 0.06259527057409286, + "learning_rate": 0.00012320777642770353, + "loss": 0.4064, + "step": 3042 + }, + { + "epoch": 0.24651652624756967, + "grad_norm": 0.0488407164812088, + "learning_rate": 0.00012324827865532604, + "loss": 0.3817, + "step": 3043 + }, + { + "epoch": 0.24659753726506806, + "grad_norm": 0.03926355019211769, + "learning_rate": 0.00012328878088294857, + "loss": 0.4017, + "step": 3044 + }, + { + "epoch": 0.24667854828256644, + "grad_norm": 0.03465135395526886, + "learning_rate": 0.00012332928311057108, + "loss": 0.3564, + "step": 3045 + }, + { + "epoch": 0.2467595593000648, + "grad_norm": 0.04638232663273811, + "learning_rate": 0.0001233697853381936, + "loss": 0.3726, + "step": 3046 + }, + { + "epoch": 0.24684057031756318, + "grad_norm": 0.06023021414875984, + "learning_rate": 0.00012341028756581612, + "loss": 0.3579, + "step": 3047 + }, + { + "epoch": 0.24692158133506156, + "grad_norm": 0.03787575662136078, + "learning_rate": 0.00012345078979343865, + "loss": 0.3638, + "step": 3048 + }, + { + "epoch": 0.24700259235255995, + "grad_norm": 0.04475943744182587, + "learning_rate": 0.00012349129202106116, + "loss": 0.3742, + "step": 3049 + }, + { + "epoch": 0.24708360337005833, + "grad_norm": 0.038167525082826614, + "learning_rate": 0.0001235317942486837, + "loss": 0.3695, + "step": 3050 + }, + { + "epoch": 0.2471646143875567, + "grad_norm": 0.04315280169248581, + "learning_rate": 0.0001235722964763062, + "loss": 0.4386, + "step": 3051 + }, + { + "epoch": 0.2472456254050551, + "grad_norm": 0.0408618189394474, + "learning_rate": 0.00012361279870392873, + "loss": 0.3583, + "step": 3052 + }, + { + "epoch": 0.24732663642255348, + "grad_norm": 0.0395033024251461, + "learning_rate": 0.00012365330093155123, + "loss": 0.3228, + "step": 3053 + }, + { + "epoch": 0.24740764744005184, + "grad_norm": 0.042332377284765244, + "learning_rate": 0.00012369380315917377, + "loss": 0.3667, + "step": 3054 + }, + { + "epoch": 0.24748865845755022, + "grad_norm": 0.04684600234031677, + "learning_rate": 0.0001237343053867963, + "loss": 0.4264, + "step": 3055 + }, + { + "epoch": 0.2475696694750486, + "grad_norm": 0.039548370987176895, + "learning_rate": 0.0001237748076144188, + "loss": 0.3568, + "step": 3056 + }, + { + "epoch": 0.247650680492547, + "grad_norm": 0.053158797323703766, + "learning_rate": 0.00012381530984204134, + "loss": 0.4191, + "step": 3057 + }, + { + "epoch": 0.24773169151004537, + "grad_norm": 0.05239748954772949, + "learning_rate": 0.00012385581206966385, + "loss": 0.3636, + "step": 3058 + }, + { + "epoch": 0.24781270252754375, + "grad_norm": 0.03982961177825928, + "learning_rate": 0.00012389631429728635, + "loss": 0.3994, + "step": 3059 + }, + { + "epoch": 0.24789371354504214, + "grad_norm": 0.041404590010643005, + "learning_rate": 0.00012393681652490889, + "loss": 0.3498, + "step": 3060 + }, + { + "epoch": 0.2479747245625405, + "grad_norm": 0.045367904007434845, + "learning_rate": 0.0001239773187525314, + "loss": 0.3847, + "step": 3061 + }, + { + "epoch": 0.24805573558003888, + "grad_norm": 0.041434478014707565, + "learning_rate": 0.0001240178209801539, + "loss": 0.3941, + "step": 3062 + }, + { + "epoch": 0.24813674659753726, + "grad_norm": 0.040743254125118256, + "learning_rate": 0.00012405832320777643, + "loss": 0.4037, + "step": 3063 + }, + { + "epoch": 0.24821775761503564, + "grad_norm": 0.04002142325043678, + "learning_rate": 0.00012409882543539894, + "loss": 0.4078, + "step": 3064 + }, + { + "epoch": 0.24829876863253403, + "grad_norm": 0.057170569896698, + "learning_rate": 0.00012413932766302147, + "loss": 0.3649, + "step": 3065 + }, + { + "epoch": 0.2483797796500324, + "grad_norm": 0.054762836545705795, + "learning_rate": 0.00012417982989064398, + "loss": 0.3933, + "step": 3066 + }, + { + "epoch": 0.2484607906675308, + "grad_norm": 0.03908546268939972, + "learning_rate": 0.0001242203321182665, + "loss": 0.3379, + "step": 3067 + }, + { + "epoch": 0.24854180168502915, + "grad_norm": 0.043949905782938004, + "learning_rate": 0.00012426083434588902, + "loss": 0.425, + "step": 3068 + }, + { + "epoch": 0.24862281270252753, + "grad_norm": 0.0349920354783535, + "learning_rate": 0.00012430133657351155, + "loss": 0.3924, + "step": 3069 + }, + { + "epoch": 0.24870382372002592, + "grad_norm": 0.04423988610506058, + "learning_rate": 0.00012434183880113405, + "loss": 0.3753, + "step": 3070 + }, + { + "epoch": 0.2487848347375243, + "grad_norm": 0.0439189188182354, + "learning_rate": 0.0001243823410287566, + "loss": 0.3584, + "step": 3071 + }, + { + "epoch": 0.24886584575502269, + "grad_norm": 0.05119139328598976, + "learning_rate": 0.0001244228432563791, + "loss": 0.4317, + "step": 3072 + }, + { + "epoch": 0.24894685677252107, + "grad_norm": 0.04475146904587746, + "learning_rate": 0.00012446334548400163, + "loss": 0.3936, + "step": 3073 + }, + { + "epoch": 0.24902786779001945, + "grad_norm": 0.03774057701230049, + "learning_rate": 0.00012450384771162416, + "loss": 0.3627, + "step": 3074 + }, + { + "epoch": 0.24910887880751784, + "grad_norm": 0.04917656630277634, + "learning_rate": 0.00012454434993924667, + "loss": 0.4298, + "step": 3075 + }, + { + "epoch": 0.2491898898250162, + "grad_norm": 0.04121607169508934, + "learning_rate": 0.0001245848521668692, + "loss": 0.3785, + "step": 3076 + }, + { + "epoch": 0.24927090084251458, + "grad_norm": 0.03795452415943146, + "learning_rate": 0.0001246253543944917, + "loss": 0.3869, + "step": 3077 + }, + { + "epoch": 0.24935191186001296, + "grad_norm": 0.03608179837465286, + "learning_rate": 0.00012466585662211424, + "loss": 0.3885, + "step": 3078 + }, + { + "epoch": 0.24943292287751134, + "grad_norm": 0.04807370528578758, + "learning_rate": 0.00012470635884973675, + "loss": 0.403, + "step": 3079 + }, + { + "epoch": 0.24951393389500973, + "grad_norm": 0.05438527837395668, + "learning_rate": 0.00012474686107735928, + "loss": 0.3725, + "step": 3080 + }, + { + "epoch": 0.2495949449125081, + "grad_norm": 0.039910171180963516, + "learning_rate": 0.00012478736330498178, + "loss": 0.3373, + "step": 3081 + }, + { + "epoch": 0.2496759559300065, + "grad_norm": 0.037223588675260544, + "learning_rate": 0.00012482786553260432, + "loss": 0.3578, + "step": 3082 + }, + { + "epoch": 0.24975696694750485, + "grad_norm": 0.04596159979701042, + "learning_rate": 0.00012486836776022682, + "loss": 0.3981, + "step": 3083 + }, + { + "epoch": 0.24983797796500323, + "grad_norm": 0.04447013884782791, + "learning_rate": 0.00012490886998784933, + "loss": 0.4163, + "step": 3084 + }, + { + "epoch": 0.24991898898250162, + "grad_norm": 0.04586843401193619, + "learning_rate": 0.00012494937221547184, + "loss": 0.3653, + "step": 3085 + }, + { + "epoch": 0.25, + "grad_norm": 0.04713024944067001, + "learning_rate": 0.00012498987444309437, + "loss": 0.435, + "step": 3086 + }, + { + "epoch": 0.25008101101749836, + "grad_norm": 0.04586999490857124, + "learning_rate": 0.00012503037667071688, + "loss": 0.4236, + "step": 3087 + }, + { + "epoch": 0.25016202203499677, + "grad_norm": 0.04212983697652817, + "learning_rate": 0.0001250708788983394, + "loss": 0.3092, + "step": 3088 + }, + { + "epoch": 0.2502430330524951, + "grad_norm": 0.05363105237483978, + "learning_rate": 0.00012511138112596191, + "loss": 0.4238, + "step": 3089 + }, + { + "epoch": 0.25032404406999353, + "grad_norm": 0.042278800159692764, + "learning_rate": 0.00012515188335358445, + "loss": 0.4017, + "step": 3090 + }, + { + "epoch": 0.2504050550874919, + "grad_norm": 0.03694623336195946, + "learning_rate": 0.00012519238558120698, + "loss": 0.3607, + "step": 3091 + }, + { + "epoch": 0.2504860661049903, + "grad_norm": 0.0348326712846756, + "learning_rate": 0.0001252328878088295, + "loss": 0.392, + "step": 3092 + }, + { + "epoch": 0.25056707712248866, + "grad_norm": 0.05313507094979286, + "learning_rate": 0.00012527339003645202, + "loss": 0.386, + "step": 3093 + }, + { + "epoch": 0.250648088139987, + "grad_norm": 0.043994393199682236, + "learning_rate": 0.00012531389226407453, + "loss": 0.424, + "step": 3094 + }, + { + "epoch": 0.2507290991574854, + "grad_norm": 0.045557476580142975, + "learning_rate": 0.00012535439449169706, + "loss": 0.4057, + "step": 3095 + }, + { + "epoch": 0.2508101101749838, + "grad_norm": 0.0556633435189724, + "learning_rate": 0.00012539489671931957, + "loss": 0.3665, + "step": 3096 + }, + { + "epoch": 0.2508911211924822, + "grad_norm": 0.03474360331892967, + "learning_rate": 0.0001254353989469421, + "loss": 0.3409, + "step": 3097 + }, + { + "epoch": 0.25097213220998055, + "grad_norm": 0.03593473881483078, + "learning_rate": 0.0001254759011745646, + "loss": 0.4017, + "step": 3098 + }, + { + "epoch": 0.25105314322747896, + "grad_norm": 0.04045382887125015, + "learning_rate": 0.00012551640340218714, + "loss": 0.3781, + "step": 3099 + }, + { + "epoch": 0.2511341542449773, + "grad_norm": 0.05407486483454704, + "learning_rate": 0.00012555690562980964, + "loss": 0.4013, + "step": 3100 + }, + { + "epoch": 0.25121516526247567, + "grad_norm": 0.048889655619859695, + "learning_rate": 0.00012559740785743218, + "loss": 0.3784, + "step": 3101 + }, + { + "epoch": 0.2512961762799741, + "grad_norm": 0.04563205689191818, + "learning_rate": 0.00012563791008505468, + "loss": 0.3982, + "step": 3102 + }, + { + "epoch": 0.25137718729747244, + "grad_norm": 0.04439222067594528, + "learning_rate": 0.00012567841231267722, + "loss": 0.3797, + "step": 3103 + }, + { + "epoch": 0.25145819831497085, + "grad_norm": 0.048667896538972855, + "learning_rate": 0.00012571891454029972, + "loss": 0.4266, + "step": 3104 + }, + { + "epoch": 0.2515392093324692, + "grad_norm": 0.057649172842502594, + "learning_rate": 0.00012575941676792226, + "loss": 0.3974, + "step": 3105 + }, + { + "epoch": 0.2516202203499676, + "grad_norm": 0.04143094643950462, + "learning_rate": 0.00012579991899554476, + "loss": 0.3832, + "step": 3106 + }, + { + "epoch": 0.25170123136746597, + "grad_norm": 0.044324059039354324, + "learning_rate": 0.00012584042122316727, + "loss": 0.4323, + "step": 3107 + }, + { + "epoch": 0.2517822423849643, + "grad_norm": 0.03827289119362831, + "learning_rate": 0.0001258809234507898, + "loss": 0.3605, + "step": 3108 + }, + { + "epoch": 0.25186325340246274, + "grad_norm": 0.039986081421375275, + "learning_rate": 0.0001259214256784123, + "loss": 0.3701, + "step": 3109 + }, + { + "epoch": 0.2519442644199611, + "grad_norm": 0.04025157168507576, + "learning_rate": 0.00012596192790603484, + "loss": 0.3573, + "step": 3110 + }, + { + "epoch": 0.2520252754374595, + "grad_norm": 0.042288873344659805, + "learning_rate": 0.00012600243013365735, + "loss": 0.3594, + "step": 3111 + }, + { + "epoch": 0.25210628645495786, + "grad_norm": 0.0396200492978096, + "learning_rate": 0.00012604293236127988, + "loss": 0.3739, + "step": 3112 + }, + { + "epoch": 0.2521872974724563, + "grad_norm": 0.03899937868118286, + "learning_rate": 0.0001260834345889024, + "loss": 0.3855, + "step": 3113 + }, + { + "epoch": 0.25226830848995463, + "grad_norm": 0.04981239512562752, + "learning_rate": 0.00012612393681652492, + "loss": 0.3627, + "step": 3114 + }, + { + "epoch": 0.25234931950745304, + "grad_norm": 0.04627585783600807, + "learning_rate": 0.00012616443904414743, + "loss": 0.345, + "step": 3115 + }, + { + "epoch": 0.2524303305249514, + "grad_norm": 0.03675699606537819, + "learning_rate": 0.00012620494127176996, + "loss": 0.318, + "step": 3116 + }, + { + "epoch": 0.25251134154244975, + "grad_norm": 0.04463730379939079, + "learning_rate": 0.00012624544349939247, + "loss": 0.3599, + "step": 3117 + }, + { + "epoch": 0.25259235255994816, + "grad_norm": 0.04861759394407272, + "learning_rate": 0.000126285945727015, + "loss": 0.3645, + "step": 3118 + }, + { + "epoch": 0.2526733635774465, + "grad_norm": 0.039907146245241165, + "learning_rate": 0.0001263264479546375, + "loss": 0.3723, + "step": 3119 + }, + { + "epoch": 0.25275437459494493, + "grad_norm": 0.051097266376018524, + "learning_rate": 0.00012636695018226004, + "loss": 0.394, + "step": 3120 + }, + { + "epoch": 0.2528353856124433, + "grad_norm": 0.04374748468399048, + "learning_rate": 0.00012640745240988254, + "loss": 0.4085, + "step": 3121 + }, + { + "epoch": 0.2529163966299417, + "grad_norm": 0.04951639473438263, + "learning_rate": 0.00012644795463750508, + "loss": 0.3515, + "step": 3122 + }, + { + "epoch": 0.25299740764744005, + "grad_norm": 0.04277408868074417, + "learning_rate": 0.00012648845686512758, + "loss": 0.3368, + "step": 3123 + }, + { + "epoch": 0.2530784186649384, + "grad_norm": 0.04922161623835564, + "learning_rate": 0.00012652895909275012, + "loss": 0.3734, + "step": 3124 + }, + { + "epoch": 0.2531594296824368, + "grad_norm": 0.05067862197756767, + "learning_rate": 0.00012656946132037262, + "loss": 0.381, + "step": 3125 + }, + { + "epoch": 0.2532404406999352, + "grad_norm": 0.03622680902481079, + "learning_rate": 0.00012660996354799516, + "loss": 0.3465, + "step": 3126 + }, + { + "epoch": 0.2533214517174336, + "grad_norm": 0.04435792192816734, + "learning_rate": 0.00012665046577561766, + "loss": 0.3514, + "step": 3127 + }, + { + "epoch": 0.25340246273493194, + "grad_norm": 0.047936275601387024, + "learning_rate": 0.0001266909680032402, + "loss": 0.3669, + "step": 3128 + }, + { + "epoch": 0.25348347375243035, + "grad_norm": 0.050081074237823486, + "learning_rate": 0.0001267314702308627, + "loss": 0.3585, + "step": 3129 + }, + { + "epoch": 0.2535644847699287, + "grad_norm": 0.0505104586482048, + "learning_rate": 0.00012677197245848523, + "loss": 0.3788, + "step": 3130 + }, + { + "epoch": 0.25364549578742707, + "grad_norm": 0.03991609066724777, + "learning_rate": 0.00012681247468610774, + "loss": 0.3467, + "step": 3131 + }, + { + "epoch": 0.2537265068049255, + "grad_norm": 0.04814030975103378, + "learning_rate": 0.00012685297691373025, + "loss": 0.3656, + "step": 3132 + }, + { + "epoch": 0.25380751782242383, + "grad_norm": 0.04745684564113617, + "learning_rate": 0.00012689347914135278, + "loss": 0.3873, + "step": 3133 + }, + { + "epoch": 0.25388852883992225, + "grad_norm": 0.047067102044820786, + "learning_rate": 0.00012693398136897529, + "loss": 0.3876, + "step": 3134 + }, + { + "epoch": 0.2539695398574206, + "grad_norm": 0.041653912514448166, + "learning_rate": 0.00012697448359659782, + "loss": 0.3832, + "step": 3135 + }, + { + "epoch": 0.254050550874919, + "grad_norm": 0.04571259766817093, + "learning_rate": 0.00012701498582422033, + "loss": 0.3589, + "step": 3136 + }, + { + "epoch": 0.25413156189241737, + "grad_norm": 0.038450032472610474, + "learning_rate": 0.00012705548805184286, + "loss": 0.3516, + "step": 3137 + }, + { + "epoch": 0.2542125729099157, + "grad_norm": 0.04408708214759827, + "learning_rate": 0.00012709599027946536, + "loss": 0.3606, + "step": 3138 + }, + { + "epoch": 0.25429358392741414, + "grad_norm": 0.05931435525417328, + "learning_rate": 0.0001271364925070879, + "loss": 0.3971, + "step": 3139 + }, + { + "epoch": 0.2543745949449125, + "grad_norm": 0.04115188121795654, + "learning_rate": 0.0001271769947347104, + "loss": 0.352, + "step": 3140 + }, + { + "epoch": 0.2544556059624109, + "grad_norm": 0.052127905189991, + "learning_rate": 0.00012721749696233294, + "loss": 0.3807, + "step": 3141 + }, + { + "epoch": 0.25453661697990926, + "grad_norm": 0.04414311423897743, + "learning_rate": 0.00012725799918995544, + "loss": 0.4141, + "step": 3142 + }, + { + "epoch": 0.25461762799740767, + "grad_norm": 0.049567751586437225, + "learning_rate": 0.00012729850141757798, + "loss": 0.3739, + "step": 3143 + }, + { + "epoch": 0.254698639014906, + "grad_norm": 0.056248169392347336, + "learning_rate": 0.00012733900364520048, + "loss": 0.359, + "step": 3144 + }, + { + "epoch": 0.2547796500324044, + "grad_norm": 0.04483194649219513, + "learning_rate": 0.00012737950587282302, + "loss": 0.3429, + "step": 3145 + }, + { + "epoch": 0.2548606610499028, + "grad_norm": 0.04335392266511917, + "learning_rate": 0.00012742000810044552, + "loss": 0.3925, + "step": 3146 + }, + { + "epoch": 0.25494167206740115, + "grad_norm": 0.03910553455352783, + "learning_rate": 0.00012746051032806806, + "loss": 0.3952, + "step": 3147 + }, + { + "epoch": 0.25502268308489956, + "grad_norm": 0.05573083087801933, + "learning_rate": 0.0001275010125556906, + "loss": 0.3531, + "step": 3148 + }, + { + "epoch": 0.2551036941023979, + "grad_norm": 0.05946796387434006, + "learning_rate": 0.0001275415147833131, + "loss": 0.373, + "step": 3149 + }, + { + "epoch": 0.2551847051198963, + "grad_norm": 0.04649536684155464, + "learning_rate": 0.00012758201701093563, + "loss": 0.415, + "step": 3150 + }, + { + "epoch": 0.2552657161373947, + "grad_norm": 0.04201361909508705, + "learning_rate": 0.00012762251923855813, + "loss": 0.3631, + "step": 3151 + }, + { + "epoch": 0.25534672715489304, + "grad_norm": 0.04001443833112717, + "learning_rate": 0.00012766302146618067, + "loss": 0.3667, + "step": 3152 + }, + { + "epoch": 0.25542773817239145, + "grad_norm": 0.051107991486787796, + "learning_rate": 0.00012770352369380317, + "loss": 0.4328, + "step": 3153 + }, + { + "epoch": 0.2555087491898898, + "grad_norm": 0.04720538482069969, + "learning_rate": 0.00012774402592142568, + "loss": 0.3522, + "step": 3154 + }, + { + "epoch": 0.2555897602073882, + "grad_norm": 0.0412609800696373, + "learning_rate": 0.0001277845281490482, + "loss": 0.4278, + "step": 3155 + }, + { + "epoch": 0.2556707712248866, + "grad_norm": 0.03820064663887024, + "learning_rate": 0.00012782503037667072, + "loss": 0.3855, + "step": 3156 + }, + { + "epoch": 0.255751782242385, + "grad_norm": 0.04344452545046806, + "learning_rate": 0.00012786553260429323, + "loss": 0.4136, + "step": 3157 + }, + { + "epoch": 0.25583279325988334, + "grad_norm": 0.04224742576479912, + "learning_rate": 0.00012790603483191576, + "loss": 0.3729, + "step": 3158 + }, + { + "epoch": 0.25591380427738175, + "grad_norm": 0.039598360657691956, + "learning_rate": 0.00012794653705953826, + "loss": 0.3878, + "step": 3159 + }, + { + "epoch": 0.2559948152948801, + "grad_norm": 0.035558078438043594, + "learning_rate": 0.0001279870392871608, + "loss": 0.3357, + "step": 3160 + }, + { + "epoch": 0.25607582631237846, + "grad_norm": 0.04091186448931694, + "learning_rate": 0.0001280275415147833, + "loss": 0.3201, + "step": 3161 + }, + { + "epoch": 0.2561568373298769, + "grad_norm": 0.056138601154088974, + "learning_rate": 0.00012806804374240584, + "loss": 0.4365, + "step": 3162 + }, + { + "epoch": 0.25623784834737523, + "grad_norm": 0.041428547352552414, + "learning_rate": 0.00012810854597002834, + "loss": 0.3792, + "step": 3163 + }, + { + "epoch": 0.25631885936487364, + "grad_norm": 0.04135297238826752, + "learning_rate": 0.00012814904819765088, + "loss": 0.3543, + "step": 3164 + }, + { + "epoch": 0.256399870382372, + "grad_norm": 0.041687510907649994, + "learning_rate": 0.00012818955042527338, + "loss": 0.3958, + "step": 3165 + }, + { + "epoch": 0.2564808813998704, + "grad_norm": 0.0362611822783947, + "learning_rate": 0.00012823005265289592, + "loss": 0.366, + "step": 3166 + }, + { + "epoch": 0.25656189241736876, + "grad_norm": 0.04926367476582527, + "learning_rate": 0.00012827055488051845, + "loss": 0.436, + "step": 3167 + }, + { + "epoch": 0.2566429034348671, + "grad_norm": 0.04708395525813103, + "learning_rate": 0.00012831105710814096, + "loss": 0.4092, + "step": 3168 + }, + { + "epoch": 0.25672391445236553, + "grad_norm": 0.03967064991593361, + "learning_rate": 0.0001283515593357635, + "loss": 0.3539, + "step": 3169 + }, + { + "epoch": 0.2568049254698639, + "grad_norm": 0.04242156818509102, + "learning_rate": 0.000128392061563386, + "loss": 0.3838, + "step": 3170 + }, + { + "epoch": 0.2568859364873623, + "grad_norm": 0.03780914098024368, + "learning_rate": 0.00012843256379100853, + "loss": 0.3519, + "step": 3171 + }, + { + "epoch": 0.25696694750486065, + "grad_norm": 0.04463619738817215, + "learning_rate": 0.00012847306601863103, + "loss": 0.3899, + "step": 3172 + }, + { + "epoch": 0.25704795852235907, + "grad_norm": 0.040854331105947495, + "learning_rate": 0.00012851356824625357, + "loss": 0.3554, + "step": 3173 + }, + { + "epoch": 0.2571289695398574, + "grad_norm": 0.04449096694588661, + "learning_rate": 0.00012855407047387607, + "loss": 0.309, + "step": 3174 + }, + { + "epoch": 0.2572099805573558, + "grad_norm": 0.04881928488612175, + "learning_rate": 0.0001285945727014986, + "loss": 0.4015, + "step": 3175 + }, + { + "epoch": 0.2572909915748542, + "grad_norm": 0.041136372834444046, + "learning_rate": 0.0001286350749291211, + "loss": 0.3598, + "step": 3176 + }, + { + "epoch": 0.25737200259235254, + "grad_norm": 0.04056499898433685, + "learning_rate": 0.00012867557715674362, + "loss": 0.3847, + "step": 3177 + }, + { + "epoch": 0.25745301360985096, + "grad_norm": 0.047216638922691345, + "learning_rate": 0.00012871607938436615, + "loss": 0.3789, + "step": 3178 + }, + { + "epoch": 0.2575340246273493, + "grad_norm": 0.0441555492579937, + "learning_rate": 0.00012875658161198866, + "loss": 0.3974, + "step": 3179 + }, + { + "epoch": 0.2576150356448477, + "grad_norm": 0.03761187940835953, + "learning_rate": 0.00012879708383961116, + "loss": 0.42, + "step": 3180 + }, + { + "epoch": 0.2576960466623461, + "grad_norm": 0.043480463325977325, + "learning_rate": 0.0001288375860672337, + "loss": 0.3676, + "step": 3181 + }, + { + "epoch": 0.25777705767984443, + "grad_norm": 0.0372801311314106, + "learning_rate": 0.0001288780882948562, + "loss": 0.3773, + "step": 3182 + }, + { + "epoch": 0.25785806869734285, + "grad_norm": 0.04209472984075546, + "learning_rate": 0.00012891859052247874, + "loss": 0.3822, + "step": 3183 + }, + { + "epoch": 0.2579390797148412, + "grad_norm": 0.04223468527197838, + "learning_rate": 0.00012895909275010124, + "loss": 0.3481, + "step": 3184 + }, + { + "epoch": 0.2580200907323396, + "grad_norm": 0.051124799996614456, + "learning_rate": 0.00012899959497772378, + "loss": 0.4014, + "step": 3185 + }, + { + "epoch": 0.25810110174983797, + "grad_norm": 0.040176596492528915, + "learning_rate": 0.0001290400972053463, + "loss": 0.352, + "step": 3186 + }, + { + "epoch": 0.2581821127673364, + "grad_norm": 0.04070594534277916, + "learning_rate": 0.00012908059943296882, + "loss": 0.3603, + "step": 3187 + }, + { + "epoch": 0.25826312378483474, + "grad_norm": 0.03919408470392227, + "learning_rate": 0.00012912110166059135, + "loss": 0.3743, + "step": 3188 + }, + { + "epoch": 0.2583441348023331, + "grad_norm": 0.043664198368787766, + "learning_rate": 0.00012916160388821385, + "loss": 0.4041, + "step": 3189 + }, + { + "epoch": 0.2584251458198315, + "grad_norm": 0.04055207595229149, + "learning_rate": 0.0001292021061158364, + "loss": 0.3676, + "step": 3190 + }, + { + "epoch": 0.25850615683732986, + "grad_norm": 0.041981372982263565, + "learning_rate": 0.0001292426083434589, + "loss": 0.3562, + "step": 3191 + }, + { + "epoch": 0.25858716785482827, + "grad_norm": 0.04531414806842804, + "learning_rate": 0.00012928311057108143, + "loss": 0.3404, + "step": 3192 + }, + { + "epoch": 0.2586681788723266, + "grad_norm": 0.043773435056209564, + "learning_rate": 0.00012932361279870393, + "loss": 0.3807, + "step": 3193 + }, + { + "epoch": 0.25874918988982504, + "grad_norm": 0.04385381191968918, + "learning_rate": 0.00012936411502632647, + "loss": 0.3815, + "step": 3194 + }, + { + "epoch": 0.2588302009073234, + "grad_norm": 0.03920425474643707, + "learning_rate": 0.00012940461725394897, + "loss": 0.3725, + "step": 3195 + }, + { + "epoch": 0.25891121192482175, + "grad_norm": 0.04236259311437607, + "learning_rate": 0.0001294451194815715, + "loss": 0.3781, + "step": 3196 + }, + { + "epoch": 0.25899222294232016, + "grad_norm": 0.05604299530386925, + "learning_rate": 0.000129485621709194, + "loss": 0.3814, + "step": 3197 + }, + { + "epoch": 0.2590732339598185, + "grad_norm": 0.04819274693727493, + "learning_rate": 0.00012952612393681655, + "loss": 0.3885, + "step": 3198 + }, + { + "epoch": 0.2591542449773169, + "grad_norm": 0.04288613796234131, + "learning_rate": 0.00012956662616443905, + "loss": 0.4185, + "step": 3199 + }, + { + "epoch": 0.2592352559948153, + "grad_norm": 0.040658704936504364, + "learning_rate": 0.00012960712839206158, + "loss": 0.3725, + "step": 3200 + }, + { + "epoch": 0.2593162670123137, + "grad_norm": 0.03994179889559746, + "learning_rate": 0.0001296476306196841, + "loss": 0.3481, + "step": 3201 + }, + { + "epoch": 0.25939727802981205, + "grad_norm": 0.04214403033256531, + "learning_rate": 0.0001296881328473066, + "loss": 0.3794, + "step": 3202 + }, + { + "epoch": 0.25947828904731046, + "grad_norm": 0.0443929024040699, + "learning_rate": 0.00012972863507492913, + "loss": 0.377, + "step": 3203 + }, + { + "epoch": 0.2595593000648088, + "grad_norm": 0.04427499696612358, + "learning_rate": 0.00012976913730255164, + "loss": 0.4113, + "step": 3204 + }, + { + "epoch": 0.2596403110823072, + "grad_norm": 0.03917337581515312, + "learning_rate": 0.00012980963953017417, + "loss": 0.3133, + "step": 3205 + }, + { + "epoch": 0.2597213220998056, + "grad_norm": 0.040803954005241394, + "learning_rate": 0.00012985014175779668, + "loss": 0.3581, + "step": 3206 + }, + { + "epoch": 0.25980233311730394, + "grad_norm": 0.048127301037311554, + "learning_rate": 0.0001298906439854192, + "loss": 0.3959, + "step": 3207 + }, + { + "epoch": 0.25988334413480235, + "grad_norm": 0.051993049681186676, + "learning_rate": 0.00012993114621304171, + "loss": 0.4089, + "step": 3208 + }, + { + "epoch": 0.2599643551523007, + "grad_norm": 0.03847181797027588, + "learning_rate": 0.00012997164844066425, + "loss": 0.3845, + "step": 3209 + }, + { + "epoch": 0.2600453661697991, + "grad_norm": 0.043846599757671356, + "learning_rate": 0.00013001215066828675, + "loss": 0.4003, + "step": 3210 + }, + { + "epoch": 0.2601263771872975, + "grad_norm": 0.04759734496474266, + "learning_rate": 0.0001300526528959093, + "loss": 0.3772, + "step": 3211 + }, + { + "epoch": 0.26020738820479583, + "grad_norm": 0.03929856792092323, + "learning_rate": 0.0001300931551235318, + "loss": 0.403, + "step": 3212 + }, + { + "epoch": 0.26028839922229424, + "grad_norm": 0.036084044724702835, + "learning_rate": 0.00013013365735115433, + "loss": 0.3519, + "step": 3213 + }, + { + "epoch": 0.2603694102397926, + "grad_norm": 0.040780842304229736, + "learning_rate": 0.00013017415957877683, + "loss": 0.333, + "step": 3214 + }, + { + "epoch": 0.260450421257291, + "grad_norm": 0.03689270839095116, + "learning_rate": 0.00013021466180639937, + "loss": 0.3211, + "step": 3215 + }, + { + "epoch": 0.26053143227478937, + "grad_norm": 0.04343465343117714, + "learning_rate": 0.00013025516403402187, + "loss": 0.4017, + "step": 3216 + }, + { + "epoch": 0.2606124432922878, + "grad_norm": 0.04075007885694504, + "learning_rate": 0.0001302956662616444, + "loss": 0.3772, + "step": 3217 + }, + { + "epoch": 0.26069345430978613, + "grad_norm": 0.04069703072309494, + "learning_rate": 0.0001303361684892669, + "loss": 0.3754, + "step": 3218 + }, + { + "epoch": 0.2607744653272845, + "grad_norm": 0.05063739791512489, + "learning_rate": 0.00013037667071688944, + "loss": 0.3919, + "step": 3219 + }, + { + "epoch": 0.2608554763447829, + "grad_norm": 0.04554930701851845, + "learning_rate": 0.00013041717294451195, + "loss": 0.376, + "step": 3220 + }, + { + "epoch": 0.26093648736228126, + "grad_norm": 0.03898908942937851, + "learning_rate": 0.00013045767517213448, + "loss": 0.3209, + "step": 3221 + }, + { + "epoch": 0.26101749837977967, + "grad_norm": 0.04082167148590088, + "learning_rate": 0.000130498177399757, + "loss": 0.3895, + "step": 3222 + }, + { + "epoch": 0.261098509397278, + "grad_norm": 0.04319079592823982, + "learning_rate": 0.00013053867962737952, + "loss": 0.3846, + "step": 3223 + }, + { + "epoch": 0.26117952041477643, + "grad_norm": 0.037946876138448715, + "learning_rate": 0.00013057918185500203, + "loss": 0.3475, + "step": 3224 + }, + { + "epoch": 0.2612605314322748, + "grad_norm": 0.047004539519548416, + "learning_rate": 0.00013061968408262456, + "loss": 0.3756, + "step": 3225 + }, + { + "epoch": 0.26134154244977315, + "grad_norm": 0.046914055943489075, + "learning_rate": 0.00013066018631024707, + "loss": 0.3843, + "step": 3226 + }, + { + "epoch": 0.26142255346727156, + "grad_norm": 0.03881968930363655, + "learning_rate": 0.00013070068853786957, + "loss": 0.3565, + "step": 3227 + }, + { + "epoch": 0.2615035644847699, + "grad_norm": 0.04270708188414574, + "learning_rate": 0.0001307411907654921, + "loss": 0.4095, + "step": 3228 + }, + { + "epoch": 0.2615845755022683, + "grad_norm": 0.044253475964069366, + "learning_rate": 0.00013078169299311461, + "loss": 0.3677, + "step": 3229 + }, + { + "epoch": 0.2616655865197667, + "grad_norm": 0.04407831281423569, + "learning_rate": 0.00013082219522073715, + "loss": 0.3748, + "step": 3230 + }, + { + "epoch": 0.2617465975372651, + "grad_norm": 0.04300897940993309, + "learning_rate": 0.00013086269744835965, + "loss": 0.4075, + "step": 3231 + }, + { + "epoch": 0.26182760855476345, + "grad_norm": 0.03860275447368622, + "learning_rate": 0.0001309031996759822, + "loss": 0.3697, + "step": 3232 + }, + { + "epoch": 0.2619086195722618, + "grad_norm": 0.04201575368642807, + "learning_rate": 0.0001309437019036047, + "loss": 0.3104, + "step": 3233 + }, + { + "epoch": 0.2619896305897602, + "grad_norm": 0.042958687990903854, + "learning_rate": 0.00013098420413122723, + "loss": 0.3594, + "step": 3234 + }, + { + "epoch": 0.26207064160725857, + "grad_norm": 0.05025588348507881, + "learning_rate": 0.00013102470635884973, + "loss": 0.4544, + "step": 3235 + }, + { + "epoch": 0.262151652624757, + "grad_norm": 0.04615491256117821, + "learning_rate": 0.00013106520858647227, + "loss": 0.3719, + "step": 3236 + }, + { + "epoch": 0.26223266364225534, + "grad_norm": 0.04639549180865288, + "learning_rate": 0.00013110571081409477, + "loss": 0.3913, + "step": 3237 + }, + { + "epoch": 0.26231367465975375, + "grad_norm": 0.04262283071875572, + "learning_rate": 0.0001311462130417173, + "loss": 0.3961, + "step": 3238 + }, + { + "epoch": 0.2623946856772521, + "grad_norm": 0.04736074432730675, + "learning_rate": 0.0001311867152693398, + "loss": 0.3343, + "step": 3239 + }, + { + "epoch": 0.26247569669475046, + "grad_norm": 0.046176109462976456, + "learning_rate": 0.00013122721749696234, + "loss": 0.3841, + "step": 3240 + }, + { + "epoch": 0.26255670771224887, + "grad_norm": 0.038287967443466187, + "learning_rate": 0.00013126771972458485, + "loss": 0.4, + "step": 3241 + }, + { + "epoch": 0.2626377187297472, + "grad_norm": 0.04353439062833786, + "learning_rate": 0.00013130822195220738, + "loss": 0.3573, + "step": 3242 + }, + { + "epoch": 0.26271872974724564, + "grad_norm": 0.042398590594530106, + "learning_rate": 0.00013134872417982992, + "loss": 0.3832, + "step": 3243 + }, + { + "epoch": 0.262799740764744, + "grad_norm": 0.03516731783747673, + "learning_rate": 0.00013138922640745242, + "loss": 0.3693, + "step": 3244 + }, + { + "epoch": 0.2628807517822424, + "grad_norm": 0.04480927065014839, + "learning_rate": 0.00013142972863507496, + "loss": 0.3878, + "step": 3245 + }, + { + "epoch": 0.26296176279974076, + "grad_norm": 0.03937854990363121, + "learning_rate": 0.00013147023086269746, + "loss": 0.3739, + "step": 3246 + }, + { + "epoch": 0.2630427738172391, + "grad_norm": 0.0367438830435276, + "learning_rate": 0.00013151073309032, + "loss": 0.4108, + "step": 3247 + }, + { + "epoch": 0.26312378483473753, + "grad_norm": 0.04266749322414398, + "learning_rate": 0.0001315512353179425, + "loss": 0.3796, + "step": 3248 + }, + { + "epoch": 0.2632047958522359, + "grad_norm": 0.040220484137535095, + "learning_rate": 0.000131591737545565, + "loss": 0.341, + "step": 3249 + }, + { + "epoch": 0.2632858068697343, + "grad_norm": 0.0418410487473011, + "learning_rate": 0.00013163223977318751, + "loss": 0.3756, + "step": 3250 + }, + { + "epoch": 0.26336681788723265, + "grad_norm": 0.04273224249482155, + "learning_rate": 0.00013167274200081005, + "loss": 0.3772, + "step": 3251 + }, + { + "epoch": 0.26344782890473106, + "grad_norm": 0.04679445922374725, + "learning_rate": 0.00013171324422843255, + "loss": 0.4407, + "step": 3252 + }, + { + "epoch": 0.2635288399222294, + "grad_norm": 0.03873904049396515, + "learning_rate": 0.00013175374645605509, + "loss": 0.354, + "step": 3253 + }, + { + "epoch": 0.26360985093972783, + "grad_norm": 0.04092535004019737, + "learning_rate": 0.0001317942486836776, + "loss": 0.3934, + "step": 3254 + }, + { + "epoch": 0.2636908619572262, + "grad_norm": 0.04127156734466553, + "learning_rate": 0.00013183475091130013, + "loss": 0.379, + "step": 3255 + }, + { + "epoch": 0.26377187297472454, + "grad_norm": 0.03976596146821976, + "learning_rate": 0.00013187525313892263, + "loss": 0.3379, + "step": 3256 + }, + { + "epoch": 0.26385288399222295, + "grad_norm": 0.04563805088400841, + "learning_rate": 0.00013191575536654516, + "loss": 0.3389, + "step": 3257 + }, + { + "epoch": 0.2639338950097213, + "grad_norm": 0.03627234324812889, + "learning_rate": 0.00013195625759416767, + "loss": 0.3842, + "step": 3258 + }, + { + "epoch": 0.2640149060272197, + "grad_norm": 0.03891756385564804, + "learning_rate": 0.0001319967598217902, + "loss": 0.3434, + "step": 3259 + }, + { + "epoch": 0.2640959170447181, + "grad_norm": 0.0457911491394043, + "learning_rate": 0.0001320372620494127, + "loss": 0.3799, + "step": 3260 + }, + { + "epoch": 0.2641769280622165, + "grad_norm": 0.04247802123427391, + "learning_rate": 0.00013207776427703524, + "loss": 0.3573, + "step": 3261 + }, + { + "epoch": 0.26425793907971484, + "grad_norm": 0.03541119396686554, + "learning_rate": 0.00013211826650465778, + "loss": 0.3333, + "step": 3262 + }, + { + "epoch": 0.2643389500972132, + "grad_norm": 0.039114829152822495, + "learning_rate": 0.00013215876873228028, + "loss": 0.3675, + "step": 3263 + }, + { + "epoch": 0.2644199611147116, + "grad_norm": 0.04546349123120308, + "learning_rate": 0.00013219927095990282, + "loss": 0.3912, + "step": 3264 + }, + { + "epoch": 0.26450097213220997, + "grad_norm": 0.042006999254226685, + "learning_rate": 0.00013223977318752532, + "loss": 0.3663, + "step": 3265 + }, + { + "epoch": 0.2645819831497084, + "grad_norm": 0.0530223585665226, + "learning_rate": 0.00013228027541514786, + "loss": 0.3815, + "step": 3266 + }, + { + "epoch": 0.26466299416720673, + "grad_norm": 0.046276140958070755, + "learning_rate": 0.00013232077764277036, + "loss": 0.4073, + "step": 3267 + }, + { + "epoch": 0.26474400518470514, + "grad_norm": 0.03913923352956772, + "learning_rate": 0.0001323612798703929, + "loss": 0.3715, + "step": 3268 + }, + { + "epoch": 0.2648250162022035, + "grad_norm": 0.03895312175154686, + "learning_rate": 0.0001324017820980154, + "loss": 0.3929, + "step": 3269 + }, + { + "epoch": 0.26490602721970186, + "grad_norm": 0.03699490427970886, + "learning_rate": 0.00013244228432563793, + "loss": 0.3966, + "step": 3270 + }, + { + "epoch": 0.26498703823720027, + "grad_norm": 0.042337387800216675, + "learning_rate": 0.00013248278655326044, + "loss": 0.3573, + "step": 3271 + }, + { + "epoch": 0.2650680492546986, + "grad_norm": 0.03894384205341339, + "learning_rate": 0.00013252328878088295, + "loss": 0.3855, + "step": 3272 + }, + { + "epoch": 0.26514906027219703, + "grad_norm": 0.05220978334546089, + "learning_rate": 0.00013256379100850548, + "loss": 0.3908, + "step": 3273 + }, + { + "epoch": 0.2652300712896954, + "grad_norm": 0.046401623636484146, + "learning_rate": 0.00013260429323612799, + "loss": 0.43, + "step": 3274 + }, + { + "epoch": 0.2653110823071938, + "grad_norm": 0.04030894115567207, + "learning_rate": 0.0001326447954637505, + "loss": 0.4457, + "step": 3275 + }, + { + "epoch": 0.26539209332469216, + "grad_norm": 0.03204037621617317, + "learning_rate": 0.00013268529769137303, + "loss": 0.3711, + "step": 3276 + }, + { + "epoch": 0.2654731043421905, + "grad_norm": 0.03618483617901802, + "learning_rate": 0.00013272579991899553, + "loss": 0.3423, + "step": 3277 + }, + { + "epoch": 0.2655541153596889, + "grad_norm": 0.040721192955970764, + "learning_rate": 0.00013276630214661806, + "loss": 0.3368, + "step": 3278 + }, + { + "epoch": 0.2656351263771873, + "grad_norm": 0.04458589851856232, + "learning_rate": 0.00013280680437424057, + "loss": 0.4423, + "step": 3279 + }, + { + "epoch": 0.2657161373946857, + "grad_norm": 0.048048872500658035, + "learning_rate": 0.0001328473066018631, + "loss": 0.3879, + "step": 3280 + }, + { + "epoch": 0.26579714841218405, + "grad_norm": 0.04069700837135315, + "learning_rate": 0.00013288780882948564, + "loss": 0.3699, + "step": 3281 + }, + { + "epoch": 0.26587815942968246, + "grad_norm": 0.04361570626497269, + "learning_rate": 0.00013292831105710814, + "loss": 0.4226, + "step": 3282 + }, + { + "epoch": 0.2659591704471808, + "grad_norm": 0.03607351705431938, + "learning_rate": 0.00013296881328473068, + "loss": 0.3625, + "step": 3283 + }, + { + "epoch": 0.26604018146467917, + "grad_norm": 0.0401526503264904, + "learning_rate": 0.00013300931551235318, + "loss": 0.379, + "step": 3284 + }, + { + "epoch": 0.2661211924821776, + "grad_norm": 0.04371299222111702, + "learning_rate": 0.00013304981773997572, + "loss": 0.3464, + "step": 3285 + }, + { + "epoch": 0.26620220349967594, + "grad_norm": 0.04439733177423477, + "learning_rate": 0.00013309031996759822, + "loss": 0.3851, + "step": 3286 + }, + { + "epoch": 0.26628321451717435, + "grad_norm": 0.04283667728304863, + "learning_rate": 0.00013313082219522075, + "loss": 0.3645, + "step": 3287 + }, + { + "epoch": 0.2663642255346727, + "grad_norm": 0.04181892052292824, + "learning_rate": 0.00013317132442284326, + "loss": 0.3553, + "step": 3288 + }, + { + "epoch": 0.2664452365521711, + "grad_norm": 0.045632556080818176, + "learning_rate": 0.0001332118266504658, + "loss": 0.3707, + "step": 3289 + }, + { + "epoch": 0.26652624756966947, + "grad_norm": 0.041404347866773605, + "learning_rate": 0.0001332523288780883, + "loss": 0.3905, + "step": 3290 + }, + { + "epoch": 0.26660725858716783, + "grad_norm": 0.04776446893811226, + "learning_rate": 0.00013329283110571083, + "loss": 0.4064, + "step": 3291 + }, + { + "epoch": 0.26668826960466624, + "grad_norm": 0.05265938863158226, + "learning_rate": 0.00013333333333333334, + "loss": 0.3756, + "step": 3292 + }, + { + "epoch": 0.2667692806221646, + "grad_norm": 0.04272965341806412, + "learning_rate": 0.00013337383556095587, + "loss": 0.4049, + "step": 3293 + }, + { + "epoch": 0.266850291639663, + "grad_norm": 0.04434579610824585, + "learning_rate": 0.00013341433778857838, + "loss": 0.3837, + "step": 3294 + }, + { + "epoch": 0.26693130265716136, + "grad_norm": 0.04281012713909149, + "learning_rate": 0.0001334548400162009, + "loss": 0.405, + "step": 3295 + }, + { + "epoch": 0.2670123136746598, + "grad_norm": 0.04999249801039696, + "learning_rate": 0.00013349534224382342, + "loss": 0.3927, + "step": 3296 + }, + { + "epoch": 0.26709332469215813, + "grad_norm": 0.03942561522126198, + "learning_rate": 0.00013353584447144592, + "loss": 0.3599, + "step": 3297 + }, + { + "epoch": 0.26717433570965654, + "grad_norm": 0.04372606426477432, + "learning_rate": 0.00013357634669906846, + "loss": 0.3304, + "step": 3298 + }, + { + "epoch": 0.2672553467271549, + "grad_norm": 0.03771752864122391, + "learning_rate": 0.00013361684892669096, + "loss": 0.3816, + "step": 3299 + }, + { + "epoch": 0.26733635774465325, + "grad_norm": 0.047414857894182205, + "learning_rate": 0.0001336573511543135, + "loss": 0.3672, + "step": 3300 + }, + { + "epoch": 0.26741736876215166, + "grad_norm": 0.04105841740965843, + "learning_rate": 0.000133697853381936, + "loss": 0.3384, + "step": 3301 + }, + { + "epoch": 0.26749837977965, + "grad_norm": 0.03685571625828743, + "learning_rate": 0.00013373835560955854, + "loss": 0.3882, + "step": 3302 + }, + { + "epoch": 0.26757939079714843, + "grad_norm": 0.05935465544462204, + "learning_rate": 0.00013377885783718104, + "loss": 0.4173, + "step": 3303 + }, + { + "epoch": 0.2676604018146468, + "grad_norm": 0.03994208946824074, + "learning_rate": 0.00013381936006480358, + "loss": 0.3625, + "step": 3304 + }, + { + "epoch": 0.2677414128321452, + "grad_norm": 0.04886750131845474, + "learning_rate": 0.00013385986229242608, + "loss": 0.4518, + "step": 3305 + }, + { + "epoch": 0.26782242384964355, + "grad_norm": 0.03798381984233856, + "learning_rate": 0.00013390036452004862, + "loss": 0.367, + "step": 3306 + }, + { + "epoch": 0.2679034348671419, + "grad_norm": 0.04732591658830643, + "learning_rate": 0.00013394086674767112, + "loss": 0.3887, + "step": 3307 + }, + { + "epoch": 0.2679844458846403, + "grad_norm": 0.034767456352710724, + "learning_rate": 0.00013398136897529365, + "loss": 0.361, + "step": 3308 + }, + { + "epoch": 0.2680654569021387, + "grad_norm": 0.036919280886650085, + "learning_rate": 0.00013402187120291616, + "loss": 0.3588, + "step": 3309 + }, + { + "epoch": 0.2681464679196371, + "grad_norm": 0.04431672394275665, + "learning_rate": 0.0001340623734305387, + "loss": 0.3589, + "step": 3310 + }, + { + "epoch": 0.26822747893713544, + "grad_norm": 0.04340912401676178, + "learning_rate": 0.0001341028756581612, + "loss": 0.343, + "step": 3311 + }, + { + "epoch": 0.26830848995463386, + "grad_norm": 0.03599447011947632, + "learning_rate": 0.00013414337788578373, + "loss": 0.3526, + "step": 3312 + }, + { + "epoch": 0.2683895009721322, + "grad_norm": 0.04156935214996338, + "learning_rate": 0.00013418388011340624, + "loss": 0.3797, + "step": 3313 + }, + { + "epoch": 0.26847051198963057, + "grad_norm": 0.036646392196416855, + "learning_rate": 0.00013422438234102877, + "loss": 0.3596, + "step": 3314 + }, + { + "epoch": 0.268551523007129, + "grad_norm": 0.043701015412807465, + "learning_rate": 0.00013426488456865128, + "loss": 0.4103, + "step": 3315 + }, + { + "epoch": 0.26863253402462733, + "grad_norm": 0.04093540832400322, + "learning_rate": 0.0001343053867962738, + "loss": 0.3778, + "step": 3316 + }, + { + "epoch": 0.26871354504212575, + "grad_norm": 0.043862100690603256, + "learning_rate": 0.00013434588902389632, + "loss": 0.3737, + "step": 3317 + }, + { + "epoch": 0.2687945560596241, + "grad_norm": 0.04786309227347374, + "learning_rate": 0.00013438639125151885, + "loss": 0.3606, + "step": 3318 + }, + { + "epoch": 0.2688755670771225, + "grad_norm": 0.03975916653871536, + "learning_rate": 0.00013442689347914136, + "loss": 0.3923, + "step": 3319 + }, + { + "epoch": 0.26895657809462087, + "grad_norm": 0.040104031562805176, + "learning_rate": 0.00013446739570676386, + "loss": 0.3744, + "step": 3320 + }, + { + "epoch": 0.2690375891121192, + "grad_norm": 0.045201487839221954, + "learning_rate": 0.0001345078979343864, + "loss": 0.3496, + "step": 3321 + }, + { + "epoch": 0.26911860012961764, + "grad_norm": 0.042033299803733826, + "learning_rate": 0.0001345484001620089, + "loss": 0.3831, + "step": 3322 + }, + { + "epoch": 0.269199611147116, + "grad_norm": 0.03977862000465393, + "learning_rate": 0.00013458890238963144, + "loss": 0.3882, + "step": 3323 + }, + { + "epoch": 0.2692806221646144, + "grad_norm": 0.03551046550273895, + "learning_rate": 0.00013462940461725394, + "loss": 0.323, + "step": 3324 + }, + { + "epoch": 0.26936163318211276, + "grad_norm": 0.03731973096728325, + "learning_rate": 0.00013466990684487648, + "loss": 0.3815, + "step": 3325 + }, + { + "epoch": 0.26944264419961117, + "grad_norm": 0.04608595743775368, + "learning_rate": 0.00013471040907249898, + "loss": 0.4272, + "step": 3326 + }, + { + "epoch": 0.2695236552171095, + "grad_norm": 0.04043601453304291, + "learning_rate": 0.00013475091130012151, + "loss": 0.4027, + "step": 3327 + }, + { + "epoch": 0.2696046662346079, + "grad_norm": 0.03765019401907921, + "learning_rate": 0.00013479141352774402, + "loss": 0.4161, + "step": 3328 + }, + { + "epoch": 0.2696856772521063, + "grad_norm": 0.04062202200293541, + "learning_rate": 0.00013483191575536655, + "loss": 0.3838, + "step": 3329 + }, + { + "epoch": 0.26976668826960465, + "grad_norm": 0.04692335054278374, + "learning_rate": 0.00013487241798298906, + "loss": 0.3961, + "step": 3330 + }, + { + "epoch": 0.26984769928710306, + "grad_norm": 0.03722019121050835, + "learning_rate": 0.0001349129202106116, + "loss": 0.3208, + "step": 3331 + }, + { + "epoch": 0.2699287103046014, + "grad_norm": 0.03446126729249954, + "learning_rate": 0.0001349534224382341, + "loss": 0.3415, + "step": 3332 + }, + { + "epoch": 0.2700097213220998, + "grad_norm": 0.039892058819532394, + "learning_rate": 0.00013499392466585663, + "loss": 0.3001, + "step": 3333 + }, + { + "epoch": 0.2700907323395982, + "grad_norm": 0.04407581314444542, + "learning_rate": 0.00013503442689347914, + "loss": 0.3266, + "step": 3334 + }, + { + "epoch": 0.27017174335709654, + "grad_norm": 0.042377736419439316, + "learning_rate": 0.00013507492912110167, + "loss": 0.3708, + "step": 3335 + }, + { + "epoch": 0.27025275437459495, + "grad_norm": 0.036600060760974884, + "learning_rate": 0.0001351154313487242, + "loss": 0.3425, + "step": 3336 + }, + { + "epoch": 0.2703337653920933, + "grad_norm": 0.04431897774338722, + "learning_rate": 0.0001351559335763467, + "loss": 0.3732, + "step": 3337 + }, + { + "epoch": 0.2704147764095917, + "grad_norm": 0.04788152873516083, + "learning_rate": 0.00013519643580396924, + "loss": 0.382, + "step": 3338 + }, + { + "epoch": 0.2704957874270901, + "grad_norm": 0.04281443729996681, + "learning_rate": 0.00013523693803159175, + "loss": 0.3487, + "step": 3339 + }, + { + "epoch": 0.2705767984445885, + "grad_norm": 0.04157177358865738, + "learning_rate": 0.00013527744025921428, + "loss": 0.3783, + "step": 3340 + }, + { + "epoch": 0.27065780946208684, + "grad_norm": 0.046408966183662415, + "learning_rate": 0.0001353179424868368, + "loss": 0.3998, + "step": 3341 + }, + { + "epoch": 0.2707388204795852, + "grad_norm": 0.04052488133311272, + "learning_rate": 0.0001353584447144593, + "loss": 0.4103, + "step": 3342 + }, + { + "epoch": 0.2708198314970836, + "grad_norm": 0.041967444121837616, + "learning_rate": 0.00013539894694208183, + "loss": 0.3815, + "step": 3343 + }, + { + "epoch": 0.27090084251458196, + "grad_norm": 0.041668497025966644, + "learning_rate": 0.00013543944916970434, + "loss": 0.3921, + "step": 3344 + }, + { + "epoch": 0.2709818535320804, + "grad_norm": 0.042255863547325134, + "learning_rate": 0.00013547995139732684, + "loss": 0.3733, + "step": 3345 + }, + { + "epoch": 0.27106286454957873, + "grad_norm": 0.052985310554504395, + "learning_rate": 0.00013552045362494937, + "loss": 0.3898, + "step": 3346 + }, + { + "epoch": 0.27114387556707714, + "grad_norm": 0.03639228269457817, + "learning_rate": 0.00013556095585257188, + "loss": 0.3232, + "step": 3347 + }, + { + "epoch": 0.2712248865845755, + "grad_norm": 0.041970837861299515, + "learning_rate": 0.00013560145808019441, + "loss": 0.3487, + "step": 3348 + }, + { + "epoch": 0.2713058976020739, + "grad_norm": 0.041541244834661484, + "learning_rate": 0.00013564196030781692, + "loss": 0.3511, + "step": 3349 + }, + { + "epoch": 0.27138690861957226, + "grad_norm": 0.03761943429708481, + "learning_rate": 0.00013568246253543945, + "loss": 0.3305, + "step": 3350 + }, + { + "epoch": 0.2714679196370706, + "grad_norm": 0.03963864594697952, + "learning_rate": 0.00013572296476306196, + "loss": 0.3451, + "step": 3351 + }, + { + "epoch": 0.27154893065456903, + "grad_norm": 0.040368348360061646, + "learning_rate": 0.0001357634669906845, + "loss": 0.371, + "step": 3352 + }, + { + "epoch": 0.2716299416720674, + "grad_norm": 0.038501329720020294, + "learning_rate": 0.000135803969218307, + "loss": 0.4093, + "step": 3353 + }, + { + "epoch": 0.2717109526895658, + "grad_norm": 0.039939314126968384, + "learning_rate": 0.00013584447144592953, + "loss": 0.3919, + "step": 3354 + }, + { + "epoch": 0.27179196370706415, + "grad_norm": 0.04205890744924545, + "learning_rate": 0.00013588497367355207, + "loss": 0.3552, + "step": 3355 + }, + { + "epoch": 0.27187297472456257, + "grad_norm": 0.04411284998059273, + "learning_rate": 0.00013592547590117457, + "loss": 0.3334, + "step": 3356 + }, + { + "epoch": 0.2719539857420609, + "grad_norm": 0.05139146000146866, + "learning_rate": 0.0001359659781287971, + "loss": 0.324, + "step": 3357 + }, + { + "epoch": 0.2720349967595593, + "grad_norm": 0.04077056795358658, + "learning_rate": 0.0001360064803564196, + "loss": 0.3724, + "step": 3358 + }, + { + "epoch": 0.2721160077770577, + "grad_norm": 0.04681459814310074, + "learning_rate": 0.00013604698258404214, + "loss": 0.372, + "step": 3359 + }, + { + "epoch": 0.27219701879455604, + "grad_norm": 0.0463474839925766, + "learning_rate": 0.00013608748481166465, + "loss": 0.3591, + "step": 3360 + }, + { + "epoch": 0.27227802981205446, + "grad_norm": 0.03969384729862213, + "learning_rate": 0.00013612798703928718, + "loss": 0.3452, + "step": 3361 + }, + { + "epoch": 0.2723590408295528, + "grad_norm": 0.03769201785326004, + "learning_rate": 0.0001361684892669097, + "loss": 0.4277, + "step": 3362 + }, + { + "epoch": 0.2724400518470512, + "grad_norm": 0.039927735924720764, + "learning_rate": 0.00013620899149453222, + "loss": 0.3531, + "step": 3363 + }, + { + "epoch": 0.2725210628645496, + "grad_norm": 0.05112633854150772, + "learning_rate": 0.00013624949372215473, + "loss": 0.3901, + "step": 3364 + }, + { + "epoch": 0.27260207388204793, + "grad_norm": 0.049192748963832855, + "learning_rate": 0.00013628999594977726, + "loss": 0.4118, + "step": 3365 + }, + { + "epoch": 0.27268308489954635, + "grad_norm": 0.04921245574951172, + "learning_rate": 0.00013633049817739977, + "loss": 0.4416, + "step": 3366 + }, + { + "epoch": 0.2727640959170447, + "grad_norm": 0.034451164305210114, + "learning_rate": 0.00013637100040502227, + "loss": 0.4156, + "step": 3367 + }, + { + "epoch": 0.2728451069345431, + "grad_norm": 0.03713851794600487, + "learning_rate": 0.0001364115026326448, + "loss": 0.3796, + "step": 3368 + }, + { + "epoch": 0.27292611795204147, + "grad_norm": 0.04367047920823097, + "learning_rate": 0.0001364520048602673, + "loss": 0.3143, + "step": 3369 + }, + { + "epoch": 0.2730071289695399, + "grad_norm": 0.039611510932445526, + "learning_rate": 0.00013649250708788982, + "loss": 0.3549, + "step": 3370 + }, + { + "epoch": 0.27308813998703824, + "grad_norm": 0.043952010571956635, + "learning_rate": 0.00013653300931551235, + "loss": 0.4153, + "step": 3371 + }, + { + "epoch": 0.2731691510045366, + "grad_norm": 0.04876040667295456, + "learning_rate": 0.00013657351154313486, + "loss": 0.3853, + "step": 3372 + }, + { + "epoch": 0.273250162022035, + "grad_norm": 0.04519908130168915, + "learning_rate": 0.0001366140137707574, + "loss": 0.3651, + "step": 3373 + }, + { + "epoch": 0.27333117303953336, + "grad_norm": 0.050599344074726105, + "learning_rate": 0.00013665451599837993, + "loss": 0.3816, + "step": 3374 + }, + { + "epoch": 0.27341218405703177, + "grad_norm": 0.04185096547007561, + "learning_rate": 0.00013669501822600243, + "loss": 0.3821, + "step": 3375 + }, + { + "epoch": 0.2734931950745301, + "grad_norm": 0.0364995002746582, + "learning_rate": 0.00013673552045362496, + "loss": 0.3356, + "step": 3376 + }, + { + "epoch": 0.27357420609202854, + "grad_norm": 0.044956009835004807, + "learning_rate": 0.00013677602268124747, + "loss": 0.3437, + "step": 3377 + }, + { + "epoch": 0.2736552171095269, + "grad_norm": 0.04424003139138222, + "learning_rate": 0.00013681652490887, + "loss": 0.3639, + "step": 3378 + }, + { + "epoch": 0.27373622812702525, + "grad_norm": 0.042500704526901245, + "learning_rate": 0.0001368570271364925, + "loss": 0.3316, + "step": 3379 + }, + { + "epoch": 0.27381723914452366, + "grad_norm": 0.03931257873773575, + "learning_rate": 0.00013689752936411504, + "loss": 0.3817, + "step": 3380 + }, + { + "epoch": 0.273898250162022, + "grad_norm": 0.0466313473880291, + "learning_rate": 0.00013693803159173755, + "loss": 0.3709, + "step": 3381 + }, + { + "epoch": 0.27397926117952043, + "grad_norm": 0.04171553999185562, + "learning_rate": 0.00013697853381936008, + "loss": 0.3918, + "step": 3382 + }, + { + "epoch": 0.2740602721970188, + "grad_norm": 0.04538581892848015, + "learning_rate": 0.0001370190360469826, + "loss": 0.3781, + "step": 3383 + }, + { + "epoch": 0.2741412832145172, + "grad_norm": 0.03303420916199684, + "learning_rate": 0.00013705953827460512, + "loss": 0.3072, + "step": 3384 + }, + { + "epoch": 0.27422229423201555, + "grad_norm": 0.03682176396250725, + "learning_rate": 0.00013710004050222763, + "loss": 0.4019, + "step": 3385 + }, + { + "epoch": 0.2743033052495139, + "grad_norm": 0.040339164435863495, + "learning_rate": 0.00013714054272985016, + "loss": 0.4232, + "step": 3386 + }, + { + "epoch": 0.2743843162670123, + "grad_norm": 0.053271178156137466, + "learning_rate": 0.00013718104495747267, + "loss": 0.3692, + "step": 3387 + }, + { + "epoch": 0.2744653272845107, + "grad_norm": 0.045034825801849365, + "learning_rate": 0.0001372215471850952, + "loss": 0.4105, + "step": 3388 + }, + { + "epoch": 0.2745463383020091, + "grad_norm": 0.04269041493535042, + "learning_rate": 0.0001372620494127177, + "loss": 0.38, + "step": 3389 + }, + { + "epoch": 0.27462734931950744, + "grad_norm": 0.0487322062253952, + "learning_rate": 0.00013730255164034024, + "loss": 0.4426, + "step": 3390 + }, + { + "epoch": 0.27470836033700585, + "grad_norm": 0.041726551949977875, + "learning_rate": 0.00013734305386796275, + "loss": 0.3755, + "step": 3391 + }, + { + "epoch": 0.2747893713545042, + "grad_norm": 0.04072955250740051, + "learning_rate": 0.00013738355609558525, + "loss": 0.3748, + "step": 3392 + }, + { + "epoch": 0.2748703823720026, + "grad_norm": 0.04266681149601936, + "learning_rate": 0.00013742405832320779, + "loss": 0.349, + "step": 3393 + }, + { + "epoch": 0.274951393389501, + "grad_norm": 0.05282905325293541, + "learning_rate": 0.0001374645605508303, + "loss": 0.3642, + "step": 3394 + }, + { + "epoch": 0.27503240440699933, + "grad_norm": 0.04154253751039505, + "learning_rate": 0.00013750506277845282, + "loss": 0.4091, + "step": 3395 + }, + { + "epoch": 0.27511341542449774, + "grad_norm": 0.03817356377840042, + "learning_rate": 0.00013754556500607533, + "loss": 0.3445, + "step": 3396 + }, + { + "epoch": 0.2751944264419961, + "grad_norm": 0.04182487726211548, + "learning_rate": 0.00013758606723369786, + "loss": 0.3667, + "step": 3397 + }, + { + "epoch": 0.2752754374594945, + "grad_norm": 0.042500101029872894, + "learning_rate": 0.00013762656946132037, + "loss": 0.3824, + "step": 3398 + }, + { + "epoch": 0.27535644847699287, + "grad_norm": 0.03695574402809143, + "learning_rate": 0.0001376670716889429, + "loss": 0.3493, + "step": 3399 + }, + { + "epoch": 0.2754374594944913, + "grad_norm": 0.049295760691165924, + "learning_rate": 0.0001377075739165654, + "loss": 0.3522, + "step": 3400 + }, + { + "epoch": 0.27551847051198963, + "grad_norm": 0.04231920465826988, + "learning_rate": 0.00013774807614418794, + "loss": 0.3436, + "step": 3401 + }, + { + "epoch": 0.275599481529488, + "grad_norm": 0.040712591260671616, + "learning_rate": 0.00013778857837181045, + "loss": 0.395, + "step": 3402 + }, + { + "epoch": 0.2756804925469864, + "grad_norm": 0.034782666712999344, + "learning_rate": 0.00013782908059943298, + "loss": 0.3343, + "step": 3403 + }, + { + "epoch": 0.27576150356448476, + "grad_norm": 0.03873579949140549, + "learning_rate": 0.0001378695828270555, + "loss": 0.3334, + "step": 3404 + }, + { + "epoch": 0.27584251458198317, + "grad_norm": 0.03855769336223602, + "learning_rate": 0.00013791008505467802, + "loss": 0.3417, + "step": 3405 + }, + { + "epoch": 0.2759235255994815, + "grad_norm": 0.042153116315603256, + "learning_rate": 0.00013795058728230053, + "loss": 0.4225, + "step": 3406 + }, + { + "epoch": 0.27600453661697993, + "grad_norm": 0.04109833016991615, + "learning_rate": 0.00013799108950992306, + "loss": 0.3462, + "step": 3407 + }, + { + "epoch": 0.2760855476344783, + "grad_norm": 0.03828966245055199, + "learning_rate": 0.00013803159173754557, + "loss": 0.367, + "step": 3408 + }, + { + "epoch": 0.27616655865197665, + "grad_norm": 0.05456683784723282, + "learning_rate": 0.0001380720939651681, + "loss": 0.3989, + "step": 3409 + }, + { + "epoch": 0.27624756966947506, + "grad_norm": 0.04691299423575401, + "learning_rate": 0.0001381125961927906, + "loss": 0.3572, + "step": 3410 + }, + { + "epoch": 0.2763285806869734, + "grad_norm": 0.046030715107917786, + "learning_rate": 0.00013815309842041314, + "loss": 0.3414, + "step": 3411 + }, + { + "epoch": 0.2764095917044718, + "grad_norm": 0.03669194132089615, + "learning_rate": 0.00013819360064803567, + "loss": 0.3463, + "step": 3412 + }, + { + "epoch": 0.2764906027219702, + "grad_norm": 0.03543637692928314, + "learning_rate": 0.00013823410287565818, + "loss": 0.3356, + "step": 3413 + }, + { + "epoch": 0.2765716137394686, + "grad_norm": 0.051583707332611084, + "learning_rate": 0.00013827460510328069, + "loss": 0.3421, + "step": 3414 + }, + { + "epoch": 0.27665262475696695, + "grad_norm": 0.039491068571805954, + "learning_rate": 0.0001383151073309032, + "loss": 0.3587, + "step": 3415 + }, + { + "epoch": 0.2767336357744653, + "grad_norm": 0.0397016741335392, + "learning_rate": 0.00013835560955852572, + "loss": 0.3307, + "step": 3416 + }, + { + "epoch": 0.2768146467919637, + "grad_norm": 0.04070290923118591, + "learning_rate": 0.00013839611178614823, + "loss": 0.3835, + "step": 3417 + }, + { + "epoch": 0.27689565780946207, + "grad_norm": 0.04390476271510124, + "learning_rate": 0.00013843661401377076, + "loss": 0.3459, + "step": 3418 + }, + { + "epoch": 0.2769766688269605, + "grad_norm": 0.04806474596261978, + "learning_rate": 0.00013847711624139327, + "loss": 0.3261, + "step": 3419 + }, + { + "epoch": 0.27705767984445884, + "grad_norm": 0.03897551819682121, + "learning_rate": 0.0001385176184690158, + "loss": 0.3872, + "step": 3420 + }, + { + "epoch": 0.27713869086195725, + "grad_norm": 0.035828590393066406, + "learning_rate": 0.0001385581206966383, + "loss": 0.4061, + "step": 3421 + }, + { + "epoch": 0.2772197018794556, + "grad_norm": 0.03442465886473656, + "learning_rate": 0.00013859862292426084, + "loss": 0.3272, + "step": 3422 + }, + { + "epoch": 0.27730071289695396, + "grad_norm": 0.04492630809545517, + "learning_rate": 0.00013863912515188335, + "loss": 0.3583, + "step": 3423 + }, + { + "epoch": 0.27738172391445237, + "grad_norm": 0.04906081035733223, + "learning_rate": 0.00013867962737950588, + "loss": 0.4207, + "step": 3424 + }, + { + "epoch": 0.2774627349319507, + "grad_norm": 0.04144575819373131, + "learning_rate": 0.0001387201296071284, + "loss": 0.4113, + "step": 3425 + }, + { + "epoch": 0.27754374594944914, + "grad_norm": 0.03677285090088844, + "learning_rate": 0.00013876063183475092, + "loss": 0.3383, + "step": 3426 + }, + { + "epoch": 0.2776247569669475, + "grad_norm": 0.0371638722717762, + "learning_rate": 0.00013880113406237343, + "loss": 0.3519, + "step": 3427 + }, + { + "epoch": 0.2777057679844459, + "grad_norm": 0.04500902071595192, + "learning_rate": 0.00013884163628999596, + "loss": 0.3751, + "step": 3428 + }, + { + "epoch": 0.27778677900194426, + "grad_norm": 0.03653738275170326, + "learning_rate": 0.00013888213851761847, + "loss": 0.3316, + "step": 3429 + }, + { + "epoch": 0.2778677900194426, + "grad_norm": 0.047400642186403275, + "learning_rate": 0.000138922640745241, + "loss": 0.4025, + "step": 3430 + }, + { + "epoch": 0.27794880103694103, + "grad_norm": 0.043980009853839874, + "learning_rate": 0.00013896314297286353, + "loss": 0.3988, + "step": 3431 + }, + { + "epoch": 0.2780298120544394, + "grad_norm": 0.03693951293826103, + "learning_rate": 0.00013900364520048604, + "loss": 0.3071, + "step": 3432 + }, + { + "epoch": 0.2781108230719378, + "grad_norm": 0.045300330966711044, + "learning_rate": 0.00013904414742810857, + "loss": 0.4204, + "step": 3433 + }, + { + "epoch": 0.27819183408943615, + "grad_norm": 0.04068627208471298, + "learning_rate": 0.00013908464965573108, + "loss": 0.3378, + "step": 3434 + }, + { + "epoch": 0.27827284510693456, + "grad_norm": 0.036370884627103806, + "learning_rate": 0.0001391251518833536, + "loss": 0.3359, + "step": 3435 + }, + { + "epoch": 0.2783538561244329, + "grad_norm": 0.039670251309871674, + "learning_rate": 0.00013916565411097612, + "loss": 0.3862, + "step": 3436 + }, + { + "epoch": 0.27843486714193133, + "grad_norm": 0.04585297778248787, + "learning_rate": 0.00013920615633859862, + "loss": 0.4285, + "step": 3437 + }, + { + "epoch": 0.2785158781594297, + "grad_norm": 0.048730622977018356, + "learning_rate": 0.00013924665856622116, + "loss": 0.4015, + "step": 3438 + }, + { + "epoch": 0.27859688917692804, + "grad_norm": 0.05057798698544502, + "learning_rate": 0.00013928716079384366, + "loss": 0.3831, + "step": 3439 + }, + { + "epoch": 0.27867790019442645, + "grad_norm": 0.03900301083922386, + "learning_rate": 0.00013932766302146617, + "loss": 0.3171, + "step": 3440 + }, + { + "epoch": 0.2787589112119248, + "grad_norm": 0.03645486384630203, + "learning_rate": 0.0001393681652490887, + "loss": 0.3732, + "step": 3441 + }, + { + "epoch": 0.2788399222294232, + "grad_norm": 0.038289736956357956, + "learning_rate": 0.0001394086674767112, + "loss": 0.3929, + "step": 3442 + }, + { + "epoch": 0.2789209332469216, + "grad_norm": 0.05204898864030838, + "learning_rate": 0.00013944916970433374, + "loss": 0.4195, + "step": 3443 + }, + { + "epoch": 0.27900194426442, + "grad_norm": 0.03858143091201782, + "learning_rate": 0.00013948967193195625, + "loss": 0.3292, + "step": 3444 + }, + { + "epoch": 0.27908295528191834, + "grad_norm": 0.04965558275580406, + "learning_rate": 0.00013953017415957878, + "loss": 0.3984, + "step": 3445 + }, + { + "epoch": 0.2791639662994167, + "grad_norm": 0.03543282300233841, + "learning_rate": 0.0001395706763872013, + "loss": 0.4242, + "step": 3446 + }, + { + "epoch": 0.2792449773169151, + "grad_norm": 0.038583435118198395, + "learning_rate": 0.00013961117861482382, + "loss": 0.3807, + "step": 3447 + }, + { + "epoch": 0.27932598833441347, + "grad_norm": 0.034903813153505325, + "learning_rate": 0.00013965168084244633, + "loss": 0.3902, + "step": 3448 + }, + { + "epoch": 0.2794069993519119, + "grad_norm": 0.047700364142656326, + "learning_rate": 0.00013969218307006886, + "loss": 0.373, + "step": 3449 + }, + { + "epoch": 0.27948801036941023, + "grad_norm": 0.04197010025382042, + "learning_rate": 0.0001397326852976914, + "loss": 0.3786, + "step": 3450 + }, + { + "epoch": 0.27956902138690864, + "grad_norm": 0.041849054396152496, + "learning_rate": 0.0001397731875253139, + "loss": 0.4107, + "step": 3451 + }, + { + "epoch": 0.279650032404407, + "grad_norm": 0.04604052007198334, + "learning_rate": 0.00013981368975293643, + "loss": 0.3422, + "step": 3452 + }, + { + "epoch": 0.27973104342190536, + "grad_norm": 0.04134983569383621, + "learning_rate": 0.00013985419198055894, + "loss": 0.3852, + "step": 3453 + }, + { + "epoch": 0.27981205443940377, + "grad_norm": 0.043265946209430695, + "learning_rate": 0.00013989469420818147, + "loss": 0.3609, + "step": 3454 + }, + { + "epoch": 0.2798930654569021, + "grad_norm": 0.037968236953020096, + "learning_rate": 0.00013993519643580398, + "loss": 0.3342, + "step": 3455 + }, + { + "epoch": 0.27997407647440054, + "grad_norm": 0.04364189878106117, + "learning_rate": 0.0001399756986634265, + "loss": 0.3908, + "step": 3456 + }, + { + "epoch": 0.2800550874918989, + "grad_norm": 0.04440903291106224, + "learning_rate": 0.00014001620089104902, + "loss": 0.4043, + "step": 3457 + }, + { + "epoch": 0.2801360985093973, + "grad_norm": 0.04480652138590813, + "learning_rate": 0.00014005670311867155, + "loss": 0.388, + "step": 3458 + }, + { + "epoch": 0.28021710952689566, + "grad_norm": 0.03758074343204498, + "learning_rate": 0.00014009720534629406, + "loss": 0.3902, + "step": 3459 + }, + { + "epoch": 0.280298120544394, + "grad_norm": 0.0437944270670414, + "learning_rate": 0.0001401377075739166, + "loss": 0.3714, + "step": 3460 + }, + { + "epoch": 0.2803791315618924, + "grad_norm": 0.039681438356637955, + "learning_rate": 0.0001401782098015391, + "loss": 0.3775, + "step": 3461 + }, + { + "epoch": 0.2804601425793908, + "grad_norm": 0.04409261420369148, + "learning_rate": 0.0001402187120291616, + "loss": 0.3569, + "step": 3462 + }, + { + "epoch": 0.2805411535968892, + "grad_norm": 0.03767713904380798, + "learning_rate": 0.0001402592142567841, + "loss": 0.3602, + "step": 3463 + }, + { + "epoch": 0.28062216461438755, + "grad_norm": 0.04003945365548134, + "learning_rate": 0.00014029971648440664, + "loss": 0.3751, + "step": 3464 + }, + { + "epoch": 0.28070317563188596, + "grad_norm": 0.0402999185025692, + "learning_rate": 0.00014034021871202915, + "loss": 0.422, + "step": 3465 + }, + { + "epoch": 0.2807841866493843, + "grad_norm": 0.04214511439204216, + "learning_rate": 0.00014038072093965168, + "loss": 0.3954, + "step": 3466 + }, + { + "epoch": 0.28086519766688267, + "grad_norm": 0.03766317665576935, + "learning_rate": 0.0001404212231672742, + "loss": 0.3376, + "step": 3467 + }, + { + "epoch": 0.2809462086843811, + "grad_norm": 0.044028155505657196, + "learning_rate": 0.00014046172539489672, + "loss": 0.3901, + "step": 3468 + }, + { + "epoch": 0.28102721970187944, + "grad_norm": 0.04517464339733124, + "learning_rate": 0.00014050222762251925, + "loss": 0.3351, + "step": 3469 + }, + { + "epoch": 0.28110823071937785, + "grad_norm": 0.044333942234516144, + "learning_rate": 0.00014054272985014176, + "loss": 0.3802, + "step": 3470 + }, + { + "epoch": 0.2811892417368762, + "grad_norm": 0.04904649406671524, + "learning_rate": 0.0001405832320777643, + "loss": 0.3873, + "step": 3471 + }, + { + "epoch": 0.2812702527543746, + "grad_norm": 0.03742432966828346, + "learning_rate": 0.0001406237343053868, + "loss": 0.4129, + "step": 3472 + }, + { + "epoch": 0.281351263771873, + "grad_norm": 0.037451375275850296, + "learning_rate": 0.00014066423653300933, + "loss": 0.3418, + "step": 3473 + }, + { + "epoch": 0.28143227478937133, + "grad_norm": 0.036673497408628464, + "learning_rate": 0.00014070473876063184, + "loss": 0.3392, + "step": 3474 + }, + { + "epoch": 0.28151328580686974, + "grad_norm": 0.035846445709466934, + "learning_rate": 0.00014074524098825437, + "loss": 0.3608, + "step": 3475 + }, + { + "epoch": 0.2815942968243681, + "grad_norm": 0.04174364358186722, + "learning_rate": 0.00014078574321587688, + "loss": 0.344, + "step": 3476 + }, + { + "epoch": 0.2816753078418665, + "grad_norm": 0.03926561772823334, + "learning_rate": 0.0001408262454434994, + "loss": 0.3958, + "step": 3477 + }, + { + "epoch": 0.28175631885936486, + "grad_norm": 0.044232893735170364, + "learning_rate": 0.00014086674767112192, + "loss": 0.3692, + "step": 3478 + }, + { + "epoch": 0.2818373298768633, + "grad_norm": 0.03457103297114372, + "learning_rate": 0.00014090724989874445, + "loss": 0.3502, + "step": 3479 + }, + { + "epoch": 0.28191834089436163, + "grad_norm": 0.03710518032312393, + "learning_rate": 0.00014094775212636696, + "loss": 0.4032, + "step": 3480 + }, + { + "epoch": 0.28199935191186, + "grad_norm": 0.03896922618150711, + "learning_rate": 0.0001409882543539895, + "loss": 0.3837, + "step": 3481 + }, + { + "epoch": 0.2820803629293584, + "grad_norm": 0.03973960876464844, + "learning_rate": 0.000141028756581612, + "loss": 0.378, + "step": 3482 + }, + { + "epoch": 0.28216137394685675, + "grad_norm": 0.03989710286259651, + "learning_rate": 0.00014106925880923453, + "loss": 0.3804, + "step": 3483 + }, + { + "epoch": 0.28224238496435516, + "grad_norm": 0.035204801708459854, + "learning_rate": 0.00014110976103685703, + "loss": 0.378, + "step": 3484 + }, + { + "epoch": 0.2823233959818535, + "grad_norm": 0.052764181047677994, + "learning_rate": 0.00014115026326447954, + "loss": 0.4182, + "step": 3485 + }, + { + "epoch": 0.28240440699935193, + "grad_norm": 0.03714631870388985, + "learning_rate": 0.00014119076549210207, + "loss": 0.3522, + "step": 3486 + }, + { + "epoch": 0.2824854180168503, + "grad_norm": 0.03593859076499939, + "learning_rate": 0.00014123126771972458, + "loss": 0.3286, + "step": 3487 + }, + { + "epoch": 0.2825664290343487, + "grad_norm": 0.040704239159822464, + "learning_rate": 0.0001412717699473471, + "loss": 0.3842, + "step": 3488 + }, + { + "epoch": 0.28264744005184705, + "grad_norm": 0.03508979082107544, + "learning_rate": 0.00014131227217496962, + "loss": 0.3497, + "step": 3489 + }, + { + "epoch": 0.2827284510693454, + "grad_norm": 0.03818688914179802, + "learning_rate": 0.00014135277440259215, + "loss": 0.4074, + "step": 3490 + }, + { + "epoch": 0.2828094620868438, + "grad_norm": 0.048654261976480484, + "learning_rate": 0.00014139327663021466, + "loss": 0.3559, + "step": 3491 + }, + { + "epoch": 0.2828904731043422, + "grad_norm": 0.03858116269111633, + "learning_rate": 0.0001414337788578372, + "loss": 0.3417, + "step": 3492 + }, + { + "epoch": 0.2829714841218406, + "grad_norm": 0.04757966473698616, + "learning_rate": 0.0001414742810854597, + "loss": 0.391, + "step": 3493 + }, + { + "epoch": 0.28305249513933894, + "grad_norm": 0.04076710715889931, + "learning_rate": 0.00014151478331308223, + "loss": 0.3532, + "step": 3494 + }, + { + "epoch": 0.28313350615683736, + "grad_norm": 0.046661194413900375, + "learning_rate": 0.00014155528554070474, + "loss": 0.3996, + "step": 3495 + }, + { + "epoch": 0.2832145171743357, + "grad_norm": 0.04634448140859604, + "learning_rate": 0.00014159578776832727, + "loss": 0.403, + "step": 3496 + }, + { + "epoch": 0.28329552819183407, + "grad_norm": 0.044653963297605515, + "learning_rate": 0.00014163628999594978, + "loss": 0.3725, + "step": 3497 + }, + { + "epoch": 0.2833765392093325, + "grad_norm": 0.037522487342357635, + "learning_rate": 0.0001416767922235723, + "loss": 0.3723, + "step": 3498 + }, + { + "epoch": 0.28345755022683083, + "grad_norm": 0.04664743319153786, + "learning_rate": 0.00014171729445119482, + "loss": 0.4247, + "step": 3499 + }, + { + "epoch": 0.28353856124432925, + "grad_norm": 0.04447102174162865, + "learning_rate": 0.00014175779667881735, + "loss": 0.3465, + "step": 3500 + }, + { + "epoch": 0.2836195722618276, + "grad_norm": 0.04960550740361214, + "learning_rate": 0.00014179829890643986, + "loss": 0.3827, + "step": 3501 + }, + { + "epoch": 0.283700583279326, + "grad_norm": 0.046506986021995544, + "learning_rate": 0.0001418388011340624, + "loss": 0.4013, + "step": 3502 + }, + { + "epoch": 0.28378159429682437, + "grad_norm": 0.04039178788661957, + "learning_rate": 0.0001418793033616849, + "loss": 0.4076, + "step": 3503 + }, + { + "epoch": 0.2838626053143227, + "grad_norm": 0.03963049128651619, + "learning_rate": 0.00014191980558930743, + "loss": 0.3529, + "step": 3504 + }, + { + "epoch": 0.28394361633182114, + "grad_norm": 0.04509516805410385, + "learning_rate": 0.00014196030781692993, + "loss": 0.3771, + "step": 3505 + }, + { + "epoch": 0.2840246273493195, + "grad_norm": 0.04358714073896408, + "learning_rate": 0.00014200081004455247, + "loss": 0.3809, + "step": 3506 + }, + { + "epoch": 0.2841056383668179, + "grad_norm": 0.036663755774497986, + "learning_rate": 0.00014204131227217497, + "loss": 0.3424, + "step": 3507 + }, + { + "epoch": 0.28418664938431626, + "grad_norm": 0.03686859831213951, + "learning_rate": 0.0001420818144997975, + "loss": 0.3423, + "step": 3508 + }, + { + "epoch": 0.28426766040181467, + "grad_norm": 0.03790473937988281, + "learning_rate": 0.00014212231672742, + "loss": 0.3514, + "step": 3509 + }, + { + "epoch": 0.284348671419313, + "grad_norm": 0.043648287653923035, + "learning_rate": 0.00014216281895504252, + "loss": 0.3904, + "step": 3510 + }, + { + "epoch": 0.2844296824368114, + "grad_norm": 0.03174104541540146, + "learning_rate": 0.00014220332118266505, + "loss": 0.3637, + "step": 3511 + }, + { + "epoch": 0.2845106934543098, + "grad_norm": 0.03682335466146469, + "learning_rate": 0.00014224382341028756, + "loss": 0.375, + "step": 3512 + }, + { + "epoch": 0.28459170447180815, + "grad_norm": 0.0405367873609066, + "learning_rate": 0.0001422843256379101, + "loss": 0.3642, + "step": 3513 + }, + { + "epoch": 0.28467271548930656, + "grad_norm": 0.03693538159132004, + "learning_rate": 0.0001423248278655326, + "loss": 0.3858, + "step": 3514 + }, + { + "epoch": 0.2847537265068049, + "grad_norm": 0.04037932679057121, + "learning_rate": 0.00014236533009315513, + "loss": 0.3701, + "step": 3515 + }, + { + "epoch": 0.2848347375243033, + "grad_norm": 0.038718000054359436, + "learning_rate": 0.00014240583232077764, + "loss": 0.3506, + "step": 3516 + }, + { + "epoch": 0.2849157485418017, + "grad_norm": 0.04396659508347511, + "learning_rate": 0.00014244633454840017, + "loss": 0.4199, + "step": 3517 + }, + { + "epoch": 0.28499675955930004, + "grad_norm": 0.035912420600652695, + "learning_rate": 0.00014248683677602268, + "loss": 0.3609, + "step": 3518 + }, + { + "epoch": 0.28507777057679845, + "grad_norm": 0.052159637212753296, + "learning_rate": 0.0001425273390036452, + "loss": 0.3734, + "step": 3519 + }, + { + "epoch": 0.2851587815942968, + "grad_norm": 0.04180484637618065, + "learning_rate": 0.00014256784123126772, + "loss": 0.3505, + "step": 3520 + }, + { + "epoch": 0.2852397926117952, + "grad_norm": 0.03229363262653351, + "learning_rate": 0.00014260834345889025, + "loss": 0.3691, + "step": 3521 + }, + { + "epoch": 0.2853208036292936, + "grad_norm": 0.04348330572247505, + "learning_rate": 0.00014264884568651276, + "loss": 0.3871, + "step": 3522 + }, + { + "epoch": 0.285401814646792, + "grad_norm": 0.04351454973220825, + "learning_rate": 0.0001426893479141353, + "loss": 0.3836, + "step": 3523 + }, + { + "epoch": 0.28548282566429034, + "grad_norm": 0.04140999913215637, + "learning_rate": 0.0001427298501417578, + "loss": 0.4107, + "step": 3524 + }, + { + "epoch": 0.2855638366817887, + "grad_norm": 0.04897570237517357, + "learning_rate": 0.00014277035236938033, + "loss": 0.4028, + "step": 3525 + }, + { + "epoch": 0.2856448476992871, + "grad_norm": 0.03690803796052933, + "learning_rate": 0.00014281085459700286, + "loss": 0.3223, + "step": 3526 + }, + { + "epoch": 0.28572585871678546, + "grad_norm": 0.05218454450368881, + "learning_rate": 0.00014285135682462537, + "loss": 0.398, + "step": 3527 + }, + { + "epoch": 0.2858068697342839, + "grad_norm": 0.04127073287963867, + "learning_rate": 0.0001428918590522479, + "loss": 0.4329, + "step": 3528 + }, + { + "epoch": 0.28588788075178223, + "grad_norm": 0.03908568620681763, + "learning_rate": 0.0001429323612798704, + "loss": 0.3761, + "step": 3529 + }, + { + "epoch": 0.28596889176928064, + "grad_norm": 0.03365986794233322, + "learning_rate": 0.00014297286350749294, + "loss": 0.3516, + "step": 3530 + }, + { + "epoch": 0.286049902786779, + "grad_norm": 0.03684350103139877, + "learning_rate": 0.00014301336573511545, + "loss": 0.3762, + "step": 3531 + }, + { + "epoch": 0.2861309138042774, + "grad_norm": 0.03770121559500694, + "learning_rate": 0.00014305386796273795, + "loss": 0.3836, + "step": 3532 + }, + { + "epoch": 0.28621192482177576, + "grad_norm": 0.03829558193683624, + "learning_rate": 0.00014309437019036048, + "loss": 0.3485, + "step": 3533 + }, + { + "epoch": 0.2862929358392741, + "grad_norm": 0.052036408334970474, + "learning_rate": 0.000143134872417983, + "loss": 0.3988, + "step": 3534 + }, + { + "epoch": 0.28637394685677253, + "grad_norm": 0.048078227788209915, + "learning_rate": 0.0001431753746456055, + "loss": 0.3805, + "step": 3535 + }, + { + "epoch": 0.2864549578742709, + "grad_norm": 0.043900419026613235, + "learning_rate": 0.00014321587687322803, + "loss": 0.4089, + "step": 3536 + }, + { + "epoch": 0.2865359688917693, + "grad_norm": 0.03973248600959778, + "learning_rate": 0.00014325637910085054, + "loss": 0.3965, + "step": 3537 + }, + { + "epoch": 0.28661697990926766, + "grad_norm": 0.04060791805386543, + "learning_rate": 0.00014329688132847307, + "loss": 0.3785, + "step": 3538 + }, + { + "epoch": 0.28669799092676607, + "grad_norm": 0.035248950123786926, + "learning_rate": 0.00014333738355609558, + "loss": 0.3761, + "step": 3539 + }, + { + "epoch": 0.2867790019442644, + "grad_norm": 0.03733890876173973, + "learning_rate": 0.0001433778857837181, + "loss": 0.3458, + "step": 3540 + }, + { + "epoch": 0.2868600129617628, + "grad_norm": 0.04270927235484123, + "learning_rate": 0.00014341838801134062, + "loss": 0.3731, + "step": 3541 + }, + { + "epoch": 0.2869410239792612, + "grad_norm": 0.0395025797188282, + "learning_rate": 0.00014345889023896315, + "loss": 0.3688, + "step": 3542 + }, + { + "epoch": 0.28702203499675955, + "grad_norm": 0.03573182225227356, + "learning_rate": 0.00014349939246658565, + "loss": 0.3082, + "step": 3543 + }, + { + "epoch": 0.28710304601425796, + "grad_norm": 0.03845391795039177, + "learning_rate": 0.0001435398946942082, + "loss": 0.3705, + "step": 3544 + }, + { + "epoch": 0.2871840570317563, + "grad_norm": 0.03716522082686424, + "learning_rate": 0.00014358039692183072, + "loss": 0.3207, + "step": 3545 + }, + { + "epoch": 0.2872650680492547, + "grad_norm": 0.046315036714076996, + "learning_rate": 0.00014362089914945323, + "loss": 0.3454, + "step": 3546 + }, + { + "epoch": 0.2873460790667531, + "grad_norm": 0.033960383385419846, + "learning_rate": 0.00014366140137707576, + "loss": 0.3123, + "step": 3547 + }, + { + "epoch": 0.28742709008425144, + "grad_norm": 0.04001070186495781, + "learning_rate": 0.00014370190360469827, + "loss": 0.4092, + "step": 3548 + }, + { + "epoch": 0.28750810110174985, + "grad_norm": 0.04841066151857376, + "learning_rate": 0.0001437424058323208, + "loss": 0.3717, + "step": 3549 + }, + { + "epoch": 0.2875891121192482, + "grad_norm": 0.04550325497984886, + "learning_rate": 0.0001437829080599433, + "loss": 0.3799, + "step": 3550 + }, + { + "epoch": 0.2876701231367466, + "grad_norm": 0.04212433844804764, + "learning_rate": 0.00014382341028756584, + "loss": 0.4029, + "step": 3551 + }, + { + "epoch": 0.28775113415424497, + "grad_norm": 0.046040620654821396, + "learning_rate": 0.00014386391251518835, + "loss": 0.4278, + "step": 3552 + }, + { + "epoch": 0.2878321451717434, + "grad_norm": 0.0427493192255497, + "learning_rate": 0.00014390441474281088, + "loss": 0.3561, + "step": 3553 + }, + { + "epoch": 0.28791315618924174, + "grad_norm": 0.036486729979515076, + "learning_rate": 0.00014394491697043338, + "loss": 0.3879, + "step": 3554 + }, + { + "epoch": 0.2879941672067401, + "grad_norm": 0.03568176552653313, + "learning_rate": 0.00014398541919805592, + "loss": 0.4286, + "step": 3555 + }, + { + "epoch": 0.2880751782242385, + "grad_norm": 0.04736393690109253, + "learning_rate": 0.00014402592142567842, + "loss": 0.4214, + "step": 3556 + }, + { + "epoch": 0.28815618924173686, + "grad_norm": 0.04134480282664299, + "learning_rate": 0.00014406642365330093, + "loss": 0.3721, + "step": 3557 + }, + { + "epoch": 0.28823720025923527, + "grad_norm": 0.03446532040834427, + "learning_rate": 0.00014410692588092344, + "loss": 0.3385, + "step": 3558 + }, + { + "epoch": 0.2883182112767336, + "grad_norm": 0.03532950207591057, + "learning_rate": 0.00014414742810854597, + "loss": 0.371, + "step": 3559 + }, + { + "epoch": 0.28839922229423204, + "grad_norm": 0.04255358502268791, + "learning_rate": 0.00014418793033616848, + "loss": 0.424, + "step": 3560 + }, + { + "epoch": 0.2884802333117304, + "grad_norm": 0.035087406635284424, + "learning_rate": 0.000144228432563791, + "loss": 0.3308, + "step": 3561 + }, + { + "epoch": 0.28856124432922875, + "grad_norm": 0.049905095249414444, + "learning_rate": 0.00014426893479141351, + "loss": 0.3809, + "step": 3562 + }, + { + "epoch": 0.28864225534672716, + "grad_norm": 0.040026336908340454, + "learning_rate": 0.00014430943701903605, + "loss": 0.3896, + "step": 3563 + }, + { + "epoch": 0.2887232663642255, + "grad_norm": 0.03831109777092934, + "learning_rate": 0.00014434993924665858, + "loss": 0.4104, + "step": 3564 + }, + { + "epoch": 0.28880427738172393, + "grad_norm": 0.04056220501661301, + "learning_rate": 0.0001443904414742811, + "loss": 0.3361, + "step": 3565 + }, + { + "epoch": 0.2888852883992223, + "grad_norm": 0.038314368575811386, + "learning_rate": 0.00014443094370190362, + "loss": 0.3596, + "step": 3566 + }, + { + "epoch": 0.2889662994167207, + "grad_norm": 0.04543676599860191, + "learning_rate": 0.00014447144592952613, + "loss": 0.4056, + "step": 3567 + }, + { + "epoch": 0.28904731043421905, + "grad_norm": 0.04003027454018593, + "learning_rate": 0.00014451194815714866, + "loss": 0.3556, + "step": 3568 + }, + { + "epoch": 0.2891283214517174, + "grad_norm": 0.04414728283882141, + "learning_rate": 0.00014455245038477117, + "loss": 0.4268, + "step": 3569 + }, + { + "epoch": 0.2892093324692158, + "grad_norm": 0.0389229953289032, + "learning_rate": 0.0001445929526123937, + "loss": 0.3166, + "step": 3570 + }, + { + "epoch": 0.2892903434867142, + "grad_norm": 0.0376911461353302, + "learning_rate": 0.0001446334548400162, + "loss": 0.3885, + "step": 3571 + }, + { + "epoch": 0.2893713545042126, + "grad_norm": 0.04160469397902489, + "learning_rate": 0.00014467395706763874, + "loss": 0.3855, + "step": 3572 + }, + { + "epoch": 0.28945236552171094, + "grad_norm": 0.041869085282087326, + "learning_rate": 0.00014471445929526124, + "loss": 0.3559, + "step": 3573 + }, + { + "epoch": 0.28953337653920935, + "grad_norm": 0.04308614507317543, + "learning_rate": 0.00014475496152288378, + "loss": 0.3946, + "step": 3574 + }, + { + "epoch": 0.2896143875567077, + "grad_norm": 0.036581553518772125, + "learning_rate": 0.00014479546375050628, + "loss": 0.3732, + "step": 3575 + }, + { + "epoch": 0.28969539857420606, + "grad_norm": 0.050230905413627625, + "learning_rate": 0.00014483596597812882, + "loss": 0.3364, + "step": 3576 + }, + { + "epoch": 0.2897764095917045, + "grad_norm": 0.04288823902606964, + "learning_rate": 0.00014487646820575132, + "loss": 0.3683, + "step": 3577 + }, + { + "epoch": 0.28985742060920283, + "grad_norm": 0.03672194108366966, + "learning_rate": 0.00014491697043337386, + "loss": 0.3776, + "step": 3578 + }, + { + "epoch": 0.28993843162670124, + "grad_norm": 0.04550372064113617, + "learning_rate": 0.00014495747266099636, + "loss": 0.4514, + "step": 3579 + }, + { + "epoch": 0.2900194426441996, + "grad_norm": 0.04061241075396538, + "learning_rate": 0.00014499797488861887, + "loss": 0.3659, + "step": 3580 + }, + { + "epoch": 0.290100453661698, + "grad_norm": 0.039126452058553696, + "learning_rate": 0.0001450384771162414, + "loss": 0.4203, + "step": 3581 + }, + { + "epoch": 0.29018146467919637, + "grad_norm": 0.0516708642244339, + "learning_rate": 0.0001450789793438639, + "loss": 0.384, + "step": 3582 + }, + { + "epoch": 0.2902624756966948, + "grad_norm": 0.045026857405900955, + "learning_rate": 0.00014511948157148644, + "loss": 0.3631, + "step": 3583 + }, + { + "epoch": 0.29034348671419313, + "grad_norm": 0.036989931017160416, + "learning_rate": 0.00014515998379910895, + "loss": 0.3612, + "step": 3584 + }, + { + "epoch": 0.2904244977316915, + "grad_norm": 0.03131851181387901, + "learning_rate": 0.00014520048602673148, + "loss": 0.3642, + "step": 3585 + }, + { + "epoch": 0.2905055087491899, + "grad_norm": 0.03495854511857033, + "learning_rate": 0.000145240988254354, + "loss": 0.3532, + "step": 3586 + }, + { + "epoch": 0.29058651976668826, + "grad_norm": 0.03899826854467392, + "learning_rate": 0.00014528149048197652, + "loss": 0.3829, + "step": 3587 + }, + { + "epoch": 0.29066753078418667, + "grad_norm": 0.04118049889802933, + "learning_rate": 0.00014532199270959903, + "loss": 0.3684, + "step": 3588 + }, + { + "epoch": 0.290748541801685, + "grad_norm": 0.04020068049430847, + "learning_rate": 0.00014536249493722156, + "loss": 0.3875, + "step": 3589 + }, + { + "epoch": 0.29082955281918343, + "grad_norm": 0.04532919079065323, + "learning_rate": 0.00014540299716484407, + "loss": 0.403, + "step": 3590 + }, + { + "epoch": 0.2909105638366818, + "grad_norm": 0.03378492221236229, + "learning_rate": 0.0001454434993924666, + "loss": 0.4048, + "step": 3591 + }, + { + "epoch": 0.29099157485418015, + "grad_norm": 0.04663817957043648, + "learning_rate": 0.0001454840016200891, + "loss": 0.375, + "step": 3592 + }, + { + "epoch": 0.29107258587167856, + "grad_norm": 0.051759760826826096, + "learning_rate": 0.00014552450384771164, + "loss": 0.3474, + "step": 3593 + }, + { + "epoch": 0.2911535968891769, + "grad_norm": 0.043322253972291946, + "learning_rate": 0.00014556500607533414, + "loss": 0.4434, + "step": 3594 + }, + { + "epoch": 0.2912346079066753, + "grad_norm": 0.04209831729531288, + "learning_rate": 0.00014560550830295668, + "loss": 0.3423, + "step": 3595 + }, + { + "epoch": 0.2913156189241737, + "grad_norm": 0.04809553548693657, + "learning_rate": 0.00014564601053057918, + "loss": 0.4065, + "step": 3596 + }, + { + "epoch": 0.2913966299416721, + "grad_norm": 0.04254837706685066, + "learning_rate": 0.00014568651275820172, + "loss": 0.391, + "step": 3597 + }, + { + "epoch": 0.29147764095917045, + "grad_norm": 0.050005342811346054, + "learning_rate": 0.00014572701498582422, + "loss": 0.3765, + "step": 3598 + }, + { + "epoch": 0.2915586519766688, + "grad_norm": 0.04318535327911377, + "learning_rate": 0.00014576751721344676, + "loss": 0.3803, + "step": 3599 + }, + { + "epoch": 0.2916396629941672, + "grad_norm": 0.042721282690763474, + "learning_rate": 0.0001458080194410693, + "loss": 0.3554, + "step": 3600 + }, + { + "epoch": 0.29172067401166557, + "grad_norm": 0.03852958604693413, + "learning_rate": 0.0001458485216686918, + "loss": 0.4072, + "step": 3601 + }, + { + "epoch": 0.291801685029164, + "grad_norm": 0.048130251467227936, + "learning_rate": 0.0001458890238963143, + "loss": 0.3778, + "step": 3602 + }, + { + "epoch": 0.29188269604666234, + "grad_norm": 0.03864138573408127, + "learning_rate": 0.00014592952612393683, + "loss": 0.3391, + "step": 3603 + }, + { + "epoch": 0.29196370706416075, + "grad_norm": 0.044405851513147354, + "learning_rate": 0.00014597002835155934, + "loss": 0.376, + "step": 3604 + }, + { + "epoch": 0.2920447180816591, + "grad_norm": 0.0400206632912159, + "learning_rate": 0.00014601053057918185, + "loss": 0.3613, + "step": 3605 + }, + { + "epoch": 0.29212572909915746, + "grad_norm": 0.05284254252910614, + "learning_rate": 0.00014605103280680438, + "loss": 0.4416, + "step": 3606 + }, + { + "epoch": 0.29220674011665587, + "grad_norm": 0.044001657515764236, + "learning_rate": 0.00014609153503442689, + "loss": 0.3975, + "step": 3607 + }, + { + "epoch": 0.29228775113415423, + "grad_norm": 0.04004145413637161, + "learning_rate": 0.00014613203726204942, + "loss": 0.3787, + "step": 3608 + }, + { + "epoch": 0.29236876215165264, + "grad_norm": 0.04128222540020943, + "learning_rate": 0.00014617253948967193, + "loss": 0.3683, + "step": 3609 + }, + { + "epoch": 0.292449773169151, + "grad_norm": 0.03241799399256706, + "learning_rate": 0.00014621304171729446, + "loss": 0.3784, + "step": 3610 + }, + { + "epoch": 0.2925307841866494, + "grad_norm": 0.046559903770685196, + "learning_rate": 0.00014625354394491696, + "loss": 0.3505, + "step": 3611 + }, + { + "epoch": 0.29261179520414776, + "grad_norm": 0.04115467518568039, + "learning_rate": 0.0001462940461725395, + "loss": 0.3923, + "step": 3612 + }, + { + "epoch": 0.2926928062216461, + "grad_norm": 0.04463135451078415, + "learning_rate": 0.000146334548400162, + "loss": 0.3951, + "step": 3613 + }, + { + "epoch": 0.29277381723914453, + "grad_norm": 0.04419780522584915, + "learning_rate": 0.00014637505062778454, + "loss": 0.3886, + "step": 3614 + }, + { + "epoch": 0.2928548282566429, + "grad_norm": 0.040862735360860825, + "learning_rate": 0.00014641555285540704, + "loss": 0.4113, + "step": 3615 + }, + { + "epoch": 0.2929358392741413, + "grad_norm": 0.03600708395242691, + "learning_rate": 0.00014645605508302958, + "loss": 0.3534, + "step": 3616 + }, + { + "epoch": 0.29301685029163965, + "grad_norm": 0.03524504974484444, + "learning_rate": 0.00014649655731065208, + "loss": 0.3842, + "step": 3617 + }, + { + "epoch": 0.29309786130913806, + "grad_norm": 0.039181362837553024, + "learning_rate": 0.00014653705953827462, + "loss": 0.3386, + "step": 3618 + }, + { + "epoch": 0.2931788723266364, + "grad_norm": 0.04047534987330437, + "learning_rate": 0.00014657756176589715, + "loss": 0.405, + "step": 3619 + }, + { + "epoch": 0.2932598833441348, + "grad_norm": 0.04122704267501831, + "learning_rate": 0.00014661806399351966, + "loss": 0.338, + "step": 3620 + }, + { + "epoch": 0.2933408943616332, + "grad_norm": 0.03426510840654373, + "learning_rate": 0.0001466585662211422, + "loss": 0.3272, + "step": 3621 + }, + { + "epoch": 0.29342190537913154, + "grad_norm": 0.049452658742666245, + "learning_rate": 0.0001466990684487647, + "loss": 0.3257, + "step": 3622 + }, + { + "epoch": 0.29350291639662995, + "grad_norm": 0.04260660335421562, + "learning_rate": 0.00014673957067638723, + "loss": 0.3875, + "step": 3623 + }, + { + "epoch": 0.2935839274141283, + "grad_norm": 0.039224427193403244, + "learning_rate": 0.00014678007290400973, + "loss": 0.3489, + "step": 3624 + }, + { + "epoch": 0.2936649384316267, + "grad_norm": 0.04074981436133385, + "learning_rate": 0.00014682057513163227, + "loss": 0.3824, + "step": 3625 + }, + { + "epoch": 0.2937459494491251, + "grad_norm": 0.04070234298706055, + "learning_rate": 0.00014686107735925477, + "loss": 0.3139, + "step": 3626 + }, + { + "epoch": 0.2938269604666235, + "grad_norm": 0.03818705305457115, + "learning_rate": 0.00014690157958687728, + "loss": 0.3318, + "step": 3627 + }, + { + "epoch": 0.29390797148412184, + "grad_norm": 0.03919083997607231, + "learning_rate": 0.00014694208181449979, + "loss": 0.3666, + "step": 3628 + }, + { + "epoch": 0.2939889825016202, + "grad_norm": 0.052625805139541626, + "learning_rate": 0.00014698258404212232, + "loss": 0.3837, + "step": 3629 + }, + { + "epoch": 0.2940699935191186, + "grad_norm": 0.048445843160152435, + "learning_rate": 0.00014702308626974483, + "loss": 0.3527, + "step": 3630 + }, + { + "epoch": 0.29415100453661697, + "grad_norm": 0.04346831887960434, + "learning_rate": 0.00014706358849736736, + "loss": 0.393, + "step": 3631 + }, + { + "epoch": 0.2942320155541154, + "grad_norm": 0.06405261904001236, + "learning_rate": 0.00014710409072498986, + "loss": 0.3996, + "step": 3632 + }, + { + "epoch": 0.29431302657161373, + "grad_norm": 0.03609946370124817, + "learning_rate": 0.0001471445929526124, + "loss": 0.3749, + "step": 3633 + }, + { + "epoch": 0.29439403758911215, + "grad_norm": 0.0333322137594223, + "learning_rate": 0.0001471850951802349, + "loss": 0.3139, + "step": 3634 + }, + { + "epoch": 0.2944750486066105, + "grad_norm": 0.036384835839271545, + "learning_rate": 0.00014722559740785744, + "loss": 0.3566, + "step": 3635 + }, + { + "epoch": 0.29455605962410886, + "grad_norm": 0.04768802970647812, + "learning_rate": 0.00014726609963547994, + "loss": 0.3948, + "step": 3636 + }, + { + "epoch": 0.29463707064160727, + "grad_norm": 0.03785340115427971, + "learning_rate": 0.00014730660186310248, + "loss": 0.3791, + "step": 3637 + }, + { + "epoch": 0.2947180816591056, + "grad_norm": 0.03505001962184906, + "learning_rate": 0.000147347104090725, + "loss": 0.3427, + "step": 3638 + }, + { + "epoch": 0.29479909267660404, + "grad_norm": 0.03778628259897232, + "learning_rate": 0.00014738760631834752, + "loss": 0.4328, + "step": 3639 + }, + { + "epoch": 0.2948801036941024, + "grad_norm": 0.0557037778198719, + "learning_rate": 0.00014742810854597005, + "loss": 0.393, + "step": 3640 + }, + { + "epoch": 0.2949611147116008, + "grad_norm": 0.036547672003507614, + "learning_rate": 0.00014746861077359256, + "loss": 0.372, + "step": 3641 + }, + { + "epoch": 0.29504212572909916, + "grad_norm": 0.03863165155053139, + "learning_rate": 0.0001475091130012151, + "loss": 0.4083, + "step": 3642 + }, + { + "epoch": 0.2951231367465975, + "grad_norm": 0.030831417068839073, + "learning_rate": 0.0001475496152288376, + "loss": 0.3538, + "step": 3643 + }, + { + "epoch": 0.2952041477640959, + "grad_norm": 0.0388491228222847, + "learning_rate": 0.00014759011745646013, + "loss": 0.3774, + "step": 3644 + }, + { + "epoch": 0.2952851587815943, + "grad_norm": 0.04144338145852089, + "learning_rate": 0.00014763061968408263, + "loss": 0.3155, + "step": 3645 + }, + { + "epoch": 0.2953661697990927, + "grad_norm": 0.03875422850251198, + "learning_rate": 0.00014767112191170517, + "loss": 0.3804, + "step": 3646 + }, + { + "epoch": 0.29544718081659105, + "grad_norm": 0.03554854914546013, + "learning_rate": 0.00014771162413932767, + "loss": 0.334, + "step": 3647 + }, + { + "epoch": 0.29552819183408946, + "grad_norm": 0.03913655877113342, + "learning_rate": 0.0001477521263669502, + "loss": 0.3686, + "step": 3648 + }, + { + "epoch": 0.2956092028515878, + "grad_norm": 0.033935628831386566, + "learning_rate": 0.0001477926285945727, + "loss": 0.3666, + "step": 3649 + }, + { + "epoch": 0.29569021386908617, + "grad_norm": 0.040643028914928436, + "learning_rate": 0.00014783313082219522, + "loss": 0.3867, + "step": 3650 + }, + { + "epoch": 0.2957712248865846, + "grad_norm": 0.040583305060863495, + "learning_rate": 0.00014787363304981775, + "loss": 0.3971, + "step": 3651 + }, + { + "epoch": 0.29585223590408294, + "grad_norm": 0.043748460710048676, + "learning_rate": 0.00014791413527744026, + "loss": 0.3837, + "step": 3652 + }, + { + "epoch": 0.29593324692158135, + "grad_norm": 0.032871171832084656, + "learning_rate": 0.00014795463750506276, + "loss": 0.3959, + "step": 3653 + }, + { + "epoch": 0.2960142579390797, + "grad_norm": 0.03821488469839096, + "learning_rate": 0.0001479951397326853, + "loss": 0.3946, + "step": 3654 + }, + { + "epoch": 0.2960952689565781, + "grad_norm": 0.03509106859564781, + "learning_rate": 0.0001480356419603078, + "loss": 0.3799, + "step": 3655 + }, + { + "epoch": 0.2961762799740765, + "grad_norm": 0.03787700831890106, + "learning_rate": 0.00014807614418793034, + "loss": 0.3385, + "step": 3656 + }, + { + "epoch": 0.29625729099157483, + "grad_norm": 0.03202836588025093, + "learning_rate": 0.00014811664641555287, + "loss": 0.3546, + "step": 3657 + }, + { + "epoch": 0.29633830200907324, + "grad_norm": 0.03317293897271156, + "learning_rate": 0.00014815714864317538, + "loss": 0.3755, + "step": 3658 + }, + { + "epoch": 0.2964193130265716, + "grad_norm": 0.03745601698756218, + "learning_rate": 0.0001481976508707979, + "loss": 0.3894, + "step": 3659 + }, + { + "epoch": 0.29650032404407, + "grad_norm": 0.04062475264072418, + "learning_rate": 0.00014823815309842042, + "loss": 0.377, + "step": 3660 + }, + { + "epoch": 0.29658133506156836, + "grad_norm": 0.043611712753772736, + "learning_rate": 0.00014827865532604295, + "loss": 0.3956, + "step": 3661 + }, + { + "epoch": 0.2966623460790668, + "grad_norm": 0.04180651530623436, + "learning_rate": 0.00014831915755366545, + "loss": 0.4128, + "step": 3662 + }, + { + "epoch": 0.29674335709656513, + "grad_norm": 0.03399563580751419, + "learning_rate": 0.000148359659781288, + "loss": 0.3598, + "step": 3663 + }, + { + "epoch": 0.2968243681140635, + "grad_norm": 0.03948217257857323, + "learning_rate": 0.0001484001620089105, + "loss": 0.3433, + "step": 3664 + }, + { + "epoch": 0.2969053791315619, + "grad_norm": 0.040393419563770294, + "learning_rate": 0.00014844066423653303, + "loss": 0.3689, + "step": 3665 + }, + { + "epoch": 0.29698639014906025, + "grad_norm": 0.037186894565820694, + "learning_rate": 0.00014848116646415553, + "loss": 0.3551, + "step": 3666 + }, + { + "epoch": 0.29706740116655866, + "grad_norm": 0.04067966714501381, + "learning_rate": 0.00014852166869177807, + "loss": 0.3772, + "step": 3667 + }, + { + "epoch": 0.297148412184057, + "grad_norm": 0.03943729028105736, + "learning_rate": 0.00014856217091940057, + "loss": 0.3924, + "step": 3668 + }, + { + "epoch": 0.29722942320155543, + "grad_norm": 0.045668501406908035, + "learning_rate": 0.0001486026731470231, + "loss": 0.3949, + "step": 3669 + }, + { + "epoch": 0.2973104342190538, + "grad_norm": 0.03306059166789055, + "learning_rate": 0.0001486431753746456, + "loss": 0.3779, + "step": 3670 + }, + { + "epoch": 0.2973914452365522, + "grad_norm": 0.045515839010477066, + "learning_rate": 0.00014868367760226815, + "loss": 0.3821, + "step": 3671 + }, + { + "epoch": 0.29747245625405055, + "grad_norm": 0.03392757847905159, + "learning_rate": 0.00014872417982989065, + "loss": 0.3475, + "step": 3672 + }, + { + "epoch": 0.2975534672715489, + "grad_norm": 0.04137163981795311, + "learning_rate": 0.00014876468205751318, + "loss": 0.4049, + "step": 3673 + }, + { + "epoch": 0.2976344782890473, + "grad_norm": 0.040442898869514465, + "learning_rate": 0.0001488051842851357, + "loss": 0.3604, + "step": 3674 + }, + { + "epoch": 0.2977154893065457, + "grad_norm": 0.042486123740673065, + "learning_rate": 0.0001488456865127582, + "loss": 0.3766, + "step": 3675 + }, + { + "epoch": 0.2977965003240441, + "grad_norm": 0.04691873863339424, + "learning_rate": 0.00014888618874038073, + "loss": 0.3714, + "step": 3676 + }, + { + "epoch": 0.29787751134154244, + "grad_norm": 0.04484069347381592, + "learning_rate": 0.00014892669096800324, + "loss": 0.3795, + "step": 3677 + }, + { + "epoch": 0.29795852235904086, + "grad_norm": 0.04399680346250534, + "learning_rate": 0.00014896719319562577, + "loss": 0.3808, + "step": 3678 + }, + { + "epoch": 0.2980395333765392, + "grad_norm": 0.041292715817689896, + "learning_rate": 0.00014900769542324828, + "loss": 0.375, + "step": 3679 + }, + { + "epoch": 0.29812054439403757, + "grad_norm": 0.042164355516433716, + "learning_rate": 0.0001490481976508708, + "loss": 0.379, + "step": 3680 + }, + { + "epoch": 0.298201555411536, + "grad_norm": 0.04407854005694389, + "learning_rate": 0.00014908869987849331, + "loss": 0.4312, + "step": 3681 + }, + { + "epoch": 0.29828256642903433, + "grad_norm": 0.03977717086672783, + "learning_rate": 0.00014912920210611585, + "loss": 0.433, + "step": 3682 + }, + { + "epoch": 0.29836357744653275, + "grad_norm": 0.044105205684900284, + "learning_rate": 0.00014916970433373835, + "loss": 0.4145, + "step": 3683 + }, + { + "epoch": 0.2984445884640311, + "grad_norm": 0.03839416801929474, + "learning_rate": 0.0001492102065613609, + "loss": 0.3969, + "step": 3684 + }, + { + "epoch": 0.2985255994815295, + "grad_norm": 0.040000759065151215, + "learning_rate": 0.0001492507087889834, + "loss": 0.431, + "step": 3685 + }, + { + "epoch": 0.29860661049902787, + "grad_norm": 0.04719092324376106, + "learning_rate": 0.00014929121101660593, + "loss": 0.3822, + "step": 3686 + }, + { + "epoch": 0.2986876215165262, + "grad_norm": 0.03640539199113846, + "learning_rate": 0.00014933171324422843, + "loss": 0.4155, + "step": 3687 + }, + { + "epoch": 0.29876863253402464, + "grad_norm": 0.0411471463739872, + "learning_rate": 0.00014937221547185097, + "loss": 0.3914, + "step": 3688 + }, + { + "epoch": 0.298849643551523, + "grad_norm": 0.041285451501607895, + "learning_rate": 0.00014941271769947347, + "loss": 0.3989, + "step": 3689 + }, + { + "epoch": 0.2989306545690214, + "grad_norm": 0.034479282796382904, + "learning_rate": 0.000149453219927096, + "loss": 0.3202, + "step": 3690 + }, + { + "epoch": 0.29901166558651976, + "grad_norm": 0.03484316170215607, + "learning_rate": 0.0001494937221547185, + "loss": 0.3759, + "step": 3691 + }, + { + "epoch": 0.29909267660401817, + "grad_norm": 0.04065540432929993, + "learning_rate": 0.00014953422438234104, + "loss": 0.407, + "step": 3692 + }, + { + "epoch": 0.2991736876215165, + "grad_norm": 0.03859071433544159, + "learning_rate": 0.00014957472660996355, + "loss": 0.3799, + "step": 3693 + }, + { + "epoch": 0.2992546986390149, + "grad_norm": 0.038158584386110306, + "learning_rate": 0.00014961522883758608, + "loss": 0.3492, + "step": 3694 + }, + { + "epoch": 0.2993357096565133, + "grad_norm": 0.03658697009086609, + "learning_rate": 0.00014965573106520862, + "loss": 0.3138, + "step": 3695 + }, + { + "epoch": 0.29941672067401165, + "grad_norm": 0.03887635096907616, + "learning_rate": 0.00014969623329283112, + "loss": 0.3794, + "step": 3696 + }, + { + "epoch": 0.29949773169151006, + "grad_norm": 0.036319054663181305, + "learning_rate": 0.00014973673552045363, + "loss": 0.3726, + "step": 3697 + }, + { + "epoch": 0.2995787427090084, + "grad_norm": 0.03450790420174599, + "learning_rate": 0.00014977723774807616, + "loss": 0.3809, + "step": 3698 + }, + { + "epoch": 0.29965975372650683, + "grad_norm": 0.04092950001358986, + "learning_rate": 0.00014981773997569867, + "loss": 0.3833, + "step": 3699 + }, + { + "epoch": 0.2997407647440052, + "grad_norm": 0.03737466782331467, + "learning_rate": 0.00014985824220332117, + "loss": 0.3669, + "step": 3700 + }, + { + "epoch": 0.29982177576150354, + "grad_norm": 0.03429148718714714, + "learning_rate": 0.0001498987444309437, + "loss": 0.3875, + "step": 3701 + }, + { + "epoch": 0.29990278677900195, + "grad_norm": 0.038960050791502, + "learning_rate": 0.00014993924665856621, + "loss": 0.3762, + "step": 3702 + }, + { + "epoch": 0.2999837977965003, + "grad_norm": 0.03236428275704384, + "learning_rate": 0.00014997974888618875, + "loss": 0.3574, + "step": 3703 + }, + { + "epoch": 0.3000648088139987, + "grad_norm": 0.035337019711732864, + "learning_rate": 0.00015002025111381125, + "loss": 0.3855, + "step": 3704 + }, + { + "epoch": 0.3001458198314971, + "grad_norm": 0.043556008487939835, + "learning_rate": 0.0001500607533414338, + "loss": 0.3833, + "step": 3705 + }, + { + "epoch": 0.3002268308489955, + "grad_norm": 0.03529440239071846, + "learning_rate": 0.0001501012555690563, + "loss": 0.3947, + "step": 3706 + }, + { + "epoch": 0.30030784186649384, + "grad_norm": 0.03917783871293068, + "learning_rate": 0.00015014175779667883, + "loss": 0.3524, + "step": 3707 + }, + { + "epoch": 0.3003888528839922, + "grad_norm": 0.03778640553355217, + "learning_rate": 0.00015018226002430133, + "loss": 0.395, + "step": 3708 + }, + { + "epoch": 0.3004698639014906, + "grad_norm": 0.041695136576890945, + "learning_rate": 0.00015022276225192387, + "loss": 0.4077, + "step": 3709 + }, + { + "epoch": 0.30055087491898896, + "grad_norm": 0.03826890140771866, + "learning_rate": 0.00015026326447954637, + "loss": 0.3684, + "step": 3710 + }, + { + "epoch": 0.3006318859364874, + "grad_norm": 0.040991995483636856, + "learning_rate": 0.0001503037667071689, + "loss": 0.4126, + "step": 3711 + }, + { + "epoch": 0.30071289695398573, + "grad_norm": 0.034812260419130325, + "learning_rate": 0.0001503442689347914, + "loss": 0.3823, + "step": 3712 + }, + { + "epoch": 0.30079390797148414, + "grad_norm": 0.034525495022535324, + "learning_rate": 0.00015038477116241394, + "loss": 0.3554, + "step": 3713 + }, + { + "epoch": 0.3008749189889825, + "grad_norm": 0.03014807216823101, + "learning_rate": 0.00015042527339003648, + "loss": 0.3261, + "step": 3714 + }, + { + "epoch": 0.30095593000648085, + "grad_norm": 0.03565245866775513, + "learning_rate": 0.00015046577561765898, + "loss": 0.324, + "step": 3715 + }, + { + "epoch": 0.30103694102397927, + "grad_norm": 0.04224449023604393, + "learning_rate": 0.00015050627784528152, + "loss": 0.3788, + "step": 3716 + }, + { + "epoch": 0.3011179520414776, + "grad_norm": 0.035440728068351746, + "learning_rate": 0.00015054678007290402, + "loss": 0.344, + "step": 3717 + }, + { + "epoch": 0.30119896305897603, + "grad_norm": 0.03690610080957413, + "learning_rate": 0.00015058728230052656, + "loss": 0.3534, + "step": 3718 + }, + { + "epoch": 0.3012799740764744, + "grad_norm": 0.04340109974145889, + "learning_rate": 0.00015062778452814906, + "loss": 0.4237, + "step": 3719 + }, + { + "epoch": 0.3013609850939728, + "grad_norm": 0.03343253582715988, + "learning_rate": 0.00015066828675577157, + "loss": 0.3615, + "step": 3720 + }, + { + "epoch": 0.30144199611147116, + "grad_norm": 0.03364843130111694, + "learning_rate": 0.0001507087889833941, + "loss": 0.3531, + "step": 3721 + }, + { + "epoch": 0.30152300712896957, + "grad_norm": 0.033783357590436935, + "learning_rate": 0.0001507492912110166, + "loss": 0.3652, + "step": 3722 + }, + { + "epoch": 0.3016040181464679, + "grad_norm": 0.04219111055135727, + "learning_rate": 0.00015078979343863911, + "loss": 0.3457, + "step": 3723 + }, + { + "epoch": 0.3016850291639663, + "grad_norm": 0.03568140044808388, + "learning_rate": 0.00015083029566626165, + "loss": 0.3636, + "step": 3724 + }, + { + "epoch": 0.3017660401814647, + "grad_norm": 0.037624381482601166, + "learning_rate": 0.00015087079789388415, + "loss": 0.4275, + "step": 3725 + }, + { + "epoch": 0.30184705119896305, + "grad_norm": 0.036071497946977615, + "learning_rate": 0.00015091130012150669, + "loss": 0.3749, + "step": 3726 + }, + { + "epoch": 0.30192806221646146, + "grad_norm": 0.04090581461787224, + "learning_rate": 0.0001509518023491292, + "loss": 0.3617, + "step": 3727 + }, + { + "epoch": 0.3020090732339598, + "grad_norm": 0.04076239466667175, + "learning_rate": 0.00015099230457675173, + "loss": 0.3988, + "step": 3728 + }, + { + "epoch": 0.3020900842514582, + "grad_norm": 0.04235999286174774, + "learning_rate": 0.00015103280680437423, + "loss": 0.3757, + "step": 3729 + }, + { + "epoch": 0.3021710952689566, + "grad_norm": 0.0365556962788105, + "learning_rate": 0.00015107330903199676, + "loss": 0.4298, + "step": 3730 + }, + { + "epoch": 0.30225210628645494, + "grad_norm": 0.043695688247680664, + "learning_rate": 0.00015111381125961927, + "loss": 0.3601, + "step": 3731 + }, + { + "epoch": 0.30233311730395335, + "grad_norm": 0.04236026108264923, + "learning_rate": 0.0001511543134872418, + "loss": 0.4221, + "step": 3732 + }, + { + "epoch": 0.3024141283214517, + "grad_norm": 0.040135957300662994, + "learning_rate": 0.00015119481571486434, + "loss": 0.4264, + "step": 3733 + }, + { + "epoch": 0.3024951393389501, + "grad_norm": 0.036353833973407745, + "learning_rate": 0.00015123531794248684, + "loss": 0.3314, + "step": 3734 + }, + { + "epoch": 0.30257615035644847, + "grad_norm": 0.03830006346106529, + "learning_rate": 0.00015127582017010938, + "loss": 0.4432, + "step": 3735 + }, + { + "epoch": 0.3026571613739469, + "grad_norm": 0.03320148587226868, + "learning_rate": 0.00015131632239773188, + "loss": 0.3652, + "step": 3736 + }, + { + "epoch": 0.30273817239144524, + "grad_norm": 0.0449746735394001, + "learning_rate": 0.00015135682462535442, + "loss": 0.4141, + "step": 3737 + }, + { + "epoch": 0.3028191834089436, + "grad_norm": 0.04558303579688072, + "learning_rate": 0.00015139732685297692, + "loss": 0.3729, + "step": 3738 + }, + { + "epoch": 0.302900194426442, + "grad_norm": 0.037199411541223526, + "learning_rate": 0.00015143782908059946, + "loss": 0.4127, + "step": 3739 + }, + { + "epoch": 0.30298120544394036, + "grad_norm": 0.036756481975317, + "learning_rate": 0.00015147833130822196, + "loss": 0.3719, + "step": 3740 + }, + { + "epoch": 0.30306221646143877, + "grad_norm": 0.037477146834135056, + "learning_rate": 0.0001515188335358445, + "loss": 0.3545, + "step": 3741 + }, + { + "epoch": 0.3031432274789371, + "grad_norm": 0.03502131998538971, + "learning_rate": 0.000151559335763467, + "loss": 0.3462, + "step": 3742 + }, + { + "epoch": 0.30322423849643554, + "grad_norm": 0.04525633901357651, + "learning_rate": 0.00015159983799108953, + "loss": 0.3617, + "step": 3743 + }, + { + "epoch": 0.3033052495139339, + "grad_norm": 0.03657715395092964, + "learning_rate": 0.00015164034021871204, + "loss": 0.3848, + "step": 3744 + }, + { + "epoch": 0.30338626053143225, + "grad_norm": 0.030081717297434807, + "learning_rate": 0.00015168084244633455, + "loss": 0.3351, + "step": 3745 + }, + { + "epoch": 0.30346727154893066, + "grad_norm": 0.04054329916834831, + "learning_rate": 0.00015172134467395708, + "loss": 0.4144, + "step": 3746 + }, + { + "epoch": 0.303548282566429, + "grad_norm": 0.03237282857298851, + "learning_rate": 0.00015176184690157959, + "loss": 0.3311, + "step": 3747 + }, + { + "epoch": 0.30362929358392743, + "grad_norm": 0.04375077411532402, + "learning_rate": 0.0001518023491292021, + "loss": 0.3587, + "step": 3748 + }, + { + "epoch": 0.3037103046014258, + "grad_norm": 0.03343536704778671, + "learning_rate": 0.00015184285135682463, + "loss": 0.337, + "step": 3749 + }, + { + "epoch": 0.3037913156189242, + "grad_norm": 0.03777144104242325, + "learning_rate": 0.00015188335358444713, + "loss": 0.3863, + "step": 3750 + }, + { + "epoch": 0.30387232663642255, + "grad_norm": 0.030147891491651535, + "learning_rate": 0.00015192385581206966, + "loss": 0.3014, + "step": 3751 + }, + { + "epoch": 0.3039533376539209, + "grad_norm": 0.048893216997385025, + "learning_rate": 0.0001519643580396922, + "loss": 0.4196, + "step": 3752 + }, + { + "epoch": 0.3040343486714193, + "grad_norm": 0.03945691138505936, + "learning_rate": 0.0001520048602673147, + "loss": 0.386, + "step": 3753 + }, + { + "epoch": 0.3041153596889177, + "grad_norm": 0.03786306828260422, + "learning_rate": 0.00015204536249493724, + "loss": 0.3315, + "step": 3754 + }, + { + "epoch": 0.3041963707064161, + "grad_norm": 0.03841707110404968, + "learning_rate": 0.00015208586472255974, + "loss": 0.3924, + "step": 3755 + }, + { + "epoch": 0.30427738172391444, + "grad_norm": 0.035001907497644424, + "learning_rate": 0.00015212636695018228, + "loss": 0.4017, + "step": 3756 + }, + { + "epoch": 0.30435839274141285, + "grad_norm": 0.03870251774787903, + "learning_rate": 0.00015216686917780478, + "loss": 0.4184, + "step": 3757 + }, + { + "epoch": 0.3044394037589112, + "grad_norm": 0.03757096081972122, + "learning_rate": 0.00015220737140542732, + "loss": 0.3805, + "step": 3758 + }, + { + "epoch": 0.30452041477640956, + "grad_norm": 0.044028136879205704, + "learning_rate": 0.00015224787363304982, + "loss": 0.3585, + "step": 3759 + }, + { + "epoch": 0.304601425793908, + "grad_norm": 0.041459329426288605, + "learning_rate": 0.00015228837586067235, + "loss": 0.3789, + "step": 3760 + }, + { + "epoch": 0.30468243681140633, + "grad_norm": 0.045845143496990204, + "learning_rate": 0.00015232887808829486, + "loss": 0.3638, + "step": 3761 + }, + { + "epoch": 0.30476344782890474, + "grad_norm": 0.0462605319917202, + "learning_rate": 0.0001523693803159174, + "loss": 0.3344, + "step": 3762 + }, + { + "epoch": 0.3048444588464031, + "grad_norm": 0.03516705334186554, + "learning_rate": 0.0001524098825435399, + "loss": 0.3918, + "step": 3763 + }, + { + "epoch": 0.3049254698639015, + "grad_norm": 0.05478263646364212, + "learning_rate": 0.00015245038477116243, + "loss": 0.3916, + "step": 3764 + }, + { + "epoch": 0.30500648088139987, + "grad_norm": 0.0416891872882843, + "learning_rate": 0.00015249088699878494, + "loss": 0.339, + "step": 3765 + }, + { + "epoch": 0.3050874918988983, + "grad_norm": 0.03643078729510307, + "learning_rate": 0.00015253138922640747, + "loss": 0.3638, + "step": 3766 + }, + { + "epoch": 0.30516850291639663, + "grad_norm": 0.040709856897592545, + "learning_rate": 0.00015257189145402998, + "loss": 0.3849, + "step": 3767 + }, + { + "epoch": 0.305249513933895, + "grad_norm": 0.041243139654397964, + "learning_rate": 0.0001526123936816525, + "loss": 0.358, + "step": 3768 + }, + { + "epoch": 0.3053305249513934, + "grad_norm": 0.03741239011287689, + "learning_rate": 0.00015265289590927502, + "loss": 0.395, + "step": 3769 + }, + { + "epoch": 0.30541153596889176, + "grad_norm": 0.033558666706085205, + "learning_rate": 0.00015269339813689752, + "loss": 0.3565, + "step": 3770 + }, + { + "epoch": 0.30549254698639017, + "grad_norm": 0.03725985065102577, + "learning_rate": 0.00015273390036452006, + "loss": 0.3383, + "step": 3771 + }, + { + "epoch": 0.3055735580038885, + "grad_norm": 0.03618418425321579, + "learning_rate": 0.00015277440259214256, + "loss": 0.3852, + "step": 3772 + }, + { + "epoch": 0.30565456902138693, + "grad_norm": 0.03546826168894768, + "learning_rate": 0.0001528149048197651, + "loss": 0.3507, + "step": 3773 + }, + { + "epoch": 0.3057355800388853, + "grad_norm": 0.040979351848363876, + "learning_rate": 0.0001528554070473876, + "loss": 0.4016, + "step": 3774 + }, + { + "epoch": 0.30581659105638365, + "grad_norm": 0.04222219064831734, + "learning_rate": 0.00015289590927501014, + "loss": 0.396, + "step": 3775 + }, + { + "epoch": 0.30589760207388206, + "grad_norm": 0.03376726806163788, + "learning_rate": 0.00015293641150263264, + "loss": 0.3172, + "step": 3776 + }, + { + "epoch": 0.3059786130913804, + "grad_norm": 0.04466283321380615, + "learning_rate": 0.00015297691373025518, + "loss": 0.3875, + "step": 3777 + }, + { + "epoch": 0.3060596241088788, + "grad_norm": 0.039065901190042496, + "learning_rate": 0.00015301741595787768, + "loss": 0.3869, + "step": 3778 + }, + { + "epoch": 0.3061406351263772, + "grad_norm": 0.036004744470119476, + "learning_rate": 0.00015305791818550022, + "loss": 0.3518, + "step": 3779 + }, + { + "epoch": 0.3062216461438756, + "grad_norm": 0.05747569352388382, + "learning_rate": 0.00015309842041312272, + "loss": 0.3464, + "step": 3780 + }, + { + "epoch": 0.30630265716137395, + "grad_norm": 0.03801568225026131, + "learning_rate": 0.00015313892264074525, + "loss": 0.3534, + "step": 3781 + }, + { + "epoch": 0.3063836681788723, + "grad_norm": 0.0325610414147377, + "learning_rate": 0.00015317942486836776, + "loss": 0.3749, + "step": 3782 + }, + { + "epoch": 0.3064646791963707, + "grad_norm": 0.049797795712947845, + "learning_rate": 0.0001532199270959903, + "loss": 0.3833, + "step": 3783 + }, + { + "epoch": 0.30654569021386907, + "grad_norm": 0.03561552241444588, + "learning_rate": 0.0001532604293236128, + "loss": 0.385, + "step": 3784 + }, + { + "epoch": 0.3066267012313675, + "grad_norm": 0.03801201283931732, + "learning_rate": 0.00015330093155123533, + "loss": 0.3636, + "step": 3785 + }, + { + "epoch": 0.30670771224886584, + "grad_norm": 0.03518911823630333, + "learning_rate": 0.00015334143377885784, + "loss": 0.3582, + "step": 3786 + }, + { + "epoch": 0.30678872326636425, + "grad_norm": 0.03534886613488197, + "learning_rate": 0.00015338193600648037, + "loss": 0.3912, + "step": 3787 + }, + { + "epoch": 0.3068697342838626, + "grad_norm": 0.032708004117012024, + "learning_rate": 0.00015342243823410288, + "loss": 0.3523, + "step": 3788 + }, + { + "epoch": 0.30695074530136096, + "grad_norm": 0.04191884770989418, + "learning_rate": 0.0001534629404617254, + "loss": 0.3536, + "step": 3789 + }, + { + "epoch": 0.3070317563188594, + "grad_norm": 0.04125582426786423, + "learning_rate": 0.00015350344268934794, + "loss": 0.3842, + "step": 3790 + }, + { + "epoch": 0.30711276733635773, + "grad_norm": 0.035285379737615585, + "learning_rate": 0.00015354394491697045, + "loss": 0.3603, + "step": 3791 + }, + { + "epoch": 0.30719377835385614, + "grad_norm": 0.04442056640982628, + "learning_rate": 0.00015358444714459296, + "loss": 0.386, + "step": 3792 + }, + { + "epoch": 0.3072747893713545, + "grad_norm": 0.03986095264554024, + "learning_rate": 0.00015362494937221546, + "loss": 0.3747, + "step": 3793 + }, + { + "epoch": 0.3073558003888529, + "grad_norm": 0.03432997688651085, + "learning_rate": 0.000153665451599838, + "loss": 0.3494, + "step": 3794 + }, + { + "epoch": 0.30743681140635126, + "grad_norm": 0.047092802822589874, + "learning_rate": 0.0001537059538274605, + "loss": 0.3552, + "step": 3795 + }, + { + "epoch": 0.3075178224238496, + "grad_norm": 0.039662785828113556, + "learning_rate": 0.00015374645605508304, + "loss": 0.3976, + "step": 3796 + }, + { + "epoch": 0.30759883344134803, + "grad_norm": 0.04702027142047882, + "learning_rate": 0.00015378695828270554, + "loss": 0.4191, + "step": 3797 + }, + { + "epoch": 0.3076798444588464, + "grad_norm": 0.03785824775695801, + "learning_rate": 0.00015382746051032808, + "loss": 0.345, + "step": 3798 + }, + { + "epoch": 0.3077608554763448, + "grad_norm": 0.042051903903484344, + "learning_rate": 0.00015386796273795058, + "loss": 0.4369, + "step": 3799 + }, + { + "epoch": 0.30784186649384315, + "grad_norm": 0.04059034585952759, + "learning_rate": 0.00015390846496557311, + "loss": 0.3913, + "step": 3800 + }, + { + "epoch": 0.30792287751134156, + "grad_norm": 0.04255729168653488, + "learning_rate": 0.00015394896719319562, + "loss": 0.3709, + "step": 3801 + }, + { + "epoch": 0.3080038885288399, + "grad_norm": 0.03430137783288956, + "learning_rate": 0.00015398946942081815, + "loss": 0.3481, + "step": 3802 + }, + { + "epoch": 0.3080848995463383, + "grad_norm": 0.037832751870155334, + "learning_rate": 0.00015402997164844066, + "loss": 0.3803, + "step": 3803 + }, + { + "epoch": 0.3081659105638367, + "grad_norm": 0.03891238570213318, + "learning_rate": 0.0001540704738760632, + "loss": 0.352, + "step": 3804 + }, + { + "epoch": 0.30824692158133504, + "grad_norm": 0.03535735234618187, + "learning_rate": 0.0001541109761036857, + "loss": 0.3866, + "step": 3805 + }, + { + "epoch": 0.30832793259883345, + "grad_norm": 0.034961897879838943, + "learning_rate": 0.00015415147833130823, + "loss": 0.3778, + "step": 3806 + }, + { + "epoch": 0.3084089436163318, + "grad_norm": 0.042050305753946304, + "learning_rate": 0.00015419198055893074, + "loss": 0.3674, + "step": 3807 + }, + { + "epoch": 0.3084899546338302, + "grad_norm": 0.0314970538020134, + "learning_rate": 0.00015423248278655327, + "loss": 0.3533, + "step": 3808 + }, + { + "epoch": 0.3085709656513286, + "grad_norm": 0.03608867898583412, + "learning_rate": 0.0001542729850141758, + "loss": 0.3695, + "step": 3809 + }, + { + "epoch": 0.30865197666882693, + "grad_norm": 0.04926412180066109, + "learning_rate": 0.0001543134872417983, + "loss": 0.4015, + "step": 3810 + }, + { + "epoch": 0.30873298768632534, + "grad_norm": 0.03634179010987282, + "learning_rate": 0.00015435398946942084, + "loss": 0.3532, + "step": 3811 + }, + { + "epoch": 0.3088139987038237, + "grad_norm": 0.037701528519392014, + "learning_rate": 0.00015439449169704335, + "loss": 0.3515, + "step": 3812 + }, + { + "epoch": 0.3088950097213221, + "grad_norm": 0.040494389832019806, + "learning_rate": 0.00015443499392466588, + "loss": 0.3561, + "step": 3813 + }, + { + "epoch": 0.30897602073882047, + "grad_norm": 0.03875404968857765, + "learning_rate": 0.0001544754961522884, + "loss": 0.354, + "step": 3814 + }, + { + "epoch": 0.3090570317563189, + "grad_norm": 0.040258169174194336, + "learning_rate": 0.0001545159983799109, + "loss": 0.3379, + "step": 3815 + }, + { + "epoch": 0.30913804277381723, + "grad_norm": 0.03131493180990219, + "learning_rate": 0.00015455650060753343, + "loss": 0.3281, + "step": 3816 + }, + { + "epoch": 0.30921905379131565, + "grad_norm": 0.038832154124975204, + "learning_rate": 0.00015459700283515594, + "loss": 0.3928, + "step": 3817 + }, + { + "epoch": 0.309300064808814, + "grad_norm": 0.06491102278232574, + "learning_rate": 0.00015463750506277844, + "loss": 0.3753, + "step": 3818 + }, + { + "epoch": 0.30938107582631236, + "grad_norm": 0.037673432379961014, + "learning_rate": 0.00015467800729040097, + "loss": 0.4006, + "step": 3819 + }, + { + "epoch": 0.30946208684381077, + "grad_norm": 0.0383102111518383, + "learning_rate": 0.00015471850951802348, + "loss": 0.3528, + "step": 3820 + }, + { + "epoch": 0.3095430978613091, + "grad_norm": 0.03696411848068237, + "learning_rate": 0.00015475901174564601, + "loss": 0.3814, + "step": 3821 + }, + { + "epoch": 0.30962410887880754, + "grad_norm": 0.03863132745027542, + "learning_rate": 0.00015479951397326852, + "loss": 0.4174, + "step": 3822 + }, + { + "epoch": 0.3097051198963059, + "grad_norm": 0.034970369189977646, + "learning_rate": 0.00015484001620089105, + "loss": 0.3798, + "step": 3823 + }, + { + "epoch": 0.3097861309138043, + "grad_norm": 0.051043085753917694, + "learning_rate": 0.00015488051842851356, + "loss": 0.4075, + "step": 3824 + }, + { + "epoch": 0.30986714193130266, + "grad_norm": 0.04437202587723732, + "learning_rate": 0.0001549210206561361, + "loss": 0.3273, + "step": 3825 + }, + { + "epoch": 0.309948152948801, + "grad_norm": 0.03810075297951698, + "learning_rate": 0.0001549615228837586, + "loss": 0.375, + "step": 3826 + }, + { + "epoch": 0.3100291639662994, + "grad_norm": 0.03678411617875099, + "learning_rate": 0.00015500202511138113, + "loss": 0.3642, + "step": 3827 + }, + { + "epoch": 0.3101101749837978, + "grad_norm": 0.03821595013141632, + "learning_rate": 0.00015504252733900367, + "loss": 0.3331, + "step": 3828 + }, + { + "epoch": 0.3101911860012962, + "grad_norm": 0.038664739578962326, + "learning_rate": 0.00015508302956662617, + "loss": 0.3971, + "step": 3829 + }, + { + "epoch": 0.31027219701879455, + "grad_norm": 0.03899373114109039, + "learning_rate": 0.0001551235317942487, + "loss": 0.3777, + "step": 3830 + }, + { + "epoch": 0.31035320803629296, + "grad_norm": 0.039425078779459, + "learning_rate": 0.0001551640340218712, + "loss": 0.3583, + "step": 3831 + }, + { + "epoch": 0.3104342190537913, + "grad_norm": 0.03491853550076485, + "learning_rate": 0.00015520453624949374, + "loss": 0.3701, + "step": 3832 + }, + { + "epoch": 0.31051523007128967, + "grad_norm": 0.03618711233139038, + "learning_rate": 0.00015524503847711625, + "loss": 0.4148, + "step": 3833 + }, + { + "epoch": 0.3105962410887881, + "grad_norm": 0.03505758196115494, + "learning_rate": 0.00015528554070473878, + "loss": 0.3773, + "step": 3834 + }, + { + "epoch": 0.31067725210628644, + "grad_norm": 0.038697127252817154, + "learning_rate": 0.0001553260429323613, + "loss": 0.3366, + "step": 3835 + }, + { + "epoch": 0.31075826312378485, + "grad_norm": 0.03813067823648453, + "learning_rate": 0.00015536654515998382, + "loss": 0.3935, + "step": 3836 + }, + { + "epoch": 0.3108392741412832, + "grad_norm": 0.032286062836647034, + "learning_rate": 0.00015540704738760633, + "loss": 0.3588, + "step": 3837 + }, + { + "epoch": 0.3109202851587816, + "grad_norm": 0.03338897228240967, + "learning_rate": 0.00015544754961522886, + "loss": 0.3776, + "step": 3838 + }, + { + "epoch": 0.31100129617628, + "grad_norm": 0.03878074884414673, + "learning_rate": 0.00015548805184285137, + "loss": 0.3242, + "step": 3839 + }, + { + "epoch": 0.31108230719377833, + "grad_norm": 0.03827903792262077, + "learning_rate": 0.00015552855407047387, + "loss": 0.4061, + "step": 3840 + }, + { + "epoch": 0.31116331821127674, + "grad_norm": 0.04468392953276634, + "learning_rate": 0.0001555690562980964, + "loss": 0.3881, + "step": 3841 + }, + { + "epoch": 0.3112443292287751, + "grad_norm": 0.033204734325408936, + "learning_rate": 0.0001556095585257189, + "loss": 0.3998, + "step": 3842 + }, + { + "epoch": 0.3113253402462735, + "grad_norm": 0.04107440635561943, + "learning_rate": 0.00015565006075334142, + "loss": 0.3947, + "step": 3843 + }, + { + "epoch": 0.31140635126377186, + "grad_norm": 0.04109537973999977, + "learning_rate": 0.00015569056298096395, + "loss": 0.367, + "step": 3844 + }, + { + "epoch": 0.3114873622812703, + "grad_norm": 0.04015589505434036, + "learning_rate": 0.00015573106520858649, + "loss": 0.359, + "step": 3845 + }, + { + "epoch": 0.31156837329876863, + "grad_norm": 0.03611072152853012, + "learning_rate": 0.000155771567436209, + "loss": 0.3701, + "step": 3846 + }, + { + "epoch": 0.311649384316267, + "grad_norm": 0.03249693661928177, + "learning_rate": 0.00015581206966383153, + "loss": 0.3294, + "step": 3847 + }, + { + "epoch": 0.3117303953337654, + "grad_norm": 0.03942349553108215, + "learning_rate": 0.00015585257189145403, + "loss": 0.4098, + "step": 3848 + }, + { + "epoch": 0.31181140635126375, + "grad_norm": 0.034818992018699646, + "learning_rate": 0.00015589307411907656, + "loss": 0.3704, + "step": 3849 + }, + { + "epoch": 0.31189241736876216, + "grad_norm": 0.03547549620270729, + "learning_rate": 0.00015593357634669907, + "loss": 0.3751, + "step": 3850 + }, + { + "epoch": 0.3119734283862605, + "grad_norm": 0.04353281483054161, + "learning_rate": 0.0001559740785743216, + "loss": 0.3593, + "step": 3851 + }, + { + "epoch": 0.31205443940375893, + "grad_norm": 0.0328286774456501, + "learning_rate": 0.0001560145808019441, + "loss": 0.3672, + "step": 3852 + }, + { + "epoch": 0.3121354504212573, + "grad_norm": 0.0367732048034668, + "learning_rate": 0.00015605508302956664, + "loss": 0.3434, + "step": 3853 + }, + { + "epoch": 0.31221646143875564, + "grad_norm": 0.032217882573604584, + "learning_rate": 0.00015609558525718915, + "loss": 0.3289, + "step": 3854 + }, + { + "epoch": 0.31229747245625405, + "grad_norm": 0.04093978926539421, + "learning_rate": 0.00015613608748481168, + "loss": 0.4299, + "step": 3855 + }, + { + "epoch": 0.3123784834737524, + "grad_norm": 0.042412422597408295, + "learning_rate": 0.0001561765897124342, + "loss": 0.3585, + "step": 3856 + }, + { + "epoch": 0.3124594944912508, + "grad_norm": 0.03818630427122116, + "learning_rate": 0.00015621709194005672, + "loss": 0.3846, + "step": 3857 + }, + { + "epoch": 0.3125405055087492, + "grad_norm": 0.04645620286464691, + "learning_rate": 0.00015625759416767923, + "loss": 0.4463, + "step": 3858 + }, + { + "epoch": 0.3126215165262476, + "grad_norm": 0.04581103101372719, + "learning_rate": 0.00015629809639530176, + "loss": 0.4104, + "step": 3859 + }, + { + "epoch": 0.31270252754374595, + "grad_norm": 0.042016733437776566, + "learning_rate": 0.00015633859862292427, + "loss": 0.374, + "step": 3860 + }, + { + "epoch": 0.31278353856124436, + "grad_norm": 0.0401945486664772, + "learning_rate": 0.0001563791008505468, + "loss": 0.3817, + "step": 3861 + }, + { + "epoch": 0.3128645495787427, + "grad_norm": 0.030451439321041107, + "learning_rate": 0.0001564196030781693, + "loss": 0.3348, + "step": 3862 + }, + { + "epoch": 0.31294556059624107, + "grad_norm": 0.035469312220811844, + "learning_rate": 0.0001564601053057918, + "loss": 0.371, + "step": 3863 + }, + { + "epoch": 0.3130265716137395, + "grad_norm": 0.03374820947647095, + "learning_rate": 0.00015650060753341435, + "loss": 0.3808, + "step": 3864 + }, + { + "epoch": 0.31310758263123784, + "grad_norm": 0.03543124347925186, + "learning_rate": 0.00015654110976103685, + "loss": 0.374, + "step": 3865 + }, + { + "epoch": 0.31318859364873625, + "grad_norm": 0.04282199963927269, + "learning_rate": 0.00015658161198865939, + "loss": 0.3263, + "step": 3866 + }, + { + "epoch": 0.3132696046662346, + "grad_norm": 0.03741493076086044, + "learning_rate": 0.0001566221142162819, + "loss": 0.3565, + "step": 3867 + }, + { + "epoch": 0.313350615683733, + "grad_norm": 0.03533174842596054, + "learning_rate": 0.00015666261644390442, + "loss": 0.3678, + "step": 3868 + }, + { + "epoch": 0.31343162670123137, + "grad_norm": 0.03229209780693054, + "learning_rate": 0.00015670311867152693, + "loss": 0.3772, + "step": 3869 + }, + { + "epoch": 0.3135126377187297, + "grad_norm": 0.03186071664094925, + "learning_rate": 0.00015674362089914946, + "loss": 0.386, + "step": 3870 + }, + { + "epoch": 0.31359364873622814, + "grad_norm": 0.04570740461349487, + "learning_rate": 0.00015678412312677197, + "loss": 0.4336, + "step": 3871 + }, + { + "epoch": 0.3136746597537265, + "grad_norm": 0.030013004317879677, + "learning_rate": 0.0001568246253543945, + "loss": 0.3536, + "step": 3872 + }, + { + "epoch": 0.3137556707712249, + "grad_norm": 0.03668045252561569, + "learning_rate": 0.000156865127582017, + "loss": 0.3605, + "step": 3873 + }, + { + "epoch": 0.31383668178872326, + "grad_norm": 0.04096108675003052, + "learning_rate": 0.00015690562980963954, + "loss": 0.3905, + "step": 3874 + }, + { + "epoch": 0.31391769280622167, + "grad_norm": 0.03962705284357071, + "learning_rate": 0.00015694613203726205, + "loss": 0.4093, + "step": 3875 + }, + { + "epoch": 0.31399870382372, + "grad_norm": 0.03714505955576897, + "learning_rate": 0.00015698663426488458, + "loss": 0.4513, + "step": 3876 + }, + { + "epoch": 0.3140797148412184, + "grad_norm": 0.03829257935285568, + "learning_rate": 0.0001570271364925071, + "loss": 0.3441, + "step": 3877 + }, + { + "epoch": 0.3141607258587168, + "grad_norm": 0.03213474899530411, + "learning_rate": 0.00015706763872012962, + "loss": 0.3869, + "step": 3878 + }, + { + "epoch": 0.31424173687621515, + "grad_norm": 0.047674331814050674, + "learning_rate": 0.00015710814094775213, + "loss": 0.4068, + "step": 3879 + }, + { + "epoch": 0.31432274789371356, + "grad_norm": 0.049570657312870026, + "learning_rate": 0.00015714864317537466, + "loss": 0.394, + "step": 3880 + }, + { + "epoch": 0.3144037589112119, + "grad_norm": 0.03454206883907318, + "learning_rate": 0.00015718914540299717, + "loss": 0.3608, + "step": 3881 + }, + { + "epoch": 0.31448476992871033, + "grad_norm": 0.03465542942285538, + "learning_rate": 0.0001572296476306197, + "loss": 0.3472, + "step": 3882 + }, + { + "epoch": 0.3145657809462087, + "grad_norm": 0.03724677115678787, + "learning_rate": 0.00015727014985824223, + "loss": 0.4236, + "step": 3883 + }, + { + "epoch": 0.31464679196370704, + "grad_norm": 0.036393288522958755, + "learning_rate": 0.00015731065208586474, + "loss": 0.3341, + "step": 3884 + }, + { + "epoch": 0.31472780298120545, + "grad_norm": 0.033561281859874725, + "learning_rate": 0.00015735115431348725, + "loss": 0.354, + "step": 3885 + }, + { + "epoch": 0.3148088139987038, + "grad_norm": 0.041037797927856445, + "learning_rate": 0.00015739165654110978, + "loss": 0.3653, + "step": 3886 + }, + { + "epoch": 0.3148898250162022, + "grad_norm": 0.03577622398734093, + "learning_rate": 0.00015743215876873229, + "loss": 0.4372, + "step": 3887 + }, + { + "epoch": 0.3149708360337006, + "grad_norm": 0.03454362228512764, + "learning_rate": 0.0001574726609963548, + "loss": 0.3424, + "step": 3888 + }, + { + "epoch": 0.315051847051199, + "grad_norm": 0.04236502945423126, + "learning_rate": 0.00015751316322397732, + "loss": 0.3823, + "step": 3889 + }, + { + "epoch": 0.31513285806869734, + "grad_norm": 0.0397837869822979, + "learning_rate": 0.00015755366545159983, + "loss": 0.358, + "step": 3890 + }, + { + "epoch": 0.3152138690861957, + "grad_norm": 0.0396885946393013, + "learning_rate": 0.00015759416767922236, + "loss": 0.403, + "step": 3891 + }, + { + "epoch": 0.3152948801036941, + "grad_norm": 0.0386836864054203, + "learning_rate": 0.00015763466990684487, + "loss": 0.3255, + "step": 3892 + }, + { + "epoch": 0.31537589112119246, + "grad_norm": 0.047269124537706375, + "learning_rate": 0.0001576751721344674, + "loss": 0.4151, + "step": 3893 + }, + { + "epoch": 0.3154569021386909, + "grad_norm": 0.04643293470144272, + "learning_rate": 0.0001577156743620899, + "loss": 0.427, + "step": 3894 + }, + { + "epoch": 0.31553791315618923, + "grad_norm": 0.0444369800388813, + "learning_rate": 0.00015775617658971244, + "loss": 0.4181, + "step": 3895 + }, + { + "epoch": 0.31561892417368764, + "grad_norm": 0.03646974265575409, + "learning_rate": 0.00015779667881733495, + "loss": 0.4078, + "step": 3896 + }, + { + "epoch": 0.315699935191186, + "grad_norm": 0.04198101535439491, + "learning_rate": 0.00015783718104495748, + "loss": 0.361, + "step": 3897 + }, + { + "epoch": 0.31578094620868435, + "grad_norm": 0.040488485246896744, + "learning_rate": 0.00015787768327258, + "loss": 0.3837, + "step": 3898 + }, + { + "epoch": 0.31586195722618277, + "grad_norm": 0.03984922170639038, + "learning_rate": 0.00015791818550020252, + "loss": 0.4125, + "step": 3899 + }, + { + "epoch": 0.3159429682436811, + "grad_norm": 0.03467942401766777, + "learning_rate": 0.00015795868772782503, + "loss": 0.3673, + "step": 3900 + }, + { + "epoch": 0.31602397926117953, + "grad_norm": 0.03277461603283882, + "learning_rate": 0.00015799918995544756, + "loss": 0.3796, + "step": 3901 + }, + { + "epoch": 0.3161049902786779, + "grad_norm": 0.03881349042057991, + "learning_rate": 0.0001580396921830701, + "loss": 0.3427, + "step": 3902 + }, + { + "epoch": 0.3161860012961763, + "grad_norm": 0.03951621428132057, + "learning_rate": 0.0001580801944106926, + "loss": 0.4245, + "step": 3903 + }, + { + "epoch": 0.31626701231367466, + "grad_norm": 0.037890203297138214, + "learning_rate": 0.00015812069663831513, + "loss": 0.3822, + "step": 3904 + }, + { + "epoch": 0.31634802333117307, + "grad_norm": 0.038001008331775665, + "learning_rate": 0.00015816119886593764, + "loss": 0.3829, + "step": 3905 + }, + { + "epoch": 0.3164290343486714, + "grad_norm": 0.04266396909952164, + "learning_rate": 0.00015820170109356017, + "loss": 0.3751, + "step": 3906 + }, + { + "epoch": 0.3165100453661698, + "grad_norm": 0.03496110811829567, + "learning_rate": 0.00015824220332118268, + "loss": 0.3971, + "step": 3907 + }, + { + "epoch": 0.3165910563836682, + "grad_norm": 0.032627735286951065, + "learning_rate": 0.0001582827055488052, + "loss": 0.3925, + "step": 3908 + }, + { + "epoch": 0.31667206740116655, + "grad_norm": 0.040596555918455124, + "learning_rate": 0.00015832320777642772, + "loss": 0.3649, + "step": 3909 + }, + { + "epoch": 0.31675307841866496, + "grad_norm": 0.04176980257034302, + "learning_rate": 0.00015836371000405022, + "loss": 0.4309, + "step": 3910 + }, + { + "epoch": 0.3168340894361633, + "grad_norm": 0.03294534981250763, + "learning_rate": 0.00015840421223167276, + "loss": 0.3871, + "step": 3911 + }, + { + "epoch": 0.3169151004536617, + "grad_norm": 0.032855693250894547, + "learning_rate": 0.00015844471445929526, + "loss": 0.3522, + "step": 3912 + }, + { + "epoch": 0.3169961114711601, + "grad_norm": 0.04674731567502022, + "learning_rate": 0.00015848521668691777, + "loss": 0.3731, + "step": 3913 + }, + { + "epoch": 0.31707712248865844, + "grad_norm": 0.04186466336250305, + "learning_rate": 0.0001585257189145403, + "loss": 0.3498, + "step": 3914 + }, + { + "epoch": 0.31715813350615685, + "grad_norm": 0.03885754197835922, + "learning_rate": 0.0001585662211421628, + "loss": 0.3419, + "step": 3915 + }, + { + "epoch": 0.3172391445236552, + "grad_norm": 0.04161924123764038, + "learning_rate": 0.00015860672336978534, + "loss": 0.3811, + "step": 3916 + }, + { + "epoch": 0.3173201555411536, + "grad_norm": 0.034731023013591766, + "learning_rate": 0.00015864722559740785, + "loss": 0.3516, + "step": 3917 + }, + { + "epoch": 0.31740116655865197, + "grad_norm": 0.041832007467746735, + "learning_rate": 0.00015868772782503038, + "loss": 0.3688, + "step": 3918 + }, + { + "epoch": 0.3174821775761504, + "grad_norm": 0.03571493178606033, + "learning_rate": 0.0001587282300526529, + "loss": 0.3345, + "step": 3919 + }, + { + "epoch": 0.31756318859364874, + "grad_norm": 0.04684751108288765, + "learning_rate": 0.00015876873228027542, + "loss": 0.4023, + "step": 3920 + }, + { + "epoch": 0.3176441996111471, + "grad_norm": 0.032821930944919586, + "learning_rate": 0.00015880923450789795, + "loss": 0.3578, + "step": 3921 + }, + { + "epoch": 0.3177252106286455, + "grad_norm": 0.035076357424259186, + "learning_rate": 0.00015884973673552046, + "loss": 0.3392, + "step": 3922 + }, + { + "epoch": 0.31780622164614386, + "grad_norm": 0.050397682934999466, + "learning_rate": 0.000158890238963143, + "loss": 0.3661, + "step": 3923 + }, + { + "epoch": 0.31788723266364227, + "grad_norm": 0.038197655230760574, + "learning_rate": 0.0001589307411907655, + "loss": 0.3239, + "step": 3924 + }, + { + "epoch": 0.3179682436811406, + "grad_norm": 0.03825785592198372, + "learning_rate": 0.00015897124341838803, + "loss": 0.3271, + "step": 3925 + }, + { + "epoch": 0.31804925469863904, + "grad_norm": 0.03636878728866577, + "learning_rate": 0.00015901174564601054, + "loss": 0.3732, + "step": 3926 + }, + { + "epoch": 0.3181302657161374, + "grad_norm": 0.03680877014994621, + "learning_rate": 0.00015905224787363307, + "loss": 0.3994, + "step": 3927 + }, + { + "epoch": 0.31821127673363575, + "grad_norm": 0.03805559128522873, + "learning_rate": 0.00015909275010125558, + "loss": 0.3787, + "step": 3928 + }, + { + "epoch": 0.31829228775113416, + "grad_norm": 0.03796125948429108, + "learning_rate": 0.0001591332523288781, + "loss": 0.3882, + "step": 3929 + }, + { + "epoch": 0.3183732987686325, + "grad_norm": 0.0325532890856266, + "learning_rate": 0.00015917375455650062, + "loss": 0.3429, + "step": 3930 + }, + { + "epoch": 0.31845430978613093, + "grad_norm": 0.034440673887729645, + "learning_rate": 0.00015921425678412315, + "loss": 0.3795, + "step": 3931 + }, + { + "epoch": 0.3185353208036293, + "grad_norm": 0.0457146018743515, + "learning_rate": 0.00015925475901174566, + "loss": 0.3864, + "step": 3932 + }, + { + "epoch": 0.3186163318211277, + "grad_norm": 0.030668552964925766, + "learning_rate": 0.0001592952612393682, + "loss": 0.3374, + "step": 3933 + }, + { + "epoch": 0.31869734283862605, + "grad_norm": 0.04022758826613426, + "learning_rate": 0.0001593357634669907, + "loss": 0.4046, + "step": 3934 + }, + { + "epoch": 0.3187783538561244, + "grad_norm": 0.0356454961001873, + "learning_rate": 0.0001593762656946132, + "loss": 0.3657, + "step": 3935 + }, + { + "epoch": 0.3188593648736228, + "grad_norm": 0.055281862616539, + "learning_rate": 0.0001594167679222357, + "loss": 0.3932, + "step": 3936 + }, + { + "epoch": 0.3189403758911212, + "grad_norm": 0.03439437970519066, + "learning_rate": 0.00015945727014985824, + "loss": 0.3297, + "step": 3937 + }, + { + "epoch": 0.3190213869086196, + "grad_norm": 0.035919319838285446, + "learning_rate": 0.00015949777237748075, + "loss": 0.325, + "step": 3938 + }, + { + "epoch": 0.31910239792611794, + "grad_norm": 0.043670397251844406, + "learning_rate": 0.00015953827460510328, + "loss": 0.4025, + "step": 3939 + }, + { + "epoch": 0.31918340894361635, + "grad_norm": 0.04012720659375191, + "learning_rate": 0.00015957877683272581, + "loss": 0.3902, + "step": 3940 + }, + { + "epoch": 0.3192644199611147, + "grad_norm": 0.042554520070552826, + "learning_rate": 0.00015961927906034832, + "loss": 0.403, + "step": 3941 + }, + { + "epoch": 0.31934543097861307, + "grad_norm": 0.04239801689982414, + "learning_rate": 0.00015965978128797085, + "loss": 0.389, + "step": 3942 + }, + { + "epoch": 0.3194264419961115, + "grad_norm": 0.03829963505268097, + "learning_rate": 0.00015970028351559336, + "loss": 0.3512, + "step": 3943 + }, + { + "epoch": 0.31950745301360983, + "grad_norm": 0.03850733861327171, + "learning_rate": 0.0001597407857432159, + "loss": 0.3895, + "step": 3944 + }, + { + "epoch": 0.31958846403110824, + "grad_norm": 0.04022325202822685, + "learning_rate": 0.0001597812879708384, + "loss": 0.3582, + "step": 3945 + }, + { + "epoch": 0.3196694750486066, + "grad_norm": 0.03183432295918465, + "learning_rate": 0.00015982179019846093, + "loss": 0.3445, + "step": 3946 + }, + { + "epoch": 0.319750486066105, + "grad_norm": 0.04546622559428215, + "learning_rate": 0.00015986229242608344, + "loss": 0.3393, + "step": 3947 + }, + { + "epoch": 0.31983149708360337, + "grad_norm": 0.031236806884407997, + "learning_rate": 0.00015990279465370597, + "loss": 0.3326, + "step": 3948 + }, + { + "epoch": 0.3199125081011017, + "grad_norm": 0.033743731677532196, + "learning_rate": 0.00015994329688132848, + "loss": 0.3756, + "step": 3949 + }, + { + "epoch": 0.31999351911860013, + "grad_norm": 0.044623322784900665, + "learning_rate": 0.000159983799108951, + "loss": 0.3892, + "step": 3950 + }, + { + "epoch": 0.3200745301360985, + "grad_norm": 0.03815013915300369, + "learning_rate": 0.00016002430133657352, + "loss": 0.3768, + "step": 3951 + }, + { + "epoch": 0.3201555411535969, + "grad_norm": 0.031806398183107376, + "learning_rate": 0.00016006480356419605, + "loss": 0.3566, + "step": 3952 + }, + { + "epoch": 0.32023655217109526, + "grad_norm": 0.04002038761973381, + "learning_rate": 0.00016010530579181856, + "loss": 0.3515, + "step": 3953 + }, + { + "epoch": 0.32031756318859367, + "grad_norm": 0.03920595347881317, + "learning_rate": 0.0001601458080194411, + "loss": 0.411, + "step": 3954 + }, + { + "epoch": 0.320398574206092, + "grad_norm": 0.038479164242744446, + "learning_rate": 0.0001601863102470636, + "loss": 0.4116, + "step": 3955 + }, + { + "epoch": 0.32047958522359044, + "grad_norm": 0.02915555238723755, + "learning_rate": 0.00016022681247468613, + "loss": 0.3619, + "step": 3956 + }, + { + "epoch": 0.3205605962410888, + "grad_norm": 0.032505497336387634, + "learning_rate": 0.00016026731470230863, + "loss": 0.3693, + "step": 3957 + }, + { + "epoch": 0.32064160725858715, + "grad_norm": 0.039211973547935486, + "learning_rate": 0.00016030781692993114, + "loss": 0.3704, + "step": 3958 + }, + { + "epoch": 0.32072261827608556, + "grad_norm": 0.03348521515727043, + "learning_rate": 0.00016034831915755367, + "loss": 0.3629, + "step": 3959 + }, + { + "epoch": 0.3208036292935839, + "grad_norm": 0.04027702286839485, + "learning_rate": 0.00016038882138517618, + "loss": 0.3444, + "step": 3960 + }, + { + "epoch": 0.3208846403110823, + "grad_norm": 0.04102419316768646, + "learning_rate": 0.0001604293236127987, + "loss": 0.3598, + "step": 3961 + }, + { + "epoch": 0.3209656513285807, + "grad_norm": 0.032008182257413864, + "learning_rate": 0.00016046982584042122, + "loss": 0.3424, + "step": 3962 + }, + { + "epoch": 0.3210466623460791, + "grad_norm": 0.03532759100198746, + "learning_rate": 0.00016051032806804375, + "loss": 0.3722, + "step": 3963 + }, + { + "epoch": 0.32112767336357745, + "grad_norm": 0.039972130209207535, + "learning_rate": 0.00016055083029566626, + "loss": 0.3641, + "step": 3964 + }, + { + "epoch": 0.3212086843810758, + "grad_norm": 0.03748741000890732, + "learning_rate": 0.0001605913325232888, + "loss": 0.3447, + "step": 3965 + }, + { + "epoch": 0.3212896953985742, + "grad_norm": 0.03745513781905174, + "learning_rate": 0.0001606318347509113, + "loss": 0.3469, + "step": 3966 + }, + { + "epoch": 0.32137070641607257, + "grad_norm": 0.03754321113228798, + "learning_rate": 0.00016067233697853383, + "loss": 0.3861, + "step": 3967 + }, + { + "epoch": 0.321451717433571, + "grad_norm": 0.0451197624206543, + "learning_rate": 0.00016071283920615634, + "loss": 0.4456, + "step": 3968 + }, + { + "epoch": 0.32153272845106934, + "grad_norm": 0.040234439074993134, + "learning_rate": 0.00016075334143377887, + "loss": 0.4076, + "step": 3969 + }, + { + "epoch": 0.32161373946856775, + "grad_norm": 0.03272630274295807, + "learning_rate": 0.00016079384366140138, + "loss": 0.3299, + "step": 3970 + }, + { + "epoch": 0.3216947504860661, + "grad_norm": 0.03154432773590088, + "learning_rate": 0.0001608343458890239, + "loss": 0.3517, + "step": 3971 + }, + { + "epoch": 0.32177576150356446, + "grad_norm": 0.042832449078559875, + "learning_rate": 0.00016087484811664642, + "loss": 0.4232, + "step": 3972 + }, + { + "epoch": 0.3218567725210629, + "grad_norm": 0.042939815670251846, + "learning_rate": 0.00016091535034426895, + "loss": 0.3594, + "step": 3973 + }, + { + "epoch": 0.32193778353856123, + "grad_norm": 0.034177325665950775, + "learning_rate": 0.00016095585257189146, + "loss": 0.3972, + "step": 3974 + }, + { + "epoch": 0.32201879455605964, + "grad_norm": 0.03674381598830223, + "learning_rate": 0.000160996354799514, + "loss": 0.3461, + "step": 3975 + }, + { + "epoch": 0.322099805573558, + "grad_norm": 0.0347549170255661, + "learning_rate": 0.0001610368570271365, + "loss": 0.3703, + "step": 3976 + }, + { + "epoch": 0.3221808165910564, + "grad_norm": 0.03207762539386749, + "learning_rate": 0.00016107735925475903, + "loss": 0.3475, + "step": 3977 + }, + { + "epoch": 0.32226182760855476, + "grad_norm": 0.03411857783794403, + "learning_rate": 0.00016111786148238156, + "loss": 0.3955, + "step": 3978 + }, + { + "epoch": 0.3223428386260531, + "grad_norm": 0.04447000101208687, + "learning_rate": 0.00016115836371000407, + "loss": 0.4138, + "step": 3979 + }, + { + "epoch": 0.32242384964355153, + "grad_norm": 0.038355935364961624, + "learning_rate": 0.00016119886593762657, + "loss": 0.3929, + "step": 3980 + }, + { + "epoch": 0.3225048606610499, + "grad_norm": 0.03359310328960419, + "learning_rate": 0.0001612393681652491, + "loss": 0.356, + "step": 3981 + }, + { + "epoch": 0.3225858716785483, + "grad_norm": 0.03186175972223282, + "learning_rate": 0.0001612798703928716, + "loss": 0.3308, + "step": 3982 + }, + { + "epoch": 0.32266688269604665, + "grad_norm": 0.03117132931947708, + "learning_rate": 0.00016132037262049412, + "loss": 0.3371, + "step": 3983 + }, + { + "epoch": 0.32274789371354506, + "grad_norm": 0.039014607667922974, + "learning_rate": 0.00016136087484811665, + "loss": 0.4021, + "step": 3984 + }, + { + "epoch": 0.3228289047310434, + "grad_norm": 0.03741605579853058, + "learning_rate": 0.00016140137707573916, + "loss": 0.3816, + "step": 3985 + }, + { + "epoch": 0.3229099157485418, + "grad_norm": 0.03936758264899254, + "learning_rate": 0.0001614418793033617, + "loss": 0.4038, + "step": 3986 + }, + { + "epoch": 0.3229909267660402, + "grad_norm": 0.03849122300744057, + "learning_rate": 0.0001614823815309842, + "loss": 0.3726, + "step": 3987 + }, + { + "epoch": 0.32307193778353854, + "grad_norm": 0.034238673746585846, + "learning_rate": 0.00016152288375860673, + "loss": 0.3438, + "step": 3988 + }, + { + "epoch": 0.32315294880103695, + "grad_norm": 0.034184083342552185, + "learning_rate": 0.00016156338598622924, + "loss": 0.3556, + "step": 3989 + }, + { + "epoch": 0.3232339598185353, + "grad_norm": 0.039354074746370316, + "learning_rate": 0.00016160388821385177, + "loss": 0.4, + "step": 3990 + }, + { + "epoch": 0.3233149708360337, + "grad_norm": 0.03062828630208969, + "learning_rate": 0.00016164439044147428, + "loss": 0.3365, + "step": 3991 + }, + { + "epoch": 0.3233959818535321, + "grad_norm": 0.0330023393034935, + "learning_rate": 0.0001616848926690968, + "loss": 0.3612, + "step": 3992 + }, + { + "epoch": 0.32347699287103043, + "grad_norm": 0.0405740812420845, + "learning_rate": 0.00016172539489671932, + "loss": 0.3907, + "step": 3993 + }, + { + "epoch": 0.32355800388852884, + "grad_norm": 0.03357372432947159, + "learning_rate": 0.00016176589712434185, + "loss": 0.3836, + "step": 3994 + }, + { + "epoch": 0.3236390149060272, + "grad_norm": 0.03729019686579704, + "learning_rate": 0.00016180639935196436, + "loss": 0.372, + "step": 3995 + }, + { + "epoch": 0.3237200259235256, + "grad_norm": 0.04074510186910629, + "learning_rate": 0.0001618469015795869, + "loss": 0.399, + "step": 3996 + }, + { + "epoch": 0.32380103694102397, + "grad_norm": 0.03839709237217903, + "learning_rate": 0.00016188740380720942, + "loss": 0.3793, + "step": 3997 + }, + { + "epoch": 0.3238820479585224, + "grad_norm": 0.03876428306102753, + "learning_rate": 0.00016192790603483193, + "loss": 0.366, + "step": 3998 + }, + { + "epoch": 0.32396305897602073, + "grad_norm": 0.03858571499586105, + "learning_rate": 0.00016196840826245446, + "loss": 0.3614, + "step": 3999 + }, + { + "epoch": 0.32404406999351915, + "grad_norm": 0.037236109375953674, + "learning_rate": 0.00016200891049007697, + "loss": 0.361, + "step": 4000 + }, + { + "epoch": 0.3241250810110175, + "grad_norm": 0.0351792648434639, + "learning_rate": 0.0001620494127176995, + "loss": 0.4234, + "step": 4001 + }, + { + "epoch": 0.32420609202851586, + "grad_norm": 0.03616362810134888, + "learning_rate": 0.000162089914945322, + "loss": 0.4013, + "step": 4002 + }, + { + "epoch": 0.32428710304601427, + "grad_norm": 0.0375266894698143, + "learning_rate": 0.00016213041717294454, + "loss": 0.4115, + "step": 4003 + }, + { + "epoch": 0.3243681140635126, + "grad_norm": 0.03920578584074974, + "learning_rate": 0.00016217091940056705, + "loss": 0.3409, + "step": 4004 + }, + { + "epoch": 0.32444912508101104, + "grad_norm": 0.036732036620378494, + "learning_rate": 0.00016221142162818955, + "loss": 0.3824, + "step": 4005 + }, + { + "epoch": 0.3245301360985094, + "grad_norm": 0.03974968567490578, + "learning_rate": 0.00016225192385581206, + "loss": 0.3146, + "step": 4006 + }, + { + "epoch": 0.3246111471160078, + "grad_norm": 0.042769160121679306, + "learning_rate": 0.0001622924260834346, + "loss": 0.388, + "step": 4007 + }, + { + "epoch": 0.32469215813350616, + "grad_norm": 0.03568337485194206, + "learning_rate": 0.0001623329283110571, + "loss": 0.3803, + "step": 4008 + }, + { + "epoch": 0.3247731691510045, + "grad_norm": 0.03258265182375908, + "learning_rate": 0.00016237343053867963, + "loss": 0.3972, + "step": 4009 + }, + { + "epoch": 0.3248541801685029, + "grad_norm": 0.036132700741291046, + "learning_rate": 0.00016241393276630214, + "loss": 0.3571, + "step": 4010 + }, + { + "epoch": 0.3249351911860013, + "grad_norm": 0.03624130040407181, + "learning_rate": 0.00016245443499392467, + "loss": 0.3414, + "step": 4011 + }, + { + "epoch": 0.3250162022034997, + "grad_norm": 0.04750201851129532, + "learning_rate": 0.00016249493722154718, + "loss": 0.3793, + "step": 4012 + }, + { + "epoch": 0.32509721322099805, + "grad_norm": 0.037769172340631485, + "learning_rate": 0.0001625354394491697, + "loss": 0.3086, + "step": 4013 + }, + { + "epoch": 0.32517822423849646, + "grad_norm": 0.03758542984724045, + "learning_rate": 0.00016257594167679222, + "loss": 0.3603, + "step": 4014 + }, + { + "epoch": 0.3252592352559948, + "grad_norm": 0.03945527970790863, + "learning_rate": 0.00016261644390441475, + "loss": 0.3936, + "step": 4015 + }, + { + "epoch": 0.32534024627349317, + "grad_norm": 0.03996698558330536, + "learning_rate": 0.00016265694613203728, + "loss": 0.4186, + "step": 4016 + }, + { + "epoch": 0.3254212572909916, + "grad_norm": 0.03950809687376022, + "learning_rate": 0.0001626974483596598, + "loss": 0.393, + "step": 4017 + }, + { + "epoch": 0.32550226830848994, + "grad_norm": 0.04291021078824997, + "learning_rate": 0.00016273795058728232, + "loss": 0.492, + "step": 4018 + }, + { + "epoch": 0.32558327932598835, + "grad_norm": 0.035584136843681335, + "learning_rate": 0.00016277845281490483, + "loss": 0.3314, + "step": 4019 + }, + { + "epoch": 0.3256642903434867, + "grad_norm": 0.04221056029200554, + "learning_rate": 0.00016281895504252736, + "loss": 0.4079, + "step": 4020 + }, + { + "epoch": 0.3257453013609851, + "grad_norm": 0.03662978485226631, + "learning_rate": 0.00016285945727014987, + "loss": 0.3372, + "step": 4021 + }, + { + "epoch": 0.3258263123784835, + "grad_norm": 0.03927822783589363, + "learning_rate": 0.0001628999594977724, + "loss": 0.3701, + "step": 4022 + }, + { + "epoch": 0.32590732339598183, + "grad_norm": 0.03549426048994064, + "learning_rate": 0.0001629404617253949, + "loss": 0.3976, + "step": 4023 + }, + { + "epoch": 0.32598833441348024, + "grad_norm": 0.035459041595458984, + "learning_rate": 0.00016298096395301744, + "loss": 0.37, + "step": 4024 + }, + { + "epoch": 0.3260693454309786, + "grad_norm": 0.03579239174723625, + "learning_rate": 0.00016302146618063995, + "loss": 0.3874, + "step": 4025 + }, + { + "epoch": 0.326150356448477, + "grad_norm": 0.03891357406973839, + "learning_rate": 0.00016306196840826248, + "loss": 0.3551, + "step": 4026 + }, + { + "epoch": 0.32623136746597536, + "grad_norm": 0.040021754801273346, + "learning_rate": 0.00016310247063588498, + "loss": 0.3913, + "step": 4027 + }, + { + "epoch": 0.3263123784834738, + "grad_norm": 0.03605050593614578, + "learning_rate": 0.0001631429728635075, + "loss": 0.3339, + "step": 4028 + }, + { + "epoch": 0.32639338950097213, + "grad_norm": 0.03774379566311836, + "learning_rate": 0.00016318347509113002, + "loss": 0.3455, + "step": 4029 + }, + { + "epoch": 0.3264744005184705, + "grad_norm": 0.037987157702445984, + "learning_rate": 0.00016322397731875253, + "loss": 0.3608, + "step": 4030 + }, + { + "epoch": 0.3265554115359689, + "grad_norm": 0.03648436442017555, + "learning_rate": 0.00016326447954637504, + "loss": 0.323, + "step": 4031 + }, + { + "epoch": 0.32663642255346725, + "grad_norm": 0.03928240388631821, + "learning_rate": 0.00016330498177399757, + "loss": 0.3906, + "step": 4032 + }, + { + "epoch": 0.32671743357096567, + "grad_norm": 0.0331883542239666, + "learning_rate": 0.00016334548400162008, + "loss": 0.3736, + "step": 4033 + }, + { + "epoch": 0.326798444588464, + "grad_norm": 0.03794945031404495, + "learning_rate": 0.0001633859862292426, + "loss": 0.3839, + "step": 4034 + }, + { + "epoch": 0.32687945560596243, + "grad_norm": 0.03638019412755966, + "learning_rate": 0.00016342648845686514, + "loss": 0.38, + "step": 4035 + }, + { + "epoch": 0.3269604666234608, + "grad_norm": 0.035750579088926315, + "learning_rate": 0.00016346699068448765, + "loss": 0.3653, + "step": 4036 + }, + { + "epoch": 0.32704147764095914, + "grad_norm": 0.03391675651073456, + "learning_rate": 0.00016350749291211018, + "loss": 0.3206, + "step": 4037 + }, + { + "epoch": 0.32712248865845756, + "grad_norm": 0.03558258339762688, + "learning_rate": 0.0001635479951397327, + "loss": 0.3528, + "step": 4038 + }, + { + "epoch": 0.3272034996759559, + "grad_norm": 0.03626623749732971, + "learning_rate": 0.00016358849736735522, + "loss": 0.3612, + "step": 4039 + }, + { + "epoch": 0.3272845106934543, + "grad_norm": 0.04127888381481171, + "learning_rate": 0.00016362899959497773, + "loss": 0.4088, + "step": 4040 + }, + { + "epoch": 0.3273655217109527, + "grad_norm": 0.03882889822125435, + "learning_rate": 0.00016366950182260026, + "loss": 0.37, + "step": 4041 + }, + { + "epoch": 0.3274465327284511, + "grad_norm": 0.03618205338716507, + "learning_rate": 0.00016371000405022277, + "loss": 0.3634, + "step": 4042 + }, + { + "epoch": 0.32752754374594945, + "grad_norm": 0.0353274866938591, + "learning_rate": 0.0001637505062778453, + "loss": 0.3593, + "step": 4043 + }, + { + "epoch": 0.3276085547634478, + "grad_norm": 0.040717270225286484, + "learning_rate": 0.0001637910085054678, + "loss": 0.3824, + "step": 4044 + }, + { + "epoch": 0.3276895657809462, + "grad_norm": 0.03363925591111183, + "learning_rate": 0.00016383151073309034, + "loss": 0.3533, + "step": 4045 + }, + { + "epoch": 0.32777057679844457, + "grad_norm": 0.033195581287145615, + "learning_rate": 0.00016387201296071284, + "loss": 0.3516, + "step": 4046 + }, + { + "epoch": 0.327851587815943, + "grad_norm": 0.045429445803165436, + "learning_rate": 0.00016391251518833538, + "loss": 0.3696, + "step": 4047 + }, + { + "epoch": 0.32793259883344134, + "grad_norm": 0.032288793474435806, + "learning_rate": 0.00016395301741595788, + "loss": 0.3922, + "step": 4048 + }, + { + "epoch": 0.32801360985093975, + "grad_norm": 0.03578419238328934, + "learning_rate": 0.00016399351964358042, + "loss": 0.4296, + "step": 4049 + }, + { + "epoch": 0.3280946208684381, + "grad_norm": 0.037038251757621765, + "learning_rate": 0.00016403402187120292, + "loss": 0.37, + "step": 4050 + }, + { + "epoch": 0.3281756318859365, + "grad_norm": 0.03831426426768303, + "learning_rate": 0.00016407452409882546, + "loss": 0.4149, + "step": 4051 + }, + { + "epoch": 0.32825664290343487, + "grad_norm": 0.03701657056808472, + "learning_rate": 0.00016411502632644796, + "loss": 0.3257, + "step": 4052 + }, + { + "epoch": 0.3283376539209332, + "grad_norm": 0.03409591317176819, + "learning_rate": 0.00016415552855407047, + "loss": 0.3546, + "step": 4053 + }, + { + "epoch": 0.32841866493843164, + "grad_norm": 0.03310379013419151, + "learning_rate": 0.000164196030781693, + "loss": 0.3481, + "step": 4054 + }, + { + "epoch": 0.32849967595593, + "grad_norm": 0.03756807744503021, + "learning_rate": 0.0001642365330093155, + "loss": 0.4197, + "step": 4055 + }, + { + "epoch": 0.3285806869734284, + "grad_norm": 0.03222097083926201, + "learning_rate": 0.00016427703523693804, + "loss": 0.3418, + "step": 4056 + }, + { + "epoch": 0.32866169799092676, + "grad_norm": 0.039320822805166245, + "learning_rate": 0.00016431753746456055, + "loss": 0.3667, + "step": 4057 + }, + { + "epoch": 0.32874270900842517, + "grad_norm": 0.03398220241069794, + "learning_rate": 0.00016435803969218308, + "loss": 0.347, + "step": 4058 + }, + { + "epoch": 0.3288237200259235, + "grad_norm": 0.03665045648813248, + "learning_rate": 0.0001643985419198056, + "loss": 0.3903, + "step": 4059 + }, + { + "epoch": 0.3289047310434219, + "grad_norm": 0.03421606495976448, + "learning_rate": 0.00016443904414742812, + "loss": 0.3649, + "step": 4060 + }, + { + "epoch": 0.3289857420609203, + "grad_norm": 0.03205835446715355, + "learning_rate": 0.00016447954637505063, + "loss": 0.3511, + "step": 4061 + }, + { + "epoch": 0.32906675307841865, + "grad_norm": 0.03892917558550835, + "learning_rate": 0.00016452004860267316, + "loss": 0.4036, + "step": 4062 + }, + { + "epoch": 0.32914776409591706, + "grad_norm": 0.05253162235021591, + "learning_rate": 0.00016456055083029567, + "loss": 0.3281, + "step": 4063 + }, + { + "epoch": 0.3292287751134154, + "grad_norm": 0.03342265635728836, + "learning_rate": 0.0001646010530579182, + "loss": 0.3494, + "step": 4064 + }, + { + "epoch": 0.32930978613091383, + "grad_norm": 0.04515873268246651, + "learning_rate": 0.0001646415552855407, + "loss": 0.3868, + "step": 4065 + }, + { + "epoch": 0.3293907971484122, + "grad_norm": 0.038879964500665665, + "learning_rate": 0.00016468205751316324, + "loss": 0.37, + "step": 4066 + }, + { + "epoch": 0.32947180816591054, + "grad_norm": 0.034797415137290955, + "learning_rate": 0.00016472255974078574, + "loss": 0.379, + "step": 4067 + }, + { + "epoch": 0.32955281918340895, + "grad_norm": 0.039061516523361206, + "learning_rate": 0.00016476306196840828, + "loss": 0.3781, + "step": 4068 + }, + { + "epoch": 0.3296338302009073, + "grad_norm": 0.03718171268701553, + "learning_rate": 0.00016480356419603078, + "loss": 0.3805, + "step": 4069 + }, + { + "epoch": 0.3297148412184057, + "grad_norm": 0.03436566889286041, + "learning_rate": 0.00016484406642365332, + "loss": 0.4011, + "step": 4070 + }, + { + "epoch": 0.3297958522359041, + "grad_norm": 0.03371388092637062, + "learning_rate": 0.00016488456865127582, + "loss": 0.3144, + "step": 4071 + }, + { + "epoch": 0.3298768632534025, + "grad_norm": 0.03688603639602661, + "learning_rate": 0.00016492507087889836, + "loss": 0.309, + "step": 4072 + }, + { + "epoch": 0.32995787427090084, + "grad_norm": 0.03432590141892433, + "learning_rate": 0.0001649655731065209, + "loss": 0.349, + "step": 4073 + }, + { + "epoch": 0.3300388852883992, + "grad_norm": 0.03731279820203781, + "learning_rate": 0.0001650060753341434, + "loss": 0.3977, + "step": 4074 + }, + { + "epoch": 0.3301198963058976, + "grad_norm": 0.040071602910757065, + "learning_rate": 0.0001650465775617659, + "loss": 0.3577, + "step": 4075 + }, + { + "epoch": 0.33020090732339596, + "grad_norm": 0.04492799565196037, + "learning_rate": 0.00016508707978938843, + "loss": 0.3521, + "step": 4076 + }, + { + "epoch": 0.3302819183408944, + "grad_norm": 0.04404788836836815, + "learning_rate": 0.00016512758201701094, + "loss": 0.4375, + "step": 4077 + }, + { + "epoch": 0.33036292935839273, + "grad_norm": 0.03406739979982376, + "learning_rate": 0.00016516808424463345, + "loss": 0.3423, + "step": 4078 + }, + { + "epoch": 0.33044394037589114, + "grad_norm": 0.04167330637574196, + "learning_rate": 0.00016520858647225598, + "loss": 0.3892, + "step": 4079 + }, + { + "epoch": 0.3305249513933895, + "grad_norm": 0.03561322018504143, + "learning_rate": 0.00016524908869987849, + "loss": 0.3382, + "step": 4080 + }, + { + "epoch": 0.33060596241088785, + "grad_norm": 0.03386366367340088, + "learning_rate": 0.00016528959092750102, + "loss": 0.3811, + "step": 4081 + }, + { + "epoch": 0.33068697342838627, + "grad_norm": 0.03760869801044464, + "learning_rate": 0.00016533009315512353, + "loss": 0.3442, + "step": 4082 + }, + { + "epoch": 0.3307679844458846, + "grad_norm": 0.036848295480012894, + "learning_rate": 0.00016537059538274606, + "loss": 0.406, + "step": 4083 + }, + { + "epoch": 0.33084899546338303, + "grad_norm": 0.039902713149785995, + "learning_rate": 0.00016541109761036856, + "loss": 0.4254, + "step": 4084 + }, + { + "epoch": 0.3309300064808814, + "grad_norm": 0.04576299339532852, + "learning_rate": 0.0001654515998379911, + "loss": 0.3806, + "step": 4085 + }, + { + "epoch": 0.3310110174983798, + "grad_norm": 0.03668006509542465, + "learning_rate": 0.0001654921020656136, + "loss": 0.365, + "step": 4086 + }, + { + "epoch": 0.33109202851587816, + "grad_norm": 0.038494616746902466, + "learning_rate": 0.00016553260429323614, + "loss": 0.4095, + "step": 4087 + }, + { + "epoch": 0.3311730395333765, + "grad_norm": 0.042895376682281494, + "learning_rate": 0.00016557310652085864, + "loss": 0.359, + "step": 4088 + }, + { + "epoch": 0.3312540505508749, + "grad_norm": 0.038043245673179626, + "learning_rate": 0.00016561360874848118, + "loss": 0.3424, + "step": 4089 + }, + { + "epoch": 0.3313350615683733, + "grad_norm": 0.040794432163238525, + "learning_rate": 0.0001656541109761037, + "loss": 0.3786, + "step": 4090 + }, + { + "epoch": 0.3314160725858717, + "grad_norm": 0.04780365899205208, + "learning_rate": 0.00016569461320372622, + "loss": 0.3424, + "step": 4091 + }, + { + "epoch": 0.33149708360337005, + "grad_norm": 0.039480097591876984, + "learning_rate": 0.00016573511543134875, + "loss": 0.3805, + "step": 4092 + }, + { + "epoch": 0.33157809462086846, + "grad_norm": 0.03293336182832718, + "learning_rate": 0.00016577561765897126, + "loss": 0.3511, + "step": 4093 + }, + { + "epoch": 0.3316591056383668, + "grad_norm": 0.03346782550215721, + "learning_rate": 0.0001658161198865938, + "loss": 0.3501, + "step": 4094 + }, + { + "epoch": 0.3317401166558652, + "grad_norm": 0.03275993466377258, + "learning_rate": 0.0001658566221142163, + "loss": 0.3694, + "step": 4095 + }, + { + "epoch": 0.3318211276733636, + "grad_norm": 0.03415694832801819, + "learning_rate": 0.00016589712434183883, + "loss": 0.3533, + "step": 4096 + }, + { + "epoch": 0.33190213869086194, + "grad_norm": 0.03276718035340309, + "learning_rate": 0.00016593762656946133, + "loss": 0.3577, + "step": 4097 + }, + { + "epoch": 0.33198314970836035, + "grad_norm": 0.03801753744482994, + "learning_rate": 0.00016597812879708387, + "loss": 0.4198, + "step": 4098 + }, + { + "epoch": 0.3320641607258587, + "grad_norm": 0.03857783228158951, + "learning_rate": 0.00016601863102470637, + "loss": 0.4047, + "step": 4099 + }, + { + "epoch": 0.3321451717433571, + "grad_norm": 0.03730057552456856, + "learning_rate": 0.00016605913325232888, + "loss": 0.3675, + "step": 4100 + }, + { + "epoch": 0.33222618276085547, + "grad_norm": 0.03350397199392319, + "learning_rate": 0.00016609963547995139, + "loss": 0.3975, + "step": 4101 + }, + { + "epoch": 0.3323071937783539, + "grad_norm": 0.03293445333838463, + "learning_rate": 0.00016614013770757392, + "loss": 0.3638, + "step": 4102 + }, + { + "epoch": 0.33238820479585224, + "grad_norm": 0.030500883236527443, + "learning_rate": 0.00016618063993519643, + "loss": 0.3162, + "step": 4103 + }, + { + "epoch": 0.3324692158133506, + "grad_norm": 0.030487844720482826, + "learning_rate": 0.00016622114216281896, + "loss": 0.3563, + "step": 4104 + }, + { + "epoch": 0.332550226830849, + "grad_norm": 0.03776392340660095, + "learning_rate": 0.00016626164439044146, + "loss": 0.3087, + "step": 4105 + }, + { + "epoch": 0.33263123784834736, + "grad_norm": 0.04220743849873543, + "learning_rate": 0.000166302146618064, + "loss": 0.4032, + "step": 4106 + }, + { + "epoch": 0.33271224886584577, + "grad_norm": 0.03415783867239952, + "learning_rate": 0.0001663426488456865, + "loss": 0.3666, + "step": 4107 + }, + { + "epoch": 0.33279325988334413, + "grad_norm": 0.04167284071445465, + "learning_rate": 0.00016638315107330904, + "loss": 0.3416, + "step": 4108 + }, + { + "epoch": 0.33287427090084254, + "grad_norm": 0.03049684315919876, + "learning_rate": 0.00016642365330093157, + "loss": 0.3758, + "step": 4109 + }, + { + "epoch": 0.3329552819183409, + "grad_norm": 0.03721962496638298, + "learning_rate": 0.00016646415552855408, + "loss": 0.356, + "step": 4110 + }, + { + "epoch": 0.33303629293583925, + "grad_norm": 0.03811046853661537, + "learning_rate": 0.0001665046577561766, + "loss": 0.3641, + "step": 4111 + }, + { + "epoch": 0.33311730395333766, + "grad_norm": 0.033855751156806946, + "learning_rate": 0.00016654515998379912, + "loss": 0.4055, + "step": 4112 + }, + { + "epoch": 0.333198314970836, + "grad_norm": 0.04019276052713394, + "learning_rate": 0.00016658566221142165, + "loss": 0.3109, + "step": 4113 + }, + { + "epoch": 0.33327932598833443, + "grad_norm": 0.030369669198989868, + "learning_rate": 0.00016662616443904415, + "loss": 0.3483, + "step": 4114 + }, + { + "epoch": 0.3333603370058328, + "grad_norm": 0.036614250391721725, + "learning_rate": 0.0001666666666666667, + "loss": 0.3964, + "step": 4115 + }, + { + "epoch": 0.3334413480233312, + "grad_norm": 0.039595827460289, + "learning_rate": 0.0001667071688942892, + "loss": 0.3832, + "step": 4116 + }, + { + "epoch": 0.33352235904082955, + "grad_norm": 0.03889375925064087, + "learning_rate": 0.00016674767112191173, + "loss": 0.3758, + "step": 4117 + }, + { + "epoch": 0.3336033700583279, + "grad_norm": 0.03840089589357376, + "learning_rate": 0.00016678817334953423, + "loss": 0.3581, + "step": 4118 + }, + { + "epoch": 0.3336843810758263, + "grad_norm": 0.030346790328621864, + "learning_rate": 0.00016682867557715677, + "loss": 0.3402, + "step": 4119 + }, + { + "epoch": 0.3337653920933247, + "grad_norm": 0.03762712702155113, + "learning_rate": 0.00016686917780477927, + "loss": 0.4095, + "step": 4120 + }, + { + "epoch": 0.3338464031108231, + "grad_norm": 0.0357481949031353, + "learning_rate": 0.0001669096800324018, + "loss": 0.3672, + "step": 4121 + }, + { + "epoch": 0.33392741412832144, + "grad_norm": 0.03390078991651535, + "learning_rate": 0.0001669501822600243, + "loss": 0.3526, + "step": 4122 + }, + { + "epoch": 0.33400842514581985, + "grad_norm": 0.04207165166735649, + "learning_rate": 0.00016699068448764682, + "loss": 0.4223, + "step": 4123 + }, + { + "epoch": 0.3340894361633182, + "grad_norm": 0.042306751012802124, + "learning_rate": 0.00016703118671526935, + "loss": 0.384, + "step": 4124 + }, + { + "epoch": 0.33417044718081657, + "grad_norm": 0.03514132276177406, + "learning_rate": 0.00016707168894289186, + "loss": 0.3652, + "step": 4125 + }, + { + "epoch": 0.334251458198315, + "grad_norm": 0.03561227768659592, + "learning_rate": 0.00016711219117051436, + "loss": 0.4069, + "step": 4126 + }, + { + "epoch": 0.33433246921581333, + "grad_norm": 0.027793284505605698, + "learning_rate": 0.0001671526933981369, + "loss": 0.2562, + "step": 4127 + }, + { + "epoch": 0.33441348023331174, + "grad_norm": 0.03717358410358429, + "learning_rate": 0.00016719319562575943, + "loss": 0.4096, + "step": 4128 + }, + { + "epoch": 0.3344944912508101, + "grad_norm": 0.036481596529483795, + "learning_rate": 0.00016723369785338194, + "loss": 0.3614, + "step": 4129 + }, + { + "epoch": 0.3345755022683085, + "grad_norm": 0.044352419674396515, + "learning_rate": 0.00016727420008100447, + "loss": 0.3769, + "step": 4130 + }, + { + "epoch": 0.33465651328580687, + "grad_norm": 0.037814266979694366, + "learning_rate": 0.00016731470230862698, + "loss": 0.3389, + "step": 4131 + }, + { + "epoch": 0.3347375243033052, + "grad_norm": 0.03270898014307022, + "learning_rate": 0.0001673552045362495, + "loss": 0.3526, + "step": 4132 + }, + { + "epoch": 0.33481853532080363, + "grad_norm": 0.03235417231917381, + "learning_rate": 0.00016739570676387202, + "loss": 0.3298, + "step": 4133 + }, + { + "epoch": 0.334899546338302, + "grad_norm": 0.038817260414361954, + "learning_rate": 0.00016743620899149455, + "loss": 0.4, + "step": 4134 + }, + { + "epoch": 0.3349805573558004, + "grad_norm": 0.04076307639479637, + "learning_rate": 0.00016747671121911705, + "loss": 0.3628, + "step": 4135 + }, + { + "epoch": 0.33506156837329876, + "grad_norm": 0.03437405079603195, + "learning_rate": 0.0001675172134467396, + "loss": 0.4204, + "step": 4136 + }, + { + "epoch": 0.33514257939079717, + "grad_norm": 0.03410829231142998, + "learning_rate": 0.0001675577156743621, + "loss": 0.3609, + "step": 4137 + }, + { + "epoch": 0.3352235904082955, + "grad_norm": 0.04926472529768944, + "learning_rate": 0.00016759821790198463, + "loss": 0.3613, + "step": 4138 + }, + { + "epoch": 0.33530460142579394, + "grad_norm": 0.03411533683538437, + "learning_rate": 0.00016763872012960713, + "loss": 0.3933, + "step": 4139 + }, + { + "epoch": 0.3353856124432923, + "grad_norm": 0.03284559026360512, + "learning_rate": 0.00016767922235722967, + "loss": 0.3586, + "step": 4140 + }, + { + "epoch": 0.33546662346079065, + "grad_norm": 0.040767405182123184, + "learning_rate": 0.00016771972458485217, + "loss": 0.3632, + "step": 4141 + }, + { + "epoch": 0.33554763447828906, + "grad_norm": 0.034310050308704376, + "learning_rate": 0.0001677602268124747, + "loss": 0.3488, + "step": 4142 + }, + { + "epoch": 0.3356286454957874, + "grad_norm": 0.03712863475084305, + "learning_rate": 0.0001678007290400972, + "loss": 0.4369, + "step": 4143 + }, + { + "epoch": 0.3357096565132858, + "grad_norm": 0.03386729210615158, + "learning_rate": 0.00016784123126771975, + "loss": 0.3479, + "step": 4144 + }, + { + "epoch": 0.3357906675307842, + "grad_norm": 0.039073631167411804, + "learning_rate": 0.00016788173349534225, + "loss": 0.3928, + "step": 4145 + }, + { + "epoch": 0.3358716785482826, + "grad_norm": 0.039460305124521255, + "learning_rate": 0.00016792223572296478, + "loss": 0.3976, + "step": 4146 + }, + { + "epoch": 0.33595268956578095, + "grad_norm": 0.03876733034849167, + "learning_rate": 0.0001679627379505873, + "loss": 0.3755, + "step": 4147 + }, + { + "epoch": 0.3360337005832793, + "grad_norm": 0.031978458166122437, + "learning_rate": 0.0001680032401782098, + "loss": 0.3225, + "step": 4148 + }, + { + "epoch": 0.3361147116007777, + "grad_norm": 0.0404481403529644, + "learning_rate": 0.00016804374240583233, + "loss": 0.3893, + "step": 4149 + }, + { + "epoch": 0.33619572261827607, + "grad_norm": 0.034730345010757446, + "learning_rate": 0.00016808424463345484, + "loss": 0.3538, + "step": 4150 + }, + { + "epoch": 0.3362767336357745, + "grad_norm": 0.03582317754626274, + "learning_rate": 0.00016812474686107737, + "loss": 0.3475, + "step": 4151 + }, + { + "epoch": 0.33635774465327284, + "grad_norm": 0.035376086831092834, + "learning_rate": 0.00016816524908869988, + "loss": 0.3683, + "step": 4152 + }, + { + "epoch": 0.33643875567077125, + "grad_norm": 0.033756762742996216, + "learning_rate": 0.0001682057513163224, + "loss": 0.3522, + "step": 4153 + }, + { + "epoch": 0.3365197666882696, + "grad_norm": 0.03765640780329704, + "learning_rate": 0.00016824625354394491, + "loss": 0.3725, + "step": 4154 + }, + { + "epoch": 0.33660077770576796, + "grad_norm": 0.03699144721031189, + "learning_rate": 0.00016828675577156745, + "loss": 0.3676, + "step": 4155 + }, + { + "epoch": 0.3366817887232664, + "grad_norm": 0.033394955098629, + "learning_rate": 0.00016832725799918995, + "loss": 0.3527, + "step": 4156 + }, + { + "epoch": 0.33676279974076473, + "grad_norm": 0.045186761766672134, + "learning_rate": 0.0001683677602268125, + "loss": 0.422, + "step": 4157 + }, + { + "epoch": 0.33684381075826314, + "grad_norm": 0.040306515991687775, + "learning_rate": 0.000168408262454435, + "loss": 0.3672, + "step": 4158 + }, + { + "epoch": 0.3369248217757615, + "grad_norm": 0.02902974747121334, + "learning_rate": 0.00016844876468205753, + "loss": 0.3542, + "step": 4159 + }, + { + "epoch": 0.3370058327932599, + "grad_norm": 0.037141066044569016, + "learning_rate": 0.00016848926690968003, + "loss": 0.3852, + "step": 4160 + }, + { + "epoch": 0.33708684381075826, + "grad_norm": 0.050520069897174835, + "learning_rate": 0.00016852976913730257, + "loss": 0.373, + "step": 4161 + }, + { + "epoch": 0.3371678548282566, + "grad_norm": 0.041232600808143616, + "learning_rate": 0.00016857027136492507, + "loss": 0.3563, + "step": 4162 + }, + { + "epoch": 0.33724886584575503, + "grad_norm": 0.034661248326301575, + "learning_rate": 0.0001686107735925476, + "loss": 0.3394, + "step": 4163 + }, + { + "epoch": 0.3373298768632534, + "grad_norm": 0.04212620481848717, + "learning_rate": 0.0001686512758201701, + "loss": 0.4088, + "step": 4164 + }, + { + "epoch": 0.3374108878807518, + "grad_norm": 0.03127884119749069, + "learning_rate": 0.00016869177804779264, + "loss": 0.331, + "step": 4165 + }, + { + "epoch": 0.33749189889825015, + "grad_norm": 0.03282209485769272, + "learning_rate": 0.00016873228027541518, + "loss": 0.3882, + "step": 4166 + }, + { + "epoch": 0.33757290991574856, + "grad_norm": 0.03402530401945114, + "learning_rate": 0.00016877278250303768, + "loss": 0.4194, + "step": 4167 + }, + { + "epoch": 0.3376539209332469, + "grad_norm": 0.03680253401398659, + "learning_rate": 0.00016881328473066022, + "loss": 0.393, + "step": 4168 + }, + { + "epoch": 0.3377349319507453, + "grad_norm": 0.03531409054994583, + "learning_rate": 0.00016885378695828272, + "loss": 0.3438, + "step": 4169 + }, + { + "epoch": 0.3378159429682437, + "grad_norm": 0.039557069540023804, + "learning_rate": 0.00016889428918590523, + "loss": 0.372, + "step": 4170 + }, + { + "epoch": 0.33789695398574204, + "grad_norm": 0.041093405336141586, + "learning_rate": 0.00016893479141352774, + "loss": 0.3918, + "step": 4171 + }, + { + "epoch": 0.33797796500324045, + "grad_norm": 0.035452280193567276, + "learning_rate": 0.00016897529364115027, + "loss": 0.365, + "step": 4172 + }, + { + "epoch": 0.3380589760207388, + "grad_norm": 0.03799006715416908, + "learning_rate": 0.00016901579586877277, + "loss": 0.3629, + "step": 4173 + }, + { + "epoch": 0.3381399870382372, + "grad_norm": 0.03691485896706581, + "learning_rate": 0.0001690562980963953, + "loss": 0.3645, + "step": 4174 + }, + { + "epoch": 0.3382209980557356, + "grad_norm": 0.036314696073532104, + "learning_rate": 0.00016909680032401781, + "loss": 0.3775, + "step": 4175 + }, + { + "epoch": 0.33830200907323393, + "grad_norm": 0.04029175639152527, + "learning_rate": 0.00016913730255164035, + "loss": 0.4537, + "step": 4176 + }, + { + "epoch": 0.33838302009073234, + "grad_norm": 0.030173292383551598, + "learning_rate": 0.00016917780477926285, + "loss": 0.337, + "step": 4177 + }, + { + "epoch": 0.3384640311082307, + "grad_norm": 0.04181046038866043, + "learning_rate": 0.0001692183070068854, + "loss": 0.3995, + "step": 4178 + }, + { + "epoch": 0.3385450421257291, + "grad_norm": 0.03693336248397827, + "learning_rate": 0.0001692588092345079, + "loss": 0.3401, + "step": 4179 + }, + { + "epoch": 0.33862605314322747, + "grad_norm": 0.04609256982803345, + "learning_rate": 0.00016929931146213043, + "loss": 0.385, + "step": 4180 + }, + { + "epoch": 0.3387070641607259, + "grad_norm": 0.036621615290641785, + "learning_rate": 0.00016933981368975293, + "loss": 0.3583, + "step": 4181 + }, + { + "epoch": 0.33878807517822424, + "grad_norm": 0.03672129288315773, + "learning_rate": 0.00016938031591737547, + "loss": 0.4042, + "step": 4182 + }, + { + "epoch": 0.3388690861957226, + "grad_norm": 0.038371678441762924, + "learning_rate": 0.00016942081814499797, + "loss": 0.3672, + "step": 4183 + }, + { + "epoch": 0.338950097213221, + "grad_norm": 0.0406721830368042, + "learning_rate": 0.0001694613203726205, + "loss": 0.383, + "step": 4184 + }, + { + "epoch": 0.33903110823071936, + "grad_norm": 0.03535905107855797, + "learning_rate": 0.00016950182260024304, + "loss": 0.3518, + "step": 4185 + }, + { + "epoch": 0.33911211924821777, + "grad_norm": 0.03706865385174751, + "learning_rate": 0.00016954232482786554, + "loss": 0.3521, + "step": 4186 + }, + { + "epoch": 0.3391931302657161, + "grad_norm": 0.04128368943929672, + "learning_rate": 0.00016958282705548808, + "loss": 0.397, + "step": 4187 + }, + { + "epoch": 0.33927414128321454, + "grad_norm": 0.03312550112605095, + "learning_rate": 0.00016962332928311058, + "loss": 0.3685, + "step": 4188 + }, + { + "epoch": 0.3393551523007129, + "grad_norm": 0.034005653113126755, + "learning_rate": 0.00016966383151073312, + "loss": 0.3365, + "step": 4189 + }, + { + "epoch": 0.3394361633182113, + "grad_norm": 0.03641623631119728, + "learning_rate": 0.00016970433373835562, + "loss": 0.3931, + "step": 4190 + }, + { + "epoch": 0.33951717433570966, + "grad_norm": 0.03957383707165718, + "learning_rate": 0.00016974483596597816, + "loss": 0.4141, + "step": 4191 + }, + { + "epoch": 0.339598185353208, + "grad_norm": 0.04036073386669159, + "learning_rate": 0.00016978533819360066, + "loss": 0.3924, + "step": 4192 + }, + { + "epoch": 0.3396791963707064, + "grad_norm": 0.033038701862096786, + "learning_rate": 0.00016982584042122317, + "loss": 0.296, + "step": 4193 + }, + { + "epoch": 0.3397602073882048, + "grad_norm": 0.027858994901180267, + "learning_rate": 0.0001698663426488457, + "loss": 0.3356, + "step": 4194 + }, + { + "epoch": 0.3398412184057032, + "grad_norm": 0.04070800542831421, + "learning_rate": 0.0001699068448764682, + "loss": 0.3525, + "step": 4195 + }, + { + "epoch": 0.33992222942320155, + "grad_norm": 0.03459759056568146, + "learning_rate": 0.00016994734710409071, + "loss": 0.3693, + "step": 4196 + }, + { + "epoch": 0.34000324044069996, + "grad_norm": 0.03300487622618675, + "learning_rate": 0.00016998784933171325, + "loss": 0.3489, + "step": 4197 + }, + { + "epoch": 0.3400842514581983, + "grad_norm": 0.03482555225491524, + "learning_rate": 0.00017002835155933575, + "loss": 0.306, + "step": 4198 + }, + { + "epoch": 0.3401652624756967, + "grad_norm": 0.039338547736406326, + "learning_rate": 0.00017006885378695829, + "loss": 0.3555, + "step": 4199 + }, + { + "epoch": 0.3402462734931951, + "grad_norm": 0.041121240705251694, + "learning_rate": 0.0001701093560145808, + "loss": 0.3355, + "step": 4200 + }, + { + "epoch": 0.34032728451069344, + "grad_norm": 0.03414642810821533, + "learning_rate": 0.00017014985824220333, + "loss": 0.3546, + "step": 4201 + }, + { + "epoch": 0.34040829552819185, + "grad_norm": 0.039132002741098404, + "learning_rate": 0.00017019036046982583, + "loss": 0.3546, + "step": 4202 + }, + { + "epoch": 0.3404893065456902, + "grad_norm": 0.036015842109918594, + "learning_rate": 0.00017023086269744836, + "loss": 0.3876, + "step": 4203 + }, + { + "epoch": 0.3405703175631886, + "grad_norm": 0.04213041439652443, + "learning_rate": 0.0001702713649250709, + "loss": 0.379, + "step": 4204 + }, + { + "epoch": 0.340651328580687, + "grad_norm": 0.04193120822310448, + "learning_rate": 0.0001703118671526934, + "loss": 0.3266, + "step": 4205 + }, + { + "epoch": 0.34073233959818533, + "grad_norm": 0.035873111337423325, + "learning_rate": 0.00017035236938031594, + "loss": 0.3738, + "step": 4206 + }, + { + "epoch": 0.34081335061568374, + "grad_norm": 0.03832458332180977, + "learning_rate": 0.00017039287160793844, + "loss": 0.3866, + "step": 4207 + }, + { + "epoch": 0.3408943616331821, + "grad_norm": 0.032247111201286316, + "learning_rate": 0.00017043337383556098, + "loss": 0.3546, + "step": 4208 + }, + { + "epoch": 0.3409753726506805, + "grad_norm": 0.03261726349592209, + "learning_rate": 0.00017047387606318348, + "loss": 0.3558, + "step": 4209 + }, + { + "epoch": 0.34105638366817886, + "grad_norm": 0.030771298334002495, + "learning_rate": 0.00017051437829080602, + "loss": 0.3082, + "step": 4210 + }, + { + "epoch": 0.3411373946856773, + "grad_norm": 0.03943264111876488, + "learning_rate": 0.00017055488051842852, + "loss": 0.3681, + "step": 4211 + }, + { + "epoch": 0.34121840570317563, + "grad_norm": 0.03512781485915184, + "learning_rate": 0.00017059538274605106, + "loss": 0.342, + "step": 4212 + }, + { + "epoch": 0.341299416720674, + "grad_norm": 0.03673945739865303, + "learning_rate": 0.00017063588497367356, + "loss": 0.3391, + "step": 4213 + }, + { + "epoch": 0.3413804277381724, + "grad_norm": 0.03736485540866852, + "learning_rate": 0.0001706763872012961, + "loss": 0.3542, + "step": 4214 + }, + { + "epoch": 0.34146143875567075, + "grad_norm": 0.0473208986222744, + "learning_rate": 0.0001707168894289186, + "loss": 0.3504, + "step": 4215 + }, + { + "epoch": 0.34154244977316917, + "grad_norm": 0.033738020807504654, + "learning_rate": 0.00017075739165654113, + "loss": 0.3571, + "step": 4216 + }, + { + "epoch": 0.3416234607906675, + "grad_norm": 0.033597659319639206, + "learning_rate": 0.00017079789388416364, + "loss": 0.3311, + "step": 4217 + }, + { + "epoch": 0.34170447180816593, + "grad_norm": 0.038350872695446014, + "learning_rate": 0.00017083839611178615, + "loss": 0.3237, + "step": 4218 + }, + { + "epoch": 0.3417854828256643, + "grad_norm": 0.033569056540727615, + "learning_rate": 0.00017087889833940868, + "loss": 0.3653, + "step": 4219 + }, + { + "epoch": 0.34186649384316264, + "grad_norm": 0.039351075887680054, + "learning_rate": 0.00017091940056703119, + "loss": 0.3981, + "step": 4220 + }, + { + "epoch": 0.34194750486066106, + "grad_norm": 0.03401686251163483, + "learning_rate": 0.0001709599027946537, + "loss": 0.3414, + "step": 4221 + }, + { + "epoch": 0.3420285158781594, + "grad_norm": 0.0369318425655365, + "learning_rate": 0.00017100040502227622, + "loss": 0.4034, + "step": 4222 + }, + { + "epoch": 0.3421095268956578, + "grad_norm": 0.03724020719528198, + "learning_rate": 0.00017104090724989876, + "loss": 0.3716, + "step": 4223 + }, + { + "epoch": 0.3421905379131562, + "grad_norm": 0.03206992894411087, + "learning_rate": 0.00017108140947752126, + "loss": 0.3711, + "step": 4224 + }, + { + "epoch": 0.3422715489306546, + "grad_norm": 0.032597772777080536, + "learning_rate": 0.0001711219117051438, + "loss": 0.3333, + "step": 4225 + }, + { + "epoch": 0.34235255994815295, + "grad_norm": 0.0400988794863224, + "learning_rate": 0.0001711624139327663, + "loss": 0.4322, + "step": 4226 + }, + { + "epoch": 0.3424335709656513, + "grad_norm": 0.03737347200512886, + "learning_rate": 0.00017120291616038884, + "loss": 0.3767, + "step": 4227 + }, + { + "epoch": 0.3425145819831497, + "grad_norm": 0.06823983043432236, + "learning_rate": 0.00017124341838801134, + "loss": 0.3987, + "step": 4228 + }, + { + "epoch": 0.34259559300064807, + "grad_norm": 0.03661317750811577, + "learning_rate": 0.00017128392061563388, + "loss": 0.3714, + "step": 4229 + }, + { + "epoch": 0.3426766040181465, + "grad_norm": 0.04608812928199768, + "learning_rate": 0.00017132442284325638, + "loss": 0.3924, + "step": 4230 + }, + { + "epoch": 0.34275761503564484, + "grad_norm": 0.03493238613009453, + "learning_rate": 0.00017136492507087892, + "loss": 0.3719, + "step": 4231 + }, + { + "epoch": 0.34283862605314325, + "grad_norm": 0.03279491513967514, + "learning_rate": 0.00017140542729850142, + "loss": 0.3655, + "step": 4232 + }, + { + "epoch": 0.3429196370706416, + "grad_norm": 0.038346197456121445, + "learning_rate": 0.00017144592952612395, + "loss": 0.3979, + "step": 4233 + }, + { + "epoch": 0.34300064808814, + "grad_norm": 0.03517588600516319, + "learning_rate": 0.00017148643175374646, + "loss": 0.3248, + "step": 4234 + }, + { + "epoch": 0.34308165910563837, + "grad_norm": 0.03253260254859924, + "learning_rate": 0.000171526933981369, + "loss": 0.3423, + "step": 4235 + }, + { + "epoch": 0.3431626701231367, + "grad_norm": 0.03733436390757561, + "learning_rate": 0.0001715674362089915, + "loss": 0.3809, + "step": 4236 + }, + { + "epoch": 0.34324368114063514, + "grad_norm": 0.03323622792959213, + "learning_rate": 0.00017160793843661403, + "loss": 0.3964, + "step": 4237 + }, + { + "epoch": 0.3433246921581335, + "grad_norm": 0.03212364390492439, + "learning_rate": 0.00017164844066423654, + "loss": 0.361, + "step": 4238 + }, + { + "epoch": 0.3434057031756319, + "grad_norm": 0.03472090885043144, + "learning_rate": 0.00017168894289185907, + "loss": 0.3455, + "step": 4239 + }, + { + "epoch": 0.34348671419313026, + "grad_norm": 0.036414679139852524, + "learning_rate": 0.00017172944511948158, + "loss": 0.3286, + "step": 4240 + }, + { + "epoch": 0.34356772521062867, + "grad_norm": 0.034054234623909, + "learning_rate": 0.0001717699473471041, + "loss": 0.3388, + "step": 4241 + }, + { + "epoch": 0.343648736228127, + "grad_norm": 0.036633871495723724, + "learning_rate": 0.00017181044957472662, + "loss": 0.3605, + "step": 4242 + }, + { + "epoch": 0.3437297472456254, + "grad_norm": 0.03781089931726456, + "learning_rate": 0.00017185095180234912, + "loss": 0.395, + "step": 4243 + }, + { + "epoch": 0.3438107582631238, + "grad_norm": 0.03504957631230354, + "learning_rate": 0.00017189145402997166, + "loss": 0.3678, + "step": 4244 + }, + { + "epoch": 0.34389176928062215, + "grad_norm": 0.03233807906508446, + "learning_rate": 0.00017193195625759416, + "loss": 0.3344, + "step": 4245 + }, + { + "epoch": 0.34397278029812056, + "grad_norm": 0.03948920965194702, + "learning_rate": 0.0001719724584852167, + "loss": 0.45, + "step": 4246 + }, + { + "epoch": 0.3440537913156189, + "grad_norm": 0.03502481058239937, + "learning_rate": 0.0001720129607128392, + "loss": 0.359, + "step": 4247 + }, + { + "epoch": 0.34413480233311733, + "grad_norm": 0.03225164860486984, + "learning_rate": 0.00017205346294046174, + "loss": 0.332, + "step": 4248 + }, + { + "epoch": 0.3442158133506157, + "grad_norm": 0.038138873875141144, + "learning_rate": 0.00017209396516808424, + "loss": 0.3711, + "step": 4249 + }, + { + "epoch": 0.34429682436811404, + "grad_norm": 0.0346660241484642, + "learning_rate": 0.00017213446739570678, + "loss": 0.3786, + "step": 4250 + }, + { + "epoch": 0.34437783538561245, + "grad_norm": 0.032906532287597656, + "learning_rate": 0.00017217496962332928, + "loss": 0.3661, + "step": 4251 + }, + { + "epoch": 0.3444588464031108, + "grad_norm": 0.03140093386173248, + "learning_rate": 0.00017221547185095182, + "loss": 0.3831, + "step": 4252 + }, + { + "epoch": 0.3445398574206092, + "grad_norm": 0.037232931703329086, + "learning_rate": 0.00017225597407857432, + "loss": 0.3661, + "step": 4253 + }, + { + "epoch": 0.3446208684381076, + "grad_norm": 0.03418324142694473, + "learning_rate": 0.00017229647630619685, + "loss": 0.3599, + "step": 4254 + }, + { + "epoch": 0.344701879455606, + "grad_norm": 0.03270847722887993, + "learning_rate": 0.00017233697853381936, + "loss": 0.4305, + "step": 4255 + }, + { + "epoch": 0.34478289047310434, + "grad_norm": 0.04176706075668335, + "learning_rate": 0.0001723774807614419, + "loss": 0.403, + "step": 4256 + }, + { + "epoch": 0.3448639014906027, + "grad_norm": 0.030672620981931686, + "learning_rate": 0.0001724179829890644, + "loss": 0.3659, + "step": 4257 + }, + { + "epoch": 0.3449449125081011, + "grad_norm": 0.031542904675006866, + "learning_rate": 0.00017245848521668693, + "loss": 0.3779, + "step": 4258 + }, + { + "epoch": 0.34502592352559946, + "grad_norm": 0.03651117905974388, + "learning_rate": 0.00017249898744430944, + "loss": 0.3581, + "step": 4259 + }, + { + "epoch": 0.3451069345430979, + "grad_norm": 0.03164476901292801, + "learning_rate": 0.00017253948967193197, + "loss": 0.3565, + "step": 4260 + }, + { + "epoch": 0.34518794556059623, + "grad_norm": 0.03185632824897766, + "learning_rate": 0.0001725799918995545, + "loss": 0.3787, + "step": 4261 + }, + { + "epoch": 0.34526895657809464, + "grad_norm": 0.03777198866009712, + "learning_rate": 0.000172620494127177, + "loss": 0.4305, + "step": 4262 + }, + { + "epoch": 0.345349967595593, + "grad_norm": 0.034228693693876266, + "learning_rate": 0.00017266099635479952, + "loss": 0.3546, + "step": 4263 + }, + { + "epoch": 0.34543097861309136, + "grad_norm": 0.035769667476415634, + "learning_rate": 0.00017270149858242205, + "loss": 0.3961, + "step": 4264 + }, + { + "epoch": 0.34551198963058977, + "grad_norm": 0.034536611288785934, + "learning_rate": 0.00017274200081004456, + "loss": 0.3816, + "step": 4265 + }, + { + "epoch": 0.3455930006480881, + "grad_norm": 0.04174252972006798, + "learning_rate": 0.00017278250303766706, + "loss": 0.3716, + "step": 4266 + }, + { + "epoch": 0.34567401166558653, + "grad_norm": 0.04082999750971794, + "learning_rate": 0.0001728230052652896, + "loss": 0.3954, + "step": 4267 + }, + { + "epoch": 0.3457550226830849, + "grad_norm": 0.05414394289255142, + "learning_rate": 0.0001728635074929121, + "loss": 0.404, + "step": 4268 + }, + { + "epoch": 0.3458360337005833, + "grad_norm": 0.037421610206365585, + "learning_rate": 0.00017290400972053464, + "loss": 0.3687, + "step": 4269 + }, + { + "epoch": 0.34591704471808166, + "grad_norm": 0.03721112385392189, + "learning_rate": 0.00017294451194815714, + "loss": 0.3903, + "step": 4270 + }, + { + "epoch": 0.34599805573558, + "grad_norm": 0.040057696402072906, + "learning_rate": 0.00017298501417577968, + "loss": 0.4476, + "step": 4271 + }, + { + "epoch": 0.3460790667530784, + "grad_norm": 0.03950329124927521, + "learning_rate": 0.00017302551640340218, + "loss": 0.3874, + "step": 4272 + }, + { + "epoch": 0.3461600777705768, + "grad_norm": 0.04660689830780029, + "learning_rate": 0.00017306601863102471, + "loss": 0.4252, + "step": 4273 + }, + { + "epoch": 0.3462410887880752, + "grad_norm": 0.0310058556497097, + "learning_rate": 0.00017310652085864722, + "loss": 0.3315, + "step": 4274 + }, + { + "epoch": 0.34632209980557355, + "grad_norm": 0.035095661878585815, + "learning_rate": 0.00017314702308626975, + "loss": 0.3643, + "step": 4275 + }, + { + "epoch": 0.34640311082307196, + "grad_norm": 0.04145737737417221, + "learning_rate": 0.00017318752531389226, + "loss": 0.3888, + "step": 4276 + }, + { + "epoch": 0.3464841218405703, + "grad_norm": 0.03940679132938385, + "learning_rate": 0.0001732280275415148, + "loss": 0.3694, + "step": 4277 + }, + { + "epoch": 0.34656513285806867, + "grad_norm": 0.03774067014455795, + "learning_rate": 0.0001732685297691373, + "loss": 0.3623, + "step": 4278 + }, + { + "epoch": 0.3466461438755671, + "grad_norm": 0.039981309324502945, + "learning_rate": 0.00017330903199675983, + "loss": 0.4002, + "step": 4279 + }, + { + "epoch": 0.34672715489306544, + "grad_norm": 0.036382727324962616, + "learning_rate": 0.00017334953422438237, + "loss": 0.4025, + "step": 4280 + }, + { + "epoch": 0.34680816591056385, + "grad_norm": 0.03269738331437111, + "learning_rate": 0.00017339003645200487, + "loss": 0.3438, + "step": 4281 + }, + { + "epoch": 0.3468891769280622, + "grad_norm": 0.03966363146901131, + "learning_rate": 0.0001734305386796274, + "loss": 0.3557, + "step": 4282 + }, + { + "epoch": 0.3469701879455606, + "grad_norm": 0.02861810475587845, + "learning_rate": 0.0001734710409072499, + "loss": 0.3333, + "step": 4283 + }, + { + "epoch": 0.34705119896305897, + "grad_norm": 0.04468153789639473, + "learning_rate": 0.00017351154313487244, + "loss": 0.3887, + "step": 4284 + }, + { + "epoch": 0.3471322099805574, + "grad_norm": 0.036037784069776535, + "learning_rate": 0.00017355204536249495, + "loss": 0.356, + "step": 4285 + }, + { + "epoch": 0.34721322099805574, + "grad_norm": 0.034804608672857285, + "learning_rate": 0.00017359254759011748, + "loss": 0.3769, + "step": 4286 + }, + { + "epoch": 0.3472942320155541, + "grad_norm": 0.04255205765366554, + "learning_rate": 0.00017363304981774, + "loss": 0.3766, + "step": 4287 + }, + { + "epoch": 0.3473752430330525, + "grad_norm": 0.030785469338297844, + "learning_rate": 0.0001736735520453625, + "loss": 0.3586, + "step": 4288 + }, + { + "epoch": 0.34745625405055086, + "grad_norm": 0.030791115015745163, + "learning_rate": 0.00017371405427298503, + "loss": 0.3536, + "step": 4289 + }, + { + "epoch": 0.3475372650680493, + "grad_norm": 0.03815629705786705, + "learning_rate": 0.00017375455650060754, + "loss": 0.4058, + "step": 4290 + }, + { + "epoch": 0.34761827608554763, + "grad_norm": 0.04502563923597336, + "learning_rate": 0.00017379505872823004, + "loss": 0.4145, + "step": 4291 + }, + { + "epoch": 0.34769928710304604, + "grad_norm": 0.03694465756416321, + "learning_rate": 0.00017383556095585257, + "loss": 0.3686, + "step": 4292 + }, + { + "epoch": 0.3477802981205444, + "grad_norm": 0.03498321771621704, + "learning_rate": 0.00017387606318347508, + "loss": 0.402, + "step": 4293 + }, + { + "epoch": 0.34786130913804275, + "grad_norm": 0.033133365213871, + "learning_rate": 0.00017391656541109761, + "loss": 0.4, + "step": 4294 + }, + { + "epoch": 0.34794232015554116, + "grad_norm": 0.036578428000211716, + "learning_rate": 0.00017395706763872012, + "loss": 0.3368, + "step": 4295 + }, + { + "epoch": 0.3480233311730395, + "grad_norm": 0.041449353098869324, + "learning_rate": 0.00017399756986634265, + "loss": 0.3596, + "step": 4296 + }, + { + "epoch": 0.34810434219053793, + "grad_norm": 0.031675804406404495, + "learning_rate": 0.00017403807209396516, + "loss": 0.3637, + "step": 4297 + }, + { + "epoch": 0.3481853532080363, + "grad_norm": 0.03366389870643616, + "learning_rate": 0.0001740785743215877, + "loss": 0.3615, + "step": 4298 + }, + { + "epoch": 0.3482663642255347, + "grad_norm": 0.04206422343850136, + "learning_rate": 0.00017411907654921023, + "loss": 0.4181, + "step": 4299 + }, + { + "epoch": 0.34834737524303305, + "grad_norm": 0.037293173372745514, + "learning_rate": 0.00017415957877683273, + "loss": 0.3176, + "step": 4300 + }, + { + "epoch": 0.3484283862605314, + "grad_norm": 0.04146702215075493, + "learning_rate": 0.00017420008100445527, + "loss": 0.3659, + "step": 4301 + }, + { + "epoch": 0.3485093972780298, + "grad_norm": 0.04191764444112778, + "learning_rate": 0.00017424058323207777, + "loss": 0.3824, + "step": 4302 + }, + { + "epoch": 0.3485904082955282, + "grad_norm": 0.03564842417836189, + "learning_rate": 0.0001742810854597003, + "loss": 0.3736, + "step": 4303 + }, + { + "epoch": 0.3486714193130266, + "grad_norm": 0.028532709926366806, + "learning_rate": 0.0001743215876873228, + "loss": 0.3697, + "step": 4304 + }, + { + "epoch": 0.34875243033052494, + "grad_norm": 0.033898431807756424, + "learning_rate": 0.00017436208991494534, + "loss": 0.3598, + "step": 4305 + }, + { + "epoch": 0.34883344134802335, + "grad_norm": 0.03289264813065529, + "learning_rate": 0.00017440259214256785, + "loss": 0.377, + "step": 4306 + }, + { + "epoch": 0.3489144523655217, + "grad_norm": 0.02952691726386547, + "learning_rate": 0.00017444309437019038, + "loss": 0.3408, + "step": 4307 + }, + { + "epoch": 0.34899546338302007, + "grad_norm": 0.035252220928668976, + "learning_rate": 0.0001744835965978129, + "loss": 0.3984, + "step": 4308 + }, + { + "epoch": 0.3490764744005185, + "grad_norm": 0.033801451325416565, + "learning_rate": 0.00017452409882543542, + "loss": 0.3344, + "step": 4309 + }, + { + "epoch": 0.34915748541801683, + "grad_norm": 0.04563166946172714, + "learning_rate": 0.00017456460105305793, + "loss": 0.3895, + "step": 4310 + }, + { + "epoch": 0.34923849643551524, + "grad_norm": 0.03591470420360565, + "learning_rate": 0.00017460510328068046, + "loss": 0.3608, + "step": 4311 + }, + { + "epoch": 0.3493195074530136, + "grad_norm": 0.035211559385061264, + "learning_rate": 0.00017464560550830297, + "loss": 0.3281, + "step": 4312 + }, + { + "epoch": 0.349400518470512, + "grad_norm": 0.03434130921959877, + "learning_rate": 0.00017468610773592547, + "loss": 0.3557, + "step": 4313 + }, + { + "epoch": 0.34948152948801037, + "grad_norm": 0.039995063096284866, + "learning_rate": 0.00017472660996354798, + "loss": 0.4288, + "step": 4314 + }, + { + "epoch": 0.3495625405055087, + "grad_norm": 0.03357269987463951, + "learning_rate": 0.0001747671121911705, + "loss": 0.3522, + "step": 4315 + }, + { + "epoch": 0.34964355152300713, + "grad_norm": 0.03886905312538147, + "learning_rate": 0.00017480761441879302, + "loss": 0.3746, + "step": 4316 + }, + { + "epoch": 0.3497245625405055, + "grad_norm": 0.033615898340940475, + "learning_rate": 0.00017484811664641555, + "loss": 0.3593, + "step": 4317 + }, + { + "epoch": 0.3498055735580039, + "grad_norm": 0.037499893456697464, + "learning_rate": 0.00017488861887403809, + "loss": 0.379, + "step": 4318 + }, + { + "epoch": 0.34988658457550226, + "grad_norm": 0.03338012099266052, + "learning_rate": 0.0001749291211016606, + "loss": 0.3761, + "step": 4319 + }, + { + "epoch": 0.34996759559300067, + "grad_norm": 0.03335127979516983, + "learning_rate": 0.00017496962332928313, + "loss": 0.3602, + "step": 4320 + }, + { + "epoch": 0.350048606610499, + "grad_norm": 0.03429269418120384, + "learning_rate": 0.00017501012555690563, + "loss": 0.3741, + "step": 4321 + }, + { + "epoch": 0.3501296176279974, + "grad_norm": 0.037369150668382645, + "learning_rate": 0.00017505062778452816, + "loss": 0.3561, + "step": 4322 + }, + { + "epoch": 0.3502106286454958, + "grad_norm": 0.031715746968984604, + "learning_rate": 0.00017509113001215067, + "loss": 0.3575, + "step": 4323 + }, + { + "epoch": 0.35029163966299415, + "grad_norm": 0.03233131021261215, + "learning_rate": 0.0001751316322397732, + "loss": 0.3526, + "step": 4324 + }, + { + "epoch": 0.35037265068049256, + "grad_norm": 0.03732204809784889, + "learning_rate": 0.0001751721344673957, + "loss": 0.3946, + "step": 4325 + }, + { + "epoch": 0.3504536616979909, + "grad_norm": 0.0344877764582634, + "learning_rate": 0.00017521263669501824, + "loss": 0.374, + "step": 4326 + }, + { + "epoch": 0.3505346727154893, + "grad_norm": 0.03300429508090019, + "learning_rate": 0.00017525313892264075, + "loss": 0.3765, + "step": 4327 + }, + { + "epoch": 0.3506156837329877, + "grad_norm": 0.04378129914402962, + "learning_rate": 0.00017529364115026328, + "loss": 0.3132, + "step": 4328 + }, + { + "epoch": 0.3506966947504861, + "grad_norm": 0.037889618426561356, + "learning_rate": 0.0001753341433778858, + "loss": 0.4394, + "step": 4329 + }, + { + "epoch": 0.35077770576798445, + "grad_norm": 0.05033688247203827, + "learning_rate": 0.00017537464560550832, + "loss": 0.4018, + "step": 4330 + }, + { + "epoch": 0.3508587167854828, + "grad_norm": 0.033001694828271866, + "learning_rate": 0.00017541514783313083, + "loss": 0.3496, + "step": 4331 + }, + { + "epoch": 0.3509397278029812, + "grad_norm": 0.030153915286064148, + "learning_rate": 0.00017545565006075336, + "loss": 0.3689, + "step": 4332 + }, + { + "epoch": 0.35102073882047957, + "grad_norm": 0.03007018193602562, + "learning_rate": 0.00017549615228837587, + "loss": 0.3636, + "step": 4333 + }, + { + "epoch": 0.351101749837978, + "grad_norm": 0.036428261548280716, + "learning_rate": 0.0001755366545159984, + "loss": 0.3656, + "step": 4334 + }, + { + "epoch": 0.35118276085547634, + "grad_norm": 0.0325998030602932, + "learning_rate": 0.0001755771567436209, + "loss": 0.3421, + "step": 4335 + }, + { + "epoch": 0.35126377187297475, + "grad_norm": 0.03554887697100639, + "learning_rate": 0.0001756176589712434, + "loss": 0.3741, + "step": 4336 + }, + { + "epoch": 0.3513447828904731, + "grad_norm": 0.03348753973841667, + "learning_rate": 0.00017565816119886595, + "loss": 0.3319, + "step": 4337 + }, + { + "epoch": 0.35142579390797146, + "grad_norm": 0.03808234632015228, + "learning_rate": 0.00017569866342648845, + "loss": 0.3794, + "step": 4338 + }, + { + "epoch": 0.3515068049254699, + "grad_norm": 0.037807319313287735, + "learning_rate": 0.00017573916565411099, + "loss": 0.4035, + "step": 4339 + }, + { + "epoch": 0.35158781594296823, + "grad_norm": 0.030004501342773438, + "learning_rate": 0.0001757796678817335, + "loss": 0.3414, + "step": 4340 + }, + { + "epoch": 0.35166882696046664, + "grad_norm": 0.0362929105758667, + "learning_rate": 0.00017582017010935602, + "loss": 0.398, + "step": 4341 + }, + { + "epoch": 0.351749837977965, + "grad_norm": 0.03204100951552391, + "learning_rate": 0.00017586067233697853, + "loss": 0.3171, + "step": 4342 + }, + { + "epoch": 0.3518308489954634, + "grad_norm": 0.032586175948381424, + "learning_rate": 0.00017590117456460106, + "loss": 0.3528, + "step": 4343 + }, + { + "epoch": 0.35191186001296176, + "grad_norm": 0.035550639033317566, + "learning_rate": 0.00017594167679222357, + "loss": 0.3487, + "step": 4344 + }, + { + "epoch": 0.3519928710304601, + "grad_norm": 0.03612900897860527, + "learning_rate": 0.0001759821790198461, + "loss": 0.3612, + "step": 4345 + }, + { + "epoch": 0.35207388204795853, + "grad_norm": 0.03829009085893631, + "learning_rate": 0.0001760226812474686, + "loss": 0.3444, + "step": 4346 + }, + { + "epoch": 0.3521548930654569, + "grad_norm": 0.03981778025627136, + "learning_rate": 0.00017606318347509114, + "loss": 0.3354, + "step": 4347 + }, + { + "epoch": 0.3522359040829553, + "grad_norm": 0.03323979303240776, + "learning_rate": 0.00017610368570271365, + "loss": 0.3847, + "step": 4348 + }, + { + "epoch": 0.35231691510045365, + "grad_norm": 0.037503600120544434, + "learning_rate": 0.00017614418793033618, + "loss": 0.3923, + "step": 4349 + }, + { + "epoch": 0.35239792611795207, + "grad_norm": 0.03373998403549194, + "learning_rate": 0.0001761846901579587, + "loss": 0.4081, + "step": 4350 + }, + { + "epoch": 0.3524789371354504, + "grad_norm": 0.035595640540122986, + "learning_rate": 0.00017622519238558122, + "loss": 0.3839, + "step": 4351 + }, + { + "epoch": 0.3525599481529488, + "grad_norm": 0.03282087668776512, + "learning_rate": 0.00017626569461320373, + "loss": 0.3699, + "step": 4352 + }, + { + "epoch": 0.3526409591704472, + "grad_norm": 0.038674503564834595, + "learning_rate": 0.00017630619684082626, + "loss": 0.4181, + "step": 4353 + }, + { + "epoch": 0.35272197018794554, + "grad_norm": 0.04119935631752014, + "learning_rate": 0.0001763466990684488, + "loss": 0.4143, + "step": 4354 + }, + { + "epoch": 0.35280298120544396, + "grad_norm": 0.03823656216263771, + "learning_rate": 0.0001763872012960713, + "loss": 0.3594, + "step": 4355 + }, + { + "epoch": 0.3528839922229423, + "grad_norm": 0.03402746096253395, + "learning_rate": 0.00017642770352369383, + "loss": 0.3849, + "step": 4356 + }, + { + "epoch": 0.3529650032404407, + "grad_norm": 0.032122448086738586, + "learning_rate": 0.00017646820575131634, + "loss": 0.3509, + "step": 4357 + }, + { + "epoch": 0.3530460142579391, + "grad_norm": 0.04331757873296738, + "learning_rate": 0.00017650870797893885, + "loss": 0.4486, + "step": 4358 + }, + { + "epoch": 0.35312702527543743, + "grad_norm": 0.03687101975083351, + "learning_rate": 0.00017654921020656138, + "loss": 0.402, + "step": 4359 + }, + { + "epoch": 0.35320803629293585, + "grad_norm": 0.03533836454153061, + "learning_rate": 0.00017658971243418389, + "loss": 0.3699, + "step": 4360 + }, + { + "epoch": 0.3532890473104342, + "grad_norm": 0.03400861471891403, + "learning_rate": 0.0001766302146618064, + "loss": 0.3589, + "step": 4361 + }, + { + "epoch": 0.3533700583279326, + "grad_norm": 0.038683127611875534, + "learning_rate": 0.00017667071688942892, + "loss": 0.361, + "step": 4362 + }, + { + "epoch": 0.35345106934543097, + "grad_norm": 0.03580750152468681, + "learning_rate": 0.00017671121911705143, + "loss": 0.3218, + "step": 4363 + }, + { + "epoch": 0.3535320803629294, + "grad_norm": 0.042962536215782166, + "learning_rate": 0.00017675172134467396, + "loss": 0.3851, + "step": 4364 + }, + { + "epoch": 0.35361309138042774, + "grad_norm": 0.03469430282711983, + "learning_rate": 0.00017679222357229647, + "loss": 0.329, + "step": 4365 + }, + { + "epoch": 0.3536941023979261, + "grad_norm": 0.04712754860520363, + "learning_rate": 0.000176832725799919, + "loss": 0.3624, + "step": 4366 + }, + { + "epoch": 0.3537751134154245, + "grad_norm": 0.042234018445014954, + "learning_rate": 0.0001768732280275415, + "loss": 0.3715, + "step": 4367 + }, + { + "epoch": 0.35385612443292286, + "grad_norm": 0.034240856766700745, + "learning_rate": 0.00017691373025516404, + "loss": 0.3689, + "step": 4368 + }, + { + "epoch": 0.35393713545042127, + "grad_norm": 0.03210359439253807, + "learning_rate": 0.00017695423248278655, + "loss": 0.3568, + "step": 4369 + }, + { + "epoch": 0.3540181464679196, + "grad_norm": 0.036989495158195496, + "learning_rate": 0.00017699473471040908, + "loss": 0.3943, + "step": 4370 + }, + { + "epoch": 0.35409915748541804, + "grad_norm": 0.041003771126270294, + "learning_rate": 0.0001770352369380316, + "loss": 0.4186, + "step": 4371 + }, + { + "epoch": 0.3541801685029164, + "grad_norm": 0.03730255737900734, + "learning_rate": 0.00017707573916565412, + "loss": 0.3636, + "step": 4372 + }, + { + "epoch": 0.3542611795204148, + "grad_norm": 0.036754168570041656, + "learning_rate": 0.00017711624139327665, + "loss": 0.3896, + "step": 4373 + }, + { + "epoch": 0.35434219053791316, + "grad_norm": 0.037548258900642395, + "learning_rate": 0.00017715674362089916, + "loss": 0.3794, + "step": 4374 + }, + { + "epoch": 0.3544232015554115, + "grad_norm": 0.049613919109106064, + "learning_rate": 0.0001771972458485217, + "loss": 0.3303, + "step": 4375 + }, + { + "epoch": 0.3545042125729099, + "grad_norm": 0.03377959504723549, + "learning_rate": 0.0001772377480761442, + "loss": 0.3296, + "step": 4376 + }, + { + "epoch": 0.3545852235904083, + "grad_norm": 0.04323616996407509, + "learning_rate": 0.00017727825030376673, + "loss": 0.3667, + "step": 4377 + }, + { + "epoch": 0.3546662346079067, + "grad_norm": 0.04176010936498642, + "learning_rate": 0.00017731875253138924, + "loss": 0.407, + "step": 4378 + }, + { + "epoch": 0.35474724562540505, + "grad_norm": 0.042668696492910385, + "learning_rate": 0.00017735925475901177, + "loss": 0.3787, + "step": 4379 + }, + { + "epoch": 0.35482825664290346, + "grad_norm": 0.035692185163497925, + "learning_rate": 0.00017739975698663428, + "loss": 0.3701, + "step": 4380 + }, + { + "epoch": 0.3549092676604018, + "grad_norm": 0.034370869398117065, + "learning_rate": 0.0001774402592142568, + "loss": 0.3917, + "step": 4381 + }, + { + "epoch": 0.3549902786779002, + "grad_norm": 0.043179381638765335, + "learning_rate": 0.00017748076144187932, + "loss": 0.3479, + "step": 4382 + }, + { + "epoch": 0.3550712896953986, + "grad_norm": 0.032561879605054855, + "learning_rate": 0.00017752126366950182, + "loss": 0.3171, + "step": 4383 + }, + { + "epoch": 0.35515230071289694, + "grad_norm": 0.035728905349969864, + "learning_rate": 0.00017756176589712436, + "loss": 0.3248, + "step": 4384 + }, + { + "epoch": 0.35523331173039535, + "grad_norm": 0.029100000858306885, + "learning_rate": 0.00017760226812474686, + "loss": 0.3176, + "step": 4385 + }, + { + "epoch": 0.3553143227478937, + "grad_norm": 0.03463733196258545, + "learning_rate": 0.00017764277035236937, + "loss": 0.3507, + "step": 4386 + }, + { + "epoch": 0.3553953337653921, + "grad_norm": 0.03186383098363876, + "learning_rate": 0.0001776832725799919, + "loss": 0.3626, + "step": 4387 + }, + { + "epoch": 0.3554763447828905, + "grad_norm": 0.03267962113022804, + "learning_rate": 0.0001777237748076144, + "loss": 0.3582, + "step": 4388 + }, + { + "epoch": 0.35555735580038883, + "grad_norm": 0.03787730634212494, + "learning_rate": 0.00017776427703523694, + "loss": 0.3739, + "step": 4389 + }, + { + "epoch": 0.35563836681788724, + "grad_norm": 0.042798951268196106, + "learning_rate": 0.00017780477926285945, + "loss": 0.4089, + "step": 4390 + }, + { + "epoch": 0.3557193778353856, + "grad_norm": 0.03359090909361839, + "learning_rate": 0.00017784528149048198, + "loss": 0.3766, + "step": 4391 + }, + { + "epoch": 0.355800388852884, + "grad_norm": 0.03374600037932396, + "learning_rate": 0.00017788578371810451, + "loss": 0.384, + "step": 4392 + }, + { + "epoch": 0.35588139987038236, + "grad_norm": 0.037167515605688095, + "learning_rate": 0.00017792628594572702, + "loss": 0.3613, + "step": 4393 + }, + { + "epoch": 0.3559624108878808, + "grad_norm": 0.03484749794006348, + "learning_rate": 0.00017796678817334955, + "loss": 0.373, + "step": 4394 + }, + { + "epoch": 0.35604342190537913, + "grad_norm": 0.040365755558013916, + "learning_rate": 0.00017800729040097206, + "loss": 0.4126, + "step": 4395 + }, + { + "epoch": 0.3561244329228775, + "grad_norm": 0.033233847469091415, + "learning_rate": 0.0001780477926285946, + "loss": 0.4013, + "step": 4396 + }, + { + "epoch": 0.3562054439403759, + "grad_norm": 0.03474245220422745, + "learning_rate": 0.0001780882948562171, + "loss": 0.4215, + "step": 4397 + }, + { + "epoch": 0.35628645495787425, + "grad_norm": 0.03786751255393028, + "learning_rate": 0.00017812879708383963, + "loss": 0.3619, + "step": 4398 + }, + { + "epoch": 0.35636746597537267, + "grad_norm": 0.034605786204338074, + "learning_rate": 0.00017816929931146214, + "loss": 0.3443, + "step": 4399 + }, + { + "epoch": 0.356448476992871, + "grad_norm": 0.03127652406692505, + "learning_rate": 0.00017820980153908467, + "loss": 0.3953, + "step": 4400 + }, + { + "epoch": 0.35652948801036943, + "grad_norm": 0.04228046536445618, + "learning_rate": 0.00017825030376670718, + "loss": 0.3826, + "step": 4401 + }, + { + "epoch": 0.3566104990278678, + "grad_norm": 0.036720506846904755, + "learning_rate": 0.0001782908059943297, + "loss": 0.3821, + "step": 4402 + }, + { + "epoch": 0.35669151004536614, + "grad_norm": 0.03216755390167236, + "learning_rate": 0.00017833130822195222, + "loss": 0.4051, + "step": 4403 + }, + { + "epoch": 0.35677252106286456, + "grad_norm": 0.033246368169784546, + "learning_rate": 0.00017837181044957475, + "loss": 0.3691, + "step": 4404 + }, + { + "epoch": 0.3568535320803629, + "grad_norm": 0.031144283711910248, + "learning_rate": 0.00017841231267719726, + "loss": 0.3615, + "step": 4405 + }, + { + "epoch": 0.3569345430978613, + "grad_norm": 0.03426123782992363, + "learning_rate": 0.00017845281490481976, + "loss": 0.3484, + "step": 4406 + }, + { + "epoch": 0.3570155541153597, + "grad_norm": 0.031132718548178673, + "learning_rate": 0.0001784933171324423, + "loss": 0.3619, + "step": 4407 + }, + { + "epoch": 0.3570965651328581, + "grad_norm": 0.04052342474460602, + "learning_rate": 0.0001785338193600648, + "loss": 0.3621, + "step": 4408 + }, + { + "epoch": 0.35717757615035645, + "grad_norm": 0.055709172040224075, + "learning_rate": 0.0001785743215876873, + "loss": 0.3982, + "step": 4409 + }, + { + "epoch": 0.3572585871678548, + "grad_norm": 0.031802013516426086, + "learning_rate": 0.00017861482381530984, + "loss": 0.3618, + "step": 4410 + }, + { + "epoch": 0.3573395981853532, + "grad_norm": 0.03412245586514473, + "learning_rate": 0.00017865532604293237, + "loss": 0.3946, + "step": 4411 + }, + { + "epoch": 0.35742060920285157, + "grad_norm": 0.03505575656890869, + "learning_rate": 0.00017869582827055488, + "loss": 0.3698, + "step": 4412 + }, + { + "epoch": 0.35750162022035, + "grad_norm": 0.03713294491171837, + "learning_rate": 0.00017873633049817741, + "loss": 0.3686, + "step": 4413 + }, + { + "epoch": 0.35758263123784834, + "grad_norm": 0.03343082591891289, + "learning_rate": 0.00017877683272579992, + "loss": 0.3212, + "step": 4414 + }, + { + "epoch": 0.35766364225534675, + "grad_norm": 0.03919597715139389, + "learning_rate": 0.00017881733495342245, + "loss": 0.355, + "step": 4415 + }, + { + "epoch": 0.3577446532728451, + "grad_norm": 0.03567646071314812, + "learning_rate": 0.00017885783718104496, + "loss": 0.3724, + "step": 4416 + }, + { + "epoch": 0.35782566429034346, + "grad_norm": 0.03884551301598549, + "learning_rate": 0.0001788983394086675, + "loss": 0.3947, + "step": 4417 + }, + { + "epoch": 0.35790667530784187, + "grad_norm": 0.03751397132873535, + "learning_rate": 0.00017893884163629, + "loss": 0.3752, + "step": 4418 + }, + { + "epoch": 0.3579876863253402, + "grad_norm": 0.03351527824997902, + "learning_rate": 0.00017897934386391253, + "loss": 0.3422, + "step": 4419 + }, + { + "epoch": 0.35806869734283864, + "grad_norm": 0.03406880795955658, + "learning_rate": 0.00017901984609153504, + "loss": 0.3083, + "step": 4420 + }, + { + "epoch": 0.358149708360337, + "grad_norm": 0.0321863517165184, + "learning_rate": 0.00017906034831915757, + "loss": 0.3498, + "step": 4421 + }, + { + "epoch": 0.3582307193778354, + "grad_norm": 0.032903872430324554, + "learning_rate": 0.00017910085054678008, + "loss": 0.3447, + "step": 4422 + }, + { + "epoch": 0.35831173039533376, + "grad_norm": 0.032026126980781555, + "learning_rate": 0.0001791413527744026, + "loss": 0.3368, + "step": 4423 + }, + { + "epoch": 0.35839274141283217, + "grad_norm": 0.031434398144483566, + "learning_rate": 0.00017918185500202512, + "loss": 0.3541, + "step": 4424 + }, + { + "epoch": 0.35847375243033053, + "grad_norm": 0.03229876980185509, + "learning_rate": 0.00017922235722964765, + "loss": 0.3212, + "step": 4425 + }, + { + "epoch": 0.3585547634478289, + "grad_norm": 0.03334301337599754, + "learning_rate": 0.00017926285945727016, + "loss": 0.3369, + "step": 4426 + }, + { + "epoch": 0.3586357744653273, + "grad_norm": 0.033178362995386124, + "learning_rate": 0.0001793033616848927, + "loss": 0.3791, + "step": 4427 + }, + { + "epoch": 0.35871678548282565, + "grad_norm": 0.03164026886224747, + "learning_rate": 0.0001793438639125152, + "loss": 0.3518, + "step": 4428 + }, + { + "epoch": 0.35879779650032406, + "grad_norm": 0.03897491469979286, + "learning_rate": 0.00017938436614013773, + "loss": 0.3846, + "step": 4429 + }, + { + "epoch": 0.3588788075178224, + "grad_norm": 0.034208353608846664, + "learning_rate": 0.00017942486836776023, + "loss": 0.2991, + "step": 4430 + }, + { + "epoch": 0.35895981853532083, + "grad_norm": 0.03465808555483818, + "learning_rate": 0.00017946537059538274, + "loss": 0.3682, + "step": 4431 + }, + { + "epoch": 0.3590408295528192, + "grad_norm": 0.039521731436252594, + "learning_rate": 0.00017950587282300527, + "loss": 0.3649, + "step": 4432 + }, + { + "epoch": 0.35912184057031754, + "grad_norm": 0.03066393733024597, + "learning_rate": 0.00017954637505062778, + "loss": 0.3466, + "step": 4433 + }, + { + "epoch": 0.35920285158781595, + "grad_norm": 0.03277844935655594, + "learning_rate": 0.0001795868772782503, + "loss": 0.3776, + "step": 4434 + }, + { + "epoch": 0.3592838626053143, + "grad_norm": 0.031138377264142036, + "learning_rate": 0.00017962737950587282, + "loss": 0.381, + "step": 4435 + }, + { + "epoch": 0.3593648736228127, + "grad_norm": 0.03991644084453583, + "learning_rate": 0.00017966788173349535, + "loss": 0.4098, + "step": 4436 + }, + { + "epoch": 0.3594458846403111, + "grad_norm": 0.037129972130060196, + "learning_rate": 0.00017970838396111786, + "loss": 0.3501, + "step": 4437 + }, + { + "epoch": 0.3595268956578095, + "grad_norm": 0.03298862650990486, + "learning_rate": 0.0001797488861887404, + "loss": 0.3319, + "step": 4438 + }, + { + "epoch": 0.35960790667530784, + "grad_norm": 0.03268022462725639, + "learning_rate": 0.0001797893884163629, + "loss": 0.3242, + "step": 4439 + }, + { + "epoch": 0.3596889176928062, + "grad_norm": 0.03326528146862984, + "learning_rate": 0.00017982989064398543, + "loss": 0.3843, + "step": 4440 + }, + { + "epoch": 0.3597699287103046, + "grad_norm": 0.03307091072201729, + "learning_rate": 0.00017987039287160794, + "loss": 0.3218, + "step": 4441 + }, + { + "epoch": 0.35985093972780297, + "grad_norm": 0.0321352519094944, + "learning_rate": 0.00017991089509923047, + "loss": 0.3213, + "step": 4442 + }, + { + "epoch": 0.3599319507453014, + "grad_norm": 0.03353230282664299, + "learning_rate": 0.00017995139732685298, + "loss": 0.3459, + "step": 4443 + }, + { + "epoch": 0.36001296176279973, + "grad_norm": 0.036084793508052826, + "learning_rate": 0.0001799918995544755, + "loss": 0.3738, + "step": 4444 + }, + { + "epoch": 0.36009397278029814, + "grad_norm": 0.03896293416619301, + "learning_rate": 0.00018003240178209802, + "loss": 0.3596, + "step": 4445 + }, + { + "epoch": 0.3601749837977965, + "grad_norm": 0.03121447004377842, + "learning_rate": 0.00018007290400972055, + "loss": 0.3664, + "step": 4446 + }, + { + "epoch": 0.36025599481529486, + "grad_norm": 0.036111436784267426, + "learning_rate": 0.00018011340623734306, + "loss": 0.4038, + "step": 4447 + }, + { + "epoch": 0.36033700583279327, + "grad_norm": 0.030647702515125275, + "learning_rate": 0.0001801539084649656, + "loss": 0.3517, + "step": 4448 + }, + { + "epoch": 0.3604180168502916, + "grad_norm": 0.03785083442926407, + "learning_rate": 0.00018019441069258812, + "loss": 0.3894, + "step": 4449 + }, + { + "epoch": 0.36049902786779003, + "grad_norm": 0.037119291722774506, + "learning_rate": 0.00018023491292021063, + "loss": 0.3726, + "step": 4450 + }, + { + "epoch": 0.3605800388852884, + "grad_norm": 0.0395854189991951, + "learning_rate": 0.00018027541514783316, + "loss": 0.4297, + "step": 4451 + }, + { + "epoch": 0.3606610499027868, + "grad_norm": 0.03590099513530731, + "learning_rate": 0.00018031591737545567, + "loss": 0.3236, + "step": 4452 + }, + { + "epoch": 0.36074206092028516, + "grad_norm": 0.04296981170773506, + "learning_rate": 0.00018035641960307817, + "loss": 0.3651, + "step": 4453 + }, + { + "epoch": 0.3608230719377835, + "grad_norm": 0.039156656712293625, + "learning_rate": 0.0001803969218307007, + "loss": 0.3636, + "step": 4454 + }, + { + "epoch": 0.3609040829552819, + "grad_norm": 0.040239349007606506, + "learning_rate": 0.0001804374240583232, + "loss": 0.3535, + "step": 4455 + }, + { + "epoch": 0.3609850939727803, + "grad_norm": 0.03353238105773926, + "learning_rate": 0.00018047792628594572, + "loss": 0.382, + "step": 4456 + }, + { + "epoch": 0.3610661049902787, + "grad_norm": 0.03441842272877693, + "learning_rate": 0.00018051842851356825, + "loss": 0.3489, + "step": 4457 + }, + { + "epoch": 0.36114711600777705, + "grad_norm": 0.030106976628303528, + "learning_rate": 0.00018055893074119076, + "loss": 0.3159, + "step": 4458 + }, + { + "epoch": 0.36122812702527546, + "grad_norm": 0.03584039956331253, + "learning_rate": 0.0001805994329688133, + "loss": 0.3966, + "step": 4459 + }, + { + "epoch": 0.3613091380427738, + "grad_norm": 0.034742627292871475, + "learning_rate": 0.0001806399351964358, + "loss": 0.3637, + "step": 4460 + }, + { + "epoch": 0.36139014906027217, + "grad_norm": 0.03912995010614395, + "learning_rate": 0.00018068043742405833, + "loss": 0.3234, + "step": 4461 + }, + { + "epoch": 0.3614711600777706, + "grad_norm": 0.03890937939286232, + "learning_rate": 0.00018072093965168084, + "loss": 0.3684, + "step": 4462 + }, + { + "epoch": 0.36155217109526894, + "grad_norm": 0.0334155298769474, + "learning_rate": 0.00018076144187930337, + "loss": 0.3663, + "step": 4463 + }, + { + "epoch": 0.36163318211276735, + "grad_norm": 0.039641525596380234, + "learning_rate": 0.00018080194410692588, + "loss": 0.3978, + "step": 4464 + }, + { + "epoch": 0.3617141931302657, + "grad_norm": 0.036285150796175, + "learning_rate": 0.0001808424463345484, + "loss": 0.4287, + "step": 4465 + }, + { + "epoch": 0.3617952041477641, + "grad_norm": 0.034932248294353485, + "learning_rate": 0.00018088294856217092, + "loss": 0.3866, + "step": 4466 + }, + { + "epoch": 0.36187621516526247, + "grad_norm": 0.03337856009602547, + "learning_rate": 0.00018092345078979345, + "loss": 0.3974, + "step": 4467 + }, + { + "epoch": 0.3619572261827609, + "grad_norm": 0.031295645982027054, + "learning_rate": 0.00018096395301741598, + "loss": 0.3226, + "step": 4468 + }, + { + "epoch": 0.36203823720025924, + "grad_norm": 0.03899591416120529, + "learning_rate": 0.0001810044552450385, + "loss": 0.3586, + "step": 4469 + }, + { + "epoch": 0.3621192482177576, + "grad_norm": 0.03353521227836609, + "learning_rate": 0.00018104495747266102, + "loss": 0.3386, + "step": 4470 + }, + { + "epoch": 0.362200259235256, + "grad_norm": 0.04231454059481621, + "learning_rate": 0.00018108545970028353, + "loss": 0.3845, + "step": 4471 + }, + { + "epoch": 0.36228127025275436, + "grad_norm": 0.030955109745264053, + "learning_rate": 0.00018112596192790606, + "loss": 0.3506, + "step": 4472 + }, + { + "epoch": 0.3623622812702528, + "grad_norm": 0.036558013409376144, + "learning_rate": 0.00018116646415552857, + "loss": 0.4057, + "step": 4473 + }, + { + "epoch": 0.36244329228775113, + "grad_norm": 0.031068559736013412, + "learning_rate": 0.0001812069663831511, + "loss": 0.3392, + "step": 4474 + }, + { + "epoch": 0.36252430330524954, + "grad_norm": 0.03186468034982681, + "learning_rate": 0.0001812474686107736, + "loss": 0.3352, + "step": 4475 + }, + { + "epoch": 0.3626053143227479, + "grad_norm": 0.03339767083525658, + "learning_rate": 0.00018128797083839614, + "loss": 0.3896, + "step": 4476 + }, + { + "epoch": 0.36268632534024625, + "grad_norm": 0.03032582812011242, + "learning_rate": 0.00018132847306601865, + "loss": 0.347, + "step": 4477 + }, + { + "epoch": 0.36276733635774466, + "grad_norm": 0.03296176716685295, + "learning_rate": 0.00018136897529364115, + "loss": 0.3254, + "step": 4478 + }, + { + "epoch": 0.362848347375243, + "grad_norm": 0.0313357375562191, + "learning_rate": 0.00018140947752126366, + "loss": 0.4099, + "step": 4479 + }, + { + "epoch": 0.36292935839274143, + "grad_norm": 0.03426244109869003, + "learning_rate": 0.0001814499797488862, + "loss": 0.3294, + "step": 4480 + }, + { + "epoch": 0.3630103694102398, + "grad_norm": 0.03199993073940277, + "learning_rate": 0.0001814904819765087, + "loss": 0.3469, + "step": 4481 + }, + { + "epoch": 0.3630913804277382, + "grad_norm": 0.0293571837246418, + "learning_rate": 0.00018153098420413123, + "loss": 0.3434, + "step": 4482 + }, + { + "epoch": 0.36317239144523655, + "grad_norm": 0.03131209686398506, + "learning_rate": 0.00018157148643175374, + "loss": 0.3375, + "step": 4483 + }, + { + "epoch": 0.3632534024627349, + "grad_norm": 0.03087998926639557, + "learning_rate": 0.00018161198865937627, + "loss": 0.355, + "step": 4484 + }, + { + "epoch": 0.3633344134802333, + "grad_norm": 0.030083077028393745, + "learning_rate": 0.00018165249088699878, + "loss": 0.3811, + "step": 4485 + }, + { + "epoch": 0.3634154244977317, + "grad_norm": 0.03256846219301224, + "learning_rate": 0.0001816929931146213, + "loss": 0.3512, + "step": 4486 + }, + { + "epoch": 0.3634964355152301, + "grad_norm": 0.038921091705560684, + "learning_rate": 0.00018173349534224384, + "loss": 0.4197, + "step": 4487 + }, + { + "epoch": 0.36357744653272844, + "grad_norm": 0.03630004823207855, + "learning_rate": 0.00018177399756986635, + "loss": 0.3657, + "step": 4488 + }, + { + "epoch": 0.36365845755022685, + "grad_norm": 0.03309519216418266, + "learning_rate": 0.00018181449979748888, + "loss": 0.3578, + "step": 4489 + }, + { + "epoch": 0.3637394685677252, + "grad_norm": 0.03494489565491676, + "learning_rate": 0.0001818550020251114, + "loss": 0.3547, + "step": 4490 + }, + { + "epoch": 0.36382047958522357, + "grad_norm": 0.0350724458694458, + "learning_rate": 0.00018189550425273392, + "loss": 0.3861, + "step": 4491 + }, + { + "epoch": 0.363901490602722, + "grad_norm": 0.03464026749134064, + "learning_rate": 0.00018193600648035643, + "loss": 0.3652, + "step": 4492 + }, + { + "epoch": 0.36398250162022033, + "grad_norm": 0.030455349013209343, + "learning_rate": 0.00018197650870797896, + "loss": 0.3767, + "step": 4493 + }, + { + "epoch": 0.36406351263771874, + "grad_norm": 0.03616175055503845, + "learning_rate": 0.00018201701093560147, + "loss": 0.3474, + "step": 4494 + }, + { + "epoch": 0.3641445236552171, + "grad_norm": 0.037097640335559845, + "learning_rate": 0.000182057513163224, + "loss": 0.4007, + "step": 4495 + }, + { + "epoch": 0.3642255346727155, + "grad_norm": 0.04117016866803169, + "learning_rate": 0.0001820980153908465, + "loss": 0.3695, + "step": 4496 + }, + { + "epoch": 0.36430654569021387, + "grad_norm": 0.034477025270462036, + "learning_rate": 0.00018213851761846904, + "loss": 0.3537, + "step": 4497 + }, + { + "epoch": 0.3643875567077122, + "grad_norm": 0.03152913972735405, + "learning_rate": 0.00018217901984609155, + "loss": 0.3469, + "step": 4498 + }, + { + "epoch": 0.36446856772521063, + "grad_norm": 0.03316012769937515, + "learning_rate": 0.00018221952207371408, + "loss": 0.3431, + "step": 4499 + }, + { + "epoch": 0.364549578742709, + "grad_norm": 0.03767377510666847, + "learning_rate": 0.00018226002430133658, + "loss": 0.4001, + "step": 4500 + }, + { + "epoch": 0.3646305897602074, + "grad_norm": 0.03182438388466835, + "learning_rate": 0.0001823005265289591, + "loss": 0.3346, + "step": 4501 + }, + { + "epoch": 0.36471160077770576, + "grad_norm": 0.03710455447435379, + "learning_rate": 0.00018234102875658162, + "loss": 0.4222, + "step": 4502 + }, + { + "epoch": 0.36479261179520417, + "grad_norm": 0.03305591270327568, + "learning_rate": 0.00018238153098420413, + "loss": 0.3964, + "step": 4503 + }, + { + "epoch": 0.3648736228127025, + "grad_norm": 0.0305950790643692, + "learning_rate": 0.00018242203321182664, + "loss": 0.3369, + "step": 4504 + }, + { + "epoch": 0.3649546338302009, + "grad_norm": 0.03193674981594086, + "learning_rate": 0.00018246253543944917, + "loss": 0.3641, + "step": 4505 + }, + { + "epoch": 0.3650356448476993, + "grad_norm": 0.03355922922492027, + "learning_rate": 0.0001825030376670717, + "loss": 0.4064, + "step": 4506 + }, + { + "epoch": 0.36511665586519765, + "grad_norm": 0.028800426051020622, + "learning_rate": 0.0001825435398946942, + "loss": 0.3606, + "step": 4507 + }, + { + "epoch": 0.36519766688269606, + "grad_norm": 0.039104342460632324, + "learning_rate": 0.00018258404212231674, + "loss": 0.3716, + "step": 4508 + }, + { + "epoch": 0.3652786779001944, + "grad_norm": 0.03672458603978157, + "learning_rate": 0.00018262454434993925, + "loss": 0.3535, + "step": 4509 + }, + { + "epoch": 0.3653596889176928, + "grad_norm": 0.03032134473323822, + "learning_rate": 0.00018266504657756178, + "loss": 0.3762, + "step": 4510 + }, + { + "epoch": 0.3654406999351912, + "grad_norm": 0.03428445756435394, + "learning_rate": 0.0001827055488051843, + "loss": 0.3917, + "step": 4511 + }, + { + "epoch": 0.36552171095268954, + "grad_norm": 0.03314060717821121, + "learning_rate": 0.00018274605103280682, + "loss": 0.3586, + "step": 4512 + }, + { + "epoch": 0.36560272197018795, + "grad_norm": 0.03430025652050972, + "learning_rate": 0.00018278655326042933, + "loss": 0.3634, + "step": 4513 + }, + { + "epoch": 0.3656837329876863, + "grad_norm": 0.030363822355866432, + "learning_rate": 0.00018282705548805186, + "loss": 0.363, + "step": 4514 + }, + { + "epoch": 0.3657647440051847, + "grad_norm": 0.038539864122867584, + "learning_rate": 0.00018286755771567437, + "loss": 0.4033, + "step": 4515 + }, + { + "epoch": 0.3658457550226831, + "grad_norm": 0.03524131700396538, + "learning_rate": 0.0001829080599432969, + "loss": 0.3802, + "step": 4516 + }, + { + "epoch": 0.3659267660401815, + "grad_norm": 0.03943591192364693, + "learning_rate": 0.0001829485621709194, + "loss": 0.4098, + "step": 4517 + }, + { + "epoch": 0.36600777705767984, + "grad_norm": 0.03730255737900734, + "learning_rate": 0.00018298906439854194, + "loss": 0.346, + "step": 4518 + }, + { + "epoch": 0.36608878807517825, + "grad_norm": 0.033342789858579636, + "learning_rate": 0.00018302956662616444, + "loss": 0.3728, + "step": 4519 + }, + { + "epoch": 0.3661697990926766, + "grad_norm": 0.038131117820739746, + "learning_rate": 0.00018307006885378698, + "loss": 0.3615, + "step": 4520 + }, + { + "epoch": 0.36625081011017496, + "grad_norm": 0.04024023562669754, + "learning_rate": 0.00018311057108140948, + "loss": 0.369, + "step": 4521 + }, + { + "epoch": 0.3663318211276734, + "grad_norm": 0.03590629994869232, + "learning_rate": 0.00018315107330903202, + "loss": 0.3693, + "step": 4522 + }, + { + "epoch": 0.36641283214517173, + "grad_norm": 0.03583737835288048, + "learning_rate": 0.00018319157553665452, + "loss": 0.3642, + "step": 4523 + }, + { + "epoch": 0.36649384316267014, + "grad_norm": 0.03601490706205368, + "learning_rate": 0.00018323207776427706, + "loss": 0.3483, + "step": 4524 + }, + { + "epoch": 0.3665748541801685, + "grad_norm": 0.0373217836022377, + "learning_rate": 0.00018327257999189956, + "loss": 0.4122, + "step": 4525 + }, + { + "epoch": 0.3666558651976669, + "grad_norm": 0.035433217883110046, + "learning_rate": 0.00018331308221952207, + "loss": 0.3168, + "step": 4526 + }, + { + "epoch": 0.36673687621516526, + "grad_norm": 0.03406085819005966, + "learning_rate": 0.0001833535844471446, + "loss": 0.3609, + "step": 4527 + }, + { + "epoch": 0.3668178872326636, + "grad_norm": 0.03262537717819214, + "learning_rate": 0.0001833940866747671, + "loss": 0.3474, + "step": 4528 + }, + { + "epoch": 0.36689889825016203, + "grad_norm": 0.038104861974716187, + "learning_rate": 0.00018343458890238964, + "loss": 0.376, + "step": 4529 + }, + { + "epoch": 0.3669799092676604, + "grad_norm": 0.037595901638269424, + "learning_rate": 0.00018347509113001215, + "loss": 0.4158, + "step": 4530 + }, + { + "epoch": 0.3670609202851588, + "grad_norm": 0.03141307085752487, + "learning_rate": 0.00018351559335763468, + "loss": 0.3483, + "step": 4531 + }, + { + "epoch": 0.36714193130265715, + "grad_norm": 0.037604913115501404, + "learning_rate": 0.0001835560955852572, + "loss": 0.3513, + "step": 4532 + }, + { + "epoch": 0.36722294232015557, + "grad_norm": 0.03441972658038139, + "learning_rate": 0.00018359659781287972, + "loss": 0.364, + "step": 4533 + }, + { + "epoch": 0.3673039533376539, + "grad_norm": 0.032113075256347656, + "learning_rate": 0.00018363710004050223, + "loss": 0.3318, + "step": 4534 + }, + { + "epoch": 0.3673849643551523, + "grad_norm": 0.03355928510427475, + "learning_rate": 0.00018367760226812476, + "loss": 0.3813, + "step": 4535 + }, + { + "epoch": 0.3674659753726507, + "grad_norm": 0.03720551356673241, + "learning_rate": 0.00018371810449574727, + "loss": 0.3893, + "step": 4536 + }, + { + "epoch": 0.36754698639014904, + "grad_norm": 0.04136877879500389, + "learning_rate": 0.0001837586067233698, + "loss": 0.3667, + "step": 4537 + }, + { + "epoch": 0.36762799740764746, + "grad_norm": 0.03302746266126633, + "learning_rate": 0.0001837991089509923, + "loss": 0.3675, + "step": 4538 + }, + { + "epoch": 0.3677090084251458, + "grad_norm": 0.034325793385505676, + "learning_rate": 0.00018383961117861484, + "loss": 0.3442, + "step": 4539 + }, + { + "epoch": 0.3677900194426442, + "grad_norm": 0.03347557410597801, + "learning_rate": 0.00018388011340623734, + "loss": 0.352, + "step": 4540 + }, + { + "epoch": 0.3678710304601426, + "grad_norm": 0.03631537780165672, + "learning_rate": 0.00018392061563385988, + "loss": 0.3562, + "step": 4541 + }, + { + "epoch": 0.36795204147764093, + "grad_norm": 0.031340762972831726, + "learning_rate": 0.00018396111786148238, + "loss": 0.3775, + "step": 4542 + }, + { + "epoch": 0.36803305249513935, + "grad_norm": 0.029904751107096672, + "learning_rate": 0.00018400162008910492, + "loss": 0.338, + "step": 4543 + }, + { + "epoch": 0.3681140635126377, + "grad_norm": 0.03467988595366478, + "learning_rate": 0.00018404212231672745, + "loss": 0.3809, + "step": 4544 + }, + { + "epoch": 0.3681950745301361, + "grad_norm": 0.03151930868625641, + "learning_rate": 0.00018408262454434996, + "loss": 0.3551, + "step": 4545 + }, + { + "epoch": 0.36827608554763447, + "grad_norm": 0.036787740886211395, + "learning_rate": 0.0001841231267719725, + "loss": 0.3439, + "step": 4546 + }, + { + "epoch": 0.3683570965651329, + "grad_norm": 0.03223686292767525, + "learning_rate": 0.000184163628999595, + "loss": 0.3669, + "step": 4547 + }, + { + "epoch": 0.36843810758263124, + "grad_norm": 0.03125021234154701, + "learning_rate": 0.0001842041312272175, + "loss": 0.3613, + "step": 4548 + }, + { + "epoch": 0.3685191186001296, + "grad_norm": 0.03878352791070938, + "learning_rate": 0.00018424463345484, + "loss": 0.4096, + "step": 4549 + }, + { + "epoch": 0.368600129617628, + "grad_norm": 0.03440506383776665, + "learning_rate": 0.00018428513568246254, + "loss": 0.3913, + "step": 4550 + }, + { + "epoch": 0.36868114063512636, + "grad_norm": 0.03337797522544861, + "learning_rate": 0.00018432563791008505, + "loss": 0.3498, + "step": 4551 + }, + { + "epoch": 0.36876215165262477, + "grad_norm": 0.035269659012556076, + "learning_rate": 0.00018436614013770758, + "loss": 0.3618, + "step": 4552 + }, + { + "epoch": 0.3688431626701231, + "grad_norm": 0.036833252757787704, + "learning_rate": 0.00018440664236533009, + "loss": 0.4147, + "step": 4553 + }, + { + "epoch": 0.36892417368762154, + "grad_norm": 0.03536658361554146, + "learning_rate": 0.00018444714459295262, + "loss": 0.353, + "step": 4554 + }, + { + "epoch": 0.3690051847051199, + "grad_norm": 0.028857829049229622, + "learning_rate": 0.00018448764682057513, + "loss": 0.3742, + "step": 4555 + }, + { + "epoch": 0.36908619572261825, + "grad_norm": 0.035467978566884995, + "learning_rate": 0.00018452814904819766, + "loss": 0.341, + "step": 4556 + }, + { + "epoch": 0.36916720674011666, + "grad_norm": 0.031414639204740524, + "learning_rate": 0.00018456865127582016, + "loss": 0.3664, + "step": 4557 + }, + { + "epoch": 0.369248217757615, + "grad_norm": 0.03707354515790939, + "learning_rate": 0.0001846091535034427, + "loss": 0.398, + "step": 4558 + }, + { + "epoch": 0.3693292287751134, + "grad_norm": 0.03519902005791664, + "learning_rate": 0.0001846496557310652, + "loss": 0.3708, + "step": 4559 + }, + { + "epoch": 0.3694102397926118, + "grad_norm": 0.03694036602973938, + "learning_rate": 0.00018469015795868774, + "loss": 0.3665, + "step": 4560 + }, + { + "epoch": 0.3694912508101102, + "grad_norm": 0.029678206890821457, + "learning_rate": 0.00018473066018631024, + "loss": 0.3253, + "step": 4561 + }, + { + "epoch": 0.36957226182760855, + "grad_norm": 0.030835380777716637, + "learning_rate": 0.00018477116241393278, + "loss": 0.3339, + "step": 4562 + }, + { + "epoch": 0.36965327284510696, + "grad_norm": 0.03632909059524536, + "learning_rate": 0.0001848116646415553, + "loss": 0.4066, + "step": 4563 + }, + { + "epoch": 0.3697342838626053, + "grad_norm": 0.031888432800769806, + "learning_rate": 0.00018485216686917782, + "loss": 0.3753, + "step": 4564 + }, + { + "epoch": 0.3698152948801037, + "grad_norm": 0.03251507505774498, + "learning_rate": 0.00018489266909680035, + "loss": 0.3445, + "step": 4565 + }, + { + "epoch": 0.3698963058976021, + "grad_norm": 0.03139140456914902, + "learning_rate": 0.00018493317132442286, + "loss": 0.3425, + "step": 4566 + }, + { + "epoch": 0.36997731691510044, + "grad_norm": 0.031516510993242264, + "learning_rate": 0.0001849736735520454, + "loss": 0.3523, + "step": 4567 + }, + { + "epoch": 0.37005832793259885, + "grad_norm": 0.03723737969994545, + "learning_rate": 0.0001850141757796679, + "loss": 0.4167, + "step": 4568 + }, + { + "epoch": 0.3701393389500972, + "grad_norm": 0.03188415989279747, + "learning_rate": 0.00018505467800729043, + "loss": 0.3622, + "step": 4569 + }, + { + "epoch": 0.3702203499675956, + "grad_norm": 0.030478963628411293, + "learning_rate": 0.00018509518023491293, + "loss": 0.3232, + "step": 4570 + }, + { + "epoch": 0.370301360985094, + "grad_norm": 0.0350758358836174, + "learning_rate": 0.00018513568246253544, + "loss": 0.3963, + "step": 4571 + }, + { + "epoch": 0.37038237200259233, + "grad_norm": 0.03737777844071388, + "learning_rate": 0.00018517618469015797, + "loss": 0.38, + "step": 4572 + }, + { + "epoch": 0.37046338302009074, + "grad_norm": 0.03170812129974365, + "learning_rate": 0.00018521668691778048, + "loss": 0.339, + "step": 4573 + }, + { + "epoch": 0.3705443940375891, + "grad_norm": 0.03591955453157425, + "learning_rate": 0.00018525718914540299, + "loss": 0.4212, + "step": 4574 + }, + { + "epoch": 0.3706254050550875, + "grad_norm": 0.03319928050041199, + "learning_rate": 0.00018529769137302552, + "loss": 0.3712, + "step": 4575 + }, + { + "epoch": 0.37070641607258586, + "grad_norm": 0.030675271525979042, + "learning_rate": 0.00018533819360064803, + "loss": 0.3147, + "step": 4576 + }, + { + "epoch": 0.3707874270900843, + "grad_norm": 0.029982060194015503, + "learning_rate": 0.00018537869582827056, + "loss": 0.3431, + "step": 4577 + }, + { + "epoch": 0.37086843810758263, + "grad_norm": 0.03718116879463196, + "learning_rate": 0.00018541919805589306, + "loss": 0.3588, + "step": 4578 + }, + { + "epoch": 0.370949449125081, + "grad_norm": 0.032471753656864166, + "learning_rate": 0.0001854597002835156, + "loss": 0.3329, + "step": 4579 + }, + { + "epoch": 0.3710304601425794, + "grad_norm": 0.03818058222532272, + "learning_rate": 0.00018550020251113813, + "loss": 0.4274, + "step": 4580 + }, + { + "epoch": 0.37111147116007775, + "grad_norm": 0.03520652651786804, + "learning_rate": 0.00018554070473876064, + "loss": 0.4192, + "step": 4581 + }, + { + "epoch": 0.37119248217757617, + "grad_norm": 0.03470182791352272, + "learning_rate": 0.00018558120696638317, + "loss": 0.3832, + "step": 4582 + }, + { + "epoch": 0.3712734931950745, + "grad_norm": 0.033301129937171936, + "learning_rate": 0.00018562170919400568, + "loss": 0.344, + "step": 4583 + }, + { + "epoch": 0.37135450421257293, + "grad_norm": 0.033769186586141586, + "learning_rate": 0.0001856622114216282, + "loss": 0.3479, + "step": 4584 + }, + { + "epoch": 0.3714355152300713, + "grad_norm": 0.03508080914616585, + "learning_rate": 0.00018570271364925072, + "loss": 0.3664, + "step": 4585 + }, + { + "epoch": 0.37151652624756965, + "grad_norm": 0.033902063965797424, + "learning_rate": 0.00018574321587687325, + "loss": 0.4061, + "step": 4586 + }, + { + "epoch": 0.37159753726506806, + "grad_norm": 0.036879245191812515, + "learning_rate": 0.00018578371810449575, + "loss": 0.3981, + "step": 4587 + }, + { + "epoch": 0.3716785482825664, + "grad_norm": 0.03260815516114235, + "learning_rate": 0.0001858242203321183, + "loss": 0.3279, + "step": 4588 + }, + { + "epoch": 0.3717595593000648, + "grad_norm": 0.029037103056907654, + "learning_rate": 0.0001858647225597408, + "loss": 0.3237, + "step": 4589 + }, + { + "epoch": 0.3718405703175632, + "grad_norm": 0.03038596175611019, + "learning_rate": 0.00018590522478736333, + "loss": 0.3839, + "step": 4590 + }, + { + "epoch": 0.3719215813350616, + "grad_norm": 0.031168343499302864, + "learning_rate": 0.00018594572701498583, + "loss": 0.3289, + "step": 4591 + }, + { + "epoch": 0.37200259235255995, + "grad_norm": 0.03584503382444382, + "learning_rate": 0.00018598622924260837, + "loss": 0.3892, + "step": 4592 + }, + { + "epoch": 0.3720836033700583, + "grad_norm": 0.034035272896289825, + "learning_rate": 0.00018602673147023087, + "loss": 0.3566, + "step": 4593 + }, + { + "epoch": 0.3721646143875567, + "grad_norm": 0.033572208136320114, + "learning_rate": 0.0001860672336978534, + "loss": 0.4105, + "step": 4594 + }, + { + "epoch": 0.37224562540505507, + "grad_norm": 0.03304578363895416, + "learning_rate": 0.0001861077359254759, + "loss": 0.3542, + "step": 4595 + }, + { + "epoch": 0.3723266364225535, + "grad_norm": 0.03381304442882538, + "learning_rate": 0.00018614823815309842, + "loss": 0.3696, + "step": 4596 + }, + { + "epoch": 0.37240764744005184, + "grad_norm": 0.03615789860486984, + "learning_rate": 0.00018618874038072095, + "loss": 0.3894, + "step": 4597 + }, + { + "epoch": 0.37248865845755025, + "grad_norm": 0.029093435034155846, + "learning_rate": 0.00018622924260834346, + "loss": 0.3533, + "step": 4598 + }, + { + "epoch": 0.3725696694750486, + "grad_norm": 0.0371689610183239, + "learning_rate": 0.000186269744835966, + "loss": 0.4334, + "step": 4599 + }, + { + "epoch": 0.37265068049254696, + "grad_norm": 0.0313064381480217, + "learning_rate": 0.0001863102470635885, + "loss": 0.3863, + "step": 4600 + }, + { + "epoch": 0.37273169151004537, + "grad_norm": 0.03435458242893219, + "learning_rate": 0.00018635074929121103, + "loss": 0.3883, + "step": 4601 + }, + { + "epoch": 0.3728127025275437, + "grad_norm": 0.04193079471588135, + "learning_rate": 0.00018639125151883354, + "loss": 0.408, + "step": 4602 + }, + { + "epoch": 0.37289371354504214, + "grad_norm": 0.03607318922877312, + "learning_rate": 0.00018643175374645607, + "loss": 0.3325, + "step": 4603 + }, + { + "epoch": 0.3729747245625405, + "grad_norm": 0.03750342130661011, + "learning_rate": 0.00018647225597407858, + "loss": 0.3663, + "step": 4604 + }, + { + "epoch": 0.3730557355800389, + "grad_norm": 0.034173641353845596, + "learning_rate": 0.0001865127582017011, + "loss": 0.3383, + "step": 4605 + }, + { + "epoch": 0.37313674659753726, + "grad_norm": 0.028621068224310875, + "learning_rate": 0.00018655326042932362, + "loss": 0.3288, + "step": 4606 + }, + { + "epoch": 0.3732177576150357, + "grad_norm": 0.038024332374334335, + "learning_rate": 0.00018659376265694615, + "loss": 0.3744, + "step": 4607 + }, + { + "epoch": 0.37329876863253403, + "grad_norm": 0.03312882408499718, + "learning_rate": 0.00018663426488456865, + "loss": 0.4204, + "step": 4608 + }, + { + "epoch": 0.3733797796500324, + "grad_norm": 0.035135842859745026, + "learning_rate": 0.0001866747671121912, + "loss": 0.3515, + "step": 4609 + }, + { + "epoch": 0.3734607906675308, + "grad_norm": 0.037961844354867935, + "learning_rate": 0.0001867152693398137, + "loss": 0.3565, + "step": 4610 + }, + { + "epoch": 0.37354180168502915, + "grad_norm": 0.034696970134973526, + "learning_rate": 0.00018675577156743623, + "loss": 0.3688, + "step": 4611 + }, + { + "epoch": 0.37362281270252756, + "grad_norm": 0.03243953734636307, + "learning_rate": 0.00018679627379505873, + "loss": 0.3631, + "step": 4612 + }, + { + "epoch": 0.3737038237200259, + "grad_norm": 0.032312992960214615, + "learning_rate": 0.00018683677602268127, + "loss": 0.3574, + "step": 4613 + }, + { + "epoch": 0.37378483473752433, + "grad_norm": 0.03381352126598358, + "learning_rate": 0.00018687727825030377, + "loss": 0.3624, + "step": 4614 + }, + { + "epoch": 0.3738658457550227, + "grad_norm": 0.02955443412065506, + "learning_rate": 0.0001869177804779263, + "loss": 0.3256, + "step": 4615 + }, + { + "epoch": 0.37394685677252104, + "grad_norm": 0.03173473849892616, + "learning_rate": 0.0001869582827055488, + "loss": 0.3171, + "step": 4616 + }, + { + "epoch": 0.37402786779001945, + "grad_norm": 0.03345518559217453, + "learning_rate": 0.00018699878493317135, + "loss": 0.4112, + "step": 4617 + }, + { + "epoch": 0.3741088788075178, + "grad_norm": 0.03448694944381714, + "learning_rate": 0.00018703928716079385, + "loss": 0.4033, + "step": 4618 + }, + { + "epoch": 0.3741898898250162, + "grad_norm": 0.03236488997936249, + "learning_rate": 0.00018707978938841638, + "loss": 0.3619, + "step": 4619 + }, + { + "epoch": 0.3742709008425146, + "grad_norm": 0.031700026243925095, + "learning_rate": 0.0001871202916160389, + "loss": 0.3548, + "step": 4620 + }, + { + "epoch": 0.374351911860013, + "grad_norm": 0.03522234782576561, + "learning_rate": 0.0001871607938436614, + "loss": 0.3816, + "step": 4621 + }, + { + "epoch": 0.37443292287751134, + "grad_norm": 0.042682599276304245, + "learning_rate": 0.00018720129607128393, + "loss": 0.4125, + "step": 4622 + }, + { + "epoch": 0.3745139338950097, + "grad_norm": 0.031941335648298264, + "learning_rate": 0.00018724179829890644, + "loss": 0.3205, + "step": 4623 + }, + { + "epoch": 0.3745949449125081, + "grad_norm": 0.0366583988070488, + "learning_rate": 0.00018728230052652897, + "loss": 0.3329, + "step": 4624 + }, + { + "epoch": 0.37467595593000647, + "grad_norm": 0.03480471670627594, + "learning_rate": 0.00018732280275415148, + "loss": 0.3354, + "step": 4625 + }, + { + "epoch": 0.3747569669475049, + "grad_norm": 0.038374267518520355, + "learning_rate": 0.000187363304981774, + "loss": 0.3454, + "step": 4626 + }, + { + "epoch": 0.37483797796500323, + "grad_norm": 0.04138772934675217, + "learning_rate": 0.00018740380720939651, + "loss": 0.3712, + "step": 4627 + }, + { + "epoch": 0.37491898898250164, + "grad_norm": 0.03101898916065693, + "learning_rate": 0.00018744430943701905, + "loss": 0.3845, + "step": 4628 + }, + { + "epoch": 0.375, + "grad_norm": 0.0356474332511425, + "learning_rate": 0.00018748481166464155, + "loss": 0.3657, + "step": 4629 + }, + { + "epoch": 0.37508101101749836, + "grad_norm": 0.03429270163178444, + "learning_rate": 0.0001875253138922641, + "loss": 0.3168, + "step": 4630 + }, + { + "epoch": 0.37516202203499677, + "grad_norm": 0.028021546080708504, + "learning_rate": 0.0001875658161198866, + "loss": 0.3497, + "step": 4631 + }, + { + "epoch": 0.3752430330524951, + "grad_norm": 0.03524024412035942, + "learning_rate": 0.00018760631834750913, + "loss": 0.4036, + "step": 4632 + }, + { + "epoch": 0.37532404406999353, + "grad_norm": 0.038602109998464584, + "learning_rate": 0.00018764682057513163, + "loss": 0.3592, + "step": 4633 + }, + { + "epoch": 0.3754050550874919, + "grad_norm": 0.027118699625134468, + "learning_rate": 0.00018768732280275417, + "loss": 0.3417, + "step": 4634 + }, + { + "epoch": 0.3754860661049903, + "grad_norm": 0.031041573733091354, + "learning_rate": 0.00018772782503037667, + "loss": 0.3874, + "step": 4635 + }, + { + "epoch": 0.37556707712248866, + "grad_norm": 0.03967761993408203, + "learning_rate": 0.0001877683272579992, + "loss": 0.3585, + "step": 4636 + }, + { + "epoch": 0.375648088139987, + "grad_norm": 0.036579083651304245, + "learning_rate": 0.00018780882948562174, + "loss": 0.3883, + "step": 4637 + }, + { + "epoch": 0.3757290991574854, + "grad_norm": 0.031344201415777206, + "learning_rate": 0.00018784933171324424, + "loss": 0.36, + "step": 4638 + }, + { + "epoch": 0.3758101101749838, + "grad_norm": 0.033904679119586945, + "learning_rate": 0.00018788983394086678, + "loss": 0.3798, + "step": 4639 + }, + { + "epoch": 0.3758911211924822, + "grad_norm": 0.031038541346788406, + "learning_rate": 0.00018793033616848928, + "loss": 0.3629, + "step": 4640 + }, + { + "epoch": 0.37597213220998055, + "grad_norm": 0.035065874457359314, + "learning_rate": 0.00018797083839611182, + "loss": 0.3545, + "step": 4641 + }, + { + "epoch": 0.37605314322747896, + "grad_norm": 0.03886274993419647, + "learning_rate": 0.00018801134062373432, + "loss": 0.4592, + "step": 4642 + }, + { + "epoch": 0.3761341542449773, + "grad_norm": 0.04013144597411156, + "learning_rate": 0.00018805184285135683, + "loss": 0.4282, + "step": 4643 + }, + { + "epoch": 0.37621516526247567, + "grad_norm": 0.044672705233097076, + "learning_rate": 0.00018809234507897934, + "loss": 0.4349, + "step": 4644 + }, + { + "epoch": 0.3762961762799741, + "grad_norm": 0.032288748770952225, + "learning_rate": 0.00018813284730660187, + "loss": 0.3905, + "step": 4645 + }, + { + "epoch": 0.37637718729747244, + "grad_norm": 0.039334528148174286, + "learning_rate": 0.00018817334953422437, + "loss": 0.3949, + "step": 4646 + }, + { + "epoch": 0.37645819831497085, + "grad_norm": 0.04137732461094856, + "learning_rate": 0.0001882138517618469, + "loss": 0.3766, + "step": 4647 + }, + { + "epoch": 0.3765392093324692, + "grad_norm": 0.04196125641465187, + "learning_rate": 0.00018825435398946941, + "loss": 0.4214, + "step": 4648 + }, + { + "epoch": 0.3766202203499676, + "grad_norm": 0.04401590675115585, + "learning_rate": 0.00018829485621709195, + "loss": 0.3992, + "step": 4649 + }, + { + "epoch": 0.37670123136746597, + "grad_norm": 0.03296400606632233, + "learning_rate": 0.00018833535844471445, + "loss": 0.3302, + "step": 4650 + }, + { + "epoch": 0.3767822423849643, + "grad_norm": 0.041027676314115524, + "learning_rate": 0.000188375860672337, + "loss": 0.4114, + "step": 4651 + }, + { + "epoch": 0.37686325340246274, + "grad_norm": 0.030237749218940735, + "learning_rate": 0.0001884163628999595, + "loss": 0.3457, + "step": 4652 + }, + { + "epoch": 0.3769442644199611, + "grad_norm": 0.03659079596400261, + "learning_rate": 0.00018845686512758203, + "loss": 0.3633, + "step": 4653 + }, + { + "epoch": 0.3770252754374595, + "grad_norm": 0.03513143211603165, + "learning_rate": 0.00018849736735520453, + "loss": 0.4143, + "step": 4654 + }, + { + "epoch": 0.37710628645495786, + "grad_norm": 0.033116623759269714, + "learning_rate": 0.00018853786958282707, + "loss": 0.3731, + "step": 4655 + }, + { + "epoch": 0.3771872974724563, + "grad_norm": 0.03905783221125603, + "learning_rate": 0.0001885783718104496, + "loss": 0.373, + "step": 4656 + }, + { + "epoch": 0.37726830848995463, + "grad_norm": 0.02860383875668049, + "learning_rate": 0.0001886188740380721, + "loss": 0.3334, + "step": 4657 + }, + { + "epoch": 0.37734931950745304, + "grad_norm": 0.039546336978673935, + "learning_rate": 0.00018865937626569464, + "loss": 0.3569, + "step": 4658 + }, + { + "epoch": 0.3774303305249514, + "grad_norm": 0.03063320182263851, + "learning_rate": 0.00018869987849331714, + "loss": 0.3908, + "step": 4659 + }, + { + "epoch": 0.37751134154244975, + "grad_norm": 0.03542062267661095, + "learning_rate": 0.00018874038072093968, + "loss": 0.4156, + "step": 4660 + }, + { + "epoch": 0.37759235255994816, + "grad_norm": 0.03663628175854683, + "learning_rate": 0.00018878088294856218, + "loss": 0.3556, + "step": 4661 + }, + { + "epoch": 0.3776733635774465, + "grad_norm": 0.039453811943531036, + "learning_rate": 0.00018882138517618472, + "loss": 0.4064, + "step": 4662 + }, + { + "epoch": 0.37775437459494493, + "grad_norm": 0.03887701779603958, + "learning_rate": 0.00018886188740380722, + "loss": 0.3667, + "step": 4663 + }, + { + "epoch": 0.3778353856124433, + "grad_norm": 0.03565892577171326, + "learning_rate": 0.00018890238963142976, + "loss": 0.3649, + "step": 4664 + }, + { + "epoch": 0.3779163966299417, + "grad_norm": 0.03139740228652954, + "learning_rate": 0.00018894289185905226, + "loss": 0.3656, + "step": 4665 + }, + { + "epoch": 0.37799740764744005, + "grad_norm": 0.04097292944788933, + "learning_rate": 0.00018898339408667477, + "loss": 0.3594, + "step": 4666 + }, + { + "epoch": 0.3780784186649384, + "grad_norm": 0.04518847167491913, + "learning_rate": 0.0001890238963142973, + "loss": 0.3792, + "step": 4667 + }, + { + "epoch": 0.3781594296824368, + "grad_norm": 0.036011647433042526, + "learning_rate": 0.0001890643985419198, + "loss": 0.3838, + "step": 4668 + }, + { + "epoch": 0.3782404406999352, + "grad_norm": 0.040306150913238525, + "learning_rate": 0.00018910490076954231, + "loss": 0.4105, + "step": 4669 + }, + { + "epoch": 0.3783214517174336, + "grad_norm": 0.03284032270312309, + "learning_rate": 0.00018914540299716485, + "loss": 0.4012, + "step": 4670 + }, + { + "epoch": 0.37840246273493194, + "grad_norm": 0.03616539016366005, + "learning_rate": 0.00018918590522478735, + "loss": 0.33, + "step": 4671 + }, + { + "epoch": 0.37848347375243035, + "grad_norm": 0.04062451422214508, + "learning_rate": 0.00018922640745240989, + "loss": 0.4352, + "step": 4672 + }, + { + "epoch": 0.3785644847699287, + "grad_norm": 0.03321415185928345, + "learning_rate": 0.0001892669096800324, + "loss": 0.4466, + "step": 4673 + }, + { + "epoch": 0.37864549578742707, + "grad_norm": 0.03490709885954857, + "learning_rate": 0.00018930741190765493, + "loss": 0.3705, + "step": 4674 + }, + { + "epoch": 0.3787265068049255, + "grad_norm": 0.03743315488100052, + "learning_rate": 0.00018934791413527746, + "loss": 0.3305, + "step": 4675 + }, + { + "epoch": 0.37880751782242383, + "grad_norm": 0.038661107420921326, + "learning_rate": 0.00018938841636289996, + "loss": 0.3704, + "step": 4676 + }, + { + "epoch": 0.37888852883992225, + "grad_norm": 0.033372364938259125, + "learning_rate": 0.0001894289185905225, + "loss": 0.3775, + "step": 4677 + }, + { + "epoch": 0.3789695398574206, + "grad_norm": 0.031249843537807465, + "learning_rate": 0.000189469420818145, + "loss": 0.3183, + "step": 4678 + }, + { + "epoch": 0.379050550874919, + "grad_norm": 0.034112848341464996, + "learning_rate": 0.00018950992304576754, + "loss": 0.3702, + "step": 4679 + }, + { + "epoch": 0.37913156189241737, + "grad_norm": 0.02745191939175129, + "learning_rate": 0.00018955042527339004, + "loss": 0.3374, + "step": 4680 + }, + { + "epoch": 0.3792125729099157, + "grad_norm": 0.03565063700079918, + "learning_rate": 0.00018959092750101258, + "loss": 0.3862, + "step": 4681 + }, + { + "epoch": 0.37929358392741414, + "grad_norm": 0.030555015429854393, + "learning_rate": 0.00018963142972863508, + "loss": 0.3539, + "step": 4682 + }, + { + "epoch": 0.3793745949449125, + "grad_norm": 0.032852426171302795, + "learning_rate": 0.00018967193195625762, + "loss": 0.3389, + "step": 4683 + }, + { + "epoch": 0.3794556059624109, + "grad_norm": 0.036056555807590485, + "learning_rate": 0.00018971243418388012, + "loss": 0.3736, + "step": 4684 + }, + { + "epoch": 0.37953661697990926, + "grad_norm": 0.03699024021625519, + "learning_rate": 0.00018975293641150266, + "loss": 0.3956, + "step": 4685 + }, + { + "epoch": 0.37961762799740767, + "grad_norm": 0.03291408717632294, + "learning_rate": 0.00018979343863912516, + "loss": 0.3524, + "step": 4686 + }, + { + "epoch": 0.379698639014906, + "grad_norm": 0.03387143090367317, + "learning_rate": 0.0001898339408667477, + "loss": 0.3661, + "step": 4687 + }, + { + "epoch": 0.3797796500324044, + "grad_norm": 0.03629806637763977, + "learning_rate": 0.0001898744430943702, + "loss": 0.3653, + "step": 4688 + }, + { + "epoch": 0.3798606610499028, + "grad_norm": 0.036638397723436356, + "learning_rate": 0.00018991494532199273, + "loss": 0.3592, + "step": 4689 + }, + { + "epoch": 0.37994167206740115, + "grad_norm": 0.03318271040916443, + "learning_rate": 0.00018995544754961524, + "loss": 0.3674, + "step": 4690 + }, + { + "epoch": 0.38002268308489956, + "grad_norm": 0.03417496010661125, + "learning_rate": 0.00018999594977723775, + "loss": 0.345, + "step": 4691 + }, + { + "epoch": 0.3801036941023979, + "grad_norm": 0.029663294553756714, + "learning_rate": 0.00019003645200486025, + "loss": 0.3648, + "step": 4692 + }, + { + "epoch": 0.3801847051198963, + "grad_norm": 0.03385043144226074, + "learning_rate": 0.00019007695423248279, + "loss": 0.3936, + "step": 4693 + }, + { + "epoch": 0.3802657161373947, + "grad_norm": 0.03586571291089058, + "learning_rate": 0.00019011745646010532, + "loss": 0.4184, + "step": 4694 + }, + { + "epoch": 0.38034672715489304, + "grad_norm": 0.04350770264863968, + "learning_rate": 0.00019015795868772782, + "loss": 0.4485, + "step": 4695 + }, + { + "epoch": 0.38042773817239145, + "grad_norm": 0.03400348499417305, + "learning_rate": 0.00019019846091535036, + "loss": 0.3484, + "step": 4696 + }, + { + "epoch": 0.3805087491898898, + "grad_norm": 0.03798680007457733, + "learning_rate": 0.00019023896314297286, + "loss": 0.3386, + "step": 4697 + }, + { + "epoch": 0.3805897602073882, + "grad_norm": 0.031218959018588066, + "learning_rate": 0.0001902794653705954, + "loss": 0.3039, + "step": 4698 + }, + { + "epoch": 0.3806707712248866, + "grad_norm": 0.031623970717191696, + "learning_rate": 0.0001903199675982179, + "loss": 0.3518, + "step": 4699 + }, + { + "epoch": 0.380751782242385, + "grad_norm": 0.034070685505867004, + "learning_rate": 0.00019036046982584044, + "loss": 0.3822, + "step": 4700 + }, + { + "epoch": 0.38083279325988334, + "grad_norm": 0.03226622939109802, + "learning_rate": 0.00019040097205346294, + "loss": 0.4141, + "step": 4701 + }, + { + "epoch": 0.38091380427738175, + "grad_norm": 0.04006161540746689, + "learning_rate": 0.00019044147428108548, + "loss": 0.3833, + "step": 4702 + }, + { + "epoch": 0.3809948152948801, + "grad_norm": 0.03199142962694168, + "learning_rate": 0.00019048197650870798, + "loss": 0.3583, + "step": 4703 + }, + { + "epoch": 0.38107582631237846, + "grad_norm": 0.033569540828466415, + "learning_rate": 0.00019052247873633052, + "loss": 0.3401, + "step": 4704 + }, + { + "epoch": 0.3811568373298769, + "grad_norm": 0.03711489588022232, + "learning_rate": 0.00019056298096395302, + "loss": 0.4158, + "step": 4705 + }, + { + "epoch": 0.38123784834737523, + "grad_norm": 0.035843729972839355, + "learning_rate": 0.00019060348319157555, + "loss": 0.4146, + "step": 4706 + }, + { + "epoch": 0.38131885936487364, + "grad_norm": 0.03303459659218788, + "learning_rate": 0.00019064398541919806, + "loss": 0.311, + "step": 4707 + }, + { + "epoch": 0.381399870382372, + "grad_norm": 0.03796025365591049, + "learning_rate": 0.0001906844876468206, + "loss": 0.3822, + "step": 4708 + }, + { + "epoch": 0.3814808813998704, + "grad_norm": 0.0355183407664299, + "learning_rate": 0.0001907249898744431, + "loss": 0.4254, + "step": 4709 + }, + { + "epoch": 0.38156189241736876, + "grad_norm": 0.034124333411455154, + "learning_rate": 0.00019076549210206563, + "loss": 0.3445, + "step": 4710 + }, + { + "epoch": 0.3816429034348671, + "grad_norm": 0.03657494857907295, + "learning_rate": 0.00019080599432968814, + "loss": 0.3568, + "step": 4711 + }, + { + "epoch": 0.38172391445236553, + "grad_norm": 0.03229145705699921, + "learning_rate": 0.00019084649655731067, + "loss": 0.338, + "step": 4712 + }, + { + "epoch": 0.3818049254698639, + "grad_norm": 0.03623109310865402, + "learning_rate": 0.00019088699878493318, + "loss": 0.3769, + "step": 4713 + }, + { + "epoch": 0.3818859364873623, + "grad_norm": 0.03175680711865425, + "learning_rate": 0.00019092750101255569, + "loss": 0.3516, + "step": 4714 + }, + { + "epoch": 0.38196694750486065, + "grad_norm": 0.033988941460847855, + "learning_rate": 0.00019096800324017822, + "loss": 0.35, + "step": 4715 + }, + { + "epoch": 0.38204795852235907, + "grad_norm": 0.03612534701824188, + "learning_rate": 0.00019100850546780072, + "loss": 0.3905, + "step": 4716 + }, + { + "epoch": 0.3821289695398574, + "grad_norm": 0.03572618588805199, + "learning_rate": 0.00019104900769542326, + "loss": 0.365, + "step": 4717 + }, + { + "epoch": 0.3822099805573558, + "grad_norm": 0.02998768351972103, + "learning_rate": 0.00019108950992304576, + "loss": 0.3414, + "step": 4718 + }, + { + "epoch": 0.3822909915748542, + "grad_norm": 0.0337430015206337, + "learning_rate": 0.0001911300121506683, + "loss": 0.3381, + "step": 4719 + }, + { + "epoch": 0.38237200259235254, + "grad_norm": 0.034730587154626846, + "learning_rate": 0.0001911705143782908, + "loss": 0.4033, + "step": 4720 + }, + { + "epoch": 0.38245301360985096, + "grad_norm": 0.03520103171467781, + "learning_rate": 0.00019121101660591334, + "loss": 0.3705, + "step": 4721 + }, + { + "epoch": 0.3825340246273493, + "grad_norm": 0.03200915828347206, + "learning_rate": 0.00019125151883353584, + "loss": 0.3908, + "step": 4722 + }, + { + "epoch": 0.3826150356448477, + "grad_norm": 0.04115212708711624, + "learning_rate": 0.00019129202106115838, + "loss": 0.3934, + "step": 4723 + }, + { + "epoch": 0.3826960466623461, + "grad_norm": 0.0342140719294548, + "learning_rate": 0.00019133252328878088, + "loss": 0.3553, + "step": 4724 + }, + { + "epoch": 0.38277705767984443, + "grad_norm": 0.04393256828188896, + "learning_rate": 0.00019137302551640342, + "loss": 0.3451, + "step": 4725 + }, + { + "epoch": 0.38285806869734285, + "grad_norm": 0.032705143094062805, + "learning_rate": 0.00019141352774402592, + "loss": 0.3812, + "step": 4726 + }, + { + "epoch": 0.3829390797148412, + "grad_norm": 0.028380325064063072, + "learning_rate": 0.00019145402997164845, + "loss": 0.3414, + "step": 4727 + }, + { + "epoch": 0.3830200907323396, + "grad_norm": 0.03669360280036926, + "learning_rate": 0.00019149453219927096, + "loss": 0.3819, + "step": 4728 + }, + { + "epoch": 0.38310110174983797, + "grad_norm": 0.030903344973921776, + "learning_rate": 0.0001915350344268935, + "loss": 0.3984, + "step": 4729 + }, + { + "epoch": 0.3831821127673364, + "grad_norm": 0.035275667905807495, + "learning_rate": 0.000191575536654516, + "loss": 0.3246, + "step": 4730 + }, + { + "epoch": 0.38326312378483474, + "grad_norm": 0.03556999936699867, + "learning_rate": 0.00019161603888213853, + "loss": 0.379, + "step": 4731 + }, + { + "epoch": 0.3833441348023331, + "grad_norm": 0.0267938245087862, + "learning_rate": 0.00019165654110976107, + "loss": 0.3089, + "step": 4732 + }, + { + "epoch": 0.3834251458198315, + "grad_norm": 0.03239806741476059, + "learning_rate": 0.00019169704333738357, + "loss": 0.336, + "step": 4733 + }, + { + "epoch": 0.38350615683732986, + "grad_norm": 0.027912812307476997, + "learning_rate": 0.0001917375455650061, + "loss": 0.3173, + "step": 4734 + }, + { + "epoch": 0.38358716785482827, + "grad_norm": 0.034143056720495224, + "learning_rate": 0.0001917780477926286, + "loss": 0.3243, + "step": 4735 + }, + { + "epoch": 0.3836681788723266, + "grad_norm": 0.03252992406487465, + "learning_rate": 0.00019181855002025112, + "loss": 0.3815, + "step": 4736 + }, + { + "epoch": 0.38374918988982504, + "grad_norm": 0.03119749017059803, + "learning_rate": 0.00019185905224787365, + "loss": 0.3537, + "step": 4737 + }, + { + "epoch": 0.3838302009073234, + "grad_norm": 0.033663660287857056, + "learning_rate": 0.00019189955447549616, + "loss": 0.3263, + "step": 4738 + }, + { + "epoch": 0.38391121192482175, + "grad_norm": 0.03443240374326706, + "learning_rate": 0.00019194005670311866, + "loss": 0.4121, + "step": 4739 + }, + { + "epoch": 0.38399222294232016, + "grad_norm": 0.031480759382247925, + "learning_rate": 0.0001919805589307412, + "loss": 0.3269, + "step": 4740 + }, + { + "epoch": 0.3840732339598185, + "grad_norm": 0.03234820440411568, + "learning_rate": 0.0001920210611583637, + "loss": 0.3758, + "step": 4741 + }, + { + "epoch": 0.3841542449773169, + "grad_norm": 0.03727741912007332, + "learning_rate": 0.00019206156338598624, + "loss": 0.3401, + "step": 4742 + }, + { + "epoch": 0.3842352559948153, + "grad_norm": 0.03509011119604111, + "learning_rate": 0.00019210206561360874, + "loss": 0.3712, + "step": 4743 + }, + { + "epoch": 0.3843162670123137, + "grad_norm": 0.03658130764961243, + "learning_rate": 0.00019214256784123128, + "loss": 0.3807, + "step": 4744 + }, + { + "epoch": 0.38439727802981205, + "grad_norm": 0.03258894011378288, + "learning_rate": 0.00019218307006885378, + "loss": 0.3832, + "step": 4745 + }, + { + "epoch": 0.38447828904731046, + "grad_norm": 0.03701847419142723, + "learning_rate": 0.00019222357229647631, + "loss": 0.3734, + "step": 4746 + }, + { + "epoch": 0.3845593000648088, + "grad_norm": 0.028350885957479477, + "learning_rate": 0.00019226407452409882, + "loss": 0.3574, + "step": 4747 + }, + { + "epoch": 0.3846403110823072, + "grad_norm": 0.03285471722483635, + "learning_rate": 0.00019230457675172135, + "loss": 0.3566, + "step": 4748 + }, + { + "epoch": 0.3847213220998056, + "grad_norm": 0.03530566021800041, + "learning_rate": 0.00019234507897934386, + "loss": 0.4205, + "step": 4749 + }, + { + "epoch": 0.38480233311730394, + "grad_norm": 0.037856727838516235, + "learning_rate": 0.0001923855812069664, + "loss": 0.3725, + "step": 4750 + }, + { + "epoch": 0.38488334413480235, + "grad_norm": 0.033076632767915726, + "learning_rate": 0.00019242608343458893, + "loss": 0.3962, + "step": 4751 + }, + { + "epoch": 0.3849643551523007, + "grad_norm": 0.027378113940358162, + "learning_rate": 0.00019246658566221143, + "loss": 0.3608, + "step": 4752 + }, + { + "epoch": 0.3850453661697991, + "grad_norm": 0.03405110165476799, + "learning_rate": 0.00019250708788983397, + "loss": 0.3823, + "step": 4753 + }, + { + "epoch": 0.3851263771872975, + "grad_norm": 0.032293081283569336, + "learning_rate": 0.00019254759011745647, + "loss": 0.3116, + "step": 4754 + }, + { + "epoch": 0.38520738820479583, + "grad_norm": 0.02823660336434841, + "learning_rate": 0.000192588092345079, + "loss": 0.3263, + "step": 4755 + }, + { + "epoch": 0.38528839922229424, + "grad_norm": 0.03242744132876396, + "learning_rate": 0.0001926285945727015, + "loss": 0.3583, + "step": 4756 + }, + { + "epoch": 0.3853694102397926, + "grad_norm": 0.03275972232222557, + "learning_rate": 0.00019266909680032404, + "loss": 0.386, + "step": 4757 + }, + { + "epoch": 0.385450421257291, + "grad_norm": 0.03530082106590271, + "learning_rate": 0.00019270959902794655, + "loss": 0.3903, + "step": 4758 + }, + { + "epoch": 0.38553143227478937, + "grad_norm": 0.03208938613533974, + "learning_rate": 0.00019275010125556908, + "loss": 0.4056, + "step": 4759 + }, + { + "epoch": 0.3856124432922878, + "grad_norm": 0.024658476933836937, + "learning_rate": 0.0001927906034831916, + "loss": 0.3006, + "step": 4760 + }, + { + "epoch": 0.38569345430978613, + "grad_norm": 0.028631461784243584, + "learning_rate": 0.0001928311057108141, + "loss": 0.3247, + "step": 4761 + }, + { + "epoch": 0.3857744653272845, + "grad_norm": 0.03463675081729889, + "learning_rate": 0.00019287160793843663, + "loss": 0.361, + "step": 4762 + }, + { + "epoch": 0.3858554763447829, + "grad_norm": 0.03447446972131729, + "learning_rate": 0.00019291211016605914, + "loss": 0.3156, + "step": 4763 + }, + { + "epoch": 0.38593648736228126, + "grad_norm": 0.03354298695921898, + "learning_rate": 0.00019295261239368164, + "loss": 0.3552, + "step": 4764 + }, + { + "epoch": 0.38601749837977967, + "grad_norm": 0.033569224178791046, + "learning_rate": 0.00019299311462130417, + "loss": 0.3036, + "step": 4765 + }, + { + "epoch": 0.386098509397278, + "grad_norm": 0.0391860231757164, + "learning_rate": 0.00019303361684892668, + "loss": 0.3781, + "step": 4766 + }, + { + "epoch": 0.38617952041477643, + "grad_norm": 0.03534352034330368, + "learning_rate": 0.00019307411907654921, + "loss": 0.3552, + "step": 4767 + }, + { + "epoch": 0.3862605314322748, + "grad_norm": 0.03565150871872902, + "learning_rate": 0.00019311462130417172, + "loss": 0.3377, + "step": 4768 + }, + { + "epoch": 0.38634154244977315, + "grad_norm": 0.037752967327833176, + "learning_rate": 0.00019315512353179425, + "loss": 0.3448, + "step": 4769 + }, + { + "epoch": 0.38642255346727156, + "grad_norm": 0.035720087587833405, + "learning_rate": 0.0001931956257594168, + "loss": 0.3834, + "step": 4770 + }, + { + "epoch": 0.3865035644847699, + "grad_norm": 0.03224213048815727, + "learning_rate": 0.0001932361279870393, + "loss": 0.3471, + "step": 4771 + }, + { + "epoch": 0.3865845755022683, + "grad_norm": 0.03377991542220116, + "learning_rate": 0.00019327663021466183, + "loss": 0.3652, + "step": 4772 + }, + { + "epoch": 0.3866655865197667, + "grad_norm": 0.0403682179749012, + "learning_rate": 0.00019331713244228433, + "loss": 0.3504, + "step": 4773 + }, + { + "epoch": 0.3867465975372651, + "grad_norm": 0.030859939754009247, + "learning_rate": 0.00019335763466990687, + "loss": 0.3318, + "step": 4774 + }, + { + "epoch": 0.38682760855476345, + "grad_norm": 0.032264601439237595, + "learning_rate": 0.00019339813689752937, + "loss": 0.3543, + "step": 4775 + }, + { + "epoch": 0.3869086195722618, + "grad_norm": 0.035585030913352966, + "learning_rate": 0.0001934386391251519, + "loss": 0.3883, + "step": 4776 + }, + { + "epoch": 0.3869896305897602, + "grad_norm": 0.03770442306995392, + "learning_rate": 0.0001934791413527744, + "loss": 0.3422, + "step": 4777 + }, + { + "epoch": 0.38707064160725857, + "grad_norm": 0.033499933779239655, + "learning_rate": 0.00019351964358039694, + "loss": 0.3829, + "step": 4778 + }, + { + "epoch": 0.387151652624757, + "grad_norm": 0.03047235682606697, + "learning_rate": 0.00019356014580801945, + "loss": 0.3219, + "step": 4779 + }, + { + "epoch": 0.38723266364225534, + "grad_norm": 0.032883550971746445, + "learning_rate": 0.00019360064803564198, + "loss": 0.339, + "step": 4780 + }, + { + "epoch": 0.38731367465975375, + "grad_norm": 0.03074515610933304, + "learning_rate": 0.0001936411502632645, + "loss": 0.3819, + "step": 4781 + }, + { + "epoch": 0.3873946856772521, + "grad_norm": 0.03032887913286686, + "learning_rate": 0.00019368165249088702, + "loss": 0.4051, + "step": 4782 + }, + { + "epoch": 0.38747569669475046, + "grad_norm": 0.03172651305794716, + "learning_rate": 0.00019372215471850953, + "loss": 0.3666, + "step": 4783 + }, + { + "epoch": 0.38755670771224887, + "grad_norm": 0.036670222878456116, + "learning_rate": 0.00019376265694613206, + "loss": 0.4072, + "step": 4784 + }, + { + "epoch": 0.3876377187297472, + "grad_norm": 0.03976541385054588, + "learning_rate": 0.00019380315917375457, + "loss": 0.4105, + "step": 4785 + }, + { + "epoch": 0.38771872974724564, + "grad_norm": 0.03309793770313263, + "learning_rate": 0.00019384366140137707, + "loss": 0.3866, + "step": 4786 + }, + { + "epoch": 0.387799740764744, + "grad_norm": 0.044504258781671524, + "learning_rate": 0.00019388416362899958, + "loss": 0.3373, + "step": 4787 + }, + { + "epoch": 0.3878807517822424, + "grad_norm": 0.02997519262135029, + "learning_rate": 0.0001939246658566221, + "loss": 0.3295, + "step": 4788 + }, + { + "epoch": 0.38796176279974076, + "grad_norm": 0.03902506083250046, + "learning_rate": 0.00019396516808424465, + "loss": 0.3407, + "step": 4789 + }, + { + "epoch": 0.3880427738172391, + "grad_norm": 0.02696751430630684, + "learning_rate": 0.00019400567031186715, + "loss": 0.3298, + "step": 4790 + }, + { + "epoch": 0.38812378483473753, + "grad_norm": 0.03264418616890907, + "learning_rate": 0.00019404617253948969, + "loss": 0.3433, + "step": 4791 + }, + { + "epoch": 0.3882047958522359, + "grad_norm": 0.029885264113545418, + "learning_rate": 0.0001940866747671122, + "loss": 0.3605, + "step": 4792 + }, + { + "epoch": 0.3882858068697343, + "grad_norm": 0.03691829741001129, + "learning_rate": 0.00019412717699473473, + "loss": 0.3941, + "step": 4793 + }, + { + "epoch": 0.38836681788723265, + "grad_norm": 0.02847517654299736, + "learning_rate": 0.00019416767922235723, + "loss": 0.3088, + "step": 4794 + }, + { + "epoch": 0.38844782890473106, + "grad_norm": 0.03323289379477501, + "learning_rate": 0.00019420818144997976, + "loss": 0.3716, + "step": 4795 + }, + { + "epoch": 0.3885288399222294, + "grad_norm": 0.04099169000983238, + "learning_rate": 0.00019424868367760227, + "loss": 0.3842, + "step": 4796 + }, + { + "epoch": 0.38860985093972783, + "grad_norm": 0.03556416183710098, + "learning_rate": 0.0001942891859052248, + "loss": 0.3681, + "step": 4797 + }, + { + "epoch": 0.3886908619572262, + "grad_norm": 0.029316971078515053, + "learning_rate": 0.0001943296881328473, + "loss": 0.3185, + "step": 4798 + }, + { + "epoch": 0.38877187297472454, + "grad_norm": 0.034768927842378616, + "learning_rate": 0.00019437019036046984, + "loss": 0.3992, + "step": 4799 + }, + { + "epoch": 0.38885288399222295, + "grad_norm": 0.03491413965821266, + "learning_rate": 0.00019441069258809235, + "loss": 0.3631, + "step": 4800 + }, + { + "epoch": 0.3889338950097213, + "grad_norm": 0.0341835580766201, + "learning_rate": 0.00019445119481571488, + "loss": 0.3734, + "step": 4801 + }, + { + "epoch": 0.3890149060272197, + "grad_norm": 0.0318155474960804, + "learning_rate": 0.0001944916970433374, + "loss": 0.3113, + "step": 4802 + }, + { + "epoch": 0.3890959170447181, + "grad_norm": 0.030088962987065315, + "learning_rate": 0.00019453219927095992, + "loss": 0.3346, + "step": 4803 + }, + { + "epoch": 0.3891769280622165, + "grad_norm": 0.03774889186024666, + "learning_rate": 0.00019457270149858243, + "loss": 0.3834, + "step": 4804 + }, + { + "epoch": 0.38925793907971484, + "grad_norm": 0.036289505660533905, + "learning_rate": 0.00019461320372620496, + "loss": 0.3579, + "step": 4805 + }, + { + "epoch": 0.3893389500972132, + "grad_norm": 0.03278936818242073, + "learning_rate": 0.00019465370595382747, + "loss": 0.3482, + "step": 4806 + }, + { + "epoch": 0.3894199611147116, + "grad_norm": 0.032946351915597916, + "learning_rate": 0.00019469420818145, + "loss": 0.3933, + "step": 4807 + }, + { + "epoch": 0.38950097213220997, + "grad_norm": 0.031581562012434006, + "learning_rate": 0.0001947347104090725, + "loss": 0.3418, + "step": 4808 + }, + { + "epoch": 0.3895819831497084, + "grad_norm": 0.03233380988240242, + "learning_rate": 0.000194775212636695, + "loss": 0.3888, + "step": 4809 + }, + { + "epoch": 0.38966299416720673, + "grad_norm": 0.031648050993680954, + "learning_rate": 0.00019481571486431755, + "loss": 0.3344, + "step": 4810 + }, + { + "epoch": 0.38974400518470514, + "grad_norm": 0.029684975743293762, + "learning_rate": 0.00019485621709194005, + "loss": 0.3455, + "step": 4811 + }, + { + "epoch": 0.3898250162022035, + "grad_norm": 0.03547152504324913, + "learning_rate": 0.00019489671931956259, + "loss": 0.3635, + "step": 4812 + }, + { + "epoch": 0.38990602721970186, + "grad_norm": 0.031869180500507355, + "learning_rate": 0.0001949372215471851, + "loss": 0.3015, + "step": 4813 + }, + { + "epoch": 0.38998703823720027, + "grad_norm": 0.031538963317871094, + "learning_rate": 0.00019497772377480762, + "loss": 0.3453, + "step": 4814 + }, + { + "epoch": 0.3900680492546986, + "grad_norm": 0.02930673398077488, + "learning_rate": 0.00019501822600243013, + "loss": 0.3513, + "step": 4815 + }, + { + "epoch": 0.39014906027219703, + "grad_norm": 0.028427084907889366, + "learning_rate": 0.00019505872823005266, + "loss": 0.3621, + "step": 4816 + }, + { + "epoch": 0.3902300712896954, + "grad_norm": 0.028040811419487, + "learning_rate": 0.00019509923045767517, + "loss": 0.3447, + "step": 4817 + }, + { + "epoch": 0.3903110823071938, + "grad_norm": 0.033528126776218414, + "learning_rate": 0.0001951397326852977, + "loss": 0.3731, + "step": 4818 + }, + { + "epoch": 0.39039209332469216, + "grad_norm": 0.03092779777944088, + "learning_rate": 0.0001951802349129202, + "loss": 0.3436, + "step": 4819 + }, + { + "epoch": 0.3904731043421905, + "grad_norm": 0.032316166907548904, + "learning_rate": 0.00019522073714054274, + "loss": 0.3815, + "step": 4820 + }, + { + "epoch": 0.3905541153596889, + "grad_norm": 0.03477739542722702, + "learning_rate": 0.00019526123936816525, + "loss": 0.372, + "step": 4821 + }, + { + "epoch": 0.3906351263771873, + "grad_norm": 0.03641634061932564, + "learning_rate": 0.00019530174159578778, + "loss": 0.3085, + "step": 4822 + }, + { + "epoch": 0.3907161373946857, + "grad_norm": 0.03308749943971634, + "learning_rate": 0.0001953422438234103, + "loss": 0.3569, + "step": 4823 + }, + { + "epoch": 0.39079714841218405, + "grad_norm": 0.03331739082932472, + "learning_rate": 0.00019538274605103282, + "loss": 0.4018, + "step": 4824 + }, + { + "epoch": 0.39087815942968246, + "grad_norm": 0.030000533908605576, + "learning_rate": 0.00019542324827865535, + "loss": 0.375, + "step": 4825 + }, + { + "epoch": 0.3909591704471808, + "grad_norm": 0.03186730295419693, + "learning_rate": 0.00019546375050627786, + "loss": 0.3682, + "step": 4826 + }, + { + "epoch": 0.39104018146467917, + "grad_norm": 0.030771762132644653, + "learning_rate": 0.0001955042527339004, + "loss": 0.4015, + "step": 4827 + }, + { + "epoch": 0.3911211924821776, + "grad_norm": 0.031149568036198616, + "learning_rate": 0.0001955447549615229, + "loss": 0.38, + "step": 4828 + }, + { + "epoch": 0.39120220349967594, + "grad_norm": 0.030003225430846214, + "learning_rate": 0.00019558525718914543, + "loss": 0.3638, + "step": 4829 + }, + { + "epoch": 0.39128321451717435, + "grad_norm": 0.03363025560975075, + "learning_rate": 0.00019562575941676794, + "loss": 0.3597, + "step": 4830 + }, + { + "epoch": 0.3913642255346727, + "grad_norm": 0.03597572073340416, + "learning_rate": 0.00019566626164439045, + "loss": 0.375, + "step": 4831 + }, + { + "epoch": 0.3914452365521711, + "grad_norm": 0.03270312398672104, + "learning_rate": 0.00019570676387201298, + "loss": 0.3947, + "step": 4832 + }, + { + "epoch": 0.39152624756966947, + "grad_norm": 0.03423533961176872, + "learning_rate": 0.00019574726609963549, + "loss": 0.3833, + "step": 4833 + }, + { + "epoch": 0.39160725858716783, + "grad_norm": 0.03765803575515747, + "learning_rate": 0.000195787768327258, + "loss": 0.3912, + "step": 4834 + }, + { + "epoch": 0.39168826960466624, + "grad_norm": 0.03126109763979912, + "learning_rate": 0.00019582827055488052, + "loss": 0.3802, + "step": 4835 + }, + { + "epoch": 0.3917692806221646, + "grad_norm": 0.029166357591748238, + "learning_rate": 0.00019586877278250303, + "loss": 0.3504, + "step": 4836 + }, + { + "epoch": 0.391850291639663, + "grad_norm": 0.042333848774433136, + "learning_rate": 0.00019590927501012556, + "loss": 0.3482, + "step": 4837 + }, + { + "epoch": 0.39193130265716136, + "grad_norm": 0.03157378360629082, + "learning_rate": 0.00019594977723774807, + "loss": 0.3392, + "step": 4838 + }, + { + "epoch": 0.3920123136746598, + "grad_norm": 0.03886628523468971, + "learning_rate": 0.0001959902794653706, + "loss": 0.407, + "step": 4839 + }, + { + "epoch": 0.39209332469215813, + "grad_norm": 0.0372137650847435, + "learning_rate": 0.0001960307816929931, + "loss": 0.3892, + "step": 4840 + }, + { + "epoch": 0.39217433570965654, + "grad_norm": 0.03133906424045563, + "learning_rate": 0.00019607128392061564, + "loss": 0.3363, + "step": 4841 + }, + { + "epoch": 0.3922553467271549, + "grad_norm": 0.031031839549541473, + "learning_rate": 0.00019611178614823815, + "loss": 0.3423, + "step": 4842 + }, + { + "epoch": 0.39233635774465325, + "grad_norm": 0.031718261539936066, + "learning_rate": 0.00019615228837586068, + "loss": 0.3545, + "step": 4843 + }, + { + "epoch": 0.39241736876215166, + "grad_norm": 0.03188610449433327, + "learning_rate": 0.00019619279060348321, + "loss": 0.3426, + "step": 4844 + }, + { + "epoch": 0.39249837977965, + "grad_norm": 0.039292700588703156, + "learning_rate": 0.00019623329283110572, + "loss": 0.3579, + "step": 4845 + }, + { + "epoch": 0.39257939079714843, + "grad_norm": 0.03364202752709389, + "learning_rate": 0.00019627379505872825, + "loss": 0.4005, + "step": 4846 + }, + { + "epoch": 0.3926604018146468, + "grad_norm": 0.03354499489068985, + "learning_rate": 0.00019631429728635076, + "loss": 0.3804, + "step": 4847 + }, + { + "epoch": 0.3927414128321452, + "grad_norm": 0.032901547849178314, + "learning_rate": 0.0001963547995139733, + "loss": 0.3656, + "step": 4848 + }, + { + "epoch": 0.39282242384964355, + "grad_norm": 0.042210184037685394, + "learning_rate": 0.0001963953017415958, + "loss": 0.3377, + "step": 4849 + }, + { + "epoch": 0.3929034348671419, + "grad_norm": 0.034993741661310196, + "learning_rate": 0.00019643580396921833, + "loss": 0.3423, + "step": 4850 + }, + { + "epoch": 0.3929844458846403, + "grad_norm": 0.03247562050819397, + "learning_rate": 0.00019647630619684084, + "loss": 0.3361, + "step": 4851 + }, + { + "epoch": 0.3930654569021387, + "grad_norm": 0.0300375297665596, + "learning_rate": 0.00019651680842446337, + "loss": 0.3598, + "step": 4852 + }, + { + "epoch": 0.3931464679196371, + "grad_norm": 0.037401050329208374, + "learning_rate": 0.00019655731065208588, + "loss": 0.432, + "step": 4853 + }, + { + "epoch": 0.39322747893713544, + "grad_norm": 0.02842889353632927, + "learning_rate": 0.0001965978128797084, + "loss": 0.388, + "step": 4854 + }, + { + "epoch": 0.39330848995463386, + "grad_norm": 0.0326065756380558, + "learning_rate": 0.00019663831510733092, + "loss": 0.3725, + "step": 4855 + }, + { + "epoch": 0.3933895009721322, + "grad_norm": 0.02978591062128544, + "learning_rate": 0.00019667881733495342, + "loss": 0.3362, + "step": 4856 + }, + { + "epoch": 0.39347051198963057, + "grad_norm": 0.03818797692656517, + "learning_rate": 0.00019671931956257593, + "loss": 0.3322, + "step": 4857 + }, + { + "epoch": 0.393551523007129, + "grad_norm": 0.03465355560183525, + "learning_rate": 0.00019675982179019846, + "loss": 0.3494, + "step": 4858 + }, + { + "epoch": 0.39363253402462733, + "grad_norm": 0.03535063937306404, + "learning_rate": 0.00019680032401782097, + "loss": 0.3486, + "step": 4859 + }, + { + "epoch": 0.39371354504212575, + "grad_norm": 0.03481747955083847, + "learning_rate": 0.0001968408262454435, + "loss": 0.3884, + "step": 4860 + }, + { + "epoch": 0.3937945560596241, + "grad_norm": 0.03376191481947899, + "learning_rate": 0.000196881328473066, + "loss": 0.3584, + "step": 4861 + }, + { + "epoch": 0.3938755670771225, + "grad_norm": 0.03966887295246124, + "learning_rate": 0.00019692183070068854, + "loss": 0.3825, + "step": 4862 + }, + { + "epoch": 0.39395657809462087, + "grad_norm": 0.03059571236371994, + "learning_rate": 0.00019696233292831108, + "loss": 0.3963, + "step": 4863 + }, + { + "epoch": 0.3940375891121192, + "grad_norm": 0.03555373474955559, + "learning_rate": 0.00019700283515593358, + "loss": 0.3657, + "step": 4864 + }, + { + "epoch": 0.39411860012961764, + "grad_norm": 0.034829266369342804, + "learning_rate": 0.00019704333738355611, + "loss": 0.3778, + "step": 4865 + }, + { + "epoch": 0.394199611147116, + "grad_norm": 0.04059548303484917, + "learning_rate": 0.00019708383961117862, + "loss": 0.3893, + "step": 4866 + }, + { + "epoch": 0.3942806221646144, + "grad_norm": 0.03126801922917366, + "learning_rate": 0.00019712434183880115, + "loss": 0.3812, + "step": 4867 + }, + { + "epoch": 0.39436163318211276, + "grad_norm": 0.04134489223361015, + "learning_rate": 0.00019716484406642366, + "loss": 0.4021, + "step": 4868 + }, + { + "epoch": 0.39444264419961117, + "grad_norm": 0.03955182060599327, + "learning_rate": 0.0001972053462940462, + "loss": 0.3436, + "step": 4869 + }, + { + "epoch": 0.3945236552171095, + "grad_norm": 0.04063299670815468, + "learning_rate": 0.0001972458485216687, + "loss": 0.4133, + "step": 4870 + }, + { + "epoch": 0.3946046662346079, + "grad_norm": 0.03148097172379494, + "learning_rate": 0.00019728635074929123, + "loss": 0.3633, + "step": 4871 + }, + { + "epoch": 0.3946856772521063, + "grad_norm": 0.03044656105339527, + "learning_rate": 0.00019732685297691374, + "loss": 0.3714, + "step": 4872 + }, + { + "epoch": 0.39476668826960465, + "grad_norm": 0.032794658094644547, + "learning_rate": 0.00019736735520453627, + "loss": 0.3534, + "step": 4873 + }, + { + "epoch": 0.39484769928710306, + "grad_norm": 0.033350620418787, + "learning_rate": 0.00019740785743215878, + "loss": 0.401, + "step": 4874 + }, + { + "epoch": 0.3949287103046014, + "grad_norm": 0.030544232577085495, + "learning_rate": 0.0001974483596597813, + "loss": 0.3758, + "step": 4875 + }, + { + "epoch": 0.3950097213220998, + "grad_norm": 0.03313911333680153, + "learning_rate": 0.00019748886188740382, + "loss": 0.3789, + "step": 4876 + }, + { + "epoch": 0.3950907323395982, + "grad_norm": 0.02834252268075943, + "learning_rate": 0.00019752936411502635, + "loss": 0.3096, + "step": 4877 + }, + { + "epoch": 0.39517174335709654, + "grad_norm": 0.03375416249036789, + "learning_rate": 0.00019756986634264886, + "loss": 0.3144, + "step": 4878 + }, + { + "epoch": 0.39525275437459495, + "grad_norm": 0.033389367163181305, + "learning_rate": 0.00019761036857027136, + "loss": 0.3339, + "step": 4879 + }, + { + "epoch": 0.3953337653920933, + "grad_norm": 0.03308767452836037, + "learning_rate": 0.0001976508707978939, + "loss": 0.3632, + "step": 4880 + }, + { + "epoch": 0.3954147764095917, + "grad_norm": 0.03741266205906868, + "learning_rate": 0.0001976913730255164, + "loss": 0.375, + "step": 4881 + }, + { + "epoch": 0.3954957874270901, + "grad_norm": 0.03020811639726162, + "learning_rate": 0.00019773187525313894, + "loss": 0.357, + "step": 4882 + }, + { + "epoch": 0.3955767984445885, + "grad_norm": 0.042745232582092285, + "learning_rate": 0.00019777237748076144, + "loss": 0.3791, + "step": 4883 + }, + { + "epoch": 0.39565780946208684, + "grad_norm": 0.03163204714655876, + "learning_rate": 0.00019781287970838397, + "loss": 0.3984, + "step": 4884 + }, + { + "epoch": 0.3957388204795852, + "grad_norm": 0.037180181592702866, + "learning_rate": 0.00019785338193600648, + "loss": 0.3619, + "step": 4885 + }, + { + "epoch": 0.3958198314970836, + "grad_norm": 0.033253274857997894, + "learning_rate": 0.00019789388416362901, + "loss": 0.3699, + "step": 4886 + }, + { + "epoch": 0.39590084251458196, + "grad_norm": 0.03679104149341583, + "learning_rate": 0.00019793438639125152, + "loss": 0.3738, + "step": 4887 + }, + { + "epoch": 0.3959818535320804, + "grad_norm": 0.033668775111436844, + "learning_rate": 0.00019797488861887405, + "loss": 0.3771, + "step": 4888 + }, + { + "epoch": 0.39606286454957873, + "grad_norm": 0.03718390315771103, + "learning_rate": 0.00019801539084649656, + "loss": 0.3656, + "step": 4889 + }, + { + "epoch": 0.39614387556707714, + "grad_norm": 0.03134991228580475, + "learning_rate": 0.0001980558930741191, + "loss": 0.369, + "step": 4890 + }, + { + "epoch": 0.3962248865845755, + "grad_norm": 0.029683059081435204, + "learning_rate": 0.0001980963953017416, + "loss": 0.3571, + "step": 4891 + }, + { + "epoch": 0.3963058976020739, + "grad_norm": 0.03865545615553856, + "learning_rate": 0.00019813689752936413, + "loss": 0.3787, + "step": 4892 + }, + { + "epoch": 0.39638690861957226, + "grad_norm": 0.031773313879966736, + "learning_rate": 0.00019817739975698664, + "loss": 0.3539, + "step": 4893 + }, + { + "epoch": 0.3964679196370706, + "grad_norm": 0.03601973503828049, + "learning_rate": 0.00019821790198460917, + "loss": 0.3613, + "step": 4894 + }, + { + "epoch": 0.39654893065456903, + "grad_norm": 0.034716833382844925, + "learning_rate": 0.00019825840421223168, + "loss": 0.3715, + "step": 4895 + }, + { + "epoch": 0.3966299416720674, + "grad_norm": 0.036405500024557114, + "learning_rate": 0.0001982989064398542, + "loss": 0.3478, + "step": 4896 + }, + { + "epoch": 0.3967109526895658, + "grad_norm": 0.0369153767824173, + "learning_rate": 0.00019833940866747672, + "loss": 0.402, + "step": 4897 + }, + { + "epoch": 0.39679196370706415, + "grad_norm": 0.03298967704176903, + "learning_rate": 0.00019837991089509925, + "loss": 0.3493, + "step": 4898 + }, + { + "epoch": 0.39687297472456257, + "grad_norm": 0.0389755517244339, + "learning_rate": 0.00019842041312272176, + "loss": 0.3571, + "step": 4899 + }, + { + "epoch": 0.3969539857420609, + "grad_norm": 0.030207008123397827, + "learning_rate": 0.0001984609153503443, + "loss": 0.3154, + "step": 4900 + }, + { + "epoch": 0.3970349967595593, + "grad_norm": 0.032016366720199585, + "learning_rate": 0.0001985014175779668, + "loss": 0.386, + "step": 4901 + }, + { + "epoch": 0.3971160077770577, + "grad_norm": 0.03134904429316521, + "learning_rate": 0.00019854191980558933, + "loss": 0.3251, + "step": 4902 + }, + { + "epoch": 0.39719701879455604, + "grad_norm": 0.048683155328035355, + "learning_rate": 0.00019858242203321183, + "loss": 0.3897, + "step": 4903 + }, + { + "epoch": 0.39727802981205446, + "grad_norm": 0.0321296788752079, + "learning_rate": 0.00019862292426083434, + "loss": 0.3607, + "step": 4904 + }, + { + "epoch": 0.3973590408295528, + "grad_norm": 0.03436422720551491, + "learning_rate": 0.00019866342648845687, + "loss": 0.344, + "step": 4905 + }, + { + "epoch": 0.3974400518470512, + "grad_norm": 0.03357069194316864, + "learning_rate": 0.00019870392871607938, + "loss": 0.3771, + "step": 4906 + }, + { + "epoch": 0.3975210628645496, + "grad_norm": 0.035833247005939484, + "learning_rate": 0.0001987444309437019, + "loss": 0.3856, + "step": 4907 + }, + { + "epoch": 0.39760207388204793, + "grad_norm": 0.032385412603616714, + "learning_rate": 0.00019878493317132442, + "loss": 0.3829, + "step": 4908 + }, + { + "epoch": 0.39768308489954635, + "grad_norm": 0.05283757671713829, + "learning_rate": 0.00019882543539894695, + "loss": 0.4404, + "step": 4909 + }, + { + "epoch": 0.3977640959170447, + "grad_norm": 0.03171170875430107, + "learning_rate": 0.00019886593762656946, + "loss": 0.314, + "step": 4910 + }, + { + "epoch": 0.3978451069345431, + "grad_norm": 0.04072034731507301, + "learning_rate": 0.000198906439854192, + "loss": 0.378, + "step": 4911 + }, + { + "epoch": 0.39792611795204147, + "grad_norm": 0.03641786426305771, + "learning_rate": 0.0001989469420818145, + "loss": 0.3439, + "step": 4912 + }, + { + "epoch": 0.3980071289695399, + "grad_norm": 0.03854703903198242, + "learning_rate": 0.00019898744430943703, + "loss": 0.3843, + "step": 4913 + }, + { + "epoch": 0.39808813998703824, + "grad_norm": 0.031505096703767776, + "learning_rate": 0.00019902794653705954, + "loss": 0.3507, + "step": 4914 + }, + { + "epoch": 0.3981691510045366, + "grad_norm": 0.03177189454436302, + "learning_rate": 0.00019906844876468207, + "loss": 0.3834, + "step": 4915 + }, + { + "epoch": 0.398250162022035, + "grad_norm": 0.03535529226064682, + "learning_rate": 0.00019910895099230458, + "loss": 0.3542, + "step": 4916 + }, + { + "epoch": 0.39833117303953336, + "grad_norm": 0.029137831181287766, + "learning_rate": 0.0001991494532199271, + "loss": 0.3283, + "step": 4917 + }, + { + "epoch": 0.39841218405703177, + "grad_norm": 0.032600197941064835, + "learning_rate": 0.00019918995544754962, + "loss": 0.4159, + "step": 4918 + }, + { + "epoch": 0.3984931950745301, + "grad_norm": 0.03417445346713066, + "learning_rate": 0.00019923045767517215, + "loss": 0.3524, + "step": 4919 + }, + { + "epoch": 0.39857420609202854, + "grad_norm": 0.037261370569467545, + "learning_rate": 0.00019927095990279468, + "loss": 0.3115, + "step": 4920 + }, + { + "epoch": 0.3986552171095269, + "grad_norm": 0.036581240594387054, + "learning_rate": 0.0001993114621304172, + "loss": 0.3512, + "step": 4921 + }, + { + "epoch": 0.39873622812702525, + "grad_norm": 0.03378806263208389, + "learning_rate": 0.00019935196435803972, + "loss": 0.3433, + "step": 4922 + }, + { + "epoch": 0.39881723914452366, + "grad_norm": 0.0315818227827549, + "learning_rate": 0.00019939246658566223, + "loss": 0.348, + "step": 4923 + }, + { + "epoch": 0.398898250162022, + "grad_norm": 0.03489783778786659, + "learning_rate": 0.00019943296881328476, + "loss": 0.3581, + "step": 4924 + }, + { + "epoch": 0.39897926117952043, + "grad_norm": 0.030317939817905426, + "learning_rate": 0.00019947347104090727, + "loss": 0.3354, + "step": 4925 + }, + { + "epoch": 0.3990602721970188, + "grad_norm": 0.03686142712831497, + "learning_rate": 0.00019951397326852977, + "loss": 0.3672, + "step": 4926 + }, + { + "epoch": 0.3991412832145172, + "grad_norm": 0.028810808435082436, + "learning_rate": 0.0001995544754961523, + "loss": 0.3068, + "step": 4927 + }, + { + "epoch": 0.39922229423201555, + "grad_norm": 0.043715715408325195, + "learning_rate": 0.0001995949777237748, + "loss": 0.394, + "step": 4928 + }, + { + "epoch": 0.3993033052495139, + "grad_norm": 0.034423165023326874, + "learning_rate": 0.00019963547995139732, + "loss": 0.3658, + "step": 4929 + }, + { + "epoch": 0.3993843162670123, + "grad_norm": 0.03472477197647095, + "learning_rate": 0.00019967598217901985, + "loss": 0.3751, + "step": 4930 + }, + { + "epoch": 0.3994653272845107, + "grad_norm": 0.03477395325899124, + "learning_rate": 0.00019971648440664236, + "loss": 0.3521, + "step": 4931 + }, + { + "epoch": 0.3995463383020091, + "grad_norm": 0.03423608839511871, + "learning_rate": 0.0001997569866342649, + "loss": 0.366, + "step": 4932 + }, + { + "epoch": 0.39962734931950744, + "grad_norm": 0.030516432598233223, + "learning_rate": 0.0001997974888618874, + "loss": 0.3782, + "step": 4933 + }, + { + "epoch": 0.39970836033700585, + "grad_norm": 0.041002050042152405, + "learning_rate": 0.00019983799108950993, + "loss": 0.3048, + "step": 4934 + }, + { + "epoch": 0.3997893713545042, + "grad_norm": 0.03749099746346474, + "learning_rate": 0.00019987849331713244, + "loss": 0.3872, + "step": 4935 + }, + { + "epoch": 0.3998703823720026, + "grad_norm": 0.031427595764398575, + "learning_rate": 0.00019991899554475497, + "loss": 0.3599, + "step": 4936 + }, + { + "epoch": 0.399951393389501, + "grad_norm": 0.027828801423311234, + "learning_rate": 0.00019995949777237748, + "loss": 0.3048, + "step": 4937 + }, + { + "epoch": 0.40003240440699933, + "grad_norm": 0.03560652583837509, + "learning_rate": 0.0002, + "loss": 0.3507, + "step": 4938 + }, + { + "epoch": 0.40011341542449774, + "grad_norm": 0.029362550005316734, + "learning_rate": 0.00019999549934740537, + "loss": 0.3621, + "step": 4939 + }, + { + "epoch": 0.4001944264419961, + "grad_norm": 0.03609123453497887, + "learning_rate": 0.00019999099869481076, + "loss": 0.3837, + "step": 4940 + }, + { + "epoch": 0.4002754374594945, + "grad_norm": 0.03292112052440643, + "learning_rate": 0.00019998649804221614, + "loss": 0.3704, + "step": 4941 + }, + { + "epoch": 0.40035644847699287, + "grad_norm": 0.03446466848254204, + "learning_rate": 0.0001999819973896215, + "loss": 0.3422, + "step": 4942 + }, + { + "epoch": 0.4004374594944913, + "grad_norm": 0.0325990729033947, + "learning_rate": 0.0001999774967370269, + "loss": 0.3621, + "step": 4943 + }, + { + "epoch": 0.40051847051198963, + "grad_norm": 0.0336809903383255, + "learning_rate": 0.00019997299608443225, + "loss": 0.3424, + "step": 4944 + }, + { + "epoch": 0.400599481529488, + "grad_norm": 0.034309420734643936, + "learning_rate": 0.0001999684954318376, + "loss": 0.3742, + "step": 4945 + }, + { + "epoch": 0.4006804925469864, + "grad_norm": 0.034966353327035904, + "learning_rate": 0.000199963994779243, + "loss": 0.331, + "step": 4946 + }, + { + "epoch": 0.40076150356448476, + "grad_norm": 0.03615720197558403, + "learning_rate": 0.00019995949412664839, + "loss": 0.4266, + "step": 4947 + }, + { + "epoch": 0.40084251458198317, + "grad_norm": 0.03168513625860214, + "learning_rate": 0.00019995499347405375, + "loss": 0.3954, + "step": 4948 + }, + { + "epoch": 0.4009235255994815, + "grad_norm": 0.031109081581234932, + "learning_rate": 0.00019995049282145913, + "loss": 0.3868, + "step": 4949 + }, + { + "epoch": 0.40100453661697993, + "grad_norm": 0.03687392175197601, + "learning_rate": 0.0001999459921688645, + "loss": 0.3249, + "step": 4950 + }, + { + "epoch": 0.4010855476344783, + "grad_norm": 0.033434923738241196, + "learning_rate": 0.00019994149151626985, + "loss": 0.339, + "step": 4951 + }, + { + "epoch": 0.40116655865197665, + "grad_norm": 0.03710939735174179, + "learning_rate": 0.00019993699086367524, + "loss": 0.3879, + "step": 4952 + }, + { + "epoch": 0.40124756966947506, + "grad_norm": 0.03508399426937103, + "learning_rate": 0.00019993249021108063, + "loss": 0.3097, + "step": 4953 + }, + { + "epoch": 0.4013285806869734, + "grad_norm": 0.035740409046411514, + "learning_rate": 0.000199927989558486, + "loss": 0.3372, + "step": 4954 + }, + { + "epoch": 0.4014095917044718, + "grad_norm": 0.03336473926901817, + "learning_rate": 0.00019992348890589137, + "loss": 0.4151, + "step": 4955 + }, + { + "epoch": 0.4014906027219702, + "grad_norm": 0.03121289797127247, + "learning_rate": 0.00019991898825329673, + "loss": 0.3833, + "step": 4956 + }, + { + "epoch": 0.4015716137394686, + "grad_norm": 0.028473064303398132, + "learning_rate": 0.0001999144876007021, + "loss": 0.3705, + "step": 4957 + }, + { + "epoch": 0.40165262475696695, + "grad_norm": 0.03372488543391228, + "learning_rate": 0.00019990998694810748, + "loss": 0.3342, + "step": 4958 + }, + { + "epoch": 0.4017336357744653, + "grad_norm": 0.049180999398231506, + "learning_rate": 0.00019990548629551287, + "loss": 0.4041, + "step": 4959 + }, + { + "epoch": 0.4018146467919637, + "grad_norm": 0.03879746422171593, + "learning_rate": 0.00019990098564291823, + "loss": 0.3849, + "step": 4960 + }, + { + "epoch": 0.40189565780946207, + "grad_norm": 0.037787068635225296, + "learning_rate": 0.00019989648499032362, + "loss": 0.389, + "step": 4961 + }, + { + "epoch": 0.4019766688269605, + "grad_norm": 0.03139461576938629, + "learning_rate": 0.00019989198433772898, + "loss": 0.3873, + "step": 4962 + }, + { + "epoch": 0.40205767984445884, + "grad_norm": 0.02805621363222599, + "learning_rate": 0.00019988748368513434, + "loss": 0.3795, + "step": 4963 + }, + { + "epoch": 0.40213869086195725, + "grad_norm": 0.03285028785467148, + "learning_rate": 0.00019988298303253972, + "loss": 0.39, + "step": 4964 + }, + { + "epoch": 0.4022197018794556, + "grad_norm": 0.03046387806534767, + "learning_rate": 0.0001998784823799451, + "loss": 0.3732, + "step": 4965 + }, + { + "epoch": 0.40230071289695396, + "grad_norm": 0.0408925786614418, + "learning_rate": 0.00019987398172735047, + "loss": 0.3773, + "step": 4966 + }, + { + "epoch": 0.40238172391445237, + "grad_norm": 0.03856538608670235, + "learning_rate": 0.00019986948107475586, + "loss": 0.3739, + "step": 4967 + }, + { + "epoch": 0.4024627349319507, + "grad_norm": 0.032258667051792145, + "learning_rate": 0.00019986498042216122, + "loss": 0.3399, + "step": 4968 + }, + { + "epoch": 0.40254374594944914, + "grad_norm": 0.03434364125132561, + "learning_rate": 0.00019986047976956658, + "loss": 0.3555, + "step": 4969 + }, + { + "epoch": 0.4026247569669475, + "grad_norm": 0.030233683064579964, + "learning_rate": 0.000199855979116972, + "loss": 0.4375, + "step": 4970 + }, + { + "epoch": 0.4027057679844459, + "grad_norm": 0.03484868258237839, + "learning_rate": 0.00019985147846437735, + "loss": 0.3555, + "step": 4971 + }, + { + "epoch": 0.40278677900194426, + "grad_norm": 0.03328513354063034, + "learning_rate": 0.0001998469778117827, + "loss": 0.3359, + "step": 4972 + }, + { + "epoch": 0.4028677900194426, + "grad_norm": 0.03617887571454048, + "learning_rate": 0.0001998424771591881, + "loss": 0.3908, + "step": 4973 + }, + { + "epoch": 0.40294880103694103, + "grad_norm": 0.03129265829920769, + "learning_rate": 0.00019983797650659346, + "loss": 0.3506, + "step": 4974 + }, + { + "epoch": 0.4030298120544394, + "grad_norm": 0.030116554349660873, + "learning_rate": 0.00019983347585399882, + "loss": 0.3632, + "step": 4975 + }, + { + "epoch": 0.4031108230719378, + "grad_norm": 0.03023291938006878, + "learning_rate": 0.00019982897520140423, + "loss": 0.3086, + "step": 4976 + }, + { + "epoch": 0.40319183408943615, + "grad_norm": 0.026817413046956062, + "learning_rate": 0.0001998244745488096, + "loss": 0.3218, + "step": 4977 + }, + { + "epoch": 0.40327284510693456, + "grad_norm": 0.03046896867454052, + "learning_rate": 0.00019981997389621495, + "loss": 0.3745, + "step": 4978 + }, + { + "epoch": 0.4033538561244329, + "grad_norm": 0.02875666506588459, + "learning_rate": 0.00019981547324362034, + "loss": 0.3823, + "step": 4979 + }, + { + "epoch": 0.40343486714193133, + "grad_norm": 0.03567210212349892, + "learning_rate": 0.0001998109725910257, + "loss": 0.3305, + "step": 4980 + }, + { + "epoch": 0.4035158781594297, + "grad_norm": 0.039444200694561005, + "learning_rate": 0.00019980647193843106, + "loss": 0.3671, + "step": 4981 + }, + { + "epoch": 0.40359688917692804, + "grad_norm": 0.03511333838105202, + "learning_rate": 0.00019980197128583648, + "loss": 0.339, + "step": 4982 + }, + { + "epoch": 0.40367790019442645, + "grad_norm": 0.03776915743947029, + "learning_rate": 0.00019979747063324184, + "loss": 0.4184, + "step": 4983 + }, + { + "epoch": 0.4037589112119248, + "grad_norm": 0.03581022098660469, + "learning_rate": 0.0001997929699806472, + "loss": 0.3786, + "step": 4984 + }, + { + "epoch": 0.4038399222294232, + "grad_norm": 0.03828386217355728, + "learning_rate": 0.00019978846932805258, + "loss": 0.3468, + "step": 4985 + }, + { + "epoch": 0.4039209332469216, + "grad_norm": 0.02726336009800434, + "learning_rate": 0.00019978396867545794, + "loss": 0.368, + "step": 4986 + }, + { + "epoch": 0.40400194426442, + "grad_norm": 0.02940969169139862, + "learning_rate": 0.0001997794680228633, + "loss": 0.3332, + "step": 4987 + }, + { + "epoch": 0.40408295528191834, + "grad_norm": 0.0344647541642189, + "learning_rate": 0.00019977496737026872, + "loss": 0.3311, + "step": 4988 + }, + { + "epoch": 0.4041639662994167, + "grad_norm": 0.037162743508815765, + "learning_rate": 0.00019977046671767408, + "loss": 0.3748, + "step": 4989 + }, + { + "epoch": 0.4042449773169151, + "grad_norm": 0.03121703863143921, + "learning_rate": 0.00019976596606507944, + "loss": 0.3355, + "step": 4990 + }, + { + "epoch": 0.40432598833441347, + "grad_norm": 0.033019792288541794, + "learning_rate": 0.00019976146541248482, + "loss": 0.3221, + "step": 4991 + }, + { + "epoch": 0.4044069993519119, + "grad_norm": 0.03607960045337677, + "learning_rate": 0.00019975696475989018, + "loss": 0.3599, + "step": 4992 + }, + { + "epoch": 0.40448801036941023, + "grad_norm": 0.03203876316547394, + "learning_rate": 0.00019975246410729557, + "loss": 0.4015, + "step": 4993 + }, + { + "epoch": 0.40456902138690864, + "grad_norm": 0.031239103525877, + "learning_rate": 0.00019974796345470096, + "loss": 0.3448, + "step": 4994 + }, + { + "epoch": 0.404650032404407, + "grad_norm": 0.04195361211895943, + "learning_rate": 0.00019974346280210632, + "loss": 0.4139, + "step": 4995 + }, + { + "epoch": 0.40473104342190536, + "grad_norm": 0.042518921196460724, + "learning_rate": 0.00019973896214951168, + "loss": 0.4325, + "step": 4996 + }, + { + "epoch": 0.40481205443940377, + "grad_norm": 0.027694914489984512, + "learning_rate": 0.00019973446149691707, + "loss": 0.3278, + "step": 4997 + }, + { + "epoch": 0.4048930654569021, + "grad_norm": 0.027236877009272575, + "learning_rate": 0.00019972996084432243, + "loss": 0.3331, + "step": 4998 + }, + { + "epoch": 0.40497407647440054, + "grad_norm": 0.030211325734853745, + "learning_rate": 0.0001997254601917278, + "loss": 0.3351, + "step": 4999 + }, + { + "epoch": 0.4050550874918989, + "grad_norm": 0.03629770874977112, + "learning_rate": 0.0001997209595391332, + "loss": 0.3644, + "step": 5000 + }, + { + "epoch": 0.4051360985093973, + "grad_norm": 0.033565860241651535, + "learning_rate": 0.00019971645888653856, + "loss": 0.3223, + "step": 5001 + }, + { + "epoch": 0.40521710952689566, + "grad_norm": 0.030630042776465416, + "learning_rate": 0.00019971195823394392, + "loss": 0.3548, + "step": 5002 + }, + { + "epoch": 0.405298120544394, + "grad_norm": 0.028197648003697395, + "learning_rate": 0.0001997074575813493, + "loss": 0.3707, + "step": 5003 + }, + { + "epoch": 0.4053791315618924, + "grad_norm": 0.03611261025071144, + "learning_rate": 0.00019970295692875467, + "loss": 0.3407, + "step": 5004 + }, + { + "epoch": 0.4054601425793908, + "grad_norm": 0.031974319368600845, + "learning_rate": 0.00019969845627616005, + "loss": 0.3836, + "step": 5005 + }, + { + "epoch": 0.4055411535968892, + "grad_norm": 0.030820896849036217, + "learning_rate": 0.00019969395562356544, + "loss": 0.3428, + "step": 5006 + }, + { + "epoch": 0.40562216461438755, + "grad_norm": 0.030217768624424934, + "learning_rate": 0.0001996894549709708, + "loss": 0.3382, + "step": 5007 + }, + { + "epoch": 0.40570317563188596, + "grad_norm": 0.031406279653310776, + "learning_rate": 0.00019968495431837616, + "loss": 0.3348, + "step": 5008 + }, + { + "epoch": 0.4057841866493843, + "grad_norm": 0.039374373853206635, + "learning_rate": 0.00019968045366578155, + "loss": 0.3772, + "step": 5009 + }, + { + "epoch": 0.40586519766688267, + "grad_norm": 0.030828993767499924, + "learning_rate": 0.0001996759530131869, + "loss": 0.3263, + "step": 5010 + }, + { + "epoch": 0.4059462086843811, + "grad_norm": 0.035990871489048004, + "learning_rate": 0.0001996714523605923, + "loss": 0.3575, + "step": 5011 + }, + { + "epoch": 0.40602721970187944, + "grad_norm": 0.034174595028162, + "learning_rate": 0.00019966695170799768, + "loss": 0.33, + "step": 5012 + }, + { + "epoch": 0.40610823071937785, + "grad_norm": 0.029588427394628525, + "learning_rate": 0.00019966245105540304, + "loss": 0.3219, + "step": 5013 + }, + { + "epoch": 0.4061892417368762, + "grad_norm": 0.030147623270750046, + "learning_rate": 0.0001996579504028084, + "loss": 0.2707, + "step": 5014 + }, + { + "epoch": 0.4062702527543746, + "grad_norm": 0.035346563905477524, + "learning_rate": 0.0001996534497502138, + "loss": 0.3642, + "step": 5015 + }, + { + "epoch": 0.406351263771873, + "grad_norm": 0.03422393649816513, + "learning_rate": 0.00019964894909761915, + "loss": 0.3599, + "step": 5016 + }, + { + "epoch": 0.40643227478937133, + "grad_norm": 0.03663076460361481, + "learning_rate": 0.00019964444844502454, + "loss": 0.3705, + "step": 5017 + }, + { + "epoch": 0.40651328580686974, + "grad_norm": 0.03565813973546028, + "learning_rate": 0.00019963994779242992, + "loss": 0.4063, + "step": 5018 + }, + { + "epoch": 0.4065942968243681, + "grad_norm": 0.030144767835736275, + "learning_rate": 0.00019963544713983529, + "loss": 0.2991, + "step": 5019 + }, + { + "epoch": 0.4066753078418665, + "grad_norm": 0.030031368136405945, + "learning_rate": 0.00019963094648724065, + "loss": 0.3289, + "step": 5020 + }, + { + "epoch": 0.40675631885936486, + "grad_norm": 0.02851344645023346, + "learning_rate": 0.00019962644583464603, + "loss": 0.3241, + "step": 5021 + }, + { + "epoch": 0.4068373298768633, + "grad_norm": 0.03419654816389084, + "learning_rate": 0.00019962194518205142, + "loss": 0.3518, + "step": 5022 + }, + { + "epoch": 0.40691834089436163, + "grad_norm": 0.02915254980325699, + "learning_rate": 0.00019961744452945678, + "loss": 0.3757, + "step": 5023 + }, + { + "epoch": 0.40699935191186, + "grad_norm": 0.030751507729291916, + "learning_rate": 0.00019961294387686217, + "loss": 0.314, + "step": 5024 + }, + { + "epoch": 0.4070803629293584, + "grad_norm": 0.032417912036180496, + "learning_rate": 0.00019960844322426753, + "loss": 0.3757, + "step": 5025 + }, + { + "epoch": 0.40716137394685675, + "grad_norm": 0.03586619719862938, + "learning_rate": 0.0001996039425716729, + "loss": 0.3446, + "step": 5026 + }, + { + "epoch": 0.40724238496435516, + "grad_norm": 0.028922390192747116, + "learning_rate": 0.00019959944191907827, + "loss": 0.3286, + "step": 5027 + }, + { + "epoch": 0.4073233959818535, + "grad_norm": 0.029639948159456253, + "learning_rate": 0.00019959494126648366, + "loss": 0.3598, + "step": 5028 + }, + { + "epoch": 0.40740440699935193, + "grad_norm": 0.03211815655231476, + "learning_rate": 0.00019959044061388902, + "loss": 0.3573, + "step": 5029 + }, + { + "epoch": 0.4074854180168503, + "grad_norm": 0.03042125515639782, + "learning_rate": 0.0001995859399612944, + "loss": 0.3311, + "step": 5030 + }, + { + "epoch": 0.4075664290343487, + "grad_norm": 0.03370456397533417, + "learning_rate": 0.00019958143930869977, + "loss": 0.3613, + "step": 5031 + }, + { + "epoch": 0.40764744005184705, + "grad_norm": 0.03500419110059738, + "learning_rate": 0.00019957693865610513, + "loss": 0.3668, + "step": 5032 + }, + { + "epoch": 0.4077284510693454, + "grad_norm": 0.031201496720314026, + "learning_rate": 0.00019957243800351052, + "loss": 0.3271, + "step": 5033 + }, + { + "epoch": 0.4078094620868438, + "grad_norm": 0.03223037347197533, + "learning_rate": 0.0001995679373509159, + "loss": 0.3222, + "step": 5034 + }, + { + "epoch": 0.4078904731043422, + "grad_norm": 0.032849233597517014, + "learning_rate": 0.00019956343669832126, + "loss": 0.3859, + "step": 5035 + }, + { + "epoch": 0.4079714841218406, + "grad_norm": 0.031805459409952164, + "learning_rate": 0.00019955893604572665, + "loss": 0.3616, + "step": 5036 + }, + { + "epoch": 0.40805249513933894, + "grad_norm": 0.030916647985577583, + "learning_rate": 0.000199554435393132, + "loss": 0.3589, + "step": 5037 + }, + { + "epoch": 0.40813350615683736, + "grad_norm": 0.03671705722808838, + "learning_rate": 0.00019954993474053737, + "loss": 0.3622, + "step": 5038 + }, + { + "epoch": 0.4082145171743357, + "grad_norm": 0.032508958131074905, + "learning_rate": 0.00019954543408794276, + "loss": 0.3363, + "step": 5039 + }, + { + "epoch": 0.40829552819183407, + "grad_norm": 0.03839171677827835, + "learning_rate": 0.00019954093343534814, + "loss": 0.4027, + "step": 5040 + }, + { + "epoch": 0.4083765392093325, + "grad_norm": 0.03205114230513573, + "learning_rate": 0.0001995364327827535, + "loss": 0.3773, + "step": 5041 + }, + { + "epoch": 0.40845755022683083, + "grad_norm": 0.03220202773809433, + "learning_rate": 0.0001995319321301589, + "loss": 0.3162, + "step": 5042 + }, + { + "epoch": 0.40853856124432925, + "grad_norm": 0.035653892904520035, + "learning_rate": 0.00019952743147756425, + "loss": 0.3868, + "step": 5043 + }, + { + "epoch": 0.4086195722618276, + "grad_norm": 0.03405566141009331, + "learning_rate": 0.0001995229308249696, + "loss": 0.393, + "step": 5044 + }, + { + "epoch": 0.408700583279326, + "grad_norm": 0.03670091927051544, + "learning_rate": 0.00019951843017237503, + "loss": 0.393, + "step": 5045 + }, + { + "epoch": 0.40878159429682437, + "grad_norm": 0.026954319328069687, + "learning_rate": 0.00019951392951978039, + "loss": 0.3394, + "step": 5046 + }, + { + "epoch": 0.4088626053143227, + "grad_norm": 0.029613513499498367, + "learning_rate": 0.00019950942886718575, + "loss": 0.3559, + "step": 5047 + }, + { + "epoch": 0.40894361633182114, + "grad_norm": 0.03285623714327812, + "learning_rate": 0.00019950492821459113, + "loss": 0.3565, + "step": 5048 + }, + { + "epoch": 0.4090246273493195, + "grad_norm": 0.046118028461933136, + "learning_rate": 0.0001995004275619965, + "loss": 0.3902, + "step": 5049 + }, + { + "epoch": 0.4091056383668179, + "grad_norm": 0.0518961064517498, + "learning_rate": 0.00019949592690940185, + "loss": 0.3158, + "step": 5050 + }, + { + "epoch": 0.40918664938431626, + "grad_norm": 0.04242045804858208, + "learning_rate": 0.00019949142625680727, + "loss": 0.3933, + "step": 5051 + }, + { + "epoch": 0.40926766040181467, + "grad_norm": 0.032364003360271454, + "learning_rate": 0.00019948692560421263, + "loss": 0.3962, + "step": 5052 + }, + { + "epoch": 0.409348671419313, + "grad_norm": 0.0313120037317276, + "learning_rate": 0.000199482424951618, + "loss": 0.3484, + "step": 5053 + }, + { + "epoch": 0.4094296824368114, + "grad_norm": 0.026692088693380356, + "learning_rate": 0.00019947792429902337, + "loss": 0.3498, + "step": 5054 + }, + { + "epoch": 0.4095106934543098, + "grad_norm": 0.03651599958539009, + "learning_rate": 0.00019947342364642873, + "loss": 0.3543, + "step": 5055 + }, + { + "epoch": 0.40959170447180815, + "grad_norm": 0.036102522164583206, + "learning_rate": 0.0001994689229938341, + "loss": 0.3928, + "step": 5056 + }, + { + "epoch": 0.40967271548930656, + "grad_norm": 0.03446866199374199, + "learning_rate": 0.0001994644223412395, + "loss": 0.3483, + "step": 5057 + }, + { + "epoch": 0.4097537265068049, + "grad_norm": 0.029560396447777748, + "learning_rate": 0.00019945992168864487, + "loss": 0.3484, + "step": 5058 + }, + { + "epoch": 0.4098347375243033, + "grad_norm": 0.045518580824136734, + "learning_rate": 0.00019945542103605023, + "loss": 0.3436, + "step": 5059 + }, + { + "epoch": 0.4099157485418017, + "grad_norm": 0.03489755094051361, + "learning_rate": 0.00019945092038345562, + "loss": 0.4013, + "step": 5060 + }, + { + "epoch": 0.40999675955930004, + "grad_norm": 0.029958384111523628, + "learning_rate": 0.00019944641973086098, + "loss": 0.306, + "step": 5061 + }, + { + "epoch": 0.41007777057679845, + "grad_norm": 0.03341478854417801, + "learning_rate": 0.00019944191907826634, + "loss": 0.3578, + "step": 5062 + }, + { + "epoch": 0.4101587815942968, + "grad_norm": 0.03505893051624298, + "learning_rate": 0.00019943741842567175, + "loss": 0.3299, + "step": 5063 + }, + { + "epoch": 0.4102397926117952, + "grad_norm": 0.03374209254980087, + "learning_rate": 0.0001994329177730771, + "loss": 0.3581, + "step": 5064 + }, + { + "epoch": 0.4103208036292936, + "grad_norm": 0.029370833188295364, + "learning_rate": 0.00019942841712048247, + "loss": 0.3853, + "step": 5065 + }, + { + "epoch": 0.410401814646792, + "grad_norm": 0.037649236619472504, + "learning_rate": 0.00019942391646788786, + "loss": 0.3855, + "step": 5066 + }, + { + "epoch": 0.41048282566429034, + "grad_norm": 0.0331161729991436, + "learning_rate": 0.00019941941581529322, + "loss": 0.3959, + "step": 5067 + }, + { + "epoch": 0.4105638366817887, + "grad_norm": 0.028420720249414444, + "learning_rate": 0.00019941491516269858, + "loss": 0.3214, + "step": 5068 + }, + { + "epoch": 0.4106448476992871, + "grad_norm": 0.028488215059041977, + "learning_rate": 0.000199410414510104, + "loss": 0.3575, + "step": 5069 + }, + { + "epoch": 0.41072585871678546, + "grad_norm": 0.03867751732468605, + "learning_rate": 0.00019940591385750935, + "loss": 0.3763, + "step": 5070 + }, + { + "epoch": 0.4108068697342839, + "grad_norm": 0.03423899784684181, + "learning_rate": 0.0001994014132049147, + "loss": 0.3847, + "step": 5071 + }, + { + "epoch": 0.41088788075178223, + "grad_norm": 0.02935202606022358, + "learning_rate": 0.0001993969125523201, + "loss": 0.3605, + "step": 5072 + }, + { + "epoch": 0.41096889176928064, + "grad_norm": 0.0323471873998642, + "learning_rate": 0.00019939241189972546, + "loss": 0.3426, + "step": 5073 + }, + { + "epoch": 0.411049902786779, + "grad_norm": 0.03729783371090889, + "learning_rate": 0.00019938791124713085, + "loss": 0.353, + "step": 5074 + }, + { + "epoch": 0.4111309138042774, + "grad_norm": 0.04616546258330345, + "learning_rate": 0.00019938341059453623, + "loss": 0.3813, + "step": 5075 + }, + { + "epoch": 0.41121192482177576, + "grad_norm": 0.03555573895573616, + "learning_rate": 0.0001993789099419416, + "loss": 0.3102, + "step": 5076 + }, + { + "epoch": 0.4112929358392741, + "grad_norm": 0.03330931067466736, + "learning_rate": 0.00019937440928934695, + "loss": 0.3645, + "step": 5077 + }, + { + "epoch": 0.41137394685677253, + "grad_norm": 0.03230508789420128, + "learning_rate": 0.00019936990863675234, + "loss": 0.3492, + "step": 5078 + }, + { + "epoch": 0.4114549578742709, + "grad_norm": 0.03547259420156479, + "learning_rate": 0.0001993654079841577, + "loss": 0.3662, + "step": 5079 + }, + { + "epoch": 0.4115359688917693, + "grad_norm": 0.03292231634259224, + "learning_rate": 0.0001993609073315631, + "loss": 0.3901, + "step": 5080 + }, + { + "epoch": 0.41161697990926766, + "grad_norm": 0.029186615720391273, + "learning_rate": 0.00019935640667896848, + "loss": 0.3055, + "step": 5081 + }, + { + "epoch": 0.41169799092676607, + "grad_norm": 0.033885449171066284, + "learning_rate": 0.00019935190602637384, + "loss": 0.3886, + "step": 5082 + }, + { + "epoch": 0.4117790019442644, + "grad_norm": 0.03691324219107628, + "learning_rate": 0.0001993474053737792, + "loss": 0.3715, + "step": 5083 + }, + { + "epoch": 0.4118600129617628, + "grad_norm": 0.03496171906590462, + "learning_rate": 0.00019934290472118458, + "loss": 0.3795, + "step": 5084 + }, + { + "epoch": 0.4119410239792612, + "grad_norm": 0.03149078041315079, + "learning_rate": 0.00019933840406858994, + "loss": 0.3449, + "step": 5085 + }, + { + "epoch": 0.41202203499675955, + "grad_norm": 0.02938619628548622, + "learning_rate": 0.00019933390341599533, + "loss": 0.3181, + "step": 5086 + }, + { + "epoch": 0.41210304601425796, + "grad_norm": 0.03057067282497883, + "learning_rate": 0.00019932940276340072, + "loss": 0.3412, + "step": 5087 + }, + { + "epoch": 0.4121840570317563, + "grad_norm": 0.03213992714881897, + "learning_rate": 0.00019932490211080608, + "loss": 0.3871, + "step": 5088 + }, + { + "epoch": 0.4122650680492547, + "grad_norm": 0.029805082827806473, + "learning_rate": 0.00019932040145821144, + "loss": 0.3548, + "step": 5089 + }, + { + "epoch": 0.4123460790667531, + "grad_norm": 0.028804264962673187, + "learning_rate": 0.00019931590080561682, + "loss": 0.3005, + "step": 5090 + }, + { + "epoch": 0.41242709008425144, + "grad_norm": 0.030629368498921394, + "learning_rate": 0.00019931140015302218, + "loss": 0.3645, + "step": 5091 + }, + { + "epoch": 0.41250810110174985, + "grad_norm": 0.03798553720116615, + "learning_rate": 0.00019930689950042757, + "loss": 0.3771, + "step": 5092 + }, + { + "epoch": 0.4125891121192482, + "grad_norm": 0.031806670129299164, + "learning_rate": 0.00019930239884783296, + "loss": 0.4109, + "step": 5093 + }, + { + "epoch": 0.4126701231367466, + "grad_norm": 0.037300970405340195, + "learning_rate": 0.00019929789819523832, + "loss": 0.398, + "step": 5094 + }, + { + "epoch": 0.41275113415424497, + "grad_norm": 0.033359628170728683, + "learning_rate": 0.00019929339754264368, + "loss": 0.3486, + "step": 5095 + }, + { + "epoch": 0.4128321451717434, + "grad_norm": 0.031194040551781654, + "learning_rate": 0.00019928889689004907, + "loss": 0.3496, + "step": 5096 + }, + { + "epoch": 0.41291315618924174, + "grad_norm": 0.029555628076195717, + "learning_rate": 0.00019928439623745445, + "loss": 0.3629, + "step": 5097 + }, + { + "epoch": 0.4129941672067401, + "grad_norm": 0.03267825022339821, + "learning_rate": 0.0001992798955848598, + "loss": 0.361, + "step": 5098 + }, + { + "epoch": 0.4130751782242385, + "grad_norm": 0.037240467965602875, + "learning_rate": 0.0001992753949322652, + "loss": 0.3751, + "step": 5099 + }, + { + "epoch": 0.41315618924173686, + "grad_norm": 0.03187297657132149, + "learning_rate": 0.00019927089427967056, + "loss": 0.3345, + "step": 5100 + }, + { + "epoch": 0.41323720025923527, + "grad_norm": 0.02669236622750759, + "learning_rate": 0.00019926639362707592, + "loss": 0.3153, + "step": 5101 + }, + { + "epoch": 0.4133182112767336, + "grad_norm": 0.03205592930316925, + "learning_rate": 0.0001992618929744813, + "loss": 0.2747, + "step": 5102 + }, + { + "epoch": 0.41339922229423204, + "grad_norm": 0.030281927436590195, + "learning_rate": 0.0001992573923218867, + "loss": 0.3505, + "step": 5103 + }, + { + "epoch": 0.4134802333117304, + "grad_norm": 0.03139625862240791, + "learning_rate": 0.00019925289166929205, + "loss": 0.3535, + "step": 5104 + }, + { + "epoch": 0.41356124432922875, + "grad_norm": 0.03957032784819603, + "learning_rate": 0.00019924839101669744, + "loss": 0.3793, + "step": 5105 + }, + { + "epoch": 0.41364225534672716, + "grad_norm": 0.032334841787815094, + "learning_rate": 0.0001992438903641028, + "loss": 0.3739, + "step": 5106 + }, + { + "epoch": 0.4137232663642255, + "grad_norm": 0.032936934381723404, + "learning_rate": 0.00019923938971150816, + "loss": 0.342, + "step": 5107 + }, + { + "epoch": 0.41380427738172393, + "grad_norm": 0.03231712430715561, + "learning_rate": 0.00019923488905891355, + "loss": 0.3294, + "step": 5108 + }, + { + "epoch": 0.4138852883992223, + "grad_norm": 0.033003080636262894, + "learning_rate": 0.00019923038840631894, + "loss": 0.3452, + "step": 5109 + }, + { + "epoch": 0.4139662994167207, + "grad_norm": 0.04072129353880882, + "learning_rate": 0.0001992258877537243, + "loss": 0.3528, + "step": 5110 + }, + { + "epoch": 0.41404731043421905, + "grad_norm": 0.03586127609014511, + "learning_rate": 0.00019922138710112968, + "loss": 0.3862, + "step": 5111 + }, + { + "epoch": 0.4141283214517174, + "grad_norm": 0.03363611176609993, + "learning_rate": 0.00019921688644853504, + "loss": 0.3756, + "step": 5112 + }, + { + "epoch": 0.4142093324692158, + "grad_norm": 0.03447722643613815, + "learning_rate": 0.0001992123857959404, + "loss": 0.3653, + "step": 5113 + }, + { + "epoch": 0.4142903434867142, + "grad_norm": 0.034334730356931686, + "learning_rate": 0.0001992078851433458, + "loss": 0.3608, + "step": 5114 + }, + { + "epoch": 0.4143713545042126, + "grad_norm": 0.03418297320604324, + "learning_rate": 0.00019920338449075118, + "loss": 0.3868, + "step": 5115 + }, + { + "epoch": 0.41445236552171094, + "grad_norm": 0.03272546827793121, + "learning_rate": 0.00019919888383815654, + "loss": 0.3903, + "step": 5116 + }, + { + "epoch": 0.41453337653920935, + "grad_norm": 0.03231920301914215, + "learning_rate": 0.00019919438318556193, + "loss": 0.335, + "step": 5117 + }, + { + "epoch": 0.4146143875567077, + "grad_norm": 0.039603229612112045, + "learning_rate": 0.00019918988253296729, + "loss": 0.3478, + "step": 5118 + }, + { + "epoch": 0.41469539857420606, + "grad_norm": 0.03109239600598812, + "learning_rate": 0.00019918538188037265, + "loss": 0.3458, + "step": 5119 + }, + { + "epoch": 0.4147764095917045, + "grad_norm": 0.029578372836112976, + "learning_rate": 0.00019918088122777803, + "loss": 0.3372, + "step": 5120 + }, + { + "epoch": 0.41485742060920283, + "grad_norm": 0.03594496101140976, + "learning_rate": 0.00019917638057518342, + "loss": 0.4146, + "step": 5121 + }, + { + "epoch": 0.41493843162670124, + "grad_norm": 0.030784275382757187, + "learning_rate": 0.00019917187992258878, + "loss": 0.4104, + "step": 5122 + }, + { + "epoch": 0.4150194426441996, + "grad_norm": 0.03049563802778721, + "learning_rate": 0.00019916737926999417, + "loss": 0.355, + "step": 5123 + }, + { + "epoch": 0.415100453661698, + "grad_norm": 0.032201237976551056, + "learning_rate": 0.00019916287861739953, + "loss": 0.3582, + "step": 5124 + }, + { + "epoch": 0.41518146467919637, + "grad_norm": 0.038234151899814606, + "learning_rate": 0.0001991583779648049, + "loss": 0.3957, + "step": 5125 + }, + { + "epoch": 0.4152624756966948, + "grad_norm": 0.03416343033313751, + "learning_rate": 0.0001991538773122103, + "loss": 0.3751, + "step": 5126 + }, + { + "epoch": 0.41534348671419313, + "grad_norm": 0.036890894174575806, + "learning_rate": 0.00019914937665961566, + "loss": 0.4049, + "step": 5127 + }, + { + "epoch": 0.4154244977316915, + "grad_norm": 0.03059619665145874, + "learning_rate": 0.00019914487600702102, + "loss": 0.3943, + "step": 5128 + }, + { + "epoch": 0.4155055087491899, + "grad_norm": 0.03785393759608269, + "learning_rate": 0.0001991403753544264, + "loss": 0.4098, + "step": 5129 + }, + { + "epoch": 0.41558651976668826, + "grad_norm": 0.03127528354525566, + "learning_rate": 0.00019913587470183177, + "loss": 0.3484, + "step": 5130 + }, + { + "epoch": 0.41566753078418667, + "grad_norm": 0.030445709824562073, + "learning_rate": 0.00019913137404923713, + "loss": 0.3255, + "step": 5131 + }, + { + "epoch": 0.415748541801685, + "grad_norm": 0.03678746148943901, + "learning_rate": 0.00019912687339664254, + "loss": 0.3528, + "step": 5132 + }, + { + "epoch": 0.41582955281918343, + "grad_norm": 0.03211822733283043, + "learning_rate": 0.0001991223727440479, + "loss": 0.3421, + "step": 5133 + }, + { + "epoch": 0.4159105638366818, + "grad_norm": 0.034807238727808, + "learning_rate": 0.00019911787209145326, + "loss": 0.3781, + "step": 5134 + }, + { + "epoch": 0.41599157485418015, + "grad_norm": 0.03515045717358589, + "learning_rate": 0.00019911337143885865, + "loss": 0.3803, + "step": 5135 + }, + { + "epoch": 0.41607258587167856, + "grad_norm": 0.03006439283490181, + "learning_rate": 0.000199108870786264, + "loss": 0.316, + "step": 5136 + }, + { + "epoch": 0.4161535968891769, + "grad_norm": 0.03301999717950821, + "learning_rate": 0.00019910437013366937, + "loss": 0.3341, + "step": 5137 + }, + { + "epoch": 0.4162346079066753, + "grad_norm": 0.045103173702955246, + "learning_rate": 0.00019909986948107478, + "loss": 0.3515, + "step": 5138 + }, + { + "epoch": 0.4163156189241737, + "grad_norm": 0.03581353649497032, + "learning_rate": 0.00019909536882848014, + "loss": 0.3232, + "step": 5139 + }, + { + "epoch": 0.4163966299416721, + "grad_norm": 0.03379884734749794, + "learning_rate": 0.0001990908681758855, + "loss": 0.3655, + "step": 5140 + }, + { + "epoch": 0.41647764095917045, + "grad_norm": 0.03726637735962868, + "learning_rate": 0.0001990863675232909, + "loss": 0.4216, + "step": 5141 + }, + { + "epoch": 0.4165586519766688, + "grad_norm": 0.0365687757730484, + "learning_rate": 0.00019908186687069625, + "loss": 0.3847, + "step": 5142 + }, + { + "epoch": 0.4166396629941672, + "grad_norm": 0.032937657088041306, + "learning_rate": 0.0001990773662181016, + "loss": 0.4113, + "step": 5143 + }, + { + "epoch": 0.41672067401166557, + "grad_norm": 0.05359840393066406, + "learning_rate": 0.00019907286556550703, + "loss": 0.3892, + "step": 5144 + }, + { + "epoch": 0.416801685029164, + "grad_norm": 0.03107147105038166, + "learning_rate": 0.00019906836491291239, + "loss": 0.3961, + "step": 5145 + }, + { + "epoch": 0.41688269604666234, + "grad_norm": 0.0421084500849247, + "learning_rate": 0.00019906386426031775, + "loss": 0.3866, + "step": 5146 + }, + { + "epoch": 0.41696370706416075, + "grad_norm": 0.028859462589025497, + "learning_rate": 0.00019905936360772313, + "loss": 0.3486, + "step": 5147 + }, + { + "epoch": 0.4170447180816591, + "grad_norm": 0.032266031950712204, + "learning_rate": 0.0001990548629551285, + "loss": 0.3932, + "step": 5148 + }, + { + "epoch": 0.41712572909915746, + "grad_norm": 0.04529944807291031, + "learning_rate": 0.00019905036230253385, + "loss": 0.3467, + "step": 5149 + }, + { + "epoch": 0.41720674011665587, + "grad_norm": 0.02860589511692524, + "learning_rate": 0.00019904586164993927, + "loss": 0.3618, + "step": 5150 + }, + { + "epoch": 0.41728775113415423, + "grad_norm": 0.02880844473838806, + "learning_rate": 0.00019904136099734463, + "loss": 0.3826, + "step": 5151 + }, + { + "epoch": 0.41736876215165264, + "grad_norm": 0.03666391596198082, + "learning_rate": 0.00019903686034475, + "loss": 0.3662, + "step": 5152 + }, + { + "epoch": 0.417449773169151, + "grad_norm": 0.02866402640938759, + "learning_rate": 0.00019903235969215537, + "loss": 0.3376, + "step": 5153 + }, + { + "epoch": 0.4175307841866494, + "grad_norm": 0.03218850865960121, + "learning_rate": 0.00019902785903956074, + "loss": 0.3204, + "step": 5154 + }, + { + "epoch": 0.41761179520414776, + "grad_norm": 0.035164568573236465, + "learning_rate": 0.00019902335838696612, + "loss": 0.3467, + "step": 5155 + }, + { + "epoch": 0.4176928062216461, + "grad_norm": 0.03248968720436096, + "learning_rate": 0.0001990188577343715, + "loss": 0.3291, + "step": 5156 + }, + { + "epoch": 0.41777381723914453, + "grad_norm": 0.03146893158555031, + "learning_rate": 0.00019901435708177687, + "loss": 0.3784, + "step": 5157 + }, + { + "epoch": 0.4178548282566429, + "grad_norm": 0.03342106193304062, + "learning_rate": 0.00019900985642918223, + "loss": 0.3677, + "step": 5158 + }, + { + "epoch": 0.4179358392741413, + "grad_norm": 0.03008284978568554, + "learning_rate": 0.00019900535577658762, + "loss": 0.355, + "step": 5159 + }, + { + "epoch": 0.41801685029163965, + "grad_norm": 0.031455010175704956, + "learning_rate": 0.00019900085512399298, + "loss": 0.3984, + "step": 5160 + }, + { + "epoch": 0.41809786130913806, + "grad_norm": 0.03575517609715462, + "learning_rate": 0.00019899635447139836, + "loss": 0.376, + "step": 5161 + }, + { + "epoch": 0.4181788723266364, + "grad_norm": 0.03251372277736664, + "learning_rate": 0.00019899185381880375, + "loss": 0.3479, + "step": 5162 + }, + { + "epoch": 0.4182598833441348, + "grad_norm": 0.03510560840368271, + "learning_rate": 0.0001989873531662091, + "loss": 0.3822, + "step": 5163 + }, + { + "epoch": 0.4183408943616332, + "grad_norm": 0.03543466702103615, + "learning_rate": 0.00019898285251361447, + "loss": 0.3961, + "step": 5164 + }, + { + "epoch": 0.41842190537913154, + "grad_norm": 0.033629365265369415, + "learning_rate": 0.00019897835186101986, + "loss": 0.3436, + "step": 5165 + }, + { + "epoch": 0.41850291639662995, + "grad_norm": 0.03533930331468582, + "learning_rate": 0.00019897385120842522, + "loss": 0.3597, + "step": 5166 + }, + { + "epoch": 0.4185839274141283, + "grad_norm": 0.034922514110803604, + "learning_rate": 0.0001989693505558306, + "loss": 0.362, + "step": 5167 + }, + { + "epoch": 0.4186649384316267, + "grad_norm": 0.03290367126464844, + "learning_rate": 0.000198964849903236, + "loss": 0.377, + "step": 5168 + }, + { + "epoch": 0.4187459494491251, + "grad_norm": 0.03566797077655792, + "learning_rate": 0.00019896034925064135, + "loss": 0.4076, + "step": 5169 + }, + { + "epoch": 0.4188269604666235, + "grad_norm": 0.03208748996257782, + "learning_rate": 0.0001989558485980467, + "loss": 0.314, + "step": 5170 + }, + { + "epoch": 0.41890797148412184, + "grad_norm": 0.029782714322209358, + "learning_rate": 0.0001989513479454521, + "loss": 0.3556, + "step": 5171 + }, + { + "epoch": 0.4189889825016202, + "grad_norm": 0.033525895327329636, + "learning_rate": 0.00019894684729285746, + "loss": 0.4088, + "step": 5172 + }, + { + "epoch": 0.4190699935191186, + "grad_norm": 0.029556160792708397, + "learning_rate": 0.00019894234664026285, + "loss": 0.3591, + "step": 5173 + }, + { + "epoch": 0.41915100453661697, + "grad_norm": 0.03858495503664017, + "learning_rate": 0.00019893784598766823, + "loss": 0.3699, + "step": 5174 + }, + { + "epoch": 0.4192320155541154, + "grad_norm": 0.03810839727520943, + "learning_rate": 0.0001989333453350736, + "loss": 0.3474, + "step": 5175 + }, + { + "epoch": 0.41931302657161373, + "grad_norm": 0.030418671667575836, + "learning_rate": 0.00019892884468247895, + "loss": 0.3507, + "step": 5176 + }, + { + "epoch": 0.41939403758911215, + "grad_norm": 0.031051823869347572, + "learning_rate": 0.00019892434402988434, + "loss": 0.3568, + "step": 5177 + }, + { + "epoch": 0.4194750486066105, + "grad_norm": 0.03632812947034836, + "learning_rate": 0.00019891984337728973, + "loss": 0.3884, + "step": 5178 + }, + { + "epoch": 0.41955605962410886, + "grad_norm": 0.034061893820762634, + "learning_rate": 0.0001989153427246951, + "loss": 0.3541, + "step": 5179 + }, + { + "epoch": 0.41963707064160727, + "grad_norm": 0.03530232980847359, + "learning_rate": 0.00019891084207210048, + "loss": 0.387, + "step": 5180 + }, + { + "epoch": 0.4197180816591056, + "grad_norm": 0.030564704909920692, + "learning_rate": 0.00019890634141950584, + "loss": 0.3974, + "step": 5181 + }, + { + "epoch": 0.41979909267660404, + "grad_norm": 0.029824761673808098, + "learning_rate": 0.0001989018407669112, + "loss": 0.2902, + "step": 5182 + }, + { + "epoch": 0.4198801036941024, + "grad_norm": 0.0386199913918972, + "learning_rate": 0.00019889734011431658, + "loss": 0.3861, + "step": 5183 + }, + { + "epoch": 0.4199611147116008, + "grad_norm": 0.030515290796756744, + "learning_rate": 0.00019889283946172197, + "loss": 0.3376, + "step": 5184 + }, + { + "epoch": 0.42004212572909916, + "grad_norm": 0.03223884105682373, + "learning_rate": 0.00019888833880912733, + "loss": 0.4042, + "step": 5185 + }, + { + "epoch": 0.4201231367465975, + "grad_norm": 0.03462700545787811, + "learning_rate": 0.00019888383815653272, + "loss": 0.3469, + "step": 5186 + }, + { + "epoch": 0.4202041477640959, + "grad_norm": 0.030687794089317322, + "learning_rate": 0.00019887933750393808, + "loss": 0.3238, + "step": 5187 + }, + { + "epoch": 0.4202851587815943, + "grad_norm": 0.03233848139643669, + "learning_rate": 0.00019887483685134344, + "loss": 0.3399, + "step": 5188 + }, + { + "epoch": 0.4203661697990927, + "grad_norm": 0.033750370144844055, + "learning_rate": 0.00019887033619874882, + "loss": 0.4309, + "step": 5189 + }, + { + "epoch": 0.42044718081659105, + "grad_norm": 0.027981825172901154, + "learning_rate": 0.0001988658355461542, + "loss": 0.3105, + "step": 5190 + }, + { + "epoch": 0.42052819183408946, + "grad_norm": 0.03219735622406006, + "learning_rate": 0.00019886133489355957, + "loss": 0.3925, + "step": 5191 + }, + { + "epoch": 0.4206092028515878, + "grad_norm": 0.02849307656288147, + "learning_rate": 0.00019885683424096496, + "loss": 0.3735, + "step": 5192 + }, + { + "epoch": 0.42069021386908617, + "grad_norm": 0.031731121242046356, + "learning_rate": 0.00019885233358837032, + "loss": 0.3774, + "step": 5193 + }, + { + "epoch": 0.4207712248865846, + "grad_norm": 0.03386862203478813, + "learning_rate": 0.00019884783293577568, + "loss": 0.3527, + "step": 5194 + }, + { + "epoch": 0.42085223590408294, + "grad_norm": 0.03693525865674019, + "learning_rate": 0.00019884333228318107, + "loss": 0.3667, + "step": 5195 + }, + { + "epoch": 0.42093324692158135, + "grad_norm": 0.03634597361087799, + "learning_rate": 0.00019883883163058645, + "loss": 0.4062, + "step": 5196 + }, + { + "epoch": 0.4210142579390797, + "grad_norm": 0.03302176296710968, + "learning_rate": 0.0001988343309779918, + "loss": 0.3638, + "step": 5197 + }, + { + "epoch": 0.4210952689565781, + "grad_norm": 0.03496898338198662, + "learning_rate": 0.0001988298303253972, + "loss": 0.3592, + "step": 5198 + }, + { + "epoch": 0.4211762799740765, + "grad_norm": 0.032557763159275055, + "learning_rate": 0.00019882532967280256, + "loss": 0.3665, + "step": 5199 + }, + { + "epoch": 0.42125729099157483, + "grad_norm": 0.030008163303136826, + "learning_rate": 0.00019882082902020792, + "loss": 0.3572, + "step": 5200 + }, + { + "epoch": 0.42133830200907324, + "grad_norm": 0.03188272565603256, + "learning_rate": 0.0001988163283676133, + "loss": 0.3361, + "step": 5201 + }, + { + "epoch": 0.4214193130265716, + "grad_norm": 0.03313596546649933, + "learning_rate": 0.0001988118277150187, + "loss": 0.3759, + "step": 5202 + }, + { + "epoch": 0.42150032404407, + "grad_norm": 0.032859593629837036, + "learning_rate": 0.00019880732706242406, + "loss": 0.379, + "step": 5203 + }, + { + "epoch": 0.42158133506156836, + "grad_norm": 0.029185116291046143, + "learning_rate": 0.00019880282640982944, + "loss": 0.3275, + "step": 5204 + }, + { + "epoch": 0.4216623460790668, + "grad_norm": 0.029771940782666206, + "learning_rate": 0.0001987983257572348, + "loss": 0.3778, + "step": 5205 + }, + { + "epoch": 0.42174335709656513, + "grad_norm": 0.029124047607183456, + "learning_rate": 0.00019879382510464016, + "loss": 0.3467, + "step": 5206 + }, + { + "epoch": 0.4218243681140635, + "grad_norm": 0.03311445564031601, + "learning_rate": 0.00019878932445204558, + "loss": 0.3557, + "step": 5207 + }, + { + "epoch": 0.4219053791315619, + "grad_norm": 0.029799094423651695, + "learning_rate": 0.00019878482379945094, + "loss": 0.3477, + "step": 5208 + }, + { + "epoch": 0.42198639014906025, + "grad_norm": 0.03274417296051979, + "learning_rate": 0.0001987803231468563, + "loss": 0.3755, + "step": 5209 + }, + { + "epoch": 0.42206740116655866, + "grad_norm": 0.03296559676527977, + "learning_rate": 0.00019877582249426168, + "loss": 0.3356, + "step": 5210 + }, + { + "epoch": 0.422148412184057, + "grad_norm": 0.03630368784070015, + "learning_rate": 0.00019877132184166704, + "loss": 0.4121, + "step": 5211 + }, + { + "epoch": 0.42222942320155543, + "grad_norm": 0.03400547802448273, + "learning_rate": 0.0001987668211890724, + "loss": 0.3499, + "step": 5212 + }, + { + "epoch": 0.4223104342190538, + "grad_norm": 0.03906463086605072, + "learning_rate": 0.00019876232053647782, + "loss": 0.3674, + "step": 5213 + }, + { + "epoch": 0.4223914452365522, + "grad_norm": 0.031435608863830566, + "learning_rate": 0.00019875781988388318, + "loss": 0.3705, + "step": 5214 + }, + { + "epoch": 0.42247245625405055, + "grad_norm": 0.03501134365797043, + "learning_rate": 0.00019875331923128854, + "loss": 0.3594, + "step": 5215 + }, + { + "epoch": 0.4225534672715489, + "grad_norm": 0.029576752334833145, + "learning_rate": 0.00019874881857869393, + "loss": 0.3737, + "step": 5216 + }, + { + "epoch": 0.4226344782890473, + "grad_norm": 0.03144533187150955, + "learning_rate": 0.00019874431792609929, + "loss": 0.3741, + "step": 5217 + }, + { + "epoch": 0.4227154893065457, + "grad_norm": 0.03286955505609512, + "learning_rate": 0.00019873981727350465, + "loss": 0.3791, + "step": 5218 + }, + { + "epoch": 0.4227965003240441, + "grad_norm": 0.02476130612194538, + "learning_rate": 0.00019873531662091006, + "loss": 0.3316, + "step": 5219 + }, + { + "epoch": 0.42287751134154244, + "grad_norm": 0.03551604598760605, + "learning_rate": 0.00019873081596831542, + "loss": 0.3734, + "step": 5220 + }, + { + "epoch": 0.42295852235904086, + "grad_norm": 0.03408364579081535, + "learning_rate": 0.00019872631531572078, + "loss": 0.3709, + "step": 5221 + }, + { + "epoch": 0.4230395333765392, + "grad_norm": 0.0335887186229229, + "learning_rate": 0.00019872181466312617, + "loss": 0.3626, + "step": 5222 + }, + { + "epoch": 0.42312054439403757, + "grad_norm": 0.03350028395652771, + "learning_rate": 0.00019871731401053153, + "loss": 0.364, + "step": 5223 + }, + { + "epoch": 0.423201555411536, + "grad_norm": 0.03685237467288971, + "learning_rate": 0.0001987128133579369, + "loss": 0.3232, + "step": 5224 + }, + { + "epoch": 0.42328256642903433, + "grad_norm": 0.03553393483161926, + "learning_rate": 0.0001987083127053423, + "loss": 0.358, + "step": 5225 + }, + { + "epoch": 0.42336357744653275, + "grad_norm": 0.034406695514917374, + "learning_rate": 0.00019870381205274766, + "loss": 0.3339, + "step": 5226 + }, + { + "epoch": 0.4234445884640311, + "grad_norm": 0.03492816910147667, + "learning_rate": 0.00019869931140015302, + "loss": 0.381, + "step": 5227 + }, + { + "epoch": 0.4235255994815295, + "grad_norm": 0.028665270656347275, + "learning_rate": 0.0001986948107475584, + "loss": 0.3527, + "step": 5228 + }, + { + "epoch": 0.42360661049902787, + "grad_norm": 0.04024519771337509, + "learning_rate": 0.00019869031009496377, + "loss": 0.3728, + "step": 5229 + }, + { + "epoch": 0.4236876215165262, + "grad_norm": 0.032281264662742615, + "learning_rate": 0.00019868580944236916, + "loss": 0.3872, + "step": 5230 + }, + { + "epoch": 0.42376863253402464, + "grad_norm": 0.03774891793727875, + "learning_rate": 0.00019868130878977454, + "loss": 0.4118, + "step": 5231 + }, + { + "epoch": 0.423849643551523, + "grad_norm": 0.032088082283735275, + "learning_rate": 0.0001986768081371799, + "loss": 0.3465, + "step": 5232 + }, + { + "epoch": 0.4239306545690214, + "grad_norm": 0.030594127252697945, + "learning_rate": 0.00019867230748458526, + "loss": 0.3673, + "step": 5233 + }, + { + "epoch": 0.42401166558651976, + "grad_norm": 0.03421124815940857, + "learning_rate": 0.00019866780683199065, + "loss": 0.3336, + "step": 5234 + }, + { + "epoch": 0.42409267660401817, + "grad_norm": 0.03351978585124016, + "learning_rate": 0.000198663306179396, + "loss": 0.3349, + "step": 5235 + }, + { + "epoch": 0.4241736876215165, + "grad_norm": 0.03628509119153023, + "learning_rate": 0.0001986588055268014, + "loss": 0.3036, + "step": 5236 + }, + { + "epoch": 0.4242546986390149, + "grad_norm": 0.03526661545038223, + "learning_rate": 0.00019865430487420678, + "loss": 0.3514, + "step": 5237 + }, + { + "epoch": 0.4243357096565133, + "grad_norm": 0.029476439580321312, + "learning_rate": 0.00019864980422161214, + "loss": 0.3813, + "step": 5238 + }, + { + "epoch": 0.42441672067401165, + "grad_norm": 0.03853764757514, + "learning_rate": 0.0001986453035690175, + "loss": 0.3639, + "step": 5239 + }, + { + "epoch": 0.42449773169151006, + "grad_norm": 0.033753901720047, + "learning_rate": 0.0001986408029164229, + "loss": 0.3765, + "step": 5240 + }, + { + "epoch": 0.4245787427090084, + "grad_norm": 0.035833194851875305, + "learning_rate": 0.00019863630226382825, + "loss": 0.3427, + "step": 5241 + }, + { + "epoch": 0.42465975372650683, + "grad_norm": 0.03638903796672821, + "learning_rate": 0.00019863180161123364, + "loss": 0.3804, + "step": 5242 + }, + { + "epoch": 0.4247407647440052, + "grad_norm": 0.029470784589648247, + "learning_rate": 0.00019862730095863903, + "loss": 0.3455, + "step": 5243 + }, + { + "epoch": 0.42482177576150354, + "grad_norm": 0.03390977531671524, + "learning_rate": 0.00019862280030604439, + "loss": 0.3588, + "step": 5244 + }, + { + "epoch": 0.42490278677900195, + "grad_norm": 0.03637409582734108, + "learning_rate": 0.00019861829965344975, + "loss": 0.4302, + "step": 5245 + }, + { + "epoch": 0.4249837977965003, + "grad_norm": 0.0375310555100441, + "learning_rate": 0.00019861379900085513, + "loss": 0.3476, + "step": 5246 + }, + { + "epoch": 0.4250648088139987, + "grad_norm": 0.027415787801146507, + "learning_rate": 0.0001986092983482605, + "loss": 0.3223, + "step": 5247 + }, + { + "epoch": 0.4251458198314971, + "grad_norm": 0.03121158853173256, + "learning_rate": 0.00019860479769566588, + "loss": 0.376, + "step": 5248 + }, + { + "epoch": 0.4252268308489955, + "grad_norm": 0.03210705518722534, + "learning_rate": 0.00019860029704307127, + "loss": 0.4036, + "step": 5249 + }, + { + "epoch": 0.42530784186649384, + "grad_norm": 0.03374101221561432, + "learning_rate": 0.00019859579639047663, + "loss": 0.4263, + "step": 5250 + }, + { + "epoch": 0.4253888528839922, + "grad_norm": 0.029598388820886612, + "learning_rate": 0.000198591295737882, + "loss": 0.362, + "step": 5251 + }, + { + "epoch": 0.4254698639014906, + "grad_norm": 0.032661404460668564, + "learning_rate": 0.00019858679508528738, + "loss": 0.3719, + "step": 5252 + }, + { + "epoch": 0.42555087491898896, + "grad_norm": 0.028480835258960724, + "learning_rate": 0.00019858229443269274, + "loss": 0.3359, + "step": 5253 + }, + { + "epoch": 0.4256318859364874, + "grad_norm": 0.03154754266142845, + "learning_rate": 0.00019857779378009812, + "loss": 0.3264, + "step": 5254 + }, + { + "epoch": 0.42571289695398573, + "grad_norm": 0.032808415591716766, + "learning_rate": 0.0001985732931275035, + "loss": 0.3761, + "step": 5255 + }, + { + "epoch": 0.42579390797148414, + "grad_norm": 0.03287159278988838, + "learning_rate": 0.00019856879247490887, + "loss": 0.348, + "step": 5256 + }, + { + "epoch": 0.4258749189889825, + "grad_norm": 0.02578105963766575, + "learning_rate": 0.00019856429182231423, + "loss": 0.3518, + "step": 5257 + }, + { + "epoch": 0.42595593000648085, + "grad_norm": 0.02951761521399021, + "learning_rate": 0.00019855979116971962, + "loss": 0.3867, + "step": 5258 + }, + { + "epoch": 0.42603694102397927, + "grad_norm": 0.03331972658634186, + "learning_rate": 0.000198555290517125, + "loss": 0.3304, + "step": 5259 + }, + { + "epoch": 0.4261179520414776, + "grad_norm": 0.028827041387557983, + "learning_rate": 0.00019855078986453036, + "loss": 0.3001, + "step": 5260 + }, + { + "epoch": 0.42619896305897603, + "grad_norm": 0.031208734959363937, + "learning_rate": 0.00019854628921193575, + "loss": 0.3473, + "step": 5261 + }, + { + "epoch": 0.4262799740764744, + "grad_norm": 0.028891805559396744, + "learning_rate": 0.0001985417885593411, + "loss": 0.3395, + "step": 5262 + }, + { + "epoch": 0.4263609850939728, + "grad_norm": 0.031027546152472496, + "learning_rate": 0.00019853728790674647, + "loss": 0.3426, + "step": 5263 + }, + { + "epoch": 0.42644199611147116, + "grad_norm": 0.04123328626155853, + "learning_rate": 0.00019853278725415186, + "loss": 0.4216, + "step": 5264 + }, + { + "epoch": 0.42652300712896957, + "grad_norm": 0.032555144280195236, + "learning_rate": 0.00019852828660155725, + "loss": 0.3763, + "step": 5265 + }, + { + "epoch": 0.4266040181464679, + "grad_norm": 0.032303549349308014, + "learning_rate": 0.0001985237859489626, + "loss": 0.3624, + "step": 5266 + }, + { + "epoch": 0.4266850291639663, + "grad_norm": 0.03178047016263008, + "learning_rate": 0.000198519285296368, + "loss": 0.3765, + "step": 5267 + }, + { + "epoch": 0.4267660401814647, + "grad_norm": 0.02855534851551056, + "learning_rate": 0.00019851478464377335, + "loss": 0.3523, + "step": 5268 + }, + { + "epoch": 0.42684705119896305, + "grad_norm": 0.04090893268585205, + "learning_rate": 0.0001985102839911787, + "loss": 0.3015, + "step": 5269 + }, + { + "epoch": 0.42692806221646146, + "grad_norm": 0.033113036304712296, + "learning_rate": 0.0001985057833385841, + "loss": 0.3791, + "step": 5270 + }, + { + "epoch": 0.4270090732339598, + "grad_norm": 0.029150884598493576, + "learning_rate": 0.0001985012826859895, + "loss": 0.3366, + "step": 5271 + }, + { + "epoch": 0.4270900842514582, + "grad_norm": 0.029976367950439453, + "learning_rate": 0.00019849678203339485, + "loss": 0.3095, + "step": 5272 + }, + { + "epoch": 0.4271710952689566, + "grad_norm": 0.029154542833566666, + "learning_rate": 0.00019849228138080023, + "loss": 0.3309, + "step": 5273 + }, + { + "epoch": 0.42725210628645494, + "grad_norm": 0.030596930533647537, + "learning_rate": 0.0001984877807282056, + "loss": 0.3795, + "step": 5274 + }, + { + "epoch": 0.42733311730395335, + "grad_norm": 0.03367812559008598, + "learning_rate": 0.00019848328007561095, + "loss": 0.3595, + "step": 5275 + }, + { + "epoch": 0.4274141283214517, + "grad_norm": 0.029898041859269142, + "learning_rate": 0.00019847877942301634, + "loss": 0.364, + "step": 5276 + }, + { + "epoch": 0.4274951393389501, + "grad_norm": 0.02948545664548874, + "learning_rate": 0.00019847427877042173, + "loss": 0.3455, + "step": 5277 + }, + { + "epoch": 0.42757615035644847, + "grad_norm": 0.03094439208507538, + "learning_rate": 0.0001984697781178271, + "loss": 0.3427, + "step": 5278 + }, + { + "epoch": 0.4276571613739469, + "grad_norm": 0.03338692709803581, + "learning_rate": 0.00019846527746523248, + "loss": 0.3592, + "step": 5279 + }, + { + "epoch": 0.42773817239144524, + "grad_norm": 0.02944916859269142, + "learning_rate": 0.00019846077681263784, + "loss": 0.3383, + "step": 5280 + }, + { + "epoch": 0.4278191834089436, + "grad_norm": 0.033850450068712234, + "learning_rate": 0.0001984562761600432, + "loss": 0.3489, + "step": 5281 + }, + { + "epoch": 0.427900194426442, + "grad_norm": 0.03082362376153469, + "learning_rate": 0.00019845177550744858, + "loss": 0.3363, + "step": 5282 + }, + { + "epoch": 0.42798120544394036, + "grad_norm": 0.03181496635079384, + "learning_rate": 0.00019844727485485397, + "loss": 0.3723, + "step": 5283 + }, + { + "epoch": 0.42806221646143877, + "grad_norm": 0.03152613341808319, + "learning_rate": 0.00019844277420225933, + "loss": 0.322, + "step": 5284 + }, + { + "epoch": 0.4281432274789371, + "grad_norm": 0.03608822450041771, + "learning_rate": 0.00019843827354966472, + "loss": 0.3746, + "step": 5285 + }, + { + "epoch": 0.42822423849643554, + "grad_norm": 0.033797916024923325, + "learning_rate": 0.00019843377289707008, + "loss": 0.3856, + "step": 5286 + }, + { + "epoch": 0.4283052495139339, + "grad_norm": 0.029529495164752007, + "learning_rate": 0.00019842927224447544, + "loss": 0.3359, + "step": 5287 + }, + { + "epoch": 0.42838626053143225, + "grad_norm": 0.03147159516811371, + "learning_rate": 0.00019842477159188085, + "loss": 0.3501, + "step": 5288 + }, + { + "epoch": 0.42846727154893066, + "grad_norm": 0.03382951766252518, + "learning_rate": 0.0001984202709392862, + "loss": 0.3169, + "step": 5289 + }, + { + "epoch": 0.428548282566429, + "grad_norm": 0.030625872313976288, + "learning_rate": 0.00019841577028669157, + "loss": 0.3695, + "step": 5290 + }, + { + "epoch": 0.42862929358392743, + "grad_norm": 0.03295085206627846, + "learning_rate": 0.00019841126963409696, + "loss": 0.385, + "step": 5291 + }, + { + "epoch": 0.4287103046014258, + "grad_norm": 0.03444168344140053, + "learning_rate": 0.00019840676898150232, + "loss": 0.3397, + "step": 5292 + }, + { + "epoch": 0.4287913156189242, + "grad_norm": 0.03426263481378555, + "learning_rate": 0.00019840226832890768, + "loss": 0.3454, + "step": 5293 + }, + { + "epoch": 0.42887232663642255, + "grad_norm": 0.03757169842720032, + "learning_rate": 0.0001983977676763131, + "loss": 0.3777, + "step": 5294 + }, + { + "epoch": 0.4289533376539209, + "grad_norm": 0.03087097406387329, + "learning_rate": 0.00019839326702371845, + "loss": 0.3677, + "step": 5295 + }, + { + "epoch": 0.4290343486714193, + "grad_norm": 0.03303956612944603, + "learning_rate": 0.00019838876637112381, + "loss": 0.3769, + "step": 5296 + }, + { + "epoch": 0.4291153596889177, + "grad_norm": 0.03492473065853119, + "learning_rate": 0.0001983842657185292, + "loss": 0.386, + "step": 5297 + }, + { + "epoch": 0.4291963707064161, + "grad_norm": 0.03316411003470421, + "learning_rate": 0.00019837976506593456, + "loss": 0.3447, + "step": 5298 + }, + { + "epoch": 0.42927738172391444, + "grad_norm": 0.03256673365831375, + "learning_rate": 0.00019837526441333992, + "loss": 0.3063, + "step": 5299 + }, + { + "epoch": 0.42935839274141285, + "grad_norm": 0.03335904702544212, + "learning_rate": 0.00019837076376074534, + "loss": 0.3417, + "step": 5300 + }, + { + "epoch": 0.4294394037589112, + "grad_norm": 0.030818233266472816, + "learning_rate": 0.0001983662631081507, + "loss": 0.3476, + "step": 5301 + }, + { + "epoch": 0.42952041477640956, + "grad_norm": 0.03463114798069, + "learning_rate": 0.00019836176245555606, + "loss": 0.3799, + "step": 5302 + }, + { + "epoch": 0.429601425793908, + "grad_norm": 0.03285285457968712, + "learning_rate": 0.00019835726180296144, + "loss": 0.387, + "step": 5303 + }, + { + "epoch": 0.42968243681140633, + "grad_norm": 0.03394316881895065, + "learning_rate": 0.0001983527611503668, + "loss": 0.3592, + "step": 5304 + }, + { + "epoch": 0.42976344782890474, + "grad_norm": 0.02860003523528576, + "learning_rate": 0.00019834826049777216, + "loss": 0.3425, + "step": 5305 + }, + { + "epoch": 0.4298444588464031, + "grad_norm": 0.031108953058719635, + "learning_rate": 0.00019834375984517758, + "loss": 0.3452, + "step": 5306 + }, + { + "epoch": 0.4299254698639015, + "grad_norm": 0.03230966255068779, + "learning_rate": 0.00019833925919258294, + "loss": 0.3784, + "step": 5307 + }, + { + "epoch": 0.43000648088139987, + "grad_norm": 0.03210729360580444, + "learning_rate": 0.0001983347585399883, + "loss": 0.3496, + "step": 5308 + }, + { + "epoch": 0.4300874918988983, + "grad_norm": 0.029922574758529663, + "learning_rate": 0.00019833025788739368, + "loss": 0.3956, + "step": 5309 + }, + { + "epoch": 0.43016850291639663, + "grad_norm": 0.03001343458890915, + "learning_rate": 0.00019832575723479904, + "loss": 0.3706, + "step": 5310 + }, + { + "epoch": 0.430249513933895, + "grad_norm": 0.030526647344231606, + "learning_rate": 0.00019832125658220443, + "loss": 0.3644, + "step": 5311 + }, + { + "epoch": 0.4303305249513934, + "grad_norm": 0.03420925512909889, + "learning_rate": 0.00019831675592960982, + "loss": 0.3832, + "step": 5312 + }, + { + "epoch": 0.43041153596889176, + "grad_norm": 0.03520764037966728, + "learning_rate": 0.00019831225527701518, + "loss": 0.3439, + "step": 5313 + }, + { + "epoch": 0.43049254698639017, + "grad_norm": 0.03564382344484329, + "learning_rate": 0.00019830775462442054, + "loss": 0.3472, + "step": 5314 + }, + { + "epoch": 0.4305735580038885, + "grad_norm": 0.03275568038225174, + "learning_rate": 0.00019830325397182593, + "loss": 0.408, + "step": 5315 + }, + { + "epoch": 0.43065456902138693, + "grad_norm": 0.03222983330488205, + "learning_rate": 0.00019829875331923129, + "loss": 0.3672, + "step": 5316 + }, + { + "epoch": 0.4307355800388853, + "grad_norm": 0.034696005284786224, + "learning_rate": 0.00019829425266663667, + "loss": 0.3478, + "step": 5317 + }, + { + "epoch": 0.43081659105638365, + "grad_norm": 0.031081615015864372, + "learning_rate": 0.00019828975201404206, + "loss": 0.3881, + "step": 5318 + }, + { + "epoch": 0.43089760207388206, + "grad_norm": 0.04065845534205437, + "learning_rate": 0.00019828525136144742, + "loss": 0.3613, + "step": 5319 + }, + { + "epoch": 0.4309786130913804, + "grad_norm": 0.03409722447395325, + "learning_rate": 0.00019828075070885278, + "loss": 0.4114, + "step": 5320 + }, + { + "epoch": 0.4310596241088788, + "grad_norm": 0.03448516130447388, + "learning_rate": 0.00019827625005625817, + "loss": 0.3711, + "step": 5321 + }, + { + "epoch": 0.4311406351263772, + "grad_norm": 0.03240002319216728, + "learning_rate": 0.00019827174940366353, + "loss": 0.3646, + "step": 5322 + }, + { + "epoch": 0.4312216461438756, + "grad_norm": 0.031075075268745422, + "learning_rate": 0.00019826724875106891, + "loss": 0.3832, + "step": 5323 + }, + { + "epoch": 0.43130265716137395, + "grad_norm": 0.03438475355505943, + "learning_rate": 0.0001982627480984743, + "loss": 0.3838, + "step": 5324 + }, + { + "epoch": 0.4313836681788723, + "grad_norm": 0.03652133792638779, + "learning_rate": 0.00019825824744587966, + "loss": 0.421, + "step": 5325 + }, + { + "epoch": 0.4314646791963707, + "grad_norm": 0.029609760269522667, + "learning_rate": 0.00019825374679328502, + "loss": 0.3324, + "step": 5326 + }, + { + "epoch": 0.43154569021386907, + "grad_norm": 0.03364776447415352, + "learning_rate": 0.0001982492461406904, + "loss": 0.4306, + "step": 5327 + }, + { + "epoch": 0.4316267012313675, + "grad_norm": 0.03062380850315094, + "learning_rate": 0.00019824474548809577, + "loss": 0.3697, + "step": 5328 + }, + { + "epoch": 0.43170771224886584, + "grad_norm": 0.03233860060572624, + "learning_rate": 0.00019824024483550116, + "loss": 0.347, + "step": 5329 + }, + { + "epoch": 0.43178872326636425, + "grad_norm": 0.03236883133649826, + "learning_rate": 0.00019823574418290654, + "loss": 0.3684, + "step": 5330 + }, + { + "epoch": 0.4318697342838626, + "grad_norm": 0.029878897592425346, + "learning_rate": 0.0001982312435303119, + "loss": 0.3848, + "step": 5331 + }, + { + "epoch": 0.43195074530136096, + "grad_norm": 0.03530362248420715, + "learning_rate": 0.00019822674287771726, + "loss": 0.3679, + "step": 5332 + }, + { + "epoch": 0.4320317563188594, + "grad_norm": 0.03218994662165642, + "learning_rate": 0.00019822224222512265, + "loss": 0.3433, + "step": 5333 + }, + { + "epoch": 0.43211276733635773, + "grad_norm": 0.030317038297653198, + "learning_rate": 0.000198217741572528, + "loss": 0.3517, + "step": 5334 + }, + { + "epoch": 0.43219377835385614, + "grad_norm": 0.03286263346672058, + "learning_rate": 0.0001982132409199334, + "loss": 0.3677, + "step": 5335 + }, + { + "epoch": 0.4322747893713545, + "grad_norm": 0.03247006982564926, + "learning_rate": 0.00019820874026733879, + "loss": 0.3826, + "step": 5336 + }, + { + "epoch": 0.4323558003888529, + "grad_norm": 0.030288146808743477, + "learning_rate": 0.00019820423961474415, + "loss": 0.3334, + "step": 5337 + }, + { + "epoch": 0.43243681140635126, + "grad_norm": 0.034478385001420975, + "learning_rate": 0.0001981997389621495, + "loss": 0.3603, + "step": 5338 + }, + { + "epoch": 0.4325178224238496, + "grad_norm": 0.03148753196001053, + "learning_rate": 0.0001981952383095549, + "loss": 0.4004, + "step": 5339 + }, + { + "epoch": 0.43259883344134803, + "grad_norm": 0.03236803039908409, + "learning_rate": 0.00019819073765696028, + "loss": 0.3352, + "step": 5340 + }, + { + "epoch": 0.4326798444588464, + "grad_norm": 0.03081342577934265, + "learning_rate": 0.00019818623700436564, + "loss": 0.3771, + "step": 5341 + }, + { + "epoch": 0.4327608554763448, + "grad_norm": 0.0314449742436409, + "learning_rate": 0.00019818173635177103, + "loss": 0.362, + "step": 5342 + }, + { + "epoch": 0.43284186649384315, + "grad_norm": 0.03407454490661621, + "learning_rate": 0.0001981772356991764, + "loss": 0.3476, + "step": 5343 + }, + { + "epoch": 0.43292287751134156, + "grad_norm": 0.03543641045689583, + "learning_rate": 0.00019817273504658175, + "loss": 0.3954, + "step": 5344 + }, + { + "epoch": 0.4330038885288399, + "grad_norm": 0.03256583213806152, + "learning_rate": 0.00019816823439398713, + "loss": 0.3339, + "step": 5345 + }, + { + "epoch": 0.4330848995463383, + "grad_norm": 0.0349603109061718, + "learning_rate": 0.00019816373374139252, + "loss": 0.3841, + "step": 5346 + }, + { + "epoch": 0.4331659105638367, + "grad_norm": 0.03182327374815941, + "learning_rate": 0.00019815923308879788, + "loss": 0.418, + "step": 5347 + }, + { + "epoch": 0.43324692158133504, + "grad_norm": 0.03906280919909477, + "learning_rate": 0.00019815473243620327, + "loss": 0.3678, + "step": 5348 + }, + { + "epoch": 0.43332793259883345, + "grad_norm": 0.02743501029908657, + "learning_rate": 0.00019815023178360863, + "loss": 0.3212, + "step": 5349 + }, + { + "epoch": 0.4334089436163318, + "grad_norm": 0.034895461052656174, + "learning_rate": 0.000198145731131014, + "loss": 0.3985, + "step": 5350 + }, + { + "epoch": 0.4334899546338302, + "grad_norm": 0.030332963913679123, + "learning_rate": 0.00019814123047841938, + "loss": 0.3461, + "step": 5351 + }, + { + "epoch": 0.4335709656513286, + "grad_norm": 0.03224902227520943, + "learning_rate": 0.00019813672982582476, + "loss": 0.3443, + "step": 5352 + }, + { + "epoch": 0.43365197666882693, + "grad_norm": 0.03049425408244133, + "learning_rate": 0.00019813222917323012, + "loss": 0.3866, + "step": 5353 + }, + { + "epoch": 0.43373298768632534, + "grad_norm": 0.036649126559495926, + "learning_rate": 0.0001981277285206355, + "loss": 0.3809, + "step": 5354 + }, + { + "epoch": 0.4338139987038237, + "grad_norm": 0.03557087481021881, + "learning_rate": 0.00019812322786804087, + "loss": 0.3624, + "step": 5355 + }, + { + "epoch": 0.4338950097213221, + "grad_norm": 0.04648400843143463, + "learning_rate": 0.00019811872721544623, + "loss": 0.4313, + "step": 5356 + }, + { + "epoch": 0.43397602073882047, + "grad_norm": 0.03537077084183693, + "learning_rate": 0.00019811422656285162, + "loss": 0.3906, + "step": 5357 + }, + { + "epoch": 0.4340570317563189, + "grad_norm": 0.03709874302148819, + "learning_rate": 0.000198109725910257, + "loss": 0.3984, + "step": 5358 + }, + { + "epoch": 0.43413804277381723, + "grad_norm": 0.03712714463472366, + "learning_rate": 0.00019810522525766236, + "loss": 0.365, + "step": 5359 + }, + { + "epoch": 0.43421905379131565, + "grad_norm": 0.03208884969353676, + "learning_rate": 0.00019810072460506775, + "loss": 0.3303, + "step": 5360 + }, + { + "epoch": 0.434300064808814, + "grad_norm": 0.03099757432937622, + "learning_rate": 0.0001980962239524731, + "loss": 0.3606, + "step": 5361 + }, + { + "epoch": 0.43438107582631236, + "grad_norm": 0.0307605043053627, + "learning_rate": 0.00019809172329987847, + "loss": 0.3973, + "step": 5362 + }, + { + "epoch": 0.43446208684381077, + "grad_norm": 0.036221764981746674, + "learning_rate": 0.00019808722264728389, + "loss": 0.3629, + "step": 5363 + }, + { + "epoch": 0.4345430978613091, + "grad_norm": 0.03021487593650818, + "learning_rate": 0.00019808272199468925, + "loss": 0.3539, + "step": 5364 + }, + { + "epoch": 0.43462410887880754, + "grad_norm": 0.038944393396377563, + "learning_rate": 0.0001980782213420946, + "loss": 0.3857, + "step": 5365 + }, + { + "epoch": 0.4347051198963059, + "grad_norm": 0.029537489637732506, + "learning_rate": 0.0001980737206895, + "loss": 0.3449, + "step": 5366 + }, + { + "epoch": 0.4347861309138043, + "grad_norm": 0.0289238803088665, + "learning_rate": 0.00019806922003690535, + "loss": 0.3115, + "step": 5367 + }, + { + "epoch": 0.43486714193130266, + "grad_norm": 0.027230558916926384, + "learning_rate": 0.0001980647193843107, + "loss": 0.3351, + "step": 5368 + }, + { + "epoch": 0.434948152948801, + "grad_norm": 0.032325826585292816, + "learning_rate": 0.00019806021873171613, + "loss": 0.3646, + "step": 5369 + }, + { + "epoch": 0.4350291639662994, + "grad_norm": 0.03655783087015152, + "learning_rate": 0.0001980557180791215, + "loss": 0.4564, + "step": 5370 + }, + { + "epoch": 0.4351101749837978, + "grad_norm": 0.028337333351373672, + "learning_rate": 0.00019805121742652685, + "loss": 0.3312, + "step": 5371 + }, + { + "epoch": 0.4351911860012962, + "grad_norm": 0.037807732820510864, + "learning_rate": 0.00019804671677393223, + "loss": 0.401, + "step": 5372 + }, + { + "epoch": 0.43527219701879455, + "grad_norm": 0.030689479783177376, + "learning_rate": 0.0001980422161213376, + "loss": 0.3411, + "step": 5373 + }, + { + "epoch": 0.43535320803629296, + "grad_norm": 0.03371885046362877, + "learning_rate": 0.00019803771546874295, + "loss": 0.3303, + "step": 5374 + }, + { + "epoch": 0.4354342190537913, + "grad_norm": 0.03178364410996437, + "learning_rate": 0.00019803321481614837, + "loss": 0.3628, + "step": 5375 + }, + { + "epoch": 0.43551523007128967, + "grad_norm": 0.03330547362565994, + "learning_rate": 0.00019802871416355373, + "loss": 0.3534, + "step": 5376 + }, + { + "epoch": 0.4355962410887881, + "grad_norm": 0.03564752638339996, + "learning_rate": 0.0001980242135109591, + "loss": 0.3938, + "step": 5377 + }, + { + "epoch": 0.43567725210628644, + "grad_norm": 0.038495369255542755, + "learning_rate": 0.00019801971285836448, + "loss": 0.34, + "step": 5378 + }, + { + "epoch": 0.43575826312378485, + "grad_norm": 0.03451234847307205, + "learning_rate": 0.00019801521220576984, + "loss": 0.3928, + "step": 5379 + }, + { + "epoch": 0.4358392741412832, + "grad_norm": 0.03760962560772896, + "learning_rate": 0.0001980107115531752, + "loss": 0.4047, + "step": 5380 + }, + { + "epoch": 0.4359202851587816, + "grad_norm": 0.034070443361997604, + "learning_rate": 0.0001980062109005806, + "loss": 0.3554, + "step": 5381 + }, + { + "epoch": 0.43600129617628, + "grad_norm": 0.03605116531252861, + "learning_rate": 0.00019800171024798597, + "loss": 0.3732, + "step": 5382 + }, + { + "epoch": 0.43608230719377833, + "grad_norm": 0.04791965335607529, + "learning_rate": 0.00019799720959539133, + "loss": 0.3819, + "step": 5383 + }, + { + "epoch": 0.43616331821127674, + "grad_norm": 0.03060603328049183, + "learning_rate": 0.00019799270894279672, + "loss": 0.3421, + "step": 5384 + }, + { + "epoch": 0.4362443292287751, + "grad_norm": 0.04278833419084549, + "learning_rate": 0.00019798820829020208, + "loss": 0.3558, + "step": 5385 + }, + { + "epoch": 0.4363253402462735, + "grad_norm": 0.037542033940553665, + "learning_rate": 0.00019798370763760747, + "loss": 0.4201, + "step": 5386 + }, + { + "epoch": 0.43640635126377186, + "grad_norm": 0.033243872225284576, + "learning_rate": 0.00019797920698501285, + "loss": 0.391, + "step": 5387 + }, + { + "epoch": 0.4364873622812703, + "grad_norm": 0.032972801476716995, + "learning_rate": 0.0001979747063324182, + "loss": 0.3856, + "step": 5388 + }, + { + "epoch": 0.43656837329876863, + "grad_norm": 0.034915633499622345, + "learning_rate": 0.00019797020567982357, + "loss": 0.3655, + "step": 5389 + }, + { + "epoch": 0.436649384316267, + "grad_norm": 0.031369417905807495, + "learning_rate": 0.00019796570502722896, + "loss": 0.359, + "step": 5390 + }, + { + "epoch": 0.4367303953337654, + "grad_norm": 0.03334236517548561, + "learning_rate": 0.00019796120437463432, + "loss": 0.4029, + "step": 5391 + }, + { + "epoch": 0.43681140635126375, + "grad_norm": 0.030649734660983086, + "learning_rate": 0.0001979567037220397, + "loss": 0.343, + "step": 5392 + }, + { + "epoch": 0.43689241736876216, + "grad_norm": 0.034339819103479385, + "learning_rate": 0.0001979522030694451, + "loss": 0.3765, + "step": 5393 + }, + { + "epoch": 0.4369734283862605, + "grad_norm": 0.033588845282793045, + "learning_rate": 0.00019794770241685045, + "loss": 0.3586, + "step": 5394 + }, + { + "epoch": 0.43705443940375893, + "grad_norm": 0.029606353491544724, + "learning_rate": 0.00019794320176425581, + "loss": 0.3613, + "step": 5395 + }, + { + "epoch": 0.4371354504212573, + "grad_norm": 0.0318920835852623, + "learning_rate": 0.0001979387011116612, + "loss": 0.388, + "step": 5396 + }, + { + "epoch": 0.43721646143875564, + "grad_norm": 0.03575948253273964, + "learning_rate": 0.00019793420045906656, + "loss": 0.3707, + "step": 5397 + }, + { + "epoch": 0.43729747245625405, + "grad_norm": 0.02881302498281002, + "learning_rate": 0.00019792969980647195, + "loss": 0.3507, + "step": 5398 + }, + { + "epoch": 0.4373784834737524, + "grad_norm": 0.031063402071595192, + "learning_rate": 0.00019792519915387734, + "loss": 0.3456, + "step": 5399 + }, + { + "epoch": 0.4374594944912508, + "grad_norm": 0.030241835862398148, + "learning_rate": 0.0001979206985012827, + "loss": 0.3449, + "step": 5400 + }, + { + "epoch": 0.4375405055087492, + "grad_norm": 0.03187227621674538, + "learning_rate": 0.00019791619784868806, + "loss": 0.3276, + "step": 5401 + }, + { + "epoch": 0.4376215165262476, + "grad_norm": 0.032396234571933746, + "learning_rate": 0.00019791169719609344, + "loss": 0.333, + "step": 5402 + }, + { + "epoch": 0.43770252754374595, + "grad_norm": 0.033492933958768845, + "learning_rate": 0.0001979071965434988, + "loss": 0.3664, + "step": 5403 + }, + { + "epoch": 0.43778353856124436, + "grad_norm": 0.03617965802550316, + "learning_rate": 0.0001979026958909042, + "loss": 0.3862, + "step": 5404 + }, + { + "epoch": 0.4378645495787427, + "grad_norm": 0.03024153970181942, + "learning_rate": 0.00019789819523830958, + "loss": 0.3379, + "step": 5405 + }, + { + "epoch": 0.43794556059624107, + "grad_norm": 0.0315060131251812, + "learning_rate": 0.00019789369458571494, + "loss": 0.3679, + "step": 5406 + }, + { + "epoch": 0.4380265716137395, + "grad_norm": 0.03162820264697075, + "learning_rate": 0.0001978891939331203, + "loss": 0.3864, + "step": 5407 + }, + { + "epoch": 0.43810758263123784, + "grad_norm": 0.029816798865795135, + "learning_rate": 0.00019788469328052568, + "loss": 0.3392, + "step": 5408 + }, + { + "epoch": 0.43818859364873625, + "grad_norm": 0.03811095282435417, + "learning_rate": 0.00019788019262793104, + "loss": 0.3207, + "step": 5409 + }, + { + "epoch": 0.4382696046662346, + "grad_norm": 0.03211852163076401, + "learning_rate": 0.00019787569197533643, + "loss": 0.3174, + "step": 5410 + }, + { + "epoch": 0.438350615683733, + "grad_norm": 0.043832749128341675, + "learning_rate": 0.00019787119132274182, + "loss": 0.3684, + "step": 5411 + }, + { + "epoch": 0.43843162670123137, + "grad_norm": 0.032680824398994446, + "learning_rate": 0.00019786669067014718, + "loss": 0.3895, + "step": 5412 + }, + { + "epoch": 0.4385126377187297, + "grad_norm": 0.03802061453461647, + "learning_rate": 0.00019786219001755254, + "loss": 0.3778, + "step": 5413 + }, + { + "epoch": 0.43859364873622814, + "grad_norm": 0.03763697296380997, + "learning_rate": 0.00019785768936495793, + "loss": 0.3794, + "step": 5414 + }, + { + "epoch": 0.4386746597537265, + "grad_norm": 0.027806898579001427, + "learning_rate": 0.0001978531887123633, + "loss": 0.3265, + "step": 5415 + }, + { + "epoch": 0.4387556707712249, + "grad_norm": 0.03902803733944893, + "learning_rate": 0.00019784868805976867, + "loss": 0.3538, + "step": 5416 + }, + { + "epoch": 0.43883668178872326, + "grad_norm": 0.03295661136507988, + "learning_rate": 0.00019784418740717406, + "loss": 0.3642, + "step": 5417 + }, + { + "epoch": 0.43891769280622167, + "grad_norm": 0.03418942913413048, + "learning_rate": 0.00019783968675457942, + "loss": 0.3758, + "step": 5418 + }, + { + "epoch": 0.43899870382372, + "grad_norm": 0.028170818462967873, + "learning_rate": 0.00019783518610198478, + "loss": 0.3357, + "step": 5419 + }, + { + "epoch": 0.4390797148412184, + "grad_norm": 0.0327889621257782, + "learning_rate": 0.00019783068544939017, + "loss": 0.3371, + "step": 5420 + }, + { + "epoch": 0.4391607258587168, + "grad_norm": 0.03259824588894844, + "learning_rate": 0.00019782618479679555, + "loss": 0.3319, + "step": 5421 + }, + { + "epoch": 0.43924173687621515, + "grad_norm": 0.030759098008275032, + "learning_rate": 0.00019782168414420091, + "loss": 0.3241, + "step": 5422 + }, + { + "epoch": 0.43932274789371356, + "grad_norm": 0.029848387464880943, + "learning_rate": 0.0001978171834916063, + "loss": 0.3527, + "step": 5423 + }, + { + "epoch": 0.4394037589112119, + "grad_norm": 0.02862711437046528, + "learning_rate": 0.00019781268283901166, + "loss": 0.3428, + "step": 5424 + }, + { + "epoch": 0.43948476992871033, + "grad_norm": 0.030342362821102142, + "learning_rate": 0.00019780818218641702, + "loss": 0.3593, + "step": 5425 + }, + { + "epoch": 0.4395657809462087, + "grad_norm": 0.03511106222867966, + "learning_rate": 0.0001978036815338224, + "loss": 0.4057, + "step": 5426 + }, + { + "epoch": 0.43964679196370704, + "grad_norm": 0.03200792893767357, + "learning_rate": 0.0001977991808812278, + "loss": 0.3096, + "step": 5427 + }, + { + "epoch": 0.43972780298120545, + "grad_norm": 0.03348712623119354, + "learning_rate": 0.00019779468022863316, + "loss": 0.3458, + "step": 5428 + }, + { + "epoch": 0.4398088139987038, + "grad_norm": 0.02808743715286255, + "learning_rate": 0.00019779017957603854, + "loss": 0.3225, + "step": 5429 + }, + { + "epoch": 0.4398898250162022, + "grad_norm": 0.037103742361068726, + "learning_rate": 0.0001977856789234439, + "loss": 0.3741, + "step": 5430 + }, + { + "epoch": 0.4399708360337006, + "grad_norm": 0.032084204256534576, + "learning_rate": 0.00019778117827084926, + "loss": 0.3507, + "step": 5431 + }, + { + "epoch": 0.440051847051199, + "grad_norm": 0.037166427820920944, + "learning_rate": 0.00019777667761825465, + "loss": 0.4298, + "step": 5432 + }, + { + "epoch": 0.44013285806869734, + "grad_norm": 0.03572225570678711, + "learning_rate": 0.00019777217696566004, + "loss": 0.3389, + "step": 5433 + }, + { + "epoch": 0.4402138690861957, + "grad_norm": 0.030854439362883568, + "learning_rate": 0.0001977676763130654, + "loss": 0.3668, + "step": 5434 + }, + { + "epoch": 0.4402948801036941, + "grad_norm": 0.029217591509222984, + "learning_rate": 0.00019776317566047079, + "loss": 0.3482, + "step": 5435 + }, + { + "epoch": 0.44037589112119246, + "grad_norm": 0.035711340606212616, + "learning_rate": 0.00019775867500787615, + "loss": 0.3429, + "step": 5436 + }, + { + "epoch": 0.4404569021386909, + "grad_norm": 0.03293585032224655, + "learning_rate": 0.0001977541743552815, + "loss": 0.3604, + "step": 5437 + }, + { + "epoch": 0.44053791315618923, + "grad_norm": 0.034553904086351395, + "learning_rate": 0.0001977496737026869, + "loss": 0.3921, + "step": 5438 + }, + { + "epoch": 0.44061892417368764, + "grad_norm": 0.03484650328755379, + "learning_rate": 0.00019774517305009228, + "loss": 0.418, + "step": 5439 + }, + { + "epoch": 0.440699935191186, + "grad_norm": 0.03234095871448517, + "learning_rate": 0.00019774067239749764, + "loss": 0.3329, + "step": 5440 + }, + { + "epoch": 0.44078094620868435, + "grad_norm": 0.03310469910502434, + "learning_rate": 0.00019773617174490303, + "loss": 0.3264, + "step": 5441 + }, + { + "epoch": 0.44086195722618277, + "grad_norm": 0.03455287218093872, + "learning_rate": 0.0001977316710923084, + "loss": 0.363, + "step": 5442 + }, + { + "epoch": 0.4409429682436811, + "grad_norm": 0.03075864352285862, + "learning_rate": 0.00019772717043971375, + "loss": 0.3325, + "step": 5443 + }, + { + "epoch": 0.44102397926117953, + "grad_norm": 0.04186461120843887, + "learning_rate": 0.00019772266978711916, + "loss": 0.396, + "step": 5444 + }, + { + "epoch": 0.4411049902786779, + "grad_norm": 0.03264939412474632, + "learning_rate": 0.00019771816913452452, + "loss": 0.3436, + "step": 5445 + }, + { + "epoch": 0.4411860012961763, + "grad_norm": 0.02961140312254429, + "learning_rate": 0.00019771366848192988, + "loss": 0.337, + "step": 5446 + }, + { + "epoch": 0.44126701231367466, + "grad_norm": 0.030168673023581505, + "learning_rate": 0.00019770916782933527, + "loss": 0.3311, + "step": 5447 + }, + { + "epoch": 0.44134802333117307, + "grad_norm": 0.029339507222175598, + "learning_rate": 0.00019770466717674063, + "loss": 0.3735, + "step": 5448 + }, + { + "epoch": 0.4414290343486714, + "grad_norm": 0.03168919309973717, + "learning_rate": 0.000197700166524146, + "loss": 0.383, + "step": 5449 + }, + { + "epoch": 0.4415100453661698, + "grad_norm": 0.031165389344096184, + "learning_rate": 0.0001976956658715514, + "loss": 0.3793, + "step": 5450 + }, + { + "epoch": 0.4415910563836682, + "grad_norm": 0.032787322998046875, + "learning_rate": 0.00019769116521895676, + "loss": 0.4, + "step": 5451 + }, + { + "epoch": 0.44167206740116655, + "grad_norm": 0.029846899211406708, + "learning_rate": 0.00019768666456636212, + "loss": 0.3531, + "step": 5452 + }, + { + "epoch": 0.44175307841866496, + "grad_norm": 0.034375566989183426, + "learning_rate": 0.0001976821639137675, + "loss": 0.3749, + "step": 5453 + }, + { + "epoch": 0.4418340894361633, + "grad_norm": 0.03918051719665527, + "learning_rate": 0.00019767766326117287, + "loss": 0.4003, + "step": 5454 + }, + { + "epoch": 0.4419151004536617, + "grad_norm": 0.03416171669960022, + "learning_rate": 0.00019767316260857826, + "loss": 0.3508, + "step": 5455 + }, + { + "epoch": 0.4419961114711601, + "grad_norm": 0.029899321496486664, + "learning_rate": 0.00019766866195598364, + "loss": 0.3539, + "step": 5456 + }, + { + "epoch": 0.44207712248865844, + "grad_norm": 0.033006105571985245, + "learning_rate": 0.000197664161303389, + "loss": 0.3613, + "step": 5457 + }, + { + "epoch": 0.44215813350615685, + "grad_norm": 0.03142609819769859, + "learning_rate": 0.00019765966065079436, + "loss": 0.3513, + "step": 5458 + }, + { + "epoch": 0.4422391445236552, + "grad_norm": 0.03870227932929993, + "learning_rate": 0.00019765515999819975, + "loss": 0.3846, + "step": 5459 + }, + { + "epoch": 0.4423201555411536, + "grad_norm": 0.03481871262192726, + "learning_rate": 0.0001976506593456051, + "loss": 0.3803, + "step": 5460 + }, + { + "epoch": 0.44240116655865197, + "grad_norm": 0.030058806762099266, + "learning_rate": 0.0001976461586930105, + "loss": 0.3983, + "step": 5461 + }, + { + "epoch": 0.4424821775761504, + "grad_norm": 0.032486896961927414, + "learning_rate": 0.00019764165804041589, + "loss": 0.3837, + "step": 5462 + }, + { + "epoch": 0.44256318859364874, + "grad_norm": 0.03169577196240425, + "learning_rate": 0.00019763715738782125, + "loss": 0.3636, + "step": 5463 + }, + { + "epoch": 0.4426441996111471, + "grad_norm": 0.029622867703437805, + "learning_rate": 0.0001976326567352266, + "loss": 0.3682, + "step": 5464 + }, + { + "epoch": 0.4427252106286455, + "grad_norm": 0.030483927577733994, + "learning_rate": 0.000197628156082632, + "loss": 0.3773, + "step": 5465 + }, + { + "epoch": 0.44280622164614386, + "grad_norm": 0.034242622554302216, + "learning_rate": 0.00019762365543003735, + "loss": 0.3745, + "step": 5466 + }, + { + "epoch": 0.44288723266364227, + "grad_norm": 0.03172431141138077, + "learning_rate": 0.00019761915477744274, + "loss": 0.3659, + "step": 5467 + }, + { + "epoch": 0.4429682436811406, + "grad_norm": 0.03193086385726929, + "learning_rate": 0.00019761465412484813, + "loss": 0.3697, + "step": 5468 + }, + { + "epoch": 0.44304925469863904, + "grad_norm": 0.02869819663465023, + "learning_rate": 0.0001976101534722535, + "loss": 0.3666, + "step": 5469 + }, + { + "epoch": 0.4431302657161374, + "grad_norm": 0.03176020458340645, + "learning_rate": 0.00019760565281965885, + "loss": 0.3943, + "step": 5470 + }, + { + "epoch": 0.44321127673363575, + "grad_norm": 0.03435826301574707, + "learning_rate": 0.00019760115216706424, + "loss": 0.3416, + "step": 5471 + }, + { + "epoch": 0.44329228775113416, + "grad_norm": 0.033849213272333145, + "learning_rate": 0.0001975966515144696, + "loss": 0.3227, + "step": 5472 + }, + { + "epoch": 0.4433732987686325, + "grad_norm": 0.032498378306627274, + "learning_rate": 0.00019759215086187498, + "loss": 0.3608, + "step": 5473 + }, + { + "epoch": 0.44345430978613093, + "grad_norm": 0.029758907854557037, + "learning_rate": 0.00019758765020928037, + "loss": 0.3676, + "step": 5474 + }, + { + "epoch": 0.4435353208036293, + "grad_norm": 0.032321639358997345, + "learning_rate": 0.00019758314955668573, + "loss": 0.3969, + "step": 5475 + }, + { + "epoch": 0.4436163318211277, + "grad_norm": 0.03752026706933975, + "learning_rate": 0.0001975786489040911, + "loss": 0.4244, + "step": 5476 + }, + { + "epoch": 0.44369734283862605, + "grad_norm": 0.035951949656009674, + "learning_rate": 0.00019757414825149648, + "loss": 0.3564, + "step": 5477 + }, + { + "epoch": 0.4437783538561244, + "grad_norm": 0.03439652919769287, + "learning_rate": 0.00019756964759890184, + "loss": 0.3896, + "step": 5478 + }, + { + "epoch": 0.4438593648736228, + "grad_norm": 0.028514515608549118, + "learning_rate": 0.00019756514694630722, + "loss": 0.3491, + "step": 5479 + }, + { + "epoch": 0.4439403758911212, + "grad_norm": 0.03369169309735298, + "learning_rate": 0.0001975606462937126, + "loss": 0.3839, + "step": 5480 + }, + { + "epoch": 0.4440213869086196, + "grad_norm": 0.034968357533216476, + "learning_rate": 0.00019755614564111797, + "loss": 0.3321, + "step": 5481 + }, + { + "epoch": 0.44410239792611794, + "grad_norm": 0.041739922016859055, + "learning_rate": 0.00019755164498852333, + "loss": 0.3806, + "step": 5482 + }, + { + "epoch": 0.44418340894361635, + "grad_norm": 0.031739480793476105, + "learning_rate": 0.00019754714433592872, + "loss": 0.3084, + "step": 5483 + }, + { + "epoch": 0.4442644199611147, + "grad_norm": 0.035275254398584366, + "learning_rate": 0.00019754264368333408, + "loss": 0.3541, + "step": 5484 + }, + { + "epoch": 0.44434543097861307, + "grad_norm": 0.03895600512623787, + "learning_rate": 0.00019753814303073947, + "loss": 0.3913, + "step": 5485 + }, + { + "epoch": 0.4444264419961115, + "grad_norm": 0.03006908856332302, + "learning_rate": 0.00019753364237814485, + "loss": 0.3521, + "step": 5486 + }, + { + "epoch": 0.44450745301360983, + "grad_norm": 0.030135368928313255, + "learning_rate": 0.0001975291417255502, + "loss": 0.3438, + "step": 5487 + }, + { + "epoch": 0.44458846403110824, + "grad_norm": 0.02975146844983101, + "learning_rate": 0.00019752464107295557, + "loss": 0.3699, + "step": 5488 + }, + { + "epoch": 0.4446694750486066, + "grad_norm": 0.033303115516901016, + "learning_rate": 0.00019752014042036096, + "loss": 0.3425, + "step": 5489 + }, + { + "epoch": 0.444750486066105, + "grad_norm": 0.03164275363087654, + "learning_rate": 0.00019751563976776632, + "loss": 0.382, + "step": 5490 + }, + { + "epoch": 0.44483149708360337, + "grad_norm": 0.032723914831876755, + "learning_rate": 0.0001975111391151717, + "loss": 0.374, + "step": 5491 + }, + { + "epoch": 0.4449125081011017, + "grad_norm": 0.02998311258852482, + "learning_rate": 0.0001975066384625771, + "loss": 0.3281, + "step": 5492 + }, + { + "epoch": 0.44499351911860013, + "grad_norm": 0.0313972532749176, + "learning_rate": 0.00019750213780998245, + "loss": 0.3308, + "step": 5493 + }, + { + "epoch": 0.4450745301360985, + "grad_norm": 0.03089817613363266, + "learning_rate": 0.00019749763715738781, + "loss": 0.3538, + "step": 5494 + }, + { + "epoch": 0.4451555411535969, + "grad_norm": 0.03646509349346161, + "learning_rate": 0.0001974931365047932, + "loss": 0.3647, + "step": 5495 + }, + { + "epoch": 0.44523655217109526, + "grad_norm": 0.03091086447238922, + "learning_rate": 0.0001974886358521986, + "loss": 0.337, + "step": 5496 + }, + { + "epoch": 0.44531756318859367, + "grad_norm": 0.03153252229094505, + "learning_rate": 0.00019748413519960395, + "loss": 0.3539, + "step": 5497 + }, + { + "epoch": 0.445398574206092, + "grad_norm": 0.030094187706708908, + "learning_rate": 0.00019747963454700934, + "loss": 0.3462, + "step": 5498 + }, + { + "epoch": 0.44547958522359044, + "grad_norm": 0.03305754065513611, + "learning_rate": 0.0001974751338944147, + "loss": 0.3711, + "step": 5499 + }, + { + "epoch": 0.4455605962410888, + "grad_norm": 0.02947155386209488, + "learning_rate": 0.00019747063324182006, + "loss": 0.3559, + "step": 5500 + }, + { + "epoch": 0.44564160725858715, + "grad_norm": 0.037813130766153336, + "learning_rate": 0.00019746613258922544, + "loss": 0.4117, + "step": 5501 + }, + { + "epoch": 0.44572261827608556, + "grad_norm": 0.0321708545088768, + "learning_rate": 0.00019746163193663083, + "loss": 0.3622, + "step": 5502 + }, + { + "epoch": 0.4458036292935839, + "grad_norm": 0.03237539157271385, + "learning_rate": 0.0001974571312840362, + "loss": 0.3718, + "step": 5503 + }, + { + "epoch": 0.4458846403110823, + "grad_norm": 0.036956895142793655, + "learning_rate": 0.00019745263063144158, + "loss": 0.3655, + "step": 5504 + }, + { + "epoch": 0.4459656513285807, + "grad_norm": 0.03339963033795357, + "learning_rate": 0.00019744812997884694, + "loss": 0.3751, + "step": 5505 + }, + { + "epoch": 0.4460466623460791, + "grad_norm": 0.030577365309000015, + "learning_rate": 0.0001974436293262523, + "loss": 0.3382, + "step": 5506 + }, + { + "epoch": 0.44612767336357745, + "grad_norm": 0.030274648219347, + "learning_rate": 0.00019743912867365768, + "loss": 0.3601, + "step": 5507 + }, + { + "epoch": 0.4462086843810758, + "grad_norm": 0.027877582237124443, + "learning_rate": 0.00019743462802106307, + "loss": 0.3226, + "step": 5508 + }, + { + "epoch": 0.4462896953985742, + "grad_norm": 0.029135365039110184, + "learning_rate": 0.00019743012736846843, + "loss": 0.3288, + "step": 5509 + }, + { + "epoch": 0.44637070641607257, + "grad_norm": 0.030737759545445442, + "learning_rate": 0.00019742562671587382, + "loss": 0.3263, + "step": 5510 + }, + { + "epoch": 0.446451717433571, + "grad_norm": 0.03417934849858284, + "learning_rate": 0.00019742112606327918, + "loss": 0.3272, + "step": 5511 + }, + { + "epoch": 0.44653272845106934, + "grad_norm": 0.03573647886514664, + "learning_rate": 0.00019741662541068454, + "loss": 0.3576, + "step": 5512 + }, + { + "epoch": 0.44661373946856775, + "grad_norm": 0.02828175760805607, + "learning_rate": 0.00019741212475808993, + "loss": 0.367, + "step": 5513 + }, + { + "epoch": 0.4466947504860661, + "grad_norm": 0.03438275679945946, + "learning_rate": 0.0001974076241054953, + "loss": 0.3431, + "step": 5514 + }, + { + "epoch": 0.44677576150356446, + "grad_norm": 0.029411716386675835, + "learning_rate": 0.00019740312345290067, + "loss": 0.3404, + "step": 5515 + }, + { + "epoch": 0.4468567725210629, + "grad_norm": 0.030819594860076904, + "learning_rate": 0.00019739862280030606, + "loss": 0.3567, + "step": 5516 + }, + { + "epoch": 0.44693778353856123, + "grad_norm": 0.036897122859954834, + "learning_rate": 0.00019739412214771142, + "loss": 0.4223, + "step": 5517 + }, + { + "epoch": 0.44701879455605964, + "grad_norm": 0.0320262610912323, + "learning_rate": 0.00019738962149511678, + "loss": 0.3399, + "step": 5518 + }, + { + "epoch": 0.447099805573558, + "grad_norm": 0.03475802764296532, + "learning_rate": 0.00019738512084252217, + "loss": 0.3839, + "step": 5519 + }, + { + "epoch": 0.4471808165910564, + "grad_norm": 0.031090576201677322, + "learning_rate": 0.00019738062018992756, + "loss": 0.3841, + "step": 5520 + }, + { + "epoch": 0.44726182760855476, + "grad_norm": 0.03029099851846695, + "learning_rate": 0.00019737611953733292, + "loss": 0.33, + "step": 5521 + }, + { + "epoch": 0.4473428386260531, + "grad_norm": 0.03051835112273693, + "learning_rate": 0.0001973716188847383, + "loss": 0.368, + "step": 5522 + }, + { + "epoch": 0.44742384964355153, + "grad_norm": 0.02957882173359394, + "learning_rate": 0.00019736711823214366, + "loss": 0.3619, + "step": 5523 + }, + { + "epoch": 0.4475048606610499, + "grad_norm": 0.027077723294496536, + "learning_rate": 0.00019736261757954905, + "loss": 0.3441, + "step": 5524 + }, + { + "epoch": 0.4475858716785483, + "grad_norm": 0.029595671221613884, + "learning_rate": 0.00019735811692695444, + "loss": 0.3311, + "step": 5525 + }, + { + "epoch": 0.44766688269604665, + "grad_norm": 0.02908790111541748, + "learning_rate": 0.0001973536162743598, + "loss": 0.3529, + "step": 5526 + }, + { + "epoch": 0.44774789371354506, + "grad_norm": 0.030402570962905884, + "learning_rate": 0.00019734911562176516, + "loss": 0.3176, + "step": 5527 + }, + { + "epoch": 0.4478289047310434, + "grad_norm": 0.035793375223875046, + "learning_rate": 0.00019734461496917054, + "loss": 0.4044, + "step": 5528 + }, + { + "epoch": 0.4479099157485418, + "grad_norm": 0.03458891436457634, + "learning_rate": 0.0001973401143165759, + "loss": 0.3557, + "step": 5529 + }, + { + "epoch": 0.4479909267660402, + "grad_norm": 0.031998954713344574, + "learning_rate": 0.0001973356136639813, + "loss": 0.3524, + "step": 5530 + }, + { + "epoch": 0.44807193778353854, + "grad_norm": 0.03016434609889984, + "learning_rate": 0.00019733111301138668, + "loss": 0.3728, + "step": 5531 + }, + { + "epoch": 0.44815294880103695, + "grad_norm": 0.042123984545469284, + "learning_rate": 0.00019732661235879204, + "loss": 0.3771, + "step": 5532 + }, + { + "epoch": 0.4482339598185353, + "grad_norm": 0.028418881818652153, + "learning_rate": 0.0001973221117061974, + "loss": 0.3518, + "step": 5533 + }, + { + "epoch": 0.4483149708360337, + "grad_norm": 0.03634253889322281, + "learning_rate": 0.00019731761105360279, + "loss": 0.3659, + "step": 5534 + }, + { + "epoch": 0.4483959818535321, + "grad_norm": 0.033422552049160004, + "learning_rate": 0.00019731311040100815, + "loss": 0.3763, + "step": 5535 + }, + { + "epoch": 0.44847699287103043, + "grad_norm": 0.03268659487366676, + "learning_rate": 0.00019730860974841353, + "loss": 0.3353, + "step": 5536 + }, + { + "epoch": 0.44855800388852884, + "grad_norm": 0.038906682282686234, + "learning_rate": 0.00019730410909581892, + "loss": 0.4272, + "step": 5537 + }, + { + "epoch": 0.4486390149060272, + "grad_norm": 0.03435641527175903, + "learning_rate": 0.00019729960844322428, + "loss": 0.4166, + "step": 5538 + }, + { + "epoch": 0.4487200259235256, + "grad_norm": 0.03522764891386032, + "learning_rate": 0.00019729510779062964, + "loss": 0.3679, + "step": 5539 + }, + { + "epoch": 0.44880103694102397, + "grad_norm": 0.03225326165556908, + "learning_rate": 0.00019729060713803503, + "loss": 0.3456, + "step": 5540 + }, + { + "epoch": 0.4488820479585224, + "grad_norm": 0.029571063816547394, + "learning_rate": 0.0001972861064854404, + "loss": 0.3183, + "step": 5541 + }, + { + "epoch": 0.44896305897602073, + "grad_norm": 0.0386439673602581, + "learning_rate": 0.00019728160583284577, + "loss": 0.438, + "step": 5542 + }, + { + "epoch": 0.44904406999351915, + "grad_norm": 0.035687077790498734, + "learning_rate": 0.00019727710518025116, + "loss": 0.3606, + "step": 5543 + }, + { + "epoch": 0.4491250810110175, + "grad_norm": 0.03311741352081299, + "learning_rate": 0.00019727260452765652, + "loss": 0.3893, + "step": 5544 + }, + { + "epoch": 0.44920609202851586, + "grad_norm": 0.0325443334877491, + "learning_rate": 0.00019726810387506188, + "loss": 0.3824, + "step": 5545 + }, + { + "epoch": 0.44928710304601427, + "grad_norm": 0.03307313844561577, + "learning_rate": 0.00019726360322246727, + "loss": 0.3303, + "step": 5546 + }, + { + "epoch": 0.4493681140635126, + "grad_norm": 0.03364656865596771, + "learning_rate": 0.00019725910256987263, + "loss": 0.3653, + "step": 5547 + }, + { + "epoch": 0.44944912508101104, + "grad_norm": 0.03391062840819359, + "learning_rate": 0.00019725460191727802, + "loss": 0.346, + "step": 5548 + }, + { + "epoch": 0.4495301360985094, + "grad_norm": 0.03271664306521416, + "learning_rate": 0.0001972501012646834, + "loss": 0.3862, + "step": 5549 + }, + { + "epoch": 0.4496111471160078, + "grad_norm": 0.031478166580200195, + "learning_rate": 0.00019724560061208876, + "loss": 0.3557, + "step": 5550 + }, + { + "epoch": 0.44969215813350616, + "grad_norm": 0.03432717174291611, + "learning_rate": 0.00019724109995949412, + "loss": 0.4148, + "step": 5551 + }, + { + "epoch": 0.4497731691510045, + "grad_norm": 0.031578414142131805, + "learning_rate": 0.0001972365993068995, + "loss": 0.3852, + "step": 5552 + }, + { + "epoch": 0.4498541801685029, + "grad_norm": 0.03257657587528229, + "learning_rate": 0.00019723209865430487, + "loss": 0.3939, + "step": 5553 + }, + { + "epoch": 0.4499351911860013, + "grad_norm": 0.029145896434783936, + "learning_rate": 0.00019722759800171026, + "loss": 0.3018, + "step": 5554 + }, + { + "epoch": 0.4500162022034997, + "grad_norm": 0.030488284304738045, + "learning_rate": 0.00019722309734911564, + "loss": 0.3805, + "step": 5555 + }, + { + "epoch": 0.45009721322099805, + "grad_norm": 0.03530951589345932, + "learning_rate": 0.000197218596696521, + "loss": 0.3785, + "step": 5556 + }, + { + "epoch": 0.45017822423849646, + "grad_norm": 0.029178744181990623, + "learning_rate": 0.00019721409604392636, + "loss": 0.3517, + "step": 5557 + }, + { + "epoch": 0.4502592352559948, + "grad_norm": 0.037996888160705566, + "learning_rate": 0.00019720959539133175, + "loss": 0.4228, + "step": 5558 + }, + { + "epoch": 0.45034024627349317, + "grad_norm": 0.03181913122534752, + "learning_rate": 0.0001972050947387371, + "loss": 0.4349, + "step": 5559 + }, + { + "epoch": 0.4504212572909916, + "grad_norm": 0.03445330634713173, + "learning_rate": 0.0001972005940861425, + "loss": 0.4087, + "step": 5560 + }, + { + "epoch": 0.45050226830848994, + "grad_norm": 0.0322774238884449, + "learning_rate": 0.00019719609343354789, + "loss": 0.3158, + "step": 5561 + }, + { + "epoch": 0.45058327932598835, + "grad_norm": 0.03479272872209549, + "learning_rate": 0.00019719159278095325, + "loss": 0.3869, + "step": 5562 + }, + { + "epoch": 0.4506642903434867, + "grad_norm": 0.03306402638554573, + "learning_rate": 0.0001971870921283586, + "loss": 0.3676, + "step": 5563 + }, + { + "epoch": 0.4507453013609851, + "grad_norm": 0.031178992241621017, + "learning_rate": 0.000197182591475764, + "loss": 0.3454, + "step": 5564 + }, + { + "epoch": 0.4508263123784835, + "grad_norm": 0.03359023854136467, + "learning_rate": 0.00019717809082316935, + "loss": 0.3874, + "step": 5565 + }, + { + "epoch": 0.45090732339598183, + "grad_norm": 0.0333256796002388, + "learning_rate": 0.00019717359017057474, + "loss": 0.3842, + "step": 5566 + }, + { + "epoch": 0.45098833441348024, + "grad_norm": 0.029935095459222794, + "learning_rate": 0.00019716908951798013, + "loss": 0.3551, + "step": 5567 + }, + { + "epoch": 0.4510693454309786, + "grad_norm": 0.029074864462018013, + "learning_rate": 0.0001971645888653855, + "loss": 0.3303, + "step": 5568 + }, + { + "epoch": 0.451150356448477, + "grad_norm": 0.03021335043013096, + "learning_rate": 0.00019716008821279085, + "loss": 0.3553, + "step": 5569 + }, + { + "epoch": 0.45123136746597536, + "grad_norm": 0.036455187946558, + "learning_rate": 0.00019715558756019624, + "loss": 0.3459, + "step": 5570 + }, + { + "epoch": 0.4513123784834738, + "grad_norm": 0.0361262671649456, + "learning_rate": 0.0001971510869076016, + "loss": 0.3783, + "step": 5571 + }, + { + "epoch": 0.45139338950097213, + "grad_norm": 0.0377480685710907, + "learning_rate": 0.00019714658625500698, + "loss": 0.3911, + "step": 5572 + }, + { + "epoch": 0.4514744005184705, + "grad_norm": 0.03418326750397682, + "learning_rate": 0.00019714208560241237, + "loss": 0.3826, + "step": 5573 + }, + { + "epoch": 0.4515554115359689, + "grad_norm": 0.035968709737062454, + "learning_rate": 0.00019713758494981773, + "loss": 0.3411, + "step": 5574 + }, + { + "epoch": 0.45163642255346725, + "grad_norm": 0.03250642493367195, + "learning_rate": 0.0001971330842972231, + "loss": 0.4007, + "step": 5575 + }, + { + "epoch": 0.45171743357096567, + "grad_norm": 0.03724129498004913, + "learning_rate": 0.00019712858364462848, + "loss": 0.3675, + "step": 5576 + }, + { + "epoch": 0.451798444588464, + "grad_norm": 0.034993384033441544, + "learning_rate": 0.00019712408299203386, + "loss": 0.3965, + "step": 5577 + }, + { + "epoch": 0.45187945560596243, + "grad_norm": 0.03404005616903305, + "learning_rate": 0.00019711958233943922, + "loss": 0.3469, + "step": 5578 + }, + { + "epoch": 0.4519604666234608, + "grad_norm": 0.033056486397981644, + "learning_rate": 0.0001971150816868446, + "loss": 0.3306, + "step": 5579 + }, + { + "epoch": 0.45204147764095914, + "grad_norm": 0.04045993462204933, + "learning_rate": 0.00019711058103424997, + "loss": 0.3976, + "step": 5580 + }, + { + "epoch": 0.45212248865845756, + "grad_norm": 0.03009396232664585, + "learning_rate": 0.00019710608038165533, + "loss": 0.3595, + "step": 5581 + }, + { + "epoch": 0.4522034996759559, + "grad_norm": 0.03585132956504822, + "learning_rate": 0.00019710157972906072, + "loss": 0.3745, + "step": 5582 + }, + { + "epoch": 0.4522845106934543, + "grad_norm": 0.03336992487311363, + "learning_rate": 0.0001970970790764661, + "loss": 0.371, + "step": 5583 + }, + { + "epoch": 0.4523655217109527, + "grad_norm": 0.03080323152244091, + "learning_rate": 0.00019709257842387147, + "loss": 0.347, + "step": 5584 + }, + { + "epoch": 0.4524465327284511, + "grad_norm": 0.031140008941292763, + "learning_rate": 0.00019708807777127685, + "loss": 0.3609, + "step": 5585 + }, + { + "epoch": 0.45252754374594945, + "grad_norm": 0.029711594805121422, + "learning_rate": 0.0001970835771186822, + "loss": 0.3359, + "step": 5586 + }, + { + "epoch": 0.4526085547634478, + "grad_norm": 0.03211309760808945, + "learning_rate": 0.0001970790764660876, + "loss": 0.3801, + "step": 5587 + }, + { + "epoch": 0.4526895657809462, + "grad_norm": 0.028747573494911194, + "learning_rate": 0.00019707457581349296, + "loss": 0.3278, + "step": 5588 + }, + { + "epoch": 0.45277057679844457, + "grad_norm": 0.02893175184726715, + "learning_rate": 0.00019707007516089835, + "loss": 0.3294, + "step": 5589 + }, + { + "epoch": 0.452851587815943, + "grad_norm": 0.030056646093726158, + "learning_rate": 0.0001970655745083037, + "loss": 0.3809, + "step": 5590 + }, + { + "epoch": 0.45293259883344134, + "grad_norm": 0.03192294016480446, + "learning_rate": 0.0001970610738557091, + "loss": 0.3171, + "step": 5591 + }, + { + "epoch": 0.45301360985093975, + "grad_norm": 0.03785628080368042, + "learning_rate": 0.00019705657320311445, + "loss": 0.3788, + "step": 5592 + }, + { + "epoch": 0.4530946208684381, + "grad_norm": 0.032300058752298355, + "learning_rate": 0.00019705207255051984, + "loss": 0.373, + "step": 5593 + }, + { + "epoch": 0.4531756318859365, + "grad_norm": 0.03178001567721367, + "learning_rate": 0.0001970475718979252, + "loss": 0.343, + "step": 5594 + }, + { + "epoch": 0.45325664290343487, + "grad_norm": 0.028133099898695946, + "learning_rate": 0.0001970430712453306, + "loss": 0.3536, + "step": 5595 + }, + { + "epoch": 0.4533376539209332, + "grad_norm": 0.03809162229299545, + "learning_rate": 0.00019703857059273595, + "loss": 0.352, + "step": 5596 + }, + { + "epoch": 0.45341866493843164, + "grad_norm": 0.028956517577171326, + "learning_rate": 0.00019703406994014134, + "loss": 0.3468, + "step": 5597 + }, + { + "epoch": 0.45349967595593, + "grad_norm": 0.033848222345113754, + "learning_rate": 0.0001970295692875467, + "loss": 0.3263, + "step": 5598 + }, + { + "epoch": 0.4535806869734284, + "grad_norm": 0.029715241864323616, + "learning_rate": 0.00019702506863495208, + "loss": 0.2873, + "step": 5599 + }, + { + "epoch": 0.45366169799092676, + "grad_norm": 0.037406545132398605, + "learning_rate": 0.00019702056798235744, + "loss": 0.3849, + "step": 5600 + }, + { + "epoch": 0.45374270900842517, + "grad_norm": 0.030025694519281387, + "learning_rate": 0.00019701606732976283, + "loss": 0.313, + "step": 5601 + }, + { + "epoch": 0.4538237200259235, + "grad_norm": 0.03170083835721016, + "learning_rate": 0.0001970115666771682, + "loss": 0.4076, + "step": 5602 + }, + { + "epoch": 0.4539047310434219, + "grad_norm": 0.03326018899679184, + "learning_rate": 0.00019700706602457358, + "loss": 0.3813, + "step": 5603 + }, + { + "epoch": 0.4539857420609203, + "grad_norm": 0.029004201292991638, + "learning_rate": 0.00019700256537197894, + "loss": 0.3126, + "step": 5604 + }, + { + "epoch": 0.45406675307841865, + "grad_norm": 0.030437350273132324, + "learning_rate": 0.00019699806471938432, + "loss": 0.3349, + "step": 5605 + }, + { + "epoch": 0.45414776409591706, + "grad_norm": 0.031204288825392723, + "learning_rate": 0.0001969935640667897, + "loss": 0.3757, + "step": 5606 + }, + { + "epoch": 0.4542287751134154, + "grad_norm": 0.03240903466939926, + "learning_rate": 0.00019698906341419507, + "loss": 0.35, + "step": 5607 + }, + { + "epoch": 0.45430978613091383, + "grad_norm": 0.03153548762202263, + "learning_rate": 0.00019698456276160043, + "loss": 0.3715, + "step": 5608 + }, + { + "epoch": 0.4543907971484122, + "grad_norm": 0.028369799256324768, + "learning_rate": 0.00019698006210900582, + "loss": 0.3137, + "step": 5609 + }, + { + "epoch": 0.45447180816591054, + "grad_norm": 0.03753526136279106, + "learning_rate": 0.00019697556145641118, + "loss": 0.4511, + "step": 5610 + }, + { + "epoch": 0.45455281918340895, + "grad_norm": 0.03225935623049736, + "learning_rate": 0.00019697106080381657, + "loss": 0.3959, + "step": 5611 + }, + { + "epoch": 0.4546338302009073, + "grad_norm": 0.03187123313546181, + "learning_rate": 0.00019696656015122195, + "loss": 0.3559, + "step": 5612 + }, + { + "epoch": 0.4547148412184057, + "grad_norm": 0.031548816710710526, + "learning_rate": 0.00019696205949862731, + "loss": 0.3414, + "step": 5613 + }, + { + "epoch": 0.4547958522359041, + "grad_norm": 0.027448706328868866, + "learning_rate": 0.00019695755884603267, + "loss": 0.3235, + "step": 5614 + }, + { + "epoch": 0.4548768632534025, + "grad_norm": 0.03475405275821686, + "learning_rate": 0.00019695305819343806, + "loss": 0.3632, + "step": 5615 + }, + { + "epoch": 0.45495787427090084, + "grad_norm": 0.03420334309339523, + "learning_rate": 0.00019694855754084342, + "loss": 0.3965, + "step": 5616 + }, + { + "epoch": 0.4550388852883992, + "grad_norm": 0.03267541527748108, + "learning_rate": 0.0001969440568882488, + "loss": 0.3534, + "step": 5617 + }, + { + "epoch": 0.4551198963058976, + "grad_norm": 0.03327197954058647, + "learning_rate": 0.0001969395562356542, + "loss": 0.3453, + "step": 5618 + }, + { + "epoch": 0.45520090732339596, + "grad_norm": 0.03551128879189491, + "learning_rate": 0.00019693505558305956, + "loss": 0.3759, + "step": 5619 + }, + { + "epoch": 0.4552819183408944, + "grad_norm": 0.03346748650074005, + "learning_rate": 0.00019693055493046492, + "loss": 0.3887, + "step": 5620 + }, + { + "epoch": 0.45536292935839273, + "grad_norm": 0.03371153026819229, + "learning_rate": 0.0001969260542778703, + "loss": 0.3671, + "step": 5621 + }, + { + "epoch": 0.45544394037589114, + "grad_norm": 0.030512280762195587, + "learning_rate": 0.00019692155362527566, + "loss": 0.3943, + "step": 5622 + }, + { + "epoch": 0.4555249513933895, + "grad_norm": 0.033492594957351685, + "learning_rate": 0.00019691705297268105, + "loss": 0.3823, + "step": 5623 + }, + { + "epoch": 0.45560596241088785, + "grad_norm": 0.033078644424676895, + "learning_rate": 0.00019691255232008644, + "loss": 0.3974, + "step": 5624 + }, + { + "epoch": 0.45568697342838627, + "grad_norm": 0.036155492067337036, + "learning_rate": 0.0001969080516674918, + "loss": 0.3953, + "step": 5625 + }, + { + "epoch": 0.4557679844458846, + "grad_norm": 0.03399474173784256, + "learning_rate": 0.00019690355101489716, + "loss": 0.4092, + "step": 5626 + }, + { + "epoch": 0.45584899546338303, + "grad_norm": 0.03158540651202202, + "learning_rate": 0.00019689905036230254, + "loss": 0.385, + "step": 5627 + }, + { + "epoch": 0.4559300064808814, + "grad_norm": 0.03434424847364426, + "learning_rate": 0.0001968945497097079, + "loss": 0.3455, + "step": 5628 + }, + { + "epoch": 0.4560110174983798, + "grad_norm": 0.033775173127651215, + "learning_rate": 0.0001968900490571133, + "loss": 0.3619, + "step": 5629 + }, + { + "epoch": 0.45609202851587816, + "grad_norm": 0.0323668047785759, + "learning_rate": 0.00019688554840451868, + "loss": 0.3301, + "step": 5630 + }, + { + "epoch": 0.4561730395333765, + "grad_norm": 0.031544625759124756, + "learning_rate": 0.00019688104775192404, + "loss": 0.3736, + "step": 5631 + }, + { + "epoch": 0.4562540505508749, + "grad_norm": 0.03242504969239235, + "learning_rate": 0.0001968765470993294, + "loss": 0.3973, + "step": 5632 + }, + { + "epoch": 0.4563350615683733, + "grad_norm": 0.03165287524461746, + "learning_rate": 0.00019687204644673479, + "loss": 0.3867, + "step": 5633 + }, + { + "epoch": 0.4564160725858717, + "grad_norm": 0.030601616948843002, + "learning_rate": 0.00019686754579414015, + "loss": 0.3521, + "step": 5634 + }, + { + "epoch": 0.45649708360337005, + "grad_norm": 0.029937176033854485, + "learning_rate": 0.00019686304514154553, + "loss": 0.4054, + "step": 5635 + }, + { + "epoch": 0.45657809462086846, + "grad_norm": 0.030966006219387054, + "learning_rate": 0.00019685854448895092, + "loss": 0.332, + "step": 5636 + }, + { + "epoch": 0.4566591056383668, + "grad_norm": 0.03530475124716759, + "learning_rate": 0.00019685404383635628, + "loss": 0.3658, + "step": 5637 + }, + { + "epoch": 0.4567401166558652, + "grad_norm": 0.02700497955083847, + "learning_rate": 0.00019684954318376164, + "loss": 0.3259, + "step": 5638 + }, + { + "epoch": 0.4568211276733636, + "grad_norm": 0.03353603556752205, + "learning_rate": 0.00019684504253116703, + "loss": 0.3527, + "step": 5639 + }, + { + "epoch": 0.45690213869086194, + "grad_norm": 0.029281500726938248, + "learning_rate": 0.0001968405418785724, + "loss": 0.353, + "step": 5640 + }, + { + "epoch": 0.45698314970836035, + "grad_norm": 0.032181404531002045, + "learning_rate": 0.00019683604122597777, + "loss": 0.379, + "step": 5641 + }, + { + "epoch": 0.4570641607258587, + "grad_norm": 0.03690697252750397, + "learning_rate": 0.00019683154057338316, + "loss": 0.4451, + "step": 5642 + }, + { + "epoch": 0.4571451717433571, + "grad_norm": 0.029781976714730263, + "learning_rate": 0.00019682703992078852, + "loss": 0.3974, + "step": 5643 + }, + { + "epoch": 0.45722618276085547, + "grad_norm": 0.03640514239668846, + "learning_rate": 0.00019682253926819388, + "loss": 0.3696, + "step": 5644 + }, + { + "epoch": 0.4573071937783539, + "grad_norm": 0.03272904083132744, + "learning_rate": 0.00019681803861559927, + "loss": 0.3926, + "step": 5645 + }, + { + "epoch": 0.45738820479585224, + "grad_norm": 0.026668280363082886, + "learning_rate": 0.00019681353796300463, + "loss": 0.3005, + "step": 5646 + }, + { + "epoch": 0.4574692158133506, + "grad_norm": 0.028446605429053307, + "learning_rate": 0.00019680903731041002, + "loss": 0.3242, + "step": 5647 + }, + { + "epoch": 0.457550226830849, + "grad_norm": 0.028360169380903244, + "learning_rate": 0.0001968045366578154, + "loss": 0.3312, + "step": 5648 + }, + { + "epoch": 0.45763123784834736, + "grad_norm": 0.02998286299407482, + "learning_rate": 0.00019680003600522076, + "loss": 0.3738, + "step": 5649 + }, + { + "epoch": 0.45771224886584577, + "grad_norm": 0.029424121603369713, + "learning_rate": 0.00019679553535262612, + "loss": 0.3192, + "step": 5650 + }, + { + "epoch": 0.45779325988334413, + "grad_norm": 0.03551146760582924, + "learning_rate": 0.0001967910347000315, + "loss": 0.3893, + "step": 5651 + }, + { + "epoch": 0.45787427090084254, + "grad_norm": 0.027763670310378075, + "learning_rate": 0.00019678653404743687, + "loss": 0.3126, + "step": 5652 + }, + { + "epoch": 0.4579552819183409, + "grad_norm": 0.030606647953391075, + "learning_rate": 0.00019678203339484226, + "loss": 0.3541, + "step": 5653 + }, + { + "epoch": 0.45803629293583925, + "grad_norm": 0.029099253937602043, + "learning_rate": 0.00019677753274224765, + "loss": 0.3799, + "step": 5654 + }, + { + "epoch": 0.45811730395333766, + "grad_norm": 0.030575638636946678, + "learning_rate": 0.000196773032089653, + "loss": 0.3879, + "step": 5655 + }, + { + "epoch": 0.458198314970836, + "grad_norm": 0.027737515047192574, + "learning_rate": 0.0001967685314370584, + "loss": 0.3454, + "step": 5656 + }, + { + "epoch": 0.45827932598833443, + "grad_norm": 0.03216322883963585, + "learning_rate": 0.00019676403078446375, + "loss": 0.4085, + "step": 5657 + }, + { + "epoch": 0.4583603370058328, + "grad_norm": 0.032551418989896774, + "learning_rate": 0.00019675953013186914, + "loss": 0.3722, + "step": 5658 + }, + { + "epoch": 0.4584413480233312, + "grad_norm": 0.03434035927057266, + "learning_rate": 0.0001967550294792745, + "loss": 0.3455, + "step": 5659 + }, + { + "epoch": 0.45852235904082955, + "grad_norm": 0.03802090883255005, + "learning_rate": 0.0001967505288266799, + "loss": 0.4339, + "step": 5660 + }, + { + "epoch": 0.4586033700583279, + "grad_norm": 0.03537151589989662, + "learning_rate": 0.00019674602817408525, + "loss": 0.3903, + "step": 5661 + }, + { + "epoch": 0.4586843810758263, + "grad_norm": 0.03610273450613022, + "learning_rate": 0.00019674152752149063, + "loss": 0.3985, + "step": 5662 + }, + { + "epoch": 0.4587653920933247, + "grad_norm": 0.0331452339887619, + "learning_rate": 0.000196737026868896, + "loss": 0.2959, + "step": 5663 + }, + { + "epoch": 0.4588464031108231, + "grad_norm": 0.037477314472198486, + "learning_rate": 0.00019673252621630138, + "loss": 0.3811, + "step": 5664 + }, + { + "epoch": 0.45892741412832144, + "grad_norm": 0.037166204303503036, + "learning_rate": 0.00019672802556370674, + "loss": 0.3617, + "step": 5665 + }, + { + "epoch": 0.45900842514581985, + "grad_norm": 0.03388550877571106, + "learning_rate": 0.00019672352491111213, + "loss": 0.3591, + "step": 5666 + }, + { + "epoch": 0.4590894361633182, + "grad_norm": 0.029943333938717842, + "learning_rate": 0.0001967190242585175, + "loss": 0.3568, + "step": 5667 + }, + { + "epoch": 0.45917044718081657, + "grad_norm": 0.031574398279190063, + "learning_rate": 0.00019671452360592288, + "loss": 0.3161, + "step": 5668 + }, + { + "epoch": 0.459251458198315, + "grad_norm": 0.032210998237133026, + "learning_rate": 0.00019671002295332824, + "loss": 0.4043, + "step": 5669 + }, + { + "epoch": 0.45933246921581333, + "grad_norm": 0.0306962039321661, + "learning_rate": 0.00019670552230073362, + "loss": 0.3695, + "step": 5670 + }, + { + "epoch": 0.45941348023331174, + "grad_norm": 0.03281569853425026, + "learning_rate": 0.00019670102164813898, + "loss": 0.3459, + "step": 5671 + }, + { + "epoch": 0.4594944912508101, + "grad_norm": 0.03295455873012543, + "learning_rate": 0.00019669652099554437, + "loss": 0.3773, + "step": 5672 + }, + { + "epoch": 0.4595755022683085, + "grad_norm": 0.03259165585041046, + "learning_rate": 0.00019669202034294973, + "loss": 0.365, + "step": 5673 + }, + { + "epoch": 0.45965651328580687, + "grad_norm": 0.03317999839782715, + "learning_rate": 0.00019668751969035512, + "loss": 0.3842, + "step": 5674 + }, + { + "epoch": 0.4597375243033052, + "grad_norm": 0.028184987604618073, + "learning_rate": 0.00019668301903776048, + "loss": 0.3677, + "step": 5675 + }, + { + "epoch": 0.45981853532080363, + "grad_norm": 0.03515704721212387, + "learning_rate": 0.00019667851838516586, + "loss": 0.3957, + "step": 5676 + }, + { + "epoch": 0.459899546338302, + "grad_norm": 0.03550994396209717, + "learning_rate": 0.00019667401773257122, + "loss": 0.3295, + "step": 5677 + }, + { + "epoch": 0.4599805573558004, + "grad_norm": 0.034394558519124985, + "learning_rate": 0.0001966695170799766, + "loss": 0.363, + "step": 5678 + }, + { + "epoch": 0.46006156837329876, + "grad_norm": 0.02899022586643696, + "learning_rate": 0.00019666501642738197, + "loss": 0.3353, + "step": 5679 + }, + { + "epoch": 0.46014257939079717, + "grad_norm": 0.029689837247133255, + "learning_rate": 0.00019666051577478736, + "loss": 0.3633, + "step": 5680 + }, + { + "epoch": 0.4602235904082955, + "grad_norm": 0.030063379555940628, + "learning_rate": 0.00019665601512219275, + "loss": 0.3576, + "step": 5681 + }, + { + "epoch": 0.46030460142579394, + "grad_norm": 0.028926603496074677, + "learning_rate": 0.0001966515144695981, + "loss": 0.374, + "step": 5682 + }, + { + "epoch": 0.4603856124432923, + "grad_norm": 0.02919875644147396, + "learning_rate": 0.00019664701381700347, + "loss": 0.3595, + "step": 5683 + }, + { + "epoch": 0.46046662346079065, + "grad_norm": 0.03260861337184906, + "learning_rate": 0.00019664251316440885, + "loss": 0.3483, + "step": 5684 + }, + { + "epoch": 0.46054763447828906, + "grad_norm": 0.03058563731610775, + "learning_rate": 0.0001966380125118142, + "loss": 0.3436, + "step": 5685 + }, + { + "epoch": 0.4606286454957874, + "grad_norm": 0.034815624356269836, + "learning_rate": 0.0001966335118592196, + "loss": 0.3441, + "step": 5686 + }, + { + "epoch": 0.4607096565132858, + "grad_norm": 0.030256694182753563, + "learning_rate": 0.000196629011206625, + "loss": 0.3332, + "step": 5687 + }, + { + "epoch": 0.4607906675307842, + "grad_norm": 0.0370088554918766, + "learning_rate": 0.00019662451055403035, + "loss": 0.4123, + "step": 5688 + }, + { + "epoch": 0.4608716785482826, + "grad_norm": 0.0312041062861681, + "learning_rate": 0.0001966200099014357, + "loss": 0.3864, + "step": 5689 + }, + { + "epoch": 0.46095268956578095, + "grad_norm": 0.03293803706765175, + "learning_rate": 0.0001966155092488411, + "loss": 0.3534, + "step": 5690 + }, + { + "epoch": 0.4610337005832793, + "grad_norm": 0.03456052392721176, + "learning_rate": 0.00019661100859624645, + "loss": 0.3744, + "step": 5691 + }, + { + "epoch": 0.4611147116007777, + "grad_norm": 0.027828998863697052, + "learning_rate": 0.00019660650794365184, + "loss": 0.3319, + "step": 5692 + }, + { + "epoch": 0.46119572261827607, + "grad_norm": 0.0315227285027504, + "learning_rate": 0.00019660200729105723, + "loss": 0.3482, + "step": 5693 + }, + { + "epoch": 0.4612767336357745, + "grad_norm": 0.032845206558704376, + "learning_rate": 0.0001965975066384626, + "loss": 0.3641, + "step": 5694 + }, + { + "epoch": 0.46135774465327284, + "grad_norm": 0.036846909672021866, + "learning_rate": 0.00019659300598586795, + "loss": 0.4032, + "step": 5695 + }, + { + "epoch": 0.46143875567077125, + "grad_norm": 0.03248875215649605, + "learning_rate": 0.00019658850533327334, + "loss": 0.3364, + "step": 5696 + }, + { + "epoch": 0.4615197666882696, + "grad_norm": 0.029342476278543472, + "learning_rate": 0.0001965840046806787, + "loss": 0.3364, + "step": 5697 + }, + { + "epoch": 0.46160077770576796, + "grad_norm": 0.03533339127898216, + "learning_rate": 0.00019657950402808408, + "loss": 0.3959, + "step": 5698 + }, + { + "epoch": 0.4616817887232664, + "grad_norm": 0.03511074557900429, + "learning_rate": 0.00019657500337548947, + "loss": 0.3542, + "step": 5699 + }, + { + "epoch": 0.46176279974076473, + "grad_norm": 0.03128790482878685, + "learning_rate": 0.00019657050272289483, + "loss": 0.3457, + "step": 5700 + }, + { + "epoch": 0.46184381075826314, + "grad_norm": 0.03102973848581314, + "learning_rate": 0.0001965660020703002, + "loss": 0.3221, + "step": 5701 + }, + { + "epoch": 0.4619248217757615, + "grad_norm": 0.03910433501005173, + "learning_rate": 0.00019656150141770558, + "loss": 0.3338, + "step": 5702 + }, + { + "epoch": 0.4620058327932599, + "grad_norm": 0.030722856521606445, + "learning_rate": 0.00019655700076511094, + "loss": 0.3936, + "step": 5703 + }, + { + "epoch": 0.46208684381075826, + "grad_norm": 0.0336427241563797, + "learning_rate": 0.00019655250011251633, + "loss": 0.3546, + "step": 5704 + }, + { + "epoch": 0.4621678548282566, + "grad_norm": 0.033403877168893814, + "learning_rate": 0.0001965479994599217, + "loss": 0.3971, + "step": 5705 + }, + { + "epoch": 0.46224886584575503, + "grad_norm": 0.03144184499979019, + "learning_rate": 0.00019654349880732707, + "loss": 0.3449, + "step": 5706 + }, + { + "epoch": 0.4623298768632534, + "grad_norm": 0.03269330784678459, + "learning_rate": 0.00019653899815473243, + "loss": 0.3501, + "step": 5707 + }, + { + "epoch": 0.4624108878807518, + "grad_norm": 0.030189063400030136, + "learning_rate": 0.00019653449750213782, + "loss": 0.3574, + "step": 5708 + }, + { + "epoch": 0.46249189889825015, + "grad_norm": 0.03226552903652191, + "learning_rate": 0.00019652999684954318, + "loss": 0.3588, + "step": 5709 + }, + { + "epoch": 0.46257290991574856, + "grad_norm": 0.03771393373608589, + "learning_rate": 0.00019652549619694857, + "loss": 0.375, + "step": 5710 + }, + { + "epoch": 0.4626539209332469, + "grad_norm": 0.028019536286592484, + "learning_rate": 0.00019652099554435395, + "loss": 0.3059, + "step": 5711 + }, + { + "epoch": 0.4627349319507453, + "grad_norm": 0.03134125843644142, + "learning_rate": 0.00019651649489175931, + "loss": 0.3717, + "step": 5712 + }, + { + "epoch": 0.4628159429682437, + "grad_norm": 0.031846798956394196, + "learning_rate": 0.00019651199423916467, + "loss": 0.3623, + "step": 5713 + }, + { + "epoch": 0.46289695398574204, + "grad_norm": 0.030864115804433823, + "learning_rate": 0.00019650749358657006, + "loss": 0.3826, + "step": 5714 + }, + { + "epoch": 0.46297796500324045, + "grad_norm": 0.0309914480894804, + "learning_rate": 0.00019650299293397542, + "loss": 0.3777, + "step": 5715 + }, + { + "epoch": 0.4630589760207388, + "grad_norm": 0.05822967737913132, + "learning_rate": 0.0001964984922813808, + "loss": 0.4019, + "step": 5716 + }, + { + "epoch": 0.4631399870382372, + "grad_norm": 0.034782227128744125, + "learning_rate": 0.0001964939916287862, + "loss": 0.3833, + "step": 5717 + }, + { + "epoch": 0.4632209980557356, + "grad_norm": 0.0274979081004858, + "learning_rate": 0.00019648949097619156, + "loss": 0.3377, + "step": 5718 + }, + { + "epoch": 0.46330200907323393, + "grad_norm": 0.04161456227302551, + "learning_rate": 0.00019648499032359692, + "loss": 0.3934, + "step": 5719 + }, + { + "epoch": 0.46338302009073234, + "grad_norm": 0.028479766100645065, + "learning_rate": 0.0001964804896710023, + "loss": 0.3514, + "step": 5720 + }, + { + "epoch": 0.4634640311082307, + "grad_norm": 0.03187427669763565, + "learning_rate": 0.00019647598901840766, + "loss": 0.3481, + "step": 5721 + }, + { + "epoch": 0.4635450421257291, + "grad_norm": 0.030612220987677574, + "learning_rate": 0.00019647148836581305, + "loss": 0.3584, + "step": 5722 + }, + { + "epoch": 0.46362605314322747, + "grad_norm": 0.0349239706993103, + "learning_rate": 0.00019646698771321844, + "loss": 0.3635, + "step": 5723 + }, + { + "epoch": 0.4637070641607259, + "grad_norm": 0.03317857161164284, + "learning_rate": 0.0001964624870606238, + "loss": 0.4113, + "step": 5724 + }, + { + "epoch": 0.46378807517822424, + "grad_norm": 0.030748484656214714, + "learning_rate": 0.00019645798640802918, + "loss": 0.3788, + "step": 5725 + }, + { + "epoch": 0.4638690861957226, + "grad_norm": 0.028064358979463577, + "learning_rate": 0.00019645348575543454, + "loss": 0.3366, + "step": 5726 + }, + { + "epoch": 0.463950097213221, + "grad_norm": 0.0314946211874485, + "learning_rate": 0.0001964489851028399, + "loss": 0.3536, + "step": 5727 + }, + { + "epoch": 0.46403110823071936, + "grad_norm": 0.03007393144071102, + "learning_rate": 0.0001964444844502453, + "loss": 0.3389, + "step": 5728 + }, + { + "epoch": 0.46411211924821777, + "grad_norm": 0.031874869018793106, + "learning_rate": 0.00019643998379765068, + "loss": 0.3696, + "step": 5729 + }, + { + "epoch": 0.4641931302657161, + "grad_norm": 0.033363260328769684, + "learning_rate": 0.00019643548314505604, + "loss": 0.3965, + "step": 5730 + }, + { + "epoch": 0.46427414128321454, + "grad_norm": 0.03128187730908394, + "learning_rate": 0.00019643098249246143, + "loss": 0.3264, + "step": 5731 + }, + { + "epoch": 0.4643551523007129, + "grad_norm": 0.03325207158923149, + "learning_rate": 0.00019642648183986679, + "loss": 0.3524, + "step": 5732 + }, + { + "epoch": 0.4644361633182113, + "grad_norm": 0.02750324457883835, + "learning_rate": 0.00019642198118727217, + "loss": 0.3199, + "step": 5733 + }, + { + "epoch": 0.46451717433570966, + "grad_norm": 0.03271065652370453, + "learning_rate": 0.00019641748053467753, + "loss": 0.3883, + "step": 5734 + }, + { + "epoch": 0.464598185353208, + "grad_norm": 0.03460625186562538, + "learning_rate": 0.00019641297988208292, + "loss": 0.3595, + "step": 5735 + }, + { + "epoch": 0.4646791963707064, + "grad_norm": 0.03765340894460678, + "learning_rate": 0.00019640847922948828, + "loss": 0.3955, + "step": 5736 + }, + { + "epoch": 0.4647602073882048, + "grad_norm": 0.03118179738521576, + "learning_rate": 0.00019640397857689367, + "loss": 0.3503, + "step": 5737 + }, + { + "epoch": 0.4648412184057032, + "grad_norm": 0.030739959329366684, + "learning_rate": 0.00019639947792429903, + "loss": 0.3483, + "step": 5738 + }, + { + "epoch": 0.46492222942320155, + "grad_norm": 0.03348294273018837, + "learning_rate": 0.00019639497727170441, + "loss": 0.3952, + "step": 5739 + }, + { + "epoch": 0.46500324044069996, + "grad_norm": 0.03235217183828354, + "learning_rate": 0.00019639047661910977, + "loss": 0.4024, + "step": 5740 + }, + { + "epoch": 0.4650842514581983, + "grad_norm": 0.03641180694103241, + "learning_rate": 0.00019638597596651516, + "loss": 0.3705, + "step": 5741 + }, + { + "epoch": 0.4651652624756967, + "grad_norm": 0.029355797916650772, + "learning_rate": 0.00019638147531392052, + "loss": 0.3022, + "step": 5742 + }, + { + "epoch": 0.4652462734931951, + "grad_norm": 0.033131200820207596, + "learning_rate": 0.0001963769746613259, + "loss": 0.3295, + "step": 5743 + }, + { + "epoch": 0.46532728451069344, + "grad_norm": 0.0363098680973053, + "learning_rate": 0.00019637247400873127, + "loss": 0.4094, + "step": 5744 + }, + { + "epoch": 0.46540829552819185, + "grad_norm": 0.03575873747467995, + "learning_rate": 0.00019636797335613666, + "loss": 0.3672, + "step": 5745 + }, + { + "epoch": 0.4654893065456902, + "grad_norm": 0.030971111729741096, + "learning_rate": 0.00019636347270354202, + "loss": 0.3718, + "step": 5746 + }, + { + "epoch": 0.4655703175631886, + "grad_norm": 0.030273951590061188, + "learning_rate": 0.0001963589720509474, + "loss": 0.3134, + "step": 5747 + }, + { + "epoch": 0.465651328580687, + "grad_norm": 0.03387882933020592, + "learning_rate": 0.00019635447139835276, + "loss": 0.3703, + "step": 5748 + }, + { + "epoch": 0.46573233959818533, + "grad_norm": 0.03747584670782089, + "learning_rate": 0.00019634997074575815, + "loss": 0.4081, + "step": 5749 + }, + { + "epoch": 0.46581335061568374, + "grad_norm": 0.030774252489209175, + "learning_rate": 0.0001963454700931635, + "loss": 0.3737, + "step": 5750 + }, + { + "epoch": 0.4658943616331821, + "grad_norm": 0.03460761159658432, + "learning_rate": 0.0001963409694405689, + "loss": 0.3649, + "step": 5751 + }, + { + "epoch": 0.4659753726506805, + "grad_norm": 0.0365169532597065, + "learning_rate": 0.00019633646878797426, + "loss": 0.3755, + "step": 5752 + }, + { + "epoch": 0.46605638366817886, + "grad_norm": 0.03159398213028908, + "learning_rate": 0.00019633196813537965, + "loss": 0.3296, + "step": 5753 + }, + { + "epoch": 0.4661373946856773, + "grad_norm": 0.03654010221362114, + "learning_rate": 0.000196327467482785, + "loss": 0.3659, + "step": 5754 + }, + { + "epoch": 0.46621840570317563, + "grad_norm": 0.030611230060458183, + "learning_rate": 0.0001963229668301904, + "loss": 0.3642, + "step": 5755 + }, + { + "epoch": 0.466299416720674, + "grad_norm": 0.031239870935678482, + "learning_rate": 0.00019631846617759575, + "loss": 0.3329, + "step": 5756 + }, + { + "epoch": 0.4663804277381724, + "grad_norm": 0.03149476647377014, + "learning_rate": 0.00019631396552500114, + "loss": 0.3521, + "step": 5757 + }, + { + "epoch": 0.46646143875567075, + "grad_norm": 0.03362254425883293, + "learning_rate": 0.0001963094648724065, + "loss": 0.3426, + "step": 5758 + }, + { + "epoch": 0.46654244977316917, + "grad_norm": 0.03327491879463196, + "learning_rate": 0.0001963049642198119, + "loss": 0.3881, + "step": 5759 + }, + { + "epoch": 0.4666234607906675, + "grad_norm": 0.03225428983569145, + "learning_rate": 0.00019630046356721725, + "loss": 0.3441, + "step": 5760 + }, + { + "epoch": 0.46670447180816593, + "grad_norm": 0.031021784991025925, + "learning_rate": 0.00019629596291462263, + "loss": 0.3446, + "step": 5761 + }, + { + "epoch": 0.4667854828256643, + "grad_norm": 0.02941407449543476, + "learning_rate": 0.00019629146226202802, + "loss": 0.3714, + "step": 5762 + }, + { + "epoch": 0.46686649384316264, + "grad_norm": 0.02663205750286579, + "learning_rate": 0.00019628696160943338, + "loss": 0.3546, + "step": 5763 + }, + { + "epoch": 0.46694750486066106, + "grad_norm": 0.032318115234375, + "learning_rate": 0.00019628246095683874, + "loss": 0.3536, + "step": 5764 + }, + { + "epoch": 0.4670285158781594, + "grad_norm": 0.029012607410550117, + "learning_rate": 0.00019627796030424413, + "loss": 0.3302, + "step": 5765 + }, + { + "epoch": 0.4671095268956578, + "grad_norm": 0.03587726131081581, + "learning_rate": 0.0001962734596516495, + "loss": 0.4192, + "step": 5766 + }, + { + "epoch": 0.4671905379131562, + "grad_norm": 0.03222670033574104, + "learning_rate": 0.00019626895899905488, + "loss": 0.3345, + "step": 5767 + }, + { + "epoch": 0.4672715489306546, + "grad_norm": 0.03298739716410637, + "learning_rate": 0.00019626445834646026, + "loss": 0.3644, + "step": 5768 + }, + { + "epoch": 0.46735255994815295, + "grad_norm": 0.030183693394064903, + "learning_rate": 0.00019625995769386562, + "loss": 0.3478, + "step": 5769 + }, + { + "epoch": 0.4674335709656513, + "grad_norm": 0.03015521913766861, + "learning_rate": 0.00019625545704127098, + "loss": 0.3734, + "step": 5770 + }, + { + "epoch": 0.4675145819831497, + "grad_norm": 0.033700130879879, + "learning_rate": 0.00019625095638867637, + "loss": 0.3917, + "step": 5771 + }, + { + "epoch": 0.46759559300064807, + "grad_norm": 0.034100547432899475, + "learning_rate": 0.00019624645573608173, + "loss": 0.3633, + "step": 5772 + }, + { + "epoch": 0.4676766040181465, + "grad_norm": 0.028375964611768723, + "learning_rate": 0.00019624195508348712, + "loss": 0.3957, + "step": 5773 + }, + { + "epoch": 0.46775761503564484, + "grad_norm": 0.03054911084473133, + "learning_rate": 0.0001962374544308925, + "loss": 0.36, + "step": 5774 + }, + { + "epoch": 0.46783862605314325, + "grad_norm": 0.029022803530097008, + "learning_rate": 0.00019623295377829786, + "loss": 0.3553, + "step": 5775 + }, + { + "epoch": 0.4679196370706416, + "grad_norm": 0.029407450929284096, + "learning_rate": 0.00019622845312570322, + "loss": 0.3464, + "step": 5776 + }, + { + "epoch": 0.46800064808814, + "grad_norm": 0.029288267716765404, + "learning_rate": 0.0001962239524731086, + "loss": 0.3328, + "step": 5777 + }, + { + "epoch": 0.46808165910563837, + "grad_norm": 0.028497010469436646, + "learning_rate": 0.00019621945182051397, + "loss": 0.3166, + "step": 5778 + }, + { + "epoch": 0.4681626701231367, + "grad_norm": 0.03194279596209526, + "learning_rate": 0.00019621495116791936, + "loss": 0.3442, + "step": 5779 + }, + { + "epoch": 0.46824368114063514, + "grad_norm": 0.026724031195044518, + "learning_rate": 0.00019621045051532475, + "loss": 0.3245, + "step": 5780 + }, + { + "epoch": 0.4683246921581335, + "grad_norm": 0.03415639325976372, + "learning_rate": 0.0001962059498627301, + "loss": 0.35, + "step": 5781 + }, + { + "epoch": 0.4684057031756319, + "grad_norm": 0.03118039481341839, + "learning_rate": 0.00019620144921013547, + "loss": 0.3783, + "step": 5782 + }, + { + "epoch": 0.46848671419313026, + "grad_norm": 0.035469118505716324, + "learning_rate": 0.00019619694855754085, + "loss": 0.3465, + "step": 5783 + }, + { + "epoch": 0.46856772521062867, + "grad_norm": 0.03357202932238579, + "learning_rate": 0.0001961924479049462, + "loss": 0.4097, + "step": 5784 + }, + { + "epoch": 0.468648736228127, + "grad_norm": 0.03250733017921448, + "learning_rate": 0.0001961879472523516, + "loss": 0.3894, + "step": 5785 + }, + { + "epoch": 0.4687297472456254, + "grad_norm": 0.03183455392718315, + "learning_rate": 0.000196183446599757, + "loss": 0.3643, + "step": 5786 + }, + { + "epoch": 0.4688107582631238, + "grad_norm": 0.035170722752809525, + "learning_rate": 0.00019617894594716235, + "loss": 0.339, + "step": 5787 + }, + { + "epoch": 0.46889176928062215, + "grad_norm": 0.034206192940473557, + "learning_rate": 0.0001961744452945677, + "loss": 0.4072, + "step": 5788 + }, + { + "epoch": 0.46897278029812056, + "grad_norm": 0.036689676344394684, + "learning_rate": 0.0001961699446419731, + "loss": 0.4151, + "step": 5789 + }, + { + "epoch": 0.4690537913156189, + "grad_norm": 0.031407617032527924, + "learning_rate": 0.00019616544398937846, + "loss": 0.3366, + "step": 5790 + }, + { + "epoch": 0.46913480233311733, + "grad_norm": 0.03071746416389942, + "learning_rate": 0.00019616094333678384, + "loss": 0.3637, + "step": 5791 + }, + { + "epoch": 0.4692158133506157, + "grad_norm": 0.030194265767931938, + "learning_rate": 0.00019615644268418923, + "loss": 0.3516, + "step": 5792 + }, + { + "epoch": 0.46929682436811404, + "grad_norm": 0.02895018830895424, + "learning_rate": 0.0001961519420315946, + "loss": 0.3574, + "step": 5793 + }, + { + "epoch": 0.46937783538561245, + "grad_norm": 0.03663880005478859, + "learning_rate": 0.00019614744137899998, + "loss": 0.3274, + "step": 5794 + }, + { + "epoch": 0.4694588464031108, + "grad_norm": 0.03252609446644783, + "learning_rate": 0.00019614294072640534, + "loss": 0.33, + "step": 5795 + }, + { + "epoch": 0.4695398574206092, + "grad_norm": 0.035485222935676575, + "learning_rate": 0.0001961384400738107, + "loss": 0.3711, + "step": 5796 + }, + { + "epoch": 0.4696208684381076, + "grad_norm": 0.0342998169362545, + "learning_rate": 0.00019613393942121608, + "loss": 0.3839, + "step": 5797 + }, + { + "epoch": 0.469701879455606, + "grad_norm": 0.032619740813970566, + "learning_rate": 0.00019612943876862147, + "loss": 0.3822, + "step": 5798 + }, + { + "epoch": 0.46978289047310434, + "grad_norm": 0.03156382218003273, + "learning_rate": 0.00019612493811602683, + "loss": 0.3753, + "step": 5799 + }, + { + "epoch": 0.4698639014906027, + "grad_norm": 0.02982146479189396, + "learning_rate": 0.00019612043746343222, + "loss": 0.3427, + "step": 5800 + }, + { + "epoch": 0.4699449125081011, + "grad_norm": 0.03202043101191521, + "learning_rate": 0.00019611593681083758, + "loss": 0.3244, + "step": 5801 + }, + { + "epoch": 0.47002592352559946, + "grad_norm": 0.031166845932602882, + "learning_rate": 0.00019611143615824294, + "loss": 0.3527, + "step": 5802 + }, + { + "epoch": 0.4701069345430979, + "grad_norm": 0.028590986505150795, + "learning_rate": 0.00019610693550564833, + "loss": 0.3551, + "step": 5803 + }, + { + "epoch": 0.47018794556059623, + "grad_norm": 0.03039284236729145, + "learning_rate": 0.0001961024348530537, + "loss": 0.3772, + "step": 5804 + }, + { + "epoch": 0.47026895657809464, + "grad_norm": 0.03211115673184395, + "learning_rate": 0.00019609793420045907, + "loss": 0.3392, + "step": 5805 + }, + { + "epoch": 0.470349967595593, + "grad_norm": 0.03226887807250023, + "learning_rate": 0.00019609343354786446, + "loss": 0.3898, + "step": 5806 + }, + { + "epoch": 0.47043097861309136, + "grad_norm": 0.03120870143175125, + "learning_rate": 0.00019608893289526982, + "loss": 0.3335, + "step": 5807 + }, + { + "epoch": 0.47051198963058977, + "grad_norm": 0.03307618573307991, + "learning_rate": 0.00019608443224267518, + "loss": 0.4147, + "step": 5808 + }, + { + "epoch": 0.4705930006480881, + "grad_norm": 0.029730724170804024, + "learning_rate": 0.00019607993159008057, + "loss": 0.3283, + "step": 5809 + }, + { + "epoch": 0.47067401166558653, + "grad_norm": 0.030490990728139877, + "learning_rate": 0.00019607543093748595, + "loss": 0.3674, + "step": 5810 + }, + { + "epoch": 0.4707550226830849, + "grad_norm": 0.03562087193131447, + "learning_rate": 0.00019607093028489131, + "loss": 0.3826, + "step": 5811 + }, + { + "epoch": 0.4708360337005833, + "grad_norm": 0.033215828239917755, + "learning_rate": 0.0001960664296322967, + "loss": 0.3294, + "step": 5812 + }, + { + "epoch": 0.47091704471808166, + "grad_norm": 0.030647827312350273, + "learning_rate": 0.00019606192897970206, + "loss": 0.3758, + "step": 5813 + }, + { + "epoch": 0.47099805573558, + "grad_norm": 0.0350574292242527, + "learning_rate": 0.00019605742832710745, + "loss": 0.3849, + "step": 5814 + }, + { + "epoch": 0.4710790667530784, + "grad_norm": 0.03518437221646309, + "learning_rate": 0.0001960529276745128, + "loss": 0.3768, + "step": 5815 + }, + { + "epoch": 0.4711600777705768, + "grad_norm": 0.03374134749174118, + "learning_rate": 0.0001960484270219182, + "loss": 0.3908, + "step": 5816 + }, + { + "epoch": 0.4712410887880752, + "grad_norm": 0.02921205572783947, + "learning_rate": 0.00019604392636932356, + "loss": 0.3264, + "step": 5817 + }, + { + "epoch": 0.47132209980557355, + "grad_norm": 0.03081241063773632, + "learning_rate": 0.00019603942571672894, + "loss": 0.3214, + "step": 5818 + }, + { + "epoch": 0.47140311082307196, + "grad_norm": 0.0298855472356081, + "learning_rate": 0.0001960349250641343, + "loss": 0.3989, + "step": 5819 + }, + { + "epoch": 0.4714841218405703, + "grad_norm": 0.03593145310878754, + "learning_rate": 0.0001960304244115397, + "loss": 0.3994, + "step": 5820 + }, + { + "epoch": 0.47156513285806867, + "grad_norm": 0.03263883292675018, + "learning_rate": 0.00019602592375894505, + "loss": 0.4075, + "step": 5821 + }, + { + "epoch": 0.4716461438755671, + "grad_norm": 0.030939802527427673, + "learning_rate": 0.00019602142310635044, + "loss": 0.3339, + "step": 5822 + }, + { + "epoch": 0.47172715489306544, + "grad_norm": 0.032803893089294434, + "learning_rate": 0.0001960169224537558, + "loss": 0.4188, + "step": 5823 + }, + { + "epoch": 0.47180816591056385, + "grad_norm": 0.03152771666646004, + "learning_rate": 0.00019601242180116118, + "loss": 0.3906, + "step": 5824 + }, + { + "epoch": 0.4718891769280622, + "grad_norm": 0.030248772352933884, + "learning_rate": 0.00019600792114856654, + "loss": 0.3348, + "step": 5825 + }, + { + "epoch": 0.4719701879455606, + "grad_norm": 0.03298931196331978, + "learning_rate": 0.00019600342049597193, + "loss": 0.3274, + "step": 5826 + }, + { + "epoch": 0.47205119896305897, + "grad_norm": 0.030176391825079918, + "learning_rate": 0.0001959989198433773, + "loss": 0.3597, + "step": 5827 + }, + { + "epoch": 0.4721322099805574, + "grad_norm": 0.03578178584575653, + "learning_rate": 0.00019599441919078268, + "loss": 0.3713, + "step": 5828 + }, + { + "epoch": 0.47221322099805574, + "grad_norm": 0.035716280341148376, + "learning_rate": 0.00019598991853818804, + "loss": 0.3836, + "step": 5829 + }, + { + "epoch": 0.4722942320155541, + "grad_norm": 0.03593599051237106, + "learning_rate": 0.00019598541788559343, + "loss": 0.3151, + "step": 5830 + }, + { + "epoch": 0.4723752430330525, + "grad_norm": 0.03321794047951698, + "learning_rate": 0.00019598091723299879, + "loss": 0.3865, + "step": 5831 + }, + { + "epoch": 0.47245625405055086, + "grad_norm": 0.03270499035716057, + "learning_rate": 0.00019597641658040417, + "loss": 0.3526, + "step": 5832 + }, + { + "epoch": 0.4725372650680493, + "grad_norm": 0.03048735298216343, + "learning_rate": 0.00019597191592780953, + "loss": 0.3733, + "step": 5833 + }, + { + "epoch": 0.47261827608554763, + "grad_norm": 0.034730520099401474, + "learning_rate": 0.00019596741527521492, + "loss": 0.4253, + "step": 5834 + }, + { + "epoch": 0.47269928710304604, + "grad_norm": 0.028934409841895103, + "learning_rate": 0.00019596291462262028, + "loss": 0.3417, + "step": 5835 + }, + { + "epoch": 0.4727802981205444, + "grad_norm": 0.02618395909667015, + "learning_rate": 0.00019595841397002567, + "loss": 0.3397, + "step": 5836 + }, + { + "epoch": 0.47286130913804275, + "grad_norm": 0.0328238420188427, + "learning_rate": 0.00019595391331743103, + "loss": 0.349, + "step": 5837 + }, + { + "epoch": 0.47294232015554116, + "grad_norm": 0.03482384979724884, + "learning_rate": 0.00019594941266483642, + "loss": 0.33, + "step": 5838 + }, + { + "epoch": 0.4730233311730395, + "grad_norm": 0.030215738341212273, + "learning_rate": 0.00019594491201224178, + "loss": 0.3656, + "step": 5839 + }, + { + "epoch": 0.47310434219053793, + "grad_norm": 0.03280828148126602, + "learning_rate": 0.00019594041135964716, + "loss": 0.364, + "step": 5840 + }, + { + "epoch": 0.4731853532080363, + "grad_norm": 0.03489303216338158, + "learning_rate": 0.00019593591070705252, + "loss": 0.3834, + "step": 5841 + }, + { + "epoch": 0.4732663642255347, + "grad_norm": 0.029000435024499893, + "learning_rate": 0.0001959314100544579, + "loss": 0.3423, + "step": 5842 + }, + { + "epoch": 0.47334737524303305, + "grad_norm": 0.028401605784893036, + "learning_rate": 0.0001959269094018633, + "loss": 0.3591, + "step": 5843 + }, + { + "epoch": 0.4734283862605314, + "grad_norm": 0.031942471861839294, + "learning_rate": 0.00019592240874926866, + "loss": 0.3582, + "step": 5844 + }, + { + "epoch": 0.4735093972780298, + "grad_norm": 0.03147265687584877, + "learning_rate": 0.00019591790809667402, + "loss": 0.3321, + "step": 5845 + }, + { + "epoch": 0.4735904082955282, + "grad_norm": 0.030731940641999245, + "learning_rate": 0.0001959134074440794, + "loss": 0.3143, + "step": 5846 + }, + { + "epoch": 0.4736714193130266, + "grad_norm": 0.02728288061916828, + "learning_rate": 0.00019590890679148476, + "loss": 0.3787, + "step": 5847 + }, + { + "epoch": 0.47375243033052494, + "grad_norm": 0.02671802043914795, + "learning_rate": 0.00019590440613889015, + "loss": 0.3523, + "step": 5848 + }, + { + "epoch": 0.47383344134802335, + "grad_norm": 0.02398890070617199, + "learning_rate": 0.00019589990548629554, + "loss": 0.3107, + "step": 5849 + }, + { + "epoch": 0.4739144523655217, + "grad_norm": 0.028565047308802605, + "learning_rate": 0.0001958954048337009, + "loss": 0.355, + "step": 5850 + }, + { + "epoch": 0.47399546338302007, + "grad_norm": 0.026737749576568604, + "learning_rate": 0.00019589090418110626, + "loss": 0.3599, + "step": 5851 + }, + { + "epoch": 0.4740764744005185, + "grad_norm": 0.03038250282406807, + "learning_rate": 0.00019588640352851165, + "loss": 0.3882, + "step": 5852 + }, + { + "epoch": 0.47415748541801683, + "grad_norm": 0.036394696682691574, + "learning_rate": 0.000195881902875917, + "loss": 0.4081, + "step": 5853 + }, + { + "epoch": 0.47423849643551524, + "grad_norm": 0.031558968126773834, + "learning_rate": 0.0001958774022233224, + "loss": 0.3896, + "step": 5854 + }, + { + "epoch": 0.4743195074530136, + "grad_norm": 0.029992422088980675, + "learning_rate": 0.00019587290157072778, + "loss": 0.3306, + "step": 5855 + }, + { + "epoch": 0.474400518470512, + "grad_norm": 0.02885795570909977, + "learning_rate": 0.00019586840091813314, + "loss": 0.3439, + "step": 5856 + }, + { + "epoch": 0.47448152948801037, + "grad_norm": 0.03568622097373009, + "learning_rate": 0.0001958639002655385, + "loss": 0.3697, + "step": 5857 + }, + { + "epoch": 0.4745625405055087, + "grad_norm": 0.03252142295241356, + "learning_rate": 0.0001958593996129439, + "loss": 0.3378, + "step": 5858 + }, + { + "epoch": 0.47464355152300713, + "grad_norm": 0.032284434884786606, + "learning_rate": 0.00019585489896034925, + "loss": 0.3845, + "step": 5859 + }, + { + "epoch": 0.4747245625405055, + "grad_norm": 0.0372265949845314, + "learning_rate": 0.00019585039830775463, + "loss": 0.3658, + "step": 5860 + }, + { + "epoch": 0.4748055735580039, + "grad_norm": 0.03087705560028553, + "learning_rate": 0.00019584589765516002, + "loss": 0.3932, + "step": 5861 + }, + { + "epoch": 0.47488658457550226, + "grad_norm": 0.032511867582798004, + "learning_rate": 0.00019584139700256538, + "loss": 0.3503, + "step": 5862 + }, + { + "epoch": 0.47496759559300067, + "grad_norm": 0.028408238664269447, + "learning_rate": 0.00019583689634997077, + "loss": 0.3346, + "step": 5863 + }, + { + "epoch": 0.475048606610499, + "grad_norm": 0.031963370740413666, + "learning_rate": 0.00019583239569737613, + "loss": 0.3491, + "step": 5864 + }, + { + "epoch": 0.4751296176279974, + "grad_norm": 0.031659066677093506, + "learning_rate": 0.0001958278950447815, + "loss": 0.363, + "step": 5865 + }, + { + "epoch": 0.4752106286454958, + "grad_norm": 0.033491283655166626, + "learning_rate": 0.00019582339439218688, + "loss": 0.3356, + "step": 5866 + }, + { + "epoch": 0.47529163966299415, + "grad_norm": 0.030474351719021797, + "learning_rate": 0.00019581889373959226, + "loss": 0.4148, + "step": 5867 + }, + { + "epoch": 0.47537265068049256, + "grad_norm": 0.0345035158097744, + "learning_rate": 0.00019581439308699762, + "loss": 0.3473, + "step": 5868 + }, + { + "epoch": 0.4754536616979909, + "grad_norm": 0.030190279707312584, + "learning_rate": 0.000195809892434403, + "loss": 0.3364, + "step": 5869 + }, + { + "epoch": 0.4755346727154893, + "grad_norm": 0.034467823803424835, + "learning_rate": 0.00019580539178180837, + "loss": 0.3863, + "step": 5870 + }, + { + "epoch": 0.4756156837329877, + "grad_norm": 0.034213580191135406, + "learning_rate": 0.00019580089112921373, + "loss": 0.3569, + "step": 5871 + }, + { + "epoch": 0.4756966947504861, + "grad_norm": 0.03731187433004379, + "learning_rate": 0.00019579639047661912, + "loss": 0.3884, + "step": 5872 + }, + { + "epoch": 0.47577770576798445, + "grad_norm": 0.030831869691610336, + "learning_rate": 0.0001957918898240245, + "loss": 0.3383, + "step": 5873 + }, + { + "epoch": 0.4758587167854828, + "grad_norm": 0.02833954058587551, + "learning_rate": 0.00019578738917142986, + "loss": 0.297, + "step": 5874 + }, + { + "epoch": 0.4759397278029812, + "grad_norm": 0.035140588879585266, + "learning_rate": 0.00019578288851883525, + "loss": 0.3608, + "step": 5875 + }, + { + "epoch": 0.47602073882047957, + "grad_norm": 0.029223179444670677, + "learning_rate": 0.0001957783878662406, + "loss": 0.3978, + "step": 5876 + }, + { + "epoch": 0.476101749837978, + "grad_norm": 0.02845843695104122, + "learning_rate": 0.00019577388721364597, + "loss": 0.3281, + "step": 5877 + }, + { + "epoch": 0.47618276085547634, + "grad_norm": 0.03213419020175934, + "learning_rate": 0.00019576938656105136, + "loss": 0.3228, + "step": 5878 + }, + { + "epoch": 0.47626377187297475, + "grad_norm": 0.03073774464428425, + "learning_rate": 0.00019576488590845675, + "loss": 0.3533, + "step": 5879 + }, + { + "epoch": 0.4763447828904731, + "grad_norm": 0.028143148869276047, + "learning_rate": 0.0001957603852558621, + "loss": 0.3223, + "step": 5880 + }, + { + "epoch": 0.47642579390797146, + "grad_norm": 0.03042803891003132, + "learning_rate": 0.0001957558846032675, + "loss": 0.3501, + "step": 5881 + }, + { + "epoch": 0.4765068049254699, + "grad_norm": 0.02732650376856327, + "learning_rate": 0.00019575138395067285, + "loss": 0.3089, + "step": 5882 + }, + { + "epoch": 0.47658781594296823, + "grad_norm": 0.026975180953741074, + "learning_rate": 0.00019574688329807821, + "loss": 0.338, + "step": 5883 + }, + { + "epoch": 0.47666882696046664, + "grad_norm": 0.028704164549708366, + "learning_rate": 0.0001957423826454836, + "loss": 0.3402, + "step": 5884 + }, + { + "epoch": 0.476749837977965, + "grad_norm": 0.031123366206884384, + "learning_rate": 0.000195737881992889, + "loss": 0.3004, + "step": 5885 + }, + { + "epoch": 0.4768308489954634, + "grad_norm": 0.032670848071575165, + "learning_rate": 0.00019573338134029435, + "loss": 0.3459, + "step": 5886 + }, + { + "epoch": 0.47691186001296176, + "grad_norm": 0.027524948120117188, + "learning_rate": 0.00019572888068769974, + "loss": 0.3338, + "step": 5887 + }, + { + "epoch": 0.4769928710304601, + "grad_norm": 0.033073924481868744, + "learning_rate": 0.0001957243800351051, + "loss": 0.3398, + "step": 5888 + }, + { + "epoch": 0.47707388204795853, + "grad_norm": 0.033785343170166016, + "learning_rate": 0.00019571987938251046, + "loss": 0.3872, + "step": 5889 + }, + { + "epoch": 0.4771548930654569, + "grad_norm": 0.0337032824754715, + "learning_rate": 0.00019571537872991584, + "loss": 0.382, + "step": 5890 + }, + { + "epoch": 0.4772359040829553, + "grad_norm": 0.03290359675884247, + "learning_rate": 0.00019571087807732123, + "loss": 0.3664, + "step": 5891 + }, + { + "epoch": 0.47731691510045365, + "grad_norm": 0.03508547320961952, + "learning_rate": 0.0001957063774247266, + "loss": 0.398, + "step": 5892 + }, + { + "epoch": 0.47739792611795207, + "grad_norm": 0.026666609570384026, + "learning_rate": 0.00019570187677213198, + "loss": 0.3114, + "step": 5893 + }, + { + "epoch": 0.4774789371354504, + "grad_norm": 0.0331806018948555, + "learning_rate": 0.00019569737611953734, + "loss": 0.3685, + "step": 5894 + }, + { + "epoch": 0.4775599481529488, + "grad_norm": 0.03472977504134178, + "learning_rate": 0.00019569287546694272, + "loss": 0.3646, + "step": 5895 + }, + { + "epoch": 0.4776409591704472, + "grad_norm": 0.03430356830358505, + "learning_rate": 0.00019568837481434808, + "loss": 0.3676, + "step": 5896 + }, + { + "epoch": 0.47772197018794554, + "grad_norm": 0.0349530354142189, + "learning_rate": 0.00019568387416175347, + "loss": 0.3811, + "step": 5897 + }, + { + "epoch": 0.47780298120544396, + "grad_norm": 0.03207635134458542, + "learning_rate": 0.00019567937350915883, + "loss": 0.3917, + "step": 5898 + }, + { + "epoch": 0.4778839922229423, + "grad_norm": 0.03403360769152641, + "learning_rate": 0.00019567487285656422, + "loss": 0.3919, + "step": 5899 + }, + { + "epoch": 0.4779650032404407, + "grad_norm": 0.03717518225312233, + "learning_rate": 0.00019567037220396958, + "loss": 0.4059, + "step": 5900 + }, + { + "epoch": 0.4780460142579391, + "grad_norm": 0.029602613300085068, + "learning_rate": 0.00019566587155137497, + "loss": 0.3616, + "step": 5901 + }, + { + "epoch": 0.47812702527543743, + "grad_norm": 0.03548922762274742, + "learning_rate": 0.00019566137089878033, + "loss": 0.387, + "step": 5902 + }, + { + "epoch": 0.47820803629293585, + "grad_norm": 0.030909236520528793, + "learning_rate": 0.0001956568702461857, + "loss": 0.3152, + "step": 5903 + }, + { + "epoch": 0.4782890473104342, + "grad_norm": 0.033036008477211, + "learning_rate": 0.00019565236959359107, + "loss": 0.3995, + "step": 5904 + }, + { + "epoch": 0.4783700583279326, + "grad_norm": 0.028769217431545258, + "learning_rate": 0.00019564786894099646, + "loss": 0.3319, + "step": 5905 + }, + { + "epoch": 0.47845106934543097, + "grad_norm": 0.03641336411237717, + "learning_rate": 0.00019564336828840182, + "loss": 0.3804, + "step": 5906 + }, + { + "epoch": 0.4785320803629294, + "grad_norm": 0.032285552471876144, + "learning_rate": 0.0001956388676358072, + "loss": 0.3149, + "step": 5907 + }, + { + "epoch": 0.47861309138042774, + "grad_norm": 0.028594225645065308, + "learning_rate": 0.00019563436698321257, + "loss": 0.3349, + "step": 5908 + }, + { + "epoch": 0.4786941023979261, + "grad_norm": 0.038507889956235886, + "learning_rate": 0.00019562986633061795, + "loss": 0.3708, + "step": 5909 + }, + { + "epoch": 0.4787751134154245, + "grad_norm": 0.033712759613990784, + "learning_rate": 0.00019562536567802331, + "loss": 0.3981, + "step": 5910 + }, + { + "epoch": 0.47885612443292286, + "grad_norm": 0.03205728530883789, + "learning_rate": 0.0001956208650254287, + "loss": 0.3889, + "step": 5911 + }, + { + "epoch": 0.47893713545042127, + "grad_norm": 0.032320015132427216, + "learning_rate": 0.00019561636437283406, + "loss": 0.3299, + "step": 5912 + }, + { + "epoch": 0.4790181464679196, + "grad_norm": 0.03484988957643509, + "learning_rate": 0.00019561186372023945, + "loss": 0.3899, + "step": 5913 + }, + { + "epoch": 0.47909915748541804, + "grad_norm": 0.036610767245292664, + "learning_rate": 0.0001956073630676448, + "loss": 0.3949, + "step": 5914 + }, + { + "epoch": 0.4791801685029164, + "grad_norm": 0.030673658475279808, + "learning_rate": 0.0001956028624150502, + "loss": 0.3216, + "step": 5915 + }, + { + "epoch": 0.4792611795204148, + "grad_norm": 0.03905686363577843, + "learning_rate": 0.00019559836176245556, + "loss": 0.3362, + "step": 5916 + }, + { + "epoch": 0.47934219053791316, + "grad_norm": 0.03621886670589447, + "learning_rate": 0.00019559386110986094, + "loss": 0.3495, + "step": 5917 + }, + { + "epoch": 0.4794232015554115, + "grad_norm": 0.03008181042969227, + "learning_rate": 0.00019558936045726633, + "loss": 0.36, + "step": 5918 + }, + { + "epoch": 0.4795042125729099, + "grad_norm": 0.02908833883702755, + "learning_rate": 0.0001955848598046717, + "loss": 0.3452, + "step": 5919 + }, + { + "epoch": 0.4795852235904083, + "grad_norm": 0.033341579139232635, + "learning_rate": 0.00019558035915207705, + "loss": 0.3276, + "step": 5920 + }, + { + "epoch": 0.4796662346079067, + "grad_norm": 0.028378089889883995, + "learning_rate": 0.00019557585849948244, + "loss": 0.3221, + "step": 5921 + }, + { + "epoch": 0.47974724562540505, + "grad_norm": 0.03122635930776596, + "learning_rate": 0.0001955713578468878, + "loss": 0.3407, + "step": 5922 + }, + { + "epoch": 0.47982825664290346, + "grad_norm": 0.03359805792570114, + "learning_rate": 0.00019556685719429319, + "loss": 0.3254, + "step": 5923 + }, + { + "epoch": 0.4799092676604018, + "grad_norm": 0.03142951428890228, + "learning_rate": 0.00019556235654169857, + "loss": 0.3226, + "step": 5924 + }, + { + "epoch": 0.4799902786779002, + "grad_norm": 0.02958759479224682, + "learning_rate": 0.00019555785588910393, + "loss": 0.3679, + "step": 5925 + }, + { + "epoch": 0.4800712896953986, + "grad_norm": 0.03471509367227554, + "learning_rate": 0.0001955533552365093, + "loss": 0.365, + "step": 5926 + }, + { + "epoch": 0.48015230071289694, + "grad_norm": 0.03234826773405075, + "learning_rate": 0.00019554885458391468, + "loss": 0.4251, + "step": 5927 + }, + { + "epoch": 0.48023331173039535, + "grad_norm": 0.03339417651295662, + "learning_rate": 0.00019554435393132004, + "loss": 0.3869, + "step": 5928 + }, + { + "epoch": 0.4803143227478937, + "grad_norm": 0.03371240943670273, + "learning_rate": 0.00019553985327872543, + "loss": 0.3468, + "step": 5929 + }, + { + "epoch": 0.4803953337653921, + "grad_norm": 0.03137330710887909, + "learning_rate": 0.00019553535262613081, + "loss": 0.3755, + "step": 5930 + }, + { + "epoch": 0.4804763447828905, + "grad_norm": 0.03640046343207359, + "learning_rate": 0.00019553085197353617, + "loss": 0.4203, + "step": 5931 + }, + { + "epoch": 0.48055735580038883, + "grad_norm": 0.030032861977815628, + "learning_rate": 0.00019552635132094156, + "loss": 0.3377, + "step": 5932 + }, + { + "epoch": 0.48063836681788724, + "grad_norm": 0.031163305044174194, + "learning_rate": 0.00019552185066834692, + "loss": 0.348, + "step": 5933 + }, + { + "epoch": 0.4807193778353856, + "grad_norm": 0.02940339781343937, + "learning_rate": 0.00019551735001575228, + "loss": 0.3285, + "step": 5934 + }, + { + "epoch": 0.480800388852884, + "grad_norm": 0.03373560681939125, + "learning_rate": 0.00019551284936315767, + "loss": 0.4155, + "step": 5935 + }, + { + "epoch": 0.48088139987038236, + "grad_norm": 0.03505878895521164, + "learning_rate": 0.00019550834871056306, + "loss": 0.3487, + "step": 5936 + }, + { + "epoch": 0.4809624108878808, + "grad_norm": 0.028804706409573555, + "learning_rate": 0.00019550384805796842, + "loss": 0.356, + "step": 5937 + }, + { + "epoch": 0.48104342190537913, + "grad_norm": 0.030923301354050636, + "learning_rate": 0.0001954993474053738, + "loss": 0.3703, + "step": 5938 + }, + { + "epoch": 0.4811244329228775, + "grad_norm": 0.035969726741313934, + "learning_rate": 0.00019549484675277916, + "loss": 0.3699, + "step": 5939 + }, + { + "epoch": 0.4812054439403759, + "grad_norm": 0.030231349170207977, + "learning_rate": 0.00019549034610018452, + "loss": 0.339, + "step": 5940 + }, + { + "epoch": 0.48128645495787425, + "grad_norm": 0.026125887408852577, + "learning_rate": 0.0001954858454475899, + "loss": 0.3463, + "step": 5941 + }, + { + "epoch": 0.48136746597537267, + "grad_norm": 0.03523105010390282, + "learning_rate": 0.0001954813447949953, + "loss": 0.3409, + "step": 5942 + }, + { + "epoch": 0.481448476992871, + "grad_norm": 0.03209773078560829, + "learning_rate": 0.00019547684414240066, + "loss": 0.3911, + "step": 5943 + }, + { + "epoch": 0.48152948801036943, + "grad_norm": 0.028588753193616867, + "learning_rate": 0.00019547234348980604, + "loss": 0.3157, + "step": 5944 + }, + { + "epoch": 0.4816104990278678, + "grad_norm": 0.03126561641693115, + "learning_rate": 0.0001954678428372114, + "loss": 0.3308, + "step": 5945 + }, + { + "epoch": 0.48169151004536614, + "grad_norm": 0.030155273154377937, + "learning_rate": 0.00019546334218461676, + "loss": 0.3458, + "step": 5946 + }, + { + "epoch": 0.48177252106286456, + "grad_norm": 0.03246188163757324, + "learning_rate": 0.00019545884153202215, + "loss": 0.3463, + "step": 5947 + }, + { + "epoch": 0.4818535320803629, + "grad_norm": 0.03119421936571598, + "learning_rate": 0.00019545434087942754, + "loss": 0.3742, + "step": 5948 + }, + { + "epoch": 0.4819345430978613, + "grad_norm": 0.03162112459540367, + "learning_rate": 0.0001954498402268329, + "loss": 0.3589, + "step": 5949 + }, + { + "epoch": 0.4820155541153597, + "grad_norm": 0.02846745029091835, + "learning_rate": 0.00019544533957423829, + "loss": 0.3648, + "step": 5950 + }, + { + "epoch": 0.4820965651328581, + "grad_norm": 0.03139728680253029, + "learning_rate": 0.00019544083892164365, + "loss": 0.3772, + "step": 5951 + }, + { + "epoch": 0.48217757615035645, + "grad_norm": 0.028640342876315117, + "learning_rate": 0.000195436338269049, + "loss": 0.3831, + "step": 5952 + }, + { + "epoch": 0.4822585871678548, + "grad_norm": 0.030841778963804245, + "learning_rate": 0.0001954318376164544, + "loss": 0.3822, + "step": 5953 + }, + { + "epoch": 0.4823395981853532, + "grad_norm": 0.02715679071843624, + "learning_rate": 0.00019542733696385978, + "loss": 0.3489, + "step": 5954 + }, + { + "epoch": 0.48242060920285157, + "grad_norm": 0.03216562792658806, + "learning_rate": 0.00019542283631126514, + "loss": 0.3706, + "step": 5955 + }, + { + "epoch": 0.48250162022035, + "grad_norm": 0.03239520639181137, + "learning_rate": 0.00019541833565867053, + "loss": 0.3817, + "step": 5956 + }, + { + "epoch": 0.48258263123784834, + "grad_norm": 0.0378153957426548, + "learning_rate": 0.0001954138350060759, + "loss": 0.3877, + "step": 5957 + }, + { + "epoch": 0.48266364225534675, + "grad_norm": 0.037669822573661804, + "learning_rate": 0.00019540933435348125, + "loss": 0.3689, + "step": 5958 + }, + { + "epoch": 0.4827446532728451, + "grad_norm": 0.029966356232762337, + "learning_rate": 0.00019540483370088663, + "loss": 0.3522, + "step": 5959 + }, + { + "epoch": 0.48282566429034346, + "grad_norm": 0.03512732312083244, + "learning_rate": 0.00019540033304829202, + "loss": 0.3842, + "step": 5960 + }, + { + "epoch": 0.48290667530784187, + "grad_norm": 0.028936125338077545, + "learning_rate": 0.00019539583239569738, + "loss": 0.3423, + "step": 5961 + }, + { + "epoch": 0.4829876863253402, + "grad_norm": 0.029880812391638756, + "learning_rate": 0.00019539133174310277, + "loss": 0.362, + "step": 5962 + }, + { + "epoch": 0.48306869734283864, + "grad_norm": 0.030505992472171783, + "learning_rate": 0.00019538683109050813, + "loss": 0.3325, + "step": 5963 + }, + { + "epoch": 0.483149708360337, + "grad_norm": 0.030406080186367035, + "learning_rate": 0.0001953823304379135, + "loss": 0.3366, + "step": 5964 + }, + { + "epoch": 0.4832307193778354, + "grad_norm": 0.03137945011258125, + "learning_rate": 0.00019537782978531888, + "loss": 0.3203, + "step": 5965 + }, + { + "epoch": 0.48331173039533376, + "grad_norm": 0.03132305294275284, + "learning_rate": 0.00019537332913272426, + "loss": 0.3632, + "step": 5966 + }, + { + "epoch": 0.48339274141283217, + "grad_norm": 0.02978110872209072, + "learning_rate": 0.00019536882848012962, + "loss": 0.344, + "step": 5967 + }, + { + "epoch": 0.48347375243033053, + "grad_norm": 0.03157349303364754, + "learning_rate": 0.000195364327827535, + "loss": 0.3475, + "step": 5968 + }, + { + "epoch": 0.4835547634478289, + "grad_norm": 0.03231159225106239, + "learning_rate": 0.00019535982717494037, + "loss": 0.4046, + "step": 5969 + }, + { + "epoch": 0.4836357744653273, + "grad_norm": 0.03151550516486168, + "learning_rate": 0.00019535532652234573, + "loss": 0.382, + "step": 5970 + }, + { + "epoch": 0.48371678548282565, + "grad_norm": 0.028865037485957146, + "learning_rate": 0.00019535082586975112, + "loss": 0.3196, + "step": 5971 + }, + { + "epoch": 0.48379779650032406, + "grad_norm": 0.028897427022457123, + "learning_rate": 0.0001953463252171565, + "loss": 0.2927, + "step": 5972 + }, + { + "epoch": 0.4838788075178224, + "grad_norm": 0.028442172333598137, + "learning_rate": 0.00019534182456456187, + "loss": 0.3318, + "step": 5973 + }, + { + "epoch": 0.48395981853532083, + "grad_norm": 0.02451622672379017, + "learning_rate": 0.00019533732391196725, + "loss": 0.3253, + "step": 5974 + }, + { + "epoch": 0.4840408295528192, + "grad_norm": 0.032599471509456635, + "learning_rate": 0.0001953328232593726, + "loss": 0.3427, + "step": 5975 + }, + { + "epoch": 0.48412184057031754, + "grad_norm": 0.030640697106719017, + "learning_rate": 0.000195328322606778, + "loss": 0.3665, + "step": 5976 + }, + { + "epoch": 0.48420285158781595, + "grad_norm": 0.02785688452422619, + "learning_rate": 0.00019532382195418336, + "loss": 0.3404, + "step": 5977 + }, + { + "epoch": 0.4842838626053143, + "grad_norm": 0.030281536281108856, + "learning_rate": 0.00019531932130158875, + "loss": 0.3297, + "step": 5978 + }, + { + "epoch": 0.4843648736228127, + "grad_norm": 0.0328233428299427, + "learning_rate": 0.0001953148206489941, + "loss": 0.3297, + "step": 5979 + }, + { + "epoch": 0.4844458846403111, + "grad_norm": 0.027729716151952744, + "learning_rate": 0.0001953103199963995, + "loss": 0.3383, + "step": 5980 + }, + { + "epoch": 0.4845268956578095, + "grad_norm": 0.03164516016840935, + "learning_rate": 0.00019530581934380485, + "loss": 0.3865, + "step": 5981 + }, + { + "epoch": 0.48460790667530784, + "grad_norm": 0.03192558512091637, + "learning_rate": 0.00019530131869121024, + "loss": 0.3368, + "step": 5982 + }, + { + "epoch": 0.4846889176928062, + "grad_norm": 0.03171934187412262, + "learning_rate": 0.0001952968180386156, + "loss": 0.3394, + "step": 5983 + }, + { + "epoch": 0.4847699287103046, + "grad_norm": 0.034574296325445175, + "learning_rate": 0.000195292317386021, + "loss": 0.4038, + "step": 5984 + }, + { + "epoch": 0.48485093972780297, + "grad_norm": 0.032755058258771896, + "learning_rate": 0.00019528781673342635, + "loss": 0.3144, + "step": 5985 + }, + { + "epoch": 0.4849319507453014, + "grad_norm": 0.03499835729598999, + "learning_rate": 0.00019528331608083174, + "loss": 0.3517, + "step": 5986 + }, + { + "epoch": 0.48501296176279973, + "grad_norm": 0.031813375651836395, + "learning_rate": 0.0001952788154282371, + "loss": 0.3153, + "step": 5987 + }, + { + "epoch": 0.48509397278029814, + "grad_norm": 0.03329386189579964, + "learning_rate": 0.00019527431477564248, + "loss": 0.3723, + "step": 5988 + }, + { + "epoch": 0.4851749837977965, + "grad_norm": 0.031179320067167282, + "learning_rate": 0.00019526981412304784, + "loss": 0.3379, + "step": 5989 + }, + { + "epoch": 0.48525599481529486, + "grad_norm": 0.03648926317691803, + "learning_rate": 0.00019526531347045323, + "loss": 0.403, + "step": 5990 + }, + { + "epoch": 0.48533700583279327, + "grad_norm": 0.0387888066470623, + "learning_rate": 0.0001952608128178586, + "loss": 0.4047, + "step": 5991 + }, + { + "epoch": 0.4854180168502916, + "grad_norm": 0.03588513657450676, + "learning_rate": 0.00019525631216526398, + "loss": 0.3659, + "step": 5992 + }, + { + "epoch": 0.48549902786779003, + "grad_norm": 0.0332111194729805, + "learning_rate": 0.00019525181151266934, + "loss": 0.3611, + "step": 5993 + }, + { + "epoch": 0.4855800388852884, + "grad_norm": 0.03178248181939125, + "learning_rate": 0.00019524731086007472, + "loss": 0.3137, + "step": 5994 + }, + { + "epoch": 0.4856610499027868, + "grad_norm": 0.03442096710205078, + "learning_rate": 0.00019524281020748008, + "loss": 0.3655, + "step": 5995 + }, + { + "epoch": 0.48574206092028516, + "grad_norm": 0.028393248096108437, + "learning_rate": 0.00019523830955488547, + "loss": 0.3141, + "step": 5996 + }, + { + "epoch": 0.4858230719377835, + "grad_norm": 0.030575979501008987, + "learning_rate": 0.00019523380890229083, + "loss": 0.3729, + "step": 5997 + }, + { + "epoch": 0.4859040829552819, + "grad_norm": 0.0336354598402977, + "learning_rate": 0.00019522930824969622, + "loss": 0.4075, + "step": 5998 + }, + { + "epoch": 0.4859850939727803, + "grad_norm": 0.03110545314848423, + "learning_rate": 0.0001952248075971016, + "loss": 0.3841, + "step": 5999 + }, + { + "epoch": 0.4860661049902787, + "grad_norm": 0.03151804953813553, + "learning_rate": 0.00019522030694450697, + "loss": 0.3684, + "step": 6000 + }, + { + "epoch": 0.48614711600777705, + "grad_norm": 0.03389604762196541, + "learning_rate": 0.00019521580629191235, + "loss": 0.4194, + "step": 6001 + }, + { + "epoch": 0.48622812702527546, + "grad_norm": 0.02937939018011093, + "learning_rate": 0.0001952113056393177, + "loss": 0.3757, + "step": 6002 + }, + { + "epoch": 0.4863091380427738, + "grad_norm": 0.03905171900987625, + "learning_rate": 0.00019520680498672307, + "loss": 0.4182, + "step": 6003 + }, + { + "epoch": 0.48639014906027217, + "grad_norm": 0.02655964158475399, + "learning_rate": 0.00019520230433412846, + "loss": 0.3227, + "step": 6004 + }, + { + "epoch": 0.4864711600777706, + "grad_norm": 0.03764092177152634, + "learning_rate": 0.00019519780368153385, + "loss": 0.3575, + "step": 6005 + }, + { + "epoch": 0.48655217109526894, + "grad_norm": 0.031080730259418488, + "learning_rate": 0.0001951933030289392, + "loss": 0.3592, + "step": 6006 + }, + { + "epoch": 0.48663318211276735, + "grad_norm": 0.03204982727766037, + "learning_rate": 0.0001951888023763446, + "loss": 0.281, + "step": 6007 + }, + { + "epoch": 0.4867141931302657, + "grad_norm": 0.03170899301767349, + "learning_rate": 0.00019518430172374995, + "loss": 0.3886, + "step": 6008 + }, + { + "epoch": 0.4867952041477641, + "grad_norm": 0.0295356884598732, + "learning_rate": 0.00019517980107115531, + "loss": 0.3828, + "step": 6009 + }, + { + "epoch": 0.48687621516526247, + "grad_norm": 0.03186435624957085, + "learning_rate": 0.0001951753004185607, + "loss": 0.3921, + "step": 6010 + }, + { + "epoch": 0.4869572261827609, + "grad_norm": 0.0325162410736084, + "learning_rate": 0.0001951707997659661, + "loss": 0.3469, + "step": 6011 + }, + { + "epoch": 0.48703823720025924, + "grad_norm": 0.031725071370601654, + "learning_rate": 0.00019516629911337145, + "loss": 0.3444, + "step": 6012 + }, + { + "epoch": 0.4871192482177576, + "grad_norm": 0.029257534071803093, + "learning_rate": 0.00019516179846077684, + "loss": 0.3549, + "step": 6013 + }, + { + "epoch": 0.487200259235256, + "grad_norm": 0.029020339250564575, + "learning_rate": 0.0001951572978081822, + "loss": 0.3602, + "step": 6014 + }, + { + "epoch": 0.48728127025275436, + "grad_norm": 0.030635559931397438, + "learning_rate": 0.00019515279715558756, + "loss": 0.3246, + "step": 6015 + }, + { + "epoch": 0.4873622812702528, + "grad_norm": 0.029000451788306236, + "learning_rate": 0.00019514829650299294, + "loss": 0.3085, + "step": 6016 + }, + { + "epoch": 0.48744329228775113, + "grad_norm": 0.03026675619184971, + "learning_rate": 0.00019514379585039833, + "loss": 0.3281, + "step": 6017 + }, + { + "epoch": 0.48752430330524954, + "grad_norm": 0.03477197140455246, + "learning_rate": 0.0001951392951978037, + "loss": 0.3268, + "step": 6018 + }, + { + "epoch": 0.4876053143227479, + "grad_norm": 0.031906504184007645, + "learning_rate": 0.00019513479454520908, + "loss": 0.3835, + "step": 6019 + }, + { + "epoch": 0.48768632534024625, + "grad_norm": 0.03229931741952896, + "learning_rate": 0.00019513029389261444, + "loss": 0.3055, + "step": 6020 + }, + { + "epoch": 0.48776733635774466, + "grad_norm": 0.039322223514318466, + "learning_rate": 0.0001951257932400198, + "loss": 0.3938, + "step": 6021 + }, + { + "epoch": 0.487848347375243, + "grad_norm": 0.030645597726106644, + "learning_rate": 0.00019512129258742519, + "loss": 0.3629, + "step": 6022 + }, + { + "epoch": 0.48792935839274143, + "grad_norm": 0.03114427626132965, + "learning_rate": 0.00019511679193483057, + "loss": 0.4017, + "step": 6023 + }, + { + "epoch": 0.4880103694102398, + "grad_norm": 0.027782771736383438, + "learning_rate": 0.00019511229128223593, + "loss": 0.3442, + "step": 6024 + }, + { + "epoch": 0.4880913804277382, + "grad_norm": 0.035519011318683624, + "learning_rate": 0.00019510779062964132, + "loss": 0.3857, + "step": 6025 + }, + { + "epoch": 0.48817239144523655, + "grad_norm": 0.036083195358514786, + "learning_rate": 0.00019510328997704668, + "loss": 0.4052, + "step": 6026 + }, + { + "epoch": 0.4882534024627349, + "grad_norm": 0.03073258511722088, + "learning_rate": 0.00019509878932445204, + "loss": 0.3382, + "step": 6027 + }, + { + "epoch": 0.4883344134802333, + "grad_norm": 0.02905021235346794, + "learning_rate": 0.00019509428867185743, + "loss": 0.3685, + "step": 6028 + }, + { + "epoch": 0.4884154244977317, + "grad_norm": 0.028884712606668472, + "learning_rate": 0.00019508978801926281, + "loss": 0.3323, + "step": 6029 + }, + { + "epoch": 0.4884964355152301, + "grad_norm": 0.032662998884916306, + "learning_rate": 0.00019508528736666817, + "loss": 0.3487, + "step": 6030 + }, + { + "epoch": 0.48857744653272844, + "grad_norm": 0.03282221034169197, + "learning_rate": 0.00019508078671407356, + "loss": 0.3565, + "step": 6031 + }, + { + "epoch": 0.48865845755022685, + "grad_norm": 0.03624732792377472, + "learning_rate": 0.00019507628606147892, + "loss": 0.371, + "step": 6032 + }, + { + "epoch": 0.4887394685677252, + "grad_norm": 0.037391260266304016, + "learning_rate": 0.00019507178540888428, + "loss": 0.3782, + "step": 6033 + }, + { + "epoch": 0.48882047958522357, + "grad_norm": 0.029727550223469734, + "learning_rate": 0.00019506728475628967, + "loss": 0.3644, + "step": 6034 + }, + { + "epoch": 0.488901490602722, + "grad_norm": 0.03331521898508072, + "learning_rate": 0.00019506278410369506, + "loss": 0.3554, + "step": 6035 + }, + { + "epoch": 0.48898250162022033, + "grad_norm": 0.030349144712090492, + "learning_rate": 0.00019505828345110042, + "loss": 0.3938, + "step": 6036 + }, + { + "epoch": 0.48906351263771874, + "grad_norm": 0.033913373947143555, + "learning_rate": 0.0001950537827985058, + "loss": 0.3725, + "step": 6037 + }, + { + "epoch": 0.4891445236552171, + "grad_norm": 0.031365133821964264, + "learning_rate": 0.00019504928214591116, + "loss": 0.3856, + "step": 6038 + }, + { + "epoch": 0.4892255346727155, + "grad_norm": 0.027343137189745903, + "learning_rate": 0.00019504478149331652, + "loss": 0.3286, + "step": 6039 + }, + { + "epoch": 0.48930654569021387, + "grad_norm": 0.035865604877471924, + "learning_rate": 0.0001950402808407219, + "loss": 0.3906, + "step": 6040 + }, + { + "epoch": 0.4893875567077122, + "grad_norm": 0.030563244596123695, + "learning_rate": 0.0001950357801881273, + "loss": 0.3393, + "step": 6041 + }, + { + "epoch": 0.48946856772521063, + "grad_norm": 0.0307852104306221, + "learning_rate": 0.00019503127953553266, + "loss": 0.3799, + "step": 6042 + }, + { + "epoch": 0.489549578742709, + "grad_norm": 0.03347328305244446, + "learning_rate": 0.00019502677888293804, + "loss": 0.3705, + "step": 6043 + }, + { + "epoch": 0.4896305897602074, + "grad_norm": 0.03165578097105026, + "learning_rate": 0.0001950222782303434, + "loss": 0.3788, + "step": 6044 + }, + { + "epoch": 0.48971160077770576, + "grad_norm": 0.02846166305243969, + "learning_rate": 0.00019501777757774876, + "loss": 0.3469, + "step": 6045 + }, + { + "epoch": 0.48979261179520417, + "grad_norm": 0.03398086130619049, + "learning_rate": 0.00019501327692515415, + "loss": 0.4594, + "step": 6046 + }, + { + "epoch": 0.4898736228127025, + "grad_norm": 0.0365835502743721, + "learning_rate": 0.00019500877627255954, + "loss": 0.3812, + "step": 6047 + }, + { + "epoch": 0.4899546338302009, + "grad_norm": 0.02915862761437893, + "learning_rate": 0.0001950042756199649, + "loss": 0.3811, + "step": 6048 + }, + { + "epoch": 0.4900356448476993, + "grad_norm": 0.029644619673490524, + "learning_rate": 0.00019499977496737029, + "loss": 0.3548, + "step": 6049 + }, + { + "epoch": 0.49011665586519765, + "grad_norm": 0.030091730877757072, + "learning_rate": 0.00019499527431477565, + "loss": 0.3554, + "step": 6050 + }, + { + "epoch": 0.49019766688269606, + "grad_norm": 0.030911961570382118, + "learning_rate": 0.00019499077366218103, + "loss": 0.3683, + "step": 6051 + }, + { + "epoch": 0.4902786779001944, + "grad_norm": 0.031337589025497437, + "learning_rate": 0.0001949862730095864, + "loss": 0.3439, + "step": 6052 + }, + { + "epoch": 0.4903596889176928, + "grad_norm": 0.027552342042326927, + "learning_rate": 0.00019498177235699178, + "loss": 0.3247, + "step": 6053 + }, + { + "epoch": 0.4904406999351912, + "grad_norm": 0.0306177269667387, + "learning_rate": 0.00019497727170439714, + "loss": 0.3879, + "step": 6054 + }, + { + "epoch": 0.49052171095268954, + "grad_norm": 0.030022302642464638, + "learning_rate": 0.00019497277105180253, + "loss": 0.3767, + "step": 6055 + }, + { + "epoch": 0.49060272197018795, + "grad_norm": 0.03156706318259239, + "learning_rate": 0.0001949682703992079, + "loss": 0.3303, + "step": 6056 + }, + { + "epoch": 0.4906837329876863, + "grad_norm": 0.031678952276706696, + "learning_rate": 0.00019496376974661327, + "loss": 0.3664, + "step": 6057 + }, + { + "epoch": 0.4907647440051847, + "grad_norm": 0.029925771057605743, + "learning_rate": 0.00019495926909401864, + "loss": 0.3658, + "step": 6058 + }, + { + "epoch": 0.4908457550226831, + "grad_norm": 0.03150847181677818, + "learning_rate": 0.00019495476844142402, + "loss": 0.3788, + "step": 6059 + }, + { + "epoch": 0.4909267660401815, + "grad_norm": 0.03222969174385071, + "learning_rate": 0.00019495026778882938, + "loss": 0.4201, + "step": 6060 + }, + { + "epoch": 0.49100777705767984, + "grad_norm": 0.036666642874479294, + "learning_rate": 0.00019494576713623477, + "loss": 0.4088, + "step": 6061 + }, + { + "epoch": 0.49108878807517825, + "grad_norm": 0.031550489366054535, + "learning_rate": 0.00019494126648364013, + "loss": 0.3477, + "step": 6062 + }, + { + "epoch": 0.4911697990926766, + "grad_norm": 0.02660016529262066, + "learning_rate": 0.00019493676583104552, + "loss": 0.3173, + "step": 6063 + }, + { + "epoch": 0.49125081011017496, + "grad_norm": 0.03121725469827652, + "learning_rate": 0.00019493226517845088, + "loss": 0.3218, + "step": 6064 + }, + { + "epoch": 0.4913318211276734, + "grad_norm": 0.03398605063557625, + "learning_rate": 0.00019492776452585626, + "loss": 0.4122, + "step": 6065 + }, + { + "epoch": 0.49141283214517173, + "grad_norm": 0.03151047229766846, + "learning_rate": 0.00019492326387326162, + "loss": 0.3821, + "step": 6066 + }, + { + "epoch": 0.49149384316267014, + "grad_norm": 0.03958398848772049, + "learning_rate": 0.000194918763220667, + "loss": 0.4201, + "step": 6067 + }, + { + "epoch": 0.4915748541801685, + "grad_norm": 0.03347938135266304, + "learning_rate": 0.00019491426256807237, + "loss": 0.3793, + "step": 6068 + }, + { + "epoch": 0.4916558651976669, + "grad_norm": 0.03312998265028, + "learning_rate": 0.00019490976191547776, + "loss": 0.3665, + "step": 6069 + }, + { + "epoch": 0.49173687621516526, + "grad_norm": 0.031087879091501236, + "learning_rate": 0.00019490526126288315, + "loss": 0.3767, + "step": 6070 + }, + { + "epoch": 0.4918178872326636, + "grad_norm": 0.03423725813627243, + "learning_rate": 0.0001949007606102885, + "loss": 0.4474, + "step": 6071 + }, + { + "epoch": 0.49189889825016203, + "grad_norm": 0.03192099183797836, + "learning_rate": 0.00019489625995769387, + "loss": 0.4102, + "step": 6072 + }, + { + "epoch": 0.4919799092676604, + "grad_norm": 0.03476814553141594, + "learning_rate": 0.00019489175930509925, + "loss": 0.414, + "step": 6073 + }, + { + "epoch": 0.4920609202851588, + "grad_norm": 0.03009425476193428, + "learning_rate": 0.0001948872586525046, + "loss": 0.3568, + "step": 6074 + }, + { + "epoch": 0.49214193130265715, + "grad_norm": 0.02842790260910988, + "learning_rate": 0.00019488275799991, + "loss": 0.344, + "step": 6075 + }, + { + "epoch": 0.49222294232015557, + "grad_norm": 0.03067297860980034, + "learning_rate": 0.0001948782573473154, + "loss": 0.3728, + "step": 6076 + }, + { + "epoch": 0.4923039533376539, + "grad_norm": 0.030920876190066338, + "learning_rate": 0.00019487375669472075, + "loss": 0.305, + "step": 6077 + }, + { + "epoch": 0.4923849643551523, + "grad_norm": 0.02858106978237629, + "learning_rate": 0.0001948692560421261, + "loss": 0.3724, + "step": 6078 + }, + { + "epoch": 0.4924659753726507, + "grad_norm": 0.03146613389253616, + "learning_rate": 0.0001948647553895315, + "loss": 0.3369, + "step": 6079 + }, + { + "epoch": 0.49254698639014904, + "grad_norm": 0.03171651437878609, + "learning_rate": 0.00019486025473693688, + "loss": 0.3992, + "step": 6080 + }, + { + "epoch": 0.49262799740764746, + "grad_norm": 0.027577511966228485, + "learning_rate": 0.00019485575408434224, + "loss": 0.3301, + "step": 6081 + }, + { + "epoch": 0.4927090084251458, + "grad_norm": 0.0313507542014122, + "learning_rate": 0.00019485125343174763, + "loss": 0.369, + "step": 6082 + }, + { + "epoch": 0.4927900194426442, + "grad_norm": 0.0450977124273777, + "learning_rate": 0.000194846752779153, + "loss": 0.3491, + "step": 6083 + }, + { + "epoch": 0.4928710304601426, + "grad_norm": 0.029169466346502304, + "learning_rate": 0.00019484225212655835, + "loss": 0.3506, + "step": 6084 + }, + { + "epoch": 0.49295204147764093, + "grad_norm": 0.026984231546521187, + "learning_rate": 0.00019483775147396374, + "loss": 0.3564, + "step": 6085 + }, + { + "epoch": 0.49303305249513935, + "grad_norm": 0.027847252786159515, + "learning_rate": 0.00019483325082136912, + "loss": 0.3293, + "step": 6086 + }, + { + "epoch": 0.4931140635126377, + "grad_norm": 0.03478049486875534, + "learning_rate": 0.00019482875016877448, + "loss": 0.4064, + "step": 6087 + }, + { + "epoch": 0.4931950745301361, + "grad_norm": 0.032629888504743576, + "learning_rate": 0.00019482424951617987, + "loss": 0.3692, + "step": 6088 + }, + { + "epoch": 0.49327608554763447, + "grad_norm": 0.032630983740091324, + "learning_rate": 0.00019481974886358523, + "loss": 0.342, + "step": 6089 + }, + { + "epoch": 0.4933570965651329, + "grad_norm": 0.030905557796359062, + "learning_rate": 0.0001948152482109906, + "loss": 0.3336, + "step": 6090 + }, + { + "epoch": 0.49343810758263124, + "grad_norm": 0.03159333020448685, + "learning_rate": 0.00019481074755839598, + "loss": 0.3174, + "step": 6091 + }, + { + "epoch": 0.4935191186001296, + "grad_norm": 0.035348713397979736, + "learning_rate": 0.00019480624690580136, + "loss": 0.3726, + "step": 6092 + }, + { + "epoch": 0.493600129617628, + "grad_norm": 0.029344551265239716, + "learning_rate": 0.00019480174625320672, + "loss": 0.3581, + "step": 6093 + }, + { + "epoch": 0.49368114063512636, + "grad_norm": 0.033776115626096725, + "learning_rate": 0.0001947972456006121, + "loss": 0.3734, + "step": 6094 + }, + { + "epoch": 0.49376215165262477, + "grad_norm": 0.032239265739917755, + "learning_rate": 0.00019479274494801747, + "loss": 0.3807, + "step": 6095 + }, + { + "epoch": 0.4938431626701231, + "grad_norm": 0.035125527530908585, + "learning_rate": 0.00019478824429542283, + "loss": 0.3738, + "step": 6096 + }, + { + "epoch": 0.49392417368762154, + "grad_norm": 0.028467318043112755, + "learning_rate": 0.00019478374364282822, + "loss": 0.3764, + "step": 6097 + }, + { + "epoch": 0.4940051847051199, + "grad_norm": 0.029696613550186157, + "learning_rate": 0.0001947792429902336, + "loss": 0.3522, + "step": 6098 + }, + { + "epoch": 0.49408619572261825, + "grad_norm": 0.035356346517801285, + "learning_rate": 0.00019477474233763897, + "loss": 0.3416, + "step": 6099 + }, + { + "epoch": 0.49416720674011666, + "grad_norm": 0.03714694082736969, + "learning_rate": 0.00019477024168504435, + "loss": 0.3707, + "step": 6100 + }, + { + "epoch": 0.494248217757615, + "grad_norm": 0.027115581557154655, + "learning_rate": 0.0001947657410324497, + "loss": 0.3327, + "step": 6101 + }, + { + "epoch": 0.4943292287751134, + "grad_norm": 0.033466219902038574, + "learning_rate": 0.00019476124037985507, + "loss": 0.3518, + "step": 6102 + }, + { + "epoch": 0.4944102397926118, + "grad_norm": 0.030489971861243248, + "learning_rate": 0.00019475673972726046, + "loss": 0.3641, + "step": 6103 + }, + { + "epoch": 0.4944912508101102, + "grad_norm": 0.03086225688457489, + "learning_rate": 0.00019475223907466585, + "loss": 0.3086, + "step": 6104 + }, + { + "epoch": 0.49457226182760855, + "grad_norm": 0.033797189593315125, + "learning_rate": 0.0001947477384220712, + "loss": 0.3786, + "step": 6105 + }, + { + "epoch": 0.49465327284510696, + "grad_norm": 0.028506649658083916, + "learning_rate": 0.0001947432377694766, + "loss": 0.3215, + "step": 6106 + }, + { + "epoch": 0.4947342838626053, + "grad_norm": 0.029441583901643753, + "learning_rate": 0.00019473873711688196, + "loss": 0.3412, + "step": 6107 + }, + { + "epoch": 0.4948152948801037, + "grad_norm": 0.029824761673808098, + "learning_rate": 0.00019473423646428732, + "loss": 0.391, + "step": 6108 + }, + { + "epoch": 0.4948963058976021, + "grad_norm": 0.031150689348578453, + "learning_rate": 0.0001947297358116927, + "loss": 0.3726, + "step": 6109 + }, + { + "epoch": 0.49497731691510044, + "grad_norm": 0.03557276353240013, + "learning_rate": 0.0001947252351590981, + "loss": 0.3477, + "step": 6110 + }, + { + "epoch": 0.49505832793259885, + "grad_norm": 0.03375493362545967, + "learning_rate": 0.00019472073450650345, + "loss": 0.4136, + "step": 6111 + }, + { + "epoch": 0.4951393389500972, + "grad_norm": 0.030482076108455658, + "learning_rate": 0.00019471623385390884, + "loss": 0.3726, + "step": 6112 + }, + { + "epoch": 0.4952203499675956, + "grad_norm": 0.03187844157218933, + "learning_rate": 0.0001947117332013142, + "loss": 0.4042, + "step": 6113 + }, + { + "epoch": 0.495301360985094, + "grad_norm": 0.0333397351205349, + "learning_rate": 0.00019470723254871956, + "loss": 0.3641, + "step": 6114 + }, + { + "epoch": 0.49538237200259233, + "grad_norm": 0.033241838216781616, + "learning_rate": 0.00019470273189612494, + "loss": 0.36, + "step": 6115 + }, + { + "epoch": 0.49546338302009074, + "grad_norm": 0.03735223412513733, + "learning_rate": 0.00019469823124353033, + "loss": 0.3981, + "step": 6116 + }, + { + "epoch": 0.4955443940375891, + "grad_norm": 0.0269224364310503, + "learning_rate": 0.0001946937305909357, + "loss": 0.2883, + "step": 6117 + }, + { + "epoch": 0.4956254050550875, + "grad_norm": 0.029367348179221153, + "learning_rate": 0.00019468922993834108, + "loss": 0.3345, + "step": 6118 + }, + { + "epoch": 0.49570641607258586, + "grad_norm": 0.02894587628543377, + "learning_rate": 0.00019468472928574644, + "loss": 0.326, + "step": 6119 + }, + { + "epoch": 0.4957874270900843, + "grad_norm": 0.029514474794268608, + "learning_rate": 0.0001946802286331518, + "loss": 0.3831, + "step": 6120 + }, + { + "epoch": 0.49586843810758263, + "grad_norm": 0.03847199305891991, + "learning_rate": 0.00019467572798055719, + "loss": 0.3559, + "step": 6121 + }, + { + "epoch": 0.495949449125081, + "grad_norm": 0.03183659166097641, + "learning_rate": 0.00019467122732796257, + "loss": 0.3779, + "step": 6122 + }, + { + "epoch": 0.4960304601425794, + "grad_norm": 0.031036654487252235, + "learning_rate": 0.00019466672667536793, + "loss": 0.3158, + "step": 6123 + }, + { + "epoch": 0.49611147116007775, + "grad_norm": 0.034427400678396225, + "learning_rate": 0.00019466222602277332, + "loss": 0.3801, + "step": 6124 + }, + { + "epoch": 0.49619248217757617, + "grad_norm": 0.03311762586236, + "learning_rate": 0.00019465772537017868, + "loss": 0.3867, + "step": 6125 + }, + { + "epoch": 0.4962734931950745, + "grad_norm": 0.03220412880182266, + "learning_rate": 0.00019465322471758404, + "loss": 0.3672, + "step": 6126 + }, + { + "epoch": 0.49635450421257293, + "grad_norm": 0.03264900669455528, + "learning_rate": 0.00019464872406498943, + "loss": 0.3424, + "step": 6127 + }, + { + "epoch": 0.4964355152300713, + "grad_norm": 0.033384453505277634, + "learning_rate": 0.00019464422341239481, + "loss": 0.3283, + "step": 6128 + }, + { + "epoch": 0.49651652624756965, + "grad_norm": 0.029015664011240005, + "learning_rate": 0.00019463972275980017, + "loss": 0.3281, + "step": 6129 + }, + { + "epoch": 0.49659753726506806, + "grad_norm": 0.033148493617773056, + "learning_rate": 0.00019463522210720556, + "loss": 0.3696, + "step": 6130 + }, + { + "epoch": 0.4966785482825664, + "grad_norm": 0.02691441774368286, + "learning_rate": 0.00019463072145461092, + "loss": 0.3583, + "step": 6131 + }, + { + "epoch": 0.4967595593000648, + "grad_norm": 0.03129500895738602, + "learning_rate": 0.0001946262208020163, + "loss": 0.3397, + "step": 6132 + }, + { + "epoch": 0.4968405703175632, + "grad_norm": 0.028962457552552223, + "learning_rate": 0.0001946217201494217, + "loss": 0.3609, + "step": 6133 + }, + { + "epoch": 0.4969215813350616, + "grad_norm": 0.030767876654863358, + "learning_rate": 0.00019461721949682706, + "loss": 0.3421, + "step": 6134 + }, + { + "epoch": 0.49700259235255995, + "grad_norm": 0.03145063295960426, + "learning_rate": 0.00019461271884423242, + "loss": 0.387, + "step": 6135 + }, + { + "epoch": 0.4970836033700583, + "grad_norm": 0.033778853714466095, + "learning_rate": 0.0001946082181916378, + "loss": 0.3792, + "step": 6136 + }, + { + "epoch": 0.4971646143875567, + "grad_norm": 0.028447460383176804, + "learning_rate": 0.00019460371753904316, + "loss": 0.3694, + "step": 6137 + }, + { + "epoch": 0.49724562540505507, + "grad_norm": 0.027313074097037315, + "learning_rate": 0.00019459921688644855, + "loss": 0.334, + "step": 6138 + }, + { + "epoch": 0.4973266364225535, + "grad_norm": 0.027841804549098015, + "learning_rate": 0.00019459471623385394, + "loss": 0.3851, + "step": 6139 + }, + { + "epoch": 0.49740764744005184, + "grad_norm": 0.03338708356022835, + "learning_rate": 0.0001945902155812593, + "loss": 0.3966, + "step": 6140 + }, + { + "epoch": 0.49748865845755025, + "grad_norm": 0.031054025515913963, + "learning_rate": 0.00019458571492866466, + "loss": 0.3282, + "step": 6141 + }, + { + "epoch": 0.4975696694750486, + "grad_norm": 0.031216377392411232, + "learning_rate": 0.00019458121427607004, + "loss": 0.3998, + "step": 6142 + }, + { + "epoch": 0.49765068049254696, + "grad_norm": 0.03344246372580528, + "learning_rate": 0.0001945767136234754, + "loss": 0.3644, + "step": 6143 + }, + { + "epoch": 0.49773169151004537, + "grad_norm": 0.029684685170650482, + "learning_rate": 0.0001945722129708808, + "loss": 0.3209, + "step": 6144 + }, + { + "epoch": 0.4978127025275437, + "grad_norm": 0.02766270935535431, + "learning_rate": 0.00019456771231828618, + "loss": 0.3444, + "step": 6145 + }, + { + "epoch": 0.49789371354504214, + "grad_norm": 0.032080575823783875, + "learning_rate": 0.00019456321166569154, + "loss": 0.3416, + "step": 6146 + }, + { + "epoch": 0.4979747245625405, + "grad_norm": 0.030261723324656487, + "learning_rate": 0.0001945587110130969, + "loss": 0.3732, + "step": 6147 + }, + { + "epoch": 0.4980557355800389, + "grad_norm": 0.03317071497440338, + "learning_rate": 0.00019455421036050229, + "loss": 0.3109, + "step": 6148 + }, + { + "epoch": 0.49813674659753726, + "grad_norm": 0.03357579931616783, + "learning_rate": 0.00019454970970790765, + "loss": 0.3439, + "step": 6149 + }, + { + "epoch": 0.4982177576150357, + "grad_norm": 0.03235941380262375, + "learning_rate": 0.00019454520905531303, + "loss": 0.3818, + "step": 6150 + }, + { + "epoch": 0.49829876863253403, + "grad_norm": 0.035714928060770035, + "learning_rate": 0.00019454070840271842, + "loss": 0.3561, + "step": 6151 + }, + { + "epoch": 0.4983797796500324, + "grad_norm": 0.02831859700381756, + "learning_rate": 0.00019453620775012378, + "loss": 0.3232, + "step": 6152 + }, + { + "epoch": 0.4984607906675308, + "grad_norm": 0.03555682301521301, + "learning_rate": 0.00019453170709752914, + "loss": 0.3169, + "step": 6153 + }, + { + "epoch": 0.49854180168502915, + "grad_norm": 0.03351519629359245, + "learning_rate": 0.00019452720644493453, + "loss": 0.3638, + "step": 6154 + }, + { + "epoch": 0.49862281270252756, + "grad_norm": 0.03014187328517437, + "learning_rate": 0.0001945227057923399, + "loss": 0.3477, + "step": 6155 + }, + { + "epoch": 0.4987038237200259, + "grad_norm": 0.0321093387901783, + "learning_rate": 0.00019451820513974528, + "loss": 0.3541, + "step": 6156 + }, + { + "epoch": 0.49878483473752433, + "grad_norm": 0.04117025434970856, + "learning_rate": 0.00019451370448715066, + "loss": 0.3738, + "step": 6157 + }, + { + "epoch": 0.4988658457550227, + "grad_norm": 0.029803169891238213, + "learning_rate": 0.00019450920383455602, + "loss": 0.3473, + "step": 6158 + }, + { + "epoch": 0.49894685677252104, + "grad_norm": 0.030828144401311874, + "learning_rate": 0.00019450470318196138, + "loss": 0.3796, + "step": 6159 + }, + { + "epoch": 0.49902786779001945, + "grad_norm": 0.03853604570031166, + "learning_rate": 0.00019450020252936677, + "loss": 0.37, + "step": 6160 + }, + { + "epoch": 0.4991088788075178, + "grad_norm": 0.03002386912703514, + "learning_rate": 0.00019449570187677216, + "loss": 0.347, + "step": 6161 + }, + { + "epoch": 0.4991898898250162, + "grad_norm": 0.0349125862121582, + "learning_rate": 0.00019449120122417752, + "loss": 0.4022, + "step": 6162 + }, + { + "epoch": 0.4992709008425146, + "grad_norm": 0.03078773058950901, + "learning_rate": 0.0001944867005715829, + "loss": 0.3651, + "step": 6163 + }, + { + "epoch": 0.499351911860013, + "grad_norm": 0.03070513904094696, + "learning_rate": 0.00019448219991898826, + "loss": 0.3402, + "step": 6164 + }, + { + "epoch": 0.49943292287751134, + "grad_norm": 0.03130587190389633, + "learning_rate": 0.00019447769926639362, + "loss": 0.3446, + "step": 6165 + }, + { + "epoch": 0.4995139338950097, + "grad_norm": 0.028303734958171844, + "learning_rate": 0.000194473198613799, + "loss": 0.3357, + "step": 6166 + }, + { + "epoch": 0.4995949449125081, + "grad_norm": 0.030292222276329994, + "learning_rate": 0.0001944686979612044, + "loss": 0.3322, + "step": 6167 + }, + { + "epoch": 0.49967595593000647, + "grad_norm": 0.03299104794859886, + "learning_rate": 0.00019446419730860976, + "loss": 0.3517, + "step": 6168 + }, + { + "epoch": 0.4997569669475049, + "grad_norm": 0.031113620847463608, + "learning_rate": 0.00019445969665601515, + "loss": 0.3741, + "step": 6169 + }, + { + "epoch": 0.49983797796500323, + "grad_norm": 0.031416017562150955, + "learning_rate": 0.0001944551960034205, + "loss": 0.3427, + "step": 6170 + }, + { + "epoch": 0.49991898898250164, + "grad_norm": 0.02874842658638954, + "learning_rate": 0.00019445069535082587, + "loss": 0.3353, + "step": 6171 + }, + { + "epoch": 0.5, + "grad_norm": 0.028396783396601677, + "learning_rate": 0.00019444619469823125, + "loss": 0.3563, + "step": 6172 + }, + { + "epoch": 0.5000810110174984, + "grad_norm": 0.029108745977282524, + "learning_rate": 0.00019444169404563664, + "loss": 0.3556, + "step": 6173 + }, + { + "epoch": 0.5001620220349967, + "grad_norm": 0.03376203402876854, + "learning_rate": 0.000194437193393042, + "loss": 0.3347, + "step": 6174 + }, + { + "epoch": 0.5002430330524952, + "grad_norm": 0.03200310841202736, + "learning_rate": 0.0001944326927404474, + "loss": 0.3773, + "step": 6175 + }, + { + "epoch": 0.5003240440699935, + "grad_norm": 0.03190843015909195, + "learning_rate": 0.00019442819208785275, + "loss": 0.3494, + "step": 6176 + }, + { + "epoch": 0.5004050550874919, + "grad_norm": 0.037339869886636734, + "learning_rate": 0.0001944236914352581, + "loss": 0.3953, + "step": 6177 + }, + { + "epoch": 0.5004860661049902, + "grad_norm": 0.028407022356987, + "learning_rate": 0.0001944191907826635, + "loss": 0.3594, + "step": 6178 + }, + { + "epoch": 0.5005670771224887, + "grad_norm": 0.02792937681078911, + "learning_rate": 0.00019441469013006888, + "loss": 0.3103, + "step": 6179 + }, + { + "epoch": 0.5006480881399871, + "grad_norm": 0.03023194707930088, + "learning_rate": 0.00019441018947747424, + "loss": 0.3372, + "step": 6180 + }, + { + "epoch": 0.5007290991574854, + "grad_norm": 0.030646204948425293, + "learning_rate": 0.00019440568882487963, + "loss": 0.3651, + "step": 6181 + }, + { + "epoch": 0.5008101101749838, + "grad_norm": 0.03154953196644783, + "learning_rate": 0.000194401188172285, + "loss": 0.3742, + "step": 6182 + }, + { + "epoch": 0.5008911211924821, + "grad_norm": 0.03270383179187775, + "learning_rate": 0.00019439668751969035, + "loss": 0.3629, + "step": 6183 + }, + { + "epoch": 0.5009721322099806, + "grad_norm": 0.029822640120983124, + "learning_rate": 0.00019439218686709574, + "loss": 0.3848, + "step": 6184 + }, + { + "epoch": 0.501053143227479, + "grad_norm": 0.02989530935883522, + "learning_rate": 0.00019438768621450112, + "loss": 0.3755, + "step": 6185 + }, + { + "epoch": 0.5011341542449773, + "grad_norm": 0.029338372871279716, + "learning_rate": 0.00019438318556190648, + "loss": 0.3114, + "step": 6186 + }, + { + "epoch": 0.5012151652624757, + "grad_norm": 0.029134294018149376, + "learning_rate": 0.00019437868490931187, + "loss": 0.3457, + "step": 6187 + }, + { + "epoch": 0.501296176279974, + "grad_norm": 0.04486939311027527, + "learning_rate": 0.00019437418425671723, + "loss": 0.3917, + "step": 6188 + }, + { + "epoch": 0.5013771872974725, + "grad_norm": 0.03448181971907616, + "learning_rate": 0.0001943696836041226, + "loss": 0.3609, + "step": 6189 + }, + { + "epoch": 0.5014581983149708, + "grad_norm": 0.030938809737563133, + "learning_rate": 0.00019436518295152798, + "loss": 0.3559, + "step": 6190 + }, + { + "epoch": 0.5015392093324692, + "grad_norm": 0.030646713450551033, + "learning_rate": 0.00019436068229893336, + "loss": 0.3587, + "step": 6191 + }, + { + "epoch": 0.5016202203499676, + "grad_norm": 0.03244081139564514, + "learning_rate": 0.00019435618164633872, + "loss": 0.3761, + "step": 6192 + }, + { + "epoch": 0.501701231367466, + "grad_norm": 0.035014357417821884, + "learning_rate": 0.0001943516809937441, + "loss": 0.3652, + "step": 6193 + }, + { + "epoch": 0.5017822423849644, + "grad_norm": 0.031360477209091187, + "learning_rate": 0.00019434718034114947, + "loss": 0.3517, + "step": 6194 + }, + { + "epoch": 0.5018632534024627, + "grad_norm": 0.028555797412991524, + "learning_rate": 0.00019434267968855483, + "loss": 0.363, + "step": 6195 + }, + { + "epoch": 0.5019442644199611, + "grad_norm": 0.03647736459970474, + "learning_rate": 0.00019433817903596022, + "loss": 0.3547, + "step": 6196 + }, + { + "epoch": 0.5020252754374595, + "grad_norm": 0.03341953083872795, + "learning_rate": 0.0001943336783833656, + "loss": 0.4141, + "step": 6197 + }, + { + "epoch": 0.5021062864549579, + "grad_norm": 0.027791619300842285, + "learning_rate": 0.00019432917773077097, + "loss": 0.3209, + "step": 6198 + }, + { + "epoch": 0.5021872974724563, + "grad_norm": 0.03126957640051842, + "learning_rate": 0.00019432467707817635, + "loss": 0.3297, + "step": 6199 + }, + { + "epoch": 0.5022683084899546, + "grad_norm": 0.028604375198483467, + "learning_rate": 0.00019432017642558171, + "loss": 0.3534, + "step": 6200 + }, + { + "epoch": 0.502349319507453, + "grad_norm": 0.03173546493053436, + "learning_rate": 0.00019431567577298707, + "loss": 0.377, + "step": 6201 + }, + { + "epoch": 0.5024303305249513, + "grad_norm": 0.030360376462340355, + "learning_rate": 0.0001943111751203925, + "loss": 0.3215, + "step": 6202 + }, + { + "epoch": 0.5025113415424498, + "grad_norm": 0.030665067955851555, + "learning_rate": 0.00019430667446779785, + "loss": 0.323, + "step": 6203 + }, + { + "epoch": 0.5025923525599482, + "grad_norm": 0.02950088307261467, + "learning_rate": 0.0001943021738152032, + "loss": 0.3481, + "step": 6204 + }, + { + "epoch": 0.5026733635774465, + "grad_norm": 0.026497744023799896, + "learning_rate": 0.0001942976731626086, + "loss": 0.338, + "step": 6205 + }, + { + "epoch": 0.5027543745949449, + "grad_norm": 0.032492347061634064, + "learning_rate": 0.00019429317251001396, + "loss": 0.3416, + "step": 6206 + }, + { + "epoch": 0.5028353856124433, + "grad_norm": 0.0336732417345047, + "learning_rate": 0.00019428867185741932, + "loss": 0.3219, + "step": 6207 + }, + { + "epoch": 0.5029163966299417, + "grad_norm": 0.029335761442780495, + "learning_rate": 0.00019428417120482473, + "loss": 0.2768, + "step": 6208 + }, + { + "epoch": 0.50299740764744, + "grad_norm": 0.03476308658719063, + "learning_rate": 0.0001942796705522301, + "loss": 0.3831, + "step": 6209 + }, + { + "epoch": 0.5030784186649384, + "grad_norm": 0.03170355036854744, + "learning_rate": 0.00019427516989963545, + "loss": 0.3257, + "step": 6210 + }, + { + "epoch": 0.5031594296824368, + "grad_norm": 0.035959310829639435, + "learning_rate": 0.00019427066924704084, + "loss": 0.3323, + "step": 6211 + }, + { + "epoch": 0.5032404406999352, + "grad_norm": 0.029416095465421677, + "learning_rate": 0.0001942661685944462, + "loss": 0.3845, + "step": 6212 + }, + { + "epoch": 0.5033214517174336, + "grad_norm": 0.03393389657139778, + "learning_rate": 0.00019426166794185158, + "loss": 0.3568, + "step": 6213 + }, + { + "epoch": 0.5034024627349319, + "grad_norm": 0.032064225524663925, + "learning_rate": 0.00019425716728925697, + "loss": 0.4063, + "step": 6214 + }, + { + "epoch": 0.5034834737524303, + "grad_norm": 0.03695201501250267, + "learning_rate": 0.00019425266663666233, + "loss": 0.3833, + "step": 6215 + }, + { + "epoch": 0.5035644847699287, + "grad_norm": 0.027572009712457657, + "learning_rate": 0.0001942481659840677, + "loss": 0.3293, + "step": 6216 + }, + { + "epoch": 0.5036454957874271, + "grad_norm": 0.03562445938587189, + "learning_rate": 0.00019424366533147308, + "loss": 0.3812, + "step": 6217 + }, + { + "epoch": 0.5037265068049255, + "grad_norm": 0.03267737105488777, + "learning_rate": 0.00019423916467887844, + "loss": 0.4063, + "step": 6218 + }, + { + "epoch": 0.5038075178224238, + "grad_norm": 0.03409668803215027, + "learning_rate": 0.00019423466402628383, + "loss": 0.4145, + "step": 6219 + }, + { + "epoch": 0.5038885288399222, + "grad_norm": 0.02712850458920002, + "learning_rate": 0.0001942301633736892, + "loss": 0.3519, + "step": 6220 + }, + { + "epoch": 0.5039695398574207, + "grad_norm": 0.03325193002820015, + "learning_rate": 0.00019422566272109457, + "loss": 0.3376, + "step": 6221 + }, + { + "epoch": 0.504050550874919, + "grad_norm": 0.031705718487501144, + "learning_rate": 0.00019422116206849993, + "loss": 0.3336, + "step": 6222 + }, + { + "epoch": 0.5041315618924174, + "grad_norm": 0.02508116513490677, + "learning_rate": 0.00019421666141590532, + "loss": 0.3163, + "step": 6223 + }, + { + "epoch": 0.5042125729099157, + "grad_norm": 0.03339695557951927, + "learning_rate": 0.00019421216076331068, + "loss": 0.4402, + "step": 6224 + }, + { + "epoch": 0.5042935839274141, + "grad_norm": 0.028711862862110138, + "learning_rate": 0.00019420766011071607, + "loss": 0.337, + "step": 6225 + }, + { + "epoch": 0.5043745949449125, + "grad_norm": 0.030767934396862984, + "learning_rate": 0.00019420315945812145, + "loss": 0.365, + "step": 6226 + }, + { + "epoch": 0.5044556059624109, + "grad_norm": 0.030805392190814018, + "learning_rate": 0.00019419865880552681, + "loss": 0.3305, + "step": 6227 + }, + { + "epoch": 0.5045366169799093, + "grad_norm": 0.029501011595129967, + "learning_rate": 0.00019419415815293217, + "loss": 0.3484, + "step": 6228 + }, + { + "epoch": 0.5046176279974076, + "grad_norm": 0.027413569390773773, + "learning_rate": 0.00019418965750033756, + "loss": 0.3342, + "step": 6229 + }, + { + "epoch": 0.5046986390149061, + "grad_norm": 0.031460799276828766, + "learning_rate": 0.00019418515684774292, + "loss": 0.3617, + "step": 6230 + }, + { + "epoch": 0.5047796500324044, + "grad_norm": 0.031566593796014786, + "learning_rate": 0.0001941806561951483, + "loss": 0.3821, + "step": 6231 + }, + { + "epoch": 0.5048606610499028, + "grad_norm": 0.029631730169057846, + "learning_rate": 0.0001941761555425537, + "loss": 0.3189, + "step": 6232 + }, + { + "epoch": 0.5049416720674011, + "grad_norm": 0.036956410855054855, + "learning_rate": 0.00019417165488995906, + "loss": 0.3916, + "step": 6233 + }, + { + "epoch": 0.5050226830848995, + "grad_norm": 0.029559465125203133, + "learning_rate": 0.00019416715423736442, + "loss": 0.3444, + "step": 6234 + }, + { + "epoch": 0.505103694102398, + "grad_norm": 0.02766140177845955, + "learning_rate": 0.0001941626535847698, + "loss": 0.3127, + "step": 6235 + }, + { + "epoch": 0.5051847051198963, + "grad_norm": 0.030541831627488136, + "learning_rate": 0.0001941581529321752, + "loss": 0.3341, + "step": 6236 + }, + { + "epoch": 0.5052657161373947, + "grad_norm": 0.029187265783548355, + "learning_rate": 0.00019415365227958055, + "loss": 0.3098, + "step": 6237 + }, + { + "epoch": 0.505346727154893, + "grad_norm": 0.03717184439301491, + "learning_rate": 0.00019414915162698594, + "loss": 0.3742, + "step": 6238 + }, + { + "epoch": 0.5054277381723914, + "grad_norm": 0.03266545757651329, + "learning_rate": 0.0001941446509743913, + "loss": 0.3678, + "step": 6239 + }, + { + "epoch": 0.5055087491898899, + "grad_norm": 0.03215375915169716, + "learning_rate": 0.00019414015032179666, + "loss": 0.3236, + "step": 6240 + }, + { + "epoch": 0.5055897602073882, + "grad_norm": 0.02944401279091835, + "learning_rate": 0.00019413564966920205, + "loss": 0.3693, + "step": 6241 + }, + { + "epoch": 0.5056707712248866, + "grad_norm": 0.030675627291202545, + "learning_rate": 0.00019413114901660743, + "loss": 0.3741, + "step": 6242 + }, + { + "epoch": 0.5057517822423849, + "grad_norm": 0.035757292062044144, + "learning_rate": 0.0001941266483640128, + "loss": 0.3646, + "step": 6243 + }, + { + "epoch": 0.5058327932598834, + "grad_norm": 0.032462820410728455, + "learning_rate": 0.00019412214771141818, + "loss": 0.3472, + "step": 6244 + }, + { + "epoch": 0.5059138042773818, + "grad_norm": 0.03389735147356987, + "learning_rate": 0.00019411764705882354, + "loss": 0.3818, + "step": 6245 + }, + { + "epoch": 0.5059948152948801, + "grad_norm": 0.028163554146885872, + "learning_rate": 0.0001941131464062289, + "loss": 0.3447, + "step": 6246 + }, + { + "epoch": 0.5060758263123785, + "grad_norm": 0.030725527554750443, + "learning_rate": 0.0001941086457536343, + "loss": 0.2915, + "step": 6247 + }, + { + "epoch": 0.5061568373298768, + "grad_norm": 0.030973976477980614, + "learning_rate": 0.00019410414510103967, + "loss": 0.3482, + "step": 6248 + }, + { + "epoch": 0.5062378483473753, + "grad_norm": 0.030928470194339752, + "learning_rate": 0.00019409964444844503, + "loss": 0.3567, + "step": 6249 + }, + { + "epoch": 0.5063188593648736, + "grad_norm": 0.030242808163166046, + "learning_rate": 0.00019409514379585042, + "loss": 0.3557, + "step": 6250 + }, + { + "epoch": 0.506399870382372, + "grad_norm": 0.02889467217028141, + "learning_rate": 0.00019409064314325578, + "loss": 0.3522, + "step": 6251 + }, + { + "epoch": 0.5064808813998704, + "grad_norm": 0.030975107103586197, + "learning_rate": 0.00019408614249066114, + "loss": 0.3302, + "step": 6252 + }, + { + "epoch": 0.5065618924173687, + "grad_norm": 0.03164440020918846, + "learning_rate": 0.00019408164183806653, + "loss": 0.3643, + "step": 6253 + }, + { + "epoch": 0.5066429034348672, + "grad_norm": 0.03547287359833717, + "learning_rate": 0.00019407714118547192, + "loss": 0.3688, + "step": 6254 + }, + { + "epoch": 0.5067239144523655, + "grad_norm": 0.031200233846902847, + "learning_rate": 0.00019407264053287728, + "loss": 0.3747, + "step": 6255 + }, + { + "epoch": 0.5068049254698639, + "grad_norm": 0.028289861977100372, + "learning_rate": 0.00019406813988028266, + "loss": 0.3329, + "step": 6256 + }, + { + "epoch": 0.5068859364873622, + "grad_norm": 0.03076610527932644, + "learning_rate": 0.00019406363922768802, + "loss": 0.3163, + "step": 6257 + }, + { + "epoch": 0.5069669475048607, + "grad_norm": 0.03446902707219124, + "learning_rate": 0.00019405913857509338, + "loss": 0.3595, + "step": 6258 + }, + { + "epoch": 0.5070479585223591, + "grad_norm": 0.03079809434711933, + "learning_rate": 0.00019405463792249877, + "loss": 0.3493, + "step": 6259 + }, + { + "epoch": 0.5071289695398574, + "grad_norm": 0.03474215790629387, + "learning_rate": 0.00019405013726990416, + "loss": 0.4053, + "step": 6260 + }, + { + "epoch": 0.5072099805573558, + "grad_norm": 0.04012686014175415, + "learning_rate": 0.00019404563661730952, + "loss": 0.3085, + "step": 6261 + }, + { + "epoch": 0.5072909915748541, + "grad_norm": 0.029809720814228058, + "learning_rate": 0.0001940411359647149, + "loss": 0.3449, + "step": 6262 + }, + { + "epoch": 0.5073720025923526, + "grad_norm": 0.028845548629760742, + "learning_rate": 0.00019403663531212026, + "loss": 0.3286, + "step": 6263 + }, + { + "epoch": 0.507453013609851, + "grad_norm": 0.03084278292953968, + "learning_rate": 0.00019403213465952562, + "loss": 0.3731, + "step": 6264 + }, + { + "epoch": 0.5075340246273493, + "grad_norm": 0.0335809662938118, + "learning_rate": 0.000194027634006931, + "loss": 0.3481, + "step": 6265 + }, + { + "epoch": 0.5076150356448477, + "grad_norm": 0.04117157682776451, + "learning_rate": 0.0001940231333543364, + "loss": 0.3492, + "step": 6266 + }, + { + "epoch": 0.5076960466623461, + "grad_norm": 0.034561336040496826, + "learning_rate": 0.00019401863270174176, + "loss": 0.3986, + "step": 6267 + }, + { + "epoch": 0.5077770576798445, + "grad_norm": 0.033314213156700134, + "learning_rate": 0.00019401413204914715, + "loss": 0.412, + "step": 6268 + }, + { + "epoch": 0.5078580686973428, + "grad_norm": 0.029534365981817245, + "learning_rate": 0.0001940096313965525, + "loss": 0.3524, + "step": 6269 + }, + { + "epoch": 0.5079390797148412, + "grad_norm": 0.032147347927093506, + "learning_rate": 0.00019400513074395787, + "loss": 0.4007, + "step": 6270 + }, + { + "epoch": 0.5080200907323396, + "grad_norm": 0.02948775887489319, + "learning_rate": 0.00019400063009136328, + "loss": 0.3446, + "step": 6271 + }, + { + "epoch": 0.508101101749838, + "grad_norm": 0.03933773934841156, + "learning_rate": 0.00019399612943876864, + "loss": 0.3851, + "step": 6272 + }, + { + "epoch": 0.5081821127673364, + "grad_norm": 0.02769440785050392, + "learning_rate": 0.000193991628786174, + "loss": 0.3081, + "step": 6273 + }, + { + "epoch": 0.5082631237848347, + "grad_norm": 0.03911726921796799, + "learning_rate": 0.0001939871281335794, + "loss": 0.3732, + "step": 6274 + }, + { + "epoch": 0.5083441348023331, + "grad_norm": 0.029643390327692032, + "learning_rate": 0.00019398262748098475, + "loss": 0.3612, + "step": 6275 + }, + { + "epoch": 0.5084251458198314, + "grad_norm": 0.036275800317525864, + "learning_rate": 0.0001939781268283901, + "loss": 0.3405, + "step": 6276 + }, + { + "epoch": 0.5085061568373299, + "grad_norm": 0.03233199194073677, + "learning_rate": 0.00019397362617579552, + "loss": 0.3421, + "step": 6277 + }, + { + "epoch": 0.5085871678548283, + "grad_norm": 0.03278407081961632, + "learning_rate": 0.00019396912552320088, + "loss": 0.3852, + "step": 6278 + }, + { + "epoch": 0.5086681788723266, + "grad_norm": 0.03374721109867096, + "learning_rate": 0.00019396462487060624, + "loss": 0.3024, + "step": 6279 + }, + { + "epoch": 0.508749189889825, + "grad_norm": 0.032265424728393555, + "learning_rate": 0.00019396012421801163, + "loss": 0.373, + "step": 6280 + }, + { + "epoch": 0.5088302009073234, + "grad_norm": 0.03495476022362709, + "learning_rate": 0.000193955623565417, + "loss": 0.3255, + "step": 6281 + }, + { + "epoch": 0.5089112119248218, + "grad_norm": 0.03319685533642769, + "learning_rate": 0.00019395112291282235, + "loss": 0.3884, + "step": 6282 + }, + { + "epoch": 0.5089922229423202, + "grad_norm": 0.030925067141652107, + "learning_rate": 0.00019394662226022776, + "loss": 0.3777, + "step": 6283 + }, + { + "epoch": 0.5090732339598185, + "grad_norm": 0.03134768828749657, + "learning_rate": 0.00019394212160763312, + "loss": 0.3875, + "step": 6284 + }, + { + "epoch": 0.5091542449773169, + "grad_norm": 0.034195996820926666, + "learning_rate": 0.00019393762095503848, + "loss": 0.3083, + "step": 6285 + }, + { + "epoch": 0.5092352559948153, + "grad_norm": 0.039229799062013626, + "learning_rate": 0.00019393312030244387, + "loss": 0.3646, + "step": 6286 + }, + { + "epoch": 0.5093162670123137, + "grad_norm": 0.03241065889596939, + "learning_rate": 0.00019392861964984923, + "loss": 0.3777, + "step": 6287 + }, + { + "epoch": 0.509397278029812, + "grad_norm": 0.031153660267591476, + "learning_rate": 0.0001939241189972546, + "loss": 0.3651, + "step": 6288 + }, + { + "epoch": 0.5094782890473104, + "grad_norm": 0.031294070184230804, + "learning_rate": 0.00019391961834466, + "loss": 0.3527, + "step": 6289 + }, + { + "epoch": 0.5095593000648088, + "grad_norm": 0.03369227051734924, + "learning_rate": 0.00019391511769206537, + "loss": 0.3764, + "step": 6290 + }, + { + "epoch": 0.5096403110823072, + "grad_norm": 0.03187112510204315, + "learning_rate": 0.00019391061703947073, + "loss": 0.3501, + "step": 6291 + }, + { + "epoch": 0.5097213220998056, + "grad_norm": 0.03397297486662865, + "learning_rate": 0.0001939061163868761, + "loss": 0.3805, + "step": 6292 + }, + { + "epoch": 0.5098023331173039, + "grad_norm": 0.027571003884077072, + "learning_rate": 0.00019390161573428147, + "loss": 0.3033, + "step": 6293 + }, + { + "epoch": 0.5098833441348023, + "grad_norm": 0.03471944108605385, + "learning_rate": 0.00019389711508168686, + "loss": 0.3677, + "step": 6294 + }, + { + "epoch": 0.5099643551523008, + "grad_norm": 0.032311148941516876, + "learning_rate": 0.00019389261442909225, + "loss": 0.3641, + "step": 6295 + }, + { + "epoch": 0.5100453661697991, + "grad_norm": 0.03428216651082039, + "learning_rate": 0.0001938881137764976, + "loss": 0.3387, + "step": 6296 + }, + { + "epoch": 0.5101263771872975, + "grad_norm": 0.03166080266237259, + "learning_rate": 0.00019388361312390297, + "loss": 0.3805, + "step": 6297 + }, + { + "epoch": 0.5102073882047958, + "grad_norm": 0.02960868738591671, + "learning_rate": 0.00019387911247130835, + "loss": 0.3455, + "step": 6298 + }, + { + "epoch": 0.5102883992222942, + "grad_norm": 0.031040215864777565, + "learning_rate": 0.00019387461181871371, + "loss": 0.3222, + "step": 6299 + }, + { + "epoch": 0.5103694102397927, + "grad_norm": 0.02788618393242359, + "learning_rate": 0.0001938701111661191, + "loss": 0.3552, + "step": 6300 + }, + { + "epoch": 0.510450421257291, + "grad_norm": 0.03101874515414238, + "learning_rate": 0.0001938656105135245, + "loss": 0.3212, + "step": 6301 + }, + { + "epoch": 0.5105314322747894, + "grad_norm": 0.03254443407058716, + "learning_rate": 0.00019386110986092985, + "loss": 0.3084, + "step": 6302 + }, + { + "epoch": 0.5106124432922877, + "grad_norm": 0.030638394877314568, + "learning_rate": 0.0001938566092083352, + "loss": 0.3793, + "step": 6303 + }, + { + "epoch": 0.5106934543097861, + "grad_norm": 0.03493942320346832, + "learning_rate": 0.0001938521085557406, + "loss": 0.3468, + "step": 6304 + }, + { + "epoch": 0.5107744653272845, + "grad_norm": 0.03019285574555397, + "learning_rate": 0.00019384760790314596, + "loss": 0.3434, + "step": 6305 + }, + { + "epoch": 0.5108554763447829, + "grad_norm": 0.03360366448760033, + "learning_rate": 0.00019384310725055134, + "loss": 0.3754, + "step": 6306 + }, + { + "epoch": 0.5109364873622813, + "grad_norm": 0.03600335493683815, + "learning_rate": 0.00019383860659795673, + "loss": 0.3806, + "step": 6307 + }, + { + "epoch": 0.5110174983797796, + "grad_norm": 0.027877481654286385, + "learning_rate": 0.0001938341059453621, + "loss": 0.3768, + "step": 6308 + }, + { + "epoch": 0.5110985093972781, + "grad_norm": 0.03358413279056549, + "learning_rate": 0.00019382960529276745, + "loss": 0.3755, + "step": 6309 + }, + { + "epoch": 0.5111795204147764, + "grad_norm": 0.02829788252711296, + "learning_rate": 0.00019382510464017284, + "loss": 0.338, + "step": 6310 + }, + { + "epoch": 0.5112605314322748, + "grad_norm": 0.03067375160753727, + "learning_rate": 0.0001938206039875782, + "loss": 0.3577, + "step": 6311 + }, + { + "epoch": 0.5113415424497731, + "grad_norm": 0.03363307565450668, + "learning_rate": 0.00019381610333498358, + "loss": 0.3685, + "step": 6312 + }, + { + "epoch": 0.5114225534672715, + "grad_norm": 0.03025786019861698, + "learning_rate": 0.00019381160268238897, + "loss": 0.3675, + "step": 6313 + }, + { + "epoch": 0.51150356448477, + "grad_norm": 0.028437191620469093, + "learning_rate": 0.00019380710202979433, + "loss": 0.3162, + "step": 6314 + }, + { + "epoch": 0.5115845755022683, + "grad_norm": 0.032357107847929, + "learning_rate": 0.0001938026013771997, + "loss": 0.3861, + "step": 6315 + }, + { + "epoch": 0.5116655865197667, + "grad_norm": 0.030493425205349922, + "learning_rate": 0.00019379810072460508, + "loss": 0.3729, + "step": 6316 + }, + { + "epoch": 0.511746597537265, + "grad_norm": 0.03158330172300339, + "learning_rate": 0.00019379360007201047, + "loss": 0.3112, + "step": 6317 + }, + { + "epoch": 0.5118276085547635, + "grad_norm": 0.029356788843870163, + "learning_rate": 0.00019378909941941583, + "loss": 0.4015, + "step": 6318 + }, + { + "epoch": 0.5119086195722619, + "grad_norm": 0.028847860172390938, + "learning_rate": 0.0001937845987668212, + "loss": 0.3186, + "step": 6319 + }, + { + "epoch": 0.5119896305897602, + "grad_norm": 0.036853089928627014, + "learning_rate": 0.00019378009811422657, + "loss": 0.4413, + "step": 6320 + }, + { + "epoch": 0.5120706416072586, + "grad_norm": 0.02763395383954048, + "learning_rate": 0.00019377559746163193, + "loss": 0.3072, + "step": 6321 + }, + { + "epoch": 0.5121516526247569, + "grad_norm": 0.03145894780755043, + "learning_rate": 0.00019377109680903732, + "loss": 0.395, + "step": 6322 + }, + { + "epoch": 0.5122326636422554, + "grad_norm": 0.02779548056423664, + "learning_rate": 0.0001937665961564427, + "loss": 0.3195, + "step": 6323 + }, + { + "epoch": 0.5123136746597537, + "grad_norm": 0.028829006478190422, + "learning_rate": 0.00019376209550384807, + "loss": 0.4011, + "step": 6324 + }, + { + "epoch": 0.5123946856772521, + "grad_norm": 0.03402452915906906, + "learning_rate": 0.00019375759485125345, + "loss": 0.369, + "step": 6325 + }, + { + "epoch": 0.5124756966947505, + "grad_norm": 0.03007756732404232, + "learning_rate": 0.00019375309419865881, + "loss": 0.3516, + "step": 6326 + }, + { + "epoch": 0.5125567077122488, + "grad_norm": 0.032619405537843704, + "learning_rate": 0.00019374859354606417, + "loss": 0.3723, + "step": 6327 + }, + { + "epoch": 0.5126377187297473, + "grad_norm": 0.029638774693012238, + "learning_rate": 0.00019374409289346956, + "loss": 0.3269, + "step": 6328 + }, + { + "epoch": 0.5127187297472456, + "grad_norm": 0.03590411692857742, + "learning_rate": 0.00019373959224087495, + "loss": 0.4164, + "step": 6329 + }, + { + "epoch": 0.512799740764744, + "grad_norm": 0.031840380281209946, + "learning_rate": 0.0001937350915882803, + "loss": 0.3834, + "step": 6330 + }, + { + "epoch": 0.5128807517822424, + "grad_norm": 0.03522587567567825, + "learning_rate": 0.0001937305909356857, + "loss": 0.3833, + "step": 6331 + }, + { + "epoch": 0.5129617627997408, + "grad_norm": 0.029918646439909935, + "learning_rate": 0.00019372609028309106, + "loss": 0.3557, + "step": 6332 + }, + { + "epoch": 0.5130427738172392, + "grad_norm": 0.0331931971013546, + "learning_rate": 0.00019372158963049642, + "loss": 0.3398, + "step": 6333 + }, + { + "epoch": 0.5131237848347375, + "grad_norm": 0.031502824276685715, + "learning_rate": 0.0001937170889779018, + "loss": 0.3731, + "step": 6334 + }, + { + "epoch": 0.5132047958522359, + "grad_norm": 0.029885951429605484, + "learning_rate": 0.0001937125883253072, + "loss": 0.3634, + "step": 6335 + }, + { + "epoch": 0.5132858068697342, + "grad_norm": 0.02981005609035492, + "learning_rate": 0.00019370808767271255, + "loss": 0.3435, + "step": 6336 + }, + { + "epoch": 0.5133668178872327, + "grad_norm": 0.032016005367040634, + "learning_rate": 0.00019370358702011794, + "loss": 0.3851, + "step": 6337 + }, + { + "epoch": 0.5134478289047311, + "grad_norm": 0.034068964421749115, + "learning_rate": 0.0001936990863675233, + "loss": 0.3678, + "step": 6338 + }, + { + "epoch": 0.5135288399222294, + "grad_norm": 0.031114788725972176, + "learning_rate": 0.00019369458571492866, + "loss": 0.3791, + "step": 6339 + }, + { + "epoch": 0.5136098509397278, + "grad_norm": 0.03359213471412659, + "learning_rate": 0.00019369008506233405, + "loss": 0.3876, + "step": 6340 + }, + { + "epoch": 0.5136908619572261, + "grad_norm": 0.03328750655055046, + "learning_rate": 0.00019368558440973943, + "loss": 0.3568, + "step": 6341 + }, + { + "epoch": 0.5137718729747246, + "grad_norm": 0.028766583651304245, + "learning_rate": 0.0001936810837571448, + "loss": 0.3137, + "step": 6342 + }, + { + "epoch": 0.513852883992223, + "grad_norm": 0.027417849749326706, + "learning_rate": 0.00019367658310455018, + "loss": 0.3123, + "step": 6343 + }, + { + "epoch": 0.5139338950097213, + "grad_norm": 0.030872000381350517, + "learning_rate": 0.00019367208245195554, + "loss": 0.3476, + "step": 6344 + }, + { + "epoch": 0.5140149060272197, + "grad_norm": 0.03216738626360893, + "learning_rate": 0.0001936675817993609, + "loss": 0.376, + "step": 6345 + }, + { + "epoch": 0.5140959170447181, + "grad_norm": 0.030741369351744652, + "learning_rate": 0.00019366308114676631, + "loss": 0.3195, + "step": 6346 + }, + { + "epoch": 0.5141769280622165, + "grad_norm": 0.029521172866225243, + "learning_rate": 0.00019365858049417167, + "loss": 0.3565, + "step": 6347 + }, + { + "epoch": 0.5142579390797148, + "grad_norm": 0.03402463719248772, + "learning_rate": 0.00019365407984157703, + "loss": 0.3921, + "step": 6348 + }, + { + "epoch": 0.5143389500972132, + "grad_norm": 0.033727966248989105, + "learning_rate": 0.00019364957918898242, + "loss": 0.3609, + "step": 6349 + }, + { + "epoch": 0.5144199611147116, + "grad_norm": 0.029165782034397125, + "learning_rate": 0.00019364507853638778, + "loss": 0.3594, + "step": 6350 + }, + { + "epoch": 0.51450097213221, + "grad_norm": 0.029388075694441795, + "learning_rate": 0.00019364057788379314, + "loss": 0.3183, + "step": 6351 + }, + { + "epoch": 0.5145819831497084, + "grad_norm": 0.03150533139705658, + "learning_rate": 0.00019363607723119856, + "loss": 0.3508, + "step": 6352 + }, + { + "epoch": 0.5146629941672067, + "grad_norm": 0.028366554528474808, + "learning_rate": 0.00019363157657860392, + "loss": 0.3234, + "step": 6353 + }, + { + "epoch": 0.5147440051847051, + "grad_norm": 0.0290334802120924, + "learning_rate": 0.00019362707592600928, + "loss": 0.3214, + "step": 6354 + }, + { + "epoch": 0.5148250162022034, + "grad_norm": 0.03059317171573639, + "learning_rate": 0.00019362257527341466, + "loss": 0.2869, + "step": 6355 + }, + { + "epoch": 0.5149060272197019, + "grad_norm": 0.03641689568758011, + "learning_rate": 0.00019361807462082002, + "loss": 0.4037, + "step": 6356 + }, + { + "epoch": 0.5149870382372003, + "grad_norm": 0.030488573014736176, + "learning_rate": 0.00019361357396822538, + "loss": 0.386, + "step": 6357 + }, + { + "epoch": 0.5150680492546986, + "grad_norm": 0.03700888529419899, + "learning_rate": 0.0001936090733156308, + "loss": 0.3534, + "step": 6358 + }, + { + "epoch": 0.515149060272197, + "grad_norm": 0.03261464461684227, + "learning_rate": 0.00019360457266303616, + "loss": 0.3377, + "step": 6359 + }, + { + "epoch": 0.5152300712896954, + "grad_norm": 0.036103613674640656, + "learning_rate": 0.00019360007201044152, + "loss": 0.3113, + "step": 6360 + }, + { + "epoch": 0.5153110823071938, + "grad_norm": 0.029434196650981903, + "learning_rate": 0.0001935955713578469, + "loss": 0.3724, + "step": 6361 + }, + { + "epoch": 0.5153920933246922, + "grad_norm": 0.03052251972258091, + "learning_rate": 0.00019359107070525226, + "loss": 0.3562, + "step": 6362 + }, + { + "epoch": 0.5154731043421905, + "grad_norm": 0.03182889148592949, + "learning_rate": 0.00019358657005265762, + "loss": 0.3399, + "step": 6363 + }, + { + "epoch": 0.5155541153596889, + "grad_norm": 0.031418126076459885, + "learning_rate": 0.00019358206940006304, + "loss": 0.337, + "step": 6364 + }, + { + "epoch": 0.5156351263771873, + "grad_norm": 0.028636738657951355, + "learning_rate": 0.0001935775687474684, + "loss": 0.349, + "step": 6365 + }, + { + "epoch": 0.5157161373946857, + "grad_norm": 0.03573239594697952, + "learning_rate": 0.00019357306809487376, + "loss": 0.2985, + "step": 6366 + }, + { + "epoch": 0.515797148412184, + "grad_norm": 0.03263181447982788, + "learning_rate": 0.00019356856744227915, + "loss": 0.3869, + "step": 6367 + }, + { + "epoch": 0.5158781594296824, + "grad_norm": 0.03369957208633423, + "learning_rate": 0.0001935640667896845, + "loss": 0.365, + "step": 6368 + }, + { + "epoch": 0.5159591704471809, + "grad_norm": 0.02732839621603489, + "learning_rate": 0.0001935595661370899, + "loss": 0.3822, + "step": 6369 + }, + { + "epoch": 0.5160401814646792, + "grad_norm": 0.0374213382601738, + "learning_rate": 0.00019355506548449528, + "loss": 0.3375, + "step": 6370 + }, + { + "epoch": 0.5161211924821776, + "grad_norm": 0.03227487951517105, + "learning_rate": 0.00019355056483190064, + "loss": 0.3982, + "step": 6371 + }, + { + "epoch": 0.5162022034996759, + "grad_norm": 0.030356882140040398, + "learning_rate": 0.000193546064179306, + "loss": 0.3526, + "step": 6372 + }, + { + "epoch": 0.5162832145171743, + "grad_norm": 0.03413158282637596, + "learning_rate": 0.0001935415635267114, + "loss": 0.3768, + "step": 6373 + }, + { + "epoch": 0.5163642255346728, + "grad_norm": 0.02953726425766945, + "learning_rate": 0.00019353706287411675, + "loss": 0.3178, + "step": 6374 + }, + { + "epoch": 0.5164452365521711, + "grad_norm": 0.029602529481053352, + "learning_rate": 0.00019353256222152214, + "loss": 0.3066, + "step": 6375 + }, + { + "epoch": 0.5165262475696695, + "grad_norm": 0.03132502734661102, + "learning_rate": 0.00019352806156892752, + "loss": 0.3302, + "step": 6376 + }, + { + "epoch": 0.5166072585871678, + "grad_norm": 0.029297152534127235, + "learning_rate": 0.00019352356091633288, + "loss": 0.3797, + "step": 6377 + }, + { + "epoch": 0.5166882696046662, + "grad_norm": 0.030203981325030327, + "learning_rate": 0.00019351906026373824, + "loss": 0.3137, + "step": 6378 + }, + { + "epoch": 0.5167692806221647, + "grad_norm": 0.03112986497581005, + "learning_rate": 0.00019351455961114363, + "loss": 0.3897, + "step": 6379 + }, + { + "epoch": 0.516850291639663, + "grad_norm": 0.030600905418395996, + "learning_rate": 0.000193510058958549, + "loss": 0.406, + "step": 6380 + }, + { + "epoch": 0.5169313026571614, + "grad_norm": 0.03712606802582741, + "learning_rate": 0.00019350555830595438, + "loss": 0.3828, + "step": 6381 + }, + { + "epoch": 0.5170123136746597, + "grad_norm": 0.03237859159708023, + "learning_rate": 0.00019350105765335976, + "loss": 0.3284, + "step": 6382 + }, + { + "epoch": 0.5170933246921582, + "grad_norm": 0.033010292798280716, + "learning_rate": 0.00019349655700076512, + "loss": 0.3705, + "step": 6383 + }, + { + "epoch": 0.5171743357096565, + "grad_norm": 0.035950496792793274, + "learning_rate": 0.00019349205634817048, + "loss": 0.3146, + "step": 6384 + }, + { + "epoch": 0.5172553467271549, + "grad_norm": 0.03335532546043396, + "learning_rate": 0.00019348755569557587, + "loss": 0.3776, + "step": 6385 + }, + { + "epoch": 0.5173363577446533, + "grad_norm": 0.03218366205692291, + "learning_rate": 0.00019348305504298123, + "loss": 0.3743, + "step": 6386 + }, + { + "epoch": 0.5174173687621516, + "grad_norm": 0.03454633429646492, + "learning_rate": 0.00019347855439038662, + "loss": 0.371, + "step": 6387 + }, + { + "epoch": 0.5174983797796501, + "grad_norm": 0.03251352906227112, + "learning_rate": 0.000193474053737792, + "loss": 0.4115, + "step": 6388 + }, + { + "epoch": 0.5175793907971484, + "grad_norm": 0.03137648105621338, + "learning_rate": 0.00019346955308519737, + "loss": 0.3467, + "step": 6389 + }, + { + "epoch": 0.5176604018146468, + "grad_norm": 0.03319603577256203, + "learning_rate": 0.00019346505243260273, + "loss": 0.4092, + "step": 6390 + }, + { + "epoch": 0.5177414128321451, + "grad_norm": 0.02964252606034279, + "learning_rate": 0.0001934605517800081, + "loss": 0.3554, + "step": 6391 + }, + { + "epoch": 0.5178224238496435, + "grad_norm": 0.03132447227835655, + "learning_rate": 0.00019345605112741347, + "loss": 0.3331, + "step": 6392 + }, + { + "epoch": 0.517903434867142, + "grad_norm": 0.033342160284519196, + "learning_rate": 0.00019345155047481886, + "loss": 0.36, + "step": 6393 + }, + { + "epoch": 0.5179844458846403, + "grad_norm": 0.03459905460476875, + "learning_rate": 0.00019344704982222425, + "loss": 0.3617, + "step": 6394 + }, + { + "epoch": 0.5180654569021387, + "grad_norm": 0.03589166700839996, + "learning_rate": 0.0001934425491696296, + "loss": 0.4322, + "step": 6395 + }, + { + "epoch": 0.518146467919637, + "grad_norm": 0.03061509132385254, + "learning_rate": 0.00019343804851703497, + "loss": 0.3421, + "step": 6396 + }, + { + "epoch": 0.5182274789371355, + "grad_norm": 0.029686959460377693, + "learning_rate": 0.00019343354786444035, + "loss": 0.3151, + "step": 6397 + }, + { + "epoch": 0.5183084899546339, + "grad_norm": 0.03125001862645149, + "learning_rate": 0.00019342904721184574, + "loss": 0.32, + "step": 6398 + }, + { + "epoch": 0.5183895009721322, + "grad_norm": 0.03100123070180416, + "learning_rate": 0.0001934245465592511, + "loss": 0.3775, + "step": 6399 + }, + { + "epoch": 0.5184705119896306, + "grad_norm": 0.03699709102511406, + "learning_rate": 0.0001934200459066565, + "loss": 0.4467, + "step": 6400 + }, + { + "epoch": 0.5185515230071289, + "grad_norm": 0.029937749728560448, + "learning_rate": 0.00019341554525406185, + "loss": 0.3409, + "step": 6401 + }, + { + "epoch": 0.5186325340246274, + "grad_norm": 0.026631394401192665, + "learning_rate": 0.0001934110446014672, + "loss": 0.3464, + "step": 6402 + }, + { + "epoch": 0.5187135450421257, + "grad_norm": 0.033784594386816025, + "learning_rate": 0.0001934065439488726, + "loss": 0.3718, + "step": 6403 + }, + { + "epoch": 0.5187945560596241, + "grad_norm": 0.03262433782219887, + "learning_rate": 0.00019340204329627798, + "loss": 0.3505, + "step": 6404 + }, + { + "epoch": 0.5188755670771225, + "grad_norm": 0.03220927715301514, + "learning_rate": 0.00019339754264368334, + "loss": 0.3633, + "step": 6405 + }, + { + "epoch": 0.5189565780946209, + "grad_norm": 0.02535347267985344, + "learning_rate": 0.00019339304199108873, + "loss": 0.3258, + "step": 6406 + }, + { + "epoch": 0.5190375891121193, + "grad_norm": 0.034237343817949295, + "learning_rate": 0.0001933885413384941, + "loss": 0.3755, + "step": 6407 + }, + { + "epoch": 0.5191186001296176, + "grad_norm": 0.03497825190424919, + "learning_rate": 0.00019338404068589945, + "loss": 0.3463, + "step": 6408 + }, + { + "epoch": 0.519199611147116, + "grad_norm": 0.03145504742860794, + "learning_rate": 0.00019337954003330484, + "loss": 0.3484, + "step": 6409 + }, + { + "epoch": 0.5192806221646143, + "grad_norm": 0.03155725449323654, + "learning_rate": 0.00019337503938071022, + "loss": 0.3341, + "step": 6410 + }, + { + "epoch": 0.5193616331821128, + "grad_norm": 0.03286818042397499, + "learning_rate": 0.00019337053872811558, + "loss": 0.3598, + "step": 6411 + }, + { + "epoch": 0.5194426441996112, + "grad_norm": 0.029384689405560493, + "learning_rate": 0.00019336603807552097, + "loss": 0.3601, + "step": 6412 + }, + { + "epoch": 0.5195236552171095, + "grad_norm": 0.03224647790193558, + "learning_rate": 0.00019336153742292633, + "loss": 0.3387, + "step": 6413 + }, + { + "epoch": 0.5196046662346079, + "grad_norm": 0.03096454218029976, + "learning_rate": 0.0001933570367703317, + "loss": 0.3579, + "step": 6414 + }, + { + "epoch": 0.5196856772521062, + "grad_norm": 0.03217236325144768, + "learning_rate": 0.00019335253611773708, + "loss": 0.3518, + "step": 6415 + }, + { + "epoch": 0.5197666882696047, + "grad_norm": 0.030055413022637367, + "learning_rate": 0.00019334803546514247, + "loss": 0.3254, + "step": 6416 + }, + { + "epoch": 0.5198476992871031, + "grad_norm": 0.03201695904135704, + "learning_rate": 0.00019334353481254783, + "loss": 0.4164, + "step": 6417 + }, + { + "epoch": 0.5199287103046014, + "grad_norm": 0.033270809799432755, + "learning_rate": 0.0001933390341599532, + "loss": 0.3886, + "step": 6418 + }, + { + "epoch": 0.5200097213220998, + "grad_norm": 0.032824356108903885, + "learning_rate": 0.00019333453350735857, + "loss": 0.3481, + "step": 6419 + }, + { + "epoch": 0.5200907323395982, + "grad_norm": 0.03067057952284813, + "learning_rate": 0.00019333003285476393, + "loss": 0.3551, + "step": 6420 + }, + { + "epoch": 0.5201717433570966, + "grad_norm": 0.03185015544295311, + "learning_rate": 0.00019332553220216932, + "loss": 0.3296, + "step": 6421 + }, + { + "epoch": 0.520252754374595, + "grad_norm": 0.02917659282684326, + "learning_rate": 0.0001933210315495747, + "loss": 0.3056, + "step": 6422 + }, + { + "epoch": 0.5203337653920933, + "grad_norm": 0.02973158471286297, + "learning_rate": 0.00019331653089698007, + "loss": 0.3248, + "step": 6423 + }, + { + "epoch": 0.5204147764095917, + "grad_norm": 0.02465933933854103, + "learning_rate": 0.00019331203024438546, + "loss": 0.2779, + "step": 6424 + }, + { + "epoch": 0.5204957874270901, + "grad_norm": 0.02845362387597561, + "learning_rate": 0.00019330752959179082, + "loss": 0.3187, + "step": 6425 + }, + { + "epoch": 0.5205767984445885, + "grad_norm": 0.031649742275476456, + "learning_rate": 0.00019330302893919618, + "loss": 0.3582, + "step": 6426 + }, + { + "epoch": 0.5206578094620868, + "grad_norm": 0.029350902885198593, + "learning_rate": 0.0001932985282866016, + "loss": 0.3251, + "step": 6427 + }, + { + "epoch": 0.5207388204795852, + "grad_norm": 0.03055676445364952, + "learning_rate": 0.00019329402763400695, + "loss": 0.3295, + "step": 6428 + }, + { + "epoch": 0.5208198314970836, + "grad_norm": 0.03613782301545143, + "learning_rate": 0.0001932895269814123, + "loss": 0.384, + "step": 6429 + }, + { + "epoch": 0.520900842514582, + "grad_norm": 0.03135627508163452, + "learning_rate": 0.0001932850263288177, + "loss": 0.384, + "step": 6430 + }, + { + "epoch": 0.5209818535320804, + "grad_norm": 0.031382665038108826, + "learning_rate": 0.00019328052567622306, + "loss": 0.3862, + "step": 6431 + }, + { + "epoch": 0.5210628645495787, + "grad_norm": 0.031073955819010735, + "learning_rate": 0.00019327602502362842, + "loss": 0.374, + "step": 6432 + }, + { + "epoch": 0.5211438755670771, + "grad_norm": 0.030740221962332726, + "learning_rate": 0.00019327152437103383, + "loss": 0.4147, + "step": 6433 + }, + { + "epoch": 0.5212248865845756, + "grad_norm": 0.028593573719263077, + "learning_rate": 0.0001932670237184392, + "loss": 0.3231, + "step": 6434 + }, + { + "epoch": 0.5213058976020739, + "grad_norm": 0.028780462220311165, + "learning_rate": 0.00019326252306584455, + "loss": 0.3409, + "step": 6435 + }, + { + "epoch": 0.5213869086195723, + "grad_norm": 0.030874744057655334, + "learning_rate": 0.00019325802241324994, + "loss": 0.3895, + "step": 6436 + }, + { + "epoch": 0.5214679196370706, + "grad_norm": 0.03160393610596657, + "learning_rate": 0.0001932535217606553, + "loss": 0.3526, + "step": 6437 + }, + { + "epoch": 0.521548930654569, + "grad_norm": 0.03185877203941345, + "learning_rate": 0.00019324902110806066, + "loss": 0.3005, + "step": 6438 + }, + { + "epoch": 0.5216299416720674, + "grad_norm": 0.03563063219189644, + "learning_rate": 0.00019324452045546607, + "loss": 0.3619, + "step": 6439 + }, + { + "epoch": 0.5217109526895658, + "grad_norm": 0.032077718526124954, + "learning_rate": 0.00019324001980287143, + "loss": 0.3913, + "step": 6440 + }, + { + "epoch": 0.5217919637070642, + "grad_norm": 0.028801945969462395, + "learning_rate": 0.0001932355191502768, + "loss": 0.3164, + "step": 6441 + }, + { + "epoch": 0.5218729747245625, + "grad_norm": 0.036385100334882736, + "learning_rate": 0.00019323101849768218, + "loss": 0.3347, + "step": 6442 + }, + { + "epoch": 0.5219539857420609, + "grad_norm": 0.02876608446240425, + "learning_rate": 0.00019322651784508754, + "loss": 0.35, + "step": 6443 + }, + { + "epoch": 0.5220349967595593, + "grad_norm": 0.031699031591415405, + "learning_rate": 0.0001932220171924929, + "loss": 0.3678, + "step": 6444 + }, + { + "epoch": 0.5221160077770577, + "grad_norm": 0.02782803401350975, + "learning_rate": 0.00019321751653989831, + "loss": 0.3406, + "step": 6445 + }, + { + "epoch": 0.522197018794556, + "grad_norm": 0.030642272904515266, + "learning_rate": 0.00019321301588730367, + "loss": 0.3659, + "step": 6446 + }, + { + "epoch": 0.5222780298120544, + "grad_norm": 0.033031292259693146, + "learning_rate": 0.00019320851523470903, + "loss": 0.3806, + "step": 6447 + }, + { + "epoch": 0.5223590408295529, + "grad_norm": 0.03382324054837227, + "learning_rate": 0.00019320401458211442, + "loss": 0.3542, + "step": 6448 + }, + { + "epoch": 0.5224400518470512, + "grad_norm": 0.03232893347740173, + "learning_rate": 0.00019319951392951978, + "loss": 0.3765, + "step": 6449 + }, + { + "epoch": 0.5225210628645496, + "grad_norm": 0.03362128511071205, + "learning_rate": 0.00019319501327692517, + "loss": 0.3558, + "step": 6450 + }, + { + "epoch": 0.5226020738820479, + "grad_norm": 0.033634036779403687, + "learning_rate": 0.00019319051262433056, + "loss": 0.3799, + "step": 6451 + }, + { + "epoch": 0.5226830848995463, + "grad_norm": 0.03131242096424103, + "learning_rate": 0.00019318601197173592, + "loss": 0.3444, + "step": 6452 + }, + { + "epoch": 0.5227640959170448, + "grad_norm": 0.028830870985984802, + "learning_rate": 0.00019318151131914128, + "loss": 0.3426, + "step": 6453 + }, + { + "epoch": 0.5228451069345431, + "grad_norm": 0.03265361115336418, + "learning_rate": 0.00019317701066654666, + "loss": 0.3594, + "step": 6454 + }, + { + "epoch": 0.5229261179520415, + "grad_norm": 0.028458425775170326, + "learning_rate": 0.00019317251001395202, + "loss": 0.3271, + "step": 6455 + }, + { + "epoch": 0.5230071289695398, + "grad_norm": 0.03274376317858696, + "learning_rate": 0.0001931680093613574, + "loss": 0.3295, + "step": 6456 + }, + { + "epoch": 0.5230881399870383, + "grad_norm": 0.029262587428092957, + "learning_rate": 0.0001931635087087628, + "loss": 0.3618, + "step": 6457 + }, + { + "epoch": 0.5231691510045366, + "grad_norm": 0.03436655178666115, + "learning_rate": 0.00019315900805616816, + "loss": 0.3552, + "step": 6458 + }, + { + "epoch": 0.523250162022035, + "grad_norm": 0.030990226194262505, + "learning_rate": 0.00019315450740357352, + "loss": 0.3752, + "step": 6459 + }, + { + "epoch": 0.5233311730395334, + "grad_norm": 0.028373638167977333, + "learning_rate": 0.0001931500067509789, + "loss": 0.3136, + "step": 6460 + }, + { + "epoch": 0.5234121840570317, + "grad_norm": 0.030110273510217667, + "learning_rate": 0.00019314550609838426, + "loss": 0.3447, + "step": 6461 + }, + { + "epoch": 0.5234931950745302, + "grad_norm": 0.03270327299833298, + "learning_rate": 0.00019314100544578965, + "loss": 0.4455, + "step": 6462 + }, + { + "epoch": 0.5235742060920285, + "grad_norm": 0.025852186605334282, + "learning_rate": 0.00019313650479319504, + "loss": 0.3235, + "step": 6463 + }, + { + "epoch": 0.5236552171095269, + "grad_norm": 0.027759717777371407, + "learning_rate": 0.0001931320041406004, + "loss": 0.3391, + "step": 6464 + }, + { + "epoch": 0.5237362281270252, + "grad_norm": 0.03059726394712925, + "learning_rate": 0.00019312750348800576, + "loss": 0.4047, + "step": 6465 + }, + { + "epoch": 0.5238172391445236, + "grad_norm": 0.03364939242601395, + "learning_rate": 0.00019312300283541115, + "loss": 0.4049, + "step": 6466 + }, + { + "epoch": 0.5238982501620221, + "grad_norm": 0.030493125319480896, + "learning_rate": 0.0001931185021828165, + "loss": 0.3328, + "step": 6467 + }, + { + "epoch": 0.5239792611795204, + "grad_norm": 0.031018385663628578, + "learning_rate": 0.0001931140015302219, + "loss": 0.3816, + "step": 6468 + }, + { + "epoch": 0.5240602721970188, + "grad_norm": 0.03358032926917076, + "learning_rate": 0.00019310950087762728, + "loss": 0.3498, + "step": 6469 + }, + { + "epoch": 0.5241412832145171, + "grad_norm": 0.033210817724466324, + "learning_rate": 0.00019310500022503264, + "loss": 0.3131, + "step": 6470 + }, + { + "epoch": 0.5242222942320156, + "grad_norm": 0.029688436537981033, + "learning_rate": 0.000193100499572438, + "loss": 0.3267, + "step": 6471 + }, + { + "epoch": 0.524303305249514, + "grad_norm": 0.035529136657714844, + "learning_rate": 0.0001930959989198434, + "loss": 0.3478, + "step": 6472 + }, + { + "epoch": 0.5243843162670123, + "grad_norm": 0.03089592605829239, + "learning_rate": 0.00019309149826724875, + "loss": 0.3441, + "step": 6473 + }, + { + "epoch": 0.5244653272845107, + "grad_norm": 0.03198695927858353, + "learning_rate": 0.00019308699761465414, + "loss": 0.3006, + "step": 6474 + }, + { + "epoch": 0.524546338302009, + "grad_norm": 0.03530554845929146, + "learning_rate": 0.00019308249696205952, + "loss": 0.4121, + "step": 6475 + }, + { + "epoch": 0.5246273493195075, + "grad_norm": 0.032105859369039536, + "learning_rate": 0.00019307799630946488, + "loss": 0.3807, + "step": 6476 + }, + { + "epoch": 0.5247083603370059, + "grad_norm": 0.028861409053206444, + "learning_rate": 0.00019307349565687024, + "loss": 0.3244, + "step": 6477 + }, + { + "epoch": 0.5247893713545042, + "grad_norm": 0.032058510929346085, + "learning_rate": 0.00019306899500427563, + "loss": 0.3674, + "step": 6478 + }, + { + "epoch": 0.5248703823720026, + "grad_norm": 0.03300195559859276, + "learning_rate": 0.00019306449435168102, + "loss": 0.384, + "step": 6479 + }, + { + "epoch": 0.5249513933895009, + "grad_norm": 0.03070252574980259, + "learning_rate": 0.00019305999369908638, + "loss": 0.3162, + "step": 6480 + }, + { + "epoch": 0.5250324044069994, + "grad_norm": 0.029741229489445686, + "learning_rate": 0.00019305549304649176, + "loss": 0.3501, + "step": 6481 + }, + { + "epoch": 0.5251134154244977, + "grad_norm": 0.0281755980104208, + "learning_rate": 0.00019305099239389712, + "loss": 0.3101, + "step": 6482 + }, + { + "epoch": 0.5251944264419961, + "grad_norm": 0.032033056020736694, + "learning_rate": 0.00019304649174130248, + "loss": 0.3721, + "step": 6483 + }, + { + "epoch": 0.5252754374594945, + "grad_norm": 0.030394721776247025, + "learning_rate": 0.00019304199108870787, + "loss": 0.3891, + "step": 6484 + }, + { + "epoch": 0.5253564484769929, + "grad_norm": 0.0320914126932621, + "learning_rate": 0.00019303749043611326, + "loss": 0.374, + "step": 6485 + }, + { + "epoch": 0.5254374594944913, + "grad_norm": 0.02836509793996811, + "learning_rate": 0.00019303298978351862, + "loss": 0.3517, + "step": 6486 + }, + { + "epoch": 0.5255184705119896, + "grad_norm": 0.03872750699520111, + "learning_rate": 0.000193028489130924, + "loss": 0.4366, + "step": 6487 + }, + { + "epoch": 0.525599481529488, + "grad_norm": 0.028343653306365013, + "learning_rate": 0.00019302398847832937, + "loss": 0.3436, + "step": 6488 + }, + { + "epoch": 0.5256804925469863, + "grad_norm": 0.030183296650648117, + "learning_rate": 0.00019301948782573473, + "loss": 0.338, + "step": 6489 + }, + { + "epoch": 0.5257615035644848, + "grad_norm": 0.03298697993159294, + "learning_rate": 0.0001930149871731401, + "loss": 0.3999, + "step": 6490 + }, + { + "epoch": 0.5258425145819832, + "grad_norm": 0.02960672415792942, + "learning_rate": 0.0001930104865205455, + "loss": 0.3683, + "step": 6491 + }, + { + "epoch": 0.5259235255994815, + "grad_norm": 0.02848297730088234, + "learning_rate": 0.00019300598586795086, + "loss": 0.3567, + "step": 6492 + }, + { + "epoch": 0.5260045366169799, + "grad_norm": 0.026230011135339737, + "learning_rate": 0.00019300148521535625, + "loss": 0.3399, + "step": 6493 + }, + { + "epoch": 0.5260855476344782, + "grad_norm": 0.032105475664138794, + "learning_rate": 0.0001929969845627616, + "loss": 0.3805, + "step": 6494 + }, + { + "epoch": 0.5261665586519767, + "grad_norm": 0.036830004304647446, + "learning_rate": 0.00019299248391016697, + "loss": 0.3794, + "step": 6495 + }, + { + "epoch": 0.5262475696694751, + "grad_norm": 0.0314621739089489, + "learning_rate": 0.00019298798325757235, + "loss": 0.364, + "step": 6496 + }, + { + "epoch": 0.5263285806869734, + "grad_norm": 0.035781923681497574, + "learning_rate": 0.00019298348260497774, + "loss": 0.3669, + "step": 6497 + }, + { + "epoch": 0.5264095917044718, + "grad_norm": 0.03248453512787819, + "learning_rate": 0.0001929789819523831, + "loss": 0.3862, + "step": 6498 + }, + { + "epoch": 0.5264906027219702, + "grad_norm": 0.030936148017644882, + "learning_rate": 0.0001929744812997885, + "loss": 0.317, + "step": 6499 + }, + { + "epoch": 0.5265716137394686, + "grad_norm": 0.03752376511693001, + "learning_rate": 0.00019296998064719385, + "loss": 0.3976, + "step": 6500 + }, + { + "epoch": 0.526652624756967, + "grad_norm": 0.028330406174063683, + "learning_rate": 0.0001929654799945992, + "loss": 0.3245, + "step": 6501 + }, + { + "epoch": 0.5267336357744653, + "grad_norm": 0.031165296211838722, + "learning_rate": 0.00019296097934200462, + "loss": 0.3627, + "step": 6502 + }, + { + "epoch": 0.5268146467919637, + "grad_norm": 0.0313921794295311, + "learning_rate": 0.00019295647868940998, + "loss": 0.3257, + "step": 6503 + }, + { + "epoch": 0.5268956578094621, + "grad_norm": 0.027600150555372238, + "learning_rate": 0.00019295197803681534, + "loss": 0.311, + "step": 6504 + }, + { + "epoch": 0.5269766688269605, + "grad_norm": 0.031687039881944656, + "learning_rate": 0.00019294747738422073, + "loss": 0.3638, + "step": 6505 + }, + { + "epoch": 0.5270576798444588, + "grad_norm": 0.031069139018654823, + "learning_rate": 0.0001929429767316261, + "loss": 0.3466, + "step": 6506 + }, + { + "epoch": 0.5271386908619572, + "grad_norm": 0.033416252583265305, + "learning_rate": 0.00019293847607903145, + "loss": 0.393, + "step": 6507 + }, + { + "epoch": 0.5272197018794557, + "grad_norm": 0.03066958859562874, + "learning_rate": 0.00019293397542643686, + "loss": 0.352, + "step": 6508 + }, + { + "epoch": 0.527300712896954, + "grad_norm": 0.0345117412507534, + "learning_rate": 0.00019292947477384222, + "loss": 0.3584, + "step": 6509 + }, + { + "epoch": 0.5273817239144524, + "grad_norm": 0.031254980713129044, + "learning_rate": 0.00019292497412124759, + "loss": 0.3252, + "step": 6510 + }, + { + "epoch": 0.5274627349319507, + "grad_norm": 0.04023100808262825, + "learning_rate": 0.00019292047346865297, + "loss": 0.3742, + "step": 6511 + }, + { + "epoch": 0.5275437459494491, + "grad_norm": 0.03604496642947197, + "learning_rate": 0.00019291597281605833, + "loss": 0.4139, + "step": 6512 + }, + { + "epoch": 0.5276247569669476, + "grad_norm": 0.02851884625852108, + "learning_rate": 0.0001929114721634637, + "loss": 0.3424, + "step": 6513 + }, + { + "epoch": 0.5277057679844459, + "grad_norm": 0.029921604320406914, + "learning_rate": 0.0001929069715108691, + "loss": 0.3254, + "step": 6514 + }, + { + "epoch": 0.5277867790019443, + "grad_norm": 0.03419069945812225, + "learning_rate": 0.00019290247085827447, + "loss": 0.416, + "step": 6515 + }, + { + "epoch": 0.5278677900194426, + "grad_norm": 0.034494705498218536, + "learning_rate": 0.00019289797020567983, + "loss": 0.355, + "step": 6516 + }, + { + "epoch": 0.527948801036941, + "grad_norm": 0.02677858993411064, + "learning_rate": 0.00019289346955308521, + "loss": 0.2843, + "step": 6517 + }, + { + "epoch": 0.5280298120544394, + "grad_norm": 0.03140726312994957, + "learning_rate": 0.00019288896890049057, + "loss": 0.3116, + "step": 6518 + }, + { + "epoch": 0.5281108230719378, + "grad_norm": 0.030908901244401932, + "learning_rate": 0.00019288446824789593, + "loss": 0.3422, + "step": 6519 + }, + { + "epoch": 0.5281918340894362, + "grad_norm": 0.030447175726294518, + "learning_rate": 0.00019287996759530135, + "loss": 0.3347, + "step": 6520 + }, + { + "epoch": 0.5282728451069345, + "grad_norm": 0.030320830643177032, + "learning_rate": 0.0001928754669427067, + "loss": 0.3295, + "step": 6521 + }, + { + "epoch": 0.528353856124433, + "grad_norm": 0.03139973431825638, + "learning_rate": 0.00019287096629011207, + "loss": 0.363, + "step": 6522 + }, + { + "epoch": 0.5284348671419313, + "grad_norm": 0.03257003426551819, + "learning_rate": 0.00019286646563751746, + "loss": 0.3209, + "step": 6523 + }, + { + "epoch": 0.5285158781594297, + "grad_norm": 0.029077529907226562, + "learning_rate": 0.00019286196498492282, + "loss": 0.2955, + "step": 6524 + }, + { + "epoch": 0.528596889176928, + "grad_norm": 0.035417500883340836, + "learning_rate": 0.00019285746433232818, + "loss": 0.3413, + "step": 6525 + }, + { + "epoch": 0.5286779001944264, + "grad_norm": 0.031534593552351, + "learning_rate": 0.0001928529636797336, + "loss": 0.3356, + "step": 6526 + }, + { + "epoch": 0.5287589112119249, + "grad_norm": 0.0340360589325428, + "learning_rate": 0.00019284846302713895, + "loss": 0.3489, + "step": 6527 + }, + { + "epoch": 0.5288399222294232, + "grad_norm": 0.033144641667604446, + "learning_rate": 0.0001928439623745443, + "loss": 0.373, + "step": 6528 + }, + { + "epoch": 0.5289209332469216, + "grad_norm": 0.0342666432261467, + "learning_rate": 0.0001928394617219497, + "loss": 0.38, + "step": 6529 + }, + { + "epoch": 0.5290019442644199, + "grad_norm": 0.03981400281190872, + "learning_rate": 0.00019283496106935506, + "loss": 0.4201, + "step": 6530 + }, + { + "epoch": 0.5290829552819183, + "grad_norm": 0.03236046060919762, + "learning_rate": 0.00019283046041676044, + "loss": 0.3718, + "step": 6531 + }, + { + "epoch": 0.5291639662994168, + "grad_norm": 0.029104501008987427, + "learning_rate": 0.00019282595976416583, + "loss": 0.3399, + "step": 6532 + }, + { + "epoch": 0.5292449773169151, + "grad_norm": 0.02896728925406933, + "learning_rate": 0.0001928214591115712, + "loss": 0.3597, + "step": 6533 + }, + { + "epoch": 0.5293259883344135, + "grad_norm": 0.028998127207159996, + "learning_rate": 0.00019281695845897655, + "loss": 0.2995, + "step": 6534 + }, + { + "epoch": 0.5294069993519118, + "grad_norm": 0.030537080019712448, + "learning_rate": 0.00019281245780638194, + "loss": 0.367, + "step": 6535 + }, + { + "epoch": 0.5294880103694103, + "grad_norm": 0.030855638906359673, + "learning_rate": 0.0001928079571537873, + "loss": 0.3648, + "step": 6536 + }, + { + "epoch": 0.5295690213869086, + "grad_norm": 0.03002474457025528, + "learning_rate": 0.00019280345650119269, + "loss": 0.3456, + "step": 6537 + }, + { + "epoch": 0.529650032404407, + "grad_norm": 0.03153291717171669, + "learning_rate": 0.00019279895584859807, + "loss": 0.362, + "step": 6538 + }, + { + "epoch": 0.5297310434219054, + "grad_norm": 0.035764485597610474, + "learning_rate": 0.00019279445519600343, + "loss": 0.3527, + "step": 6539 + }, + { + "epoch": 0.5298120544394037, + "grad_norm": 0.028985558077692986, + "learning_rate": 0.0001927899545434088, + "loss": 0.3014, + "step": 6540 + }, + { + "epoch": 0.5298930654569022, + "grad_norm": 0.03560829162597656, + "learning_rate": 0.00019278545389081418, + "loss": 0.3515, + "step": 6541 + }, + { + "epoch": 0.5299740764744005, + "grad_norm": 0.034974757581949234, + "learning_rate": 0.00019278095323821954, + "loss": 0.3566, + "step": 6542 + }, + { + "epoch": 0.5300550874918989, + "grad_norm": 0.0336134247481823, + "learning_rate": 0.00019277645258562493, + "loss": 0.3807, + "step": 6543 + }, + { + "epoch": 0.5301360985093972, + "grad_norm": 0.031493667513132095, + "learning_rate": 0.00019277195193303031, + "loss": 0.3494, + "step": 6544 + }, + { + "epoch": 0.5302171095268956, + "grad_norm": 0.033709052950143814, + "learning_rate": 0.00019276745128043567, + "loss": 0.3652, + "step": 6545 + }, + { + "epoch": 0.5302981205443941, + "grad_norm": 0.033176809549331665, + "learning_rate": 0.00019276295062784103, + "loss": 0.4018, + "step": 6546 + }, + { + "epoch": 0.5303791315618924, + "grad_norm": 0.03512312099337578, + "learning_rate": 0.00019275844997524642, + "loss": 0.396, + "step": 6547 + }, + { + "epoch": 0.5304601425793908, + "grad_norm": 0.03072085976600647, + "learning_rate": 0.00019275394932265178, + "loss": 0.3562, + "step": 6548 + }, + { + "epoch": 0.5305411535968891, + "grad_norm": 0.02936788834631443, + "learning_rate": 0.00019274944867005717, + "loss": 0.3802, + "step": 6549 + }, + { + "epoch": 0.5306221646143876, + "grad_norm": 0.028300996869802475, + "learning_rate": 0.00019274494801746256, + "loss": 0.3461, + "step": 6550 + }, + { + "epoch": 0.530703175631886, + "grad_norm": 0.032628510147333145, + "learning_rate": 0.00019274044736486792, + "loss": 0.3159, + "step": 6551 + }, + { + "epoch": 0.5307841866493843, + "grad_norm": 0.028229277580976486, + "learning_rate": 0.00019273594671227328, + "loss": 0.3754, + "step": 6552 + }, + { + "epoch": 0.5308651976668827, + "grad_norm": 0.029050737619400024, + "learning_rate": 0.00019273144605967866, + "loss": 0.3378, + "step": 6553 + }, + { + "epoch": 0.530946208684381, + "grad_norm": 0.034775856882333755, + "learning_rate": 0.00019272694540708405, + "loss": 0.3415, + "step": 6554 + }, + { + "epoch": 0.5310272197018795, + "grad_norm": 0.03148787468671799, + "learning_rate": 0.0001927224447544894, + "loss": 0.3355, + "step": 6555 + }, + { + "epoch": 0.5311082307193778, + "grad_norm": 0.032840415835380554, + "learning_rate": 0.0001927179441018948, + "loss": 0.3908, + "step": 6556 + }, + { + "epoch": 0.5311892417368762, + "grad_norm": 0.03288496658205986, + "learning_rate": 0.00019271344344930016, + "loss": 0.3472, + "step": 6557 + }, + { + "epoch": 0.5312702527543746, + "grad_norm": 0.032925862818956375, + "learning_rate": 0.00019270894279670552, + "loss": 0.3843, + "step": 6558 + }, + { + "epoch": 0.531351263771873, + "grad_norm": 0.02724250592291355, + "learning_rate": 0.0001927044421441109, + "loss": 0.3, + "step": 6559 + }, + { + "epoch": 0.5314322747893714, + "grad_norm": 0.03701140731573105, + "learning_rate": 0.0001926999414915163, + "loss": 0.4249, + "step": 6560 + }, + { + "epoch": 0.5315132858068697, + "grad_norm": 0.02940862812101841, + "learning_rate": 0.00019269544083892165, + "loss": 0.3136, + "step": 6561 + }, + { + "epoch": 0.5315942968243681, + "grad_norm": 0.03185543790459633, + "learning_rate": 0.00019269094018632704, + "loss": 0.3823, + "step": 6562 + }, + { + "epoch": 0.5316753078418665, + "grad_norm": 0.035884223878383636, + "learning_rate": 0.0001926864395337324, + "loss": 0.3259, + "step": 6563 + }, + { + "epoch": 0.5317563188593649, + "grad_norm": 0.0321180559694767, + "learning_rate": 0.00019268193888113776, + "loss": 0.4205, + "step": 6564 + }, + { + "epoch": 0.5318373298768633, + "grad_norm": 0.029376648366451263, + "learning_rate": 0.00019267743822854315, + "loss": 0.3507, + "step": 6565 + }, + { + "epoch": 0.5319183408943616, + "grad_norm": 0.030484110116958618, + "learning_rate": 0.00019267293757594853, + "loss": 0.3006, + "step": 6566 + }, + { + "epoch": 0.53199935191186, + "grad_norm": 0.031638842076063156, + "learning_rate": 0.0001926684369233539, + "loss": 0.3321, + "step": 6567 + }, + { + "epoch": 0.5320803629293583, + "grad_norm": 0.02748614177107811, + "learning_rate": 0.00019266393627075928, + "loss": 0.323, + "step": 6568 + }, + { + "epoch": 0.5321613739468568, + "grad_norm": 0.03108358010649681, + "learning_rate": 0.00019265943561816464, + "loss": 0.3184, + "step": 6569 + }, + { + "epoch": 0.5322423849643552, + "grad_norm": 0.033117424696683884, + "learning_rate": 0.00019265493496557, + "loss": 0.3431, + "step": 6570 + }, + { + "epoch": 0.5323233959818535, + "grad_norm": 0.03042810969054699, + "learning_rate": 0.0001926504343129754, + "loss": 0.3439, + "step": 6571 + }, + { + "epoch": 0.5324044069993519, + "grad_norm": 0.029681673273444176, + "learning_rate": 0.00019264593366038078, + "loss": 0.3546, + "step": 6572 + }, + { + "epoch": 0.5324854180168503, + "grad_norm": 0.03175319731235504, + "learning_rate": 0.00019264143300778614, + "loss": 0.3422, + "step": 6573 + }, + { + "epoch": 0.5325664290343487, + "grad_norm": 0.02874058112502098, + "learning_rate": 0.00019263693235519152, + "loss": 0.3849, + "step": 6574 + }, + { + "epoch": 0.532647440051847, + "grad_norm": 0.030932355672121048, + "learning_rate": 0.00019263243170259688, + "loss": 0.3795, + "step": 6575 + }, + { + "epoch": 0.5327284510693454, + "grad_norm": 0.029642682522535324, + "learning_rate": 0.00019262793105000224, + "loss": 0.3304, + "step": 6576 + }, + { + "epoch": 0.5328094620868438, + "grad_norm": 0.029917187988758087, + "learning_rate": 0.00019262343039740763, + "loss": 0.3417, + "step": 6577 + }, + { + "epoch": 0.5328904731043422, + "grad_norm": 0.03342309966683388, + "learning_rate": 0.00019261892974481302, + "loss": 0.3096, + "step": 6578 + }, + { + "epoch": 0.5329714841218406, + "grad_norm": 0.03146572411060333, + "learning_rate": 0.00019261442909221838, + "loss": 0.327, + "step": 6579 + }, + { + "epoch": 0.5330524951393389, + "grad_norm": 0.030505899339914322, + "learning_rate": 0.00019260992843962376, + "loss": 0.3738, + "step": 6580 + }, + { + "epoch": 0.5331335061568373, + "grad_norm": 0.032242707908153534, + "learning_rate": 0.00019260542778702912, + "loss": 0.3503, + "step": 6581 + }, + { + "epoch": 0.5332145171743357, + "grad_norm": 0.029962383210659027, + "learning_rate": 0.00019260092713443448, + "loss": 0.3407, + "step": 6582 + }, + { + "epoch": 0.5332955281918341, + "grad_norm": 0.03191268444061279, + "learning_rate": 0.0001925964264818399, + "loss": 0.3599, + "step": 6583 + }, + { + "epoch": 0.5333765392093325, + "grad_norm": 0.032974135130643845, + "learning_rate": 0.00019259192582924526, + "loss": 0.3547, + "step": 6584 + }, + { + "epoch": 0.5334575502268308, + "grad_norm": 0.03472721949219704, + "learning_rate": 0.00019258742517665062, + "loss": 0.3788, + "step": 6585 + }, + { + "epoch": 0.5335385612443292, + "grad_norm": 0.03367713838815689, + "learning_rate": 0.000192582924524056, + "loss": 0.3431, + "step": 6586 + }, + { + "epoch": 0.5336195722618277, + "grad_norm": 0.0349598303437233, + "learning_rate": 0.00019257842387146137, + "loss": 0.3453, + "step": 6587 + }, + { + "epoch": 0.533700583279326, + "grad_norm": 0.03054990991950035, + "learning_rate": 0.00019257392321886673, + "loss": 0.3561, + "step": 6588 + }, + { + "epoch": 0.5337815942968244, + "grad_norm": 0.0322737917304039, + "learning_rate": 0.00019256942256627214, + "loss": 0.3745, + "step": 6589 + }, + { + "epoch": 0.5338626053143227, + "grad_norm": 0.03227555379271507, + "learning_rate": 0.0001925649219136775, + "loss": 0.3677, + "step": 6590 + }, + { + "epoch": 0.5339436163318211, + "grad_norm": 0.031967077404260635, + "learning_rate": 0.00019256042126108286, + "loss": 0.3392, + "step": 6591 + }, + { + "epoch": 0.5340246273493195, + "grad_norm": 0.03309684991836548, + "learning_rate": 0.00019255592060848825, + "loss": 0.3521, + "step": 6592 + }, + { + "epoch": 0.5341056383668179, + "grad_norm": 0.033072203397750854, + "learning_rate": 0.0001925514199558936, + "loss": 0.3997, + "step": 6593 + }, + { + "epoch": 0.5341866493843163, + "grad_norm": 0.031067723408341408, + "learning_rate": 0.00019254691930329897, + "loss": 0.3181, + "step": 6594 + }, + { + "epoch": 0.5342676604018146, + "grad_norm": 0.030648205429315567, + "learning_rate": 0.00019254241865070438, + "loss": 0.3506, + "step": 6595 + }, + { + "epoch": 0.5343486714193131, + "grad_norm": 0.030988864600658417, + "learning_rate": 0.00019253791799810974, + "loss": 0.3297, + "step": 6596 + }, + { + "epoch": 0.5344296824368114, + "grad_norm": 0.03229081630706787, + "learning_rate": 0.0001925334173455151, + "loss": 0.3435, + "step": 6597 + }, + { + "epoch": 0.5345106934543098, + "grad_norm": 0.036464571952819824, + "learning_rate": 0.0001925289166929205, + "loss": 0.3873, + "step": 6598 + }, + { + "epoch": 0.5345917044718081, + "grad_norm": 0.03050421178340912, + "learning_rate": 0.00019252441604032585, + "loss": 0.3157, + "step": 6599 + }, + { + "epoch": 0.5346727154893065, + "grad_norm": 0.032148249447345734, + "learning_rate": 0.0001925199153877312, + "loss": 0.3353, + "step": 6600 + }, + { + "epoch": 0.534753726506805, + "grad_norm": 0.038223471492528915, + "learning_rate": 0.00019251541473513662, + "loss": 0.3748, + "step": 6601 + }, + { + "epoch": 0.5348347375243033, + "grad_norm": 0.035033948719501495, + "learning_rate": 0.00019251091408254198, + "loss": 0.3619, + "step": 6602 + }, + { + "epoch": 0.5349157485418017, + "grad_norm": 0.028229834511876106, + "learning_rate": 0.00019250641342994734, + "loss": 0.341, + "step": 6603 + }, + { + "epoch": 0.5349967595593, + "grad_norm": 0.030927540734410286, + "learning_rate": 0.00019250191277735273, + "loss": 0.3958, + "step": 6604 + }, + { + "epoch": 0.5350777705767984, + "grad_norm": 0.03158433735370636, + "learning_rate": 0.0001924974121247581, + "loss": 0.3543, + "step": 6605 + }, + { + "epoch": 0.5351587815942969, + "grad_norm": 0.03204088285565376, + "learning_rate": 0.00019249291147216345, + "loss": 0.3538, + "step": 6606 + }, + { + "epoch": 0.5352397926117952, + "grad_norm": 0.028660694137215614, + "learning_rate": 0.00019248841081956887, + "loss": 0.3721, + "step": 6607 + }, + { + "epoch": 0.5353208036292936, + "grad_norm": 0.030562499538064003, + "learning_rate": 0.00019248391016697423, + "loss": 0.337, + "step": 6608 + }, + { + "epoch": 0.5354018146467919, + "grad_norm": 0.03120999038219452, + "learning_rate": 0.00019247940951437959, + "loss": 0.3575, + "step": 6609 + }, + { + "epoch": 0.5354828256642904, + "grad_norm": 0.030365686863660812, + "learning_rate": 0.00019247490886178497, + "loss": 0.3578, + "step": 6610 + }, + { + "epoch": 0.5355638366817888, + "grad_norm": 0.03199993818998337, + "learning_rate": 0.00019247040820919033, + "loss": 0.3185, + "step": 6611 + }, + { + "epoch": 0.5356448476992871, + "grad_norm": 0.030325790867209435, + "learning_rate": 0.00019246590755659572, + "loss": 0.3804, + "step": 6612 + }, + { + "epoch": 0.5357258587167855, + "grad_norm": 0.030955428257584572, + "learning_rate": 0.0001924614069040011, + "loss": 0.361, + "step": 6613 + }, + { + "epoch": 0.5358068697342838, + "grad_norm": 0.030837949365377426, + "learning_rate": 0.00019245690625140647, + "loss": 0.3887, + "step": 6614 + }, + { + "epoch": 0.5358878807517823, + "grad_norm": 0.029817679896950722, + "learning_rate": 0.00019245240559881183, + "loss": 0.363, + "step": 6615 + }, + { + "epoch": 0.5359688917692806, + "grad_norm": 0.029662759974598885, + "learning_rate": 0.00019244790494621721, + "loss": 0.3621, + "step": 6616 + }, + { + "epoch": 0.536049902786779, + "grad_norm": 0.028333280235528946, + "learning_rate": 0.00019244340429362257, + "loss": 0.3288, + "step": 6617 + }, + { + "epoch": 0.5361309138042774, + "grad_norm": 0.03501308336853981, + "learning_rate": 0.00019243890364102796, + "loss": 0.3274, + "step": 6618 + }, + { + "epoch": 0.5362119248217757, + "grad_norm": 0.03427104651927948, + "learning_rate": 0.00019243440298843335, + "loss": 0.3607, + "step": 6619 + }, + { + "epoch": 0.5362929358392742, + "grad_norm": 0.028717700392007828, + "learning_rate": 0.0001924299023358387, + "loss": 0.333, + "step": 6620 + }, + { + "epoch": 0.5363739468567725, + "grad_norm": 0.03295782208442688, + "learning_rate": 0.00019242540168324407, + "loss": 0.3858, + "step": 6621 + }, + { + "epoch": 0.5364549578742709, + "grad_norm": 0.035240571945905685, + "learning_rate": 0.00019242090103064946, + "loss": 0.3862, + "step": 6622 + }, + { + "epoch": 0.5365359688917692, + "grad_norm": 0.032001230865716934, + "learning_rate": 0.00019241640037805482, + "loss": 0.3435, + "step": 6623 + }, + { + "epoch": 0.5366169799092677, + "grad_norm": 0.031990185379981995, + "learning_rate": 0.0001924118997254602, + "loss": 0.3606, + "step": 6624 + }, + { + "epoch": 0.5366979909267661, + "grad_norm": 0.03834180533885956, + "learning_rate": 0.0001924073990728656, + "loss": 0.355, + "step": 6625 + }, + { + "epoch": 0.5367790019442644, + "grad_norm": 0.030459698289632797, + "learning_rate": 0.00019240289842027095, + "loss": 0.3294, + "step": 6626 + }, + { + "epoch": 0.5368600129617628, + "grad_norm": 0.034062933176755905, + "learning_rate": 0.0001923983977676763, + "loss": 0.4105, + "step": 6627 + }, + { + "epoch": 0.5369410239792611, + "grad_norm": 0.02938479371368885, + "learning_rate": 0.0001923938971150817, + "loss": 0.3429, + "step": 6628 + }, + { + "epoch": 0.5370220349967596, + "grad_norm": 0.03130931779742241, + "learning_rate": 0.00019238939646248706, + "loss": 0.3225, + "step": 6629 + }, + { + "epoch": 0.537103046014258, + "grad_norm": 0.03414647653698921, + "learning_rate": 0.00019238489580989244, + "loss": 0.3901, + "step": 6630 + }, + { + "epoch": 0.5371840570317563, + "grad_norm": 0.029339836910367012, + "learning_rate": 0.00019238039515729783, + "loss": 0.3331, + "step": 6631 + }, + { + "epoch": 0.5372650680492547, + "grad_norm": 0.028171516954898834, + "learning_rate": 0.0001923758945047032, + "loss": 0.308, + "step": 6632 + }, + { + "epoch": 0.537346079066753, + "grad_norm": 0.03288857266306877, + "learning_rate": 0.00019237139385210855, + "loss": 0.3761, + "step": 6633 + }, + { + "epoch": 0.5374270900842515, + "grad_norm": 0.028602857142686844, + "learning_rate": 0.00019236689319951394, + "loss": 0.3238, + "step": 6634 + }, + { + "epoch": 0.5375081011017498, + "grad_norm": 0.02909966930747032, + "learning_rate": 0.00019236239254691933, + "loss": 0.3295, + "step": 6635 + }, + { + "epoch": 0.5375891121192482, + "grad_norm": 0.03496485948562622, + "learning_rate": 0.00019235789189432469, + "loss": 0.3576, + "step": 6636 + }, + { + "epoch": 0.5376701231367466, + "grad_norm": 0.03171485289931297, + "learning_rate": 0.00019235339124173007, + "loss": 0.346, + "step": 6637 + }, + { + "epoch": 0.537751134154245, + "grad_norm": 0.028812715783715248, + "learning_rate": 0.00019234889058913543, + "loss": 0.2998, + "step": 6638 + }, + { + "epoch": 0.5378321451717434, + "grad_norm": 0.030826063826680183, + "learning_rate": 0.0001923443899365408, + "loss": 0.3188, + "step": 6639 + }, + { + "epoch": 0.5379131561892417, + "grad_norm": 0.03311021998524666, + "learning_rate": 0.00019233988928394618, + "loss": 0.3608, + "step": 6640 + }, + { + "epoch": 0.5379941672067401, + "grad_norm": 0.030214812606573105, + "learning_rate": 0.00019233538863135157, + "loss": 0.373, + "step": 6641 + }, + { + "epoch": 0.5380751782242384, + "grad_norm": 0.030127253383398056, + "learning_rate": 0.00019233088797875693, + "loss": 0.3482, + "step": 6642 + }, + { + "epoch": 0.5381561892417369, + "grad_norm": 0.030809765681624413, + "learning_rate": 0.00019232638732616231, + "loss": 0.336, + "step": 6643 + }, + { + "epoch": 0.5382372002592353, + "grad_norm": 0.02976158633828163, + "learning_rate": 0.00019232188667356767, + "loss": 0.3253, + "step": 6644 + }, + { + "epoch": 0.5383182112767336, + "grad_norm": 0.03155311197042465, + "learning_rate": 0.00019231738602097304, + "loss": 0.3899, + "step": 6645 + }, + { + "epoch": 0.538399222294232, + "grad_norm": 0.02847321890294552, + "learning_rate": 0.00019231288536837842, + "loss": 0.3457, + "step": 6646 + }, + { + "epoch": 0.5384802333117304, + "grad_norm": 0.025865985080599785, + "learning_rate": 0.0001923083847157838, + "loss": 0.3404, + "step": 6647 + }, + { + "epoch": 0.5385612443292288, + "grad_norm": 0.03417167067527771, + "learning_rate": 0.00019230388406318917, + "loss": 0.3644, + "step": 6648 + }, + { + "epoch": 0.5386422553467272, + "grad_norm": 0.03334147855639458, + "learning_rate": 0.00019229938341059456, + "loss": 0.3967, + "step": 6649 + }, + { + "epoch": 0.5387232663642255, + "grad_norm": 0.03236551582813263, + "learning_rate": 0.00019229488275799992, + "loss": 0.3303, + "step": 6650 + }, + { + "epoch": 0.5388042773817239, + "grad_norm": 0.030227910727262497, + "learning_rate": 0.00019229038210540528, + "loss": 0.3705, + "step": 6651 + }, + { + "epoch": 0.5388852883992223, + "grad_norm": 0.03478400781750679, + "learning_rate": 0.00019228588145281066, + "loss": 0.3776, + "step": 6652 + }, + { + "epoch": 0.5389662994167207, + "grad_norm": 0.03594156354665756, + "learning_rate": 0.00019228138080021605, + "loss": 0.3548, + "step": 6653 + }, + { + "epoch": 0.539047310434219, + "grad_norm": 0.02861880138516426, + "learning_rate": 0.0001922768801476214, + "loss": 0.3769, + "step": 6654 + }, + { + "epoch": 0.5391283214517174, + "grad_norm": 0.047353651374578476, + "learning_rate": 0.0001922723794950268, + "loss": 0.3352, + "step": 6655 + }, + { + "epoch": 0.5392093324692158, + "grad_norm": 0.030897874385118484, + "learning_rate": 0.00019226787884243216, + "loss": 0.3666, + "step": 6656 + }, + { + "epoch": 0.5392903434867142, + "grad_norm": 0.031044932082295418, + "learning_rate": 0.00019226337818983752, + "loss": 0.3569, + "step": 6657 + }, + { + "epoch": 0.5393713545042126, + "grad_norm": 0.02874862030148506, + "learning_rate": 0.0001922588775372429, + "loss": 0.302, + "step": 6658 + }, + { + "epoch": 0.5394523655217109, + "grad_norm": 0.032438814640045166, + "learning_rate": 0.0001922543768846483, + "loss": 0.3438, + "step": 6659 + }, + { + "epoch": 0.5395333765392093, + "grad_norm": 0.03133894130587578, + "learning_rate": 0.00019224987623205365, + "loss": 0.3543, + "step": 6660 + }, + { + "epoch": 0.5396143875567078, + "grad_norm": 0.029364844784140587, + "learning_rate": 0.00019224537557945904, + "loss": 0.351, + "step": 6661 + }, + { + "epoch": 0.5396953985742061, + "grad_norm": 0.024655230343341827, + "learning_rate": 0.0001922408749268644, + "loss": 0.2798, + "step": 6662 + }, + { + "epoch": 0.5397764095917045, + "grad_norm": 0.03577417880296707, + "learning_rate": 0.00019223637427426976, + "loss": 0.3507, + "step": 6663 + }, + { + "epoch": 0.5398574206092028, + "grad_norm": 0.028531793504953384, + "learning_rate": 0.00019223187362167517, + "loss": 0.3693, + "step": 6664 + }, + { + "epoch": 0.5399384316267012, + "grad_norm": 0.027541454881429672, + "learning_rate": 0.00019222737296908053, + "loss": 0.3099, + "step": 6665 + }, + { + "epoch": 0.5400194426441997, + "grad_norm": 0.02836167998611927, + "learning_rate": 0.0001922228723164859, + "loss": 0.3692, + "step": 6666 + }, + { + "epoch": 0.540100453661698, + "grad_norm": 0.030694272369146347, + "learning_rate": 0.00019221837166389128, + "loss": 0.3601, + "step": 6667 + }, + { + "epoch": 0.5401814646791964, + "grad_norm": 0.028306683525443077, + "learning_rate": 0.00019221387101129664, + "loss": 0.3149, + "step": 6668 + }, + { + "epoch": 0.5402624756966947, + "grad_norm": 0.0326666533946991, + "learning_rate": 0.000192209370358702, + "loss": 0.3668, + "step": 6669 + }, + { + "epoch": 0.5403434867141931, + "grad_norm": 0.02991878241300583, + "learning_rate": 0.00019220486970610742, + "loss": 0.306, + "step": 6670 + }, + { + "epoch": 0.5404244977316915, + "grad_norm": 0.03173675015568733, + "learning_rate": 0.00019220036905351278, + "loss": 0.3454, + "step": 6671 + }, + { + "epoch": 0.5405055087491899, + "grad_norm": 0.030415697023272514, + "learning_rate": 0.00019219586840091814, + "loss": 0.3276, + "step": 6672 + }, + { + "epoch": 0.5405865197666883, + "grad_norm": 0.0356595441699028, + "learning_rate": 0.00019219136774832352, + "loss": 0.3416, + "step": 6673 + }, + { + "epoch": 0.5406675307841866, + "grad_norm": 0.03170002996921539, + "learning_rate": 0.00019218686709572888, + "loss": 0.308, + "step": 6674 + }, + { + "epoch": 0.5407485418016851, + "grad_norm": 0.03190978989005089, + "learning_rate": 0.00019218236644313424, + "loss": 0.3746, + "step": 6675 + }, + { + "epoch": 0.5408295528191834, + "grad_norm": 0.029476001858711243, + "learning_rate": 0.00019217786579053966, + "loss": 0.3553, + "step": 6676 + }, + { + "epoch": 0.5409105638366818, + "grad_norm": 0.02875671535730362, + "learning_rate": 0.00019217336513794502, + "loss": 0.333, + "step": 6677 + }, + { + "epoch": 0.5409915748541801, + "grad_norm": 0.03290131688117981, + "learning_rate": 0.00019216886448535038, + "loss": 0.3558, + "step": 6678 + }, + { + "epoch": 0.5410725858716785, + "grad_norm": 0.0369555726647377, + "learning_rate": 0.00019216436383275576, + "loss": 0.4028, + "step": 6679 + }, + { + "epoch": 0.541153596889177, + "grad_norm": 0.03486994653940201, + "learning_rate": 0.00019215986318016112, + "loss": 0.346, + "step": 6680 + }, + { + "epoch": 0.5412346079066753, + "grad_norm": 0.030410174280405045, + "learning_rate": 0.00019215536252756648, + "loss": 0.3199, + "step": 6681 + }, + { + "epoch": 0.5413156189241737, + "grad_norm": 0.031043974682688713, + "learning_rate": 0.0001921508618749719, + "loss": 0.3585, + "step": 6682 + }, + { + "epoch": 0.541396629941672, + "grad_norm": 0.03518590331077576, + "learning_rate": 0.00019214636122237726, + "loss": 0.3196, + "step": 6683 + }, + { + "epoch": 0.5414776409591704, + "grad_norm": 0.030820732936263084, + "learning_rate": 0.00019214186056978262, + "loss": 0.338, + "step": 6684 + }, + { + "epoch": 0.5415586519766689, + "grad_norm": 0.02911275252699852, + "learning_rate": 0.000192137359917188, + "loss": 0.3443, + "step": 6685 + }, + { + "epoch": 0.5416396629941672, + "grad_norm": 0.02568489871919155, + "learning_rate": 0.00019213285926459337, + "loss": 0.2971, + "step": 6686 + }, + { + "epoch": 0.5417206740116656, + "grad_norm": 0.031865790486335754, + "learning_rate": 0.00019212835861199875, + "loss": 0.3521, + "step": 6687 + }, + { + "epoch": 0.5418016850291639, + "grad_norm": 0.02974405325949192, + "learning_rate": 0.00019212385795940414, + "loss": 0.3538, + "step": 6688 + }, + { + "epoch": 0.5418826960466624, + "grad_norm": 0.034740883857011795, + "learning_rate": 0.0001921193573068095, + "loss": 0.3696, + "step": 6689 + }, + { + "epoch": 0.5419637070641607, + "grad_norm": 0.03494054079055786, + "learning_rate": 0.00019211485665421486, + "loss": 0.3256, + "step": 6690 + }, + { + "epoch": 0.5420447180816591, + "grad_norm": 0.031966958194971085, + "learning_rate": 0.00019211035600162025, + "loss": 0.3667, + "step": 6691 + }, + { + "epoch": 0.5421257290991575, + "grad_norm": 0.029948875308036804, + "learning_rate": 0.0001921058553490256, + "loss": 0.3557, + "step": 6692 + }, + { + "epoch": 0.5422067401166558, + "grad_norm": 0.02980083040893078, + "learning_rate": 0.000192101354696431, + "loss": 0.3299, + "step": 6693 + }, + { + "epoch": 0.5422877511341543, + "grad_norm": 0.029759561643004417, + "learning_rate": 0.00019209685404383638, + "loss": 0.3587, + "step": 6694 + }, + { + "epoch": 0.5423687621516526, + "grad_norm": 0.031510476022958755, + "learning_rate": 0.00019209235339124174, + "loss": 0.3741, + "step": 6695 + }, + { + "epoch": 0.542449773169151, + "grad_norm": 0.03240029513835907, + "learning_rate": 0.0001920878527386471, + "loss": 0.3901, + "step": 6696 + }, + { + "epoch": 0.5425307841866494, + "grad_norm": 0.03050781786441803, + "learning_rate": 0.0001920833520860525, + "loss": 0.3457, + "step": 6697 + }, + { + "epoch": 0.5426117952041478, + "grad_norm": 0.030108438804745674, + "learning_rate": 0.00019207885143345785, + "loss": 0.3498, + "step": 6698 + }, + { + "epoch": 0.5426928062216462, + "grad_norm": 0.0339641310274601, + "learning_rate": 0.00019207435078086324, + "loss": 0.3551, + "step": 6699 + }, + { + "epoch": 0.5427738172391445, + "grad_norm": 0.030506672337651253, + "learning_rate": 0.00019206985012826862, + "loss": 0.343, + "step": 6700 + }, + { + "epoch": 0.5428548282566429, + "grad_norm": 0.03556285798549652, + "learning_rate": 0.00019206534947567398, + "loss": 0.3685, + "step": 6701 + }, + { + "epoch": 0.5429358392741412, + "grad_norm": 0.033519964665174484, + "learning_rate": 0.00019206084882307934, + "loss": 0.3473, + "step": 6702 + }, + { + "epoch": 0.5430168502916397, + "grad_norm": 0.030523715540766716, + "learning_rate": 0.00019205634817048473, + "loss": 0.3449, + "step": 6703 + }, + { + "epoch": 0.5430978613091381, + "grad_norm": 0.03125949949026108, + "learning_rate": 0.0001920518475178901, + "loss": 0.3239, + "step": 6704 + }, + { + "epoch": 0.5431788723266364, + "grad_norm": 0.03124059922993183, + "learning_rate": 0.00019204734686529548, + "loss": 0.3408, + "step": 6705 + }, + { + "epoch": 0.5432598833441348, + "grad_norm": 0.029732635244727135, + "learning_rate": 0.00019204284621270087, + "loss": 0.3434, + "step": 6706 + }, + { + "epoch": 0.5433408943616331, + "grad_norm": 0.03286097198724747, + "learning_rate": 0.00019203834556010623, + "loss": 0.3496, + "step": 6707 + }, + { + "epoch": 0.5434219053791316, + "grad_norm": 0.03427884355187416, + "learning_rate": 0.00019203384490751159, + "loss": 0.3767, + "step": 6708 + }, + { + "epoch": 0.54350291639663, + "grad_norm": 0.02995665930211544, + "learning_rate": 0.00019202934425491697, + "loss": 0.3233, + "step": 6709 + }, + { + "epoch": 0.5435839274141283, + "grad_norm": 0.031802188605070114, + "learning_rate": 0.00019202484360232233, + "loss": 0.2948, + "step": 6710 + }, + { + "epoch": 0.5436649384316267, + "grad_norm": 0.03708088397979736, + "learning_rate": 0.00019202034294972772, + "loss": 0.3585, + "step": 6711 + }, + { + "epoch": 0.5437459494491251, + "grad_norm": 0.03856736421585083, + "learning_rate": 0.0001920158422971331, + "loss": 0.3743, + "step": 6712 + }, + { + "epoch": 0.5438269604666235, + "grad_norm": 0.027673332020640373, + "learning_rate": 0.00019201134164453847, + "loss": 0.3266, + "step": 6713 + }, + { + "epoch": 0.5439079714841218, + "grad_norm": 0.02969091199338436, + "learning_rate": 0.00019200684099194383, + "loss": 0.3063, + "step": 6714 + }, + { + "epoch": 0.5439889825016202, + "grad_norm": 0.03220966458320618, + "learning_rate": 0.00019200234033934921, + "loss": 0.375, + "step": 6715 + }, + { + "epoch": 0.5440699935191186, + "grad_norm": 0.032075174152851105, + "learning_rate": 0.0001919978396867546, + "loss": 0.3759, + "step": 6716 + }, + { + "epoch": 0.544151004536617, + "grad_norm": 0.029770556837320328, + "learning_rate": 0.00019199333903415996, + "loss": 0.3245, + "step": 6717 + }, + { + "epoch": 0.5442320155541154, + "grad_norm": 0.030717231333255768, + "learning_rate": 0.00019198883838156535, + "loss": 0.3231, + "step": 6718 + }, + { + "epoch": 0.5443130265716137, + "grad_norm": 0.037240952253341675, + "learning_rate": 0.0001919843377289707, + "loss": 0.3672, + "step": 6719 + }, + { + "epoch": 0.5443940375891121, + "grad_norm": 0.029282161965966225, + "learning_rate": 0.00019197983707637607, + "loss": 0.3592, + "step": 6720 + }, + { + "epoch": 0.5444750486066104, + "grad_norm": 0.03133406862616539, + "learning_rate": 0.00019197533642378146, + "loss": 0.3544, + "step": 6721 + }, + { + "epoch": 0.5445560596241089, + "grad_norm": 0.03557344898581505, + "learning_rate": 0.00019197083577118684, + "loss": 0.3298, + "step": 6722 + }, + { + "epoch": 0.5446370706416073, + "grad_norm": 0.03574608638882637, + "learning_rate": 0.0001919663351185922, + "loss": 0.3927, + "step": 6723 + }, + { + "epoch": 0.5447180816591056, + "grad_norm": 0.03001011349260807, + "learning_rate": 0.0001919618344659976, + "loss": 0.3612, + "step": 6724 + }, + { + "epoch": 0.544799092676604, + "grad_norm": 0.030254319310188293, + "learning_rate": 0.00019195733381340295, + "loss": 0.3542, + "step": 6725 + }, + { + "epoch": 0.5448801036941024, + "grad_norm": 0.032271239906549454, + "learning_rate": 0.0001919528331608083, + "loss": 0.3552, + "step": 6726 + }, + { + "epoch": 0.5449611147116008, + "grad_norm": 0.03296198323369026, + "learning_rate": 0.0001919483325082137, + "loss": 0.3746, + "step": 6727 + }, + { + "epoch": 0.5450421257290992, + "grad_norm": 0.03257940709590912, + "learning_rate": 0.00019194383185561908, + "loss": 0.3433, + "step": 6728 + }, + { + "epoch": 0.5451231367465975, + "grad_norm": 0.03006012551486492, + "learning_rate": 0.00019193933120302444, + "loss": 0.3734, + "step": 6729 + }, + { + "epoch": 0.5452041477640959, + "grad_norm": 0.032573532313108444, + "learning_rate": 0.00019193483055042983, + "loss": 0.3438, + "step": 6730 + }, + { + "epoch": 0.5452851587815943, + "grad_norm": 0.05190266668796539, + "learning_rate": 0.0001919303298978352, + "loss": 0.3493, + "step": 6731 + }, + { + "epoch": 0.5453661697990927, + "grad_norm": 0.029056159779429436, + "learning_rate": 0.00019192582924524055, + "loss": 0.3305, + "step": 6732 + }, + { + "epoch": 0.545447180816591, + "grad_norm": 0.02668183483183384, + "learning_rate": 0.00019192132859264594, + "loss": 0.3358, + "step": 6733 + }, + { + "epoch": 0.5455281918340894, + "grad_norm": 0.035193026065826416, + "learning_rate": 0.00019191682794005133, + "loss": 0.3606, + "step": 6734 + }, + { + "epoch": 0.5456092028515879, + "grad_norm": 0.031068945303559303, + "learning_rate": 0.0001919123272874567, + "loss": 0.3423, + "step": 6735 + }, + { + "epoch": 0.5456902138690862, + "grad_norm": 0.03662171959877014, + "learning_rate": 0.00019190782663486207, + "loss": 0.4027, + "step": 6736 + }, + { + "epoch": 0.5457712248865846, + "grad_norm": 0.02608422189950943, + "learning_rate": 0.00019190332598226743, + "loss": 0.3066, + "step": 6737 + }, + { + "epoch": 0.5458522359040829, + "grad_norm": 0.02757170982658863, + "learning_rate": 0.0001918988253296728, + "loss": 0.3084, + "step": 6738 + }, + { + "epoch": 0.5459332469215813, + "grad_norm": 0.03273205831646919, + "learning_rate": 0.0001918943246770782, + "loss": 0.3567, + "step": 6739 + }, + { + "epoch": 0.5460142579390798, + "grad_norm": 0.035114750266075134, + "learning_rate": 0.00019188982402448357, + "loss": 0.3655, + "step": 6740 + }, + { + "epoch": 0.5460952689565781, + "grad_norm": 0.027389245107769966, + "learning_rate": 0.00019188532337188893, + "loss": 0.3515, + "step": 6741 + }, + { + "epoch": 0.5461762799740765, + "grad_norm": 0.03340989723801613, + "learning_rate": 0.00019188082271929432, + "loss": 0.3579, + "step": 6742 + }, + { + "epoch": 0.5462572909915748, + "grad_norm": 0.029114002361893654, + "learning_rate": 0.00019187632206669968, + "loss": 0.3715, + "step": 6743 + }, + { + "epoch": 0.5463383020090732, + "grad_norm": 0.028086459264159203, + "learning_rate": 0.00019187182141410504, + "loss": 0.3447, + "step": 6744 + }, + { + "epoch": 0.5464193130265717, + "grad_norm": 0.03181225061416626, + "learning_rate": 0.00019186732076151045, + "loss": 0.3363, + "step": 6745 + }, + { + "epoch": 0.54650032404407, + "grad_norm": 0.03511791676282883, + "learning_rate": 0.0001918628201089158, + "loss": 0.378, + "step": 6746 + }, + { + "epoch": 0.5465813350615684, + "grad_norm": 0.031299296766519547, + "learning_rate": 0.00019185831945632117, + "loss": 0.3503, + "step": 6747 + }, + { + "epoch": 0.5466623460790667, + "grad_norm": 0.025090057402849197, + "learning_rate": 0.00019185381880372656, + "loss": 0.2959, + "step": 6748 + }, + { + "epoch": 0.5467433570965652, + "grad_norm": 0.030407389625906944, + "learning_rate": 0.00019184931815113192, + "loss": 0.3149, + "step": 6749 + }, + { + "epoch": 0.5468243681140635, + "grad_norm": 0.03297053650021553, + "learning_rate": 0.00019184481749853728, + "loss": 0.3589, + "step": 6750 + }, + { + "epoch": 0.5469053791315619, + "grad_norm": 0.02942829765379429, + "learning_rate": 0.0001918403168459427, + "loss": 0.3376, + "step": 6751 + }, + { + "epoch": 0.5469863901490603, + "grad_norm": 0.027623575180768967, + "learning_rate": 0.00019183581619334805, + "loss": 0.3155, + "step": 6752 + }, + { + "epoch": 0.5470674011665586, + "grad_norm": 0.029620088636875153, + "learning_rate": 0.0001918313155407534, + "loss": 0.3602, + "step": 6753 + }, + { + "epoch": 0.5471484121840571, + "grad_norm": 0.035274140536785126, + "learning_rate": 0.0001918268148881588, + "loss": 0.3609, + "step": 6754 + }, + { + "epoch": 0.5472294232015554, + "grad_norm": 0.039904288947582245, + "learning_rate": 0.00019182231423556416, + "loss": 0.4022, + "step": 6755 + }, + { + "epoch": 0.5473104342190538, + "grad_norm": 0.028668256476521492, + "learning_rate": 0.00019181781358296952, + "loss": 0.3206, + "step": 6756 + }, + { + "epoch": 0.5473914452365521, + "grad_norm": 0.030775291845202446, + "learning_rate": 0.00019181331293037493, + "loss": 0.3432, + "step": 6757 + }, + { + "epoch": 0.5474724562540505, + "grad_norm": 0.03100411407649517, + "learning_rate": 0.0001918088122777803, + "loss": 0.3562, + "step": 6758 + }, + { + "epoch": 0.547553467271549, + "grad_norm": 0.03083074279129505, + "learning_rate": 0.00019180431162518565, + "loss": 0.3479, + "step": 6759 + }, + { + "epoch": 0.5476344782890473, + "grad_norm": 0.029746398329734802, + "learning_rate": 0.00019179981097259104, + "loss": 0.3299, + "step": 6760 + }, + { + "epoch": 0.5477154893065457, + "grad_norm": 0.029062218964099884, + "learning_rate": 0.0001917953103199964, + "loss": 0.3753, + "step": 6761 + }, + { + "epoch": 0.547796500324044, + "grad_norm": 0.03341421112418175, + "learning_rate": 0.00019179080966740176, + "loss": 0.3101, + "step": 6762 + }, + { + "epoch": 0.5478775113415425, + "grad_norm": 0.03206118568778038, + "learning_rate": 0.00019178630901480717, + "loss": 0.3661, + "step": 6763 + }, + { + "epoch": 0.5479585223590409, + "grad_norm": 0.029552146792411804, + "learning_rate": 0.00019178180836221253, + "loss": 0.3376, + "step": 6764 + }, + { + "epoch": 0.5480395333765392, + "grad_norm": 0.028868060559034348, + "learning_rate": 0.0001917773077096179, + "loss": 0.3225, + "step": 6765 + }, + { + "epoch": 0.5481205443940376, + "grad_norm": 0.02880750596523285, + "learning_rate": 0.00019177280705702328, + "loss": 0.3137, + "step": 6766 + }, + { + "epoch": 0.5482015554115359, + "grad_norm": 0.03132180869579315, + "learning_rate": 0.00019176830640442864, + "loss": 0.3225, + "step": 6767 + }, + { + "epoch": 0.5482825664290344, + "grad_norm": 0.030915534123778343, + "learning_rate": 0.00019176380575183403, + "loss": 0.337, + "step": 6768 + }, + { + "epoch": 0.5483635774465327, + "grad_norm": 0.02946346439421177, + "learning_rate": 0.00019175930509923942, + "loss": 0.3765, + "step": 6769 + }, + { + "epoch": 0.5484445884640311, + "grad_norm": 0.029647020623087883, + "learning_rate": 0.00019175480444664478, + "loss": 0.3635, + "step": 6770 + }, + { + "epoch": 0.5485255994815295, + "grad_norm": 0.03362290561199188, + "learning_rate": 0.00019175030379405014, + "loss": 0.3469, + "step": 6771 + }, + { + "epoch": 0.5486066104990278, + "grad_norm": 0.034770358353853226, + "learning_rate": 0.00019174580314145552, + "loss": 0.3524, + "step": 6772 + }, + { + "epoch": 0.5486876215165263, + "grad_norm": 0.04310256242752075, + "learning_rate": 0.00019174130248886088, + "loss": 0.3756, + "step": 6773 + }, + { + "epoch": 0.5487686325340246, + "grad_norm": 0.03369515761733055, + "learning_rate": 0.00019173680183626627, + "loss": 0.382, + "step": 6774 + }, + { + "epoch": 0.548849643551523, + "grad_norm": 0.032771673053503036, + "learning_rate": 0.00019173230118367166, + "loss": 0.3968, + "step": 6775 + }, + { + "epoch": 0.5489306545690213, + "grad_norm": 0.03280429169535637, + "learning_rate": 0.00019172780053107702, + "loss": 0.3491, + "step": 6776 + }, + { + "epoch": 0.5490116655865198, + "grad_norm": 0.031371165066957474, + "learning_rate": 0.00019172329987848238, + "loss": 0.3224, + "step": 6777 + }, + { + "epoch": 0.5490926766040182, + "grad_norm": 0.029993250966072083, + "learning_rate": 0.00019171879922588776, + "loss": 0.3669, + "step": 6778 + }, + { + "epoch": 0.5491736876215165, + "grad_norm": 0.03484036773443222, + "learning_rate": 0.00019171429857329312, + "loss": 0.3361, + "step": 6779 + }, + { + "epoch": 0.5492546986390149, + "grad_norm": 0.03162126988172531, + "learning_rate": 0.0001917097979206985, + "loss": 0.3849, + "step": 6780 + }, + { + "epoch": 0.5493357096565132, + "grad_norm": 0.035372983664274216, + "learning_rate": 0.0001917052972681039, + "loss": 0.3978, + "step": 6781 + }, + { + "epoch": 0.5494167206740117, + "grad_norm": 0.032826296985149384, + "learning_rate": 0.00019170079661550926, + "loss": 0.3882, + "step": 6782 + }, + { + "epoch": 0.5494977316915101, + "grad_norm": 0.027257384732365608, + "learning_rate": 0.00019169629596291462, + "loss": 0.3338, + "step": 6783 + }, + { + "epoch": 0.5495787427090084, + "grad_norm": 0.0283949077129364, + "learning_rate": 0.00019169179531032, + "loss": 0.2983, + "step": 6784 + }, + { + "epoch": 0.5496597537265068, + "grad_norm": 0.03267160430550575, + "learning_rate": 0.00019168729465772537, + "loss": 0.3423, + "step": 6785 + }, + { + "epoch": 0.5497407647440052, + "grad_norm": 0.03289078548550606, + "learning_rate": 0.00019168279400513075, + "loss": 0.311, + "step": 6786 + }, + { + "epoch": 0.5498217757615036, + "grad_norm": 0.03308109566569328, + "learning_rate": 0.00019167829335253614, + "loss": 0.3191, + "step": 6787 + }, + { + "epoch": 0.549902786779002, + "grad_norm": 0.034293193370103836, + "learning_rate": 0.0001916737926999415, + "loss": 0.3856, + "step": 6788 + }, + { + "epoch": 0.5499837977965003, + "grad_norm": 0.03392151743173599, + "learning_rate": 0.00019166929204734686, + "loss": 0.3347, + "step": 6789 + }, + { + "epoch": 0.5500648088139987, + "grad_norm": 0.031559936702251434, + "learning_rate": 0.00019166479139475225, + "loss": 0.3762, + "step": 6790 + }, + { + "epoch": 0.5501458198314971, + "grad_norm": 0.03242883458733559, + "learning_rate": 0.0001916602907421576, + "loss": 0.3907, + "step": 6791 + }, + { + "epoch": 0.5502268308489955, + "grad_norm": 0.02734988182783127, + "learning_rate": 0.000191655790089563, + "loss": 0.3148, + "step": 6792 + }, + { + "epoch": 0.5503078418664938, + "grad_norm": 0.032763585448265076, + "learning_rate": 0.00019165128943696838, + "loss": 0.4039, + "step": 6793 + }, + { + "epoch": 0.5503888528839922, + "grad_norm": 0.03262895345687866, + "learning_rate": 0.00019164678878437374, + "loss": 0.3682, + "step": 6794 + }, + { + "epoch": 0.5504698639014906, + "grad_norm": 0.03161659091711044, + "learning_rate": 0.0001916422881317791, + "loss": 0.3162, + "step": 6795 + }, + { + "epoch": 0.550550874918989, + "grad_norm": 0.03022010624408722, + "learning_rate": 0.0001916377874791845, + "loss": 0.3902, + "step": 6796 + }, + { + "epoch": 0.5506318859364874, + "grad_norm": 0.03609905764460564, + "learning_rate": 0.00019163328682658988, + "loss": 0.3424, + "step": 6797 + }, + { + "epoch": 0.5507128969539857, + "grad_norm": 0.03280745819211006, + "learning_rate": 0.00019162878617399524, + "loss": 0.4093, + "step": 6798 + }, + { + "epoch": 0.5507939079714841, + "grad_norm": 0.03054327517747879, + "learning_rate": 0.00019162428552140062, + "loss": 0.3166, + "step": 6799 + }, + { + "epoch": 0.5508749189889826, + "grad_norm": 0.030223997309803963, + "learning_rate": 0.00019161978486880598, + "loss": 0.3185, + "step": 6800 + }, + { + "epoch": 0.5509559300064809, + "grad_norm": 0.029921215027570724, + "learning_rate": 0.00019161528421621134, + "loss": 0.3583, + "step": 6801 + }, + { + "epoch": 0.5510369410239793, + "grad_norm": 0.030760707333683968, + "learning_rate": 0.00019161078356361673, + "loss": 0.3484, + "step": 6802 + }, + { + "epoch": 0.5511179520414776, + "grad_norm": 0.02772986702620983, + "learning_rate": 0.00019160628291102212, + "loss": 0.3056, + "step": 6803 + }, + { + "epoch": 0.551198963058976, + "grad_norm": 0.032765943557024, + "learning_rate": 0.00019160178225842748, + "loss": 0.3425, + "step": 6804 + }, + { + "epoch": 0.5512799740764744, + "grad_norm": 0.028970535844564438, + "learning_rate": 0.00019159728160583287, + "loss": 0.3272, + "step": 6805 + }, + { + "epoch": 0.5513609850939728, + "grad_norm": 0.03425658121705055, + "learning_rate": 0.00019159278095323823, + "loss": 0.3468, + "step": 6806 + }, + { + "epoch": 0.5514419961114712, + "grad_norm": 0.03469391167163849, + "learning_rate": 0.00019158828030064359, + "loss": 0.4086, + "step": 6807 + }, + { + "epoch": 0.5515230071289695, + "grad_norm": 0.031067317351698875, + "learning_rate": 0.00019158377964804897, + "loss": 0.3424, + "step": 6808 + }, + { + "epoch": 0.5516040181464679, + "grad_norm": 0.03129751980304718, + "learning_rate": 0.00019157927899545436, + "loss": 0.3564, + "step": 6809 + }, + { + "epoch": 0.5516850291639663, + "grad_norm": 0.02611689455807209, + "learning_rate": 0.00019157477834285972, + "loss": 0.3438, + "step": 6810 + }, + { + "epoch": 0.5517660401814647, + "grad_norm": 0.030698176473379135, + "learning_rate": 0.0001915702776902651, + "loss": 0.3252, + "step": 6811 + }, + { + "epoch": 0.551847051198963, + "grad_norm": 0.029681215062737465, + "learning_rate": 0.00019156577703767047, + "loss": 0.3413, + "step": 6812 + }, + { + "epoch": 0.5519280622164614, + "grad_norm": 0.03175154700875282, + "learning_rate": 0.00019156127638507583, + "loss": 0.3418, + "step": 6813 + }, + { + "epoch": 0.5520090732339599, + "grad_norm": 0.03310735896229744, + "learning_rate": 0.00019155677573248121, + "loss": 0.3601, + "step": 6814 + }, + { + "epoch": 0.5520900842514582, + "grad_norm": 0.02929147146642208, + "learning_rate": 0.0001915522750798866, + "loss": 0.3651, + "step": 6815 + }, + { + "epoch": 0.5521710952689566, + "grad_norm": 0.03329383581876755, + "learning_rate": 0.00019154777442729196, + "loss": 0.3508, + "step": 6816 + }, + { + "epoch": 0.5522521062864549, + "grad_norm": 0.027802016586065292, + "learning_rate": 0.00019154327377469735, + "loss": 0.3346, + "step": 6817 + }, + { + "epoch": 0.5523331173039533, + "grad_norm": 0.03057532198727131, + "learning_rate": 0.0001915387731221027, + "loss": 0.3249, + "step": 6818 + }, + { + "epoch": 0.5524141283214518, + "grad_norm": 0.030911043286323547, + "learning_rate": 0.00019153427246950807, + "loss": 0.3036, + "step": 6819 + }, + { + "epoch": 0.5524951393389501, + "grad_norm": 0.028215307742357254, + "learning_rate": 0.00019152977181691348, + "loss": 0.3631, + "step": 6820 + }, + { + "epoch": 0.5525761503564485, + "grad_norm": 0.03021448478102684, + "learning_rate": 0.00019152527116431884, + "loss": 0.3646, + "step": 6821 + }, + { + "epoch": 0.5526571613739468, + "grad_norm": 0.03648442029953003, + "learning_rate": 0.0001915207705117242, + "loss": 0.3783, + "step": 6822 + }, + { + "epoch": 0.5527381723914452, + "grad_norm": 0.030310718342661858, + "learning_rate": 0.0001915162698591296, + "loss": 0.3367, + "step": 6823 + }, + { + "epoch": 0.5528191834089436, + "grad_norm": 0.030238375067710876, + "learning_rate": 0.00019151176920653495, + "loss": 0.3696, + "step": 6824 + }, + { + "epoch": 0.552900194426442, + "grad_norm": 0.03566858544945717, + "learning_rate": 0.0001915072685539403, + "loss": 0.4048, + "step": 6825 + }, + { + "epoch": 0.5529812054439404, + "grad_norm": 0.030715545639395714, + "learning_rate": 0.00019150276790134573, + "loss": 0.354, + "step": 6826 + }, + { + "epoch": 0.5530622164614387, + "grad_norm": 0.028971727937459946, + "learning_rate": 0.00019149826724875109, + "loss": 0.319, + "step": 6827 + }, + { + "epoch": 0.5531432274789372, + "grad_norm": 0.028171943500638008, + "learning_rate": 0.00019149376659615645, + "loss": 0.3677, + "step": 6828 + }, + { + "epoch": 0.5532242384964355, + "grad_norm": 0.03259630873799324, + "learning_rate": 0.00019148926594356183, + "loss": 0.3665, + "step": 6829 + }, + { + "epoch": 0.5533052495139339, + "grad_norm": 0.034019727259874344, + "learning_rate": 0.0001914847652909672, + "loss": 0.362, + "step": 6830 + }, + { + "epoch": 0.5533862605314323, + "grad_norm": 0.027019208297133446, + "learning_rate": 0.00019148026463837255, + "loss": 0.3097, + "step": 6831 + }, + { + "epoch": 0.5534672715489306, + "grad_norm": 0.029250169172883034, + "learning_rate": 0.00019147576398577797, + "loss": 0.3312, + "step": 6832 + }, + { + "epoch": 0.5535482825664291, + "grad_norm": 0.03057018667459488, + "learning_rate": 0.00019147126333318333, + "loss": 0.3799, + "step": 6833 + }, + { + "epoch": 0.5536292935839274, + "grad_norm": 0.03147272393107414, + "learning_rate": 0.0001914667626805887, + "loss": 0.3241, + "step": 6834 + }, + { + "epoch": 0.5537103046014258, + "grad_norm": 0.029593253508210182, + "learning_rate": 0.00019146226202799407, + "loss": 0.3662, + "step": 6835 + }, + { + "epoch": 0.5537913156189241, + "grad_norm": 0.03245329111814499, + "learning_rate": 0.00019145776137539943, + "loss": 0.3119, + "step": 6836 + }, + { + "epoch": 0.5538723266364226, + "grad_norm": 0.03435903042554855, + "learning_rate": 0.0001914532607228048, + "loss": 0.3854, + "step": 6837 + }, + { + "epoch": 0.553953337653921, + "grad_norm": 0.036790501326322556, + "learning_rate": 0.0001914487600702102, + "loss": 0.4024, + "step": 6838 + }, + { + "epoch": 0.5540343486714193, + "grad_norm": 0.02854119800031185, + "learning_rate": 0.00019144425941761557, + "loss": 0.3576, + "step": 6839 + }, + { + "epoch": 0.5541153596889177, + "grad_norm": 0.032489001750946045, + "learning_rate": 0.00019143975876502093, + "loss": 0.3809, + "step": 6840 + }, + { + "epoch": 0.554196370706416, + "grad_norm": 0.03234725818037987, + "learning_rate": 0.00019143525811242632, + "loss": 0.3146, + "step": 6841 + }, + { + "epoch": 0.5542773817239145, + "grad_norm": 0.031654514372348785, + "learning_rate": 0.00019143075745983168, + "loss": 0.3793, + "step": 6842 + }, + { + "epoch": 0.5543583927414129, + "grad_norm": 0.02953651174902916, + "learning_rate": 0.00019142625680723704, + "loss": 0.3314, + "step": 6843 + }, + { + "epoch": 0.5544394037589112, + "grad_norm": 0.03533756360411644, + "learning_rate": 0.00019142175615464245, + "loss": 0.3854, + "step": 6844 + }, + { + "epoch": 0.5545204147764096, + "grad_norm": 0.029549822211265564, + "learning_rate": 0.0001914172555020478, + "loss": 0.3541, + "step": 6845 + }, + { + "epoch": 0.5546014257939079, + "grad_norm": 0.0319569893181324, + "learning_rate": 0.00019141275484945317, + "loss": 0.3656, + "step": 6846 + }, + { + "epoch": 0.5546824368114064, + "grad_norm": 0.026602884754538536, + "learning_rate": 0.00019140825419685856, + "loss": 0.3507, + "step": 6847 + }, + { + "epoch": 0.5547634478289047, + "grad_norm": 0.03057567961513996, + "learning_rate": 0.00019140375354426392, + "loss": 0.3551, + "step": 6848 + }, + { + "epoch": 0.5548444588464031, + "grad_norm": 0.029442768543958664, + "learning_rate": 0.0001913992528916693, + "loss": 0.3521, + "step": 6849 + }, + { + "epoch": 0.5549254698639015, + "grad_norm": 0.0324246883392334, + "learning_rate": 0.0001913947522390747, + "loss": 0.3345, + "step": 6850 + }, + { + "epoch": 0.5550064808813999, + "grad_norm": 0.03182468190789223, + "learning_rate": 0.00019139025158648005, + "loss": 0.3794, + "step": 6851 + }, + { + "epoch": 0.5550874918988983, + "grad_norm": 0.033127471804618835, + "learning_rate": 0.0001913857509338854, + "loss": 0.3408, + "step": 6852 + }, + { + "epoch": 0.5551685029163966, + "grad_norm": 0.027491910383105278, + "learning_rate": 0.0001913812502812908, + "loss": 0.336, + "step": 6853 + }, + { + "epoch": 0.555249513933895, + "grad_norm": 0.031573060899972916, + "learning_rate": 0.00019137674962869616, + "loss": 0.3205, + "step": 6854 + }, + { + "epoch": 0.5553305249513933, + "grad_norm": 0.02980121038854122, + "learning_rate": 0.00019137224897610155, + "loss": 0.3579, + "step": 6855 + }, + { + "epoch": 0.5554115359688918, + "grad_norm": 0.0305420383810997, + "learning_rate": 0.00019136774832350693, + "loss": 0.3647, + "step": 6856 + }, + { + "epoch": 0.5554925469863902, + "grad_norm": 0.02846667729318142, + "learning_rate": 0.0001913632476709123, + "loss": 0.3339, + "step": 6857 + }, + { + "epoch": 0.5555735580038885, + "grad_norm": 0.03440249711275101, + "learning_rate": 0.00019135874701831765, + "loss": 0.3948, + "step": 6858 + }, + { + "epoch": 0.5556545690213869, + "grad_norm": 0.030161675065755844, + "learning_rate": 0.00019135424636572304, + "loss": 0.3686, + "step": 6859 + }, + { + "epoch": 0.5557355800388852, + "grad_norm": 0.030002431944012642, + "learning_rate": 0.0001913497457131284, + "loss": 0.3303, + "step": 6860 + }, + { + "epoch": 0.5558165910563837, + "grad_norm": 0.029472051188349724, + "learning_rate": 0.0001913452450605338, + "loss": 0.3622, + "step": 6861 + }, + { + "epoch": 0.5558976020738821, + "grad_norm": 0.02902107685804367, + "learning_rate": 0.00019134074440793917, + "loss": 0.3077, + "step": 6862 + }, + { + "epoch": 0.5559786130913804, + "grad_norm": 0.031501494348049164, + "learning_rate": 0.00019133624375534453, + "loss": 0.3397, + "step": 6863 + }, + { + "epoch": 0.5560596241088788, + "grad_norm": 0.0340961217880249, + "learning_rate": 0.0001913317431027499, + "loss": 0.3326, + "step": 6864 + }, + { + "epoch": 0.5561406351263772, + "grad_norm": 0.032292790710926056, + "learning_rate": 0.00019132724245015528, + "loss": 0.3664, + "step": 6865 + }, + { + "epoch": 0.5562216461438756, + "grad_norm": 0.03301149979233742, + "learning_rate": 0.00019132274179756064, + "loss": 0.3626, + "step": 6866 + }, + { + "epoch": 0.556302657161374, + "grad_norm": 0.02956380322575569, + "learning_rate": 0.00019131824114496603, + "loss": 0.335, + "step": 6867 + }, + { + "epoch": 0.5563836681788723, + "grad_norm": 0.03247414156794548, + "learning_rate": 0.00019131374049237142, + "loss": 0.4058, + "step": 6868 + }, + { + "epoch": 0.5564646791963707, + "grad_norm": 0.028093554079532623, + "learning_rate": 0.00019130923983977678, + "loss": 0.3335, + "step": 6869 + }, + { + "epoch": 0.5565456902138691, + "grad_norm": 0.02879849076271057, + "learning_rate": 0.00019130473918718214, + "loss": 0.3434, + "step": 6870 + }, + { + "epoch": 0.5566267012313675, + "grad_norm": 0.02984016016125679, + "learning_rate": 0.00019130023853458752, + "loss": 0.3348, + "step": 6871 + }, + { + "epoch": 0.5567077122488658, + "grad_norm": 0.026220617815852165, + "learning_rate": 0.0001912957378819929, + "loss": 0.2983, + "step": 6872 + }, + { + "epoch": 0.5567887232663642, + "grad_norm": 0.03234969452023506, + "learning_rate": 0.00019129123722939827, + "loss": 0.3404, + "step": 6873 + }, + { + "epoch": 0.5568697342838627, + "grad_norm": 0.030391644686460495, + "learning_rate": 0.00019128673657680366, + "loss": 0.3228, + "step": 6874 + }, + { + "epoch": 0.556950745301361, + "grad_norm": 0.031873755156993866, + "learning_rate": 0.00019128223592420902, + "loss": 0.3734, + "step": 6875 + }, + { + "epoch": 0.5570317563188594, + "grad_norm": 0.02982131578028202, + "learning_rate": 0.00019127773527161438, + "loss": 0.3209, + "step": 6876 + }, + { + "epoch": 0.5571127673363577, + "grad_norm": 0.029721610248088837, + "learning_rate": 0.00019127323461901977, + "loss": 0.3013, + "step": 6877 + }, + { + "epoch": 0.5571937783538561, + "grad_norm": 0.04121321812272072, + "learning_rate": 0.00019126873396642515, + "loss": 0.3376, + "step": 6878 + }, + { + "epoch": 0.5572747893713546, + "grad_norm": 0.032671697437763214, + "learning_rate": 0.0001912642333138305, + "loss": 0.3472, + "step": 6879 + }, + { + "epoch": 0.5573558003888529, + "grad_norm": 0.03484325855970383, + "learning_rate": 0.0001912597326612359, + "loss": 0.3852, + "step": 6880 + }, + { + "epoch": 0.5574368114063513, + "grad_norm": 0.02832927368581295, + "learning_rate": 0.00019125523200864126, + "loss": 0.3094, + "step": 6881 + }, + { + "epoch": 0.5575178224238496, + "grad_norm": 0.03815742954611778, + "learning_rate": 0.00019125073135604662, + "loss": 0.3853, + "step": 6882 + }, + { + "epoch": 0.557598833441348, + "grad_norm": 0.03191991522908211, + "learning_rate": 0.000191246230703452, + "loss": 0.3514, + "step": 6883 + }, + { + "epoch": 0.5576798444588464, + "grad_norm": 0.03721702843904495, + "learning_rate": 0.0001912417300508574, + "loss": 0.3489, + "step": 6884 + }, + { + "epoch": 0.5577608554763448, + "grad_norm": 0.02670007012784481, + "learning_rate": 0.00019123722939826275, + "loss": 0.282, + "step": 6885 + }, + { + "epoch": 0.5578418664938432, + "grad_norm": 0.03530779480934143, + "learning_rate": 0.00019123272874566814, + "loss": 0.3499, + "step": 6886 + }, + { + "epoch": 0.5579228775113415, + "grad_norm": 0.028200553730130196, + "learning_rate": 0.0001912282280930735, + "loss": 0.3153, + "step": 6887 + }, + { + "epoch": 0.55800388852884, + "grad_norm": 0.034294601529836655, + "learning_rate": 0.00019122372744047886, + "loss": 0.3672, + "step": 6888 + }, + { + "epoch": 0.5580848995463383, + "grad_norm": 0.03616178408265114, + "learning_rate": 0.00019121922678788425, + "loss": 0.425, + "step": 6889 + }, + { + "epoch": 0.5581659105638367, + "grad_norm": 0.03213857114315033, + "learning_rate": 0.00019121472613528964, + "loss": 0.316, + "step": 6890 + }, + { + "epoch": 0.558246921581335, + "grad_norm": 0.031516559422016144, + "learning_rate": 0.000191210225482695, + "loss": 0.3513, + "step": 6891 + }, + { + "epoch": 0.5583279325988334, + "grad_norm": 0.03412635996937752, + "learning_rate": 0.00019120572483010038, + "loss": 0.3695, + "step": 6892 + }, + { + "epoch": 0.5584089436163319, + "grad_norm": 0.032149460166692734, + "learning_rate": 0.00019120122417750574, + "loss": 0.4148, + "step": 6893 + }, + { + "epoch": 0.5584899546338302, + "grad_norm": 0.03549889475107193, + "learning_rate": 0.0001911967235249111, + "loss": 0.3817, + "step": 6894 + }, + { + "epoch": 0.5585709656513286, + "grad_norm": 0.027554932981729507, + "learning_rate": 0.0001911922228723165, + "loss": 0.2947, + "step": 6895 + }, + { + "epoch": 0.5586519766688269, + "grad_norm": 0.040073104202747345, + "learning_rate": 0.00019118772221972188, + "loss": 0.3383, + "step": 6896 + }, + { + "epoch": 0.5587329876863253, + "grad_norm": 0.03099329210817814, + "learning_rate": 0.00019118322156712724, + "loss": 0.2951, + "step": 6897 + }, + { + "epoch": 0.5588139987038238, + "grad_norm": 0.029820909723639488, + "learning_rate": 0.00019117872091453262, + "loss": 0.3435, + "step": 6898 + }, + { + "epoch": 0.5588950097213221, + "grad_norm": 0.030790315940976143, + "learning_rate": 0.00019117422026193798, + "loss": 0.3651, + "step": 6899 + }, + { + "epoch": 0.5589760207388205, + "grad_norm": 0.03345898538827896, + "learning_rate": 0.00019116971960934334, + "loss": 0.3387, + "step": 6900 + }, + { + "epoch": 0.5590570317563188, + "grad_norm": 0.031085064634680748, + "learning_rate": 0.00019116521895674876, + "loss": 0.3167, + "step": 6901 + }, + { + "epoch": 0.5591380427738173, + "grad_norm": 0.03324053809046745, + "learning_rate": 0.00019116071830415412, + "loss": 0.3762, + "step": 6902 + }, + { + "epoch": 0.5592190537913156, + "grad_norm": 0.03265579044818878, + "learning_rate": 0.00019115621765155948, + "loss": 0.3682, + "step": 6903 + }, + { + "epoch": 0.559300064808814, + "grad_norm": 0.03241958096623421, + "learning_rate": 0.00019115171699896487, + "loss": 0.3431, + "step": 6904 + }, + { + "epoch": 0.5593810758263124, + "grad_norm": 0.027653567492961884, + "learning_rate": 0.00019114721634637023, + "loss": 0.3514, + "step": 6905 + }, + { + "epoch": 0.5594620868438107, + "grad_norm": 0.03009861335158348, + "learning_rate": 0.00019114271569377559, + "loss": 0.3885, + "step": 6906 + }, + { + "epoch": 0.5595430978613092, + "grad_norm": 0.030470533296465874, + "learning_rate": 0.000191138215041181, + "loss": 0.3478, + "step": 6907 + }, + { + "epoch": 0.5596241088788075, + "grad_norm": 0.03243311867117882, + "learning_rate": 0.00019113371438858636, + "loss": 0.3719, + "step": 6908 + }, + { + "epoch": 0.5597051198963059, + "grad_norm": 0.03046790510416031, + "learning_rate": 0.00019112921373599172, + "loss": 0.3193, + "step": 6909 + }, + { + "epoch": 0.5597861309138042, + "grad_norm": 0.02883930876851082, + "learning_rate": 0.0001911247130833971, + "loss": 0.3559, + "step": 6910 + }, + { + "epoch": 0.5598671419313026, + "grad_norm": 0.03061991184949875, + "learning_rate": 0.00019112021243080247, + "loss": 0.3398, + "step": 6911 + }, + { + "epoch": 0.5599481529488011, + "grad_norm": 0.027453241869807243, + "learning_rate": 0.00019111571177820783, + "loss": 0.3302, + "step": 6912 + }, + { + "epoch": 0.5600291639662994, + "grad_norm": 0.034282706677913666, + "learning_rate": 0.00019111121112561324, + "loss": 0.3877, + "step": 6913 + }, + { + "epoch": 0.5601101749837978, + "grad_norm": 0.028397291898727417, + "learning_rate": 0.0001911067104730186, + "loss": 0.3199, + "step": 6914 + }, + { + "epoch": 0.5601911860012961, + "grad_norm": 0.029234793037176132, + "learning_rate": 0.00019110220982042396, + "loss": 0.3374, + "step": 6915 + }, + { + "epoch": 0.5602721970187946, + "grad_norm": 0.02638567052781582, + "learning_rate": 0.00019109770916782935, + "loss": 0.3036, + "step": 6916 + }, + { + "epoch": 0.560353208036293, + "grad_norm": 0.03524412959814072, + "learning_rate": 0.0001910932085152347, + "loss": 0.3744, + "step": 6917 + }, + { + "epoch": 0.5604342190537913, + "grad_norm": 0.02868202142417431, + "learning_rate": 0.00019108870786264007, + "loss": 0.3432, + "step": 6918 + }, + { + "epoch": 0.5605152300712897, + "grad_norm": 0.03504109010100365, + "learning_rate": 0.00019108420721004548, + "loss": 0.3756, + "step": 6919 + }, + { + "epoch": 0.560596241088788, + "grad_norm": 0.03579915314912796, + "learning_rate": 0.00019107970655745084, + "loss": 0.3528, + "step": 6920 + }, + { + "epoch": 0.5606772521062865, + "grad_norm": 0.03431423008441925, + "learning_rate": 0.0001910752059048562, + "loss": 0.3997, + "step": 6921 + }, + { + "epoch": 0.5607582631237849, + "grad_norm": 0.02899775095283985, + "learning_rate": 0.0001910707052522616, + "loss": 0.3264, + "step": 6922 + }, + { + "epoch": 0.5608392741412832, + "grad_norm": 0.031370386481285095, + "learning_rate": 0.00019106620459966695, + "loss": 0.3265, + "step": 6923 + }, + { + "epoch": 0.5609202851587816, + "grad_norm": 0.02682410180568695, + "learning_rate": 0.0001910617039470723, + "loss": 0.3327, + "step": 6924 + }, + { + "epoch": 0.56100129617628, + "grad_norm": 0.041649095714092255, + "learning_rate": 0.00019105720329447773, + "loss": 0.4209, + "step": 6925 + }, + { + "epoch": 0.5610823071937784, + "grad_norm": 0.026437275111675262, + "learning_rate": 0.00019105270264188309, + "loss": 0.2907, + "step": 6926 + }, + { + "epoch": 0.5611633182112767, + "grad_norm": 0.029267890378832817, + "learning_rate": 0.00019104820198928845, + "loss": 0.3051, + "step": 6927 + }, + { + "epoch": 0.5612443292287751, + "grad_norm": 0.029865281656384468, + "learning_rate": 0.00019104370133669383, + "loss": 0.364, + "step": 6928 + }, + { + "epoch": 0.5613253402462735, + "grad_norm": 0.03844697028398514, + "learning_rate": 0.0001910392006840992, + "loss": 0.3835, + "step": 6929 + }, + { + "epoch": 0.5614063512637719, + "grad_norm": 0.03238433972001076, + "learning_rate": 0.00019103470003150458, + "loss": 0.346, + "step": 6930 + }, + { + "epoch": 0.5614873622812703, + "grad_norm": 0.032621171325445175, + "learning_rate": 0.00019103019937890997, + "loss": 0.3792, + "step": 6931 + }, + { + "epoch": 0.5615683732987686, + "grad_norm": 0.030204012989997864, + "learning_rate": 0.00019102569872631533, + "loss": 0.2853, + "step": 6932 + }, + { + "epoch": 0.561649384316267, + "grad_norm": 0.03270819038152695, + "learning_rate": 0.0001910211980737207, + "loss": 0.3452, + "step": 6933 + }, + { + "epoch": 0.5617303953337653, + "grad_norm": 0.028208116069436073, + "learning_rate": 0.00019101669742112607, + "loss": 0.3446, + "step": 6934 + }, + { + "epoch": 0.5618114063512638, + "grad_norm": 0.0340554341673851, + "learning_rate": 0.00019101219676853143, + "loss": 0.4057, + "step": 6935 + }, + { + "epoch": 0.5618924173687622, + "grad_norm": 0.033202871680259705, + "learning_rate": 0.00019100769611593682, + "loss": 0.3742, + "step": 6936 + }, + { + "epoch": 0.5619734283862605, + "grad_norm": 0.035153090953826904, + "learning_rate": 0.0001910031954633422, + "loss": 0.3307, + "step": 6937 + }, + { + "epoch": 0.5620544394037589, + "grad_norm": 0.03547315672039986, + "learning_rate": 0.00019099869481074757, + "loss": 0.4135, + "step": 6938 + }, + { + "epoch": 0.5621354504212573, + "grad_norm": 0.03163422644138336, + "learning_rate": 0.00019099419415815293, + "loss": 0.3767, + "step": 6939 + }, + { + "epoch": 0.5622164614387557, + "grad_norm": 0.030638709664344788, + "learning_rate": 0.00019098969350555832, + "loss": 0.3512, + "step": 6940 + }, + { + "epoch": 0.562297472456254, + "grad_norm": 0.033417392522096634, + "learning_rate": 0.00019098519285296368, + "loss": 0.311, + "step": 6941 + }, + { + "epoch": 0.5623784834737524, + "grad_norm": 0.032544467598199844, + "learning_rate": 0.00019098069220036906, + "loss": 0.3455, + "step": 6942 + }, + { + "epoch": 0.5624594944912508, + "grad_norm": 0.03068172000348568, + "learning_rate": 0.00019097619154777445, + "loss": 0.3728, + "step": 6943 + }, + { + "epoch": 0.5625405055087492, + "grad_norm": 0.026925476267933846, + "learning_rate": 0.0001909716908951798, + "loss": 0.3251, + "step": 6944 + }, + { + "epoch": 0.5626215165262476, + "grad_norm": 0.03054945543408394, + "learning_rate": 0.00019096719024258517, + "loss": 0.3691, + "step": 6945 + }, + { + "epoch": 0.562702527543746, + "grad_norm": 0.030231181532144547, + "learning_rate": 0.00019096268958999056, + "loss": 0.3419, + "step": 6946 + }, + { + "epoch": 0.5627835385612443, + "grad_norm": 0.029685189947485924, + "learning_rate": 0.00019095818893739592, + "loss": 0.2949, + "step": 6947 + }, + { + "epoch": 0.5628645495787427, + "grad_norm": 0.030578652396798134, + "learning_rate": 0.0001909536882848013, + "loss": 0.3367, + "step": 6948 + }, + { + "epoch": 0.5629455605962411, + "grad_norm": 0.029063817113637924, + "learning_rate": 0.0001909491876322067, + "loss": 0.3408, + "step": 6949 + }, + { + "epoch": 0.5630265716137395, + "grad_norm": 0.03256330266594887, + "learning_rate": 0.00019094468697961205, + "loss": 0.3879, + "step": 6950 + }, + { + "epoch": 0.5631075826312378, + "grad_norm": 0.0308829378336668, + "learning_rate": 0.0001909401863270174, + "loss": 0.3672, + "step": 6951 + }, + { + "epoch": 0.5631885936487362, + "grad_norm": 0.029668381437659264, + "learning_rate": 0.0001909356856744228, + "loss": 0.3685, + "step": 6952 + }, + { + "epoch": 0.5632696046662347, + "grad_norm": 0.030312536284327507, + "learning_rate": 0.00019093118502182819, + "loss": 0.336, + "step": 6953 + }, + { + "epoch": 0.563350615683733, + "grad_norm": 0.031126992776989937, + "learning_rate": 0.00019092668436923355, + "loss": 0.3314, + "step": 6954 + }, + { + "epoch": 0.5634316267012314, + "grad_norm": 0.03099069371819496, + "learning_rate": 0.00019092218371663893, + "loss": 0.3329, + "step": 6955 + }, + { + "epoch": 0.5635126377187297, + "grad_norm": 0.03418287634849548, + "learning_rate": 0.0001909176830640443, + "loss": 0.3692, + "step": 6956 + }, + { + "epoch": 0.5635936487362281, + "grad_norm": 0.03272226080298424, + "learning_rate": 0.00019091318241144965, + "loss": 0.3692, + "step": 6957 + }, + { + "epoch": 0.5636746597537265, + "grad_norm": 0.031897593289613724, + "learning_rate": 0.00019090868175885504, + "loss": 0.3494, + "step": 6958 + }, + { + "epoch": 0.5637556707712249, + "grad_norm": 0.02550220675766468, + "learning_rate": 0.00019090418110626043, + "loss": 0.3067, + "step": 6959 + }, + { + "epoch": 0.5638366817887233, + "grad_norm": 0.03189469128847122, + "learning_rate": 0.0001908996804536658, + "loss": 0.3767, + "step": 6960 + }, + { + "epoch": 0.5639176928062216, + "grad_norm": 0.02977280505001545, + "learning_rate": 0.00019089517980107118, + "loss": 0.3379, + "step": 6961 + }, + { + "epoch": 0.56399870382372, + "grad_norm": 0.029872236773371696, + "learning_rate": 0.00019089067914847654, + "loss": 0.3221, + "step": 6962 + }, + { + "epoch": 0.5640797148412184, + "grad_norm": 0.03274345397949219, + "learning_rate": 0.0001908861784958819, + "loss": 0.4027, + "step": 6963 + }, + { + "epoch": 0.5641607258587168, + "grad_norm": 0.03244532644748688, + "learning_rate": 0.00019088167784328728, + "loss": 0.3528, + "step": 6964 + }, + { + "epoch": 0.5642417368762151, + "grad_norm": 0.03851190581917763, + "learning_rate": 0.00019087717719069267, + "loss": 0.3393, + "step": 6965 + }, + { + "epoch": 0.5643227478937135, + "grad_norm": 0.032855067402124405, + "learning_rate": 0.00019087267653809803, + "loss": 0.4021, + "step": 6966 + }, + { + "epoch": 0.564403758911212, + "grad_norm": 0.029516946524381638, + "learning_rate": 0.00019086817588550342, + "loss": 0.3405, + "step": 6967 + }, + { + "epoch": 0.5644847699287103, + "grad_norm": 0.027933668345212936, + "learning_rate": 0.00019086367523290878, + "loss": 0.3783, + "step": 6968 + }, + { + "epoch": 0.5645657809462087, + "grad_norm": 0.032324567437171936, + "learning_rate": 0.00019085917458031414, + "loss": 0.2918, + "step": 6969 + }, + { + "epoch": 0.564646791963707, + "grad_norm": 0.03222862258553505, + "learning_rate": 0.00019085467392771952, + "loss": 0.3378, + "step": 6970 + }, + { + "epoch": 0.5647278029812054, + "grad_norm": 0.030286898836493492, + "learning_rate": 0.0001908501732751249, + "loss": 0.3657, + "step": 6971 + }, + { + "epoch": 0.5648088139987039, + "grad_norm": 0.0321059413254261, + "learning_rate": 0.00019084567262253027, + "loss": 0.364, + "step": 6972 + }, + { + "epoch": 0.5648898250162022, + "grad_norm": 0.02939450554549694, + "learning_rate": 0.00019084117196993566, + "loss": 0.3024, + "step": 6973 + }, + { + "epoch": 0.5649708360337006, + "grad_norm": 0.030884001404047012, + "learning_rate": 0.00019083667131734102, + "loss": 0.3682, + "step": 6974 + }, + { + "epoch": 0.5650518470511989, + "grad_norm": 0.026744620874524117, + "learning_rate": 0.00019083217066474638, + "loss": 0.3452, + "step": 6975 + }, + { + "epoch": 0.5651328580686974, + "grad_norm": 0.03520316630601883, + "learning_rate": 0.00019082767001215177, + "loss": 0.391, + "step": 6976 + }, + { + "epoch": 0.5652138690861958, + "grad_norm": 0.030200695618987083, + "learning_rate": 0.00019082316935955715, + "loss": 0.3403, + "step": 6977 + }, + { + "epoch": 0.5652948801036941, + "grad_norm": 0.03265073522925377, + "learning_rate": 0.0001908186687069625, + "loss": 0.3498, + "step": 6978 + }, + { + "epoch": 0.5653758911211925, + "grad_norm": 0.030164245516061783, + "learning_rate": 0.0001908141680543679, + "loss": 0.3537, + "step": 6979 + }, + { + "epoch": 0.5654569021386908, + "grad_norm": 0.031043274328112602, + "learning_rate": 0.00019080966740177326, + "loss": 0.3244, + "step": 6980 + }, + { + "epoch": 0.5655379131561893, + "grad_norm": 0.03142424300312996, + "learning_rate": 0.00019080516674917862, + "loss": 0.3523, + "step": 6981 + }, + { + "epoch": 0.5656189241736876, + "grad_norm": 0.03224541246891022, + "learning_rate": 0.00019080066609658403, + "loss": 0.4044, + "step": 6982 + }, + { + "epoch": 0.565699935191186, + "grad_norm": 0.03293000906705856, + "learning_rate": 0.0001907961654439894, + "loss": 0.3533, + "step": 6983 + }, + { + "epoch": 0.5657809462086844, + "grad_norm": 0.028464820235967636, + "learning_rate": 0.00019079166479139475, + "loss": 0.3517, + "step": 6984 + }, + { + "epoch": 0.5658619572261827, + "grad_norm": 0.029161132872104645, + "learning_rate": 0.00019078716413880014, + "loss": 0.2946, + "step": 6985 + }, + { + "epoch": 0.5659429682436812, + "grad_norm": 0.031095337122678757, + "learning_rate": 0.0001907826634862055, + "loss": 0.2974, + "step": 6986 + }, + { + "epoch": 0.5660239792611795, + "grad_norm": 0.03502573445439339, + "learning_rate": 0.00019077816283361086, + "loss": 0.3453, + "step": 6987 + }, + { + "epoch": 0.5661049902786779, + "grad_norm": 0.03055300936102867, + "learning_rate": 0.00019077366218101628, + "loss": 0.3246, + "step": 6988 + }, + { + "epoch": 0.5661860012961762, + "grad_norm": 0.029000548645853996, + "learning_rate": 0.00019076916152842164, + "loss": 0.3267, + "step": 6989 + }, + { + "epoch": 0.5662670123136747, + "grad_norm": 0.03340502828359604, + "learning_rate": 0.000190764660875827, + "loss": 0.3524, + "step": 6990 + }, + { + "epoch": 0.5663480233311731, + "grad_norm": 0.028636228293180466, + "learning_rate": 0.00019076016022323238, + "loss": 0.3332, + "step": 6991 + }, + { + "epoch": 0.5664290343486714, + "grad_norm": 0.02928844839334488, + "learning_rate": 0.00019075565957063774, + "loss": 0.349, + "step": 6992 + }, + { + "epoch": 0.5665100453661698, + "grad_norm": 0.030798735097050667, + "learning_rate": 0.0001907511589180431, + "loss": 0.3215, + "step": 6993 + }, + { + "epoch": 0.5665910563836681, + "grad_norm": 0.030463317409157753, + "learning_rate": 0.00019074665826544852, + "loss": 0.3882, + "step": 6994 + }, + { + "epoch": 0.5666720674011666, + "grad_norm": 0.029138967394828796, + "learning_rate": 0.00019074215761285388, + "loss": 0.3598, + "step": 6995 + }, + { + "epoch": 0.566753078418665, + "grad_norm": 0.032293178141117096, + "learning_rate": 0.00019073765696025924, + "loss": 0.3721, + "step": 6996 + }, + { + "epoch": 0.5668340894361633, + "grad_norm": 0.029973942786455154, + "learning_rate": 0.00019073315630766462, + "loss": 0.3442, + "step": 6997 + }, + { + "epoch": 0.5669151004536617, + "grad_norm": 0.03126822039484978, + "learning_rate": 0.00019072865565506998, + "loss": 0.3765, + "step": 6998 + }, + { + "epoch": 0.56699611147116, + "grad_norm": 0.030398758128285408, + "learning_rate": 0.00019072415500247534, + "loss": 0.3642, + "step": 6999 + }, + { + "epoch": 0.5670771224886585, + "grad_norm": 0.03205035626888275, + "learning_rate": 0.00019071965434988076, + "loss": 0.3284, + "step": 7000 + }, + { + "epoch": 0.5671581335061568, + "grad_norm": 0.027481509372591972, + "learning_rate": 0.00019071515369728612, + "loss": 0.3222, + "step": 7001 + }, + { + "epoch": 0.5672391445236552, + "grad_norm": 0.03299596533179283, + "learning_rate": 0.00019071065304469148, + "loss": 0.352, + "step": 7002 + }, + { + "epoch": 0.5673201555411536, + "grad_norm": 0.03149527683854103, + "learning_rate": 0.00019070615239209687, + "loss": 0.3751, + "step": 7003 + }, + { + "epoch": 0.567401166558652, + "grad_norm": 0.030225474387407303, + "learning_rate": 0.00019070165173950223, + "loss": 0.3298, + "step": 7004 + }, + { + "epoch": 0.5674821775761504, + "grad_norm": 0.028560619801282883, + "learning_rate": 0.00019069715108690761, + "loss": 0.2978, + "step": 7005 + }, + { + "epoch": 0.5675631885936487, + "grad_norm": 0.03127693012356758, + "learning_rate": 0.000190692650434313, + "loss": 0.3485, + "step": 7006 + }, + { + "epoch": 0.5676441996111471, + "grad_norm": 0.032159753143787384, + "learning_rate": 0.00019068814978171836, + "loss": 0.3145, + "step": 7007 + }, + { + "epoch": 0.5677252106286454, + "grad_norm": 0.027168098837137222, + "learning_rate": 0.00019068364912912372, + "loss": 0.3509, + "step": 7008 + }, + { + "epoch": 0.5678062216461439, + "grad_norm": 0.03163224831223488, + "learning_rate": 0.0001906791484765291, + "loss": 0.3411, + "step": 7009 + }, + { + "epoch": 0.5678872326636423, + "grad_norm": 0.034824687987565994, + "learning_rate": 0.00019067464782393447, + "loss": 0.4088, + "step": 7010 + }, + { + "epoch": 0.5679682436811406, + "grad_norm": 0.028746243566274643, + "learning_rate": 0.00019067014717133986, + "loss": 0.3432, + "step": 7011 + }, + { + "epoch": 0.568049254698639, + "grad_norm": 0.029679277911782265, + "learning_rate": 0.00019066564651874524, + "loss": 0.3717, + "step": 7012 + }, + { + "epoch": 0.5681302657161373, + "grad_norm": 0.03528760373592377, + "learning_rate": 0.0001906611458661506, + "loss": 0.3522, + "step": 7013 + }, + { + "epoch": 0.5682112767336358, + "grad_norm": 0.027924789115786552, + "learning_rate": 0.00019065664521355596, + "loss": 0.3783, + "step": 7014 + }, + { + "epoch": 0.5682922877511342, + "grad_norm": 0.027431324124336243, + "learning_rate": 0.00019065214456096135, + "loss": 0.3478, + "step": 7015 + }, + { + "epoch": 0.5683732987686325, + "grad_norm": 0.027628622949123383, + "learning_rate": 0.0001906476439083667, + "loss": 0.3212, + "step": 7016 + }, + { + "epoch": 0.5684543097861309, + "grad_norm": 0.03062502108514309, + "learning_rate": 0.0001906431432557721, + "loss": 0.3789, + "step": 7017 + }, + { + "epoch": 0.5685353208036293, + "grad_norm": 0.03195822983980179, + "learning_rate": 0.00019063864260317748, + "loss": 0.3883, + "step": 7018 + }, + { + "epoch": 0.5686163318211277, + "grad_norm": 0.028229979798197746, + "learning_rate": 0.00019063414195058284, + "loss": 0.3258, + "step": 7019 + }, + { + "epoch": 0.568697342838626, + "grad_norm": 0.027966110035777092, + "learning_rate": 0.0001906296412979882, + "loss": 0.328, + "step": 7020 + }, + { + "epoch": 0.5687783538561244, + "grad_norm": 0.03365035727620125, + "learning_rate": 0.0001906251406453936, + "loss": 0.3187, + "step": 7021 + }, + { + "epoch": 0.5688593648736228, + "grad_norm": 0.030285336077213287, + "learning_rate": 0.00019062063999279895, + "loss": 0.351, + "step": 7022 + }, + { + "epoch": 0.5689403758911212, + "grad_norm": 0.03437206521630287, + "learning_rate": 0.00019061613934020434, + "loss": 0.385, + "step": 7023 + }, + { + "epoch": 0.5690213869086196, + "grad_norm": 0.02868664637207985, + "learning_rate": 0.00019061163868760973, + "loss": 0.3158, + "step": 7024 + }, + { + "epoch": 0.5691023979261179, + "grad_norm": 0.02950209379196167, + "learning_rate": 0.00019060713803501509, + "loss": 0.3166, + "step": 7025 + }, + { + "epoch": 0.5691834089436163, + "grad_norm": 0.03166711702942848, + "learning_rate": 0.00019060263738242045, + "loss": 0.3657, + "step": 7026 + }, + { + "epoch": 0.5692644199611148, + "grad_norm": 0.030810615047812462, + "learning_rate": 0.00019059813672982583, + "loss": 0.3119, + "step": 7027 + }, + { + "epoch": 0.5693454309786131, + "grad_norm": 0.03254614397883415, + "learning_rate": 0.0001905936360772312, + "loss": 0.3126, + "step": 7028 + }, + { + "epoch": 0.5694264419961115, + "grad_norm": 0.03156366944313049, + "learning_rate": 0.00019058913542463658, + "loss": 0.3276, + "step": 7029 + }, + { + "epoch": 0.5695074530136098, + "grad_norm": 0.0318727120757103, + "learning_rate": 0.00019058463477204197, + "loss": 0.3988, + "step": 7030 + }, + { + "epoch": 0.5695884640311082, + "grad_norm": 0.033699046820402145, + "learning_rate": 0.00019058013411944733, + "loss": 0.3496, + "step": 7031 + }, + { + "epoch": 0.5696694750486067, + "grad_norm": 0.02864360436797142, + "learning_rate": 0.0001905756334668527, + "loss": 0.3686, + "step": 7032 + }, + { + "epoch": 0.569750486066105, + "grad_norm": 0.03384806588292122, + "learning_rate": 0.00019057113281425807, + "loss": 0.3722, + "step": 7033 + }, + { + "epoch": 0.5698314970836034, + "grad_norm": 0.032431185245513916, + "learning_rate": 0.00019056663216166346, + "loss": 0.3765, + "step": 7034 + }, + { + "epoch": 0.5699125081011017, + "grad_norm": 0.02844301052391529, + "learning_rate": 0.00019056213150906882, + "loss": 0.3759, + "step": 7035 + }, + { + "epoch": 0.5699935191186001, + "grad_norm": 0.03484733775258064, + "learning_rate": 0.0001905576308564742, + "loss": 0.3515, + "step": 7036 + }, + { + "epoch": 0.5700745301360985, + "grad_norm": 0.03190077468752861, + "learning_rate": 0.00019055313020387957, + "loss": 0.3424, + "step": 7037 + }, + { + "epoch": 0.5701555411535969, + "grad_norm": 0.030969643965363503, + "learning_rate": 0.00019054862955128493, + "loss": 0.3274, + "step": 7038 + }, + { + "epoch": 0.5702365521710953, + "grad_norm": 0.029254410415887833, + "learning_rate": 0.00019054412889869032, + "loss": 0.3644, + "step": 7039 + }, + { + "epoch": 0.5703175631885936, + "grad_norm": 0.028428638353943825, + "learning_rate": 0.0001905396282460957, + "loss": 0.3494, + "step": 7040 + }, + { + "epoch": 0.5703985742060921, + "grad_norm": 0.02732779271900654, + "learning_rate": 0.00019053512759350106, + "loss": 0.3039, + "step": 7041 + }, + { + "epoch": 0.5704795852235904, + "grad_norm": 0.03077746368944645, + "learning_rate": 0.00019053062694090645, + "loss": 0.342, + "step": 7042 + }, + { + "epoch": 0.5705605962410888, + "grad_norm": 0.0320584736764431, + "learning_rate": 0.0001905261262883118, + "loss": 0.3728, + "step": 7043 + }, + { + "epoch": 0.5706416072585871, + "grad_norm": 0.030684838071465492, + "learning_rate": 0.00019052162563571717, + "loss": 0.3453, + "step": 7044 + }, + { + "epoch": 0.5707226182760855, + "grad_norm": 0.03280287981033325, + "learning_rate": 0.00019051712498312256, + "loss": 0.321, + "step": 7045 + }, + { + "epoch": 0.570803629293584, + "grad_norm": 0.030638406053185463, + "learning_rate": 0.00019051262433052794, + "loss": 0.3576, + "step": 7046 + }, + { + "epoch": 0.5708846403110823, + "grad_norm": 0.029015205800533295, + "learning_rate": 0.0001905081236779333, + "loss": 0.3577, + "step": 7047 + }, + { + "epoch": 0.5709656513285807, + "grad_norm": 0.03286876901984215, + "learning_rate": 0.0001905036230253387, + "loss": 0.3475, + "step": 7048 + }, + { + "epoch": 0.571046662346079, + "grad_norm": 0.029815932735800743, + "learning_rate": 0.00019049912237274405, + "loss": 0.3719, + "step": 7049 + }, + { + "epoch": 0.5711276733635774, + "grad_norm": 0.0388667918741703, + "learning_rate": 0.0001904946217201494, + "loss": 0.3749, + "step": 7050 + }, + { + "epoch": 0.5712086843810759, + "grad_norm": 0.04013926908373833, + "learning_rate": 0.0001904901210675548, + "loss": 0.4188, + "step": 7051 + }, + { + "epoch": 0.5712896953985742, + "grad_norm": 0.030169228091835976, + "learning_rate": 0.0001904856204149602, + "loss": 0.3483, + "step": 7052 + }, + { + "epoch": 0.5713707064160726, + "grad_norm": 0.032555319368839264, + "learning_rate": 0.00019048111976236555, + "loss": 0.357, + "step": 7053 + }, + { + "epoch": 0.5714517174335709, + "grad_norm": 0.029092267155647278, + "learning_rate": 0.00019047661910977093, + "loss": 0.3342, + "step": 7054 + }, + { + "epoch": 0.5715327284510694, + "grad_norm": 0.032974276691675186, + "learning_rate": 0.0001904721184571763, + "loss": 0.3531, + "step": 7055 + }, + { + "epoch": 0.5716137394685677, + "grad_norm": 0.030939916148781776, + "learning_rate": 0.00019046761780458165, + "loss": 0.363, + "step": 7056 + }, + { + "epoch": 0.5716947504860661, + "grad_norm": 0.02596089616417885, + "learning_rate": 0.00019046311715198707, + "loss": 0.2696, + "step": 7057 + }, + { + "epoch": 0.5717757615035645, + "grad_norm": 0.029358698055148125, + "learning_rate": 0.00019045861649939243, + "loss": 0.3477, + "step": 7058 + }, + { + "epoch": 0.5718567725210628, + "grad_norm": 0.02909529209136963, + "learning_rate": 0.0001904541158467978, + "loss": 0.3049, + "step": 7059 + }, + { + "epoch": 0.5719377835385613, + "grad_norm": 0.0325615294277668, + "learning_rate": 0.00019044961519420318, + "loss": 0.4295, + "step": 7060 + }, + { + "epoch": 0.5720187945560596, + "grad_norm": 0.027235547080636024, + "learning_rate": 0.00019044511454160854, + "loss": 0.2808, + "step": 7061 + }, + { + "epoch": 0.572099805573558, + "grad_norm": 0.03443726897239685, + "learning_rate": 0.0001904406138890139, + "loss": 0.3784, + "step": 7062 + }, + { + "epoch": 0.5721808165910564, + "grad_norm": 0.02712845802307129, + "learning_rate": 0.0001904361132364193, + "loss": 0.35, + "step": 7063 + }, + { + "epoch": 0.5722618276085548, + "grad_norm": 0.0366659052670002, + "learning_rate": 0.00019043161258382467, + "loss": 0.3964, + "step": 7064 + }, + { + "epoch": 0.5723428386260532, + "grad_norm": 0.029586222022771835, + "learning_rate": 0.00019042711193123003, + "loss": 0.361, + "step": 7065 + }, + { + "epoch": 0.5724238496435515, + "grad_norm": 0.027487218379974365, + "learning_rate": 0.00019042261127863542, + "loss": 0.3015, + "step": 7066 + }, + { + "epoch": 0.5725048606610499, + "grad_norm": 0.031158898025751114, + "learning_rate": 0.00019041811062604078, + "loss": 0.3368, + "step": 7067 + }, + { + "epoch": 0.5725858716785482, + "grad_norm": 0.031523216515779495, + "learning_rate": 0.00019041360997344614, + "loss": 0.3424, + "step": 7068 + }, + { + "epoch": 0.5726668826960467, + "grad_norm": 0.035301659256219864, + "learning_rate": 0.00019040910932085155, + "loss": 0.3952, + "step": 7069 + }, + { + "epoch": 0.5727478937135451, + "grad_norm": 0.03992548957467079, + "learning_rate": 0.0001904046086682569, + "loss": 0.4048, + "step": 7070 + }, + { + "epoch": 0.5728289047310434, + "grad_norm": 0.030019249767065048, + "learning_rate": 0.00019040010801566227, + "loss": 0.3521, + "step": 7071 + }, + { + "epoch": 0.5729099157485418, + "grad_norm": 0.030296877026557922, + "learning_rate": 0.00019039560736306766, + "loss": 0.3257, + "step": 7072 + }, + { + "epoch": 0.5729909267660401, + "grad_norm": 0.029138630256056786, + "learning_rate": 0.00019039110671047302, + "loss": 0.3359, + "step": 7073 + }, + { + "epoch": 0.5730719377835386, + "grad_norm": 0.03025815263390541, + "learning_rate": 0.00019038660605787838, + "loss": 0.3194, + "step": 7074 + }, + { + "epoch": 0.573152948801037, + "grad_norm": 0.03255913779139519, + "learning_rate": 0.0001903821054052838, + "loss": 0.3692, + "step": 7075 + }, + { + "epoch": 0.5732339598185353, + "grad_norm": 0.031642988324165344, + "learning_rate": 0.00019037760475268915, + "loss": 0.3636, + "step": 7076 + }, + { + "epoch": 0.5733149708360337, + "grad_norm": 0.02967303805053234, + "learning_rate": 0.0001903731041000945, + "loss": 0.3692, + "step": 7077 + }, + { + "epoch": 0.5733959818535321, + "grad_norm": 0.029209032654762268, + "learning_rate": 0.0001903686034474999, + "loss": 0.3398, + "step": 7078 + }, + { + "epoch": 0.5734769928710305, + "grad_norm": 0.03341130539774895, + "learning_rate": 0.00019036410279490526, + "loss": 0.3902, + "step": 7079 + }, + { + "epoch": 0.5735580038885288, + "grad_norm": 0.03325894474983215, + "learning_rate": 0.00019035960214231062, + "loss": 0.3536, + "step": 7080 + }, + { + "epoch": 0.5736390149060272, + "grad_norm": 0.0343037024140358, + "learning_rate": 0.00019035510148971603, + "loss": 0.4056, + "step": 7081 + }, + { + "epoch": 0.5737200259235256, + "grad_norm": 0.028654051944613457, + "learning_rate": 0.0001903506008371214, + "loss": 0.3126, + "step": 7082 + }, + { + "epoch": 0.573801036941024, + "grad_norm": 0.03184157609939575, + "learning_rate": 0.00019034610018452675, + "loss": 0.3597, + "step": 7083 + }, + { + "epoch": 0.5738820479585224, + "grad_norm": 0.03419782966375351, + "learning_rate": 0.00019034159953193214, + "loss": 0.3551, + "step": 7084 + }, + { + "epoch": 0.5739630589760207, + "grad_norm": 0.029133254662156105, + "learning_rate": 0.0001903370988793375, + "loss": 0.2879, + "step": 7085 + }, + { + "epoch": 0.5740440699935191, + "grad_norm": 0.031131476163864136, + "learning_rate": 0.0001903325982267429, + "loss": 0.3761, + "step": 7086 + }, + { + "epoch": 0.5741250810110174, + "grad_norm": 0.031101131811738014, + "learning_rate": 0.00019032809757414828, + "loss": 0.3361, + "step": 7087 + }, + { + "epoch": 0.5742060920285159, + "grad_norm": 0.02945782244205475, + "learning_rate": 0.00019032359692155364, + "loss": 0.3903, + "step": 7088 + }, + { + "epoch": 0.5742871030460143, + "grad_norm": 0.029181165620684624, + "learning_rate": 0.000190319096268959, + "loss": 0.3134, + "step": 7089 + }, + { + "epoch": 0.5743681140635126, + "grad_norm": 0.028031928464770317, + "learning_rate": 0.00019031459561636438, + "loss": 0.3439, + "step": 7090 + }, + { + "epoch": 0.574449125081011, + "grad_norm": 0.032897673547267914, + "learning_rate": 0.00019031009496376974, + "loss": 0.396, + "step": 7091 + }, + { + "epoch": 0.5745301360985094, + "grad_norm": 0.031826216727495193, + "learning_rate": 0.00019030559431117513, + "loss": 0.3312, + "step": 7092 + }, + { + "epoch": 0.5746111471160078, + "grad_norm": 0.028905490413308144, + "learning_rate": 0.00019030109365858052, + "loss": 0.3671, + "step": 7093 + }, + { + "epoch": 0.5746921581335062, + "grad_norm": 0.03372209519147873, + "learning_rate": 0.00019029659300598588, + "loss": 0.3251, + "step": 7094 + }, + { + "epoch": 0.5747731691510045, + "grad_norm": 0.03266318887472153, + "learning_rate": 0.00019029209235339124, + "loss": 0.3785, + "step": 7095 + }, + { + "epoch": 0.5748541801685029, + "grad_norm": 0.034305647015571594, + "learning_rate": 0.00019028759170079663, + "loss": 0.3506, + "step": 7096 + }, + { + "epoch": 0.5749351911860013, + "grad_norm": 0.031909115612506866, + "learning_rate": 0.00019028309104820199, + "loss": 0.3645, + "step": 7097 + }, + { + "epoch": 0.5750162022034997, + "grad_norm": 0.029492471367120743, + "learning_rate": 0.00019027859039560737, + "loss": 0.3546, + "step": 7098 + }, + { + "epoch": 0.575097213220998, + "grad_norm": 0.027777908369898796, + "learning_rate": 0.00019027408974301276, + "loss": 0.3045, + "step": 7099 + }, + { + "epoch": 0.5751782242384964, + "grad_norm": 0.029632898047566414, + "learning_rate": 0.00019026958909041812, + "loss": 0.376, + "step": 7100 + }, + { + "epoch": 0.5752592352559948, + "grad_norm": 0.031298041343688965, + "learning_rate": 0.00019026508843782348, + "loss": 0.3796, + "step": 7101 + }, + { + "epoch": 0.5753402462734932, + "grad_norm": 0.031095469370484352, + "learning_rate": 0.00019026058778522887, + "loss": 0.2957, + "step": 7102 + }, + { + "epoch": 0.5754212572909916, + "grad_norm": 0.02930424176156521, + "learning_rate": 0.00019025608713263423, + "loss": 0.3761, + "step": 7103 + }, + { + "epoch": 0.5755022683084899, + "grad_norm": 0.030215157195925713, + "learning_rate": 0.00019025158648003961, + "loss": 0.3833, + "step": 7104 + }, + { + "epoch": 0.5755832793259883, + "grad_norm": 0.033876512199640274, + "learning_rate": 0.000190247085827445, + "loss": 0.3075, + "step": 7105 + }, + { + "epoch": 0.5756642903434868, + "grad_norm": 0.03203409165143967, + "learning_rate": 0.00019024258517485036, + "loss": 0.3537, + "step": 7106 + }, + { + "epoch": 0.5757453013609851, + "grad_norm": 0.029567353427410126, + "learning_rate": 0.00019023808452225572, + "loss": 0.3328, + "step": 7107 + }, + { + "epoch": 0.5758263123784835, + "grad_norm": 0.0335480161011219, + "learning_rate": 0.0001902335838696611, + "loss": 0.3695, + "step": 7108 + }, + { + "epoch": 0.5759073233959818, + "grad_norm": 0.03027655929327011, + "learning_rate": 0.00019022908321706647, + "loss": 0.2899, + "step": 7109 + }, + { + "epoch": 0.5759883344134802, + "grad_norm": 0.03336027264595032, + "learning_rate": 0.00019022458256447186, + "loss": 0.3664, + "step": 7110 + }, + { + "epoch": 0.5760693454309787, + "grad_norm": 0.03019268438220024, + "learning_rate": 0.00019022008191187724, + "loss": 0.3169, + "step": 7111 + }, + { + "epoch": 0.576150356448477, + "grad_norm": 0.02925017476081848, + "learning_rate": 0.0001902155812592826, + "loss": 0.3746, + "step": 7112 + }, + { + "epoch": 0.5762313674659754, + "grad_norm": 0.03173941746354103, + "learning_rate": 0.00019021108060668796, + "loss": 0.3079, + "step": 7113 + }, + { + "epoch": 0.5763123784834737, + "grad_norm": 0.027620505541563034, + "learning_rate": 0.00019020657995409335, + "loss": 0.3396, + "step": 7114 + }, + { + "epoch": 0.5763933895009722, + "grad_norm": 0.032209865748882294, + "learning_rate": 0.00019020207930149874, + "loss": 0.3093, + "step": 7115 + }, + { + "epoch": 0.5764744005184705, + "grad_norm": 0.033973708748817444, + "learning_rate": 0.0001901975786489041, + "loss": 0.3483, + "step": 7116 + }, + { + "epoch": 0.5765554115359689, + "grad_norm": 0.03276333212852478, + "learning_rate": 0.00019019307799630948, + "loss": 0.373, + "step": 7117 + }, + { + "epoch": 0.5766364225534673, + "grad_norm": 0.02871558628976345, + "learning_rate": 0.00019018857734371484, + "loss": 0.3484, + "step": 7118 + }, + { + "epoch": 0.5767174335709656, + "grad_norm": 0.03591687232255936, + "learning_rate": 0.0001901840766911202, + "loss": 0.3226, + "step": 7119 + }, + { + "epoch": 0.5767984445884641, + "grad_norm": 0.02873329259455204, + "learning_rate": 0.0001901795760385256, + "loss": 0.3407, + "step": 7120 + }, + { + "epoch": 0.5768794556059624, + "grad_norm": 0.02535742148756981, + "learning_rate": 0.00019017507538593098, + "loss": 0.3217, + "step": 7121 + }, + { + "epoch": 0.5769604666234608, + "grad_norm": 0.03068891353905201, + "learning_rate": 0.00019017057473333634, + "loss": 0.3338, + "step": 7122 + }, + { + "epoch": 0.5770414776409591, + "grad_norm": 0.033300936222076416, + "learning_rate": 0.00019016607408074173, + "loss": 0.3581, + "step": 7123 + }, + { + "epoch": 0.5771224886584575, + "grad_norm": 0.029896607622504234, + "learning_rate": 0.00019016157342814709, + "loss": 0.3226, + "step": 7124 + }, + { + "epoch": 0.577203499675956, + "grad_norm": 0.03473919630050659, + "learning_rate": 0.00019015707277555245, + "loss": 0.3508, + "step": 7125 + }, + { + "epoch": 0.5772845106934543, + "grad_norm": 0.03545752912759781, + "learning_rate": 0.00019015257212295783, + "loss": 0.3609, + "step": 7126 + }, + { + "epoch": 0.5773655217109527, + "grad_norm": 0.03342561423778534, + "learning_rate": 0.00019014807147036322, + "loss": 0.3596, + "step": 7127 + }, + { + "epoch": 0.577446532728451, + "grad_norm": 0.032791558653116226, + "learning_rate": 0.00019014357081776858, + "loss": 0.3319, + "step": 7128 + }, + { + "epoch": 0.5775275437459495, + "grad_norm": 0.029370320960879326, + "learning_rate": 0.00019013907016517397, + "loss": 0.355, + "step": 7129 + }, + { + "epoch": 0.5776085547634479, + "grad_norm": 0.03259564936161041, + "learning_rate": 0.00019013456951257933, + "loss": 0.386, + "step": 7130 + }, + { + "epoch": 0.5776895657809462, + "grad_norm": 0.03333577513694763, + "learning_rate": 0.0001901300688599847, + "loss": 0.3386, + "step": 7131 + }, + { + "epoch": 0.5777705767984446, + "grad_norm": 0.029573451727628708, + "learning_rate": 0.00019012556820739007, + "loss": 0.3498, + "step": 7132 + }, + { + "epoch": 0.5778515878159429, + "grad_norm": 0.030514564365148544, + "learning_rate": 0.00019012106755479546, + "loss": 0.3264, + "step": 7133 + }, + { + "epoch": 0.5779325988334414, + "grad_norm": 0.029826374724507332, + "learning_rate": 0.00019011656690220082, + "loss": 0.3528, + "step": 7134 + }, + { + "epoch": 0.5780136098509397, + "grad_norm": 0.02787317894399166, + "learning_rate": 0.0001901120662496062, + "loss": 0.3244, + "step": 7135 + }, + { + "epoch": 0.5780946208684381, + "grad_norm": 0.03055379167199135, + "learning_rate": 0.00019010756559701157, + "loss": 0.3541, + "step": 7136 + }, + { + "epoch": 0.5781756318859365, + "grad_norm": 0.02805292047560215, + "learning_rate": 0.00019010306494441693, + "loss": 0.3204, + "step": 7137 + }, + { + "epoch": 0.5782566429034348, + "grad_norm": 0.036902476102113724, + "learning_rate": 0.00019009856429182234, + "loss": 0.3539, + "step": 7138 + }, + { + "epoch": 0.5783376539209333, + "grad_norm": 0.031508274376392365, + "learning_rate": 0.0001900940636392277, + "loss": 0.3322, + "step": 7139 + }, + { + "epoch": 0.5784186649384316, + "grad_norm": 0.03548916056752205, + "learning_rate": 0.00019008956298663306, + "loss": 0.3459, + "step": 7140 + }, + { + "epoch": 0.57849967595593, + "grad_norm": 0.03395074978470802, + "learning_rate": 0.00019008506233403845, + "loss": 0.3777, + "step": 7141 + }, + { + "epoch": 0.5785806869734283, + "grad_norm": 0.029324263334274292, + "learning_rate": 0.0001900805616814438, + "loss": 0.3407, + "step": 7142 + }, + { + "epoch": 0.5786616979909268, + "grad_norm": 0.033915892243385315, + "learning_rate": 0.00019007606102884917, + "loss": 0.3588, + "step": 7143 + }, + { + "epoch": 0.5787427090084252, + "grad_norm": 0.03298439085483551, + "learning_rate": 0.00019007156037625459, + "loss": 0.3362, + "step": 7144 + }, + { + "epoch": 0.5788237200259235, + "grad_norm": 0.034808751195669174, + "learning_rate": 0.00019006705972365995, + "loss": 0.3884, + "step": 7145 + }, + { + "epoch": 0.5789047310434219, + "grad_norm": 0.03707476705312729, + "learning_rate": 0.0001900625590710653, + "loss": 0.4259, + "step": 7146 + }, + { + "epoch": 0.5789857420609202, + "grad_norm": 0.03482748568058014, + "learning_rate": 0.0001900580584184707, + "loss": 0.355, + "step": 7147 + }, + { + "epoch": 0.5790667530784187, + "grad_norm": 0.028616994619369507, + "learning_rate": 0.00019005355776587605, + "loss": 0.3333, + "step": 7148 + }, + { + "epoch": 0.5791477640959171, + "grad_norm": 0.03026905469596386, + "learning_rate": 0.0001900490571132814, + "loss": 0.2797, + "step": 7149 + }, + { + "epoch": 0.5792287751134154, + "grad_norm": 0.03337970748543739, + "learning_rate": 0.00019004455646068683, + "loss": 0.3674, + "step": 7150 + }, + { + "epoch": 0.5793097861309138, + "grad_norm": 0.030933627858757973, + "learning_rate": 0.0001900400558080922, + "loss": 0.3401, + "step": 7151 + }, + { + "epoch": 0.5793907971484121, + "grad_norm": 0.03672805428504944, + "learning_rate": 0.00019003555515549755, + "loss": 0.3895, + "step": 7152 + }, + { + "epoch": 0.5794718081659106, + "grad_norm": 0.034799739718437195, + "learning_rate": 0.00019003105450290293, + "loss": 0.3476, + "step": 7153 + }, + { + "epoch": 0.579552819183409, + "grad_norm": 0.02711411938071251, + "learning_rate": 0.0001900265538503083, + "loss": 0.3195, + "step": 7154 + }, + { + "epoch": 0.5796338302009073, + "grad_norm": 0.026923276484012604, + "learning_rate": 0.00019002205319771365, + "loss": 0.3521, + "step": 7155 + }, + { + "epoch": 0.5797148412184057, + "grad_norm": 0.03141951188445091, + "learning_rate": 0.00019001755254511907, + "loss": 0.3422, + "step": 7156 + }, + { + "epoch": 0.5797958522359041, + "grad_norm": 0.02795341983437538, + "learning_rate": 0.00019001305189252443, + "loss": 0.3449, + "step": 7157 + }, + { + "epoch": 0.5798768632534025, + "grad_norm": 0.028946880251169205, + "learning_rate": 0.0001900085512399298, + "loss": 0.3549, + "step": 7158 + }, + { + "epoch": 0.5799578742709008, + "grad_norm": 0.029874345287680626, + "learning_rate": 0.00019000405058733518, + "loss": 0.3188, + "step": 7159 + }, + { + "epoch": 0.5800388852883992, + "grad_norm": 0.029896339401602745, + "learning_rate": 0.00018999954993474054, + "loss": 0.3976, + "step": 7160 + }, + { + "epoch": 0.5801198963058976, + "grad_norm": 0.030478930100798607, + "learning_rate": 0.00018999504928214592, + "loss": 0.3394, + "step": 7161 + }, + { + "epoch": 0.580200907323396, + "grad_norm": 0.02973802201449871, + "learning_rate": 0.0001899905486295513, + "loss": 0.3528, + "step": 7162 + }, + { + "epoch": 0.5802819183408944, + "grad_norm": 0.02996090054512024, + "learning_rate": 0.00018998604797695667, + "loss": 0.3468, + "step": 7163 + }, + { + "epoch": 0.5803629293583927, + "grad_norm": 0.026388095691800117, + "learning_rate": 0.00018998154732436203, + "loss": 0.3132, + "step": 7164 + }, + { + "epoch": 0.5804439403758911, + "grad_norm": 0.029814952984452248, + "learning_rate": 0.00018997704667176742, + "loss": 0.3203, + "step": 7165 + }, + { + "epoch": 0.5805249513933896, + "grad_norm": 0.033563531935214996, + "learning_rate": 0.00018997254601917278, + "loss": 0.3301, + "step": 7166 + }, + { + "epoch": 0.5806059624108879, + "grad_norm": 0.03099728748202324, + "learning_rate": 0.00018996804536657816, + "loss": 0.3577, + "step": 7167 + }, + { + "epoch": 0.5806869734283863, + "grad_norm": 0.03393007069826126, + "learning_rate": 0.00018996354471398355, + "loss": 0.3309, + "step": 7168 + }, + { + "epoch": 0.5807679844458846, + "grad_norm": 0.02811267040669918, + "learning_rate": 0.0001899590440613889, + "loss": 0.3344, + "step": 7169 + }, + { + "epoch": 0.580848995463383, + "grad_norm": 0.030411504209041595, + "learning_rate": 0.00018995454340879427, + "loss": 0.3786, + "step": 7170 + }, + { + "epoch": 0.5809300064808814, + "grad_norm": 0.0271066315472126, + "learning_rate": 0.00018995004275619966, + "loss": 0.3355, + "step": 7171 + }, + { + "epoch": 0.5810110174983798, + "grad_norm": 0.03240623325109482, + "learning_rate": 0.00018994554210360502, + "loss": 0.3215, + "step": 7172 + }, + { + "epoch": 0.5810920285158782, + "grad_norm": 0.03191445395350456, + "learning_rate": 0.0001899410414510104, + "loss": 0.3229, + "step": 7173 + }, + { + "epoch": 0.5811730395333765, + "grad_norm": 0.029774196445941925, + "learning_rate": 0.0001899365407984158, + "loss": 0.3283, + "step": 7174 + }, + { + "epoch": 0.5812540505508749, + "grad_norm": 0.03459349274635315, + "learning_rate": 0.00018993204014582115, + "loss": 0.3482, + "step": 7175 + }, + { + "epoch": 0.5813350615683733, + "grad_norm": 0.033518653362989426, + "learning_rate": 0.0001899275394932265, + "loss": 0.3709, + "step": 7176 + }, + { + "epoch": 0.5814160725858717, + "grad_norm": 0.03485652804374695, + "learning_rate": 0.0001899230388406319, + "loss": 0.3757, + "step": 7177 + }, + { + "epoch": 0.58149708360337, + "grad_norm": 0.03507082536816597, + "learning_rate": 0.00018991853818803726, + "loss": 0.3428, + "step": 7178 + }, + { + "epoch": 0.5815780946208684, + "grad_norm": 0.028515536338090897, + "learning_rate": 0.00018991403753544265, + "loss": 0.3469, + "step": 7179 + }, + { + "epoch": 0.5816591056383669, + "grad_norm": 0.0331462137401104, + "learning_rate": 0.00018990953688284803, + "loss": 0.3641, + "step": 7180 + }, + { + "epoch": 0.5817401166558652, + "grad_norm": 0.028841599822044373, + "learning_rate": 0.0001899050362302534, + "loss": 0.3821, + "step": 7181 + }, + { + "epoch": 0.5818211276733636, + "grad_norm": 0.03024439513683319, + "learning_rate": 0.00018990053557765875, + "loss": 0.4041, + "step": 7182 + }, + { + "epoch": 0.5819021386908619, + "grad_norm": 0.03342823311686516, + "learning_rate": 0.00018989603492506414, + "loss": 0.3355, + "step": 7183 + }, + { + "epoch": 0.5819831497083603, + "grad_norm": 0.0266738161444664, + "learning_rate": 0.0001898915342724695, + "loss": 0.3015, + "step": 7184 + }, + { + "epoch": 0.5820641607258588, + "grad_norm": 0.03139263391494751, + "learning_rate": 0.0001898870336198749, + "loss": 0.3446, + "step": 7185 + }, + { + "epoch": 0.5821451717433571, + "grad_norm": 0.029045145958662033, + "learning_rate": 0.00018988253296728028, + "loss": 0.3601, + "step": 7186 + }, + { + "epoch": 0.5822261827608555, + "grad_norm": 0.028520744293928146, + "learning_rate": 0.00018987803231468564, + "loss": 0.3602, + "step": 7187 + }, + { + "epoch": 0.5823071937783538, + "grad_norm": 0.02972353622317314, + "learning_rate": 0.000189873531662091, + "loss": 0.3549, + "step": 7188 + }, + { + "epoch": 0.5823882047958522, + "grad_norm": 0.029112640768289566, + "learning_rate": 0.00018986903100949638, + "loss": 0.2799, + "step": 7189 + }, + { + "epoch": 0.5824692158133506, + "grad_norm": 0.029921172186732292, + "learning_rate": 0.00018986453035690177, + "loss": 0.353, + "step": 7190 + }, + { + "epoch": 0.582550226830849, + "grad_norm": 0.03464297205209732, + "learning_rate": 0.00018986002970430713, + "loss": 0.3872, + "step": 7191 + }, + { + "epoch": 0.5826312378483474, + "grad_norm": 0.028278376907110214, + "learning_rate": 0.00018985552905171252, + "loss": 0.3366, + "step": 7192 + }, + { + "epoch": 0.5827122488658457, + "grad_norm": 0.0291961207985878, + "learning_rate": 0.00018985102839911788, + "loss": 0.3643, + "step": 7193 + }, + { + "epoch": 0.5827932598833442, + "grad_norm": 0.029754353687167168, + "learning_rate": 0.00018984652774652324, + "loss": 0.2975, + "step": 7194 + }, + { + "epoch": 0.5828742709008425, + "grad_norm": 0.036267850548028946, + "learning_rate": 0.00018984202709392863, + "loss": 0.405, + "step": 7195 + }, + { + "epoch": 0.5829552819183409, + "grad_norm": 0.03386032581329346, + "learning_rate": 0.000189837526441334, + "loss": 0.396, + "step": 7196 + }, + { + "epoch": 0.5830362929358393, + "grad_norm": 0.030160658061504364, + "learning_rate": 0.00018983302578873937, + "loss": 0.3308, + "step": 7197 + }, + { + "epoch": 0.5831173039533376, + "grad_norm": 0.03126070648431778, + "learning_rate": 0.00018982852513614476, + "loss": 0.377, + "step": 7198 + }, + { + "epoch": 0.5831983149708361, + "grad_norm": 0.031703200191259384, + "learning_rate": 0.00018982402448355012, + "loss": 0.3651, + "step": 7199 + }, + { + "epoch": 0.5832793259883344, + "grad_norm": 0.03930109739303589, + "learning_rate": 0.00018981952383095548, + "loss": 0.3751, + "step": 7200 + }, + { + "epoch": 0.5833603370058328, + "grad_norm": 0.033295638859272, + "learning_rate": 0.00018981502317836087, + "loss": 0.3573, + "step": 7201 + }, + { + "epoch": 0.5834413480233311, + "grad_norm": 0.030004773288965225, + "learning_rate": 0.00018981052252576625, + "loss": 0.3387, + "step": 7202 + }, + { + "epoch": 0.5835223590408296, + "grad_norm": 0.028721129521727562, + "learning_rate": 0.00018980602187317161, + "loss": 0.3108, + "step": 7203 + }, + { + "epoch": 0.583603370058328, + "grad_norm": 0.030161743983626366, + "learning_rate": 0.000189801521220577, + "loss": 0.3435, + "step": 7204 + }, + { + "epoch": 0.5836843810758263, + "grad_norm": 0.02853168360888958, + "learning_rate": 0.00018979702056798236, + "loss": 0.3471, + "step": 7205 + }, + { + "epoch": 0.5837653920933247, + "grad_norm": 0.030113236978650093, + "learning_rate": 0.00018979251991538772, + "loss": 0.3293, + "step": 7206 + }, + { + "epoch": 0.583846403110823, + "grad_norm": 0.03496495261788368, + "learning_rate": 0.0001897880192627931, + "loss": 0.389, + "step": 7207 + }, + { + "epoch": 0.5839274141283215, + "grad_norm": 0.03152168542146683, + "learning_rate": 0.0001897835186101985, + "loss": 0.3555, + "step": 7208 + }, + { + "epoch": 0.5840084251458199, + "grad_norm": 0.03420482948422432, + "learning_rate": 0.00018977901795760386, + "loss": 0.3499, + "step": 7209 + }, + { + "epoch": 0.5840894361633182, + "grad_norm": 0.02827419713139534, + "learning_rate": 0.00018977451730500924, + "loss": 0.3431, + "step": 7210 + }, + { + "epoch": 0.5841704471808166, + "grad_norm": 0.032837990671396255, + "learning_rate": 0.0001897700166524146, + "loss": 0.3529, + "step": 7211 + }, + { + "epoch": 0.5842514581983149, + "grad_norm": 0.0367714948952198, + "learning_rate": 0.00018976551599981996, + "loss": 0.3061, + "step": 7212 + }, + { + "epoch": 0.5843324692158134, + "grad_norm": 0.02832326851785183, + "learning_rate": 0.00018976101534722535, + "loss": 0.312, + "step": 7213 + }, + { + "epoch": 0.5844134802333117, + "grad_norm": 0.03139325976371765, + "learning_rate": 0.00018975651469463074, + "loss": 0.3408, + "step": 7214 + }, + { + "epoch": 0.5844944912508101, + "grad_norm": 0.03237767145037651, + "learning_rate": 0.0001897520140420361, + "loss": 0.3624, + "step": 7215 + }, + { + "epoch": 0.5845755022683085, + "grad_norm": 0.03246236592531204, + "learning_rate": 0.00018974751338944148, + "loss": 0.344, + "step": 7216 + }, + { + "epoch": 0.5846565132858069, + "grad_norm": 0.033114735037088394, + "learning_rate": 0.00018974301273684684, + "loss": 0.3782, + "step": 7217 + }, + { + "epoch": 0.5847375243033053, + "grad_norm": 0.037286460399627686, + "learning_rate": 0.0001897385120842522, + "loss": 0.3727, + "step": 7218 + }, + { + "epoch": 0.5848185353208036, + "grad_norm": 0.03592041879892349, + "learning_rate": 0.00018973401143165762, + "loss": 0.4344, + "step": 7219 + }, + { + "epoch": 0.584899546338302, + "grad_norm": 0.03247380256652832, + "learning_rate": 0.00018972951077906298, + "loss": 0.3834, + "step": 7220 + }, + { + "epoch": 0.5849805573558003, + "grad_norm": 0.03495294228196144, + "learning_rate": 0.00018972501012646834, + "loss": 0.4064, + "step": 7221 + }, + { + "epoch": 0.5850615683732988, + "grad_norm": 0.028913801535964012, + "learning_rate": 0.00018972050947387373, + "loss": 0.3313, + "step": 7222 + }, + { + "epoch": 0.5851425793907972, + "grad_norm": 0.028750916942954063, + "learning_rate": 0.00018971600882127909, + "loss": 0.3515, + "step": 7223 + }, + { + "epoch": 0.5852235904082955, + "grad_norm": 0.03291678428649902, + "learning_rate": 0.00018971150816868445, + "loss": 0.3918, + "step": 7224 + }, + { + "epoch": 0.5853046014257939, + "grad_norm": 0.030185990035533905, + "learning_rate": 0.00018970700751608986, + "loss": 0.3017, + "step": 7225 + }, + { + "epoch": 0.5853856124432922, + "grad_norm": 0.03135082498192787, + "learning_rate": 0.00018970250686349522, + "loss": 0.3407, + "step": 7226 + }, + { + "epoch": 0.5854666234607907, + "grad_norm": 0.03142891451716423, + "learning_rate": 0.00018969800621090058, + "loss": 0.3675, + "step": 7227 + }, + { + "epoch": 0.5855476344782891, + "grad_norm": 0.03312395513057709, + "learning_rate": 0.00018969350555830597, + "loss": 0.3129, + "step": 7228 + }, + { + "epoch": 0.5856286454957874, + "grad_norm": 0.030316771939396858, + "learning_rate": 0.00018968900490571133, + "loss": 0.3024, + "step": 7229 + }, + { + "epoch": 0.5857096565132858, + "grad_norm": 0.03373432531952858, + "learning_rate": 0.00018968450425311671, + "loss": 0.3566, + "step": 7230 + }, + { + "epoch": 0.5857906675307842, + "grad_norm": 0.03295702487230301, + "learning_rate": 0.0001896800036005221, + "loss": 0.3486, + "step": 7231 + }, + { + "epoch": 0.5858716785482826, + "grad_norm": 0.03206154331564903, + "learning_rate": 0.00018967550294792746, + "loss": 0.3414, + "step": 7232 + }, + { + "epoch": 0.585952689565781, + "grad_norm": 0.036674872040748596, + "learning_rate": 0.00018967100229533282, + "loss": 0.3827, + "step": 7233 + }, + { + "epoch": 0.5860337005832793, + "grad_norm": 0.028908833861351013, + "learning_rate": 0.0001896665016427382, + "loss": 0.3596, + "step": 7234 + }, + { + "epoch": 0.5861147116007777, + "grad_norm": 0.03456667810678482, + "learning_rate": 0.00018966200099014357, + "loss": 0.3575, + "step": 7235 + }, + { + "epoch": 0.5861957226182761, + "grad_norm": 0.02778259664773941, + "learning_rate": 0.00018965750033754896, + "loss": 0.3151, + "step": 7236 + }, + { + "epoch": 0.5862767336357745, + "grad_norm": 0.0309758223593235, + "learning_rate": 0.00018965299968495434, + "loss": 0.3201, + "step": 7237 + }, + { + "epoch": 0.5863577446532728, + "grad_norm": 0.02835778519511223, + "learning_rate": 0.0001896484990323597, + "loss": 0.3391, + "step": 7238 + }, + { + "epoch": 0.5864387556707712, + "grad_norm": 0.03552369028329849, + "learning_rate": 0.00018964399837976506, + "loss": 0.4067, + "step": 7239 + }, + { + "epoch": 0.5865197666882696, + "grad_norm": 0.03288602828979492, + "learning_rate": 0.00018963949772717045, + "loss": 0.3515, + "step": 7240 + }, + { + "epoch": 0.586600777705768, + "grad_norm": 0.03445067256689072, + "learning_rate": 0.0001896349970745758, + "loss": 0.3746, + "step": 7241 + }, + { + "epoch": 0.5866817887232664, + "grad_norm": 0.02727990224957466, + "learning_rate": 0.0001896304964219812, + "loss": 0.3459, + "step": 7242 + }, + { + "epoch": 0.5867627997407647, + "grad_norm": 0.027717137709259987, + "learning_rate": 0.00018962599576938659, + "loss": 0.3723, + "step": 7243 + }, + { + "epoch": 0.5868438107582631, + "grad_norm": 0.03080059587955475, + "learning_rate": 0.00018962149511679195, + "loss": 0.3556, + "step": 7244 + }, + { + "epoch": 0.5869248217757616, + "grad_norm": 0.03022826835513115, + "learning_rate": 0.0001896169944641973, + "loss": 0.3416, + "step": 7245 + }, + { + "epoch": 0.5870058327932599, + "grad_norm": 0.03362521901726723, + "learning_rate": 0.0001896124938116027, + "loss": 0.3765, + "step": 7246 + }, + { + "epoch": 0.5870868438107583, + "grad_norm": 0.02809925563633442, + "learning_rate": 0.00018960799315900805, + "loss": 0.3707, + "step": 7247 + }, + { + "epoch": 0.5871678548282566, + "grad_norm": 0.02911153994500637, + "learning_rate": 0.00018960349250641344, + "loss": 0.326, + "step": 7248 + }, + { + "epoch": 0.587248865845755, + "grad_norm": 0.027382275089621544, + "learning_rate": 0.00018959899185381883, + "loss": 0.2986, + "step": 7249 + }, + { + "epoch": 0.5873298768632534, + "grad_norm": 0.03484692424535751, + "learning_rate": 0.0001895944912012242, + "loss": 0.3613, + "step": 7250 + }, + { + "epoch": 0.5874108878807518, + "grad_norm": 0.03420795127749443, + "learning_rate": 0.00018958999054862955, + "loss": 0.3925, + "step": 7251 + }, + { + "epoch": 0.5874918988982502, + "grad_norm": 0.03072218969464302, + "learning_rate": 0.00018958548989603493, + "loss": 0.3445, + "step": 7252 + }, + { + "epoch": 0.5875729099157485, + "grad_norm": 0.028487669304013252, + "learning_rate": 0.0001895809892434403, + "loss": 0.3237, + "step": 7253 + }, + { + "epoch": 0.587653920933247, + "grad_norm": 0.030667198821902275, + "learning_rate": 0.00018957648859084568, + "loss": 0.3768, + "step": 7254 + }, + { + "epoch": 0.5877349319507453, + "grad_norm": 0.03093591146171093, + "learning_rate": 0.00018957198793825107, + "loss": 0.3227, + "step": 7255 + }, + { + "epoch": 0.5878159429682437, + "grad_norm": 0.02597019262611866, + "learning_rate": 0.00018956748728565643, + "loss": 0.3214, + "step": 7256 + }, + { + "epoch": 0.587896953985742, + "grad_norm": 0.02928873524069786, + "learning_rate": 0.0001895629866330618, + "loss": 0.35, + "step": 7257 + }, + { + "epoch": 0.5879779650032404, + "grad_norm": 0.03613502159714699, + "learning_rate": 0.00018955848598046718, + "loss": 0.3712, + "step": 7258 + }, + { + "epoch": 0.5880589760207389, + "grad_norm": 0.0315786674618721, + "learning_rate": 0.00018955398532787254, + "loss": 0.3383, + "step": 7259 + }, + { + "epoch": 0.5881399870382372, + "grad_norm": 0.03728878125548363, + "learning_rate": 0.00018954948467527792, + "loss": 0.3634, + "step": 7260 + }, + { + "epoch": 0.5882209980557356, + "grad_norm": 0.03164689242839813, + "learning_rate": 0.0001895449840226833, + "loss": 0.3308, + "step": 7261 + }, + { + "epoch": 0.5883020090732339, + "grad_norm": 0.03446730971336365, + "learning_rate": 0.00018954048337008867, + "loss": 0.4361, + "step": 7262 + }, + { + "epoch": 0.5883830200907323, + "grad_norm": 0.036188770085573196, + "learning_rate": 0.00018953598271749403, + "loss": 0.4244, + "step": 7263 + }, + { + "epoch": 0.5884640311082308, + "grad_norm": 0.031748078763484955, + "learning_rate": 0.00018953148206489942, + "loss": 0.365, + "step": 7264 + }, + { + "epoch": 0.5885450421257291, + "grad_norm": 0.03347034379839897, + "learning_rate": 0.00018952698141230478, + "loss": 0.3854, + "step": 7265 + }, + { + "epoch": 0.5886260531432275, + "grad_norm": 0.03278379887342453, + "learning_rate": 0.00018952248075971016, + "loss": 0.3878, + "step": 7266 + }, + { + "epoch": 0.5887070641607258, + "grad_norm": 0.032228413969278336, + "learning_rate": 0.00018951798010711555, + "loss": 0.3348, + "step": 7267 + }, + { + "epoch": 0.5887880751782243, + "grad_norm": 0.03474995121359825, + "learning_rate": 0.0001895134794545209, + "loss": 0.4198, + "step": 7268 + }, + { + "epoch": 0.5888690861957226, + "grad_norm": 0.03140547126531601, + "learning_rate": 0.00018950897880192627, + "loss": 0.3601, + "step": 7269 + }, + { + "epoch": 0.588950097213221, + "grad_norm": 0.02940104901790619, + "learning_rate": 0.00018950447814933166, + "loss": 0.3501, + "step": 7270 + }, + { + "epoch": 0.5890311082307194, + "grad_norm": 0.044192470610141754, + "learning_rate": 0.00018949997749673705, + "loss": 0.3985, + "step": 7271 + }, + { + "epoch": 0.5891121192482177, + "grad_norm": 0.030684489756822586, + "learning_rate": 0.0001894954768441424, + "loss": 0.359, + "step": 7272 + }, + { + "epoch": 0.5891931302657162, + "grad_norm": 0.0336628220975399, + "learning_rate": 0.0001894909761915478, + "loss": 0.3745, + "step": 7273 + }, + { + "epoch": 0.5892741412832145, + "grad_norm": 0.028922492638230324, + "learning_rate": 0.00018948647553895315, + "loss": 0.2946, + "step": 7274 + }, + { + "epoch": 0.5893551523007129, + "grad_norm": 0.032071251422166824, + "learning_rate": 0.00018948197488635851, + "loss": 0.3145, + "step": 7275 + }, + { + "epoch": 0.5894361633182112, + "grad_norm": 0.03068896383047104, + "learning_rate": 0.0001894774742337639, + "loss": 0.3355, + "step": 7276 + }, + { + "epoch": 0.5895171743357096, + "grad_norm": 0.03323281556367874, + "learning_rate": 0.0001894729735811693, + "loss": 0.3663, + "step": 7277 + }, + { + "epoch": 0.5895981853532081, + "grad_norm": 0.035299479961395264, + "learning_rate": 0.00018946847292857465, + "loss": 0.3555, + "step": 7278 + }, + { + "epoch": 0.5896791963707064, + "grad_norm": 0.03173135221004486, + "learning_rate": 0.00018946397227598004, + "loss": 0.3478, + "step": 7279 + }, + { + "epoch": 0.5897602073882048, + "grad_norm": 0.03322529047727585, + "learning_rate": 0.0001894594716233854, + "loss": 0.399, + "step": 7280 + }, + { + "epoch": 0.5898412184057031, + "grad_norm": 0.031474243849515915, + "learning_rate": 0.00018945497097079076, + "loss": 0.3601, + "step": 7281 + }, + { + "epoch": 0.5899222294232016, + "grad_norm": 0.03509430214762688, + "learning_rate": 0.00018945047031819614, + "loss": 0.3971, + "step": 7282 + }, + { + "epoch": 0.5900032404407, + "grad_norm": 0.033451810479164124, + "learning_rate": 0.00018944596966560153, + "loss": 0.3539, + "step": 7283 + }, + { + "epoch": 0.5900842514581983, + "grad_norm": 0.028561068698763847, + "learning_rate": 0.0001894414690130069, + "loss": 0.3101, + "step": 7284 + }, + { + "epoch": 0.5901652624756967, + "grad_norm": 0.0287350844591856, + "learning_rate": 0.00018943696836041228, + "loss": 0.3632, + "step": 7285 + }, + { + "epoch": 0.590246273493195, + "grad_norm": 0.030074043199419975, + "learning_rate": 0.00018943246770781764, + "loss": 0.334, + "step": 7286 + }, + { + "epoch": 0.5903272845106935, + "grad_norm": 0.03051457181572914, + "learning_rate": 0.000189427967055223, + "loss": 0.341, + "step": 7287 + }, + { + "epoch": 0.5904082955281919, + "grad_norm": 0.028257975354790688, + "learning_rate": 0.00018942346640262838, + "loss": 0.3435, + "step": 7288 + }, + { + "epoch": 0.5904893065456902, + "grad_norm": 0.026414060965180397, + "learning_rate": 0.00018941896575003377, + "loss": 0.2944, + "step": 7289 + }, + { + "epoch": 0.5905703175631886, + "grad_norm": 0.029365338385105133, + "learning_rate": 0.00018941446509743913, + "loss": 0.3395, + "step": 7290 + }, + { + "epoch": 0.5906513285806869, + "grad_norm": 0.03021685592830181, + "learning_rate": 0.00018940996444484452, + "loss": 0.3159, + "step": 7291 + }, + { + "epoch": 0.5907323395981854, + "grad_norm": 0.026979271322488785, + "learning_rate": 0.00018940546379224988, + "loss": 0.3154, + "step": 7292 + }, + { + "epoch": 0.5908133506156837, + "grad_norm": 0.03209391236305237, + "learning_rate": 0.00018940096313965524, + "loss": 0.368, + "step": 7293 + }, + { + "epoch": 0.5908943616331821, + "grad_norm": 0.032004281878471375, + "learning_rate": 0.00018939646248706063, + "loss": 0.3188, + "step": 7294 + }, + { + "epoch": 0.5909753726506805, + "grad_norm": 0.032680317759513855, + "learning_rate": 0.000189391961834466, + "loss": 0.3584, + "step": 7295 + }, + { + "epoch": 0.5910563836681789, + "grad_norm": 0.03193636238574982, + "learning_rate": 0.00018938746118187137, + "loss": 0.3295, + "step": 7296 + }, + { + "epoch": 0.5911373946856773, + "grad_norm": 0.029368802905082703, + "learning_rate": 0.00018938296052927676, + "loss": 0.3596, + "step": 7297 + }, + { + "epoch": 0.5912184057031756, + "grad_norm": 0.030950840562582016, + "learning_rate": 0.00018937845987668212, + "loss": 0.3813, + "step": 7298 + }, + { + "epoch": 0.591299416720674, + "grad_norm": 0.03713708743453026, + "learning_rate": 0.0001893739592240875, + "loss": 0.415, + "step": 7299 + }, + { + "epoch": 0.5913804277381723, + "grad_norm": 0.02990836836397648, + "learning_rate": 0.0001893694585714929, + "loss": 0.3518, + "step": 7300 + }, + { + "epoch": 0.5914614387556708, + "grad_norm": 0.029787639155983925, + "learning_rate": 0.00018936495791889825, + "loss": 0.3212, + "step": 7301 + }, + { + "epoch": 0.5915424497731692, + "grad_norm": 0.03178705275058746, + "learning_rate": 0.00018936045726630361, + "loss": 0.3802, + "step": 7302 + }, + { + "epoch": 0.5916234607906675, + "grad_norm": 0.034910041838884354, + "learning_rate": 0.000189355956613709, + "loss": 0.3656, + "step": 7303 + }, + { + "epoch": 0.5917044718081659, + "grad_norm": 0.02986535057425499, + "learning_rate": 0.00018935145596111436, + "loss": 0.322, + "step": 7304 + }, + { + "epoch": 0.5917854828256643, + "grad_norm": 0.03602313622832298, + "learning_rate": 0.00018934695530851975, + "loss": 0.3598, + "step": 7305 + }, + { + "epoch": 0.5918664938431627, + "grad_norm": 0.028011471033096313, + "learning_rate": 0.00018934245465592514, + "loss": 0.3045, + "step": 7306 + }, + { + "epoch": 0.5919475048606611, + "grad_norm": 0.03489091992378235, + "learning_rate": 0.0001893379540033305, + "loss": 0.3552, + "step": 7307 + }, + { + "epoch": 0.5920285158781594, + "grad_norm": 0.033708952367305756, + "learning_rate": 0.00018933345335073586, + "loss": 0.3914, + "step": 7308 + }, + { + "epoch": 0.5921095268956578, + "grad_norm": 0.03242769464850426, + "learning_rate": 0.00018932895269814124, + "loss": 0.35, + "step": 7309 + }, + { + "epoch": 0.5921905379131562, + "grad_norm": 0.03306904435157776, + "learning_rate": 0.0001893244520455466, + "loss": 0.3893, + "step": 7310 + }, + { + "epoch": 0.5922715489306546, + "grad_norm": 0.033809393644332886, + "learning_rate": 0.000189319951392952, + "loss": 0.4143, + "step": 7311 + }, + { + "epoch": 0.592352559948153, + "grad_norm": 0.030917203053832054, + "learning_rate": 0.00018931545074035738, + "loss": 0.3322, + "step": 7312 + }, + { + "epoch": 0.5924335709656513, + "grad_norm": 0.03382820636034012, + "learning_rate": 0.00018931095008776274, + "loss": 0.3669, + "step": 7313 + }, + { + "epoch": 0.5925145819831497, + "grad_norm": 0.03001822717487812, + "learning_rate": 0.0001893064494351681, + "loss": 0.3484, + "step": 7314 + }, + { + "epoch": 0.5925955930006481, + "grad_norm": 0.03102373518049717, + "learning_rate": 0.00018930194878257348, + "loss": 0.4064, + "step": 7315 + }, + { + "epoch": 0.5926766040181465, + "grad_norm": 0.03309882804751396, + "learning_rate": 0.00018929744812997884, + "loss": 0.3638, + "step": 7316 + }, + { + "epoch": 0.5927576150356448, + "grad_norm": 0.03323843702673912, + "learning_rate": 0.00018929294747738423, + "loss": 0.4001, + "step": 7317 + }, + { + "epoch": 0.5928386260531432, + "grad_norm": 0.030602650716900826, + "learning_rate": 0.00018928844682478962, + "loss": 0.3165, + "step": 7318 + }, + { + "epoch": 0.5929196370706417, + "grad_norm": 0.025571037083864212, + "learning_rate": 0.00018928394617219498, + "loss": 0.2903, + "step": 7319 + }, + { + "epoch": 0.59300064808814, + "grad_norm": 0.03153804689645767, + "learning_rate": 0.00018927944551960034, + "loss": 0.3626, + "step": 7320 + }, + { + "epoch": 0.5930816591056384, + "grad_norm": 0.030782267451286316, + "learning_rate": 0.00018927494486700573, + "loss": 0.3972, + "step": 7321 + }, + { + "epoch": 0.5931626701231367, + "grad_norm": 0.030006734654307365, + "learning_rate": 0.0001892704442144111, + "loss": 0.3545, + "step": 7322 + }, + { + "epoch": 0.5932436811406351, + "grad_norm": 0.028132524341344833, + "learning_rate": 0.00018926594356181647, + "loss": 0.2896, + "step": 7323 + }, + { + "epoch": 0.5933246921581335, + "grad_norm": 0.033880989998579025, + "learning_rate": 0.00018926144290922186, + "loss": 0.4088, + "step": 7324 + }, + { + "epoch": 0.5934057031756319, + "grad_norm": 0.03421669453382492, + "learning_rate": 0.00018925694225662722, + "loss": 0.3562, + "step": 7325 + }, + { + "epoch": 0.5934867141931303, + "grad_norm": 0.03325583040714264, + "learning_rate": 0.00018925244160403258, + "loss": 0.3458, + "step": 7326 + }, + { + "epoch": 0.5935677252106286, + "grad_norm": 0.030235623940825462, + "learning_rate": 0.00018924794095143797, + "loss": 0.3524, + "step": 7327 + }, + { + "epoch": 0.593648736228127, + "grad_norm": 0.02768390066921711, + "learning_rate": 0.00018924344029884333, + "loss": 0.3424, + "step": 7328 + }, + { + "epoch": 0.5937297472456254, + "grad_norm": 0.03422306105494499, + "learning_rate": 0.00018923893964624872, + "loss": 0.3298, + "step": 7329 + }, + { + "epoch": 0.5938107582631238, + "grad_norm": 0.029942166060209274, + "learning_rate": 0.0001892344389936541, + "loss": 0.3097, + "step": 7330 + }, + { + "epoch": 0.5938917692806222, + "grad_norm": 0.03228619322180748, + "learning_rate": 0.00018922993834105946, + "loss": 0.3228, + "step": 7331 + }, + { + "epoch": 0.5939727802981205, + "grad_norm": 0.029120970517396927, + "learning_rate": 0.00018922543768846482, + "loss": 0.3278, + "step": 7332 + }, + { + "epoch": 0.594053791315619, + "grad_norm": 0.03116501122713089, + "learning_rate": 0.0001892209370358702, + "loss": 0.3299, + "step": 7333 + }, + { + "epoch": 0.5941348023331173, + "grad_norm": 0.026393789798021317, + "learning_rate": 0.00018921643638327557, + "loss": 0.296, + "step": 7334 + }, + { + "epoch": 0.5942158133506157, + "grad_norm": 0.03156831115484238, + "learning_rate": 0.00018921193573068096, + "loss": 0.3492, + "step": 7335 + }, + { + "epoch": 0.594296824368114, + "grad_norm": 0.028718816116452217, + "learning_rate": 0.00018920743507808634, + "loss": 0.3266, + "step": 7336 + }, + { + "epoch": 0.5943778353856124, + "grad_norm": 0.032878655940294266, + "learning_rate": 0.0001892029344254917, + "loss": 0.3585, + "step": 7337 + }, + { + "epoch": 0.5944588464031109, + "grad_norm": 0.028278445824980736, + "learning_rate": 0.00018919843377289706, + "loss": 0.3349, + "step": 7338 + }, + { + "epoch": 0.5945398574206092, + "grad_norm": 0.03184044361114502, + "learning_rate": 0.00018919393312030245, + "loss": 0.345, + "step": 7339 + }, + { + "epoch": 0.5946208684381076, + "grad_norm": 0.034407783299684525, + "learning_rate": 0.0001891894324677078, + "loss": 0.3698, + "step": 7340 + }, + { + "epoch": 0.5947018794556059, + "grad_norm": 0.02938602864742279, + "learning_rate": 0.0001891849318151132, + "loss": 0.322, + "step": 7341 + }, + { + "epoch": 0.5947828904731044, + "grad_norm": 0.02893815189599991, + "learning_rate": 0.00018918043116251859, + "loss": 0.3289, + "step": 7342 + }, + { + "epoch": 0.5948639014906028, + "grad_norm": 0.03124549798667431, + "learning_rate": 0.00018917593050992395, + "loss": 0.3528, + "step": 7343 + }, + { + "epoch": 0.5949449125081011, + "grad_norm": 0.03118320368230343, + "learning_rate": 0.0001891714298573293, + "loss": 0.3395, + "step": 7344 + }, + { + "epoch": 0.5950259235255995, + "grad_norm": 0.034015022218227386, + "learning_rate": 0.0001891669292047347, + "loss": 0.3687, + "step": 7345 + }, + { + "epoch": 0.5951069345430978, + "grad_norm": 0.028784219175577164, + "learning_rate": 0.00018916242855214005, + "loss": 0.3328, + "step": 7346 + }, + { + "epoch": 0.5951879455605963, + "grad_norm": 0.030091838911175728, + "learning_rate": 0.00018915792789954544, + "loss": 0.3041, + "step": 7347 + }, + { + "epoch": 0.5952689565780946, + "grad_norm": 0.03068556636571884, + "learning_rate": 0.00018915342724695083, + "loss": 0.3472, + "step": 7348 + }, + { + "epoch": 0.595349967595593, + "grad_norm": 0.031809356063604355, + "learning_rate": 0.0001891489265943562, + "loss": 0.3515, + "step": 7349 + }, + { + "epoch": 0.5954309786130914, + "grad_norm": 0.03569292649626732, + "learning_rate": 0.00018914442594176155, + "loss": 0.3772, + "step": 7350 + }, + { + "epoch": 0.5955119896305897, + "grad_norm": 0.029809661209583282, + "learning_rate": 0.00018913992528916693, + "loss": 0.3893, + "step": 7351 + }, + { + "epoch": 0.5955930006480882, + "grad_norm": 0.03562194108963013, + "learning_rate": 0.00018913542463657232, + "loss": 0.3478, + "step": 7352 + }, + { + "epoch": 0.5956740116655865, + "grad_norm": 0.030969291925430298, + "learning_rate": 0.00018913092398397768, + "loss": 0.3484, + "step": 7353 + }, + { + "epoch": 0.5957550226830849, + "grad_norm": 0.02741086110472679, + "learning_rate": 0.00018912642333138307, + "loss": 0.3232, + "step": 7354 + }, + { + "epoch": 0.5958360337005832, + "grad_norm": 0.03458785638213158, + "learning_rate": 0.00018912192267878843, + "loss": 0.3574, + "step": 7355 + }, + { + "epoch": 0.5959170447180817, + "grad_norm": 0.03107859566807747, + "learning_rate": 0.0001891174220261938, + "loss": 0.3316, + "step": 7356 + }, + { + "epoch": 0.5959980557355801, + "grad_norm": 0.032016459852457047, + "learning_rate": 0.00018911292137359918, + "loss": 0.3892, + "step": 7357 + }, + { + "epoch": 0.5960790667530784, + "grad_norm": 0.02998381108045578, + "learning_rate": 0.00018910842072100456, + "loss": 0.3514, + "step": 7358 + }, + { + "epoch": 0.5961600777705768, + "grad_norm": 0.03130162134766579, + "learning_rate": 0.00018910392006840992, + "loss": 0.3637, + "step": 7359 + }, + { + "epoch": 0.5962410887880751, + "grad_norm": 0.030972221866250038, + "learning_rate": 0.0001890994194158153, + "loss": 0.3611, + "step": 7360 + }, + { + "epoch": 0.5963220998055736, + "grad_norm": 0.03162587434053421, + "learning_rate": 0.00018909491876322067, + "loss": 0.4011, + "step": 7361 + }, + { + "epoch": 0.596403110823072, + "grad_norm": 0.030603982508182526, + "learning_rate": 0.00018909041811062606, + "loss": 0.3557, + "step": 7362 + }, + { + "epoch": 0.5964841218405703, + "grad_norm": 0.029461462050676346, + "learning_rate": 0.00018908591745803142, + "loss": 0.3157, + "step": 7363 + }, + { + "epoch": 0.5965651328580687, + "grad_norm": 0.03398451954126358, + "learning_rate": 0.0001890814168054368, + "loss": 0.346, + "step": 7364 + }, + { + "epoch": 0.596646143875567, + "grad_norm": 0.02900012768805027, + "learning_rate": 0.00018907691615284216, + "loss": 0.321, + "step": 7365 + }, + { + "epoch": 0.5967271548930655, + "grad_norm": 0.03144826740026474, + "learning_rate": 0.00018907241550024755, + "loss": 0.3189, + "step": 7366 + }, + { + "epoch": 0.5968081659105638, + "grad_norm": 0.02788534387946129, + "learning_rate": 0.0001890679148476529, + "loss": 0.3219, + "step": 7367 + }, + { + "epoch": 0.5968891769280622, + "grad_norm": 0.030930208042263985, + "learning_rate": 0.0001890634141950583, + "loss": 0.3584, + "step": 7368 + }, + { + "epoch": 0.5969701879455606, + "grad_norm": 0.03033963218331337, + "learning_rate": 0.00018905891354246366, + "loss": 0.3441, + "step": 7369 + }, + { + "epoch": 0.597051198963059, + "grad_norm": 0.0332016684114933, + "learning_rate": 0.00018905441288986905, + "loss": 0.3553, + "step": 7370 + }, + { + "epoch": 0.5971322099805574, + "grad_norm": 0.028269365429878235, + "learning_rate": 0.0001890499122372744, + "loss": 0.2937, + "step": 7371 + }, + { + "epoch": 0.5972132209980557, + "grad_norm": 0.029664890840649605, + "learning_rate": 0.0001890454115846798, + "loss": 0.3122, + "step": 7372 + }, + { + "epoch": 0.5972942320155541, + "grad_norm": 0.032333459705114365, + "learning_rate": 0.00018904091093208515, + "loss": 0.3575, + "step": 7373 + }, + { + "epoch": 0.5973752430330524, + "grad_norm": 0.03276979178190231, + "learning_rate": 0.00018903641027949054, + "loss": 0.3543, + "step": 7374 + }, + { + "epoch": 0.5974562540505509, + "grad_norm": 0.030932294204831123, + "learning_rate": 0.00018903190962689593, + "loss": 0.3839, + "step": 7375 + }, + { + "epoch": 0.5975372650680493, + "grad_norm": 0.034941308200359344, + "learning_rate": 0.0001890274089743013, + "loss": 0.3879, + "step": 7376 + }, + { + "epoch": 0.5976182760855476, + "grad_norm": 0.03369393199682236, + "learning_rate": 0.00018902290832170665, + "loss": 0.347, + "step": 7377 + }, + { + "epoch": 0.597699287103046, + "grad_norm": 0.035528942942619324, + "learning_rate": 0.00018901840766911204, + "loss": 0.38, + "step": 7378 + }, + { + "epoch": 0.5977802981205443, + "grad_norm": 0.03295198827981949, + "learning_rate": 0.0001890139070165174, + "loss": 0.3525, + "step": 7379 + }, + { + "epoch": 0.5978613091380428, + "grad_norm": 0.029161540791392326, + "learning_rate": 0.00018900940636392278, + "loss": 0.3543, + "step": 7380 + }, + { + "epoch": 0.5979423201555412, + "grad_norm": 0.026549329981207848, + "learning_rate": 0.00018900490571132817, + "loss": 0.2638, + "step": 7381 + }, + { + "epoch": 0.5980233311730395, + "grad_norm": 0.03204452246427536, + "learning_rate": 0.00018900040505873353, + "loss": 0.3266, + "step": 7382 + }, + { + "epoch": 0.5981043421905379, + "grad_norm": 0.02657189778983593, + "learning_rate": 0.0001889959044061389, + "loss": 0.304, + "step": 7383 + }, + { + "epoch": 0.5981853532080363, + "grad_norm": 0.03293665871024132, + "learning_rate": 0.00018899140375354428, + "loss": 0.3721, + "step": 7384 + }, + { + "epoch": 0.5982663642255347, + "grad_norm": 0.032227154821157455, + "learning_rate": 0.00018898690310094964, + "loss": 0.3504, + "step": 7385 + }, + { + "epoch": 0.598347375243033, + "grad_norm": 0.03022935800254345, + "learning_rate": 0.00018898240244835502, + "loss": 0.388, + "step": 7386 + }, + { + "epoch": 0.5984283862605314, + "grad_norm": 0.032245948910713196, + "learning_rate": 0.0001889779017957604, + "loss": 0.3542, + "step": 7387 + }, + { + "epoch": 0.5985093972780298, + "grad_norm": 0.029357144609093666, + "learning_rate": 0.00018897340114316577, + "loss": 0.3575, + "step": 7388 + }, + { + "epoch": 0.5985904082955282, + "grad_norm": 0.03233001008629799, + "learning_rate": 0.00018896890049057113, + "loss": 0.388, + "step": 7389 + }, + { + "epoch": 0.5986714193130266, + "grad_norm": 0.033273451030254364, + "learning_rate": 0.00018896439983797652, + "loss": 0.3334, + "step": 7390 + }, + { + "epoch": 0.5987524303305249, + "grad_norm": 0.03183024749159813, + "learning_rate": 0.00018895989918538188, + "loss": 0.3427, + "step": 7391 + }, + { + "epoch": 0.5988334413480233, + "grad_norm": 0.0347750298678875, + "learning_rate": 0.00018895539853278727, + "loss": 0.3704, + "step": 7392 + }, + { + "epoch": 0.5989144523655218, + "grad_norm": 0.03242835775017738, + "learning_rate": 0.00018895089788019265, + "loss": 0.369, + "step": 7393 + }, + { + "epoch": 0.5989954633830201, + "grad_norm": 0.026880014687776566, + "learning_rate": 0.000188946397227598, + "loss": 0.3017, + "step": 7394 + }, + { + "epoch": 0.5990764744005185, + "grad_norm": 0.03177474066615105, + "learning_rate": 0.00018894189657500337, + "loss": 0.3304, + "step": 7395 + }, + { + "epoch": 0.5991574854180168, + "grad_norm": 0.03396380692720413, + "learning_rate": 0.00018893739592240876, + "loss": 0.3638, + "step": 7396 + }, + { + "epoch": 0.5992384964355152, + "grad_norm": 0.02995561808347702, + "learning_rate": 0.00018893289526981412, + "loss": 0.3706, + "step": 7397 + }, + { + "epoch": 0.5993195074530137, + "grad_norm": 0.030224697664380074, + "learning_rate": 0.0001889283946172195, + "loss": 0.3403, + "step": 7398 + }, + { + "epoch": 0.599400518470512, + "grad_norm": 0.0329534150660038, + "learning_rate": 0.0001889238939646249, + "loss": 0.327, + "step": 7399 + }, + { + "epoch": 0.5994815294880104, + "grad_norm": 0.03500063344836235, + "learning_rate": 0.00018891939331203025, + "loss": 0.365, + "step": 7400 + }, + { + "epoch": 0.5995625405055087, + "grad_norm": 0.03050912730395794, + "learning_rate": 0.00018891489265943561, + "loss": 0.3489, + "step": 7401 + }, + { + "epoch": 0.5996435515230071, + "grad_norm": 0.03011389449238777, + "learning_rate": 0.000188910392006841, + "loss": 0.3636, + "step": 7402 + }, + { + "epoch": 0.5997245625405055, + "grad_norm": 0.03238969296216965, + "learning_rate": 0.00018890589135424636, + "loss": 0.3729, + "step": 7403 + }, + { + "epoch": 0.5998055735580039, + "grad_norm": 0.02938798815011978, + "learning_rate": 0.00018890139070165175, + "loss": 0.346, + "step": 7404 + }, + { + "epoch": 0.5998865845755023, + "grad_norm": 0.03192545101046562, + "learning_rate": 0.00018889689004905714, + "loss": 0.3871, + "step": 7405 + }, + { + "epoch": 0.5999675955930006, + "grad_norm": 0.03070968948304653, + "learning_rate": 0.0001888923893964625, + "loss": 0.3447, + "step": 7406 + }, + { + "epoch": 0.6000486066104991, + "grad_norm": 0.0291475597769022, + "learning_rate": 0.00018888788874386786, + "loss": 0.3284, + "step": 7407 + }, + { + "epoch": 0.6001296176279974, + "grad_norm": 0.029424374923110008, + "learning_rate": 0.00018888338809127324, + "loss": 0.3387, + "step": 7408 + }, + { + "epoch": 0.6002106286454958, + "grad_norm": 0.029002133756875992, + "learning_rate": 0.0001888788874386786, + "loss": 0.3533, + "step": 7409 + }, + { + "epoch": 0.6002916396629941, + "grad_norm": 0.03278941661119461, + "learning_rate": 0.000188874386786084, + "loss": 0.3748, + "step": 7410 + }, + { + "epoch": 0.6003726506804925, + "grad_norm": 0.02870086394250393, + "learning_rate": 0.00018886988613348938, + "loss": 0.3564, + "step": 7411 + }, + { + "epoch": 0.600453661697991, + "grad_norm": 0.033800091594457626, + "learning_rate": 0.00018886538548089474, + "loss": 0.416, + "step": 7412 + }, + { + "epoch": 0.6005346727154893, + "grad_norm": 0.02814219333231449, + "learning_rate": 0.0001888608848283001, + "loss": 0.3087, + "step": 7413 + }, + { + "epoch": 0.6006156837329877, + "grad_norm": 0.031001700088381767, + "learning_rate": 0.00018885638417570549, + "loss": 0.3408, + "step": 7414 + }, + { + "epoch": 0.600696694750486, + "grad_norm": 0.030194900929927826, + "learning_rate": 0.00018885188352311085, + "loss": 0.3389, + "step": 7415 + }, + { + "epoch": 0.6007777057679844, + "grad_norm": 0.03206025809049606, + "learning_rate": 0.00018884738287051623, + "loss": 0.3463, + "step": 7416 + }, + { + "epoch": 0.6008587167854829, + "grad_norm": 0.0302756205201149, + "learning_rate": 0.00018884288221792162, + "loss": 0.3369, + "step": 7417 + }, + { + "epoch": 0.6009397278029812, + "grad_norm": 0.0310045275837183, + "learning_rate": 0.00018883838156532698, + "loss": 0.4104, + "step": 7418 + }, + { + "epoch": 0.6010207388204796, + "grad_norm": 0.029064510017633438, + "learning_rate": 0.00018883388091273234, + "loss": 0.3764, + "step": 7419 + }, + { + "epoch": 0.6011017498379779, + "grad_norm": 0.02953030914068222, + "learning_rate": 0.00018882938026013773, + "loss": 0.3594, + "step": 7420 + }, + { + "epoch": 0.6011827608554764, + "grad_norm": 0.03259488195180893, + "learning_rate": 0.0001888248796075431, + "loss": 0.3665, + "step": 7421 + }, + { + "epoch": 0.6012637718729748, + "grad_norm": 0.03570137545466423, + "learning_rate": 0.00018882037895494847, + "loss": 0.3738, + "step": 7422 + }, + { + "epoch": 0.6013447828904731, + "grad_norm": 0.03246882185339928, + "learning_rate": 0.00018881587830235386, + "loss": 0.3403, + "step": 7423 + }, + { + "epoch": 0.6014257939079715, + "grad_norm": 0.03044661320745945, + "learning_rate": 0.00018881137764975922, + "loss": 0.3397, + "step": 7424 + }, + { + "epoch": 0.6015068049254698, + "grad_norm": 0.033912573009729385, + "learning_rate": 0.00018880687699716458, + "loss": 0.3188, + "step": 7425 + }, + { + "epoch": 0.6015878159429683, + "grad_norm": 0.03139291703701019, + "learning_rate": 0.00018880237634456997, + "loss": 0.4062, + "step": 7426 + }, + { + "epoch": 0.6016688269604666, + "grad_norm": 0.025072293356060982, + "learning_rate": 0.00018879787569197533, + "loss": 0.2751, + "step": 7427 + }, + { + "epoch": 0.601749837977965, + "grad_norm": 0.0371062196791172, + "learning_rate": 0.00018879337503938072, + "loss": 0.3948, + "step": 7428 + }, + { + "epoch": 0.6018308489954634, + "grad_norm": 0.03248792514204979, + "learning_rate": 0.0001887888743867861, + "loss": 0.3191, + "step": 7429 + }, + { + "epoch": 0.6019118600129617, + "grad_norm": 0.029657980427145958, + "learning_rate": 0.00018878437373419146, + "loss": 0.3028, + "step": 7430 + }, + { + "epoch": 0.6019928710304602, + "grad_norm": 0.029270123690366745, + "learning_rate": 0.00018877987308159685, + "loss": 0.3518, + "step": 7431 + }, + { + "epoch": 0.6020738820479585, + "grad_norm": 0.032467812299728394, + "learning_rate": 0.0001887753724290022, + "loss": 0.3462, + "step": 7432 + }, + { + "epoch": 0.6021548930654569, + "grad_norm": 0.028843361884355545, + "learning_rate": 0.0001887708717764076, + "loss": 0.317, + "step": 7433 + }, + { + "epoch": 0.6022359040829552, + "grad_norm": 0.029680926352739334, + "learning_rate": 0.00018876637112381296, + "loss": 0.3395, + "step": 7434 + }, + { + "epoch": 0.6023169151004537, + "grad_norm": 0.03853990137577057, + "learning_rate": 0.00018876187047121834, + "loss": 0.39, + "step": 7435 + }, + { + "epoch": 0.6023979261179521, + "grad_norm": 0.034090928733348846, + "learning_rate": 0.0001887573698186237, + "loss": 0.355, + "step": 7436 + }, + { + "epoch": 0.6024789371354504, + "grad_norm": 0.030918769538402557, + "learning_rate": 0.0001887528691660291, + "loss": 0.3414, + "step": 7437 + }, + { + "epoch": 0.6025599481529488, + "grad_norm": 0.029954658821225166, + "learning_rate": 0.00018874836851343445, + "loss": 0.3191, + "step": 7438 + }, + { + "epoch": 0.6026409591704471, + "grad_norm": 0.03110465221107006, + "learning_rate": 0.00018874386786083984, + "loss": 0.3761, + "step": 7439 + }, + { + "epoch": 0.6027219701879456, + "grad_norm": 0.029768571257591248, + "learning_rate": 0.0001887393672082452, + "loss": 0.3439, + "step": 7440 + }, + { + "epoch": 0.602802981205444, + "grad_norm": 0.029363000765442848, + "learning_rate": 0.00018873486655565059, + "loss": 0.3499, + "step": 7441 + }, + { + "epoch": 0.6028839922229423, + "grad_norm": 0.03165175020694733, + "learning_rate": 0.00018873036590305595, + "loss": 0.4128, + "step": 7442 + }, + { + "epoch": 0.6029650032404407, + "grad_norm": 0.03060912899672985, + "learning_rate": 0.00018872586525046133, + "loss": 0.3343, + "step": 7443 + }, + { + "epoch": 0.6030460142579391, + "grad_norm": 0.03013191744685173, + "learning_rate": 0.0001887213645978667, + "loss": 0.3377, + "step": 7444 + }, + { + "epoch": 0.6031270252754375, + "grad_norm": 0.030229419469833374, + "learning_rate": 0.00018871686394527208, + "loss": 0.3538, + "step": 7445 + }, + { + "epoch": 0.6032080362929358, + "grad_norm": 0.028185758739709854, + "learning_rate": 0.00018871236329267744, + "loss": 0.3352, + "step": 7446 + }, + { + "epoch": 0.6032890473104342, + "grad_norm": 0.029758667573332787, + "learning_rate": 0.00018870786264008283, + "loss": 0.3567, + "step": 7447 + }, + { + "epoch": 0.6033700583279326, + "grad_norm": 0.028832679614424706, + "learning_rate": 0.0001887033619874882, + "loss": 0.3317, + "step": 7448 + }, + { + "epoch": 0.603451069345431, + "grad_norm": 0.03557024523615837, + "learning_rate": 0.00018869886133489357, + "loss": 0.3251, + "step": 7449 + }, + { + "epoch": 0.6035320803629294, + "grad_norm": 0.0323876328766346, + "learning_rate": 0.00018869436068229893, + "loss": 0.35, + "step": 7450 + }, + { + "epoch": 0.6036130913804277, + "grad_norm": 0.031438153237104416, + "learning_rate": 0.00018868986002970432, + "loss": 0.3505, + "step": 7451 + }, + { + "epoch": 0.6036941023979261, + "grad_norm": 0.031943872570991516, + "learning_rate": 0.00018868535937710968, + "loss": 0.3453, + "step": 7452 + }, + { + "epoch": 0.6037751134154244, + "grad_norm": 0.03199175372719765, + "learning_rate": 0.00018868085872451507, + "loss": 0.3101, + "step": 7453 + }, + { + "epoch": 0.6038561244329229, + "grad_norm": 0.03276806324720383, + "learning_rate": 0.00018867635807192043, + "loss": 0.3513, + "step": 7454 + }, + { + "epoch": 0.6039371354504213, + "grad_norm": 0.03356426954269409, + "learning_rate": 0.00018867185741932582, + "loss": 0.3261, + "step": 7455 + }, + { + "epoch": 0.6040181464679196, + "grad_norm": 0.03232236206531525, + "learning_rate": 0.0001886673567667312, + "loss": 0.3478, + "step": 7456 + }, + { + "epoch": 0.604099157485418, + "grad_norm": 0.0322866290807724, + "learning_rate": 0.00018866285611413656, + "loss": 0.3869, + "step": 7457 + }, + { + "epoch": 0.6041801685029164, + "grad_norm": 0.03561306744813919, + "learning_rate": 0.00018865835546154192, + "loss": 0.3321, + "step": 7458 + }, + { + "epoch": 0.6042611795204148, + "grad_norm": 0.027576128020882607, + "learning_rate": 0.0001886538548089473, + "loss": 0.3066, + "step": 7459 + }, + { + "epoch": 0.6043421905379132, + "grad_norm": 0.036662038415670395, + "learning_rate": 0.00018864935415635267, + "loss": 0.3476, + "step": 7460 + }, + { + "epoch": 0.6044232015554115, + "grad_norm": 0.032418131828308105, + "learning_rate": 0.00018864485350375806, + "loss": 0.3525, + "step": 7461 + }, + { + "epoch": 0.6045042125729099, + "grad_norm": 0.03187812119722366, + "learning_rate": 0.00018864035285116345, + "loss": 0.3518, + "step": 7462 + }, + { + "epoch": 0.6045852235904083, + "grad_norm": 0.03083401545882225, + "learning_rate": 0.0001886358521985688, + "loss": 0.346, + "step": 7463 + }, + { + "epoch": 0.6046662346079067, + "grad_norm": 0.030447332188487053, + "learning_rate": 0.00018863135154597417, + "loss": 0.3357, + "step": 7464 + }, + { + "epoch": 0.604747245625405, + "grad_norm": 0.02773471362888813, + "learning_rate": 0.00018862685089337955, + "loss": 0.3616, + "step": 7465 + }, + { + "epoch": 0.6048282566429034, + "grad_norm": 0.029380742460489273, + "learning_rate": 0.0001886223502407849, + "loss": 0.3325, + "step": 7466 + }, + { + "epoch": 0.6049092676604018, + "grad_norm": 0.03036758117377758, + "learning_rate": 0.0001886178495881903, + "loss": 0.3378, + "step": 7467 + }, + { + "epoch": 0.6049902786779002, + "grad_norm": 0.028923461213707924, + "learning_rate": 0.0001886133489355957, + "loss": 0.3138, + "step": 7468 + }, + { + "epoch": 0.6050712896953986, + "grad_norm": 0.02898426540195942, + "learning_rate": 0.00018860884828300105, + "loss": 0.3135, + "step": 7469 + }, + { + "epoch": 0.6051523007128969, + "grad_norm": 0.0326344259083271, + "learning_rate": 0.0001886043476304064, + "loss": 0.406, + "step": 7470 + }, + { + "epoch": 0.6052333117303953, + "grad_norm": 0.029875773936510086, + "learning_rate": 0.0001885998469778118, + "loss": 0.355, + "step": 7471 + }, + { + "epoch": 0.6053143227478938, + "grad_norm": 0.03648769482970238, + "learning_rate": 0.00018859534632521715, + "loss": 0.4147, + "step": 7472 + }, + { + "epoch": 0.6053953337653921, + "grad_norm": 0.02821452170610428, + "learning_rate": 0.00018859084567262254, + "loss": 0.3285, + "step": 7473 + }, + { + "epoch": 0.6054763447828905, + "grad_norm": 0.034381620585918427, + "learning_rate": 0.00018858634502002793, + "loss": 0.39, + "step": 7474 + }, + { + "epoch": 0.6055573558003888, + "grad_norm": 0.032561976462602615, + "learning_rate": 0.0001885818443674333, + "loss": 0.3234, + "step": 7475 + }, + { + "epoch": 0.6056383668178872, + "grad_norm": 0.030314048752188683, + "learning_rate": 0.00018857734371483865, + "loss": 0.3129, + "step": 7476 + }, + { + "epoch": 0.6057193778353857, + "grad_norm": 0.027895336970686913, + "learning_rate": 0.00018857284306224404, + "loss": 0.3218, + "step": 7477 + }, + { + "epoch": 0.605800388852884, + "grad_norm": 0.029156718403100967, + "learning_rate": 0.0001885683424096494, + "loss": 0.2981, + "step": 7478 + }, + { + "epoch": 0.6058813998703824, + "grad_norm": 0.030389677733182907, + "learning_rate": 0.00018856384175705478, + "loss": 0.3639, + "step": 7479 + }, + { + "epoch": 0.6059624108878807, + "grad_norm": 0.03531801328063011, + "learning_rate": 0.00018855934110446017, + "loss": 0.3279, + "step": 7480 + }, + { + "epoch": 0.6060434219053791, + "grad_norm": 0.03205801919102669, + "learning_rate": 0.00018855484045186553, + "loss": 0.3109, + "step": 7481 + }, + { + "epoch": 0.6061244329228775, + "grad_norm": 0.02922491915524006, + "learning_rate": 0.0001885503397992709, + "loss": 0.3292, + "step": 7482 + }, + { + "epoch": 0.6062054439403759, + "grad_norm": 0.03221955522894859, + "learning_rate": 0.00018854583914667628, + "loss": 0.3552, + "step": 7483 + }, + { + "epoch": 0.6062864549578743, + "grad_norm": 0.035640884190797806, + "learning_rate": 0.00018854133849408164, + "loss": 0.381, + "step": 7484 + }, + { + "epoch": 0.6063674659753726, + "grad_norm": 0.03220059350132942, + "learning_rate": 0.00018853683784148702, + "loss": 0.3554, + "step": 7485 + }, + { + "epoch": 0.6064484769928711, + "grad_norm": 0.02993176132440567, + "learning_rate": 0.0001885323371888924, + "loss": 0.3594, + "step": 7486 + }, + { + "epoch": 0.6065294880103694, + "grad_norm": 0.03339843451976776, + "learning_rate": 0.00018852783653629777, + "loss": 0.3492, + "step": 7487 + }, + { + "epoch": 0.6066104990278678, + "grad_norm": 0.032218873500823975, + "learning_rate": 0.00018852333588370313, + "loss": 0.3189, + "step": 7488 + }, + { + "epoch": 0.6066915100453661, + "grad_norm": 0.02685026451945305, + "learning_rate": 0.00018851883523110852, + "loss": 0.3296, + "step": 7489 + }, + { + "epoch": 0.6067725210628645, + "grad_norm": 0.027700411155819893, + "learning_rate": 0.00018851433457851388, + "loss": 0.3303, + "step": 7490 + }, + { + "epoch": 0.606853532080363, + "grad_norm": 0.04086620360612869, + "learning_rate": 0.00018850983392591927, + "loss": 0.429, + "step": 7491 + }, + { + "epoch": 0.6069345430978613, + "grad_norm": 0.036509573459625244, + "learning_rate": 0.00018850533327332465, + "loss": 0.3302, + "step": 7492 + }, + { + "epoch": 0.6070155541153597, + "grad_norm": 0.03572225570678711, + "learning_rate": 0.00018850083262073, + "loss": 0.3754, + "step": 7493 + }, + { + "epoch": 0.607096565132858, + "grad_norm": 0.029395904392004013, + "learning_rate": 0.00018849633196813537, + "loss": 0.3518, + "step": 7494 + }, + { + "epoch": 0.6071775761503565, + "grad_norm": 0.03418650105595589, + "learning_rate": 0.00018849183131554076, + "loss": 0.3434, + "step": 7495 + }, + { + "epoch": 0.6072585871678549, + "grad_norm": 0.030631721019744873, + "learning_rate": 0.00018848733066294612, + "loss": 0.3968, + "step": 7496 + }, + { + "epoch": 0.6073395981853532, + "grad_norm": 0.035159096121788025, + "learning_rate": 0.0001884828300103515, + "loss": 0.392, + "step": 7497 + }, + { + "epoch": 0.6074206092028516, + "grad_norm": 0.03384539857506752, + "learning_rate": 0.0001884783293577569, + "loss": 0.3973, + "step": 7498 + }, + { + "epoch": 0.6075016202203499, + "grad_norm": 0.03203663229942322, + "learning_rate": 0.00018847382870516225, + "loss": 0.3499, + "step": 7499 + }, + { + "epoch": 0.6075826312378484, + "grad_norm": 0.03463354334235191, + "learning_rate": 0.00018846932805256764, + "loss": 0.371, + "step": 7500 + }, + { + "epoch": 0.6076636422553467, + "grad_norm": 0.034394148737192154, + "learning_rate": 0.000188464827399973, + "loss": 0.3824, + "step": 7501 + }, + { + "epoch": 0.6077446532728451, + "grad_norm": 0.03237493708729744, + "learning_rate": 0.00018846032674737836, + "loss": 0.338, + "step": 7502 + }, + { + "epoch": 0.6078256642903435, + "grad_norm": 0.032026149332523346, + "learning_rate": 0.00018845582609478375, + "loss": 0.3727, + "step": 7503 + }, + { + "epoch": 0.6079066753078418, + "grad_norm": 0.0315285362303257, + "learning_rate": 0.00018845132544218914, + "loss": 0.343, + "step": 7504 + }, + { + "epoch": 0.6079876863253403, + "grad_norm": 0.028412126004695892, + "learning_rate": 0.0001884468247895945, + "loss": 0.3156, + "step": 7505 + }, + { + "epoch": 0.6080686973428386, + "grad_norm": 0.03015442192554474, + "learning_rate": 0.00018844232413699988, + "loss": 0.3419, + "step": 7506 + }, + { + "epoch": 0.608149708360337, + "grad_norm": 0.03320285677909851, + "learning_rate": 0.00018843782348440524, + "loss": 0.4232, + "step": 7507 + }, + { + "epoch": 0.6082307193778353, + "grad_norm": 0.03172954171895981, + "learning_rate": 0.00018843332283181063, + "loss": 0.3471, + "step": 7508 + }, + { + "epoch": 0.6083117303953338, + "grad_norm": 0.03579653054475784, + "learning_rate": 0.000188428822179216, + "loss": 0.3192, + "step": 7509 + }, + { + "epoch": 0.6083927414128322, + "grad_norm": 0.031022492796182632, + "learning_rate": 0.00018842432152662138, + "loss": 0.3419, + "step": 7510 + }, + { + "epoch": 0.6084737524303305, + "grad_norm": 0.029089458286762238, + "learning_rate": 0.00018841982087402674, + "loss": 0.3166, + "step": 7511 + }, + { + "epoch": 0.6085547634478289, + "grad_norm": 0.02824055217206478, + "learning_rate": 0.00018841532022143213, + "loss": 0.3633, + "step": 7512 + }, + { + "epoch": 0.6086357744653272, + "grad_norm": 0.03523353487253189, + "learning_rate": 0.00018841081956883749, + "loss": 0.3655, + "step": 7513 + }, + { + "epoch": 0.6087167854828257, + "grad_norm": 0.03349660709500313, + "learning_rate": 0.00018840631891624287, + "loss": 0.3749, + "step": 7514 + }, + { + "epoch": 0.6087977965003241, + "grad_norm": 0.02907554619014263, + "learning_rate": 0.00018840181826364823, + "loss": 0.3263, + "step": 7515 + }, + { + "epoch": 0.6088788075178224, + "grad_norm": 0.03144342452287674, + "learning_rate": 0.00018839731761105362, + "loss": 0.4028, + "step": 7516 + }, + { + "epoch": 0.6089598185353208, + "grad_norm": 0.037257906049489975, + "learning_rate": 0.00018839281695845898, + "loss": 0.3366, + "step": 7517 + }, + { + "epoch": 0.6090408295528191, + "grad_norm": 0.03802880272269249, + "learning_rate": 0.00018838831630586437, + "loss": 0.366, + "step": 7518 + }, + { + "epoch": 0.6091218405703176, + "grad_norm": 0.03249386325478554, + "learning_rate": 0.00018838381565326973, + "loss": 0.3743, + "step": 7519 + }, + { + "epoch": 0.609202851587816, + "grad_norm": 0.031599901616573334, + "learning_rate": 0.00018837931500067511, + "loss": 0.3632, + "step": 7520 + }, + { + "epoch": 0.6092838626053143, + "grad_norm": 0.032178837805986404, + "learning_rate": 0.00018837481434808047, + "loss": 0.3521, + "step": 7521 + }, + { + "epoch": 0.6093648736228127, + "grad_norm": 0.042940713465213776, + "learning_rate": 0.00018837031369548586, + "loss": 0.3405, + "step": 7522 + }, + { + "epoch": 0.6094458846403111, + "grad_norm": 0.03091159276664257, + "learning_rate": 0.00018836581304289122, + "loss": 0.3694, + "step": 7523 + }, + { + "epoch": 0.6095268956578095, + "grad_norm": 0.03217526152729988, + "learning_rate": 0.0001883613123902966, + "loss": 0.3212, + "step": 7524 + }, + { + "epoch": 0.6096079066753078, + "grad_norm": 0.0323089137673378, + "learning_rate": 0.00018835681173770197, + "loss": 0.3987, + "step": 7525 + }, + { + "epoch": 0.6096889176928062, + "grad_norm": 0.033394478261470795, + "learning_rate": 0.00018835231108510736, + "loss": 0.3771, + "step": 7526 + }, + { + "epoch": 0.6097699287103046, + "grad_norm": 0.02519734390079975, + "learning_rate": 0.00018834781043251272, + "loss": 0.3115, + "step": 7527 + }, + { + "epoch": 0.609850939727803, + "grad_norm": 0.030933715403079987, + "learning_rate": 0.0001883433097799181, + "loss": 0.3129, + "step": 7528 + }, + { + "epoch": 0.6099319507453014, + "grad_norm": 0.03571024537086487, + "learning_rate": 0.00018833880912732346, + "loss": 0.3683, + "step": 7529 + }, + { + "epoch": 0.6100129617627997, + "grad_norm": 0.03200255334377289, + "learning_rate": 0.00018833430847472885, + "loss": 0.3812, + "step": 7530 + }, + { + "epoch": 0.6100939727802981, + "grad_norm": 0.02796841971576214, + "learning_rate": 0.0001883298078221342, + "loss": 0.3288, + "step": 7531 + }, + { + "epoch": 0.6101749837977966, + "grad_norm": 0.037334516644477844, + "learning_rate": 0.0001883253071695396, + "loss": 0.3238, + "step": 7532 + }, + { + "epoch": 0.6102559948152949, + "grad_norm": 0.03164404258131981, + "learning_rate": 0.00018832080651694496, + "loss": 0.306, + "step": 7533 + }, + { + "epoch": 0.6103370058327933, + "grad_norm": 0.028371304273605347, + "learning_rate": 0.00018831630586435034, + "loss": 0.3295, + "step": 7534 + }, + { + "epoch": 0.6104180168502916, + "grad_norm": 0.03163844719529152, + "learning_rate": 0.0001883118052117557, + "loss": 0.3692, + "step": 7535 + }, + { + "epoch": 0.61049902786779, + "grad_norm": 0.029623981565237045, + "learning_rate": 0.0001883073045591611, + "loss": 0.3555, + "step": 7536 + }, + { + "epoch": 0.6105800388852884, + "grad_norm": 0.0332329086959362, + "learning_rate": 0.00018830280390656648, + "loss": 0.2943, + "step": 7537 + }, + { + "epoch": 0.6106610499027868, + "grad_norm": 0.031672000885009766, + "learning_rate": 0.00018829830325397184, + "loss": 0.3936, + "step": 7538 + }, + { + "epoch": 0.6107420609202852, + "grad_norm": 0.030289702117443085, + "learning_rate": 0.0001882938026013772, + "loss": 0.3335, + "step": 7539 + }, + { + "epoch": 0.6108230719377835, + "grad_norm": 0.03449969366192818, + "learning_rate": 0.00018828930194878259, + "loss": 0.378, + "step": 7540 + }, + { + "epoch": 0.6109040829552819, + "grad_norm": 0.03260310739278793, + "learning_rate": 0.00018828480129618795, + "loss": 0.3875, + "step": 7541 + }, + { + "epoch": 0.6109850939727803, + "grad_norm": 0.026728922501206398, + "learning_rate": 0.00018828030064359333, + "loss": 0.287, + "step": 7542 + }, + { + "epoch": 0.6110661049902787, + "grad_norm": 0.02870047651231289, + "learning_rate": 0.00018827579999099872, + "loss": 0.3006, + "step": 7543 + }, + { + "epoch": 0.611147116007777, + "grad_norm": 0.030876463279128075, + "learning_rate": 0.00018827129933840408, + "loss": 0.3794, + "step": 7544 + }, + { + "epoch": 0.6112281270252754, + "grad_norm": 0.034999217838048935, + "learning_rate": 0.00018826679868580944, + "loss": 0.3356, + "step": 7545 + }, + { + "epoch": 0.6113091380427739, + "grad_norm": 0.03449104726314545, + "learning_rate": 0.00018826229803321483, + "loss": 0.4207, + "step": 7546 + }, + { + "epoch": 0.6113901490602722, + "grad_norm": 0.03226851671934128, + "learning_rate": 0.0001882577973806202, + "loss": 0.3652, + "step": 7547 + }, + { + "epoch": 0.6114711600777706, + "grad_norm": 0.027200650423765182, + "learning_rate": 0.00018825329672802558, + "loss": 0.3235, + "step": 7548 + }, + { + "epoch": 0.6115521710952689, + "grad_norm": 0.029985858127474785, + "learning_rate": 0.00018824879607543096, + "loss": 0.3626, + "step": 7549 + }, + { + "epoch": 0.6116331821127673, + "grad_norm": 0.033420026302337646, + "learning_rate": 0.00018824429542283632, + "loss": 0.3958, + "step": 7550 + }, + { + "epoch": 0.6117141931302658, + "grad_norm": 0.028016239404678345, + "learning_rate": 0.00018823979477024168, + "loss": 0.2803, + "step": 7551 + }, + { + "epoch": 0.6117952041477641, + "grad_norm": 0.034927189350128174, + "learning_rate": 0.00018823529411764707, + "loss": 0.3053, + "step": 7552 + }, + { + "epoch": 0.6118762151652625, + "grad_norm": 0.031104566529393196, + "learning_rate": 0.00018823079346505243, + "loss": 0.325, + "step": 7553 + }, + { + "epoch": 0.6119572261827608, + "grad_norm": 0.03127651661634445, + "learning_rate": 0.00018822629281245782, + "loss": 0.3617, + "step": 7554 + }, + { + "epoch": 0.6120382372002592, + "grad_norm": 0.03370172902941704, + "learning_rate": 0.0001882217921598632, + "loss": 0.3424, + "step": 7555 + }, + { + "epoch": 0.6121192482177576, + "grad_norm": 0.03209322318434715, + "learning_rate": 0.00018821729150726856, + "loss": 0.3977, + "step": 7556 + }, + { + "epoch": 0.612200259235256, + "grad_norm": 0.030257215723395348, + "learning_rate": 0.00018821279085467392, + "loss": 0.3375, + "step": 7557 + }, + { + "epoch": 0.6122812702527544, + "grad_norm": 0.03481970727443695, + "learning_rate": 0.0001882082902020793, + "loss": 0.3502, + "step": 7558 + }, + { + "epoch": 0.6123622812702527, + "grad_norm": 0.03415451943874359, + "learning_rate": 0.00018820378954948467, + "loss": 0.3839, + "step": 7559 + }, + { + "epoch": 0.6124432922877512, + "grad_norm": 0.02852708287537098, + "learning_rate": 0.00018819928889689006, + "loss": 0.328, + "step": 7560 + }, + { + "epoch": 0.6125243033052495, + "grad_norm": 0.03239717334508896, + "learning_rate": 0.00018819478824429545, + "loss": 0.3375, + "step": 7561 + }, + { + "epoch": 0.6126053143227479, + "grad_norm": 0.03367842733860016, + "learning_rate": 0.0001881902875917008, + "loss": 0.3581, + "step": 7562 + }, + { + "epoch": 0.6126863253402463, + "grad_norm": 0.029296424239873886, + "learning_rate": 0.00018818578693910617, + "loss": 0.2953, + "step": 7563 + }, + { + "epoch": 0.6127673363577446, + "grad_norm": 0.028935760259628296, + "learning_rate": 0.00018818128628651155, + "loss": 0.3405, + "step": 7564 + }, + { + "epoch": 0.6128483473752431, + "grad_norm": 0.04053902253508568, + "learning_rate": 0.0001881767856339169, + "loss": 0.394, + "step": 7565 + }, + { + "epoch": 0.6129293583927414, + "grad_norm": 0.03760660067200661, + "learning_rate": 0.0001881722849813223, + "loss": 0.3728, + "step": 7566 + }, + { + "epoch": 0.6130103694102398, + "grad_norm": 0.03172369301319122, + "learning_rate": 0.0001881677843287277, + "loss": 0.3449, + "step": 7567 + }, + { + "epoch": 0.6130913804277381, + "grad_norm": 0.037709664553403854, + "learning_rate": 0.00018816328367613305, + "loss": 0.3574, + "step": 7568 + }, + { + "epoch": 0.6131723914452365, + "grad_norm": 0.03204488754272461, + "learning_rate": 0.00018815878302353843, + "loss": 0.3516, + "step": 7569 + }, + { + "epoch": 0.613253402462735, + "grad_norm": 0.03243907913565636, + "learning_rate": 0.0001881542823709438, + "loss": 0.3128, + "step": 7570 + }, + { + "epoch": 0.6133344134802333, + "grad_norm": 0.030570013448596, + "learning_rate": 0.00018814978171834915, + "loss": 0.3163, + "step": 7571 + }, + { + "epoch": 0.6134154244977317, + "grad_norm": 0.03219159319996834, + "learning_rate": 0.00018814528106575454, + "loss": 0.3558, + "step": 7572 + }, + { + "epoch": 0.61349643551523, + "grad_norm": 0.04272877797484398, + "learning_rate": 0.00018814078041315993, + "loss": 0.3984, + "step": 7573 + }, + { + "epoch": 0.6135774465327285, + "grad_norm": 0.030328383669257164, + "learning_rate": 0.0001881362797605653, + "loss": 0.3269, + "step": 7574 + }, + { + "epoch": 0.6136584575502269, + "grad_norm": 0.03045385703444481, + "learning_rate": 0.00018813177910797068, + "loss": 0.3806, + "step": 7575 + }, + { + "epoch": 0.6137394685677252, + "grad_norm": 0.027790486812591553, + "learning_rate": 0.00018812727845537604, + "loss": 0.3117, + "step": 7576 + }, + { + "epoch": 0.6138204795852236, + "grad_norm": 0.03376387804746628, + "learning_rate": 0.0001881227778027814, + "loss": 0.3243, + "step": 7577 + }, + { + "epoch": 0.6139014906027219, + "grad_norm": 0.03374413028359413, + "learning_rate": 0.00018811827715018678, + "loss": 0.3735, + "step": 7578 + }, + { + "epoch": 0.6139825016202204, + "grad_norm": 0.025805898010730743, + "learning_rate": 0.00018811377649759217, + "loss": 0.3219, + "step": 7579 + }, + { + "epoch": 0.6140635126377187, + "grad_norm": 0.031635358929634094, + "learning_rate": 0.00018810927584499753, + "loss": 0.3219, + "step": 7580 + }, + { + "epoch": 0.6141445236552171, + "grad_norm": 0.029126780107617378, + "learning_rate": 0.00018810477519240292, + "loss": 0.3237, + "step": 7581 + }, + { + "epoch": 0.6142255346727155, + "grad_norm": 0.026430238038301468, + "learning_rate": 0.00018810027453980828, + "loss": 0.2978, + "step": 7582 + }, + { + "epoch": 0.6143065456902139, + "grad_norm": 0.03805132582783699, + "learning_rate": 0.00018809577388721364, + "loss": 0.4294, + "step": 7583 + }, + { + "epoch": 0.6143875567077123, + "grad_norm": 0.03655115142464638, + "learning_rate": 0.00018809127323461902, + "loss": 0.3913, + "step": 7584 + }, + { + "epoch": 0.6144685677252106, + "grad_norm": 0.03136638179421425, + "learning_rate": 0.0001880867725820244, + "loss": 0.405, + "step": 7585 + }, + { + "epoch": 0.614549578742709, + "grad_norm": 0.03049628436565399, + "learning_rate": 0.00018808227192942977, + "loss": 0.3337, + "step": 7586 + }, + { + "epoch": 0.6146305897602073, + "grad_norm": 0.030468054115772247, + "learning_rate": 0.00018807777127683516, + "loss": 0.343, + "step": 7587 + }, + { + "epoch": 0.6147116007777058, + "grad_norm": 0.031013580039143562, + "learning_rate": 0.00018807327062424052, + "loss": 0.3543, + "step": 7588 + }, + { + "epoch": 0.6147926117952042, + "grad_norm": 0.030559582635760307, + "learning_rate": 0.0001880687699716459, + "loss": 0.3742, + "step": 7589 + }, + { + "epoch": 0.6148736228127025, + "grad_norm": 0.03500194847583771, + "learning_rate": 0.00018806426931905127, + "loss": 0.3978, + "step": 7590 + }, + { + "epoch": 0.6149546338302009, + "grad_norm": 0.029537200927734375, + "learning_rate": 0.00018805976866645665, + "loss": 0.3563, + "step": 7591 + }, + { + "epoch": 0.6150356448476992, + "grad_norm": 0.03477493301033974, + "learning_rate": 0.00018805526801386201, + "loss": 0.3144, + "step": 7592 + }, + { + "epoch": 0.6151166558651977, + "grad_norm": 0.02951614372432232, + "learning_rate": 0.0001880507673612674, + "loss": 0.3141, + "step": 7593 + }, + { + "epoch": 0.6151976668826961, + "grad_norm": 0.03100944310426712, + "learning_rate": 0.00018804626670867276, + "loss": 0.3837, + "step": 7594 + }, + { + "epoch": 0.6152786779001944, + "grad_norm": 0.041775528341531754, + "learning_rate": 0.00018804176605607815, + "loss": 0.3651, + "step": 7595 + }, + { + "epoch": 0.6153596889176928, + "grad_norm": 0.031040530651807785, + "learning_rate": 0.0001880372654034835, + "loss": 0.3317, + "step": 7596 + }, + { + "epoch": 0.6154406999351912, + "grad_norm": 0.0322146899998188, + "learning_rate": 0.0001880327647508889, + "loss": 0.3203, + "step": 7597 + }, + { + "epoch": 0.6155217109526896, + "grad_norm": 0.02917400188744068, + "learning_rate": 0.00018802826409829426, + "loss": 0.3457, + "step": 7598 + }, + { + "epoch": 0.615602721970188, + "grad_norm": 0.028559932485222816, + "learning_rate": 0.00018802376344569964, + "loss": 0.3112, + "step": 7599 + }, + { + "epoch": 0.6156837329876863, + "grad_norm": 0.030218252912163734, + "learning_rate": 0.000188019262793105, + "loss": 0.3319, + "step": 7600 + }, + { + "epoch": 0.6157647440051847, + "grad_norm": 0.03289555385708809, + "learning_rate": 0.0001880147621405104, + "loss": 0.3458, + "step": 7601 + }, + { + "epoch": 0.6158457550226831, + "grad_norm": 0.031023845076560974, + "learning_rate": 0.00018801026148791575, + "loss": 0.345, + "step": 7602 + }, + { + "epoch": 0.6159267660401815, + "grad_norm": 0.02872307039797306, + "learning_rate": 0.00018800576083532114, + "loss": 0.3196, + "step": 7603 + }, + { + "epoch": 0.6160077770576798, + "grad_norm": 0.030780525878071785, + "learning_rate": 0.0001880012601827265, + "loss": 0.3173, + "step": 7604 + }, + { + "epoch": 0.6160887880751782, + "grad_norm": 0.033606596291065216, + "learning_rate": 0.00018799675953013188, + "loss": 0.3704, + "step": 7605 + }, + { + "epoch": 0.6161697990926766, + "grad_norm": 0.028881927952170372, + "learning_rate": 0.00018799225887753724, + "loss": 0.3254, + "step": 7606 + }, + { + "epoch": 0.616250810110175, + "grad_norm": 0.033225275576114655, + "learning_rate": 0.00018798775822494263, + "loss": 0.3329, + "step": 7607 + }, + { + "epoch": 0.6163318211276734, + "grad_norm": 0.03254341334104538, + "learning_rate": 0.000187983257572348, + "loss": 0.358, + "step": 7608 + }, + { + "epoch": 0.6164128321451717, + "grad_norm": 0.033052753657102585, + "learning_rate": 0.00018797875691975338, + "loss": 0.341, + "step": 7609 + }, + { + "epoch": 0.6164938431626701, + "grad_norm": 0.03443504869937897, + "learning_rate": 0.00018797425626715874, + "loss": 0.3831, + "step": 7610 + }, + { + "epoch": 0.6165748541801686, + "grad_norm": 0.02747352235019207, + "learning_rate": 0.00018796975561456413, + "loss": 0.3274, + "step": 7611 + }, + { + "epoch": 0.6166558651976669, + "grad_norm": 0.03181469440460205, + "learning_rate": 0.00018796525496196949, + "loss": 0.3884, + "step": 7612 + }, + { + "epoch": 0.6167368762151653, + "grad_norm": 0.033690501004457474, + "learning_rate": 0.00018796075430937487, + "loss": 0.398, + "step": 7613 + }, + { + "epoch": 0.6168178872326636, + "grad_norm": 0.03067837283015251, + "learning_rate": 0.00018795625365678023, + "loss": 0.3083, + "step": 7614 + }, + { + "epoch": 0.616898898250162, + "grad_norm": 0.027984514832496643, + "learning_rate": 0.00018795175300418562, + "loss": 0.2947, + "step": 7615 + }, + { + "epoch": 0.6169799092676604, + "grad_norm": 0.02972647361457348, + "learning_rate": 0.00018794725235159098, + "loss": 0.3436, + "step": 7616 + }, + { + "epoch": 0.6170609202851588, + "grad_norm": 0.031133973971009254, + "learning_rate": 0.00018794275169899637, + "loss": 0.353, + "step": 7617 + }, + { + "epoch": 0.6171419313026572, + "grad_norm": 0.031696900725364685, + "learning_rate": 0.00018793825104640175, + "loss": 0.3642, + "step": 7618 + }, + { + "epoch": 0.6172229423201555, + "grad_norm": 0.03247181698679924, + "learning_rate": 0.00018793375039380711, + "loss": 0.3375, + "step": 7619 + }, + { + "epoch": 0.6173039533376539, + "grad_norm": 0.030099336057901382, + "learning_rate": 0.00018792924974121247, + "loss": 0.31, + "step": 7620 + }, + { + "epoch": 0.6173849643551523, + "grad_norm": 0.028563443571329117, + "learning_rate": 0.00018792474908861786, + "loss": 0.339, + "step": 7621 + }, + { + "epoch": 0.6174659753726507, + "grad_norm": 0.030126718804240227, + "learning_rate": 0.00018792024843602322, + "loss": 0.3602, + "step": 7622 + }, + { + "epoch": 0.617546986390149, + "grad_norm": 0.029762936756014824, + "learning_rate": 0.0001879157477834286, + "loss": 0.3322, + "step": 7623 + }, + { + "epoch": 0.6176279974076474, + "grad_norm": 0.029128240421414375, + "learning_rate": 0.000187911247130834, + "loss": 0.2891, + "step": 7624 + }, + { + "epoch": 0.6177090084251459, + "grad_norm": 0.028977826237678528, + "learning_rate": 0.00018790674647823936, + "loss": 0.3313, + "step": 7625 + }, + { + "epoch": 0.6177900194426442, + "grad_norm": 0.029257269576191902, + "learning_rate": 0.00018790224582564472, + "loss": 0.2927, + "step": 7626 + }, + { + "epoch": 0.6178710304601426, + "grad_norm": 0.03232651203870773, + "learning_rate": 0.0001878977451730501, + "loss": 0.3648, + "step": 7627 + }, + { + "epoch": 0.6179520414776409, + "grad_norm": 0.02883930690586567, + "learning_rate": 0.00018789324452045546, + "loss": 0.3391, + "step": 7628 + }, + { + "epoch": 0.6180330524951393, + "grad_norm": 0.03152291476726532, + "learning_rate": 0.00018788874386786085, + "loss": 0.319, + "step": 7629 + }, + { + "epoch": 0.6181140635126378, + "grad_norm": 0.03525060415267944, + "learning_rate": 0.00018788424321526624, + "loss": 0.3852, + "step": 7630 + }, + { + "epoch": 0.6181950745301361, + "grad_norm": 0.03459348902106285, + "learning_rate": 0.0001878797425626716, + "loss": 0.3838, + "step": 7631 + }, + { + "epoch": 0.6182760855476345, + "grad_norm": 0.030856028199195862, + "learning_rate": 0.00018787524191007696, + "loss": 0.3459, + "step": 7632 + }, + { + "epoch": 0.6183570965651328, + "grad_norm": 0.029476845636963844, + "learning_rate": 0.00018787074125748234, + "loss": 0.3599, + "step": 7633 + }, + { + "epoch": 0.6184381075826313, + "grad_norm": 0.029246686026453972, + "learning_rate": 0.0001878662406048877, + "loss": 0.3057, + "step": 7634 + }, + { + "epoch": 0.6185191186001296, + "grad_norm": 0.03888120874762535, + "learning_rate": 0.0001878617399522931, + "loss": 0.4396, + "step": 7635 + }, + { + "epoch": 0.618600129617628, + "grad_norm": 0.03742235526442528, + "learning_rate": 0.00018785723929969848, + "loss": 0.3666, + "step": 7636 + }, + { + "epoch": 0.6186811406351264, + "grad_norm": 0.029268227517604828, + "learning_rate": 0.00018785273864710384, + "loss": 0.3696, + "step": 7637 + }, + { + "epoch": 0.6187621516526247, + "grad_norm": 0.02961529977619648, + "learning_rate": 0.00018784823799450923, + "loss": 0.3066, + "step": 7638 + }, + { + "epoch": 0.6188431626701232, + "grad_norm": 0.03180916979908943, + "learning_rate": 0.0001878437373419146, + "loss": 0.3737, + "step": 7639 + }, + { + "epoch": 0.6189241736876215, + "grad_norm": 0.027828671038150787, + "learning_rate": 0.00018783923668931995, + "loss": 0.2885, + "step": 7640 + }, + { + "epoch": 0.6190051847051199, + "grad_norm": 0.034832458943128586, + "learning_rate": 0.00018783473603672533, + "loss": 0.3623, + "step": 7641 + }, + { + "epoch": 0.6190861957226182, + "grad_norm": 0.03153369948267937, + "learning_rate": 0.00018783023538413072, + "loss": 0.3357, + "step": 7642 + }, + { + "epoch": 0.6191672067401166, + "grad_norm": 0.032440654933452606, + "learning_rate": 0.00018782573473153608, + "loss": 0.321, + "step": 7643 + }, + { + "epoch": 0.6192482177576151, + "grad_norm": 0.03241625428199768, + "learning_rate": 0.00018782123407894147, + "loss": 0.3742, + "step": 7644 + }, + { + "epoch": 0.6193292287751134, + "grad_norm": 0.032071325927972794, + "learning_rate": 0.00018781673342634683, + "loss": 0.3665, + "step": 7645 + }, + { + "epoch": 0.6194102397926118, + "grad_norm": 0.030852077528834343, + "learning_rate": 0.0001878122327737522, + "loss": 0.3976, + "step": 7646 + }, + { + "epoch": 0.6194912508101101, + "grad_norm": 0.027692748233675957, + "learning_rate": 0.00018780773212115758, + "loss": 0.3431, + "step": 7647 + }, + { + "epoch": 0.6195722618276086, + "grad_norm": 0.03268222138285637, + "learning_rate": 0.00018780323146856296, + "loss": 0.3486, + "step": 7648 + }, + { + "epoch": 0.619653272845107, + "grad_norm": 0.03294394537806511, + "learning_rate": 0.00018779873081596832, + "loss": 0.3759, + "step": 7649 + }, + { + "epoch": 0.6197342838626053, + "grad_norm": 0.0347847044467926, + "learning_rate": 0.0001877942301633737, + "loss": 0.3716, + "step": 7650 + }, + { + "epoch": 0.6198152948801037, + "grad_norm": 0.03194839507341385, + "learning_rate": 0.00018778972951077907, + "loss": 0.3021, + "step": 7651 + }, + { + "epoch": 0.619896305897602, + "grad_norm": 0.038782477378845215, + "learning_rate": 0.00018778522885818443, + "loss": 0.3673, + "step": 7652 + }, + { + "epoch": 0.6199773169151005, + "grad_norm": 0.029995804652571678, + "learning_rate": 0.00018778072820558982, + "loss": 0.324, + "step": 7653 + }, + { + "epoch": 0.6200583279325989, + "grad_norm": 0.03394217789173126, + "learning_rate": 0.0001877762275529952, + "loss": 0.3304, + "step": 7654 + }, + { + "epoch": 0.6201393389500972, + "grad_norm": 0.03123002126812935, + "learning_rate": 0.00018777172690040056, + "loss": 0.3457, + "step": 7655 + }, + { + "epoch": 0.6202203499675956, + "grad_norm": 0.029245445504784584, + "learning_rate": 0.00018776722624780595, + "loss": 0.3354, + "step": 7656 + }, + { + "epoch": 0.6203013609850939, + "grad_norm": 0.03261977806687355, + "learning_rate": 0.0001877627255952113, + "loss": 0.3578, + "step": 7657 + }, + { + "epoch": 0.6203823720025924, + "grad_norm": 0.03168657794594765, + "learning_rate": 0.00018775822494261667, + "loss": 0.3391, + "step": 7658 + }, + { + "epoch": 0.6204633830200907, + "grad_norm": 0.02707383781671524, + "learning_rate": 0.00018775372429002206, + "loss": 0.3067, + "step": 7659 + }, + { + "epoch": 0.6205443940375891, + "grad_norm": 0.034147847443819046, + "learning_rate": 0.00018774922363742745, + "loss": 0.3351, + "step": 7660 + }, + { + "epoch": 0.6206254050550875, + "grad_norm": 0.02819540537893772, + "learning_rate": 0.0001877447229848328, + "loss": 0.3439, + "step": 7661 + }, + { + "epoch": 0.6207064160725859, + "grad_norm": 0.032641153782606125, + "learning_rate": 0.0001877402223322382, + "loss": 0.3474, + "step": 7662 + }, + { + "epoch": 0.6207874270900843, + "grad_norm": 0.031857844442129135, + "learning_rate": 0.00018773572167964355, + "loss": 0.3122, + "step": 7663 + }, + { + "epoch": 0.6208684381075826, + "grad_norm": 0.03519831597805023, + "learning_rate": 0.0001877312210270489, + "loss": 0.3792, + "step": 7664 + }, + { + "epoch": 0.620949449125081, + "grad_norm": 0.028352493420243263, + "learning_rate": 0.0001877267203744543, + "loss": 0.334, + "step": 7665 + }, + { + "epoch": 0.6210304601425793, + "grad_norm": 0.029478255659341812, + "learning_rate": 0.0001877222197218597, + "loss": 0.3336, + "step": 7666 + }, + { + "epoch": 0.6211114711600778, + "grad_norm": 0.030945241451263428, + "learning_rate": 0.00018771771906926505, + "loss": 0.3041, + "step": 7667 + }, + { + "epoch": 0.6211924821775762, + "grad_norm": 0.03310845419764519, + "learning_rate": 0.00018771321841667043, + "loss": 0.3457, + "step": 7668 + }, + { + "epoch": 0.6212734931950745, + "grad_norm": 0.030136924237012863, + "learning_rate": 0.0001877087177640758, + "loss": 0.3792, + "step": 7669 + }, + { + "epoch": 0.6213545042125729, + "grad_norm": 0.035356342792510986, + "learning_rate": 0.00018770421711148118, + "loss": 0.3677, + "step": 7670 + }, + { + "epoch": 0.6214355152300713, + "grad_norm": 0.028500456362962723, + "learning_rate": 0.00018769971645888654, + "loss": 0.3355, + "step": 7671 + }, + { + "epoch": 0.6215165262475697, + "grad_norm": 0.032192058861255646, + "learning_rate": 0.00018769521580629193, + "loss": 0.3773, + "step": 7672 + }, + { + "epoch": 0.6215975372650681, + "grad_norm": 0.031182562932372093, + "learning_rate": 0.0001876907151536973, + "loss": 0.3782, + "step": 7673 + }, + { + "epoch": 0.6216785482825664, + "grad_norm": 0.027648283168673515, + "learning_rate": 0.00018768621450110268, + "loss": 0.3275, + "step": 7674 + }, + { + "epoch": 0.6217595593000648, + "grad_norm": 0.02733623795211315, + "learning_rate": 0.00018768171384850804, + "loss": 0.3194, + "step": 7675 + }, + { + "epoch": 0.6218405703175632, + "grad_norm": 0.031033562496304512, + "learning_rate": 0.00018767721319591342, + "loss": 0.3608, + "step": 7676 + }, + { + "epoch": 0.6219215813350616, + "grad_norm": 0.031006228178739548, + "learning_rate": 0.00018767271254331878, + "loss": 0.3551, + "step": 7677 + }, + { + "epoch": 0.62200259235256, + "grad_norm": 0.029379108920693398, + "learning_rate": 0.00018766821189072417, + "loss": 0.3467, + "step": 7678 + }, + { + "epoch": 0.6220836033700583, + "grad_norm": 0.02919275499880314, + "learning_rate": 0.00018766371123812953, + "loss": 0.3358, + "step": 7679 + }, + { + "epoch": 0.6221646143875567, + "grad_norm": 0.028002485632896423, + "learning_rate": 0.00018765921058553492, + "loss": 0.3253, + "step": 7680 + }, + { + "epoch": 0.6222456254050551, + "grad_norm": 0.03162265196442604, + "learning_rate": 0.00018765470993294028, + "loss": 0.3578, + "step": 7681 + }, + { + "epoch": 0.6223266364225535, + "grad_norm": 0.0298593882471323, + "learning_rate": 0.00018765020928034566, + "loss": 0.3134, + "step": 7682 + }, + { + "epoch": 0.6224076474400518, + "grad_norm": 0.03213672339916229, + "learning_rate": 0.00018764570862775103, + "loss": 0.3407, + "step": 7683 + }, + { + "epoch": 0.6224886584575502, + "grad_norm": 0.02828054688870907, + "learning_rate": 0.0001876412079751564, + "loss": 0.3089, + "step": 7684 + }, + { + "epoch": 0.6225696694750487, + "grad_norm": 0.03332474082708359, + "learning_rate": 0.00018763670732256177, + "loss": 0.3777, + "step": 7685 + }, + { + "epoch": 0.622650680492547, + "grad_norm": 0.035733021795749664, + "learning_rate": 0.00018763220666996716, + "loss": 0.3843, + "step": 7686 + }, + { + "epoch": 0.6227316915100454, + "grad_norm": 0.03247293457388878, + "learning_rate": 0.00018762770601737252, + "loss": 0.3606, + "step": 7687 + }, + { + "epoch": 0.6228127025275437, + "grad_norm": 0.034030482172966, + "learning_rate": 0.0001876232053647779, + "loss": 0.4188, + "step": 7688 + }, + { + "epoch": 0.6228937135450421, + "grad_norm": 0.03167286515235901, + "learning_rate": 0.00018761870471218327, + "loss": 0.3326, + "step": 7689 + }, + { + "epoch": 0.6229747245625405, + "grad_norm": 0.03559541329741478, + "learning_rate": 0.00018761420405958865, + "loss": 0.4577, + "step": 7690 + }, + { + "epoch": 0.6230557355800389, + "grad_norm": 0.03855305537581444, + "learning_rate": 0.00018760970340699401, + "loss": 0.3574, + "step": 7691 + }, + { + "epoch": 0.6231367465975373, + "grad_norm": 0.03338480740785599, + "learning_rate": 0.0001876052027543994, + "loss": 0.347, + "step": 7692 + }, + { + "epoch": 0.6232177576150356, + "grad_norm": 0.032319311052560806, + "learning_rate": 0.0001876007021018048, + "loss": 0.321, + "step": 7693 + }, + { + "epoch": 0.623298768632534, + "grad_norm": 0.03374495357275009, + "learning_rate": 0.00018759620144921015, + "loss": 0.3338, + "step": 7694 + }, + { + "epoch": 0.6233797796500324, + "grad_norm": 0.030150065198540688, + "learning_rate": 0.0001875917007966155, + "loss": 0.4023, + "step": 7695 + }, + { + "epoch": 0.6234607906675308, + "grad_norm": 0.02881564199924469, + "learning_rate": 0.0001875872001440209, + "loss": 0.3131, + "step": 7696 + }, + { + "epoch": 0.6235418016850292, + "grad_norm": 0.03331762179732323, + "learning_rate": 0.00018758269949142626, + "loss": 0.377, + "step": 7697 + }, + { + "epoch": 0.6236228127025275, + "grad_norm": 0.031027859076857567, + "learning_rate": 0.00018757819883883164, + "loss": 0.3222, + "step": 7698 + }, + { + "epoch": 0.623703823720026, + "grad_norm": 0.03061489760875702, + "learning_rate": 0.00018757369818623703, + "loss": 0.3708, + "step": 7699 + }, + { + "epoch": 0.6237848347375243, + "grad_norm": 0.02916574850678444, + "learning_rate": 0.0001875691975336424, + "loss": 0.3638, + "step": 7700 + }, + { + "epoch": 0.6238658457550227, + "grad_norm": 0.0307163055986166, + "learning_rate": 0.00018756469688104775, + "loss": 0.3371, + "step": 7701 + }, + { + "epoch": 0.623946856772521, + "grad_norm": 0.02987971156835556, + "learning_rate": 0.00018756019622845314, + "loss": 0.3574, + "step": 7702 + }, + { + "epoch": 0.6240278677900194, + "grad_norm": 0.043917465955019, + "learning_rate": 0.0001875556955758585, + "loss": 0.3741, + "step": 7703 + }, + { + "epoch": 0.6241088788075179, + "grad_norm": 0.0349520780146122, + "learning_rate": 0.00018755119492326388, + "loss": 0.3873, + "step": 7704 + }, + { + "epoch": 0.6241898898250162, + "grad_norm": 0.03076990880072117, + "learning_rate": 0.00018754669427066927, + "loss": 0.3318, + "step": 7705 + }, + { + "epoch": 0.6242709008425146, + "grad_norm": 0.031307514756917953, + "learning_rate": 0.00018754219361807463, + "loss": 0.3338, + "step": 7706 + }, + { + "epoch": 0.6243519118600129, + "grad_norm": 0.033534422516822815, + "learning_rate": 0.00018753769296548002, + "loss": 0.368, + "step": 7707 + }, + { + "epoch": 0.6244329228775113, + "grad_norm": 0.03319765254855156, + "learning_rate": 0.00018753319231288538, + "loss": 0.3658, + "step": 7708 + }, + { + "epoch": 0.6245139338950098, + "grad_norm": 0.03517692908644676, + "learning_rate": 0.00018752869166029074, + "loss": 0.4121, + "step": 7709 + }, + { + "epoch": 0.6245949449125081, + "grad_norm": 0.044076353311538696, + "learning_rate": 0.00018752419100769613, + "loss": 0.3644, + "step": 7710 + }, + { + "epoch": 0.6246759559300065, + "grad_norm": 0.03173117712140083, + "learning_rate": 0.0001875196903551015, + "loss": 0.3017, + "step": 7711 + }, + { + "epoch": 0.6247569669475048, + "grad_norm": 0.02849593758583069, + "learning_rate": 0.00018751518970250687, + "loss": 0.3117, + "step": 7712 + }, + { + "epoch": 0.6248379779650033, + "grad_norm": 0.036401744931936264, + "learning_rate": 0.00018751068904991226, + "loss": 0.3876, + "step": 7713 + }, + { + "epoch": 0.6249189889825016, + "grad_norm": 0.034901734441518784, + "learning_rate": 0.00018750618839731762, + "loss": 0.4069, + "step": 7714 + }, + { + "epoch": 0.625, + "grad_norm": 0.031746748834848404, + "learning_rate": 0.00018750168774472298, + "loss": 0.3244, + "step": 7715 + }, + { + "epoch": 0.6250810110174984, + "grad_norm": 0.03301462158560753, + "learning_rate": 0.00018749718709212837, + "loss": 0.3879, + "step": 7716 + }, + { + "epoch": 0.6251620220349967, + "grad_norm": 0.029027685523033142, + "learning_rate": 0.00018749268643953375, + "loss": 0.3395, + "step": 7717 + }, + { + "epoch": 0.6252430330524952, + "grad_norm": 0.02682817541062832, + "learning_rate": 0.00018748818578693911, + "loss": 0.319, + "step": 7718 + }, + { + "epoch": 0.6253240440699935, + "grad_norm": 0.02966170758008957, + "learning_rate": 0.0001874836851343445, + "loss": 0.3461, + "step": 7719 + }, + { + "epoch": 0.6254050550874919, + "grad_norm": 0.029544176533818245, + "learning_rate": 0.00018747918448174986, + "loss": 0.3465, + "step": 7720 + }, + { + "epoch": 0.6254860661049902, + "grad_norm": 0.03321243077516556, + "learning_rate": 0.00018747468382915522, + "loss": 0.3737, + "step": 7721 + }, + { + "epoch": 0.6255670771224887, + "grad_norm": 0.03554457798600197, + "learning_rate": 0.0001874701831765606, + "loss": 0.3616, + "step": 7722 + }, + { + "epoch": 0.6256480881399871, + "grad_norm": 0.030386896803975105, + "learning_rate": 0.000187465682523966, + "loss": 0.3075, + "step": 7723 + }, + { + "epoch": 0.6257290991574854, + "grad_norm": 0.02811279334127903, + "learning_rate": 0.00018746118187137136, + "loss": 0.3463, + "step": 7724 + }, + { + "epoch": 0.6258101101749838, + "grad_norm": 0.033893097192049026, + "learning_rate": 0.00018745668121877674, + "loss": 0.4574, + "step": 7725 + }, + { + "epoch": 0.6258911211924821, + "grad_norm": 0.030976947396993637, + "learning_rate": 0.0001874521805661821, + "loss": 0.3689, + "step": 7726 + }, + { + "epoch": 0.6259721322099806, + "grad_norm": 0.03269078955054283, + "learning_rate": 0.00018744767991358746, + "loss": 0.421, + "step": 7727 + }, + { + "epoch": 0.626053143227479, + "grad_norm": 0.028709085658192635, + "learning_rate": 0.00018744317926099285, + "loss": 0.3167, + "step": 7728 + }, + { + "epoch": 0.6261341542449773, + "grad_norm": 0.02881304919719696, + "learning_rate": 0.00018743867860839824, + "loss": 0.3373, + "step": 7729 + }, + { + "epoch": 0.6262151652624757, + "grad_norm": 0.030857166275382042, + "learning_rate": 0.0001874341779558036, + "loss": 0.3754, + "step": 7730 + }, + { + "epoch": 0.626296176279974, + "grad_norm": 0.0322621688246727, + "learning_rate": 0.00018742967730320899, + "loss": 0.3685, + "step": 7731 + }, + { + "epoch": 0.6263771872974725, + "grad_norm": 0.03219965472817421, + "learning_rate": 0.00018742517665061435, + "loss": 0.3557, + "step": 7732 + }, + { + "epoch": 0.6264581983149708, + "grad_norm": 0.033386070281267166, + "learning_rate": 0.0001874206759980197, + "loss": 0.3894, + "step": 7733 + }, + { + "epoch": 0.6265392093324692, + "grad_norm": 0.031703755259513855, + "learning_rate": 0.0001874161753454251, + "loss": 0.3368, + "step": 7734 + }, + { + "epoch": 0.6266202203499676, + "grad_norm": 0.03309629485011101, + "learning_rate": 0.00018741167469283048, + "loss": 0.4076, + "step": 7735 + }, + { + "epoch": 0.626701231367466, + "grad_norm": 0.030072150751948357, + "learning_rate": 0.00018740717404023584, + "loss": 0.3714, + "step": 7736 + }, + { + "epoch": 0.6267822423849644, + "grad_norm": 0.03196774795651436, + "learning_rate": 0.00018740267338764123, + "loss": 0.3403, + "step": 7737 + }, + { + "epoch": 0.6268632534024627, + "grad_norm": 0.03893179073929787, + "learning_rate": 0.0001873981727350466, + "loss": 0.3679, + "step": 7738 + }, + { + "epoch": 0.6269442644199611, + "grad_norm": 0.027131741866469383, + "learning_rate": 0.00018739367208245195, + "loss": 0.3135, + "step": 7739 + }, + { + "epoch": 0.6270252754374595, + "grad_norm": 0.03447889909148216, + "learning_rate": 0.00018738917142985733, + "loss": 0.3686, + "step": 7740 + }, + { + "epoch": 0.6271062864549579, + "grad_norm": 0.0332777164876461, + "learning_rate": 0.00018738467077726272, + "loss": 0.3708, + "step": 7741 + }, + { + "epoch": 0.6271872974724563, + "grad_norm": 0.03280436247587204, + "learning_rate": 0.00018738017012466808, + "loss": 0.3166, + "step": 7742 + }, + { + "epoch": 0.6272683084899546, + "grad_norm": 0.031332600861787796, + "learning_rate": 0.00018737566947207347, + "loss": 0.315, + "step": 7743 + }, + { + "epoch": 0.627349319507453, + "grad_norm": 0.03338506817817688, + "learning_rate": 0.00018737116881947883, + "loss": 0.3947, + "step": 7744 + }, + { + "epoch": 0.6274303305249513, + "grad_norm": 0.03453171253204346, + "learning_rate": 0.0001873666681668842, + "loss": 0.3335, + "step": 7745 + }, + { + "epoch": 0.6275113415424498, + "grad_norm": 0.032016437500715256, + "learning_rate": 0.00018736216751428958, + "loss": 0.3425, + "step": 7746 + }, + { + "epoch": 0.6275923525599482, + "grad_norm": 0.026858022436499596, + "learning_rate": 0.00018735766686169496, + "loss": 0.3468, + "step": 7747 + }, + { + "epoch": 0.6276733635774465, + "grad_norm": 0.028182942420244217, + "learning_rate": 0.00018735316620910032, + "loss": 0.3507, + "step": 7748 + }, + { + "epoch": 0.6277543745949449, + "grad_norm": 0.027926893904805183, + "learning_rate": 0.0001873486655565057, + "loss": 0.2843, + "step": 7749 + }, + { + "epoch": 0.6278353856124433, + "grad_norm": 0.028946882113814354, + "learning_rate": 0.00018734416490391107, + "loss": 0.3046, + "step": 7750 + }, + { + "epoch": 0.6279163966299417, + "grad_norm": 0.03570643439888954, + "learning_rate": 0.00018733966425131646, + "loss": 0.3791, + "step": 7751 + }, + { + "epoch": 0.62799740764744, + "grad_norm": 0.03203314542770386, + "learning_rate": 0.00018733516359872182, + "loss": 0.357, + "step": 7752 + }, + { + "epoch": 0.6280784186649384, + "grad_norm": 0.029860027134418488, + "learning_rate": 0.0001873306629461272, + "loss": 0.3278, + "step": 7753 + }, + { + "epoch": 0.6281594296824368, + "grad_norm": 0.033992551267147064, + "learning_rate": 0.00018732616229353256, + "loss": 0.3851, + "step": 7754 + }, + { + "epoch": 0.6282404406999352, + "grad_norm": 0.03233917057514191, + "learning_rate": 0.00018732166164093795, + "loss": 0.3922, + "step": 7755 + }, + { + "epoch": 0.6283214517174336, + "grad_norm": 0.03380677103996277, + "learning_rate": 0.0001873171609883433, + "loss": 0.3704, + "step": 7756 + }, + { + "epoch": 0.6284024627349319, + "grad_norm": 0.03246676176786423, + "learning_rate": 0.0001873126603357487, + "loss": 0.3935, + "step": 7757 + }, + { + "epoch": 0.6284834737524303, + "grad_norm": 0.029080143198370934, + "learning_rate": 0.00018730815968315406, + "loss": 0.3525, + "step": 7758 + }, + { + "epoch": 0.6285644847699287, + "grad_norm": 0.028809476643800735, + "learning_rate": 0.00018730365903055945, + "loss": 0.2819, + "step": 7759 + }, + { + "epoch": 0.6286454957874271, + "grad_norm": 0.028122996911406517, + "learning_rate": 0.0001872991583779648, + "loss": 0.3095, + "step": 7760 + }, + { + "epoch": 0.6287265068049255, + "grad_norm": 0.0307499747723341, + "learning_rate": 0.0001872946577253702, + "loss": 0.36, + "step": 7761 + }, + { + "epoch": 0.6288075178224238, + "grad_norm": 0.03517741337418556, + "learning_rate": 0.00018729015707277555, + "loss": 0.3515, + "step": 7762 + }, + { + "epoch": 0.6288885288399222, + "grad_norm": 0.03172396123409271, + "learning_rate": 0.00018728565642018094, + "loss": 0.3623, + "step": 7763 + }, + { + "epoch": 0.6289695398574207, + "grad_norm": 0.02997780777513981, + "learning_rate": 0.0001872811557675863, + "loss": 0.3266, + "step": 7764 + }, + { + "epoch": 0.629050550874919, + "grad_norm": 0.02853599190711975, + "learning_rate": 0.0001872766551149917, + "loss": 0.3123, + "step": 7765 + }, + { + "epoch": 0.6291315618924174, + "grad_norm": 0.02735970728099346, + "learning_rate": 0.00018727215446239705, + "loss": 0.3058, + "step": 7766 + }, + { + "epoch": 0.6292125729099157, + "grad_norm": 0.03487173095345497, + "learning_rate": 0.00018726765380980243, + "loss": 0.3259, + "step": 7767 + }, + { + "epoch": 0.6292935839274141, + "grad_norm": 0.032258134335279465, + "learning_rate": 0.0001872631531572078, + "loss": 0.3393, + "step": 7768 + }, + { + "epoch": 0.6293745949449125, + "grad_norm": 0.034025609493255615, + "learning_rate": 0.00018725865250461318, + "loss": 0.3721, + "step": 7769 + }, + { + "epoch": 0.6294556059624109, + "grad_norm": 0.03420078381896019, + "learning_rate": 0.00018725415185201854, + "loss": 0.3673, + "step": 7770 + }, + { + "epoch": 0.6295366169799093, + "grad_norm": 0.034589920192956924, + "learning_rate": 0.00018724965119942393, + "loss": 0.3744, + "step": 7771 + }, + { + "epoch": 0.6296176279974076, + "grad_norm": 0.029632601886987686, + "learning_rate": 0.0001872451505468293, + "loss": 0.3097, + "step": 7772 + }, + { + "epoch": 0.6296986390149061, + "grad_norm": 0.030544817447662354, + "learning_rate": 0.00018724064989423468, + "loss": 0.2801, + "step": 7773 + }, + { + "epoch": 0.6297796500324044, + "grad_norm": 0.029614605009555817, + "learning_rate": 0.00018723614924164006, + "loss": 0.3636, + "step": 7774 + }, + { + "epoch": 0.6298606610499028, + "grad_norm": 0.030968619510531425, + "learning_rate": 0.00018723164858904542, + "loss": 0.3664, + "step": 7775 + }, + { + "epoch": 0.6299416720674011, + "grad_norm": 0.026602793484926224, + "learning_rate": 0.0001872271479364508, + "loss": 0.3372, + "step": 7776 + }, + { + "epoch": 0.6300226830848995, + "grad_norm": 0.033030539751052856, + "learning_rate": 0.00018722264728385617, + "loss": 0.3602, + "step": 7777 + }, + { + "epoch": 0.630103694102398, + "grad_norm": 0.03224138543009758, + "learning_rate": 0.00018721814663126153, + "loss": 0.3378, + "step": 7778 + }, + { + "epoch": 0.6301847051198963, + "grad_norm": 0.032139744609594345, + "learning_rate": 0.00018721364597866692, + "loss": 0.3274, + "step": 7779 + }, + { + "epoch": 0.6302657161373947, + "grad_norm": 0.0309866052120924, + "learning_rate": 0.0001872091453260723, + "loss": 0.3763, + "step": 7780 + }, + { + "epoch": 0.630346727154893, + "grad_norm": 0.030568985268473625, + "learning_rate": 0.00018720464467347767, + "loss": 0.3292, + "step": 7781 + }, + { + "epoch": 0.6304277381723914, + "grad_norm": 0.036407891660928726, + "learning_rate": 0.00018720014402088305, + "loss": 0.3756, + "step": 7782 + }, + { + "epoch": 0.6305087491898899, + "grad_norm": 0.029157212004065514, + "learning_rate": 0.0001871956433682884, + "loss": 0.3273, + "step": 7783 + }, + { + "epoch": 0.6305897602073882, + "grad_norm": 0.031077854335308075, + "learning_rate": 0.00018719114271569377, + "loss": 0.3667, + "step": 7784 + }, + { + "epoch": 0.6306707712248866, + "grad_norm": 0.027137432247400284, + "learning_rate": 0.00018718664206309916, + "loss": 0.2917, + "step": 7785 + }, + { + "epoch": 0.6307517822423849, + "grad_norm": 0.03677370026707649, + "learning_rate": 0.00018718214141050455, + "loss": 0.3672, + "step": 7786 + }, + { + "epoch": 0.6308327932598834, + "grad_norm": 0.029398785904049873, + "learning_rate": 0.0001871776407579099, + "loss": 0.3165, + "step": 7787 + }, + { + "epoch": 0.6309138042773818, + "grad_norm": 0.028446868062019348, + "learning_rate": 0.0001871731401053153, + "loss": 0.3239, + "step": 7788 + }, + { + "epoch": 0.6309948152948801, + "grad_norm": 0.032294776290655136, + "learning_rate": 0.00018716863945272065, + "loss": 0.3201, + "step": 7789 + }, + { + "epoch": 0.6310758263123785, + "grad_norm": 0.030610181391239166, + "learning_rate": 0.00018716413880012601, + "loss": 0.3521, + "step": 7790 + }, + { + "epoch": 0.6311568373298768, + "grad_norm": 0.029071614146232605, + "learning_rate": 0.0001871596381475314, + "loss": 0.3449, + "step": 7791 + }, + { + "epoch": 0.6312378483473753, + "grad_norm": 0.0313870944082737, + "learning_rate": 0.0001871551374949368, + "loss": 0.327, + "step": 7792 + }, + { + "epoch": 0.6313188593648736, + "grad_norm": 0.028792880475521088, + "learning_rate": 0.00018715063684234215, + "loss": 0.3437, + "step": 7793 + }, + { + "epoch": 0.631399870382372, + "grad_norm": 0.02864512801170349, + "learning_rate": 0.00018714613618974754, + "loss": 0.3288, + "step": 7794 + }, + { + "epoch": 0.6314808813998704, + "grad_norm": 0.0337587371468544, + "learning_rate": 0.0001871416355371529, + "loss": 0.3454, + "step": 7795 + }, + { + "epoch": 0.6315618924173687, + "grad_norm": 0.03164323791861534, + "learning_rate": 0.00018713713488455826, + "loss": 0.3309, + "step": 7796 + }, + { + "epoch": 0.6316429034348672, + "grad_norm": 0.032779909670352936, + "learning_rate": 0.00018713263423196364, + "loss": 0.3963, + "step": 7797 + }, + { + "epoch": 0.6317239144523655, + "grad_norm": 0.031031692400574684, + "learning_rate": 0.00018712813357936903, + "loss": 0.3399, + "step": 7798 + }, + { + "epoch": 0.6318049254698639, + "grad_norm": 0.028976168483495712, + "learning_rate": 0.0001871236329267744, + "loss": 0.3275, + "step": 7799 + }, + { + "epoch": 0.6318859364873622, + "grad_norm": 0.03551590442657471, + "learning_rate": 0.00018711913227417978, + "loss": 0.3604, + "step": 7800 + }, + { + "epoch": 0.6319669475048607, + "grad_norm": 0.0324922576546669, + "learning_rate": 0.00018711463162158514, + "loss": 0.4056, + "step": 7801 + }, + { + "epoch": 0.6320479585223591, + "grad_norm": 0.030459264293313026, + "learning_rate": 0.0001871101309689905, + "loss": 0.3151, + "step": 7802 + }, + { + "epoch": 0.6321289695398574, + "grad_norm": 0.03610960766673088, + "learning_rate": 0.00018710563031639588, + "loss": 0.3772, + "step": 7803 + }, + { + "epoch": 0.6322099805573558, + "grad_norm": 0.033341750502586365, + "learning_rate": 0.00018710112966380127, + "loss": 0.3688, + "step": 7804 + }, + { + "epoch": 0.6322909915748541, + "grad_norm": 0.029451124370098114, + "learning_rate": 0.00018709662901120663, + "loss": 0.3213, + "step": 7805 + }, + { + "epoch": 0.6323720025923526, + "grad_norm": 0.03107476606965065, + "learning_rate": 0.00018709212835861202, + "loss": 0.351, + "step": 7806 + }, + { + "epoch": 0.632453013609851, + "grad_norm": 0.03220190480351448, + "learning_rate": 0.00018708762770601738, + "loss": 0.3563, + "step": 7807 + }, + { + "epoch": 0.6325340246273493, + "grad_norm": 0.028155187144875526, + "learning_rate": 0.00018708312705342274, + "loss": 0.3067, + "step": 7808 + }, + { + "epoch": 0.6326150356448477, + "grad_norm": 0.03451511636376381, + "learning_rate": 0.00018707862640082813, + "loss": 0.406, + "step": 7809 + }, + { + "epoch": 0.6326960466623461, + "grad_norm": 0.03255141153931618, + "learning_rate": 0.0001870741257482335, + "loss": 0.3816, + "step": 7810 + }, + { + "epoch": 0.6327770576798445, + "grad_norm": 0.027588123455643654, + "learning_rate": 0.00018706962509563887, + "loss": 0.3374, + "step": 7811 + }, + { + "epoch": 0.6328580686973428, + "grad_norm": 0.03637088090181351, + "learning_rate": 0.00018706512444304426, + "loss": 0.3403, + "step": 7812 + }, + { + "epoch": 0.6329390797148412, + "grad_norm": 0.0299672894179821, + "learning_rate": 0.00018706062379044962, + "loss": 0.3079, + "step": 7813 + }, + { + "epoch": 0.6330200907323396, + "grad_norm": 0.03295118361711502, + "learning_rate": 0.00018705612313785498, + "loss": 0.3351, + "step": 7814 + }, + { + "epoch": 0.633101101749838, + "grad_norm": 0.034902893006801605, + "learning_rate": 0.00018705162248526037, + "loss": 0.354, + "step": 7815 + }, + { + "epoch": 0.6331821127673364, + "grad_norm": 0.03641962260007858, + "learning_rate": 0.00018704712183266575, + "loss": 0.3717, + "step": 7816 + }, + { + "epoch": 0.6332631237848347, + "grad_norm": 0.028225794434547424, + "learning_rate": 0.00018704262118007111, + "loss": 0.3003, + "step": 7817 + }, + { + "epoch": 0.6333441348023331, + "grad_norm": 0.033281516283750534, + "learning_rate": 0.0001870381205274765, + "loss": 0.3613, + "step": 7818 + }, + { + "epoch": 0.6334251458198314, + "grad_norm": 0.03376254066824913, + "learning_rate": 0.00018703361987488186, + "loss": 0.3418, + "step": 7819 + }, + { + "epoch": 0.6335061568373299, + "grad_norm": 0.02990681864321232, + "learning_rate": 0.00018702911922228722, + "loss": 0.35, + "step": 7820 + }, + { + "epoch": 0.6335871678548283, + "grad_norm": 0.029667438939213753, + "learning_rate": 0.0001870246185696926, + "loss": 0.3329, + "step": 7821 + }, + { + "epoch": 0.6336681788723266, + "grad_norm": 0.0334978923201561, + "learning_rate": 0.000187020117917098, + "loss": 0.353, + "step": 7822 + }, + { + "epoch": 0.633749189889825, + "grad_norm": 0.033578719943761826, + "learning_rate": 0.00018701561726450336, + "loss": 0.3596, + "step": 7823 + }, + { + "epoch": 0.6338302009073234, + "grad_norm": 0.033971913158893585, + "learning_rate": 0.00018701111661190874, + "loss": 0.3467, + "step": 7824 + }, + { + "epoch": 0.6339112119248218, + "grad_norm": 0.034550849348306656, + "learning_rate": 0.0001870066159593141, + "loss": 0.3418, + "step": 7825 + }, + { + "epoch": 0.6339922229423202, + "grad_norm": 0.02897421084344387, + "learning_rate": 0.0001870021153067195, + "loss": 0.3507, + "step": 7826 + }, + { + "epoch": 0.6340732339598185, + "grad_norm": 0.028931085020303726, + "learning_rate": 0.00018699761465412485, + "loss": 0.3433, + "step": 7827 + }, + { + "epoch": 0.6341542449773169, + "grad_norm": 0.034571584314107895, + "learning_rate": 0.00018699311400153024, + "loss": 0.3595, + "step": 7828 + }, + { + "epoch": 0.6342352559948153, + "grad_norm": 0.03314541280269623, + "learning_rate": 0.0001869886133489356, + "loss": 0.3272, + "step": 7829 + }, + { + "epoch": 0.6343162670123137, + "grad_norm": 0.031858861446380615, + "learning_rate": 0.00018698411269634099, + "loss": 0.3978, + "step": 7830 + }, + { + "epoch": 0.634397278029812, + "grad_norm": 0.031506266444921494, + "learning_rate": 0.00018697961204374635, + "loss": 0.3604, + "step": 7831 + }, + { + "epoch": 0.6344782890473104, + "grad_norm": 0.028984401375055313, + "learning_rate": 0.00018697511139115173, + "loss": 0.3421, + "step": 7832 + }, + { + "epoch": 0.6345593000648088, + "grad_norm": 0.029323618859052658, + "learning_rate": 0.0001869706107385571, + "loss": 0.324, + "step": 7833 + }, + { + "epoch": 0.6346403110823072, + "grad_norm": 0.02964533492922783, + "learning_rate": 0.00018696611008596248, + "loss": 0.371, + "step": 7834 + }, + { + "epoch": 0.6347213220998056, + "grad_norm": 0.029131043702363968, + "learning_rate": 0.00018696160943336784, + "loss": 0.3247, + "step": 7835 + }, + { + "epoch": 0.6348023331173039, + "grad_norm": 0.032547157257795334, + "learning_rate": 0.00018695710878077323, + "loss": 0.3574, + "step": 7836 + }, + { + "epoch": 0.6348833441348023, + "grad_norm": 0.03153744339942932, + "learning_rate": 0.0001869526081281786, + "loss": 0.363, + "step": 7837 + }, + { + "epoch": 0.6349643551523008, + "grad_norm": 0.02874799631536007, + "learning_rate": 0.00018694810747558397, + "loss": 0.3587, + "step": 7838 + }, + { + "epoch": 0.6350453661697991, + "grad_norm": 0.02552749775350094, + "learning_rate": 0.00018694360682298936, + "loss": 0.3196, + "step": 7839 + }, + { + "epoch": 0.6351263771872975, + "grad_norm": 0.03303792327642441, + "learning_rate": 0.00018693910617039472, + "loss": 0.4216, + "step": 7840 + }, + { + "epoch": 0.6352073882047958, + "grad_norm": 0.03018331155180931, + "learning_rate": 0.00018693460551780008, + "loss": 0.3297, + "step": 7841 + }, + { + "epoch": 0.6352883992222942, + "grad_norm": 0.034510936588048935, + "learning_rate": 0.00018693010486520547, + "loss": 0.3494, + "step": 7842 + }, + { + "epoch": 0.6353694102397927, + "grad_norm": 0.031153760850429535, + "learning_rate": 0.00018692560421261083, + "loss": 0.3, + "step": 7843 + }, + { + "epoch": 0.635450421257291, + "grad_norm": 0.03127124160528183, + "learning_rate": 0.00018692110356001622, + "loss": 0.3681, + "step": 7844 + }, + { + "epoch": 0.6355314322747894, + "grad_norm": 0.028756583109498024, + "learning_rate": 0.0001869166029074216, + "loss": 0.3521, + "step": 7845 + }, + { + "epoch": 0.6356124432922877, + "grad_norm": 0.03337053209543228, + "learning_rate": 0.00018691210225482696, + "loss": 0.3818, + "step": 7846 + }, + { + "epoch": 0.6356934543097861, + "grad_norm": 0.032148126512765884, + "learning_rate": 0.00018690760160223232, + "loss": 0.3168, + "step": 7847 + }, + { + "epoch": 0.6357744653272845, + "grad_norm": 0.03170749545097351, + "learning_rate": 0.0001869031009496377, + "loss": 0.3426, + "step": 7848 + }, + { + "epoch": 0.6358554763447829, + "grad_norm": 0.03537492826581001, + "learning_rate": 0.00018689860029704307, + "loss": 0.41, + "step": 7849 + }, + { + "epoch": 0.6359364873622813, + "grad_norm": 0.032313790172338486, + "learning_rate": 0.00018689409964444846, + "loss": 0.3755, + "step": 7850 + }, + { + "epoch": 0.6360174983797796, + "grad_norm": 0.029278071597218513, + "learning_rate": 0.00018688959899185384, + "loss": 0.349, + "step": 7851 + }, + { + "epoch": 0.6360985093972781, + "grad_norm": 0.032198451459407806, + "learning_rate": 0.0001868850983392592, + "loss": 0.3568, + "step": 7852 + }, + { + "epoch": 0.6361795204147764, + "grad_norm": 0.03112160414457321, + "learning_rate": 0.00018688059768666456, + "loss": 0.3429, + "step": 7853 + }, + { + "epoch": 0.6362605314322748, + "grad_norm": 0.03611839562654495, + "learning_rate": 0.00018687609703406995, + "loss": 0.3426, + "step": 7854 + }, + { + "epoch": 0.6363415424497731, + "grad_norm": 0.030980095267295837, + "learning_rate": 0.00018687159638147534, + "loss": 0.3306, + "step": 7855 + }, + { + "epoch": 0.6364225534672715, + "grad_norm": 0.028781326487660408, + "learning_rate": 0.0001868670957288807, + "loss": 0.324, + "step": 7856 + }, + { + "epoch": 0.63650356448477, + "grad_norm": 0.033035773783922195, + "learning_rate": 0.00018686259507628609, + "loss": 0.3372, + "step": 7857 + }, + { + "epoch": 0.6365845755022683, + "grad_norm": 0.03632812201976776, + "learning_rate": 0.00018685809442369145, + "loss": 0.4069, + "step": 7858 + }, + { + "epoch": 0.6366655865197667, + "grad_norm": 0.03556077554821968, + "learning_rate": 0.0001868535937710968, + "loss": 0.3416, + "step": 7859 + }, + { + "epoch": 0.636746597537265, + "grad_norm": 0.04521572217345238, + "learning_rate": 0.0001868490931185022, + "loss": 0.3711, + "step": 7860 + }, + { + "epoch": 0.6368276085547635, + "grad_norm": 0.031015830114483833, + "learning_rate": 0.00018684459246590758, + "loss": 0.3253, + "step": 7861 + }, + { + "epoch": 0.6369086195722619, + "grad_norm": 0.03212182968854904, + "learning_rate": 0.00018684009181331294, + "loss": 0.3253, + "step": 7862 + }, + { + "epoch": 0.6369896305897602, + "grad_norm": 0.02970615215599537, + "learning_rate": 0.00018683559116071833, + "loss": 0.3117, + "step": 7863 + }, + { + "epoch": 0.6370706416072586, + "grad_norm": 0.03084675595164299, + "learning_rate": 0.0001868310905081237, + "loss": 0.3269, + "step": 7864 + }, + { + "epoch": 0.6371516526247569, + "grad_norm": 0.029871240258216858, + "learning_rate": 0.00018682658985552905, + "loss": 0.3584, + "step": 7865 + }, + { + "epoch": 0.6372326636422554, + "grad_norm": 0.02989710122346878, + "learning_rate": 0.00018682208920293444, + "loss": 0.3301, + "step": 7866 + }, + { + "epoch": 0.6373136746597537, + "grad_norm": 0.0319591723382473, + "learning_rate": 0.00018681758855033982, + "loss": 0.3443, + "step": 7867 + }, + { + "epoch": 0.6373946856772521, + "grad_norm": 0.03004080429673195, + "learning_rate": 0.00018681308789774518, + "loss": 0.3612, + "step": 7868 + }, + { + "epoch": 0.6374756966947505, + "grad_norm": 0.0289496760815382, + "learning_rate": 0.00018680858724515057, + "loss": 0.3276, + "step": 7869 + }, + { + "epoch": 0.6375567077122488, + "grad_norm": 0.032016411423683167, + "learning_rate": 0.00018680408659255593, + "loss": 0.3369, + "step": 7870 + }, + { + "epoch": 0.6376377187297473, + "grad_norm": 0.031566351652145386, + "learning_rate": 0.0001867995859399613, + "loss": 0.2975, + "step": 7871 + }, + { + "epoch": 0.6377187297472456, + "grad_norm": 0.030370811000466347, + "learning_rate": 0.00018679508528736668, + "loss": 0.3555, + "step": 7872 + }, + { + "epoch": 0.637799740764744, + "grad_norm": 0.031356330960989, + "learning_rate": 0.00018679058463477206, + "loss": 0.3226, + "step": 7873 + }, + { + "epoch": 0.6378807517822424, + "grad_norm": 0.025103889405727386, + "learning_rate": 0.00018678608398217742, + "loss": 0.3158, + "step": 7874 + }, + { + "epoch": 0.6379617627997408, + "grad_norm": 0.036442261189222336, + "learning_rate": 0.0001867815833295828, + "loss": 0.3843, + "step": 7875 + }, + { + "epoch": 0.6380427738172392, + "grad_norm": 0.03667015582323074, + "learning_rate": 0.00018677708267698817, + "loss": 0.4306, + "step": 7876 + }, + { + "epoch": 0.6381237848347375, + "grad_norm": 0.03109908103942871, + "learning_rate": 0.00018677258202439353, + "loss": 0.344, + "step": 7877 + }, + { + "epoch": 0.6382047958522359, + "grad_norm": 0.028864892199635506, + "learning_rate": 0.00018676808137179892, + "loss": 0.3103, + "step": 7878 + }, + { + "epoch": 0.6382858068697342, + "grad_norm": 0.031213095411658287, + "learning_rate": 0.0001867635807192043, + "loss": 0.343, + "step": 7879 + }, + { + "epoch": 0.6383668178872327, + "grad_norm": 0.030226996168494225, + "learning_rate": 0.00018675908006660967, + "loss": 0.3656, + "step": 7880 + }, + { + "epoch": 0.6384478289047311, + "grad_norm": 0.031217820942401886, + "learning_rate": 0.00018675457941401505, + "loss": 0.3583, + "step": 7881 + }, + { + "epoch": 0.6385288399222294, + "grad_norm": 0.032496050000190735, + "learning_rate": 0.0001867500787614204, + "loss": 0.3667, + "step": 7882 + }, + { + "epoch": 0.6386098509397278, + "grad_norm": 0.02922358550131321, + "learning_rate": 0.00018674557810882577, + "loss": 0.3303, + "step": 7883 + }, + { + "epoch": 0.6386908619572261, + "grad_norm": 0.03338933363556862, + "learning_rate": 0.00018674107745623116, + "loss": 0.3584, + "step": 7884 + }, + { + "epoch": 0.6387718729747246, + "grad_norm": 0.033961571753025055, + "learning_rate": 0.00018673657680363655, + "loss": 0.3591, + "step": 7885 + }, + { + "epoch": 0.638852883992223, + "grad_norm": 0.028723198920488358, + "learning_rate": 0.0001867320761510419, + "loss": 0.3674, + "step": 7886 + }, + { + "epoch": 0.6389338950097213, + "grad_norm": 0.036117613315582275, + "learning_rate": 0.0001867275754984473, + "loss": 0.4079, + "step": 7887 + }, + { + "epoch": 0.6390149060272197, + "grad_norm": 0.028472524136304855, + "learning_rate": 0.00018672307484585265, + "loss": 0.3171, + "step": 7888 + }, + { + "epoch": 0.6390959170447181, + "grad_norm": 0.03445472568273544, + "learning_rate": 0.00018671857419325801, + "loss": 0.3583, + "step": 7889 + }, + { + "epoch": 0.6391769280622165, + "grad_norm": 0.030528537929058075, + "learning_rate": 0.0001867140735406634, + "loss": 0.3694, + "step": 7890 + }, + { + "epoch": 0.6392579390797148, + "grad_norm": 0.029722632840275764, + "learning_rate": 0.0001867095728880688, + "loss": 0.3617, + "step": 7891 + }, + { + "epoch": 0.6393389500972132, + "grad_norm": 0.03173793479800224, + "learning_rate": 0.00018670507223547415, + "loss": 0.3909, + "step": 7892 + }, + { + "epoch": 0.6394199611147116, + "grad_norm": 0.030716672539711, + "learning_rate": 0.00018670057158287954, + "loss": 0.3438, + "step": 7893 + }, + { + "epoch": 0.63950097213221, + "grad_norm": 0.02941414900124073, + "learning_rate": 0.0001866960709302849, + "loss": 0.3159, + "step": 7894 + }, + { + "epoch": 0.6395819831497084, + "grad_norm": 0.0353083461523056, + "learning_rate": 0.00018669157027769026, + "loss": 0.2895, + "step": 7895 + }, + { + "epoch": 0.6396629941672067, + "grad_norm": 0.03264469653367996, + "learning_rate": 0.00018668706962509564, + "loss": 0.3612, + "step": 7896 + }, + { + "epoch": 0.6397440051847051, + "grad_norm": 0.02999110147356987, + "learning_rate": 0.00018668256897250103, + "loss": 0.3251, + "step": 7897 + }, + { + "epoch": 0.6398250162022034, + "grad_norm": 0.029968377202749252, + "learning_rate": 0.0001866780683199064, + "loss": 0.3717, + "step": 7898 + }, + { + "epoch": 0.6399060272197019, + "grad_norm": 0.030061427503824234, + "learning_rate": 0.00018667356766731178, + "loss": 0.3565, + "step": 7899 + }, + { + "epoch": 0.6399870382372003, + "grad_norm": 0.029621459543704987, + "learning_rate": 0.00018666906701471714, + "loss": 0.3551, + "step": 7900 + }, + { + "epoch": 0.6400680492546986, + "grad_norm": 0.03125537186861038, + "learning_rate": 0.0001866645663621225, + "loss": 0.3238, + "step": 7901 + }, + { + "epoch": 0.640149060272197, + "grad_norm": 0.030191486701369286, + "learning_rate": 0.00018666006570952788, + "loss": 0.3346, + "step": 7902 + }, + { + "epoch": 0.6402300712896954, + "grad_norm": 0.03128298744559288, + "learning_rate": 0.00018665556505693327, + "loss": 0.2971, + "step": 7903 + }, + { + "epoch": 0.6403110823071938, + "grad_norm": 0.03299170359969139, + "learning_rate": 0.00018665106440433863, + "loss": 0.329, + "step": 7904 + }, + { + "epoch": 0.6403920933246922, + "grad_norm": 0.03404494374990463, + "learning_rate": 0.00018664656375174402, + "loss": 0.3708, + "step": 7905 + }, + { + "epoch": 0.6404731043421905, + "grad_norm": 0.031427644193172455, + "learning_rate": 0.00018664206309914938, + "loss": 0.329, + "step": 7906 + }, + { + "epoch": 0.6405541153596889, + "grad_norm": 0.030701419338583946, + "learning_rate": 0.00018663756244655477, + "loss": 0.37, + "step": 7907 + }, + { + "epoch": 0.6406351263771873, + "grad_norm": 0.030969882383942604, + "learning_rate": 0.00018663306179396015, + "loss": 0.3344, + "step": 7908 + }, + { + "epoch": 0.6407161373946857, + "grad_norm": 0.03479425981640816, + "learning_rate": 0.00018662856114136551, + "loss": 0.3623, + "step": 7909 + }, + { + "epoch": 0.640797148412184, + "grad_norm": 0.03328912705183029, + "learning_rate": 0.00018662406048877087, + "loss": 0.3736, + "step": 7910 + }, + { + "epoch": 0.6408781594296824, + "grad_norm": 0.029649941250681877, + "learning_rate": 0.00018661955983617626, + "loss": 0.3327, + "step": 7911 + }, + { + "epoch": 0.6409591704471809, + "grad_norm": 0.03194352239370346, + "learning_rate": 0.00018661505918358162, + "loss": 0.3356, + "step": 7912 + }, + { + "epoch": 0.6410401814646792, + "grad_norm": 0.029047805815935135, + "learning_rate": 0.000186610558530987, + "loss": 0.3159, + "step": 7913 + }, + { + "epoch": 0.6411211924821776, + "grad_norm": 0.02954118326306343, + "learning_rate": 0.0001866060578783924, + "loss": 0.3533, + "step": 7914 + }, + { + "epoch": 0.6412022034996759, + "grad_norm": 0.03956909477710724, + "learning_rate": 0.00018660155722579776, + "loss": 0.3732, + "step": 7915 + }, + { + "epoch": 0.6412832145171743, + "grad_norm": 0.03225688263773918, + "learning_rate": 0.00018659705657320312, + "loss": 0.3361, + "step": 7916 + }, + { + "epoch": 0.6413642255346728, + "grad_norm": 0.03178594261407852, + "learning_rate": 0.0001865925559206085, + "loss": 0.3506, + "step": 7917 + }, + { + "epoch": 0.6414452365521711, + "grad_norm": 0.025829335674643517, + "learning_rate": 0.00018658805526801386, + "loss": 0.2901, + "step": 7918 + }, + { + "epoch": 0.6415262475696695, + "grad_norm": 0.031244834885001183, + "learning_rate": 0.00018658355461541925, + "loss": 0.3615, + "step": 7919 + }, + { + "epoch": 0.6416072585871678, + "grad_norm": 0.03178013116121292, + "learning_rate": 0.00018657905396282464, + "loss": 0.314, + "step": 7920 + }, + { + "epoch": 0.6416882696046662, + "grad_norm": 0.03225788101553917, + "learning_rate": 0.00018657455331023, + "loss": 0.3678, + "step": 7921 + }, + { + "epoch": 0.6417692806221647, + "grad_norm": 0.03708713874220848, + "learning_rate": 0.00018657005265763536, + "loss": 0.4256, + "step": 7922 + }, + { + "epoch": 0.641850291639663, + "grad_norm": 0.029338078573346138, + "learning_rate": 0.00018656555200504074, + "loss": 0.3137, + "step": 7923 + }, + { + "epoch": 0.6419313026571614, + "grad_norm": 0.032095056027173996, + "learning_rate": 0.0001865610513524461, + "loss": 0.3916, + "step": 7924 + }, + { + "epoch": 0.6420123136746597, + "grad_norm": 0.03248443081974983, + "learning_rate": 0.0001865565506998515, + "loss": 0.3505, + "step": 7925 + }, + { + "epoch": 0.6420933246921582, + "grad_norm": 0.03844140097498894, + "learning_rate": 0.00018655205004725688, + "loss": 0.3834, + "step": 7926 + }, + { + "epoch": 0.6421743357096565, + "grad_norm": 0.032976362854242325, + "learning_rate": 0.00018654754939466224, + "loss": 0.3248, + "step": 7927 + }, + { + "epoch": 0.6422553467271549, + "grad_norm": 0.03275357559323311, + "learning_rate": 0.0001865430487420676, + "loss": 0.327, + "step": 7928 + }, + { + "epoch": 0.6423363577446533, + "grad_norm": 0.03425171598792076, + "learning_rate": 0.00018653854808947299, + "loss": 0.3443, + "step": 7929 + }, + { + "epoch": 0.6424173687621516, + "grad_norm": 0.03208903223276138, + "learning_rate": 0.00018653404743687835, + "loss": 0.3217, + "step": 7930 + }, + { + "epoch": 0.6424983797796501, + "grad_norm": 0.028323756530880928, + "learning_rate": 0.00018652954678428373, + "loss": 0.3016, + "step": 7931 + }, + { + "epoch": 0.6425793907971484, + "grad_norm": 0.0322665199637413, + "learning_rate": 0.00018652504613168912, + "loss": 0.3503, + "step": 7932 + }, + { + "epoch": 0.6426604018146468, + "grad_norm": 0.02915794961154461, + "learning_rate": 0.00018652054547909448, + "loss": 0.305, + "step": 7933 + }, + { + "epoch": 0.6427414128321451, + "grad_norm": 0.03422217071056366, + "learning_rate": 0.00018651604482649984, + "loss": 0.3062, + "step": 7934 + }, + { + "epoch": 0.6428224238496435, + "grad_norm": 0.032783396542072296, + "learning_rate": 0.00018651154417390523, + "loss": 0.3768, + "step": 7935 + }, + { + "epoch": 0.642903434867142, + "grad_norm": 0.045881237834692, + "learning_rate": 0.00018650704352131061, + "loss": 0.3574, + "step": 7936 + }, + { + "epoch": 0.6429844458846403, + "grad_norm": 0.031124461442232132, + "learning_rate": 0.00018650254286871597, + "loss": 0.3339, + "step": 7937 + }, + { + "epoch": 0.6430654569021387, + "grad_norm": 0.03581704944372177, + "learning_rate": 0.00018649804221612136, + "loss": 0.3366, + "step": 7938 + }, + { + "epoch": 0.643146467919637, + "grad_norm": 0.03543675318360329, + "learning_rate": 0.00018649354156352672, + "loss": 0.412, + "step": 7939 + }, + { + "epoch": 0.6432274789371355, + "grad_norm": 0.0327875129878521, + "learning_rate": 0.00018648904091093208, + "loss": 0.3712, + "step": 7940 + }, + { + "epoch": 0.6433084899546339, + "grad_norm": 0.033418960869312286, + "learning_rate": 0.00018648454025833747, + "loss": 0.3482, + "step": 7941 + }, + { + "epoch": 0.6433895009721322, + "grad_norm": 0.029584798961877823, + "learning_rate": 0.00018648003960574286, + "loss": 0.3146, + "step": 7942 + }, + { + "epoch": 0.6434705119896306, + "grad_norm": 0.033126283437013626, + "learning_rate": 0.00018647553895314822, + "loss": 0.3722, + "step": 7943 + }, + { + "epoch": 0.6435515230071289, + "grad_norm": 0.03137566149234772, + "learning_rate": 0.0001864710383005536, + "loss": 0.3713, + "step": 7944 + }, + { + "epoch": 0.6436325340246274, + "grad_norm": 0.03218172490596771, + "learning_rate": 0.00018646653764795896, + "loss": 0.3373, + "step": 7945 + }, + { + "epoch": 0.6437135450421257, + "grad_norm": 0.03332442045211792, + "learning_rate": 0.00018646203699536432, + "loss": 0.3969, + "step": 7946 + }, + { + "epoch": 0.6437945560596241, + "grad_norm": 0.029476812109351158, + "learning_rate": 0.0001864575363427697, + "loss": 0.3237, + "step": 7947 + }, + { + "epoch": 0.6438755670771225, + "grad_norm": 0.029915448278188705, + "learning_rate": 0.0001864530356901751, + "loss": 0.3233, + "step": 7948 + }, + { + "epoch": 0.6439565780946209, + "grad_norm": 0.028523625805974007, + "learning_rate": 0.00018644853503758046, + "loss": 0.3583, + "step": 7949 + }, + { + "epoch": 0.6440375891121193, + "grad_norm": 0.03365233168005943, + "learning_rate": 0.00018644403438498584, + "loss": 0.3135, + "step": 7950 + }, + { + "epoch": 0.6441186001296176, + "grad_norm": 0.029672134667634964, + "learning_rate": 0.0001864395337323912, + "loss": 0.3326, + "step": 7951 + }, + { + "epoch": 0.644199611147116, + "grad_norm": 0.03292734920978546, + "learning_rate": 0.00018643503307979656, + "loss": 0.3933, + "step": 7952 + }, + { + "epoch": 0.6442806221646143, + "grad_norm": 0.02732773683965206, + "learning_rate": 0.00018643053242720195, + "loss": 0.3157, + "step": 7953 + }, + { + "epoch": 0.6443616331821128, + "grad_norm": 0.02848062478005886, + "learning_rate": 0.00018642603177460734, + "loss": 0.3228, + "step": 7954 + }, + { + "epoch": 0.6444426441996112, + "grad_norm": 0.03194410353899002, + "learning_rate": 0.0001864215311220127, + "loss": 0.3175, + "step": 7955 + }, + { + "epoch": 0.6445236552171095, + "grad_norm": 0.031859688460826874, + "learning_rate": 0.0001864170304694181, + "loss": 0.3438, + "step": 7956 + }, + { + "epoch": 0.6446046662346079, + "grad_norm": 0.03289076313376427, + "learning_rate": 0.00018641252981682345, + "loss": 0.3058, + "step": 7957 + }, + { + "epoch": 0.6446856772521062, + "grad_norm": 0.02861196920275688, + "learning_rate": 0.0001864080291642288, + "loss": 0.3345, + "step": 7958 + }, + { + "epoch": 0.6447666882696047, + "grad_norm": 0.03252119943499565, + "learning_rate": 0.0001864035285116342, + "loss": 0.3565, + "step": 7959 + }, + { + "epoch": 0.6448476992871031, + "grad_norm": 0.03106110729277134, + "learning_rate": 0.00018639902785903958, + "loss": 0.3154, + "step": 7960 + }, + { + "epoch": 0.6449287103046014, + "grad_norm": 0.032401468604803085, + "learning_rate": 0.00018639452720644494, + "loss": 0.3781, + "step": 7961 + }, + { + "epoch": 0.6450097213220998, + "grad_norm": 0.03558249771595001, + "learning_rate": 0.00018639002655385033, + "loss": 0.3246, + "step": 7962 + }, + { + "epoch": 0.6450907323395982, + "grad_norm": 0.02981211617588997, + "learning_rate": 0.0001863855259012557, + "loss": 0.3168, + "step": 7963 + }, + { + "epoch": 0.6451717433570966, + "grad_norm": 0.031262945383787155, + "learning_rate": 0.00018638102524866105, + "loss": 0.3403, + "step": 7964 + }, + { + "epoch": 0.645252754374595, + "grad_norm": 0.036666229367256165, + "learning_rate": 0.00018637652459606644, + "loss": 0.3721, + "step": 7965 + }, + { + "epoch": 0.6453337653920933, + "grad_norm": 0.03510688245296478, + "learning_rate": 0.00018637202394347182, + "loss": 0.3607, + "step": 7966 + }, + { + "epoch": 0.6454147764095917, + "grad_norm": 0.032817598432302475, + "learning_rate": 0.00018636752329087718, + "loss": 0.3372, + "step": 7967 + }, + { + "epoch": 0.6454957874270901, + "grad_norm": 0.02903442457318306, + "learning_rate": 0.00018636302263828257, + "loss": 0.3263, + "step": 7968 + }, + { + "epoch": 0.6455767984445885, + "grad_norm": 0.03062974475324154, + "learning_rate": 0.00018635852198568793, + "loss": 0.3696, + "step": 7969 + }, + { + "epoch": 0.6456578094620868, + "grad_norm": 0.032218314707279205, + "learning_rate": 0.0001863540213330933, + "loss": 0.3409, + "step": 7970 + }, + { + "epoch": 0.6457388204795852, + "grad_norm": 0.031189072877168655, + "learning_rate": 0.00018634952068049868, + "loss": 0.3022, + "step": 7971 + }, + { + "epoch": 0.6458198314970836, + "grad_norm": 0.02876521833240986, + "learning_rate": 0.00018634502002790406, + "loss": 0.3585, + "step": 7972 + }, + { + "epoch": 0.645900842514582, + "grad_norm": 0.029905641451478004, + "learning_rate": 0.00018634051937530942, + "loss": 0.3379, + "step": 7973 + }, + { + "epoch": 0.6459818535320804, + "grad_norm": 0.038259658962488174, + "learning_rate": 0.0001863360187227148, + "loss": 0.4316, + "step": 7974 + }, + { + "epoch": 0.6460628645495787, + "grad_norm": 0.03205183893442154, + "learning_rate": 0.00018633151807012017, + "loss": 0.3666, + "step": 7975 + }, + { + "epoch": 0.6461438755670771, + "grad_norm": 0.031414229422807693, + "learning_rate": 0.00018632701741752553, + "loss": 0.3861, + "step": 7976 + }, + { + "epoch": 0.6462248865845756, + "grad_norm": 0.03359837830066681, + "learning_rate": 0.00018632251676493095, + "loss": 0.4069, + "step": 7977 + }, + { + "epoch": 0.6463058976020739, + "grad_norm": 0.03183357045054436, + "learning_rate": 0.0001863180161123363, + "loss": 0.3205, + "step": 7978 + }, + { + "epoch": 0.6463869086195723, + "grad_norm": 0.03323815017938614, + "learning_rate": 0.00018631351545974167, + "loss": 0.3296, + "step": 7979 + }, + { + "epoch": 0.6464679196370706, + "grad_norm": 0.0322708860039711, + "learning_rate": 0.00018630901480714705, + "loss": 0.4201, + "step": 7980 + }, + { + "epoch": 0.646548930654569, + "grad_norm": 0.025975298136472702, + "learning_rate": 0.0001863045141545524, + "loss": 0.2796, + "step": 7981 + }, + { + "epoch": 0.6466299416720674, + "grad_norm": 0.03103083372116089, + "learning_rate": 0.00018630001350195777, + "loss": 0.3357, + "step": 7982 + }, + { + "epoch": 0.6467109526895658, + "grad_norm": 0.03107629343867302, + "learning_rate": 0.0001862955128493632, + "loss": 0.3631, + "step": 7983 + }, + { + "epoch": 0.6467919637070642, + "grad_norm": 0.028790367767214775, + "learning_rate": 0.00018629101219676855, + "loss": 0.3155, + "step": 7984 + }, + { + "epoch": 0.6468729747245625, + "grad_norm": 0.029491981491446495, + "learning_rate": 0.0001862865115441739, + "loss": 0.3385, + "step": 7985 + }, + { + "epoch": 0.6469539857420609, + "grad_norm": 0.03248732164502144, + "learning_rate": 0.0001862820108915793, + "loss": 0.3698, + "step": 7986 + }, + { + "epoch": 0.6470349967595593, + "grad_norm": 0.030972251668572426, + "learning_rate": 0.00018627751023898465, + "loss": 0.3182, + "step": 7987 + }, + { + "epoch": 0.6471160077770577, + "grad_norm": 0.029722899198532104, + "learning_rate": 0.00018627300958639004, + "loss": 0.2785, + "step": 7988 + }, + { + "epoch": 0.647197018794556, + "grad_norm": 0.03773869574069977, + "learning_rate": 0.00018626850893379543, + "loss": 0.3493, + "step": 7989 + }, + { + "epoch": 0.6472780298120544, + "grad_norm": 0.031442657113075256, + "learning_rate": 0.0001862640082812008, + "loss": 0.3414, + "step": 7990 + }, + { + "epoch": 0.6473590408295529, + "grad_norm": 0.030141377821564674, + "learning_rate": 0.00018625950762860615, + "loss": 0.2964, + "step": 7991 + }, + { + "epoch": 0.6474400518470512, + "grad_norm": 0.03201432153582573, + "learning_rate": 0.00018625500697601154, + "loss": 0.355, + "step": 7992 + }, + { + "epoch": 0.6475210628645496, + "grad_norm": 0.027599439024925232, + "learning_rate": 0.0001862505063234169, + "loss": 0.2955, + "step": 7993 + }, + { + "epoch": 0.6476020738820479, + "grad_norm": 0.035235144197940826, + "learning_rate": 0.00018624600567082228, + "loss": 0.343, + "step": 7994 + }, + { + "epoch": 0.6476830848995463, + "grad_norm": 0.03464788198471069, + "learning_rate": 0.00018624150501822767, + "loss": 0.3726, + "step": 7995 + }, + { + "epoch": 0.6477640959170448, + "grad_norm": 0.03147656470537186, + "learning_rate": 0.00018623700436563303, + "loss": 0.3669, + "step": 7996 + }, + { + "epoch": 0.6478451069345431, + "grad_norm": 0.030429191887378693, + "learning_rate": 0.0001862325037130384, + "loss": 0.3533, + "step": 7997 + }, + { + "epoch": 0.6479261179520415, + "grad_norm": 0.03295878693461418, + "learning_rate": 0.00018622800306044378, + "loss": 0.298, + "step": 7998 + }, + { + "epoch": 0.6480071289695398, + "grad_norm": 0.03022496961057186, + "learning_rate": 0.00018622350240784914, + "loss": 0.3597, + "step": 7999 + }, + { + "epoch": 0.6480881399870383, + "grad_norm": 0.03181470185518265, + "learning_rate": 0.00018621900175525453, + "loss": 0.3484, + "step": 8000 + }, + { + "epoch": 0.6481691510045366, + "grad_norm": 0.040238749235868454, + "learning_rate": 0.0001862145011026599, + "loss": 0.2987, + "step": 8001 + }, + { + "epoch": 0.648250162022035, + "grad_norm": 0.027749789878726006, + "learning_rate": 0.00018621000045006527, + "loss": 0.3122, + "step": 8002 + }, + { + "epoch": 0.6483311730395334, + "grad_norm": 0.028344471007585526, + "learning_rate": 0.00018620549979747063, + "loss": 0.2646, + "step": 8003 + }, + { + "epoch": 0.6484121840570317, + "grad_norm": 0.028361354023218155, + "learning_rate": 0.00018620099914487602, + "loss": 0.3586, + "step": 8004 + }, + { + "epoch": 0.6484931950745302, + "grad_norm": 0.030784590169787407, + "learning_rate": 0.00018619649849228138, + "loss": 0.336, + "step": 8005 + }, + { + "epoch": 0.6485742060920285, + "grad_norm": 0.027588283643126488, + "learning_rate": 0.00018619199783968677, + "loss": 0.297, + "step": 8006 + }, + { + "epoch": 0.6486552171095269, + "grad_norm": 0.03066411241889, + "learning_rate": 0.00018618749718709215, + "loss": 0.3539, + "step": 8007 + }, + { + "epoch": 0.6487362281270252, + "grad_norm": 0.033937420696020126, + "learning_rate": 0.00018618299653449751, + "loss": 0.3963, + "step": 8008 + }, + { + "epoch": 0.6488172391445236, + "grad_norm": 0.03501052036881447, + "learning_rate": 0.00018617849588190287, + "loss": 0.3579, + "step": 8009 + }, + { + "epoch": 0.6488982501620221, + "grad_norm": 0.03133450448513031, + "learning_rate": 0.00018617399522930826, + "loss": 0.3442, + "step": 8010 + }, + { + "epoch": 0.6489792611795204, + "grad_norm": 0.03345509245991707, + "learning_rate": 0.00018616949457671365, + "loss": 0.3604, + "step": 8011 + }, + { + "epoch": 0.6490602721970188, + "grad_norm": 0.033844854682683945, + "learning_rate": 0.000186164993924119, + "loss": 0.3981, + "step": 8012 + }, + { + "epoch": 0.6491412832145171, + "grad_norm": 0.03028370812535286, + "learning_rate": 0.0001861604932715244, + "loss": 0.3166, + "step": 8013 + }, + { + "epoch": 0.6492222942320156, + "grad_norm": 0.026143895462155342, + "learning_rate": 0.00018615599261892976, + "loss": 0.3192, + "step": 8014 + }, + { + "epoch": 0.649303305249514, + "grad_norm": 0.034095458686351776, + "learning_rate": 0.00018615149196633512, + "loss": 0.4142, + "step": 8015 + }, + { + "epoch": 0.6493843162670123, + "grad_norm": 0.03332599624991417, + "learning_rate": 0.0001861469913137405, + "loss": 0.3668, + "step": 8016 + }, + { + "epoch": 0.6494653272845107, + "grad_norm": 0.03390030190348625, + "learning_rate": 0.0001861424906611459, + "loss": 0.3198, + "step": 8017 + }, + { + "epoch": 0.649546338302009, + "grad_norm": 0.029879910871386528, + "learning_rate": 0.00018613799000855125, + "loss": 0.3186, + "step": 8018 + }, + { + "epoch": 0.6496273493195075, + "grad_norm": 0.03314916417002678, + "learning_rate": 0.00018613348935595664, + "loss": 0.3611, + "step": 8019 + }, + { + "epoch": 0.6497083603370059, + "grad_norm": 0.03120088204741478, + "learning_rate": 0.000186128988703362, + "loss": 0.3322, + "step": 8020 + }, + { + "epoch": 0.6497893713545042, + "grad_norm": 0.029262244701385498, + "learning_rate": 0.00018612448805076736, + "loss": 0.3151, + "step": 8021 + }, + { + "epoch": 0.6498703823720026, + "grad_norm": 0.03213749825954437, + "learning_rate": 0.00018611998739817274, + "loss": 0.3447, + "step": 8022 + }, + { + "epoch": 0.6499513933895009, + "grad_norm": 0.030877763405442238, + "learning_rate": 0.00018611548674557813, + "loss": 0.3584, + "step": 8023 + }, + { + "epoch": 0.6500324044069994, + "grad_norm": 0.031971901655197144, + "learning_rate": 0.0001861109860929835, + "loss": 0.3481, + "step": 8024 + }, + { + "epoch": 0.6501134154244977, + "grad_norm": 0.03156515210866928, + "learning_rate": 0.00018610648544038888, + "loss": 0.3788, + "step": 8025 + }, + { + "epoch": 0.6501944264419961, + "grad_norm": 0.03007367067039013, + "learning_rate": 0.00018610198478779424, + "loss": 0.3403, + "step": 8026 + }, + { + "epoch": 0.6502754374594945, + "grad_norm": 0.0311642587184906, + "learning_rate": 0.0001860974841351996, + "loss": 0.3279, + "step": 8027 + }, + { + "epoch": 0.6503564484769929, + "grad_norm": 0.03929786756634712, + "learning_rate": 0.00018609298348260499, + "loss": 0.3465, + "step": 8028 + }, + { + "epoch": 0.6504374594944913, + "grad_norm": 0.02747124247252941, + "learning_rate": 0.00018608848283001037, + "loss": 0.2891, + "step": 8029 + }, + { + "epoch": 0.6505184705119896, + "grad_norm": 0.03623297065496445, + "learning_rate": 0.00018608398217741573, + "loss": 0.418, + "step": 8030 + }, + { + "epoch": 0.650599481529488, + "grad_norm": 0.03151100128889084, + "learning_rate": 0.00018607948152482112, + "loss": 0.3487, + "step": 8031 + }, + { + "epoch": 0.6506804925469863, + "grad_norm": 0.028096545487642288, + "learning_rate": 0.00018607498087222648, + "loss": 0.3142, + "step": 8032 + }, + { + "epoch": 0.6507615035644848, + "grad_norm": 0.03158478066325188, + "learning_rate": 0.00018607048021963184, + "loss": 0.3316, + "step": 8033 + }, + { + "epoch": 0.6508425145819832, + "grad_norm": 0.029952632263302803, + "learning_rate": 0.00018606597956703723, + "loss": 0.339, + "step": 8034 + }, + { + "epoch": 0.6509235255994815, + "grad_norm": 0.03018800914287567, + "learning_rate": 0.00018606147891444261, + "loss": 0.3186, + "step": 8035 + }, + { + "epoch": 0.6510045366169799, + "grad_norm": 0.028747960925102234, + "learning_rate": 0.00018605697826184797, + "loss": 0.3322, + "step": 8036 + }, + { + "epoch": 0.6510855476344782, + "grad_norm": 0.030452804639935493, + "learning_rate": 0.00018605247760925336, + "loss": 0.3833, + "step": 8037 + }, + { + "epoch": 0.6511665586519767, + "grad_norm": 0.030978774651885033, + "learning_rate": 0.00018604797695665872, + "loss": 0.3378, + "step": 8038 + }, + { + "epoch": 0.6512475696694751, + "grad_norm": 0.030736364424228668, + "learning_rate": 0.00018604347630406408, + "loss": 0.3492, + "step": 8039 + }, + { + "epoch": 0.6513285806869734, + "grad_norm": 0.03373675048351288, + "learning_rate": 0.00018603897565146947, + "loss": 0.3894, + "step": 8040 + }, + { + "epoch": 0.6514095917044718, + "grad_norm": 0.03460337966680527, + "learning_rate": 0.00018603447499887486, + "loss": 0.3471, + "step": 8041 + }, + { + "epoch": 0.6514906027219702, + "grad_norm": 0.031132934615015984, + "learning_rate": 0.00018602997434628022, + "loss": 0.3235, + "step": 8042 + }, + { + "epoch": 0.6515716137394686, + "grad_norm": 0.03173597529530525, + "learning_rate": 0.0001860254736936856, + "loss": 0.3369, + "step": 8043 + }, + { + "epoch": 0.651652624756967, + "grad_norm": 0.030315032228827477, + "learning_rate": 0.00018602097304109096, + "loss": 0.3247, + "step": 8044 + }, + { + "epoch": 0.6517336357744653, + "grad_norm": 0.034832924604415894, + "learning_rate": 0.00018601647238849632, + "loss": 0.3624, + "step": 8045 + }, + { + "epoch": 0.6518146467919637, + "grad_norm": 0.031159505248069763, + "learning_rate": 0.00018601197173590174, + "loss": 0.3391, + "step": 8046 + }, + { + "epoch": 0.6518956578094621, + "grad_norm": 0.03149305656552315, + "learning_rate": 0.0001860074710833071, + "loss": 0.3688, + "step": 8047 + }, + { + "epoch": 0.6519766688269605, + "grad_norm": 0.03061858005821705, + "learning_rate": 0.00018600297043071246, + "loss": 0.3645, + "step": 8048 + }, + { + "epoch": 0.6520576798444588, + "grad_norm": 0.03019680269062519, + "learning_rate": 0.00018599846977811785, + "loss": 0.3284, + "step": 8049 + }, + { + "epoch": 0.6521386908619572, + "grad_norm": 0.029862701892852783, + "learning_rate": 0.0001859939691255232, + "loss": 0.3286, + "step": 8050 + }, + { + "epoch": 0.6522197018794557, + "grad_norm": 0.03851095587015152, + "learning_rate": 0.00018598946847292857, + "loss": 0.3709, + "step": 8051 + }, + { + "epoch": 0.652300712896954, + "grad_norm": 0.034326620399951935, + "learning_rate": 0.00018598496782033398, + "loss": 0.3327, + "step": 8052 + }, + { + "epoch": 0.6523817239144524, + "grad_norm": 0.02795341983437538, + "learning_rate": 0.00018598046716773934, + "loss": 0.3188, + "step": 8053 + }, + { + "epoch": 0.6524627349319507, + "grad_norm": 0.03119768388569355, + "learning_rate": 0.0001859759665151447, + "loss": 0.3795, + "step": 8054 + }, + { + "epoch": 0.6525437459494491, + "grad_norm": 0.031861960887908936, + "learning_rate": 0.0001859714658625501, + "loss": 0.3837, + "step": 8055 + }, + { + "epoch": 0.6526247569669476, + "grad_norm": 0.028594138100743294, + "learning_rate": 0.00018596696520995545, + "loss": 0.3313, + "step": 8056 + }, + { + "epoch": 0.6527057679844459, + "grad_norm": 0.033589012920856476, + "learning_rate": 0.0001859624645573608, + "loss": 0.354, + "step": 8057 + }, + { + "epoch": 0.6527867790019443, + "grad_norm": 0.03334236517548561, + "learning_rate": 0.00018595796390476622, + "loss": 0.3248, + "step": 8058 + }, + { + "epoch": 0.6528677900194426, + "grad_norm": 0.03135516494512558, + "learning_rate": 0.00018595346325217158, + "loss": 0.3385, + "step": 8059 + }, + { + "epoch": 0.652948801036941, + "grad_norm": 0.03384825587272644, + "learning_rate": 0.00018594896259957694, + "loss": 0.3494, + "step": 8060 + }, + { + "epoch": 0.6530298120544394, + "grad_norm": 0.02878384292125702, + "learning_rate": 0.00018594446194698233, + "loss": 0.3105, + "step": 8061 + }, + { + "epoch": 0.6531108230719378, + "grad_norm": 0.03048752248287201, + "learning_rate": 0.0001859399612943877, + "loss": 0.3408, + "step": 8062 + }, + { + "epoch": 0.6531918340894362, + "grad_norm": 0.03223764896392822, + "learning_rate": 0.00018593546064179308, + "loss": 0.3665, + "step": 8063 + }, + { + "epoch": 0.6532728451069345, + "grad_norm": 0.03150539472699165, + "learning_rate": 0.00018593095998919846, + "loss": 0.3255, + "step": 8064 + }, + { + "epoch": 0.653353856124433, + "grad_norm": 0.03526121377944946, + "learning_rate": 0.00018592645933660382, + "loss": 0.4035, + "step": 8065 + }, + { + "epoch": 0.6534348671419313, + "grad_norm": 0.028970615938305855, + "learning_rate": 0.00018592195868400918, + "loss": 0.365, + "step": 8066 + }, + { + "epoch": 0.6535158781594297, + "grad_norm": 0.034498874098062515, + "learning_rate": 0.00018591745803141457, + "loss": 0.3666, + "step": 8067 + }, + { + "epoch": 0.653596889176928, + "grad_norm": 0.030491981655359268, + "learning_rate": 0.00018591295737881993, + "loss": 0.3711, + "step": 8068 + }, + { + "epoch": 0.6536779001944264, + "grad_norm": 0.029132137075066566, + "learning_rate": 0.00018590845672622532, + "loss": 0.303, + "step": 8069 + }, + { + "epoch": 0.6537589112119249, + "grad_norm": 0.02738095261156559, + "learning_rate": 0.0001859039560736307, + "loss": 0.3228, + "step": 8070 + }, + { + "epoch": 0.6538399222294232, + "grad_norm": 0.03282042592763901, + "learning_rate": 0.00018589945542103606, + "loss": 0.3893, + "step": 8071 + }, + { + "epoch": 0.6539209332469216, + "grad_norm": 0.03241070732474327, + "learning_rate": 0.00018589495476844142, + "loss": 0.3724, + "step": 8072 + }, + { + "epoch": 0.6540019442644199, + "grad_norm": 0.032008569687604904, + "learning_rate": 0.0001858904541158468, + "loss": 0.3184, + "step": 8073 + }, + { + "epoch": 0.6540829552819183, + "grad_norm": 0.03105410560965538, + "learning_rate": 0.00018588595346325217, + "loss": 0.354, + "step": 8074 + }, + { + "epoch": 0.6541639662994168, + "grad_norm": 0.028940342366695404, + "learning_rate": 0.00018588145281065756, + "loss": 0.3252, + "step": 8075 + }, + { + "epoch": 0.6542449773169151, + "grad_norm": 0.03389149159193039, + "learning_rate": 0.00018587695215806295, + "loss": 0.3761, + "step": 8076 + }, + { + "epoch": 0.6543259883344135, + "grad_norm": 0.028859373182058334, + "learning_rate": 0.0001858724515054683, + "loss": 0.3609, + "step": 8077 + }, + { + "epoch": 0.6544069993519118, + "grad_norm": 0.03037380985915661, + "learning_rate": 0.00018586795085287367, + "loss": 0.3317, + "step": 8078 + }, + { + "epoch": 0.6544880103694103, + "grad_norm": 0.03146057575941086, + "learning_rate": 0.00018586345020027905, + "loss": 0.3699, + "step": 8079 + }, + { + "epoch": 0.6545690213869086, + "grad_norm": 0.031758084893226624, + "learning_rate": 0.0001858589495476844, + "loss": 0.3537, + "step": 8080 + }, + { + "epoch": 0.654650032404407, + "grad_norm": 0.03247644007205963, + "learning_rate": 0.0001858544488950898, + "loss": 0.3575, + "step": 8081 + }, + { + "epoch": 0.6547310434219054, + "grad_norm": 0.03290760517120361, + "learning_rate": 0.0001858499482424952, + "loss": 0.3372, + "step": 8082 + }, + { + "epoch": 0.6548120544394037, + "grad_norm": 0.029254132881760597, + "learning_rate": 0.00018584544758990055, + "loss": 0.3084, + "step": 8083 + }, + { + "epoch": 0.6548930654569022, + "grad_norm": 0.03385884314775467, + "learning_rate": 0.0001858409469373059, + "loss": 0.3641, + "step": 8084 + }, + { + "epoch": 0.6549740764744005, + "grad_norm": 0.032351624220609665, + "learning_rate": 0.0001858364462847113, + "loss": 0.3702, + "step": 8085 + }, + { + "epoch": 0.6550550874918989, + "grad_norm": 0.03416949883103371, + "learning_rate": 0.00018583194563211665, + "loss": 0.3482, + "step": 8086 + }, + { + "epoch": 0.6551360985093972, + "grad_norm": 0.03164816275238991, + "learning_rate": 0.00018582744497952204, + "loss": 0.3747, + "step": 8087 + }, + { + "epoch": 0.6552171095268956, + "grad_norm": 0.029934266582131386, + "learning_rate": 0.00018582294432692743, + "loss": 0.3367, + "step": 8088 + }, + { + "epoch": 0.6552981205443941, + "grad_norm": 0.03290180861949921, + "learning_rate": 0.0001858184436743328, + "loss": 0.3688, + "step": 8089 + }, + { + "epoch": 0.6553791315618924, + "grad_norm": 0.032323382794857025, + "learning_rate": 0.00018581394302173815, + "loss": 0.3525, + "step": 8090 + }, + { + "epoch": 0.6554601425793908, + "grad_norm": 0.033882077783346176, + "learning_rate": 0.00018580944236914354, + "loss": 0.3368, + "step": 8091 + }, + { + "epoch": 0.6555411535968891, + "grad_norm": 0.02703363448381424, + "learning_rate": 0.00018580494171654892, + "loss": 0.3239, + "step": 8092 + }, + { + "epoch": 0.6556221646143876, + "grad_norm": 0.03567137196660042, + "learning_rate": 0.00018580044106395428, + "loss": 0.349, + "step": 8093 + }, + { + "epoch": 0.655703175631886, + "grad_norm": 0.03596005216240883, + "learning_rate": 0.00018579594041135967, + "loss": 0.3806, + "step": 8094 + }, + { + "epoch": 0.6557841866493843, + "grad_norm": 0.02762463130056858, + "learning_rate": 0.00018579143975876503, + "loss": 0.3162, + "step": 8095 + }, + { + "epoch": 0.6558651976668827, + "grad_norm": 0.02889867126941681, + "learning_rate": 0.0001857869391061704, + "loss": 0.3202, + "step": 8096 + }, + { + "epoch": 0.655946208684381, + "grad_norm": 0.03050253912806511, + "learning_rate": 0.00018578243845357578, + "loss": 0.3443, + "step": 8097 + }, + { + "epoch": 0.6560272197018795, + "grad_norm": 0.03690246492624283, + "learning_rate": 0.00018577793780098117, + "loss": 0.3787, + "step": 8098 + }, + { + "epoch": 0.6561082307193778, + "grad_norm": 0.03827010095119476, + "learning_rate": 0.00018577343714838653, + "loss": 0.346, + "step": 8099 + }, + { + "epoch": 0.6561892417368762, + "grad_norm": 0.031673092395067215, + "learning_rate": 0.0001857689364957919, + "loss": 0.3205, + "step": 8100 + }, + { + "epoch": 0.6562702527543746, + "grad_norm": 0.027814095839858055, + "learning_rate": 0.00018576443584319727, + "loss": 0.3237, + "step": 8101 + }, + { + "epoch": 0.656351263771873, + "grad_norm": 0.035531893372535706, + "learning_rate": 0.00018575993519060263, + "loss": 0.3353, + "step": 8102 + }, + { + "epoch": 0.6564322747893714, + "grad_norm": 0.029679805040359497, + "learning_rate": 0.00018575543453800802, + "loss": 0.2917, + "step": 8103 + }, + { + "epoch": 0.6565132858068697, + "grad_norm": 0.035042133182287216, + "learning_rate": 0.0001857509338854134, + "loss": 0.3663, + "step": 8104 + }, + { + "epoch": 0.6565942968243681, + "grad_norm": 0.03472837433218956, + "learning_rate": 0.00018574643323281877, + "loss": 0.3746, + "step": 8105 + }, + { + "epoch": 0.6566753078418665, + "grad_norm": 0.033048685640096664, + "learning_rate": 0.00018574193258022415, + "loss": 0.3643, + "step": 8106 + }, + { + "epoch": 0.6567563188593649, + "grad_norm": 0.028987662866711617, + "learning_rate": 0.00018573743192762951, + "loss": 0.367, + "step": 8107 + }, + { + "epoch": 0.6568373298768633, + "grad_norm": 0.03321503847837448, + "learning_rate": 0.00018573293127503487, + "loss": 0.3682, + "step": 8108 + }, + { + "epoch": 0.6569183408943616, + "grad_norm": 0.02941848896443844, + "learning_rate": 0.00018572843062244026, + "loss": 0.3408, + "step": 8109 + }, + { + "epoch": 0.65699935191186, + "grad_norm": 0.03230408951640129, + "learning_rate": 0.00018572392996984565, + "loss": 0.3347, + "step": 8110 + }, + { + "epoch": 0.6570803629293583, + "grad_norm": 0.028914272785186768, + "learning_rate": 0.000185719429317251, + "loss": 0.3017, + "step": 8111 + }, + { + "epoch": 0.6571613739468568, + "grad_norm": 0.03766142949461937, + "learning_rate": 0.0001857149286646564, + "loss": 0.3322, + "step": 8112 + }, + { + "epoch": 0.6572423849643552, + "grad_norm": 0.03311333432793617, + "learning_rate": 0.00018571042801206176, + "loss": 0.3262, + "step": 8113 + }, + { + "epoch": 0.6573233959818535, + "grad_norm": 0.02609996497631073, + "learning_rate": 0.00018570592735946712, + "loss": 0.3018, + "step": 8114 + }, + { + "epoch": 0.6574044069993519, + "grad_norm": 0.03328759968280792, + "learning_rate": 0.0001857014267068725, + "loss": 0.3557, + "step": 8115 + }, + { + "epoch": 0.6574854180168503, + "grad_norm": 0.0315190814435482, + "learning_rate": 0.0001856969260542779, + "loss": 0.3634, + "step": 8116 + }, + { + "epoch": 0.6575664290343487, + "grad_norm": 0.03723594918847084, + "learning_rate": 0.00018569242540168325, + "loss": 0.3691, + "step": 8117 + }, + { + "epoch": 0.657647440051847, + "grad_norm": 0.03286529704928398, + "learning_rate": 0.00018568792474908864, + "loss": 0.3265, + "step": 8118 + }, + { + "epoch": 0.6577284510693454, + "grad_norm": 0.026647770777344704, + "learning_rate": 0.000185683424096494, + "loss": 0.3176, + "step": 8119 + }, + { + "epoch": 0.6578094620868438, + "grad_norm": 0.02880423702299595, + "learning_rate": 0.00018567892344389936, + "loss": 0.3488, + "step": 8120 + }, + { + "epoch": 0.6578904731043422, + "grad_norm": 0.03130809962749481, + "learning_rate": 0.00018567442279130477, + "loss": 0.3071, + "step": 8121 + }, + { + "epoch": 0.6579714841218406, + "grad_norm": 0.03377383574843407, + "learning_rate": 0.00018566992213871013, + "loss": 0.3177, + "step": 8122 + }, + { + "epoch": 0.6580524951393389, + "grad_norm": 0.03247836232185364, + "learning_rate": 0.0001856654214861155, + "loss": 0.31, + "step": 8123 + }, + { + "epoch": 0.6581335061568373, + "grad_norm": 0.031876444816589355, + "learning_rate": 0.00018566092083352088, + "loss": 0.3618, + "step": 8124 + }, + { + "epoch": 0.6582145171743357, + "grad_norm": 0.03695908561348915, + "learning_rate": 0.00018565642018092624, + "loss": 0.3867, + "step": 8125 + }, + { + "epoch": 0.6582955281918341, + "grad_norm": 0.032040636986494064, + "learning_rate": 0.0001856519195283316, + "loss": 0.3652, + "step": 8126 + }, + { + "epoch": 0.6583765392093325, + "grad_norm": 0.02838469110429287, + "learning_rate": 0.000185647418875737, + "loss": 0.3076, + "step": 8127 + }, + { + "epoch": 0.6584575502268308, + "grad_norm": 0.030958110466599464, + "learning_rate": 0.00018564291822314237, + "loss": 0.3494, + "step": 8128 + }, + { + "epoch": 0.6585385612443292, + "grad_norm": 0.0332995280623436, + "learning_rate": 0.00018563841757054773, + "loss": 0.3514, + "step": 8129 + }, + { + "epoch": 0.6586195722618277, + "grad_norm": 0.03220449760556221, + "learning_rate": 0.00018563391691795312, + "loss": 0.379, + "step": 8130 + }, + { + "epoch": 0.658700583279326, + "grad_norm": 0.03334110975265503, + "learning_rate": 0.00018562941626535848, + "loss": 0.3494, + "step": 8131 + }, + { + "epoch": 0.6587815942968244, + "grad_norm": 0.0317775122821331, + "learning_rate": 0.00018562491561276384, + "loss": 0.3264, + "step": 8132 + }, + { + "epoch": 0.6588626053143227, + "grad_norm": 0.03261169418692589, + "learning_rate": 0.00018562041496016925, + "loss": 0.3918, + "step": 8133 + }, + { + "epoch": 0.6589436163318211, + "grad_norm": 0.033578574657440186, + "learning_rate": 0.00018561591430757461, + "loss": 0.3693, + "step": 8134 + }, + { + "epoch": 0.6590246273493195, + "grad_norm": 0.02868012897670269, + "learning_rate": 0.00018561141365497998, + "loss": 0.3405, + "step": 8135 + }, + { + "epoch": 0.6591056383668179, + "grad_norm": 0.03563758358359337, + "learning_rate": 0.00018560691300238536, + "loss": 0.3384, + "step": 8136 + }, + { + "epoch": 0.6591866493843163, + "grad_norm": 0.030504295602440834, + "learning_rate": 0.00018560241234979072, + "loss": 0.3529, + "step": 8137 + }, + { + "epoch": 0.6592676604018146, + "grad_norm": 0.030688825994729996, + "learning_rate": 0.00018559791169719608, + "loss": 0.3393, + "step": 8138 + }, + { + "epoch": 0.6593486714193131, + "grad_norm": 0.02721015177667141, + "learning_rate": 0.0001855934110446015, + "loss": 0.3117, + "step": 8139 + }, + { + "epoch": 0.6594296824368114, + "grad_norm": 0.0309552401304245, + "learning_rate": 0.00018558891039200686, + "loss": 0.3516, + "step": 8140 + }, + { + "epoch": 0.6595106934543098, + "grad_norm": 0.03438500314950943, + "learning_rate": 0.00018558440973941222, + "loss": 0.2714, + "step": 8141 + }, + { + "epoch": 0.6595917044718081, + "grad_norm": 0.0357445664703846, + "learning_rate": 0.0001855799090868176, + "loss": 0.3761, + "step": 8142 + }, + { + "epoch": 0.6596727154893065, + "grad_norm": 0.031231852248311043, + "learning_rate": 0.00018557540843422296, + "loss": 0.3256, + "step": 8143 + }, + { + "epoch": 0.659753726506805, + "grad_norm": 0.03136920928955078, + "learning_rate": 0.00018557090778162835, + "loss": 0.3326, + "step": 8144 + }, + { + "epoch": 0.6598347375243033, + "grad_norm": 0.030037716031074524, + "learning_rate": 0.00018556640712903374, + "loss": 0.3229, + "step": 8145 + }, + { + "epoch": 0.6599157485418017, + "grad_norm": 0.031204765662550926, + "learning_rate": 0.0001855619064764391, + "loss": 0.3507, + "step": 8146 + }, + { + "epoch": 0.6599967595593, + "grad_norm": 0.030983639881014824, + "learning_rate": 0.00018555740582384446, + "loss": 0.3427, + "step": 8147 + }, + { + "epoch": 0.6600777705767984, + "grad_norm": 0.03538487106561661, + "learning_rate": 0.00018555290517124985, + "loss": 0.3719, + "step": 8148 + }, + { + "epoch": 0.6601587815942969, + "grad_norm": 0.036955807358026505, + "learning_rate": 0.0001855484045186552, + "loss": 0.3721, + "step": 8149 + }, + { + "epoch": 0.6602397926117952, + "grad_norm": 0.03064347244799137, + "learning_rate": 0.0001855439038660606, + "loss": 0.3213, + "step": 8150 + }, + { + "epoch": 0.6603208036292936, + "grad_norm": 0.036659225821495056, + "learning_rate": 0.00018553940321346598, + "loss": 0.3554, + "step": 8151 + }, + { + "epoch": 0.6604018146467919, + "grad_norm": 0.0313510037958622, + "learning_rate": 0.00018553490256087134, + "loss": 0.3794, + "step": 8152 + }, + { + "epoch": 0.6604828256642904, + "grad_norm": 0.03193637356162071, + "learning_rate": 0.0001855304019082767, + "loss": 0.3221, + "step": 8153 + }, + { + "epoch": 0.6605638366817888, + "grad_norm": 0.031372442841529846, + "learning_rate": 0.0001855259012556821, + "loss": 0.2984, + "step": 8154 + }, + { + "epoch": 0.6606448476992871, + "grad_norm": 0.034660547971725464, + "learning_rate": 0.00018552140060308745, + "loss": 0.3699, + "step": 8155 + }, + { + "epoch": 0.6607258587167855, + "grad_norm": 0.03529341146349907, + "learning_rate": 0.00018551689995049283, + "loss": 0.3348, + "step": 8156 + }, + { + "epoch": 0.6608068697342838, + "grad_norm": 0.02971823327243328, + "learning_rate": 0.00018551239929789822, + "loss": 0.3282, + "step": 8157 + }, + { + "epoch": 0.6608878807517823, + "grad_norm": 0.03183675929903984, + "learning_rate": 0.00018550789864530358, + "loss": 0.3902, + "step": 8158 + }, + { + "epoch": 0.6609688917692806, + "grad_norm": 0.0275438129901886, + "learning_rate": 0.00018550339799270894, + "loss": 0.3099, + "step": 8159 + }, + { + "epoch": 0.661049902786779, + "grad_norm": 0.036063600331544876, + "learning_rate": 0.00018549889734011433, + "loss": 0.4058, + "step": 8160 + }, + { + "epoch": 0.6611309138042774, + "grad_norm": 0.028650248423218727, + "learning_rate": 0.0001854943966875197, + "loss": 0.3305, + "step": 8161 + }, + { + "epoch": 0.6612119248217757, + "grad_norm": 0.03210330381989479, + "learning_rate": 0.00018548989603492508, + "loss": 0.3486, + "step": 8162 + }, + { + "epoch": 0.6612929358392742, + "grad_norm": 0.02795662172138691, + "learning_rate": 0.00018548539538233046, + "loss": 0.3106, + "step": 8163 + }, + { + "epoch": 0.6613739468567725, + "grad_norm": 0.03249041736125946, + "learning_rate": 0.00018548089472973582, + "loss": 0.3766, + "step": 8164 + }, + { + "epoch": 0.6614549578742709, + "grad_norm": 0.0343630425632, + "learning_rate": 0.00018547639407714118, + "loss": 0.3537, + "step": 8165 + }, + { + "epoch": 0.6615359688917692, + "grad_norm": 0.030740728601813316, + "learning_rate": 0.00018547189342454657, + "loss": 0.3559, + "step": 8166 + }, + { + "epoch": 0.6616169799092677, + "grad_norm": 0.029429970309138298, + "learning_rate": 0.00018546739277195193, + "loss": 0.3559, + "step": 8167 + }, + { + "epoch": 0.6616979909267661, + "grad_norm": 0.03364703431725502, + "learning_rate": 0.00018546289211935732, + "loss": 0.3855, + "step": 8168 + }, + { + "epoch": 0.6617790019442644, + "grad_norm": 0.028997313231229782, + "learning_rate": 0.0001854583914667627, + "loss": 0.3238, + "step": 8169 + }, + { + "epoch": 0.6618600129617628, + "grad_norm": 0.027220774441957474, + "learning_rate": 0.00018545389081416806, + "loss": 0.2873, + "step": 8170 + }, + { + "epoch": 0.6619410239792611, + "grad_norm": 0.036889396607875824, + "learning_rate": 0.00018544939016157342, + "loss": 0.3443, + "step": 8171 + }, + { + "epoch": 0.6620220349967596, + "grad_norm": 0.03279150277376175, + "learning_rate": 0.0001854448895089788, + "loss": 0.3325, + "step": 8172 + }, + { + "epoch": 0.662103046014258, + "grad_norm": 0.029903072863817215, + "learning_rate": 0.0001854403888563842, + "loss": 0.3398, + "step": 8173 + }, + { + "epoch": 0.6621840570317563, + "grad_norm": 0.03309585154056549, + "learning_rate": 0.00018543588820378956, + "loss": 0.3411, + "step": 8174 + }, + { + "epoch": 0.6622650680492547, + "grad_norm": 0.027920817956328392, + "learning_rate": 0.00018543138755119495, + "loss": 0.3086, + "step": 8175 + }, + { + "epoch": 0.662346079066753, + "grad_norm": 0.03215247392654419, + "learning_rate": 0.0001854268868986003, + "loss": 0.3609, + "step": 8176 + }, + { + "epoch": 0.6624270900842515, + "grad_norm": 0.032950956374406815, + "learning_rate": 0.00018542238624600567, + "loss": 0.3389, + "step": 8177 + }, + { + "epoch": 0.6625081011017498, + "grad_norm": 0.03412731736898422, + "learning_rate": 0.00018541788559341105, + "loss": 0.3539, + "step": 8178 + }, + { + "epoch": 0.6625891121192482, + "grad_norm": 0.029734715819358826, + "learning_rate": 0.00018541338494081644, + "loss": 0.3468, + "step": 8179 + }, + { + "epoch": 0.6626701231367466, + "grad_norm": 0.0334324948489666, + "learning_rate": 0.0001854088842882218, + "loss": 0.3548, + "step": 8180 + }, + { + "epoch": 0.662751134154245, + "grad_norm": 0.029621506109833717, + "learning_rate": 0.0001854043836356272, + "loss": 0.3672, + "step": 8181 + }, + { + "epoch": 0.6628321451717434, + "grad_norm": 0.033380456268787384, + "learning_rate": 0.00018539988298303255, + "loss": 0.3399, + "step": 8182 + }, + { + "epoch": 0.6629131561892417, + "grad_norm": 0.032133206725120544, + "learning_rate": 0.0001853953823304379, + "loss": 0.4174, + "step": 8183 + }, + { + "epoch": 0.6629941672067401, + "grad_norm": 0.03078743815422058, + "learning_rate": 0.0001853908816778433, + "loss": 0.3531, + "step": 8184 + }, + { + "epoch": 0.6630751782242384, + "grad_norm": 0.03292972594499588, + "learning_rate": 0.00018538638102524868, + "loss": 0.3256, + "step": 8185 + }, + { + "epoch": 0.6631561892417369, + "grad_norm": 0.03613421693444252, + "learning_rate": 0.00018538188037265404, + "loss": 0.3266, + "step": 8186 + }, + { + "epoch": 0.6632372002592353, + "grad_norm": 0.030362753197550774, + "learning_rate": 0.00018537737972005943, + "loss": 0.3285, + "step": 8187 + }, + { + "epoch": 0.6633182112767336, + "grad_norm": 0.03654602915048599, + "learning_rate": 0.0001853728790674648, + "loss": 0.3572, + "step": 8188 + }, + { + "epoch": 0.663399222294232, + "grad_norm": 0.028821973130106926, + "learning_rate": 0.00018536837841487015, + "loss": 0.3269, + "step": 8189 + }, + { + "epoch": 0.6634802333117304, + "grad_norm": 0.03146751970052719, + "learning_rate": 0.00018536387776227554, + "loss": 0.3292, + "step": 8190 + }, + { + "epoch": 0.6635612443292288, + "grad_norm": 0.026077449321746826, + "learning_rate": 0.00018535937710968092, + "loss": 0.3013, + "step": 8191 + }, + { + "epoch": 0.6636422553467272, + "grad_norm": 0.02873549982905388, + "learning_rate": 0.00018535487645708628, + "loss": 0.3095, + "step": 8192 + }, + { + "epoch": 0.6637232663642255, + "grad_norm": 0.03518807142972946, + "learning_rate": 0.00018535037580449167, + "loss": 0.3682, + "step": 8193 + }, + { + "epoch": 0.6638042773817239, + "grad_norm": 0.02819172665476799, + "learning_rate": 0.00018534587515189703, + "loss": 0.3173, + "step": 8194 + }, + { + "epoch": 0.6638852883992223, + "grad_norm": 0.0349283330142498, + "learning_rate": 0.0001853413744993024, + "loss": 0.3611, + "step": 8195 + }, + { + "epoch": 0.6639662994167207, + "grad_norm": 0.03043026104569435, + "learning_rate": 0.0001853368738467078, + "loss": 0.3297, + "step": 8196 + }, + { + "epoch": 0.664047310434219, + "grad_norm": 0.03040027804672718, + "learning_rate": 0.00018533237319411317, + "loss": 0.3362, + "step": 8197 + }, + { + "epoch": 0.6641283214517174, + "grad_norm": 0.030086055397987366, + "learning_rate": 0.00018532787254151853, + "loss": 0.3285, + "step": 8198 + }, + { + "epoch": 0.6642093324692158, + "grad_norm": 0.03521738573908806, + "learning_rate": 0.0001853233718889239, + "loss": 0.3297, + "step": 8199 + }, + { + "epoch": 0.6642903434867142, + "grad_norm": 0.029127212241292, + "learning_rate": 0.00018531887123632927, + "loss": 0.2985, + "step": 8200 + }, + { + "epoch": 0.6643713545042126, + "grad_norm": 0.031439512968063354, + "learning_rate": 0.00018531437058373463, + "loss": 0.3424, + "step": 8201 + }, + { + "epoch": 0.6644523655217109, + "grad_norm": 0.028876887634396553, + "learning_rate": 0.00018530986993114005, + "loss": 0.3458, + "step": 8202 + }, + { + "epoch": 0.6645333765392093, + "grad_norm": 0.027514170855283737, + "learning_rate": 0.0001853053692785454, + "loss": 0.3167, + "step": 8203 + }, + { + "epoch": 0.6646143875567078, + "grad_norm": 0.031886518001556396, + "learning_rate": 0.00018530086862595077, + "loss": 0.3359, + "step": 8204 + }, + { + "epoch": 0.6646953985742061, + "grad_norm": 0.032745279371738434, + "learning_rate": 0.00018529636797335615, + "loss": 0.3414, + "step": 8205 + }, + { + "epoch": 0.6647764095917045, + "grad_norm": 0.03147970885038376, + "learning_rate": 0.00018529186732076151, + "loss": 0.3224, + "step": 8206 + }, + { + "epoch": 0.6648574206092028, + "grad_norm": 0.030254552140831947, + "learning_rate": 0.00018528736666816687, + "loss": 0.355, + "step": 8207 + }, + { + "epoch": 0.6649384316267012, + "grad_norm": 0.03358277678489685, + "learning_rate": 0.0001852828660155723, + "loss": 0.4045, + "step": 8208 + }, + { + "epoch": 0.6650194426441997, + "grad_norm": 0.028438769280910492, + "learning_rate": 0.00018527836536297765, + "loss": 0.3173, + "step": 8209 + }, + { + "epoch": 0.665100453661698, + "grad_norm": 0.03194088488817215, + "learning_rate": 0.000185273864710383, + "loss": 0.3578, + "step": 8210 + }, + { + "epoch": 0.6651814646791964, + "grad_norm": 0.034081485122442245, + "learning_rate": 0.0001852693640577884, + "loss": 0.3131, + "step": 8211 + }, + { + "epoch": 0.6652624756966947, + "grad_norm": 0.027561400085687637, + "learning_rate": 0.00018526486340519376, + "loss": 0.3138, + "step": 8212 + }, + { + "epoch": 0.6653434867141931, + "grad_norm": 0.030994897708296776, + "learning_rate": 0.00018526036275259912, + "loss": 0.3247, + "step": 8213 + }, + { + "epoch": 0.6654244977316915, + "grad_norm": 0.031196480616927147, + "learning_rate": 0.00018525586210000453, + "loss": 0.3538, + "step": 8214 + }, + { + "epoch": 0.6655055087491899, + "grad_norm": 0.030344417318701744, + "learning_rate": 0.0001852513614474099, + "loss": 0.334, + "step": 8215 + }, + { + "epoch": 0.6655865197666883, + "grad_norm": 0.03306497633457184, + "learning_rate": 0.00018524686079481525, + "loss": 0.3763, + "step": 8216 + }, + { + "epoch": 0.6656675307841866, + "grad_norm": 0.03008945658802986, + "learning_rate": 0.00018524236014222064, + "loss": 0.3221, + "step": 8217 + }, + { + "epoch": 0.6657485418016851, + "grad_norm": 0.03661755844950676, + "learning_rate": 0.000185237859489626, + "loss": 0.367, + "step": 8218 + }, + { + "epoch": 0.6658295528191834, + "grad_norm": 0.031068217009305954, + "learning_rate": 0.00018523335883703136, + "loss": 0.3296, + "step": 8219 + }, + { + "epoch": 0.6659105638366818, + "grad_norm": 0.032785579562187195, + "learning_rate": 0.00018522885818443677, + "loss": 0.3532, + "step": 8220 + }, + { + "epoch": 0.6659915748541801, + "grad_norm": 0.035862911492586136, + "learning_rate": 0.00018522435753184213, + "loss": 0.3114, + "step": 8221 + }, + { + "epoch": 0.6660725858716785, + "grad_norm": 0.030160140246152878, + "learning_rate": 0.0001852198568792475, + "loss": 0.3415, + "step": 8222 + }, + { + "epoch": 0.666153596889177, + "grad_norm": 0.029396269470453262, + "learning_rate": 0.00018521535622665288, + "loss": 0.3189, + "step": 8223 + }, + { + "epoch": 0.6662346079066753, + "grad_norm": 0.02842693403363228, + "learning_rate": 0.00018521085557405824, + "loss": 0.3364, + "step": 8224 + }, + { + "epoch": 0.6663156189241737, + "grad_norm": 0.03332171589136124, + "learning_rate": 0.00018520635492146363, + "loss": 0.3405, + "step": 8225 + }, + { + "epoch": 0.666396629941672, + "grad_norm": 0.030777165666222572, + "learning_rate": 0.00018520185426886901, + "loss": 0.3495, + "step": 8226 + }, + { + "epoch": 0.6664776409591704, + "grad_norm": 0.031115055084228516, + "learning_rate": 0.00018519735361627437, + "loss": 0.2807, + "step": 8227 + }, + { + "epoch": 0.6665586519766689, + "grad_norm": 0.029170986264944077, + "learning_rate": 0.00018519285296367973, + "loss": 0.3295, + "step": 8228 + }, + { + "epoch": 0.6666396629941672, + "grad_norm": 0.030726497992873192, + "learning_rate": 0.00018518835231108512, + "loss": 0.3405, + "step": 8229 + }, + { + "epoch": 0.6667206740116656, + "grad_norm": 0.031001776456832886, + "learning_rate": 0.00018518385165849048, + "loss": 0.3136, + "step": 8230 + }, + { + "epoch": 0.6668016850291639, + "grad_norm": 0.03498663753271103, + "learning_rate": 0.00018517935100589587, + "loss": 0.3649, + "step": 8231 + }, + { + "epoch": 0.6668826960466624, + "grad_norm": 0.029227035120129585, + "learning_rate": 0.00018517485035330126, + "loss": 0.3397, + "step": 8232 + }, + { + "epoch": 0.6669637070641607, + "grad_norm": 0.034446362406015396, + "learning_rate": 0.00018517034970070662, + "loss": 0.3718, + "step": 8233 + }, + { + "epoch": 0.6670447180816591, + "grad_norm": 0.03119107149541378, + "learning_rate": 0.00018516584904811198, + "loss": 0.3188, + "step": 8234 + }, + { + "epoch": 0.6671257290991575, + "grad_norm": 0.036731645464897156, + "learning_rate": 0.00018516134839551736, + "loss": 0.3484, + "step": 8235 + }, + { + "epoch": 0.6672067401166558, + "grad_norm": 0.031873635947704315, + "learning_rate": 0.00018515684774292272, + "loss": 0.356, + "step": 8236 + }, + { + "epoch": 0.6672877511341543, + "grad_norm": 0.03385842964053154, + "learning_rate": 0.0001851523470903281, + "loss": 0.3968, + "step": 8237 + }, + { + "epoch": 0.6673687621516526, + "grad_norm": 0.03451808542013168, + "learning_rate": 0.0001851478464377335, + "loss": 0.347, + "step": 8238 + }, + { + "epoch": 0.667449773169151, + "grad_norm": 0.03075961396098137, + "learning_rate": 0.00018514334578513886, + "loss": 0.3388, + "step": 8239 + }, + { + "epoch": 0.6675307841866494, + "grad_norm": 0.028750881552696228, + "learning_rate": 0.00018513884513254422, + "loss": 0.3216, + "step": 8240 + }, + { + "epoch": 0.6676117952041478, + "grad_norm": 0.028194140642881393, + "learning_rate": 0.0001851343444799496, + "loss": 0.3215, + "step": 8241 + }, + { + "epoch": 0.6676928062216462, + "grad_norm": 0.028071299195289612, + "learning_rate": 0.00018512984382735496, + "loss": 0.3221, + "step": 8242 + }, + { + "epoch": 0.6677738172391445, + "grad_norm": 0.03381093963980675, + "learning_rate": 0.00018512534317476035, + "loss": 0.3777, + "step": 8243 + }, + { + "epoch": 0.6678548282566429, + "grad_norm": 0.03282523527741432, + "learning_rate": 0.00018512084252216574, + "loss": 0.3485, + "step": 8244 + }, + { + "epoch": 0.6679358392741412, + "grad_norm": 0.030734248459339142, + "learning_rate": 0.0001851163418695711, + "loss": 0.3075, + "step": 8245 + }, + { + "epoch": 0.6680168502916397, + "grad_norm": 0.031647514551877975, + "learning_rate": 0.00018511184121697646, + "loss": 0.3644, + "step": 8246 + }, + { + "epoch": 0.6680978613091381, + "grad_norm": 0.03423544764518738, + "learning_rate": 0.00018510734056438185, + "loss": 0.3365, + "step": 8247 + }, + { + "epoch": 0.6681788723266364, + "grad_norm": 0.03142764791846275, + "learning_rate": 0.0001851028399117872, + "loss": 0.3527, + "step": 8248 + }, + { + "epoch": 0.6682598833441348, + "grad_norm": 0.03567124903202057, + "learning_rate": 0.0001850983392591926, + "loss": 0.323, + "step": 8249 + }, + { + "epoch": 0.6683408943616331, + "grad_norm": 0.033891335129737854, + "learning_rate": 0.00018509383860659798, + "loss": 0.3747, + "step": 8250 + }, + { + "epoch": 0.6684219053791316, + "grad_norm": 0.02757412940263748, + "learning_rate": 0.00018508933795400334, + "loss": 0.3186, + "step": 8251 + }, + { + "epoch": 0.66850291639663, + "grad_norm": 0.030500739812850952, + "learning_rate": 0.0001850848373014087, + "loss": 0.3387, + "step": 8252 + }, + { + "epoch": 0.6685839274141283, + "grad_norm": 0.03229675069451332, + "learning_rate": 0.0001850803366488141, + "loss": 0.333, + "step": 8253 + }, + { + "epoch": 0.6686649384316267, + "grad_norm": 0.031556446105241776, + "learning_rate": 0.00018507583599621947, + "loss": 0.3234, + "step": 8254 + }, + { + "epoch": 0.6687459494491251, + "grad_norm": 0.03277357295155525, + "learning_rate": 0.00018507133534362483, + "loss": 0.3282, + "step": 8255 + }, + { + "epoch": 0.6688269604666235, + "grad_norm": 0.030226023867726326, + "learning_rate": 0.00018506683469103022, + "loss": 0.3975, + "step": 8256 + }, + { + "epoch": 0.6689079714841218, + "grad_norm": 0.03333505243062973, + "learning_rate": 0.00018506233403843558, + "loss": 0.3657, + "step": 8257 + }, + { + "epoch": 0.6689889825016202, + "grad_norm": 0.030513428151607513, + "learning_rate": 0.00018505783338584094, + "loss": 0.2682, + "step": 8258 + }, + { + "epoch": 0.6690699935191186, + "grad_norm": 0.02662210538983345, + "learning_rate": 0.00018505333273324633, + "loss": 0.3014, + "step": 8259 + }, + { + "epoch": 0.669151004536617, + "grad_norm": 0.033432211726903915, + "learning_rate": 0.00018504883208065172, + "loss": 0.3837, + "step": 8260 + }, + { + "epoch": 0.6692320155541154, + "grad_norm": 0.028870683163404465, + "learning_rate": 0.00018504433142805708, + "loss": 0.3765, + "step": 8261 + }, + { + "epoch": 0.6693130265716137, + "grad_norm": 0.03151526302099228, + "learning_rate": 0.00018503983077546246, + "loss": 0.3777, + "step": 8262 + }, + { + "epoch": 0.6693940375891121, + "grad_norm": 0.029690612107515335, + "learning_rate": 0.00018503533012286782, + "loss": 0.3425, + "step": 8263 + }, + { + "epoch": 0.6694750486066104, + "grad_norm": 0.03496913984417915, + "learning_rate": 0.00018503082947027318, + "loss": 0.3598, + "step": 8264 + }, + { + "epoch": 0.6695560596241089, + "grad_norm": 0.03096066601574421, + "learning_rate": 0.00018502632881767857, + "loss": 0.33, + "step": 8265 + }, + { + "epoch": 0.6696370706416073, + "grad_norm": 0.032060880213975906, + "learning_rate": 0.00018502182816508396, + "loss": 0.3653, + "step": 8266 + }, + { + "epoch": 0.6697180816591056, + "grad_norm": 0.031135201454162598, + "learning_rate": 0.00018501732751248932, + "loss": 0.3451, + "step": 8267 + }, + { + "epoch": 0.669799092676604, + "grad_norm": 0.03153124824166298, + "learning_rate": 0.0001850128268598947, + "loss": 0.3761, + "step": 8268 + }, + { + "epoch": 0.6698801036941024, + "grad_norm": 0.03391728922724724, + "learning_rate": 0.00018500832620730006, + "loss": 0.3559, + "step": 8269 + }, + { + "epoch": 0.6699611147116008, + "grad_norm": 0.033236369490623474, + "learning_rate": 0.00018500382555470543, + "loss": 0.3162, + "step": 8270 + }, + { + "epoch": 0.6700421257290992, + "grad_norm": 0.041579004377126694, + "learning_rate": 0.0001849993249021108, + "loss": 0.4065, + "step": 8271 + }, + { + "epoch": 0.6701231367465975, + "grad_norm": 0.02709999307990074, + "learning_rate": 0.0001849948242495162, + "loss": 0.338, + "step": 8272 + }, + { + "epoch": 0.6702041477640959, + "grad_norm": 0.033023085445165634, + "learning_rate": 0.00018499032359692156, + "loss": 0.3523, + "step": 8273 + }, + { + "epoch": 0.6702851587815943, + "grad_norm": 0.026633795350790024, + "learning_rate": 0.00018498582294432695, + "loss": 0.2917, + "step": 8274 + }, + { + "epoch": 0.6703661697990927, + "grad_norm": 0.030886471271514893, + "learning_rate": 0.0001849813222917323, + "loss": 0.3144, + "step": 8275 + }, + { + "epoch": 0.670447180816591, + "grad_norm": 0.02772926539182663, + "learning_rate": 0.00018497682163913767, + "loss": 0.311, + "step": 8276 + }, + { + "epoch": 0.6705281918340894, + "grad_norm": 0.03047368861734867, + "learning_rate": 0.00018497232098654308, + "loss": 0.3737, + "step": 8277 + }, + { + "epoch": 0.6706092028515879, + "grad_norm": 0.030798176303505898, + "learning_rate": 0.00018496782033394844, + "loss": 0.3343, + "step": 8278 + }, + { + "epoch": 0.6706902138690862, + "grad_norm": 0.03067445196211338, + "learning_rate": 0.0001849633196813538, + "loss": 0.3801, + "step": 8279 + }, + { + "epoch": 0.6707712248865846, + "grad_norm": 0.03117203153669834, + "learning_rate": 0.0001849588190287592, + "loss": 0.3651, + "step": 8280 + }, + { + "epoch": 0.6708522359040829, + "grad_norm": 0.031838901340961456, + "learning_rate": 0.00018495431837616455, + "loss": 0.3705, + "step": 8281 + }, + { + "epoch": 0.6709332469215813, + "grad_norm": 0.031705450266599655, + "learning_rate": 0.0001849498177235699, + "loss": 0.399, + "step": 8282 + }, + { + "epoch": 0.6710142579390798, + "grad_norm": 0.030952507629990578, + "learning_rate": 0.00018494531707097532, + "loss": 0.3718, + "step": 8283 + }, + { + "epoch": 0.6710952689565781, + "grad_norm": 0.03643830493092537, + "learning_rate": 0.00018494081641838068, + "loss": 0.3151, + "step": 8284 + }, + { + "epoch": 0.6711762799740765, + "grad_norm": 0.030212044715881348, + "learning_rate": 0.00018493631576578604, + "loss": 0.3665, + "step": 8285 + }, + { + "epoch": 0.6712572909915748, + "grad_norm": 0.0286360252648592, + "learning_rate": 0.00018493181511319143, + "loss": 0.2955, + "step": 8286 + }, + { + "epoch": 0.6713383020090732, + "grad_norm": 0.0322282612323761, + "learning_rate": 0.0001849273144605968, + "loss": 0.3847, + "step": 8287 + }, + { + "epoch": 0.6714193130265717, + "grad_norm": 0.03221067413687706, + "learning_rate": 0.00018492281380800215, + "loss": 0.3419, + "step": 8288 + }, + { + "epoch": 0.67150032404407, + "grad_norm": 0.03293781727552414, + "learning_rate": 0.00018491831315540756, + "loss": 0.3746, + "step": 8289 + }, + { + "epoch": 0.6715813350615684, + "grad_norm": 0.03397240489721298, + "learning_rate": 0.00018491381250281292, + "loss": 0.326, + "step": 8290 + }, + { + "epoch": 0.6716623460790667, + "grad_norm": 0.027765223756432533, + "learning_rate": 0.00018490931185021828, + "loss": 0.2949, + "step": 8291 + }, + { + "epoch": 0.6717433570965652, + "grad_norm": 0.030565418303012848, + "learning_rate": 0.00018490481119762367, + "loss": 0.3155, + "step": 8292 + }, + { + "epoch": 0.6718243681140635, + "grad_norm": 0.02972385101020336, + "learning_rate": 0.00018490031054502903, + "loss": 0.3434, + "step": 8293 + }, + { + "epoch": 0.6719053791315619, + "grad_norm": 0.03217386454343796, + "learning_rate": 0.0001848958098924344, + "loss": 0.3643, + "step": 8294 + }, + { + "epoch": 0.6719863901490603, + "grad_norm": 0.03395373001694679, + "learning_rate": 0.0001848913092398398, + "loss": 0.3811, + "step": 8295 + }, + { + "epoch": 0.6720674011665586, + "grad_norm": 0.0331270694732666, + "learning_rate": 0.00018488680858724517, + "loss": 0.312, + "step": 8296 + }, + { + "epoch": 0.6721484121840571, + "grad_norm": 0.032217953354120255, + "learning_rate": 0.00018488230793465053, + "loss": 0.3978, + "step": 8297 + }, + { + "epoch": 0.6722294232015554, + "grad_norm": 0.03126753121614456, + "learning_rate": 0.0001848778072820559, + "loss": 0.3478, + "step": 8298 + }, + { + "epoch": 0.6723104342190538, + "grad_norm": 0.030235685408115387, + "learning_rate": 0.00018487330662946127, + "loss": 0.3219, + "step": 8299 + }, + { + "epoch": 0.6723914452365521, + "grad_norm": 0.032127559185028076, + "learning_rate": 0.00018486880597686663, + "loss": 0.3244, + "step": 8300 + }, + { + "epoch": 0.6724724562540505, + "grad_norm": 0.029689908027648926, + "learning_rate": 0.00018486430532427205, + "loss": 0.3231, + "step": 8301 + }, + { + "epoch": 0.672553467271549, + "grad_norm": 0.032204680144786835, + "learning_rate": 0.0001848598046716774, + "loss": 0.3704, + "step": 8302 + }, + { + "epoch": 0.6726344782890473, + "grad_norm": 0.033696506172418594, + "learning_rate": 0.00018485530401908277, + "loss": 0.3683, + "step": 8303 + }, + { + "epoch": 0.6727154893065457, + "grad_norm": 0.030479637905955315, + "learning_rate": 0.00018485080336648815, + "loss": 0.3205, + "step": 8304 + }, + { + "epoch": 0.672796500324044, + "grad_norm": 0.02611781284213066, + "learning_rate": 0.00018484630271389351, + "loss": 0.298, + "step": 8305 + }, + { + "epoch": 0.6728775113415425, + "grad_norm": 0.03304387629032135, + "learning_rate": 0.0001848418020612989, + "loss": 0.3671, + "step": 8306 + }, + { + "epoch": 0.6729585223590409, + "grad_norm": 0.030400149524211884, + "learning_rate": 0.0001848373014087043, + "loss": 0.3655, + "step": 8307 + }, + { + "epoch": 0.6730395333765392, + "grad_norm": 0.035930197685956955, + "learning_rate": 0.00018483280075610965, + "loss": 0.3748, + "step": 8308 + }, + { + "epoch": 0.6731205443940376, + "grad_norm": 0.028552010655403137, + "learning_rate": 0.000184828300103515, + "loss": 0.3544, + "step": 8309 + }, + { + "epoch": 0.6732015554115359, + "grad_norm": 0.031653061509132385, + "learning_rate": 0.0001848237994509204, + "loss": 0.3633, + "step": 8310 + }, + { + "epoch": 0.6732825664290344, + "grad_norm": 0.029851393774151802, + "learning_rate": 0.00018481929879832576, + "loss": 0.3013, + "step": 8311 + }, + { + "epoch": 0.6733635774465327, + "grad_norm": 0.034095648676157, + "learning_rate": 0.00018481479814573114, + "loss": 0.3643, + "step": 8312 + }, + { + "epoch": 0.6734445884640311, + "grad_norm": 0.029064346104860306, + "learning_rate": 0.00018481029749313653, + "loss": 0.3238, + "step": 8313 + }, + { + "epoch": 0.6735255994815295, + "grad_norm": 0.03601328283548355, + "learning_rate": 0.0001848057968405419, + "loss": 0.3288, + "step": 8314 + }, + { + "epoch": 0.6736066104990278, + "grad_norm": 0.033166661858558655, + "learning_rate": 0.00018480129618794725, + "loss": 0.3303, + "step": 8315 + }, + { + "epoch": 0.6736876215165263, + "grad_norm": 0.03911403939127922, + "learning_rate": 0.00018479679553535264, + "loss": 0.3487, + "step": 8316 + }, + { + "epoch": 0.6737686325340246, + "grad_norm": 0.03566893935203552, + "learning_rate": 0.000184792294882758, + "loss": 0.3874, + "step": 8317 + }, + { + "epoch": 0.673849643551523, + "grad_norm": 0.03268427029252052, + "learning_rate": 0.00018478779423016339, + "loss": 0.3461, + "step": 8318 + }, + { + "epoch": 0.6739306545690213, + "grad_norm": 0.03220139816403389, + "learning_rate": 0.00018478329357756877, + "loss": 0.3425, + "step": 8319 + }, + { + "epoch": 0.6740116655865198, + "grad_norm": 0.033977918326854706, + "learning_rate": 0.00018477879292497413, + "loss": 0.3653, + "step": 8320 + }, + { + "epoch": 0.6740926766040182, + "grad_norm": 0.0333586148917675, + "learning_rate": 0.0001847742922723795, + "loss": 0.3588, + "step": 8321 + }, + { + "epoch": 0.6741736876215165, + "grad_norm": 0.03474782779812813, + "learning_rate": 0.00018476979161978488, + "loss": 0.3246, + "step": 8322 + }, + { + "epoch": 0.6742546986390149, + "grad_norm": 0.03482431918382645, + "learning_rate": 0.00018476529096719024, + "loss": 0.3209, + "step": 8323 + }, + { + "epoch": 0.6743357096565132, + "grad_norm": 0.03185975179076195, + "learning_rate": 0.00018476079031459563, + "loss": 0.347, + "step": 8324 + }, + { + "epoch": 0.6744167206740117, + "grad_norm": 0.03250424191355705, + "learning_rate": 0.00018475628966200101, + "loss": 0.3224, + "step": 8325 + }, + { + "epoch": 0.6744977316915101, + "grad_norm": 0.03220516815781593, + "learning_rate": 0.00018475178900940637, + "loss": 0.3783, + "step": 8326 + }, + { + "epoch": 0.6745787427090084, + "grad_norm": 0.03147347643971443, + "learning_rate": 0.00018474728835681173, + "loss": 0.3235, + "step": 8327 + }, + { + "epoch": 0.6746597537265068, + "grad_norm": 0.03596566244959831, + "learning_rate": 0.00018474278770421712, + "loss": 0.3858, + "step": 8328 + }, + { + "epoch": 0.6747407647440052, + "grad_norm": 0.03046397678554058, + "learning_rate": 0.0001847382870516225, + "loss": 0.3356, + "step": 8329 + }, + { + "epoch": 0.6748217757615036, + "grad_norm": 0.030585501343011856, + "learning_rate": 0.00018473378639902787, + "loss": 0.3516, + "step": 8330 + }, + { + "epoch": 0.674902786779002, + "grad_norm": 0.03178463876247406, + "learning_rate": 0.00018472928574643326, + "loss": 0.3332, + "step": 8331 + }, + { + "epoch": 0.6749837977965003, + "grad_norm": 0.03212091699242592, + "learning_rate": 0.00018472478509383862, + "loss": 0.3483, + "step": 8332 + }, + { + "epoch": 0.6750648088139987, + "grad_norm": 0.03113105148077011, + "learning_rate": 0.00018472028444124398, + "loss": 0.313, + "step": 8333 + }, + { + "epoch": 0.6751458198314971, + "grad_norm": 0.03049953654408455, + "learning_rate": 0.00018471578378864936, + "loss": 0.3194, + "step": 8334 + }, + { + "epoch": 0.6752268308489955, + "grad_norm": 0.03434020280838013, + "learning_rate": 0.00018471128313605475, + "loss": 0.3494, + "step": 8335 + }, + { + "epoch": 0.6753078418664938, + "grad_norm": 0.03446441516280174, + "learning_rate": 0.0001847067824834601, + "loss": 0.3756, + "step": 8336 + }, + { + "epoch": 0.6753888528839922, + "grad_norm": 0.02943461760878563, + "learning_rate": 0.0001847022818308655, + "loss": 0.3164, + "step": 8337 + }, + { + "epoch": 0.6754698639014906, + "grad_norm": 0.03737198933959007, + "learning_rate": 0.00018469778117827086, + "loss": 0.3463, + "step": 8338 + }, + { + "epoch": 0.675550874918989, + "grad_norm": 0.026089653372764587, + "learning_rate": 0.00018469328052567622, + "loss": 0.2937, + "step": 8339 + }, + { + "epoch": 0.6756318859364874, + "grad_norm": 0.03368382155895233, + "learning_rate": 0.0001846887798730816, + "loss": 0.3888, + "step": 8340 + }, + { + "epoch": 0.6757128969539857, + "grad_norm": 0.034756697714328766, + "learning_rate": 0.000184684279220487, + "loss": 0.3741, + "step": 8341 + }, + { + "epoch": 0.6757939079714841, + "grad_norm": 0.035592664033174515, + "learning_rate": 0.00018467977856789235, + "loss": 0.3442, + "step": 8342 + }, + { + "epoch": 0.6758749189889826, + "grad_norm": 0.028692577034235, + "learning_rate": 0.00018467527791529774, + "loss": 0.3846, + "step": 8343 + }, + { + "epoch": 0.6759559300064809, + "grad_norm": 0.03535209223628044, + "learning_rate": 0.0001846707772627031, + "loss": 0.3569, + "step": 8344 + }, + { + "epoch": 0.6760369410239793, + "grad_norm": 0.03144112229347229, + "learning_rate": 0.00018466627661010846, + "loss": 0.3458, + "step": 8345 + }, + { + "epoch": 0.6761179520414776, + "grad_norm": 0.03163455054163933, + "learning_rate": 0.00018466177595751385, + "loss": 0.3408, + "step": 8346 + }, + { + "epoch": 0.676198963058976, + "grad_norm": 0.03496427834033966, + "learning_rate": 0.00018465727530491923, + "loss": 0.3751, + "step": 8347 + }, + { + "epoch": 0.6762799740764744, + "grad_norm": 0.029395341873168945, + "learning_rate": 0.0001846527746523246, + "loss": 0.3297, + "step": 8348 + }, + { + "epoch": 0.6763609850939728, + "grad_norm": 0.03403710573911667, + "learning_rate": 0.00018464827399972998, + "loss": 0.3831, + "step": 8349 + }, + { + "epoch": 0.6764419961114712, + "grad_norm": 0.028789136558771133, + "learning_rate": 0.00018464377334713534, + "loss": 0.3725, + "step": 8350 + }, + { + "epoch": 0.6765230071289695, + "grad_norm": 0.03976523131132126, + "learning_rate": 0.0001846392726945407, + "loss": 0.4335, + "step": 8351 + }, + { + "epoch": 0.6766040181464679, + "grad_norm": 0.02864331193268299, + "learning_rate": 0.0001846347720419461, + "loss": 0.3131, + "step": 8352 + }, + { + "epoch": 0.6766850291639663, + "grad_norm": 0.030334942042827606, + "learning_rate": 0.00018463027138935147, + "loss": 0.3114, + "step": 8353 + }, + { + "epoch": 0.6767660401814647, + "grad_norm": 0.03436267375946045, + "learning_rate": 0.00018462577073675683, + "loss": 0.3356, + "step": 8354 + }, + { + "epoch": 0.676847051198963, + "grad_norm": 0.032568447291851044, + "learning_rate": 0.00018462127008416222, + "loss": 0.3685, + "step": 8355 + }, + { + "epoch": 0.6769280622164614, + "grad_norm": 0.03440425917506218, + "learning_rate": 0.00018461676943156758, + "loss": 0.3552, + "step": 8356 + }, + { + "epoch": 0.6770090732339599, + "grad_norm": 0.03378186374902725, + "learning_rate": 0.00018461226877897294, + "loss": 0.3257, + "step": 8357 + }, + { + "epoch": 0.6770900842514582, + "grad_norm": 0.03300986438989639, + "learning_rate": 0.00018460776812637836, + "loss": 0.3587, + "step": 8358 + }, + { + "epoch": 0.6771710952689566, + "grad_norm": 0.03187122195959091, + "learning_rate": 0.00018460326747378372, + "loss": 0.3304, + "step": 8359 + }, + { + "epoch": 0.6772521062864549, + "grad_norm": 0.030382633209228516, + "learning_rate": 0.00018459876682118908, + "loss": 0.3617, + "step": 8360 + }, + { + "epoch": 0.6773331173039533, + "grad_norm": 0.030665893107652664, + "learning_rate": 0.00018459426616859446, + "loss": 0.3531, + "step": 8361 + }, + { + "epoch": 0.6774141283214518, + "grad_norm": 0.032478105276823044, + "learning_rate": 0.00018458976551599982, + "loss": 0.3567, + "step": 8362 + }, + { + "epoch": 0.6774951393389501, + "grad_norm": 0.031920671463012695, + "learning_rate": 0.00018458526486340518, + "loss": 0.3899, + "step": 8363 + }, + { + "epoch": 0.6775761503564485, + "grad_norm": 0.0325034074485302, + "learning_rate": 0.0001845807642108106, + "loss": 0.3315, + "step": 8364 + }, + { + "epoch": 0.6776571613739468, + "grad_norm": 0.028699731454253197, + "learning_rate": 0.00018457626355821596, + "loss": 0.2991, + "step": 8365 + }, + { + "epoch": 0.6777381723914452, + "grad_norm": 0.031557776033878326, + "learning_rate": 0.00018457176290562132, + "loss": 0.385, + "step": 8366 + }, + { + "epoch": 0.6778191834089436, + "grad_norm": 0.031747668981552124, + "learning_rate": 0.0001845672622530267, + "loss": 0.3662, + "step": 8367 + }, + { + "epoch": 0.677900194426442, + "grad_norm": 0.03107340633869171, + "learning_rate": 0.00018456276160043207, + "loss": 0.3672, + "step": 8368 + }, + { + "epoch": 0.6779812054439404, + "grad_norm": 0.029906675219535828, + "learning_rate": 0.00018455826094783743, + "loss": 0.3334, + "step": 8369 + }, + { + "epoch": 0.6780622164614387, + "grad_norm": 0.029467299580574036, + "learning_rate": 0.00018455376029524284, + "loss": 0.3618, + "step": 8370 + }, + { + "epoch": 0.6781432274789372, + "grad_norm": 0.027794750407338142, + "learning_rate": 0.0001845492596426482, + "loss": 0.2963, + "step": 8371 + }, + { + "epoch": 0.6782242384964355, + "grad_norm": 0.029678959399461746, + "learning_rate": 0.00018454475899005356, + "loss": 0.3364, + "step": 8372 + }, + { + "epoch": 0.6783052495139339, + "grad_norm": 0.029389753937721252, + "learning_rate": 0.00018454025833745895, + "loss": 0.3207, + "step": 8373 + }, + { + "epoch": 0.6783862605314323, + "grad_norm": 0.031182603910565376, + "learning_rate": 0.0001845357576848643, + "loss": 0.3015, + "step": 8374 + }, + { + "epoch": 0.6784672715489306, + "grad_norm": 0.0318613238632679, + "learning_rate": 0.00018453125703226967, + "loss": 0.3365, + "step": 8375 + }, + { + "epoch": 0.6785482825664291, + "grad_norm": 0.03271263837814331, + "learning_rate": 0.00018452675637967508, + "loss": 0.382, + "step": 8376 + }, + { + "epoch": 0.6786292935839274, + "grad_norm": 0.03947818651795387, + "learning_rate": 0.00018452225572708044, + "loss": 0.3519, + "step": 8377 + }, + { + "epoch": 0.6787103046014258, + "grad_norm": 0.029119271785020828, + "learning_rate": 0.0001845177550744858, + "loss": 0.3464, + "step": 8378 + }, + { + "epoch": 0.6787913156189241, + "grad_norm": 0.026837065815925598, + "learning_rate": 0.0001845132544218912, + "loss": 0.3269, + "step": 8379 + }, + { + "epoch": 0.6788723266364226, + "grad_norm": 0.03194974735379219, + "learning_rate": 0.00018450875376929655, + "loss": 0.3412, + "step": 8380 + }, + { + "epoch": 0.678953337653921, + "grad_norm": 0.031717248260974884, + "learning_rate": 0.00018450425311670194, + "loss": 0.3267, + "step": 8381 + }, + { + "epoch": 0.6790343486714193, + "grad_norm": 0.03393028676509857, + "learning_rate": 0.00018449975246410732, + "loss": 0.3378, + "step": 8382 + }, + { + "epoch": 0.6791153596889177, + "grad_norm": 0.02875995635986328, + "learning_rate": 0.00018449525181151268, + "loss": 0.2964, + "step": 8383 + }, + { + "epoch": 0.679196370706416, + "grad_norm": 0.03428420424461365, + "learning_rate": 0.00018449075115891804, + "loss": 0.345, + "step": 8384 + }, + { + "epoch": 0.6792773817239145, + "grad_norm": 0.033055152744054794, + "learning_rate": 0.00018448625050632343, + "loss": 0.334, + "step": 8385 + }, + { + "epoch": 0.6793583927414129, + "grad_norm": 0.02540542371571064, + "learning_rate": 0.0001844817498537288, + "loss": 0.3016, + "step": 8386 + }, + { + "epoch": 0.6794394037589112, + "grad_norm": 0.027096103876829147, + "learning_rate": 0.00018447724920113418, + "loss": 0.3058, + "step": 8387 + }, + { + "epoch": 0.6795204147764096, + "grad_norm": 0.033959295600652695, + "learning_rate": 0.00018447274854853956, + "loss": 0.3308, + "step": 8388 + }, + { + "epoch": 0.6796014257939079, + "grad_norm": 0.037247296422719955, + "learning_rate": 0.00018446824789594492, + "loss": 0.4068, + "step": 8389 + }, + { + "epoch": 0.6796824368114064, + "grad_norm": 0.029535872861742973, + "learning_rate": 0.00018446374724335028, + "loss": 0.3042, + "step": 8390 + }, + { + "epoch": 0.6797634478289047, + "grad_norm": 0.030849486589431763, + "learning_rate": 0.00018445924659075567, + "loss": 0.3305, + "step": 8391 + }, + { + "epoch": 0.6798444588464031, + "grad_norm": 0.036509912461042404, + "learning_rate": 0.00018445474593816103, + "loss": 0.3492, + "step": 8392 + }, + { + "epoch": 0.6799254698639015, + "grad_norm": 0.036684028804302216, + "learning_rate": 0.00018445024528556642, + "loss": 0.4093, + "step": 8393 + }, + { + "epoch": 0.6800064808813999, + "grad_norm": 0.029528088867664337, + "learning_rate": 0.0001844457446329718, + "loss": 0.3526, + "step": 8394 + }, + { + "epoch": 0.6800874918988983, + "grad_norm": 0.029075436294078827, + "learning_rate": 0.00018444124398037717, + "loss": 0.3485, + "step": 8395 + }, + { + "epoch": 0.6801685029163966, + "grad_norm": 0.02978142723441124, + "learning_rate": 0.00018443674332778253, + "loss": 0.2908, + "step": 8396 + }, + { + "epoch": 0.680249513933895, + "grad_norm": 0.034966953098773956, + "learning_rate": 0.0001844322426751879, + "loss": 0.3829, + "step": 8397 + }, + { + "epoch": 0.6803305249513933, + "grad_norm": 0.03151193633675575, + "learning_rate": 0.00018442774202259327, + "loss": 0.3413, + "step": 8398 + }, + { + "epoch": 0.6804115359688918, + "grad_norm": 0.0359109602868557, + "learning_rate": 0.00018442324136999866, + "loss": 0.3949, + "step": 8399 + }, + { + "epoch": 0.6804925469863902, + "grad_norm": 0.03561898693442345, + "learning_rate": 0.00018441874071740405, + "loss": 0.3485, + "step": 8400 + }, + { + "epoch": 0.6805735580038885, + "grad_norm": 0.027923088520765305, + "learning_rate": 0.0001844142400648094, + "loss": 0.3571, + "step": 8401 + }, + { + "epoch": 0.6806545690213869, + "grad_norm": 0.03303304314613342, + "learning_rate": 0.00018440973941221477, + "loss": 0.3832, + "step": 8402 + }, + { + "epoch": 0.6807355800388852, + "grad_norm": 0.028605502098798752, + "learning_rate": 0.00018440523875962015, + "loss": 0.3472, + "step": 8403 + }, + { + "epoch": 0.6808165910563837, + "grad_norm": 0.03851553052663803, + "learning_rate": 0.00018440073810702551, + "loss": 0.3674, + "step": 8404 + }, + { + "epoch": 0.6808976020738821, + "grad_norm": 0.03283935785293579, + "learning_rate": 0.0001843962374544309, + "loss": 0.3333, + "step": 8405 + }, + { + "epoch": 0.6809786130913804, + "grad_norm": 0.03001883625984192, + "learning_rate": 0.0001843917368018363, + "loss": 0.3492, + "step": 8406 + }, + { + "epoch": 0.6810596241088788, + "grad_norm": 0.03826170414686203, + "learning_rate": 0.00018438723614924165, + "loss": 0.3458, + "step": 8407 + }, + { + "epoch": 0.6811406351263772, + "grad_norm": 0.033792685717344284, + "learning_rate": 0.000184382735496647, + "loss": 0.3444, + "step": 8408 + }, + { + "epoch": 0.6812216461438756, + "grad_norm": 0.032256945967674255, + "learning_rate": 0.0001843782348440524, + "loss": 0.3727, + "step": 8409 + }, + { + "epoch": 0.681302657161374, + "grad_norm": 0.032892536371946335, + "learning_rate": 0.00018437373419145778, + "loss": 0.3385, + "step": 8410 + }, + { + "epoch": 0.6813836681788723, + "grad_norm": 0.03750142082571983, + "learning_rate": 0.00018436923353886314, + "loss": 0.373, + "step": 8411 + }, + { + "epoch": 0.6814646791963707, + "grad_norm": 0.03716817870736122, + "learning_rate": 0.00018436473288626853, + "loss": 0.3735, + "step": 8412 + }, + { + "epoch": 0.6815456902138691, + "grad_norm": 0.02930922619998455, + "learning_rate": 0.0001843602322336739, + "loss": 0.3086, + "step": 8413 + }, + { + "epoch": 0.6816267012313675, + "grad_norm": 0.03268182650208473, + "learning_rate": 0.00018435573158107925, + "loss": 0.349, + "step": 8414 + }, + { + "epoch": 0.6817077122488658, + "grad_norm": 0.03252032399177551, + "learning_rate": 0.00018435123092848464, + "loss": 0.3054, + "step": 8415 + }, + { + "epoch": 0.6817887232663642, + "grad_norm": 0.03493228182196617, + "learning_rate": 0.00018434673027589003, + "loss": 0.3688, + "step": 8416 + }, + { + "epoch": 0.6818697342838627, + "grad_norm": 0.033659689128398895, + "learning_rate": 0.00018434222962329539, + "loss": 0.3069, + "step": 8417 + }, + { + "epoch": 0.681950745301361, + "grad_norm": 0.0300731398165226, + "learning_rate": 0.00018433772897070077, + "loss": 0.2972, + "step": 8418 + }, + { + "epoch": 0.6820317563188594, + "grad_norm": 0.035994041711091995, + "learning_rate": 0.00018433322831810613, + "loss": 0.324, + "step": 8419 + }, + { + "epoch": 0.6821127673363577, + "grad_norm": 0.03320501744747162, + "learning_rate": 0.0001843287276655115, + "loss": 0.3607, + "step": 8420 + }, + { + "epoch": 0.6821937783538561, + "grad_norm": 0.02966352365911007, + "learning_rate": 0.00018432422701291688, + "loss": 0.3363, + "step": 8421 + }, + { + "epoch": 0.6822747893713546, + "grad_norm": 0.034290432929992676, + "learning_rate": 0.00018431972636032227, + "loss": 0.351, + "step": 8422 + }, + { + "epoch": 0.6823558003888529, + "grad_norm": 0.031076081097126007, + "learning_rate": 0.00018431522570772763, + "loss": 0.3386, + "step": 8423 + }, + { + "epoch": 0.6824368114063513, + "grad_norm": 0.04111815616488457, + "learning_rate": 0.00018431072505513301, + "loss": 0.3651, + "step": 8424 + }, + { + "epoch": 0.6825178224238496, + "grad_norm": 0.0348864309489727, + "learning_rate": 0.00018430622440253837, + "loss": 0.4214, + "step": 8425 + }, + { + "epoch": 0.682598833441348, + "grad_norm": 0.030616192147135735, + "learning_rate": 0.00018430172374994373, + "loss": 0.3693, + "step": 8426 + }, + { + "epoch": 0.6826798444588464, + "grad_norm": 0.030621033161878586, + "learning_rate": 0.00018429722309734912, + "loss": 0.3228, + "step": 8427 + }, + { + "epoch": 0.6827608554763448, + "grad_norm": 0.032740604132413864, + "learning_rate": 0.0001842927224447545, + "loss": 0.3253, + "step": 8428 + }, + { + "epoch": 0.6828418664938432, + "grad_norm": 0.032946109771728516, + "learning_rate": 0.00018428822179215987, + "loss": 0.3695, + "step": 8429 + }, + { + "epoch": 0.6829228775113415, + "grad_norm": 0.03482314199209213, + "learning_rate": 0.00018428372113956526, + "loss": 0.3573, + "step": 8430 + }, + { + "epoch": 0.68300388852884, + "grad_norm": 0.03567078337073326, + "learning_rate": 0.00018427922048697062, + "loss": 0.3514, + "step": 8431 + }, + { + "epoch": 0.6830848995463383, + "grad_norm": 0.040290411561727524, + "learning_rate": 0.00018427471983437598, + "loss": 0.3861, + "step": 8432 + }, + { + "epoch": 0.6831659105638367, + "grad_norm": 0.039115242660045624, + "learning_rate": 0.00018427021918178136, + "loss": 0.3529, + "step": 8433 + }, + { + "epoch": 0.683246921581335, + "grad_norm": 0.03234872967004776, + "learning_rate": 0.00018426571852918675, + "loss": 0.3497, + "step": 8434 + }, + { + "epoch": 0.6833279325988334, + "grad_norm": 0.029645444825291634, + "learning_rate": 0.0001842612178765921, + "loss": 0.3215, + "step": 8435 + }, + { + "epoch": 0.6834089436163319, + "grad_norm": 0.03823678195476532, + "learning_rate": 0.0001842567172239975, + "loss": 0.3486, + "step": 8436 + }, + { + "epoch": 0.6834899546338302, + "grad_norm": 0.033858608454465866, + "learning_rate": 0.00018425221657140286, + "loss": 0.3571, + "step": 8437 + }, + { + "epoch": 0.6835709656513286, + "grad_norm": 0.029137929901480675, + "learning_rate": 0.00018424771591880822, + "loss": 0.3132, + "step": 8438 + }, + { + "epoch": 0.6836519766688269, + "grad_norm": 0.030431587249040604, + "learning_rate": 0.00018424321526621363, + "loss": 0.3677, + "step": 8439 + }, + { + "epoch": 0.6837329876863253, + "grad_norm": 0.03393100947141647, + "learning_rate": 0.000184238714613619, + "loss": 0.3538, + "step": 8440 + }, + { + "epoch": 0.6838139987038238, + "grad_norm": 0.033213693648576736, + "learning_rate": 0.00018423421396102435, + "loss": 0.3299, + "step": 8441 + }, + { + "epoch": 0.6838950097213221, + "grad_norm": 0.03575395420193672, + "learning_rate": 0.00018422971330842974, + "loss": 0.3084, + "step": 8442 + }, + { + "epoch": 0.6839760207388205, + "grad_norm": 0.033678457140922546, + "learning_rate": 0.0001842252126558351, + "loss": 0.3386, + "step": 8443 + }, + { + "epoch": 0.6840570317563188, + "grad_norm": 0.030759470537304878, + "learning_rate": 0.00018422071200324046, + "loss": 0.3371, + "step": 8444 + }, + { + "epoch": 0.6841380427738173, + "grad_norm": 0.033590931445360184, + "learning_rate": 0.00018421621135064587, + "loss": 0.3781, + "step": 8445 + }, + { + "epoch": 0.6842190537913156, + "grad_norm": 0.03153382986783981, + "learning_rate": 0.00018421171069805123, + "loss": 0.3207, + "step": 8446 + }, + { + "epoch": 0.684300064808814, + "grad_norm": 0.03318261355161667, + "learning_rate": 0.0001842072100454566, + "loss": 0.3428, + "step": 8447 + }, + { + "epoch": 0.6843810758263124, + "grad_norm": 0.03142682835459709, + "learning_rate": 0.00018420270939286198, + "loss": 0.3678, + "step": 8448 + }, + { + "epoch": 0.6844620868438107, + "grad_norm": 0.03981975093483925, + "learning_rate": 0.00018419820874026734, + "loss": 0.3752, + "step": 8449 + }, + { + "epoch": 0.6845430978613092, + "grad_norm": 0.027852242812514305, + "learning_rate": 0.0001841937080876727, + "loss": 0.3113, + "step": 8450 + }, + { + "epoch": 0.6846241088788075, + "grad_norm": 0.03192290663719177, + "learning_rate": 0.00018418920743507812, + "loss": 0.3583, + "step": 8451 + }, + { + "epoch": 0.6847051198963059, + "grad_norm": 0.03634326532483101, + "learning_rate": 0.00018418470678248348, + "loss": 0.3149, + "step": 8452 + }, + { + "epoch": 0.6847861309138042, + "grad_norm": 0.03109004907310009, + "learning_rate": 0.00018418020612988884, + "loss": 0.3133, + "step": 8453 + }, + { + "epoch": 0.6848671419313026, + "grad_norm": 0.03400532156229019, + "learning_rate": 0.00018417570547729422, + "loss": 0.3254, + "step": 8454 + }, + { + "epoch": 0.6849481529488011, + "grad_norm": 0.029681047424674034, + "learning_rate": 0.00018417120482469958, + "loss": 0.3686, + "step": 8455 + }, + { + "epoch": 0.6850291639662994, + "grad_norm": 0.029582427814602852, + "learning_rate": 0.00018416670417210494, + "loss": 0.308, + "step": 8456 + }, + { + "epoch": 0.6851101749837978, + "grad_norm": 0.030207743868231773, + "learning_rate": 0.00018416220351951036, + "loss": 0.3149, + "step": 8457 + }, + { + "epoch": 0.6851911860012961, + "grad_norm": 0.02820783481001854, + "learning_rate": 0.00018415770286691572, + "loss": 0.2731, + "step": 8458 + }, + { + "epoch": 0.6852721970187946, + "grad_norm": 0.030728664249181747, + "learning_rate": 0.00018415320221432108, + "loss": 0.3038, + "step": 8459 + }, + { + "epoch": 0.685353208036293, + "grad_norm": 0.03349175676703453, + "learning_rate": 0.00018414870156172646, + "loss": 0.39, + "step": 8460 + }, + { + "epoch": 0.6854342190537913, + "grad_norm": 0.03066192753612995, + "learning_rate": 0.00018414420090913182, + "loss": 0.3464, + "step": 8461 + }, + { + "epoch": 0.6855152300712897, + "grad_norm": 0.030717190355062485, + "learning_rate": 0.0001841397002565372, + "loss": 0.3424, + "step": 8462 + }, + { + "epoch": 0.685596241088788, + "grad_norm": 0.028941521421074867, + "learning_rate": 0.0001841351996039426, + "loss": 0.3517, + "step": 8463 + }, + { + "epoch": 0.6856772521062865, + "grad_norm": 0.031130949035286903, + "learning_rate": 0.00018413069895134796, + "loss": 0.3115, + "step": 8464 + }, + { + "epoch": 0.6857582631237849, + "grad_norm": 0.029154805466532707, + "learning_rate": 0.00018412619829875332, + "loss": 0.3008, + "step": 8465 + }, + { + "epoch": 0.6858392741412832, + "grad_norm": 0.03485098108649254, + "learning_rate": 0.0001841216976461587, + "loss": 0.3451, + "step": 8466 + }, + { + "epoch": 0.6859202851587816, + "grad_norm": 0.030184783041477203, + "learning_rate": 0.00018411719699356407, + "loss": 0.3554, + "step": 8467 + }, + { + "epoch": 0.68600129617628, + "grad_norm": 0.02812212146818638, + "learning_rate": 0.00018411269634096945, + "loss": 0.33, + "step": 8468 + }, + { + "epoch": 0.6860823071937784, + "grad_norm": 0.0370754636824131, + "learning_rate": 0.00018410819568837484, + "loss": 0.3428, + "step": 8469 + }, + { + "epoch": 0.6861633182112767, + "grad_norm": 0.032499175518751144, + "learning_rate": 0.0001841036950357802, + "loss": 0.3263, + "step": 8470 + }, + { + "epoch": 0.6862443292287751, + "grad_norm": 0.028472188860177994, + "learning_rate": 0.00018409919438318556, + "loss": 0.3552, + "step": 8471 + }, + { + "epoch": 0.6863253402462735, + "grad_norm": 0.0350298248231411, + "learning_rate": 0.00018409469373059095, + "loss": 0.3764, + "step": 8472 + }, + { + "epoch": 0.6864063512637719, + "grad_norm": 0.0346914604306221, + "learning_rate": 0.0001840901930779963, + "loss": 0.3325, + "step": 8473 + }, + { + "epoch": 0.6864873622812703, + "grad_norm": 0.031484149396419525, + "learning_rate": 0.0001840856924254017, + "loss": 0.3588, + "step": 8474 + }, + { + "epoch": 0.6865683732987686, + "grad_norm": 0.03626801818609238, + "learning_rate": 0.00018408119177280708, + "loss": 0.3355, + "step": 8475 + }, + { + "epoch": 0.686649384316267, + "grad_norm": 0.0336102731525898, + "learning_rate": 0.00018407669112021244, + "loss": 0.408, + "step": 8476 + }, + { + "epoch": 0.6867303953337653, + "grad_norm": 0.0309014730155468, + "learning_rate": 0.0001840721904676178, + "loss": 0.3403, + "step": 8477 + }, + { + "epoch": 0.6868114063512638, + "grad_norm": 0.035775888711214066, + "learning_rate": 0.0001840676898150232, + "loss": 0.3254, + "step": 8478 + }, + { + "epoch": 0.6868924173687622, + "grad_norm": 0.03134749084711075, + "learning_rate": 0.00018406318916242855, + "loss": 0.3703, + "step": 8479 + }, + { + "epoch": 0.6869734283862605, + "grad_norm": 0.03138517588376999, + "learning_rate": 0.00018405868850983394, + "loss": 0.3614, + "step": 8480 + }, + { + "epoch": 0.6870544394037589, + "grad_norm": 0.030462482944130898, + "learning_rate": 0.00018405418785723932, + "loss": 0.2867, + "step": 8481 + }, + { + "epoch": 0.6871354504212573, + "grad_norm": 0.029852738603949547, + "learning_rate": 0.00018404968720464468, + "loss": 0.312, + "step": 8482 + }, + { + "epoch": 0.6872164614387557, + "grad_norm": 0.033276185393333435, + "learning_rate": 0.00018404518655205004, + "loss": 0.3144, + "step": 8483 + }, + { + "epoch": 0.687297472456254, + "grad_norm": 0.03628126531839371, + "learning_rate": 0.00018404068589945543, + "loss": 0.3954, + "step": 8484 + }, + { + "epoch": 0.6873784834737524, + "grad_norm": 0.03308907151222229, + "learning_rate": 0.0001840361852468608, + "loss": 0.3578, + "step": 8485 + }, + { + "epoch": 0.6874594944912508, + "grad_norm": 0.029518576338887215, + "learning_rate": 0.00018403168459426618, + "loss": 0.3736, + "step": 8486 + }, + { + "epoch": 0.6875405055087492, + "grad_norm": 0.031972698867321014, + "learning_rate": 0.00018402718394167156, + "loss": 0.3777, + "step": 8487 + }, + { + "epoch": 0.6876215165262476, + "grad_norm": 0.036778468638658524, + "learning_rate": 0.00018402268328907692, + "loss": 0.409, + "step": 8488 + }, + { + "epoch": 0.687702527543746, + "grad_norm": 0.033233892172575, + "learning_rate": 0.00018401818263648228, + "loss": 0.4315, + "step": 8489 + }, + { + "epoch": 0.6877835385612443, + "grad_norm": 0.033360805362463, + "learning_rate": 0.00018401368198388767, + "loss": 0.3999, + "step": 8490 + }, + { + "epoch": 0.6878645495787427, + "grad_norm": 0.035713110119104385, + "learning_rate": 0.00018400918133129306, + "loss": 0.3866, + "step": 8491 + }, + { + "epoch": 0.6879455605962411, + "grad_norm": 0.029528697952628136, + "learning_rate": 0.00018400468067869842, + "loss": 0.2948, + "step": 8492 + }, + { + "epoch": 0.6880265716137395, + "grad_norm": 0.03073948062956333, + "learning_rate": 0.0001840001800261038, + "loss": 0.4063, + "step": 8493 + }, + { + "epoch": 0.6881075826312378, + "grad_norm": 0.033134352415800095, + "learning_rate": 0.00018399567937350917, + "loss": 0.3205, + "step": 8494 + }, + { + "epoch": 0.6881885936487362, + "grad_norm": 0.03518693521618843, + "learning_rate": 0.00018399117872091453, + "loss": 0.4068, + "step": 8495 + }, + { + "epoch": 0.6882696046662347, + "grad_norm": 0.030890299007296562, + "learning_rate": 0.00018398667806831991, + "loss": 0.3325, + "step": 8496 + }, + { + "epoch": 0.688350615683733, + "grad_norm": 0.03359353542327881, + "learning_rate": 0.0001839821774157253, + "loss": 0.3043, + "step": 8497 + }, + { + "epoch": 0.6884316267012314, + "grad_norm": 0.028195347636938095, + "learning_rate": 0.00018397767676313066, + "loss": 0.2898, + "step": 8498 + }, + { + "epoch": 0.6885126377187297, + "grad_norm": 0.031246118247509003, + "learning_rate": 0.00018397317611053605, + "loss": 0.373, + "step": 8499 + }, + { + "epoch": 0.6885936487362281, + "grad_norm": 0.035277605056762695, + "learning_rate": 0.0001839686754579414, + "loss": 0.329, + "step": 8500 + }, + { + "epoch": 0.6886746597537265, + "grad_norm": 0.030519159510731697, + "learning_rate": 0.00018396417480534677, + "loss": 0.3366, + "step": 8501 + }, + { + "epoch": 0.6887556707712249, + "grad_norm": 0.03541049733757973, + "learning_rate": 0.00018395967415275216, + "loss": 0.3487, + "step": 8502 + }, + { + "epoch": 0.6888366817887233, + "grad_norm": 0.03722112998366356, + "learning_rate": 0.00018395517350015754, + "loss": 0.358, + "step": 8503 + }, + { + "epoch": 0.6889176928062216, + "grad_norm": 0.028488805517554283, + "learning_rate": 0.0001839506728475629, + "loss": 0.289, + "step": 8504 + }, + { + "epoch": 0.68899870382372, + "grad_norm": 0.030860982835292816, + "learning_rate": 0.0001839461721949683, + "loss": 0.353, + "step": 8505 + }, + { + "epoch": 0.6890797148412184, + "grad_norm": 0.0341181643307209, + "learning_rate": 0.00018394167154237365, + "loss": 0.3434, + "step": 8506 + }, + { + "epoch": 0.6891607258587168, + "grad_norm": 0.030584245920181274, + "learning_rate": 0.000183937170889779, + "loss": 0.3334, + "step": 8507 + }, + { + "epoch": 0.6892417368762151, + "grad_norm": 0.027770398184657097, + "learning_rate": 0.0001839326702371844, + "loss": 0.3031, + "step": 8508 + }, + { + "epoch": 0.6893227478937135, + "grad_norm": 0.030859703198075294, + "learning_rate": 0.00018392816958458978, + "loss": 0.3319, + "step": 8509 + }, + { + "epoch": 0.689403758911212, + "grad_norm": 0.03638660907745361, + "learning_rate": 0.00018392366893199514, + "loss": 0.4019, + "step": 8510 + }, + { + "epoch": 0.6894847699287103, + "grad_norm": 0.028055081143975258, + "learning_rate": 0.00018391916827940053, + "loss": 0.3395, + "step": 8511 + }, + { + "epoch": 0.6895657809462087, + "grad_norm": 0.031972650438547134, + "learning_rate": 0.0001839146676268059, + "loss": 0.3476, + "step": 8512 + }, + { + "epoch": 0.689646791963707, + "grad_norm": 0.03158603608608246, + "learning_rate": 0.00018391016697421125, + "loss": 0.3258, + "step": 8513 + }, + { + "epoch": 0.6897278029812054, + "grad_norm": 0.03295688331127167, + "learning_rate": 0.00018390566632161667, + "loss": 0.3516, + "step": 8514 + }, + { + "epoch": 0.6898088139987039, + "grad_norm": 0.03116844780743122, + "learning_rate": 0.00018390116566902203, + "loss": 0.3671, + "step": 8515 + }, + { + "epoch": 0.6898898250162022, + "grad_norm": 0.02899783104658127, + "learning_rate": 0.00018389666501642739, + "loss": 0.3082, + "step": 8516 + }, + { + "epoch": 0.6899708360337006, + "grad_norm": 0.030172912403941154, + "learning_rate": 0.00018389216436383277, + "loss": 0.3322, + "step": 8517 + }, + { + "epoch": 0.6900518470511989, + "grad_norm": 0.030508432537317276, + "learning_rate": 0.00018388766371123813, + "loss": 0.3723, + "step": 8518 + }, + { + "epoch": 0.6901328580686974, + "grad_norm": 0.032022520899772644, + "learning_rate": 0.0001838831630586435, + "loss": 0.3673, + "step": 8519 + }, + { + "epoch": 0.6902138690861958, + "grad_norm": 0.0314970389008522, + "learning_rate": 0.0001838786624060489, + "loss": 0.3344, + "step": 8520 + }, + { + "epoch": 0.6902948801036941, + "grad_norm": 0.027550017461180687, + "learning_rate": 0.00018387416175345427, + "loss": 0.2939, + "step": 8521 + }, + { + "epoch": 0.6903758911211925, + "grad_norm": 0.035641346126794815, + "learning_rate": 0.00018386966110085963, + "loss": 0.3855, + "step": 8522 + }, + { + "epoch": 0.6904569021386908, + "grad_norm": 0.03057931363582611, + "learning_rate": 0.00018386516044826501, + "loss": 0.3432, + "step": 8523 + }, + { + "epoch": 0.6905379131561893, + "grad_norm": 0.03537129983305931, + "learning_rate": 0.00018386065979567037, + "loss": 0.3375, + "step": 8524 + }, + { + "epoch": 0.6906189241736876, + "grad_norm": 0.03352898359298706, + "learning_rate": 0.00018385615914307573, + "loss": 0.3071, + "step": 8525 + }, + { + "epoch": 0.690699935191186, + "grad_norm": 0.028735622763633728, + "learning_rate": 0.00018385165849048115, + "loss": 0.3063, + "step": 8526 + }, + { + "epoch": 0.6907809462086844, + "grad_norm": 0.027944544330239296, + "learning_rate": 0.0001838471578378865, + "loss": 0.3412, + "step": 8527 + }, + { + "epoch": 0.6908619572261827, + "grad_norm": 0.026370206847786903, + "learning_rate": 0.00018384265718529187, + "loss": 0.3208, + "step": 8528 + }, + { + "epoch": 0.6909429682436812, + "grad_norm": 0.035538118332624435, + "learning_rate": 0.00018383815653269726, + "loss": 0.3605, + "step": 8529 + }, + { + "epoch": 0.6910239792611795, + "grad_norm": 0.030197838321328163, + "learning_rate": 0.00018383365588010262, + "loss": 0.36, + "step": 8530 + }, + { + "epoch": 0.6911049902786779, + "grad_norm": 0.026188869029283524, + "learning_rate": 0.00018382915522750798, + "loss": 0.297, + "step": 8531 + }, + { + "epoch": 0.6911860012961762, + "grad_norm": 0.032884180545806885, + "learning_rate": 0.0001838246545749134, + "loss": 0.3684, + "step": 8532 + }, + { + "epoch": 0.6912670123136747, + "grad_norm": 0.027330821380019188, + "learning_rate": 0.00018382015392231875, + "loss": 0.2778, + "step": 8533 + }, + { + "epoch": 0.6913480233311731, + "grad_norm": 0.03134836256504059, + "learning_rate": 0.0001838156532697241, + "loss": 0.3519, + "step": 8534 + }, + { + "epoch": 0.6914290343486714, + "grad_norm": 0.03451668098568916, + "learning_rate": 0.0001838111526171295, + "loss": 0.408, + "step": 8535 + }, + { + "epoch": 0.6915100453661698, + "grad_norm": 0.037174392491579056, + "learning_rate": 0.00018380665196453486, + "loss": 0.3307, + "step": 8536 + }, + { + "epoch": 0.6915910563836681, + "grad_norm": 0.030446277931332588, + "learning_rate": 0.00018380215131194022, + "loss": 0.3321, + "step": 8537 + }, + { + "epoch": 0.6916720674011666, + "grad_norm": 0.031310826539993286, + "learning_rate": 0.00018379765065934563, + "loss": 0.3356, + "step": 8538 + }, + { + "epoch": 0.691753078418665, + "grad_norm": 0.036476824432611465, + "learning_rate": 0.000183793150006751, + "loss": 0.3875, + "step": 8539 + }, + { + "epoch": 0.6918340894361633, + "grad_norm": 0.027982639148831367, + "learning_rate": 0.00018378864935415635, + "loss": 0.3096, + "step": 8540 + }, + { + "epoch": 0.6919151004536617, + "grad_norm": 0.02935284748673439, + "learning_rate": 0.00018378414870156174, + "loss": 0.3416, + "step": 8541 + }, + { + "epoch": 0.69199611147116, + "grad_norm": 0.03340727463364601, + "learning_rate": 0.0001837796480489671, + "loss": 0.3885, + "step": 8542 + }, + { + "epoch": 0.6920771224886585, + "grad_norm": 0.030995968729257584, + "learning_rate": 0.0001837751473963725, + "loss": 0.3612, + "step": 8543 + }, + { + "epoch": 0.6921581335061568, + "grad_norm": 0.030012935400009155, + "learning_rate": 0.00018377064674377787, + "loss": 0.3599, + "step": 8544 + }, + { + "epoch": 0.6922391445236552, + "grad_norm": 0.033467911183834076, + "learning_rate": 0.00018376614609118323, + "loss": 0.3333, + "step": 8545 + }, + { + "epoch": 0.6923201555411536, + "grad_norm": 0.03058590553700924, + "learning_rate": 0.0001837616454385886, + "loss": 0.3182, + "step": 8546 + }, + { + "epoch": 0.692401166558652, + "grad_norm": 0.028448492288589478, + "learning_rate": 0.00018375714478599398, + "loss": 0.3114, + "step": 8547 + }, + { + "epoch": 0.6924821775761504, + "grad_norm": 0.04189586266875267, + "learning_rate": 0.00018375264413339934, + "loss": 0.378, + "step": 8548 + }, + { + "epoch": 0.6925631885936487, + "grad_norm": 0.032369352877140045, + "learning_rate": 0.00018374814348080473, + "loss": 0.3884, + "step": 8549 + }, + { + "epoch": 0.6926441996111471, + "grad_norm": 0.02977856807410717, + "learning_rate": 0.00018374364282821012, + "loss": 0.3471, + "step": 8550 + }, + { + "epoch": 0.6927252106286454, + "grad_norm": 0.0340099073946476, + "learning_rate": 0.00018373914217561548, + "loss": 0.3973, + "step": 8551 + }, + { + "epoch": 0.6928062216461439, + "grad_norm": 0.026700271293520927, + "learning_rate": 0.00018373464152302084, + "loss": 0.3254, + "step": 8552 + }, + { + "epoch": 0.6928872326636423, + "grad_norm": 0.03509001433849335, + "learning_rate": 0.00018373014087042622, + "loss": 0.3497, + "step": 8553 + }, + { + "epoch": 0.6929682436811406, + "grad_norm": 0.03207175433635712, + "learning_rate": 0.00018372564021783158, + "loss": 0.3451, + "step": 8554 + }, + { + "epoch": 0.693049254698639, + "grad_norm": 0.032138824462890625, + "learning_rate": 0.00018372113956523697, + "loss": 0.3531, + "step": 8555 + }, + { + "epoch": 0.6931302657161373, + "grad_norm": 0.033208414912223816, + "learning_rate": 0.00018371663891264236, + "loss": 0.348, + "step": 8556 + }, + { + "epoch": 0.6932112767336358, + "grad_norm": 0.03227568790316582, + "learning_rate": 0.00018371213826004772, + "loss": 0.3497, + "step": 8557 + }, + { + "epoch": 0.6932922877511342, + "grad_norm": 0.03891857713460922, + "learning_rate": 0.00018370763760745308, + "loss": 0.3617, + "step": 8558 + }, + { + "epoch": 0.6933732987686325, + "grad_norm": 0.031320828944444656, + "learning_rate": 0.00018370313695485846, + "loss": 0.3262, + "step": 8559 + }, + { + "epoch": 0.6934543097861309, + "grad_norm": 0.03595731779932976, + "learning_rate": 0.00018369863630226382, + "loss": 0.3573, + "step": 8560 + }, + { + "epoch": 0.6935353208036293, + "grad_norm": 0.0347558967769146, + "learning_rate": 0.0001836941356496692, + "loss": 0.3578, + "step": 8561 + }, + { + "epoch": 0.6936163318211277, + "grad_norm": 0.02745945379137993, + "learning_rate": 0.0001836896349970746, + "loss": 0.3237, + "step": 8562 + }, + { + "epoch": 0.693697342838626, + "grad_norm": 0.031957414001226425, + "learning_rate": 0.00018368513434447996, + "loss": 0.3488, + "step": 8563 + }, + { + "epoch": 0.6937783538561244, + "grad_norm": 0.03347107395529747, + "learning_rate": 0.00018368063369188532, + "loss": 0.4016, + "step": 8564 + }, + { + "epoch": 0.6938593648736228, + "grad_norm": 0.03430968523025513, + "learning_rate": 0.0001836761330392907, + "loss": 0.3403, + "step": 8565 + }, + { + "epoch": 0.6939403758911212, + "grad_norm": 0.03299398720264435, + "learning_rate": 0.00018367163238669607, + "loss": 0.3568, + "step": 8566 + }, + { + "epoch": 0.6940213869086196, + "grad_norm": 0.03152777627110481, + "learning_rate": 0.00018366713173410145, + "loss": 0.3231, + "step": 8567 + }, + { + "epoch": 0.6941023979261179, + "grad_norm": 0.0307911466807127, + "learning_rate": 0.00018366263108150684, + "loss": 0.3596, + "step": 8568 + }, + { + "epoch": 0.6941834089436163, + "grad_norm": 0.029098432511091232, + "learning_rate": 0.0001836581304289122, + "loss": 0.3651, + "step": 8569 + }, + { + "epoch": 0.6942644199611148, + "grad_norm": 0.02972140535712242, + "learning_rate": 0.00018365362977631756, + "loss": 0.3355, + "step": 8570 + }, + { + "epoch": 0.6943454309786131, + "grad_norm": 0.027684060856699944, + "learning_rate": 0.00018364912912372295, + "loss": 0.3261, + "step": 8571 + }, + { + "epoch": 0.6944264419961115, + "grad_norm": 0.034204453229904175, + "learning_rate": 0.00018364462847112833, + "loss": 0.4012, + "step": 8572 + }, + { + "epoch": 0.6945074530136098, + "grad_norm": 0.03092847764492035, + "learning_rate": 0.0001836401278185337, + "loss": 0.3891, + "step": 8573 + }, + { + "epoch": 0.6945884640311082, + "grad_norm": 0.0319262258708477, + "learning_rate": 0.00018363562716593908, + "loss": 0.3884, + "step": 8574 + }, + { + "epoch": 0.6946694750486067, + "grad_norm": 0.030752461403608322, + "learning_rate": 0.00018363112651334444, + "loss": 0.3154, + "step": 8575 + }, + { + "epoch": 0.694750486066105, + "grad_norm": 0.03386900946497917, + "learning_rate": 0.0001836266258607498, + "loss": 0.3706, + "step": 8576 + }, + { + "epoch": 0.6948314970836034, + "grad_norm": 0.034681811928749084, + "learning_rate": 0.0001836221252081552, + "loss": 0.3592, + "step": 8577 + }, + { + "epoch": 0.6949125081011017, + "grad_norm": 0.027019526809453964, + "learning_rate": 0.00018361762455556058, + "loss": 0.3067, + "step": 8578 + }, + { + "epoch": 0.6949935191186001, + "grad_norm": 0.031102588400244713, + "learning_rate": 0.00018361312390296594, + "loss": 0.3359, + "step": 8579 + }, + { + "epoch": 0.6950745301360985, + "grad_norm": 0.030616452917456627, + "learning_rate": 0.00018360862325037132, + "loss": 0.3852, + "step": 8580 + }, + { + "epoch": 0.6951555411535969, + "grad_norm": 0.044032927602529526, + "learning_rate": 0.00018360412259777668, + "loss": 0.3905, + "step": 8581 + }, + { + "epoch": 0.6952365521710953, + "grad_norm": 0.029699688777327538, + "learning_rate": 0.00018359962194518204, + "loss": 0.3783, + "step": 8582 + }, + { + "epoch": 0.6953175631885936, + "grad_norm": 0.026341859251260757, + "learning_rate": 0.00018359512129258743, + "loss": 0.2967, + "step": 8583 + }, + { + "epoch": 0.6953985742060921, + "grad_norm": 0.035602036863565445, + "learning_rate": 0.00018359062063999282, + "loss": 0.3264, + "step": 8584 + }, + { + "epoch": 0.6954795852235904, + "grad_norm": 0.03353574126958847, + "learning_rate": 0.00018358611998739818, + "loss": 0.3803, + "step": 8585 + }, + { + "epoch": 0.6955605962410888, + "grad_norm": 0.0328943133354187, + "learning_rate": 0.00018358161933480357, + "loss": 0.3333, + "step": 8586 + }, + { + "epoch": 0.6956416072585871, + "grad_norm": 0.032131899148225784, + "learning_rate": 0.00018357711868220893, + "loss": 0.3782, + "step": 8587 + }, + { + "epoch": 0.6957226182760855, + "grad_norm": 0.02701735869050026, + "learning_rate": 0.00018357261802961429, + "loss": 0.3116, + "step": 8588 + }, + { + "epoch": 0.695803629293584, + "grad_norm": 0.03216202184557915, + "learning_rate": 0.00018356811737701967, + "loss": 0.3545, + "step": 8589 + }, + { + "epoch": 0.6958846403110823, + "grad_norm": 0.028192881494760513, + "learning_rate": 0.00018356361672442506, + "loss": 0.306, + "step": 8590 + }, + { + "epoch": 0.6959656513285807, + "grad_norm": 0.02953306771814823, + "learning_rate": 0.00018355911607183042, + "loss": 0.3099, + "step": 8591 + }, + { + "epoch": 0.696046662346079, + "grad_norm": 0.03247429430484772, + "learning_rate": 0.0001835546154192358, + "loss": 0.3325, + "step": 8592 + }, + { + "epoch": 0.6961276733635774, + "grad_norm": 0.032983459532260895, + "learning_rate": 0.00018355011476664117, + "loss": 0.3769, + "step": 8593 + }, + { + "epoch": 0.6962086843810759, + "grad_norm": 0.03049176186323166, + "learning_rate": 0.00018354561411404653, + "loss": 0.2843, + "step": 8594 + }, + { + "epoch": 0.6962896953985742, + "grad_norm": 0.025875264778733253, + "learning_rate": 0.00018354111346145194, + "loss": 0.3501, + "step": 8595 + }, + { + "epoch": 0.6963707064160726, + "grad_norm": 0.030669327825307846, + "learning_rate": 0.0001835366128088573, + "loss": 0.3537, + "step": 8596 + }, + { + "epoch": 0.6964517174335709, + "grad_norm": 0.028857417404651642, + "learning_rate": 0.00018353211215626266, + "loss": 0.2993, + "step": 8597 + }, + { + "epoch": 0.6965327284510694, + "grad_norm": 0.030767953023314476, + "learning_rate": 0.00018352761150366805, + "loss": 0.3266, + "step": 8598 + }, + { + "epoch": 0.6966137394685677, + "grad_norm": 0.035894282162189484, + "learning_rate": 0.0001835231108510734, + "loss": 0.3871, + "step": 8599 + }, + { + "epoch": 0.6966947504860661, + "grad_norm": 0.03249775990843773, + "learning_rate": 0.00018351861019847877, + "loss": 0.345, + "step": 8600 + }, + { + "epoch": 0.6967757615035645, + "grad_norm": 0.03632766380906105, + "learning_rate": 0.00018351410954588418, + "loss": 0.405, + "step": 8601 + }, + { + "epoch": 0.6968567725210628, + "grad_norm": 0.029767895117402077, + "learning_rate": 0.00018350960889328954, + "loss": 0.3026, + "step": 8602 + }, + { + "epoch": 0.6969377835385613, + "grad_norm": 0.03096391074359417, + "learning_rate": 0.0001835051082406949, + "loss": 0.2842, + "step": 8603 + }, + { + "epoch": 0.6970187945560596, + "grad_norm": 0.03562683239579201, + "learning_rate": 0.0001835006075881003, + "loss": 0.3859, + "step": 8604 + }, + { + "epoch": 0.697099805573558, + "grad_norm": 0.030624883249402046, + "learning_rate": 0.00018349610693550565, + "loss": 0.3152, + "step": 8605 + }, + { + "epoch": 0.6971808165910564, + "grad_norm": 0.035233303904533386, + "learning_rate": 0.000183491606282911, + "loss": 0.3304, + "step": 8606 + }, + { + "epoch": 0.6972618276085548, + "grad_norm": 0.031249161809682846, + "learning_rate": 0.00018348710563031642, + "loss": 0.3441, + "step": 8607 + }, + { + "epoch": 0.6973428386260532, + "grad_norm": 0.0314306803047657, + "learning_rate": 0.00018348260497772178, + "loss": 0.3646, + "step": 8608 + }, + { + "epoch": 0.6974238496435515, + "grad_norm": 0.03200050815939903, + "learning_rate": 0.00018347810432512714, + "loss": 0.3381, + "step": 8609 + }, + { + "epoch": 0.6975048606610499, + "grad_norm": 0.04529730975627899, + "learning_rate": 0.00018347360367253253, + "loss": 0.3428, + "step": 8610 + }, + { + "epoch": 0.6975858716785482, + "grad_norm": 0.029167592525482178, + "learning_rate": 0.0001834691030199379, + "loss": 0.2873, + "step": 8611 + }, + { + "epoch": 0.6976668826960467, + "grad_norm": 0.03315237909555435, + "learning_rate": 0.00018346460236734325, + "loss": 0.3004, + "step": 8612 + }, + { + "epoch": 0.6977478937135451, + "grad_norm": 0.033364132046699524, + "learning_rate": 0.00018346010171474867, + "loss": 0.3412, + "step": 8613 + }, + { + "epoch": 0.6978289047310434, + "grad_norm": 0.03286145254969597, + "learning_rate": 0.00018345560106215403, + "loss": 0.3819, + "step": 8614 + }, + { + "epoch": 0.6979099157485418, + "grad_norm": 0.030075784772634506, + "learning_rate": 0.00018345110040955939, + "loss": 0.3545, + "step": 8615 + }, + { + "epoch": 0.6979909267660401, + "grad_norm": 0.0326153002679348, + "learning_rate": 0.00018344659975696477, + "loss": 0.2943, + "step": 8616 + }, + { + "epoch": 0.6980719377835386, + "grad_norm": 0.031880997121334076, + "learning_rate": 0.00018344209910437013, + "loss": 0.3736, + "step": 8617 + }, + { + "epoch": 0.698152948801037, + "grad_norm": 0.034937191754579544, + "learning_rate": 0.0001834375984517755, + "loss": 0.3137, + "step": 8618 + }, + { + "epoch": 0.6982339598185353, + "grad_norm": 0.03716542199254036, + "learning_rate": 0.0001834330977991809, + "loss": 0.3653, + "step": 8619 + }, + { + "epoch": 0.6983149708360337, + "grad_norm": 0.03413591533899307, + "learning_rate": 0.00018342859714658627, + "loss": 0.3028, + "step": 8620 + }, + { + "epoch": 0.6983959818535321, + "grad_norm": 0.030354049056768417, + "learning_rate": 0.00018342409649399163, + "loss": 0.3092, + "step": 8621 + }, + { + "epoch": 0.6984769928710305, + "grad_norm": 0.026400646194815636, + "learning_rate": 0.00018341959584139701, + "loss": 0.3403, + "step": 8622 + }, + { + "epoch": 0.6985580038885288, + "grad_norm": 0.03018624521791935, + "learning_rate": 0.00018341509518880237, + "loss": 0.3382, + "step": 8623 + }, + { + "epoch": 0.6986390149060272, + "grad_norm": 0.028660321608185768, + "learning_rate": 0.00018341059453620776, + "loss": 0.3172, + "step": 8624 + }, + { + "epoch": 0.6987200259235256, + "grad_norm": 0.03087625838816166, + "learning_rate": 0.00018340609388361315, + "loss": 0.3266, + "step": 8625 + }, + { + "epoch": 0.698801036941024, + "grad_norm": 0.030530283227562904, + "learning_rate": 0.0001834015932310185, + "loss": 0.3811, + "step": 8626 + }, + { + "epoch": 0.6988820479585224, + "grad_norm": 0.030472485348582268, + "learning_rate": 0.00018339709257842387, + "loss": 0.3513, + "step": 8627 + }, + { + "epoch": 0.6989630589760207, + "grad_norm": 0.035994235426187515, + "learning_rate": 0.00018339259192582926, + "loss": 0.4433, + "step": 8628 + }, + { + "epoch": 0.6990440699935191, + "grad_norm": 0.031035959720611572, + "learning_rate": 0.00018338809127323462, + "loss": 0.319, + "step": 8629 + }, + { + "epoch": 0.6991250810110174, + "grad_norm": 0.03235156834125519, + "learning_rate": 0.00018338359062064, + "loss": 0.3806, + "step": 8630 + }, + { + "epoch": 0.6992060920285159, + "grad_norm": 0.027372000738978386, + "learning_rate": 0.0001833790899680454, + "loss": 0.3305, + "step": 8631 + }, + { + "epoch": 0.6992871030460143, + "grad_norm": 0.032422930002212524, + "learning_rate": 0.00018337458931545075, + "loss": 0.3677, + "step": 8632 + }, + { + "epoch": 0.6993681140635126, + "grad_norm": 0.028228141367435455, + "learning_rate": 0.0001833700886628561, + "loss": 0.3047, + "step": 8633 + }, + { + "epoch": 0.699449125081011, + "grad_norm": 0.033268511295318604, + "learning_rate": 0.0001833655880102615, + "loss": 0.3476, + "step": 8634 + }, + { + "epoch": 0.6995301360985094, + "grad_norm": 0.03443459793925285, + "learning_rate": 0.00018336108735766686, + "loss": 0.3696, + "step": 8635 + }, + { + "epoch": 0.6996111471160078, + "grad_norm": 0.03373868018388748, + "learning_rate": 0.00018335658670507225, + "loss": 0.3547, + "step": 8636 + }, + { + "epoch": 0.6996921581335062, + "grad_norm": 0.03315461426973343, + "learning_rate": 0.00018335208605247763, + "loss": 0.3661, + "step": 8637 + }, + { + "epoch": 0.6997731691510045, + "grad_norm": 0.034669872373342514, + "learning_rate": 0.000183347585399883, + "loss": 0.3788, + "step": 8638 + }, + { + "epoch": 0.6998541801685029, + "grad_norm": 0.03502920642495155, + "learning_rate": 0.00018334308474728835, + "loss": 0.3149, + "step": 8639 + }, + { + "epoch": 0.6999351911860013, + "grad_norm": 0.02984611876308918, + "learning_rate": 0.00018333858409469374, + "loss": 0.3106, + "step": 8640 + }, + { + "epoch": 0.7000162022034997, + "grad_norm": 0.03439144045114517, + "learning_rate": 0.0001833340834420991, + "loss": 0.3083, + "step": 8641 + }, + { + "epoch": 0.700097213220998, + "grad_norm": 0.03035905957221985, + "learning_rate": 0.0001833295827895045, + "loss": 0.3185, + "step": 8642 + }, + { + "epoch": 0.7001782242384964, + "grad_norm": 0.02940438501536846, + "learning_rate": 0.00018332508213690987, + "loss": 0.3162, + "step": 8643 + }, + { + "epoch": 0.7002592352559948, + "grad_norm": 0.031967442482709885, + "learning_rate": 0.00018332058148431523, + "loss": 0.3732, + "step": 8644 + }, + { + "epoch": 0.7003402462734932, + "grad_norm": 0.03425975888967514, + "learning_rate": 0.0001833160808317206, + "loss": 0.3661, + "step": 8645 + }, + { + "epoch": 0.7004212572909916, + "grad_norm": 0.03647081181406975, + "learning_rate": 0.00018331158017912598, + "loss": 0.3241, + "step": 8646 + }, + { + "epoch": 0.7005022683084899, + "grad_norm": 0.03272273391485214, + "learning_rate": 0.00018330707952653137, + "loss": 0.3178, + "step": 8647 + }, + { + "epoch": 0.7005832793259883, + "grad_norm": 0.03189370036125183, + "learning_rate": 0.00018330257887393673, + "loss": 0.3577, + "step": 8648 + }, + { + "epoch": 0.7006642903434868, + "grad_norm": 0.03181673586368561, + "learning_rate": 0.00018329807822134212, + "loss": 0.355, + "step": 8649 + }, + { + "epoch": 0.7007453013609851, + "grad_norm": 0.034718107432127, + "learning_rate": 0.00018329357756874748, + "loss": 0.3608, + "step": 8650 + }, + { + "epoch": 0.7008263123784835, + "grad_norm": 0.027240954339504242, + "learning_rate": 0.00018328907691615284, + "loss": 0.3099, + "step": 8651 + }, + { + "epoch": 0.7009073233959818, + "grad_norm": 0.03302300348877907, + "learning_rate": 0.00018328457626355822, + "loss": 0.3498, + "step": 8652 + }, + { + "epoch": 0.7009883344134802, + "grad_norm": 0.031916722655296326, + "learning_rate": 0.0001832800756109636, + "loss": 0.3537, + "step": 8653 + }, + { + "epoch": 0.7010693454309787, + "grad_norm": 0.03751882165670395, + "learning_rate": 0.00018327557495836897, + "loss": 0.3693, + "step": 8654 + }, + { + "epoch": 0.701150356448477, + "grad_norm": 0.03253519907593727, + "learning_rate": 0.00018327107430577436, + "loss": 0.3914, + "step": 8655 + }, + { + "epoch": 0.7012313674659754, + "grad_norm": 0.029587239027023315, + "learning_rate": 0.00018326657365317972, + "loss": 0.3344, + "step": 8656 + }, + { + "epoch": 0.7013123784834737, + "grad_norm": 0.025954367592930794, + "learning_rate": 0.00018326207300058508, + "loss": 0.3117, + "step": 8657 + }, + { + "epoch": 0.7013933895009722, + "grad_norm": 0.030775019899010658, + "learning_rate": 0.00018325757234799046, + "loss": 0.3515, + "step": 8658 + }, + { + "epoch": 0.7014744005184705, + "grad_norm": 0.0295817069709301, + "learning_rate": 0.00018325307169539585, + "loss": 0.3184, + "step": 8659 + }, + { + "epoch": 0.7015554115359689, + "grad_norm": 0.03209137171506882, + "learning_rate": 0.0001832485710428012, + "loss": 0.3163, + "step": 8660 + }, + { + "epoch": 0.7016364225534673, + "grad_norm": 0.03405177965760231, + "learning_rate": 0.0001832440703902066, + "loss": 0.3536, + "step": 8661 + }, + { + "epoch": 0.7017174335709656, + "grad_norm": 0.034785494208335876, + "learning_rate": 0.00018323956973761196, + "loss": 0.3315, + "step": 8662 + }, + { + "epoch": 0.7017984445884641, + "grad_norm": 0.030162710696458817, + "learning_rate": 0.00018323506908501732, + "loss": 0.3357, + "step": 8663 + }, + { + "epoch": 0.7018794556059624, + "grad_norm": 0.03082972951233387, + "learning_rate": 0.0001832305684324227, + "loss": 0.3536, + "step": 8664 + }, + { + "epoch": 0.7019604666234608, + "grad_norm": 0.03256779909133911, + "learning_rate": 0.0001832260677798281, + "loss": 0.3666, + "step": 8665 + }, + { + "epoch": 0.7020414776409591, + "grad_norm": 0.035849783569574356, + "learning_rate": 0.00018322156712723345, + "loss": 0.3553, + "step": 8666 + }, + { + "epoch": 0.7021224886584575, + "grad_norm": 0.030329683795571327, + "learning_rate": 0.00018321706647463884, + "loss": 0.3572, + "step": 8667 + }, + { + "epoch": 0.702203499675956, + "grad_norm": 0.03349778801202774, + "learning_rate": 0.0001832125658220442, + "loss": 0.391, + "step": 8668 + }, + { + "epoch": 0.7022845106934543, + "grad_norm": 0.029244735836982727, + "learning_rate": 0.00018320806516944956, + "loss": 0.3374, + "step": 8669 + }, + { + "epoch": 0.7023655217109527, + "grad_norm": 0.03602052479982376, + "learning_rate": 0.00018320356451685495, + "loss": 0.3542, + "step": 8670 + }, + { + "epoch": 0.702446532728451, + "grad_norm": 0.032265808433294296, + "learning_rate": 0.00018319906386426033, + "loss": 0.3379, + "step": 8671 + }, + { + "epoch": 0.7025275437459495, + "grad_norm": 0.03190753981471062, + "learning_rate": 0.0001831945632116657, + "loss": 0.3775, + "step": 8672 + }, + { + "epoch": 0.7026085547634479, + "grad_norm": 0.03543262183666229, + "learning_rate": 0.00018319006255907108, + "loss": 0.3369, + "step": 8673 + }, + { + "epoch": 0.7026895657809462, + "grad_norm": 0.030671080574393272, + "learning_rate": 0.00018318556190647644, + "loss": 0.3364, + "step": 8674 + }, + { + "epoch": 0.7027705767984446, + "grad_norm": 0.031106283888220787, + "learning_rate": 0.0001831810612538818, + "loss": 0.2999, + "step": 8675 + }, + { + "epoch": 0.7028515878159429, + "grad_norm": 0.03163905814290047, + "learning_rate": 0.00018317656060128722, + "loss": 0.3724, + "step": 8676 + }, + { + "epoch": 0.7029325988334414, + "grad_norm": 0.02911280281841755, + "learning_rate": 0.00018317205994869258, + "loss": 0.3077, + "step": 8677 + }, + { + "epoch": 0.7030136098509397, + "grad_norm": 0.0325298085808754, + "learning_rate": 0.00018316755929609794, + "loss": 0.3562, + "step": 8678 + }, + { + "epoch": 0.7030946208684381, + "grad_norm": 0.027824802324175835, + "learning_rate": 0.00018316305864350332, + "loss": 0.317, + "step": 8679 + }, + { + "epoch": 0.7031756318859365, + "grad_norm": 0.027131253853440285, + "learning_rate": 0.00018315855799090868, + "loss": 0.3124, + "step": 8680 + }, + { + "epoch": 0.7032566429034348, + "grad_norm": 0.033741287887096405, + "learning_rate": 0.00018315405733831404, + "loss": 0.3374, + "step": 8681 + }, + { + "epoch": 0.7033376539209333, + "grad_norm": 0.03055637702345848, + "learning_rate": 0.00018314955668571946, + "loss": 0.3231, + "step": 8682 + }, + { + "epoch": 0.7034186649384316, + "grad_norm": 0.03051404468715191, + "learning_rate": 0.00018314505603312482, + "loss": 0.3192, + "step": 8683 + }, + { + "epoch": 0.70349967595593, + "grad_norm": 0.029856974259018898, + "learning_rate": 0.00018314055538053018, + "loss": 0.3274, + "step": 8684 + }, + { + "epoch": 0.7035806869734283, + "grad_norm": 0.0327875092625618, + "learning_rate": 0.00018313605472793557, + "loss": 0.3225, + "step": 8685 + }, + { + "epoch": 0.7036616979909268, + "grad_norm": 0.03562135621905327, + "learning_rate": 0.00018313155407534093, + "loss": 0.3994, + "step": 8686 + }, + { + "epoch": 0.7037427090084252, + "grad_norm": 0.03183677792549133, + "learning_rate": 0.00018312705342274629, + "loss": 0.3577, + "step": 8687 + }, + { + "epoch": 0.7038237200259235, + "grad_norm": 0.034511812031269073, + "learning_rate": 0.0001831225527701517, + "loss": 0.3383, + "step": 8688 + }, + { + "epoch": 0.7039047310434219, + "grad_norm": 0.034360211342573166, + "learning_rate": 0.00018311805211755706, + "loss": 0.3884, + "step": 8689 + }, + { + "epoch": 0.7039857420609202, + "grad_norm": 0.03103705681860447, + "learning_rate": 0.00018311355146496242, + "loss": 0.2922, + "step": 8690 + }, + { + "epoch": 0.7040667530784187, + "grad_norm": 0.029840853065252304, + "learning_rate": 0.0001831090508123678, + "loss": 0.3547, + "step": 8691 + }, + { + "epoch": 0.7041477640959171, + "grad_norm": 0.03040078841149807, + "learning_rate": 0.00018310455015977317, + "loss": 0.3452, + "step": 8692 + }, + { + "epoch": 0.7042287751134154, + "grad_norm": 0.030439404770731926, + "learning_rate": 0.00018310004950717853, + "loss": 0.3216, + "step": 8693 + }, + { + "epoch": 0.7043097861309138, + "grad_norm": 0.0319373719394207, + "learning_rate": 0.00018309554885458394, + "loss": 0.3025, + "step": 8694 + }, + { + "epoch": 0.7043907971484121, + "grad_norm": 0.02826685458421707, + "learning_rate": 0.0001830910482019893, + "loss": 0.3383, + "step": 8695 + }, + { + "epoch": 0.7044718081659106, + "grad_norm": 0.030664505437016487, + "learning_rate": 0.00018308654754939466, + "loss": 0.3165, + "step": 8696 + }, + { + "epoch": 0.704552819183409, + "grad_norm": 0.03440213203430176, + "learning_rate": 0.00018308204689680005, + "loss": 0.3911, + "step": 8697 + }, + { + "epoch": 0.7046338302009073, + "grad_norm": 0.03921644017100334, + "learning_rate": 0.0001830775462442054, + "loss": 0.3432, + "step": 8698 + }, + { + "epoch": 0.7047148412184057, + "grad_norm": 0.033602550625801086, + "learning_rate": 0.0001830730455916108, + "loss": 0.3138, + "step": 8699 + }, + { + "epoch": 0.7047958522359041, + "grad_norm": 0.03248998522758484, + "learning_rate": 0.00018306854493901618, + "loss": 0.3435, + "step": 8700 + }, + { + "epoch": 0.7048768632534025, + "grad_norm": 0.030209265649318695, + "learning_rate": 0.00018306404428642154, + "loss": 0.3108, + "step": 8701 + }, + { + "epoch": 0.7049578742709008, + "grad_norm": 0.0281781367957592, + "learning_rate": 0.0001830595436338269, + "loss": 0.2978, + "step": 8702 + }, + { + "epoch": 0.7050388852883992, + "grad_norm": 0.03777993470430374, + "learning_rate": 0.0001830550429812323, + "loss": 0.4091, + "step": 8703 + }, + { + "epoch": 0.7051198963058976, + "grad_norm": 0.030923016369342804, + "learning_rate": 0.00018305054232863765, + "loss": 0.3174, + "step": 8704 + }, + { + "epoch": 0.705200907323396, + "grad_norm": 0.03093315102159977, + "learning_rate": 0.00018304604167604304, + "loss": 0.324, + "step": 8705 + }, + { + "epoch": 0.7052819183408944, + "grad_norm": 0.03139530122280121, + "learning_rate": 0.00018304154102344842, + "loss": 0.3232, + "step": 8706 + }, + { + "epoch": 0.7053629293583927, + "grad_norm": 0.029654931277036667, + "learning_rate": 0.00018303704037085378, + "loss": 0.3483, + "step": 8707 + }, + { + "epoch": 0.7054439403758911, + "grad_norm": 0.03465942665934563, + "learning_rate": 0.00018303253971825914, + "loss": 0.3657, + "step": 8708 + }, + { + "epoch": 0.7055249513933896, + "grad_norm": 0.028565427288413048, + "learning_rate": 0.00018302803906566453, + "loss": 0.2924, + "step": 8709 + }, + { + "epoch": 0.7056059624108879, + "grad_norm": 0.03348225727677345, + "learning_rate": 0.0001830235384130699, + "loss": 0.3582, + "step": 8710 + }, + { + "epoch": 0.7056869734283863, + "grad_norm": 0.032475586980581284, + "learning_rate": 0.00018301903776047528, + "loss": 0.377, + "step": 8711 + }, + { + "epoch": 0.7057679844458846, + "grad_norm": 0.03190094232559204, + "learning_rate": 0.00018301453710788067, + "loss": 0.3596, + "step": 8712 + }, + { + "epoch": 0.705848995463383, + "grad_norm": 0.03844151273369789, + "learning_rate": 0.00018301003645528603, + "loss": 0.3868, + "step": 8713 + }, + { + "epoch": 0.7059300064808814, + "grad_norm": 0.03228616714477539, + "learning_rate": 0.00018300553580269139, + "loss": 0.3538, + "step": 8714 + }, + { + "epoch": 0.7060110174983798, + "grad_norm": 0.02916380949318409, + "learning_rate": 0.00018300103515009677, + "loss": 0.34, + "step": 8715 + }, + { + "epoch": 0.7060920285158782, + "grad_norm": 0.03413008525967598, + "learning_rate": 0.00018299653449750213, + "loss": 0.3049, + "step": 8716 + }, + { + "epoch": 0.7061730395333765, + "grad_norm": 0.033210255205631256, + "learning_rate": 0.00018299203384490752, + "loss": 0.3668, + "step": 8717 + }, + { + "epoch": 0.7062540505508749, + "grad_norm": 0.033286821097135544, + "learning_rate": 0.0001829875331923129, + "loss": 0.3584, + "step": 8718 + }, + { + "epoch": 0.7063350615683733, + "grad_norm": 0.029751164838671684, + "learning_rate": 0.00018298303253971827, + "loss": 0.301, + "step": 8719 + }, + { + "epoch": 0.7064160725858717, + "grad_norm": 0.030733181163668633, + "learning_rate": 0.00018297853188712363, + "loss": 0.3435, + "step": 8720 + }, + { + "epoch": 0.70649708360337, + "grad_norm": 0.03550855442881584, + "learning_rate": 0.00018297403123452902, + "loss": 0.4085, + "step": 8721 + }, + { + "epoch": 0.7065780946208684, + "grad_norm": 0.0320737287402153, + "learning_rate": 0.00018296953058193438, + "loss": 0.325, + "step": 8722 + }, + { + "epoch": 0.7066591056383669, + "grad_norm": 0.03291591629385948, + "learning_rate": 0.00018296502992933976, + "loss": 0.4128, + "step": 8723 + }, + { + "epoch": 0.7067401166558652, + "grad_norm": 0.031303029507398605, + "learning_rate": 0.00018296052927674515, + "loss": 0.3524, + "step": 8724 + }, + { + "epoch": 0.7068211276733636, + "grad_norm": 0.0403035506606102, + "learning_rate": 0.0001829560286241505, + "loss": 0.4219, + "step": 8725 + }, + { + "epoch": 0.7069021386908619, + "grad_norm": 0.03615202009677887, + "learning_rate": 0.00018295152797155587, + "loss": 0.4069, + "step": 8726 + }, + { + "epoch": 0.7069831497083603, + "grad_norm": 0.03636353090405464, + "learning_rate": 0.00018294702731896126, + "loss": 0.3371, + "step": 8727 + }, + { + "epoch": 0.7070641607258588, + "grad_norm": 0.03489084169268608, + "learning_rate": 0.00018294252666636664, + "loss": 0.4238, + "step": 8728 + }, + { + "epoch": 0.7071451717433571, + "grad_norm": 0.028424890711903572, + "learning_rate": 0.000182938026013772, + "loss": 0.3169, + "step": 8729 + }, + { + "epoch": 0.7072261827608555, + "grad_norm": 0.03434450924396515, + "learning_rate": 0.0001829335253611774, + "loss": 0.3473, + "step": 8730 + }, + { + "epoch": 0.7073071937783538, + "grad_norm": 0.02949521876871586, + "learning_rate": 0.00018292902470858275, + "loss": 0.312, + "step": 8731 + }, + { + "epoch": 0.7073882047958522, + "grad_norm": 0.03468482196331024, + "learning_rate": 0.0001829245240559881, + "loss": 0.3502, + "step": 8732 + }, + { + "epoch": 0.7074692158133506, + "grad_norm": 0.03378097712993622, + "learning_rate": 0.0001829200234033935, + "loss": 0.3398, + "step": 8733 + }, + { + "epoch": 0.707550226830849, + "grad_norm": 0.030609462410211563, + "learning_rate": 0.00018291552275079889, + "loss": 0.3552, + "step": 8734 + }, + { + "epoch": 0.7076312378483474, + "grad_norm": 0.028447598218917847, + "learning_rate": 0.00018291102209820425, + "loss": 0.3073, + "step": 8735 + }, + { + "epoch": 0.7077122488658457, + "grad_norm": 0.02937387302517891, + "learning_rate": 0.00018290652144560963, + "loss": 0.3475, + "step": 8736 + }, + { + "epoch": 0.7077932598833442, + "grad_norm": 0.03563931584358215, + "learning_rate": 0.000182902020793015, + "loss": 0.3395, + "step": 8737 + }, + { + "epoch": 0.7078742709008425, + "grad_norm": 0.02817721478641033, + "learning_rate": 0.00018289752014042035, + "loss": 0.33, + "step": 8738 + }, + { + "epoch": 0.7079552819183409, + "grad_norm": 0.03269148990511894, + "learning_rate": 0.00018289301948782574, + "loss": 0.3608, + "step": 8739 + }, + { + "epoch": 0.7080362929358393, + "grad_norm": 0.03341267630457878, + "learning_rate": 0.00018288851883523113, + "loss": 0.393, + "step": 8740 + }, + { + "epoch": 0.7081173039533376, + "grad_norm": 0.03010784648358822, + "learning_rate": 0.0001828840181826365, + "loss": 0.3202, + "step": 8741 + }, + { + "epoch": 0.7081983149708361, + "grad_norm": 0.04232911765575409, + "learning_rate": 0.00018287951753004187, + "loss": 0.3817, + "step": 8742 + }, + { + "epoch": 0.7082793259883344, + "grad_norm": 0.03095560148358345, + "learning_rate": 0.00018287501687744723, + "loss": 0.346, + "step": 8743 + }, + { + "epoch": 0.7083603370058328, + "grad_norm": 0.031654082238674164, + "learning_rate": 0.0001828705162248526, + "loss": 0.3696, + "step": 8744 + }, + { + "epoch": 0.7084413480233311, + "grad_norm": 0.03190073370933533, + "learning_rate": 0.00018286601557225798, + "loss": 0.3623, + "step": 8745 + }, + { + "epoch": 0.7085223590408296, + "grad_norm": 0.03220773860812187, + "learning_rate": 0.00018286151491966337, + "loss": 0.36, + "step": 8746 + }, + { + "epoch": 0.708603370058328, + "grad_norm": 0.031856559216976166, + "learning_rate": 0.00018285701426706873, + "loss": 0.3149, + "step": 8747 + }, + { + "epoch": 0.7086843810758263, + "grad_norm": 0.03256354480981827, + "learning_rate": 0.00018285251361447412, + "loss": 0.3331, + "step": 8748 + }, + { + "epoch": 0.7087653920933247, + "grad_norm": 0.03743598982691765, + "learning_rate": 0.00018284801296187948, + "loss": 0.3936, + "step": 8749 + }, + { + "epoch": 0.708846403110823, + "grad_norm": 0.03272002190351486, + "learning_rate": 0.00018284351230928484, + "loss": 0.3475, + "step": 8750 + }, + { + "epoch": 0.7089274141283215, + "grad_norm": 0.03147174045443535, + "learning_rate": 0.00018283901165669022, + "loss": 0.3302, + "step": 8751 + }, + { + "epoch": 0.7090084251458199, + "grad_norm": 0.030662400647997856, + "learning_rate": 0.0001828345110040956, + "loss": 0.3505, + "step": 8752 + }, + { + "epoch": 0.7090894361633182, + "grad_norm": 0.029921991750597954, + "learning_rate": 0.00018283001035150097, + "loss": 0.3221, + "step": 8753 + }, + { + "epoch": 0.7091704471808166, + "grad_norm": 0.03617072105407715, + "learning_rate": 0.00018282550969890636, + "loss": 0.3553, + "step": 8754 + }, + { + "epoch": 0.7092514581983149, + "grad_norm": 0.030982332304120064, + "learning_rate": 0.00018282100904631172, + "loss": 0.3824, + "step": 8755 + }, + { + "epoch": 0.7093324692158134, + "grad_norm": 0.03520418331027031, + "learning_rate": 0.00018281650839371708, + "loss": 0.3946, + "step": 8756 + }, + { + "epoch": 0.7094134802333117, + "grad_norm": 0.03270168974995613, + "learning_rate": 0.0001828120077411225, + "loss": 0.3001, + "step": 8757 + }, + { + "epoch": 0.7094944912508101, + "grad_norm": 0.033834751695394516, + "learning_rate": 0.00018280750708852785, + "loss": 0.412, + "step": 8758 + }, + { + "epoch": 0.7095755022683085, + "grad_norm": 0.028466802090406418, + "learning_rate": 0.0001828030064359332, + "loss": 0.3104, + "step": 8759 + }, + { + "epoch": 0.7096565132858069, + "grad_norm": 0.032070502638816833, + "learning_rate": 0.0001827985057833386, + "loss": 0.3134, + "step": 8760 + }, + { + "epoch": 0.7097375243033053, + "grad_norm": 0.03090481273829937, + "learning_rate": 0.00018279400513074396, + "loss": 0.3136, + "step": 8761 + }, + { + "epoch": 0.7098185353208036, + "grad_norm": 0.030255356803536415, + "learning_rate": 0.00018278950447814932, + "loss": 0.3208, + "step": 8762 + }, + { + "epoch": 0.709899546338302, + "grad_norm": 0.03436191380023956, + "learning_rate": 0.00018278500382555473, + "loss": 0.3745, + "step": 8763 + }, + { + "epoch": 0.7099805573558003, + "grad_norm": 0.03800062835216522, + "learning_rate": 0.0001827805031729601, + "loss": 0.3415, + "step": 8764 + }, + { + "epoch": 0.7100615683732988, + "grad_norm": 0.03259582817554474, + "learning_rate": 0.00018277600252036545, + "loss": 0.3683, + "step": 8765 + }, + { + "epoch": 0.7101425793907972, + "grad_norm": 0.030449820682406425, + "learning_rate": 0.00018277150186777084, + "loss": 0.2866, + "step": 8766 + }, + { + "epoch": 0.7102235904082955, + "grad_norm": 0.035687729716300964, + "learning_rate": 0.0001827670012151762, + "loss": 0.321, + "step": 8767 + }, + { + "epoch": 0.7103046014257939, + "grad_norm": 0.0287131629884243, + "learning_rate": 0.00018276250056258156, + "loss": 0.3534, + "step": 8768 + }, + { + "epoch": 0.7103856124432922, + "grad_norm": 0.030978770926594734, + "learning_rate": 0.00018275799990998698, + "loss": 0.3558, + "step": 8769 + }, + { + "epoch": 0.7104666234607907, + "grad_norm": 0.03859172388911247, + "learning_rate": 0.00018275349925739234, + "loss": 0.3639, + "step": 8770 + }, + { + "epoch": 0.7105476344782891, + "grad_norm": 0.029355809092521667, + "learning_rate": 0.0001827489986047977, + "loss": 0.3119, + "step": 8771 + }, + { + "epoch": 0.7106286454957874, + "grad_norm": 0.03254703804850578, + "learning_rate": 0.00018274449795220308, + "loss": 0.3357, + "step": 8772 + }, + { + "epoch": 0.7107096565132858, + "grad_norm": 0.029273660853505135, + "learning_rate": 0.00018273999729960844, + "loss": 0.3378, + "step": 8773 + }, + { + "epoch": 0.7107906675307842, + "grad_norm": 0.034787289798259735, + "learning_rate": 0.0001827354966470138, + "loss": 0.3994, + "step": 8774 + }, + { + "epoch": 0.7108716785482826, + "grad_norm": 0.034370459616184235, + "learning_rate": 0.00018273099599441922, + "loss": 0.3461, + "step": 8775 + }, + { + "epoch": 0.710952689565781, + "grad_norm": 0.03280269354581833, + "learning_rate": 0.00018272649534182458, + "loss": 0.3364, + "step": 8776 + }, + { + "epoch": 0.7110337005832793, + "grad_norm": 0.03019985742866993, + "learning_rate": 0.00018272199468922994, + "loss": 0.3456, + "step": 8777 + }, + { + "epoch": 0.7111147116007777, + "grad_norm": 0.038414500653743744, + "learning_rate": 0.00018271749403663532, + "loss": 0.3545, + "step": 8778 + }, + { + "epoch": 0.7111957226182761, + "grad_norm": 0.03398662805557251, + "learning_rate": 0.00018271299338404068, + "loss": 0.3426, + "step": 8779 + }, + { + "epoch": 0.7112767336357745, + "grad_norm": 0.03139416128396988, + "learning_rate": 0.00018270849273144607, + "loss": 0.3325, + "step": 8780 + }, + { + "epoch": 0.7113577446532728, + "grad_norm": 0.027758194133639336, + "learning_rate": 0.00018270399207885146, + "loss": 0.299, + "step": 8781 + }, + { + "epoch": 0.7114387556707712, + "grad_norm": 0.03087412379682064, + "learning_rate": 0.00018269949142625682, + "loss": 0.3143, + "step": 8782 + }, + { + "epoch": 0.7115197666882696, + "grad_norm": 0.03145689144730568, + "learning_rate": 0.00018269499077366218, + "loss": 0.3453, + "step": 8783 + }, + { + "epoch": 0.711600777705768, + "grad_norm": 0.035226672887802124, + "learning_rate": 0.00018269049012106757, + "loss": 0.375, + "step": 8784 + }, + { + "epoch": 0.7116817887232664, + "grad_norm": 0.03201188147068024, + "learning_rate": 0.00018268598946847293, + "loss": 0.3434, + "step": 8785 + }, + { + "epoch": 0.7117627997407647, + "grad_norm": 0.03402809053659439, + "learning_rate": 0.0001826814888158783, + "loss": 0.3468, + "step": 8786 + }, + { + "epoch": 0.7118438107582631, + "grad_norm": 0.034564413130283356, + "learning_rate": 0.0001826769881632837, + "loss": 0.3548, + "step": 8787 + }, + { + "epoch": 0.7119248217757616, + "grad_norm": 0.030737025663256645, + "learning_rate": 0.00018267248751068906, + "loss": 0.3013, + "step": 8788 + }, + { + "epoch": 0.7120058327932599, + "grad_norm": 0.02977253682911396, + "learning_rate": 0.00018266798685809442, + "loss": 0.3244, + "step": 8789 + }, + { + "epoch": 0.7120868438107583, + "grad_norm": 0.028753027319908142, + "learning_rate": 0.0001826634862054998, + "loss": 0.321, + "step": 8790 + }, + { + "epoch": 0.7121678548282566, + "grad_norm": 0.028307564556598663, + "learning_rate": 0.00018265898555290517, + "loss": 0.3137, + "step": 8791 + }, + { + "epoch": 0.712248865845755, + "grad_norm": 0.030974021181464195, + "learning_rate": 0.00018265448490031055, + "loss": 0.3008, + "step": 8792 + }, + { + "epoch": 0.7123298768632534, + "grad_norm": 0.028313852846622467, + "learning_rate": 0.00018264998424771594, + "loss": 0.3281, + "step": 8793 + }, + { + "epoch": 0.7124108878807518, + "grad_norm": 0.03233834356069565, + "learning_rate": 0.0001826454835951213, + "loss": 0.302, + "step": 8794 + }, + { + "epoch": 0.7124918988982502, + "grad_norm": 0.030246617272496223, + "learning_rate": 0.00018264098294252666, + "loss": 0.3111, + "step": 8795 + }, + { + "epoch": 0.7125729099157485, + "grad_norm": 0.03288370370864868, + "learning_rate": 0.00018263648228993205, + "loss": 0.3461, + "step": 8796 + }, + { + "epoch": 0.712653920933247, + "grad_norm": 0.03088957816362381, + "learning_rate": 0.0001826319816373374, + "loss": 0.34, + "step": 8797 + }, + { + "epoch": 0.7127349319507453, + "grad_norm": 0.032008424401283264, + "learning_rate": 0.0001826274809847428, + "loss": 0.3632, + "step": 8798 + }, + { + "epoch": 0.7128159429682437, + "grad_norm": 0.03967854380607605, + "learning_rate": 0.00018262298033214818, + "loss": 0.3617, + "step": 8799 + }, + { + "epoch": 0.712896953985742, + "grad_norm": 0.03548341989517212, + "learning_rate": 0.00018261847967955354, + "loss": 0.3606, + "step": 8800 + }, + { + "epoch": 0.7129779650032404, + "grad_norm": 0.033855367451906204, + "learning_rate": 0.0001826139790269589, + "loss": 0.3144, + "step": 8801 + }, + { + "epoch": 0.7130589760207389, + "grad_norm": 0.0331764742732048, + "learning_rate": 0.0001826094783743643, + "loss": 0.4024, + "step": 8802 + }, + { + "epoch": 0.7131399870382372, + "grad_norm": 0.036456841975450516, + "learning_rate": 0.00018260497772176965, + "loss": 0.3421, + "step": 8803 + }, + { + "epoch": 0.7132209980557356, + "grad_norm": 0.03295794501900673, + "learning_rate": 0.00018260047706917504, + "loss": 0.3488, + "step": 8804 + }, + { + "epoch": 0.7133020090732339, + "grad_norm": 0.03683172166347504, + "learning_rate": 0.00018259597641658042, + "loss": 0.3197, + "step": 8805 + }, + { + "epoch": 0.7133830200907323, + "grad_norm": 0.02898084558546543, + "learning_rate": 0.00018259147576398578, + "loss": 0.3493, + "step": 8806 + }, + { + "epoch": 0.7134640311082308, + "grad_norm": 0.03242986276745796, + "learning_rate": 0.00018258697511139114, + "loss": 0.3182, + "step": 8807 + }, + { + "epoch": 0.7135450421257291, + "grad_norm": 0.03240397945046425, + "learning_rate": 0.00018258247445879653, + "loss": 0.3224, + "step": 8808 + }, + { + "epoch": 0.7136260531432275, + "grad_norm": 0.03218116611242294, + "learning_rate": 0.00018257797380620192, + "loss": 0.3398, + "step": 8809 + }, + { + "epoch": 0.7137070641607258, + "grad_norm": 0.03495160490274429, + "learning_rate": 0.00018257347315360728, + "loss": 0.3756, + "step": 8810 + }, + { + "epoch": 0.7137880751782243, + "grad_norm": 0.035370394587516785, + "learning_rate": 0.00018256897250101267, + "loss": 0.3639, + "step": 8811 + }, + { + "epoch": 0.7138690861957226, + "grad_norm": 0.029185624793171883, + "learning_rate": 0.00018256447184841803, + "loss": 0.34, + "step": 8812 + }, + { + "epoch": 0.713950097213221, + "grad_norm": 0.034185972064733505, + "learning_rate": 0.0001825599711958234, + "loss": 0.3777, + "step": 8813 + }, + { + "epoch": 0.7140311082307194, + "grad_norm": 0.03374910727143288, + "learning_rate": 0.00018255547054322877, + "loss": 0.3375, + "step": 8814 + }, + { + "epoch": 0.7141121192482177, + "grad_norm": 0.029371364042162895, + "learning_rate": 0.00018255096989063416, + "loss": 0.3024, + "step": 8815 + }, + { + "epoch": 0.7141931302657162, + "grad_norm": 0.03439253568649292, + "learning_rate": 0.00018254646923803952, + "loss": 0.3091, + "step": 8816 + }, + { + "epoch": 0.7142741412832145, + "grad_norm": 0.042450908571481705, + "learning_rate": 0.0001825419685854449, + "loss": 0.3589, + "step": 8817 + }, + { + "epoch": 0.7143551523007129, + "grad_norm": 0.03298235312104225, + "learning_rate": 0.00018253746793285027, + "loss": 0.3122, + "step": 8818 + }, + { + "epoch": 0.7144361633182112, + "grad_norm": 0.028749145567417145, + "learning_rate": 0.00018253296728025563, + "loss": 0.3337, + "step": 8819 + }, + { + "epoch": 0.7145171743357096, + "grad_norm": 0.02993011847138405, + "learning_rate": 0.00018252846662766102, + "loss": 0.354, + "step": 8820 + }, + { + "epoch": 0.7145981853532081, + "grad_norm": 0.030420931056141853, + "learning_rate": 0.0001825239659750664, + "loss": 0.3231, + "step": 8821 + }, + { + "epoch": 0.7146791963707064, + "grad_norm": 0.03501799702644348, + "learning_rate": 0.00018251946532247176, + "loss": 0.3781, + "step": 8822 + }, + { + "epoch": 0.7147602073882048, + "grad_norm": 0.030919726938009262, + "learning_rate": 0.00018251496466987715, + "loss": 0.3437, + "step": 8823 + }, + { + "epoch": 0.7148412184057031, + "grad_norm": 0.03802415356040001, + "learning_rate": 0.0001825104640172825, + "loss": 0.3688, + "step": 8824 + }, + { + "epoch": 0.7149222294232016, + "grad_norm": 0.03371911495923996, + "learning_rate": 0.00018250596336468787, + "loss": 0.3584, + "step": 8825 + }, + { + "epoch": 0.7150032404407, + "grad_norm": 0.03014080785214901, + "learning_rate": 0.00018250146271209326, + "loss": 0.3006, + "step": 8826 + }, + { + "epoch": 0.7150842514581983, + "grad_norm": 0.032793935388326645, + "learning_rate": 0.00018249696205949864, + "loss": 0.3479, + "step": 8827 + }, + { + "epoch": 0.7151652624756967, + "grad_norm": 0.02692987769842148, + "learning_rate": 0.000182492461406904, + "loss": 0.3172, + "step": 8828 + }, + { + "epoch": 0.715246273493195, + "grad_norm": 0.031958647072315216, + "learning_rate": 0.0001824879607543094, + "loss": 0.34, + "step": 8829 + }, + { + "epoch": 0.7153272845106935, + "grad_norm": 0.03293073922395706, + "learning_rate": 0.00018248346010171475, + "loss": 0.3599, + "step": 8830 + }, + { + "epoch": 0.7154082955281919, + "grad_norm": 0.037021856755018234, + "learning_rate": 0.0001824789594491201, + "loss": 0.3591, + "step": 8831 + }, + { + "epoch": 0.7154893065456902, + "grad_norm": 0.034067876636981964, + "learning_rate": 0.00018247445879652553, + "loss": 0.3729, + "step": 8832 + }, + { + "epoch": 0.7155703175631886, + "grad_norm": 0.030054716393351555, + "learning_rate": 0.00018246995814393089, + "loss": 0.3042, + "step": 8833 + }, + { + "epoch": 0.7156513285806869, + "grad_norm": 0.030154969543218613, + "learning_rate": 0.00018246545749133625, + "loss": 0.334, + "step": 8834 + }, + { + "epoch": 0.7157323395981854, + "grad_norm": 0.0348099060356617, + "learning_rate": 0.00018246095683874163, + "loss": 0.3847, + "step": 8835 + }, + { + "epoch": 0.7158133506156837, + "grad_norm": 0.029983345419168472, + "learning_rate": 0.000182456456186147, + "loss": 0.2999, + "step": 8836 + }, + { + "epoch": 0.7158943616331821, + "grad_norm": 0.03360939770936966, + "learning_rate": 0.00018245195553355235, + "loss": 0.3884, + "step": 8837 + }, + { + "epoch": 0.7159753726506805, + "grad_norm": 0.03326816111803055, + "learning_rate": 0.00018244745488095777, + "loss": 0.3523, + "step": 8838 + }, + { + "epoch": 0.7160563836681789, + "grad_norm": 0.0402449294924736, + "learning_rate": 0.00018244295422836313, + "loss": 0.4038, + "step": 8839 + }, + { + "epoch": 0.7161373946856773, + "grad_norm": 0.031454093754291534, + "learning_rate": 0.0001824384535757685, + "loss": 0.3471, + "step": 8840 + }, + { + "epoch": 0.7162184057031756, + "grad_norm": 0.03565017133951187, + "learning_rate": 0.00018243395292317387, + "loss": 0.341, + "step": 8841 + }, + { + "epoch": 0.716299416720674, + "grad_norm": 0.03517875447869301, + "learning_rate": 0.00018242945227057923, + "loss": 0.3271, + "step": 8842 + }, + { + "epoch": 0.7163804277381723, + "grad_norm": 0.03396892547607422, + "learning_rate": 0.0001824249516179846, + "loss": 0.3345, + "step": 8843 + }, + { + "epoch": 0.7164614387556708, + "grad_norm": 0.03428976982831955, + "learning_rate": 0.00018242045096539, + "loss": 0.3868, + "step": 8844 + }, + { + "epoch": 0.7165424497731692, + "grad_norm": 0.03712880238890648, + "learning_rate": 0.00018241595031279537, + "loss": 0.3605, + "step": 8845 + }, + { + "epoch": 0.7166234607906675, + "grad_norm": 0.030677849426865578, + "learning_rate": 0.00018241144966020073, + "loss": 0.289, + "step": 8846 + }, + { + "epoch": 0.7167044718081659, + "grad_norm": 0.03206459432840347, + "learning_rate": 0.00018240694900760612, + "loss": 0.3164, + "step": 8847 + }, + { + "epoch": 0.7167854828256643, + "grad_norm": 0.033618632704019547, + "learning_rate": 0.00018240244835501148, + "loss": 0.3338, + "step": 8848 + }, + { + "epoch": 0.7168664938431627, + "grad_norm": 0.031833041459321976, + "learning_rate": 0.00018239794770241684, + "loss": 0.3573, + "step": 8849 + }, + { + "epoch": 0.7169475048606611, + "grad_norm": 0.03609463945031166, + "learning_rate": 0.00018239344704982225, + "loss": 0.3515, + "step": 8850 + }, + { + "epoch": 0.7170285158781594, + "grad_norm": 0.030079221352934837, + "learning_rate": 0.0001823889463972276, + "loss": 0.3107, + "step": 8851 + }, + { + "epoch": 0.7171095268956578, + "grad_norm": 0.03663382679224014, + "learning_rate": 0.00018238444574463297, + "loss": 0.3602, + "step": 8852 + }, + { + "epoch": 0.7171905379131562, + "grad_norm": 0.03408436477184296, + "learning_rate": 0.00018237994509203836, + "loss": 0.3563, + "step": 8853 + }, + { + "epoch": 0.7172715489306546, + "grad_norm": 0.033193428069353104, + "learning_rate": 0.00018237544443944372, + "loss": 0.3681, + "step": 8854 + }, + { + "epoch": 0.717352559948153, + "grad_norm": 0.031171130016446114, + "learning_rate": 0.00018237094378684908, + "loss": 0.3607, + "step": 8855 + }, + { + "epoch": 0.7174335709656513, + "grad_norm": 0.034352436661720276, + "learning_rate": 0.0001823664431342545, + "loss": 0.384, + "step": 8856 + }, + { + "epoch": 0.7175145819831497, + "grad_norm": 0.0330151803791523, + "learning_rate": 0.00018236194248165985, + "loss": 0.3193, + "step": 8857 + }, + { + "epoch": 0.7175955930006481, + "grad_norm": 0.027999212965369225, + "learning_rate": 0.0001823574418290652, + "loss": 0.2807, + "step": 8858 + }, + { + "epoch": 0.7176766040181465, + "grad_norm": 0.02855718694627285, + "learning_rate": 0.0001823529411764706, + "loss": 0.3139, + "step": 8859 + }, + { + "epoch": 0.7177576150356448, + "grad_norm": 0.03285132721066475, + "learning_rate": 0.00018234844052387596, + "loss": 0.3573, + "step": 8860 + }, + { + "epoch": 0.7178386260531432, + "grad_norm": 0.030333518981933594, + "learning_rate": 0.00018234393987128135, + "loss": 0.3014, + "step": 8861 + }, + { + "epoch": 0.7179196370706417, + "grad_norm": 0.03320077806711197, + "learning_rate": 0.00018233943921868673, + "loss": 0.3319, + "step": 8862 + }, + { + "epoch": 0.71800064808814, + "grad_norm": 0.031354501843452454, + "learning_rate": 0.0001823349385660921, + "loss": 0.3028, + "step": 8863 + }, + { + "epoch": 0.7180816591056384, + "grad_norm": 0.03711242601275444, + "learning_rate": 0.00018233043791349745, + "loss": 0.357, + "step": 8864 + }, + { + "epoch": 0.7181626701231367, + "grad_norm": 0.030047696083784103, + "learning_rate": 0.00018232593726090284, + "loss": 0.315, + "step": 8865 + }, + { + "epoch": 0.7182436811406351, + "grad_norm": 0.03306373208761215, + "learning_rate": 0.0001823214366083082, + "loss": 0.3628, + "step": 8866 + }, + { + "epoch": 0.7183246921581335, + "grad_norm": 0.03267044574022293, + "learning_rate": 0.0001823169359557136, + "loss": 0.2989, + "step": 8867 + }, + { + "epoch": 0.7184057031756319, + "grad_norm": 0.02751380018889904, + "learning_rate": 0.00018231243530311898, + "loss": 0.3077, + "step": 8868 + }, + { + "epoch": 0.7184867141931303, + "grad_norm": 0.029321538284420967, + "learning_rate": 0.00018230793465052434, + "loss": 0.3114, + "step": 8869 + }, + { + "epoch": 0.7185677252106286, + "grad_norm": 0.03440944105386734, + "learning_rate": 0.0001823034339979297, + "loss": 0.3438, + "step": 8870 + }, + { + "epoch": 0.718648736228127, + "grad_norm": 0.03138351812958717, + "learning_rate": 0.00018229893334533508, + "loss": 0.296, + "step": 8871 + }, + { + "epoch": 0.7187297472456254, + "grad_norm": 0.034056250005960464, + "learning_rate": 0.00018229443269274044, + "loss": 0.3643, + "step": 8872 + }, + { + "epoch": 0.7188107582631238, + "grad_norm": 0.032820384949445724, + "learning_rate": 0.00018228993204014583, + "loss": 0.3449, + "step": 8873 + }, + { + "epoch": 0.7188917692806222, + "grad_norm": 0.030815161764621735, + "learning_rate": 0.00018228543138755122, + "loss": 0.3456, + "step": 8874 + }, + { + "epoch": 0.7189727802981205, + "grad_norm": 0.03418358787894249, + "learning_rate": 0.00018228093073495658, + "loss": 0.3523, + "step": 8875 + }, + { + "epoch": 0.719053791315619, + "grad_norm": 0.029444202780723572, + "learning_rate": 0.00018227643008236194, + "loss": 0.3286, + "step": 8876 + }, + { + "epoch": 0.7191348023331173, + "grad_norm": 0.047734808176755905, + "learning_rate": 0.00018227192942976732, + "loss": 0.4268, + "step": 8877 + }, + { + "epoch": 0.7192158133506157, + "grad_norm": 0.03228252753615379, + "learning_rate": 0.00018226742877717268, + "loss": 0.349, + "step": 8878 + }, + { + "epoch": 0.719296824368114, + "grad_norm": 0.034021634608507156, + "learning_rate": 0.00018226292812457807, + "loss": 0.3693, + "step": 8879 + }, + { + "epoch": 0.7193778353856124, + "grad_norm": 0.03035876527428627, + "learning_rate": 0.00018225842747198346, + "loss": 0.3519, + "step": 8880 + }, + { + "epoch": 0.7194588464031109, + "grad_norm": 0.02635454386472702, + "learning_rate": 0.00018225392681938882, + "loss": 0.3025, + "step": 8881 + }, + { + "epoch": 0.7195398574206092, + "grad_norm": 0.028250424191355705, + "learning_rate": 0.00018224942616679418, + "loss": 0.3595, + "step": 8882 + }, + { + "epoch": 0.7196208684381076, + "grad_norm": 0.02870979905128479, + "learning_rate": 0.00018224492551419957, + "loss": 0.3139, + "step": 8883 + }, + { + "epoch": 0.7197018794556059, + "grad_norm": 0.032601818442344666, + "learning_rate": 0.00018224042486160495, + "loss": 0.3251, + "step": 8884 + }, + { + "epoch": 0.7197828904731044, + "grad_norm": 0.03433837369084358, + "learning_rate": 0.0001822359242090103, + "loss": 0.3472, + "step": 8885 + }, + { + "epoch": 0.7198639014906028, + "grad_norm": 0.03213446959853172, + "learning_rate": 0.0001822314235564157, + "loss": 0.3386, + "step": 8886 + }, + { + "epoch": 0.7199449125081011, + "grad_norm": 0.03527262806892395, + "learning_rate": 0.00018222692290382106, + "loss": 0.3702, + "step": 8887 + }, + { + "epoch": 0.7200259235255995, + "grad_norm": 0.032607708126306534, + "learning_rate": 0.00018222242225122642, + "loss": 0.332, + "step": 8888 + }, + { + "epoch": 0.7201069345430978, + "grad_norm": 0.03458176553249359, + "learning_rate": 0.0001822179215986318, + "loss": 0.3398, + "step": 8889 + }, + { + "epoch": 0.7201879455605963, + "grad_norm": 0.033175062388181686, + "learning_rate": 0.0001822134209460372, + "loss": 0.3496, + "step": 8890 + }, + { + "epoch": 0.7202689565780946, + "grad_norm": 0.03210841119289398, + "learning_rate": 0.00018220892029344255, + "loss": 0.3456, + "step": 8891 + }, + { + "epoch": 0.720349967595593, + "grad_norm": 0.03000030666589737, + "learning_rate": 0.00018220441964084794, + "loss": 0.3228, + "step": 8892 + }, + { + "epoch": 0.7204309786130914, + "grad_norm": 0.037756647914648056, + "learning_rate": 0.0001821999189882533, + "loss": 0.3243, + "step": 8893 + }, + { + "epoch": 0.7205119896305897, + "grad_norm": 0.03188977390527725, + "learning_rate": 0.00018219541833565866, + "loss": 0.2873, + "step": 8894 + }, + { + "epoch": 0.7205930006480882, + "grad_norm": 0.03612763434648514, + "learning_rate": 0.00018219091768306405, + "loss": 0.3652, + "step": 8895 + }, + { + "epoch": 0.7206740116655865, + "grad_norm": 0.032409533858299255, + "learning_rate": 0.00018218641703046944, + "loss": 0.3268, + "step": 8896 + }, + { + "epoch": 0.7207550226830849, + "grad_norm": 0.03179197385907173, + "learning_rate": 0.0001821819163778748, + "loss": 0.3328, + "step": 8897 + }, + { + "epoch": 0.7208360337005832, + "grad_norm": 0.03408501669764519, + "learning_rate": 0.00018217741572528018, + "loss": 0.3547, + "step": 8898 + }, + { + "epoch": 0.7209170447180817, + "grad_norm": 0.031727708876132965, + "learning_rate": 0.00018217291507268554, + "loss": 0.3515, + "step": 8899 + }, + { + "epoch": 0.7209980557355801, + "grad_norm": 0.033311303704977036, + "learning_rate": 0.0001821684144200909, + "loss": 0.3462, + "step": 8900 + }, + { + "epoch": 0.7210790667530784, + "grad_norm": 0.03319597244262695, + "learning_rate": 0.0001821639137674963, + "loss": 0.288, + "step": 8901 + }, + { + "epoch": 0.7211600777705768, + "grad_norm": 0.03720049187541008, + "learning_rate": 0.00018215941311490168, + "loss": 0.3932, + "step": 8902 + }, + { + "epoch": 0.7212410887880751, + "grad_norm": 0.034940432757139206, + "learning_rate": 0.00018215491246230704, + "loss": 0.3307, + "step": 8903 + }, + { + "epoch": 0.7213220998055736, + "grad_norm": 0.030946683138608932, + "learning_rate": 0.00018215041180971243, + "loss": 0.3118, + "step": 8904 + }, + { + "epoch": 0.721403110823072, + "grad_norm": 0.03591233864426613, + "learning_rate": 0.00018214591115711779, + "loss": 0.3601, + "step": 8905 + }, + { + "epoch": 0.7214841218405703, + "grad_norm": 0.03742532059550285, + "learning_rate": 0.00018214141050452315, + "loss": 0.334, + "step": 8906 + }, + { + "epoch": 0.7215651328580687, + "grad_norm": 0.032043810933828354, + "learning_rate": 0.00018213690985192853, + "loss": 0.3403, + "step": 8907 + }, + { + "epoch": 0.721646143875567, + "grad_norm": 0.031024204567074776, + "learning_rate": 0.00018213240919933392, + "loss": 0.3284, + "step": 8908 + }, + { + "epoch": 0.7217271548930655, + "grad_norm": 0.03355749696493149, + "learning_rate": 0.00018212790854673928, + "loss": 0.3646, + "step": 8909 + }, + { + "epoch": 0.7218081659105638, + "grad_norm": 0.033181991428136826, + "learning_rate": 0.00018212340789414467, + "loss": 0.3336, + "step": 8910 + }, + { + "epoch": 0.7218891769280622, + "grad_norm": 0.0307527594268322, + "learning_rate": 0.00018211890724155003, + "loss": 0.3282, + "step": 8911 + }, + { + "epoch": 0.7219701879455606, + "grad_norm": 0.035663995891809464, + "learning_rate": 0.0001821144065889554, + "loss": 0.3281, + "step": 8912 + }, + { + "epoch": 0.722051198963059, + "grad_norm": 0.030169153586030006, + "learning_rate": 0.0001821099059363608, + "loss": 0.293, + "step": 8913 + }, + { + "epoch": 0.7221322099805574, + "grad_norm": 0.03254105895757675, + "learning_rate": 0.00018210540528376616, + "loss": 0.3449, + "step": 8914 + }, + { + "epoch": 0.7222132209980557, + "grad_norm": 0.033432308584451675, + "learning_rate": 0.00018210090463117152, + "loss": 0.3908, + "step": 8915 + }, + { + "epoch": 0.7222942320155541, + "grad_norm": 0.03857411816716194, + "learning_rate": 0.0001820964039785769, + "loss": 0.3751, + "step": 8916 + }, + { + "epoch": 0.7223752430330524, + "grad_norm": 0.03752148523926735, + "learning_rate": 0.00018209190332598227, + "loss": 0.3848, + "step": 8917 + }, + { + "epoch": 0.7224562540505509, + "grad_norm": 0.03653761371970177, + "learning_rate": 0.00018208740267338763, + "loss": 0.3679, + "step": 8918 + }, + { + "epoch": 0.7225372650680493, + "grad_norm": 0.034680552780628204, + "learning_rate": 0.00018208290202079304, + "loss": 0.3571, + "step": 8919 + }, + { + "epoch": 0.7226182760855476, + "grad_norm": 0.028377162292599678, + "learning_rate": 0.0001820784013681984, + "loss": 0.337, + "step": 8920 + }, + { + "epoch": 0.722699287103046, + "grad_norm": 0.029166093096137047, + "learning_rate": 0.00018207390071560376, + "loss": 0.312, + "step": 8921 + }, + { + "epoch": 0.7227802981205443, + "grad_norm": 0.034398071467876434, + "learning_rate": 0.00018206940006300915, + "loss": 0.3587, + "step": 8922 + }, + { + "epoch": 0.7228613091380428, + "grad_norm": 0.03137093037366867, + "learning_rate": 0.0001820648994104145, + "loss": 0.3204, + "step": 8923 + }, + { + "epoch": 0.7229423201555412, + "grad_norm": 0.029238324612379074, + "learning_rate": 0.00018206039875781987, + "loss": 0.3021, + "step": 8924 + }, + { + "epoch": 0.7230233311730395, + "grad_norm": 0.028114039450883865, + "learning_rate": 0.00018205589810522528, + "loss": 0.3014, + "step": 8925 + }, + { + "epoch": 0.7231043421905379, + "grad_norm": 0.03557706996798515, + "learning_rate": 0.00018205139745263064, + "loss": 0.3371, + "step": 8926 + }, + { + "epoch": 0.7231853532080363, + "grad_norm": 0.03201264515519142, + "learning_rate": 0.000182046896800036, + "loss": 0.3082, + "step": 8927 + }, + { + "epoch": 0.7232663642255347, + "grad_norm": 0.03449628874659538, + "learning_rate": 0.0001820423961474414, + "loss": 0.3353, + "step": 8928 + }, + { + "epoch": 0.723347375243033, + "grad_norm": 0.029499752447009087, + "learning_rate": 0.00018203789549484675, + "loss": 0.3292, + "step": 8929 + }, + { + "epoch": 0.7234283862605314, + "grad_norm": 0.0337161123752594, + "learning_rate": 0.0001820333948422521, + "loss": 0.408, + "step": 8930 + }, + { + "epoch": 0.7235093972780298, + "grad_norm": 0.02877146005630493, + "learning_rate": 0.00018202889418965753, + "loss": 0.3312, + "step": 8931 + }, + { + "epoch": 0.7235904082955282, + "grad_norm": 0.03432083502411842, + "learning_rate": 0.00018202439353706289, + "loss": 0.3652, + "step": 8932 + }, + { + "epoch": 0.7236714193130266, + "grad_norm": 0.03515870124101639, + "learning_rate": 0.00018201989288446825, + "loss": 0.3677, + "step": 8933 + }, + { + "epoch": 0.7237524303305249, + "grad_norm": 0.028151020407676697, + "learning_rate": 0.00018201539223187363, + "loss": 0.3109, + "step": 8934 + }, + { + "epoch": 0.7238334413480233, + "grad_norm": 0.0321088470518589, + "learning_rate": 0.000182010891579279, + "loss": 0.34, + "step": 8935 + }, + { + "epoch": 0.7239144523655218, + "grad_norm": 0.030212152749300003, + "learning_rate": 0.00018200639092668438, + "loss": 0.33, + "step": 8936 + }, + { + "epoch": 0.7239954633830201, + "grad_norm": 0.030875565484166145, + "learning_rate": 0.00018200189027408977, + "loss": 0.3162, + "step": 8937 + }, + { + "epoch": 0.7240764744005185, + "grad_norm": 0.02767636999487877, + "learning_rate": 0.00018199738962149513, + "loss": 0.2755, + "step": 8938 + }, + { + "epoch": 0.7241574854180168, + "grad_norm": 0.03224732726812363, + "learning_rate": 0.0001819928889689005, + "loss": 0.3359, + "step": 8939 + }, + { + "epoch": 0.7242384964355152, + "grad_norm": 0.027454540133476257, + "learning_rate": 0.00018198838831630587, + "loss": 0.3014, + "step": 8940 + }, + { + "epoch": 0.7243195074530137, + "grad_norm": 0.028916843235492706, + "learning_rate": 0.00018198388766371123, + "loss": 0.3156, + "step": 8941 + }, + { + "epoch": 0.724400518470512, + "grad_norm": 0.0313744843006134, + "learning_rate": 0.00018197938701111662, + "loss": 0.3091, + "step": 8942 + }, + { + "epoch": 0.7244815294880104, + "grad_norm": 0.029769916087388992, + "learning_rate": 0.000181974886358522, + "loss": 0.304, + "step": 8943 + }, + { + "epoch": 0.7245625405055087, + "grad_norm": 0.028017953038215637, + "learning_rate": 0.00018197038570592737, + "loss": 0.3212, + "step": 8944 + }, + { + "epoch": 0.7246435515230071, + "grad_norm": 0.02983788400888443, + "learning_rate": 0.00018196588505333273, + "loss": 0.3384, + "step": 8945 + }, + { + "epoch": 0.7247245625405055, + "grad_norm": 0.03687024116516113, + "learning_rate": 0.00018196138440073812, + "loss": 0.4235, + "step": 8946 + }, + { + "epoch": 0.7248055735580039, + "grad_norm": 0.02993650548160076, + "learning_rate": 0.00018195688374814348, + "loss": 0.3374, + "step": 8947 + }, + { + "epoch": 0.7248865845755023, + "grad_norm": 0.038119856268167496, + "learning_rate": 0.00018195238309554886, + "loss": 0.4032, + "step": 8948 + }, + { + "epoch": 0.7249675955930006, + "grad_norm": 0.02985922433435917, + "learning_rate": 0.00018194788244295425, + "loss": 0.3356, + "step": 8949 + }, + { + "epoch": 0.7250486066104991, + "grad_norm": 0.03387674316763878, + "learning_rate": 0.0001819433817903596, + "loss": 0.3578, + "step": 8950 + }, + { + "epoch": 0.7251296176279974, + "grad_norm": 0.03529488295316696, + "learning_rate": 0.00018193888113776497, + "loss": 0.3207, + "step": 8951 + }, + { + "epoch": 0.7252106286454958, + "grad_norm": 0.03560759872198105, + "learning_rate": 0.00018193438048517036, + "loss": 0.3062, + "step": 8952 + }, + { + "epoch": 0.7252916396629941, + "grad_norm": 0.02996155433356762, + "learning_rate": 0.00018192987983257572, + "loss": 0.3398, + "step": 8953 + }, + { + "epoch": 0.7253726506804925, + "grad_norm": 0.036760371178388596, + "learning_rate": 0.0001819253791799811, + "loss": 0.3428, + "step": 8954 + }, + { + "epoch": 0.725453661697991, + "grad_norm": 0.03608694672584534, + "learning_rate": 0.0001819208785273865, + "loss": 0.3686, + "step": 8955 + }, + { + "epoch": 0.7255346727154893, + "grad_norm": 0.03458326682448387, + "learning_rate": 0.00018191637787479185, + "loss": 0.3414, + "step": 8956 + }, + { + "epoch": 0.7256156837329877, + "grad_norm": 0.032023146748542786, + "learning_rate": 0.0001819118772221972, + "loss": 0.3464, + "step": 8957 + }, + { + "epoch": 0.725696694750486, + "grad_norm": 0.03176487609744072, + "learning_rate": 0.0001819073765696026, + "loss": 0.3124, + "step": 8958 + }, + { + "epoch": 0.7257777057679844, + "grad_norm": 0.0323118157684803, + "learning_rate": 0.00018190287591700796, + "loss": 0.3714, + "step": 8959 + }, + { + "epoch": 0.7258587167854829, + "grad_norm": 0.034501854330301285, + "learning_rate": 0.00018189837526441335, + "loss": 0.3555, + "step": 8960 + }, + { + "epoch": 0.7259397278029812, + "grad_norm": 0.032414842396974564, + "learning_rate": 0.00018189387461181873, + "loss": 0.3498, + "step": 8961 + }, + { + "epoch": 0.7260207388204796, + "grad_norm": 0.03302488103508949, + "learning_rate": 0.0001818893739592241, + "loss": 0.3768, + "step": 8962 + }, + { + "epoch": 0.7261017498379779, + "grad_norm": 0.027776505798101425, + "learning_rate": 0.00018188487330662945, + "loss": 0.3008, + "step": 8963 + }, + { + "epoch": 0.7261827608554764, + "grad_norm": 0.03728478029370308, + "learning_rate": 0.00018188037265403484, + "loss": 0.4108, + "step": 8964 + }, + { + "epoch": 0.7262637718729748, + "grad_norm": 0.03290198743343353, + "learning_rate": 0.00018187587200144023, + "loss": 0.3348, + "step": 8965 + }, + { + "epoch": 0.7263447828904731, + "grad_norm": 0.03009003959596157, + "learning_rate": 0.0001818713713488456, + "loss": 0.3328, + "step": 8966 + }, + { + "epoch": 0.7264257939079715, + "grad_norm": 0.041864633560180664, + "learning_rate": 0.00018186687069625098, + "loss": 0.3214, + "step": 8967 + }, + { + "epoch": 0.7265068049254698, + "grad_norm": 0.03529899939894676, + "learning_rate": 0.00018186237004365634, + "loss": 0.3512, + "step": 8968 + }, + { + "epoch": 0.7265878159429683, + "grad_norm": 0.030811816453933716, + "learning_rate": 0.0001818578693910617, + "loss": 0.3534, + "step": 8969 + }, + { + "epoch": 0.7266688269604666, + "grad_norm": 0.02578016184270382, + "learning_rate": 0.00018185336873846708, + "loss": 0.3255, + "step": 8970 + }, + { + "epoch": 0.726749837977965, + "grad_norm": 0.03245696797966957, + "learning_rate": 0.00018184886808587247, + "loss": 0.2953, + "step": 8971 + }, + { + "epoch": 0.7268308489954634, + "grad_norm": 0.03241654485464096, + "learning_rate": 0.00018184436743327783, + "loss": 0.3336, + "step": 8972 + }, + { + "epoch": 0.7269118600129617, + "grad_norm": 0.032060008496046066, + "learning_rate": 0.00018183986678068322, + "loss": 0.3119, + "step": 8973 + }, + { + "epoch": 0.7269928710304602, + "grad_norm": 0.03207783028483391, + "learning_rate": 0.00018183536612808858, + "loss": 0.3818, + "step": 8974 + }, + { + "epoch": 0.7270738820479585, + "grad_norm": 0.03175025433301926, + "learning_rate": 0.00018183086547549394, + "loss": 0.3239, + "step": 8975 + }, + { + "epoch": 0.7271548930654569, + "grad_norm": 0.030761821195483208, + "learning_rate": 0.00018182636482289932, + "loss": 0.3228, + "step": 8976 + }, + { + "epoch": 0.7272359040829552, + "grad_norm": 0.03948160260915756, + "learning_rate": 0.0001818218641703047, + "loss": 0.4037, + "step": 8977 + }, + { + "epoch": 0.7273169151004537, + "grad_norm": 0.032243337482213974, + "learning_rate": 0.00018181736351771007, + "loss": 0.3502, + "step": 8978 + }, + { + "epoch": 0.7273979261179521, + "grad_norm": 0.03181379660964012, + "learning_rate": 0.00018181286286511546, + "loss": 0.3384, + "step": 8979 + }, + { + "epoch": 0.7274789371354504, + "grad_norm": 0.034360308200120926, + "learning_rate": 0.00018180836221252082, + "loss": 0.3473, + "step": 8980 + }, + { + "epoch": 0.7275599481529488, + "grad_norm": 0.03012571483850479, + "learning_rate": 0.00018180386155992618, + "loss": 0.3076, + "step": 8981 + }, + { + "epoch": 0.7276409591704471, + "grad_norm": 0.03172267973423004, + "learning_rate": 0.00018179936090733157, + "loss": 0.3409, + "step": 8982 + }, + { + "epoch": 0.7277219701879456, + "grad_norm": 0.03266200050711632, + "learning_rate": 0.00018179486025473695, + "loss": 0.3604, + "step": 8983 + }, + { + "epoch": 0.727802981205444, + "grad_norm": 0.03298585116863251, + "learning_rate": 0.0001817903596021423, + "loss": 0.3856, + "step": 8984 + }, + { + "epoch": 0.7278839922229423, + "grad_norm": 0.032270126044750214, + "learning_rate": 0.0001817858589495477, + "loss": 0.3444, + "step": 8985 + }, + { + "epoch": 0.7279650032404407, + "grad_norm": 0.030016066506505013, + "learning_rate": 0.00018178135829695306, + "loss": 0.3222, + "step": 8986 + }, + { + "epoch": 0.7280460142579391, + "grad_norm": 0.038993485271930695, + "learning_rate": 0.00018177685764435842, + "loss": 0.3631, + "step": 8987 + }, + { + "epoch": 0.7281270252754375, + "grad_norm": 0.03430574759840965, + "learning_rate": 0.0001817723569917638, + "loss": 0.338, + "step": 8988 + }, + { + "epoch": 0.7282080362929358, + "grad_norm": 0.031583987176418304, + "learning_rate": 0.0001817678563391692, + "loss": 0.3672, + "step": 8989 + }, + { + "epoch": 0.7282890473104342, + "grad_norm": 0.02773544006049633, + "learning_rate": 0.00018176335568657455, + "loss": 0.3033, + "step": 8990 + }, + { + "epoch": 0.7283700583279326, + "grad_norm": 0.03437868878245354, + "learning_rate": 0.00018175885503397994, + "loss": 0.3621, + "step": 8991 + }, + { + "epoch": 0.728451069345431, + "grad_norm": 0.03022439405322075, + "learning_rate": 0.0001817543543813853, + "loss": 0.3064, + "step": 8992 + }, + { + "epoch": 0.7285320803629294, + "grad_norm": 0.032135121524333954, + "learning_rate": 0.00018174985372879066, + "loss": 0.3562, + "step": 8993 + }, + { + "epoch": 0.7286130913804277, + "grad_norm": 0.03385743498802185, + "learning_rate": 0.00018174535307619608, + "loss": 0.3733, + "step": 8994 + }, + { + "epoch": 0.7286941023979261, + "grad_norm": 0.031596675515174866, + "learning_rate": 0.00018174085242360144, + "loss": 0.3298, + "step": 8995 + }, + { + "epoch": 0.7287751134154244, + "grad_norm": 0.029566364362835884, + "learning_rate": 0.0001817363517710068, + "loss": 0.3548, + "step": 8996 + }, + { + "epoch": 0.7288561244329229, + "grad_norm": 0.031259965151548386, + "learning_rate": 0.00018173185111841218, + "loss": 0.3474, + "step": 8997 + }, + { + "epoch": 0.7289371354504213, + "grad_norm": 0.03048304282128811, + "learning_rate": 0.00018172735046581754, + "loss": 0.3289, + "step": 8998 + }, + { + "epoch": 0.7290181464679196, + "grad_norm": 0.03218503296375275, + "learning_rate": 0.0001817228498132229, + "loss": 0.2826, + "step": 8999 + }, + { + "epoch": 0.729099157485418, + "grad_norm": 0.030204620212316513, + "learning_rate": 0.00018171834916062832, + "loss": 0.3565, + "step": 9000 + }, + { + "epoch": 0.7291801685029164, + "grad_norm": 0.03479839116334915, + "learning_rate": 0.00018171384850803368, + "loss": 0.3533, + "step": 9001 + }, + { + "epoch": 0.7292611795204148, + "grad_norm": 0.035329870879650116, + "learning_rate": 0.00018170934785543904, + "loss": 0.3319, + "step": 9002 + }, + { + "epoch": 0.7293421905379132, + "grad_norm": 0.03221248835325241, + "learning_rate": 0.00018170484720284443, + "loss": 0.3336, + "step": 9003 + }, + { + "epoch": 0.7294232015554115, + "grad_norm": 0.031972408294677734, + "learning_rate": 0.00018170034655024979, + "loss": 0.3342, + "step": 9004 + }, + { + "epoch": 0.7295042125729099, + "grad_norm": 0.03126239404082298, + "learning_rate": 0.00018169584589765517, + "loss": 0.3445, + "step": 9005 + }, + { + "epoch": 0.7295852235904083, + "grad_norm": 0.030894698575139046, + "learning_rate": 0.00018169134524506056, + "loss": 0.3464, + "step": 9006 + }, + { + "epoch": 0.7296662346079067, + "grad_norm": 0.032019056379795074, + "learning_rate": 0.00018168684459246592, + "loss": 0.3407, + "step": 9007 + }, + { + "epoch": 0.729747245625405, + "grad_norm": 0.03275280445814133, + "learning_rate": 0.00018168234393987128, + "loss": 0.3025, + "step": 9008 + }, + { + "epoch": 0.7298282566429034, + "grad_norm": 0.02743900939822197, + "learning_rate": 0.00018167784328727667, + "loss": 0.3116, + "step": 9009 + }, + { + "epoch": 0.7299092676604018, + "grad_norm": 0.02820882946252823, + "learning_rate": 0.00018167334263468203, + "loss": 0.2914, + "step": 9010 + }, + { + "epoch": 0.7299902786779002, + "grad_norm": 0.03033355250954628, + "learning_rate": 0.00018166884198208741, + "loss": 0.3626, + "step": 9011 + }, + { + "epoch": 0.7300712896953986, + "grad_norm": 0.03902474045753479, + "learning_rate": 0.0001816643413294928, + "loss": 0.3859, + "step": 9012 + }, + { + "epoch": 0.7301523007128969, + "grad_norm": 0.03890910744667053, + "learning_rate": 0.00018165984067689816, + "loss": 0.3857, + "step": 9013 + }, + { + "epoch": 0.7302333117303953, + "grad_norm": 0.03715941682457924, + "learning_rate": 0.00018165534002430352, + "loss": 0.3821, + "step": 9014 + }, + { + "epoch": 0.7303143227478938, + "grad_norm": 0.039741143584251404, + "learning_rate": 0.0001816508393717089, + "loss": 0.3778, + "step": 9015 + }, + { + "epoch": 0.7303953337653921, + "grad_norm": 0.03291528299450874, + "learning_rate": 0.00018164633871911427, + "loss": 0.3162, + "step": 9016 + }, + { + "epoch": 0.7304763447828905, + "grad_norm": 0.03132300078868866, + "learning_rate": 0.00018164183806651966, + "loss": 0.3653, + "step": 9017 + }, + { + "epoch": 0.7305573558003888, + "grad_norm": 0.03031282313168049, + "learning_rate": 0.00018163733741392504, + "loss": 0.3354, + "step": 9018 + }, + { + "epoch": 0.7306383668178872, + "grad_norm": 0.03178204596042633, + "learning_rate": 0.0001816328367613304, + "loss": 0.324, + "step": 9019 + }, + { + "epoch": 0.7307193778353857, + "grad_norm": 0.03401080146431923, + "learning_rate": 0.00018162833610873576, + "loss": 0.3569, + "step": 9020 + }, + { + "epoch": 0.730800388852884, + "grad_norm": 0.03225165233016014, + "learning_rate": 0.00018162383545614115, + "loss": 0.3675, + "step": 9021 + }, + { + "epoch": 0.7308813998703824, + "grad_norm": 0.03994085639715195, + "learning_rate": 0.0001816193348035465, + "loss": 0.3501, + "step": 9022 + }, + { + "epoch": 0.7309624108878807, + "grad_norm": 0.03080485202372074, + "learning_rate": 0.0001816148341509519, + "loss": 0.3166, + "step": 9023 + }, + { + "epoch": 0.7310434219053791, + "grad_norm": 0.03139037266373634, + "learning_rate": 0.00018161033349835728, + "loss": 0.3216, + "step": 9024 + }, + { + "epoch": 0.7311244329228775, + "grad_norm": 0.03789443522691727, + "learning_rate": 0.00018160583284576264, + "loss": 0.3481, + "step": 9025 + }, + { + "epoch": 0.7312054439403759, + "grad_norm": 0.029310574755072594, + "learning_rate": 0.000181601332193168, + "loss": 0.2941, + "step": 9026 + }, + { + "epoch": 0.7312864549578743, + "grad_norm": 0.03206866607069969, + "learning_rate": 0.0001815968315405734, + "loss": 0.3174, + "step": 9027 + }, + { + "epoch": 0.7313674659753726, + "grad_norm": 0.031277187168598175, + "learning_rate": 0.00018159233088797875, + "loss": 0.2838, + "step": 9028 + }, + { + "epoch": 0.7314484769928711, + "grad_norm": 0.03462287411093712, + "learning_rate": 0.00018158783023538414, + "loss": 0.367, + "step": 9029 + }, + { + "epoch": 0.7315294880103694, + "grad_norm": 0.031755078583955765, + "learning_rate": 0.00018158332958278953, + "loss": 0.3271, + "step": 9030 + }, + { + "epoch": 0.7316104990278678, + "grad_norm": 0.03572135418653488, + "learning_rate": 0.00018157882893019489, + "loss": 0.3138, + "step": 9031 + }, + { + "epoch": 0.7316915100453661, + "grad_norm": 0.03189965337514877, + "learning_rate": 0.00018157432827760025, + "loss": 0.2958, + "step": 9032 + }, + { + "epoch": 0.7317725210628645, + "grad_norm": 0.033261414617300034, + "learning_rate": 0.00018156982762500563, + "loss": 0.3294, + "step": 9033 + }, + { + "epoch": 0.731853532080363, + "grad_norm": 0.0299600288271904, + "learning_rate": 0.000181565326972411, + "loss": 0.3578, + "step": 9034 + }, + { + "epoch": 0.7319345430978613, + "grad_norm": 0.0337442122399807, + "learning_rate": 0.00018156082631981638, + "loss": 0.3425, + "step": 9035 + }, + { + "epoch": 0.7320155541153597, + "grad_norm": 0.0325690433382988, + "learning_rate": 0.00018155632566722177, + "loss": 0.3504, + "step": 9036 + }, + { + "epoch": 0.732096565132858, + "grad_norm": 0.029540935531258583, + "learning_rate": 0.00018155182501462713, + "loss": 0.3249, + "step": 9037 + }, + { + "epoch": 0.7321775761503565, + "grad_norm": 0.03266843780875206, + "learning_rate": 0.0001815473243620325, + "loss": 0.3712, + "step": 9038 + }, + { + "epoch": 0.7322585871678549, + "grad_norm": 0.031022358685731888, + "learning_rate": 0.00018154282370943788, + "loss": 0.3624, + "step": 9039 + }, + { + "epoch": 0.7323395981853532, + "grad_norm": 0.03409833088517189, + "learning_rate": 0.00018153832305684324, + "loss": 0.3533, + "step": 9040 + }, + { + "epoch": 0.7324206092028516, + "grad_norm": 0.02747887372970581, + "learning_rate": 0.00018153382240424862, + "loss": 0.2489, + "step": 9041 + }, + { + "epoch": 0.7325016202203499, + "grad_norm": 0.03128967061638832, + "learning_rate": 0.000181529321751654, + "loss": 0.3403, + "step": 9042 + }, + { + "epoch": 0.7325826312378484, + "grad_norm": 0.03193770721554756, + "learning_rate": 0.00018152482109905937, + "loss": 0.3551, + "step": 9043 + }, + { + "epoch": 0.7326636422553467, + "grad_norm": 0.03336536884307861, + "learning_rate": 0.00018152032044646473, + "loss": 0.3658, + "step": 9044 + }, + { + "epoch": 0.7327446532728451, + "grad_norm": 0.029813924804329872, + "learning_rate": 0.00018151581979387012, + "loss": 0.3186, + "step": 9045 + }, + { + "epoch": 0.7328256642903435, + "grad_norm": 0.03379664942622185, + "learning_rate": 0.0001815113191412755, + "loss": 0.3077, + "step": 9046 + }, + { + "epoch": 0.7329066753078418, + "grad_norm": 0.032446689903736115, + "learning_rate": 0.00018150681848868086, + "loss": 0.3538, + "step": 9047 + }, + { + "epoch": 0.7329876863253403, + "grad_norm": 0.03147300332784653, + "learning_rate": 0.00018150231783608625, + "loss": 0.3358, + "step": 9048 + }, + { + "epoch": 0.7330686973428386, + "grad_norm": 0.031243788078427315, + "learning_rate": 0.0001814978171834916, + "loss": 0.324, + "step": 9049 + }, + { + "epoch": 0.733149708360337, + "grad_norm": 0.031456124037504196, + "learning_rate": 0.00018149331653089697, + "loss": 0.354, + "step": 9050 + }, + { + "epoch": 0.7332307193778353, + "grad_norm": 0.030385632067918777, + "learning_rate": 0.00018148881587830236, + "loss": 0.3145, + "step": 9051 + }, + { + "epoch": 0.7333117303953338, + "grad_norm": 0.03331984207034111, + "learning_rate": 0.00018148431522570775, + "loss": 0.3716, + "step": 9052 + }, + { + "epoch": 0.7333927414128322, + "grad_norm": 0.02952803298830986, + "learning_rate": 0.0001814798145731131, + "loss": 0.3105, + "step": 9053 + }, + { + "epoch": 0.7334737524303305, + "grad_norm": 0.031981199979782104, + "learning_rate": 0.0001814753139205185, + "loss": 0.369, + "step": 9054 + }, + { + "epoch": 0.7335547634478289, + "grad_norm": 0.03284860774874687, + "learning_rate": 0.00018147081326792385, + "loss": 0.3362, + "step": 9055 + }, + { + "epoch": 0.7336357744653272, + "grad_norm": 0.03138614073395729, + "learning_rate": 0.0001814663126153292, + "loss": 0.3132, + "step": 9056 + }, + { + "epoch": 0.7337167854828257, + "grad_norm": 0.032481104135513306, + "learning_rate": 0.0001814618119627346, + "loss": 0.3668, + "step": 9057 + }, + { + "epoch": 0.7337977965003241, + "grad_norm": 0.0347520187497139, + "learning_rate": 0.00018145731131014, + "loss": 0.327, + "step": 9058 + }, + { + "epoch": 0.7338788075178224, + "grad_norm": 0.034793466329574585, + "learning_rate": 0.00018145281065754535, + "loss": 0.3724, + "step": 9059 + }, + { + "epoch": 0.7339598185353208, + "grad_norm": 0.04012209549546242, + "learning_rate": 0.00018144831000495073, + "loss": 0.346, + "step": 9060 + }, + { + "epoch": 0.7340408295528191, + "grad_norm": 0.029469860717654228, + "learning_rate": 0.0001814438093523561, + "loss": 0.3149, + "step": 9061 + }, + { + "epoch": 0.7341218405703176, + "grad_norm": 0.028441384434700012, + "learning_rate": 0.00018143930869976145, + "loss": 0.2956, + "step": 9062 + }, + { + "epoch": 0.734202851587816, + "grad_norm": 0.03293197229504585, + "learning_rate": 0.00018143480804716684, + "loss": 0.3505, + "step": 9063 + }, + { + "epoch": 0.7342838626053143, + "grad_norm": 0.03378473222255707, + "learning_rate": 0.00018143030739457223, + "loss": 0.3419, + "step": 9064 + }, + { + "epoch": 0.7343648736228127, + "grad_norm": 0.0314219705760479, + "learning_rate": 0.0001814258067419776, + "loss": 0.3001, + "step": 9065 + }, + { + "epoch": 0.7344458846403111, + "grad_norm": 0.029219215735793114, + "learning_rate": 0.00018142130608938298, + "loss": 0.3235, + "step": 9066 + }, + { + "epoch": 0.7345268956578095, + "grad_norm": 0.029307449236512184, + "learning_rate": 0.00018141680543678834, + "loss": 0.3082, + "step": 9067 + }, + { + "epoch": 0.7346079066753078, + "grad_norm": 0.029544701799750328, + "learning_rate": 0.0001814123047841937, + "loss": 0.3379, + "step": 9068 + }, + { + "epoch": 0.7346889176928062, + "grad_norm": 0.039076659828424454, + "learning_rate": 0.00018140780413159908, + "loss": 0.377, + "step": 9069 + }, + { + "epoch": 0.7347699287103046, + "grad_norm": 0.029530270025134087, + "learning_rate": 0.00018140330347900447, + "loss": 0.3219, + "step": 9070 + }, + { + "epoch": 0.734850939727803, + "grad_norm": 0.03346060961484909, + "learning_rate": 0.00018139880282640983, + "loss": 0.3302, + "step": 9071 + }, + { + "epoch": 0.7349319507453014, + "grad_norm": 0.03790099173784256, + "learning_rate": 0.00018139430217381522, + "loss": 0.3431, + "step": 9072 + }, + { + "epoch": 0.7350129617627997, + "grad_norm": 0.031614601612091064, + "learning_rate": 0.00018138980152122058, + "loss": 0.3386, + "step": 9073 + }, + { + "epoch": 0.7350939727802981, + "grad_norm": 0.031407468020915985, + "learning_rate": 0.00018138530086862596, + "loss": 0.3031, + "step": 9074 + }, + { + "epoch": 0.7351749837977966, + "grad_norm": 0.03491174429655075, + "learning_rate": 0.00018138080021603135, + "loss": 0.3829, + "step": 9075 + }, + { + "epoch": 0.7352559948152949, + "grad_norm": 0.03396368399262428, + "learning_rate": 0.0001813762995634367, + "loss": 0.3787, + "step": 9076 + }, + { + "epoch": 0.7353370058327933, + "grad_norm": 0.037365756928920746, + "learning_rate": 0.00018137179891084207, + "loss": 0.3225, + "step": 9077 + }, + { + "epoch": 0.7354180168502916, + "grad_norm": 0.031209466978907585, + "learning_rate": 0.00018136729825824746, + "loss": 0.3593, + "step": 9078 + }, + { + "epoch": 0.73549902786779, + "grad_norm": 0.0321301631629467, + "learning_rate": 0.00018136279760565282, + "loss": 0.3472, + "step": 9079 + }, + { + "epoch": 0.7355800388852884, + "grad_norm": 0.031743042171001434, + "learning_rate": 0.0001813582969530582, + "loss": 0.3606, + "step": 9080 + }, + { + "epoch": 0.7356610499027868, + "grad_norm": 0.030633676797151566, + "learning_rate": 0.0001813537963004636, + "loss": 0.3498, + "step": 9081 + }, + { + "epoch": 0.7357420609202852, + "grad_norm": 0.031029678881168365, + "learning_rate": 0.00018134929564786895, + "loss": 0.3216, + "step": 9082 + }, + { + "epoch": 0.7358230719377835, + "grad_norm": 0.030585667118430138, + "learning_rate": 0.00018134479499527431, + "loss": 0.3398, + "step": 9083 + }, + { + "epoch": 0.7359040829552819, + "grad_norm": 0.031699955463409424, + "learning_rate": 0.0001813402943426797, + "loss": 0.3434, + "step": 9084 + }, + { + "epoch": 0.7359850939727803, + "grad_norm": 0.03134704753756523, + "learning_rate": 0.00018133579369008506, + "loss": 0.3384, + "step": 9085 + }, + { + "epoch": 0.7360661049902787, + "grad_norm": 0.03340228646993637, + "learning_rate": 0.00018133129303749045, + "loss": 0.3712, + "step": 9086 + }, + { + "epoch": 0.736147116007777, + "grad_norm": 0.03207501396536827, + "learning_rate": 0.00018132679238489584, + "loss": 0.3361, + "step": 9087 + }, + { + "epoch": 0.7362281270252754, + "grad_norm": 0.03203867748379707, + "learning_rate": 0.0001813222917323012, + "loss": 0.3264, + "step": 9088 + }, + { + "epoch": 0.7363091380427739, + "grad_norm": 0.028404178097844124, + "learning_rate": 0.00018131779107970656, + "loss": 0.3341, + "step": 9089 + }, + { + "epoch": 0.7363901490602722, + "grad_norm": 0.028629284352064133, + "learning_rate": 0.00018131329042711194, + "loss": 0.3268, + "step": 9090 + }, + { + "epoch": 0.7364711600777706, + "grad_norm": 0.03248591721057892, + "learning_rate": 0.0001813087897745173, + "loss": 0.3821, + "step": 9091 + }, + { + "epoch": 0.7365521710952689, + "grad_norm": 0.0325838178396225, + "learning_rate": 0.0001813042891219227, + "loss": 0.359, + "step": 9092 + }, + { + "epoch": 0.7366331821127673, + "grad_norm": 0.034078460186719894, + "learning_rate": 0.00018129978846932808, + "loss": 0.3479, + "step": 9093 + }, + { + "epoch": 0.7367141931302658, + "grad_norm": 0.04527908191084862, + "learning_rate": 0.00018129528781673344, + "loss": 0.37, + "step": 9094 + }, + { + "epoch": 0.7367952041477641, + "grad_norm": 0.031279996037483215, + "learning_rate": 0.0001812907871641388, + "loss": 0.3571, + "step": 9095 + }, + { + "epoch": 0.7368762151652625, + "grad_norm": 0.02940979227423668, + "learning_rate": 0.00018128628651154418, + "loss": 0.3211, + "step": 9096 + }, + { + "epoch": 0.7369572261827608, + "grad_norm": 0.03120758943259716, + "learning_rate": 0.00018128178585894954, + "loss": 0.365, + "step": 9097 + }, + { + "epoch": 0.7370382372002592, + "grad_norm": 0.029934510588645935, + "learning_rate": 0.00018127728520635493, + "loss": 0.3587, + "step": 9098 + }, + { + "epoch": 0.7371192482177576, + "grad_norm": 0.02927226759493351, + "learning_rate": 0.00018127278455376032, + "loss": 0.3283, + "step": 9099 + }, + { + "epoch": 0.737200259235256, + "grad_norm": 0.03264165297150612, + "learning_rate": 0.00018126828390116568, + "loss": 0.3582, + "step": 9100 + }, + { + "epoch": 0.7372812702527544, + "grad_norm": 0.027434570714831352, + "learning_rate": 0.00018126378324857104, + "loss": 0.3311, + "step": 9101 + }, + { + "epoch": 0.7373622812702527, + "grad_norm": 0.02903290092945099, + "learning_rate": 0.00018125928259597643, + "loss": 0.3428, + "step": 9102 + }, + { + "epoch": 0.7374432922877512, + "grad_norm": 0.03369829058647156, + "learning_rate": 0.00018125478194338179, + "loss": 0.3104, + "step": 9103 + }, + { + "epoch": 0.7375243033052495, + "grad_norm": 0.03176895156502724, + "learning_rate": 0.00018125028129078717, + "loss": 0.3222, + "step": 9104 + }, + { + "epoch": 0.7376053143227479, + "grad_norm": 0.03280607610940933, + "learning_rate": 0.00018124578063819256, + "loss": 0.3583, + "step": 9105 + }, + { + "epoch": 0.7376863253402463, + "grad_norm": 0.033629160374403, + "learning_rate": 0.00018124127998559792, + "loss": 0.3244, + "step": 9106 + }, + { + "epoch": 0.7377673363577446, + "grad_norm": 0.03247423097491264, + "learning_rate": 0.00018123677933300328, + "loss": 0.3439, + "step": 9107 + }, + { + "epoch": 0.7378483473752431, + "grad_norm": 0.04204316437244415, + "learning_rate": 0.00018123227868040867, + "loss": 0.3505, + "step": 9108 + }, + { + "epoch": 0.7379293583927414, + "grad_norm": 0.03633909299969673, + "learning_rate": 0.00018122777802781403, + "loss": 0.3615, + "step": 9109 + }, + { + "epoch": 0.7380103694102398, + "grad_norm": 0.031753137707710266, + "learning_rate": 0.00018122327737521941, + "loss": 0.3329, + "step": 9110 + }, + { + "epoch": 0.7380913804277381, + "grad_norm": 0.033632081001996994, + "learning_rate": 0.0001812187767226248, + "loss": 0.3854, + "step": 9111 + }, + { + "epoch": 0.7381723914452365, + "grad_norm": 0.03247777372598648, + "learning_rate": 0.00018121427607003016, + "loss": 0.3689, + "step": 9112 + }, + { + "epoch": 0.738253402462735, + "grad_norm": 0.033727072179317474, + "learning_rate": 0.00018120977541743552, + "loss": 0.39, + "step": 9113 + }, + { + "epoch": 0.7383344134802333, + "grad_norm": 0.03229176253080368, + "learning_rate": 0.0001812052747648409, + "loss": 0.3513, + "step": 9114 + }, + { + "epoch": 0.7384154244977317, + "grad_norm": 0.028008291497826576, + "learning_rate": 0.00018120077411224627, + "loss": 0.3261, + "step": 9115 + }, + { + "epoch": 0.73849643551523, + "grad_norm": 0.03333825245499611, + "learning_rate": 0.00018119627345965166, + "loss": 0.3434, + "step": 9116 + }, + { + "epoch": 0.7385774465327285, + "grad_norm": 0.033771634101867676, + "learning_rate": 0.00018119177280705704, + "loss": 0.3979, + "step": 9117 + }, + { + "epoch": 0.7386584575502269, + "grad_norm": 0.02980533428490162, + "learning_rate": 0.0001811872721544624, + "loss": 0.3241, + "step": 9118 + }, + { + "epoch": 0.7387394685677252, + "grad_norm": 0.030567575246095657, + "learning_rate": 0.00018118277150186776, + "loss": 0.3496, + "step": 9119 + }, + { + "epoch": 0.7388204795852236, + "grad_norm": 0.03218156844377518, + "learning_rate": 0.00018117827084927315, + "loss": 0.3302, + "step": 9120 + }, + { + "epoch": 0.7389014906027219, + "grad_norm": 0.02872970513999462, + "learning_rate": 0.0001811737701966785, + "loss": 0.2925, + "step": 9121 + }, + { + "epoch": 0.7389825016202204, + "grad_norm": 0.0280693881213665, + "learning_rate": 0.0001811692695440839, + "loss": 0.2888, + "step": 9122 + }, + { + "epoch": 0.7390635126377187, + "grad_norm": 0.03380297124385834, + "learning_rate": 0.00018116476889148928, + "loss": 0.3334, + "step": 9123 + }, + { + "epoch": 0.7391445236552171, + "grad_norm": 0.029702387750148773, + "learning_rate": 0.00018116026823889464, + "loss": 0.3116, + "step": 9124 + }, + { + "epoch": 0.7392255346727155, + "grad_norm": 0.03500824049115181, + "learning_rate": 0.0001811557675863, + "loss": 0.3944, + "step": 9125 + }, + { + "epoch": 0.7393065456902139, + "grad_norm": 0.03508531674742699, + "learning_rate": 0.0001811512669337054, + "loss": 0.3393, + "step": 9126 + }, + { + "epoch": 0.7393875567077123, + "grad_norm": 0.03019407019019127, + "learning_rate": 0.00018114676628111078, + "loss": 0.3625, + "step": 9127 + }, + { + "epoch": 0.7394685677252106, + "grad_norm": 0.02897045575082302, + "learning_rate": 0.00018114226562851614, + "loss": 0.3062, + "step": 9128 + }, + { + "epoch": 0.739549578742709, + "grad_norm": 0.03262624889612198, + "learning_rate": 0.00018113776497592153, + "loss": 0.3678, + "step": 9129 + }, + { + "epoch": 0.7396305897602073, + "grad_norm": 0.031078308820724487, + "learning_rate": 0.0001811332643233269, + "loss": 0.3602, + "step": 9130 + }, + { + "epoch": 0.7397116007777058, + "grad_norm": 0.034019969403743744, + "learning_rate": 0.00018112876367073225, + "loss": 0.33, + "step": 9131 + }, + { + "epoch": 0.7397926117952042, + "grad_norm": 0.033825770020484924, + "learning_rate": 0.00018112426301813763, + "loss": 0.3672, + "step": 9132 + }, + { + "epoch": 0.7398736228127025, + "grad_norm": 0.03215663880109787, + "learning_rate": 0.00018111976236554302, + "loss": 0.3426, + "step": 9133 + }, + { + "epoch": 0.7399546338302009, + "grad_norm": 0.031370263546705246, + "learning_rate": 0.00018111526171294838, + "loss": 0.3459, + "step": 9134 + }, + { + "epoch": 0.7400356448476992, + "grad_norm": 0.031213058158755302, + "learning_rate": 0.00018111076106035377, + "loss": 0.3464, + "step": 9135 + }, + { + "epoch": 0.7401166558651977, + "grad_norm": 0.031317610293626785, + "learning_rate": 0.00018110626040775913, + "loss": 0.3533, + "step": 9136 + }, + { + "epoch": 0.7401976668826961, + "grad_norm": 0.028887102380394936, + "learning_rate": 0.00018110175975516452, + "loss": 0.2923, + "step": 9137 + }, + { + "epoch": 0.7402786779001944, + "grad_norm": 0.030203672125935555, + "learning_rate": 0.00018109725910256988, + "loss": 0.3149, + "step": 9138 + }, + { + "epoch": 0.7403596889176928, + "grad_norm": 0.03250570967793465, + "learning_rate": 0.00018109275844997526, + "loss": 0.3381, + "step": 9139 + }, + { + "epoch": 0.7404406999351912, + "grad_norm": 0.02843714877963066, + "learning_rate": 0.00018108825779738062, + "loss": 0.2859, + "step": 9140 + }, + { + "epoch": 0.7405217109526896, + "grad_norm": 0.038352809846401215, + "learning_rate": 0.000181083757144786, + "loss": 0.3449, + "step": 9141 + }, + { + "epoch": 0.740602721970188, + "grad_norm": 0.030800314620137215, + "learning_rate": 0.00018107925649219137, + "loss": 0.3251, + "step": 9142 + }, + { + "epoch": 0.7406837329876863, + "grad_norm": 0.031433526426553726, + "learning_rate": 0.00018107475583959676, + "loss": 0.3338, + "step": 9143 + }, + { + "epoch": 0.7407647440051847, + "grad_norm": 0.038461629301309586, + "learning_rate": 0.00018107025518700212, + "loss": 0.3632, + "step": 9144 + }, + { + "epoch": 0.7408457550226831, + "grad_norm": 0.03290737792849541, + "learning_rate": 0.0001810657545344075, + "loss": 0.3538, + "step": 9145 + }, + { + "epoch": 0.7409267660401815, + "grad_norm": 0.02991539239883423, + "learning_rate": 0.00018106125388181286, + "loss": 0.3025, + "step": 9146 + }, + { + "epoch": 0.7410077770576798, + "grad_norm": 0.03495603799819946, + "learning_rate": 0.00018105675322921825, + "loss": 0.3859, + "step": 9147 + }, + { + "epoch": 0.7410887880751782, + "grad_norm": 0.031578708440065384, + "learning_rate": 0.0001810522525766236, + "loss": 0.3463, + "step": 9148 + }, + { + "epoch": 0.7411697990926766, + "grad_norm": 0.031590063124895096, + "learning_rate": 0.000181047751924029, + "loss": 0.3333, + "step": 9149 + }, + { + "epoch": 0.741250810110175, + "grad_norm": 0.03381989896297455, + "learning_rate": 0.00018104325127143439, + "loss": 0.3665, + "step": 9150 + }, + { + "epoch": 0.7413318211276734, + "grad_norm": 0.03611378371715546, + "learning_rate": 0.00018103875061883975, + "loss": 0.3665, + "step": 9151 + }, + { + "epoch": 0.7414128321451717, + "grad_norm": 0.03055768832564354, + "learning_rate": 0.0001810342499662451, + "loss": 0.3569, + "step": 9152 + }, + { + "epoch": 0.7414938431626701, + "grad_norm": 0.03822949901223183, + "learning_rate": 0.0001810297493136505, + "loss": 0.3644, + "step": 9153 + }, + { + "epoch": 0.7415748541801686, + "grad_norm": 0.03513355553150177, + "learning_rate": 0.00018102524866105585, + "loss": 0.3741, + "step": 9154 + }, + { + "epoch": 0.7416558651976669, + "grad_norm": 0.028875524178147316, + "learning_rate": 0.00018102074800846124, + "loss": 0.2756, + "step": 9155 + }, + { + "epoch": 0.7417368762151653, + "grad_norm": 0.03183237463235855, + "learning_rate": 0.00018101624735586663, + "loss": 0.3582, + "step": 9156 + }, + { + "epoch": 0.7418178872326636, + "grad_norm": 0.03398773819208145, + "learning_rate": 0.000181011746703272, + "loss": 0.362, + "step": 9157 + }, + { + "epoch": 0.741898898250162, + "grad_norm": 0.03188676759600639, + "learning_rate": 0.00018100724605067735, + "loss": 0.3328, + "step": 9158 + }, + { + "epoch": 0.7419799092676604, + "grad_norm": 0.03117707185447216, + "learning_rate": 0.00018100274539808273, + "loss": 0.3177, + "step": 9159 + }, + { + "epoch": 0.7420609202851588, + "grad_norm": 0.03517724573612213, + "learning_rate": 0.0001809982447454881, + "loss": 0.3785, + "step": 9160 + }, + { + "epoch": 0.7421419313026572, + "grad_norm": 0.03325748071074486, + "learning_rate": 0.00018099374409289348, + "loss": 0.3567, + "step": 9161 + }, + { + "epoch": 0.7422229423201555, + "grad_norm": 0.03816870599985123, + "learning_rate": 0.00018098924344029887, + "loss": 0.4106, + "step": 9162 + }, + { + "epoch": 0.7423039533376539, + "grad_norm": 0.032118070870637894, + "learning_rate": 0.00018098474278770423, + "loss": 0.399, + "step": 9163 + }, + { + "epoch": 0.7423849643551523, + "grad_norm": 0.032386768609285355, + "learning_rate": 0.0001809802421351096, + "loss": 0.3135, + "step": 9164 + }, + { + "epoch": 0.7424659753726507, + "grad_norm": 0.0341571681201458, + "learning_rate": 0.00018097574148251498, + "loss": 0.3677, + "step": 9165 + }, + { + "epoch": 0.742546986390149, + "grad_norm": 0.03333856537938118, + "learning_rate": 0.00018097124082992034, + "loss": 0.3186, + "step": 9166 + }, + { + "epoch": 0.7426279974076474, + "grad_norm": 0.028618505224585533, + "learning_rate": 0.00018096674017732572, + "loss": 0.3031, + "step": 9167 + }, + { + "epoch": 0.7427090084251459, + "grad_norm": 0.029666319489479065, + "learning_rate": 0.0001809622395247311, + "loss": 0.3142, + "step": 9168 + }, + { + "epoch": 0.7427900194426442, + "grad_norm": 0.036109957844018936, + "learning_rate": 0.00018095773887213647, + "loss": 0.3375, + "step": 9169 + }, + { + "epoch": 0.7428710304601426, + "grad_norm": 0.03363575413823128, + "learning_rate": 0.00018095323821954183, + "loss": 0.3346, + "step": 9170 + }, + { + "epoch": 0.7429520414776409, + "grad_norm": 0.03813802823424339, + "learning_rate": 0.00018094873756694722, + "loss": 0.3885, + "step": 9171 + }, + { + "epoch": 0.7430330524951393, + "grad_norm": 0.03305772319436073, + "learning_rate": 0.00018094423691435258, + "loss": 0.3521, + "step": 9172 + }, + { + "epoch": 0.7431140635126378, + "grad_norm": 0.031648457050323486, + "learning_rate": 0.00018093973626175797, + "loss": 0.3652, + "step": 9173 + }, + { + "epoch": 0.7431950745301361, + "grad_norm": 0.03677235171198845, + "learning_rate": 0.00018093523560916335, + "loss": 0.3655, + "step": 9174 + }, + { + "epoch": 0.7432760855476345, + "grad_norm": 0.032715436071157455, + "learning_rate": 0.0001809307349565687, + "loss": 0.3218, + "step": 9175 + }, + { + "epoch": 0.7433570965651328, + "grad_norm": 0.028857214376330376, + "learning_rate": 0.00018092623430397407, + "loss": 0.3114, + "step": 9176 + }, + { + "epoch": 0.7434381075826313, + "grad_norm": 0.029268058016896248, + "learning_rate": 0.00018092173365137946, + "loss": 0.3041, + "step": 9177 + }, + { + "epoch": 0.7435191186001296, + "grad_norm": 0.03330998495221138, + "learning_rate": 0.00018091723299878482, + "loss": 0.3391, + "step": 9178 + }, + { + "epoch": 0.743600129617628, + "grad_norm": 0.038648154586553574, + "learning_rate": 0.0001809127323461902, + "loss": 0.4163, + "step": 9179 + }, + { + "epoch": 0.7436811406351264, + "grad_norm": 0.028641097247600555, + "learning_rate": 0.0001809082316935956, + "loss": 0.3762, + "step": 9180 + }, + { + "epoch": 0.7437621516526247, + "grad_norm": 0.032774269580841064, + "learning_rate": 0.00018090373104100095, + "loss": 0.3515, + "step": 9181 + }, + { + "epoch": 0.7438431626701232, + "grad_norm": 0.029507745057344437, + "learning_rate": 0.00018089923038840631, + "loss": 0.3251, + "step": 9182 + }, + { + "epoch": 0.7439241736876215, + "grad_norm": 0.030811132863163948, + "learning_rate": 0.0001808947297358117, + "loss": 0.305, + "step": 9183 + }, + { + "epoch": 0.7440051847051199, + "grad_norm": 0.029300684109330177, + "learning_rate": 0.00018089022908321706, + "loss": 0.2801, + "step": 9184 + }, + { + "epoch": 0.7440861957226182, + "grad_norm": 0.03540259227156639, + "learning_rate": 0.00018088572843062245, + "loss": 0.3535, + "step": 9185 + }, + { + "epoch": 0.7441672067401166, + "grad_norm": 0.03168850019574165, + "learning_rate": 0.00018088122777802784, + "loss": 0.3148, + "step": 9186 + }, + { + "epoch": 0.7442482177576151, + "grad_norm": 0.035167254507541656, + "learning_rate": 0.0001808767271254332, + "loss": 0.3692, + "step": 9187 + }, + { + "epoch": 0.7443292287751134, + "grad_norm": 0.032082077115774155, + "learning_rate": 0.00018087222647283856, + "loss": 0.357, + "step": 9188 + }, + { + "epoch": 0.7444102397926118, + "grad_norm": 0.037227094173431396, + "learning_rate": 0.00018086772582024394, + "loss": 0.4317, + "step": 9189 + }, + { + "epoch": 0.7444912508101101, + "grad_norm": 0.029042791575193405, + "learning_rate": 0.0001808632251676493, + "loss": 0.3345, + "step": 9190 + }, + { + "epoch": 0.7445722618276086, + "grad_norm": 0.031363196671009064, + "learning_rate": 0.0001808587245150547, + "loss": 0.2798, + "step": 9191 + }, + { + "epoch": 0.744653272845107, + "grad_norm": 0.03424050286412239, + "learning_rate": 0.00018085422386246008, + "loss": 0.4305, + "step": 9192 + }, + { + "epoch": 0.7447342838626053, + "grad_norm": 0.03143888711929321, + "learning_rate": 0.00018084972320986544, + "loss": 0.3572, + "step": 9193 + }, + { + "epoch": 0.7448152948801037, + "grad_norm": 0.02935538813471794, + "learning_rate": 0.0001808452225572708, + "loss": 0.319, + "step": 9194 + }, + { + "epoch": 0.744896305897602, + "grad_norm": 0.029158979654312134, + "learning_rate": 0.00018084072190467618, + "loss": 0.3371, + "step": 9195 + }, + { + "epoch": 0.7449773169151005, + "grad_norm": 0.030298082157969475, + "learning_rate": 0.00018083622125208154, + "loss": 0.3047, + "step": 9196 + }, + { + "epoch": 0.7450583279325989, + "grad_norm": 0.034861814230680466, + "learning_rate": 0.00018083172059948693, + "loss": 0.3294, + "step": 9197 + }, + { + "epoch": 0.7451393389500972, + "grad_norm": 0.027019919827580452, + "learning_rate": 0.00018082721994689232, + "loss": 0.2687, + "step": 9198 + }, + { + "epoch": 0.7452203499675956, + "grad_norm": 0.028821589425206184, + "learning_rate": 0.00018082271929429768, + "loss": 0.3444, + "step": 9199 + }, + { + "epoch": 0.7453013609850939, + "grad_norm": 0.03376943618059158, + "learning_rate": 0.00018081821864170304, + "loss": 0.3363, + "step": 9200 + }, + { + "epoch": 0.7453823720025924, + "grad_norm": 0.029261022806167603, + "learning_rate": 0.00018081371798910843, + "loss": 0.3549, + "step": 9201 + }, + { + "epoch": 0.7454633830200907, + "grad_norm": 0.03731764853000641, + "learning_rate": 0.0001808092173365138, + "loss": 0.3923, + "step": 9202 + }, + { + "epoch": 0.7455443940375891, + "grad_norm": 0.03250044956803322, + "learning_rate": 0.00018080471668391917, + "loss": 0.3124, + "step": 9203 + }, + { + "epoch": 0.7456254050550875, + "grad_norm": 0.032679181545972824, + "learning_rate": 0.00018080021603132456, + "loss": 0.3426, + "step": 9204 + }, + { + "epoch": 0.7457064160725859, + "grad_norm": 0.034462928771972656, + "learning_rate": 0.00018079571537872992, + "loss": 0.343, + "step": 9205 + }, + { + "epoch": 0.7457874270900843, + "grad_norm": 0.03392226994037628, + "learning_rate": 0.0001807912147261353, + "loss": 0.3827, + "step": 9206 + }, + { + "epoch": 0.7458684381075826, + "grad_norm": 0.02565544657409191, + "learning_rate": 0.00018078671407354067, + "loss": 0.2889, + "step": 9207 + }, + { + "epoch": 0.745949449125081, + "grad_norm": 0.03116597980260849, + "learning_rate": 0.00018078221342094605, + "loss": 0.353, + "step": 9208 + }, + { + "epoch": 0.7460304601425793, + "grad_norm": 0.037584736943244934, + "learning_rate": 0.00018077771276835141, + "loss": 0.2985, + "step": 9209 + }, + { + "epoch": 0.7461114711600778, + "grad_norm": 0.03558696433901787, + "learning_rate": 0.0001807732121157568, + "loss": 0.3611, + "step": 9210 + }, + { + "epoch": 0.7461924821775762, + "grad_norm": 0.027648333460092545, + "learning_rate": 0.00018076871146316216, + "loss": 0.3073, + "step": 9211 + }, + { + "epoch": 0.7462734931950745, + "grad_norm": 0.0350969135761261, + "learning_rate": 0.00018076421081056755, + "loss": 0.3525, + "step": 9212 + }, + { + "epoch": 0.7463545042125729, + "grad_norm": 0.04231669008731842, + "learning_rate": 0.0001807597101579729, + "loss": 0.3325, + "step": 9213 + }, + { + "epoch": 0.7464355152300713, + "grad_norm": 0.02886810339987278, + "learning_rate": 0.0001807552095053783, + "loss": 0.33, + "step": 9214 + }, + { + "epoch": 0.7465165262475697, + "grad_norm": 0.035336822271347046, + "learning_rate": 0.00018075070885278366, + "loss": 0.3424, + "step": 9215 + }, + { + "epoch": 0.7465975372650681, + "grad_norm": 0.026189319789409637, + "learning_rate": 0.00018074620820018904, + "loss": 0.301, + "step": 9216 + }, + { + "epoch": 0.7466785482825664, + "grad_norm": 0.029928090050816536, + "learning_rate": 0.0001807417075475944, + "loss": 0.3352, + "step": 9217 + }, + { + "epoch": 0.7467595593000648, + "grad_norm": 0.030840082094073296, + "learning_rate": 0.0001807372068949998, + "loss": 0.3578, + "step": 9218 + }, + { + "epoch": 0.7468405703175632, + "grad_norm": 0.03217003121972084, + "learning_rate": 0.00018073270624240515, + "loss": 0.3904, + "step": 9219 + }, + { + "epoch": 0.7469215813350616, + "grad_norm": 0.028606245294213295, + "learning_rate": 0.00018072820558981054, + "loss": 0.3082, + "step": 9220 + }, + { + "epoch": 0.74700259235256, + "grad_norm": 0.028930075466632843, + "learning_rate": 0.0001807237049372159, + "loss": 0.3107, + "step": 9221 + }, + { + "epoch": 0.7470836033700583, + "grad_norm": 0.03044108673930168, + "learning_rate": 0.00018071920428462129, + "loss": 0.3099, + "step": 9222 + }, + { + "epoch": 0.7471646143875567, + "grad_norm": 0.02919185906648636, + "learning_rate": 0.00018071470363202665, + "loss": 0.3367, + "step": 9223 + }, + { + "epoch": 0.7472456254050551, + "grad_norm": 0.03341258689761162, + "learning_rate": 0.00018071020297943203, + "loss": 0.3635, + "step": 9224 + }, + { + "epoch": 0.7473266364225535, + "grad_norm": 0.03471376374363899, + "learning_rate": 0.0001807057023268374, + "loss": 0.3618, + "step": 9225 + }, + { + "epoch": 0.7474076474400518, + "grad_norm": 0.03482995182275772, + "learning_rate": 0.00018070120167424278, + "loss": 0.3518, + "step": 9226 + }, + { + "epoch": 0.7474886584575502, + "grad_norm": 0.033793773502111435, + "learning_rate": 0.00018069670102164814, + "loss": 0.3788, + "step": 9227 + }, + { + "epoch": 0.7475696694750487, + "grad_norm": 0.03680787235498428, + "learning_rate": 0.00018069220036905353, + "loss": 0.3649, + "step": 9228 + }, + { + "epoch": 0.747650680492547, + "grad_norm": 0.03281555324792862, + "learning_rate": 0.0001806876997164589, + "loss": 0.3062, + "step": 9229 + }, + { + "epoch": 0.7477316915100454, + "grad_norm": 0.03725738078355789, + "learning_rate": 0.00018068319906386427, + "loss": 0.3573, + "step": 9230 + }, + { + "epoch": 0.7478127025275437, + "grad_norm": 0.02526703290641308, + "learning_rate": 0.00018067869841126966, + "loss": 0.2774, + "step": 9231 + }, + { + "epoch": 0.7478937135450421, + "grad_norm": 0.030975675210356712, + "learning_rate": 0.00018067419775867502, + "loss": 0.3356, + "step": 9232 + }, + { + "epoch": 0.7479747245625405, + "grad_norm": 0.030292998999357224, + "learning_rate": 0.00018066969710608038, + "loss": 0.3183, + "step": 9233 + }, + { + "epoch": 0.7480557355800389, + "grad_norm": 0.03010285459458828, + "learning_rate": 0.00018066519645348577, + "loss": 0.3192, + "step": 9234 + }, + { + "epoch": 0.7481367465975373, + "grad_norm": 0.03180127218365669, + "learning_rate": 0.00018066069580089113, + "loss": 0.3286, + "step": 9235 + }, + { + "epoch": 0.7482177576150356, + "grad_norm": 0.03309743478894234, + "learning_rate": 0.00018065619514829652, + "loss": 0.3836, + "step": 9236 + }, + { + "epoch": 0.748298768632534, + "grad_norm": 0.02979223243892193, + "learning_rate": 0.0001806516944957019, + "loss": 0.2981, + "step": 9237 + }, + { + "epoch": 0.7483797796500324, + "grad_norm": 0.0348726287484169, + "learning_rate": 0.00018064719384310726, + "loss": 0.3966, + "step": 9238 + }, + { + "epoch": 0.7484607906675308, + "grad_norm": 0.03552888706326485, + "learning_rate": 0.00018064269319051262, + "loss": 0.3175, + "step": 9239 + }, + { + "epoch": 0.7485418016850292, + "grad_norm": 0.03508378192782402, + "learning_rate": 0.000180638192537918, + "loss": 0.3756, + "step": 9240 + }, + { + "epoch": 0.7486228127025275, + "grad_norm": 0.03183341398835182, + "learning_rate": 0.00018063369188532337, + "loss": 0.3585, + "step": 9241 + }, + { + "epoch": 0.748703823720026, + "grad_norm": 0.035623688250780106, + "learning_rate": 0.00018062919123272876, + "loss": 0.3623, + "step": 9242 + }, + { + "epoch": 0.7487848347375243, + "grad_norm": 0.03101072646677494, + "learning_rate": 0.00018062469058013414, + "loss": 0.3117, + "step": 9243 + }, + { + "epoch": 0.7488658457550227, + "grad_norm": 0.03312772512435913, + "learning_rate": 0.0001806201899275395, + "loss": 0.315, + "step": 9244 + }, + { + "epoch": 0.748946856772521, + "grad_norm": 0.03041679970920086, + "learning_rate": 0.00018061568927494486, + "loss": 0.3006, + "step": 9245 + }, + { + "epoch": 0.7490278677900194, + "grad_norm": 0.03130163997411728, + "learning_rate": 0.00018061118862235025, + "loss": 0.3769, + "step": 9246 + }, + { + "epoch": 0.7491088788075179, + "grad_norm": 0.031065113842487335, + "learning_rate": 0.0001806066879697556, + "loss": 0.314, + "step": 9247 + }, + { + "epoch": 0.7491898898250162, + "grad_norm": 0.0288732647895813, + "learning_rate": 0.000180602187317161, + "loss": 0.2995, + "step": 9248 + }, + { + "epoch": 0.7492709008425146, + "grad_norm": 0.036610234528779984, + "learning_rate": 0.00018059768666456639, + "loss": 0.3862, + "step": 9249 + }, + { + "epoch": 0.7493519118600129, + "grad_norm": 0.029909661039710045, + "learning_rate": 0.00018059318601197175, + "loss": 0.2936, + "step": 9250 + }, + { + "epoch": 0.7494329228775113, + "grad_norm": 0.029548456892371178, + "learning_rate": 0.0001805886853593771, + "loss": 0.3241, + "step": 9251 + }, + { + "epoch": 0.7495139338950098, + "grad_norm": 0.03577359765768051, + "learning_rate": 0.0001805841847067825, + "loss": 0.3205, + "step": 9252 + }, + { + "epoch": 0.7495949449125081, + "grad_norm": 0.02838653326034546, + "learning_rate": 0.00018057968405418785, + "loss": 0.3168, + "step": 9253 + }, + { + "epoch": 0.7496759559300065, + "grad_norm": 0.034319330006837845, + "learning_rate": 0.00018057518340159324, + "loss": 0.354, + "step": 9254 + }, + { + "epoch": 0.7497569669475048, + "grad_norm": 0.033737730234861374, + "learning_rate": 0.00018057068274899863, + "loss": 0.392, + "step": 9255 + }, + { + "epoch": 0.7498379779650033, + "grad_norm": 0.029997693374753, + "learning_rate": 0.000180566182096404, + "loss": 0.3458, + "step": 9256 + }, + { + "epoch": 0.7499189889825016, + "grad_norm": 0.030124330893158913, + "learning_rate": 0.00018056168144380935, + "loss": 0.3336, + "step": 9257 + }, + { + "epoch": 0.75, + "grad_norm": 0.03392377868294716, + "learning_rate": 0.00018055718079121473, + "loss": 0.3515, + "step": 9258 + }, + { + "epoch": 0.7500810110174984, + "grad_norm": 0.03187362104654312, + "learning_rate": 0.0001805526801386201, + "loss": 0.3582, + "step": 9259 + }, + { + "epoch": 0.7501620220349967, + "grad_norm": 0.03177661448717117, + "learning_rate": 0.00018054817948602548, + "loss": 0.3308, + "step": 9260 + }, + { + "epoch": 0.7502430330524952, + "grad_norm": 0.02912510186433792, + "learning_rate": 0.00018054367883343087, + "loss": 0.3492, + "step": 9261 + }, + { + "epoch": 0.7503240440699935, + "grad_norm": 0.027753859758377075, + "learning_rate": 0.00018053917818083623, + "loss": 0.3043, + "step": 9262 + }, + { + "epoch": 0.7504050550874919, + "grad_norm": 0.03045106679201126, + "learning_rate": 0.0001805346775282416, + "loss": 0.3586, + "step": 9263 + }, + { + "epoch": 0.7504860661049902, + "grad_norm": 0.03325005993247032, + "learning_rate": 0.00018053017687564698, + "loss": 0.3423, + "step": 9264 + }, + { + "epoch": 0.7505670771224887, + "grad_norm": 0.029894059523940086, + "learning_rate": 0.00018052567622305234, + "loss": 0.3433, + "step": 9265 + }, + { + "epoch": 0.7506480881399871, + "grad_norm": 0.03508707880973816, + "learning_rate": 0.00018052117557045772, + "loss": 0.3774, + "step": 9266 + }, + { + "epoch": 0.7507290991574854, + "grad_norm": 0.02966720052063465, + "learning_rate": 0.0001805166749178631, + "loss": 0.2835, + "step": 9267 + }, + { + "epoch": 0.7508101101749838, + "grad_norm": 0.03271722048521042, + "learning_rate": 0.00018051217426526847, + "loss": 0.3535, + "step": 9268 + }, + { + "epoch": 0.7508911211924821, + "grad_norm": 0.03025810234248638, + "learning_rate": 0.00018050767361267383, + "loss": 0.3233, + "step": 9269 + }, + { + "epoch": 0.7509721322099806, + "grad_norm": 0.030399255454540253, + "learning_rate": 0.00018050317296007922, + "loss": 0.3071, + "step": 9270 + }, + { + "epoch": 0.751053143227479, + "grad_norm": 0.034914761781692505, + "learning_rate": 0.00018049867230748458, + "loss": 0.3435, + "step": 9271 + }, + { + "epoch": 0.7511341542449773, + "grad_norm": 0.028581790626049042, + "learning_rate": 0.00018049417165488997, + "loss": 0.3035, + "step": 9272 + }, + { + "epoch": 0.7512151652624757, + "grad_norm": 0.034693315625190735, + "learning_rate": 0.00018048967100229535, + "loss": 0.3154, + "step": 9273 + }, + { + "epoch": 0.751296176279974, + "grad_norm": 0.030193015933036804, + "learning_rate": 0.0001804851703497007, + "loss": 0.3567, + "step": 9274 + }, + { + "epoch": 0.7513771872974725, + "grad_norm": 0.03147919476032257, + "learning_rate": 0.0001804806696971061, + "loss": 0.3303, + "step": 9275 + }, + { + "epoch": 0.7514581983149708, + "grad_norm": 0.037018414586782455, + "learning_rate": 0.00018047616904451146, + "loss": 0.351, + "step": 9276 + }, + { + "epoch": 0.7515392093324692, + "grad_norm": 0.028499385342001915, + "learning_rate": 0.00018047166839191682, + "loss": 0.2793, + "step": 9277 + }, + { + "epoch": 0.7516202203499676, + "grad_norm": 0.029187791049480438, + "learning_rate": 0.0001804671677393222, + "loss": 0.3322, + "step": 9278 + }, + { + "epoch": 0.751701231367466, + "grad_norm": 0.034434039145708084, + "learning_rate": 0.0001804626670867276, + "loss": 0.3262, + "step": 9279 + }, + { + "epoch": 0.7517822423849644, + "grad_norm": 0.033068154007196426, + "learning_rate": 0.00018045816643413295, + "loss": 0.3139, + "step": 9280 + }, + { + "epoch": 0.7518632534024627, + "grad_norm": 0.03756197914481163, + "learning_rate": 0.00018045366578153834, + "loss": 0.3722, + "step": 9281 + }, + { + "epoch": 0.7519442644199611, + "grad_norm": 0.03321055322885513, + "learning_rate": 0.0001804491651289437, + "loss": 0.319, + "step": 9282 + }, + { + "epoch": 0.7520252754374595, + "grad_norm": 0.03404540941119194, + "learning_rate": 0.0001804446644763491, + "loss": 0.3463, + "step": 9283 + }, + { + "epoch": 0.7521062864549579, + "grad_norm": 0.03360103443264961, + "learning_rate": 0.00018044016382375445, + "loss": 0.3457, + "step": 9284 + }, + { + "epoch": 0.7521872974724563, + "grad_norm": 0.03434904292225838, + "learning_rate": 0.00018043566317115984, + "loss": 0.333, + "step": 9285 + }, + { + "epoch": 0.7522683084899546, + "grad_norm": 0.029737526550889015, + "learning_rate": 0.0001804311625185652, + "loss": 0.3167, + "step": 9286 + }, + { + "epoch": 0.752349319507453, + "grad_norm": 0.03503742441534996, + "learning_rate": 0.00018042666186597058, + "loss": 0.3429, + "step": 9287 + }, + { + "epoch": 0.7524303305249513, + "grad_norm": 0.030850261449813843, + "learning_rate": 0.00018042216121337594, + "loss": 0.3236, + "step": 9288 + }, + { + "epoch": 0.7525113415424498, + "grad_norm": 0.03333286568522453, + "learning_rate": 0.00018041766056078133, + "loss": 0.3372, + "step": 9289 + }, + { + "epoch": 0.7525923525599482, + "grad_norm": 0.03200223669409752, + "learning_rate": 0.0001804131599081867, + "loss": 0.3486, + "step": 9290 + }, + { + "epoch": 0.7526733635774465, + "grad_norm": 0.0319368951022625, + "learning_rate": 0.00018040865925559208, + "loss": 0.3156, + "step": 9291 + }, + { + "epoch": 0.7527543745949449, + "grad_norm": 0.030817804858088493, + "learning_rate": 0.00018040415860299744, + "loss": 0.3128, + "step": 9292 + }, + { + "epoch": 0.7528353856124433, + "grad_norm": 0.03215906769037247, + "learning_rate": 0.00018039965795040282, + "loss": 0.3327, + "step": 9293 + }, + { + "epoch": 0.7529163966299417, + "grad_norm": 0.03907868638634682, + "learning_rate": 0.00018039515729780818, + "loss": 0.3626, + "step": 9294 + }, + { + "epoch": 0.75299740764744, + "grad_norm": 0.02961132489144802, + "learning_rate": 0.00018039065664521357, + "loss": 0.3254, + "step": 9295 + }, + { + "epoch": 0.7530784186649384, + "grad_norm": 0.03202762454748154, + "learning_rate": 0.00018038615599261893, + "loss": 0.3301, + "step": 9296 + }, + { + "epoch": 0.7531594296824368, + "grad_norm": 0.036983706057071686, + "learning_rate": 0.00018038165534002432, + "loss": 0.3997, + "step": 9297 + }, + { + "epoch": 0.7532404406999352, + "grad_norm": 0.031749822199344635, + "learning_rate": 0.00018037715468742968, + "loss": 0.3464, + "step": 9298 + }, + { + "epoch": 0.7533214517174336, + "grad_norm": 0.030583178624510765, + "learning_rate": 0.00018037265403483507, + "loss": 0.3031, + "step": 9299 + }, + { + "epoch": 0.7534024627349319, + "grad_norm": 0.02886773645877838, + "learning_rate": 0.00018036815338224043, + "loss": 0.3287, + "step": 9300 + }, + { + "epoch": 0.7534834737524303, + "grad_norm": 0.03687857836484909, + "learning_rate": 0.0001803636527296458, + "loss": 0.354, + "step": 9301 + }, + { + "epoch": 0.7535644847699287, + "grad_norm": 0.03391767665743828, + "learning_rate": 0.00018035915207705117, + "loss": 0.3836, + "step": 9302 + }, + { + "epoch": 0.7536454957874271, + "grad_norm": 0.02986670285463333, + "learning_rate": 0.00018035465142445656, + "loss": 0.2838, + "step": 9303 + }, + { + "epoch": 0.7537265068049255, + "grad_norm": 0.034182414412498474, + "learning_rate": 0.00018035015077186192, + "loss": 0.3159, + "step": 9304 + }, + { + "epoch": 0.7538075178224238, + "grad_norm": 0.03301328048110008, + "learning_rate": 0.0001803456501192673, + "loss": 0.3639, + "step": 9305 + }, + { + "epoch": 0.7538885288399222, + "grad_norm": 0.030668480321764946, + "learning_rate": 0.00018034114946667267, + "loss": 0.3159, + "step": 9306 + }, + { + "epoch": 0.7539695398574207, + "grad_norm": 0.03611016646027565, + "learning_rate": 0.00018033664881407805, + "loss": 0.3313, + "step": 9307 + }, + { + "epoch": 0.754050550874919, + "grad_norm": 0.030725689604878426, + "learning_rate": 0.00018033214816148342, + "loss": 0.3352, + "step": 9308 + }, + { + "epoch": 0.7541315618924174, + "grad_norm": 0.0315089076757431, + "learning_rate": 0.0001803276475088888, + "loss": 0.3145, + "step": 9309 + }, + { + "epoch": 0.7542125729099157, + "grad_norm": 0.02995385229587555, + "learning_rate": 0.00018032314685629416, + "loss": 0.3134, + "step": 9310 + }, + { + "epoch": 0.7542935839274141, + "grad_norm": 0.02921799197793007, + "learning_rate": 0.00018031864620369955, + "loss": 0.361, + "step": 9311 + }, + { + "epoch": 0.7543745949449125, + "grad_norm": 0.033073678612709045, + "learning_rate": 0.00018031414555110494, + "loss": 0.3352, + "step": 9312 + }, + { + "epoch": 0.7544556059624109, + "grad_norm": 0.03068387322127819, + "learning_rate": 0.0001803096448985103, + "loss": 0.3258, + "step": 9313 + }, + { + "epoch": 0.7545366169799093, + "grad_norm": 0.031815964728593826, + "learning_rate": 0.00018030514424591566, + "loss": 0.3541, + "step": 9314 + }, + { + "epoch": 0.7546176279974076, + "grad_norm": 0.03938791900873184, + "learning_rate": 0.00018030064359332104, + "loss": 0.3434, + "step": 9315 + }, + { + "epoch": 0.7546986390149061, + "grad_norm": 0.03221413120627403, + "learning_rate": 0.0001802961429407264, + "loss": 0.3123, + "step": 9316 + }, + { + "epoch": 0.7547796500324044, + "grad_norm": 0.030221721157431602, + "learning_rate": 0.0001802916422881318, + "loss": 0.3206, + "step": 9317 + }, + { + "epoch": 0.7548606610499028, + "grad_norm": 0.029876358807086945, + "learning_rate": 0.00018028714163553718, + "loss": 0.3194, + "step": 9318 + }, + { + "epoch": 0.7549416720674011, + "grad_norm": 0.03315850347280502, + "learning_rate": 0.00018028264098294254, + "loss": 0.3337, + "step": 9319 + }, + { + "epoch": 0.7550226830848995, + "grad_norm": 0.03478917479515076, + "learning_rate": 0.0001802781403303479, + "loss": 0.3457, + "step": 9320 + }, + { + "epoch": 0.755103694102398, + "grad_norm": 0.03456339240074158, + "learning_rate": 0.00018027363967775329, + "loss": 0.3242, + "step": 9321 + }, + { + "epoch": 0.7551847051198963, + "grad_norm": 0.03251144289970398, + "learning_rate": 0.00018026913902515865, + "loss": 0.3594, + "step": 9322 + }, + { + "epoch": 0.7552657161373947, + "grad_norm": 0.0323588103055954, + "learning_rate": 0.00018026463837256403, + "loss": 0.3321, + "step": 9323 + }, + { + "epoch": 0.755346727154893, + "grad_norm": 0.034761857241392136, + "learning_rate": 0.00018026013771996942, + "loss": 0.3711, + "step": 9324 + }, + { + "epoch": 0.7554277381723914, + "grad_norm": 0.032338641583919525, + "learning_rate": 0.00018025563706737478, + "loss": 0.3045, + "step": 9325 + }, + { + "epoch": 0.7555087491898899, + "grad_norm": 0.030599815770983696, + "learning_rate": 0.00018025113641478014, + "loss": 0.3054, + "step": 9326 + }, + { + "epoch": 0.7555897602073882, + "grad_norm": 0.03238639235496521, + "learning_rate": 0.00018024663576218553, + "loss": 0.3283, + "step": 9327 + }, + { + "epoch": 0.7556707712248866, + "grad_norm": 0.0294541846960783, + "learning_rate": 0.0001802421351095909, + "loss": 0.3159, + "step": 9328 + }, + { + "epoch": 0.7557517822423849, + "grad_norm": 0.03155055642127991, + "learning_rate": 0.00018023763445699627, + "loss": 0.2916, + "step": 9329 + }, + { + "epoch": 0.7558327932598834, + "grad_norm": 0.02897864580154419, + "learning_rate": 0.00018023313380440166, + "loss": 0.2919, + "step": 9330 + }, + { + "epoch": 0.7559138042773818, + "grad_norm": 0.03341088071465492, + "learning_rate": 0.00018022863315180702, + "loss": 0.3239, + "step": 9331 + }, + { + "epoch": 0.7559948152948801, + "grad_norm": 0.03133474290370941, + "learning_rate": 0.00018022413249921238, + "loss": 0.371, + "step": 9332 + }, + { + "epoch": 0.7560758263123785, + "grad_norm": 0.034195493906736374, + "learning_rate": 0.00018021963184661777, + "loss": 0.3424, + "step": 9333 + }, + { + "epoch": 0.7561568373298768, + "grad_norm": 0.026915594935417175, + "learning_rate": 0.00018021513119402313, + "loss": 0.2903, + "step": 9334 + }, + { + "epoch": 0.7562378483473753, + "grad_norm": 0.030386967584490776, + "learning_rate": 0.00018021063054142852, + "loss": 0.334, + "step": 9335 + }, + { + "epoch": 0.7563188593648736, + "grad_norm": 0.029541438445448875, + "learning_rate": 0.0001802061298888339, + "loss": 0.2776, + "step": 9336 + }, + { + "epoch": 0.756399870382372, + "grad_norm": 0.03158644214272499, + "learning_rate": 0.00018020162923623926, + "loss": 0.3009, + "step": 9337 + }, + { + "epoch": 0.7564808813998704, + "grad_norm": 0.03073226846754551, + "learning_rate": 0.00018019712858364462, + "loss": 0.3201, + "step": 9338 + }, + { + "epoch": 0.7565618924173687, + "grad_norm": 0.0377579927444458, + "learning_rate": 0.00018019262793105, + "loss": 0.3696, + "step": 9339 + }, + { + "epoch": 0.7566429034348672, + "grad_norm": 0.03365955501794815, + "learning_rate": 0.00018018812727845537, + "loss": 0.3081, + "step": 9340 + }, + { + "epoch": 0.7567239144523655, + "grad_norm": 0.032374121248722076, + "learning_rate": 0.00018018362662586076, + "loss": 0.343, + "step": 9341 + }, + { + "epoch": 0.7568049254698639, + "grad_norm": 0.03342792019248009, + "learning_rate": 0.00018017912597326614, + "loss": 0.3669, + "step": 9342 + }, + { + "epoch": 0.7568859364873622, + "grad_norm": 0.03184104710817337, + "learning_rate": 0.0001801746253206715, + "loss": 0.3532, + "step": 9343 + }, + { + "epoch": 0.7569669475048607, + "grad_norm": 0.03416401892900467, + "learning_rate": 0.0001801701246680769, + "loss": 0.4207, + "step": 9344 + }, + { + "epoch": 0.7570479585223591, + "grad_norm": 0.03579508140683174, + "learning_rate": 0.00018016562401548225, + "loss": 0.3759, + "step": 9345 + }, + { + "epoch": 0.7571289695398574, + "grad_norm": 0.03344917297363281, + "learning_rate": 0.0001801611233628876, + "loss": 0.3761, + "step": 9346 + }, + { + "epoch": 0.7572099805573558, + "grad_norm": 0.03486419841647148, + "learning_rate": 0.000180156622710293, + "loss": 0.3805, + "step": 9347 + }, + { + "epoch": 0.7572909915748541, + "grad_norm": 0.03962663188576698, + "learning_rate": 0.00018015212205769839, + "loss": 0.3267, + "step": 9348 + }, + { + "epoch": 0.7573720025923526, + "grad_norm": 0.03494442626833916, + "learning_rate": 0.00018014762140510375, + "loss": 0.3187, + "step": 9349 + }, + { + "epoch": 0.757453013609851, + "grad_norm": 0.02679494582116604, + "learning_rate": 0.00018014312075250913, + "loss": 0.292, + "step": 9350 + }, + { + "epoch": 0.7575340246273493, + "grad_norm": 0.03419162333011627, + "learning_rate": 0.0001801386200999145, + "loss": 0.3537, + "step": 9351 + }, + { + "epoch": 0.7576150356448477, + "grad_norm": 0.031362127512693405, + "learning_rate": 0.00018013411944731985, + "loss": 0.3032, + "step": 9352 + }, + { + "epoch": 0.7576960466623461, + "grad_norm": 0.03217475116252899, + "learning_rate": 0.00018012961879472524, + "loss": 0.3378, + "step": 9353 + }, + { + "epoch": 0.7577770576798445, + "grad_norm": 0.030293717980384827, + "learning_rate": 0.00018012511814213063, + "loss": 0.3262, + "step": 9354 + }, + { + "epoch": 0.7578580686973428, + "grad_norm": 0.03205201402306557, + "learning_rate": 0.000180120617489536, + "loss": 0.3296, + "step": 9355 + }, + { + "epoch": 0.7579390797148412, + "grad_norm": 0.03602740168571472, + "learning_rate": 0.00018011611683694138, + "loss": 0.3174, + "step": 9356 + }, + { + "epoch": 0.7580200907323396, + "grad_norm": 0.029463456943631172, + "learning_rate": 0.00018011161618434674, + "loss": 0.2948, + "step": 9357 + }, + { + "epoch": 0.758101101749838, + "grad_norm": 0.03281380981206894, + "learning_rate": 0.0001801071155317521, + "loss": 0.3417, + "step": 9358 + }, + { + "epoch": 0.7581821127673364, + "grad_norm": 0.031789641827344894, + "learning_rate": 0.00018010261487915748, + "loss": 0.3238, + "step": 9359 + }, + { + "epoch": 0.7582631237848347, + "grad_norm": 0.03468538448214531, + "learning_rate": 0.00018009811422656287, + "loss": 0.3624, + "step": 9360 + }, + { + "epoch": 0.7583441348023331, + "grad_norm": 0.02855679579079151, + "learning_rate": 0.00018009361357396823, + "loss": 0.2951, + "step": 9361 + }, + { + "epoch": 0.7584251458198314, + "grad_norm": 0.03410002589225769, + "learning_rate": 0.00018008911292137362, + "loss": 0.3651, + "step": 9362 + }, + { + "epoch": 0.7585061568373299, + "grad_norm": 0.037260934710502625, + "learning_rate": 0.00018008461226877898, + "loss": 0.3393, + "step": 9363 + }, + { + "epoch": 0.7585871678548283, + "grad_norm": 0.03792154788970947, + "learning_rate": 0.00018008011161618436, + "loss": 0.3471, + "step": 9364 + }, + { + "epoch": 0.7586681788723266, + "grad_norm": 0.030598971992731094, + "learning_rate": 0.00018007561096358972, + "loss": 0.3217, + "step": 9365 + }, + { + "epoch": 0.758749189889825, + "grad_norm": 0.0321931317448616, + "learning_rate": 0.0001800711103109951, + "loss": 0.3284, + "step": 9366 + }, + { + "epoch": 0.7588302009073234, + "grad_norm": 0.032547708600759506, + "learning_rate": 0.00018006660965840047, + "loss": 0.3484, + "step": 9367 + }, + { + "epoch": 0.7589112119248218, + "grad_norm": 0.034850314259529114, + "learning_rate": 0.00018006210900580586, + "loss": 0.3422, + "step": 9368 + }, + { + "epoch": 0.7589922229423202, + "grad_norm": 0.03074449487030506, + "learning_rate": 0.00018005760835321122, + "loss": 0.3437, + "step": 9369 + }, + { + "epoch": 0.7590732339598185, + "grad_norm": 0.03898514434695244, + "learning_rate": 0.0001800531077006166, + "loss": 0.3742, + "step": 9370 + }, + { + "epoch": 0.7591542449773169, + "grad_norm": 0.033423904329538345, + "learning_rate": 0.00018004860704802197, + "loss": 0.3415, + "step": 9371 + }, + { + "epoch": 0.7592352559948153, + "grad_norm": 0.030887460336089134, + "learning_rate": 0.00018004410639542735, + "loss": 0.3329, + "step": 9372 + }, + { + "epoch": 0.7593162670123137, + "grad_norm": 0.031972624361515045, + "learning_rate": 0.0001800396057428327, + "loss": 0.3321, + "step": 9373 + }, + { + "epoch": 0.759397278029812, + "grad_norm": 0.03002900630235672, + "learning_rate": 0.0001800351050902381, + "loss": 0.3058, + "step": 9374 + }, + { + "epoch": 0.7594782890473104, + "grad_norm": 0.03169652074575424, + "learning_rate": 0.00018003060443764346, + "loss": 0.3284, + "step": 9375 + }, + { + "epoch": 0.7595593000648088, + "grad_norm": 0.030031373724341393, + "learning_rate": 0.00018002610378504885, + "loss": 0.2913, + "step": 9376 + }, + { + "epoch": 0.7596403110823072, + "grad_norm": 0.029552794992923737, + "learning_rate": 0.0001800216031324542, + "loss": 0.3228, + "step": 9377 + }, + { + "epoch": 0.7597213220998056, + "grad_norm": 0.029546670615673065, + "learning_rate": 0.0001800171024798596, + "loss": 0.3027, + "step": 9378 + }, + { + "epoch": 0.7598023331173039, + "grad_norm": 0.028050288558006287, + "learning_rate": 0.00018001260182726495, + "loss": 0.2832, + "step": 9379 + }, + { + "epoch": 0.7598833441348023, + "grad_norm": 0.03491856902837753, + "learning_rate": 0.00018000810117467034, + "loss": 0.3216, + "step": 9380 + }, + { + "epoch": 0.7599643551523008, + "grad_norm": 0.027808241546154022, + "learning_rate": 0.0001800036005220757, + "loss": 0.2929, + "step": 9381 + }, + { + "epoch": 0.7600453661697991, + "grad_norm": 0.031897857785224915, + "learning_rate": 0.0001799990998694811, + "loss": 0.3254, + "step": 9382 + }, + { + "epoch": 0.7601263771872975, + "grad_norm": 0.03805245831608772, + "learning_rate": 0.00017999459921688645, + "loss": 0.4039, + "step": 9383 + }, + { + "epoch": 0.7602073882047958, + "grad_norm": 0.03206348046660423, + "learning_rate": 0.00017999009856429184, + "loss": 0.3588, + "step": 9384 + }, + { + "epoch": 0.7602883992222942, + "grad_norm": 0.03059943951666355, + "learning_rate": 0.0001799855979116972, + "loss": 0.2983, + "step": 9385 + }, + { + "epoch": 0.7603694102397927, + "grad_norm": 0.03752979636192322, + "learning_rate": 0.00017998109725910258, + "loss": 0.3583, + "step": 9386 + }, + { + "epoch": 0.760450421257291, + "grad_norm": 0.03344959393143654, + "learning_rate": 0.00017997659660650794, + "loss": 0.3581, + "step": 9387 + }, + { + "epoch": 0.7605314322747894, + "grad_norm": 0.039618462324142456, + "learning_rate": 0.00017997209595391333, + "loss": 0.3975, + "step": 9388 + }, + { + "epoch": 0.7606124432922877, + "grad_norm": 0.030214224010705948, + "learning_rate": 0.0001799675953013187, + "loss": 0.353, + "step": 9389 + }, + { + "epoch": 0.7606934543097861, + "grad_norm": 0.03098473511636257, + "learning_rate": 0.00017996309464872408, + "loss": 0.3169, + "step": 9390 + }, + { + "epoch": 0.7607744653272845, + "grad_norm": 0.029453137889504433, + "learning_rate": 0.00017995859399612944, + "loss": 0.3098, + "step": 9391 + }, + { + "epoch": 0.7608554763447829, + "grad_norm": 0.03248664364218712, + "learning_rate": 0.00017995409334353482, + "loss": 0.3471, + "step": 9392 + }, + { + "epoch": 0.7609364873622813, + "grad_norm": 0.03373479098081589, + "learning_rate": 0.0001799495926909402, + "loss": 0.3884, + "step": 9393 + }, + { + "epoch": 0.7610174983797796, + "grad_norm": 0.03235981985926628, + "learning_rate": 0.00017994509203834557, + "loss": 0.3485, + "step": 9394 + }, + { + "epoch": 0.7610985093972781, + "grad_norm": 0.0278353039175272, + "learning_rate": 0.00017994059138575093, + "loss": 0.3181, + "step": 9395 + }, + { + "epoch": 0.7611795204147764, + "grad_norm": 0.03313388302922249, + "learning_rate": 0.00017993609073315632, + "loss": 0.3985, + "step": 9396 + }, + { + "epoch": 0.7612605314322748, + "grad_norm": 0.0328248105943203, + "learning_rate": 0.00017993159008056168, + "loss": 0.3568, + "step": 9397 + }, + { + "epoch": 0.7613415424497731, + "grad_norm": 0.032912254333496094, + "learning_rate": 0.00017992708942796707, + "loss": 0.3182, + "step": 9398 + }, + { + "epoch": 0.7614225534672715, + "grad_norm": 0.03135635331273079, + "learning_rate": 0.00017992258877537245, + "loss": 0.3237, + "step": 9399 + }, + { + "epoch": 0.76150356448477, + "grad_norm": 0.02806262858211994, + "learning_rate": 0.00017991808812277781, + "loss": 0.3339, + "step": 9400 + }, + { + "epoch": 0.7615845755022683, + "grad_norm": 0.033621639013290405, + "learning_rate": 0.00017991358747018317, + "loss": 0.3376, + "step": 9401 + }, + { + "epoch": 0.7616655865197667, + "grad_norm": 0.03141626715660095, + "learning_rate": 0.00017990908681758856, + "loss": 0.3296, + "step": 9402 + }, + { + "epoch": 0.761746597537265, + "grad_norm": 0.030434491112828255, + "learning_rate": 0.00017990458616499392, + "loss": 0.2825, + "step": 9403 + }, + { + "epoch": 0.7618276085547635, + "grad_norm": 0.029175328090786934, + "learning_rate": 0.0001799000855123993, + "loss": 0.3109, + "step": 9404 + }, + { + "epoch": 0.7619086195722619, + "grad_norm": 0.031339529901742935, + "learning_rate": 0.0001798955848598047, + "loss": 0.3267, + "step": 9405 + }, + { + "epoch": 0.7619896305897602, + "grad_norm": 0.031228726729750633, + "learning_rate": 0.00017989108420721006, + "loss": 0.3275, + "step": 9406 + }, + { + "epoch": 0.7620706416072586, + "grad_norm": 0.032652441412210464, + "learning_rate": 0.00017988658355461542, + "loss": 0.3489, + "step": 9407 + }, + { + "epoch": 0.7621516526247569, + "grad_norm": 0.03801897168159485, + "learning_rate": 0.0001798820829020208, + "loss": 0.3678, + "step": 9408 + }, + { + "epoch": 0.7622326636422554, + "grad_norm": 0.03199564665555954, + "learning_rate": 0.00017987758224942616, + "loss": 0.3903, + "step": 9409 + }, + { + "epoch": 0.7623136746597537, + "grad_norm": 0.03358980268239975, + "learning_rate": 0.00017987308159683155, + "loss": 0.3262, + "step": 9410 + }, + { + "epoch": 0.7623946856772521, + "grad_norm": 0.03218737989664078, + "learning_rate": 0.00017986858094423694, + "loss": 0.351, + "step": 9411 + }, + { + "epoch": 0.7624756966947505, + "grad_norm": 0.028905585408210754, + "learning_rate": 0.0001798640802916423, + "loss": 0.3422, + "step": 9412 + }, + { + "epoch": 0.7625567077122488, + "grad_norm": 0.03379735350608826, + "learning_rate": 0.00017985957963904768, + "loss": 0.3237, + "step": 9413 + }, + { + "epoch": 0.7626377187297473, + "grad_norm": 0.034211091697216034, + "learning_rate": 0.00017985507898645304, + "loss": 0.3419, + "step": 9414 + }, + { + "epoch": 0.7627187297472456, + "grad_norm": 0.03187450021505356, + "learning_rate": 0.0001798505783338584, + "loss": 0.3476, + "step": 9415 + }, + { + "epoch": 0.762799740764744, + "grad_norm": 0.031442102044820786, + "learning_rate": 0.0001798460776812638, + "loss": 0.3068, + "step": 9416 + }, + { + "epoch": 0.7628807517822424, + "grad_norm": 0.03673580288887024, + "learning_rate": 0.00017984157702866918, + "loss": 0.3134, + "step": 9417 + }, + { + "epoch": 0.7629617627997408, + "grad_norm": 0.028350602835416794, + "learning_rate": 0.00017983707637607454, + "loss": 0.3164, + "step": 9418 + }, + { + "epoch": 0.7630427738172392, + "grad_norm": 0.03699628636240959, + "learning_rate": 0.00017983257572347993, + "loss": 0.3638, + "step": 9419 + }, + { + "epoch": 0.7631237848347375, + "grad_norm": 0.03153945133090019, + "learning_rate": 0.00017982807507088529, + "loss": 0.3471, + "step": 9420 + }, + { + "epoch": 0.7632047958522359, + "grad_norm": 0.03328992798924446, + "learning_rate": 0.00017982357441829065, + "loss": 0.3345, + "step": 9421 + }, + { + "epoch": 0.7632858068697342, + "grad_norm": 0.03313197195529938, + "learning_rate": 0.00017981907376569603, + "loss": 0.3277, + "step": 9422 + }, + { + "epoch": 0.7633668178872327, + "grad_norm": 0.033376265317201614, + "learning_rate": 0.00017981457311310142, + "loss": 0.3647, + "step": 9423 + }, + { + "epoch": 0.7634478289047311, + "grad_norm": 0.029434161260724068, + "learning_rate": 0.00017981007246050678, + "loss": 0.3253, + "step": 9424 + }, + { + "epoch": 0.7635288399222294, + "grad_norm": 0.035335421562194824, + "learning_rate": 0.00017980557180791217, + "loss": 0.3435, + "step": 9425 + }, + { + "epoch": 0.7636098509397278, + "grad_norm": 0.033159878104925156, + "learning_rate": 0.00017980107115531753, + "loss": 0.3497, + "step": 9426 + }, + { + "epoch": 0.7636908619572261, + "grad_norm": 0.035425253212451935, + "learning_rate": 0.0001797965705027229, + "loss": 0.3333, + "step": 9427 + }, + { + "epoch": 0.7637718729747246, + "grad_norm": 0.032743070274591446, + "learning_rate": 0.00017979206985012827, + "loss": 0.3062, + "step": 9428 + }, + { + "epoch": 0.763852883992223, + "grad_norm": 0.032274626195430756, + "learning_rate": 0.00017978756919753366, + "loss": 0.3587, + "step": 9429 + }, + { + "epoch": 0.7639338950097213, + "grad_norm": 0.032194703817367554, + "learning_rate": 0.00017978306854493902, + "loss": 0.3454, + "step": 9430 + }, + { + "epoch": 0.7640149060272197, + "grad_norm": 0.028805602341890335, + "learning_rate": 0.0001797785678923444, + "loss": 0.3055, + "step": 9431 + }, + { + "epoch": 0.7640959170447181, + "grad_norm": 0.03278293088078499, + "learning_rate": 0.00017977406723974977, + "loss": 0.3308, + "step": 9432 + }, + { + "epoch": 0.7641769280622165, + "grad_norm": 0.03008844517171383, + "learning_rate": 0.00017976956658715513, + "loss": 0.3352, + "step": 9433 + }, + { + "epoch": 0.7642579390797148, + "grad_norm": 0.03664538636803627, + "learning_rate": 0.00017976506593456052, + "loss": 0.3172, + "step": 9434 + }, + { + "epoch": 0.7643389500972132, + "grad_norm": 0.034973543137311935, + "learning_rate": 0.0001797605652819659, + "loss": 0.3243, + "step": 9435 + }, + { + "epoch": 0.7644199611147116, + "grad_norm": 0.02906326949596405, + "learning_rate": 0.00017975606462937126, + "loss": 0.3168, + "step": 9436 + }, + { + "epoch": 0.76450097213221, + "grad_norm": 0.028848815709352493, + "learning_rate": 0.00017975156397677665, + "loss": 0.3766, + "step": 9437 + }, + { + "epoch": 0.7645819831497084, + "grad_norm": 0.033257681876420975, + "learning_rate": 0.000179747063324182, + "loss": 0.3484, + "step": 9438 + }, + { + "epoch": 0.7646629941672067, + "grad_norm": 0.03166605159640312, + "learning_rate": 0.00017974256267158737, + "loss": 0.3636, + "step": 9439 + }, + { + "epoch": 0.7647440051847051, + "grad_norm": 0.03185746818780899, + "learning_rate": 0.00017973806201899276, + "loss": 0.3032, + "step": 9440 + }, + { + "epoch": 0.7648250162022034, + "grad_norm": 0.028180088847875595, + "learning_rate": 0.00017973356136639814, + "loss": 0.3359, + "step": 9441 + }, + { + "epoch": 0.7649060272197019, + "grad_norm": 0.03395479917526245, + "learning_rate": 0.0001797290607138035, + "loss": 0.3566, + "step": 9442 + }, + { + "epoch": 0.7649870382372003, + "grad_norm": 0.03671841323375702, + "learning_rate": 0.0001797245600612089, + "loss": 0.3645, + "step": 9443 + }, + { + "epoch": 0.7650680492546986, + "grad_norm": 0.03072306700050831, + "learning_rate": 0.00017972005940861425, + "loss": 0.295, + "step": 9444 + }, + { + "epoch": 0.765149060272197, + "grad_norm": 0.03294285759329796, + "learning_rate": 0.00017971555875601964, + "loss": 0.355, + "step": 9445 + }, + { + "epoch": 0.7652300712896954, + "grad_norm": 0.03324880078434944, + "learning_rate": 0.000179711058103425, + "loss": 0.3283, + "step": 9446 + }, + { + "epoch": 0.7653110823071938, + "grad_norm": 0.02983265370130539, + "learning_rate": 0.0001797065574508304, + "loss": 0.2818, + "step": 9447 + }, + { + "epoch": 0.7653920933246922, + "grad_norm": 0.03002902865409851, + "learning_rate": 0.00017970205679823575, + "loss": 0.3334, + "step": 9448 + }, + { + "epoch": 0.7654731043421905, + "grad_norm": 0.03378603234887123, + "learning_rate": 0.00017969755614564113, + "loss": 0.3495, + "step": 9449 + }, + { + "epoch": 0.7655541153596889, + "grad_norm": 0.02799426205456257, + "learning_rate": 0.0001796930554930465, + "loss": 0.293, + "step": 9450 + }, + { + "epoch": 0.7656351263771873, + "grad_norm": 0.0365104004740715, + "learning_rate": 0.00017968855484045188, + "loss": 0.3561, + "step": 9451 + }, + { + "epoch": 0.7657161373946857, + "grad_norm": 0.030740009620785713, + "learning_rate": 0.00017968405418785724, + "loss": 0.3388, + "step": 9452 + }, + { + "epoch": 0.765797148412184, + "grad_norm": 0.029759861528873444, + "learning_rate": 0.00017967955353526263, + "loss": 0.2985, + "step": 9453 + }, + { + "epoch": 0.7658781594296824, + "grad_norm": 0.03421476483345032, + "learning_rate": 0.000179675052882668, + "loss": 0.3903, + "step": 9454 + }, + { + "epoch": 0.7659591704471809, + "grad_norm": 0.03636905923485756, + "learning_rate": 0.00017967055223007338, + "loss": 0.3207, + "step": 9455 + }, + { + "epoch": 0.7660401814646792, + "grad_norm": 0.030458511784672737, + "learning_rate": 0.00017966605157747874, + "loss": 0.3277, + "step": 9456 + }, + { + "epoch": 0.7661211924821776, + "grad_norm": 0.03039832040667534, + "learning_rate": 0.00017966155092488412, + "loss": 0.3439, + "step": 9457 + }, + { + "epoch": 0.7662022034996759, + "grad_norm": 0.03035557270050049, + "learning_rate": 0.00017965705027228948, + "loss": 0.3679, + "step": 9458 + }, + { + "epoch": 0.7662832145171743, + "grad_norm": 0.028992289677262306, + "learning_rate": 0.00017965254961969487, + "loss": 0.3216, + "step": 9459 + }, + { + "epoch": 0.7663642255346728, + "grad_norm": 0.029640663415193558, + "learning_rate": 0.00017964804896710023, + "loss": 0.3101, + "step": 9460 + }, + { + "epoch": 0.7664452365521711, + "grad_norm": 0.03412988781929016, + "learning_rate": 0.00017964354831450562, + "loss": 0.3398, + "step": 9461 + }, + { + "epoch": 0.7665262475696695, + "grad_norm": 0.02864237129688263, + "learning_rate": 0.00017963904766191098, + "loss": 0.3201, + "step": 9462 + }, + { + "epoch": 0.7666072585871678, + "grad_norm": 0.03264410048723221, + "learning_rate": 0.00017963454700931636, + "loss": 0.3465, + "step": 9463 + }, + { + "epoch": 0.7666882696046662, + "grad_norm": 0.03233474865555763, + "learning_rate": 0.00017963004635672172, + "loss": 0.3573, + "step": 9464 + }, + { + "epoch": 0.7667692806221647, + "grad_norm": 0.03645838797092438, + "learning_rate": 0.0001796255457041271, + "loss": 0.3595, + "step": 9465 + }, + { + "epoch": 0.766850291639663, + "grad_norm": 0.03317071124911308, + "learning_rate": 0.00017962104505153247, + "loss": 0.348, + "step": 9466 + }, + { + "epoch": 0.7669313026571614, + "grad_norm": 0.02817850187420845, + "learning_rate": 0.00017961654439893786, + "loss": 0.2987, + "step": 9467 + }, + { + "epoch": 0.7670123136746597, + "grad_norm": 0.028615185990929604, + "learning_rate": 0.00017961204374634325, + "loss": 0.29, + "step": 9468 + }, + { + "epoch": 0.7670933246921582, + "grad_norm": 0.039165839552879333, + "learning_rate": 0.0001796075430937486, + "loss": 0.3669, + "step": 9469 + }, + { + "epoch": 0.7671743357096565, + "grad_norm": 0.0342554971575737, + "learning_rate": 0.00017960304244115397, + "loss": 0.3504, + "step": 9470 + }, + { + "epoch": 0.7672553467271549, + "grad_norm": 0.030537201091647148, + "learning_rate": 0.00017959854178855935, + "loss": 0.2925, + "step": 9471 + }, + { + "epoch": 0.7673363577446533, + "grad_norm": 0.033227209001779556, + "learning_rate": 0.0001795940411359647, + "loss": 0.3262, + "step": 9472 + }, + { + "epoch": 0.7674173687621516, + "grad_norm": 0.03859534487128258, + "learning_rate": 0.0001795895404833701, + "loss": 0.3416, + "step": 9473 + }, + { + "epoch": 0.7674983797796501, + "grad_norm": 0.031956154853105545, + "learning_rate": 0.0001795850398307755, + "loss": 0.3614, + "step": 9474 + }, + { + "epoch": 0.7675793907971484, + "grad_norm": 0.032025307416915894, + "learning_rate": 0.00017958053917818085, + "loss": 0.3267, + "step": 9475 + }, + { + "epoch": 0.7676604018146468, + "grad_norm": 0.03213995695114136, + "learning_rate": 0.0001795760385255862, + "loss": 0.3687, + "step": 9476 + }, + { + "epoch": 0.7677414128321451, + "grad_norm": 0.03206200152635574, + "learning_rate": 0.0001795715378729916, + "loss": 0.3219, + "step": 9477 + }, + { + "epoch": 0.7678224238496435, + "grad_norm": 0.03265627846121788, + "learning_rate": 0.00017956703722039695, + "loss": 0.3215, + "step": 9478 + }, + { + "epoch": 0.767903434867142, + "grad_norm": 0.03244243562221527, + "learning_rate": 0.00017956253656780234, + "loss": 0.3481, + "step": 9479 + }, + { + "epoch": 0.7679844458846403, + "grad_norm": 0.037454694509506226, + "learning_rate": 0.00017955803591520773, + "loss": 0.4056, + "step": 9480 + }, + { + "epoch": 0.7680654569021387, + "grad_norm": 0.03103182651102543, + "learning_rate": 0.0001795535352626131, + "loss": 0.3104, + "step": 9481 + }, + { + "epoch": 0.768146467919637, + "grad_norm": 0.03464951366186142, + "learning_rate": 0.00017954903461001848, + "loss": 0.3621, + "step": 9482 + }, + { + "epoch": 0.7682274789371355, + "grad_norm": 0.03253443166613579, + "learning_rate": 0.00017954453395742384, + "loss": 0.3356, + "step": 9483 + }, + { + "epoch": 0.7683084899546339, + "grad_norm": 0.03156166151165962, + "learning_rate": 0.0001795400333048292, + "loss": 0.3175, + "step": 9484 + }, + { + "epoch": 0.7683895009721322, + "grad_norm": 0.030760960653424263, + "learning_rate": 0.00017953553265223458, + "loss": 0.3421, + "step": 9485 + }, + { + "epoch": 0.7684705119896306, + "grad_norm": 0.031788572669029236, + "learning_rate": 0.00017953103199963997, + "loss": 0.2998, + "step": 9486 + }, + { + "epoch": 0.7685515230071289, + "grad_norm": 0.02940155379474163, + "learning_rate": 0.00017952653134704533, + "loss": 0.301, + "step": 9487 + }, + { + "epoch": 0.7686325340246274, + "grad_norm": 0.03467442840337753, + "learning_rate": 0.00017952203069445072, + "loss": 0.3603, + "step": 9488 + }, + { + "epoch": 0.7687135450421257, + "grad_norm": 0.0318903923034668, + "learning_rate": 0.00017951753004185608, + "loss": 0.3347, + "step": 9489 + }, + { + "epoch": 0.7687945560596241, + "grad_norm": 0.03219975158572197, + "learning_rate": 0.00017951302938926144, + "loss": 0.3164, + "step": 9490 + }, + { + "epoch": 0.7688755670771225, + "grad_norm": 0.03198402747511864, + "learning_rate": 0.00017950852873666683, + "loss": 0.3344, + "step": 9491 + }, + { + "epoch": 0.7689565780946209, + "grad_norm": 0.029385024681687355, + "learning_rate": 0.0001795040280840722, + "loss": 0.2955, + "step": 9492 + }, + { + "epoch": 0.7690375891121193, + "grad_norm": 0.03244990482926369, + "learning_rate": 0.00017949952743147757, + "loss": 0.3284, + "step": 9493 + }, + { + "epoch": 0.7691186001296176, + "grad_norm": 0.0280169527977705, + "learning_rate": 0.00017949502677888296, + "loss": 0.2783, + "step": 9494 + }, + { + "epoch": 0.769199611147116, + "grad_norm": 0.035642508417367935, + "learning_rate": 0.00017949052612628832, + "loss": 0.3739, + "step": 9495 + }, + { + "epoch": 0.7692806221646143, + "grad_norm": 0.03269866853952408, + "learning_rate": 0.00017948602547369368, + "loss": 0.315, + "step": 9496 + }, + { + "epoch": 0.7693616331821128, + "grad_norm": 0.03235873579978943, + "learning_rate": 0.00017948152482109907, + "loss": 0.3678, + "step": 9497 + }, + { + "epoch": 0.7694426441996112, + "grad_norm": 0.03456562012434006, + "learning_rate": 0.00017947702416850445, + "loss": 0.322, + "step": 9498 + }, + { + "epoch": 0.7695236552171095, + "grad_norm": 0.036083102226257324, + "learning_rate": 0.00017947252351590981, + "loss": 0.3906, + "step": 9499 + }, + { + "epoch": 0.7696046662346079, + "grad_norm": 0.03269893676042557, + "learning_rate": 0.0001794680228633152, + "loss": 0.334, + "step": 9500 + }, + { + "epoch": 0.7696856772521062, + "grad_norm": 0.033791083842515945, + "learning_rate": 0.00017946352221072056, + "loss": 0.3306, + "step": 9501 + }, + { + "epoch": 0.7697666882696047, + "grad_norm": 0.03444517031311989, + "learning_rate": 0.00017945902155812592, + "loss": 0.3183, + "step": 9502 + }, + { + "epoch": 0.7698476992871031, + "grad_norm": 0.03153405338525772, + "learning_rate": 0.0001794545209055313, + "loss": 0.3229, + "step": 9503 + }, + { + "epoch": 0.7699287103046014, + "grad_norm": 0.03601398691534996, + "learning_rate": 0.0001794500202529367, + "loss": 0.3615, + "step": 9504 + }, + { + "epoch": 0.7700097213220998, + "grad_norm": 0.03581347316503525, + "learning_rate": 0.00017944551960034206, + "loss": 0.3464, + "step": 9505 + }, + { + "epoch": 0.7700907323395982, + "grad_norm": 0.031124750152230263, + "learning_rate": 0.00017944101894774744, + "loss": 0.3338, + "step": 9506 + }, + { + "epoch": 0.7701717433570966, + "grad_norm": 0.031031455844640732, + "learning_rate": 0.0001794365182951528, + "loss": 0.3398, + "step": 9507 + }, + { + "epoch": 0.770252754374595, + "grad_norm": 0.031076917424798012, + "learning_rate": 0.00017943201764255816, + "loss": 0.31, + "step": 9508 + }, + { + "epoch": 0.7703337653920933, + "grad_norm": 0.037329282611608505, + "learning_rate": 0.00017942751698996355, + "loss": 0.3188, + "step": 9509 + }, + { + "epoch": 0.7704147764095917, + "grad_norm": 0.036704570055007935, + "learning_rate": 0.00017942301633736894, + "loss": 0.3305, + "step": 9510 + }, + { + "epoch": 0.7704957874270901, + "grad_norm": 0.036286912858486176, + "learning_rate": 0.0001794185156847743, + "loss": 0.3318, + "step": 9511 + }, + { + "epoch": 0.7705767984445885, + "grad_norm": 0.033339180052280426, + "learning_rate": 0.00017941401503217968, + "loss": 0.3735, + "step": 9512 + }, + { + "epoch": 0.7706578094620868, + "grad_norm": 0.026896296069025993, + "learning_rate": 0.00017940951437958504, + "loss": 0.274, + "step": 9513 + }, + { + "epoch": 0.7707388204795852, + "grad_norm": 0.03523967042565346, + "learning_rate": 0.0001794050137269904, + "loss": 0.3305, + "step": 9514 + }, + { + "epoch": 0.7708198314970836, + "grad_norm": 0.034618232399225235, + "learning_rate": 0.0001794005130743958, + "loss": 0.3455, + "step": 9515 + }, + { + "epoch": 0.770900842514582, + "grad_norm": 0.034029290080070496, + "learning_rate": 0.00017939601242180118, + "loss": 0.3354, + "step": 9516 + }, + { + "epoch": 0.7709818535320804, + "grad_norm": 0.034780219197273254, + "learning_rate": 0.00017939151176920654, + "loss": 0.365, + "step": 9517 + }, + { + "epoch": 0.7710628645495787, + "grad_norm": 0.03206856921315193, + "learning_rate": 0.00017938701111661193, + "loss": 0.367, + "step": 9518 + }, + { + "epoch": 0.7711438755670771, + "grad_norm": 0.03862106427550316, + "learning_rate": 0.00017938251046401729, + "loss": 0.384, + "step": 9519 + }, + { + "epoch": 0.7712248865845756, + "grad_norm": 0.03470539674162865, + "learning_rate": 0.00017937800981142267, + "loss": 0.3311, + "step": 9520 + }, + { + "epoch": 0.7713058976020739, + "grad_norm": 0.02929665707051754, + "learning_rate": 0.00017937350915882803, + "loss": 0.3018, + "step": 9521 + }, + { + "epoch": 0.7713869086195723, + "grad_norm": 0.03172649070620537, + "learning_rate": 0.00017936900850623342, + "loss": 0.3377, + "step": 9522 + }, + { + "epoch": 0.7714679196370706, + "grad_norm": 0.03501424938440323, + "learning_rate": 0.00017936450785363878, + "loss": 0.388, + "step": 9523 + }, + { + "epoch": 0.771548930654569, + "grad_norm": 0.03126747906208038, + "learning_rate": 0.00017936000720104417, + "loss": 0.3486, + "step": 9524 + }, + { + "epoch": 0.7716299416720674, + "grad_norm": 0.02895498462021351, + "learning_rate": 0.00017935550654844953, + "loss": 0.3092, + "step": 9525 + }, + { + "epoch": 0.7717109526895658, + "grad_norm": 0.03243519365787506, + "learning_rate": 0.00017935100589585491, + "loss": 0.3233, + "step": 9526 + }, + { + "epoch": 0.7717919637070642, + "grad_norm": 0.03405696526169777, + "learning_rate": 0.00017934650524326027, + "loss": 0.2988, + "step": 9527 + }, + { + "epoch": 0.7718729747245625, + "grad_norm": 0.03410395607352257, + "learning_rate": 0.00017934200459066566, + "loss": 0.3578, + "step": 9528 + }, + { + "epoch": 0.7719539857420609, + "grad_norm": 0.030598022043704987, + "learning_rate": 0.00017933750393807102, + "loss": 0.3091, + "step": 9529 + }, + { + "epoch": 0.7720349967595593, + "grad_norm": 0.0366281159222126, + "learning_rate": 0.0001793330032854764, + "loss": 0.3863, + "step": 9530 + }, + { + "epoch": 0.7721160077770577, + "grad_norm": 0.036652810871601105, + "learning_rate": 0.00017932850263288177, + "loss": 0.3833, + "step": 9531 + }, + { + "epoch": 0.772197018794556, + "grad_norm": 0.03007068671286106, + "learning_rate": 0.00017932400198028716, + "loss": 0.3465, + "step": 9532 + }, + { + "epoch": 0.7722780298120544, + "grad_norm": 0.031867507845163345, + "learning_rate": 0.00017931950132769252, + "loss": 0.3191, + "step": 9533 + }, + { + "epoch": 0.7723590408295529, + "grad_norm": 0.031599272042512894, + "learning_rate": 0.0001793150006750979, + "loss": 0.3866, + "step": 9534 + }, + { + "epoch": 0.7724400518470512, + "grad_norm": 0.03572344034910202, + "learning_rate": 0.00017931050002250326, + "loss": 0.3925, + "step": 9535 + }, + { + "epoch": 0.7725210628645496, + "grad_norm": 0.033920932561159134, + "learning_rate": 0.00017930599936990865, + "loss": 0.3545, + "step": 9536 + }, + { + "epoch": 0.7726020738820479, + "grad_norm": 0.032732460647821426, + "learning_rate": 0.000179301498717314, + "loss": 0.3351, + "step": 9537 + }, + { + "epoch": 0.7726830848995463, + "grad_norm": 0.030768388882279396, + "learning_rate": 0.0001792969980647194, + "loss": 0.3133, + "step": 9538 + }, + { + "epoch": 0.7727640959170448, + "grad_norm": 0.028435923159122467, + "learning_rate": 0.00017929249741212476, + "loss": 0.325, + "step": 9539 + }, + { + "epoch": 0.7728451069345431, + "grad_norm": 0.030626384541392326, + "learning_rate": 0.00017928799675953015, + "loss": 0.2976, + "step": 9540 + }, + { + "epoch": 0.7729261179520415, + "grad_norm": 0.03307020291686058, + "learning_rate": 0.0001792834961069355, + "loss": 0.3515, + "step": 9541 + }, + { + "epoch": 0.7730071289695398, + "grad_norm": 0.030811350792646408, + "learning_rate": 0.0001792789954543409, + "loss": 0.3202, + "step": 9542 + }, + { + "epoch": 0.7730881399870383, + "grad_norm": 0.03180959075689316, + "learning_rate": 0.00017927449480174625, + "loss": 0.3461, + "step": 9543 + }, + { + "epoch": 0.7731691510045366, + "grad_norm": 0.03422219678759575, + "learning_rate": 0.00017926999414915164, + "loss": 0.3214, + "step": 9544 + }, + { + "epoch": 0.773250162022035, + "grad_norm": 0.03370879590511322, + "learning_rate": 0.000179265493496557, + "loss": 0.3632, + "step": 9545 + }, + { + "epoch": 0.7733311730395334, + "grad_norm": 0.028467241674661636, + "learning_rate": 0.0001792609928439624, + "loss": 0.2876, + "step": 9546 + }, + { + "epoch": 0.7734121840570317, + "grad_norm": 0.034185174852609634, + "learning_rate": 0.00017925649219136775, + "loss": 0.3264, + "step": 9547 + }, + { + "epoch": 0.7734931950745302, + "grad_norm": 0.030525004491209984, + "learning_rate": 0.00017925199153877313, + "loss": 0.3245, + "step": 9548 + }, + { + "epoch": 0.7735742060920285, + "grad_norm": 0.03793109208345413, + "learning_rate": 0.00017924749088617852, + "loss": 0.3661, + "step": 9549 + }, + { + "epoch": 0.7736552171095269, + "grad_norm": 0.03561738505959511, + "learning_rate": 0.00017924299023358388, + "loss": 0.397, + "step": 9550 + }, + { + "epoch": 0.7737362281270252, + "grad_norm": 0.029582394286990166, + "learning_rate": 0.00017923848958098927, + "loss": 0.2889, + "step": 9551 + }, + { + "epoch": 0.7738172391445236, + "grad_norm": 0.030550727620720863, + "learning_rate": 0.00017923398892839463, + "loss": 0.3313, + "step": 9552 + }, + { + "epoch": 0.7738982501620221, + "grad_norm": 0.040079787373542786, + "learning_rate": 0.0001792294882758, + "loss": 0.3366, + "step": 9553 + }, + { + "epoch": 0.7739792611795204, + "grad_norm": 0.033145103603601456, + "learning_rate": 0.00017922498762320538, + "loss": 0.3652, + "step": 9554 + }, + { + "epoch": 0.7740602721970188, + "grad_norm": 0.03565409779548645, + "learning_rate": 0.00017922048697061076, + "loss": 0.3709, + "step": 9555 + }, + { + "epoch": 0.7741412832145171, + "grad_norm": 0.03129686042666435, + "learning_rate": 0.00017921598631801612, + "loss": 0.3, + "step": 9556 + }, + { + "epoch": 0.7742222942320156, + "grad_norm": 0.033627647906541824, + "learning_rate": 0.0001792114856654215, + "loss": 0.3353, + "step": 9557 + }, + { + "epoch": 0.774303305249514, + "grad_norm": 0.03130074590444565, + "learning_rate": 0.00017920698501282687, + "loss": 0.3693, + "step": 9558 + }, + { + "epoch": 0.7743843162670123, + "grad_norm": 0.031756918877363205, + "learning_rate": 0.00017920248436023223, + "loss": 0.3226, + "step": 9559 + }, + { + "epoch": 0.7744653272845107, + "grad_norm": 0.027761923149228096, + "learning_rate": 0.00017919798370763762, + "loss": 0.3002, + "step": 9560 + }, + { + "epoch": 0.774546338302009, + "grad_norm": 0.030556004494428635, + "learning_rate": 0.000179193483055043, + "loss": 0.3502, + "step": 9561 + }, + { + "epoch": 0.7746273493195075, + "grad_norm": 0.03501565754413605, + "learning_rate": 0.00017918898240244836, + "loss": 0.3477, + "step": 9562 + }, + { + "epoch": 0.7747083603370059, + "grad_norm": 0.035923443734645844, + "learning_rate": 0.00017918448174985375, + "loss": 0.3512, + "step": 9563 + }, + { + "epoch": 0.7747893713545042, + "grad_norm": 0.037229202687740326, + "learning_rate": 0.0001791799810972591, + "loss": 0.3648, + "step": 9564 + }, + { + "epoch": 0.7748703823720026, + "grad_norm": 0.029280126094818115, + "learning_rate": 0.00017917548044466447, + "loss": 0.3377, + "step": 9565 + }, + { + "epoch": 0.7749513933895009, + "grad_norm": 0.03008767031133175, + "learning_rate": 0.00017917097979206986, + "loss": 0.3225, + "step": 9566 + }, + { + "epoch": 0.7750324044069994, + "grad_norm": 0.030362514778971672, + "learning_rate": 0.00017916647913947525, + "loss": 0.3643, + "step": 9567 + }, + { + "epoch": 0.7751134154244977, + "grad_norm": 0.029189782217144966, + "learning_rate": 0.0001791619784868806, + "loss": 0.3192, + "step": 9568 + }, + { + "epoch": 0.7751944264419961, + "grad_norm": 0.030873114243149757, + "learning_rate": 0.000179157477834286, + "loss": 0.3158, + "step": 9569 + }, + { + "epoch": 0.7752754374594945, + "grad_norm": 0.03154481202363968, + "learning_rate": 0.00017915297718169135, + "loss": 0.3176, + "step": 9570 + }, + { + "epoch": 0.7753564484769929, + "grad_norm": 0.03424935042858124, + "learning_rate": 0.0001791484765290967, + "loss": 0.3161, + "step": 9571 + }, + { + "epoch": 0.7754374594944913, + "grad_norm": 0.0363914854824543, + "learning_rate": 0.0001791439758765021, + "loss": 0.3239, + "step": 9572 + }, + { + "epoch": 0.7755184705119896, + "grad_norm": 0.031175851821899414, + "learning_rate": 0.0001791394752239075, + "loss": 0.3578, + "step": 9573 + }, + { + "epoch": 0.775599481529488, + "grad_norm": 0.033968258649110794, + "learning_rate": 0.00017913497457131285, + "loss": 0.3488, + "step": 9574 + }, + { + "epoch": 0.7756804925469863, + "grad_norm": 0.03248978033661842, + "learning_rate": 0.00017913047391871823, + "loss": 0.3245, + "step": 9575 + }, + { + "epoch": 0.7757615035644848, + "grad_norm": 0.036548539996147156, + "learning_rate": 0.0001791259732661236, + "loss": 0.3343, + "step": 9576 + }, + { + "epoch": 0.7758425145819832, + "grad_norm": 0.031858112663030624, + "learning_rate": 0.00017912147261352895, + "loss": 0.3543, + "step": 9577 + }, + { + "epoch": 0.7759235255994815, + "grad_norm": 0.032914452254772186, + "learning_rate": 0.00017911697196093434, + "loss": 0.3456, + "step": 9578 + }, + { + "epoch": 0.7760045366169799, + "grad_norm": 0.02853773906826973, + "learning_rate": 0.00017911247130833973, + "loss": 0.3068, + "step": 9579 + }, + { + "epoch": 0.7760855476344782, + "grad_norm": 0.036762382835149765, + "learning_rate": 0.0001791079706557451, + "loss": 0.3823, + "step": 9580 + }, + { + "epoch": 0.7761665586519767, + "grad_norm": 0.0352318175137043, + "learning_rate": 0.00017910347000315048, + "loss": 0.3595, + "step": 9581 + }, + { + "epoch": 0.7762475696694751, + "grad_norm": 0.03461692854762077, + "learning_rate": 0.00017909896935055584, + "loss": 0.3355, + "step": 9582 + }, + { + "epoch": 0.7763285806869734, + "grad_norm": 0.036282241344451904, + "learning_rate": 0.0001790944686979612, + "loss": 0.3133, + "step": 9583 + }, + { + "epoch": 0.7764095917044718, + "grad_norm": 0.03350308537483215, + "learning_rate": 0.00017908996804536658, + "loss": 0.3294, + "step": 9584 + }, + { + "epoch": 0.7764906027219702, + "grad_norm": 0.029508352279663086, + "learning_rate": 0.00017908546739277197, + "loss": 0.325, + "step": 9585 + }, + { + "epoch": 0.7765716137394686, + "grad_norm": 0.03445051982998848, + "learning_rate": 0.00017908096674017733, + "loss": 0.3428, + "step": 9586 + }, + { + "epoch": 0.776652624756967, + "grad_norm": 0.029148763045668602, + "learning_rate": 0.00017907646608758272, + "loss": 0.3175, + "step": 9587 + }, + { + "epoch": 0.7767336357744653, + "grad_norm": 0.037063565105199814, + "learning_rate": 0.00017907196543498808, + "loss": 0.3849, + "step": 9588 + }, + { + "epoch": 0.7768146467919637, + "grad_norm": 0.033602166920900345, + "learning_rate": 0.00017906746478239344, + "loss": 0.3114, + "step": 9589 + }, + { + "epoch": 0.7768956578094621, + "grad_norm": 0.03480584919452667, + "learning_rate": 0.00017906296412979883, + "loss": 0.3049, + "step": 9590 + }, + { + "epoch": 0.7769766688269605, + "grad_norm": 0.02770129404962063, + "learning_rate": 0.0001790584634772042, + "loss": 0.34, + "step": 9591 + }, + { + "epoch": 0.7770576798444588, + "grad_norm": 0.03278804570436478, + "learning_rate": 0.00017905396282460957, + "loss": 0.3414, + "step": 9592 + }, + { + "epoch": 0.7771386908619572, + "grad_norm": 0.02992059662938118, + "learning_rate": 0.00017904946217201496, + "loss": 0.3476, + "step": 9593 + }, + { + "epoch": 0.7772197018794557, + "grad_norm": 0.028599180281162262, + "learning_rate": 0.00017904496151942032, + "loss": 0.3124, + "step": 9594 + }, + { + "epoch": 0.777300712896954, + "grad_norm": 0.033986471593379974, + "learning_rate": 0.00017904046086682568, + "loss": 0.3397, + "step": 9595 + }, + { + "epoch": 0.7773817239144524, + "grad_norm": 0.032153479754924774, + "learning_rate": 0.00017903596021423107, + "loss": 0.3471, + "step": 9596 + }, + { + "epoch": 0.7774627349319507, + "grad_norm": 0.030729763209819794, + "learning_rate": 0.00017903145956163645, + "loss": 0.2989, + "step": 9597 + }, + { + "epoch": 0.7775437459494491, + "grad_norm": 0.0319899320602417, + "learning_rate": 0.00017902695890904181, + "loss": 0.3418, + "step": 9598 + }, + { + "epoch": 0.7776247569669476, + "grad_norm": 0.042746610939502716, + "learning_rate": 0.0001790224582564472, + "loss": 0.3612, + "step": 9599 + }, + { + "epoch": 0.7777057679844459, + "grad_norm": 0.031451623886823654, + "learning_rate": 0.00017901795760385256, + "loss": 0.3429, + "step": 9600 + }, + { + "epoch": 0.7777867790019443, + "grad_norm": 0.034506361931562424, + "learning_rate": 0.00017901345695125795, + "loss": 0.3292, + "step": 9601 + }, + { + "epoch": 0.7778677900194426, + "grad_norm": 0.027968307957053185, + "learning_rate": 0.0001790089562986633, + "loss": 0.2688, + "step": 9602 + }, + { + "epoch": 0.777948801036941, + "grad_norm": 0.035632938146591187, + "learning_rate": 0.0001790044556460687, + "loss": 0.3485, + "step": 9603 + }, + { + "epoch": 0.7780298120544394, + "grad_norm": 0.03224572911858559, + "learning_rate": 0.00017899995499347406, + "loss": 0.3222, + "step": 9604 + }, + { + "epoch": 0.7781108230719378, + "grad_norm": 0.03422819823026657, + "learning_rate": 0.00017899545434087944, + "loss": 0.3058, + "step": 9605 + }, + { + "epoch": 0.7781918340894362, + "grad_norm": 0.032302357256412506, + "learning_rate": 0.0001789909536882848, + "loss": 0.3048, + "step": 9606 + }, + { + "epoch": 0.7782728451069345, + "grad_norm": 0.04375520721077919, + "learning_rate": 0.0001789864530356902, + "loss": 0.3714, + "step": 9607 + }, + { + "epoch": 0.778353856124433, + "grad_norm": 0.031402699649333954, + "learning_rate": 0.00017898195238309555, + "loss": 0.3035, + "step": 9608 + }, + { + "epoch": 0.7784348671419313, + "grad_norm": 0.03309689834713936, + "learning_rate": 0.00017897745173050094, + "loss": 0.3285, + "step": 9609 + }, + { + "epoch": 0.7785158781594297, + "grad_norm": 0.03338422253727913, + "learning_rate": 0.0001789729510779063, + "loss": 0.3332, + "step": 9610 + }, + { + "epoch": 0.778596889176928, + "grad_norm": 0.03191295266151428, + "learning_rate": 0.00017896845042531168, + "loss": 0.3559, + "step": 9611 + }, + { + "epoch": 0.7786779001944264, + "grad_norm": 0.02891041338443756, + "learning_rate": 0.00017896394977271704, + "loss": 0.319, + "step": 9612 + }, + { + "epoch": 0.7787589112119249, + "grad_norm": 0.03670676797628403, + "learning_rate": 0.00017895944912012243, + "loss": 0.3447, + "step": 9613 + }, + { + "epoch": 0.7788399222294232, + "grad_norm": 0.03295731917023659, + "learning_rate": 0.00017895494846752782, + "loss": 0.3487, + "step": 9614 + }, + { + "epoch": 0.7789209332469216, + "grad_norm": 0.032904211431741714, + "learning_rate": 0.00017895044781493318, + "loss": 0.3538, + "step": 9615 + }, + { + "epoch": 0.7790019442644199, + "grad_norm": 0.040504857897758484, + "learning_rate": 0.00017894594716233854, + "loss": 0.374, + "step": 9616 + }, + { + "epoch": 0.7790829552819183, + "grad_norm": 0.03470660373568535, + "learning_rate": 0.00017894144650974393, + "loss": 0.3646, + "step": 9617 + }, + { + "epoch": 0.7791639662994168, + "grad_norm": 0.03370751813054085, + "learning_rate": 0.00017893694585714929, + "loss": 0.3769, + "step": 9618 + }, + { + "epoch": 0.7792449773169151, + "grad_norm": 0.03140348568558693, + "learning_rate": 0.00017893244520455467, + "loss": 0.2578, + "step": 9619 + }, + { + "epoch": 0.7793259883344135, + "grad_norm": 0.033045411109924316, + "learning_rate": 0.00017892794455196006, + "loss": 0.3683, + "step": 9620 + }, + { + "epoch": 0.7794069993519118, + "grad_norm": 0.030752673745155334, + "learning_rate": 0.00017892344389936542, + "loss": 0.3476, + "step": 9621 + }, + { + "epoch": 0.7794880103694103, + "grad_norm": 0.035168688744306564, + "learning_rate": 0.00017891894324677078, + "loss": 0.3032, + "step": 9622 + }, + { + "epoch": 0.7795690213869086, + "grad_norm": 0.03096243180334568, + "learning_rate": 0.00017891444259417617, + "loss": 0.3188, + "step": 9623 + }, + { + "epoch": 0.779650032404407, + "grad_norm": 0.0341077521443367, + "learning_rate": 0.00017890994194158153, + "loss": 0.358, + "step": 9624 + }, + { + "epoch": 0.7797310434219054, + "grad_norm": 0.031084155663847923, + "learning_rate": 0.00017890544128898692, + "loss": 0.3123, + "step": 9625 + }, + { + "epoch": 0.7798120544394037, + "grad_norm": 0.032854244112968445, + "learning_rate": 0.0001789009406363923, + "loss": 0.3659, + "step": 9626 + }, + { + "epoch": 0.7798930654569022, + "grad_norm": 0.033259496092796326, + "learning_rate": 0.00017889643998379766, + "loss": 0.3104, + "step": 9627 + }, + { + "epoch": 0.7799740764744005, + "grad_norm": 0.03488700091838837, + "learning_rate": 0.00017889193933120302, + "loss": 0.2932, + "step": 9628 + }, + { + "epoch": 0.7800550874918989, + "grad_norm": 0.03404168039560318, + "learning_rate": 0.0001788874386786084, + "loss": 0.3103, + "step": 9629 + }, + { + "epoch": 0.7801360985093972, + "grad_norm": 0.03553805127739906, + "learning_rate": 0.0001788829380260138, + "loss": 0.3551, + "step": 9630 + }, + { + "epoch": 0.7802171095268956, + "grad_norm": 0.03256218135356903, + "learning_rate": 0.00017887843737341916, + "loss": 0.3292, + "step": 9631 + }, + { + "epoch": 0.7802981205443941, + "grad_norm": 0.031110748648643494, + "learning_rate": 0.00017887393672082454, + "loss": 0.3527, + "step": 9632 + }, + { + "epoch": 0.7803791315618924, + "grad_norm": 0.03429475054144859, + "learning_rate": 0.0001788694360682299, + "loss": 0.2781, + "step": 9633 + }, + { + "epoch": 0.7804601425793908, + "grad_norm": 0.03137778490781784, + "learning_rate": 0.00017886493541563526, + "loss": 0.3287, + "step": 9634 + }, + { + "epoch": 0.7805411535968891, + "grad_norm": 0.03139874339103699, + "learning_rate": 0.00017886043476304065, + "loss": 0.3391, + "step": 9635 + }, + { + "epoch": 0.7806221646143876, + "grad_norm": 0.03607549890875816, + "learning_rate": 0.00017885593411044604, + "loss": 0.3364, + "step": 9636 + }, + { + "epoch": 0.780703175631886, + "grad_norm": 0.03316293656826019, + "learning_rate": 0.0001788514334578514, + "loss": 0.3261, + "step": 9637 + }, + { + "epoch": 0.7807841866493843, + "grad_norm": 0.031096961349248886, + "learning_rate": 0.00017884693280525679, + "loss": 0.325, + "step": 9638 + }, + { + "epoch": 0.7808651976668827, + "grad_norm": 0.029852962121367455, + "learning_rate": 0.00017884243215266215, + "loss": 0.3336, + "step": 9639 + }, + { + "epoch": 0.780946208684381, + "grad_norm": 0.035916849970817566, + "learning_rate": 0.0001788379315000675, + "loss": 0.3333, + "step": 9640 + }, + { + "epoch": 0.7810272197018795, + "grad_norm": 0.03139684349298477, + "learning_rate": 0.0001788334308474729, + "loss": 0.3193, + "step": 9641 + }, + { + "epoch": 0.7811082307193778, + "grad_norm": 0.030681351199746132, + "learning_rate": 0.00017882893019487828, + "loss": 0.3389, + "step": 9642 + }, + { + "epoch": 0.7811892417368762, + "grad_norm": 0.033301860094070435, + "learning_rate": 0.00017882442954228364, + "loss": 0.3235, + "step": 9643 + }, + { + "epoch": 0.7812702527543746, + "grad_norm": 0.03338354825973511, + "learning_rate": 0.00017881992888968903, + "loss": 0.348, + "step": 9644 + }, + { + "epoch": 0.781351263771873, + "grad_norm": 0.04342193156480789, + "learning_rate": 0.0001788154282370944, + "loss": 0.3586, + "step": 9645 + }, + { + "epoch": 0.7814322747893714, + "grad_norm": 0.035788752138614655, + "learning_rate": 0.00017881092758449975, + "loss": 0.2832, + "step": 9646 + }, + { + "epoch": 0.7815132858068697, + "grad_norm": 0.032872676849365234, + "learning_rate": 0.00017880642693190513, + "loss": 0.3529, + "step": 9647 + }, + { + "epoch": 0.7815942968243681, + "grad_norm": 0.03165189176797867, + "learning_rate": 0.00017880192627931052, + "loss": 0.3199, + "step": 9648 + }, + { + "epoch": 0.7816753078418665, + "grad_norm": 0.0314420610666275, + "learning_rate": 0.00017879742562671588, + "loss": 0.348, + "step": 9649 + }, + { + "epoch": 0.7817563188593649, + "grad_norm": 0.030172783881425858, + "learning_rate": 0.00017879292497412127, + "loss": 0.2972, + "step": 9650 + }, + { + "epoch": 0.7818373298768633, + "grad_norm": 0.029220450669527054, + "learning_rate": 0.00017878842432152663, + "loss": 0.332, + "step": 9651 + }, + { + "epoch": 0.7819183408943616, + "grad_norm": 0.030133794993162155, + "learning_rate": 0.000178783923668932, + "loss": 0.319, + "step": 9652 + }, + { + "epoch": 0.78199935191186, + "grad_norm": 0.0334329754114151, + "learning_rate": 0.00017877942301633738, + "loss": 0.3412, + "step": 9653 + }, + { + "epoch": 0.7820803629293583, + "grad_norm": 0.03315896913409233, + "learning_rate": 0.00017877492236374276, + "loss": 0.3074, + "step": 9654 + }, + { + "epoch": 0.7821613739468568, + "grad_norm": 0.03713025152683258, + "learning_rate": 0.00017877042171114812, + "loss": 0.3226, + "step": 9655 + }, + { + "epoch": 0.7822423849643552, + "grad_norm": 0.03378988802433014, + "learning_rate": 0.0001787659210585535, + "loss": 0.359, + "step": 9656 + }, + { + "epoch": 0.7823233959818535, + "grad_norm": 0.03350520133972168, + "learning_rate": 0.00017876142040595887, + "loss": 0.3346, + "step": 9657 + }, + { + "epoch": 0.7824044069993519, + "grad_norm": 0.03147077187895775, + "learning_rate": 0.00017875691975336423, + "loss": 0.3423, + "step": 9658 + }, + { + "epoch": 0.7824854180168503, + "grad_norm": 0.02769390493631363, + "learning_rate": 0.00017875241910076962, + "loss": 0.3126, + "step": 9659 + }, + { + "epoch": 0.7825664290343487, + "grad_norm": 0.03711729869246483, + "learning_rate": 0.000178747918448175, + "loss": 0.2779, + "step": 9660 + }, + { + "epoch": 0.782647440051847, + "grad_norm": 0.03212438523769379, + "learning_rate": 0.00017874341779558036, + "loss": 0.2873, + "step": 9661 + }, + { + "epoch": 0.7827284510693454, + "grad_norm": 0.03071577101945877, + "learning_rate": 0.00017873891714298575, + "loss": 0.2942, + "step": 9662 + }, + { + "epoch": 0.7828094620868438, + "grad_norm": 0.03193732351064682, + "learning_rate": 0.0001787344164903911, + "loss": 0.3417, + "step": 9663 + }, + { + "epoch": 0.7828904731043422, + "grad_norm": 0.03078223206102848, + "learning_rate": 0.00017872991583779647, + "loss": 0.3429, + "step": 9664 + }, + { + "epoch": 0.7829714841218406, + "grad_norm": 0.034121815115213394, + "learning_rate": 0.00017872541518520186, + "loss": 0.3843, + "step": 9665 + }, + { + "epoch": 0.7830524951393389, + "grad_norm": 0.03557079657912254, + "learning_rate": 0.00017872091453260725, + "loss": 0.3732, + "step": 9666 + }, + { + "epoch": 0.7831335061568373, + "grad_norm": 0.032793350517749786, + "learning_rate": 0.0001787164138800126, + "loss": 0.3637, + "step": 9667 + }, + { + "epoch": 0.7832145171743357, + "grad_norm": 0.03170783817768097, + "learning_rate": 0.000178711913227418, + "loss": 0.3299, + "step": 9668 + }, + { + "epoch": 0.7832955281918341, + "grad_norm": 0.03139970824122429, + "learning_rate": 0.00017870741257482335, + "loss": 0.3519, + "step": 9669 + }, + { + "epoch": 0.7833765392093325, + "grad_norm": 0.03657853603363037, + "learning_rate": 0.00017870291192222871, + "loss": 0.3497, + "step": 9670 + }, + { + "epoch": 0.7834575502268308, + "grad_norm": 0.03443003445863724, + "learning_rate": 0.0001786984112696341, + "loss": 0.3633, + "step": 9671 + }, + { + "epoch": 0.7835385612443292, + "grad_norm": 0.03483427315950394, + "learning_rate": 0.0001786939106170395, + "loss": 0.3797, + "step": 9672 + }, + { + "epoch": 0.7836195722618277, + "grad_norm": 0.031927600502967834, + "learning_rate": 0.00017868940996444485, + "loss": 0.301, + "step": 9673 + }, + { + "epoch": 0.783700583279326, + "grad_norm": 0.03260060027241707, + "learning_rate": 0.00017868490931185024, + "loss": 0.3489, + "step": 9674 + }, + { + "epoch": 0.7837815942968244, + "grad_norm": 0.027573350816965103, + "learning_rate": 0.0001786804086592556, + "loss": 0.2931, + "step": 9675 + }, + { + "epoch": 0.7838626053143227, + "grad_norm": 0.02995600365102291, + "learning_rate": 0.00017867590800666096, + "loss": 0.2981, + "step": 9676 + }, + { + "epoch": 0.7839436163318211, + "grad_norm": 0.03223692625761032, + "learning_rate": 0.00017867140735406634, + "loss": 0.3589, + "step": 9677 + }, + { + "epoch": 0.7840246273493195, + "grad_norm": 0.03705068677663803, + "learning_rate": 0.00017866690670147173, + "loss": 0.3344, + "step": 9678 + }, + { + "epoch": 0.7841056383668179, + "grad_norm": 0.03044736385345459, + "learning_rate": 0.0001786624060488771, + "loss": 0.3734, + "step": 9679 + }, + { + "epoch": 0.7841866493843163, + "grad_norm": 0.031635627150535583, + "learning_rate": 0.00017865790539628248, + "loss": 0.342, + "step": 9680 + }, + { + "epoch": 0.7842676604018146, + "grad_norm": 0.028439925983548164, + "learning_rate": 0.00017865340474368784, + "loss": 0.2895, + "step": 9681 + }, + { + "epoch": 0.7843486714193131, + "grad_norm": 0.03232768177986145, + "learning_rate": 0.00017864890409109322, + "loss": 0.3541, + "step": 9682 + }, + { + "epoch": 0.7844296824368114, + "grad_norm": 0.030741017311811447, + "learning_rate": 0.0001786444034384986, + "loss": 0.3588, + "step": 9683 + }, + { + "epoch": 0.7845106934543098, + "grad_norm": 0.03578881546854973, + "learning_rate": 0.00017863990278590397, + "loss": 0.3798, + "step": 9684 + }, + { + "epoch": 0.7845917044718081, + "grad_norm": 0.029170949012041092, + "learning_rate": 0.00017863540213330933, + "loss": 0.3481, + "step": 9685 + }, + { + "epoch": 0.7846727154893065, + "grad_norm": 0.031627245247364044, + "learning_rate": 0.00017863090148071472, + "loss": 0.3582, + "step": 9686 + }, + { + "epoch": 0.784753726506805, + "grad_norm": 0.03248753771185875, + "learning_rate": 0.00017862640082812008, + "loss": 0.3353, + "step": 9687 + }, + { + "epoch": 0.7848347375243033, + "grad_norm": 0.030072558671236038, + "learning_rate": 0.00017862190017552547, + "loss": 0.3435, + "step": 9688 + }, + { + "epoch": 0.7849157485418017, + "grad_norm": 0.032075896859169006, + "learning_rate": 0.00017861739952293085, + "loss": 0.3313, + "step": 9689 + }, + { + "epoch": 0.7849967595593, + "grad_norm": 0.032127607613801956, + "learning_rate": 0.0001786128988703362, + "loss": 0.3554, + "step": 9690 + }, + { + "epoch": 0.7850777705767984, + "grad_norm": 0.028677096590399742, + "learning_rate": 0.00017860839821774157, + "loss": 0.2903, + "step": 9691 + }, + { + "epoch": 0.7851587815942969, + "grad_norm": 0.030471572652459145, + "learning_rate": 0.00017860389756514696, + "loss": 0.34, + "step": 9692 + }, + { + "epoch": 0.7852397926117952, + "grad_norm": 0.035030148923397064, + "learning_rate": 0.00017859939691255232, + "loss": 0.3296, + "step": 9693 + }, + { + "epoch": 0.7853208036292936, + "grad_norm": 0.03215445205569267, + "learning_rate": 0.0001785948962599577, + "loss": 0.3515, + "step": 9694 + }, + { + "epoch": 0.7854018146467919, + "grad_norm": 0.0335715189576149, + "learning_rate": 0.0001785903956073631, + "loss": 0.3452, + "step": 9695 + }, + { + "epoch": 0.7854828256642904, + "grad_norm": 0.03462142124772072, + "learning_rate": 0.00017858589495476845, + "loss": 0.3536, + "step": 9696 + }, + { + "epoch": 0.7855638366817888, + "grad_norm": 0.030049508437514305, + "learning_rate": 0.00017858139430217381, + "loss": 0.306, + "step": 9697 + }, + { + "epoch": 0.7856448476992871, + "grad_norm": 0.03550144284963608, + "learning_rate": 0.0001785768936495792, + "loss": 0.3499, + "step": 9698 + }, + { + "epoch": 0.7857258587167855, + "grad_norm": 0.031812943518161774, + "learning_rate": 0.00017857239299698456, + "loss": 0.3201, + "step": 9699 + }, + { + "epoch": 0.7858068697342838, + "grad_norm": 0.03390384465456009, + "learning_rate": 0.00017856789234438995, + "loss": 0.3668, + "step": 9700 + }, + { + "epoch": 0.7858878807517823, + "grad_norm": 0.03422324359416962, + "learning_rate": 0.00017856339169179534, + "loss": 0.3161, + "step": 9701 + }, + { + "epoch": 0.7859688917692806, + "grad_norm": 0.037070129066705704, + "learning_rate": 0.0001785588910392007, + "loss": 0.3561, + "step": 9702 + }, + { + "epoch": 0.786049902786779, + "grad_norm": 0.032369211316108704, + "learning_rate": 0.00017855439038660606, + "loss": 0.3626, + "step": 9703 + }, + { + "epoch": 0.7861309138042774, + "grad_norm": 0.03542504832148552, + "learning_rate": 0.00017854988973401144, + "loss": 0.4142, + "step": 9704 + }, + { + "epoch": 0.7862119248217757, + "grad_norm": 0.03197111189365387, + "learning_rate": 0.00017854538908141683, + "loss": 0.3509, + "step": 9705 + }, + { + "epoch": 0.7862929358392742, + "grad_norm": 0.032501548528671265, + "learning_rate": 0.0001785408884288222, + "loss": 0.353, + "step": 9706 + }, + { + "epoch": 0.7863739468567725, + "grad_norm": 0.03382599353790283, + "learning_rate": 0.00017853638777622758, + "loss": 0.3428, + "step": 9707 + }, + { + "epoch": 0.7864549578742709, + "grad_norm": 0.035178374499082565, + "learning_rate": 0.00017853188712363294, + "loss": 0.3423, + "step": 9708 + }, + { + "epoch": 0.7865359688917692, + "grad_norm": 0.0341215543448925, + "learning_rate": 0.0001785273864710383, + "loss": 0.3263, + "step": 9709 + }, + { + "epoch": 0.7866169799092677, + "grad_norm": 0.030356815084815025, + "learning_rate": 0.00017852288581844368, + "loss": 0.3085, + "step": 9710 + }, + { + "epoch": 0.7866979909267661, + "grad_norm": 0.03421403467655182, + "learning_rate": 0.00017851838516584907, + "loss": 0.3651, + "step": 9711 + }, + { + "epoch": 0.7867790019442644, + "grad_norm": 0.03532508388161659, + "learning_rate": 0.00017851388451325443, + "loss": 0.3753, + "step": 9712 + }, + { + "epoch": 0.7868600129617628, + "grad_norm": 0.03950536251068115, + "learning_rate": 0.00017850938386065982, + "loss": 0.3683, + "step": 9713 + }, + { + "epoch": 0.7869410239792611, + "grad_norm": 0.031308356672525406, + "learning_rate": 0.00017850488320806518, + "loss": 0.3372, + "step": 9714 + }, + { + "epoch": 0.7870220349967596, + "grad_norm": 0.03673476725816727, + "learning_rate": 0.00017850038255547054, + "loss": 0.3931, + "step": 9715 + }, + { + "epoch": 0.787103046014258, + "grad_norm": 0.03149571642279625, + "learning_rate": 0.00017849588190287593, + "loss": 0.3008, + "step": 9716 + }, + { + "epoch": 0.7871840570317563, + "grad_norm": 0.02885325625538826, + "learning_rate": 0.00017849138125028131, + "loss": 0.3077, + "step": 9717 + }, + { + "epoch": 0.7872650680492547, + "grad_norm": 0.031650837510824203, + "learning_rate": 0.00017848688059768667, + "loss": 0.3456, + "step": 9718 + }, + { + "epoch": 0.787346079066753, + "grad_norm": 0.03272511065006256, + "learning_rate": 0.00017848237994509206, + "loss": 0.3169, + "step": 9719 + }, + { + "epoch": 0.7874270900842515, + "grad_norm": 0.028613831847906113, + "learning_rate": 0.00017847787929249742, + "loss": 0.2844, + "step": 9720 + }, + { + "epoch": 0.7875081011017498, + "grad_norm": 0.0399998277425766, + "learning_rate": 0.00017847337863990278, + "loss": 0.3512, + "step": 9721 + }, + { + "epoch": 0.7875891121192482, + "grad_norm": 0.03618674352765083, + "learning_rate": 0.00017846887798730817, + "loss": 0.3317, + "step": 9722 + }, + { + "epoch": 0.7876701231367466, + "grad_norm": 0.03203833848237991, + "learning_rate": 0.00017846437733471356, + "loss": 0.3709, + "step": 9723 + }, + { + "epoch": 0.787751134154245, + "grad_norm": 0.03079145960509777, + "learning_rate": 0.00017845987668211892, + "loss": 0.3337, + "step": 9724 + }, + { + "epoch": 0.7878321451717434, + "grad_norm": 0.031215449795126915, + "learning_rate": 0.0001784553760295243, + "loss": 0.301, + "step": 9725 + }, + { + "epoch": 0.7879131561892417, + "grad_norm": 0.032910317182540894, + "learning_rate": 0.00017845087537692966, + "loss": 0.3394, + "step": 9726 + }, + { + "epoch": 0.7879941672067401, + "grad_norm": 0.029228439554572105, + "learning_rate": 0.00017844637472433502, + "loss": 0.3028, + "step": 9727 + }, + { + "epoch": 0.7880751782242384, + "grad_norm": 0.03681560978293419, + "learning_rate": 0.0001784418740717404, + "loss": 0.366, + "step": 9728 + }, + { + "epoch": 0.7881561892417369, + "grad_norm": 0.02952171303331852, + "learning_rate": 0.0001784373734191458, + "loss": 0.3399, + "step": 9729 + }, + { + "epoch": 0.7882372002592353, + "grad_norm": 0.03275004029273987, + "learning_rate": 0.00017843287276655116, + "loss": 0.345, + "step": 9730 + }, + { + "epoch": 0.7883182112767336, + "grad_norm": 0.0315512977540493, + "learning_rate": 0.00017842837211395654, + "loss": 0.3281, + "step": 9731 + }, + { + "epoch": 0.788399222294232, + "grad_norm": 0.03237222507596016, + "learning_rate": 0.0001784238714613619, + "loss": 0.3266, + "step": 9732 + }, + { + "epoch": 0.7884802333117304, + "grad_norm": 0.03108234703540802, + "learning_rate": 0.00017841937080876726, + "loss": 0.3206, + "step": 9733 + }, + { + "epoch": 0.7885612443292288, + "grad_norm": 0.03223662078380585, + "learning_rate": 0.00017841487015617265, + "loss": 0.3309, + "step": 9734 + }, + { + "epoch": 0.7886422553467272, + "grad_norm": 0.03800920397043228, + "learning_rate": 0.00017841036950357804, + "loss": 0.3842, + "step": 9735 + }, + { + "epoch": 0.7887232663642255, + "grad_norm": 0.03151298314332962, + "learning_rate": 0.0001784058688509834, + "loss": 0.314, + "step": 9736 + }, + { + "epoch": 0.7888042773817239, + "grad_norm": 0.03089182637631893, + "learning_rate": 0.00017840136819838879, + "loss": 0.3261, + "step": 9737 + }, + { + "epoch": 0.7888852883992223, + "grad_norm": 0.03249596431851387, + "learning_rate": 0.00017839686754579415, + "loss": 0.2848, + "step": 9738 + }, + { + "epoch": 0.7889662994167207, + "grad_norm": 0.03219173103570938, + "learning_rate": 0.0001783923668931995, + "loss": 0.3275, + "step": 9739 + }, + { + "epoch": 0.789047310434219, + "grad_norm": 0.033864930272102356, + "learning_rate": 0.0001783878662406049, + "loss": 0.3618, + "step": 9740 + }, + { + "epoch": 0.7891283214517174, + "grad_norm": 0.031739283353090286, + "learning_rate": 0.00017838336558801028, + "loss": 0.3355, + "step": 9741 + }, + { + "epoch": 0.7892093324692158, + "grad_norm": 0.029978927224874496, + "learning_rate": 0.00017837886493541564, + "loss": 0.3614, + "step": 9742 + }, + { + "epoch": 0.7892903434867142, + "grad_norm": 0.03132997453212738, + "learning_rate": 0.00017837436428282103, + "loss": 0.3458, + "step": 9743 + }, + { + "epoch": 0.7893713545042126, + "grad_norm": 0.0318823978304863, + "learning_rate": 0.0001783698636302264, + "loss": 0.3637, + "step": 9744 + }, + { + "epoch": 0.7894523655217109, + "grad_norm": 0.031721170991659164, + "learning_rate": 0.00017836536297763175, + "loss": 0.3446, + "step": 9745 + }, + { + "epoch": 0.7895333765392093, + "grad_norm": 0.036005645990371704, + "learning_rate": 0.00017836086232503713, + "loss": 0.3616, + "step": 9746 + }, + { + "epoch": 0.7896143875567078, + "grad_norm": 0.029349252581596375, + "learning_rate": 0.00017835636167244252, + "loss": 0.3128, + "step": 9747 + }, + { + "epoch": 0.7896953985742061, + "grad_norm": 0.03366193547844887, + "learning_rate": 0.00017835186101984788, + "loss": 0.3768, + "step": 9748 + }, + { + "epoch": 0.7897764095917045, + "grad_norm": 0.03656681627035141, + "learning_rate": 0.00017834736036725327, + "loss": 0.3455, + "step": 9749 + }, + { + "epoch": 0.7898574206092028, + "grad_norm": 0.035946715623140335, + "learning_rate": 0.00017834285971465863, + "loss": 0.3624, + "step": 9750 + }, + { + "epoch": 0.7899384316267012, + "grad_norm": 0.033776670694351196, + "learning_rate": 0.000178338359062064, + "loss": 0.416, + "step": 9751 + }, + { + "epoch": 0.7900194426441997, + "grad_norm": 0.033469364047050476, + "learning_rate": 0.0001783338584094694, + "loss": 0.4144, + "step": 9752 + }, + { + "epoch": 0.790100453661698, + "grad_norm": 0.029603907838463783, + "learning_rate": 0.00017832935775687476, + "loss": 0.3114, + "step": 9753 + }, + { + "epoch": 0.7901814646791964, + "grad_norm": 0.031154030933976173, + "learning_rate": 0.00017832485710428012, + "loss": 0.3165, + "step": 9754 + }, + { + "epoch": 0.7902624756966947, + "grad_norm": 0.03153345733880997, + "learning_rate": 0.0001783203564516855, + "loss": 0.3285, + "step": 9755 + }, + { + "epoch": 0.7903434867141931, + "grad_norm": 0.033831704407930374, + "learning_rate": 0.00017831585579909087, + "loss": 0.3407, + "step": 9756 + }, + { + "epoch": 0.7904244977316915, + "grad_norm": 0.02914523147046566, + "learning_rate": 0.00017831135514649623, + "loss": 0.3039, + "step": 9757 + }, + { + "epoch": 0.7905055087491899, + "grad_norm": 0.02872067503631115, + "learning_rate": 0.00017830685449390164, + "loss": 0.3161, + "step": 9758 + }, + { + "epoch": 0.7905865197666883, + "grad_norm": 0.03430357947945595, + "learning_rate": 0.000178302353841307, + "loss": 0.3809, + "step": 9759 + }, + { + "epoch": 0.7906675307841866, + "grad_norm": 0.0303533636033535, + "learning_rate": 0.00017829785318871237, + "loss": 0.3368, + "step": 9760 + }, + { + "epoch": 0.7907485418016851, + "grad_norm": 0.03114994615316391, + "learning_rate": 0.00017829335253611775, + "loss": 0.3374, + "step": 9761 + }, + { + "epoch": 0.7908295528191834, + "grad_norm": 0.032797671854496, + "learning_rate": 0.0001782888518835231, + "loss": 0.3391, + "step": 9762 + }, + { + "epoch": 0.7909105638366818, + "grad_norm": 0.034057628363370895, + "learning_rate": 0.0001782843512309285, + "loss": 0.3473, + "step": 9763 + }, + { + "epoch": 0.7909915748541801, + "grad_norm": 0.03768150880932808, + "learning_rate": 0.0001782798505783339, + "loss": 0.3216, + "step": 9764 + }, + { + "epoch": 0.7910725858716785, + "grad_norm": 0.03195664659142494, + "learning_rate": 0.00017827534992573925, + "loss": 0.3239, + "step": 9765 + }, + { + "epoch": 0.791153596889177, + "grad_norm": 0.034110911190509796, + "learning_rate": 0.0001782708492731446, + "loss": 0.2988, + "step": 9766 + }, + { + "epoch": 0.7912346079066753, + "grad_norm": 0.03776692971587181, + "learning_rate": 0.00017826634862055, + "loss": 0.3188, + "step": 9767 + }, + { + "epoch": 0.7913156189241737, + "grad_norm": 0.03164956718683243, + "learning_rate": 0.00017826184796795535, + "loss": 0.3367, + "step": 9768 + }, + { + "epoch": 0.791396629941672, + "grad_norm": 0.034085437655448914, + "learning_rate": 0.00017825734731536074, + "loss": 0.3762, + "step": 9769 + }, + { + "epoch": 0.7914776409591704, + "grad_norm": 0.059056442230939865, + "learning_rate": 0.00017825284666276613, + "loss": 0.3315, + "step": 9770 + }, + { + "epoch": 0.7915586519766689, + "grad_norm": 0.03333378955721855, + "learning_rate": 0.0001782483460101715, + "loss": 0.3224, + "step": 9771 + }, + { + "epoch": 0.7916396629941672, + "grad_norm": 0.035385195165872574, + "learning_rate": 0.00017824384535757685, + "loss": 0.3564, + "step": 9772 + }, + { + "epoch": 0.7917206740116656, + "grad_norm": 0.03095993772149086, + "learning_rate": 0.00017823934470498224, + "loss": 0.306, + "step": 9773 + }, + { + "epoch": 0.7918016850291639, + "grad_norm": 0.035980224609375, + "learning_rate": 0.0001782348440523876, + "loss": 0.321, + "step": 9774 + }, + { + "epoch": 0.7918826960466624, + "grad_norm": 0.03179408237338066, + "learning_rate": 0.00017823034339979298, + "loss": 0.3073, + "step": 9775 + }, + { + "epoch": 0.7919637070641607, + "grad_norm": 0.0315295048058033, + "learning_rate": 0.00017822584274719837, + "loss": 0.2791, + "step": 9776 + }, + { + "epoch": 0.7920447180816591, + "grad_norm": 0.03435314819216728, + "learning_rate": 0.00017822134209460373, + "loss": 0.3752, + "step": 9777 + }, + { + "epoch": 0.7921257290991575, + "grad_norm": 0.0332770012319088, + "learning_rate": 0.0001782168414420091, + "loss": 0.3101, + "step": 9778 + }, + { + "epoch": 0.7922067401166558, + "grad_norm": 0.031906016170978546, + "learning_rate": 0.00017821234078941448, + "loss": 0.2801, + "step": 9779 + }, + { + "epoch": 0.7922877511341543, + "grad_norm": 0.03557458519935608, + "learning_rate": 0.00017820784013681984, + "loss": 0.3043, + "step": 9780 + }, + { + "epoch": 0.7923687621516526, + "grad_norm": 0.031249122694134712, + "learning_rate": 0.00017820333948422522, + "loss": 0.323, + "step": 9781 + }, + { + "epoch": 0.792449773169151, + "grad_norm": 0.027938146144151688, + "learning_rate": 0.0001781988388316306, + "loss": 0.2701, + "step": 9782 + }, + { + "epoch": 0.7925307841866494, + "grad_norm": 0.038791898638010025, + "learning_rate": 0.00017819433817903597, + "loss": 0.3682, + "step": 9783 + }, + { + "epoch": 0.7926117952041478, + "grad_norm": 0.03866951912641525, + "learning_rate": 0.00017818983752644133, + "loss": 0.3229, + "step": 9784 + }, + { + "epoch": 0.7926928062216462, + "grad_norm": 0.03439750149846077, + "learning_rate": 0.00017818533687384672, + "loss": 0.3426, + "step": 9785 + }, + { + "epoch": 0.7927738172391445, + "grad_norm": 0.03489131107926369, + "learning_rate": 0.0001781808362212521, + "loss": 0.3889, + "step": 9786 + }, + { + "epoch": 0.7928548282566429, + "grad_norm": 0.031454797834157944, + "learning_rate": 0.00017817633556865747, + "loss": 0.3119, + "step": 9787 + }, + { + "epoch": 0.7929358392741412, + "grad_norm": 0.0329572819173336, + "learning_rate": 0.00017817183491606285, + "loss": 0.3671, + "step": 9788 + }, + { + "epoch": 0.7930168502916397, + "grad_norm": 0.03671346977353096, + "learning_rate": 0.0001781673342634682, + "loss": 0.3855, + "step": 9789 + }, + { + "epoch": 0.7930978613091381, + "grad_norm": 0.03178151696920395, + "learning_rate": 0.00017816283361087357, + "loss": 0.326, + "step": 9790 + }, + { + "epoch": 0.7931788723266364, + "grad_norm": 0.03052881546318531, + "learning_rate": 0.00017815833295827896, + "loss": 0.3398, + "step": 9791 + }, + { + "epoch": 0.7932598833441348, + "grad_norm": 0.04047226905822754, + "learning_rate": 0.00017815383230568435, + "loss": 0.3886, + "step": 9792 + }, + { + "epoch": 0.7933408943616331, + "grad_norm": 0.034657519310712814, + "learning_rate": 0.0001781493316530897, + "loss": 0.3747, + "step": 9793 + }, + { + "epoch": 0.7934219053791316, + "grad_norm": 0.0302236657589674, + "learning_rate": 0.0001781448310004951, + "loss": 0.3105, + "step": 9794 + }, + { + "epoch": 0.79350291639663, + "grad_norm": 0.035705894231796265, + "learning_rate": 0.00017814033034790045, + "loss": 0.4141, + "step": 9795 + }, + { + "epoch": 0.7935839274141283, + "grad_norm": 0.03531669080257416, + "learning_rate": 0.00017813582969530581, + "loss": 0.3131, + "step": 9796 + }, + { + "epoch": 0.7936649384316267, + "grad_norm": 0.03963521495461464, + "learning_rate": 0.0001781313290427112, + "loss": 0.3832, + "step": 9797 + }, + { + "epoch": 0.7937459494491251, + "grad_norm": 0.03380154073238373, + "learning_rate": 0.0001781268283901166, + "loss": 0.3527, + "step": 9798 + }, + { + "epoch": 0.7938269604666235, + "grad_norm": 0.03466390073299408, + "learning_rate": 0.00017812232773752195, + "loss": 0.3342, + "step": 9799 + }, + { + "epoch": 0.7939079714841218, + "grad_norm": 0.031100409105420113, + "learning_rate": 0.00017811782708492734, + "loss": 0.2889, + "step": 9800 + }, + { + "epoch": 0.7939889825016202, + "grad_norm": 0.03908621892333031, + "learning_rate": 0.0001781133264323327, + "loss": 0.3044, + "step": 9801 + }, + { + "epoch": 0.7940699935191186, + "grad_norm": 0.03332590311765671, + "learning_rate": 0.00017810882577973806, + "loss": 0.3317, + "step": 9802 + }, + { + "epoch": 0.794151004536617, + "grad_norm": 0.03381446376442909, + "learning_rate": 0.00017810432512714344, + "loss": 0.3451, + "step": 9803 + }, + { + "epoch": 0.7942320155541154, + "grad_norm": 0.032552119344472885, + "learning_rate": 0.00017809982447454883, + "loss": 0.3047, + "step": 9804 + }, + { + "epoch": 0.7943130265716137, + "grad_norm": 0.03203838691115379, + "learning_rate": 0.0001780953238219542, + "loss": 0.3456, + "step": 9805 + }, + { + "epoch": 0.7943940375891121, + "grad_norm": 0.03360417112708092, + "learning_rate": 0.00017809082316935958, + "loss": 0.3442, + "step": 9806 + }, + { + "epoch": 0.7944750486066104, + "grad_norm": 0.03208596259355545, + "learning_rate": 0.00017808632251676494, + "loss": 0.329, + "step": 9807 + }, + { + "epoch": 0.7945560596241089, + "grad_norm": 0.032233938574790955, + "learning_rate": 0.0001780818218641703, + "loss": 0.3261, + "step": 9808 + }, + { + "epoch": 0.7946370706416073, + "grad_norm": 0.034315407276153564, + "learning_rate": 0.00017807732121157569, + "loss": 0.3596, + "step": 9809 + }, + { + "epoch": 0.7947180816591056, + "grad_norm": 0.033394258469343185, + "learning_rate": 0.00017807282055898107, + "loss": 0.3577, + "step": 9810 + }, + { + "epoch": 0.794799092676604, + "grad_norm": 0.0287623293697834, + "learning_rate": 0.00017806831990638643, + "loss": 0.3068, + "step": 9811 + }, + { + "epoch": 0.7948801036941024, + "grad_norm": 0.03077227808535099, + "learning_rate": 0.00017806381925379182, + "loss": 0.2974, + "step": 9812 + }, + { + "epoch": 0.7949611147116008, + "grad_norm": 0.0382005013525486, + "learning_rate": 0.00017805931860119718, + "loss": 0.4317, + "step": 9813 + }, + { + "epoch": 0.7950421257290992, + "grad_norm": 0.03552140295505524, + "learning_rate": 0.00017805481794860254, + "loss": 0.3376, + "step": 9814 + }, + { + "epoch": 0.7951231367465975, + "grad_norm": 0.03667037934064865, + "learning_rate": 0.00017805031729600793, + "loss": 0.3729, + "step": 9815 + }, + { + "epoch": 0.7952041477640959, + "grad_norm": 0.030753176659345627, + "learning_rate": 0.00017804581664341331, + "loss": 0.3105, + "step": 9816 + }, + { + "epoch": 0.7952851587815943, + "grad_norm": 0.03592974320054054, + "learning_rate": 0.00017804131599081867, + "loss": 0.3686, + "step": 9817 + }, + { + "epoch": 0.7953661697990927, + "grad_norm": 0.034059733152389526, + "learning_rate": 0.00017803681533822406, + "loss": 0.3527, + "step": 9818 + }, + { + "epoch": 0.795447180816591, + "grad_norm": 0.03542809560894966, + "learning_rate": 0.00017803231468562942, + "loss": 0.3431, + "step": 9819 + }, + { + "epoch": 0.7955281918340894, + "grad_norm": 0.03323250636458397, + "learning_rate": 0.00017802781403303478, + "loss": 0.3434, + "step": 9820 + }, + { + "epoch": 0.7956092028515879, + "grad_norm": 0.03204082325100899, + "learning_rate": 0.0001780233133804402, + "loss": 0.3056, + "step": 9821 + }, + { + "epoch": 0.7956902138690862, + "grad_norm": 0.031638313084840775, + "learning_rate": 0.00017801881272784556, + "loss": 0.3473, + "step": 9822 + }, + { + "epoch": 0.7957712248865846, + "grad_norm": 0.027752108871936798, + "learning_rate": 0.00017801431207525092, + "loss": 0.3222, + "step": 9823 + }, + { + "epoch": 0.7958522359040829, + "grad_norm": 0.037686675786972046, + "learning_rate": 0.0001780098114226563, + "loss": 0.3285, + "step": 9824 + }, + { + "epoch": 0.7959332469215813, + "grad_norm": 0.040860336273908615, + "learning_rate": 0.00017800531077006166, + "loss": 0.3929, + "step": 9825 + }, + { + "epoch": 0.7960142579390798, + "grad_norm": 0.03692979738116264, + "learning_rate": 0.00017800081011746702, + "loss": 0.3687, + "step": 9826 + }, + { + "epoch": 0.7960952689565781, + "grad_norm": 0.03794914856553078, + "learning_rate": 0.00017799630946487244, + "loss": 0.4021, + "step": 9827 + }, + { + "epoch": 0.7961762799740765, + "grad_norm": 0.032314639538526535, + "learning_rate": 0.0001779918088122778, + "loss": 0.384, + "step": 9828 + }, + { + "epoch": 0.7962572909915748, + "grad_norm": 0.03454400226473808, + "learning_rate": 0.00017798730815968316, + "loss": 0.3652, + "step": 9829 + }, + { + "epoch": 0.7963383020090732, + "grad_norm": 0.033590447157621384, + "learning_rate": 0.00017798280750708854, + "loss": 0.3331, + "step": 9830 + }, + { + "epoch": 0.7964193130265717, + "grad_norm": 0.032915979623794556, + "learning_rate": 0.0001779783068544939, + "loss": 0.3188, + "step": 9831 + }, + { + "epoch": 0.79650032404407, + "grad_norm": 0.034886036068201065, + "learning_rate": 0.00017797380620189926, + "loss": 0.3538, + "step": 9832 + }, + { + "epoch": 0.7965813350615684, + "grad_norm": 0.03149457648396492, + "learning_rate": 0.00017796930554930468, + "loss": 0.2962, + "step": 9833 + }, + { + "epoch": 0.7966623460790667, + "grad_norm": 0.03190891072154045, + "learning_rate": 0.00017796480489671004, + "loss": 0.3518, + "step": 9834 + }, + { + "epoch": 0.7967433570965652, + "grad_norm": 0.03555048629641533, + "learning_rate": 0.0001779603042441154, + "loss": 0.3769, + "step": 9835 + }, + { + "epoch": 0.7968243681140635, + "grad_norm": 0.03308168426156044, + "learning_rate": 0.00017795580359152079, + "loss": 0.3265, + "step": 9836 + }, + { + "epoch": 0.7969053791315619, + "grad_norm": 0.0301455520093441, + "learning_rate": 0.00017795130293892615, + "loss": 0.3352, + "step": 9837 + }, + { + "epoch": 0.7969863901490603, + "grad_norm": 0.0303630530834198, + "learning_rate": 0.00017794680228633153, + "loss": 0.3216, + "step": 9838 + }, + { + "epoch": 0.7970674011665586, + "grad_norm": 0.03650288283824921, + "learning_rate": 0.00017794230163373692, + "loss": 0.3274, + "step": 9839 + }, + { + "epoch": 0.7971484121840571, + "grad_norm": 0.03212815895676613, + "learning_rate": 0.00017793780098114228, + "loss": 0.3295, + "step": 9840 + }, + { + "epoch": 0.7972294232015554, + "grad_norm": 0.02993020787835121, + "learning_rate": 0.00017793330032854764, + "loss": 0.3319, + "step": 9841 + }, + { + "epoch": 0.7973104342190538, + "grad_norm": 0.0330897755920887, + "learning_rate": 0.00017792879967595303, + "loss": 0.3578, + "step": 9842 + }, + { + "epoch": 0.7973914452365521, + "grad_norm": 0.033456459641456604, + "learning_rate": 0.0001779242990233584, + "loss": 0.3331, + "step": 9843 + }, + { + "epoch": 0.7974724562540505, + "grad_norm": 0.030742675065994263, + "learning_rate": 0.00017791979837076377, + "loss": 0.3283, + "step": 9844 + }, + { + "epoch": 0.797553467271549, + "grad_norm": 0.03512030839920044, + "learning_rate": 0.00017791529771816916, + "loss": 0.3325, + "step": 9845 + }, + { + "epoch": 0.7976344782890473, + "grad_norm": 0.03695352002978325, + "learning_rate": 0.00017791079706557452, + "loss": 0.3491, + "step": 9846 + }, + { + "epoch": 0.7977154893065457, + "grad_norm": 0.03346193954348564, + "learning_rate": 0.00017790629641297988, + "loss": 0.3334, + "step": 9847 + }, + { + "epoch": 0.797796500324044, + "grad_norm": 0.032957058399915695, + "learning_rate": 0.00017790179576038527, + "loss": 0.3567, + "step": 9848 + }, + { + "epoch": 0.7978775113415425, + "grad_norm": 0.034393198788166046, + "learning_rate": 0.00017789729510779063, + "loss": 0.3555, + "step": 9849 + }, + { + "epoch": 0.7979585223590409, + "grad_norm": 0.03429095074534416, + "learning_rate": 0.00017789279445519602, + "loss": 0.344, + "step": 9850 + }, + { + "epoch": 0.7980395333765392, + "grad_norm": 0.03974846377968788, + "learning_rate": 0.0001778882938026014, + "loss": 0.3608, + "step": 9851 + }, + { + "epoch": 0.7981205443940376, + "grad_norm": 0.02912759967148304, + "learning_rate": 0.00017788379315000676, + "loss": 0.2945, + "step": 9852 + }, + { + "epoch": 0.7982015554115359, + "grad_norm": 0.03488336130976677, + "learning_rate": 0.00017787929249741212, + "loss": 0.3598, + "step": 9853 + }, + { + "epoch": 0.7982825664290344, + "grad_norm": 0.0349576361477375, + "learning_rate": 0.0001778747918448175, + "loss": 0.3569, + "step": 9854 + }, + { + "epoch": 0.7983635774465327, + "grad_norm": 0.031084474176168442, + "learning_rate": 0.00017787029119222287, + "loss": 0.2779, + "step": 9855 + }, + { + "epoch": 0.7984445884640311, + "grad_norm": 0.04295002669095993, + "learning_rate": 0.00017786579053962826, + "loss": 0.3446, + "step": 9856 + }, + { + "epoch": 0.7985255994815295, + "grad_norm": 0.03482022136449814, + "learning_rate": 0.00017786128988703365, + "loss": 0.2966, + "step": 9857 + }, + { + "epoch": 0.7986066104990278, + "grad_norm": 0.032117944210767746, + "learning_rate": 0.000177856789234439, + "loss": 0.3177, + "step": 9858 + }, + { + "epoch": 0.7986876215165263, + "grad_norm": 0.03325260058045387, + "learning_rate": 0.00017785228858184437, + "loss": 0.3332, + "step": 9859 + }, + { + "epoch": 0.7987686325340246, + "grad_norm": 0.033402219414711, + "learning_rate": 0.00017784778792924975, + "loss": 0.356, + "step": 9860 + }, + { + "epoch": 0.798849643551523, + "grad_norm": 0.02923320233821869, + "learning_rate": 0.0001778432872766551, + "loss": 0.2805, + "step": 9861 + }, + { + "epoch": 0.7989306545690213, + "grad_norm": 0.03429705277085304, + "learning_rate": 0.0001778387866240605, + "loss": 0.3445, + "step": 9862 + }, + { + "epoch": 0.7990116655865198, + "grad_norm": 0.03572859242558479, + "learning_rate": 0.0001778342859714659, + "loss": 0.3237, + "step": 9863 + }, + { + "epoch": 0.7990926766040182, + "grad_norm": 0.03505443409085274, + "learning_rate": 0.00017782978531887125, + "loss": 0.3418, + "step": 9864 + }, + { + "epoch": 0.7991736876215165, + "grad_norm": 0.03291371092200279, + "learning_rate": 0.0001778252846662766, + "loss": 0.3022, + "step": 9865 + }, + { + "epoch": 0.7992546986390149, + "grad_norm": 0.04195602238178253, + "learning_rate": 0.000177820784013682, + "loss": 0.3914, + "step": 9866 + }, + { + "epoch": 0.7993357096565132, + "grad_norm": 0.030259612947702408, + "learning_rate": 0.00017781628336108738, + "loss": 0.3484, + "step": 9867 + }, + { + "epoch": 0.7994167206740117, + "grad_norm": 0.03401730954647064, + "learning_rate": 0.00017781178270849274, + "loss": 0.3492, + "step": 9868 + }, + { + "epoch": 0.7994977316915101, + "grad_norm": 0.02929941564798355, + "learning_rate": 0.00017780728205589813, + "loss": 0.3135, + "step": 9869 + }, + { + "epoch": 0.7995787427090084, + "grad_norm": 0.03494355082511902, + "learning_rate": 0.0001778027814033035, + "loss": 0.3343, + "step": 9870 + }, + { + "epoch": 0.7996597537265068, + "grad_norm": 0.035004131495952606, + "learning_rate": 0.00017779828075070885, + "loss": 0.3092, + "step": 9871 + }, + { + "epoch": 0.7997407647440052, + "grad_norm": 0.03391312435269356, + "learning_rate": 0.00017779378009811424, + "loss": 0.3313, + "step": 9872 + }, + { + "epoch": 0.7998217757615036, + "grad_norm": 0.032676223665475845, + "learning_rate": 0.00017778927944551962, + "loss": 0.3053, + "step": 9873 + }, + { + "epoch": 0.799902786779002, + "grad_norm": 0.03063894808292389, + "learning_rate": 0.00017778477879292498, + "loss": 0.3541, + "step": 9874 + }, + { + "epoch": 0.7999837977965003, + "grad_norm": 0.030848519876599312, + "learning_rate": 0.00017778027814033037, + "loss": 0.3774, + "step": 9875 + }, + { + "epoch": 0.8000648088139987, + "grad_norm": 0.03999854996800423, + "learning_rate": 0.00017777577748773573, + "loss": 0.3959, + "step": 9876 + }, + { + "epoch": 0.8001458198314971, + "grad_norm": 0.031490426510572433, + "learning_rate": 0.0001777712768351411, + "loss": 0.3219, + "step": 9877 + }, + { + "epoch": 0.8002268308489955, + "grad_norm": 0.03349410369992256, + "learning_rate": 0.00017776677618254648, + "loss": 0.3838, + "step": 9878 + }, + { + "epoch": 0.8003078418664938, + "grad_norm": 0.03521084040403366, + "learning_rate": 0.00017776227552995186, + "loss": 0.3527, + "step": 9879 + }, + { + "epoch": 0.8003888528839922, + "grad_norm": 0.03576532378792763, + "learning_rate": 0.00017775777487735722, + "loss": 0.3284, + "step": 9880 + }, + { + "epoch": 0.8004698639014906, + "grad_norm": 0.033710140734910965, + "learning_rate": 0.0001777532742247626, + "loss": 0.3252, + "step": 9881 + }, + { + "epoch": 0.800550874918989, + "grad_norm": 0.03096635453402996, + "learning_rate": 0.00017774877357216797, + "loss": 0.3208, + "step": 9882 + }, + { + "epoch": 0.8006318859364874, + "grad_norm": 0.03235981613397598, + "learning_rate": 0.00017774427291957333, + "loss": 0.3158, + "step": 9883 + }, + { + "epoch": 0.8007128969539857, + "grad_norm": 0.03289040923118591, + "learning_rate": 0.00017773977226697872, + "loss": 0.3225, + "step": 9884 + }, + { + "epoch": 0.8007939079714841, + "grad_norm": 0.03425959497690201, + "learning_rate": 0.0001777352716143841, + "loss": 0.3607, + "step": 9885 + }, + { + "epoch": 0.8008749189889826, + "grad_norm": 0.03126666322350502, + "learning_rate": 0.00017773077096178947, + "loss": 0.3311, + "step": 9886 + }, + { + "epoch": 0.8009559300064809, + "grad_norm": 0.03363567590713501, + "learning_rate": 0.00017772627030919485, + "loss": 0.3225, + "step": 9887 + }, + { + "epoch": 0.8010369410239793, + "grad_norm": 0.03323504328727722, + "learning_rate": 0.0001777217696566002, + "loss": 0.3588, + "step": 9888 + }, + { + "epoch": 0.8011179520414776, + "grad_norm": 0.03272292762994766, + "learning_rate": 0.00017771726900400557, + "loss": 0.3388, + "step": 9889 + }, + { + "epoch": 0.801198963058976, + "grad_norm": 0.035594481974840164, + "learning_rate": 0.00017771276835141096, + "loss": 0.3581, + "step": 9890 + }, + { + "epoch": 0.8012799740764744, + "grad_norm": 0.03276817500591278, + "learning_rate": 0.00017770826769881635, + "loss": 0.3742, + "step": 9891 + }, + { + "epoch": 0.8013609850939728, + "grad_norm": 0.03029584512114525, + "learning_rate": 0.0001777037670462217, + "loss": 0.2998, + "step": 9892 + }, + { + "epoch": 0.8014419961114712, + "grad_norm": 0.030790219083428383, + "learning_rate": 0.0001776992663936271, + "loss": 0.3391, + "step": 9893 + }, + { + "epoch": 0.8015230071289695, + "grad_norm": 0.040925610810518265, + "learning_rate": 0.00017769476574103245, + "loss": 0.3516, + "step": 9894 + }, + { + "epoch": 0.8016040181464679, + "grad_norm": 0.03036440722644329, + "learning_rate": 0.00017769026508843782, + "loss": 0.339, + "step": 9895 + }, + { + "epoch": 0.8016850291639663, + "grad_norm": 0.0375761054456234, + "learning_rate": 0.00017768576443584323, + "loss": 0.3519, + "step": 9896 + }, + { + "epoch": 0.8017660401814647, + "grad_norm": 0.0340602844953537, + "learning_rate": 0.0001776812637832486, + "loss": 0.3145, + "step": 9897 + }, + { + "epoch": 0.801847051198963, + "grad_norm": 0.033066585659980774, + "learning_rate": 0.00017767676313065395, + "loss": 0.3557, + "step": 9898 + }, + { + "epoch": 0.8019280622164614, + "grad_norm": 0.0324365571141243, + "learning_rate": 0.00017767226247805934, + "loss": 0.3625, + "step": 9899 + }, + { + "epoch": 0.8020090732339599, + "grad_norm": 0.02973266690969467, + "learning_rate": 0.0001776677618254647, + "loss": 0.3006, + "step": 9900 + }, + { + "epoch": 0.8020900842514582, + "grad_norm": 0.032307885587215424, + "learning_rate": 0.00017766326117287006, + "loss": 0.3152, + "step": 9901 + }, + { + "epoch": 0.8021710952689566, + "grad_norm": 0.029426628723740578, + "learning_rate": 0.00017765876052027547, + "loss": 0.2689, + "step": 9902 + }, + { + "epoch": 0.8022521062864549, + "grad_norm": 0.03942330554127693, + "learning_rate": 0.00017765425986768083, + "loss": 0.3758, + "step": 9903 + }, + { + "epoch": 0.8023331173039533, + "grad_norm": 0.03324083238840103, + "learning_rate": 0.0001776497592150862, + "loss": 0.3391, + "step": 9904 + }, + { + "epoch": 0.8024141283214518, + "grad_norm": 0.03555333986878395, + "learning_rate": 0.00017764525856249158, + "loss": 0.3283, + "step": 9905 + }, + { + "epoch": 0.8024951393389501, + "grad_norm": 0.03296968713402748, + "learning_rate": 0.00017764075790989694, + "loss": 0.3561, + "step": 9906 + }, + { + "epoch": 0.8025761503564485, + "grad_norm": 0.0356278233230114, + "learning_rate": 0.0001776362572573023, + "loss": 0.3368, + "step": 9907 + }, + { + "epoch": 0.8026571613739468, + "grad_norm": 0.03892393410205841, + "learning_rate": 0.0001776317566047077, + "loss": 0.4099, + "step": 9908 + }, + { + "epoch": 0.8027381723914452, + "grad_norm": 0.035411980003118515, + "learning_rate": 0.00017762725595211307, + "loss": 0.377, + "step": 9909 + }, + { + "epoch": 0.8028191834089436, + "grad_norm": 0.03715914860367775, + "learning_rate": 0.00017762275529951843, + "loss": 0.3227, + "step": 9910 + }, + { + "epoch": 0.802900194426442, + "grad_norm": 0.036011308431625366, + "learning_rate": 0.00017761825464692382, + "loss": 0.3568, + "step": 9911 + }, + { + "epoch": 0.8029812054439404, + "grad_norm": 0.03165072947740555, + "learning_rate": 0.00017761375399432918, + "loss": 0.3277, + "step": 9912 + }, + { + "epoch": 0.8030622164614387, + "grad_norm": 0.02398240566253662, + "learning_rate": 0.00017760925334173454, + "loss": 0.2622, + "step": 9913 + }, + { + "epoch": 0.8031432274789372, + "grad_norm": 0.036116186529397964, + "learning_rate": 0.00017760475268913995, + "loss": 0.3896, + "step": 9914 + }, + { + "epoch": 0.8032242384964355, + "grad_norm": 0.033894285559654236, + "learning_rate": 0.00017760025203654531, + "loss": 0.3327, + "step": 9915 + }, + { + "epoch": 0.8033052495139339, + "grad_norm": 0.030235815793275833, + "learning_rate": 0.00017759575138395067, + "loss": 0.342, + "step": 9916 + }, + { + "epoch": 0.8033862605314323, + "grad_norm": 0.031925395131111145, + "learning_rate": 0.00017759125073135606, + "loss": 0.3369, + "step": 9917 + }, + { + "epoch": 0.8034672715489306, + "grad_norm": 0.032986488193273544, + "learning_rate": 0.00017758675007876142, + "loss": 0.333, + "step": 9918 + }, + { + "epoch": 0.8035482825664291, + "grad_norm": 0.030651472508907318, + "learning_rate": 0.0001775822494261668, + "loss": 0.3445, + "step": 9919 + }, + { + "epoch": 0.8036292935839274, + "grad_norm": 0.03186076134443283, + "learning_rate": 0.0001775777487735722, + "loss": 0.3913, + "step": 9920 + }, + { + "epoch": 0.8037103046014258, + "grad_norm": 0.03048551455140114, + "learning_rate": 0.00017757324812097756, + "loss": 0.31, + "step": 9921 + }, + { + "epoch": 0.8037913156189241, + "grad_norm": 0.028865564614534378, + "learning_rate": 0.00017756874746838292, + "loss": 0.3382, + "step": 9922 + }, + { + "epoch": 0.8038723266364226, + "grad_norm": 0.032060034573078156, + "learning_rate": 0.0001775642468157883, + "loss": 0.3355, + "step": 9923 + }, + { + "epoch": 0.803953337653921, + "grad_norm": 0.03471523895859718, + "learning_rate": 0.00017755974616319366, + "loss": 0.3643, + "step": 9924 + }, + { + "epoch": 0.8040343486714193, + "grad_norm": 0.030344653874635696, + "learning_rate": 0.00017755524551059905, + "loss": 0.3015, + "step": 9925 + }, + { + "epoch": 0.8041153596889177, + "grad_norm": 0.033976975828409195, + "learning_rate": 0.00017755074485800444, + "loss": 0.3253, + "step": 9926 + }, + { + "epoch": 0.804196370706416, + "grad_norm": 0.034605782479047775, + "learning_rate": 0.0001775462442054098, + "loss": 0.3234, + "step": 9927 + }, + { + "epoch": 0.8042773817239145, + "grad_norm": 0.03288121148943901, + "learning_rate": 0.00017754174355281516, + "loss": 0.3525, + "step": 9928 + }, + { + "epoch": 0.8043583927414129, + "grad_norm": 0.0387084074318409, + "learning_rate": 0.00017753724290022054, + "loss": 0.3987, + "step": 9929 + }, + { + "epoch": 0.8044394037589112, + "grad_norm": 0.02676405757665634, + "learning_rate": 0.0001775327422476259, + "loss": 0.2951, + "step": 9930 + }, + { + "epoch": 0.8045204147764096, + "grad_norm": 0.032487448304891586, + "learning_rate": 0.0001775282415950313, + "loss": 0.3576, + "step": 9931 + }, + { + "epoch": 0.8046014257939079, + "grad_norm": 0.03143838793039322, + "learning_rate": 0.00017752374094243668, + "loss": 0.3081, + "step": 9932 + }, + { + "epoch": 0.8046824368114064, + "grad_norm": 0.030801311135292053, + "learning_rate": 0.00017751924028984204, + "loss": 0.3141, + "step": 9933 + }, + { + "epoch": 0.8047634478289047, + "grad_norm": 0.033451713621616364, + "learning_rate": 0.0001775147396372474, + "loss": 0.3355, + "step": 9934 + }, + { + "epoch": 0.8048444588464031, + "grad_norm": 0.03344229608774185, + "learning_rate": 0.00017751023898465279, + "loss": 0.3176, + "step": 9935 + }, + { + "epoch": 0.8049254698639015, + "grad_norm": 0.037789855152368546, + "learning_rate": 0.00017750573833205815, + "loss": 0.3356, + "step": 9936 + }, + { + "epoch": 0.8050064808813999, + "grad_norm": 0.027795903384685516, + "learning_rate": 0.00017750123767946353, + "loss": 0.2775, + "step": 9937 + }, + { + "epoch": 0.8050874918988983, + "grad_norm": 0.035118166357278824, + "learning_rate": 0.00017749673702686892, + "loss": 0.3245, + "step": 9938 + }, + { + "epoch": 0.8051685029163966, + "grad_norm": 0.03690321743488312, + "learning_rate": 0.00017749223637427428, + "loss": 0.3588, + "step": 9939 + }, + { + "epoch": 0.805249513933895, + "grad_norm": 0.03635449334979057, + "learning_rate": 0.00017748773572167964, + "loss": 0.3713, + "step": 9940 + }, + { + "epoch": 0.8053305249513933, + "grad_norm": 0.038405247032642365, + "learning_rate": 0.00017748323506908503, + "loss": 0.3332, + "step": 9941 + }, + { + "epoch": 0.8054115359688918, + "grad_norm": 0.03047071024775505, + "learning_rate": 0.0001774787344164904, + "loss": 0.309, + "step": 9942 + }, + { + "epoch": 0.8054925469863902, + "grad_norm": 0.03196856752038002, + "learning_rate": 0.00017747423376389578, + "loss": 0.3169, + "step": 9943 + }, + { + "epoch": 0.8055735580038885, + "grad_norm": 0.032038189470767975, + "learning_rate": 0.00017746973311130116, + "loss": 0.3235, + "step": 9944 + }, + { + "epoch": 0.8056545690213869, + "grad_norm": 0.03425036743283272, + "learning_rate": 0.00017746523245870652, + "loss": 0.3052, + "step": 9945 + }, + { + "epoch": 0.8057355800388852, + "grad_norm": 0.03385873883962631, + "learning_rate": 0.00017746073180611188, + "loss": 0.3353, + "step": 9946 + }, + { + "epoch": 0.8058165910563837, + "grad_norm": 0.030192499980330467, + "learning_rate": 0.00017745623115351727, + "loss": 0.3524, + "step": 9947 + }, + { + "epoch": 0.8058976020738821, + "grad_norm": 0.0328400619328022, + "learning_rate": 0.00017745173050092266, + "loss": 0.2959, + "step": 9948 + }, + { + "epoch": 0.8059786130913804, + "grad_norm": 0.036807384341955185, + "learning_rate": 0.00017744722984832802, + "loss": 0.3319, + "step": 9949 + }, + { + "epoch": 0.8060596241088788, + "grad_norm": 0.038014866411685944, + "learning_rate": 0.0001774427291957334, + "loss": 0.3558, + "step": 9950 + }, + { + "epoch": 0.8061406351263772, + "grad_norm": 0.04583004489541054, + "learning_rate": 0.00017743822854313876, + "loss": 0.3558, + "step": 9951 + }, + { + "epoch": 0.8062216461438756, + "grad_norm": 0.03124105930328369, + "learning_rate": 0.00017743372789054412, + "loss": 0.2939, + "step": 9952 + }, + { + "epoch": 0.806302657161374, + "grad_norm": 0.03650376573204994, + "learning_rate": 0.0001774292272379495, + "loss": 0.3778, + "step": 9953 + }, + { + "epoch": 0.8063836681788723, + "grad_norm": 0.038504041731357574, + "learning_rate": 0.0001774247265853549, + "loss": 0.4135, + "step": 9954 + }, + { + "epoch": 0.8064646791963707, + "grad_norm": 0.02942444011569023, + "learning_rate": 0.00017742022593276026, + "loss": 0.2947, + "step": 9955 + }, + { + "epoch": 0.8065456902138691, + "grad_norm": 0.03135211765766144, + "learning_rate": 0.00017741572528016565, + "loss": 0.3236, + "step": 9956 + }, + { + "epoch": 0.8066267012313675, + "grad_norm": 0.026941142976284027, + "learning_rate": 0.000177411224627571, + "loss": 0.2815, + "step": 9957 + }, + { + "epoch": 0.8067077122488658, + "grad_norm": 0.03467633202672005, + "learning_rate": 0.00017740672397497637, + "loss": 0.329, + "step": 9958 + }, + { + "epoch": 0.8067887232663642, + "grad_norm": 0.034031517803668976, + "learning_rate": 0.00017740222332238175, + "loss": 0.3436, + "step": 9959 + }, + { + "epoch": 0.8068697342838627, + "grad_norm": 0.041298530995845795, + "learning_rate": 0.00017739772266978714, + "loss": 0.3865, + "step": 9960 + }, + { + "epoch": 0.806950745301361, + "grad_norm": 0.029256189242005348, + "learning_rate": 0.0001773932220171925, + "loss": 0.2955, + "step": 9961 + }, + { + "epoch": 0.8070317563188594, + "grad_norm": 0.033907074481248856, + "learning_rate": 0.0001773887213645979, + "loss": 0.3672, + "step": 9962 + }, + { + "epoch": 0.8071127673363577, + "grad_norm": 0.03336393088102341, + "learning_rate": 0.00017738422071200325, + "loss": 0.382, + "step": 9963 + }, + { + "epoch": 0.8071937783538561, + "grad_norm": 0.03110332041978836, + "learning_rate": 0.0001773797200594086, + "loss": 0.3467, + "step": 9964 + }, + { + "epoch": 0.8072747893713546, + "grad_norm": 0.0371004119515419, + "learning_rate": 0.000177375219406814, + "loss": 0.3596, + "step": 9965 + }, + { + "epoch": 0.8073558003888529, + "grad_norm": 0.033193279057741165, + "learning_rate": 0.00017737071875421938, + "loss": 0.31, + "step": 9966 + }, + { + "epoch": 0.8074368114063513, + "grad_norm": 0.030885837972164154, + "learning_rate": 0.00017736621810162474, + "loss": 0.3003, + "step": 9967 + }, + { + "epoch": 0.8075178224238496, + "grad_norm": 0.033284615725278854, + "learning_rate": 0.00017736171744903013, + "loss": 0.3384, + "step": 9968 + }, + { + "epoch": 0.807598833441348, + "grad_norm": 0.02768583968281746, + "learning_rate": 0.0001773572167964355, + "loss": 0.2796, + "step": 9969 + }, + { + "epoch": 0.8076798444588464, + "grad_norm": 0.03884153440594673, + "learning_rate": 0.00017735271614384085, + "loss": 0.3976, + "step": 9970 + }, + { + "epoch": 0.8077608554763448, + "grad_norm": 0.03400607034564018, + "learning_rate": 0.00017734821549124626, + "loss": 0.3267, + "step": 9971 + }, + { + "epoch": 0.8078418664938432, + "grad_norm": 0.030479488894343376, + "learning_rate": 0.00017734371483865162, + "loss": 0.3018, + "step": 9972 + }, + { + "epoch": 0.8079228775113415, + "grad_norm": 0.032650839537382126, + "learning_rate": 0.00017733921418605698, + "loss": 0.3255, + "step": 9973 + }, + { + "epoch": 0.80800388852884, + "grad_norm": 0.03217211365699768, + "learning_rate": 0.00017733471353346237, + "loss": 0.3618, + "step": 9974 + }, + { + "epoch": 0.8080848995463383, + "grad_norm": 0.030187807977199554, + "learning_rate": 0.00017733021288086773, + "loss": 0.3099, + "step": 9975 + }, + { + "epoch": 0.8081659105638367, + "grad_norm": 0.027642568573355675, + "learning_rate": 0.0001773257122282731, + "loss": 0.3518, + "step": 9976 + }, + { + "epoch": 0.808246921581335, + "grad_norm": 0.036619480699300766, + "learning_rate": 0.0001773212115756785, + "loss": 0.381, + "step": 9977 + }, + { + "epoch": 0.8083279325988334, + "grad_norm": 0.03958069160580635, + "learning_rate": 0.00017731671092308386, + "loss": 0.4041, + "step": 9978 + }, + { + "epoch": 0.8084089436163319, + "grad_norm": 0.03649488463997841, + "learning_rate": 0.00017731221027048922, + "loss": 0.3567, + "step": 9979 + }, + { + "epoch": 0.8084899546338302, + "grad_norm": 0.03155497834086418, + "learning_rate": 0.0001773077096178946, + "loss": 0.3163, + "step": 9980 + }, + { + "epoch": 0.8085709656513286, + "grad_norm": 0.02980252169072628, + "learning_rate": 0.00017730320896529997, + "loss": 0.3462, + "step": 9981 + }, + { + "epoch": 0.8086519766688269, + "grad_norm": 0.03588375821709633, + "learning_rate": 0.00017729870831270533, + "loss": 0.3528, + "step": 9982 + }, + { + "epoch": 0.8087329876863253, + "grad_norm": 0.031652361154556274, + "learning_rate": 0.00017729420766011075, + "loss": 0.3122, + "step": 9983 + }, + { + "epoch": 0.8088139987038238, + "grad_norm": 0.03213665261864662, + "learning_rate": 0.0001772897070075161, + "loss": 0.3334, + "step": 9984 + }, + { + "epoch": 0.8088950097213221, + "grad_norm": 0.03248204290866852, + "learning_rate": 0.00017728520635492147, + "loss": 0.3086, + "step": 9985 + }, + { + "epoch": 0.8089760207388205, + "grad_norm": 0.03270760923624039, + "learning_rate": 0.00017728070570232685, + "loss": 0.3195, + "step": 9986 + }, + { + "epoch": 0.8090570317563188, + "grad_norm": 0.035143230110406876, + "learning_rate": 0.00017727620504973221, + "loss": 0.3604, + "step": 9987 + }, + { + "epoch": 0.8091380427738173, + "grad_norm": 0.030242938548326492, + "learning_rate": 0.00017727170439713757, + "loss": 0.3452, + "step": 9988 + }, + { + "epoch": 0.8092190537913156, + "grad_norm": 0.031948208808898926, + "learning_rate": 0.000177267203744543, + "loss": 0.311, + "step": 9989 + }, + { + "epoch": 0.809300064808814, + "grad_norm": 0.03433237969875336, + "learning_rate": 0.00017726270309194835, + "loss": 0.352, + "step": 9990 + }, + { + "epoch": 0.8093810758263124, + "grad_norm": 0.036170594394207, + "learning_rate": 0.0001772582024393537, + "loss": 0.3987, + "step": 9991 + }, + { + "epoch": 0.8094620868438107, + "grad_norm": 0.029423857107758522, + "learning_rate": 0.0001772537017867591, + "loss": 0.2831, + "step": 9992 + }, + { + "epoch": 0.8095430978613092, + "grad_norm": 0.03154407814145088, + "learning_rate": 0.00017724920113416446, + "loss": 0.3293, + "step": 9993 + }, + { + "epoch": 0.8096241088788075, + "grad_norm": 0.03517995402216911, + "learning_rate": 0.00017724470048156982, + "loss": 0.3704, + "step": 9994 + }, + { + "epoch": 0.8097051198963059, + "grad_norm": 0.03546958789229393, + "learning_rate": 0.00017724019982897523, + "loss": 0.3842, + "step": 9995 + }, + { + "epoch": 0.8097861309138042, + "grad_norm": 0.03441493958234787, + "learning_rate": 0.0001772356991763806, + "loss": 0.3228, + "step": 9996 + }, + { + "epoch": 0.8098671419313026, + "grad_norm": 0.03107467293739319, + "learning_rate": 0.00017723119852378595, + "loss": 0.2757, + "step": 9997 + }, + { + "epoch": 0.8099481529488011, + "grad_norm": 0.02883220836520195, + "learning_rate": 0.00017722669787119134, + "loss": 0.296, + "step": 9998 + }, + { + "epoch": 0.8100291639662994, + "grad_norm": 0.030075686052441597, + "learning_rate": 0.0001772221972185967, + "loss": 0.3419, + "step": 9999 + }, + { + "epoch": 0.8101101749837978, + "grad_norm": 0.03298826888203621, + "learning_rate": 0.00017721769656600208, + "loss": 0.3751, + "step": 10000 + }, + { + "epoch": 0.8101911860012961, + "grad_norm": 0.03761919215321541, + "learning_rate": 0.00017721319591340747, + "loss": 0.3478, + "step": 10001 + }, + { + "epoch": 0.8102721970187946, + "grad_norm": 0.03876710310578346, + "learning_rate": 0.00017720869526081283, + "loss": 0.3734, + "step": 10002 + }, + { + "epoch": 0.810353208036293, + "grad_norm": 0.02835431881248951, + "learning_rate": 0.0001772041946082182, + "loss": 0.2926, + "step": 10003 + }, + { + "epoch": 0.8104342190537913, + "grad_norm": 0.03384508937597275, + "learning_rate": 0.00017719969395562358, + "loss": 0.3082, + "step": 10004 + }, + { + "epoch": 0.8105152300712897, + "grad_norm": 0.0412626713514328, + "learning_rate": 0.00017719519330302894, + "loss": 0.3824, + "step": 10005 + }, + { + "epoch": 0.810596241088788, + "grad_norm": 0.027186481282114983, + "learning_rate": 0.00017719069265043433, + "loss": 0.2845, + "step": 10006 + }, + { + "epoch": 0.8106772521062865, + "grad_norm": 0.03608010709285736, + "learning_rate": 0.0001771861919978397, + "loss": 0.343, + "step": 10007 + }, + { + "epoch": 0.8107582631237849, + "grad_norm": 0.034267593175172806, + "learning_rate": 0.00017718169134524507, + "loss": 0.3317, + "step": 10008 + }, + { + "epoch": 0.8108392741412832, + "grad_norm": 0.03193562477827072, + "learning_rate": 0.00017717719069265043, + "loss": 0.3081, + "step": 10009 + }, + { + "epoch": 0.8109202851587816, + "grad_norm": 0.029407281428575516, + "learning_rate": 0.00017717269004005582, + "loss": 0.3008, + "step": 10010 + }, + { + "epoch": 0.81100129617628, + "grad_norm": 0.0316649004817009, + "learning_rate": 0.00017716818938746118, + "loss": 0.3247, + "step": 10011 + }, + { + "epoch": 0.8110823071937784, + "grad_norm": 0.03278086706995964, + "learning_rate": 0.00017716368873486657, + "loss": 0.3495, + "step": 10012 + }, + { + "epoch": 0.8111633182112767, + "grad_norm": 0.02849617227911949, + "learning_rate": 0.00017715918808227195, + "loss": 0.3042, + "step": 10013 + }, + { + "epoch": 0.8112443292287751, + "grad_norm": 0.03393520787358284, + "learning_rate": 0.00017715468742967731, + "loss": 0.3218, + "step": 10014 + }, + { + "epoch": 0.8113253402462735, + "grad_norm": 0.03271503746509552, + "learning_rate": 0.00017715018677708267, + "loss": 0.329, + "step": 10015 + }, + { + "epoch": 0.8114063512637719, + "grad_norm": 0.035968679934740067, + "learning_rate": 0.00017714568612448806, + "loss": 0.3237, + "step": 10016 + }, + { + "epoch": 0.8114873622812703, + "grad_norm": 0.03288400173187256, + "learning_rate": 0.00017714118547189342, + "loss": 0.3547, + "step": 10017 + }, + { + "epoch": 0.8115683732987686, + "grad_norm": 0.032152384519577026, + "learning_rate": 0.0001771366848192988, + "loss": 0.3109, + "step": 10018 + }, + { + "epoch": 0.811649384316267, + "grad_norm": 0.03296571597456932, + "learning_rate": 0.0001771321841667042, + "loss": 0.3286, + "step": 10019 + }, + { + "epoch": 0.8117303953337653, + "grad_norm": 0.03662797063589096, + "learning_rate": 0.00017712768351410956, + "loss": 0.3494, + "step": 10020 + }, + { + "epoch": 0.8118114063512638, + "grad_norm": 0.030709531158208847, + "learning_rate": 0.00017712318286151492, + "loss": 0.3459, + "step": 10021 + }, + { + "epoch": 0.8118924173687622, + "grad_norm": 0.03615014627575874, + "learning_rate": 0.0001771186822089203, + "loss": 0.3693, + "step": 10022 + }, + { + "epoch": 0.8119734283862605, + "grad_norm": 0.034624602645635605, + "learning_rate": 0.0001771141815563257, + "loss": 0.3282, + "step": 10023 + }, + { + "epoch": 0.8120544394037589, + "grad_norm": 0.03422831743955612, + "learning_rate": 0.00017710968090373105, + "loss": 0.3027, + "step": 10024 + }, + { + "epoch": 0.8121354504212573, + "grad_norm": 0.03537493571639061, + "learning_rate": 0.00017710518025113644, + "loss": 0.357, + "step": 10025 + }, + { + "epoch": 0.8122164614387557, + "grad_norm": 0.03175731748342514, + "learning_rate": 0.0001771006795985418, + "loss": 0.3397, + "step": 10026 + }, + { + "epoch": 0.812297472456254, + "grad_norm": 0.0324130654335022, + "learning_rate": 0.00017709617894594716, + "loss": 0.3181, + "step": 10027 + }, + { + "epoch": 0.8123784834737524, + "grad_norm": 0.03641660511493683, + "learning_rate": 0.00017709167829335254, + "loss": 0.3697, + "step": 10028 + }, + { + "epoch": 0.8124594944912508, + "grad_norm": 0.028983507305383682, + "learning_rate": 0.00017708717764075793, + "loss": 0.2776, + "step": 10029 + }, + { + "epoch": 0.8125405055087492, + "grad_norm": 0.02874094247817993, + "learning_rate": 0.0001770826769881633, + "loss": 0.3062, + "step": 10030 + }, + { + "epoch": 0.8126215165262476, + "grad_norm": 0.03256585821509361, + "learning_rate": 0.00017707817633556868, + "loss": 0.2878, + "step": 10031 + }, + { + "epoch": 0.812702527543746, + "grad_norm": 0.03371914476156235, + "learning_rate": 0.00017707367568297404, + "loss": 0.3744, + "step": 10032 + }, + { + "epoch": 0.8127835385612443, + "grad_norm": 0.03364307060837746, + "learning_rate": 0.0001770691750303794, + "loss": 0.3373, + "step": 10033 + }, + { + "epoch": 0.8128645495787427, + "grad_norm": 0.03251344710588455, + "learning_rate": 0.0001770646743777848, + "loss": 0.3503, + "step": 10034 + }, + { + "epoch": 0.8129455605962411, + "grad_norm": 0.03772956505417824, + "learning_rate": 0.00017706017372519017, + "loss": 0.3693, + "step": 10035 + }, + { + "epoch": 0.8130265716137395, + "grad_norm": 0.03383497893810272, + "learning_rate": 0.00017705567307259553, + "loss": 0.3337, + "step": 10036 + }, + { + "epoch": 0.8131075826312378, + "grad_norm": 0.030101200565695763, + "learning_rate": 0.00017705117242000092, + "loss": 0.3006, + "step": 10037 + }, + { + "epoch": 0.8131885936487362, + "grad_norm": 0.03550754860043526, + "learning_rate": 0.00017704667176740628, + "loss": 0.3391, + "step": 10038 + }, + { + "epoch": 0.8132696046662347, + "grad_norm": 0.033610474318265915, + "learning_rate": 0.00017704217111481164, + "loss": 0.3198, + "step": 10039 + }, + { + "epoch": 0.813350615683733, + "grad_norm": 0.033679138869047165, + "learning_rate": 0.00017703767046221703, + "loss": 0.3716, + "step": 10040 + }, + { + "epoch": 0.8134316267012314, + "grad_norm": 0.03023621067404747, + "learning_rate": 0.00017703316980962242, + "loss": 0.3359, + "step": 10041 + }, + { + "epoch": 0.8135126377187297, + "grad_norm": 0.030499301850795746, + "learning_rate": 0.00017702866915702778, + "loss": 0.2694, + "step": 10042 + }, + { + "epoch": 0.8135936487362281, + "grad_norm": 0.03403441235423088, + "learning_rate": 0.00017702416850443316, + "loss": 0.352, + "step": 10043 + }, + { + "epoch": 0.8136746597537265, + "grad_norm": 0.02972903847694397, + "learning_rate": 0.00017701966785183852, + "loss": 0.298, + "step": 10044 + }, + { + "epoch": 0.8137556707712249, + "grad_norm": 0.03293878212571144, + "learning_rate": 0.00017701516719924388, + "loss": 0.3617, + "step": 10045 + }, + { + "epoch": 0.8138366817887233, + "grad_norm": 0.03338270261883736, + "learning_rate": 0.00017701066654664927, + "loss": 0.3154, + "step": 10046 + }, + { + "epoch": 0.8139176928062216, + "grad_norm": 0.03914094343781471, + "learning_rate": 0.00017700616589405466, + "loss": 0.3046, + "step": 10047 + }, + { + "epoch": 0.81399870382372, + "grad_norm": 0.03613367676734924, + "learning_rate": 0.00017700166524146002, + "loss": 0.3574, + "step": 10048 + }, + { + "epoch": 0.8140797148412184, + "grad_norm": 0.04005276784300804, + "learning_rate": 0.0001769971645888654, + "loss": 0.3526, + "step": 10049 + }, + { + "epoch": 0.8141607258587168, + "grad_norm": 0.032530419528484344, + "learning_rate": 0.00017699266393627076, + "loss": 0.2935, + "step": 10050 + }, + { + "epoch": 0.8142417368762151, + "grad_norm": 0.02893023192882538, + "learning_rate": 0.00017698816328367612, + "loss": 0.2801, + "step": 10051 + }, + { + "epoch": 0.8143227478937135, + "grad_norm": 0.03815500810742378, + "learning_rate": 0.00017698366263108154, + "loss": 0.3827, + "step": 10052 + }, + { + "epoch": 0.814403758911212, + "grad_norm": 0.03357556834816933, + "learning_rate": 0.0001769791619784869, + "loss": 0.3487, + "step": 10053 + }, + { + "epoch": 0.8144847699287103, + "grad_norm": 0.03304484114050865, + "learning_rate": 0.00017697466132589226, + "loss": 0.36, + "step": 10054 + }, + { + "epoch": 0.8145657809462087, + "grad_norm": 0.03409358486533165, + "learning_rate": 0.00017697016067329765, + "loss": 0.3653, + "step": 10055 + }, + { + "epoch": 0.814646791963707, + "grad_norm": 0.0298635121434927, + "learning_rate": 0.000176965660020703, + "loss": 0.2991, + "step": 10056 + }, + { + "epoch": 0.8147278029812054, + "grad_norm": 0.03272094205021858, + "learning_rate": 0.00017696115936810837, + "loss": 0.3561, + "step": 10057 + }, + { + "epoch": 0.8148088139987039, + "grad_norm": 0.031532127410173416, + "learning_rate": 0.00017695665871551378, + "loss": 0.3426, + "step": 10058 + }, + { + "epoch": 0.8148898250162022, + "grad_norm": 0.029812676832079887, + "learning_rate": 0.00017695215806291914, + "loss": 0.3313, + "step": 10059 + }, + { + "epoch": 0.8149708360337006, + "grad_norm": 0.032686151564121246, + "learning_rate": 0.0001769476574103245, + "loss": 0.3582, + "step": 10060 + }, + { + "epoch": 0.8150518470511989, + "grad_norm": 0.02958156354725361, + "learning_rate": 0.0001769431567577299, + "loss": 0.3346, + "step": 10061 + }, + { + "epoch": 0.8151328580686974, + "grad_norm": 0.03275395929813385, + "learning_rate": 0.00017693865610513525, + "loss": 0.3527, + "step": 10062 + }, + { + "epoch": 0.8152138690861958, + "grad_norm": 0.03696022182703018, + "learning_rate": 0.0001769341554525406, + "loss": 0.3439, + "step": 10063 + }, + { + "epoch": 0.8152948801036941, + "grad_norm": 0.036947984248399734, + "learning_rate": 0.00017692965479994602, + "loss": 0.3293, + "step": 10064 + }, + { + "epoch": 0.8153758911211925, + "grad_norm": 0.034265387803316116, + "learning_rate": 0.00017692515414735138, + "loss": 0.3234, + "step": 10065 + }, + { + "epoch": 0.8154569021386908, + "grad_norm": 0.03201863914728165, + "learning_rate": 0.00017692065349475674, + "loss": 0.3489, + "step": 10066 + }, + { + "epoch": 0.8155379131561893, + "grad_norm": 0.0339871384203434, + "learning_rate": 0.00017691615284216213, + "loss": 0.3307, + "step": 10067 + }, + { + "epoch": 0.8156189241736876, + "grad_norm": 0.0365617498755455, + "learning_rate": 0.0001769116521895675, + "loss": 0.3246, + "step": 10068 + }, + { + "epoch": 0.815699935191186, + "grad_norm": 0.032406214624643326, + "learning_rate": 0.00017690715153697285, + "loss": 0.3279, + "step": 10069 + }, + { + "epoch": 0.8157809462086844, + "grad_norm": 0.034895364195108414, + "learning_rate": 0.00017690265088437826, + "loss": 0.3606, + "step": 10070 + }, + { + "epoch": 0.8158619572261827, + "grad_norm": 0.03705539554357529, + "learning_rate": 0.00017689815023178362, + "loss": 0.3373, + "step": 10071 + }, + { + "epoch": 0.8159429682436812, + "grad_norm": 0.03331300616264343, + "learning_rate": 0.00017689364957918898, + "loss": 0.3623, + "step": 10072 + }, + { + "epoch": 0.8160239792611795, + "grad_norm": 0.03506195545196533, + "learning_rate": 0.00017688914892659437, + "loss": 0.3565, + "step": 10073 + }, + { + "epoch": 0.8161049902786779, + "grad_norm": 0.033705681562423706, + "learning_rate": 0.00017688464827399973, + "loss": 0.3304, + "step": 10074 + }, + { + "epoch": 0.8161860012961762, + "grad_norm": 0.03693459555506706, + "learning_rate": 0.0001768801476214051, + "loss": 0.4013, + "step": 10075 + }, + { + "epoch": 0.8162670123136747, + "grad_norm": 0.034374382346868515, + "learning_rate": 0.0001768756469688105, + "loss": 0.3793, + "step": 10076 + }, + { + "epoch": 0.8163480233311731, + "grad_norm": 0.029740922152996063, + "learning_rate": 0.00017687114631621587, + "loss": 0.3023, + "step": 10077 + }, + { + "epoch": 0.8164290343486714, + "grad_norm": 0.03397730737924576, + "learning_rate": 0.00017686664566362123, + "loss": 0.3225, + "step": 10078 + }, + { + "epoch": 0.8165100453661698, + "grad_norm": 0.03439175710082054, + "learning_rate": 0.0001768621450110266, + "loss": 0.3202, + "step": 10079 + }, + { + "epoch": 0.8165910563836681, + "grad_norm": 0.03943687304854393, + "learning_rate": 0.00017685764435843197, + "loss": 0.3415, + "step": 10080 + }, + { + "epoch": 0.8166720674011666, + "grad_norm": 0.03075510449707508, + "learning_rate": 0.00017685314370583736, + "loss": 0.3429, + "step": 10081 + }, + { + "epoch": 0.816753078418665, + "grad_norm": 0.03713309392333031, + "learning_rate": 0.00017684864305324275, + "loss": 0.3386, + "step": 10082 + }, + { + "epoch": 0.8168340894361633, + "grad_norm": 0.03184698894619942, + "learning_rate": 0.0001768441424006481, + "loss": 0.3186, + "step": 10083 + }, + { + "epoch": 0.8169151004536617, + "grad_norm": 0.028865061700344086, + "learning_rate": 0.00017683964174805347, + "loss": 0.287, + "step": 10084 + }, + { + "epoch": 0.81699611147116, + "grad_norm": 0.02944747731089592, + "learning_rate": 0.00017683514109545885, + "loss": 0.2863, + "step": 10085 + }, + { + "epoch": 0.8170771224886585, + "grad_norm": 0.03916551172733307, + "learning_rate": 0.00017683064044286421, + "loss": 0.3541, + "step": 10086 + }, + { + "epoch": 0.8171581335061568, + "grad_norm": 0.030422579497098923, + "learning_rate": 0.0001768261397902696, + "loss": 0.3448, + "step": 10087 + }, + { + "epoch": 0.8172391445236552, + "grad_norm": 0.03345503285527229, + "learning_rate": 0.000176821639137675, + "loss": 0.3954, + "step": 10088 + }, + { + "epoch": 0.8173201555411536, + "grad_norm": 0.03584202006459236, + "learning_rate": 0.00017681713848508035, + "loss": 0.3315, + "step": 10089 + }, + { + "epoch": 0.817401166558652, + "grad_norm": 0.03900199756026268, + "learning_rate": 0.0001768126378324857, + "loss": 0.382, + "step": 10090 + }, + { + "epoch": 0.8174821775761504, + "grad_norm": 0.03155552223324776, + "learning_rate": 0.0001768081371798911, + "loss": 0.3376, + "step": 10091 + }, + { + "epoch": 0.8175631885936487, + "grad_norm": 0.03717714920639992, + "learning_rate": 0.00017680363652729646, + "loss": 0.3707, + "step": 10092 + }, + { + "epoch": 0.8176441996111471, + "grad_norm": 0.036984365433454514, + "learning_rate": 0.00017679913587470184, + "loss": 0.3643, + "step": 10093 + }, + { + "epoch": 0.8177252106286454, + "grad_norm": 0.03007912077009678, + "learning_rate": 0.00017679463522210723, + "loss": 0.3161, + "step": 10094 + }, + { + "epoch": 0.8178062216461439, + "grad_norm": 0.030122673138976097, + "learning_rate": 0.0001767901345695126, + "loss": 0.3519, + "step": 10095 + }, + { + "epoch": 0.8178872326636423, + "grad_norm": 0.033813752233982086, + "learning_rate": 0.00017678563391691795, + "loss": 0.3217, + "step": 10096 + }, + { + "epoch": 0.8179682436811406, + "grad_norm": 0.03976140543818474, + "learning_rate": 0.00017678113326432334, + "loss": 0.3755, + "step": 10097 + }, + { + "epoch": 0.818049254698639, + "grad_norm": 0.02783932164311409, + "learning_rate": 0.0001767766326117287, + "loss": 0.2846, + "step": 10098 + }, + { + "epoch": 0.8181302657161373, + "grad_norm": 0.03258772939443588, + "learning_rate": 0.00017677213195913408, + "loss": 0.329, + "step": 10099 + }, + { + "epoch": 0.8182112767336358, + "grad_norm": 0.02766154520213604, + "learning_rate": 0.00017676763130653947, + "loss": 0.3437, + "step": 10100 + }, + { + "epoch": 0.8182922877511342, + "grad_norm": 0.03032521903514862, + "learning_rate": 0.00017676313065394483, + "loss": 0.3026, + "step": 10101 + }, + { + "epoch": 0.8183732987686325, + "grad_norm": 0.03306029364466667, + "learning_rate": 0.0001767586300013502, + "loss": 0.3731, + "step": 10102 + }, + { + "epoch": 0.8184543097861309, + "grad_norm": 0.03311099484562874, + "learning_rate": 0.00017675412934875558, + "loss": 0.3349, + "step": 10103 + }, + { + "epoch": 0.8185353208036293, + "grad_norm": 0.03536633029580116, + "learning_rate": 0.00017674962869616097, + "loss": 0.3336, + "step": 10104 + }, + { + "epoch": 0.8186163318211277, + "grad_norm": 0.034599754959344864, + "learning_rate": 0.00017674512804356633, + "loss": 0.3893, + "step": 10105 + }, + { + "epoch": 0.818697342838626, + "grad_norm": 0.035283416509628296, + "learning_rate": 0.0001767406273909717, + "loss": 0.3381, + "step": 10106 + }, + { + "epoch": 0.8187783538561244, + "grad_norm": 0.035050179809331894, + "learning_rate": 0.00017673612673837707, + "loss": 0.356, + "step": 10107 + }, + { + "epoch": 0.8188593648736228, + "grad_norm": 0.032310303300619125, + "learning_rate": 0.00017673162608578243, + "loss": 0.3146, + "step": 10108 + }, + { + "epoch": 0.8189403758911212, + "grad_norm": 0.03521250560879707, + "learning_rate": 0.00017672712543318782, + "loss": 0.3217, + "step": 10109 + }, + { + "epoch": 0.8190213869086196, + "grad_norm": 0.0328909195959568, + "learning_rate": 0.0001767226247805932, + "loss": 0.3146, + "step": 10110 + }, + { + "epoch": 0.8191023979261179, + "grad_norm": 0.027794158086180687, + "learning_rate": 0.00017671812412799857, + "loss": 0.3128, + "step": 10111 + }, + { + "epoch": 0.8191834089436163, + "grad_norm": 0.032471973448991776, + "learning_rate": 0.00017671362347540395, + "loss": 0.3042, + "step": 10112 + }, + { + "epoch": 0.8192644199611148, + "grad_norm": 0.03750569745898247, + "learning_rate": 0.00017670912282280931, + "loss": 0.3869, + "step": 10113 + }, + { + "epoch": 0.8193454309786131, + "grad_norm": 0.035073671489953995, + "learning_rate": 0.00017670462217021467, + "loss": 0.3423, + "step": 10114 + }, + { + "epoch": 0.8194264419961115, + "grad_norm": 0.033087559044361115, + "learning_rate": 0.00017670012151762006, + "loss": 0.3588, + "step": 10115 + }, + { + "epoch": 0.8195074530136098, + "grad_norm": 0.030937805771827698, + "learning_rate": 0.00017669562086502545, + "loss": 0.3604, + "step": 10116 + }, + { + "epoch": 0.8195884640311082, + "grad_norm": 0.03352980315685272, + "learning_rate": 0.0001766911202124308, + "loss": 0.3375, + "step": 10117 + }, + { + "epoch": 0.8196694750486067, + "grad_norm": 0.030550308525562286, + "learning_rate": 0.0001766866195598362, + "loss": 0.316, + "step": 10118 + }, + { + "epoch": 0.819750486066105, + "grad_norm": 0.030858267098665237, + "learning_rate": 0.00017668211890724156, + "loss": 0.3223, + "step": 10119 + }, + { + "epoch": 0.8198314970836034, + "grad_norm": 0.03076436184346676, + "learning_rate": 0.00017667761825464692, + "loss": 0.3202, + "step": 10120 + }, + { + "epoch": 0.8199125081011017, + "grad_norm": 0.029977362602949142, + "learning_rate": 0.0001766731176020523, + "loss": 0.3316, + "step": 10121 + }, + { + "epoch": 0.8199935191186001, + "grad_norm": 0.03315039351582527, + "learning_rate": 0.0001766686169494577, + "loss": 0.3452, + "step": 10122 + }, + { + "epoch": 0.8200745301360985, + "grad_norm": 0.0308857299387455, + "learning_rate": 0.00017666411629686305, + "loss": 0.3231, + "step": 10123 + }, + { + "epoch": 0.8201555411535969, + "grad_norm": 0.036206457763910294, + "learning_rate": 0.00017665961564426844, + "loss": 0.358, + "step": 10124 + }, + { + "epoch": 0.8202365521710953, + "grad_norm": 0.037194930016994476, + "learning_rate": 0.0001766551149916738, + "loss": 0.3359, + "step": 10125 + }, + { + "epoch": 0.8203175631885936, + "grad_norm": 0.032011616975069046, + "learning_rate": 0.00017665061433907916, + "loss": 0.3539, + "step": 10126 + }, + { + "epoch": 0.8203985742060921, + "grad_norm": 0.03428258001804352, + "learning_rate": 0.00017664611368648455, + "loss": 0.3126, + "step": 10127 + }, + { + "epoch": 0.8204795852235904, + "grad_norm": 0.03403123840689659, + "learning_rate": 0.00017664161303388993, + "loss": 0.3002, + "step": 10128 + }, + { + "epoch": 0.8205605962410888, + "grad_norm": 0.03198724612593651, + "learning_rate": 0.0001766371123812953, + "loss": 0.3712, + "step": 10129 + }, + { + "epoch": 0.8206416072585871, + "grad_norm": 0.036207620054483414, + "learning_rate": 0.00017663261172870068, + "loss": 0.3321, + "step": 10130 + }, + { + "epoch": 0.8207226182760855, + "grad_norm": 0.029472071677446365, + "learning_rate": 0.00017662811107610604, + "loss": 0.3188, + "step": 10131 + }, + { + "epoch": 0.820803629293584, + "grad_norm": 0.035696543753147125, + "learning_rate": 0.0001766236104235114, + "loss": 0.3607, + "step": 10132 + }, + { + "epoch": 0.8208846403110823, + "grad_norm": 0.03706861287355423, + "learning_rate": 0.00017661910977091681, + "loss": 0.3192, + "step": 10133 + }, + { + "epoch": 0.8209656513285807, + "grad_norm": 0.03445323556661606, + "learning_rate": 0.00017661460911832217, + "loss": 0.3363, + "step": 10134 + }, + { + "epoch": 0.821046662346079, + "grad_norm": 0.0338570699095726, + "learning_rate": 0.00017661010846572753, + "loss": 0.4025, + "step": 10135 + }, + { + "epoch": 0.8211276733635774, + "grad_norm": 0.032541487365961075, + "learning_rate": 0.00017660560781313292, + "loss": 0.3042, + "step": 10136 + }, + { + "epoch": 0.8212086843810759, + "grad_norm": 0.03224986419081688, + "learning_rate": 0.00017660110716053828, + "loss": 0.3385, + "step": 10137 + }, + { + "epoch": 0.8212896953985742, + "grad_norm": 0.03629942238330841, + "learning_rate": 0.00017659660650794364, + "loss": 0.3329, + "step": 10138 + }, + { + "epoch": 0.8213707064160726, + "grad_norm": 0.02894359454512596, + "learning_rate": 0.00017659210585534906, + "loss": 0.2813, + "step": 10139 + }, + { + "epoch": 0.8214517174335709, + "grad_norm": 0.033627405762672424, + "learning_rate": 0.00017658760520275442, + "loss": 0.3437, + "step": 10140 + }, + { + "epoch": 0.8215327284510694, + "grad_norm": 0.03281831368803978, + "learning_rate": 0.00017658310455015978, + "loss": 0.3184, + "step": 10141 + }, + { + "epoch": 0.8216137394685677, + "grad_norm": 0.03470015153288841, + "learning_rate": 0.00017657860389756516, + "loss": 0.3477, + "step": 10142 + }, + { + "epoch": 0.8216947504860661, + "grad_norm": 0.03491885960102081, + "learning_rate": 0.00017657410324497052, + "loss": 0.3619, + "step": 10143 + }, + { + "epoch": 0.8217757615035645, + "grad_norm": 0.03157167136669159, + "learning_rate": 0.00017656960259237588, + "loss": 0.2922, + "step": 10144 + }, + { + "epoch": 0.8218567725210628, + "grad_norm": 0.030484763905405998, + "learning_rate": 0.0001765651019397813, + "loss": 0.3078, + "step": 10145 + }, + { + "epoch": 0.8219377835385613, + "grad_norm": 0.030474934726953506, + "learning_rate": 0.00017656060128718666, + "loss": 0.3291, + "step": 10146 + }, + { + "epoch": 0.8220187945560596, + "grad_norm": 0.030692746862769127, + "learning_rate": 0.00017655610063459202, + "loss": 0.3524, + "step": 10147 + }, + { + "epoch": 0.822099805573558, + "grad_norm": 0.0413612499833107, + "learning_rate": 0.0001765515999819974, + "loss": 0.3379, + "step": 10148 + }, + { + "epoch": 0.8221808165910564, + "grad_norm": 0.03265791013836861, + "learning_rate": 0.00017654709932940276, + "loss": 0.3039, + "step": 10149 + }, + { + "epoch": 0.8222618276085548, + "grad_norm": 0.03477988764643669, + "learning_rate": 0.00017654259867680812, + "loss": 0.4187, + "step": 10150 + }, + { + "epoch": 0.8223428386260532, + "grad_norm": 0.03015865385532379, + "learning_rate": 0.00017653809802421354, + "loss": 0.2651, + "step": 10151 + }, + { + "epoch": 0.8224238496435515, + "grad_norm": 0.03432740271091461, + "learning_rate": 0.0001765335973716189, + "loss": 0.3499, + "step": 10152 + }, + { + "epoch": 0.8225048606610499, + "grad_norm": 0.033863767981529236, + "learning_rate": 0.00017652909671902426, + "loss": 0.3242, + "step": 10153 + }, + { + "epoch": 0.8225858716785482, + "grad_norm": 0.03144434094429016, + "learning_rate": 0.00017652459606642965, + "loss": 0.3114, + "step": 10154 + }, + { + "epoch": 0.8226668826960467, + "grad_norm": 0.03368362784385681, + "learning_rate": 0.000176520095413835, + "loss": 0.3216, + "step": 10155 + }, + { + "epoch": 0.8227478937135451, + "grad_norm": 0.032564952969551086, + "learning_rate": 0.0001765155947612404, + "loss": 0.3055, + "step": 10156 + }, + { + "epoch": 0.8228289047310434, + "grad_norm": 0.03535742685198784, + "learning_rate": 0.00017651109410864578, + "loss": 0.3472, + "step": 10157 + }, + { + "epoch": 0.8229099157485418, + "grad_norm": 0.033608000725507736, + "learning_rate": 0.00017650659345605114, + "loss": 0.3403, + "step": 10158 + }, + { + "epoch": 0.8229909267660401, + "grad_norm": 0.03379244729876518, + "learning_rate": 0.0001765020928034565, + "loss": 0.3519, + "step": 10159 + }, + { + "epoch": 0.8230719377835386, + "grad_norm": 0.03284695744514465, + "learning_rate": 0.0001764975921508619, + "loss": 0.3404, + "step": 10160 + }, + { + "epoch": 0.823152948801037, + "grad_norm": 0.03292253240942955, + "learning_rate": 0.00017649309149826725, + "loss": 0.3426, + "step": 10161 + }, + { + "epoch": 0.8232339598185353, + "grad_norm": 0.03824080526828766, + "learning_rate": 0.00017648859084567263, + "loss": 0.3025, + "step": 10162 + }, + { + "epoch": 0.8233149708360337, + "grad_norm": 0.033393364399671555, + "learning_rate": 0.00017648409019307802, + "loss": 0.3542, + "step": 10163 + }, + { + "epoch": 0.8233959818535321, + "grad_norm": 0.03220822662115097, + "learning_rate": 0.00017647958954048338, + "loss": 0.3412, + "step": 10164 + }, + { + "epoch": 0.8234769928710305, + "grad_norm": 0.033468231558799744, + "learning_rate": 0.00017647508888788874, + "loss": 0.3682, + "step": 10165 + }, + { + "epoch": 0.8235580038885288, + "grad_norm": 0.03393976017832756, + "learning_rate": 0.00017647058823529413, + "loss": 0.3251, + "step": 10166 + }, + { + "epoch": 0.8236390149060272, + "grad_norm": 0.03448769450187683, + "learning_rate": 0.0001764660875826995, + "loss": 0.3865, + "step": 10167 + }, + { + "epoch": 0.8237200259235256, + "grad_norm": 0.03279362991452217, + "learning_rate": 0.00017646158693010488, + "loss": 0.3293, + "step": 10168 + }, + { + "epoch": 0.823801036941024, + "grad_norm": 0.03309918940067291, + "learning_rate": 0.00017645708627751026, + "loss": 0.3241, + "step": 10169 + }, + { + "epoch": 0.8238820479585224, + "grad_norm": 0.03760819882154465, + "learning_rate": 0.00017645258562491562, + "loss": 0.3705, + "step": 10170 + }, + { + "epoch": 0.8239630589760207, + "grad_norm": 0.03316948935389519, + "learning_rate": 0.00017644808497232098, + "loss": 0.3367, + "step": 10171 + }, + { + "epoch": 0.8240440699935191, + "grad_norm": 0.034577999264001846, + "learning_rate": 0.00017644358431972637, + "loss": 0.3283, + "step": 10172 + }, + { + "epoch": 0.8241250810110174, + "grad_norm": 0.031563322991132736, + "learning_rate": 0.00017643908366713173, + "loss": 0.3797, + "step": 10173 + }, + { + "epoch": 0.8242060920285159, + "grad_norm": 0.03220498561859131, + "learning_rate": 0.00017643458301453712, + "loss": 0.2818, + "step": 10174 + }, + { + "epoch": 0.8242871030460143, + "grad_norm": 0.036133624613285065, + "learning_rate": 0.0001764300823619425, + "loss": 0.3076, + "step": 10175 + }, + { + "epoch": 0.8243681140635126, + "grad_norm": 0.0324624739587307, + "learning_rate": 0.00017642558170934787, + "loss": 0.3597, + "step": 10176 + }, + { + "epoch": 0.824449125081011, + "grad_norm": 0.0329521968960762, + "learning_rate": 0.00017642108105675323, + "loss": 0.3029, + "step": 10177 + }, + { + "epoch": 0.8245301360985094, + "grad_norm": 0.03469759225845337, + "learning_rate": 0.0001764165804041586, + "loss": 0.3496, + "step": 10178 + }, + { + "epoch": 0.8246111471160078, + "grad_norm": 0.03664855659008026, + "learning_rate": 0.00017641207975156397, + "loss": 0.3336, + "step": 10179 + }, + { + "epoch": 0.8246921581335062, + "grad_norm": 0.03370808809995651, + "learning_rate": 0.00017640757909896936, + "loss": 0.3523, + "step": 10180 + }, + { + "epoch": 0.8247731691510045, + "grad_norm": 0.029433516785502434, + "learning_rate": 0.00017640307844637475, + "loss": 0.3281, + "step": 10181 + }, + { + "epoch": 0.8248541801685029, + "grad_norm": 0.029438281431794167, + "learning_rate": 0.0001763985777937801, + "loss": 0.2875, + "step": 10182 + }, + { + "epoch": 0.8249351911860013, + "grad_norm": 0.03132937103509903, + "learning_rate": 0.00017639407714118547, + "loss": 0.3136, + "step": 10183 + }, + { + "epoch": 0.8250162022034997, + "grad_norm": 0.03620680794119835, + "learning_rate": 0.00017638957648859085, + "loss": 0.3434, + "step": 10184 + }, + { + "epoch": 0.825097213220998, + "grad_norm": 0.031460925936698914, + "learning_rate": 0.00017638507583599624, + "loss": 0.3506, + "step": 10185 + }, + { + "epoch": 0.8251782242384964, + "grad_norm": 0.03671254590153694, + "learning_rate": 0.0001763805751834016, + "loss": 0.3396, + "step": 10186 + }, + { + "epoch": 0.8252592352559948, + "grad_norm": 0.040558792650699615, + "learning_rate": 0.000176376074530807, + "loss": 0.371, + "step": 10187 + }, + { + "epoch": 0.8253402462734932, + "grad_norm": 0.040832262486219406, + "learning_rate": 0.00017637157387821235, + "loss": 0.3513, + "step": 10188 + }, + { + "epoch": 0.8254212572909916, + "grad_norm": 0.030765267089009285, + "learning_rate": 0.0001763670732256177, + "loss": 0.3731, + "step": 10189 + }, + { + "epoch": 0.8255022683084899, + "grad_norm": 0.032495588064193726, + "learning_rate": 0.0001763625725730231, + "loss": 0.3384, + "step": 10190 + }, + { + "epoch": 0.8255832793259883, + "grad_norm": 0.035736821591854095, + "learning_rate": 0.00017635807192042848, + "loss": 0.3452, + "step": 10191 + }, + { + "epoch": 0.8256642903434868, + "grad_norm": 0.03253974765539169, + "learning_rate": 0.00017635357126783384, + "loss": 0.3396, + "step": 10192 + }, + { + "epoch": 0.8257453013609851, + "grad_norm": 0.033621449023485184, + "learning_rate": 0.00017634907061523923, + "loss": 0.3445, + "step": 10193 + }, + { + "epoch": 0.8258263123784835, + "grad_norm": 0.035593949258327484, + "learning_rate": 0.0001763445699626446, + "loss": 0.3359, + "step": 10194 + }, + { + "epoch": 0.8259073233959818, + "grad_norm": 0.03543344885110855, + "learning_rate": 0.00017634006931004995, + "loss": 0.3429, + "step": 10195 + }, + { + "epoch": 0.8259883344134802, + "grad_norm": 0.03207477554678917, + "learning_rate": 0.00017633556865745534, + "loss": 0.3537, + "step": 10196 + }, + { + "epoch": 0.8260693454309787, + "grad_norm": 0.03806965425610542, + "learning_rate": 0.00017633106800486072, + "loss": 0.317, + "step": 10197 + }, + { + "epoch": 0.826150356448477, + "grad_norm": 0.048110056668519974, + "learning_rate": 0.00017632656735226608, + "loss": 0.3788, + "step": 10198 + }, + { + "epoch": 0.8262313674659754, + "grad_norm": 0.034069642424583435, + "learning_rate": 0.00017632206669967147, + "loss": 0.3338, + "step": 10199 + }, + { + "epoch": 0.8263123784834737, + "grad_norm": 0.03619742393493652, + "learning_rate": 0.00017631756604707683, + "loss": 0.344, + "step": 10200 + }, + { + "epoch": 0.8263933895009722, + "grad_norm": 0.03356582671403885, + "learning_rate": 0.0001763130653944822, + "loss": 0.3025, + "step": 10201 + }, + { + "epoch": 0.8264744005184705, + "grad_norm": 0.030345069244503975, + "learning_rate": 0.00017630856474188758, + "loss": 0.3266, + "step": 10202 + }, + { + "epoch": 0.8265554115359689, + "grad_norm": 0.029872342944145203, + "learning_rate": 0.00017630406408929297, + "loss": 0.3091, + "step": 10203 + }, + { + "epoch": 0.8266364225534673, + "grad_norm": 0.03376317024230957, + "learning_rate": 0.00017629956343669833, + "loss": 0.3665, + "step": 10204 + }, + { + "epoch": 0.8267174335709656, + "grad_norm": 0.03272249549627304, + "learning_rate": 0.0001762950627841037, + "loss": 0.3275, + "step": 10205 + }, + { + "epoch": 0.8267984445884641, + "grad_norm": 0.035184647887945175, + "learning_rate": 0.00017629056213150907, + "loss": 0.367, + "step": 10206 + }, + { + "epoch": 0.8268794556059624, + "grad_norm": 0.02668853849172592, + "learning_rate": 0.00017628606147891443, + "loss": 0.2825, + "step": 10207 + }, + { + "epoch": 0.8269604666234608, + "grad_norm": 0.0330558642745018, + "learning_rate": 0.00017628156082631982, + "loss": 0.3166, + "step": 10208 + }, + { + "epoch": 0.8270414776409591, + "grad_norm": 0.03391389548778534, + "learning_rate": 0.0001762770601737252, + "loss": 0.3426, + "step": 10209 + }, + { + "epoch": 0.8271224886584575, + "grad_norm": 0.03699477016925812, + "learning_rate": 0.00017627255952113057, + "loss": 0.3551, + "step": 10210 + }, + { + "epoch": 0.827203499675956, + "grad_norm": 0.028039144352078438, + "learning_rate": 0.00017626805886853595, + "loss": 0.2904, + "step": 10211 + }, + { + "epoch": 0.8272845106934543, + "grad_norm": 0.031306397169828415, + "learning_rate": 0.00017626355821594132, + "loss": 0.3135, + "step": 10212 + }, + { + "epoch": 0.8273655217109527, + "grad_norm": 0.032856930047273636, + "learning_rate": 0.00017625905756334668, + "loss": 0.3002, + "step": 10213 + }, + { + "epoch": 0.827446532728451, + "grad_norm": 0.032040588557720184, + "learning_rate": 0.0001762545569107521, + "loss": 0.329, + "step": 10214 + }, + { + "epoch": 0.8275275437459495, + "grad_norm": 0.03887049853801727, + "learning_rate": 0.00017625005625815745, + "loss": 0.3489, + "step": 10215 + }, + { + "epoch": 0.8276085547634479, + "grad_norm": 0.03505093976855278, + "learning_rate": 0.0001762455556055628, + "loss": 0.3306, + "step": 10216 + }, + { + "epoch": 0.8276895657809462, + "grad_norm": 0.032911516726017, + "learning_rate": 0.0001762410549529682, + "loss": 0.3399, + "step": 10217 + }, + { + "epoch": 0.8277705767984446, + "grad_norm": 0.029951702803373337, + "learning_rate": 0.00017623655430037356, + "loss": 0.2972, + "step": 10218 + }, + { + "epoch": 0.8278515878159429, + "grad_norm": 0.039747558534145355, + "learning_rate": 0.00017623205364777892, + "loss": 0.3619, + "step": 10219 + }, + { + "epoch": 0.8279325988334414, + "grad_norm": 0.03002556785941124, + "learning_rate": 0.00017622755299518433, + "loss": 0.3075, + "step": 10220 + }, + { + "epoch": 0.8280136098509397, + "grad_norm": 0.031569890677928925, + "learning_rate": 0.0001762230523425897, + "loss": 0.2906, + "step": 10221 + }, + { + "epoch": 0.8280946208684381, + "grad_norm": 0.036268286406993866, + "learning_rate": 0.00017621855168999505, + "loss": 0.365, + "step": 10222 + }, + { + "epoch": 0.8281756318859365, + "grad_norm": 0.0369952954351902, + "learning_rate": 0.00017621405103740044, + "loss": 0.341, + "step": 10223 + }, + { + "epoch": 0.8282566429034348, + "grad_norm": 0.03524234518408775, + "learning_rate": 0.0001762095503848058, + "loss": 0.3623, + "step": 10224 + }, + { + "epoch": 0.8283376539209333, + "grad_norm": 0.03595670685172081, + "learning_rate": 0.00017620504973221116, + "loss": 0.3357, + "step": 10225 + }, + { + "epoch": 0.8284186649384316, + "grad_norm": 0.03600858896970749, + "learning_rate": 0.00017620054907961657, + "loss": 0.3593, + "step": 10226 + }, + { + "epoch": 0.82849967595593, + "grad_norm": 0.0327393002808094, + "learning_rate": 0.00017619604842702193, + "loss": 0.3542, + "step": 10227 + }, + { + "epoch": 0.8285806869734283, + "grad_norm": 0.03379761055111885, + "learning_rate": 0.0001761915477744273, + "loss": 0.3669, + "step": 10228 + }, + { + "epoch": 0.8286616979909268, + "grad_norm": 0.03441119194030762, + "learning_rate": 0.00017618704712183268, + "loss": 0.3492, + "step": 10229 + }, + { + "epoch": 0.8287427090084252, + "grad_norm": 0.03550002723932266, + "learning_rate": 0.00017618254646923804, + "loss": 0.3634, + "step": 10230 + }, + { + "epoch": 0.8288237200259235, + "grad_norm": 0.029466258361935616, + "learning_rate": 0.0001761780458166434, + "loss": 0.3013, + "step": 10231 + }, + { + "epoch": 0.8289047310434219, + "grad_norm": 0.03521445021033287, + "learning_rate": 0.00017617354516404881, + "loss": 0.3302, + "step": 10232 + }, + { + "epoch": 0.8289857420609202, + "grad_norm": 0.03384638950228691, + "learning_rate": 0.00017616904451145417, + "loss": 0.3117, + "step": 10233 + }, + { + "epoch": 0.8290667530784187, + "grad_norm": 0.03205539658665657, + "learning_rate": 0.00017616454385885953, + "loss": 0.2758, + "step": 10234 + }, + { + "epoch": 0.8291477640959171, + "grad_norm": 0.034657254815101624, + "learning_rate": 0.00017616004320626492, + "loss": 0.3505, + "step": 10235 + }, + { + "epoch": 0.8292287751134154, + "grad_norm": 0.03520963340997696, + "learning_rate": 0.00017615554255367028, + "loss": 0.3589, + "step": 10236 + }, + { + "epoch": 0.8293097861309138, + "grad_norm": 0.03619067370891571, + "learning_rate": 0.00017615104190107567, + "loss": 0.3584, + "step": 10237 + }, + { + "epoch": 0.8293907971484121, + "grad_norm": 0.03255670145153999, + "learning_rate": 0.00017614654124848106, + "loss": 0.3254, + "step": 10238 + }, + { + "epoch": 0.8294718081659106, + "grad_norm": 0.037719033658504486, + "learning_rate": 0.00017614204059588642, + "loss": 0.3668, + "step": 10239 + }, + { + "epoch": 0.829552819183409, + "grad_norm": 0.03547034412622452, + "learning_rate": 0.00017613753994329178, + "loss": 0.348, + "step": 10240 + }, + { + "epoch": 0.8296338302009073, + "grad_norm": 0.03199911117553711, + "learning_rate": 0.00017613303929069716, + "loss": 0.346, + "step": 10241 + }, + { + "epoch": 0.8297148412184057, + "grad_norm": 0.032698921859264374, + "learning_rate": 0.00017612853863810252, + "loss": 0.3688, + "step": 10242 + }, + { + "epoch": 0.8297958522359041, + "grad_norm": 0.03255463391542435, + "learning_rate": 0.0001761240379855079, + "loss": 0.3056, + "step": 10243 + }, + { + "epoch": 0.8298768632534025, + "grad_norm": 0.03685171157121658, + "learning_rate": 0.0001761195373329133, + "loss": 0.3088, + "step": 10244 + }, + { + "epoch": 0.8299578742709008, + "grad_norm": 0.035410698503255844, + "learning_rate": 0.00017611503668031866, + "loss": 0.3413, + "step": 10245 + }, + { + "epoch": 0.8300388852883992, + "grad_norm": 0.036075409501791, + "learning_rate": 0.00017611053602772402, + "loss": 0.3515, + "step": 10246 + }, + { + "epoch": 0.8301198963058976, + "grad_norm": 0.04027034714818001, + "learning_rate": 0.0001761060353751294, + "loss": 0.3424, + "step": 10247 + }, + { + "epoch": 0.830200907323396, + "grad_norm": 0.03788850083947182, + "learning_rate": 0.00017610153472253476, + "loss": 0.3113, + "step": 10248 + }, + { + "epoch": 0.8302819183408944, + "grad_norm": 0.030312340706586838, + "learning_rate": 0.00017609703406994015, + "loss": 0.3107, + "step": 10249 + }, + { + "epoch": 0.8303629293583927, + "grad_norm": 0.0325654000043869, + "learning_rate": 0.00017609253341734554, + "loss": 0.3605, + "step": 10250 + }, + { + "epoch": 0.8304439403758911, + "grad_norm": 0.032030895352363586, + "learning_rate": 0.0001760880327647509, + "loss": 0.3274, + "step": 10251 + }, + { + "epoch": 0.8305249513933896, + "grad_norm": 0.03195015341043472, + "learning_rate": 0.00017608353211215626, + "loss": 0.3199, + "step": 10252 + }, + { + "epoch": 0.8306059624108879, + "grad_norm": 0.03378359600901604, + "learning_rate": 0.00017607903145956165, + "loss": 0.3425, + "step": 10253 + }, + { + "epoch": 0.8306869734283863, + "grad_norm": 0.030721347779035568, + "learning_rate": 0.000176074530806967, + "loss": 0.3134, + "step": 10254 + }, + { + "epoch": 0.8307679844458846, + "grad_norm": 0.037971507757902145, + "learning_rate": 0.0001760700301543724, + "loss": 0.3284, + "step": 10255 + }, + { + "epoch": 0.830848995463383, + "grad_norm": 0.03493460640311241, + "learning_rate": 0.00017606552950177778, + "loss": 0.3524, + "step": 10256 + }, + { + "epoch": 0.8309300064808814, + "grad_norm": 0.026955394074320793, + "learning_rate": 0.00017606102884918314, + "loss": 0.2959, + "step": 10257 + }, + { + "epoch": 0.8310110174983798, + "grad_norm": 0.03566323220729828, + "learning_rate": 0.0001760565281965885, + "loss": 0.3519, + "step": 10258 + }, + { + "epoch": 0.8310920285158782, + "grad_norm": 0.035410210490226746, + "learning_rate": 0.0001760520275439939, + "loss": 0.3128, + "step": 10259 + }, + { + "epoch": 0.8311730395333765, + "grad_norm": 0.037130143493413925, + "learning_rate": 0.00017604752689139925, + "loss": 0.3271, + "step": 10260 + }, + { + "epoch": 0.8312540505508749, + "grad_norm": 0.0340542234480381, + "learning_rate": 0.00017604302623880464, + "loss": 0.3247, + "step": 10261 + }, + { + "epoch": 0.8313350615683733, + "grad_norm": 0.03176279738545418, + "learning_rate": 0.00017603852558621002, + "loss": 0.3623, + "step": 10262 + }, + { + "epoch": 0.8314160725858717, + "grad_norm": 0.034670326858758926, + "learning_rate": 0.00017603402493361538, + "loss": 0.3571, + "step": 10263 + }, + { + "epoch": 0.83149708360337, + "grad_norm": 0.03411025553941727, + "learning_rate": 0.00017602952428102074, + "loss": 0.3358, + "step": 10264 + }, + { + "epoch": 0.8315780946208684, + "grad_norm": 0.03430251032114029, + "learning_rate": 0.00017602502362842613, + "loss": 0.3704, + "step": 10265 + }, + { + "epoch": 0.8316591056383669, + "grad_norm": 0.034683723002672195, + "learning_rate": 0.00017602052297583152, + "loss": 0.3604, + "step": 10266 + }, + { + "epoch": 0.8317401166558652, + "grad_norm": 0.027798432856798172, + "learning_rate": 0.00017601602232323688, + "loss": 0.2718, + "step": 10267 + }, + { + "epoch": 0.8318211276733636, + "grad_norm": 0.029978135600686073, + "learning_rate": 0.00017601152167064226, + "loss": 0.3384, + "step": 10268 + }, + { + "epoch": 0.8319021386908619, + "grad_norm": 0.03305796906352043, + "learning_rate": 0.00017600702101804762, + "loss": 0.3121, + "step": 10269 + }, + { + "epoch": 0.8319831497083603, + "grad_norm": 0.036289576441049576, + "learning_rate": 0.00017600252036545298, + "loss": 0.3989, + "step": 10270 + }, + { + "epoch": 0.8320641607258588, + "grad_norm": 0.0353555865585804, + "learning_rate": 0.00017599801971285837, + "loss": 0.3777, + "step": 10271 + }, + { + "epoch": 0.8321451717433571, + "grad_norm": 0.030521580949425697, + "learning_rate": 0.00017599351906026376, + "loss": 0.367, + "step": 10272 + }, + { + "epoch": 0.8322261827608555, + "grad_norm": 0.04173492640256882, + "learning_rate": 0.00017598901840766912, + "loss": 0.3639, + "step": 10273 + }, + { + "epoch": 0.8323071937783538, + "grad_norm": 0.02986193262040615, + "learning_rate": 0.0001759845177550745, + "loss": 0.3024, + "step": 10274 + }, + { + "epoch": 0.8323882047958522, + "grad_norm": 0.032112687826156616, + "learning_rate": 0.00017598001710247987, + "loss": 0.3203, + "step": 10275 + }, + { + "epoch": 0.8324692158133506, + "grad_norm": 0.03689032047986984, + "learning_rate": 0.00017597551644988523, + "loss": 0.3443, + "step": 10276 + }, + { + "epoch": 0.832550226830849, + "grad_norm": 0.0335177518427372, + "learning_rate": 0.0001759710157972906, + "loss": 0.3579, + "step": 10277 + }, + { + "epoch": 0.8326312378483474, + "grad_norm": 0.03220372647047043, + "learning_rate": 0.000175966515144696, + "loss": 0.3264, + "step": 10278 + }, + { + "epoch": 0.8327122488658457, + "grad_norm": 0.03158966824412346, + "learning_rate": 0.00017596201449210136, + "loss": 0.3068, + "step": 10279 + }, + { + "epoch": 0.8327932598833442, + "grad_norm": 0.035905398428440094, + "learning_rate": 0.00017595751383950675, + "loss": 0.3317, + "step": 10280 + }, + { + "epoch": 0.8328742709008425, + "grad_norm": 0.04016716405749321, + "learning_rate": 0.0001759530131869121, + "loss": 0.372, + "step": 10281 + }, + { + "epoch": 0.8329552819183409, + "grad_norm": 0.033283233642578125, + "learning_rate": 0.00017594851253431747, + "loss": 0.3443, + "step": 10282 + }, + { + "epoch": 0.8330362929358393, + "grad_norm": 0.04536040499806404, + "learning_rate": 0.00017594401188172285, + "loss": 0.3507, + "step": 10283 + }, + { + "epoch": 0.8331173039533376, + "grad_norm": 0.037573862820863724, + "learning_rate": 0.00017593951122912824, + "loss": 0.3726, + "step": 10284 + }, + { + "epoch": 0.8331983149708361, + "grad_norm": 0.036852750927209854, + "learning_rate": 0.0001759350105765336, + "loss": 0.3693, + "step": 10285 + }, + { + "epoch": 0.8332793259883344, + "grad_norm": 0.036361176520586014, + "learning_rate": 0.000175930509923939, + "loss": 0.3116, + "step": 10286 + }, + { + "epoch": 0.8333603370058328, + "grad_norm": 0.032356224954128265, + "learning_rate": 0.00017592600927134435, + "loss": 0.3175, + "step": 10287 + }, + { + "epoch": 0.8334413480233311, + "grad_norm": 0.03756551444530487, + "learning_rate": 0.0001759215086187497, + "loss": 0.3124, + "step": 10288 + }, + { + "epoch": 0.8335223590408296, + "grad_norm": 0.029808249324560165, + "learning_rate": 0.00017591700796615512, + "loss": 0.3145, + "step": 10289 + }, + { + "epoch": 0.833603370058328, + "grad_norm": 0.03625890985131264, + "learning_rate": 0.00017591250731356048, + "loss": 0.3583, + "step": 10290 + }, + { + "epoch": 0.8336843810758263, + "grad_norm": 0.030652465298771858, + "learning_rate": 0.00017590800666096584, + "loss": 0.3323, + "step": 10291 + }, + { + "epoch": 0.8337653920933247, + "grad_norm": 0.03474270552396774, + "learning_rate": 0.00017590350600837123, + "loss": 0.3724, + "step": 10292 + }, + { + "epoch": 0.833846403110823, + "grad_norm": 0.03331577777862549, + "learning_rate": 0.0001758990053557766, + "loss": 0.3337, + "step": 10293 + }, + { + "epoch": 0.8339274141283215, + "grad_norm": 0.034953150898218155, + "learning_rate": 0.00017589450470318195, + "loss": 0.3337, + "step": 10294 + }, + { + "epoch": 0.8340084251458199, + "grad_norm": 0.0396929569542408, + "learning_rate": 0.00017589000405058736, + "loss": 0.378, + "step": 10295 + }, + { + "epoch": 0.8340894361633182, + "grad_norm": 0.03195977956056595, + "learning_rate": 0.00017588550339799272, + "loss": 0.3093, + "step": 10296 + }, + { + "epoch": 0.8341704471808166, + "grad_norm": 0.03176313266158104, + "learning_rate": 0.00017588100274539808, + "loss": 0.35, + "step": 10297 + }, + { + "epoch": 0.8342514581983149, + "grad_norm": 0.03105923719704151, + "learning_rate": 0.00017587650209280347, + "loss": 0.3063, + "step": 10298 + }, + { + "epoch": 0.8343324692158134, + "grad_norm": 0.03520163521170616, + "learning_rate": 0.00017587200144020883, + "loss": 0.3734, + "step": 10299 + }, + { + "epoch": 0.8344134802333117, + "grad_norm": 0.0330432653427124, + "learning_rate": 0.0001758675007876142, + "loss": 0.3141, + "step": 10300 + }, + { + "epoch": 0.8344944912508101, + "grad_norm": 0.02842598967254162, + "learning_rate": 0.0001758630001350196, + "loss": 0.312, + "step": 10301 + }, + { + "epoch": 0.8345755022683085, + "grad_norm": 0.036401763558387756, + "learning_rate": 0.00017585849948242497, + "loss": 0.4048, + "step": 10302 + }, + { + "epoch": 0.8346565132858069, + "grad_norm": 0.03183615207672119, + "learning_rate": 0.00017585399882983033, + "loss": 0.3077, + "step": 10303 + }, + { + "epoch": 0.8347375243033053, + "grad_norm": 0.030855121091008186, + "learning_rate": 0.00017584949817723571, + "loss": 0.3145, + "step": 10304 + }, + { + "epoch": 0.8348185353208036, + "grad_norm": 0.0320095457136631, + "learning_rate": 0.00017584499752464107, + "loss": 0.3457, + "step": 10305 + }, + { + "epoch": 0.834899546338302, + "grad_norm": 0.0305408276617527, + "learning_rate": 0.00017584049687204643, + "loss": 0.3296, + "step": 10306 + }, + { + "epoch": 0.8349805573558003, + "grad_norm": 0.03183217719197273, + "learning_rate": 0.00017583599621945185, + "loss": 0.3173, + "step": 10307 + }, + { + "epoch": 0.8350615683732988, + "grad_norm": 0.03041781298816204, + "learning_rate": 0.0001758314955668572, + "loss": 0.3178, + "step": 10308 + }, + { + "epoch": 0.8351425793907972, + "grad_norm": 0.035926688462495804, + "learning_rate": 0.00017582699491426257, + "loss": 0.3723, + "step": 10309 + }, + { + "epoch": 0.8352235904082955, + "grad_norm": 0.034372445195913315, + "learning_rate": 0.00017582249426166796, + "loss": 0.3116, + "step": 10310 + }, + { + "epoch": 0.8353046014257939, + "grad_norm": 0.03208107501268387, + "learning_rate": 0.00017581799360907332, + "loss": 0.2975, + "step": 10311 + }, + { + "epoch": 0.8353856124432922, + "grad_norm": 0.0312693789601326, + "learning_rate": 0.00017581349295647868, + "loss": 0.3534, + "step": 10312 + }, + { + "epoch": 0.8354666234607907, + "grad_norm": 0.02982800267636776, + "learning_rate": 0.0001758089923038841, + "loss": 0.2737, + "step": 10313 + }, + { + "epoch": 0.8355476344782891, + "grad_norm": 0.03189459815621376, + "learning_rate": 0.00017580449165128945, + "loss": 0.325, + "step": 10314 + }, + { + "epoch": 0.8356286454957874, + "grad_norm": 0.032958321273326874, + "learning_rate": 0.0001757999909986948, + "loss": 0.3049, + "step": 10315 + }, + { + "epoch": 0.8357096565132858, + "grad_norm": 0.0360761396586895, + "learning_rate": 0.0001757954903461002, + "loss": 0.3337, + "step": 10316 + }, + { + "epoch": 0.8357906675307842, + "grad_norm": 0.03744465112686157, + "learning_rate": 0.00017579098969350556, + "loss": 0.3731, + "step": 10317 + }, + { + "epoch": 0.8358716785482826, + "grad_norm": 0.02999812364578247, + "learning_rate": 0.00017578648904091094, + "loss": 0.3192, + "step": 10318 + }, + { + "epoch": 0.835952689565781, + "grad_norm": 0.03587619215250015, + "learning_rate": 0.00017578198838831633, + "loss": 0.3333, + "step": 10319 + }, + { + "epoch": 0.8360337005832793, + "grad_norm": 0.03604312986135483, + "learning_rate": 0.0001757774877357217, + "loss": 0.3697, + "step": 10320 + }, + { + "epoch": 0.8361147116007777, + "grad_norm": 0.03261552378535271, + "learning_rate": 0.00017577298708312705, + "loss": 0.3659, + "step": 10321 + }, + { + "epoch": 0.8361957226182761, + "grad_norm": 0.03550964221358299, + "learning_rate": 0.00017576848643053244, + "loss": 0.3168, + "step": 10322 + }, + { + "epoch": 0.8362767336357745, + "grad_norm": 0.03809160739183426, + "learning_rate": 0.0001757639857779378, + "loss": 0.3518, + "step": 10323 + }, + { + "epoch": 0.8363577446532728, + "grad_norm": 0.03714146092534065, + "learning_rate": 0.00017575948512534319, + "loss": 0.3622, + "step": 10324 + }, + { + "epoch": 0.8364387556707712, + "grad_norm": 0.03425988182425499, + "learning_rate": 0.00017575498447274857, + "loss": 0.3196, + "step": 10325 + }, + { + "epoch": 0.8365197666882696, + "grad_norm": 0.03243013843894005, + "learning_rate": 0.00017575048382015393, + "loss": 0.2836, + "step": 10326 + }, + { + "epoch": 0.836600777705768, + "grad_norm": 0.03315865993499756, + "learning_rate": 0.0001757459831675593, + "loss": 0.3028, + "step": 10327 + }, + { + "epoch": 0.8366817887232664, + "grad_norm": 0.03174262493848801, + "learning_rate": 0.00017574148251496468, + "loss": 0.3402, + "step": 10328 + }, + { + "epoch": 0.8367627997407647, + "grad_norm": 0.03199068829417229, + "learning_rate": 0.00017573698186237004, + "loss": 0.3028, + "step": 10329 + }, + { + "epoch": 0.8368438107582631, + "grad_norm": 0.03462492674589157, + "learning_rate": 0.00017573248120977543, + "loss": 0.3441, + "step": 10330 + }, + { + "epoch": 0.8369248217757616, + "grad_norm": 0.031907789409160614, + "learning_rate": 0.00017572798055718081, + "loss": 0.3094, + "step": 10331 + }, + { + "epoch": 0.8370058327932599, + "grad_norm": 0.03425981476902962, + "learning_rate": 0.00017572347990458617, + "loss": 0.3765, + "step": 10332 + }, + { + "epoch": 0.8370868438107583, + "grad_norm": 0.030974196270108223, + "learning_rate": 0.00017571897925199153, + "loss": 0.3041, + "step": 10333 + }, + { + "epoch": 0.8371678548282566, + "grad_norm": 0.033227961510419846, + "learning_rate": 0.00017571447859939692, + "loss": 0.3493, + "step": 10334 + }, + { + "epoch": 0.837248865845755, + "grad_norm": 0.03223063424229622, + "learning_rate": 0.00017570997794680228, + "loss": 0.3668, + "step": 10335 + }, + { + "epoch": 0.8373298768632534, + "grad_norm": 0.03262501582503319, + "learning_rate": 0.00017570547729420767, + "loss": 0.3485, + "step": 10336 + }, + { + "epoch": 0.8374108878807518, + "grad_norm": 0.03690466657280922, + "learning_rate": 0.00017570097664161306, + "loss": 0.3207, + "step": 10337 + }, + { + "epoch": 0.8374918988982502, + "grad_norm": 0.042168475687503815, + "learning_rate": 0.00017569647598901842, + "loss": 0.326, + "step": 10338 + }, + { + "epoch": 0.8375729099157485, + "grad_norm": 0.03320693597197533, + "learning_rate": 0.00017569197533642378, + "loss": 0.3514, + "step": 10339 + }, + { + "epoch": 0.837653920933247, + "grad_norm": 0.03084513545036316, + "learning_rate": 0.00017568747468382916, + "loss": 0.3072, + "step": 10340 + }, + { + "epoch": 0.8377349319507453, + "grad_norm": 0.03049416095018387, + "learning_rate": 0.00017568297403123455, + "loss": 0.3292, + "step": 10341 + }, + { + "epoch": 0.8378159429682437, + "grad_norm": 0.03144636005163193, + "learning_rate": 0.0001756784733786399, + "loss": 0.3068, + "step": 10342 + }, + { + "epoch": 0.837896953985742, + "grad_norm": 0.032478101551532745, + "learning_rate": 0.0001756739727260453, + "loss": 0.3345, + "step": 10343 + }, + { + "epoch": 0.8379779650032404, + "grad_norm": 0.034912481904029846, + "learning_rate": 0.00017566947207345066, + "loss": 0.3603, + "step": 10344 + }, + { + "epoch": 0.8380589760207389, + "grad_norm": 0.02963745966553688, + "learning_rate": 0.00017566497142085602, + "loss": 0.3777, + "step": 10345 + }, + { + "epoch": 0.8381399870382372, + "grad_norm": 0.029963036999106407, + "learning_rate": 0.0001756604707682614, + "loss": 0.3091, + "step": 10346 + }, + { + "epoch": 0.8382209980557356, + "grad_norm": 0.033851027488708496, + "learning_rate": 0.0001756559701156668, + "loss": 0.3792, + "step": 10347 + }, + { + "epoch": 0.8383020090732339, + "grad_norm": 0.03498721122741699, + "learning_rate": 0.00017565146946307215, + "loss": 0.3297, + "step": 10348 + }, + { + "epoch": 0.8383830200907323, + "grad_norm": 0.032017070800065994, + "learning_rate": 0.00017564696881047754, + "loss": 0.3312, + "step": 10349 + }, + { + "epoch": 0.8384640311082308, + "grad_norm": 0.036637600511312485, + "learning_rate": 0.0001756424681578829, + "loss": 0.3858, + "step": 10350 + }, + { + "epoch": 0.8385450421257291, + "grad_norm": 0.034795694053173065, + "learning_rate": 0.00017563796750528826, + "loss": 0.3326, + "step": 10351 + }, + { + "epoch": 0.8386260531432275, + "grad_norm": 0.04814450815320015, + "learning_rate": 0.00017563346685269365, + "loss": 0.362, + "step": 10352 + }, + { + "epoch": 0.8387070641607258, + "grad_norm": 0.034067779779434204, + "learning_rate": 0.00017562896620009903, + "loss": 0.3023, + "step": 10353 + }, + { + "epoch": 0.8387880751782243, + "grad_norm": 0.04440617933869362, + "learning_rate": 0.0001756244655475044, + "loss": 0.3306, + "step": 10354 + }, + { + "epoch": 0.8388690861957226, + "grad_norm": 0.033861372619867325, + "learning_rate": 0.00017561996489490978, + "loss": 0.3611, + "step": 10355 + }, + { + "epoch": 0.838950097213221, + "grad_norm": 0.0317358635365963, + "learning_rate": 0.00017561546424231514, + "loss": 0.3187, + "step": 10356 + }, + { + "epoch": 0.8390311082307194, + "grad_norm": 0.030743295326828957, + "learning_rate": 0.0001756109635897205, + "loss": 0.2925, + "step": 10357 + }, + { + "epoch": 0.8391121192482177, + "grad_norm": 0.03173130378127098, + "learning_rate": 0.0001756064629371259, + "loss": 0.341, + "step": 10358 + }, + { + "epoch": 0.8391931302657162, + "grad_norm": 0.03562208637595177, + "learning_rate": 0.00017560196228453128, + "loss": 0.2951, + "step": 10359 + }, + { + "epoch": 0.8392741412832145, + "grad_norm": 0.0299212709069252, + "learning_rate": 0.00017559746163193664, + "loss": 0.37, + "step": 10360 + }, + { + "epoch": 0.8393551523007129, + "grad_norm": 0.03546148166060448, + "learning_rate": 0.00017559296097934202, + "loss": 0.3246, + "step": 10361 + }, + { + "epoch": 0.8394361633182112, + "grad_norm": 0.027700217440724373, + "learning_rate": 0.00017558846032674738, + "loss": 0.2963, + "step": 10362 + }, + { + "epoch": 0.8395171743357096, + "grad_norm": 0.0363149456679821, + "learning_rate": 0.00017558395967415274, + "loss": 0.3417, + "step": 10363 + }, + { + "epoch": 0.8395981853532081, + "grad_norm": 0.032922759652137756, + "learning_rate": 0.00017557945902155813, + "loss": 0.3704, + "step": 10364 + }, + { + "epoch": 0.8396791963707064, + "grad_norm": 0.0318986251950264, + "learning_rate": 0.00017557495836896352, + "loss": 0.3125, + "step": 10365 + }, + { + "epoch": 0.8397602073882048, + "grad_norm": 0.03699880093336105, + "learning_rate": 0.00017557045771636888, + "loss": 0.3382, + "step": 10366 + }, + { + "epoch": 0.8398412184057031, + "grad_norm": 0.03683356195688248, + "learning_rate": 0.00017556595706377426, + "loss": 0.3457, + "step": 10367 + }, + { + "epoch": 0.8399222294232016, + "grad_norm": 0.0369548536837101, + "learning_rate": 0.00017556145641117962, + "loss": 0.3256, + "step": 10368 + }, + { + "epoch": 0.8400032404407, + "grad_norm": 0.03477805480360985, + "learning_rate": 0.00017555695575858498, + "loss": 0.3288, + "step": 10369 + }, + { + "epoch": 0.8400842514581983, + "grad_norm": 0.03249835595488548, + "learning_rate": 0.0001755524551059904, + "loss": 0.3157, + "step": 10370 + }, + { + "epoch": 0.8401652624756967, + "grad_norm": 0.035522498190402985, + "learning_rate": 0.00017554795445339576, + "loss": 0.3019, + "step": 10371 + }, + { + "epoch": 0.840246273493195, + "grad_norm": 0.039500270038843155, + "learning_rate": 0.00017554345380080112, + "loss": 0.3576, + "step": 10372 + }, + { + "epoch": 0.8403272845106935, + "grad_norm": 0.03279853239655495, + "learning_rate": 0.0001755389531482065, + "loss": 0.3433, + "step": 10373 + }, + { + "epoch": 0.8404082955281919, + "grad_norm": 0.03178710490465164, + "learning_rate": 0.00017553445249561187, + "loss": 0.3494, + "step": 10374 + }, + { + "epoch": 0.8404893065456902, + "grad_norm": 0.04652286320924759, + "learning_rate": 0.00017552995184301723, + "loss": 0.3443, + "step": 10375 + }, + { + "epoch": 0.8405703175631886, + "grad_norm": 0.03587408363819122, + "learning_rate": 0.00017552545119042264, + "loss": 0.366, + "step": 10376 + }, + { + "epoch": 0.8406513285806869, + "grad_norm": 0.03234461694955826, + "learning_rate": 0.000175520950537828, + "loss": 0.3429, + "step": 10377 + }, + { + "epoch": 0.8407323395981854, + "grad_norm": 0.028930282220244408, + "learning_rate": 0.00017551644988523336, + "loss": 0.252, + "step": 10378 + }, + { + "epoch": 0.8408133506156837, + "grad_norm": 0.03449437767267227, + "learning_rate": 0.00017551194923263875, + "loss": 0.3081, + "step": 10379 + }, + { + "epoch": 0.8408943616331821, + "grad_norm": 0.037620704621076584, + "learning_rate": 0.0001755074485800441, + "loss": 0.3037, + "step": 10380 + }, + { + "epoch": 0.8409753726506805, + "grad_norm": 0.035372767597436905, + "learning_rate": 0.00017550294792744947, + "loss": 0.3238, + "step": 10381 + }, + { + "epoch": 0.8410563836681789, + "grad_norm": 0.037627965211868286, + "learning_rate": 0.00017549844727485488, + "loss": 0.3703, + "step": 10382 + }, + { + "epoch": 0.8411373946856773, + "grad_norm": 0.037020884454250336, + "learning_rate": 0.00017549394662226024, + "loss": 0.3649, + "step": 10383 + }, + { + "epoch": 0.8412184057031756, + "grad_norm": 0.0348907932639122, + "learning_rate": 0.0001754894459696656, + "loss": 0.331, + "step": 10384 + }, + { + "epoch": 0.841299416720674, + "grad_norm": 0.0314050018787384, + "learning_rate": 0.000175484945317071, + "loss": 0.349, + "step": 10385 + }, + { + "epoch": 0.8413804277381723, + "grad_norm": 0.02945929765701294, + "learning_rate": 0.00017548044466447635, + "loss": 0.3174, + "step": 10386 + }, + { + "epoch": 0.8414614387556708, + "grad_norm": 0.03195718303322792, + "learning_rate": 0.0001754759440118817, + "loss": 0.3399, + "step": 10387 + }, + { + "epoch": 0.8415424497731692, + "grad_norm": 0.029619552195072174, + "learning_rate": 0.00017547144335928712, + "loss": 0.3145, + "step": 10388 + }, + { + "epoch": 0.8416234607906675, + "grad_norm": 0.033388081938028336, + "learning_rate": 0.00017546694270669248, + "loss": 0.371, + "step": 10389 + }, + { + "epoch": 0.8417044718081659, + "grad_norm": 0.034243419766426086, + "learning_rate": 0.00017546244205409784, + "loss": 0.3269, + "step": 10390 + }, + { + "epoch": 0.8417854828256643, + "grad_norm": 0.03303029388189316, + "learning_rate": 0.00017545794140150323, + "loss": 0.361, + "step": 10391 + }, + { + "epoch": 0.8418664938431627, + "grad_norm": 0.03691716119647026, + "learning_rate": 0.0001754534407489086, + "loss": 0.3715, + "step": 10392 + }, + { + "epoch": 0.8419475048606611, + "grad_norm": 0.03396623581647873, + "learning_rate": 0.00017544894009631395, + "loss": 0.3227, + "step": 10393 + }, + { + "epoch": 0.8420285158781594, + "grad_norm": 0.03716912865638733, + "learning_rate": 0.00017544443944371937, + "loss": 0.3289, + "step": 10394 + }, + { + "epoch": 0.8421095268956578, + "grad_norm": 0.038433950394392014, + "learning_rate": 0.00017543993879112473, + "loss": 0.3227, + "step": 10395 + }, + { + "epoch": 0.8421905379131562, + "grad_norm": 0.03316536918282509, + "learning_rate": 0.00017543543813853009, + "loss": 0.3503, + "step": 10396 + }, + { + "epoch": 0.8422715489306546, + "grad_norm": 0.03443404287099838, + "learning_rate": 0.00017543093748593547, + "loss": 0.3407, + "step": 10397 + }, + { + "epoch": 0.842352559948153, + "grad_norm": 0.032896075397729874, + "learning_rate": 0.00017542643683334083, + "loss": 0.356, + "step": 10398 + }, + { + "epoch": 0.8424335709656513, + "grad_norm": 0.03154495730996132, + "learning_rate": 0.00017542193618074622, + "loss": 0.2861, + "step": 10399 + }, + { + "epoch": 0.8425145819831497, + "grad_norm": 0.031888049095869064, + "learning_rate": 0.0001754174355281516, + "loss": 0.3499, + "step": 10400 + }, + { + "epoch": 0.8425955930006481, + "grad_norm": 0.035155389457941055, + "learning_rate": 0.00017541293487555697, + "loss": 0.3075, + "step": 10401 + }, + { + "epoch": 0.8426766040181465, + "grad_norm": 0.03185368701815605, + "learning_rate": 0.00017540843422296233, + "loss": 0.3378, + "step": 10402 + }, + { + "epoch": 0.8427576150356448, + "grad_norm": 0.03206276893615723, + "learning_rate": 0.00017540393357036771, + "loss": 0.2845, + "step": 10403 + }, + { + "epoch": 0.8428386260531432, + "grad_norm": 0.033448535948991776, + "learning_rate": 0.00017539943291777307, + "loss": 0.3136, + "step": 10404 + }, + { + "epoch": 0.8429196370706417, + "grad_norm": 0.03267358988523483, + "learning_rate": 0.00017539493226517846, + "loss": 0.2805, + "step": 10405 + }, + { + "epoch": 0.84300064808814, + "grad_norm": 0.031093450263142586, + "learning_rate": 0.00017539043161258385, + "loss": 0.3217, + "step": 10406 + }, + { + "epoch": 0.8430816591056384, + "grad_norm": 0.03250857815146446, + "learning_rate": 0.0001753859309599892, + "loss": 0.3299, + "step": 10407 + }, + { + "epoch": 0.8431626701231367, + "grad_norm": 0.030597645789384842, + "learning_rate": 0.00017538143030739457, + "loss": 0.2808, + "step": 10408 + }, + { + "epoch": 0.8432436811406351, + "grad_norm": 0.032161563634872437, + "learning_rate": 0.00017537692965479996, + "loss": 0.321, + "step": 10409 + }, + { + "epoch": 0.8433246921581335, + "grad_norm": 0.03388415649533272, + "learning_rate": 0.00017537242900220532, + "loss": 0.3399, + "step": 10410 + }, + { + "epoch": 0.8434057031756319, + "grad_norm": 0.038028065115213394, + "learning_rate": 0.0001753679283496107, + "loss": 0.3773, + "step": 10411 + }, + { + "epoch": 0.8434867141931303, + "grad_norm": 0.030943255871534348, + "learning_rate": 0.0001753634276970161, + "loss": 0.326, + "step": 10412 + }, + { + "epoch": 0.8435677252106286, + "grad_norm": 0.0353950560092926, + "learning_rate": 0.00017535892704442145, + "loss": 0.3228, + "step": 10413 + }, + { + "epoch": 0.843648736228127, + "grad_norm": 0.034129682928323746, + "learning_rate": 0.0001753544263918268, + "loss": 0.3608, + "step": 10414 + }, + { + "epoch": 0.8437297472456254, + "grad_norm": 0.031029662117362022, + "learning_rate": 0.0001753499257392322, + "loss": 0.3247, + "step": 10415 + }, + { + "epoch": 0.8438107582631238, + "grad_norm": 0.0346672423183918, + "learning_rate": 0.00017534542508663756, + "loss": 0.3631, + "step": 10416 + }, + { + "epoch": 0.8438917692806222, + "grad_norm": 0.03463343158364296, + "learning_rate": 0.00017534092443404294, + "loss": 0.3119, + "step": 10417 + }, + { + "epoch": 0.8439727802981205, + "grad_norm": 0.027508914470672607, + "learning_rate": 0.00017533642378144833, + "loss": 0.3191, + "step": 10418 + }, + { + "epoch": 0.844053791315619, + "grad_norm": 0.032727956771850586, + "learning_rate": 0.0001753319231288537, + "loss": 0.3611, + "step": 10419 + }, + { + "epoch": 0.8441348023331173, + "grad_norm": 0.038039255887269974, + "learning_rate": 0.00017532742247625905, + "loss": 0.3386, + "step": 10420 + }, + { + "epoch": 0.8442158133506157, + "grad_norm": 0.029720397666096687, + "learning_rate": 0.00017532292182366444, + "loss": 0.3256, + "step": 10421 + }, + { + "epoch": 0.844296824368114, + "grad_norm": 0.033293940126895905, + "learning_rate": 0.00017531842117106983, + "loss": 0.357, + "step": 10422 + }, + { + "epoch": 0.8443778353856124, + "grad_norm": 0.03200975060462952, + "learning_rate": 0.00017531392051847519, + "loss": 0.2927, + "step": 10423 + }, + { + "epoch": 0.8444588464031109, + "grad_norm": 0.034486331045627594, + "learning_rate": 0.00017530941986588057, + "loss": 0.3625, + "step": 10424 + }, + { + "epoch": 0.8445398574206092, + "grad_norm": 0.03146517276763916, + "learning_rate": 0.00017530491921328593, + "loss": 0.2829, + "step": 10425 + }, + { + "epoch": 0.8446208684381076, + "grad_norm": 0.03595639392733574, + "learning_rate": 0.0001753004185606913, + "loss": 0.3401, + "step": 10426 + }, + { + "epoch": 0.8447018794556059, + "grad_norm": 0.031279925256967545, + "learning_rate": 0.00017529591790809668, + "loss": 0.3248, + "step": 10427 + }, + { + "epoch": 0.8447828904731044, + "grad_norm": 0.02898409217596054, + "learning_rate": 0.00017529141725550207, + "loss": 0.3023, + "step": 10428 + }, + { + "epoch": 0.8448639014906028, + "grad_norm": 0.03126189485192299, + "learning_rate": 0.00017528691660290743, + "loss": 0.3171, + "step": 10429 + }, + { + "epoch": 0.8449449125081011, + "grad_norm": 0.03708004206418991, + "learning_rate": 0.00017528241595031281, + "loss": 0.3685, + "step": 10430 + }, + { + "epoch": 0.8450259235255995, + "grad_norm": 0.03363499045372009, + "learning_rate": 0.00017527791529771817, + "loss": 0.3127, + "step": 10431 + }, + { + "epoch": 0.8451069345430978, + "grad_norm": 0.031809255480766296, + "learning_rate": 0.00017527341464512353, + "loss": 0.2751, + "step": 10432 + }, + { + "epoch": 0.8451879455605963, + "grad_norm": 0.03433963283896446, + "learning_rate": 0.00017526891399252892, + "loss": 0.3465, + "step": 10433 + }, + { + "epoch": 0.8452689565780946, + "grad_norm": 0.030995117500424385, + "learning_rate": 0.0001752644133399343, + "loss": 0.3213, + "step": 10434 + }, + { + "epoch": 0.845349967595593, + "grad_norm": 0.035280533134937286, + "learning_rate": 0.00017525991268733967, + "loss": 0.3344, + "step": 10435 + }, + { + "epoch": 0.8454309786130914, + "grad_norm": 0.030559923499822617, + "learning_rate": 0.00017525541203474506, + "loss": 0.3134, + "step": 10436 + }, + { + "epoch": 0.8455119896305897, + "grad_norm": 0.04106120392680168, + "learning_rate": 0.00017525091138215042, + "loss": 0.3434, + "step": 10437 + }, + { + "epoch": 0.8455930006480882, + "grad_norm": 0.0386023111641407, + "learning_rate": 0.00017524641072955578, + "loss": 0.3193, + "step": 10438 + }, + { + "epoch": 0.8456740116655865, + "grad_norm": 0.03373211249709129, + "learning_rate": 0.00017524191007696116, + "loss": 0.3165, + "step": 10439 + }, + { + "epoch": 0.8457550226830849, + "grad_norm": 0.03274044767022133, + "learning_rate": 0.00017523740942436655, + "loss": 0.3494, + "step": 10440 + }, + { + "epoch": 0.8458360337005832, + "grad_norm": 0.036142315715551376, + "learning_rate": 0.0001752329087717719, + "loss": 0.324, + "step": 10441 + }, + { + "epoch": 0.8459170447180817, + "grad_norm": 0.03499079868197441, + "learning_rate": 0.0001752284081191773, + "loss": 0.3474, + "step": 10442 + }, + { + "epoch": 0.8459980557355801, + "grad_norm": 0.03600572049617767, + "learning_rate": 0.00017522390746658266, + "loss": 0.3471, + "step": 10443 + }, + { + "epoch": 0.8460790667530784, + "grad_norm": 0.031942158937454224, + "learning_rate": 0.00017521940681398802, + "loss": 0.3209, + "step": 10444 + }, + { + "epoch": 0.8461600777705768, + "grad_norm": 0.03473243489861488, + "learning_rate": 0.0001752149061613934, + "loss": 0.306, + "step": 10445 + }, + { + "epoch": 0.8462410887880751, + "grad_norm": 0.033768992871046066, + "learning_rate": 0.0001752104055087988, + "loss": 0.354, + "step": 10446 + }, + { + "epoch": 0.8463220998055736, + "grad_norm": 0.03379128873348236, + "learning_rate": 0.00017520590485620415, + "loss": 0.3339, + "step": 10447 + }, + { + "epoch": 0.846403110823072, + "grad_norm": 0.035574495792388916, + "learning_rate": 0.00017520140420360954, + "loss": 0.3418, + "step": 10448 + }, + { + "epoch": 0.8464841218405703, + "grad_norm": 0.03196391090750694, + "learning_rate": 0.0001751969035510149, + "loss": 0.3601, + "step": 10449 + }, + { + "epoch": 0.8465651328580687, + "grad_norm": 0.03568727895617485, + "learning_rate": 0.00017519240289842026, + "loss": 0.3679, + "step": 10450 + }, + { + "epoch": 0.846646143875567, + "grad_norm": 0.034467652440071106, + "learning_rate": 0.00017518790224582567, + "loss": 0.3713, + "step": 10451 + }, + { + "epoch": 0.8467271548930655, + "grad_norm": 0.029677096754312515, + "learning_rate": 0.00017518340159323103, + "loss": 0.3436, + "step": 10452 + }, + { + "epoch": 0.8468081659105638, + "grad_norm": 0.03185281902551651, + "learning_rate": 0.0001751789009406364, + "loss": 0.3117, + "step": 10453 + }, + { + "epoch": 0.8468891769280622, + "grad_norm": 0.0332181490957737, + "learning_rate": 0.00017517440028804178, + "loss": 0.3082, + "step": 10454 + }, + { + "epoch": 0.8469701879455606, + "grad_norm": 0.034928590059280396, + "learning_rate": 0.00017516989963544714, + "loss": 0.284, + "step": 10455 + }, + { + "epoch": 0.847051198963059, + "grad_norm": 0.036926280707120895, + "learning_rate": 0.0001751653989828525, + "loss": 0.3399, + "step": 10456 + }, + { + "epoch": 0.8471322099805574, + "grad_norm": 0.033530738204717636, + "learning_rate": 0.00017516089833025792, + "loss": 0.3319, + "step": 10457 + }, + { + "epoch": 0.8472132209980557, + "grad_norm": 0.03474058955907822, + "learning_rate": 0.00017515639767766328, + "loss": 0.3459, + "step": 10458 + }, + { + "epoch": 0.8472942320155541, + "grad_norm": 0.03582526743412018, + "learning_rate": 0.00017515189702506864, + "loss": 0.3388, + "step": 10459 + }, + { + "epoch": 0.8473752430330524, + "grad_norm": 0.030622687190771103, + "learning_rate": 0.00017514739637247402, + "loss": 0.2943, + "step": 10460 + }, + { + "epoch": 0.8474562540505509, + "grad_norm": 0.03136473521590233, + "learning_rate": 0.00017514289571987938, + "loss": 0.3152, + "step": 10461 + }, + { + "epoch": 0.8475372650680493, + "grad_norm": 0.03528090938925743, + "learning_rate": 0.00017513839506728474, + "loss": 0.3494, + "step": 10462 + }, + { + "epoch": 0.8476182760855476, + "grad_norm": 0.03588859736919403, + "learning_rate": 0.00017513389441469016, + "loss": 0.3291, + "step": 10463 + }, + { + "epoch": 0.847699287103046, + "grad_norm": 0.031587012112140656, + "learning_rate": 0.00017512939376209552, + "loss": 0.3038, + "step": 10464 + }, + { + "epoch": 0.8477802981205443, + "grad_norm": 0.03372107073664665, + "learning_rate": 0.00017512489310950088, + "loss": 0.3606, + "step": 10465 + }, + { + "epoch": 0.8478613091380428, + "grad_norm": 0.03304535523056984, + "learning_rate": 0.00017512039245690626, + "loss": 0.3513, + "step": 10466 + }, + { + "epoch": 0.8479423201555412, + "grad_norm": 0.03352010250091553, + "learning_rate": 0.00017511589180431162, + "loss": 0.3307, + "step": 10467 + }, + { + "epoch": 0.8480233311730395, + "grad_norm": 0.03563792631030083, + "learning_rate": 0.00017511139115171698, + "loss": 0.2981, + "step": 10468 + }, + { + "epoch": 0.8481043421905379, + "grad_norm": 0.031774427741765976, + "learning_rate": 0.0001751068904991224, + "loss": 0.3627, + "step": 10469 + }, + { + "epoch": 0.8481853532080363, + "grad_norm": 0.030490538105368614, + "learning_rate": 0.00017510238984652776, + "loss": 0.3148, + "step": 10470 + }, + { + "epoch": 0.8482663642255347, + "grad_norm": 0.03514517843723297, + "learning_rate": 0.00017509788919393312, + "loss": 0.3667, + "step": 10471 + }, + { + "epoch": 0.848347375243033, + "grad_norm": 0.035855576395988464, + "learning_rate": 0.0001750933885413385, + "loss": 0.3683, + "step": 10472 + }, + { + "epoch": 0.8484283862605314, + "grad_norm": 0.03173260763287544, + "learning_rate": 0.00017508888788874387, + "loss": 0.3434, + "step": 10473 + }, + { + "epoch": 0.8485093972780298, + "grad_norm": 0.029711991548538208, + "learning_rate": 0.00017508438723614925, + "loss": 0.3222, + "step": 10474 + }, + { + "epoch": 0.8485904082955282, + "grad_norm": 0.038392074406147, + "learning_rate": 0.00017507988658355464, + "loss": 0.329, + "step": 10475 + }, + { + "epoch": 0.8486714193130266, + "grad_norm": 0.03413744643330574, + "learning_rate": 0.00017507538593096, + "loss": 0.2911, + "step": 10476 + }, + { + "epoch": 0.8487524303305249, + "grad_norm": 0.03350900858640671, + "learning_rate": 0.00017507088527836536, + "loss": 0.3408, + "step": 10477 + }, + { + "epoch": 0.8488334413480233, + "grad_norm": 0.03474394232034683, + "learning_rate": 0.00017506638462577075, + "loss": 0.3367, + "step": 10478 + }, + { + "epoch": 0.8489144523655218, + "grad_norm": 0.032444316893815994, + "learning_rate": 0.0001750618839731761, + "loss": 0.3016, + "step": 10479 + }, + { + "epoch": 0.8489954633830201, + "grad_norm": 0.035130154341459274, + "learning_rate": 0.0001750573833205815, + "loss": 0.3083, + "step": 10480 + }, + { + "epoch": 0.8490764744005185, + "grad_norm": 0.03369656205177307, + "learning_rate": 0.00017505288266798688, + "loss": 0.3756, + "step": 10481 + }, + { + "epoch": 0.8491574854180168, + "grad_norm": 0.03448232635855675, + "learning_rate": 0.00017504838201539224, + "loss": 0.3186, + "step": 10482 + }, + { + "epoch": 0.8492384964355152, + "grad_norm": 0.031114330515265465, + "learning_rate": 0.0001750438813627976, + "loss": 0.3067, + "step": 10483 + }, + { + "epoch": 0.8493195074530137, + "grad_norm": 0.039805684238672256, + "learning_rate": 0.000175039380710203, + "loss": 0.3406, + "step": 10484 + }, + { + "epoch": 0.849400518470512, + "grad_norm": 0.03329307585954666, + "learning_rate": 0.00017503488005760835, + "loss": 0.3063, + "step": 10485 + }, + { + "epoch": 0.8494815294880104, + "grad_norm": 0.03293229639530182, + "learning_rate": 0.00017503037940501374, + "loss": 0.302, + "step": 10486 + }, + { + "epoch": 0.8495625405055087, + "grad_norm": 0.03178049623966217, + "learning_rate": 0.00017502587875241912, + "loss": 0.3079, + "step": 10487 + }, + { + "epoch": 0.8496435515230071, + "grad_norm": 0.04478497803211212, + "learning_rate": 0.00017502137809982448, + "loss": 0.3592, + "step": 10488 + }, + { + "epoch": 0.8497245625405055, + "grad_norm": 0.03788404539227486, + "learning_rate": 0.00017501687744722984, + "loss": 0.3441, + "step": 10489 + }, + { + "epoch": 0.8498055735580039, + "grad_norm": 0.031607985496520996, + "learning_rate": 0.00017501237679463523, + "loss": 0.3114, + "step": 10490 + }, + { + "epoch": 0.8498865845755023, + "grad_norm": 0.03063138946890831, + "learning_rate": 0.0001750078761420406, + "loss": 0.2909, + "step": 10491 + }, + { + "epoch": 0.8499675955930006, + "grad_norm": 0.030662069097161293, + "learning_rate": 0.00017500337548944598, + "loss": 0.3031, + "step": 10492 + }, + { + "epoch": 0.8500486066104991, + "grad_norm": 0.031277742236852646, + "learning_rate": 0.00017499887483685137, + "loss": 0.3337, + "step": 10493 + }, + { + "epoch": 0.8501296176279974, + "grad_norm": 0.029651375487446785, + "learning_rate": 0.00017499437418425673, + "loss": 0.3225, + "step": 10494 + }, + { + "epoch": 0.8502106286454958, + "grad_norm": 0.03209708631038666, + "learning_rate": 0.00017498987353166209, + "loss": 0.2684, + "step": 10495 + }, + { + "epoch": 0.8502916396629941, + "grad_norm": 0.03412383422255516, + "learning_rate": 0.00017498537287906747, + "loss": 0.3399, + "step": 10496 + }, + { + "epoch": 0.8503726506804925, + "grad_norm": 0.037482064217329025, + "learning_rate": 0.00017498087222647283, + "loss": 0.3399, + "step": 10497 + }, + { + "epoch": 0.850453661697991, + "grad_norm": 0.030939724296331406, + "learning_rate": 0.00017497637157387822, + "loss": 0.3098, + "step": 10498 + }, + { + "epoch": 0.8505346727154893, + "grad_norm": 0.034733328968286514, + "learning_rate": 0.0001749718709212836, + "loss": 0.3347, + "step": 10499 + }, + { + "epoch": 0.8506156837329877, + "grad_norm": 0.027981078252196312, + "learning_rate": 0.00017496737026868897, + "loss": 0.2766, + "step": 10500 + }, + { + "epoch": 0.850696694750486, + "grad_norm": 0.03607220947742462, + "learning_rate": 0.00017496286961609433, + "loss": 0.3655, + "step": 10501 + }, + { + "epoch": 0.8507777057679844, + "grad_norm": 0.030028050765395164, + "learning_rate": 0.00017495836896349971, + "loss": 0.3004, + "step": 10502 + }, + { + "epoch": 0.8508587167854829, + "grad_norm": 0.03431127220392227, + "learning_rate": 0.0001749538683109051, + "loss": 0.2987, + "step": 10503 + }, + { + "epoch": 0.8509397278029812, + "grad_norm": 0.035340793430805206, + "learning_rate": 0.00017494936765831046, + "loss": 0.3555, + "step": 10504 + }, + { + "epoch": 0.8510207388204796, + "grad_norm": 0.034876782447099686, + "learning_rate": 0.00017494486700571585, + "loss": 0.3196, + "step": 10505 + }, + { + "epoch": 0.8511017498379779, + "grad_norm": 0.03613532707095146, + "learning_rate": 0.0001749403663531212, + "loss": 0.2934, + "step": 10506 + }, + { + "epoch": 0.8511827608554764, + "grad_norm": 0.03943546861410141, + "learning_rate": 0.00017493586570052657, + "loss": 0.3606, + "step": 10507 + }, + { + "epoch": 0.8512637718729748, + "grad_norm": 0.028644273057579994, + "learning_rate": 0.00017493136504793196, + "loss": 0.2971, + "step": 10508 + }, + { + "epoch": 0.8513447828904731, + "grad_norm": 0.03636188432574272, + "learning_rate": 0.00017492686439533734, + "loss": 0.3565, + "step": 10509 + }, + { + "epoch": 0.8514257939079715, + "grad_norm": 0.03777918219566345, + "learning_rate": 0.0001749223637427427, + "loss": 0.3487, + "step": 10510 + }, + { + "epoch": 0.8515068049254698, + "grad_norm": 0.03311832621693611, + "learning_rate": 0.0001749178630901481, + "loss": 0.3344, + "step": 10511 + }, + { + "epoch": 0.8515878159429683, + "grad_norm": 0.037984661757946014, + "learning_rate": 0.00017491336243755345, + "loss": 0.3333, + "step": 10512 + }, + { + "epoch": 0.8516688269604666, + "grad_norm": 0.034046683460474014, + "learning_rate": 0.0001749088617849588, + "loss": 0.3901, + "step": 10513 + }, + { + "epoch": 0.851749837977965, + "grad_norm": 0.03571656718850136, + "learning_rate": 0.0001749043611323642, + "loss": 0.335, + "step": 10514 + }, + { + "epoch": 0.8518308489954634, + "grad_norm": 0.030205728486180305, + "learning_rate": 0.00017489986047976958, + "loss": 0.3063, + "step": 10515 + }, + { + "epoch": 0.8519118600129617, + "grad_norm": 0.029302062466740608, + "learning_rate": 0.00017489535982717494, + "loss": 0.2921, + "step": 10516 + }, + { + "epoch": 0.8519928710304602, + "grad_norm": 0.035452794283628464, + "learning_rate": 0.00017489085917458033, + "loss": 0.3215, + "step": 10517 + }, + { + "epoch": 0.8520738820479585, + "grad_norm": 0.02976333722472191, + "learning_rate": 0.0001748863585219857, + "loss": 0.2924, + "step": 10518 + }, + { + "epoch": 0.8521548930654569, + "grad_norm": 0.03485938161611557, + "learning_rate": 0.00017488185786939105, + "loss": 0.3418, + "step": 10519 + }, + { + "epoch": 0.8522359040829552, + "grad_norm": 0.03175254538655281, + "learning_rate": 0.00017487735721679644, + "loss": 0.3056, + "step": 10520 + }, + { + "epoch": 0.8523169151004537, + "grad_norm": 0.029901329427957535, + "learning_rate": 0.00017487285656420183, + "loss": 0.2967, + "step": 10521 + }, + { + "epoch": 0.8523979261179521, + "grad_norm": 0.029504723846912384, + "learning_rate": 0.00017486835591160719, + "loss": 0.2951, + "step": 10522 + }, + { + "epoch": 0.8524789371354504, + "grad_norm": 0.03407463803887367, + "learning_rate": 0.00017486385525901257, + "loss": 0.3007, + "step": 10523 + }, + { + "epoch": 0.8525599481529488, + "grad_norm": 0.029503922909498215, + "learning_rate": 0.00017485935460641793, + "loss": 0.2701, + "step": 10524 + }, + { + "epoch": 0.8526409591704471, + "grad_norm": 0.03525906056165695, + "learning_rate": 0.0001748548539538233, + "loss": 0.3396, + "step": 10525 + }, + { + "epoch": 0.8527219701879456, + "grad_norm": 0.035471223294734955, + "learning_rate": 0.0001748503533012287, + "loss": 0.3647, + "step": 10526 + }, + { + "epoch": 0.852802981205444, + "grad_norm": 0.031278256326913834, + "learning_rate": 0.00017484585264863407, + "loss": 0.2727, + "step": 10527 + }, + { + "epoch": 0.8528839922229423, + "grad_norm": 0.0368519127368927, + "learning_rate": 0.00017484135199603943, + "loss": 0.2909, + "step": 10528 + }, + { + "epoch": 0.8529650032404407, + "grad_norm": 0.034553319215774536, + "learning_rate": 0.00017483685134344482, + "loss": 0.3483, + "step": 10529 + }, + { + "epoch": 0.8530460142579391, + "grad_norm": 0.03882437199354172, + "learning_rate": 0.00017483235069085018, + "loss": 0.3794, + "step": 10530 + }, + { + "epoch": 0.8531270252754375, + "grad_norm": 0.029881006106734276, + "learning_rate": 0.00017482785003825554, + "loss": 0.3027, + "step": 10531 + }, + { + "epoch": 0.8532080362929358, + "grad_norm": 0.03130098059773445, + "learning_rate": 0.00017482334938566095, + "loss": 0.3028, + "step": 10532 + }, + { + "epoch": 0.8532890473104342, + "grad_norm": 0.032947756350040436, + "learning_rate": 0.0001748188487330663, + "loss": 0.328, + "step": 10533 + }, + { + "epoch": 0.8533700583279326, + "grad_norm": 0.03265698254108429, + "learning_rate": 0.00017481434808047167, + "loss": 0.289, + "step": 10534 + }, + { + "epoch": 0.853451069345431, + "grad_norm": 0.0320311076939106, + "learning_rate": 0.00017480984742787706, + "loss": 0.3247, + "step": 10535 + }, + { + "epoch": 0.8535320803629294, + "grad_norm": 0.03273462876677513, + "learning_rate": 0.00017480534677528242, + "loss": 0.303, + "step": 10536 + }, + { + "epoch": 0.8536130913804277, + "grad_norm": 0.03394000604748726, + "learning_rate": 0.00017480084612268778, + "loss": 0.3492, + "step": 10537 + }, + { + "epoch": 0.8536941023979261, + "grad_norm": 0.03348904103040695, + "learning_rate": 0.0001747963454700932, + "loss": 0.3686, + "step": 10538 + }, + { + "epoch": 0.8537751134154244, + "grad_norm": 0.03144761547446251, + "learning_rate": 0.00017479184481749855, + "loss": 0.2992, + "step": 10539 + }, + { + "epoch": 0.8538561244329229, + "grad_norm": 0.03692207485437393, + "learning_rate": 0.0001747873441649039, + "loss": 0.3173, + "step": 10540 + }, + { + "epoch": 0.8539371354504213, + "grad_norm": 0.032168686389923096, + "learning_rate": 0.0001747828435123093, + "loss": 0.2916, + "step": 10541 + }, + { + "epoch": 0.8540181464679196, + "grad_norm": 0.03592012822628021, + "learning_rate": 0.00017477834285971466, + "loss": 0.3574, + "step": 10542 + }, + { + "epoch": 0.854099157485418, + "grad_norm": 0.03215634450316429, + "learning_rate": 0.00017477384220712002, + "loss": 0.3042, + "step": 10543 + }, + { + "epoch": 0.8541801685029164, + "grad_norm": 0.030979136005043983, + "learning_rate": 0.00017476934155452543, + "loss": 0.312, + "step": 10544 + }, + { + "epoch": 0.8542611795204148, + "grad_norm": 0.03511403873562813, + "learning_rate": 0.0001747648409019308, + "loss": 0.3502, + "step": 10545 + }, + { + "epoch": 0.8543421905379132, + "grad_norm": 0.03832389414310455, + "learning_rate": 0.00017476034024933615, + "loss": 0.3195, + "step": 10546 + }, + { + "epoch": 0.8544232015554115, + "grad_norm": 0.036246124655008316, + "learning_rate": 0.00017475583959674154, + "loss": 0.3676, + "step": 10547 + }, + { + "epoch": 0.8545042125729099, + "grad_norm": 0.0348796620965004, + "learning_rate": 0.0001747513389441469, + "loss": 0.3575, + "step": 10548 + }, + { + "epoch": 0.8545852235904083, + "grad_norm": 0.031146438792347908, + "learning_rate": 0.00017474683829155226, + "loss": 0.3296, + "step": 10549 + }, + { + "epoch": 0.8546662346079067, + "grad_norm": 0.03914908692240715, + "learning_rate": 0.00017474233763895767, + "loss": 0.3367, + "step": 10550 + }, + { + "epoch": 0.854747245625405, + "grad_norm": 0.045957714319229126, + "learning_rate": 0.00017473783698636303, + "loss": 0.4161, + "step": 10551 + }, + { + "epoch": 0.8548282566429034, + "grad_norm": 0.03508540242910385, + "learning_rate": 0.0001747333363337684, + "loss": 0.3005, + "step": 10552 + }, + { + "epoch": 0.8549092676604018, + "grad_norm": 0.03551001101732254, + "learning_rate": 0.00017472883568117378, + "loss": 0.3527, + "step": 10553 + }, + { + "epoch": 0.8549902786779002, + "grad_norm": 0.03131183981895447, + "learning_rate": 0.00017472433502857914, + "loss": 0.3136, + "step": 10554 + }, + { + "epoch": 0.8550712896953986, + "grad_norm": 0.02973882667720318, + "learning_rate": 0.00017471983437598453, + "loss": 0.2885, + "step": 10555 + }, + { + "epoch": 0.8551523007128969, + "grad_norm": 0.03016551584005356, + "learning_rate": 0.00017471533372338992, + "loss": 0.3037, + "step": 10556 + }, + { + "epoch": 0.8552333117303953, + "grad_norm": 0.033437248319387436, + "learning_rate": 0.00017471083307079528, + "loss": 0.3456, + "step": 10557 + }, + { + "epoch": 0.8553143227478938, + "grad_norm": 0.03692424297332764, + "learning_rate": 0.00017470633241820064, + "loss": 0.3644, + "step": 10558 + }, + { + "epoch": 0.8553953337653921, + "grad_norm": 0.03292136266827583, + "learning_rate": 0.00017470183176560602, + "loss": 0.3453, + "step": 10559 + }, + { + "epoch": 0.8554763447828905, + "grad_norm": 0.03032582625746727, + "learning_rate": 0.00017469733111301138, + "loss": 0.3173, + "step": 10560 + }, + { + "epoch": 0.8555573558003888, + "grad_norm": 0.03443504869937897, + "learning_rate": 0.00017469283046041677, + "loss": 0.349, + "step": 10561 + }, + { + "epoch": 0.8556383668178872, + "grad_norm": 0.03508608415722847, + "learning_rate": 0.00017468832980782216, + "loss": 0.3351, + "step": 10562 + }, + { + "epoch": 0.8557193778353857, + "grad_norm": 0.03486845642328262, + "learning_rate": 0.00017468382915522752, + "loss": 0.3078, + "step": 10563 + }, + { + "epoch": 0.855800388852884, + "grad_norm": 0.036465246230363846, + "learning_rate": 0.00017467932850263288, + "loss": 0.4078, + "step": 10564 + }, + { + "epoch": 0.8558813998703824, + "grad_norm": 0.03374059870839119, + "learning_rate": 0.00017467482785003826, + "loss": 0.3448, + "step": 10565 + }, + { + "epoch": 0.8559624108878807, + "grad_norm": 0.03241916373372078, + "learning_rate": 0.00017467032719744362, + "loss": 0.2998, + "step": 10566 + }, + { + "epoch": 0.8560434219053791, + "grad_norm": 0.029919980093836784, + "learning_rate": 0.000174665826544849, + "loss": 0.3075, + "step": 10567 + }, + { + "epoch": 0.8561244329228775, + "grad_norm": 0.037165362387895584, + "learning_rate": 0.0001746613258922544, + "loss": 0.437, + "step": 10568 + }, + { + "epoch": 0.8562054439403759, + "grad_norm": 0.03804047033190727, + "learning_rate": 0.00017465682523965976, + "loss": 0.3562, + "step": 10569 + }, + { + "epoch": 0.8562864549578743, + "grad_norm": 0.029782762750983238, + "learning_rate": 0.00017465232458706512, + "loss": 0.3238, + "step": 10570 + }, + { + "epoch": 0.8563674659753726, + "grad_norm": 0.03304661065340042, + "learning_rate": 0.0001746478239344705, + "loss": 0.3193, + "step": 10571 + }, + { + "epoch": 0.8564484769928711, + "grad_norm": 0.03296075761318207, + "learning_rate": 0.00017464332328187587, + "loss": 0.3256, + "step": 10572 + }, + { + "epoch": 0.8565294880103694, + "grad_norm": 0.0316382572054863, + "learning_rate": 0.00017463882262928125, + "loss": 0.3024, + "step": 10573 + }, + { + "epoch": 0.8566104990278678, + "grad_norm": 0.032189108431339264, + "learning_rate": 0.00017463432197668664, + "loss": 0.3169, + "step": 10574 + }, + { + "epoch": 0.8566915100453661, + "grad_norm": 0.03579109162092209, + "learning_rate": 0.000174629821324092, + "loss": 0.3472, + "step": 10575 + }, + { + "epoch": 0.8567725210628645, + "grad_norm": 0.037844136357307434, + "learning_rate": 0.00017462532067149736, + "loss": 0.305, + "step": 10576 + }, + { + "epoch": 0.856853532080363, + "grad_norm": 0.030680108815431595, + "learning_rate": 0.00017462082001890275, + "loss": 0.3346, + "step": 10577 + }, + { + "epoch": 0.8569345430978613, + "grad_norm": 0.030500588938593864, + "learning_rate": 0.0001746163193663081, + "loss": 0.3188, + "step": 10578 + }, + { + "epoch": 0.8570155541153597, + "grad_norm": 0.031004825606942177, + "learning_rate": 0.0001746118187137135, + "loss": 0.3341, + "step": 10579 + }, + { + "epoch": 0.857096565132858, + "grad_norm": 0.038353081792593, + "learning_rate": 0.00017460731806111888, + "loss": 0.3641, + "step": 10580 + }, + { + "epoch": 0.8571775761503565, + "grad_norm": 0.03716079890727997, + "learning_rate": 0.00017460281740852424, + "loss": 0.3545, + "step": 10581 + }, + { + "epoch": 0.8572585871678549, + "grad_norm": 0.038967959582805634, + "learning_rate": 0.0001745983167559296, + "loss": 0.3494, + "step": 10582 + }, + { + "epoch": 0.8573395981853532, + "grad_norm": 0.032367292791604996, + "learning_rate": 0.000174593816103335, + "loss": 0.3105, + "step": 10583 + }, + { + "epoch": 0.8574206092028516, + "grad_norm": 0.03238913044333458, + "learning_rate": 0.00017458931545074038, + "loss": 0.3067, + "step": 10584 + }, + { + "epoch": 0.8575016202203499, + "grad_norm": 0.03460938110947609, + "learning_rate": 0.00017458481479814574, + "loss": 0.3403, + "step": 10585 + }, + { + "epoch": 0.8575826312378484, + "grad_norm": 0.031412165611982346, + "learning_rate": 0.00017458031414555112, + "loss": 0.3467, + "step": 10586 + }, + { + "epoch": 0.8576636422553467, + "grad_norm": 0.03444996848702431, + "learning_rate": 0.00017457581349295648, + "loss": 0.3632, + "step": 10587 + }, + { + "epoch": 0.8577446532728451, + "grad_norm": 0.035433437675237656, + "learning_rate": 0.00017457131284036184, + "loss": 0.3549, + "step": 10588 + }, + { + "epoch": 0.8578256642903435, + "grad_norm": 0.03140580281615257, + "learning_rate": 0.00017456681218776723, + "loss": 0.3354, + "step": 10589 + }, + { + "epoch": 0.8579066753078418, + "grad_norm": 0.03797769546508789, + "learning_rate": 0.00017456231153517262, + "loss": 0.3699, + "step": 10590 + }, + { + "epoch": 0.8579876863253403, + "grad_norm": 0.03754464536905289, + "learning_rate": 0.00017455781088257798, + "loss": 0.3747, + "step": 10591 + }, + { + "epoch": 0.8580686973428386, + "grad_norm": 0.04102705046534538, + "learning_rate": 0.00017455331022998337, + "loss": 0.3145, + "step": 10592 + }, + { + "epoch": 0.858149708360337, + "grad_norm": 0.02819933369755745, + "learning_rate": 0.00017454880957738873, + "loss": 0.3025, + "step": 10593 + }, + { + "epoch": 0.8582307193778353, + "grad_norm": 0.03205656632781029, + "learning_rate": 0.00017454430892479409, + "loss": 0.3274, + "step": 10594 + }, + { + "epoch": 0.8583117303953338, + "grad_norm": 0.03971799835562706, + "learning_rate": 0.00017453980827219947, + "loss": 0.3275, + "step": 10595 + }, + { + "epoch": 0.8583927414128322, + "grad_norm": 0.03544100001454353, + "learning_rate": 0.00017453530761960486, + "loss": 0.3637, + "step": 10596 + }, + { + "epoch": 0.8584737524303305, + "grad_norm": 0.03206910938024521, + "learning_rate": 0.00017453080696701022, + "loss": 0.3333, + "step": 10597 + }, + { + "epoch": 0.8585547634478289, + "grad_norm": 0.03851524367928505, + "learning_rate": 0.0001745263063144156, + "loss": 0.3343, + "step": 10598 + }, + { + "epoch": 0.8586357744653272, + "grad_norm": 0.03511196747422218, + "learning_rate": 0.00017452180566182097, + "loss": 0.3406, + "step": 10599 + }, + { + "epoch": 0.8587167854828257, + "grad_norm": 0.033691514283418655, + "learning_rate": 0.00017451730500922633, + "loss": 0.3267, + "step": 10600 + }, + { + "epoch": 0.8587977965003241, + "grad_norm": 0.03204326704144478, + "learning_rate": 0.00017451280435663171, + "loss": 0.3155, + "step": 10601 + }, + { + "epoch": 0.8588788075178224, + "grad_norm": 0.033211641013622284, + "learning_rate": 0.0001745083037040371, + "loss": 0.3562, + "step": 10602 + }, + { + "epoch": 0.8589598185353208, + "grad_norm": 0.03935149312019348, + "learning_rate": 0.00017450380305144246, + "loss": 0.3606, + "step": 10603 + }, + { + "epoch": 0.8590408295528191, + "grad_norm": 0.034088894724845886, + "learning_rate": 0.00017449930239884785, + "loss": 0.3354, + "step": 10604 + }, + { + "epoch": 0.8591218405703176, + "grad_norm": 0.03058660961687565, + "learning_rate": 0.0001744948017462532, + "loss": 0.295, + "step": 10605 + }, + { + "epoch": 0.859202851587816, + "grad_norm": 0.03271281719207764, + "learning_rate": 0.00017449030109365857, + "loss": 0.2943, + "step": 10606 + }, + { + "epoch": 0.8592838626053143, + "grad_norm": 0.036279551684856415, + "learning_rate": 0.00017448580044106398, + "loss": 0.3883, + "step": 10607 + }, + { + "epoch": 0.8593648736228127, + "grad_norm": 0.03347590193152428, + "learning_rate": 0.00017448129978846934, + "loss": 0.3241, + "step": 10608 + }, + { + "epoch": 0.8594458846403111, + "grad_norm": 0.03564797714352608, + "learning_rate": 0.0001744767991358747, + "loss": 0.3127, + "step": 10609 + }, + { + "epoch": 0.8595268956578095, + "grad_norm": 0.03396541625261307, + "learning_rate": 0.0001744722984832801, + "loss": 0.3016, + "step": 10610 + }, + { + "epoch": 0.8596079066753078, + "grad_norm": 0.028714140877127647, + "learning_rate": 0.00017446779783068545, + "loss": 0.2933, + "step": 10611 + }, + { + "epoch": 0.8596889176928062, + "grad_norm": 0.030577784404158592, + "learning_rate": 0.0001744632971780908, + "loss": 0.3014, + "step": 10612 + }, + { + "epoch": 0.8597699287103046, + "grad_norm": 0.03310084715485573, + "learning_rate": 0.00017445879652549622, + "loss": 0.316, + "step": 10613 + }, + { + "epoch": 0.859850939727803, + "grad_norm": 0.03821340203285217, + "learning_rate": 0.00017445429587290158, + "loss": 0.3858, + "step": 10614 + }, + { + "epoch": 0.8599319507453014, + "grad_norm": 0.03301851078867912, + "learning_rate": 0.00017444979522030694, + "loss": 0.3136, + "step": 10615 + }, + { + "epoch": 0.8600129617627997, + "grad_norm": 0.03597554191946983, + "learning_rate": 0.00017444529456771233, + "loss": 0.3253, + "step": 10616 + }, + { + "epoch": 0.8600939727802981, + "grad_norm": 0.03361279517412186, + "learning_rate": 0.0001744407939151177, + "loss": 0.3394, + "step": 10617 + }, + { + "epoch": 0.8601749837977966, + "grad_norm": 0.03018251433968544, + "learning_rate": 0.00017443629326252305, + "loss": 0.3303, + "step": 10618 + }, + { + "epoch": 0.8602559948152949, + "grad_norm": 0.03364105150103569, + "learning_rate": 0.00017443179260992847, + "loss": 0.3443, + "step": 10619 + }, + { + "epoch": 0.8603370058327933, + "grad_norm": 0.03645896166563034, + "learning_rate": 0.00017442729195733383, + "loss": 0.3852, + "step": 10620 + }, + { + "epoch": 0.8604180168502916, + "grad_norm": 0.03501100465655327, + "learning_rate": 0.0001744227913047392, + "loss": 0.3107, + "step": 10621 + }, + { + "epoch": 0.86049902786779, + "grad_norm": 0.034452538937330246, + "learning_rate": 0.00017441829065214457, + "loss": 0.3213, + "step": 10622 + }, + { + "epoch": 0.8605800388852884, + "grad_norm": 0.030402205884456635, + "learning_rate": 0.00017441378999954993, + "loss": 0.2975, + "step": 10623 + }, + { + "epoch": 0.8606610499027868, + "grad_norm": 0.03308364003896713, + "learning_rate": 0.0001744092893469553, + "loss": 0.3142, + "step": 10624 + }, + { + "epoch": 0.8607420609202852, + "grad_norm": 0.0385962538421154, + "learning_rate": 0.0001744047886943607, + "loss": 0.3829, + "step": 10625 + }, + { + "epoch": 0.8608230719377835, + "grad_norm": 0.034132879227399826, + "learning_rate": 0.00017440028804176607, + "loss": 0.346, + "step": 10626 + }, + { + "epoch": 0.8609040829552819, + "grad_norm": 0.039486926048994064, + "learning_rate": 0.00017439578738917143, + "loss": 0.3325, + "step": 10627 + }, + { + "epoch": 0.8609850939727803, + "grad_norm": 0.03526204451918602, + "learning_rate": 0.00017439128673657682, + "loss": 0.3385, + "step": 10628 + }, + { + "epoch": 0.8610661049902787, + "grad_norm": 0.03386912867426872, + "learning_rate": 0.00017438678608398218, + "loss": 0.3401, + "step": 10629 + }, + { + "epoch": 0.861147116007777, + "grad_norm": 0.03569406270980835, + "learning_rate": 0.00017438228543138754, + "loss": 0.3134, + "step": 10630 + }, + { + "epoch": 0.8612281270252754, + "grad_norm": 0.031888384371995926, + "learning_rate": 0.00017437778477879295, + "loss": 0.3116, + "step": 10631 + }, + { + "epoch": 0.8613091380427739, + "grad_norm": 0.04027692601084709, + "learning_rate": 0.0001743732841261983, + "loss": 0.3985, + "step": 10632 + }, + { + "epoch": 0.8613901490602722, + "grad_norm": 0.03750056028366089, + "learning_rate": 0.00017436878347360367, + "loss": 0.3761, + "step": 10633 + }, + { + "epoch": 0.8614711600777706, + "grad_norm": 0.035463955253362656, + "learning_rate": 0.00017436428282100906, + "loss": 0.355, + "step": 10634 + }, + { + "epoch": 0.8615521710952689, + "grad_norm": 0.03540815785527229, + "learning_rate": 0.00017435978216841442, + "loss": 0.3559, + "step": 10635 + }, + { + "epoch": 0.8616331821127673, + "grad_norm": 0.032286386936903, + "learning_rate": 0.0001743552815158198, + "loss": 0.3134, + "step": 10636 + }, + { + "epoch": 0.8617141931302658, + "grad_norm": 0.039775971323251724, + "learning_rate": 0.0001743507808632252, + "loss": 0.379, + "step": 10637 + }, + { + "epoch": 0.8617952041477641, + "grad_norm": 0.03340979665517807, + "learning_rate": 0.00017434628021063055, + "loss": 0.3042, + "step": 10638 + }, + { + "epoch": 0.8618762151652625, + "grad_norm": 0.03356197848916054, + "learning_rate": 0.0001743417795580359, + "loss": 0.3474, + "step": 10639 + }, + { + "epoch": 0.8619572261827608, + "grad_norm": 0.030694514513015747, + "learning_rate": 0.0001743372789054413, + "loss": 0.2946, + "step": 10640 + }, + { + "epoch": 0.8620382372002592, + "grad_norm": 0.034636642783880234, + "learning_rate": 0.00017433277825284666, + "loss": 0.3392, + "step": 10641 + }, + { + "epoch": 0.8621192482177576, + "grad_norm": 0.03530239313840866, + "learning_rate": 0.00017432827760025205, + "loss": 0.3405, + "step": 10642 + }, + { + "epoch": 0.862200259235256, + "grad_norm": 0.03424142301082611, + "learning_rate": 0.00017432377694765743, + "loss": 0.3384, + "step": 10643 + }, + { + "epoch": 0.8622812702527544, + "grad_norm": 0.02966010756790638, + "learning_rate": 0.0001743192762950628, + "loss": 0.3191, + "step": 10644 + }, + { + "epoch": 0.8623622812702527, + "grad_norm": 0.035714924335479736, + "learning_rate": 0.00017431477564246815, + "loss": 0.3296, + "step": 10645 + }, + { + "epoch": 0.8624432922877512, + "grad_norm": 0.033053092658519745, + "learning_rate": 0.00017431027498987354, + "loss": 0.3273, + "step": 10646 + }, + { + "epoch": 0.8625243033052495, + "grad_norm": 0.030531764030456543, + "learning_rate": 0.0001743057743372789, + "loss": 0.3126, + "step": 10647 + }, + { + "epoch": 0.8626053143227479, + "grad_norm": 0.03648407757282257, + "learning_rate": 0.0001743012736846843, + "loss": 0.3023, + "step": 10648 + }, + { + "epoch": 0.8626863253402463, + "grad_norm": 0.030021555721759796, + "learning_rate": 0.00017429677303208967, + "loss": 0.2931, + "step": 10649 + }, + { + "epoch": 0.8627673363577446, + "grad_norm": 0.03257157281041145, + "learning_rate": 0.00017429227237949503, + "loss": 0.313, + "step": 10650 + }, + { + "epoch": 0.8628483473752431, + "grad_norm": 0.03287028893828392, + "learning_rate": 0.0001742877717269004, + "loss": 0.3271, + "step": 10651 + }, + { + "epoch": 0.8629293583927414, + "grad_norm": 0.0382518544793129, + "learning_rate": 0.00017428327107430578, + "loss": 0.2954, + "step": 10652 + }, + { + "epoch": 0.8630103694102398, + "grad_norm": 0.03195209801197052, + "learning_rate": 0.00017427877042171114, + "loss": 0.3085, + "step": 10653 + }, + { + "epoch": 0.8630913804277381, + "grad_norm": 0.028789833188056946, + "learning_rate": 0.00017427426976911653, + "loss": 0.3043, + "step": 10654 + }, + { + "epoch": 0.8631723914452365, + "grad_norm": 0.03181454911828041, + "learning_rate": 0.00017426976911652192, + "loss": 0.2868, + "step": 10655 + }, + { + "epoch": 0.863253402462735, + "grad_norm": 0.03049142099916935, + "learning_rate": 0.00017426526846392728, + "loss": 0.3284, + "step": 10656 + }, + { + "epoch": 0.8633344134802333, + "grad_norm": 0.03163653239607811, + "learning_rate": 0.00017426076781133264, + "loss": 0.2851, + "step": 10657 + }, + { + "epoch": 0.8634154244977317, + "grad_norm": 0.032725002616643906, + "learning_rate": 0.00017425626715873802, + "loss": 0.3044, + "step": 10658 + }, + { + "epoch": 0.86349643551523, + "grad_norm": 0.03800255060195923, + "learning_rate": 0.0001742517665061434, + "loss": 0.352, + "step": 10659 + }, + { + "epoch": 0.8635774465327285, + "grad_norm": 0.043231409043073654, + "learning_rate": 0.00017424726585354877, + "loss": 0.3637, + "step": 10660 + }, + { + "epoch": 0.8636584575502269, + "grad_norm": 0.037208881229162216, + "learning_rate": 0.00017424276520095416, + "loss": 0.3086, + "step": 10661 + }, + { + "epoch": 0.8637394685677252, + "grad_norm": 0.03458251804113388, + "learning_rate": 0.00017423826454835952, + "loss": 0.2955, + "step": 10662 + }, + { + "epoch": 0.8638204795852236, + "grad_norm": 0.03293118625879288, + "learning_rate": 0.00017423376389576488, + "loss": 0.304, + "step": 10663 + }, + { + "epoch": 0.8639014906027219, + "grad_norm": 0.034051090478897095, + "learning_rate": 0.00017422926324317027, + "loss": 0.3041, + "step": 10664 + }, + { + "epoch": 0.8639825016202204, + "grad_norm": 0.03171685338020325, + "learning_rate": 0.00017422476259057565, + "loss": 0.3146, + "step": 10665 + }, + { + "epoch": 0.8640635126377187, + "grad_norm": 0.035040758550167084, + "learning_rate": 0.000174220261937981, + "loss": 0.3429, + "step": 10666 + }, + { + "epoch": 0.8641445236552171, + "grad_norm": 0.029530083760619164, + "learning_rate": 0.0001742157612853864, + "loss": 0.3277, + "step": 10667 + }, + { + "epoch": 0.8642255346727155, + "grad_norm": 0.036100320518016815, + "learning_rate": 0.00017421126063279176, + "loss": 0.3547, + "step": 10668 + }, + { + "epoch": 0.8643065456902139, + "grad_norm": 0.033129528164863586, + "learning_rate": 0.00017420675998019712, + "loss": 0.3367, + "step": 10669 + }, + { + "epoch": 0.8643875567077123, + "grad_norm": 0.03324815630912781, + "learning_rate": 0.0001742022593276025, + "loss": 0.3198, + "step": 10670 + }, + { + "epoch": 0.8644685677252106, + "grad_norm": 0.03402934595942497, + "learning_rate": 0.0001741977586750079, + "loss": 0.3392, + "step": 10671 + }, + { + "epoch": 0.864549578742709, + "grad_norm": 0.03805414214730263, + "learning_rate": 0.00017419325802241325, + "loss": 0.3836, + "step": 10672 + }, + { + "epoch": 0.8646305897602073, + "grad_norm": 0.03230415657162666, + "learning_rate": 0.00017418875736981864, + "loss": 0.3108, + "step": 10673 + }, + { + "epoch": 0.8647116007777058, + "grad_norm": 0.03507767990231514, + "learning_rate": 0.000174184256717224, + "loss": 0.2988, + "step": 10674 + }, + { + "epoch": 0.8647926117952042, + "grad_norm": 0.03984934464097023, + "learning_rate": 0.00017417975606462936, + "loss": 0.344, + "step": 10675 + }, + { + "epoch": 0.8648736228127025, + "grad_norm": 0.041479699313640594, + "learning_rate": 0.00017417525541203475, + "loss": 0.3316, + "step": 10676 + }, + { + "epoch": 0.8649546338302009, + "grad_norm": 0.0320621095597744, + "learning_rate": 0.00017417075475944014, + "loss": 0.3287, + "step": 10677 + }, + { + "epoch": 0.8650356448476992, + "grad_norm": 0.0352802574634552, + "learning_rate": 0.0001741662541068455, + "loss": 0.3131, + "step": 10678 + }, + { + "epoch": 0.8651166558651977, + "grad_norm": 0.0304051972925663, + "learning_rate": 0.00017416175345425088, + "loss": 0.2471, + "step": 10679 + }, + { + "epoch": 0.8651976668826961, + "grad_norm": 0.036712054163217545, + "learning_rate": 0.00017415725280165624, + "loss": 0.38, + "step": 10680 + }, + { + "epoch": 0.8652786779001944, + "grad_norm": 0.03605759143829346, + "learning_rate": 0.0001741527521490616, + "loss": 0.3202, + "step": 10681 + }, + { + "epoch": 0.8653596889176928, + "grad_norm": 0.032023441046476364, + "learning_rate": 0.000174148251496467, + "loss": 0.3106, + "step": 10682 + }, + { + "epoch": 0.8654406999351912, + "grad_norm": 0.03299464285373688, + "learning_rate": 0.00017414375084387238, + "loss": 0.2842, + "step": 10683 + }, + { + "epoch": 0.8655217109526896, + "grad_norm": 0.036962252110242844, + "learning_rate": 0.00017413925019127774, + "loss": 0.3104, + "step": 10684 + }, + { + "epoch": 0.865602721970188, + "grad_norm": 0.0329434871673584, + "learning_rate": 0.00017413474953868312, + "loss": 0.3073, + "step": 10685 + }, + { + "epoch": 0.8656837329876863, + "grad_norm": 0.050255440175533295, + "learning_rate": 0.00017413024888608848, + "loss": 0.3358, + "step": 10686 + }, + { + "epoch": 0.8657647440051847, + "grad_norm": 0.029616905376315117, + "learning_rate": 0.00017412574823349384, + "loss": 0.306, + "step": 10687 + }, + { + "epoch": 0.8658457550226831, + "grad_norm": 0.029950309544801712, + "learning_rate": 0.00017412124758089926, + "loss": 0.3012, + "step": 10688 + }, + { + "epoch": 0.8659267660401815, + "grad_norm": 0.03012317605316639, + "learning_rate": 0.00017411674692830462, + "loss": 0.2814, + "step": 10689 + }, + { + "epoch": 0.8660077770576798, + "grad_norm": 0.03209361433982849, + "learning_rate": 0.00017411224627570998, + "loss": 0.3218, + "step": 10690 + }, + { + "epoch": 0.8660887880751782, + "grad_norm": 0.03671686351299286, + "learning_rate": 0.00017410774562311537, + "loss": 0.3721, + "step": 10691 + }, + { + "epoch": 0.8661697990926766, + "grad_norm": 0.029738424345850945, + "learning_rate": 0.00017410324497052073, + "loss": 0.2891, + "step": 10692 + }, + { + "epoch": 0.866250810110175, + "grad_norm": 0.039622507989406586, + "learning_rate": 0.00017409874431792609, + "loss": 0.323, + "step": 10693 + }, + { + "epoch": 0.8663318211276734, + "grad_norm": 0.03439989686012268, + "learning_rate": 0.0001740942436653315, + "loss": 0.3709, + "step": 10694 + }, + { + "epoch": 0.8664128321451717, + "grad_norm": 0.03073376975953579, + "learning_rate": 0.00017408974301273686, + "loss": 0.3305, + "step": 10695 + }, + { + "epoch": 0.8664938431626701, + "grad_norm": 0.031084010377526283, + "learning_rate": 0.00017408524236014222, + "loss": 0.3121, + "step": 10696 + }, + { + "epoch": 0.8665748541801686, + "grad_norm": 0.032315392047166824, + "learning_rate": 0.0001740807417075476, + "loss": 0.327, + "step": 10697 + }, + { + "epoch": 0.8666558651976669, + "grad_norm": 0.029009966179728508, + "learning_rate": 0.00017407624105495297, + "loss": 0.3286, + "step": 10698 + }, + { + "epoch": 0.8667368762151653, + "grad_norm": 0.03397389501333237, + "learning_rate": 0.00017407174040235833, + "loss": 0.3512, + "step": 10699 + }, + { + "epoch": 0.8668178872326636, + "grad_norm": 0.03850146010518074, + "learning_rate": 0.00017406723974976374, + "loss": 0.3224, + "step": 10700 + }, + { + "epoch": 0.866898898250162, + "grad_norm": 0.031645797193050385, + "learning_rate": 0.0001740627390971691, + "loss": 0.3326, + "step": 10701 + }, + { + "epoch": 0.8669799092676604, + "grad_norm": 0.030226711183786392, + "learning_rate": 0.00017405823844457446, + "loss": 0.3079, + "step": 10702 + }, + { + "epoch": 0.8670609202851588, + "grad_norm": 0.03322417661547661, + "learning_rate": 0.00017405373779197985, + "loss": 0.3504, + "step": 10703 + }, + { + "epoch": 0.8671419313026572, + "grad_norm": 0.03156125918030739, + "learning_rate": 0.0001740492371393852, + "loss": 0.3623, + "step": 10704 + }, + { + "epoch": 0.8672229423201555, + "grad_norm": 0.03021164797246456, + "learning_rate": 0.00017404473648679057, + "loss": 0.3162, + "step": 10705 + }, + { + "epoch": 0.8673039533376539, + "grad_norm": 0.028827734291553497, + "learning_rate": 0.00017404023583419598, + "loss": 0.2772, + "step": 10706 + }, + { + "epoch": 0.8673849643551523, + "grad_norm": 0.03488578647375107, + "learning_rate": 0.00017403573518160134, + "loss": 0.3003, + "step": 10707 + }, + { + "epoch": 0.8674659753726507, + "grad_norm": 0.03198534995317459, + "learning_rate": 0.0001740312345290067, + "loss": 0.3357, + "step": 10708 + }, + { + "epoch": 0.867546986390149, + "grad_norm": 0.034626707434654236, + "learning_rate": 0.0001740267338764121, + "loss": 0.3404, + "step": 10709 + }, + { + "epoch": 0.8676279974076474, + "grad_norm": 0.033544886857271194, + "learning_rate": 0.00017402223322381745, + "loss": 0.3644, + "step": 10710 + }, + { + "epoch": 0.8677090084251459, + "grad_norm": 0.03247756510972977, + "learning_rate": 0.00017401773257122284, + "loss": 0.2922, + "step": 10711 + }, + { + "epoch": 0.8677900194426442, + "grad_norm": 0.030314233154058456, + "learning_rate": 0.00017401323191862823, + "loss": 0.2931, + "step": 10712 + }, + { + "epoch": 0.8678710304601426, + "grad_norm": 0.032003603875637054, + "learning_rate": 0.00017400873126603359, + "loss": 0.272, + "step": 10713 + }, + { + "epoch": 0.8679520414776409, + "grad_norm": 0.034450311213731766, + "learning_rate": 0.00017400423061343895, + "loss": 0.3421, + "step": 10714 + }, + { + "epoch": 0.8680330524951393, + "grad_norm": 0.029790136963129044, + "learning_rate": 0.00017399972996084433, + "loss": 0.2894, + "step": 10715 + }, + { + "epoch": 0.8681140635126378, + "grad_norm": 0.03776300325989723, + "learning_rate": 0.0001739952293082497, + "loss": 0.3921, + "step": 10716 + }, + { + "epoch": 0.8681950745301361, + "grad_norm": 0.03808490186929703, + "learning_rate": 0.00017399072865565508, + "loss": 0.3456, + "step": 10717 + }, + { + "epoch": 0.8682760855476345, + "grad_norm": 0.03374411538243294, + "learning_rate": 0.00017398622800306047, + "loss": 0.3152, + "step": 10718 + }, + { + "epoch": 0.8683570965651328, + "grad_norm": 0.04357757046818733, + "learning_rate": 0.00017398172735046583, + "loss": 0.3132, + "step": 10719 + }, + { + "epoch": 0.8684381075826313, + "grad_norm": 0.032372117042541504, + "learning_rate": 0.0001739772266978712, + "loss": 0.3417, + "step": 10720 + }, + { + "epoch": 0.8685191186001296, + "grad_norm": 0.033516012132167816, + "learning_rate": 0.00017397272604527657, + "loss": 0.3177, + "step": 10721 + }, + { + "epoch": 0.868600129617628, + "grad_norm": 0.03316143900156021, + "learning_rate": 0.00017396822539268193, + "loss": 0.3479, + "step": 10722 + }, + { + "epoch": 0.8686811406351264, + "grad_norm": 0.03665821626782417, + "learning_rate": 0.00017396372474008732, + "loss": 0.41, + "step": 10723 + }, + { + "epoch": 0.8687621516526247, + "grad_norm": 0.03130485117435455, + "learning_rate": 0.0001739592240874927, + "loss": 0.3165, + "step": 10724 + }, + { + "epoch": 0.8688431626701232, + "grad_norm": 0.040841370820999146, + "learning_rate": 0.00017395472343489807, + "loss": 0.363, + "step": 10725 + }, + { + "epoch": 0.8689241736876215, + "grad_norm": 0.03354490175843239, + "learning_rate": 0.00017395022278230343, + "loss": 0.3516, + "step": 10726 + }, + { + "epoch": 0.8690051847051199, + "grad_norm": 0.03047320991754532, + "learning_rate": 0.00017394572212970882, + "loss": 0.2943, + "step": 10727 + }, + { + "epoch": 0.8690861957226182, + "grad_norm": 0.03401746600866318, + "learning_rate": 0.00017394122147711418, + "loss": 0.3112, + "step": 10728 + }, + { + "epoch": 0.8691672067401166, + "grad_norm": 0.03398209437727928, + "learning_rate": 0.00017393672082451956, + "loss": 0.3462, + "step": 10729 + }, + { + "epoch": 0.8692482177576151, + "grad_norm": 0.03289905935525894, + "learning_rate": 0.00017393222017192495, + "loss": 0.3224, + "step": 10730 + }, + { + "epoch": 0.8693292287751134, + "grad_norm": 0.03407776728272438, + "learning_rate": 0.0001739277195193303, + "loss": 0.3419, + "step": 10731 + }, + { + "epoch": 0.8694102397926118, + "grad_norm": 0.035825107246637344, + "learning_rate": 0.00017392321886673567, + "loss": 0.3768, + "step": 10732 + }, + { + "epoch": 0.8694912508101101, + "grad_norm": 0.03510098159313202, + "learning_rate": 0.00017391871821414106, + "loss": 0.3312, + "step": 10733 + }, + { + "epoch": 0.8695722618276086, + "grad_norm": 0.03410264849662781, + "learning_rate": 0.00017391421756154642, + "loss": 0.3689, + "step": 10734 + }, + { + "epoch": 0.869653272845107, + "grad_norm": 0.03720330819487572, + "learning_rate": 0.0001739097169089518, + "loss": 0.3598, + "step": 10735 + }, + { + "epoch": 0.8697342838626053, + "grad_norm": 0.03458027541637421, + "learning_rate": 0.0001739052162563572, + "loss": 0.3532, + "step": 10736 + }, + { + "epoch": 0.8698152948801037, + "grad_norm": 0.03295762836933136, + "learning_rate": 0.00017390071560376255, + "loss": 0.3231, + "step": 10737 + }, + { + "epoch": 0.869896305897602, + "grad_norm": 0.033154476433992386, + "learning_rate": 0.0001738962149511679, + "loss": 0.3661, + "step": 10738 + }, + { + "epoch": 0.8699773169151005, + "grad_norm": 0.03453659638762474, + "learning_rate": 0.0001738917142985733, + "loss": 0.3342, + "step": 10739 + }, + { + "epoch": 0.8700583279325989, + "grad_norm": 0.037514906376600266, + "learning_rate": 0.00017388721364597869, + "loss": 0.3413, + "step": 10740 + }, + { + "epoch": 0.8701393389500972, + "grad_norm": 0.03465067967772484, + "learning_rate": 0.00017388271299338405, + "loss": 0.3946, + "step": 10741 + }, + { + "epoch": 0.8702203499675956, + "grad_norm": 0.030446641147136688, + "learning_rate": 0.00017387821234078943, + "loss": 0.3085, + "step": 10742 + }, + { + "epoch": 0.8703013609850939, + "grad_norm": 0.03456110879778862, + "learning_rate": 0.0001738737116881948, + "loss": 0.3391, + "step": 10743 + }, + { + "epoch": 0.8703823720025924, + "grad_norm": 0.03867906332015991, + "learning_rate": 0.00017386921103560015, + "loss": 0.346, + "step": 10744 + }, + { + "epoch": 0.8704633830200907, + "grad_norm": 0.036336660385131836, + "learning_rate": 0.00017386471038300554, + "loss": 0.3707, + "step": 10745 + }, + { + "epoch": 0.8705443940375891, + "grad_norm": 0.03098193369805813, + "learning_rate": 0.00017386020973041093, + "loss": 0.3021, + "step": 10746 + }, + { + "epoch": 0.8706254050550875, + "grad_norm": 0.03248054161667824, + "learning_rate": 0.0001738557090778163, + "loss": 0.3275, + "step": 10747 + }, + { + "epoch": 0.8707064160725859, + "grad_norm": 0.034962303936481476, + "learning_rate": 0.00017385120842522167, + "loss": 0.3374, + "step": 10748 + }, + { + "epoch": 0.8707874270900843, + "grad_norm": 0.033861901611089706, + "learning_rate": 0.00017384670777262703, + "loss": 0.3022, + "step": 10749 + }, + { + "epoch": 0.8708684381075826, + "grad_norm": 0.03983207419514656, + "learning_rate": 0.0001738422071200324, + "loss": 0.3946, + "step": 10750 + }, + { + "epoch": 0.870949449125081, + "grad_norm": 0.032978422939777374, + "learning_rate": 0.00017383770646743778, + "loss": 0.3489, + "step": 10751 + }, + { + "epoch": 0.8710304601425793, + "grad_norm": 0.03442319110035896, + "learning_rate": 0.00017383320581484317, + "loss": 0.3146, + "step": 10752 + }, + { + "epoch": 0.8711114711600778, + "grad_norm": 0.029209937900304794, + "learning_rate": 0.00017382870516224853, + "loss": 0.2718, + "step": 10753 + }, + { + "epoch": 0.8711924821775762, + "grad_norm": 0.03570421040058136, + "learning_rate": 0.00017382420450965392, + "loss": 0.3383, + "step": 10754 + }, + { + "epoch": 0.8712734931950745, + "grad_norm": 0.04989160969853401, + "learning_rate": 0.00017381970385705928, + "loss": 0.3582, + "step": 10755 + }, + { + "epoch": 0.8713545042125729, + "grad_norm": 0.031460873782634735, + "learning_rate": 0.00017381520320446464, + "loss": 0.3243, + "step": 10756 + }, + { + "epoch": 0.8714355152300713, + "grad_norm": 0.03317393362522125, + "learning_rate": 0.00017381070255187002, + "loss": 0.3197, + "step": 10757 + }, + { + "epoch": 0.8715165262475697, + "grad_norm": 0.036221351474523544, + "learning_rate": 0.0001738062018992754, + "loss": 0.3549, + "step": 10758 + }, + { + "epoch": 0.8715975372650681, + "grad_norm": 0.036588188260793686, + "learning_rate": 0.00017380170124668077, + "loss": 0.3305, + "step": 10759 + }, + { + "epoch": 0.8716785482825664, + "grad_norm": 0.038899991661310196, + "learning_rate": 0.00017379720059408616, + "loss": 0.3229, + "step": 10760 + }, + { + "epoch": 0.8717595593000648, + "grad_norm": 0.03450969606637955, + "learning_rate": 0.00017379269994149152, + "loss": 0.3075, + "step": 10761 + }, + { + "epoch": 0.8718405703175632, + "grad_norm": 0.03251654654741287, + "learning_rate": 0.00017378819928889688, + "loss": 0.3405, + "step": 10762 + }, + { + "epoch": 0.8719215813350616, + "grad_norm": 0.033687327057123184, + "learning_rate": 0.00017378369863630227, + "loss": 0.324, + "step": 10763 + }, + { + "epoch": 0.87200259235256, + "grad_norm": 0.03482494875788689, + "learning_rate": 0.00017377919798370765, + "loss": 0.333, + "step": 10764 + }, + { + "epoch": 0.8720836033700583, + "grad_norm": 0.03824484348297119, + "learning_rate": 0.000173774697331113, + "loss": 0.3509, + "step": 10765 + }, + { + "epoch": 0.8721646143875567, + "grad_norm": 0.03036884032189846, + "learning_rate": 0.0001737701966785184, + "loss": 0.3204, + "step": 10766 + }, + { + "epoch": 0.8722456254050551, + "grad_norm": 0.03216506168246269, + "learning_rate": 0.00017376569602592376, + "loss": 0.3263, + "step": 10767 + }, + { + "epoch": 0.8723266364225535, + "grad_norm": 0.03733062371611595, + "learning_rate": 0.00017376119537332912, + "loss": 0.3485, + "step": 10768 + }, + { + "epoch": 0.8724076474400518, + "grad_norm": 0.035098589956760406, + "learning_rate": 0.00017375669472073453, + "loss": 0.3384, + "step": 10769 + }, + { + "epoch": 0.8724886584575502, + "grad_norm": 0.032889917492866516, + "learning_rate": 0.0001737521940681399, + "loss": 0.2968, + "step": 10770 + }, + { + "epoch": 0.8725696694750487, + "grad_norm": 0.034361012279987335, + "learning_rate": 0.00017374769341554525, + "loss": 0.3312, + "step": 10771 + }, + { + "epoch": 0.872650680492547, + "grad_norm": 0.03438268601894379, + "learning_rate": 0.00017374319276295064, + "loss": 0.3398, + "step": 10772 + }, + { + "epoch": 0.8727316915100454, + "grad_norm": 0.032490380108356476, + "learning_rate": 0.000173738692110356, + "loss": 0.3235, + "step": 10773 + }, + { + "epoch": 0.8728127025275437, + "grad_norm": 0.03424321487545967, + "learning_rate": 0.00017373419145776136, + "loss": 0.3227, + "step": 10774 + }, + { + "epoch": 0.8728937135450421, + "grad_norm": 0.03552704304456711, + "learning_rate": 0.00017372969080516678, + "loss": 0.3065, + "step": 10775 + }, + { + "epoch": 0.8729747245625405, + "grad_norm": 0.03595755994319916, + "learning_rate": 0.00017372519015257214, + "loss": 0.3538, + "step": 10776 + }, + { + "epoch": 0.8730557355800389, + "grad_norm": 0.0303262397646904, + "learning_rate": 0.0001737206894999775, + "loss": 0.3143, + "step": 10777 + }, + { + "epoch": 0.8731367465975373, + "grad_norm": 0.02945934422314167, + "learning_rate": 0.00017371618884738288, + "loss": 0.303, + "step": 10778 + }, + { + "epoch": 0.8732177576150356, + "grad_norm": 0.03361841291189194, + "learning_rate": 0.00017371168819478824, + "loss": 0.3539, + "step": 10779 + }, + { + "epoch": 0.873298768632534, + "grad_norm": 0.02983030490577221, + "learning_rate": 0.00017370718754219363, + "loss": 0.3095, + "step": 10780 + }, + { + "epoch": 0.8733797796500324, + "grad_norm": 0.032523952424526215, + "learning_rate": 0.00017370268688959902, + "loss": 0.3488, + "step": 10781 + }, + { + "epoch": 0.8734607906675308, + "grad_norm": 0.03318939730525017, + "learning_rate": 0.00017369818623700438, + "loss": 0.3189, + "step": 10782 + }, + { + "epoch": 0.8735418016850292, + "grad_norm": 0.03516692668199539, + "learning_rate": 0.00017369368558440974, + "loss": 0.3287, + "step": 10783 + }, + { + "epoch": 0.8736228127025275, + "grad_norm": 0.03213590756058693, + "learning_rate": 0.00017368918493181512, + "loss": 0.2941, + "step": 10784 + }, + { + "epoch": 0.873703823720026, + "grad_norm": 0.03252600133419037, + "learning_rate": 0.00017368468427922048, + "loss": 0.3446, + "step": 10785 + }, + { + "epoch": 0.8737848347375243, + "grad_norm": 0.03503256291151047, + "learning_rate": 0.00017368018362662587, + "loss": 0.3329, + "step": 10786 + }, + { + "epoch": 0.8738658457550227, + "grad_norm": 0.033437907695770264, + "learning_rate": 0.00017367568297403126, + "loss": 0.3219, + "step": 10787 + }, + { + "epoch": 0.873946856772521, + "grad_norm": 0.03503791615366936, + "learning_rate": 0.00017367118232143662, + "loss": 0.3627, + "step": 10788 + }, + { + "epoch": 0.8740278677900194, + "grad_norm": 0.037898220121860504, + "learning_rate": 0.00017366668166884198, + "loss": 0.3716, + "step": 10789 + }, + { + "epoch": 0.8741088788075179, + "grad_norm": 0.0397721491754055, + "learning_rate": 0.00017366218101624737, + "loss": 0.361, + "step": 10790 + }, + { + "epoch": 0.8741898898250162, + "grad_norm": 0.037444885820150375, + "learning_rate": 0.00017365768036365273, + "loss": 0.3577, + "step": 10791 + }, + { + "epoch": 0.8742709008425146, + "grad_norm": 0.036365706473588943, + "learning_rate": 0.0001736531797110581, + "loss": 0.364, + "step": 10792 + }, + { + "epoch": 0.8743519118600129, + "grad_norm": 0.03337782248854637, + "learning_rate": 0.0001736486790584635, + "loss": 0.3322, + "step": 10793 + }, + { + "epoch": 0.8744329228775113, + "grad_norm": 0.037269555032253265, + "learning_rate": 0.00017364417840586886, + "loss": 0.3359, + "step": 10794 + }, + { + "epoch": 0.8745139338950098, + "grad_norm": 0.03035913221538067, + "learning_rate": 0.00017363967775327422, + "loss": 0.3126, + "step": 10795 + }, + { + "epoch": 0.8745949449125081, + "grad_norm": 0.028627660125494003, + "learning_rate": 0.0001736351771006796, + "loss": 0.2996, + "step": 10796 + }, + { + "epoch": 0.8746759559300065, + "grad_norm": 0.03484364226460457, + "learning_rate": 0.00017363067644808497, + "loss": 0.3422, + "step": 10797 + }, + { + "epoch": 0.8747569669475048, + "grad_norm": 0.03468231111764908, + "learning_rate": 0.00017362617579549036, + "loss": 0.3615, + "step": 10798 + }, + { + "epoch": 0.8748379779650033, + "grad_norm": 0.03218459337949753, + "learning_rate": 0.00017362167514289574, + "loss": 0.3283, + "step": 10799 + }, + { + "epoch": 0.8749189889825016, + "grad_norm": 0.029084214940667152, + "learning_rate": 0.0001736171744903011, + "loss": 0.3068, + "step": 10800 + }, + { + "epoch": 0.875, + "grad_norm": 0.03318289667367935, + "learning_rate": 0.00017361267383770646, + "loss": 0.3468, + "step": 10801 + }, + { + "epoch": 0.8750810110174984, + "grad_norm": 0.032937172800302505, + "learning_rate": 0.00017360817318511185, + "loss": 0.3316, + "step": 10802 + }, + { + "epoch": 0.8751620220349967, + "grad_norm": 0.02804538980126381, + "learning_rate": 0.0001736036725325172, + "loss": 0.2894, + "step": 10803 + }, + { + "epoch": 0.8752430330524952, + "grad_norm": 0.03209392726421356, + "learning_rate": 0.0001735991718799226, + "loss": 0.318, + "step": 10804 + }, + { + "epoch": 0.8753240440699935, + "grad_norm": 0.0320693776011467, + "learning_rate": 0.00017359467122732798, + "loss": 0.3094, + "step": 10805 + }, + { + "epoch": 0.8754050550874919, + "grad_norm": 0.030849037691950798, + "learning_rate": 0.00017359017057473334, + "loss": 0.3057, + "step": 10806 + }, + { + "epoch": 0.8754860661049902, + "grad_norm": 0.032688066363334656, + "learning_rate": 0.0001735856699221387, + "loss": 0.316, + "step": 10807 + }, + { + "epoch": 0.8755670771224887, + "grad_norm": 0.03459910303354263, + "learning_rate": 0.0001735811692695441, + "loss": 0.3298, + "step": 10808 + }, + { + "epoch": 0.8756480881399871, + "grad_norm": 0.02897426299750805, + "learning_rate": 0.00017357666861694945, + "loss": 0.2906, + "step": 10809 + }, + { + "epoch": 0.8757290991574854, + "grad_norm": 0.03555037081241608, + "learning_rate": 0.00017357216796435484, + "loss": 0.3439, + "step": 10810 + }, + { + "epoch": 0.8758101101749838, + "grad_norm": 0.037332337349653244, + "learning_rate": 0.00017356766731176023, + "loss": 0.345, + "step": 10811 + }, + { + "epoch": 0.8758911211924821, + "grad_norm": 0.037951596081256866, + "learning_rate": 0.00017356316665916559, + "loss": 0.3423, + "step": 10812 + }, + { + "epoch": 0.8759721322099806, + "grad_norm": 0.030132196843624115, + "learning_rate": 0.00017355866600657095, + "loss": 0.2823, + "step": 10813 + }, + { + "epoch": 0.876053143227479, + "grad_norm": 0.041832491755485535, + "learning_rate": 0.00017355416535397633, + "loss": 0.3397, + "step": 10814 + }, + { + "epoch": 0.8761341542449773, + "grad_norm": 0.03547552973031998, + "learning_rate": 0.0001735496647013817, + "loss": 0.3229, + "step": 10815 + }, + { + "epoch": 0.8762151652624757, + "grad_norm": 0.031897854059934616, + "learning_rate": 0.00017354516404878708, + "loss": 0.2958, + "step": 10816 + }, + { + "epoch": 0.876296176279974, + "grad_norm": 0.03446755185723305, + "learning_rate": 0.00017354066339619247, + "loss": 0.3402, + "step": 10817 + }, + { + "epoch": 0.8763771872974725, + "grad_norm": 0.03411189094185829, + "learning_rate": 0.00017353616274359783, + "loss": 0.393, + "step": 10818 + }, + { + "epoch": 0.8764581983149708, + "grad_norm": 0.03914996236562729, + "learning_rate": 0.0001735316620910032, + "loss": 0.3346, + "step": 10819 + }, + { + "epoch": 0.8765392093324692, + "grad_norm": 0.03950195014476776, + "learning_rate": 0.00017352716143840857, + "loss": 0.3642, + "step": 10820 + }, + { + "epoch": 0.8766202203499676, + "grad_norm": 0.0327347032725811, + "learning_rate": 0.00017352266078581396, + "loss": 0.3027, + "step": 10821 + }, + { + "epoch": 0.876701231367466, + "grad_norm": 0.040154580026865005, + "learning_rate": 0.00017351816013321932, + "loss": 0.3457, + "step": 10822 + }, + { + "epoch": 0.8767822423849644, + "grad_norm": 0.03418951481580734, + "learning_rate": 0.0001735136594806247, + "loss": 0.314, + "step": 10823 + }, + { + "epoch": 0.8768632534024627, + "grad_norm": 0.03217468410730362, + "learning_rate": 0.00017350915882803007, + "loss": 0.3392, + "step": 10824 + }, + { + "epoch": 0.8769442644199611, + "grad_norm": 0.034714192152023315, + "learning_rate": 0.00017350465817543543, + "loss": 0.3374, + "step": 10825 + }, + { + "epoch": 0.8770252754374595, + "grad_norm": 0.03474406152963638, + "learning_rate": 0.00017350015752284082, + "loss": 0.3819, + "step": 10826 + }, + { + "epoch": 0.8771062864549579, + "grad_norm": 0.03752182424068451, + "learning_rate": 0.0001734956568702462, + "loss": 0.3416, + "step": 10827 + }, + { + "epoch": 0.8771872974724563, + "grad_norm": 0.031079022213816643, + "learning_rate": 0.00017349115621765156, + "loss": 0.3083, + "step": 10828 + }, + { + "epoch": 0.8772683084899546, + "grad_norm": 0.038465503603219986, + "learning_rate": 0.00017348665556505695, + "loss": 0.3686, + "step": 10829 + }, + { + "epoch": 0.877349319507453, + "grad_norm": 0.03562534973025322, + "learning_rate": 0.0001734821549124623, + "loss": 0.3569, + "step": 10830 + }, + { + "epoch": 0.8774303305249513, + "grad_norm": 0.030780943110585213, + "learning_rate": 0.00017347765425986767, + "loss": 0.3092, + "step": 10831 + }, + { + "epoch": 0.8775113415424498, + "grad_norm": 0.03834667429327965, + "learning_rate": 0.00017347315360727306, + "loss": 0.3336, + "step": 10832 + }, + { + "epoch": 0.8775923525599482, + "grad_norm": 0.032464317977428436, + "learning_rate": 0.00017346865295467844, + "loss": 0.3284, + "step": 10833 + }, + { + "epoch": 0.8776733635774465, + "grad_norm": 0.033732820302248, + "learning_rate": 0.0001734641523020838, + "loss": 0.3481, + "step": 10834 + }, + { + "epoch": 0.8777543745949449, + "grad_norm": 0.0332949161529541, + "learning_rate": 0.0001734596516494892, + "loss": 0.3244, + "step": 10835 + }, + { + "epoch": 0.8778353856124433, + "grad_norm": 0.030543554574251175, + "learning_rate": 0.00017345515099689455, + "loss": 0.3044, + "step": 10836 + }, + { + "epoch": 0.8779163966299417, + "grad_norm": 0.030538873746991158, + "learning_rate": 0.0001734506503442999, + "loss": 0.2957, + "step": 10837 + }, + { + "epoch": 0.87799740764744, + "grad_norm": 0.03430887684226036, + "learning_rate": 0.0001734461496917053, + "loss": 0.3731, + "step": 10838 + }, + { + "epoch": 0.8780784186649384, + "grad_norm": 0.02899688109755516, + "learning_rate": 0.00017344164903911069, + "loss": 0.3316, + "step": 10839 + }, + { + "epoch": 0.8781594296824368, + "grad_norm": 0.035096172243356705, + "learning_rate": 0.00017343714838651605, + "loss": 0.3175, + "step": 10840 + }, + { + "epoch": 0.8782404406999352, + "grad_norm": 0.03392845019698143, + "learning_rate": 0.00017343264773392143, + "loss": 0.3243, + "step": 10841 + }, + { + "epoch": 0.8783214517174336, + "grad_norm": 0.04853187873959541, + "learning_rate": 0.0001734281470813268, + "loss": 0.3504, + "step": 10842 + }, + { + "epoch": 0.8784024627349319, + "grad_norm": 0.02897292748093605, + "learning_rate": 0.00017342364642873218, + "loss": 0.307, + "step": 10843 + }, + { + "epoch": 0.8784834737524303, + "grad_norm": 0.035371195524930954, + "learning_rate": 0.00017341914577613757, + "loss": 0.3441, + "step": 10844 + }, + { + "epoch": 0.8785644847699287, + "grad_norm": 0.03544081747531891, + "learning_rate": 0.00017341464512354293, + "loss": 0.3592, + "step": 10845 + }, + { + "epoch": 0.8786454957874271, + "grad_norm": 0.029531244188547134, + "learning_rate": 0.0001734101444709483, + "loss": 0.2998, + "step": 10846 + }, + { + "epoch": 0.8787265068049255, + "grad_norm": 0.034727614372968674, + "learning_rate": 0.00017340564381835368, + "loss": 0.3093, + "step": 10847 + }, + { + "epoch": 0.8788075178224238, + "grad_norm": 0.029829610139131546, + "learning_rate": 0.00017340114316575904, + "loss": 0.3056, + "step": 10848 + }, + { + "epoch": 0.8788885288399222, + "grad_norm": 0.026881849393248558, + "learning_rate": 0.00017339664251316442, + "loss": 0.2613, + "step": 10849 + }, + { + "epoch": 0.8789695398574207, + "grad_norm": 0.03386076167225838, + "learning_rate": 0.0001733921418605698, + "loss": 0.3473, + "step": 10850 + }, + { + "epoch": 0.879050550874919, + "grad_norm": 0.045226048678159714, + "learning_rate": 0.00017338764120797517, + "loss": 0.3694, + "step": 10851 + }, + { + "epoch": 0.8791315618924174, + "grad_norm": 0.03687673062086105, + "learning_rate": 0.00017338314055538053, + "loss": 0.3564, + "step": 10852 + }, + { + "epoch": 0.8792125729099157, + "grad_norm": 0.03726038336753845, + "learning_rate": 0.00017337863990278592, + "loss": 0.3299, + "step": 10853 + }, + { + "epoch": 0.8792935839274141, + "grad_norm": 0.034257080405950546, + "learning_rate": 0.00017337413925019128, + "loss": 0.3573, + "step": 10854 + }, + { + "epoch": 0.8793745949449125, + "grad_norm": 0.0355970673263073, + "learning_rate": 0.00017336963859759666, + "loss": 0.3459, + "step": 10855 + }, + { + "epoch": 0.8794556059624109, + "grad_norm": 0.03855161368846893, + "learning_rate": 0.00017336513794500205, + "loss": 0.3271, + "step": 10856 + }, + { + "epoch": 0.8795366169799093, + "grad_norm": 0.033906761556863785, + "learning_rate": 0.0001733606372924074, + "loss": 0.3253, + "step": 10857 + }, + { + "epoch": 0.8796176279974076, + "grad_norm": 0.04081016033887863, + "learning_rate": 0.00017335613663981277, + "loss": 0.3626, + "step": 10858 + }, + { + "epoch": 0.8796986390149061, + "grad_norm": 0.03275776654481888, + "learning_rate": 0.00017335163598721816, + "loss": 0.3439, + "step": 10859 + }, + { + "epoch": 0.8797796500324044, + "grad_norm": 0.037287477403879166, + "learning_rate": 0.00017334713533462352, + "loss": 0.3505, + "step": 10860 + }, + { + "epoch": 0.8798606610499028, + "grad_norm": 0.03666910529136658, + "learning_rate": 0.0001733426346820289, + "loss": 0.3528, + "step": 10861 + }, + { + "epoch": 0.8799416720674011, + "grad_norm": 0.03178151696920395, + "learning_rate": 0.0001733381340294343, + "loss": 0.3151, + "step": 10862 + }, + { + "epoch": 0.8800226830848995, + "grad_norm": 0.03164476528763771, + "learning_rate": 0.00017333363337683965, + "loss": 0.3157, + "step": 10863 + }, + { + "epoch": 0.880103694102398, + "grad_norm": 0.03677202761173248, + "learning_rate": 0.000173329132724245, + "loss": 0.3508, + "step": 10864 + }, + { + "epoch": 0.8801847051198963, + "grad_norm": 0.03549559414386749, + "learning_rate": 0.0001733246320716504, + "loss": 0.325, + "step": 10865 + }, + { + "epoch": 0.8802657161373947, + "grad_norm": 0.03302009403705597, + "learning_rate": 0.00017332013141905576, + "loss": 0.3393, + "step": 10866 + }, + { + "epoch": 0.880346727154893, + "grad_norm": 0.03544938564300537, + "learning_rate": 0.00017331563076646115, + "loss": 0.3309, + "step": 10867 + }, + { + "epoch": 0.8804277381723914, + "grad_norm": 0.03859511390328407, + "learning_rate": 0.00017331113011386653, + "loss": 0.3496, + "step": 10868 + }, + { + "epoch": 0.8805087491898899, + "grad_norm": 0.03511954843997955, + "learning_rate": 0.0001733066294612719, + "loss": 0.3354, + "step": 10869 + }, + { + "epoch": 0.8805897602073882, + "grad_norm": 0.03728478029370308, + "learning_rate": 0.00017330212880867725, + "loss": 0.344, + "step": 10870 + }, + { + "epoch": 0.8806707712248866, + "grad_norm": 0.03648219630122185, + "learning_rate": 0.00017329762815608264, + "loss": 0.3614, + "step": 10871 + }, + { + "epoch": 0.8807517822423849, + "grad_norm": 0.04043553024530411, + "learning_rate": 0.000173293127503488, + "loss": 0.3726, + "step": 10872 + }, + { + "epoch": 0.8808327932598834, + "grad_norm": 0.037512943148612976, + "learning_rate": 0.0001732886268508934, + "loss": 0.3654, + "step": 10873 + }, + { + "epoch": 0.8809138042773818, + "grad_norm": 0.03332279995083809, + "learning_rate": 0.00017328412619829878, + "loss": 0.3645, + "step": 10874 + }, + { + "epoch": 0.8809948152948801, + "grad_norm": 0.03482624888420105, + "learning_rate": 0.00017327962554570414, + "loss": 0.3478, + "step": 10875 + }, + { + "epoch": 0.8810758263123785, + "grad_norm": 0.031530581414699554, + "learning_rate": 0.0001732751248931095, + "loss": 0.3207, + "step": 10876 + }, + { + "epoch": 0.8811568373298768, + "grad_norm": 0.03738391399383545, + "learning_rate": 0.00017327062424051488, + "loss": 0.3668, + "step": 10877 + }, + { + "epoch": 0.8812378483473753, + "grad_norm": 0.03019733913242817, + "learning_rate": 0.00017326612358792024, + "loss": 0.3161, + "step": 10878 + }, + { + "epoch": 0.8813188593648736, + "grad_norm": 0.0378187894821167, + "learning_rate": 0.00017326162293532563, + "loss": 0.3082, + "step": 10879 + }, + { + "epoch": 0.881399870382372, + "grad_norm": 0.03067973628640175, + "learning_rate": 0.00017325712228273102, + "loss": 0.3149, + "step": 10880 + }, + { + "epoch": 0.8814808813998704, + "grad_norm": 0.03693872317671776, + "learning_rate": 0.00017325262163013638, + "loss": 0.3524, + "step": 10881 + }, + { + "epoch": 0.8815618924173687, + "grad_norm": 0.036563705652952194, + "learning_rate": 0.00017324812097754174, + "loss": 0.3549, + "step": 10882 + }, + { + "epoch": 0.8816429034348672, + "grad_norm": 0.031392719596624374, + "learning_rate": 0.00017324362032494712, + "loss": 0.2875, + "step": 10883 + }, + { + "epoch": 0.8817239144523655, + "grad_norm": 0.035123202949762344, + "learning_rate": 0.00017323911967235248, + "loss": 0.3157, + "step": 10884 + }, + { + "epoch": 0.8818049254698639, + "grad_norm": 0.0297453124076128, + "learning_rate": 0.00017323461901975787, + "loss": 0.3111, + "step": 10885 + }, + { + "epoch": 0.8818859364873622, + "grad_norm": 0.030868038535118103, + "learning_rate": 0.00017323011836716326, + "loss": 0.3521, + "step": 10886 + }, + { + "epoch": 0.8819669475048607, + "grad_norm": 0.03564748540520668, + "learning_rate": 0.00017322561771456862, + "loss": 0.3233, + "step": 10887 + }, + { + "epoch": 0.8820479585223591, + "grad_norm": 0.03591104969382286, + "learning_rate": 0.00017322111706197398, + "loss": 0.3787, + "step": 10888 + }, + { + "epoch": 0.8821289695398574, + "grad_norm": 0.03601299971342087, + "learning_rate": 0.00017321661640937937, + "loss": 0.2917, + "step": 10889 + }, + { + "epoch": 0.8822099805573558, + "grad_norm": 0.03527918457984924, + "learning_rate": 0.00017321211575678473, + "loss": 0.3529, + "step": 10890 + }, + { + "epoch": 0.8822909915748541, + "grad_norm": 0.029023386538028717, + "learning_rate": 0.00017320761510419011, + "loss": 0.2677, + "step": 10891 + }, + { + "epoch": 0.8823720025923526, + "grad_norm": 0.039927802979946136, + "learning_rate": 0.0001732031144515955, + "loss": 0.3724, + "step": 10892 + }, + { + "epoch": 0.882453013609851, + "grad_norm": 0.03533445671200752, + "learning_rate": 0.00017319861379900086, + "loss": 0.3346, + "step": 10893 + }, + { + "epoch": 0.8825340246273493, + "grad_norm": 0.033590059727430344, + "learning_rate": 0.00017319411314640622, + "loss": 0.2974, + "step": 10894 + }, + { + "epoch": 0.8826150356448477, + "grad_norm": 0.031166430562734604, + "learning_rate": 0.0001731896124938116, + "loss": 0.3015, + "step": 10895 + }, + { + "epoch": 0.8826960466623461, + "grad_norm": 0.032051268965005875, + "learning_rate": 0.00017318511184121697, + "loss": 0.3282, + "step": 10896 + }, + { + "epoch": 0.8827770576798445, + "grad_norm": 0.03437373787164688, + "learning_rate": 0.00017318061118862236, + "loss": 0.3441, + "step": 10897 + }, + { + "epoch": 0.8828580686973428, + "grad_norm": 0.03485589846968651, + "learning_rate": 0.00017317611053602774, + "loss": 0.3577, + "step": 10898 + }, + { + "epoch": 0.8829390797148412, + "grad_norm": 0.033468667417764664, + "learning_rate": 0.0001731716098834331, + "loss": 0.3135, + "step": 10899 + }, + { + "epoch": 0.8830200907323396, + "grad_norm": 0.036351680755615234, + "learning_rate": 0.00017316710923083846, + "loss": 0.3505, + "step": 10900 + }, + { + "epoch": 0.883101101749838, + "grad_norm": 0.03218887001276016, + "learning_rate": 0.00017316260857824385, + "loss": 0.3124, + "step": 10901 + }, + { + "epoch": 0.8831821127673364, + "grad_norm": 0.03348303586244583, + "learning_rate": 0.00017315810792564924, + "loss": 0.3012, + "step": 10902 + }, + { + "epoch": 0.8832631237848347, + "grad_norm": 0.029188605025410652, + "learning_rate": 0.0001731536072730546, + "loss": 0.3096, + "step": 10903 + }, + { + "epoch": 0.8833441348023331, + "grad_norm": 0.033830948173999786, + "learning_rate": 0.00017314910662045998, + "loss": 0.3506, + "step": 10904 + }, + { + "epoch": 0.8834251458198314, + "grad_norm": 0.03579388186335564, + "learning_rate": 0.00017314460596786534, + "loss": 0.3254, + "step": 10905 + }, + { + "epoch": 0.8835061568373299, + "grad_norm": 0.03170622140169144, + "learning_rate": 0.0001731401053152707, + "loss": 0.3513, + "step": 10906 + }, + { + "epoch": 0.8835871678548283, + "grad_norm": 0.03922513499855995, + "learning_rate": 0.0001731356046626761, + "loss": 0.3259, + "step": 10907 + }, + { + "epoch": 0.8836681788723266, + "grad_norm": 0.034124623984098434, + "learning_rate": 0.00017313110401008148, + "loss": 0.4014, + "step": 10908 + }, + { + "epoch": 0.883749189889825, + "grad_norm": 0.031138645485043526, + "learning_rate": 0.00017312660335748684, + "loss": 0.3222, + "step": 10909 + }, + { + "epoch": 0.8838302009073234, + "grad_norm": 0.03142575919628143, + "learning_rate": 0.00017312210270489223, + "loss": 0.3274, + "step": 10910 + }, + { + "epoch": 0.8839112119248218, + "grad_norm": 0.036674655973911285, + "learning_rate": 0.00017311760205229759, + "loss": 0.3872, + "step": 10911 + }, + { + "epoch": 0.8839922229423202, + "grad_norm": 0.03667711466550827, + "learning_rate": 0.00017311310139970297, + "loss": 0.3524, + "step": 10912 + }, + { + "epoch": 0.8840732339598185, + "grad_norm": 0.03456597402691841, + "learning_rate": 0.00017310860074710833, + "loss": 0.3658, + "step": 10913 + }, + { + "epoch": 0.8841542449773169, + "grad_norm": 0.0352916345000267, + "learning_rate": 0.00017310410009451372, + "loss": 0.3236, + "step": 10914 + }, + { + "epoch": 0.8842352559948153, + "grad_norm": 0.03116772323846817, + "learning_rate": 0.00017309959944191908, + "loss": 0.2683, + "step": 10915 + }, + { + "epoch": 0.8843162670123137, + "grad_norm": 0.0329105444252491, + "learning_rate": 0.00017309509878932447, + "loss": 0.3473, + "step": 10916 + }, + { + "epoch": 0.884397278029812, + "grad_norm": 0.0340874008834362, + "learning_rate": 0.00017309059813672983, + "loss": 0.3575, + "step": 10917 + }, + { + "epoch": 0.8844782890473104, + "grad_norm": 0.03931410610675812, + "learning_rate": 0.00017308609748413521, + "loss": 0.3495, + "step": 10918 + }, + { + "epoch": 0.8845593000648088, + "grad_norm": 0.037920136004686356, + "learning_rate": 0.00017308159683154057, + "loss": 0.3809, + "step": 10919 + }, + { + "epoch": 0.8846403110823072, + "grad_norm": 0.037753473967313766, + "learning_rate": 0.00017307709617894596, + "loss": 0.3371, + "step": 10920 + }, + { + "epoch": 0.8847213220998056, + "grad_norm": 0.03462715074419975, + "learning_rate": 0.00017307259552635132, + "loss": 0.3513, + "step": 10921 + }, + { + "epoch": 0.8848023331173039, + "grad_norm": 0.03926094248890877, + "learning_rate": 0.0001730680948737567, + "loss": 0.365, + "step": 10922 + }, + { + "epoch": 0.8848833441348023, + "grad_norm": 0.02999107912182808, + "learning_rate": 0.00017306359422116207, + "loss": 0.3364, + "step": 10923 + }, + { + "epoch": 0.8849643551523008, + "grad_norm": 0.02969319559633732, + "learning_rate": 0.00017305909356856746, + "loss": 0.324, + "step": 10924 + }, + { + "epoch": 0.8850453661697991, + "grad_norm": 0.03290926665067673, + "learning_rate": 0.00017305459291597284, + "loss": 0.3514, + "step": 10925 + }, + { + "epoch": 0.8851263771872975, + "grad_norm": 0.034250661730766296, + "learning_rate": 0.0001730500922633782, + "loss": 0.3367, + "step": 10926 + }, + { + "epoch": 0.8852073882047958, + "grad_norm": 0.040132924914360046, + "learning_rate": 0.00017304559161078356, + "loss": 0.3632, + "step": 10927 + }, + { + "epoch": 0.8852883992222942, + "grad_norm": 0.036323387175798416, + "learning_rate": 0.00017304109095818895, + "loss": 0.3199, + "step": 10928 + }, + { + "epoch": 0.8853694102397927, + "grad_norm": 0.03790266811847687, + "learning_rate": 0.0001730365903055943, + "loss": 0.3318, + "step": 10929 + }, + { + "epoch": 0.885450421257291, + "grad_norm": 0.03556707128882408, + "learning_rate": 0.0001730320896529997, + "loss": 0.3132, + "step": 10930 + }, + { + "epoch": 0.8855314322747894, + "grad_norm": 0.03412071615457535, + "learning_rate": 0.00017302758900040508, + "loss": 0.3475, + "step": 10931 + }, + { + "epoch": 0.8856124432922877, + "grad_norm": 0.03974820300936699, + "learning_rate": 0.00017302308834781044, + "loss": 0.3767, + "step": 10932 + }, + { + "epoch": 0.8856934543097861, + "grad_norm": 0.03283865377306938, + "learning_rate": 0.0001730185876952158, + "loss": 0.2992, + "step": 10933 + }, + { + "epoch": 0.8857744653272845, + "grad_norm": 0.03139305114746094, + "learning_rate": 0.0001730140870426212, + "loss": 0.3106, + "step": 10934 + }, + { + "epoch": 0.8858554763447829, + "grad_norm": 0.03446220979094505, + "learning_rate": 0.00017300958639002655, + "loss": 0.3128, + "step": 10935 + }, + { + "epoch": 0.8859364873622813, + "grad_norm": 0.03159172087907791, + "learning_rate": 0.00017300508573743194, + "loss": 0.2882, + "step": 10936 + }, + { + "epoch": 0.8860174983797796, + "grad_norm": 0.03451040759682655, + "learning_rate": 0.00017300058508483733, + "loss": 0.3288, + "step": 10937 + }, + { + "epoch": 0.8860985093972781, + "grad_norm": 0.029484622180461884, + "learning_rate": 0.0001729960844322427, + "loss": 0.2764, + "step": 10938 + }, + { + "epoch": 0.8861795204147764, + "grad_norm": 0.0305886659771204, + "learning_rate": 0.00017299158377964805, + "loss": 0.331, + "step": 10939 + }, + { + "epoch": 0.8862605314322748, + "grad_norm": 0.03970607370138168, + "learning_rate": 0.00017298708312705343, + "loss": 0.3571, + "step": 10940 + }, + { + "epoch": 0.8863415424497731, + "grad_norm": 0.032614629715681076, + "learning_rate": 0.0001729825824744588, + "loss": 0.3438, + "step": 10941 + }, + { + "epoch": 0.8864225534672715, + "grad_norm": 0.03078024461865425, + "learning_rate": 0.00017297808182186418, + "loss": 0.3229, + "step": 10942 + }, + { + "epoch": 0.88650356448477, + "grad_norm": 0.03623131290078163, + "learning_rate": 0.00017297358116926957, + "loss": 0.38, + "step": 10943 + }, + { + "epoch": 0.8865845755022683, + "grad_norm": 0.03395291045308113, + "learning_rate": 0.00017296908051667493, + "loss": 0.3349, + "step": 10944 + }, + { + "epoch": 0.8866655865197667, + "grad_norm": 0.031880639493465424, + "learning_rate": 0.0001729645798640803, + "loss": 0.2841, + "step": 10945 + }, + { + "epoch": 0.886746597537265, + "grad_norm": 0.033075135201215744, + "learning_rate": 0.00017296007921148568, + "loss": 0.31, + "step": 10946 + }, + { + "epoch": 0.8868276085547635, + "grad_norm": 0.03080565668642521, + "learning_rate": 0.00017295557855889104, + "loss": 0.2977, + "step": 10947 + }, + { + "epoch": 0.8869086195722619, + "grad_norm": 0.039215441793203354, + "learning_rate": 0.00017295107790629642, + "loss": 0.3396, + "step": 10948 + }, + { + "epoch": 0.8869896305897602, + "grad_norm": 0.03834895044565201, + "learning_rate": 0.0001729465772537018, + "loss": 0.3419, + "step": 10949 + }, + { + "epoch": 0.8870706416072586, + "grad_norm": 0.03201017901301384, + "learning_rate": 0.00017294207660110717, + "loss": 0.3251, + "step": 10950 + }, + { + "epoch": 0.8871516526247569, + "grad_norm": 0.028811967000365257, + "learning_rate": 0.00017293757594851253, + "loss": 0.3065, + "step": 10951 + }, + { + "epoch": 0.8872326636422554, + "grad_norm": 0.03520270437002182, + "learning_rate": 0.00017293307529591792, + "loss": 0.3207, + "step": 10952 + }, + { + "epoch": 0.8873136746597537, + "grad_norm": 0.033660054206848145, + "learning_rate": 0.00017292857464332328, + "loss": 0.322, + "step": 10953 + }, + { + "epoch": 0.8873946856772521, + "grad_norm": 0.029386943206191063, + "learning_rate": 0.00017292407399072866, + "loss": 0.2527, + "step": 10954 + }, + { + "epoch": 0.8874756966947505, + "grad_norm": 0.03373291343450546, + "learning_rate": 0.00017291957333813405, + "loss": 0.3207, + "step": 10955 + }, + { + "epoch": 0.8875567077122488, + "grad_norm": 0.03560221195220947, + "learning_rate": 0.0001729150726855394, + "loss": 0.355, + "step": 10956 + }, + { + "epoch": 0.8876377187297473, + "grad_norm": 0.030335064977407455, + "learning_rate": 0.00017291057203294477, + "loss": 0.2609, + "step": 10957 + }, + { + "epoch": 0.8877187297472456, + "grad_norm": 0.027602190151810646, + "learning_rate": 0.00017290607138035016, + "loss": 0.2777, + "step": 10958 + }, + { + "epoch": 0.887799740764744, + "grad_norm": 0.040721770375967026, + "learning_rate": 0.00017290157072775552, + "loss": 0.3634, + "step": 10959 + }, + { + "epoch": 0.8878807517822424, + "grad_norm": 0.037371281534433365, + "learning_rate": 0.0001728970700751609, + "loss": 0.3755, + "step": 10960 + }, + { + "epoch": 0.8879617627997408, + "grad_norm": 0.03600545600056648, + "learning_rate": 0.0001728925694225663, + "loss": 0.3431, + "step": 10961 + }, + { + "epoch": 0.8880427738172392, + "grad_norm": 0.03449264541268349, + "learning_rate": 0.00017288806876997165, + "loss": 0.3277, + "step": 10962 + }, + { + "epoch": 0.8881237848347375, + "grad_norm": 0.033802565187215805, + "learning_rate": 0.000172883568117377, + "loss": 0.3102, + "step": 10963 + }, + { + "epoch": 0.8882047958522359, + "grad_norm": 0.031701382249593735, + "learning_rate": 0.0001728790674647824, + "loss": 0.2825, + "step": 10964 + }, + { + "epoch": 0.8882858068697342, + "grad_norm": 0.03377435356378555, + "learning_rate": 0.00017287456681218776, + "loss": 0.3627, + "step": 10965 + }, + { + "epoch": 0.8883668178872327, + "grad_norm": 0.029154321178793907, + "learning_rate": 0.00017287006615959315, + "loss": 0.2998, + "step": 10966 + }, + { + "epoch": 0.8884478289047311, + "grad_norm": 0.03917527571320534, + "learning_rate": 0.00017286556550699853, + "loss": 0.3575, + "step": 10967 + }, + { + "epoch": 0.8885288399222294, + "grad_norm": 0.03809036687016487, + "learning_rate": 0.0001728610648544039, + "loss": 0.3551, + "step": 10968 + }, + { + "epoch": 0.8886098509397278, + "grad_norm": 0.028501780703663826, + "learning_rate": 0.00017285656420180925, + "loss": 0.2913, + "step": 10969 + }, + { + "epoch": 0.8886908619572261, + "grad_norm": 0.04084382951259613, + "learning_rate": 0.00017285206354921464, + "loss": 0.3357, + "step": 10970 + }, + { + "epoch": 0.8887718729747246, + "grad_norm": 0.032277580350637436, + "learning_rate": 0.00017284756289662, + "loss": 0.3286, + "step": 10971 + }, + { + "epoch": 0.888852883992223, + "grad_norm": 0.03367283195257187, + "learning_rate": 0.0001728430622440254, + "loss": 0.2913, + "step": 10972 + }, + { + "epoch": 0.8889338950097213, + "grad_norm": 0.035349562764167786, + "learning_rate": 0.00017283856159143078, + "loss": 0.3036, + "step": 10973 + }, + { + "epoch": 0.8890149060272197, + "grad_norm": 0.035391971468925476, + "learning_rate": 0.00017283406093883614, + "loss": 0.3499, + "step": 10974 + }, + { + "epoch": 0.8890959170447181, + "grad_norm": 0.03234266862273216, + "learning_rate": 0.0001728295602862415, + "loss": 0.3213, + "step": 10975 + }, + { + "epoch": 0.8891769280622165, + "grad_norm": 0.032877422869205475, + "learning_rate": 0.00017282505963364688, + "loss": 0.3347, + "step": 10976 + }, + { + "epoch": 0.8892579390797148, + "grad_norm": 0.030273448675870895, + "learning_rate": 0.00017282055898105227, + "loss": 0.3127, + "step": 10977 + }, + { + "epoch": 0.8893389500972132, + "grad_norm": 0.035193730145692825, + "learning_rate": 0.00017281605832845763, + "loss": 0.3409, + "step": 10978 + }, + { + "epoch": 0.8894199611147116, + "grad_norm": 0.030384352430701256, + "learning_rate": 0.00017281155767586302, + "loss": 0.315, + "step": 10979 + }, + { + "epoch": 0.88950097213221, + "grad_norm": 0.030477603897452354, + "learning_rate": 0.00017280705702326838, + "loss": 0.2877, + "step": 10980 + }, + { + "epoch": 0.8895819831497084, + "grad_norm": 0.03655509278178215, + "learning_rate": 0.00017280255637067377, + "loss": 0.3396, + "step": 10981 + }, + { + "epoch": 0.8896629941672067, + "grad_norm": 0.03506140410900116, + "learning_rate": 0.00017279805571807913, + "loss": 0.3275, + "step": 10982 + }, + { + "epoch": 0.8897440051847051, + "grad_norm": 0.0328826829791069, + "learning_rate": 0.0001727935550654845, + "loss": 0.3162, + "step": 10983 + }, + { + "epoch": 0.8898250162022034, + "grad_norm": 0.03526667132973671, + "learning_rate": 0.00017278905441288987, + "loss": 0.3562, + "step": 10984 + }, + { + "epoch": 0.8899060272197019, + "grad_norm": 0.031116103753447533, + "learning_rate": 0.00017278455376029526, + "loss": 0.3126, + "step": 10985 + }, + { + "epoch": 0.8899870382372003, + "grad_norm": 0.03344593569636345, + "learning_rate": 0.00017278005310770062, + "loss": 0.3167, + "step": 10986 + }, + { + "epoch": 0.8900680492546986, + "grad_norm": 0.0322323776781559, + "learning_rate": 0.000172775552455106, + "loss": 0.2934, + "step": 10987 + }, + { + "epoch": 0.890149060272197, + "grad_norm": 0.037336114794015884, + "learning_rate": 0.00017277105180251137, + "loss": 0.3233, + "step": 10988 + }, + { + "epoch": 0.8902300712896954, + "grad_norm": 0.042128514498472214, + "learning_rate": 0.00017276655114991675, + "loss": 0.3382, + "step": 10989 + }, + { + "epoch": 0.8903110823071938, + "grad_norm": 0.042901746928691864, + "learning_rate": 0.00017276205049732211, + "loss": 0.3601, + "step": 10990 + }, + { + "epoch": 0.8903920933246922, + "grad_norm": 0.034772779792547226, + "learning_rate": 0.0001727575498447275, + "loss": 0.3356, + "step": 10991 + }, + { + "epoch": 0.8904731043421905, + "grad_norm": 0.04229419305920601, + "learning_rate": 0.00017275304919213286, + "loss": 0.3381, + "step": 10992 + }, + { + "epoch": 0.8905541153596889, + "grad_norm": 0.0391651950776577, + "learning_rate": 0.00017274854853953825, + "loss": 0.3807, + "step": 10993 + }, + { + "epoch": 0.8906351263771873, + "grad_norm": 0.03737068176269531, + "learning_rate": 0.0001727440478869436, + "loss": 0.3548, + "step": 10994 + }, + { + "epoch": 0.8907161373946857, + "grad_norm": 0.03788726031780243, + "learning_rate": 0.000172739547234349, + "loss": 0.3282, + "step": 10995 + }, + { + "epoch": 0.890797148412184, + "grad_norm": 0.03355458006262779, + "learning_rate": 0.00017273504658175436, + "loss": 0.3528, + "step": 10996 + }, + { + "epoch": 0.8908781594296824, + "grad_norm": 0.034730155020952225, + "learning_rate": 0.00017273054592915974, + "loss": 0.3806, + "step": 10997 + }, + { + "epoch": 0.8909591704471809, + "grad_norm": 0.034574177116155624, + "learning_rate": 0.0001727260452765651, + "loss": 0.3267, + "step": 10998 + }, + { + "epoch": 0.8910401814646792, + "grad_norm": 0.03197081759572029, + "learning_rate": 0.0001727215446239705, + "loss": 0.3088, + "step": 10999 + }, + { + "epoch": 0.8911211924821776, + "grad_norm": 0.029353322461247444, + "learning_rate": 0.00017271704397137585, + "loss": 0.2935, + "step": 11000 + }, + { + "epoch": 0.8912022034996759, + "grad_norm": 0.031723231077194214, + "learning_rate": 0.00017271254331878124, + "loss": 0.3083, + "step": 11001 + }, + { + "epoch": 0.8912832145171743, + "grad_norm": 0.03682200238108635, + "learning_rate": 0.0001727080426661866, + "loss": 0.3472, + "step": 11002 + }, + { + "epoch": 0.8913642255346728, + "grad_norm": 0.031014375388622284, + "learning_rate": 0.00017270354201359198, + "loss": 0.3053, + "step": 11003 + }, + { + "epoch": 0.8914452365521711, + "grad_norm": 0.030643856152892113, + "learning_rate": 0.00017269904136099734, + "loss": 0.3239, + "step": 11004 + }, + { + "epoch": 0.8915262475696695, + "grad_norm": 0.03582141920924187, + "learning_rate": 0.00017269454070840273, + "loss": 0.3672, + "step": 11005 + }, + { + "epoch": 0.8916072585871678, + "grad_norm": 0.03675571084022522, + "learning_rate": 0.00017269004005580812, + "loss": 0.3595, + "step": 11006 + }, + { + "epoch": 0.8916882696046662, + "grad_norm": 0.03711831569671631, + "learning_rate": 0.00017268553940321348, + "loss": 0.325, + "step": 11007 + }, + { + "epoch": 0.8917692806221647, + "grad_norm": 0.031868595629930496, + "learning_rate": 0.00017268103875061884, + "loss": 0.3189, + "step": 11008 + }, + { + "epoch": 0.891850291639663, + "grad_norm": 0.03502114117145538, + "learning_rate": 0.00017267653809802423, + "loss": 0.355, + "step": 11009 + }, + { + "epoch": 0.8919313026571614, + "grad_norm": 0.032476335763931274, + "learning_rate": 0.00017267203744542959, + "loss": 0.3506, + "step": 11010 + }, + { + "epoch": 0.8920123136746597, + "grad_norm": 0.03275424987077713, + "learning_rate": 0.00017266753679283497, + "loss": 0.2934, + "step": 11011 + }, + { + "epoch": 0.8920933246921582, + "grad_norm": 0.03534052148461342, + "learning_rate": 0.00017266303614024036, + "loss": 0.3401, + "step": 11012 + }, + { + "epoch": 0.8921743357096565, + "grad_norm": 0.03902437165379524, + "learning_rate": 0.00017265853548764572, + "loss": 0.3376, + "step": 11013 + }, + { + "epoch": 0.8922553467271549, + "grad_norm": 0.036552559584379196, + "learning_rate": 0.00017265403483505108, + "loss": 0.3468, + "step": 11014 + }, + { + "epoch": 0.8923363577446533, + "grad_norm": 0.03513502702116966, + "learning_rate": 0.00017264953418245647, + "loss": 0.3471, + "step": 11015 + }, + { + "epoch": 0.8924173687621516, + "grad_norm": 0.033258937299251556, + "learning_rate": 0.00017264503352986183, + "loss": 0.3179, + "step": 11016 + }, + { + "epoch": 0.8924983797796501, + "grad_norm": 0.034728024154901505, + "learning_rate": 0.00017264053287726721, + "loss": 0.3294, + "step": 11017 + }, + { + "epoch": 0.8925793907971484, + "grad_norm": 0.03324027359485626, + "learning_rate": 0.0001726360322246726, + "loss": 0.3556, + "step": 11018 + }, + { + "epoch": 0.8926604018146468, + "grad_norm": 0.03504133224487305, + "learning_rate": 0.00017263153157207796, + "loss": 0.3237, + "step": 11019 + }, + { + "epoch": 0.8927414128321451, + "grad_norm": 0.03394149988889694, + "learning_rate": 0.00017262703091948332, + "loss": 0.3378, + "step": 11020 + }, + { + "epoch": 0.8928224238496435, + "grad_norm": 0.04117913171648979, + "learning_rate": 0.0001726225302668887, + "loss": 0.3468, + "step": 11021 + }, + { + "epoch": 0.892903434867142, + "grad_norm": 0.034105632454156876, + "learning_rate": 0.00017261802961429407, + "loss": 0.3601, + "step": 11022 + }, + { + "epoch": 0.8929844458846403, + "grad_norm": 0.03052723966538906, + "learning_rate": 0.00017261352896169946, + "loss": 0.3062, + "step": 11023 + }, + { + "epoch": 0.8930654569021387, + "grad_norm": 0.03380640968680382, + "learning_rate": 0.00017260902830910484, + "loss": 0.3059, + "step": 11024 + }, + { + "epoch": 0.893146467919637, + "grad_norm": 0.034155409783124924, + "learning_rate": 0.0001726045276565102, + "loss": 0.2754, + "step": 11025 + }, + { + "epoch": 0.8932274789371355, + "grad_norm": 0.03457861393690109, + "learning_rate": 0.00017260002700391556, + "loss": 0.3358, + "step": 11026 + }, + { + "epoch": 0.8933084899546339, + "grad_norm": 0.03328819200396538, + "learning_rate": 0.00017259552635132095, + "loss": 0.3069, + "step": 11027 + }, + { + "epoch": 0.8933895009721322, + "grad_norm": 0.033737119287252426, + "learning_rate": 0.0001725910256987263, + "loss": 0.3335, + "step": 11028 + }, + { + "epoch": 0.8934705119896306, + "grad_norm": 0.03014223463833332, + "learning_rate": 0.0001725865250461317, + "loss": 0.2744, + "step": 11029 + }, + { + "epoch": 0.8935515230071289, + "grad_norm": 0.030346151441335678, + "learning_rate": 0.00017258202439353709, + "loss": 0.299, + "step": 11030 + }, + { + "epoch": 0.8936325340246274, + "grad_norm": 0.03608626872301102, + "learning_rate": 0.00017257752374094245, + "loss": 0.3276, + "step": 11031 + }, + { + "epoch": 0.8937135450421257, + "grad_norm": 0.03907999023795128, + "learning_rate": 0.0001725730230883478, + "loss": 0.3778, + "step": 11032 + }, + { + "epoch": 0.8937945560596241, + "grad_norm": 0.03341106325387955, + "learning_rate": 0.0001725685224357532, + "loss": 0.3848, + "step": 11033 + }, + { + "epoch": 0.8938755670771225, + "grad_norm": 0.04504524916410446, + "learning_rate": 0.00017256402178315855, + "loss": 0.4034, + "step": 11034 + }, + { + "epoch": 0.8939565780946209, + "grad_norm": 0.037876032292842865, + "learning_rate": 0.00017255952113056394, + "loss": 0.3598, + "step": 11035 + }, + { + "epoch": 0.8940375891121193, + "grad_norm": 0.03509625792503357, + "learning_rate": 0.00017255502047796933, + "loss": 0.3249, + "step": 11036 + }, + { + "epoch": 0.8941186001296176, + "grad_norm": 0.03736604377627373, + "learning_rate": 0.0001725505198253747, + "loss": 0.3802, + "step": 11037 + }, + { + "epoch": 0.894199611147116, + "grad_norm": 0.03288540616631508, + "learning_rate": 0.00017254601917278005, + "loss": 0.3542, + "step": 11038 + }, + { + "epoch": 0.8942806221646143, + "grad_norm": 0.030276447534561157, + "learning_rate": 0.00017254151852018543, + "loss": 0.3297, + "step": 11039 + }, + { + "epoch": 0.8943616331821128, + "grad_norm": 0.03415091335773468, + "learning_rate": 0.0001725370178675908, + "loss": 0.3715, + "step": 11040 + }, + { + "epoch": 0.8944426441996112, + "grad_norm": 0.035904526710510254, + "learning_rate": 0.00017253251721499618, + "loss": 0.3552, + "step": 11041 + }, + { + "epoch": 0.8945236552171095, + "grad_norm": 0.03375767543911934, + "learning_rate": 0.00017252801656240157, + "loss": 0.346, + "step": 11042 + }, + { + "epoch": 0.8946046662346079, + "grad_norm": 0.03472788259387016, + "learning_rate": 0.00017252351590980693, + "loss": 0.3534, + "step": 11043 + }, + { + "epoch": 0.8946856772521062, + "grad_norm": 0.03359057009220123, + "learning_rate": 0.0001725190152572123, + "loss": 0.2999, + "step": 11044 + }, + { + "epoch": 0.8947666882696047, + "grad_norm": 0.03802908584475517, + "learning_rate": 0.00017251451460461768, + "loss": 0.2986, + "step": 11045 + }, + { + "epoch": 0.8948476992871031, + "grad_norm": 0.031641487032175064, + "learning_rate": 0.00017251001395202304, + "loss": 0.3225, + "step": 11046 + }, + { + "epoch": 0.8949287103046014, + "grad_norm": 0.037392813712358475, + "learning_rate": 0.00017250551329942842, + "loss": 0.3274, + "step": 11047 + }, + { + "epoch": 0.8950097213220998, + "grad_norm": 0.032086845487356186, + "learning_rate": 0.0001725010126468338, + "loss": 0.3255, + "step": 11048 + }, + { + "epoch": 0.8950907323395982, + "grad_norm": 0.035567983984947205, + "learning_rate": 0.00017249651199423917, + "loss": 0.3345, + "step": 11049 + }, + { + "epoch": 0.8951717433570966, + "grad_norm": 0.03568422794342041, + "learning_rate": 0.00017249201134164456, + "loss": 0.3835, + "step": 11050 + }, + { + "epoch": 0.895252754374595, + "grad_norm": 0.032751090824604034, + "learning_rate": 0.00017248751068904992, + "loss": 0.2823, + "step": 11051 + }, + { + "epoch": 0.8953337653920933, + "grad_norm": 0.03469252213835716, + "learning_rate": 0.00017248301003645528, + "loss": 0.3056, + "step": 11052 + }, + { + "epoch": 0.8954147764095917, + "grad_norm": 0.03916336968541145, + "learning_rate": 0.00017247850938386066, + "loss": 0.3917, + "step": 11053 + }, + { + "epoch": 0.8954957874270901, + "grad_norm": 0.034609757363796234, + "learning_rate": 0.00017247400873126605, + "loss": 0.3444, + "step": 11054 + }, + { + "epoch": 0.8955767984445885, + "grad_norm": 0.03403623029589653, + "learning_rate": 0.0001724695080786714, + "loss": 0.3524, + "step": 11055 + }, + { + "epoch": 0.8956578094620868, + "grad_norm": 0.03477954864501953, + "learning_rate": 0.0001724650074260768, + "loss": 0.3344, + "step": 11056 + }, + { + "epoch": 0.8957388204795852, + "grad_norm": 0.037678562104701996, + "learning_rate": 0.00017246050677348216, + "loss": 0.3426, + "step": 11057 + }, + { + "epoch": 0.8958198314970836, + "grad_norm": 0.029955245554447174, + "learning_rate": 0.00017245600612088755, + "loss": 0.3056, + "step": 11058 + }, + { + "epoch": 0.895900842514582, + "grad_norm": 0.03784172981977463, + "learning_rate": 0.0001724515054682929, + "loss": 0.3624, + "step": 11059 + }, + { + "epoch": 0.8959818535320804, + "grad_norm": 0.03630274161696434, + "learning_rate": 0.0001724470048156983, + "loss": 0.3516, + "step": 11060 + }, + { + "epoch": 0.8960628645495787, + "grad_norm": 0.0336889885365963, + "learning_rate": 0.00017244250416310365, + "loss": 0.3306, + "step": 11061 + }, + { + "epoch": 0.8961438755670771, + "grad_norm": 0.036519281566143036, + "learning_rate": 0.00017243800351050904, + "loss": 0.3459, + "step": 11062 + }, + { + "epoch": 0.8962248865845756, + "grad_norm": 0.032081425189971924, + "learning_rate": 0.0001724335028579144, + "loss": 0.3077, + "step": 11063 + }, + { + "epoch": 0.8963058976020739, + "grad_norm": 0.03331870585680008, + "learning_rate": 0.0001724290022053198, + "loss": 0.3369, + "step": 11064 + }, + { + "epoch": 0.8963869086195723, + "grad_norm": 0.032635319977998734, + "learning_rate": 0.00017242450155272515, + "loss": 0.3273, + "step": 11065 + }, + { + "epoch": 0.8964679196370706, + "grad_norm": 0.03170187398791313, + "learning_rate": 0.00017242000090013053, + "loss": 0.2604, + "step": 11066 + }, + { + "epoch": 0.896548930654569, + "grad_norm": 0.03879852965474129, + "learning_rate": 0.0001724155002475359, + "loss": 0.3501, + "step": 11067 + }, + { + "epoch": 0.8966299416720674, + "grad_norm": 0.03534620627760887, + "learning_rate": 0.00017241099959494128, + "loss": 0.3419, + "step": 11068 + }, + { + "epoch": 0.8967109526895658, + "grad_norm": 0.03365003690123558, + "learning_rate": 0.00017240649894234664, + "loss": 0.3471, + "step": 11069 + }, + { + "epoch": 0.8967919637070642, + "grad_norm": 0.037116698920726776, + "learning_rate": 0.00017240199828975203, + "loss": 0.3562, + "step": 11070 + }, + { + "epoch": 0.8968729747245625, + "grad_norm": 0.031580351293087006, + "learning_rate": 0.0001723974976371574, + "loss": 0.3336, + "step": 11071 + }, + { + "epoch": 0.8969539857420609, + "grad_norm": 0.03468446806073189, + "learning_rate": 0.00017239299698456278, + "loss": 0.341, + "step": 11072 + }, + { + "epoch": 0.8970349967595593, + "grad_norm": 0.03346063196659088, + "learning_rate": 0.00017238849633196814, + "loss": 0.318, + "step": 11073 + }, + { + "epoch": 0.8971160077770577, + "grad_norm": 0.030881447717547417, + "learning_rate": 0.00017238399567937352, + "loss": 0.3021, + "step": 11074 + }, + { + "epoch": 0.897197018794556, + "grad_norm": 0.03686084598302841, + "learning_rate": 0.00017237949502677888, + "loss": 0.3768, + "step": 11075 + }, + { + "epoch": 0.8972780298120544, + "grad_norm": 0.0324772372841835, + "learning_rate": 0.00017237499437418427, + "loss": 0.3233, + "step": 11076 + }, + { + "epoch": 0.8973590408295529, + "grad_norm": 0.03122434765100479, + "learning_rate": 0.00017237049372158963, + "loss": 0.2928, + "step": 11077 + }, + { + "epoch": 0.8974400518470512, + "grad_norm": 0.03259376436471939, + "learning_rate": 0.00017236599306899502, + "loss": 0.3438, + "step": 11078 + }, + { + "epoch": 0.8975210628645496, + "grad_norm": 0.03402949869632721, + "learning_rate": 0.00017236149241640038, + "loss": 0.3401, + "step": 11079 + }, + { + "epoch": 0.8976020738820479, + "grad_norm": 0.041615165770053864, + "learning_rate": 0.00017235699176380577, + "loss": 0.4367, + "step": 11080 + }, + { + "epoch": 0.8976830848995463, + "grad_norm": 0.034159209579229355, + "learning_rate": 0.00017235249111121113, + "loss": 0.362, + "step": 11081 + }, + { + "epoch": 0.8977640959170448, + "grad_norm": 0.03641688451170921, + "learning_rate": 0.0001723479904586165, + "loss": 0.3262, + "step": 11082 + }, + { + "epoch": 0.8978451069345431, + "grad_norm": 0.03852277249097824, + "learning_rate": 0.00017234348980602187, + "loss": 0.3482, + "step": 11083 + }, + { + "epoch": 0.8979261179520415, + "grad_norm": 0.03586672246456146, + "learning_rate": 0.00017233898915342726, + "loss": 0.3441, + "step": 11084 + }, + { + "epoch": 0.8980071289695398, + "grad_norm": 0.034681983292102814, + "learning_rate": 0.00017233448850083262, + "loss": 0.3424, + "step": 11085 + }, + { + "epoch": 0.8980881399870383, + "grad_norm": 0.03471839800477028, + "learning_rate": 0.000172329987848238, + "loss": 0.3411, + "step": 11086 + }, + { + "epoch": 0.8981691510045366, + "grad_norm": 0.03647003322839737, + "learning_rate": 0.0001723254871956434, + "loss": 0.3706, + "step": 11087 + }, + { + "epoch": 0.898250162022035, + "grad_norm": 0.03368555009365082, + "learning_rate": 0.00017232098654304875, + "loss": 0.3682, + "step": 11088 + }, + { + "epoch": 0.8983311730395334, + "grad_norm": 0.03130301833152771, + "learning_rate": 0.00017231648589045411, + "loss": 0.2777, + "step": 11089 + }, + { + "epoch": 0.8984121840570317, + "grad_norm": 0.03784201294183731, + "learning_rate": 0.0001723119852378595, + "loss": 0.3645, + "step": 11090 + }, + { + "epoch": 0.8984931950745302, + "grad_norm": 0.03592813014984131, + "learning_rate": 0.00017230748458526486, + "loss": 0.35, + "step": 11091 + }, + { + "epoch": 0.8985742060920285, + "grad_norm": 0.04151960462331772, + "learning_rate": 0.00017230298393267025, + "loss": 0.296, + "step": 11092 + }, + { + "epoch": 0.8986552171095269, + "grad_norm": 0.040424298495054245, + "learning_rate": 0.00017229848328007564, + "loss": 0.3996, + "step": 11093 + }, + { + "epoch": 0.8987362281270252, + "grad_norm": 0.0319959856569767, + "learning_rate": 0.000172293982627481, + "loss": 0.2948, + "step": 11094 + }, + { + "epoch": 0.8988172391445236, + "grad_norm": 0.03344470262527466, + "learning_rate": 0.00017228948197488636, + "loss": 0.3116, + "step": 11095 + }, + { + "epoch": 0.8988982501620221, + "grad_norm": 0.035156577825546265, + "learning_rate": 0.00017228498132229174, + "loss": 0.3508, + "step": 11096 + }, + { + "epoch": 0.8989792611795204, + "grad_norm": 0.035696208477020264, + "learning_rate": 0.0001722804806696971, + "loss": 0.3711, + "step": 11097 + }, + { + "epoch": 0.8990602721970188, + "grad_norm": 0.0365951843559742, + "learning_rate": 0.0001722759800171025, + "loss": 0.3306, + "step": 11098 + }, + { + "epoch": 0.8991412832145171, + "grad_norm": 0.034049030393362045, + "learning_rate": 0.00017227147936450788, + "loss": 0.3341, + "step": 11099 + }, + { + "epoch": 0.8992222942320156, + "grad_norm": 0.031511444598436356, + "learning_rate": 0.00017226697871191324, + "loss": 0.2812, + "step": 11100 + }, + { + "epoch": 0.899303305249514, + "grad_norm": 0.038552649319171906, + "learning_rate": 0.0001722624780593186, + "loss": 0.308, + "step": 11101 + }, + { + "epoch": 0.8993843162670123, + "grad_norm": 0.03560515120625496, + "learning_rate": 0.00017225797740672398, + "loss": 0.341, + "step": 11102 + }, + { + "epoch": 0.8994653272845107, + "grad_norm": 0.031026704236865044, + "learning_rate": 0.00017225347675412934, + "loss": 0.2962, + "step": 11103 + }, + { + "epoch": 0.899546338302009, + "grad_norm": 0.03231107443571091, + "learning_rate": 0.00017224897610153473, + "loss": 0.3072, + "step": 11104 + }, + { + "epoch": 0.8996273493195075, + "grad_norm": 0.033269885927438736, + "learning_rate": 0.00017224447544894012, + "loss": 0.3487, + "step": 11105 + }, + { + "epoch": 0.8997083603370059, + "grad_norm": 0.03612830489873886, + "learning_rate": 0.00017223997479634548, + "loss": 0.3468, + "step": 11106 + }, + { + "epoch": 0.8997893713545042, + "grad_norm": 0.03197575733065605, + "learning_rate": 0.00017223547414375084, + "loss": 0.3024, + "step": 11107 + }, + { + "epoch": 0.8998703823720026, + "grad_norm": 0.030702589079737663, + "learning_rate": 0.00017223097349115623, + "loss": 0.2801, + "step": 11108 + }, + { + "epoch": 0.8999513933895009, + "grad_norm": 0.03103921003639698, + "learning_rate": 0.00017222647283856159, + "loss": 0.3345, + "step": 11109 + }, + { + "epoch": 0.9000324044069994, + "grad_norm": 0.029365181922912598, + "learning_rate": 0.00017222197218596697, + "loss": 0.3055, + "step": 11110 + }, + { + "epoch": 0.9001134154244977, + "grad_norm": 0.031475115567445755, + "learning_rate": 0.00017221747153337236, + "loss": 0.3088, + "step": 11111 + }, + { + "epoch": 0.9001944264419961, + "grad_norm": 0.032637711614370346, + "learning_rate": 0.00017221297088077772, + "loss": 0.306, + "step": 11112 + }, + { + "epoch": 0.9002754374594945, + "grad_norm": 0.0325482040643692, + "learning_rate": 0.00017220847022818308, + "loss": 0.3099, + "step": 11113 + }, + { + "epoch": 0.9003564484769929, + "grad_norm": 0.037501901388168335, + "learning_rate": 0.00017220396957558847, + "loss": 0.3421, + "step": 11114 + }, + { + "epoch": 0.9004374594944913, + "grad_norm": 0.030759846791625023, + "learning_rate": 0.00017219946892299383, + "loss": 0.3364, + "step": 11115 + }, + { + "epoch": 0.9005184705119896, + "grad_norm": 0.0340886265039444, + "learning_rate": 0.00017219496827039922, + "loss": 0.3079, + "step": 11116 + }, + { + "epoch": 0.900599481529488, + "grad_norm": 0.03348534554243088, + "learning_rate": 0.0001721904676178046, + "loss": 0.3003, + "step": 11117 + }, + { + "epoch": 0.9006804925469863, + "grad_norm": 0.03073684312403202, + "learning_rate": 0.00017218596696520996, + "loss": 0.297, + "step": 11118 + }, + { + "epoch": 0.9007615035644848, + "grad_norm": 0.03075311705470085, + "learning_rate": 0.00017218146631261535, + "loss": 0.3001, + "step": 11119 + }, + { + "epoch": 0.9008425145819832, + "grad_norm": 0.035695288330316544, + "learning_rate": 0.0001721769656600207, + "loss": 0.3263, + "step": 11120 + }, + { + "epoch": 0.9009235255994815, + "grad_norm": 0.03151996061205864, + "learning_rate": 0.00017217246500742607, + "loss": 0.3215, + "step": 11121 + }, + { + "epoch": 0.9010045366169799, + "grad_norm": 0.03588097542524338, + "learning_rate": 0.00017216796435483146, + "loss": 0.3328, + "step": 11122 + }, + { + "epoch": 0.9010855476344782, + "grad_norm": 0.03737993165850639, + "learning_rate": 0.00017216346370223684, + "loss": 0.3409, + "step": 11123 + }, + { + "epoch": 0.9011665586519767, + "grad_norm": 0.03557238727807999, + "learning_rate": 0.0001721589630496422, + "loss": 0.346, + "step": 11124 + }, + { + "epoch": 0.9012475696694751, + "grad_norm": 0.030679596588015556, + "learning_rate": 0.0001721544623970476, + "loss": 0.3092, + "step": 11125 + }, + { + "epoch": 0.9013285806869734, + "grad_norm": 0.02960105612874031, + "learning_rate": 0.00017214996174445295, + "loss": 0.2876, + "step": 11126 + }, + { + "epoch": 0.9014095917044718, + "grad_norm": 0.03361353278160095, + "learning_rate": 0.0001721454610918583, + "loss": 0.3638, + "step": 11127 + }, + { + "epoch": 0.9014906027219702, + "grad_norm": 0.039340220391750336, + "learning_rate": 0.0001721409604392637, + "loss": 0.3688, + "step": 11128 + }, + { + "epoch": 0.9015716137394686, + "grad_norm": 0.03614228218793869, + "learning_rate": 0.00017213645978666909, + "loss": 0.3435, + "step": 11129 + }, + { + "epoch": 0.901652624756967, + "grad_norm": 0.029744107276201248, + "learning_rate": 0.00017213195913407445, + "loss": 0.2884, + "step": 11130 + }, + { + "epoch": 0.9017336357744653, + "grad_norm": 0.03261115774512291, + "learning_rate": 0.00017212745848147983, + "loss": 0.2932, + "step": 11131 + }, + { + "epoch": 0.9018146467919637, + "grad_norm": 0.03626837581396103, + "learning_rate": 0.0001721229578288852, + "loss": 0.3324, + "step": 11132 + }, + { + "epoch": 0.9018956578094621, + "grad_norm": 0.02938266471028328, + "learning_rate": 0.00017211845717629055, + "loss": 0.3234, + "step": 11133 + }, + { + "epoch": 0.9019766688269605, + "grad_norm": 0.03389137610793114, + "learning_rate": 0.00017211395652369594, + "loss": 0.3378, + "step": 11134 + }, + { + "epoch": 0.9020576798444588, + "grad_norm": 0.03858252987265587, + "learning_rate": 0.00017210945587110133, + "loss": 0.332, + "step": 11135 + }, + { + "epoch": 0.9021386908619572, + "grad_norm": 0.036512937396764755, + "learning_rate": 0.0001721049552185067, + "loss": 0.3257, + "step": 11136 + }, + { + "epoch": 0.9022197018794557, + "grad_norm": 0.040067460387945175, + "learning_rate": 0.00017210045456591207, + "loss": 0.4024, + "step": 11137 + }, + { + "epoch": 0.902300712896954, + "grad_norm": 0.03473897650837898, + "learning_rate": 0.00017209595391331743, + "loss": 0.3167, + "step": 11138 + }, + { + "epoch": 0.9023817239144524, + "grad_norm": 0.03343284875154495, + "learning_rate": 0.00017209145326072282, + "loss": 0.3048, + "step": 11139 + }, + { + "epoch": 0.9024627349319507, + "grad_norm": 0.03314567729830742, + "learning_rate": 0.00017208695260812818, + "loss": 0.3384, + "step": 11140 + }, + { + "epoch": 0.9025437459494491, + "grad_norm": 0.029058843851089478, + "learning_rate": 0.00017208245195553357, + "loss": 0.2856, + "step": 11141 + }, + { + "epoch": 0.9026247569669476, + "grad_norm": 0.03524141386151314, + "learning_rate": 0.00017207795130293893, + "loss": 0.3532, + "step": 11142 + }, + { + "epoch": 0.9027057679844459, + "grad_norm": 0.03530716150999069, + "learning_rate": 0.00017207345065034432, + "loss": 0.3738, + "step": 11143 + }, + { + "epoch": 0.9027867790019443, + "grad_norm": 0.030532442033290863, + "learning_rate": 0.00017206894999774968, + "loss": 0.3049, + "step": 11144 + }, + { + "epoch": 0.9028677900194426, + "grad_norm": 0.03170107305049896, + "learning_rate": 0.00017206444934515506, + "loss": 0.3551, + "step": 11145 + }, + { + "epoch": 0.902948801036941, + "grad_norm": 0.03420290723443031, + "learning_rate": 0.00017205994869256042, + "loss": 0.36, + "step": 11146 + }, + { + "epoch": 0.9030298120544394, + "grad_norm": 0.036962270736694336, + "learning_rate": 0.0001720554480399658, + "loss": 0.3437, + "step": 11147 + }, + { + "epoch": 0.9031108230719378, + "grad_norm": 0.0335623174905777, + "learning_rate": 0.00017205094738737117, + "loss": 0.3373, + "step": 11148 + }, + { + "epoch": 0.9031918340894362, + "grad_norm": 0.03984922543168068, + "learning_rate": 0.00017204644673477656, + "loss": 0.3905, + "step": 11149 + }, + { + "epoch": 0.9032728451069345, + "grad_norm": 0.04073641821742058, + "learning_rate": 0.00017204194608218192, + "loss": 0.3668, + "step": 11150 + }, + { + "epoch": 0.903353856124433, + "grad_norm": 0.033864449709653854, + "learning_rate": 0.0001720374454295873, + "loss": 0.3345, + "step": 11151 + }, + { + "epoch": 0.9034348671419313, + "grad_norm": 0.036866188049316406, + "learning_rate": 0.00017203294477699266, + "loss": 0.3107, + "step": 11152 + }, + { + "epoch": 0.9035158781594297, + "grad_norm": 0.03417964652180672, + "learning_rate": 0.00017202844412439805, + "loss": 0.3361, + "step": 11153 + }, + { + "epoch": 0.903596889176928, + "grad_norm": 0.03552941605448723, + "learning_rate": 0.0001720239434718034, + "loss": 0.3485, + "step": 11154 + }, + { + "epoch": 0.9036779001944264, + "grad_norm": 0.030915522947907448, + "learning_rate": 0.0001720194428192088, + "loss": 0.3156, + "step": 11155 + }, + { + "epoch": 0.9037589112119249, + "grad_norm": 0.03367985785007477, + "learning_rate": 0.00017201494216661416, + "loss": 0.2865, + "step": 11156 + }, + { + "epoch": 0.9038399222294232, + "grad_norm": 0.031156057491898537, + "learning_rate": 0.00017201044151401955, + "loss": 0.318, + "step": 11157 + }, + { + "epoch": 0.9039209332469216, + "grad_norm": 0.031971532851457596, + "learning_rate": 0.0001720059408614249, + "loss": 0.3321, + "step": 11158 + }, + { + "epoch": 0.9040019442644199, + "grad_norm": 0.0335182249546051, + "learning_rate": 0.0001720014402088303, + "loss": 0.3206, + "step": 11159 + }, + { + "epoch": 0.9040829552819183, + "grad_norm": 0.0367172434926033, + "learning_rate": 0.00017199693955623565, + "loss": 0.342, + "step": 11160 + }, + { + "epoch": 0.9041639662994168, + "grad_norm": 0.030467765405774117, + "learning_rate": 0.00017199243890364104, + "loss": 0.2973, + "step": 11161 + }, + { + "epoch": 0.9042449773169151, + "grad_norm": 0.0324619859457016, + "learning_rate": 0.00017198793825104643, + "loss": 0.3715, + "step": 11162 + }, + { + "epoch": 0.9043259883344135, + "grad_norm": 0.03537648171186447, + "learning_rate": 0.0001719834375984518, + "loss": 0.3652, + "step": 11163 + }, + { + "epoch": 0.9044069993519118, + "grad_norm": 0.03005852736532688, + "learning_rate": 0.00017197893694585715, + "loss": 0.3367, + "step": 11164 + }, + { + "epoch": 0.9044880103694103, + "grad_norm": 0.033330272883176804, + "learning_rate": 0.00017197443629326254, + "loss": 0.3327, + "step": 11165 + }, + { + "epoch": 0.9045690213869086, + "grad_norm": 0.03628942742943764, + "learning_rate": 0.0001719699356406679, + "loss": 0.3349, + "step": 11166 + }, + { + "epoch": 0.904650032404407, + "grad_norm": 0.032023970037698746, + "learning_rate": 0.00017196543498807328, + "loss": 0.3235, + "step": 11167 + }, + { + "epoch": 0.9047310434219054, + "grad_norm": 0.03418853133916855, + "learning_rate": 0.00017196093433547867, + "loss": 0.3044, + "step": 11168 + }, + { + "epoch": 0.9048120544394037, + "grad_norm": 0.03541678190231323, + "learning_rate": 0.00017195643368288403, + "loss": 0.3405, + "step": 11169 + }, + { + "epoch": 0.9048930654569022, + "grad_norm": 0.0324513241648674, + "learning_rate": 0.0001719519330302894, + "loss": 0.328, + "step": 11170 + }, + { + "epoch": 0.9049740764744005, + "grad_norm": 0.03757879137992859, + "learning_rate": 0.00017194743237769478, + "loss": 0.3464, + "step": 11171 + }, + { + "epoch": 0.9050550874918989, + "grad_norm": 0.03542768955230713, + "learning_rate": 0.00017194293172510014, + "loss": 0.3238, + "step": 11172 + }, + { + "epoch": 0.9051360985093972, + "grad_norm": 0.040335994213819504, + "learning_rate": 0.00017193843107250552, + "loss": 0.3683, + "step": 11173 + }, + { + "epoch": 0.9052171095268956, + "grad_norm": 0.038330186158418655, + "learning_rate": 0.0001719339304199109, + "loss": 0.3151, + "step": 11174 + }, + { + "epoch": 0.9052981205443941, + "grad_norm": 0.0329168438911438, + "learning_rate": 0.00017192942976731627, + "loss": 0.3149, + "step": 11175 + }, + { + "epoch": 0.9053791315618924, + "grad_norm": 0.036200057715177536, + "learning_rate": 0.00017192492911472163, + "loss": 0.3267, + "step": 11176 + }, + { + "epoch": 0.9054601425793908, + "grad_norm": 0.038981903344392776, + "learning_rate": 0.00017192042846212702, + "loss": 0.329, + "step": 11177 + }, + { + "epoch": 0.9055411535968891, + "grad_norm": 0.04097919166088104, + "learning_rate": 0.00017191592780953238, + "loss": 0.3441, + "step": 11178 + }, + { + "epoch": 0.9056221646143876, + "grad_norm": 0.03847785294055939, + "learning_rate": 0.00017191142715693777, + "loss": 0.3578, + "step": 11179 + }, + { + "epoch": 0.905703175631886, + "grad_norm": 0.03198770061135292, + "learning_rate": 0.00017190692650434315, + "loss": 0.309, + "step": 11180 + }, + { + "epoch": 0.9057841866493843, + "grad_norm": 0.037624429911375046, + "learning_rate": 0.0001719024258517485, + "loss": 0.3443, + "step": 11181 + }, + { + "epoch": 0.9058651976668827, + "grad_norm": 0.0348743237555027, + "learning_rate": 0.00017189792519915387, + "loss": 0.3141, + "step": 11182 + }, + { + "epoch": 0.905946208684381, + "grad_norm": 0.036382030695676804, + "learning_rate": 0.00017189342454655926, + "loss": 0.3801, + "step": 11183 + }, + { + "epoch": 0.9060272197018795, + "grad_norm": 0.03109712339937687, + "learning_rate": 0.00017188892389396462, + "loss": 0.3263, + "step": 11184 + }, + { + "epoch": 0.9061082307193778, + "grad_norm": 0.029653826728463173, + "learning_rate": 0.00017188442324137, + "loss": 0.2775, + "step": 11185 + }, + { + "epoch": 0.9061892417368762, + "grad_norm": 0.0371454581618309, + "learning_rate": 0.0001718799225887754, + "loss": 0.3024, + "step": 11186 + }, + { + "epoch": 0.9062702527543746, + "grad_norm": 0.034754831343889236, + "learning_rate": 0.00017187542193618075, + "loss": 0.3296, + "step": 11187 + }, + { + "epoch": 0.906351263771873, + "grad_norm": 0.038768380880355835, + "learning_rate": 0.00017187092128358614, + "loss": 0.374, + "step": 11188 + }, + { + "epoch": 0.9064322747893714, + "grad_norm": 0.03388087823987007, + "learning_rate": 0.0001718664206309915, + "loss": 0.3241, + "step": 11189 + }, + { + "epoch": 0.9065132858068697, + "grad_norm": 0.032058071345090866, + "learning_rate": 0.00017186191997839686, + "loss": 0.2732, + "step": 11190 + }, + { + "epoch": 0.9065942968243681, + "grad_norm": 0.03243362531065941, + "learning_rate": 0.00017185741932580225, + "loss": 0.3347, + "step": 11191 + }, + { + "epoch": 0.9066753078418665, + "grad_norm": 0.03509004786610603, + "learning_rate": 0.00017185291867320764, + "loss": 0.3421, + "step": 11192 + }, + { + "epoch": 0.9067563188593649, + "grad_norm": 0.03373107314109802, + "learning_rate": 0.000171848418020613, + "loss": 0.332, + "step": 11193 + }, + { + "epoch": 0.9068373298768633, + "grad_norm": 0.03082781843841076, + "learning_rate": 0.00017184391736801838, + "loss": 0.3062, + "step": 11194 + }, + { + "epoch": 0.9069183408943616, + "grad_norm": 0.03341514617204666, + "learning_rate": 0.00017183941671542374, + "loss": 0.3813, + "step": 11195 + }, + { + "epoch": 0.90699935191186, + "grad_norm": 0.033063508570194244, + "learning_rate": 0.0001718349160628291, + "loss": 0.3548, + "step": 11196 + }, + { + "epoch": 0.9070803629293583, + "grad_norm": 0.033410415053367615, + "learning_rate": 0.0001718304154102345, + "loss": 0.3382, + "step": 11197 + }, + { + "epoch": 0.9071613739468568, + "grad_norm": 0.032123420387506485, + "learning_rate": 0.00017182591475763988, + "loss": 0.316, + "step": 11198 + }, + { + "epoch": 0.9072423849643552, + "grad_norm": 0.03485981002449989, + "learning_rate": 0.00017182141410504524, + "loss": 0.3348, + "step": 11199 + }, + { + "epoch": 0.9073233959818535, + "grad_norm": 0.04091982915997505, + "learning_rate": 0.00017181691345245062, + "loss": 0.3457, + "step": 11200 + }, + { + "epoch": 0.9074044069993519, + "grad_norm": 0.029208384454250336, + "learning_rate": 0.00017181241279985598, + "loss": 0.3125, + "step": 11201 + }, + { + "epoch": 0.9074854180168503, + "grad_norm": 0.035110436379909515, + "learning_rate": 0.00017180791214726134, + "loss": 0.3414, + "step": 11202 + }, + { + "epoch": 0.9075664290343487, + "grad_norm": 0.03841705992817879, + "learning_rate": 0.00017180341149466673, + "loss": 0.3394, + "step": 11203 + }, + { + "epoch": 0.907647440051847, + "grad_norm": 0.034956254065036774, + "learning_rate": 0.00017179891084207212, + "loss": 0.3441, + "step": 11204 + }, + { + "epoch": 0.9077284510693454, + "grad_norm": 0.03248672932386398, + "learning_rate": 0.00017179441018947748, + "loss": 0.3605, + "step": 11205 + }, + { + "epoch": 0.9078094620868438, + "grad_norm": 0.03265668451786041, + "learning_rate": 0.00017178990953688287, + "loss": 0.362, + "step": 11206 + }, + { + "epoch": 0.9078904731043422, + "grad_norm": 0.033454276621341705, + "learning_rate": 0.00017178540888428823, + "loss": 0.3425, + "step": 11207 + }, + { + "epoch": 0.9079714841218406, + "grad_norm": 0.0342661514878273, + "learning_rate": 0.0001717809082316936, + "loss": 0.3545, + "step": 11208 + }, + { + "epoch": 0.9080524951393389, + "grad_norm": 0.02960863895714283, + "learning_rate": 0.00017177640757909897, + "loss": 0.3217, + "step": 11209 + }, + { + "epoch": 0.9081335061568373, + "grad_norm": 0.033228784799575806, + "learning_rate": 0.00017177190692650436, + "loss": 0.2741, + "step": 11210 + }, + { + "epoch": 0.9082145171743357, + "grad_norm": 0.030154986307024956, + "learning_rate": 0.00017176740627390972, + "loss": 0.2771, + "step": 11211 + }, + { + "epoch": 0.9082955281918341, + "grad_norm": 0.03683822229504585, + "learning_rate": 0.0001717629056213151, + "loss": 0.3257, + "step": 11212 + }, + { + "epoch": 0.9083765392093325, + "grad_norm": 0.03959393501281738, + "learning_rate": 0.00017175840496872047, + "loss": 0.3753, + "step": 11213 + }, + { + "epoch": 0.9084575502268308, + "grad_norm": 0.03272836282849312, + "learning_rate": 0.00017175390431612583, + "loss": 0.3436, + "step": 11214 + }, + { + "epoch": 0.9085385612443292, + "grad_norm": 0.03133765608072281, + "learning_rate": 0.00017174940366353122, + "loss": 0.3141, + "step": 11215 + }, + { + "epoch": 0.9086195722618277, + "grad_norm": 0.03695807605981827, + "learning_rate": 0.0001717449030109366, + "loss": 0.3312, + "step": 11216 + }, + { + "epoch": 0.908700583279326, + "grad_norm": 0.03227134048938751, + "learning_rate": 0.00017174040235834196, + "loss": 0.3215, + "step": 11217 + }, + { + "epoch": 0.9087815942968244, + "grad_norm": 0.034273579716682434, + "learning_rate": 0.00017173590170574735, + "loss": 0.3459, + "step": 11218 + }, + { + "epoch": 0.9088626053143227, + "grad_norm": 0.041489578783512115, + "learning_rate": 0.0001717314010531527, + "loss": 0.397, + "step": 11219 + }, + { + "epoch": 0.9089436163318211, + "grad_norm": 0.033856477588415146, + "learning_rate": 0.0001717269004005581, + "loss": 0.2967, + "step": 11220 + }, + { + "epoch": 0.9090246273493195, + "grad_norm": 0.03919477388262749, + "learning_rate": 0.00017172239974796346, + "loss": 0.3253, + "step": 11221 + }, + { + "epoch": 0.9091056383668179, + "grad_norm": 0.03546192869544029, + "learning_rate": 0.00017171789909536884, + "loss": 0.3376, + "step": 11222 + }, + { + "epoch": 0.9091866493843163, + "grad_norm": 0.03431613743305206, + "learning_rate": 0.0001717133984427742, + "loss": 0.3302, + "step": 11223 + }, + { + "epoch": 0.9092676604018146, + "grad_norm": 0.032933030277490616, + "learning_rate": 0.0001717088977901796, + "loss": 0.3175, + "step": 11224 + }, + { + "epoch": 0.9093486714193131, + "grad_norm": 0.03571391850709915, + "learning_rate": 0.00017170439713758495, + "loss": 0.3086, + "step": 11225 + }, + { + "epoch": 0.9094296824368114, + "grad_norm": 0.03721616417169571, + "learning_rate": 0.00017169989648499034, + "loss": 0.3678, + "step": 11226 + }, + { + "epoch": 0.9095106934543098, + "grad_norm": 0.033615391701459885, + "learning_rate": 0.0001716953958323957, + "loss": 0.3555, + "step": 11227 + }, + { + "epoch": 0.9095917044718081, + "grad_norm": 0.028835784643888474, + "learning_rate": 0.00017169089517980109, + "loss": 0.3028, + "step": 11228 + }, + { + "epoch": 0.9096727154893065, + "grad_norm": 0.03430754318833351, + "learning_rate": 0.00017168639452720645, + "loss": 0.3465, + "step": 11229 + }, + { + "epoch": 0.909753726506805, + "grad_norm": 0.03223668411374092, + "learning_rate": 0.00017168189387461183, + "loss": 0.2622, + "step": 11230 + }, + { + "epoch": 0.9098347375243033, + "grad_norm": 0.031379371881484985, + "learning_rate": 0.0001716773932220172, + "loss": 0.3265, + "step": 11231 + }, + { + "epoch": 0.9099157485418017, + "grad_norm": 0.04087335988879204, + "learning_rate": 0.00017167289256942258, + "loss": 0.4356, + "step": 11232 + }, + { + "epoch": 0.9099967595593, + "grad_norm": 0.03372536227107048, + "learning_rate": 0.00017166839191682794, + "loss": 0.3474, + "step": 11233 + }, + { + "epoch": 0.9100777705767984, + "grad_norm": 0.0366133376955986, + "learning_rate": 0.00017166389126423333, + "loss": 0.3522, + "step": 11234 + }, + { + "epoch": 0.9101587815942969, + "grad_norm": 0.03601552173495293, + "learning_rate": 0.0001716593906116387, + "loss": 0.2926, + "step": 11235 + }, + { + "epoch": 0.9102397926117952, + "grad_norm": 0.03612074628472328, + "learning_rate": 0.00017165488995904407, + "loss": 0.3286, + "step": 11236 + }, + { + "epoch": 0.9103208036292936, + "grad_norm": 0.032681941986083984, + "learning_rate": 0.00017165038930644943, + "loss": 0.3076, + "step": 11237 + }, + { + "epoch": 0.9104018146467919, + "grad_norm": 0.03919201344251633, + "learning_rate": 0.00017164588865385482, + "loss": 0.3569, + "step": 11238 + }, + { + "epoch": 0.9104828256642904, + "grad_norm": 0.03580492362380028, + "learning_rate": 0.00017164138800126018, + "loss": 0.325, + "step": 11239 + }, + { + "epoch": 0.9105638366817888, + "grad_norm": 0.03724316507577896, + "learning_rate": 0.00017163688734866557, + "loss": 0.3448, + "step": 11240 + }, + { + "epoch": 0.9106448476992871, + "grad_norm": 0.0412566177546978, + "learning_rate": 0.00017163238669607093, + "loss": 0.3547, + "step": 11241 + }, + { + "epoch": 0.9107258587167855, + "grad_norm": 0.038360945880413055, + "learning_rate": 0.00017162788604347632, + "loss": 0.3004, + "step": 11242 + }, + { + "epoch": 0.9108068697342838, + "grad_norm": 0.03706123307347298, + "learning_rate": 0.0001716233853908817, + "loss": 0.3773, + "step": 11243 + }, + { + "epoch": 0.9108878807517823, + "grad_norm": 0.03696974739432335, + "learning_rate": 0.00017161888473828706, + "loss": 0.3565, + "step": 11244 + }, + { + "epoch": 0.9109688917692806, + "grad_norm": 0.03767175227403641, + "learning_rate": 0.00017161438408569242, + "loss": 0.3378, + "step": 11245 + }, + { + "epoch": 0.911049902786779, + "grad_norm": 0.03733237832784653, + "learning_rate": 0.0001716098834330978, + "loss": 0.3499, + "step": 11246 + }, + { + "epoch": 0.9111309138042774, + "grad_norm": 0.03460073471069336, + "learning_rate": 0.00017160538278050317, + "loss": 0.3479, + "step": 11247 + }, + { + "epoch": 0.9112119248217757, + "grad_norm": 0.037753161042928696, + "learning_rate": 0.00017160088212790856, + "loss": 0.3499, + "step": 11248 + }, + { + "epoch": 0.9112929358392742, + "grad_norm": 0.03585457429289818, + "learning_rate": 0.00017159638147531394, + "loss": 0.3369, + "step": 11249 + }, + { + "epoch": 0.9113739468567725, + "grad_norm": 0.0369020476937294, + "learning_rate": 0.0001715918808227193, + "loss": 0.289, + "step": 11250 + }, + { + "epoch": 0.9114549578742709, + "grad_norm": 0.036454979330301285, + "learning_rate": 0.00017158738017012467, + "loss": 0.3271, + "step": 11251 + }, + { + "epoch": 0.9115359688917692, + "grad_norm": 0.034425899386405945, + "learning_rate": 0.00017158287951753005, + "loss": 0.372, + "step": 11252 + }, + { + "epoch": 0.9116169799092677, + "grad_norm": 0.03651287779211998, + "learning_rate": 0.0001715783788649354, + "loss": 0.3525, + "step": 11253 + }, + { + "epoch": 0.9116979909267661, + "grad_norm": 0.03623446449637413, + "learning_rate": 0.0001715738782123408, + "loss": 0.3519, + "step": 11254 + }, + { + "epoch": 0.9117790019442644, + "grad_norm": 0.03538570553064346, + "learning_rate": 0.0001715693775597462, + "loss": 0.3524, + "step": 11255 + }, + { + "epoch": 0.9118600129617628, + "grad_norm": 0.038185957819223404, + "learning_rate": 0.00017156487690715155, + "loss": 0.3425, + "step": 11256 + }, + { + "epoch": 0.9119410239792611, + "grad_norm": 0.0360933318734169, + "learning_rate": 0.00017156037625455693, + "loss": 0.3282, + "step": 11257 + }, + { + "epoch": 0.9120220349967596, + "grad_norm": 0.03948797285556793, + "learning_rate": 0.0001715558756019623, + "loss": 0.4002, + "step": 11258 + }, + { + "epoch": 0.912103046014258, + "grad_norm": 0.039010416716337204, + "learning_rate": 0.00017155137494936765, + "loss": 0.3217, + "step": 11259 + }, + { + "epoch": 0.9121840570317563, + "grad_norm": 0.03200877457857132, + "learning_rate": 0.00017154687429677304, + "loss": 0.3044, + "step": 11260 + }, + { + "epoch": 0.9122650680492547, + "grad_norm": 0.03211471065878868, + "learning_rate": 0.00017154237364417843, + "loss": 0.3395, + "step": 11261 + }, + { + "epoch": 0.912346079066753, + "grad_norm": 0.030386080965399742, + "learning_rate": 0.0001715378729915838, + "loss": 0.2794, + "step": 11262 + }, + { + "epoch": 0.9124270900842515, + "grad_norm": 0.037416622042655945, + "learning_rate": 0.00017153337233898918, + "loss": 0.3557, + "step": 11263 + }, + { + "epoch": 0.9125081011017498, + "grad_norm": 0.03650791198015213, + "learning_rate": 0.00017152887168639454, + "loss": 0.3542, + "step": 11264 + }, + { + "epoch": 0.9125891121192482, + "grad_norm": 0.03503980487585068, + "learning_rate": 0.0001715243710337999, + "loss": 0.3502, + "step": 11265 + }, + { + "epoch": 0.9126701231367466, + "grad_norm": 0.033888183534145355, + "learning_rate": 0.00017151987038120528, + "loss": 0.3508, + "step": 11266 + }, + { + "epoch": 0.912751134154245, + "grad_norm": 0.03710508719086647, + "learning_rate": 0.00017151536972861067, + "loss": 0.3347, + "step": 11267 + }, + { + "epoch": 0.9128321451717434, + "grad_norm": 0.03084034100174904, + "learning_rate": 0.00017151086907601603, + "loss": 0.2781, + "step": 11268 + }, + { + "epoch": 0.9129131561892417, + "grad_norm": 0.03391794487833977, + "learning_rate": 0.00017150636842342142, + "loss": 0.3406, + "step": 11269 + }, + { + "epoch": 0.9129941672067401, + "grad_norm": 0.03140241652727127, + "learning_rate": 0.00017150186777082678, + "loss": 0.3048, + "step": 11270 + }, + { + "epoch": 0.9130751782242384, + "grad_norm": 0.04186585173010826, + "learning_rate": 0.00017149736711823214, + "loss": 0.3957, + "step": 11271 + }, + { + "epoch": 0.9131561892417369, + "grad_norm": 0.03869124501943588, + "learning_rate": 0.00017149286646563752, + "loss": 0.3725, + "step": 11272 + }, + { + "epoch": 0.9132372002592353, + "grad_norm": 0.038467105478048325, + "learning_rate": 0.0001714883658130429, + "loss": 0.3514, + "step": 11273 + }, + { + "epoch": 0.9133182112767336, + "grad_norm": 0.03266032412648201, + "learning_rate": 0.00017148386516044827, + "loss": 0.3406, + "step": 11274 + }, + { + "epoch": 0.913399222294232, + "grad_norm": 0.03216484189033508, + "learning_rate": 0.00017147936450785366, + "loss": 0.3249, + "step": 11275 + }, + { + "epoch": 0.9134802333117304, + "grad_norm": 0.03364231809973717, + "learning_rate": 0.00017147486385525902, + "loss": 0.309, + "step": 11276 + }, + { + "epoch": 0.9135612443292288, + "grad_norm": 0.03598964586853981, + "learning_rate": 0.00017147036320266438, + "loss": 0.3246, + "step": 11277 + }, + { + "epoch": 0.9136422553467272, + "grad_norm": 0.03309572860598564, + "learning_rate": 0.00017146586255006977, + "loss": 0.2848, + "step": 11278 + }, + { + "epoch": 0.9137232663642255, + "grad_norm": 0.03358837962150574, + "learning_rate": 0.00017146136189747515, + "loss": 0.3364, + "step": 11279 + }, + { + "epoch": 0.9138042773817239, + "grad_norm": 0.03198292478919029, + "learning_rate": 0.0001714568612448805, + "loss": 0.2818, + "step": 11280 + }, + { + "epoch": 0.9138852883992223, + "grad_norm": 0.03653620928525925, + "learning_rate": 0.0001714523605922859, + "loss": 0.4019, + "step": 11281 + }, + { + "epoch": 0.9139662994167207, + "grad_norm": 0.031275875866413116, + "learning_rate": 0.00017144785993969126, + "loss": 0.3208, + "step": 11282 + }, + { + "epoch": 0.914047310434219, + "grad_norm": 0.040892936289310455, + "learning_rate": 0.00017144335928709662, + "loss": 0.3516, + "step": 11283 + }, + { + "epoch": 0.9141283214517174, + "grad_norm": 0.032283004373311996, + "learning_rate": 0.000171438858634502, + "loss": 0.3202, + "step": 11284 + }, + { + "epoch": 0.9142093324692158, + "grad_norm": 0.03630262613296509, + "learning_rate": 0.0001714343579819074, + "loss": 0.3574, + "step": 11285 + }, + { + "epoch": 0.9142903434867142, + "grad_norm": 0.03689377382397652, + "learning_rate": 0.00017142985732931275, + "loss": 0.3583, + "step": 11286 + }, + { + "epoch": 0.9143713545042126, + "grad_norm": 0.0360412560403347, + "learning_rate": 0.00017142535667671814, + "loss": 0.3132, + "step": 11287 + }, + { + "epoch": 0.9144523655217109, + "grad_norm": 0.035145342350006104, + "learning_rate": 0.0001714208560241235, + "loss": 0.3105, + "step": 11288 + }, + { + "epoch": 0.9145333765392093, + "grad_norm": 0.02984432876110077, + "learning_rate": 0.00017141635537152886, + "loss": 0.3126, + "step": 11289 + }, + { + "epoch": 0.9146143875567078, + "grad_norm": 0.03315803408622742, + "learning_rate": 0.00017141185471893425, + "loss": 0.3362, + "step": 11290 + }, + { + "epoch": 0.9146953985742061, + "grad_norm": 0.031095709651708603, + "learning_rate": 0.00017140735406633964, + "loss": 0.3677, + "step": 11291 + }, + { + "epoch": 0.9147764095917045, + "grad_norm": 0.03344155102968216, + "learning_rate": 0.000171402853413745, + "loss": 0.3581, + "step": 11292 + }, + { + "epoch": 0.9148574206092028, + "grad_norm": 0.03367290273308754, + "learning_rate": 0.00017139835276115038, + "loss": 0.3454, + "step": 11293 + }, + { + "epoch": 0.9149384316267012, + "grad_norm": 0.035224586725234985, + "learning_rate": 0.00017139385210855574, + "loss": 0.3567, + "step": 11294 + }, + { + "epoch": 0.9150194426441997, + "grad_norm": 0.033881694078445435, + "learning_rate": 0.00017138935145596113, + "loss": 0.2955, + "step": 11295 + }, + { + "epoch": 0.915100453661698, + "grad_norm": 0.037541892379522324, + "learning_rate": 0.0001713848508033665, + "loss": 0.3421, + "step": 11296 + }, + { + "epoch": 0.9151814646791964, + "grad_norm": 0.03299560025334358, + "learning_rate": 0.00017138035015077188, + "loss": 0.2979, + "step": 11297 + }, + { + "epoch": 0.9152624756966947, + "grad_norm": 0.04041426256299019, + "learning_rate": 0.00017137584949817724, + "loss": 0.3696, + "step": 11298 + }, + { + "epoch": 0.9153434867141931, + "grad_norm": 0.03309905156493187, + "learning_rate": 0.00017137134884558263, + "loss": 0.2828, + "step": 11299 + }, + { + "epoch": 0.9154244977316915, + "grad_norm": 0.031124910339713097, + "learning_rate": 0.00017136684819298799, + "loss": 0.2948, + "step": 11300 + }, + { + "epoch": 0.9155055087491899, + "grad_norm": 0.038336314260959625, + "learning_rate": 0.00017136234754039337, + "loss": 0.3329, + "step": 11301 + }, + { + "epoch": 0.9155865197666883, + "grad_norm": 0.034365057945251465, + "learning_rate": 0.00017135784688779873, + "loss": 0.3208, + "step": 11302 + }, + { + "epoch": 0.9156675307841866, + "grad_norm": 0.032800037413835526, + "learning_rate": 0.00017135334623520412, + "loss": 0.3003, + "step": 11303 + }, + { + "epoch": 0.9157485418016851, + "grad_norm": 0.03155320882797241, + "learning_rate": 0.00017134884558260948, + "loss": 0.2989, + "step": 11304 + }, + { + "epoch": 0.9158295528191834, + "grad_norm": 0.040037475526332855, + "learning_rate": 0.00017134434493001487, + "loss": 0.3596, + "step": 11305 + }, + { + "epoch": 0.9159105638366818, + "grad_norm": 0.03675394132733345, + "learning_rate": 0.00017133984427742023, + "loss": 0.3357, + "step": 11306 + }, + { + "epoch": 0.9159915748541801, + "grad_norm": 0.03477318212389946, + "learning_rate": 0.00017133534362482561, + "loss": 0.3089, + "step": 11307 + }, + { + "epoch": 0.9160725858716785, + "grad_norm": 0.036784540861845016, + "learning_rate": 0.00017133084297223097, + "loss": 0.3537, + "step": 11308 + }, + { + "epoch": 0.916153596889177, + "grad_norm": 0.03151548653841019, + "learning_rate": 0.00017132634231963636, + "loss": 0.3111, + "step": 11309 + }, + { + "epoch": 0.9162346079066753, + "grad_norm": 0.03540056571364403, + "learning_rate": 0.00017132184166704172, + "loss": 0.3867, + "step": 11310 + }, + { + "epoch": 0.9163156189241737, + "grad_norm": 0.035191185772418976, + "learning_rate": 0.0001713173410144471, + "loss": 0.3107, + "step": 11311 + }, + { + "epoch": 0.916396629941672, + "grad_norm": 0.03517398610711098, + "learning_rate": 0.00017131284036185247, + "loss": 0.3428, + "step": 11312 + }, + { + "epoch": 0.9164776409591704, + "grad_norm": 0.03177648410201073, + "learning_rate": 0.00017130833970925786, + "loss": 0.296, + "step": 11313 + }, + { + "epoch": 0.9165586519766689, + "grad_norm": 0.034015148878097534, + "learning_rate": 0.00017130383905666322, + "loss": 0.3476, + "step": 11314 + }, + { + "epoch": 0.9166396629941672, + "grad_norm": 0.029152886942029, + "learning_rate": 0.0001712993384040686, + "loss": 0.2802, + "step": 11315 + }, + { + "epoch": 0.9167206740116656, + "grad_norm": 0.030134642496705055, + "learning_rate": 0.00017129483775147396, + "loss": 0.3336, + "step": 11316 + }, + { + "epoch": 0.9168016850291639, + "grad_norm": 0.037936773151159286, + "learning_rate": 0.00017129033709887935, + "loss": 0.3464, + "step": 11317 + }, + { + "epoch": 0.9168826960466624, + "grad_norm": 0.037356019020080566, + "learning_rate": 0.0001712858364462847, + "loss": 0.4063, + "step": 11318 + }, + { + "epoch": 0.9169637070641607, + "grad_norm": 0.04031597450375557, + "learning_rate": 0.0001712813357936901, + "loss": 0.3689, + "step": 11319 + }, + { + "epoch": 0.9170447180816591, + "grad_norm": 0.035003118216991425, + "learning_rate": 0.00017127683514109546, + "loss": 0.3517, + "step": 11320 + }, + { + "epoch": 0.9171257290991575, + "grad_norm": 0.04067349061369896, + "learning_rate": 0.00017127233448850084, + "loss": 0.3245, + "step": 11321 + }, + { + "epoch": 0.9172067401166558, + "grad_norm": 0.037310849875211716, + "learning_rate": 0.0001712678338359062, + "loss": 0.3059, + "step": 11322 + }, + { + "epoch": 0.9172877511341543, + "grad_norm": 0.033759478479623795, + "learning_rate": 0.0001712633331833116, + "loss": 0.3194, + "step": 11323 + }, + { + "epoch": 0.9173687621516526, + "grad_norm": 0.03483540192246437, + "learning_rate": 0.00017125883253071698, + "loss": 0.3311, + "step": 11324 + }, + { + "epoch": 0.917449773169151, + "grad_norm": 0.03583363816142082, + "learning_rate": 0.00017125433187812234, + "loss": 0.3828, + "step": 11325 + }, + { + "epoch": 0.9175307841866494, + "grad_norm": 0.03208068013191223, + "learning_rate": 0.00017124983122552773, + "loss": 0.2907, + "step": 11326 + }, + { + "epoch": 0.9176117952041478, + "grad_norm": 0.03374846279621124, + "learning_rate": 0.00017124533057293309, + "loss": 0.2758, + "step": 11327 + }, + { + "epoch": 0.9176928062216462, + "grad_norm": 0.03522154688835144, + "learning_rate": 0.00017124082992033845, + "loss": 0.3436, + "step": 11328 + }, + { + "epoch": 0.9177738172391445, + "grad_norm": 0.03557585924863815, + "learning_rate": 0.00017123632926774383, + "loss": 0.3376, + "step": 11329 + }, + { + "epoch": 0.9178548282566429, + "grad_norm": 0.03580590337514877, + "learning_rate": 0.00017123182861514922, + "loss": 0.3489, + "step": 11330 + }, + { + "epoch": 0.9179358392741412, + "grad_norm": 0.036937177181243896, + "learning_rate": 0.00017122732796255458, + "loss": 0.2889, + "step": 11331 + }, + { + "epoch": 0.9180168502916397, + "grad_norm": 0.03509330749511719, + "learning_rate": 0.00017122282730995997, + "loss": 0.318, + "step": 11332 + }, + { + "epoch": 0.9180978613091381, + "grad_norm": 0.03411509841680527, + "learning_rate": 0.00017121832665736533, + "loss": 0.321, + "step": 11333 + }, + { + "epoch": 0.9181788723266364, + "grad_norm": 0.034354668110609055, + "learning_rate": 0.0001712138260047707, + "loss": 0.3503, + "step": 11334 + }, + { + "epoch": 0.9182598833441348, + "grad_norm": 0.03821565955877304, + "learning_rate": 0.00017120932535217607, + "loss": 0.3174, + "step": 11335 + }, + { + "epoch": 0.9183408943616331, + "grad_norm": 0.03345862030982971, + "learning_rate": 0.00017120482469958146, + "loss": 0.3462, + "step": 11336 + }, + { + "epoch": 0.9184219053791316, + "grad_norm": 0.03985141962766647, + "learning_rate": 0.00017120032404698682, + "loss": 0.3578, + "step": 11337 + }, + { + "epoch": 0.91850291639663, + "grad_norm": 0.04216769337654114, + "learning_rate": 0.0001711958233943922, + "loss": 0.3522, + "step": 11338 + }, + { + "epoch": 0.9185839274141283, + "grad_norm": 0.030603772029280663, + "learning_rate": 0.00017119132274179757, + "loss": 0.3129, + "step": 11339 + }, + { + "epoch": 0.9186649384316267, + "grad_norm": 0.03371666744351387, + "learning_rate": 0.00017118682208920293, + "loss": 0.3322, + "step": 11340 + }, + { + "epoch": 0.9187459494491251, + "grad_norm": 0.03615809231996536, + "learning_rate": 0.00017118232143660832, + "loss": 0.3615, + "step": 11341 + }, + { + "epoch": 0.9188269604666235, + "grad_norm": 0.03493629768490791, + "learning_rate": 0.0001711778207840137, + "loss": 0.3396, + "step": 11342 + }, + { + "epoch": 0.9189079714841218, + "grad_norm": 0.03149208053946495, + "learning_rate": 0.00017117332013141906, + "loss": 0.3103, + "step": 11343 + }, + { + "epoch": 0.9189889825016202, + "grad_norm": 0.03813861310482025, + "learning_rate": 0.00017116881947882445, + "loss": 0.3619, + "step": 11344 + }, + { + "epoch": 0.9190699935191186, + "grad_norm": 0.034407857805490494, + "learning_rate": 0.0001711643188262298, + "loss": 0.3453, + "step": 11345 + }, + { + "epoch": 0.919151004536617, + "grad_norm": 0.034576304256916046, + "learning_rate": 0.00017115981817363517, + "loss": 0.3239, + "step": 11346 + }, + { + "epoch": 0.9192320155541154, + "grad_norm": 0.03313819319009781, + "learning_rate": 0.00017115531752104056, + "loss": 0.2812, + "step": 11347 + }, + { + "epoch": 0.9193130265716137, + "grad_norm": 0.032415058463811874, + "learning_rate": 0.00017115081686844595, + "loss": 0.3224, + "step": 11348 + }, + { + "epoch": 0.9193940375891121, + "grad_norm": 0.03585551306605339, + "learning_rate": 0.0001711463162158513, + "loss": 0.3718, + "step": 11349 + }, + { + "epoch": 0.9194750486066104, + "grad_norm": 0.03217494115233421, + "learning_rate": 0.0001711418155632567, + "loss": 0.3295, + "step": 11350 + }, + { + "epoch": 0.9195560596241089, + "grad_norm": 0.03694257512688637, + "learning_rate": 0.00017113731491066205, + "loss": 0.348, + "step": 11351 + }, + { + "epoch": 0.9196370706416073, + "grad_norm": 0.031091250479221344, + "learning_rate": 0.0001711328142580674, + "loss": 0.3289, + "step": 11352 + }, + { + "epoch": 0.9197180816591056, + "grad_norm": 0.034045275300741196, + "learning_rate": 0.0001711283136054728, + "loss": 0.3563, + "step": 11353 + }, + { + "epoch": 0.919799092676604, + "grad_norm": 0.03653084114193916, + "learning_rate": 0.0001711238129528782, + "loss": 0.3752, + "step": 11354 + }, + { + "epoch": 0.9198801036941024, + "grad_norm": 0.03138238564133644, + "learning_rate": 0.00017111931230028355, + "loss": 0.3176, + "step": 11355 + }, + { + "epoch": 0.9199611147116008, + "grad_norm": 0.030700651928782463, + "learning_rate": 0.00017111481164768893, + "loss": 0.2794, + "step": 11356 + }, + { + "epoch": 0.9200421257290992, + "grad_norm": 0.03846675902605057, + "learning_rate": 0.0001711103109950943, + "loss": 0.3192, + "step": 11357 + }, + { + "epoch": 0.9201231367465975, + "grad_norm": 0.03891396149992943, + "learning_rate": 0.00017110581034249965, + "loss": 0.3276, + "step": 11358 + }, + { + "epoch": 0.9202041477640959, + "grad_norm": 0.03765661641955376, + "learning_rate": 0.00017110130968990504, + "loss": 0.3326, + "step": 11359 + }, + { + "epoch": 0.9202851587815943, + "grad_norm": 0.03686787560582161, + "learning_rate": 0.00017109680903731043, + "loss": 0.3327, + "step": 11360 + }, + { + "epoch": 0.9203661697990927, + "grad_norm": 0.03930029645562172, + "learning_rate": 0.0001710923083847158, + "loss": 0.3086, + "step": 11361 + }, + { + "epoch": 0.920447180816591, + "grad_norm": 0.032593511044979095, + "learning_rate": 0.00017108780773212118, + "loss": 0.2982, + "step": 11362 + }, + { + "epoch": 0.9205281918340894, + "grad_norm": 0.04143695533275604, + "learning_rate": 0.00017108330707952654, + "loss": 0.3586, + "step": 11363 + }, + { + "epoch": 0.9206092028515879, + "grad_norm": 0.0322968028485775, + "learning_rate": 0.0001710788064269319, + "loss": 0.3224, + "step": 11364 + }, + { + "epoch": 0.9206902138690862, + "grad_norm": 0.037244465202093124, + "learning_rate": 0.00017107430577433728, + "loss": 0.3114, + "step": 11365 + }, + { + "epoch": 0.9207712248865846, + "grad_norm": 0.03762952238321304, + "learning_rate": 0.00017106980512174267, + "loss": 0.3404, + "step": 11366 + }, + { + "epoch": 0.9208522359040829, + "grad_norm": 0.03405272588133812, + "learning_rate": 0.00017106530446914803, + "loss": 0.3496, + "step": 11367 + }, + { + "epoch": 0.9209332469215813, + "grad_norm": 0.03447031229734421, + "learning_rate": 0.00017106080381655342, + "loss": 0.3883, + "step": 11368 + }, + { + "epoch": 0.9210142579390798, + "grad_norm": 0.04121264070272446, + "learning_rate": 0.00017105630316395878, + "loss": 0.3286, + "step": 11369 + }, + { + "epoch": 0.9210952689565781, + "grad_norm": 0.03879169002175331, + "learning_rate": 0.00017105180251136414, + "loss": 0.3051, + "step": 11370 + }, + { + "epoch": 0.9211762799740765, + "grad_norm": 0.03608115017414093, + "learning_rate": 0.00017104730185876952, + "loss": 0.318, + "step": 11371 + }, + { + "epoch": 0.9212572909915748, + "grad_norm": 0.032185833901166916, + "learning_rate": 0.0001710428012061749, + "loss": 0.3164, + "step": 11372 + }, + { + "epoch": 0.9213383020090732, + "grad_norm": 0.035732269287109375, + "learning_rate": 0.00017103830055358027, + "loss": 0.3474, + "step": 11373 + }, + { + "epoch": 0.9214193130265717, + "grad_norm": 0.03568696603178978, + "learning_rate": 0.00017103379990098566, + "loss": 0.3533, + "step": 11374 + }, + { + "epoch": 0.92150032404407, + "grad_norm": 0.03487107902765274, + "learning_rate": 0.00017102929924839102, + "loss": 0.2939, + "step": 11375 + }, + { + "epoch": 0.9215813350615684, + "grad_norm": 0.03586133196949959, + "learning_rate": 0.0001710247985957964, + "loss": 0.3227, + "step": 11376 + }, + { + "epoch": 0.9216623460790667, + "grad_norm": 0.039440032094717026, + "learning_rate": 0.00017102029794320177, + "loss": 0.3714, + "step": 11377 + }, + { + "epoch": 0.9217433570965652, + "grad_norm": 0.03375830128788948, + "learning_rate": 0.00017101579729060715, + "loss": 0.3298, + "step": 11378 + }, + { + "epoch": 0.9218243681140635, + "grad_norm": 0.03659455105662346, + "learning_rate": 0.0001710112966380125, + "loss": 0.3109, + "step": 11379 + }, + { + "epoch": 0.9219053791315619, + "grad_norm": 0.030281485989689827, + "learning_rate": 0.0001710067959854179, + "loss": 0.321, + "step": 11380 + }, + { + "epoch": 0.9219863901490603, + "grad_norm": 0.03286823630332947, + "learning_rate": 0.00017100229533282326, + "loss": 0.3255, + "step": 11381 + }, + { + "epoch": 0.9220674011665586, + "grad_norm": 0.02968735620379448, + "learning_rate": 0.00017099779468022865, + "loss": 0.2788, + "step": 11382 + }, + { + "epoch": 0.9221484121840571, + "grad_norm": 0.034185122698545456, + "learning_rate": 0.000170993294027634, + "loss": 0.3215, + "step": 11383 + }, + { + "epoch": 0.9222294232015554, + "grad_norm": 0.030537785962224007, + "learning_rate": 0.0001709887933750394, + "loss": 0.3023, + "step": 11384 + }, + { + "epoch": 0.9223104342190538, + "grad_norm": 0.03613237664103508, + "learning_rate": 0.00017098429272244476, + "loss": 0.338, + "step": 11385 + }, + { + "epoch": 0.9223914452365521, + "grad_norm": 0.03522944450378418, + "learning_rate": 0.00017097979206985014, + "loss": 0.3225, + "step": 11386 + }, + { + "epoch": 0.9224724562540505, + "grad_norm": 0.0329560749232769, + "learning_rate": 0.0001709752914172555, + "loss": 0.3556, + "step": 11387 + }, + { + "epoch": 0.922553467271549, + "grad_norm": 0.03431179001927376, + "learning_rate": 0.0001709707907646609, + "loss": 0.3191, + "step": 11388 + }, + { + "epoch": 0.9226344782890473, + "grad_norm": 0.03724316507577896, + "learning_rate": 0.00017096629011206628, + "loss": 0.3533, + "step": 11389 + }, + { + "epoch": 0.9227154893065457, + "grad_norm": 0.034799784421920776, + "learning_rate": 0.00017096178945947164, + "loss": 0.3598, + "step": 11390 + }, + { + "epoch": 0.922796500324044, + "grad_norm": 0.03583141788840294, + "learning_rate": 0.000170957288806877, + "loss": 0.3348, + "step": 11391 + }, + { + "epoch": 0.9228775113415425, + "grad_norm": 0.033143218606710434, + "learning_rate": 0.00017095278815428238, + "loss": 0.3405, + "step": 11392 + }, + { + "epoch": 0.9229585223590409, + "grad_norm": 0.03624225780367851, + "learning_rate": 0.00017094828750168774, + "loss": 0.3299, + "step": 11393 + }, + { + "epoch": 0.9230395333765392, + "grad_norm": 0.04120873287320137, + "learning_rate": 0.00017094378684909313, + "loss": 0.3725, + "step": 11394 + }, + { + "epoch": 0.9231205443940376, + "grad_norm": 0.038186412304639816, + "learning_rate": 0.00017093928619649852, + "loss": 0.3405, + "step": 11395 + }, + { + "epoch": 0.9232015554115359, + "grad_norm": 0.03770510479807854, + "learning_rate": 0.00017093478554390388, + "loss": 0.3575, + "step": 11396 + }, + { + "epoch": 0.9232825664290344, + "grad_norm": 0.033232200890779495, + "learning_rate": 0.00017093028489130924, + "loss": 0.2977, + "step": 11397 + }, + { + "epoch": 0.9233635774465327, + "grad_norm": 0.02936842478811741, + "learning_rate": 0.00017092578423871463, + "loss": 0.2745, + "step": 11398 + }, + { + "epoch": 0.9234445884640311, + "grad_norm": 0.03786390647292137, + "learning_rate": 0.00017092128358611999, + "loss": 0.3699, + "step": 11399 + }, + { + "epoch": 0.9235255994815295, + "grad_norm": 0.03299250081181526, + "learning_rate": 0.00017091678293352537, + "loss": 0.3151, + "step": 11400 + }, + { + "epoch": 0.9236066104990278, + "grad_norm": 0.03264123201370239, + "learning_rate": 0.00017091228228093076, + "loss": 0.3312, + "step": 11401 + }, + { + "epoch": 0.9236876215165263, + "grad_norm": 0.039083484560251236, + "learning_rate": 0.00017090778162833612, + "loss": 0.3418, + "step": 11402 + }, + { + "epoch": 0.9237686325340246, + "grad_norm": 0.03410564363002777, + "learning_rate": 0.00017090328097574148, + "loss": 0.345, + "step": 11403 + }, + { + "epoch": 0.923849643551523, + "grad_norm": 0.036052338778972626, + "learning_rate": 0.00017089878032314687, + "loss": 0.3506, + "step": 11404 + }, + { + "epoch": 0.9239306545690213, + "grad_norm": 0.03703179582953453, + "learning_rate": 0.00017089427967055225, + "loss": 0.3332, + "step": 11405 + }, + { + "epoch": 0.9240116655865198, + "grad_norm": 0.03270883858203888, + "learning_rate": 0.00017088977901795761, + "loss": 0.3088, + "step": 11406 + }, + { + "epoch": 0.9240926766040182, + "grad_norm": 0.03068448416888714, + "learning_rate": 0.000170885278365363, + "loss": 0.3115, + "step": 11407 + }, + { + "epoch": 0.9241736876215165, + "grad_norm": 0.037590015679597855, + "learning_rate": 0.00017088077771276836, + "loss": 0.3389, + "step": 11408 + }, + { + "epoch": 0.9242546986390149, + "grad_norm": 0.035884857177734375, + "learning_rate": 0.00017087627706017372, + "loss": 0.3257, + "step": 11409 + }, + { + "epoch": 0.9243357096565132, + "grad_norm": 0.03558656573295593, + "learning_rate": 0.0001708717764075791, + "loss": 0.3096, + "step": 11410 + }, + { + "epoch": 0.9244167206740117, + "grad_norm": 0.03657744452357292, + "learning_rate": 0.0001708672757549845, + "loss": 0.3602, + "step": 11411 + }, + { + "epoch": 0.9244977316915101, + "grad_norm": 0.03377017378807068, + "learning_rate": 0.00017086277510238986, + "loss": 0.253, + "step": 11412 + }, + { + "epoch": 0.9245787427090084, + "grad_norm": 0.03472224622964859, + "learning_rate": 0.00017085827444979524, + "loss": 0.3189, + "step": 11413 + }, + { + "epoch": 0.9246597537265068, + "grad_norm": 0.033812422305345535, + "learning_rate": 0.0001708537737972006, + "loss": 0.3387, + "step": 11414 + }, + { + "epoch": 0.9247407647440052, + "grad_norm": 0.03222503140568733, + "learning_rate": 0.00017084927314460596, + "loss": 0.3508, + "step": 11415 + }, + { + "epoch": 0.9248217757615036, + "grad_norm": 0.04204294830560684, + "learning_rate": 0.00017084477249201135, + "loss": 0.3582, + "step": 11416 + }, + { + "epoch": 0.924902786779002, + "grad_norm": 0.034957461059093475, + "learning_rate": 0.00017084027183941674, + "loss": 0.3077, + "step": 11417 + }, + { + "epoch": 0.9249837977965003, + "grad_norm": 0.039951667189598083, + "learning_rate": 0.0001708357711868221, + "loss": 0.37, + "step": 11418 + }, + { + "epoch": 0.9250648088139987, + "grad_norm": 0.034192297607660294, + "learning_rate": 0.00017083127053422748, + "loss": 0.2993, + "step": 11419 + }, + { + "epoch": 0.9251458198314971, + "grad_norm": 0.03962968662381172, + "learning_rate": 0.00017082676988163284, + "loss": 0.4001, + "step": 11420 + }, + { + "epoch": 0.9252268308489955, + "grad_norm": 0.03515026345849037, + "learning_rate": 0.0001708222692290382, + "loss": 0.3177, + "step": 11421 + }, + { + "epoch": 0.9253078418664938, + "grad_norm": 0.038349054753780365, + "learning_rate": 0.0001708177685764436, + "loss": 0.3432, + "step": 11422 + }, + { + "epoch": 0.9253888528839922, + "grad_norm": 0.035203397274017334, + "learning_rate": 0.00017081326792384898, + "loss": 0.3304, + "step": 11423 + }, + { + "epoch": 0.9254698639014906, + "grad_norm": 0.029140474274754524, + "learning_rate": 0.00017080876727125434, + "loss": 0.2933, + "step": 11424 + }, + { + "epoch": 0.925550874918989, + "grad_norm": 0.03449895232915878, + "learning_rate": 0.00017080426661865973, + "loss": 0.3045, + "step": 11425 + }, + { + "epoch": 0.9256318859364874, + "grad_norm": 0.03640986233949661, + "learning_rate": 0.00017079976596606509, + "loss": 0.3374, + "step": 11426 + }, + { + "epoch": 0.9257128969539857, + "grad_norm": 0.043093591928482056, + "learning_rate": 0.00017079526531347045, + "loss": 0.3548, + "step": 11427 + }, + { + "epoch": 0.9257939079714841, + "grad_norm": 0.030513029545545578, + "learning_rate": 0.00017079076466087583, + "loss": 0.2871, + "step": 11428 + }, + { + "epoch": 0.9258749189889826, + "grad_norm": 0.029281042516231537, + "learning_rate": 0.00017078626400828122, + "loss": 0.2716, + "step": 11429 + }, + { + "epoch": 0.9259559300064809, + "grad_norm": 0.04084150120615959, + "learning_rate": 0.00017078176335568658, + "loss": 0.3118, + "step": 11430 + }, + { + "epoch": 0.9260369410239793, + "grad_norm": 0.03145955502986908, + "learning_rate": 0.00017077726270309197, + "loss": 0.319, + "step": 11431 + }, + { + "epoch": 0.9261179520414776, + "grad_norm": 0.03483365476131439, + "learning_rate": 0.00017077276205049733, + "loss": 0.3576, + "step": 11432 + }, + { + "epoch": 0.926198963058976, + "grad_norm": 0.038508594036102295, + "learning_rate": 0.0001707682613979027, + "loss": 0.3321, + "step": 11433 + }, + { + "epoch": 0.9262799740764744, + "grad_norm": 0.03275780379772186, + "learning_rate": 0.00017076376074530808, + "loss": 0.3241, + "step": 11434 + }, + { + "epoch": 0.9263609850939728, + "grad_norm": 0.028733495622873306, + "learning_rate": 0.00017075926009271346, + "loss": 0.28, + "step": 11435 + }, + { + "epoch": 0.9264419961114712, + "grad_norm": 0.03230396658182144, + "learning_rate": 0.00017075475944011882, + "loss": 0.2993, + "step": 11436 + }, + { + "epoch": 0.9265230071289695, + "grad_norm": 0.03491473197937012, + "learning_rate": 0.0001707502587875242, + "loss": 0.3475, + "step": 11437 + }, + { + "epoch": 0.9266040181464679, + "grad_norm": 0.03464370220899582, + "learning_rate": 0.00017074575813492957, + "loss": 0.3335, + "step": 11438 + }, + { + "epoch": 0.9266850291639663, + "grad_norm": 0.032484348863363266, + "learning_rate": 0.00017074125748233493, + "loss": 0.3357, + "step": 11439 + }, + { + "epoch": 0.9267660401814647, + "grad_norm": 0.035478755831718445, + "learning_rate": 0.00017073675682974032, + "loss": 0.3395, + "step": 11440 + }, + { + "epoch": 0.926847051198963, + "grad_norm": 0.03466634079813957, + "learning_rate": 0.0001707322561771457, + "loss": 0.3336, + "step": 11441 + }, + { + "epoch": 0.9269280622164614, + "grad_norm": 0.0307964738458395, + "learning_rate": 0.00017072775552455106, + "loss": 0.291, + "step": 11442 + }, + { + "epoch": 0.9270090732339599, + "grad_norm": 0.036242470145225525, + "learning_rate": 0.00017072325487195645, + "loss": 0.3682, + "step": 11443 + }, + { + "epoch": 0.9270900842514582, + "grad_norm": 0.03543122857809067, + "learning_rate": 0.0001707187542193618, + "loss": 0.3305, + "step": 11444 + }, + { + "epoch": 0.9271710952689566, + "grad_norm": 0.03091447241604328, + "learning_rate": 0.00017071425356676717, + "loss": 0.3281, + "step": 11445 + }, + { + "epoch": 0.9272521062864549, + "grad_norm": 0.03639169782400131, + "learning_rate": 0.00017070975291417256, + "loss": 0.351, + "step": 11446 + }, + { + "epoch": 0.9273331173039533, + "grad_norm": 0.03385458141565323, + "learning_rate": 0.00017070525226157795, + "loss": 0.3301, + "step": 11447 + }, + { + "epoch": 0.9274141283214518, + "grad_norm": 0.034979742020368576, + "learning_rate": 0.0001707007516089833, + "loss": 0.3356, + "step": 11448 + }, + { + "epoch": 0.9274951393389501, + "grad_norm": 0.03415704146027565, + "learning_rate": 0.0001706962509563887, + "loss": 0.3231, + "step": 11449 + }, + { + "epoch": 0.9275761503564485, + "grad_norm": 0.03785685822367668, + "learning_rate": 0.00017069175030379405, + "loss": 0.3528, + "step": 11450 + }, + { + "epoch": 0.9276571613739468, + "grad_norm": 0.03500499576330185, + "learning_rate": 0.0001706872496511994, + "loss": 0.3426, + "step": 11451 + }, + { + "epoch": 0.9277381723914452, + "grad_norm": 0.036131519824266434, + "learning_rate": 0.0001706827489986048, + "loss": 0.3472, + "step": 11452 + }, + { + "epoch": 0.9278191834089436, + "grad_norm": 0.03363850712776184, + "learning_rate": 0.0001706782483460102, + "loss": 0.3279, + "step": 11453 + }, + { + "epoch": 0.927900194426442, + "grad_norm": 0.035742562264204025, + "learning_rate": 0.00017067374769341555, + "loss": 0.3214, + "step": 11454 + }, + { + "epoch": 0.9279812054439404, + "grad_norm": 0.03095664642751217, + "learning_rate": 0.00017066924704082093, + "loss": 0.3056, + "step": 11455 + }, + { + "epoch": 0.9280622164614387, + "grad_norm": 0.03762781620025635, + "learning_rate": 0.0001706647463882263, + "loss": 0.3286, + "step": 11456 + }, + { + "epoch": 0.9281432274789372, + "grad_norm": 0.038630541414022446, + "learning_rate": 0.00017066024573563168, + "loss": 0.3846, + "step": 11457 + }, + { + "epoch": 0.9282242384964355, + "grad_norm": 0.038850728422403336, + "learning_rate": 0.00017065574508303707, + "loss": 0.3631, + "step": 11458 + }, + { + "epoch": 0.9283052495139339, + "grad_norm": 0.03348441794514656, + "learning_rate": 0.00017065124443044243, + "loss": 0.3132, + "step": 11459 + }, + { + "epoch": 0.9283862605314323, + "grad_norm": 0.0356043204665184, + "learning_rate": 0.0001706467437778478, + "loss": 0.3085, + "step": 11460 + }, + { + "epoch": 0.9284672715489306, + "grad_norm": 0.03675344958901405, + "learning_rate": 0.00017064224312525318, + "loss": 0.3378, + "step": 11461 + }, + { + "epoch": 0.9285482825664291, + "grad_norm": 0.03896648809313774, + "learning_rate": 0.00017063774247265854, + "loss": 0.3529, + "step": 11462 + }, + { + "epoch": 0.9286292935839274, + "grad_norm": 0.03709070757031441, + "learning_rate": 0.00017063324182006392, + "loss": 0.3499, + "step": 11463 + }, + { + "epoch": 0.9287103046014258, + "grad_norm": 0.03415576368570328, + "learning_rate": 0.0001706287411674693, + "loss": 0.3342, + "step": 11464 + }, + { + "epoch": 0.9287913156189241, + "grad_norm": 0.03336158022284508, + "learning_rate": 0.00017062424051487467, + "loss": 0.3098, + "step": 11465 + }, + { + "epoch": 0.9288723266364226, + "grad_norm": 0.03390232473611832, + "learning_rate": 0.00017061973986228003, + "loss": 0.3064, + "step": 11466 + }, + { + "epoch": 0.928953337653921, + "grad_norm": 0.03544805198907852, + "learning_rate": 0.00017061523920968542, + "loss": 0.3387, + "step": 11467 + }, + { + "epoch": 0.9290343486714193, + "grad_norm": 0.03590017184615135, + "learning_rate": 0.00017061073855709078, + "loss": 0.3063, + "step": 11468 + }, + { + "epoch": 0.9291153596889177, + "grad_norm": 0.031386904418468475, + "learning_rate": 0.00017060623790449616, + "loss": 0.2848, + "step": 11469 + }, + { + "epoch": 0.929196370706416, + "grad_norm": 0.03534477576613426, + "learning_rate": 0.00017060173725190155, + "loss": 0.36, + "step": 11470 + }, + { + "epoch": 0.9292773817239145, + "grad_norm": 0.033903393894433975, + "learning_rate": 0.0001705972365993069, + "loss": 0.2919, + "step": 11471 + }, + { + "epoch": 0.9293583927414129, + "grad_norm": 0.03623591363430023, + "learning_rate": 0.00017059273594671227, + "loss": 0.3357, + "step": 11472 + }, + { + "epoch": 0.9294394037589112, + "grad_norm": 0.03716109320521355, + "learning_rate": 0.00017058823529411766, + "loss": 0.3357, + "step": 11473 + }, + { + "epoch": 0.9295204147764096, + "grad_norm": 0.03469749540090561, + "learning_rate": 0.00017058373464152302, + "loss": 0.3176, + "step": 11474 + }, + { + "epoch": 0.9296014257939079, + "grad_norm": 0.03799695894122124, + "learning_rate": 0.0001705792339889284, + "loss": 0.3297, + "step": 11475 + }, + { + "epoch": 0.9296824368114064, + "grad_norm": 0.03861555457115173, + "learning_rate": 0.0001705747333363338, + "loss": 0.3193, + "step": 11476 + }, + { + "epoch": 0.9297634478289047, + "grad_norm": 0.02883889339864254, + "learning_rate": 0.00017057023268373915, + "loss": 0.2863, + "step": 11477 + }, + { + "epoch": 0.9298444588464031, + "grad_norm": 0.03888789564371109, + "learning_rate": 0.00017056573203114451, + "loss": 0.343, + "step": 11478 + }, + { + "epoch": 0.9299254698639015, + "grad_norm": 0.03772765025496483, + "learning_rate": 0.0001705612313785499, + "loss": 0.3638, + "step": 11479 + }, + { + "epoch": 0.9300064808813999, + "grad_norm": 0.03691214695572853, + "learning_rate": 0.0001705567307259553, + "loss": 0.3235, + "step": 11480 + }, + { + "epoch": 0.9300874918988983, + "grad_norm": 0.03321418911218643, + "learning_rate": 0.00017055223007336065, + "loss": 0.3747, + "step": 11481 + }, + { + "epoch": 0.9301685029163966, + "grad_norm": 0.03231301158666611, + "learning_rate": 0.00017054772942076604, + "loss": 0.3113, + "step": 11482 + }, + { + "epoch": 0.930249513933895, + "grad_norm": 0.03366365283727646, + "learning_rate": 0.0001705432287681714, + "loss": 0.3078, + "step": 11483 + }, + { + "epoch": 0.9303305249513933, + "grad_norm": 0.03471920266747475, + "learning_rate": 0.00017053872811557676, + "loss": 0.3472, + "step": 11484 + }, + { + "epoch": 0.9304115359688918, + "grad_norm": 0.03448682278394699, + "learning_rate": 0.00017053422746298214, + "loss": 0.3388, + "step": 11485 + }, + { + "epoch": 0.9304925469863902, + "grad_norm": 0.033348411321640015, + "learning_rate": 0.00017052972681038753, + "loss": 0.358, + "step": 11486 + }, + { + "epoch": 0.9305735580038885, + "grad_norm": 0.03339369595050812, + "learning_rate": 0.0001705252261577929, + "loss": 0.3165, + "step": 11487 + }, + { + "epoch": 0.9306545690213869, + "grad_norm": 0.033238619565963745, + "learning_rate": 0.00017052072550519828, + "loss": 0.3264, + "step": 11488 + }, + { + "epoch": 0.9307355800388852, + "grad_norm": 0.03498007729649544, + "learning_rate": 0.00017051622485260364, + "loss": 0.3196, + "step": 11489 + }, + { + "epoch": 0.9308165910563837, + "grad_norm": 0.03177583962678909, + "learning_rate": 0.000170511724200009, + "loss": 0.3158, + "step": 11490 + }, + { + "epoch": 0.9308976020738821, + "grad_norm": 0.036919716745615005, + "learning_rate": 0.00017050722354741438, + "loss": 0.3724, + "step": 11491 + }, + { + "epoch": 0.9309786130913804, + "grad_norm": 0.03319632634520531, + "learning_rate": 0.00017050272289481977, + "loss": 0.3294, + "step": 11492 + }, + { + "epoch": 0.9310596241088788, + "grad_norm": 0.03813881799578667, + "learning_rate": 0.00017049822224222513, + "loss": 0.3445, + "step": 11493 + }, + { + "epoch": 0.9311406351263772, + "grad_norm": 0.03665956109762192, + "learning_rate": 0.00017049372158963052, + "loss": 0.3549, + "step": 11494 + }, + { + "epoch": 0.9312216461438756, + "grad_norm": 0.033054448664188385, + "learning_rate": 0.00017048922093703588, + "loss": 0.3153, + "step": 11495 + }, + { + "epoch": 0.931302657161374, + "grad_norm": 0.033781781792640686, + "learning_rate": 0.00017048472028444124, + "loss": 0.3292, + "step": 11496 + }, + { + "epoch": 0.9313836681788723, + "grad_norm": 0.030734725296497345, + "learning_rate": 0.00017048021963184663, + "loss": 0.2866, + "step": 11497 + }, + { + "epoch": 0.9314646791963707, + "grad_norm": 0.0317283570766449, + "learning_rate": 0.000170475718979252, + "loss": 0.3138, + "step": 11498 + }, + { + "epoch": 0.9315456902138691, + "grad_norm": 0.035663917660713196, + "learning_rate": 0.00017047121832665737, + "loss": 0.329, + "step": 11499 + }, + { + "epoch": 0.9316267012313675, + "grad_norm": 0.03555869683623314, + "learning_rate": 0.00017046671767406276, + "loss": 0.3159, + "step": 11500 + }, + { + "epoch": 0.9317077122488658, + "grad_norm": 0.035777702927589417, + "learning_rate": 0.00017046221702146812, + "loss": 0.3305, + "step": 11501 + }, + { + "epoch": 0.9317887232663642, + "grad_norm": 0.029906559735536575, + "learning_rate": 0.00017045771636887348, + "loss": 0.3008, + "step": 11502 + }, + { + "epoch": 0.9318697342838627, + "grad_norm": 0.03649575635790825, + "learning_rate": 0.00017045321571627887, + "loss": 0.3771, + "step": 11503 + }, + { + "epoch": 0.931950745301361, + "grad_norm": 0.033256951719522476, + "learning_rate": 0.00017044871506368425, + "loss": 0.3219, + "step": 11504 + }, + { + "epoch": 0.9320317563188594, + "grad_norm": 0.0365782156586647, + "learning_rate": 0.00017044421441108961, + "loss": 0.3143, + "step": 11505 + }, + { + "epoch": 0.9321127673363577, + "grad_norm": 0.03269030153751373, + "learning_rate": 0.000170439713758495, + "loss": 0.2937, + "step": 11506 + }, + { + "epoch": 0.9321937783538561, + "grad_norm": 0.03456766530871391, + "learning_rate": 0.00017043521310590036, + "loss": 0.3096, + "step": 11507 + }, + { + "epoch": 0.9322747893713546, + "grad_norm": 0.035860177129507065, + "learning_rate": 0.00017043071245330572, + "loss": 0.3588, + "step": 11508 + }, + { + "epoch": 0.9323558003888529, + "grad_norm": 0.034222107380628586, + "learning_rate": 0.0001704262118007111, + "loss": 0.3318, + "step": 11509 + }, + { + "epoch": 0.9324368114063513, + "grad_norm": 0.04168862849473953, + "learning_rate": 0.0001704217111481165, + "loss": 0.3149, + "step": 11510 + }, + { + "epoch": 0.9325178224238496, + "grad_norm": 0.03862250596284866, + "learning_rate": 0.00017041721049552186, + "loss": 0.3747, + "step": 11511 + }, + { + "epoch": 0.932598833441348, + "grad_norm": 0.030834008008241653, + "learning_rate": 0.00017041270984292724, + "loss": 0.3011, + "step": 11512 + }, + { + "epoch": 0.9326798444588464, + "grad_norm": 0.03323963284492493, + "learning_rate": 0.0001704082091903326, + "loss": 0.3053, + "step": 11513 + }, + { + "epoch": 0.9327608554763448, + "grad_norm": 0.03129046410322189, + "learning_rate": 0.00017040370853773796, + "loss": 0.3059, + "step": 11514 + }, + { + "epoch": 0.9328418664938432, + "grad_norm": 0.038254525512456894, + "learning_rate": 0.00017039920788514335, + "loss": 0.3393, + "step": 11515 + }, + { + "epoch": 0.9329228775113415, + "grad_norm": 0.033706698566675186, + "learning_rate": 0.00017039470723254874, + "loss": 0.3213, + "step": 11516 + }, + { + "epoch": 0.93300388852884, + "grad_norm": 0.032409097999334335, + "learning_rate": 0.0001703902065799541, + "loss": 0.2629, + "step": 11517 + }, + { + "epoch": 0.9330848995463383, + "grad_norm": 0.0412583090364933, + "learning_rate": 0.00017038570592735948, + "loss": 0.397, + "step": 11518 + }, + { + "epoch": 0.9331659105638367, + "grad_norm": 0.030987797304987907, + "learning_rate": 0.00017038120527476484, + "loss": 0.3079, + "step": 11519 + }, + { + "epoch": 0.933246921581335, + "grad_norm": 0.03425063192844391, + "learning_rate": 0.0001703767046221702, + "loss": 0.3438, + "step": 11520 + }, + { + "epoch": 0.9333279325988334, + "grad_norm": 0.032528121024370193, + "learning_rate": 0.0001703722039695756, + "loss": 0.339, + "step": 11521 + }, + { + "epoch": 0.9334089436163319, + "grad_norm": 0.03559907153248787, + "learning_rate": 0.00017036770331698098, + "loss": 0.3516, + "step": 11522 + }, + { + "epoch": 0.9334899546338302, + "grad_norm": 0.03166068717837334, + "learning_rate": 0.00017036320266438634, + "loss": 0.309, + "step": 11523 + }, + { + "epoch": 0.9335709656513286, + "grad_norm": 0.034030213952064514, + "learning_rate": 0.00017035870201179173, + "loss": 0.3057, + "step": 11524 + }, + { + "epoch": 0.9336519766688269, + "grad_norm": 0.03300134092569351, + "learning_rate": 0.0001703542013591971, + "loss": 0.3307, + "step": 11525 + }, + { + "epoch": 0.9337329876863253, + "grad_norm": 0.03915845975279808, + "learning_rate": 0.00017034970070660245, + "loss": 0.3698, + "step": 11526 + }, + { + "epoch": 0.9338139987038238, + "grad_norm": 0.032462168484926224, + "learning_rate": 0.00017034520005400786, + "loss": 0.3215, + "step": 11527 + }, + { + "epoch": 0.9338950097213221, + "grad_norm": 0.03716924414038658, + "learning_rate": 0.00017034069940141322, + "loss": 0.3183, + "step": 11528 + }, + { + "epoch": 0.9339760207388205, + "grad_norm": 0.03721143305301666, + "learning_rate": 0.00017033619874881858, + "loss": 0.3399, + "step": 11529 + }, + { + "epoch": 0.9340570317563188, + "grad_norm": 0.034695371985435486, + "learning_rate": 0.00017033169809622397, + "loss": 0.2887, + "step": 11530 + }, + { + "epoch": 0.9341380427738173, + "grad_norm": 0.03291356563568115, + "learning_rate": 0.00017032719744362933, + "loss": 0.3333, + "step": 11531 + }, + { + "epoch": 0.9342190537913156, + "grad_norm": 0.04289603978395462, + "learning_rate": 0.0001703226967910347, + "loss": 0.3503, + "step": 11532 + }, + { + "epoch": 0.934300064808814, + "grad_norm": 0.03919799253344536, + "learning_rate": 0.0001703181961384401, + "loss": 0.3218, + "step": 11533 + }, + { + "epoch": 0.9343810758263124, + "grad_norm": 0.03670763596892357, + "learning_rate": 0.00017031369548584546, + "loss": 0.3216, + "step": 11534 + }, + { + "epoch": 0.9344620868438107, + "grad_norm": 0.03523210808634758, + "learning_rate": 0.00017030919483325082, + "loss": 0.2918, + "step": 11535 + }, + { + "epoch": 0.9345430978613092, + "grad_norm": 0.035426367074251175, + "learning_rate": 0.0001703046941806562, + "loss": 0.3414, + "step": 11536 + }, + { + "epoch": 0.9346241088788075, + "grad_norm": 0.034479837864637375, + "learning_rate": 0.00017030019352806157, + "loss": 0.3218, + "step": 11537 + }, + { + "epoch": 0.9347051198963059, + "grad_norm": 0.0363641120493412, + "learning_rate": 0.00017029569287546696, + "loss": 0.3709, + "step": 11538 + }, + { + "epoch": 0.9347861309138042, + "grad_norm": 0.037798330187797546, + "learning_rate": 0.00017029119222287234, + "loss": 0.333, + "step": 11539 + }, + { + "epoch": 0.9348671419313026, + "grad_norm": 0.03117884136736393, + "learning_rate": 0.0001702866915702777, + "loss": 0.315, + "step": 11540 + }, + { + "epoch": 0.9349481529488011, + "grad_norm": 0.035172026604413986, + "learning_rate": 0.00017028219091768306, + "loss": 0.365, + "step": 11541 + }, + { + "epoch": 0.9350291639662994, + "grad_norm": 0.03705654665827751, + "learning_rate": 0.00017027769026508845, + "loss": 0.3459, + "step": 11542 + }, + { + "epoch": 0.9351101749837978, + "grad_norm": 0.03678044676780701, + "learning_rate": 0.0001702731896124938, + "loss": 0.3268, + "step": 11543 + }, + { + "epoch": 0.9351911860012961, + "grad_norm": 0.03360627591609955, + "learning_rate": 0.0001702686889598992, + "loss": 0.3017, + "step": 11544 + }, + { + "epoch": 0.9352721970187946, + "grad_norm": 0.038903381675481796, + "learning_rate": 0.00017026418830730459, + "loss": 0.3278, + "step": 11545 + }, + { + "epoch": 0.935353208036293, + "grad_norm": 0.033496957272291183, + "learning_rate": 0.00017025968765470995, + "loss": 0.3388, + "step": 11546 + }, + { + "epoch": 0.9354342190537913, + "grad_norm": 0.03374259173870087, + "learning_rate": 0.0001702551870021153, + "loss": 0.3332, + "step": 11547 + }, + { + "epoch": 0.9355152300712897, + "grad_norm": 0.03631717339158058, + "learning_rate": 0.0001702506863495207, + "loss": 0.3125, + "step": 11548 + }, + { + "epoch": 0.935596241088788, + "grad_norm": 0.032183315604925156, + "learning_rate": 0.00017024618569692605, + "loss": 0.3005, + "step": 11549 + }, + { + "epoch": 0.9356772521062865, + "grad_norm": 0.03625780716538429, + "learning_rate": 0.00017024168504433144, + "loss": 0.3246, + "step": 11550 + }, + { + "epoch": 0.9357582631237849, + "grad_norm": 0.03256357088685036, + "learning_rate": 0.00017023718439173683, + "loss": 0.3119, + "step": 11551 + }, + { + "epoch": 0.9358392741412832, + "grad_norm": 0.046169064939022064, + "learning_rate": 0.0001702326837391422, + "loss": 0.4229, + "step": 11552 + }, + { + "epoch": 0.9359202851587816, + "grad_norm": 0.03766137734055519, + "learning_rate": 0.00017022818308654755, + "loss": 0.3421, + "step": 11553 + }, + { + "epoch": 0.93600129617628, + "grad_norm": 0.029175125062465668, + "learning_rate": 0.00017022368243395293, + "loss": 0.2945, + "step": 11554 + }, + { + "epoch": 0.9360823071937784, + "grad_norm": 0.03890814259648323, + "learning_rate": 0.0001702191817813583, + "loss": 0.3493, + "step": 11555 + }, + { + "epoch": 0.9361633182112767, + "grad_norm": 0.030468717217445374, + "learning_rate": 0.00017021468112876368, + "loss": 0.2837, + "step": 11556 + }, + { + "epoch": 0.9362443292287751, + "grad_norm": 0.03299061954021454, + "learning_rate": 0.00017021018047616907, + "loss": 0.3475, + "step": 11557 + }, + { + "epoch": 0.9363253402462735, + "grad_norm": 0.033994968980550766, + "learning_rate": 0.00017020567982357443, + "loss": 0.359, + "step": 11558 + }, + { + "epoch": 0.9364063512637719, + "grad_norm": 0.03398413211107254, + "learning_rate": 0.0001702011791709798, + "loss": 0.34, + "step": 11559 + }, + { + "epoch": 0.9364873622812703, + "grad_norm": 0.03915158659219742, + "learning_rate": 0.00017019667851838518, + "loss": 0.3156, + "step": 11560 + }, + { + "epoch": 0.9365683732987686, + "grad_norm": 0.033859699964523315, + "learning_rate": 0.00017019217786579056, + "loss": 0.3224, + "step": 11561 + }, + { + "epoch": 0.936649384316267, + "grad_norm": 0.0342167429625988, + "learning_rate": 0.00017018767721319592, + "loss": 0.3362, + "step": 11562 + }, + { + "epoch": 0.9367303953337653, + "grad_norm": 0.03582310676574707, + "learning_rate": 0.0001701831765606013, + "loss": 0.342, + "step": 11563 + }, + { + "epoch": 0.9368114063512638, + "grad_norm": 0.03299706429243088, + "learning_rate": 0.00017017867590800667, + "loss": 0.3339, + "step": 11564 + }, + { + "epoch": 0.9368924173687622, + "grad_norm": 0.03426812216639519, + "learning_rate": 0.00017017417525541203, + "loss": 0.3237, + "step": 11565 + }, + { + "epoch": 0.9369734283862605, + "grad_norm": 0.03299387916922569, + "learning_rate": 0.00017016967460281742, + "loss": 0.305, + "step": 11566 + }, + { + "epoch": 0.9370544394037589, + "grad_norm": 0.03170327469706535, + "learning_rate": 0.0001701651739502228, + "loss": 0.3085, + "step": 11567 + }, + { + "epoch": 0.9371354504212573, + "grad_norm": 0.03617962822318077, + "learning_rate": 0.00017016067329762817, + "loss": 0.3366, + "step": 11568 + }, + { + "epoch": 0.9372164614387557, + "grad_norm": 0.03227115795016289, + "learning_rate": 0.00017015617264503355, + "loss": 0.3222, + "step": 11569 + }, + { + "epoch": 0.937297472456254, + "grad_norm": 0.03870026767253876, + "learning_rate": 0.0001701516719924389, + "loss": 0.3413, + "step": 11570 + }, + { + "epoch": 0.9373784834737524, + "grad_norm": 0.03202787786722183, + "learning_rate": 0.00017014717133984427, + "loss": 0.3302, + "step": 11571 + }, + { + "epoch": 0.9374594944912508, + "grad_norm": 0.034028373658657074, + "learning_rate": 0.00017014267068724966, + "loss": 0.324, + "step": 11572 + }, + { + "epoch": 0.9375405055087492, + "grad_norm": 0.03828026354312897, + "learning_rate": 0.00017013817003465505, + "loss": 0.3441, + "step": 11573 + }, + { + "epoch": 0.9376215165262476, + "grad_norm": 0.035729967057704926, + "learning_rate": 0.0001701336693820604, + "loss": 0.288, + "step": 11574 + }, + { + "epoch": 0.937702527543746, + "grad_norm": 0.02852979488670826, + "learning_rate": 0.0001701291687294658, + "loss": 0.2586, + "step": 11575 + }, + { + "epoch": 0.9377835385612443, + "grad_norm": 0.03379920497536659, + "learning_rate": 0.00017012466807687115, + "loss": 0.2936, + "step": 11576 + }, + { + "epoch": 0.9378645495787427, + "grad_norm": 0.03683913126587868, + "learning_rate": 0.00017012016742427651, + "loss": 0.3608, + "step": 11577 + }, + { + "epoch": 0.9379455605962411, + "grad_norm": 0.03738532215356827, + "learning_rate": 0.0001701156667716819, + "loss": 0.3765, + "step": 11578 + }, + { + "epoch": 0.9380265716137395, + "grad_norm": 0.03998420760035515, + "learning_rate": 0.0001701111661190873, + "loss": 0.3014, + "step": 11579 + }, + { + "epoch": 0.9381075826312378, + "grad_norm": 0.03761008381843567, + "learning_rate": 0.00017010666546649265, + "loss": 0.3352, + "step": 11580 + }, + { + "epoch": 0.9381885936487362, + "grad_norm": 0.03799549490213394, + "learning_rate": 0.00017010216481389804, + "loss": 0.326, + "step": 11581 + }, + { + "epoch": 0.9382696046662347, + "grad_norm": 0.03644454851746559, + "learning_rate": 0.0001700976641613034, + "loss": 0.3306, + "step": 11582 + }, + { + "epoch": 0.938350615683733, + "grad_norm": 0.0304781012237072, + "learning_rate": 0.00017009316350870876, + "loss": 0.2902, + "step": 11583 + }, + { + "epoch": 0.9384316267012314, + "grad_norm": 0.0314607247710228, + "learning_rate": 0.00017008866285611414, + "loss": 0.3029, + "step": 11584 + }, + { + "epoch": 0.9385126377187297, + "grad_norm": 0.03330124914646149, + "learning_rate": 0.00017008416220351953, + "loss": 0.3073, + "step": 11585 + }, + { + "epoch": 0.9385936487362281, + "grad_norm": 0.038247060030698776, + "learning_rate": 0.0001700796615509249, + "loss": 0.3443, + "step": 11586 + }, + { + "epoch": 0.9386746597537265, + "grad_norm": 0.03827197477221489, + "learning_rate": 0.00017007516089833028, + "loss": 0.3741, + "step": 11587 + }, + { + "epoch": 0.9387556707712249, + "grad_norm": 0.035438474267721176, + "learning_rate": 0.00017007066024573564, + "loss": 0.3206, + "step": 11588 + }, + { + "epoch": 0.9388366817887233, + "grad_norm": 0.03596781566739082, + "learning_rate": 0.000170066159593141, + "loss": 0.3348, + "step": 11589 + }, + { + "epoch": 0.9389176928062216, + "grad_norm": 0.031203007325530052, + "learning_rate": 0.00017006165894054638, + "loss": 0.3496, + "step": 11590 + }, + { + "epoch": 0.93899870382372, + "grad_norm": 0.03926529362797737, + "learning_rate": 0.00017005715828795177, + "loss": 0.3612, + "step": 11591 + }, + { + "epoch": 0.9390797148412184, + "grad_norm": 0.04273316264152527, + "learning_rate": 0.00017005265763535713, + "loss": 0.364, + "step": 11592 + }, + { + "epoch": 0.9391607258587168, + "grad_norm": 0.033596985042095184, + "learning_rate": 0.00017004815698276252, + "loss": 0.3308, + "step": 11593 + }, + { + "epoch": 0.9392417368762151, + "grad_norm": 0.03773493692278862, + "learning_rate": 0.00017004365633016788, + "loss": 0.3731, + "step": 11594 + }, + { + "epoch": 0.9393227478937135, + "grad_norm": 0.03566722571849823, + "learning_rate": 0.00017003915567757324, + "loss": 0.3478, + "step": 11595 + }, + { + "epoch": 0.939403758911212, + "grad_norm": 0.029825484380126, + "learning_rate": 0.00017003465502497865, + "loss": 0.2838, + "step": 11596 + }, + { + "epoch": 0.9394847699287103, + "grad_norm": 0.034685224294662476, + "learning_rate": 0.000170030154372384, + "loss": 0.3065, + "step": 11597 + }, + { + "epoch": 0.9395657809462087, + "grad_norm": 0.03669968619942665, + "learning_rate": 0.00017002565371978937, + "loss": 0.3638, + "step": 11598 + }, + { + "epoch": 0.939646791963707, + "grad_norm": 0.035428546369075775, + "learning_rate": 0.00017002115306719476, + "loss": 0.2985, + "step": 11599 + }, + { + "epoch": 0.9397278029812054, + "grad_norm": 0.032245926558971405, + "learning_rate": 0.00017001665241460012, + "loss": 0.3037, + "step": 11600 + }, + { + "epoch": 0.9398088139987039, + "grad_norm": 0.03529680520296097, + "learning_rate": 0.00017001215176200548, + "loss": 0.3433, + "step": 11601 + }, + { + "epoch": 0.9398898250162022, + "grad_norm": 0.035892363637685776, + "learning_rate": 0.0001700076511094109, + "loss": 0.3487, + "step": 11602 + }, + { + "epoch": 0.9399708360337006, + "grad_norm": 0.03226469084620476, + "learning_rate": 0.00017000315045681625, + "loss": 0.3295, + "step": 11603 + }, + { + "epoch": 0.9400518470511989, + "grad_norm": 0.030066296458244324, + "learning_rate": 0.00016999864980422161, + "loss": 0.2968, + "step": 11604 + }, + { + "epoch": 0.9401328580686974, + "grad_norm": 0.035130247473716736, + "learning_rate": 0.000169994149151627, + "loss": 0.3375, + "step": 11605 + }, + { + "epoch": 0.9402138690861958, + "grad_norm": 0.033153027296066284, + "learning_rate": 0.00016998964849903236, + "loss": 0.3437, + "step": 11606 + }, + { + "epoch": 0.9402948801036941, + "grad_norm": 0.03841940313577652, + "learning_rate": 0.00016998514784643772, + "loss": 0.3726, + "step": 11607 + }, + { + "epoch": 0.9403758911211925, + "grad_norm": 0.036327630281448364, + "learning_rate": 0.00016998064719384314, + "loss": 0.332, + "step": 11608 + }, + { + "epoch": 0.9404569021386908, + "grad_norm": 0.03609196096658707, + "learning_rate": 0.0001699761465412485, + "loss": 0.3465, + "step": 11609 + }, + { + "epoch": 0.9405379131561893, + "grad_norm": 0.034757182002067566, + "learning_rate": 0.00016997164588865386, + "loss": 0.3241, + "step": 11610 + }, + { + "epoch": 0.9406189241736876, + "grad_norm": 0.04242127388715744, + "learning_rate": 0.00016996714523605924, + "loss": 0.3737, + "step": 11611 + }, + { + "epoch": 0.940699935191186, + "grad_norm": 0.037196334451436996, + "learning_rate": 0.0001699626445834646, + "loss": 0.3549, + "step": 11612 + }, + { + "epoch": 0.9407809462086844, + "grad_norm": 0.03243549168109894, + "learning_rate": 0.00016995814393087, + "loss": 0.3365, + "step": 11613 + }, + { + "epoch": 0.9408619572261827, + "grad_norm": 0.03829599916934967, + "learning_rate": 0.00016995364327827538, + "loss": 0.3163, + "step": 11614 + }, + { + "epoch": 0.9409429682436812, + "grad_norm": 0.03744173422455788, + "learning_rate": 0.00016994914262568074, + "loss": 0.3281, + "step": 11615 + }, + { + "epoch": 0.9410239792611795, + "grad_norm": 0.035117655992507935, + "learning_rate": 0.0001699446419730861, + "loss": 0.337, + "step": 11616 + }, + { + "epoch": 0.9411049902786779, + "grad_norm": 0.037259314209222794, + "learning_rate": 0.00016994014132049149, + "loss": 0.3937, + "step": 11617 + }, + { + "epoch": 0.9411860012961762, + "grad_norm": 0.0373586043715477, + "learning_rate": 0.00016993564066789685, + "loss": 0.3403, + "step": 11618 + }, + { + "epoch": 0.9412670123136747, + "grad_norm": 0.03547549992799759, + "learning_rate": 0.00016993114001530223, + "loss": 0.333, + "step": 11619 + }, + { + "epoch": 0.9413480233311731, + "grad_norm": 0.03412862494587898, + "learning_rate": 0.00016992663936270762, + "loss": 0.3196, + "step": 11620 + }, + { + "epoch": 0.9414290343486714, + "grad_norm": 0.03266661614179611, + "learning_rate": 0.00016992213871011298, + "loss": 0.319, + "step": 11621 + }, + { + "epoch": 0.9415100453661698, + "grad_norm": 0.035066504031419754, + "learning_rate": 0.00016991763805751834, + "loss": 0.343, + "step": 11622 + }, + { + "epoch": 0.9415910563836681, + "grad_norm": 0.03465098515152931, + "learning_rate": 0.00016991313740492373, + "loss": 0.3424, + "step": 11623 + }, + { + "epoch": 0.9416720674011666, + "grad_norm": 0.03283773362636566, + "learning_rate": 0.0001699086367523291, + "loss": 0.3125, + "step": 11624 + }, + { + "epoch": 0.941753078418665, + "grad_norm": 0.033413149416446686, + "learning_rate": 0.00016990413609973447, + "loss": 0.3532, + "step": 11625 + }, + { + "epoch": 0.9418340894361633, + "grad_norm": 0.029487356543540955, + "learning_rate": 0.00016989963544713986, + "loss": 0.2655, + "step": 11626 + }, + { + "epoch": 0.9419151004536617, + "grad_norm": 0.04271164536476135, + "learning_rate": 0.00016989513479454522, + "loss": 0.3881, + "step": 11627 + }, + { + "epoch": 0.94199611147116, + "grad_norm": 0.03455482795834541, + "learning_rate": 0.00016989063414195058, + "loss": 0.3244, + "step": 11628 + }, + { + "epoch": 0.9420771224886585, + "grad_norm": 0.03296886757016182, + "learning_rate": 0.00016988613348935597, + "loss": 0.3237, + "step": 11629 + }, + { + "epoch": 0.9421581335061568, + "grad_norm": 0.03511711582541466, + "learning_rate": 0.00016988163283676133, + "loss": 0.3444, + "step": 11630 + }, + { + "epoch": 0.9422391445236552, + "grad_norm": 0.03849519416689873, + "learning_rate": 0.00016987713218416672, + "loss": 0.3313, + "step": 11631 + }, + { + "epoch": 0.9423201555411536, + "grad_norm": 0.037213683128356934, + "learning_rate": 0.0001698726315315721, + "loss": 0.32, + "step": 11632 + }, + { + "epoch": 0.942401166558652, + "grad_norm": 0.03361336514353752, + "learning_rate": 0.00016986813087897746, + "loss": 0.3153, + "step": 11633 + }, + { + "epoch": 0.9424821775761504, + "grad_norm": 0.03675851225852966, + "learning_rate": 0.00016986363022638282, + "loss": 0.3525, + "step": 11634 + }, + { + "epoch": 0.9425631885936487, + "grad_norm": 0.0376199372112751, + "learning_rate": 0.0001698591295737882, + "loss": 0.3468, + "step": 11635 + }, + { + "epoch": 0.9426441996111471, + "grad_norm": 0.0355244055390358, + "learning_rate": 0.00016985462892119357, + "loss": 0.3516, + "step": 11636 + }, + { + "epoch": 0.9427252106286454, + "grad_norm": 0.03431635722517967, + "learning_rate": 0.00016985012826859896, + "loss": 0.2943, + "step": 11637 + }, + { + "epoch": 0.9428062216461439, + "grad_norm": 0.032604239881038666, + "learning_rate": 0.00016984562761600434, + "loss": 0.3412, + "step": 11638 + }, + { + "epoch": 0.9428872326636423, + "grad_norm": 0.0344766266644001, + "learning_rate": 0.0001698411269634097, + "loss": 0.3224, + "step": 11639 + }, + { + "epoch": 0.9429682436811406, + "grad_norm": 0.03054865449666977, + "learning_rate": 0.00016983662631081506, + "loss": 0.284, + "step": 11640 + }, + { + "epoch": 0.943049254698639, + "grad_norm": 0.034912411123514175, + "learning_rate": 0.00016983212565822045, + "loss": 0.3106, + "step": 11641 + }, + { + "epoch": 0.9431302657161373, + "grad_norm": 0.03549661487340927, + "learning_rate": 0.00016982762500562584, + "loss": 0.3144, + "step": 11642 + }, + { + "epoch": 0.9432112767336358, + "grad_norm": 0.03743693605065346, + "learning_rate": 0.0001698231243530312, + "loss": 0.3596, + "step": 11643 + }, + { + "epoch": 0.9432922877511342, + "grad_norm": 0.03865379840135574, + "learning_rate": 0.00016981862370043659, + "loss": 0.3242, + "step": 11644 + }, + { + "epoch": 0.9433732987686325, + "grad_norm": 0.030137626454234123, + "learning_rate": 0.00016981412304784195, + "loss": 0.2934, + "step": 11645 + }, + { + "epoch": 0.9434543097861309, + "grad_norm": 0.03273788467049599, + "learning_rate": 0.0001698096223952473, + "loss": 0.2888, + "step": 11646 + }, + { + "epoch": 0.9435353208036293, + "grad_norm": 0.03315626084804535, + "learning_rate": 0.0001698051217426527, + "loss": 0.2767, + "step": 11647 + }, + { + "epoch": 0.9436163318211277, + "grad_norm": 0.03848462179303169, + "learning_rate": 0.00016980062109005808, + "loss": 0.3099, + "step": 11648 + }, + { + "epoch": 0.943697342838626, + "grad_norm": 0.030941886827349663, + "learning_rate": 0.00016979612043746344, + "loss": 0.2987, + "step": 11649 + }, + { + "epoch": 0.9437783538561244, + "grad_norm": 0.04029727354645729, + "learning_rate": 0.00016979161978486883, + "loss": 0.3577, + "step": 11650 + }, + { + "epoch": 0.9438593648736228, + "grad_norm": 0.041079651564359665, + "learning_rate": 0.0001697871191322742, + "loss": 0.3579, + "step": 11651 + }, + { + "epoch": 0.9439403758911212, + "grad_norm": 0.033457282930612564, + "learning_rate": 0.00016978261847967955, + "loss": 0.3292, + "step": 11652 + }, + { + "epoch": 0.9440213869086196, + "grad_norm": 0.03081076219677925, + "learning_rate": 0.00016977811782708493, + "loss": 0.2804, + "step": 11653 + }, + { + "epoch": 0.9441023979261179, + "grad_norm": 0.034133411943912506, + "learning_rate": 0.00016977361717449032, + "loss": 0.2907, + "step": 11654 + }, + { + "epoch": 0.9441834089436163, + "grad_norm": 0.030764104798436165, + "learning_rate": 0.00016976911652189568, + "loss": 0.2712, + "step": 11655 + }, + { + "epoch": 0.9442644199611148, + "grad_norm": 0.03566576540470123, + "learning_rate": 0.00016976461586930107, + "loss": 0.3491, + "step": 11656 + }, + { + "epoch": 0.9443454309786131, + "grad_norm": 0.038679346442222595, + "learning_rate": 0.00016976011521670643, + "loss": 0.3152, + "step": 11657 + }, + { + "epoch": 0.9444264419961115, + "grad_norm": 0.03520083427429199, + "learning_rate": 0.0001697556145641118, + "loss": 0.2969, + "step": 11658 + }, + { + "epoch": 0.9445074530136098, + "grad_norm": 0.03682069107890129, + "learning_rate": 0.00016975111391151718, + "loss": 0.3412, + "step": 11659 + }, + { + "epoch": 0.9445884640311082, + "grad_norm": 0.034318264573812485, + "learning_rate": 0.00016974661325892256, + "loss": 0.3412, + "step": 11660 + }, + { + "epoch": 0.9446694750486067, + "grad_norm": 0.03663177788257599, + "learning_rate": 0.00016974211260632792, + "loss": 0.3447, + "step": 11661 + }, + { + "epoch": 0.944750486066105, + "grad_norm": 0.03302030265331268, + "learning_rate": 0.0001697376119537333, + "loss": 0.3214, + "step": 11662 + }, + { + "epoch": 0.9448314970836034, + "grad_norm": 0.034949298948049545, + "learning_rate": 0.00016973311130113867, + "loss": 0.3263, + "step": 11663 + }, + { + "epoch": 0.9449125081011017, + "grad_norm": 0.03676712140440941, + "learning_rate": 0.00016972861064854403, + "loss": 0.3366, + "step": 11664 + }, + { + "epoch": 0.9449935191186001, + "grad_norm": 0.03827500343322754, + "learning_rate": 0.00016972410999594945, + "loss": 0.3845, + "step": 11665 + }, + { + "epoch": 0.9450745301360985, + "grad_norm": 0.028249388560652733, + "learning_rate": 0.0001697196093433548, + "loss": 0.2928, + "step": 11666 + }, + { + "epoch": 0.9451555411535969, + "grad_norm": 0.035137057304382324, + "learning_rate": 0.00016971510869076017, + "loss": 0.3038, + "step": 11667 + }, + { + "epoch": 0.9452365521710953, + "grad_norm": 0.03510432690382004, + "learning_rate": 0.00016971060803816555, + "loss": 0.3282, + "step": 11668 + }, + { + "epoch": 0.9453175631885936, + "grad_norm": 0.033443015068769455, + "learning_rate": 0.0001697061073855709, + "loss": 0.3412, + "step": 11669 + }, + { + "epoch": 0.9453985742060921, + "grad_norm": 0.03263236582279205, + "learning_rate": 0.00016970160673297627, + "loss": 0.3158, + "step": 11670 + }, + { + "epoch": 0.9454795852235904, + "grad_norm": 0.038706421852111816, + "learning_rate": 0.0001696971060803817, + "loss": 0.3624, + "step": 11671 + }, + { + "epoch": 0.9455605962410888, + "grad_norm": 0.029902534559369087, + "learning_rate": 0.00016969260542778705, + "loss": 0.3152, + "step": 11672 + }, + { + "epoch": 0.9456416072585871, + "grad_norm": 0.0366581529378891, + "learning_rate": 0.0001696881047751924, + "loss": 0.3915, + "step": 11673 + }, + { + "epoch": 0.9457226182760855, + "grad_norm": 0.03990044817328453, + "learning_rate": 0.0001696836041225978, + "loss": 0.3217, + "step": 11674 + }, + { + "epoch": 0.945803629293584, + "grad_norm": 0.034472111612558365, + "learning_rate": 0.00016967910347000315, + "loss": 0.3514, + "step": 11675 + }, + { + "epoch": 0.9458846403110823, + "grad_norm": 0.0318378284573555, + "learning_rate": 0.00016967460281740851, + "loss": 0.3206, + "step": 11676 + }, + { + "epoch": 0.9459656513285807, + "grad_norm": 0.03478666767477989, + "learning_rate": 0.00016967010216481393, + "loss": 0.3141, + "step": 11677 + }, + { + "epoch": 0.946046662346079, + "grad_norm": 0.03769747540354729, + "learning_rate": 0.0001696656015122193, + "loss": 0.3969, + "step": 11678 + }, + { + "epoch": 0.9461276733635774, + "grad_norm": 0.036210544407367706, + "learning_rate": 0.00016966110085962465, + "loss": 0.3578, + "step": 11679 + }, + { + "epoch": 0.9462086843810759, + "grad_norm": 0.03335197642445564, + "learning_rate": 0.00016965660020703004, + "loss": 0.2924, + "step": 11680 + }, + { + "epoch": 0.9462896953985742, + "grad_norm": 0.031656358391046524, + "learning_rate": 0.0001696520995544354, + "loss": 0.3034, + "step": 11681 + }, + { + "epoch": 0.9463707064160726, + "grad_norm": 0.033151499927043915, + "learning_rate": 0.00016964759890184076, + "loss": 0.3418, + "step": 11682 + }, + { + "epoch": 0.9464517174335709, + "grad_norm": 0.037764452397823334, + "learning_rate": 0.00016964309824924617, + "loss": 0.365, + "step": 11683 + }, + { + "epoch": 0.9465327284510694, + "grad_norm": 0.03604017570614815, + "learning_rate": 0.00016963859759665153, + "loss": 0.3216, + "step": 11684 + }, + { + "epoch": 0.9466137394685677, + "grad_norm": 0.03398576378822327, + "learning_rate": 0.0001696340969440569, + "loss": 0.2693, + "step": 11685 + }, + { + "epoch": 0.9466947504860661, + "grad_norm": 0.04054470360279083, + "learning_rate": 0.00016962959629146228, + "loss": 0.381, + "step": 11686 + }, + { + "epoch": 0.9467757615035645, + "grad_norm": 0.03417252004146576, + "learning_rate": 0.00016962509563886764, + "loss": 0.2849, + "step": 11687 + }, + { + "epoch": 0.9468567725210628, + "grad_norm": 0.03717213496565819, + "learning_rate": 0.000169620594986273, + "loss": 0.3327, + "step": 11688 + }, + { + "epoch": 0.9469377835385613, + "grad_norm": 0.03692119941115379, + "learning_rate": 0.0001696160943336784, + "loss": 0.3463, + "step": 11689 + }, + { + "epoch": 0.9470187945560596, + "grad_norm": 0.03388385847210884, + "learning_rate": 0.00016961159368108377, + "loss": 0.312, + "step": 11690 + }, + { + "epoch": 0.947099805573558, + "grad_norm": 0.03554116189479828, + "learning_rate": 0.00016960709302848913, + "loss": 0.3774, + "step": 11691 + }, + { + "epoch": 0.9471808165910564, + "grad_norm": 0.032233826816082, + "learning_rate": 0.00016960259237589452, + "loss": 0.3052, + "step": 11692 + }, + { + "epoch": 0.9472618276085548, + "grad_norm": 0.03355953097343445, + "learning_rate": 0.00016959809172329988, + "loss": 0.3092, + "step": 11693 + }, + { + "epoch": 0.9473428386260532, + "grad_norm": 0.0348745621740818, + "learning_rate": 0.00016959359107070527, + "loss": 0.2982, + "step": 11694 + }, + { + "epoch": 0.9474238496435515, + "grad_norm": 0.04244668781757355, + "learning_rate": 0.00016958909041811065, + "loss": 0.3179, + "step": 11695 + }, + { + "epoch": 0.9475048606610499, + "grad_norm": 0.03393128514289856, + "learning_rate": 0.000169584589765516, + "loss": 0.3385, + "step": 11696 + }, + { + "epoch": 0.9475858716785482, + "grad_norm": 0.03268209099769592, + "learning_rate": 0.00016958008911292137, + "loss": 0.2917, + "step": 11697 + }, + { + "epoch": 0.9476668826960467, + "grad_norm": 0.03826176002621651, + "learning_rate": 0.00016957558846032676, + "loss": 0.3272, + "step": 11698 + }, + { + "epoch": 0.9477478937135451, + "grad_norm": 0.03487805277109146, + "learning_rate": 0.00016957108780773212, + "loss": 0.3428, + "step": 11699 + }, + { + "epoch": 0.9478289047310434, + "grad_norm": 0.029413651674985886, + "learning_rate": 0.0001695665871551375, + "loss": 0.2864, + "step": 11700 + }, + { + "epoch": 0.9479099157485418, + "grad_norm": 0.041242972016334534, + "learning_rate": 0.0001695620865025429, + "loss": 0.3314, + "step": 11701 + }, + { + "epoch": 0.9479909267660401, + "grad_norm": 0.03674250841140747, + "learning_rate": 0.00016955758584994826, + "loss": 0.3567, + "step": 11702 + }, + { + "epoch": 0.9480719377835386, + "grad_norm": 0.0323912687599659, + "learning_rate": 0.00016955308519735362, + "loss": 0.3049, + "step": 11703 + }, + { + "epoch": 0.948152948801037, + "grad_norm": 0.03778766468167305, + "learning_rate": 0.000169548584544759, + "loss": 0.2959, + "step": 11704 + }, + { + "epoch": 0.9482339598185353, + "grad_norm": 0.034497227519750595, + "learning_rate": 0.00016954408389216436, + "loss": 0.347, + "step": 11705 + }, + { + "epoch": 0.9483149708360337, + "grad_norm": 0.03679029643535614, + "learning_rate": 0.00016953958323956975, + "loss": 0.3651, + "step": 11706 + }, + { + "epoch": 0.9483959818535321, + "grad_norm": 0.04503096267580986, + "learning_rate": 0.00016953508258697514, + "loss": 0.378, + "step": 11707 + }, + { + "epoch": 0.9484769928710305, + "grad_norm": 0.03016662783920765, + "learning_rate": 0.0001695305819343805, + "loss": 0.2731, + "step": 11708 + }, + { + "epoch": 0.9485580038885288, + "grad_norm": 0.03268558531999588, + "learning_rate": 0.00016952608128178586, + "loss": 0.2694, + "step": 11709 + }, + { + "epoch": 0.9486390149060272, + "grad_norm": 0.03649129718542099, + "learning_rate": 0.00016952158062919124, + "loss": 0.3221, + "step": 11710 + }, + { + "epoch": 0.9487200259235256, + "grad_norm": 0.03539815917611122, + "learning_rate": 0.0001695170799765966, + "loss": 0.3662, + "step": 11711 + }, + { + "epoch": 0.948801036941024, + "grad_norm": 0.0350770466029644, + "learning_rate": 0.000169512579324002, + "loss": 0.3065, + "step": 11712 + }, + { + "epoch": 0.9488820479585224, + "grad_norm": 0.03900324925780296, + "learning_rate": 0.00016950807867140738, + "loss": 0.3552, + "step": 11713 + }, + { + "epoch": 0.9489630589760207, + "grad_norm": 0.03432242199778557, + "learning_rate": 0.00016950357801881274, + "loss": 0.314, + "step": 11714 + }, + { + "epoch": 0.9490440699935191, + "grad_norm": 0.03255319967865944, + "learning_rate": 0.0001694990773662181, + "loss": 0.2936, + "step": 11715 + }, + { + "epoch": 0.9491250810110174, + "grad_norm": 0.03342980891466141, + "learning_rate": 0.00016949457671362349, + "loss": 0.3205, + "step": 11716 + }, + { + "epoch": 0.9492060920285159, + "grad_norm": 0.03174217417836189, + "learning_rate": 0.00016949007606102885, + "loss": 0.3155, + "step": 11717 + }, + { + "epoch": 0.9492871030460143, + "grad_norm": 0.04021428897976875, + "learning_rate": 0.00016948557540843423, + "loss": 0.3229, + "step": 11718 + }, + { + "epoch": 0.9493681140635126, + "grad_norm": 0.03540870547294617, + "learning_rate": 0.00016948107475583962, + "loss": 0.3245, + "step": 11719 + }, + { + "epoch": 0.949449125081011, + "grad_norm": 0.039765365421772, + "learning_rate": 0.00016947657410324498, + "loss": 0.3519, + "step": 11720 + }, + { + "epoch": 0.9495301360985094, + "grad_norm": 0.039002127945423126, + "learning_rate": 0.00016947207345065034, + "loss": 0.372, + "step": 11721 + }, + { + "epoch": 0.9496111471160078, + "grad_norm": 0.031765103340148926, + "learning_rate": 0.00016946757279805573, + "loss": 0.3012, + "step": 11722 + }, + { + "epoch": 0.9496921581335062, + "grad_norm": 0.031993210315704346, + "learning_rate": 0.00016946307214546111, + "loss": 0.3037, + "step": 11723 + }, + { + "epoch": 0.9497731691510045, + "grad_norm": 0.03384470194578171, + "learning_rate": 0.00016945857149286647, + "loss": 0.3278, + "step": 11724 + }, + { + "epoch": 0.9498541801685029, + "grad_norm": 0.03596783056855202, + "learning_rate": 0.00016945407084027186, + "loss": 0.331, + "step": 11725 + }, + { + "epoch": 0.9499351911860013, + "grad_norm": 0.029281174764037132, + "learning_rate": 0.00016944957018767722, + "loss": 0.2366, + "step": 11726 + }, + { + "epoch": 0.9500162022034997, + "grad_norm": 0.03412824124097824, + "learning_rate": 0.00016944506953508258, + "loss": 0.3133, + "step": 11727 + }, + { + "epoch": 0.950097213220998, + "grad_norm": 0.03497035428881645, + "learning_rate": 0.00016944056888248797, + "loss": 0.3024, + "step": 11728 + }, + { + "epoch": 0.9501782242384964, + "grad_norm": 0.03496416285634041, + "learning_rate": 0.00016943606822989336, + "loss": 0.3502, + "step": 11729 + }, + { + "epoch": 0.9502592352559948, + "grad_norm": 0.036330707371234894, + "learning_rate": 0.00016943156757729872, + "loss": 0.3328, + "step": 11730 + }, + { + "epoch": 0.9503402462734932, + "grad_norm": 0.03646592050790787, + "learning_rate": 0.0001694270669247041, + "loss": 0.3299, + "step": 11731 + }, + { + "epoch": 0.9504212572909916, + "grad_norm": 0.033516980707645416, + "learning_rate": 0.00016942256627210946, + "loss": 0.3106, + "step": 11732 + }, + { + "epoch": 0.9505022683084899, + "grad_norm": 0.03749392181634903, + "learning_rate": 0.00016941806561951482, + "loss": 0.3687, + "step": 11733 + }, + { + "epoch": 0.9505832793259883, + "grad_norm": 0.03816765919327736, + "learning_rate": 0.0001694135649669202, + "loss": 0.3196, + "step": 11734 + }, + { + "epoch": 0.9506642903434868, + "grad_norm": 0.032571665942668915, + "learning_rate": 0.0001694090643143256, + "loss": 0.2962, + "step": 11735 + }, + { + "epoch": 0.9507453013609851, + "grad_norm": 0.034489717334508896, + "learning_rate": 0.00016940456366173096, + "loss": 0.3183, + "step": 11736 + }, + { + "epoch": 0.9508263123784835, + "grad_norm": 0.03572354093194008, + "learning_rate": 0.00016940006300913634, + "loss": 0.3061, + "step": 11737 + }, + { + "epoch": 0.9509073233959818, + "grad_norm": 0.034814946353435516, + "learning_rate": 0.0001693955623565417, + "loss": 0.3264, + "step": 11738 + }, + { + "epoch": 0.9509883344134802, + "grad_norm": 0.03461809828877449, + "learning_rate": 0.00016939106170394706, + "loss": 0.3138, + "step": 11739 + }, + { + "epoch": 0.9510693454309787, + "grad_norm": 0.03412418067455292, + "learning_rate": 0.00016938656105135245, + "loss": 0.3284, + "step": 11740 + }, + { + "epoch": 0.951150356448477, + "grad_norm": 0.03454767167568207, + "learning_rate": 0.00016938206039875784, + "loss": 0.3381, + "step": 11741 + }, + { + "epoch": 0.9512313674659754, + "grad_norm": 0.03485751524567604, + "learning_rate": 0.0001693775597461632, + "loss": 0.3043, + "step": 11742 + }, + { + "epoch": 0.9513123784834737, + "grad_norm": 0.03471321612596512, + "learning_rate": 0.00016937305909356859, + "loss": 0.3475, + "step": 11743 + }, + { + "epoch": 0.9513933895009722, + "grad_norm": 0.03523659333586693, + "learning_rate": 0.00016936855844097395, + "loss": 0.3372, + "step": 11744 + }, + { + "epoch": 0.9514744005184705, + "grad_norm": 0.0338854119181633, + "learning_rate": 0.0001693640577883793, + "loss": 0.311, + "step": 11745 + }, + { + "epoch": 0.9515554115359689, + "grad_norm": 0.0323248915374279, + "learning_rate": 0.00016935955713578472, + "loss": 0.3268, + "step": 11746 + }, + { + "epoch": 0.9516364225534673, + "grad_norm": 0.03338275104761124, + "learning_rate": 0.00016935505648319008, + "loss": 0.2595, + "step": 11747 + }, + { + "epoch": 0.9517174335709656, + "grad_norm": 0.04064786061644554, + "learning_rate": 0.00016935055583059544, + "loss": 0.3568, + "step": 11748 + }, + { + "epoch": 0.9517984445884641, + "grad_norm": 0.03257180005311966, + "learning_rate": 0.00016934605517800083, + "loss": 0.2703, + "step": 11749 + }, + { + "epoch": 0.9518794556059624, + "grad_norm": 0.03152458369731903, + "learning_rate": 0.0001693415545254062, + "loss": 0.2934, + "step": 11750 + }, + { + "epoch": 0.9519604666234608, + "grad_norm": 0.03133175149559975, + "learning_rate": 0.00016933705387281155, + "loss": 0.2884, + "step": 11751 + }, + { + "epoch": 0.9520414776409591, + "grad_norm": 0.035321589559316635, + "learning_rate": 0.00016933255322021696, + "loss": 0.3162, + "step": 11752 + }, + { + "epoch": 0.9521224886584575, + "grad_norm": 0.04080462455749512, + "learning_rate": 0.00016932805256762232, + "loss": 0.354, + "step": 11753 + }, + { + "epoch": 0.952203499675956, + "grad_norm": 0.0328151099383831, + "learning_rate": 0.00016932355191502768, + "loss": 0.3083, + "step": 11754 + }, + { + "epoch": 0.9522845106934543, + "grad_norm": 0.03481130301952362, + "learning_rate": 0.00016931905126243307, + "loss": 0.3128, + "step": 11755 + }, + { + "epoch": 0.9523655217109527, + "grad_norm": 0.03485405072569847, + "learning_rate": 0.00016931455060983843, + "loss": 0.3458, + "step": 11756 + }, + { + "epoch": 0.952446532728451, + "grad_norm": 0.03622061014175415, + "learning_rate": 0.0001693100499572438, + "loss": 0.3733, + "step": 11757 + }, + { + "epoch": 0.9525275437459495, + "grad_norm": 0.040508657693862915, + "learning_rate": 0.0001693055493046492, + "loss": 0.3509, + "step": 11758 + }, + { + "epoch": 0.9526085547634479, + "grad_norm": 0.03345491737127304, + "learning_rate": 0.00016930104865205456, + "loss": 0.3284, + "step": 11759 + }, + { + "epoch": 0.9526895657809462, + "grad_norm": 0.03780923783779144, + "learning_rate": 0.00016929654799945992, + "loss": 0.3215, + "step": 11760 + }, + { + "epoch": 0.9527705767984446, + "grad_norm": 0.02839631587266922, + "learning_rate": 0.0001692920473468653, + "loss": 0.2715, + "step": 11761 + }, + { + "epoch": 0.9528515878159429, + "grad_norm": 0.0344814732670784, + "learning_rate": 0.00016928754669427067, + "loss": 0.3353, + "step": 11762 + }, + { + "epoch": 0.9529325988334414, + "grad_norm": 0.0311529953032732, + "learning_rate": 0.00016928304604167603, + "loss": 0.2887, + "step": 11763 + }, + { + "epoch": 0.9530136098509397, + "grad_norm": 0.042994722723960876, + "learning_rate": 0.00016927854538908145, + "loss": 0.4151, + "step": 11764 + }, + { + "epoch": 0.9530946208684381, + "grad_norm": 0.030827734619379044, + "learning_rate": 0.0001692740447364868, + "loss": 0.3259, + "step": 11765 + }, + { + "epoch": 0.9531756318859365, + "grad_norm": 0.03870192915201187, + "learning_rate": 0.00016926954408389217, + "loss": 0.3981, + "step": 11766 + }, + { + "epoch": 0.9532566429034348, + "grad_norm": 0.033833280205726624, + "learning_rate": 0.00016926504343129755, + "loss": 0.3258, + "step": 11767 + }, + { + "epoch": 0.9533376539209333, + "grad_norm": 0.03414291888475418, + "learning_rate": 0.0001692605427787029, + "loss": 0.3437, + "step": 11768 + }, + { + "epoch": 0.9534186649384316, + "grad_norm": 0.03322215750813484, + "learning_rate": 0.00016925604212610827, + "loss": 0.3217, + "step": 11769 + }, + { + "epoch": 0.95349967595593, + "grad_norm": 0.0363927036523819, + "learning_rate": 0.0001692515414735137, + "loss": 0.322, + "step": 11770 + }, + { + "epoch": 0.9535806869734283, + "grad_norm": 0.03173397481441498, + "learning_rate": 0.00016924704082091905, + "loss": 0.311, + "step": 11771 + }, + { + "epoch": 0.9536616979909268, + "grad_norm": 0.03396917134523392, + "learning_rate": 0.0001692425401683244, + "loss": 0.3083, + "step": 11772 + }, + { + "epoch": 0.9537427090084252, + "grad_norm": 0.03490575775504112, + "learning_rate": 0.0001692380395157298, + "loss": 0.3028, + "step": 11773 + }, + { + "epoch": 0.9538237200259235, + "grad_norm": 0.03380593657493591, + "learning_rate": 0.00016923353886313515, + "loss": 0.3213, + "step": 11774 + }, + { + "epoch": 0.9539047310434219, + "grad_norm": 0.03855833038687706, + "learning_rate": 0.00016922903821054054, + "loss": 0.3698, + "step": 11775 + }, + { + "epoch": 0.9539857420609202, + "grad_norm": 0.03649449720978737, + "learning_rate": 0.00016922453755794593, + "loss": 0.3911, + "step": 11776 + }, + { + "epoch": 0.9540667530784187, + "grad_norm": 0.032006874680519104, + "learning_rate": 0.0001692200369053513, + "loss": 0.3117, + "step": 11777 + }, + { + "epoch": 0.9541477640959171, + "grad_norm": 0.037732262164354324, + "learning_rate": 0.00016921553625275665, + "loss": 0.328, + "step": 11778 + }, + { + "epoch": 0.9542287751134154, + "grad_norm": 0.034876108169555664, + "learning_rate": 0.00016921103560016204, + "loss": 0.3054, + "step": 11779 + }, + { + "epoch": 0.9543097861309138, + "grad_norm": 0.03947483375668526, + "learning_rate": 0.0001692065349475674, + "loss": 0.3248, + "step": 11780 + }, + { + "epoch": 0.9543907971484121, + "grad_norm": 0.033376701176166534, + "learning_rate": 0.00016920203429497278, + "loss": 0.2932, + "step": 11781 + }, + { + "epoch": 0.9544718081659106, + "grad_norm": 0.03565700352191925, + "learning_rate": 0.00016919753364237817, + "loss": 0.3275, + "step": 11782 + }, + { + "epoch": 0.954552819183409, + "grad_norm": 0.036495938897132874, + "learning_rate": 0.00016919303298978353, + "loss": 0.2915, + "step": 11783 + }, + { + "epoch": 0.9546338302009073, + "grad_norm": 0.036930181086063385, + "learning_rate": 0.0001691885323371889, + "loss": 0.3189, + "step": 11784 + }, + { + "epoch": 0.9547148412184057, + "grad_norm": 0.03827503323554993, + "learning_rate": 0.00016918403168459428, + "loss": 0.321, + "step": 11785 + }, + { + "epoch": 0.9547958522359041, + "grad_norm": 0.03679677098989487, + "learning_rate": 0.00016917953103199964, + "loss": 0.2892, + "step": 11786 + }, + { + "epoch": 0.9548768632534025, + "grad_norm": 0.03638848662376404, + "learning_rate": 0.00016917503037940502, + "loss": 0.3257, + "step": 11787 + }, + { + "epoch": 0.9549578742709008, + "grad_norm": 0.03703775629401207, + "learning_rate": 0.0001691705297268104, + "loss": 0.3547, + "step": 11788 + }, + { + "epoch": 0.9550388852883992, + "grad_norm": 0.03831619769334793, + "learning_rate": 0.00016916602907421577, + "loss": 0.3986, + "step": 11789 + }, + { + "epoch": 0.9551198963058976, + "grad_norm": 0.03259553760290146, + "learning_rate": 0.00016916152842162113, + "loss": 0.3045, + "step": 11790 + }, + { + "epoch": 0.955200907323396, + "grad_norm": 0.03698387369513512, + "learning_rate": 0.00016915702776902652, + "loss": 0.2949, + "step": 11791 + }, + { + "epoch": 0.9552819183408944, + "grad_norm": 0.03987254947423935, + "learning_rate": 0.00016915252711643188, + "loss": 0.3408, + "step": 11792 + }, + { + "epoch": 0.9553629293583927, + "grad_norm": 0.0366261750459671, + "learning_rate": 0.00016914802646383727, + "loss": 0.3705, + "step": 11793 + }, + { + "epoch": 0.9554439403758911, + "grad_norm": 0.03159358724951744, + "learning_rate": 0.00016914352581124265, + "loss": 0.2974, + "step": 11794 + }, + { + "epoch": 0.9555249513933896, + "grad_norm": 0.03561757504940033, + "learning_rate": 0.00016913902515864801, + "loss": 0.3321, + "step": 11795 + }, + { + "epoch": 0.9556059624108879, + "grad_norm": 0.030125150457024574, + "learning_rate": 0.00016913452450605337, + "loss": 0.2685, + "step": 11796 + }, + { + "epoch": 0.9556869734283863, + "grad_norm": 0.04590509086847305, + "learning_rate": 0.00016913002385345876, + "loss": 0.3406, + "step": 11797 + }, + { + "epoch": 0.9557679844458846, + "grad_norm": 0.03866960480809212, + "learning_rate": 0.00016912552320086415, + "loss": 0.3871, + "step": 11798 + }, + { + "epoch": 0.955848995463383, + "grad_norm": 0.03353342413902283, + "learning_rate": 0.0001691210225482695, + "loss": 0.3109, + "step": 11799 + }, + { + "epoch": 0.9559300064808814, + "grad_norm": 0.03562798351049423, + "learning_rate": 0.0001691165218956749, + "loss": 0.3502, + "step": 11800 + }, + { + "epoch": 0.9560110174983798, + "grad_norm": 0.03377343341708183, + "learning_rate": 0.00016911202124308026, + "loss": 0.3315, + "step": 11801 + }, + { + "epoch": 0.9560920285158782, + "grad_norm": 0.034913334995508194, + "learning_rate": 0.00016910752059048562, + "loss": 0.3222, + "step": 11802 + }, + { + "epoch": 0.9561730395333765, + "grad_norm": 0.03448820114135742, + "learning_rate": 0.000169103019937891, + "loss": 0.3303, + "step": 11803 + }, + { + "epoch": 0.9562540505508749, + "grad_norm": 0.028959903866052628, + "learning_rate": 0.0001690985192852964, + "loss": 0.2945, + "step": 11804 + }, + { + "epoch": 0.9563350615683733, + "grad_norm": 0.03772129863500595, + "learning_rate": 0.00016909401863270175, + "loss": 0.3726, + "step": 11805 + }, + { + "epoch": 0.9564160725858717, + "grad_norm": 0.03504560515284538, + "learning_rate": 0.00016908951798010714, + "loss": 0.3372, + "step": 11806 + }, + { + "epoch": 0.95649708360337, + "grad_norm": 0.03733687847852707, + "learning_rate": 0.0001690850173275125, + "loss": 0.3983, + "step": 11807 + }, + { + "epoch": 0.9565780946208684, + "grad_norm": 0.03707200288772583, + "learning_rate": 0.00016908051667491786, + "loss": 0.3513, + "step": 11808 + }, + { + "epoch": 0.9566591056383669, + "grad_norm": 0.03620739281177521, + "learning_rate": 0.00016907601602232324, + "loss": 0.3467, + "step": 11809 + }, + { + "epoch": 0.9567401166558652, + "grad_norm": 0.03168788552284241, + "learning_rate": 0.00016907151536972863, + "loss": 0.2961, + "step": 11810 + }, + { + "epoch": 0.9568211276733636, + "grad_norm": 0.03348376229405403, + "learning_rate": 0.000169067014717134, + "loss": 0.2898, + "step": 11811 + }, + { + "epoch": 0.9569021386908619, + "grad_norm": 0.03537198528647423, + "learning_rate": 0.00016906251406453938, + "loss": 0.3066, + "step": 11812 + }, + { + "epoch": 0.9569831497083603, + "grad_norm": 0.037128619849681854, + "learning_rate": 0.00016905801341194474, + "loss": 0.3561, + "step": 11813 + }, + { + "epoch": 0.9570641607258588, + "grad_norm": 0.03736058622598648, + "learning_rate": 0.0001690535127593501, + "loss": 0.37, + "step": 11814 + }, + { + "epoch": 0.9571451717433571, + "grad_norm": 0.033690933138132095, + "learning_rate": 0.00016904901210675549, + "loss": 0.3121, + "step": 11815 + }, + { + "epoch": 0.9572261827608555, + "grad_norm": 0.0378740020096302, + "learning_rate": 0.00016904451145416087, + "loss": 0.3614, + "step": 11816 + }, + { + "epoch": 0.9573071937783538, + "grad_norm": 0.033114880323410034, + "learning_rate": 0.00016904001080156623, + "loss": 0.2907, + "step": 11817 + }, + { + "epoch": 0.9573882047958522, + "grad_norm": 0.04144694283604622, + "learning_rate": 0.00016903551014897162, + "loss": 0.3657, + "step": 11818 + }, + { + "epoch": 0.9574692158133506, + "grad_norm": 0.035634271800518036, + "learning_rate": 0.00016903100949637698, + "loss": 0.3649, + "step": 11819 + }, + { + "epoch": 0.957550226830849, + "grad_norm": 0.03484927490353584, + "learning_rate": 0.00016902650884378234, + "loss": 0.344, + "step": 11820 + }, + { + "epoch": 0.9576312378483474, + "grad_norm": 0.03607147932052612, + "learning_rate": 0.00016902200819118773, + "loss": 0.3054, + "step": 11821 + }, + { + "epoch": 0.9577122488658457, + "grad_norm": 0.030834076926112175, + "learning_rate": 0.00016901750753859311, + "loss": 0.3091, + "step": 11822 + }, + { + "epoch": 0.9577932598833442, + "grad_norm": 0.03185582533478737, + "learning_rate": 0.00016901300688599847, + "loss": 0.28, + "step": 11823 + }, + { + "epoch": 0.9578742709008425, + "grad_norm": 0.038030751049518585, + "learning_rate": 0.00016900850623340386, + "loss": 0.2906, + "step": 11824 + }, + { + "epoch": 0.9579552819183409, + "grad_norm": 0.034883469343185425, + "learning_rate": 0.00016900400558080922, + "loss": 0.339, + "step": 11825 + }, + { + "epoch": 0.9580362929358393, + "grad_norm": 0.04374323785305023, + "learning_rate": 0.00016899950492821458, + "loss": 0.3658, + "step": 11826 + }, + { + "epoch": 0.9581173039533376, + "grad_norm": 0.03804289177060127, + "learning_rate": 0.00016899500427562, + "loss": 0.3321, + "step": 11827 + }, + { + "epoch": 0.9581983149708361, + "grad_norm": 0.033111900091171265, + "learning_rate": 0.00016899050362302536, + "loss": 0.3215, + "step": 11828 + }, + { + "epoch": 0.9582793259883344, + "grad_norm": 0.033424705266952515, + "learning_rate": 0.00016898600297043072, + "loss": 0.3278, + "step": 11829 + }, + { + "epoch": 0.9583603370058328, + "grad_norm": 0.032798223197460175, + "learning_rate": 0.0001689815023178361, + "loss": 0.2936, + "step": 11830 + }, + { + "epoch": 0.9584413480233311, + "grad_norm": 0.03180072829127312, + "learning_rate": 0.00016897700166524146, + "loss": 0.3061, + "step": 11831 + }, + { + "epoch": 0.9585223590408296, + "grad_norm": 0.030326934531331062, + "learning_rate": 0.00016897250101264682, + "loss": 0.3027, + "step": 11832 + }, + { + "epoch": 0.958603370058328, + "grad_norm": 0.037974853068590164, + "learning_rate": 0.00016896800036005224, + "loss": 0.309, + "step": 11833 + }, + { + "epoch": 0.9586843810758263, + "grad_norm": 0.03329726681113243, + "learning_rate": 0.0001689634997074576, + "loss": 0.2979, + "step": 11834 + }, + { + "epoch": 0.9587653920933247, + "grad_norm": 0.03679864853620529, + "learning_rate": 0.00016895899905486296, + "loss": 0.3325, + "step": 11835 + }, + { + "epoch": 0.958846403110823, + "grad_norm": 0.03249267116189003, + "learning_rate": 0.00016895449840226834, + "loss": 0.2676, + "step": 11836 + }, + { + "epoch": 0.9589274141283215, + "grad_norm": 0.035779163241386414, + "learning_rate": 0.0001689499977496737, + "loss": 0.3245, + "step": 11837 + }, + { + "epoch": 0.9590084251458199, + "grad_norm": 0.038208309561014175, + "learning_rate": 0.00016894549709707907, + "loss": 0.3432, + "step": 11838 + }, + { + "epoch": 0.9590894361633182, + "grad_norm": 0.034464698284864426, + "learning_rate": 0.00016894099644448448, + "loss": 0.2977, + "step": 11839 + }, + { + "epoch": 0.9591704471808166, + "grad_norm": 0.03685634583234787, + "learning_rate": 0.00016893649579188984, + "loss": 0.3223, + "step": 11840 + }, + { + "epoch": 0.9592514581983149, + "grad_norm": 0.036837100982666016, + "learning_rate": 0.0001689319951392952, + "loss": 0.2679, + "step": 11841 + }, + { + "epoch": 0.9593324692158134, + "grad_norm": 0.03724145144224167, + "learning_rate": 0.0001689274944867006, + "loss": 0.2949, + "step": 11842 + }, + { + "epoch": 0.9594134802333117, + "grad_norm": 0.03809111937880516, + "learning_rate": 0.00016892299383410595, + "loss": 0.3518, + "step": 11843 + }, + { + "epoch": 0.9594944912508101, + "grad_norm": 0.03469406068325043, + "learning_rate": 0.0001689184931815113, + "loss": 0.3644, + "step": 11844 + }, + { + "epoch": 0.9595755022683085, + "grad_norm": 0.03754259645938873, + "learning_rate": 0.00016891399252891672, + "loss": 0.3545, + "step": 11845 + }, + { + "epoch": 0.9596565132858069, + "grad_norm": 0.03645508363842964, + "learning_rate": 0.00016890949187632208, + "loss": 0.3223, + "step": 11846 + }, + { + "epoch": 0.9597375243033053, + "grad_norm": 0.03522754833102226, + "learning_rate": 0.00016890499122372744, + "loss": 0.353, + "step": 11847 + }, + { + "epoch": 0.9598185353208036, + "grad_norm": 0.036083534359931946, + "learning_rate": 0.00016890049057113283, + "loss": 0.3477, + "step": 11848 + }, + { + "epoch": 0.959899546338302, + "grad_norm": 0.03646800294518471, + "learning_rate": 0.0001688959899185382, + "loss": 0.3192, + "step": 11849 + }, + { + "epoch": 0.9599805573558003, + "grad_norm": 0.03613121807575226, + "learning_rate": 0.00016889148926594358, + "loss": 0.3567, + "step": 11850 + }, + { + "epoch": 0.9600615683732988, + "grad_norm": 0.03716088458895683, + "learning_rate": 0.00016888698861334896, + "loss": 0.3071, + "step": 11851 + }, + { + "epoch": 0.9601425793907972, + "grad_norm": 0.0373094417154789, + "learning_rate": 0.00016888248796075432, + "loss": 0.333, + "step": 11852 + }, + { + "epoch": 0.9602235904082955, + "grad_norm": 0.034615617245435715, + "learning_rate": 0.00016887798730815968, + "loss": 0.3143, + "step": 11853 + }, + { + "epoch": 0.9603046014257939, + "grad_norm": 0.041054386645555496, + "learning_rate": 0.00016887348665556507, + "loss": 0.3148, + "step": 11854 + }, + { + "epoch": 0.9603856124432922, + "grad_norm": 0.03472714498639107, + "learning_rate": 0.00016886898600297043, + "loss": 0.3177, + "step": 11855 + }, + { + "epoch": 0.9604666234607907, + "grad_norm": 0.032427918165922165, + "learning_rate": 0.00016886448535037582, + "loss": 0.279, + "step": 11856 + }, + { + "epoch": 0.9605476344782891, + "grad_norm": 0.036566201597452164, + "learning_rate": 0.0001688599846977812, + "loss": 0.3376, + "step": 11857 + }, + { + "epoch": 0.9606286454957874, + "grad_norm": 0.030958328396081924, + "learning_rate": 0.00016885548404518656, + "loss": 0.3252, + "step": 11858 + }, + { + "epoch": 0.9607096565132858, + "grad_norm": 0.02840495854616165, + "learning_rate": 0.00016885098339259192, + "loss": 0.2626, + "step": 11859 + }, + { + "epoch": 0.9607906675307842, + "grad_norm": 0.03243676573038101, + "learning_rate": 0.0001688464827399973, + "loss": 0.3303, + "step": 11860 + }, + { + "epoch": 0.9608716785482826, + "grad_norm": 0.034418750554323196, + "learning_rate": 0.00016884198208740267, + "loss": 0.3564, + "step": 11861 + }, + { + "epoch": 0.960952689565781, + "grad_norm": 0.038227930665016174, + "learning_rate": 0.00016883748143480806, + "loss": 0.3468, + "step": 11862 + }, + { + "epoch": 0.9610337005832793, + "grad_norm": 0.034565817564725876, + "learning_rate": 0.00016883298078221345, + "loss": 0.2818, + "step": 11863 + }, + { + "epoch": 0.9611147116007777, + "grad_norm": 0.03184519335627556, + "learning_rate": 0.0001688284801296188, + "loss": 0.3059, + "step": 11864 + }, + { + "epoch": 0.9611957226182761, + "grad_norm": 0.03285719081759453, + "learning_rate": 0.00016882397947702417, + "loss": 0.3113, + "step": 11865 + }, + { + "epoch": 0.9612767336357745, + "grad_norm": 0.037567757070064545, + "learning_rate": 0.00016881947882442955, + "loss": 0.2988, + "step": 11866 + }, + { + "epoch": 0.9613577446532728, + "grad_norm": 0.039061255753040314, + "learning_rate": 0.0001688149781718349, + "loss": 0.323, + "step": 11867 + }, + { + "epoch": 0.9614387556707712, + "grad_norm": 0.03476432338356972, + "learning_rate": 0.0001688104775192403, + "loss": 0.347, + "step": 11868 + }, + { + "epoch": 0.9615197666882696, + "grad_norm": 0.032409682869911194, + "learning_rate": 0.0001688059768666457, + "loss": 0.307, + "step": 11869 + }, + { + "epoch": 0.961600777705768, + "grad_norm": 0.03706350550055504, + "learning_rate": 0.00016880147621405105, + "loss": 0.3321, + "step": 11870 + }, + { + "epoch": 0.9616817887232664, + "grad_norm": 0.03343446925282478, + "learning_rate": 0.0001687969755614564, + "loss": 0.3036, + "step": 11871 + }, + { + "epoch": 0.9617627997407647, + "grad_norm": 0.031622350215911865, + "learning_rate": 0.0001687924749088618, + "loss": 0.3016, + "step": 11872 + }, + { + "epoch": 0.9618438107582631, + "grad_norm": 0.032086245715618134, + "learning_rate": 0.00016878797425626715, + "loss": 0.3114, + "step": 11873 + }, + { + "epoch": 0.9619248217757616, + "grad_norm": 0.03357468545436859, + "learning_rate": 0.00016878347360367254, + "loss": 0.309, + "step": 11874 + }, + { + "epoch": 0.9620058327932599, + "grad_norm": 0.03183509781956673, + "learning_rate": 0.00016877897295107793, + "loss": 0.3006, + "step": 11875 + }, + { + "epoch": 0.9620868438107583, + "grad_norm": 0.03746958449482918, + "learning_rate": 0.0001687744722984833, + "loss": 0.3443, + "step": 11876 + }, + { + "epoch": 0.9621678548282566, + "grad_norm": 0.03041190281510353, + "learning_rate": 0.00016876997164588865, + "loss": 0.3113, + "step": 11877 + }, + { + "epoch": 0.962248865845755, + "grad_norm": 0.03378698229789734, + "learning_rate": 0.00016876547099329404, + "loss": 0.3163, + "step": 11878 + }, + { + "epoch": 0.9623298768632534, + "grad_norm": 0.03692770004272461, + "learning_rate": 0.00016876097034069942, + "loss": 0.2977, + "step": 11879 + }, + { + "epoch": 0.9624108878807518, + "grad_norm": 0.038222331553697586, + "learning_rate": 0.00016875646968810478, + "loss": 0.3146, + "step": 11880 + }, + { + "epoch": 0.9624918988982502, + "grad_norm": 0.04049944877624512, + "learning_rate": 0.00016875196903551017, + "loss": 0.3635, + "step": 11881 + }, + { + "epoch": 0.9625729099157485, + "grad_norm": 0.03273358568549156, + "learning_rate": 0.00016874746838291553, + "loss": 0.3228, + "step": 11882 + }, + { + "epoch": 0.962653920933247, + "grad_norm": 0.03491583839058876, + "learning_rate": 0.0001687429677303209, + "loss": 0.2915, + "step": 11883 + }, + { + "epoch": 0.9627349319507453, + "grad_norm": 0.037491586059331894, + "learning_rate": 0.00016873846707772628, + "loss": 0.3297, + "step": 11884 + }, + { + "epoch": 0.9628159429682437, + "grad_norm": 0.03505074232816696, + "learning_rate": 0.00016873396642513167, + "loss": 0.3111, + "step": 11885 + }, + { + "epoch": 0.962896953985742, + "grad_norm": 0.036803267896175385, + "learning_rate": 0.00016872946577253703, + "loss": 0.3451, + "step": 11886 + }, + { + "epoch": 0.9629779650032404, + "grad_norm": 0.03782041743397713, + "learning_rate": 0.0001687249651199424, + "loss": 0.3575, + "step": 11887 + }, + { + "epoch": 0.9630589760207389, + "grad_norm": 0.0339006632566452, + "learning_rate": 0.00016872046446734777, + "loss": 0.2886, + "step": 11888 + }, + { + "epoch": 0.9631399870382372, + "grad_norm": 0.038778334856033325, + "learning_rate": 0.00016871596381475313, + "loss": 0.3627, + "step": 11889 + }, + { + "epoch": 0.9632209980557356, + "grad_norm": 0.033467769622802734, + "learning_rate": 0.00016871146316215852, + "loss": 0.3282, + "step": 11890 + }, + { + "epoch": 0.9633020090732339, + "grad_norm": 0.04024284705519676, + "learning_rate": 0.0001687069625095639, + "loss": 0.3378, + "step": 11891 + }, + { + "epoch": 0.9633830200907323, + "grad_norm": 0.032425012439489365, + "learning_rate": 0.00016870246185696927, + "loss": 0.318, + "step": 11892 + }, + { + "epoch": 0.9634640311082308, + "grad_norm": 0.033856865018606186, + "learning_rate": 0.00016869796120437465, + "loss": 0.3001, + "step": 11893 + }, + { + "epoch": 0.9635450421257291, + "grad_norm": 0.0412677526473999, + "learning_rate": 0.00016869346055178001, + "loss": 0.325, + "step": 11894 + }, + { + "epoch": 0.9636260531432275, + "grad_norm": 0.037516701966524124, + "learning_rate": 0.00016868895989918537, + "loss": 0.3859, + "step": 11895 + }, + { + "epoch": 0.9637070641607258, + "grad_norm": 0.03334256261587143, + "learning_rate": 0.00016868445924659076, + "loss": 0.3469, + "step": 11896 + }, + { + "epoch": 0.9637880751782243, + "grad_norm": 0.0391136072576046, + "learning_rate": 0.00016867995859399615, + "loss": 0.3106, + "step": 11897 + }, + { + "epoch": 0.9638690861957226, + "grad_norm": 0.03461690992116928, + "learning_rate": 0.0001686754579414015, + "loss": 0.3162, + "step": 11898 + }, + { + "epoch": 0.963950097213221, + "grad_norm": 0.03258664906024933, + "learning_rate": 0.0001686709572888069, + "loss": 0.3114, + "step": 11899 + }, + { + "epoch": 0.9640311082307194, + "grad_norm": 0.03400241583585739, + "learning_rate": 0.00016866645663621226, + "loss": 0.3152, + "step": 11900 + }, + { + "epoch": 0.9641121192482177, + "grad_norm": 0.034580402076244354, + "learning_rate": 0.00016866195598361762, + "loss": 0.3116, + "step": 11901 + }, + { + "epoch": 0.9641931302657162, + "grad_norm": 0.032911915332078934, + "learning_rate": 0.000168657455331023, + "loss": 0.326, + "step": 11902 + }, + { + "epoch": 0.9642741412832145, + "grad_norm": 0.035322003066539764, + "learning_rate": 0.0001686529546784284, + "loss": 0.3314, + "step": 11903 + }, + { + "epoch": 0.9643551523007129, + "grad_norm": 0.044158484786748886, + "learning_rate": 0.00016864845402583375, + "loss": 0.3966, + "step": 11904 + }, + { + "epoch": 0.9644361633182112, + "grad_norm": 0.03736729547381401, + "learning_rate": 0.00016864395337323914, + "loss": 0.2828, + "step": 11905 + }, + { + "epoch": 0.9645171743357096, + "grad_norm": 0.03226647526025772, + "learning_rate": 0.0001686394527206445, + "loss": 0.3046, + "step": 11906 + }, + { + "epoch": 0.9645981853532081, + "grad_norm": 0.03562768176198006, + "learning_rate": 0.00016863495206804986, + "loss": 0.3168, + "step": 11907 + }, + { + "epoch": 0.9646791963707064, + "grad_norm": 0.03386708348989487, + "learning_rate": 0.00016863045141545527, + "loss": 0.3114, + "step": 11908 + }, + { + "epoch": 0.9647602073882048, + "grad_norm": 0.04202236980199814, + "learning_rate": 0.00016862595076286063, + "loss": 0.3351, + "step": 11909 + }, + { + "epoch": 0.9648412184057031, + "grad_norm": 0.0363929346203804, + "learning_rate": 0.000168621450110266, + "loss": 0.3274, + "step": 11910 + }, + { + "epoch": 0.9649222294232016, + "grad_norm": 0.041646115481853485, + "learning_rate": 0.00016861694945767138, + "loss": 0.3577, + "step": 11911 + }, + { + "epoch": 0.9650032404407, + "grad_norm": 0.03489062562584877, + "learning_rate": 0.00016861244880507674, + "loss": 0.3014, + "step": 11912 + }, + { + "epoch": 0.9650842514581983, + "grad_norm": 0.03679816424846649, + "learning_rate": 0.0001686079481524821, + "loss": 0.3269, + "step": 11913 + }, + { + "epoch": 0.9651652624756967, + "grad_norm": 0.030343173071742058, + "learning_rate": 0.0001686034474998875, + "loss": 0.2979, + "step": 11914 + }, + { + "epoch": 0.965246273493195, + "grad_norm": 0.03204835578799248, + "learning_rate": 0.00016859894684729287, + "loss": 0.2832, + "step": 11915 + }, + { + "epoch": 0.9653272845106935, + "grad_norm": 0.03680717572569847, + "learning_rate": 0.00016859444619469823, + "loss": 0.3436, + "step": 11916 + }, + { + "epoch": 0.9654082955281919, + "grad_norm": 0.0416983887553215, + "learning_rate": 0.00016858994554210362, + "loss": 0.3516, + "step": 11917 + }, + { + "epoch": 0.9654893065456902, + "grad_norm": 0.0383131317794323, + "learning_rate": 0.00016858544488950898, + "loss": 0.3122, + "step": 11918 + }, + { + "epoch": 0.9655703175631886, + "grad_norm": 0.03952203318476677, + "learning_rate": 0.00016858094423691434, + "loss": 0.3272, + "step": 11919 + }, + { + "epoch": 0.9656513285806869, + "grad_norm": 0.03892749920487404, + "learning_rate": 0.00016857644358431975, + "loss": 0.3626, + "step": 11920 + }, + { + "epoch": 0.9657323395981854, + "grad_norm": 0.04064110666513443, + "learning_rate": 0.00016857194293172511, + "loss": 0.347, + "step": 11921 + }, + { + "epoch": 0.9658133506156837, + "grad_norm": 0.03372594714164734, + "learning_rate": 0.00016856744227913047, + "loss": 0.294, + "step": 11922 + }, + { + "epoch": 0.9658943616331821, + "grad_norm": 0.036753252148628235, + "learning_rate": 0.00016856294162653586, + "loss": 0.3506, + "step": 11923 + }, + { + "epoch": 0.9659753726506805, + "grad_norm": 0.035451460629701614, + "learning_rate": 0.00016855844097394122, + "loss": 0.327, + "step": 11924 + }, + { + "epoch": 0.9660563836681789, + "grad_norm": 0.038048360496759415, + "learning_rate": 0.00016855394032134658, + "loss": 0.3112, + "step": 11925 + }, + { + "epoch": 0.9661373946856773, + "grad_norm": 0.032932937145233154, + "learning_rate": 0.000168549439668752, + "loss": 0.3371, + "step": 11926 + }, + { + "epoch": 0.9662184057031756, + "grad_norm": 0.034424230456352234, + "learning_rate": 0.00016854493901615736, + "loss": 0.312, + "step": 11927 + }, + { + "epoch": 0.966299416720674, + "grad_norm": 0.03948157653212547, + "learning_rate": 0.00016854043836356272, + "loss": 0.3457, + "step": 11928 + }, + { + "epoch": 0.9663804277381723, + "grad_norm": 0.03489063307642937, + "learning_rate": 0.0001685359377109681, + "loss": 0.3272, + "step": 11929 + }, + { + "epoch": 0.9664614387556708, + "grad_norm": 0.038051407784223557, + "learning_rate": 0.00016853143705837346, + "loss": 0.3468, + "step": 11930 + }, + { + "epoch": 0.9665424497731692, + "grad_norm": 0.036160316318273544, + "learning_rate": 0.00016852693640577885, + "loss": 0.3286, + "step": 11931 + }, + { + "epoch": 0.9666234607906675, + "grad_norm": 0.043658457696437836, + "learning_rate": 0.00016852243575318424, + "loss": 0.3146, + "step": 11932 + }, + { + "epoch": 0.9667044718081659, + "grad_norm": 0.0352136455476284, + "learning_rate": 0.0001685179351005896, + "loss": 0.2906, + "step": 11933 + }, + { + "epoch": 0.9667854828256643, + "grad_norm": 0.03503730893135071, + "learning_rate": 0.00016851343444799496, + "loss": 0.3469, + "step": 11934 + }, + { + "epoch": 0.9668664938431627, + "grad_norm": 0.03400106355547905, + "learning_rate": 0.00016850893379540035, + "loss": 0.2711, + "step": 11935 + }, + { + "epoch": 0.9669475048606611, + "grad_norm": 0.03846592828631401, + "learning_rate": 0.0001685044331428057, + "loss": 0.343, + "step": 11936 + }, + { + "epoch": 0.9670285158781594, + "grad_norm": 0.037533823400735855, + "learning_rate": 0.0001684999324902111, + "loss": 0.3429, + "step": 11937 + }, + { + "epoch": 0.9671095268956578, + "grad_norm": 0.03524777293205261, + "learning_rate": 0.00016849543183761648, + "loss": 0.3448, + "step": 11938 + }, + { + "epoch": 0.9671905379131562, + "grad_norm": 0.0354623906314373, + "learning_rate": 0.00016849093118502184, + "loss": 0.3177, + "step": 11939 + }, + { + "epoch": 0.9672715489306546, + "grad_norm": 0.03387176990509033, + "learning_rate": 0.0001684864305324272, + "loss": 0.3028, + "step": 11940 + }, + { + "epoch": 0.967352559948153, + "grad_norm": 0.033984046429395676, + "learning_rate": 0.0001684819298798326, + "loss": 0.3333, + "step": 11941 + }, + { + "epoch": 0.9674335709656513, + "grad_norm": 0.034461379051208496, + "learning_rate": 0.00016847742922723795, + "loss": 0.2981, + "step": 11942 + }, + { + "epoch": 0.9675145819831497, + "grad_norm": 0.0331692174077034, + "learning_rate": 0.00016847292857464333, + "loss": 0.3362, + "step": 11943 + }, + { + "epoch": 0.9675955930006481, + "grad_norm": 0.030690953135490417, + "learning_rate": 0.00016846842792204872, + "loss": 0.2703, + "step": 11944 + }, + { + "epoch": 0.9676766040181465, + "grad_norm": 0.03597325086593628, + "learning_rate": 0.00016846392726945408, + "loss": 0.3525, + "step": 11945 + }, + { + "epoch": 0.9677576150356448, + "grad_norm": 0.03336905315518379, + "learning_rate": 0.00016845942661685944, + "loss": 0.3418, + "step": 11946 + }, + { + "epoch": 0.9678386260531432, + "grad_norm": 0.03322875499725342, + "learning_rate": 0.00016845492596426483, + "loss": 0.3311, + "step": 11947 + }, + { + "epoch": 0.9679196370706417, + "grad_norm": 0.03448665514588356, + "learning_rate": 0.0001684504253116702, + "loss": 0.3301, + "step": 11948 + }, + { + "epoch": 0.96800064808814, + "grad_norm": 0.03418492153286934, + "learning_rate": 0.00016844592465907558, + "loss": 0.3148, + "step": 11949 + }, + { + "epoch": 0.9680816591056384, + "grad_norm": 0.040306005626916885, + "learning_rate": 0.00016844142400648096, + "loss": 0.3654, + "step": 11950 + }, + { + "epoch": 0.9681626701231367, + "grad_norm": 0.0345134511590004, + "learning_rate": 0.00016843692335388632, + "loss": 0.324, + "step": 11951 + }, + { + "epoch": 0.9682436811406351, + "grad_norm": 0.03828649967908859, + "learning_rate": 0.00016843242270129168, + "loss": 0.312, + "step": 11952 + }, + { + "epoch": 0.9683246921581335, + "grad_norm": 0.03464636951684952, + "learning_rate": 0.00016842792204869707, + "loss": 0.3217, + "step": 11953 + }, + { + "epoch": 0.9684057031756319, + "grad_norm": 0.03320741280913353, + "learning_rate": 0.00016842342139610243, + "loss": 0.3334, + "step": 11954 + }, + { + "epoch": 0.9684867141931303, + "grad_norm": 0.03954046592116356, + "learning_rate": 0.00016841892074350782, + "loss": 0.3285, + "step": 11955 + }, + { + "epoch": 0.9685677252106286, + "grad_norm": 0.03934956341981888, + "learning_rate": 0.0001684144200909132, + "loss": 0.3164, + "step": 11956 + }, + { + "epoch": 0.968648736228127, + "grad_norm": 0.031792640686035156, + "learning_rate": 0.00016840991943831856, + "loss": 0.2543, + "step": 11957 + }, + { + "epoch": 0.9687297472456254, + "grad_norm": 0.037689026445150375, + "learning_rate": 0.00016840541878572392, + "loss": 0.3358, + "step": 11958 + }, + { + "epoch": 0.9688107582631238, + "grad_norm": 0.03448779508471489, + "learning_rate": 0.0001684009181331293, + "loss": 0.3659, + "step": 11959 + }, + { + "epoch": 0.9688917692806222, + "grad_norm": 0.03423665836453438, + "learning_rate": 0.0001683964174805347, + "loss": 0.314, + "step": 11960 + }, + { + "epoch": 0.9689727802981205, + "grad_norm": 0.03471982851624489, + "learning_rate": 0.00016839191682794006, + "loss": 0.3302, + "step": 11961 + }, + { + "epoch": 0.969053791315619, + "grad_norm": 0.03254758566617966, + "learning_rate": 0.00016838741617534545, + "loss": 0.3197, + "step": 11962 + }, + { + "epoch": 0.9691348023331173, + "grad_norm": 0.03508211299777031, + "learning_rate": 0.0001683829155227508, + "loss": 0.3282, + "step": 11963 + }, + { + "epoch": 0.9692158133506157, + "grad_norm": 0.037181735038757324, + "learning_rate": 0.00016837841487015617, + "loss": 0.3467, + "step": 11964 + }, + { + "epoch": 0.969296824368114, + "grad_norm": 0.03629257529973984, + "learning_rate": 0.00016837391421756155, + "loss": 0.3154, + "step": 11965 + }, + { + "epoch": 0.9693778353856124, + "grad_norm": 0.03312021866440773, + "learning_rate": 0.00016836941356496694, + "loss": 0.3009, + "step": 11966 + }, + { + "epoch": 0.9694588464031109, + "grad_norm": 0.0314815379679203, + "learning_rate": 0.0001683649129123723, + "loss": 0.3232, + "step": 11967 + }, + { + "epoch": 0.9695398574206092, + "grad_norm": 0.03045765683054924, + "learning_rate": 0.0001683604122597777, + "loss": 0.2831, + "step": 11968 + }, + { + "epoch": 0.9696208684381076, + "grad_norm": 0.03353416547179222, + "learning_rate": 0.00016835591160718305, + "loss": 0.2955, + "step": 11969 + }, + { + "epoch": 0.9697018794556059, + "grad_norm": 0.035733599215745926, + "learning_rate": 0.0001683514109545884, + "loss": 0.3178, + "step": 11970 + }, + { + "epoch": 0.9697828904731044, + "grad_norm": 0.03448764607310295, + "learning_rate": 0.0001683469103019938, + "loss": 0.336, + "step": 11971 + }, + { + "epoch": 0.9698639014906028, + "grad_norm": 0.035881828516721725, + "learning_rate": 0.00016834240964939918, + "loss": 0.2854, + "step": 11972 + }, + { + "epoch": 0.9699449125081011, + "grad_norm": 0.03701227903366089, + "learning_rate": 0.00016833790899680454, + "loss": 0.3392, + "step": 11973 + }, + { + "epoch": 0.9700259235255995, + "grad_norm": 0.03427772596478462, + "learning_rate": 0.00016833340834420993, + "loss": 0.3131, + "step": 11974 + }, + { + "epoch": 0.9701069345430978, + "grad_norm": 0.03733328357338905, + "learning_rate": 0.0001683289076916153, + "loss": 0.3246, + "step": 11975 + }, + { + "epoch": 0.9701879455605963, + "grad_norm": 0.03317997232079506, + "learning_rate": 0.00016832440703902065, + "loss": 0.3102, + "step": 11976 + }, + { + "epoch": 0.9702689565780946, + "grad_norm": 0.032740697264671326, + "learning_rate": 0.00016831990638642604, + "loss": 0.3, + "step": 11977 + }, + { + "epoch": 0.970349967595593, + "grad_norm": 0.03583207353949547, + "learning_rate": 0.00016831540573383142, + "loss": 0.3076, + "step": 11978 + }, + { + "epoch": 0.9704309786130914, + "grad_norm": 0.04081375524401665, + "learning_rate": 0.00016831090508123678, + "loss": 0.356, + "step": 11979 + }, + { + "epoch": 0.9705119896305897, + "grad_norm": 0.039508093148469925, + "learning_rate": 0.00016830640442864217, + "loss": 0.2998, + "step": 11980 + }, + { + "epoch": 0.9705930006480882, + "grad_norm": 0.03704408183693886, + "learning_rate": 0.00016830190377604753, + "loss": 0.3403, + "step": 11981 + }, + { + "epoch": 0.9706740116655865, + "grad_norm": 0.038443513214588165, + "learning_rate": 0.0001682974031234529, + "loss": 0.3315, + "step": 11982 + }, + { + "epoch": 0.9707550226830849, + "grad_norm": 0.03290339931845665, + "learning_rate": 0.0001682929024708583, + "loss": 0.3021, + "step": 11983 + }, + { + "epoch": 0.9708360337005832, + "grad_norm": 0.03160303458571434, + "learning_rate": 0.00016828840181826367, + "loss": 0.2866, + "step": 11984 + }, + { + "epoch": 0.9709170447180817, + "grad_norm": 0.03647925704717636, + "learning_rate": 0.00016828390116566903, + "loss": 0.3198, + "step": 11985 + }, + { + "epoch": 0.9709980557355801, + "grad_norm": 0.038130663335323334, + "learning_rate": 0.0001682794005130744, + "loss": 0.3386, + "step": 11986 + }, + { + "epoch": 0.9710790667530784, + "grad_norm": 0.036003727465867996, + "learning_rate": 0.00016827489986047977, + "loss": 0.2976, + "step": 11987 + }, + { + "epoch": 0.9711600777705768, + "grad_norm": 0.03716365620493889, + "learning_rate": 0.00016827039920788513, + "loss": 0.369, + "step": 11988 + }, + { + "epoch": 0.9712410887880751, + "grad_norm": 0.03195514902472496, + "learning_rate": 0.00016826589855529055, + "loss": 0.3043, + "step": 11989 + }, + { + "epoch": 0.9713220998055736, + "grad_norm": 0.03346724435687065, + "learning_rate": 0.0001682613979026959, + "loss": 0.3165, + "step": 11990 + }, + { + "epoch": 0.971403110823072, + "grad_norm": 0.029482269659638405, + "learning_rate": 0.00016825689725010127, + "loss": 0.2691, + "step": 11991 + }, + { + "epoch": 0.9714841218405703, + "grad_norm": 0.031241513788700104, + "learning_rate": 0.00016825239659750665, + "loss": 0.3227, + "step": 11992 + }, + { + "epoch": 0.9715651328580687, + "grad_norm": 0.03511514514684677, + "learning_rate": 0.00016824789594491201, + "loss": 0.3189, + "step": 11993 + }, + { + "epoch": 0.971646143875567, + "grad_norm": 0.03130088746547699, + "learning_rate": 0.00016824339529231737, + "loss": 0.3331, + "step": 11994 + }, + { + "epoch": 0.9717271548930655, + "grad_norm": 0.034547485411167145, + "learning_rate": 0.0001682388946397228, + "loss": 0.3485, + "step": 11995 + }, + { + "epoch": 0.9718081659105638, + "grad_norm": 0.04532260447740555, + "learning_rate": 0.00016823439398712815, + "loss": 0.3239, + "step": 11996 + }, + { + "epoch": 0.9718891769280622, + "grad_norm": 0.03304055333137512, + "learning_rate": 0.0001682298933345335, + "loss": 0.3066, + "step": 11997 + }, + { + "epoch": 0.9719701879455606, + "grad_norm": 0.031891681253910065, + "learning_rate": 0.0001682253926819389, + "loss": 0.3086, + "step": 11998 + }, + { + "epoch": 0.972051198963059, + "grad_norm": 0.03132178634405136, + "learning_rate": 0.00016822089202934426, + "loss": 0.2935, + "step": 11999 + }, + { + "epoch": 0.9721322099805574, + "grad_norm": 0.03411552309989929, + "learning_rate": 0.00016821639137674962, + "loss": 0.3179, + "step": 12000 + }, + { + "epoch": 0.9722132209980557, + "grad_norm": 0.03236962482333183, + "learning_rate": 0.00016821189072415503, + "loss": 0.3086, + "step": 12001 + }, + { + "epoch": 0.9722942320155541, + "grad_norm": 0.03681671991944313, + "learning_rate": 0.0001682073900715604, + "loss": 0.3545, + "step": 12002 + }, + { + "epoch": 0.9723752430330524, + "grad_norm": 0.0358586311340332, + "learning_rate": 0.00016820288941896575, + "loss": 0.2965, + "step": 12003 + }, + { + "epoch": 0.9724562540505509, + "grad_norm": 0.036946721374988556, + "learning_rate": 0.00016819838876637114, + "loss": 0.3616, + "step": 12004 + }, + { + "epoch": 0.9725372650680493, + "grad_norm": 0.037640947848558426, + "learning_rate": 0.0001681938881137765, + "loss": 0.3717, + "step": 12005 + }, + { + "epoch": 0.9726182760855476, + "grad_norm": 0.039124995470047, + "learning_rate": 0.00016818938746118186, + "loss": 0.3478, + "step": 12006 + }, + { + "epoch": 0.972699287103046, + "grad_norm": 0.03698194399476051, + "learning_rate": 0.00016818488680858727, + "loss": 0.3166, + "step": 12007 + }, + { + "epoch": 0.9727802981205443, + "grad_norm": 0.0408819317817688, + "learning_rate": 0.00016818038615599263, + "loss": 0.3266, + "step": 12008 + }, + { + "epoch": 0.9728613091380428, + "grad_norm": 0.03516923636198044, + "learning_rate": 0.000168175885503398, + "loss": 0.3507, + "step": 12009 + }, + { + "epoch": 0.9729423201555412, + "grad_norm": 0.03453219681978226, + "learning_rate": 0.00016817138485080338, + "loss": 0.2954, + "step": 12010 + }, + { + "epoch": 0.9730233311730395, + "grad_norm": 0.04005401208996773, + "learning_rate": 0.00016816688419820874, + "loss": 0.3521, + "step": 12011 + }, + { + "epoch": 0.9731043421905379, + "grad_norm": 0.03611200675368309, + "learning_rate": 0.00016816238354561413, + "loss": 0.3156, + "step": 12012 + }, + { + "epoch": 0.9731853532080363, + "grad_norm": 0.031062688678503036, + "learning_rate": 0.0001681578828930195, + "loss": 0.2817, + "step": 12013 + }, + { + "epoch": 0.9732663642255347, + "grad_norm": 0.033956896513700485, + "learning_rate": 0.00016815338224042487, + "loss": 0.3013, + "step": 12014 + }, + { + "epoch": 0.973347375243033, + "grad_norm": 0.03631037846207619, + "learning_rate": 0.00016814888158783023, + "loss": 0.3182, + "step": 12015 + }, + { + "epoch": 0.9734283862605314, + "grad_norm": 0.035651616752147675, + "learning_rate": 0.00016814438093523562, + "loss": 0.3212, + "step": 12016 + }, + { + "epoch": 0.9735093972780298, + "grad_norm": 0.036742936819791794, + "learning_rate": 0.00016813988028264098, + "loss": 0.3165, + "step": 12017 + }, + { + "epoch": 0.9735904082955282, + "grad_norm": 0.03688966482877731, + "learning_rate": 0.00016813537963004637, + "loss": 0.3841, + "step": 12018 + }, + { + "epoch": 0.9736714193130266, + "grad_norm": 0.03511689230799675, + "learning_rate": 0.00016813087897745176, + "loss": 0.29, + "step": 12019 + }, + { + "epoch": 0.9737524303305249, + "grad_norm": 0.0386803075671196, + "learning_rate": 0.00016812637832485712, + "loss": 0.3592, + "step": 12020 + }, + { + "epoch": 0.9738334413480233, + "grad_norm": 0.03346274420619011, + "learning_rate": 0.00016812187767226248, + "loss": 0.2931, + "step": 12021 + }, + { + "epoch": 0.9739144523655218, + "grad_norm": 0.03622061759233475, + "learning_rate": 0.00016811737701966786, + "loss": 0.3199, + "step": 12022 + }, + { + "epoch": 0.9739954633830201, + "grad_norm": 0.043558187782764435, + "learning_rate": 0.00016811287636707322, + "loss": 0.3601, + "step": 12023 + }, + { + "epoch": 0.9740764744005185, + "grad_norm": 0.03961658477783203, + "learning_rate": 0.0001681083757144786, + "loss": 0.3545, + "step": 12024 + }, + { + "epoch": 0.9741574854180168, + "grad_norm": 0.032546672970056534, + "learning_rate": 0.000168103875061884, + "loss": 0.2833, + "step": 12025 + }, + { + "epoch": 0.9742384964355152, + "grad_norm": 0.03565071523189545, + "learning_rate": 0.00016809937440928936, + "loss": 0.3108, + "step": 12026 + }, + { + "epoch": 0.9743195074530137, + "grad_norm": 0.03935694321990013, + "learning_rate": 0.00016809487375669472, + "loss": 0.3424, + "step": 12027 + }, + { + "epoch": 0.974400518470512, + "grad_norm": 0.03298564627766609, + "learning_rate": 0.0001680903731041001, + "loss": 0.2936, + "step": 12028 + }, + { + "epoch": 0.9744815294880104, + "grad_norm": 0.029612941667437553, + "learning_rate": 0.00016808587245150546, + "loss": 0.2605, + "step": 12029 + }, + { + "epoch": 0.9745625405055087, + "grad_norm": 0.03617319092154503, + "learning_rate": 0.00016808137179891085, + "loss": 0.3319, + "step": 12030 + }, + { + "epoch": 0.9746435515230071, + "grad_norm": 0.036087553948163986, + "learning_rate": 0.00016807687114631624, + "loss": 0.3346, + "step": 12031 + }, + { + "epoch": 0.9747245625405055, + "grad_norm": 0.033548567444086075, + "learning_rate": 0.0001680723704937216, + "loss": 0.3214, + "step": 12032 + }, + { + "epoch": 0.9748055735580039, + "grad_norm": 0.03468198701739311, + "learning_rate": 0.00016806786984112696, + "loss": 0.3056, + "step": 12033 + }, + { + "epoch": 0.9748865845755023, + "grad_norm": 0.03659326583147049, + "learning_rate": 0.00016806336918853235, + "loss": 0.3159, + "step": 12034 + }, + { + "epoch": 0.9749675955930006, + "grad_norm": 0.03924743831157684, + "learning_rate": 0.0001680588685359377, + "loss": 0.3788, + "step": 12035 + }, + { + "epoch": 0.9750486066104991, + "grad_norm": 0.03203437477350235, + "learning_rate": 0.0001680543678833431, + "loss": 0.2802, + "step": 12036 + }, + { + "epoch": 0.9751296176279974, + "grad_norm": 0.034819379448890686, + "learning_rate": 0.00016804986723074848, + "loss": 0.3311, + "step": 12037 + }, + { + "epoch": 0.9752106286454958, + "grad_norm": 0.033765386790037155, + "learning_rate": 0.00016804536657815384, + "loss": 0.3156, + "step": 12038 + }, + { + "epoch": 0.9752916396629941, + "grad_norm": 0.035257935523986816, + "learning_rate": 0.0001680408659255592, + "loss": 0.2908, + "step": 12039 + }, + { + "epoch": 0.9753726506804925, + "grad_norm": 0.03877081349492073, + "learning_rate": 0.0001680363652729646, + "loss": 0.337, + "step": 12040 + }, + { + "epoch": 0.975453661697991, + "grad_norm": 0.03562001883983612, + "learning_rate": 0.00016803186462036997, + "loss": 0.2947, + "step": 12041 + }, + { + "epoch": 0.9755346727154893, + "grad_norm": 0.036863259971141815, + "learning_rate": 0.00016802736396777533, + "loss": 0.3313, + "step": 12042 + }, + { + "epoch": 0.9756156837329877, + "grad_norm": 0.03537532687187195, + "learning_rate": 0.00016802286331518072, + "loss": 0.3271, + "step": 12043 + }, + { + "epoch": 0.975696694750486, + "grad_norm": 0.031441546976566315, + "learning_rate": 0.00016801836266258608, + "loss": 0.2997, + "step": 12044 + }, + { + "epoch": 0.9757777057679844, + "grad_norm": 0.029675081372261047, + "learning_rate": 0.00016801386200999144, + "loss": 0.2649, + "step": 12045 + }, + { + "epoch": 0.9758587167854829, + "grad_norm": 0.038790564984083176, + "learning_rate": 0.00016800936135739683, + "loss": 0.3773, + "step": 12046 + }, + { + "epoch": 0.9759397278029812, + "grad_norm": 0.031189288944005966, + "learning_rate": 0.00016800486070480222, + "loss": 0.3211, + "step": 12047 + }, + { + "epoch": 0.9760207388204796, + "grad_norm": 0.03335803374648094, + "learning_rate": 0.00016800036005220758, + "loss": 0.3334, + "step": 12048 + }, + { + "epoch": 0.9761017498379779, + "grad_norm": 0.033088475465774536, + "learning_rate": 0.00016799585939961296, + "loss": 0.3224, + "step": 12049 + }, + { + "epoch": 0.9761827608554764, + "grad_norm": 0.037221070379018784, + "learning_rate": 0.00016799135874701832, + "loss": 0.3163, + "step": 12050 + }, + { + "epoch": 0.9762637718729748, + "grad_norm": 0.03499598801136017, + "learning_rate": 0.00016798685809442368, + "loss": 0.3521, + "step": 12051 + }, + { + "epoch": 0.9763447828904731, + "grad_norm": 0.03871636092662811, + "learning_rate": 0.00016798235744182907, + "loss": 0.3824, + "step": 12052 + }, + { + "epoch": 0.9764257939079715, + "grad_norm": 0.03844713792204857, + "learning_rate": 0.00016797785678923446, + "loss": 0.4189, + "step": 12053 + }, + { + "epoch": 0.9765068049254698, + "grad_norm": 0.037115678191185, + "learning_rate": 0.00016797335613663982, + "loss": 0.3243, + "step": 12054 + }, + { + "epoch": 0.9765878159429683, + "grad_norm": 0.040090303868055344, + "learning_rate": 0.0001679688554840452, + "loss": 0.3462, + "step": 12055 + }, + { + "epoch": 0.9766688269604666, + "grad_norm": 0.030491787940263748, + "learning_rate": 0.00016796435483145056, + "loss": 0.3018, + "step": 12056 + }, + { + "epoch": 0.976749837977965, + "grad_norm": 0.033908918499946594, + "learning_rate": 0.00016795985417885592, + "loss": 0.2919, + "step": 12057 + }, + { + "epoch": 0.9768308489954634, + "grad_norm": 0.034717120230197906, + "learning_rate": 0.0001679553535262613, + "loss": 0.2919, + "step": 12058 + }, + { + "epoch": 0.9769118600129617, + "grad_norm": 0.031752314418554306, + "learning_rate": 0.0001679508528736667, + "loss": 0.2668, + "step": 12059 + }, + { + "epoch": 0.9769928710304602, + "grad_norm": 0.03231941908597946, + "learning_rate": 0.00016794635222107206, + "loss": 0.2891, + "step": 12060 + }, + { + "epoch": 0.9770738820479585, + "grad_norm": 0.038936734199523926, + "learning_rate": 0.00016794185156847745, + "loss": 0.3659, + "step": 12061 + }, + { + "epoch": 0.9771548930654569, + "grad_norm": 0.03846294805407524, + "learning_rate": 0.0001679373509158828, + "loss": 0.395, + "step": 12062 + }, + { + "epoch": 0.9772359040829552, + "grad_norm": 0.04152911528944969, + "learning_rate": 0.00016793285026328817, + "loss": 0.4063, + "step": 12063 + }, + { + "epoch": 0.9773169151004537, + "grad_norm": 0.03231941536068916, + "learning_rate": 0.00016792834961069358, + "loss": 0.2817, + "step": 12064 + }, + { + "epoch": 0.9773979261179521, + "grad_norm": 0.0361601822078228, + "learning_rate": 0.00016792384895809894, + "loss": 0.3337, + "step": 12065 + }, + { + "epoch": 0.9774789371354504, + "grad_norm": 0.0319807194173336, + "learning_rate": 0.0001679193483055043, + "loss": 0.3655, + "step": 12066 + }, + { + "epoch": 0.9775599481529488, + "grad_norm": 0.039843834936618805, + "learning_rate": 0.0001679148476529097, + "loss": 0.3795, + "step": 12067 + }, + { + "epoch": 0.9776409591704471, + "grad_norm": 0.03642282634973526, + "learning_rate": 0.00016791034700031505, + "loss": 0.3392, + "step": 12068 + }, + { + "epoch": 0.9777219701879456, + "grad_norm": 0.036994218826293945, + "learning_rate": 0.0001679058463477204, + "loss": 0.315, + "step": 12069 + }, + { + "epoch": 0.977802981205444, + "grad_norm": 0.040465034544467926, + "learning_rate": 0.00016790134569512582, + "loss": 0.3278, + "step": 12070 + }, + { + "epoch": 0.9778839922229423, + "grad_norm": 0.03564620018005371, + "learning_rate": 0.00016789684504253118, + "loss": 0.3344, + "step": 12071 + }, + { + "epoch": 0.9779650032404407, + "grad_norm": 0.03501134738326073, + "learning_rate": 0.00016789234438993654, + "loss": 0.3137, + "step": 12072 + }, + { + "epoch": 0.9780460142579391, + "grad_norm": 0.03337648883461952, + "learning_rate": 0.00016788784373734193, + "loss": 0.3183, + "step": 12073 + }, + { + "epoch": 0.9781270252754375, + "grad_norm": 0.0345093309879303, + "learning_rate": 0.0001678833430847473, + "loss": 0.264, + "step": 12074 + }, + { + "epoch": 0.9782080362929358, + "grad_norm": 0.0420089066028595, + "learning_rate": 0.00016787884243215265, + "loss": 0.3451, + "step": 12075 + }, + { + "epoch": 0.9782890473104342, + "grad_norm": 0.02996142767369747, + "learning_rate": 0.00016787434177955806, + "loss": 0.3257, + "step": 12076 + }, + { + "epoch": 0.9783700583279326, + "grad_norm": 0.03614578768610954, + "learning_rate": 0.00016786984112696342, + "loss": 0.3039, + "step": 12077 + }, + { + "epoch": 0.978451069345431, + "grad_norm": 0.03654995560646057, + "learning_rate": 0.00016786534047436878, + "loss": 0.3459, + "step": 12078 + }, + { + "epoch": 0.9785320803629294, + "grad_norm": 0.03231252729892731, + "learning_rate": 0.00016786083982177417, + "loss": 0.2984, + "step": 12079 + }, + { + "epoch": 0.9786130913804277, + "grad_norm": 0.03695613890886307, + "learning_rate": 0.00016785633916917953, + "loss": 0.3186, + "step": 12080 + }, + { + "epoch": 0.9786941023979261, + "grad_norm": 0.03418729826807976, + "learning_rate": 0.0001678518385165849, + "loss": 0.304, + "step": 12081 + }, + { + "epoch": 0.9787751134154244, + "grad_norm": 0.031289901584386826, + "learning_rate": 0.0001678473378639903, + "loss": 0.3236, + "step": 12082 + }, + { + "epoch": 0.9788561244329229, + "grad_norm": 0.03405127674341202, + "learning_rate": 0.00016784283721139567, + "loss": 0.3379, + "step": 12083 + }, + { + "epoch": 0.9789371354504213, + "grad_norm": 0.040710579603910446, + "learning_rate": 0.00016783833655880103, + "loss": 0.3391, + "step": 12084 + }, + { + "epoch": 0.9790181464679196, + "grad_norm": 0.03499474376440048, + "learning_rate": 0.0001678338359062064, + "loss": 0.3436, + "step": 12085 + }, + { + "epoch": 0.979099157485418, + "grad_norm": 0.04072335362434387, + "learning_rate": 0.00016782933525361177, + "loss": 0.3629, + "step": 12086 + }, + { + "epoch": 0.9791801685029164, + "grad_norm": 0.039114028215408325, + "learning_rate": 0.00016782483460101713, + "loss": 0.3565, + "step": 12087 + }, + { + "epoch": 0.9792611795204148, + "grad_norm": 0.03882155194878578, + "learning_rate": 0.00016782033394842255, + "loss": 0.3352, + "step": 12088 + }, + { + "epoch": 0.9793421905379132, + "grad_norm": 0.04164344444870949, + "learning_rate": 0.0001678158332958279, + "loss": 0.3681, + "step": 12089 + }, + { + "epoch": 0.9794232015554115, + "grad_norm": 0.03964466229081154, + "learning_rate": 0.00016781133264323327, + "loss": 0.3562, + "step": 12090 + }, + { + "epoch": 0.9795042125729099, + "grad_norm": 0.03374399617314339, + "learning_rate": 0.00016780683199063865, + "loss": 0.3174, + "step": 12091 + }, + { + "epoch": 0.9795852235904083, + "grad_norm": 0.03571087867021561, + "learning_rate": 0.00016780233133804401, + "loss": 0.3276, + "step": 12092 + }, + { + "epoch": 0.9796662346079067, + "grad_norm": 0.037918128073215485, + "learning_rate": 0.0001677978306854494, + "loss": 0.2998, + "step": 12093 + }, + { + "epoch": 0.979747245625405, + "grad_norm": 0.03598501533269882, + "learning_rate": 0.0001677933300328548, + "loss": 0.3824, + "step": 12094 + }, + { + "epoch": 0.9798282566429034, + "grad_norm": 0.030146285891532898, + "learning_rate": 0.00016778882938026015, + "loss": 0.2893, + "step": 12095 + }, + { + "epoch": 0.9799092676604018, + "grad_norm": 0.038079775869846344, + "learning_rate": 0.0001677843287276655, + "loss": 0.3156, + "step": 12096 + }, + { + "epoch": 0.9799902786779002, + "grad_norm": 0.0377010740339756, + "learning_rate": 0.0001677798280750709, + "loss": 0.3612, + "step": 12097 + }, + { + "epoch": 0.9800712896953986, + "grad_norm": 0.03465786576271057, + "learning_rate": 0.00016777532742247626, + "loss": 0.3, + "step": 12098 + }, + { + "epoch": 0.9801523007128969, + "grad_norm": 0.03254738450050354, + "learning_rate": 0.00016777082676988164, + "loss": 0.2917, + "step": 12099 + }, + { + "epoch": 0.9802333117303953, + "grad_norm": 0.04372397065162659, + "learning_rate": 0.00016776632611728703, + "loss": 0.3801, + "step": 12100 + }, + { + "epoch": 0.9803143227478938, + "grad_norm": 0.03171698376536369, + "learning_rate": 0.0001677618254646924, + "loss": 0.2705, + "step": 12101 + }, + { + "epoch": 0.9803953337653921, + "grad_norm": 0.03690371662378311, + "learning_rate": 0.00016775732481209775, + "loss": 0.3376, + "step": 12102 + }, + { + "epoch": 0.9804763447828905, + "grad_norm": 0.037281621247529984, + "learning_rate": 0.00016775282415950314, + "loss": 0.3308, + "step": 12103 + }, + { + "epoch": 0.9805573558003888, + "grad_norm": 0.038886263966560364, + "learning_rate": 0.0001677483235069085, + "loss": 0.3401, + "step": 12104 + }, + { + "epoch": 0.9806383668178872, + "grad_norm": 0.032279498875141144, + "learning_rate": 0.00016774382285431388, + "loss": 0.3005, + "step": 12105 + }, + { + "epoch": 0.9807193778353857, + "grad_norm": 0.04319687932729721, + "learning_rate": 0.00016773932220171927, + "loss": 0.3647, + "step": 12106 + }, + { + "epoch": 0.980800388852884, + "grad_norm": 0.03467544540762901, + "learning_rate": 0.00016773482154912463, + "loss": 0.3023, + "step": 12107 + }, + { + "epoch": 0.9808813998703824, + "grad_norm": 0.03290662169456482, + "learning_rate": 0.00016773032089653, + "loss": 0.3045, + "step": 12108 + }, + { + "epoch": 0.9809624108878807, + "grad_norm": 0.03559955209493637, + "learning_rate": 0.00016772582024393538, + "loss": 0.3516, + "step": 12109 + }, + { + "epoch": 0.9810434219053791, + "grad_norm": 0.035643838346004486, + "learning_rate": 0.00016772131959134074, + "loss": 0.3338, + "step": 12110 + }, + { + "epoch": 0.9811244329228775, + "grad_norm": 0.03310045227408409, + "learning_rate": 0.00016771681893874613, + "loss": 0.3109, + "step": 12111 + }, + { + "epoch": 0.9812054439403759, + "grad_norm": 0.031548332422971725, + "learning_rate": 0.00016771231828615151, + "loss": 0.2958, + "step": 12112 + }, + { + "epoch": 0.9812864549578743, + "grad_norm": 0.0302834864705801, + "learning_rate": 0.00016770781763355687, + "loss": 0.305, + "step": 12113 + }, + { + "epoch": 0.9813674659753726, + "grad_norm": 0.045317355543375015, + "learning_rate": 0.00016770331698096223, + "loss": 0.383, + "step": 12114 + }, + { + "epoch": 0.9814484769928711, + "grad_norm": 0.03549464792013168, + "learning_rate": 0.00016769881632836762, + "loss": 0.3062, + "step": 12115 + }, + { + "epoch": 0.9815294880103694, + "grad_norm": 0.042516469955444336, + "learning_rate": 0.000167694315675773, + "loss": 0.3586, + "step": 12116 + }, + { + "epoch": 0.9816104990278678, + "grad_norm": 0.035244572907686234, + "learning_rate": 0.00016768981502317837, + "loss": 0.3084, + "step": 12117 + }, + { + "epoch": 0.9816915100453661, + "grad_norm": 0.037299104034900665, + "learning_rate": 0.00016768531437058376, + "loss": 0.3453, + "step": 12118 + }, + { + "epoch": 0.9817725210628645, + "grad_norm": 0.042558420449495316, + "learning_rate": 0.00016768081371798912, + "loss": 0.3437, + "step": 12119 + }, + { + "epoch": 0.981853532080363, + "grad_norm": 0.034451086074113846, + "learning_rate": 0.00016767631306539448, + "loss": 0.3098, + "step": 12120 + }, + { + "epoch": 0.9819345430978613, + "grad_norm": 0.03349757939577103, + "learning_rate": 0.00016767181241279986, + "loss": 0.3304, + "step": 12121 + }, + { + "epoch": 0.9820155541153597, + "grad_norm": 0.034323133528232574, + "learning_rate": 0.00016766731176020525, + "loss": 0.3007, + "step": 12122 + }, + { + "epoch": 0.982096565132858, + "grad_norm": 0.03829013928771019, + "learning_rate": 0.0001676628111076106, + "loss": 0.3474, + "step": 12123 + }, + { + "epoch": 0.9821775761503565, + "grad_norm": 0.03527158126235008, + "learning_rate": 0.000167658310455016, + "loss": 0.3264, + "step": 12124 + }, + { + "epoch": 0.9822585871678549, + "grad_norm": 0.03139869496226311, + "learning_rate": 0.00016765380980242136, + "loss": 0.3016, + "step": 12125 + }, + { + "epoch": 0.9823395981853532, + "grad_norm": 0.034956566989421844, + "learning_rate": 0.00016764930914982672, + "loss": 0.3475, + "step": 12126 + }, + { + "epoch": 0.9824206092028516, + "grad_norm": 0.03701526299118996, + "learning_rate": 0.0001676448084972321, + "loss": 0.3376, + "step": 12127 + }, + { + "epoch": 0.9825016202203499, + "grad_norm": 0.03835771232843399, + "learning_rate": 0.0001676403078446375, + "loss": 0.3733, + "step": 12128 + }, + { + "epoch": 0.9825826312378484, + "grad_norm": 0.03585449978709221, + "learning_rate": 0.00016763580719204285, + "loss": 0.3314, + "step": 12129 + }, + { + "epoch": 0.9826636422553467, + "grad_norm": 0.033534858375787735, + "learning_rate": 0.00016763130653944824, + "loss": 0.3812, + "step": 12130 + }, + { + "epoch": 0.9827446532728451, + "grad_norm": 0.03601505607366562, + "learning_rate": 0.0001676268058868536, + "loss": 0.2844, + "step": 12131 + }, + { + "epoch": 0.9828256642903435, + "grad_norm": 0.03342147171497345, + "learning_rate": 0.00016762230523425896, + "loss": 0.3221, + "step": 12132 + }, + { + "epoch": 0.9829066753078418, + "grad_norm": 0.03414756432175636, + "learning_rate": 0.00016761780458166435, + "loss": 0.3119, + "step": 12133 + }, + { + "epoch": 0.9829876863253403, + "grad_norm": 0.033940937370061874, + "learning_rate": 0.00016761330392906973, + "loss": 0.3078, + "step": 12134 + }, + { + "epoch": 0.9830686973428386, + "grad_norm": 0.03797852620482445, + "learning_rate": 0.0001676088032764751, + "loss": 0.3289, + "step": 12135 + }, + { + "epoch": 0.983149708360337, + "grad_norm": 0.03421001508831978, + "learning_rate": 0.00016760430262388048, + "loss": 0.3313, + "step": 12136 + }, + { + "epoch": 0.9832307193778353, + "grad_norm": 0.034913770854473114, + "learning_rate": 0.00016759980197128584, + "loss": 0.3149, + "step": 12137 + }, + { + "epoch": 0.9833117303953338, + "grad_norm": 0.03968416526913643, + "learning_rate": 0.0001675953013186912, + "loss": 0.3632, + "step": 12138 + }, + { + "epoch": 0.9833927414128322, + "grad_norm": 0.035320572555065155, + "learning_rate": 0.0001675908006660966, + "loss": 0.3232, + "step": 12139 + }, + { + "epoch": 0.9834737524303305, + "grad_norm": 0.033722467720508575, + "learning_rate": 0.00016758630001350197, + "loss": 0.2897, + "step": 12140 + }, + { + "epoch": 0.9835547634478289, + "grad_norm": 0.03793656826019287, + "learning_rate": 0.00016758179936090733, + "loss": 0.322, + "step": 12141 + }, + { + "epoch": 0.9836357744653272, + "grad_norm": 0.032825589179992676, + "learning_rate": 0.00016757729870831272, + "loss": 0.3085, + "step": 12142 + }, + { + "epoch": 0.9837167854828257, + "grad_norm": 0.03451994061470032, + "learning_rate": 0.00016757279805571808, + "loss": 0.3335, + "step": 12143 + }, + { + "epoch": 0.9837977965003241, + "grad_norm": 0.03874523565173149, + "learning_rate": 0.00016756829740312344, + "loss": 0.3385, + "step": 12144 + }, + { + "epoch": 0.9838788075178224, + "grad_norm": 0.038094766438007355, + "learning_rate": 0.00016756379675052886, + "loss": 0.3322, + "step": 12145 + }, + { + "epoch": 0.9839598185353208, + "grad_norm": 0.036857735365629196, + "learning_rate": 0.00016755929609793422, + "loss": 0.3054, + "step": 12146 + }, + { + "epoch": 0.9840408295528191, + "grad_norm": 0.04062890261411667, + "learning_rate": 0.00016755479544533958, + "loss": 0.3367, + "step": 12147 + }, + { + "epoch": 0.9841218405703176, + "grad_norm": 0.032043006271123886, + "learning_rate": 0.00016755029479274496, + "loss": 0.3227, + "step": 12148 + }, + { + "epoch": 0.984202851587816, + "grad_norm": 0.0364932045340538, + "learning_rate": 0.00016754579414015032, + "loss": 0.3285, + "step": 12149 + }, + { + "epoch": 0.9842838626053143, + "grad_norm": 0.03572333604097366, + "learning_rate": 0.00016754129348755568, + "loss": 0.3439, + "step": 12150 + }, + { + "epoch": 0.9843648736228127, + "grad_norm": 0.039901476353406906, + "learning_rate": 0.0001675367928349611, + "loss": 0.3169, + "step": 12151 + }, + { + "epoch": 0.9844458846403111, + "grad_norm": 0.042402952909469604, + "learning_rate": 0.00016753229218236646, + "loss": 0.3442, + "step": 12152 + }, + { + "epoch": 0.9845268956578095, + "grad_norm": 0.036621030420064926, + "learning_rate": 0.00016752779152977182, + "loss": 0.2791, + "step": 12153 + }, + { + "epoch": 0.9846079066753078, + "grad_norm": 0.036967337131500244, + "learning_rate": 0.0001675232908771772, + "loss": 0.2982, + "step": 12154 + }, + { + "epoch": 0.9846889176928062, + "grad_norm": 0.039531875401735306, + "learning_rate": 0.00016751879022458257, + "loss": 0.3337, + "step": 12155 + }, + { + "epoch": 0.9847699287103046, + "grad_norm": 0.03411232680082321, + "learning_rate": 0.00016751428957198793, + "loss": 0.3021, + "step": 12156 + }, + { + "epoch": 0.984850939727803, + "grad_norm": 0.03600526601076126, + "learning_rate": 0.00016750978891939334, + "loss": 0.3203, + "step": 12157 + }, + { + "epoch": 0.9849319507453014, + "grad_norm": 0.04176699370145798, + "learning_rate": 0.0001675052882667987, + "loss": 0.32, + "step": 12158 + }, + { + "epoch": 0.9850129617627997, + "grad_norm": 0.036995962262153625, + "learning_rate": 0.00016750078761420406, + "loss": 0.3388, + "step": 12159 + }, + { + "epoch": 0.9850939727802981, + "grad_norm": 0.039030853658914566, + "learning_rate": 0.00016749628696160945, + "loss": 0.3352, + "step": 12160 + }, + { + "epoch": 0.9851749837977966, + "grad_norm": 0.032328519970178604, + "learning_rate": 0.0001674917863090148, + "loss": 0.276, + "step": 12161 + }, + { + "epoch": 0.9852559948152949, + "grad_norm": 0.03794671595096588, + "learning_rate": 0.00016748728565642017, + "loss": 0.3641, + "step": 12162 + }, + { + "epoch": 0.9853370058327933, + "grad_norm": 0.03538952022790909, + "learning_rate": 0.00016748278500382558, + "loss": 0.331, + "step": 12163 + }, + { + "epoch": 0.9854180168502916, + "grad_norm": 0.035941652953624725, + "learning_rate": 0.00016747828435123094, + "loss": 0.3681, + "step": 12164 + }, + { + "epoch": 0.98549902786779, + "grad_norm": 0.03243683651089668, + "learning_rate": 0.0001674737836986363, + "loss": 0.2974, + "step": 12165 + }, + { + "epoch": 0.9855800388852884, + "grad_norm": 0.03725123032927513, + "learning_rate": 0.0001674692830460417, + "loss": 0.3333, + "step": 12166 + }, + { + "epoch": 0.9856610499027868, + "grad_norm": 0.041863132268190384, + "learning_rate": 0.00016746478239344705, + "loss": 0.3771, + "step": 12167 + }, + { + "epoch": 0.9857420609202852, + "grad_norm": 0.03277840465307236, + "learning_rate": 0.00016746028174085244, + "loss": 0.3114, + "step": 12168 + }, + { + "epoch": 0.9858230719377835, + "grad_norm": 0.035245537757873535, + "learning_rate": 0.00016745578108825782, + "loss": 0.3384, + "step": 12169 + }, + { + "epoch": 0.9859040829552819, + "grad_norm": 0.04435291886329651, + "learning_rate": 0.00016745128043566318, + "loss": 0.3479, + "step": 12170 + }, + { + "epoch": 0.9859850939727803, + "grad_norm": 0.03915781155228615, + "learning_rate": 0.00016744677978306854, + "loss": 0.3298, + "step": 12171 + }, + { + "epoch": 0.9860661049902787, + "grad_norm": 0.037133507430553436, + "learning_rate": 0.00016744227913047393, + "loss": 0.3671, + "step": 12172 + }, + { + "epoch": 0.986147116007777, + "grad_norm": 0.03917395696043968, + "learning_rate": 0.0001674377784778793, + "loss": 0.3151, + "step": 12173 + }, + { + "epoch": 0.9862281270252754, + "grad_norm": 0.03865097463130951, + "learning_rate": 0.00016743327782528468, + "loss": 0.3627, + "step": 12174 + }, + { + "epoch": 0.9863091380427739, + "grad_norm": 0.0351426862180233, + "learning_rate": 0.00016742877717269006, + "loss": 0.302, + "step": 12175 + }, + { + "epoch": 0.9863901490602722, + "grad_norm": 0.03547579050064087, + "learning_rate": 0.00016742427652009542, + "loss": 0.3213, + "step": 12176 + }, + { + "epoch": 0.9864711600777706, + "grad_norm": 0.034349165856838226, + "learning_rate": 0.00016741977586750078, + "loss": 0.3177, + "step": 12177 + }, + { + "epoch": 0.9865521710952689, + "grad_norm": 0.03572181612253189, + "learning_rate": 0.00016741527521490617, + "loss": 0.2843, + "step": 12178 + }, + { + "epoch": 0.9866331821127673, + "grad_norm": 0.03393491730093956, + "learning_rate": 0.00016741077456231153, + "loss": 0.3074, + "step": 12179 + }, + { + "epoch": 0.9867141931302658, + "grad_norm": 0.037414468824863434, + "learning_rate": 0.00016740627390971692, + "loss": 0.3719, + "step": 12180 + }, + { + "epoch": 0.9867952041477641, + "grad_norm": 0.03554844483733177, + "learning_rate": 0.0001674017732571223, + "loss": 0.3543, + "step": 12181 + }, + { + "epoch": 0.9868762151652625, + "grad_norm": 0.03349216282367706, + "learning_rate": 0.00016739727260452767, + "loss": 0.323, + "step": 12182 + }, + { + "epoch": 0.9869572261827608, + "grad_norm": 0.03510260209441185, + "learning_rate": 0.00016739277195193303, + "loss": 0.3536, + "step": 12183 + }, + { + "epoch": 0.9870382372002592, + "grad_norm": 0.033853139728307724, + "learning_rate": 0.0001673882712993384, + "loss": 0.3127, + "step": 12184 + }, + { + "epoch": 0.9871192482177576, + "grad_norm": 0.036612339317798615, + "learning_rate": 0.00016738377064674377, + "loss": 0.3265, + "step": 12185 + }, + { + "epoch": 0.987200259235256, + "grad_norm": 0.03170229122042656, + "learning_rate": 0.00016737926999414916, + "loss": 0.3193, + "step": 12186 + }, + { + "epoch": 0.9872812702527544, + "grad_norm": 0.03500431403517723, + "learning_rate": 0.00016737476934155455, + "loss": 0.3057, + "step": 12187 + }, + { + "epoch": 0.9873622812702527, + "grad_norm": 0.032831739634275436, + "learning_rate": 0.0001673702686889599, + "loss": 0.3013, + "step": 12188 + }, + { + "epoch": 0.9874432922877512, + "grad_norm": 0.03684018552303314, + "learning_rate": 0.00016736576803636527, + "loss": 0.33, + "step": 12189 + }, + { + "epoch": 0.9875243033052495, + "grad_norm": 0.037288084626197815, + "learning_rate": 0.00016736126738377065, + "loss": 0.3402, + "step": 12190 + }, + { + "epoch": 0.9876053143227479, + "grad_norm": 0.03490764647722244, + "learning_rate": 0.00016735676673117601, + "loss": 0.3314, + "step": 12191 + }, + { + "epoch": 0.9876863253402463, + "grad_norm": 0.03680797666311264, + "learning_rate": 0.0001673522660785814, + "loss": 0.3295, + "step": 12192 + }, + { + "epoch": 0.9877673363577446, + "grad_norm": 0.03332938253879547, + "learning_rate": 0.0001673477654259868, + "loss": 0.3503, + "step": 12193 + }, + { + "epoch": 0.9878483473752431, + "grad_norm": 0.037796203047037125, + "learning_rate": 0.00016734326477339215, + "loss": 0.3532, + "step": 12194 + }, + { + "epoch": 0.9879293583927414, + "grad_norm": 0.035063859075307846, + "learning_rate": 0.0001673387641207975, + "loss": 0.3151, + "step": 12195 + }, + { + "epoch": 0.9880103694102398, + "grad_norm": 0.03438085690140724, + "learning_rate": 0.0001673342634682029, + "loss": 0.3169, + "step": 12196 + }, + { + "epoch": 0.9880913804277381, + "grad_norm": 0.03508453816175461, + "learning_rate": 0.00016732976281560828, + "loss": 0.3033, + "step": 12197 + }, + { + "epoch": 0.9881723914452365, + "grad_norm": 0.036688365042209625, + "learning_rate": 0.00016732526216301364, + "loss": 0.2952, + "step": 12198 + }, + { + "epoch": 0.988253402462735, + "grad_norm": 0.03475033864378929, + "learning_rate": 0.00016732076151041903, + "loss": 0.3235, + "step": 12199 + }, + { + "epoch": 0.9883344134802333, + "grad_norm": 0.03848425671458244, + "learning_rate": 0.0001673162608578244, + "loss": 0.254, + "step": 12200 + }, + { + "epoch": 0.9884154244977317, + "grad_norm": 0.036981042474508286, + "learning_rate": 0.00016731176020522975, + "loss": 0.3465, + "step": 12201 + }, + { + "epoch": 0.98849643551523, + "grad_norm": 0.03264646604657173, + "learning_rate": 0.00016730725955263514, + "loss": 0.2708, + "step": 12202 + }, + { + "epoch": 0.9885774465327285, + "grad_norm": 0.0368381030857563, + "learning_rate": 0.00016730275890004053, + "loss": 0.314, + "step": 12203 + }, + { + "epoch": 0.9886584575502269, + "grad_norm": 0.04556920379400253, + "learning_rate": 0.00016729825824744589, + "loss": 0.3608, + "step": 12204 + }, + { + "epoch": 0.9887394685677252, + "grad_norm": 0.03348538279533386, + "learning_rate": 0.00016729375759485127, + "loss": 0.2967, + "step": 12205 + }, + { + "epoch": 0.9888204795852236, + "grad_norm": 0.03768174350261688, + "learning_rate": 0.00016728925694225663, + "loss": 0.3023, + "step": 12206 + }, + { + "epoch": 0.9889014906027219, + "grad_norm": 0.036302268505096436, + "learning_rate": 0.000167284756289662, + "loss": 0.32, + "step": 12207 + }, + { + "epoch": 0.9889825016202204, + "grad_norm": 0.03752124309539795, + "learning_rate": 0.00016728025563706738, + "loss": 0.3545, + "step": 12208 + }, + { + "epoch": 0.9890635126377187, + "grad_norm": 0.04945499822497368, + "learning_rate": 0.00016727575498447277, + "loss": 0.3927, + "step": 12209 + }, + { + "epoch": 0.9891445236552171, + "grad_norm": 0.03695805370807648, + "learning_rate": 0.00016727125433187813, + "loss": 0.3258, + "step": 12210 + }, + { + "epoch": 0.9892255346727155, + "grad_norm": 0.03560155630111694, + "learning_rate": 0.00016726675367928351, + "loss": 0.3048, + "step": 12211 + }, + { + "epoch": 0.9893065456902139, + "grad_norm": 0.04134264215826988, + "learning_rate": 0.00016726225302668887, + "loss": 0.3407, + "step": 12212 + }, + { + "epoch": 0.9893875567077123, + "grad_norm": 0.035698454827070236, + "learning_rate": 0.00016725775237409423, + "loss": 0.332, + "step": 12213 + }, + { + "epoch": 0.9894685677252106, + "grad_norm": 0.03139304742217064, + "learning_rate": 0.00016725325172149962, + "loss": 0.3244, + "step": 12214 + }, + { + "epoch": 0.989549578742709, + "grad_norm": 0.03921914100646973, + "learning_rate": 0.000167248751068905, + "loss": 0.3876, + "step": 12215 + }, + { + "epoch": 0.9896305897602073, + "grad_norm": 0.036349404603242874, + "learning_rate": 0.00016724425041631037, + "loss": 0.3254, + "step": 12216 + }, + { + "epoch": 0.9897116007777058, + "grad_norm": 0.037594377994537354, + "learning_rate": 0.00016723974976371576, + "loss": 0.3609, + "step": 12217 + }, + { + "epoch": 0.9897926117952042, + "grad_norm": 0.03646136447787285, + "learning_rate": 0.00016723524911112112, + "loss": 0.3481, + "step": 12218 + }, + { + "epoch": 0.9898736228127025, + "grad_norm": 0.031988371163606644, + "learning_rate": 0.00016723074845852648, + "loss": 0.2894, + "step": 12219 + }, + { + "epoch": 0.9899546338302009, + "grad_norm": 0.03128928318619728, + "learning_rate": 0.00016722624780593186, + "loss": 0.3097, + "step": 12220 + }, + { + "epoch": 0.9900356448476992, + "grad_norm": 0.03760599344968796, + "learning_rate": 0.00016722174715333725, + "loss": 0.3216, + "step": 12221 + }, + { + "epoch": 0.9901166558651977, + "grad_norm": 0.034738097339868546, + "learning_rate": 0.0001672172465007426, + "loss": 0.3486, + "step": 12222 + }, + { + "epoch": 0.9901976668826961, + "grad_norm": 0.03593483567237854, + "learning_rate": 0.000167212745848148, + "loss": 0.3223, + "step": 12223 + }, + { + "epoch": 0.9902786779001944, + "grad_norm": 0.034317515790462494, + "learning_rate": 0.00016720824519555336, + "loss": 0.3191, + "step": 12224 + }, + { + "epoch": 0.9903596889176928, + "grad_norm": 0.034238673746585846, + "learning_rate": 0.00016720374454295872, + "loss": 0.3271, + "step": 12225 + }, + { + "epoch": 0.9904406999351912, + "grad_norm": 0.040035225450992584, + "learning_rate": 0.00016719924389036413, + "loss": 0.3203, + "step": 12226 + }, + { + "epoch": 0.9905217109526896, + "grad_norm": 0.036540232598781586, + "learning_rate": 0.0001671947432377695, + "loss": 0.307, + "step": 12227 + }, + { + "epoch": 0.990602721970188, + "grad_norm": 0.0375128798186779, + "learning_rate": 0.00016719024258517485, + "loss": 0.3247, + "step": 12228 + }, + { + "epoch": 0.9906837329876863, + "grad_norm": 0.03315060958266258, + "learning_rate": 0.00016718574193258024, + "loss": 0.3176, + "step": 12229 + }, + { + "epoch": 0.9907647440051847, + "grad_norm": 0.03492714464664459, + "learning_rate": 0.0001671812412799856, + "loss": 0.3182, + "step": 12230 + }, + { + "epoch": 0.9908457550226831, + "grad_norm": 0.036610666662454605, + "learning_rate": 0.00016717674062739096, + "loss": 0.3409, + "step": 12231 + }, + { + "epoch": 0.9909267660401815, + "grad_norm": 0.03414812311530113, + "learning_rate": 0.00016717223997479637, + "loss": 0.3163, + "step": 12232 + }, + { + "epoch": 0.9910077770576798, + "grad_norm": 0.03545241430401802, + "learning_rate": 0.00016716773932220173, + "loss": 0.3071, + "step": 12233 + }, + { + "epoch": 0.9910887880751782, + "grad_norm": 0.04074105620384216, + "learning_rate": 0.0001671632386696071, + "loss": 0.3231, + "step": 12234 + }, + { + "epoch": 0.9911697990926766, + "grad_norm": 0.03582175076007843, + "learning_rate": 0.00016715873801701248, + "loss": 0.3559, + "step": 12235 + }, + { + "epoch": 0.991250810110175, + "grad_norm": 0.038345787674188614, + "learning_rate": 0.00016715423736441784, + "loss": 0.3599, + "step": 12236 + }, + { + "epoch": 0.9913318211276734, + "grad_norm": 0.033684276044368744, + "learning_rate": 0.0001671497367118232, + "loss": 0.3029, + "step": 12237 + }, + { + "epoch": 0.9914128321451717, + "grad_norm": 0.03453746438026428, + "learning_rate": 0.00016714523605922861, + "loss": 0.2979, + "step": 12238 + }, + { + "epoch": 0.9914938431626701, + "grad_norm": 0.03586553782224655, + "learning_rate": 0.00016714073540663397, + "loss": 0.3443, + "step": 12239 + }, + { + "epoch": 0.9915748541801686, + "grad_norm": 0.03529420495033264, + "learning_rate": 0.00016713623475403933, + "loss": 0.3262, + "step": 12240 + }, + { + "epoch": 0.9916558651976669, + "grad_norm": 0.0342470221221447, + "learning_rate": 0.00016713173410144472, + "loss": 0.3247, + "step": 12241 + }, + { + "epoch": 0.9917368762151653, + "grad_norm": 0.03083527274429798, + "learning_rate": 0.00016712723344885008, + "loss": 0.2956, + "step": 12242 + }, + { + "epoch": 0.9918178872326636, + "grad_norm": 0.03758340701460838, + "learning_rate": 0.00016712273279625544, + "loss": 0.31, + "step": 12243 + }, + { + "epoch": 0.991898898250162, + "grad_norm": 0.03650803864002228, + "learning_rate": 0.00016711823214366086, + "loss": 0.3517, + "step": 12244 + }, + { + "epoch": 0.9919799092676604, + "grad_norm": 0.03343210741877556, + "learning_rate": 0.00016711373149106622, + "loss": 0.2898, + "step": 12245 + }, + { + "epoch": 0.9920609202851588, + "grad_norm": 0.03352636843919754, + "learning_rate": 0.00016710923083847158, + "loss": 0.2622, + "step": 12246 + }, + { + "epoch": 0.9921419313026572, + "grad_norm": 0.03206267207860947, + "learning_rate": 0.00016710473018587696, + "loss": 0.2606, + "step": 12247 + }, + { + "epoch": 0.9922229423201555, + "grad_norm": 0.03664777800440788, + "learning_rate": 0.00016710022953328232, + "loss": 0.3447, + "step": 12248 + }, + { + "epoch": 0.9923039533376539, + "grad_norm": 0.0360250361263752, + "learning_rate": 0.0001670957288806877, + "loss": 0.3173, + "step": 12249 + }, + { + "epoch": 0.9923849643551523, + "grad_norm": 0.032525595277547836, + "learning_rate": 0.0001670912282280931, + "loss": 0.2989, + "step": 12250 + }, + { + "epoch": 0.9924659753726507, + "grad_norm": 0.03685523942112923, + "learning_rate": 0.00016708672757549846, + "loss": 0.3359, + "step": 12251 + }, + { + "epoch": 0.992546986390149, + "grad_norm": 0.039908938109874725, + "learning_rate": 0.00016708222692290382, + "loss": 0.3506, + "step": 12252 + }, + { + "epoch": 0.9926279974076474, + "grad_norm": 0.03686125949025154, + "learning_rate": 0.0001670777262703092, + "loss": 0.2967, + "step": 12253 + }, + { + "epoch": 0.9927090084251459, + "grad_norm": 0.03284912183880806, + "learning_rate": 0.00016707322561771457, + "loss": 0.3224, + "step": 12254 + }, + { + "epoch": 0.9927900194426442, + "grad_norm": 0.037038374692201614, + "learning_rate": 0.00016706872496511995, + "loss": 0.3101, + "step": 12255 + }, + { + "epoch": 0.9928710304601426, + "grad_norm": 0.03874828293919563, + "learning_rate": 0.00016706422431252534, + "loss": 0.3035, + "step": 12256 + }, + { + "epoch": 0.9929520414776409, + "grad_norm": 0.030359134078025818, + "learning_rate": 0.0001670597236599307, + "loss": 0.3111, + "step": 12257 + }, + { + "epoch": 0.9930330524951393, + "grad_norm": 0.040259990841150284, + "learning_rate": 0.00016705522300733606, + "loss": 0.3559, + "step": 12258 + }, + { + "epoch": 0.9931140635126378, + "grad_norm": 0.03642154484987259, + "learning_rate": 0.00016705072235474145, + "loss": 0.3568, + "step": 12259 + }, + { + "epoch": 0.9931950745301361, + "grad_norm": 0.042818669229745865, + "learning_rate": 0.0001670462217021468, + "loss": 0.409, + "step": 12260 + }, + { + "epoch": 0.9932760855476345, + "grad_norm": 0.03275144472718239, + "learning_rate": 0.0001670417210495522, + "loss": 0.2907, + "step": 12261 + }, + { + "epoch": 0.9933570965651328, + "grad_norm": 0.03294292464852333, + "learning_rate": 0.00016703722039695758, + "loss": 0.3168, + "step": 12262 + }, + { + "epoch": 0.9934381075826313, + "grad_norm": 0.03793772682547569, + "learning_rate": 0.00016703271974436294, + "loss": 0.3324, + "step": 12263 + }, + { + "epoch": 0.9935191186001296, + "grad_norm": 0.03757007420063019, + "learning_rate": 0.0001670282190917683, + "loss": 0.3266, + "step": 12264 + }, + { + "epoch": 0.993600129617628, + "grad_norm": 0.03577961027622223, + "learning_rate": 0.0001670237184391737, + "loss": 0.2696, + "step": 12265 + }, + { + "epoch": 0.9936811406351264, + "grad_norm": 0.036166343837976456, + "learning_rate": 0.00016701921778657905, + "loss": 0.2861, + "step": 12266 + }, + { + "epoch": 0.9937621516526247, + "grad_norm": 0.03126160427927971, + "learning_rate": 0.00016701471713398444, + "loss": 0.3055, + "step": 12267 + }, + { + "epoch": 0.9938431626701232, + "grad_norm": 0.04049358144402504, + "learning_rate": 0.00016701021648138982, + "loss": 0.3118, + "step": 12268 + }, + { + "epoch": 0.9939241736876215, + "grad_norm": 0.03834820166230202, + "learning_rate": 0.00016700571582879518, + "loss": 0.3451, + "step": 12269 + }, + { + "epoch": 0.9940051847051199, + "grad_norm": 0.03607412055134773, + "learning_rate": 0.00016700121517620054, + "loss": 0.325, + "step": 12270 + }, + { + "epoch": 0.9940861957226182, + "grad_norm": 0.03167823702096939, + "learning_rate": 0.00016699671452360593, + "loss": 0.3014, + "step": 12271 + }, + { + "epoch": 0.9941672067401166, + "grad_norm": 0.038372207432985306, + "learning_rate": 0.0001669922138710113, + "loss": 0.3583, + "step": 12272 + }, + { + "epoch": 0.9942482177576151, + "grad_norm": 0.03487422317266464, + "learning_rate": 0.00016698771321841668, + "loss": 0.3359, + "step": 12273 + }, + { + "epoch": 0.9943292287751134, + "grad_norm": 0.03847598284482956, + "learning_rate": 0.00016698321256582206, + "loss": 0.335, + "step": 12274 + }, + { + "epoch": 0.9944102397926118, + "grad_norm": 0.03265409916639328, + "learning_rate": 0.00016697871191322742, + "loss": 0.2944, + "step": 12275 + }, + { + "epoch": 0.9944912508101101, + "grad_norm": 0.03902866318821907, + "learning_rate": 0.00016697421126063278, + "loss": 0.3686, + "step": 12276 + }, + { + "epoch": 0.9945722618276086, + "grad_norm": 0.03776983544230461, + "learning_rate": 0.00016696971060803817, + "loss": 0.3537, + "step": 12277 + }, + { + "epoch": 0.994653272845107, + "grad_norm": 0.03842049464583397, + "learning_rate": 0.00016696520995544356, + "loss": 0.2971, + "step": 12278 + }, + { + "epoch": 0.9947342838626053, + "grad_norm": 0.03293439745903015, + "learning_rate": 0.00016696070930284892, + "loss": 0.2864, + "step": 12279 + }, + { + "epoch": 0.9948152948801037, + "grad_norm": 0.033829886466264725, + "learning_rate": 0.0001669562086502543, + "loss": 0.2979, + "step": 12280 + }, + { + "epoch": 0.994896305897602, + "grad_norm": 0.04007561877369881, + "learning_rate": 0.00016695170799765967, + "loss": 0.298, + "step": 12281 + }, + { + "epoch": 0.9949773169151005, + "grad_norm": 0.03744842857122421, + "learning_rate": 0.00016694720734506503, + "loss": 0.299, + "step": 12282 + }, + { + "epoch": 0.9950583279325989, + "grad_norm": 0.034444358199834824, + "learning_rate": 0.0001669427066924704, + "loss": 0.2833, + "step": 12283 + }, + { + "epoch": 0.9951393389500972, + "grad_norm": 0.044794388115406036, + "learning_rate": 0.0001669382060398758, + "loss": 0.338, + "step": 12284 + }, + { + "epoch": 0.9952203499675956, + "grad_norm": 0.0318923182785511, + "learning_rate": 0.00016693370538728116, + "loss": 0.2646, + "step": 12285 + }, + { + "epoch": 0.9953013609850939, + "grad_norm": 0.032464563846588135, + "learning_rate": 0.00016692920473468655, + "loss": 0.2779, + "step": 12286 + }, + { + "epoch": 0.9953823720025924, + "grad_norm": 0.03734520822763443, + "learning_rate": 0.0001669247040820919, + "loss": 0.3252, + "step": 12287 + }, + { + "epoch": 0.9954633830200907, + "grad_norm": 0.038119930773973465, + "learning_rate": 0.00016692020342949727, + "loss": 0.3091, + "step": 12288 + }, + { + "epoch": 0.9955443940375891, + "grad_norm": 0.03723182529211044, + "learning_rate": 0.00016691570277690266, + "loss": 0.3309, + "step": 12289 + }, + { + "epoch": 0.9956254050550875, + "grad_norm": 0.04023761674761772, + "learning_rate": 0.00016691120212430804, + "loss": 0.3339, + "step": 12290 + }, + { + "epoch": 0.9957064160725859, + "grad_norm": 0.03981214016675949, + "learning_rate": 0.0001669067014717134, + "loss": 0.3338, + "step": 12291 + }, + { + "epoch": 0.9957874270900843, + "grad_norm": 0.04064285010099411, + "learning_rate": 0.0001669022008191188, + "loss": 0.3596, + "step": 12292 + }, + { + "epoch": 0.9958684381075826, + "grad_norm": 0.03574245050549507, + "learning_rate": 0.00016689770016652415, + "loss": 0.2799, + "step": 12293 + }, + { + "epoch": 0.995949449125081, + "grad_norm": 0.03438296541571617, + "learning_rate": 0.0001668931995139295, + "loss": 0.3576, + "step": 12294 + }, + { + "epoch": 0.9960304601425793, + "grad_norm": 0.034820858389139175, + "learning_rate": 0.0001668886988613349, + "loss": 0.2916, + "step": 12295 + }, + { + "epoch": 0.9961114711600778, + "grad_norm": 0.0366164855659008, + "learning_rate": 0.00016688419820874028, + "loss": 0.3207, + "step": 12296 + }, + { + "epoch": 0.9961924821775762, + "grad_norm": 0.0425511971116066, + "learning_rate": 0.00016687969755614564, + "loss": 0.348, + "step": 12297 + }, + { + "epoch": 0.9962734931950745, + "grad_norm": 0.03484644740819931, + "learning_rate": 0.00016687519690355103, + "loss": 0.3605, + "step": 12298 + }, + { + "epoch": 0.9963545042125729, + "grad_norm": 0.03128006309270859, + "learning_rate": 0.0001668706962509564, + "loss": 0.3131, + "step": 12299 + }, + { + "epoch": 0.9964355152300713, + "grad_norm": 0.039280328899621964, + "learning_rate": 0.00016686619559836175, + "loss": 0.3344, + "step": 12300 + }, + { + "epoch": 0.9965165262475697, + "grad_norm": 0.03059135191142559, + "learning_rate": 0.00016686169494576717, + "loss": 0.3399, + "step": 12301 + }, + { + "epoch": 0.9965975372650681, + "grad_norm": 0.033593568950891495, + "learning_rate": 0.00016685719429317253, + "loss": 0.2888, + "step": 12302 + }, + { + "epoch": 0.9966785482825664, + "grad_norm": 0.037966132164001465, + "learning_rate": 0.00016685269364057789, + "loss": 0.3288, + "step": 12303 + }, + { + "epoch": 0.9967595593000648, + "grad_norm": 0.03207383677363396, + "learning_rate": 0.00016684819298798327, + "loss": 0.2973, + "step": 12304 + }, + { + "epoch": 0.9968405703175632, + "grad_norm": 0.03187767416238785, + "learning_rate": 0.00016684369233538863, + "loss": 0.3199, + "step": 12305 + }, + { + "epoch": 0.9969215813350616, + "grad_norm": 0.03184106573462486, + "learning_rate": 0.000166839191682794, + "loss": 0.2718, + "step": 12306 + }, + { + "epoch": 0.99700259235256, + "grad_norm": 0.039083920419216156, + "learning_rate": 0.0001668346910301994, + "loss": 0.3683, + "step": 12307 + }, + { + "epoch": 0.9970836033700583, + "grad_norm": 0.033640190958976746, + "learning_rate": 0.00016683019037760477, + "loss": 0.3077, + "step": 12308 + }, + { + "epoch": 0.9971646143875567, + "grad_norm": 0.038430314511060715, + "learning_rate": 0.00016682568972501013, + "loss": 0.315, + "step": 12309 + }, + { + "epoch": 0.9972456254050551, + "grad_norm": 0.03640434890985489, + "learning_rate": 0.00016682118907241551, + "loss": 0.3046, + "step": 12310 + }, + { + "epoch": 0.9973266364225535, + "grad_norm": 0.03566000610589981, + "learning_rate": 0.00016681668841982087, + "loss": 0.3054, + "step": 12311 + }, + { + "epoch": 0.9974076474400518, + "grad_norm": 0.04168863222002983, + "learning_rate": 0.00016681218776722623, + "loss": 0.393, + "step": 12312 + }, + { + "epoch": 0.9974886584575502, + "grad_norm": 0.03590601310133934, + "learning_rate": 0.00016680768711463165, + "loss": 0.2891, + "step": 12313 + }, + { + "epoch": 0.9975696694750487, + "grad_norm": 0.03797917440533638, + "learning_rate": 0.000166803186462037, + "loss": 0.3614, + "step": 12314 + }, + { + "epoch": 0.997650680492547, + "grad_norm": 0.03066432662308216, + "learning_rate": 0.00016679868580944237, + "loss": 0.2477, + "step": 12315 + }, + { + "epoch": 0.9977316915100454, + "grad_norm": 0.03818430006504059, + "learning_rate": 0.00016679418515684776, + "loss": 0.3103, + "step": 12316 + }, + { + "epoch": 0.9978127025275437, + "grad_norm": 0.043079014867544174, + "learning_rate": 0.00016678968450425312, + "loss": 0.294, + "step": 12317 + }, + { + "epoch": 0.9978937135450421, + "grad_norm": 0.03821693733334541, + "learning_rate": 0.00016678518385165848, + "loss": 0.3659, + "step": 12318 + }, + { + "epoch": 0.9979747245625405, + "grad_norm": 0.03290431946516037, + "learning_rate": 0.0001667806831990639, + "loss": 0.2696, + "step": 12319 + }, + { + "epoch": 0.9980557355800389, + "grad_norm": 0.037341225892305374, + "learning_rate": 0.00016677618254646925, + "loss": 0.3211, + "step": 12320 + }, + { + "epoch": 0.9981367465975373, + "grad_norm": 0.037481408566236496, + "learning_rate": 0.0001667716818938746, + "loss": 0.332, + "step": 12321 + }, + { + "epoch": 0.9982177576150356, + "grad_norm": 0.03560345619916916, + "learning_rate": 0.00016676718124128, + "loss": 0.3281, + "step": 12322 + }, + { + "epoch": 0.998298768632534, + "grad_norm": 0.036946386098861694, + "learning_rate": 0.00016676268058868536, + "loss": 0.326, + "step": 12323 + }, + { + "epoch": 0.9983797796500324, + "grad_norm": 0.03420798480510712, + "learning_rate": 0.00016675817993609072, + "loss": 0.3193, + "step": 12324 + }, + { + "epoch": 0.9984607906675308, + "grad_norm": 0.03938150405883789, + "learning_rate": 0.00016675367928349613, + "loss": 0.3319, + "step": 12325 + }, + { + "epoch": 0.9985418016850292, + "grad_norm": 0.03574543818831444, + "learning_rate": 0.0001667491786309015, + "loss": 0.3217, + "step": 12326 + }, + { + "epoch": 0.9986228127025275, + "grad_norm": 0.037979912012815475, + "learning_rate": 0.00016674467797830685, + "loss": 0.3628, + "step": 12327 + }, + { + "epoch": 0.998703823720026, + "grad_norm": 0.038943979889154434, + "learning_rate": 0.00016674017732571224, + "loss": 0.3481, + "step": 12328 + }, + { + "epoch": 0.9987848347375243, + "grad_norm": 0.03603614494204521, + "learning_rate": 0.0001667356766731176, + "loss": 0.2906, + "step": 12329 + }, + { + "epoch": 0.9988658457550227, + "grad_norm": 0.030259212478995323, + "learning_rate": 0.00016673117602052299, + "loss": 0.2556, + "step": 12330 + }, + { + "epoch": 0.998946856772521, + "grad_norm": 0.03213247284293175, + "learning_rate": 0.00016672667536792837, + "loss": 0.3307, + "step": 12331 + }, + { + "epoch": 0.9990278677900194, + "grad_norm": 0.03017602674663067, + "learning_rate": 0.00016672217471533373, + "loss": 0.2732, + "step": 12332 + }, + { + "epoch": 0.9991088788075179, + "grad_norm": 0.04762164503335953, + "learning_rate": 0.0001667176740627391, + "loss": 0.3062, + "step": 12333 + }, + { + "epoch": 0.9991898898250162, + "grad_norm": 0.03786870837211609, + "learning_rate": 0.00016671317341014448, + "loss": 0.4087, + "step": 12334 + }, + { + "epoch": 0.9992709008425146, + "grad_norm": 0.030480682849884033, + "learning_rate": 0.00016670867275754984, + "loss": 0.2739, + "step": 12335 + }, + { + "epoch": 0.9993519118600129, + "grad_norm": 0.030645664781332016, + "learning_rate": 0.00016670417210495523, + "loss": 0.2811, + "step": 12336 + }, + { + "epoch": 0.9994329228775113, + "grad_norm": 0.033013977110385895, + "learning_rate": 0.00016669967145236062, + "loss": 0.2941, + "step": 12337 + }, + { + "epoch": 0.9995139338950098, + "grad_norm": 0.03498419001698494, + "learning_rate": 0.00016669517079976598, + "loss": 0.3035, + "step": 12338 + }, + { + "epoch": 0.9995949449125081, + "grad_norm": 0.035238780081272125, + "learning_rate": 0.00016669067014717134, + "loss": 0.3082, + "step": 12339 + }, + { + "epoch": 0.9996759559300065, + "grad_norm": 0.036749210208654404, + "learning_rate": 0.00016668616949457672, + "loss": 0.3347, + "step": 12340 + }, + { + "epoch": 0.9997569669475048, + "grad_norm": 0.037299081683158875, + "learning_rate": 0.00016668166884198208, + "loss": 0.3616, + "step": 12341 + }, + { + "epoch": 0.9998379779650033, + "grad_norm": 0.03310994803905487, + "learning_rate": 0.00016667716818938747, + "loss": 0.2927, + "step": 12342 + }, + { + "epoch": 0.9999189889825016, + "grad_norm": 0.03382963687181473, + "learning_rate": 0.00016667266753679286, + "loss": 0.3183, + "step": 12343 + }, + { + "epoch": 1.0, + "grad_norm": 0.03378660976886749, + "learning_rate": 0.00016666816688419822, + "loss": 0.3154, + "step": 12344 + }, + { + "epoch": 1.0000810110174985, + "grad_norm": 0.03642452135682106, + "learning_rate": 0.00016666366623160358, + "loss": 0.3174, + "step": 12345 + }, + { + "epoch": 1.0001620220349967, + "grad_norm": 0.03315473720431328, + "learning_rate": 0.00016665916557900896, + "loss": 0.2641, + "step": 12346 + }, + { + "epoch": 1.0002430330524952, + "grad_norm": 0.037134990096092224, + "learning_rate": 0.00016665466492641432, + "loss": 0.3233, + "step": 12347 + }, + { + "epoch": 1.0003240440699934, + "grad_norm": 0.03000638820230961, + "learning_rate": 0.0001666501642738197, + "loss": 0.269, + "step": 12348 + }, + { + "epoch": 1.000405055087492, + "grad_norm": 0.038361046463251114, + "learning_rate": 0.0001666456636212251, + "loss": 0.2948, + "step": 12349 + }, + { + "epoch": 1.0004860661049904, + "grad_norm": 0.03925226256251335, + "learning_rate": 0.00016664116296863046, + "loss": 0.3089, + "step": 12350 + }, + { + "epoch": 1.0005670771224886, + "grad_norm": 0.03771889954805374, + "learning_rate": 0.00016663666231603582, + "loss": 0.3116, + "step": 12351 + }, + { + "epoch": 1.000648088139987, + "grad_norm": 0.03906197100877762, + "learning_rate": 0.0001666321616634412, + "loss": 0.3379, + "step": 12352 + }, + { + "epoch": 1.0007290991574853, + "grad_norm": 0.03280739486217499, + "learning_rate": 0.00016662766101084657, + "loss": 0.3226, + "step": 12353 + }, + { + "epoch": 1.0008101101749838, + "grad_norm": 0.03842942789196968, + "learning_rate": 0.00016662316035825195, + "loss": 0.3435, + "step": 12354 + }, + { + "epoch": 1.0008911211924822, + "grad_norm": 0.043962206691503525, + "learning_rate": 0.00016661865970565734, + "loss": 0.3883, + "step": 12355 + }, + { + "epoch": 1.0009721322099805, + "grad_norm": 0.031128423288464546, + "learning_rate": 0.0001666141590530627, + "loss": 0.2722, + "step": 12356 + }, + { + "epoch": 1.001053143227479, + "grad_norm": 0.03812249004840851, + "learning_rate": 0.00016660965840046806, + "loss": 0.3343, + "step": 12357 + }, + { + "epoch": 1.0011341542449774, + "grad_norm": 0.035646624863147736, + "learning_rate": 0.00016660515774787345, + "loss": 0.308, + "step": 12358 + }, + { + "epoch": 1.0012151652624757, + "grad_norm": 0.032420992851257324, + "learning_rate": 0.00016660065709527883, + "loss": 0.2966, + "step": 12359 + }, + { + "epoch": 1.0012961762799741, + "grad_norm": 0.0395088866353035, + "learning_rate": 0.0001665961564426842, + "loss": 0.387, + "step": 12360 + }, + { + "epoch": 1.0013771872974724, + "grad_norm": 0.03513415530323982, + "learning_rate": 0.00016659165579008958, + "loss": 0.3448, + "step": 12361 + }, + { + "epoch": 1.0014581983149708, + "grad_norm": 0.03443319723010063, + "learning_rate": 0.00016658715513749494, + "loss": 0.3102, + "step": 12362 + }, + { + "epoch": 1.0015392093324693, + "grad_norm": 0.03467671200633049, + "learning_rate": 0.0001665826544849003, + "loss": 0.2769, + "step": 12363 + }, + { + "epoch": 1.0016202203499676, + "grad_norm": 0.03799883648753166, + "learning_rate": 0.0001665781538323057, + "loss": 0.336, + "step": 12364 + }, + { + "epoch": 1.001701231367466, + "grad_norm": 0.046157464385032654, + "learning_rate": 0.00016657365317971108, + "loss": 0.3336, + "step": 12365 + }, + { + "epoch": 1.0017822423849643, + "grad_norm": 0.03803658112883568, + "learning_rate": 0.00016656915252711644, + "loss": 0.2913, + "step": 12366 + }, + { + "epoch": 1.0018632534024627, + "grad_norm": 0.032749395817518234, + "learning_rate": 0.00016656465187452182, + "loss": 0.2897, + "step": 12367 + }, + { + "epoch": 1.0019442644199612, + "grad_norm": 0.04132390394806862, + "learning_rate": 0.00016656015122192718, + "loss": 0.3353, + "step": 12368 + }, + { + "epoch": 1.0020252754374595, + "grad_norm": 0.03762146458029747, + "learning_rate": 0.00016655565056933254, + "loss": 0.3598, + "step": 12369 + }, + { + "epoch": 1.002106286454958, + "grad_norm": 0.038307737559080124, + "learning_rate": 0.00016655114991673793, + "loss": 0.3231, + "step": 12370 + }, + { + "epoch": 1.0021872974724562, + "grad_norm": 0.04003438353538513, + "learning_rate": 0.00016654664926414332, + "loss": 0.3427, + "step": 12371 + }, + { + "epoch": 1.0022683084899546, + "grad_norm": 0.040446821600198746, + "learning_rate": 0.00016654214861154868, + "loss": 0.3412, + "step": 12372 + }, + { + "epoch": 1.002349319507453, + "grad_norm": 0.02912214770913124, + "learning_rate": 0.00016653764795895406, + "loss": 0.2351, + "step": 12373 + }, + { + "epoch": 1.0024303305249513, + "grad_norm": 0.03535939007997513, + "learning_rate": 0.00016653314730635942, + "loss": 0.289, + "step": 12374 + }, + { + "epoch": 1.0025113415424498, + "grad_norm": 0.04004836454987526, + "learning_rate": 0.00016652864665376478, + "loss": 0.3261, + "step": 12375 + }, + { + "epoch": 1.002592352559948, + "grad_norm": 0.034039877355098724, + "learning_rate": 0.00016652414600117017, + "loss": 0.2957, + "step": 12376 + }, + { + "epoch": 1.0026733635774465, + "grad_norm": 0.03624784201383591, + "learning_rate": 0.00016651964534857556, + "loss": 0.3374, + "step": 12377 + }, + { + "epoch": 1.002754374594945, + "grad_norm": 0.03206413611769676, + "learning_rate": 0.00016651514469598092, + "loss": 0.2958, + "step": 12378 + }, + { + "epoch": 1.0028353856124432, + "grad_norm": 0.04170289263129234, + "learning_rate": 0.0001665106440433863, + "loss": 0.3392, + "step": 12379 + }, + { + "epoch": 1.0029163966299417, + "grad_norm": 0.036238156259059906, + "learning_rate": 0.00016650614339079167, + "loss": 0.3556, + "step": 12380 + }, + { + "epoch": 1.00299740764744, + "grad_norm": 0.03602059930562973, + "learning_rate": 0.00016650164273819703, + "loss": 0.3006, + "step": 12381 + }, + { + "epoch": 1.0030784186649384, + "grad_norm": 0.039142753928899765, + "learning_rate": 0.00016649714208560244, + "loss": 0.3518, + "step": 12382 + }, + { + "epoch": 1.0031594296824369, + "grad_norm": 0.03825153782963753, + "learning_rate": 0.0001664926414330078, + "loss": 0.3141, + "step": 12383 + }, + { + "epoch": 1.0032404406999351, + "grad_norm": 0.033279530704021454, + "learning_rate": 0.00016648814078041316, + "loss": 0.2877, + "step": 12384 + }, + { + "epoch": 1.0033214517174336, + "grad_norm": 0.03614173084497452, + "learning_rate": 0.00016648364012781855, + "loss": 0.3441, + "step": 12385 + }, + { + "epoch": 1.003402462734932, + "grad_norm": 0.039306920021772385, + "learning_rate": 0.0001664791394752239, + "loss": 0.3253, + "step": 12386 + }, + { + "epoch": 1.0034834737524303, + "grad_norm": 0.03397579491138458, + "learning_rate": 0.00016647463882262927, + "loss": 0.2791, + "step": 12387 + }, + { + "epoch": 1.0035644847699288, + "grad_norm": 0.0360901802778244, + "learning_rate": 0.00016647013817003468, + "loss": 0.3041, + "step": 12388 + }, + { + "epoch": 1.003645495787427, + "grad_norm": 0.03415064141154289, + "learning_rate": 0.00016646563751744004, + "loss": 0.2809, + "step": 12389 + }, + { + "epoch": 1.0037265068049255, + "grad_norm": 0.03701115772128105, + "learning_rate": 0.0001664611368648454, + "loss": 0.298, + "step": 12390 + }, + { + "epoch": 1.003807517822424, + "grad_norm": 0.03698696568608284, + "learning_rate": 0.0001664566362122508, + "loss": 0.3428, + "step": 12391 + }, + { + "epoch": 1.0038885288399222, + "grad_norm": 0.03689829260110855, + "learning_rate": 0.00016645213555965615, + "loss": 0.2951, + "step": 12392 + }, + { + "epoch": 1.0039695398574207, + "grad_norm": 0.03843197971582413, + "learning_rate": 0.0001664476349070615, + "loss": 0.3324, + "step": 12393 + }, + { + "epoch": 1.004050550874919, + "grad_norm": 0.03739136457443237, + "learning_rate": 0.00016644313425446692, + "loss": 0.2794, + "step": 12394 + }, + { + "epoch": 1.0041315618924174, + "grad_norm": 0.03774438798427582, + "learning_rate": 0.00016643863360187228, + "loss": 0.3869, + "step": 12395 + }, + { + "epoch": 1.0042125729099158, + "grad_norm": 0.0323587991297245, + "learning_rate": 0.00016643413294927764, + "loss": 0.283, + "step": 12396 + }, + { + "epoch": 1.004293583927414, + "grad_norm": 0.030298395082354546, + "learning_rate": 0.00016642963229668303, + "loss": 0.2823, + "step": 12397 + }, + { + "epoch": 1.0043745949449125, + "grad_norm": 0.03403405472636223, + "learning_rate": 0.0001664251316440884, + "loss": 0.3001, + "step": 12398 + }, + { + "epoch": 1.0044556059624108, + "grad_norm": 0.03376943618059158, + "learning_rate": 0.00016642063099149375, + "loss": 0.2832, + "step": 12399 + }, + { + "epoch": 1.0045366169799093, + "grad_norm": 0.039719048887491226, + "learning_rate": 0.00016641613033889917, + "loss": 0.3748, + "step": 12400 + }, + { + "epoch": 1.0046176279974077, + "grad_norm": 0.034429267048835754, + "learning_rate": 0.00016641162968630453, + "loss": 0.3099, + "step": 12401 + }, + { + "epoch": 1.004698639014906, + "grad_norm": 0.03809856250882149, + "learning_rate": 0.00016640712903370989, + "loss": 0.3077, + "step": 12402 + }, + { + "epoch": 1.0047796500324044, + "grad_norm": 0.035106975585222244, + "learning_rate": 0.00016640262838111527, + "loss": 0.3042, + "step": 12403 + }, + { + "epoch": 1.0048606610499027, + "grad_norm": 0.034927934408187866, + "learning_rate": 0.00016639812772852063, + "loss": 0.3166, + "step": 12404 + }, + { + "epoch": 1.0049416720674011, + "grad_norm": 0.03464159741997719, + "learning_rate": 0.000166393627075926, + "loss": 0.3071, + "step": 12405 + }, + { + "epoch": 1.0050226830848996, + "grad_norm": 0.036465100944042206, + "learning_rate": 0.0001663891264233314, + "loss": 0.309, + "step": 12406 + }, + { + "epoch": 1.0051036941023979, + "grad_norm": 0.035587284713983536, + "learning_rate": 0.00016638462577073677, + "loss": 0.3246, + "step": 12407 + }, + { + "epoch": 1.0051847051198963, + "grad_norm": 0.03844556584954262, + "learning_rate": 0.00016638012511814213, + "loss": 0.2976, + "step": 12408 + }, + { + "epoch": 1.0052657161373948, + "grad_norm": 0.03252662718296051, + "learning_rate": 0.00016637562446554751, + "loss": 0.271, + "step": 12409 + }, + { + "epoch": 1.005346727154893, + "grad_norm": 0.03310299292206764, + "learning_rate": 0.00016637112381295287, + "loss": 0.3237, + "step": 12410 + }, + { + "epoch": 1.0054277381723915, + "grad_norm": 0.03731595352292061, + "learning_rate": 0.00016636662316035826, + "loss": 0.2918, + "step": 12411 + }, + { + "epoch": 1.0055087491898898, + "grad_norm": 0.03584537282586098, + "learning_rate": 0.00016636212250776365, + "loss": 0.3427, + "step": 12412 + }, + { + "epoch": 1.0055897602073882, + "grad_norm": 0.03774617984890938, + "learning_rate": 0.000166357621855169, + "loss": 0.3049, + "step": 12413 + }, + { + "epoch": 1.0056707712248867, + "grad_norm": 0.03594934195280075, + "learning_rate": 0.00016635312120257437, + "loss": 0.3343, + "step": 12414 + }, + { + "epoch": 1.005751782242385, + "grad_norm": 0.03343866392970085, + "learning_rate": 0.00016634862054997976, + "loss": 0.2872, + "step": 12415 + }, + { + "epoch": 1.0058327932598834, + "grad_norm": 0.036227621138095856, + "learning_rate": 0.00016634411989738512, + "loss": 0.3081, + "step": 12416 + }, + { + "epoch": 1.0059138042773816, + "grad_norm": 0.039095692336559296, + "learning_rate": 0.0001663396192447905, + "loss": 0.3371, + "step": 12417 + }, + { + "epoch": 1.00599481529488, + "grad_norm": 0.03556566312909126, + "learning_rate": 0.0001663351185921959, + "loss": 0.3305, + "step": 12418 + }, + { + "epoch": 1.0060758263123786, + "grad_norm": 0.04125303030014038, + "learning_rate": 0.00016633061793960125, + "loss": 0.3408, + "step": 12419 + }, + { + "epoch": 1.0061568373298768, + "grad_norm": 0.0392603874206543, + "learning_rate": 0.0001663261172870066, + "loss": 0.304, + "step": 12420 + }, + { + "epoch": 1.0062378483473753, + "grad_norm": 0.0386536680161953, + "learning_rate": 0.000166321616634412, + "loss": 0.3502, + "step": 12421 + }, + { + "epoch": 1.0063188593648735, + "grad_norm": 0.03425775468349457, + "learning_rate": 0.00016631711598181736, + "loss": 0.3169, + "step": 12422 + }, + { + "epoch": 1.006399870382372, + "grad_norm": 0.032373424619436264, + "learning_rate": 0.00016631261532922275, + "loss": 0.2838, + "step": 12423 + }, + { + "epoch": 1.0064808813998705, + "grad_norm": 0.03774702921509743, + "learning_rate": 0.00016630811467662813, + "loss": 0.3258, + "step": 12424 + }, + { + "epoch": 1.0065618924173687, + "grad_norm": 0.03462809696793556, + "learning_rate": 0.0001663036140240335, + "loss": 0.3146, + "step": 12425 + }, + { + "epoch": 1.0066429034348672, + "grad_norm": 0.03529775142669678, + "learning_rate": 0.00016629911337143885, + "loss": 0.3624, + "step": 12426 + }, + { + "epoch": 1.0067239144523654, + "grad_norm": 0.03785771131515503, + "learning_rate": 0.00016629461271884424, + "loss": 0.3217, + "step": 12427 + }, + { + "epoch": 1.0068049254698639, + "grad_norm": 0.03252212703227997, + "learning_rate": 0.0001662901120662496, + "loss": 0.2931, + "step": 12428 + }, + { + "epoch": 1.0068859364873624, + "grad_norm": 0.03546149656176567, + "learning_rate": 0.000166285611413655, + "loss": 0.3254, + "step": 12429 + }, + { + "epoch": 1.0069669475048606, + "grad_norm": 0.034065522253513336, + "learning_rate": 0.00016628111076106037, + "loss": 0.3355, + "step": 12430 + }, + { + "epoch": 1.007047958522359, + "grad_norm": 0.03405478969216347, + "learning_rate": 0.00016627661010846573, + "loss": 0.2963, + "step": 12431 + }, + { + "epoch": 1.0071289695398573, + "grad_norm": 0.039432525634765625, + "learning_rate": 0.0001662721094558711, + "loss": 0.3565, + "step": 12432 + }, + { + "epoch": 1.0072099805573558, + "grad_norm": 0.042425453662872314, + "learning_rate": 0.00016626760880327648, + "loss": 0.3517, + "step": 12433 + }, + { + "epoch": 1.0072909915748542, + "grad_norm": 0.03917066380381584, + "learning_rate": 0.00016626310815068187, + "loss": 0.3633, + "step": 12434 + }, + { + "epoch": 1.0073720025923525, + "grad_norm": 0.0367281399667263, + "learning_rate": 0.00016625860749808723, + "loss": 0.3253, + "step": 12435 + }, + { + "epoch": 1.007453013609851, + "grad_norm": 0.03884049132466316, + "learning_rate": 0.00016625410684549262, + "loss": 0.328, + "step": 12436 + }, + { + "epoch": 1.0075340246273494, + "grad_norm": 0.04218551516532898, + "learning_rate": 0.00016624960619289798, + "loss": 0.3234, + "step": 12437 + }, + { + "epoch": 1.0076150356448477, + "grad_norm": 0.03948225453495979, + "learning_rate": 0.00016624510554030334, + "loss": 0.3129, + "step": 12438 + }, + { + "epoch": 1.0076960466623461, + "grad_norm": 0.03605709224939346, + "learning_rate": 0.00016624060488770872, + "loss": 0.3264, + "step": 12439 + }, + { + "epoch": 1.0077770576798444, + "grad_norm": 0.04017477482557297, + "learning_rate": 0.0001662361042351141, + "loss": 0.3624, + "step": 12440 + }, + { + "epoch": 1.0078580686973428, + "grad_norm": 0.03859005868434906, + "learning_rate": 0.00016623160358251947, + "loss": 0.2997, + "step": 12441 + }, + { + "epoch": 1.0079390797148413, + "grad_norm": 0.03483903780579567, + "learning_rate": 0.00016622710292992486, + "loss": 0.2916, + "step": 12442 + }, + { + "epoch": 1.0080200907323396, + "grad_norm": 0.034934330731630325, + "learning_rate": 0.00016622260227733022, + "loss": 0.3259, + "step": 12443 + }, + { + "epoch": 1.008101101749838, + "grad_norm": 0.029897112399339676, + "learning_rate": 0.00016621810162473558, + "loss": 0.2831, + "step": 12444 + }, + { + "epoch": 1.0081821127673363, + "grad_norm": 0.03647159785032272, + "learning_rate": 0.00016621360097214096, + "loss": 0.3101, + "step": 12445 + }, + { + "epoch": 1.0082631237848347, + "grad_norm": 0.035187117755413055, + "learning_rate": 0.00016620910031954635, + "loss": 0.3158, + "step": 12446 + }, + { + "epoch": 1.0083441348023332, + "grad_norm": 0.04031701385974884, + "learning_rate": 0.0001662045996669517, + "loss": 0.3255, + "step": 12447 + }, + { + "epoch": 1.0084251458198314, + "grad_norm": 0.041569486260414124, + "learning_rate": 0.0001662000990143571, + "loss": 0.3324, + "step": 12448 + }, + { + "epoch": 1.00850615683733, + "grad_norm": 0.04560425877571106, + "learning_rate": 0.00016619559836176246, + "loss": 0.3304, + "step": 12449 + }, + { + "epoch": 1.0085871678548282, + "grad_norm": 0.04038703069090843, + "learning_rate": 0.00016619109770916782, + "loss": 0.313, + "step": 12450 + }, + { + "epoch": 1.0086681788723266, + "grad_norm": 0.03961900994181633, + "learning_rate": 0.0001661865970565732, + "loss": 0.3351, + "step": 12451 + }, + { + "epoch": 1.008749189889825, + "grad_norm": 0.03639005124568939, + "learning_rate": 0.0001661820964039786, + "loss": 0.3294, + "step": 12452 + }, + { + "epoch": 1.0088302009073233, + "grad_norm": 0.0363776758313179, + "learning_rate": 0.00016617759575138395, + "loss": 0.2832, + "step": 12453 + }, + { + "epoch": 1.0089112119248218, + "grad_norm": 0.033802758902311325, + "learning_rate": 0.00016617309509878934, + "loss": 0.3063, + "step": 12454 + }, + { + "epoch": 1.00899222294232, + "grad_norm": 0.04182208329439163, + "learning_rate": 0.0001661685944461947, + "loss": 0.3325, + "step": 12455 + }, + { + "epoch": 1.0090732339598185, + "grad_norm": 0.03720999136567116, + "learning_rate": 0.00016616409379360006, + "loss": 0.3423, + "step": 12456 + }, + { + "epoch": 1.009154244977317, + "grad_norm": 0.046621162444353104, + "learning_rate": 0.00016615959314100545, + "loss": 0.3477, + "step": 12457 + }, + { + "epoch": 1.0092352559948152, + "grad_norm": 0.0410497672855854, + "learning_rate": 0.00016615509248841083, + "loss": 0.3279, + "step": 12458 + }, + { + "epoch": 1.0093162670123137, + "grad_norm": 0.032629966735839844, + "learning_rate": 0.0001661505918358162, + "loss": 0.2904, + "step": 12459 + }, + { + "epoch": 1.0093972780298122, + "grad_norm": 0.040290966629981995, + "learning_rate": 0.00016614609118322158, + "loss": 0.3397, + "step": 12460 + }, + { + "epoch": 1.0094782890473104, + "grad_norm": 0.042767442762851715, + "learning_rate": 0.00016614159053062694, + "loss": 0.3455, + "step": 12461 + }, + { + "epoch": 1.0095593000648089, + "grad_norm": 0.03545527160167694, + "learning_rate": 0.0001661370898780323, + "loss": 0.3493, + "step": 12462 + }, + { + "epoch": 1.0096403110823071, + "grad_norm": 0.03302915021777153, + "learning_rate": 0.00016613258922543772, + "loss": 0.2907, + "step": 12463 + }, + { + "epoch": 1.0097213220998056, + "grad_norm": 0.031950343400239944, + "learning_rate": 0.00016612808857284308, + "loss": 0.3082, + "step": 12464 + }, + { + "epoch": 1.009802333117304, + "grad_norm": 0.040452003479003906, + "learning_rate": 0.00016612358792024844, + "loss": 0.3198, + "step": 12465 + }, + { + "epoch": 1.0098833441348023, + "grad_norm": 0.04159904643893242, + "learning_rate": 0.00016611908726765382, + "loss": 0.3603, + "step": 12466 + }, + { + "epoch": 1.0099643551523008, + "grad_norm": 0.0391816720366478, + "learning_rate": 0.00016611458661505918, + "loss": 0.3622, + "step": 12467 + }, + { + "epoch": 1.010045366169799, + "grad_norm": 0.03843311592936516, + "learning_rate": 0.00016611008596246454, + "loss": 0.3093, + "step": 12468 + }, + { + "epoch": 1.0101263771872975, + "grad_norm": 0.03490731120109558, + "learning_rate": 0.00016610558530986996, + "loss": 0.2906, + "step": 12469 + }, + { + "epoch": 1.010207388204796, + "grad_norm": 0.036694154143333435, + "learning_rate": 0.00016610108465727532, + "loss": 0.296, + "step": 12470 + }, + { + "epoch": 1.0102883992222942, + "grad_norm": 0.03656122460961342, + "learning_rate": 0.00016609658400468068, + "loss": 0.3082, + "step": 12471 + }, + { + "epoch": 1.0103694102397927, + "grad_norm": 0.03301248699426651, + "learning_rate": 0.00016609208335208607, + "loss": 0.3047, + "step": 12472 + }, + { + "epoch": 1.010450421257291, + "grad_norm": 0.03141988813877106, + "learning_rate": 0.00016608758269949143, + "loss": 0.3005, + "step": 12473 + }, + { + "epoch": 1.0105314322747894, + "grad_norm": 0.03507707640528679, + "learning_rate": 0.00016608308204689679, + "loss": 0.3019, + "step": 12474 + }, + { + "epoch": 1.0106124432922878, + "grad_norm": 0.04133852943778038, + "learning_rate": 0.0001660785813943022, + "loss": 0.3315, + "step": 12475 + }, + { + "epoch": 1.010693454309786, + "grad_norm": 0.04655103757977486, + "learning_rate": 0.00016607408074170756, + "loss": 0.3819, + "step": 12476 + }, + { + "epoch": 1.0107744653272845, + "grad_norm": 0.03938587009906769, + "learning_rate": 0.00016606958008911292, + "loss": 0.3146, + "step": 12477 + }, + { + "epoch": 1.0108554763447828, + "grad_norm": 0.03262874856591225, + "learning_rate": 0.0001660650794365183, + "loss": 0.2895, + "step": 12478 + }, + { + "epoch": 1.0109364873622813, + "grad_norm": 0.0301420446485281, + "learning_rate": 0.00016606057878392367, + "loss": 0.2891, + "step": 12479 + }, + { + "epoch": 1.0110174983797797, + "grad_norm": 0.03468109294772148, + "learning_rate": 0.00016605607813132903, + "loss": 0.3424, + "step": 12480 + }, + { + "epoch": 1.011098509397278, + "grad_norm": 0.03772737830877304, + "learning_rate": 0.00016605157747873444, + "loss": 0.3303, + "step": 12481 + }, + { + "epoch": 1.0111795204147764, + "grad_norm": 0.03408652916550636, + "learning_rate": 0.0001660470768261398, + "loss": 0.3008, + "step": 12482 + }, + { + "epoch": 1.0112605314322747, + "grad_norm": 0.03782588988542557, + "learning_rate": 0.00016604257617354516, + "loss": 0.3382, + "step": 12483 + }, + { + "epoch": 1.0113415424497731, + "grad_norm": 0.03254377096891403, + "learning_rate": 0.00016603807552095055, + "loss": 0.2517, + "step": 12484 + }, + { + "epoch": 1.0114225534672716, + "grad_norm": 0.046504005789756775, + "learning_rate": 0.0001660335748683559, + "loss": 0.3585, + "step": 12485 + }, + { + "epoch": 1.0115035644847699, + "grad_norm": 0.039096616208553314, + "learning_rate": 0.0001660290742157613, + "loss": 0.3187, + "step": 12486 + }, + { + "epoch": 1.0115845755022683, + "grad_norm": 0.04427647963166237, + "learning_rate": 0.00016602457356316668, + "loss": 0.3409, + "step": 12487 + }, + { + "epoch": 1.0116655865197668, + "grad_norm": 0.0376652292907238, + "learning_rate": 0.00016602007291057204, + "loss": 0.3174, + "step": 12488 + }, + { + "epoch": 1.011746597537265, + "grad_norm": 0.044266387820243835, + "learning_rate": 0.0001660155722579774, + "loss": 0.3298, + "step": 12489 + }, + { + "epoch": 1.0118276085547635, + "grad_norm": 0.03599374368786812, + "learning_rate": 0.0001660110716053828, + "loss": 0.307, + "step": 12490 + }, + { + "epoch": 1.0119086195722617, + "grad_norm": 0.03235464543104172, + "learning_rate": 0.00016600657095278815, + "loss": 0.2877, + "step": 12491 + }, + { + "epoch": 1.0119896305897602, + "grad_norm": 0.0392383337020874, + "learning_rate": 0.00016600207030019354, + "loss": 0.3156, + "step": 12492 + }, + { + "epoch": 1.0120706416072587, + "grad_norm": 0.030660009011626244, + "learning_rate": 0.00016599756964759892, + "loss": 0.277, + "step": 12493 + }, + { + "epoch": 1.012151652624757, + "grad_norm": 0.03609098121523857, + "learning_rate": 0.00016599306899500428, + "loss": 0.3084, + "step": 12494 + }, + { + "epoch": 1.0122326636422554, + "grad_norm": 0.032300084829330444, + "learning_rate": 0.00016598856834240964, + "loss": 0.281, + "step": 12495 + }, + { + "epoch": 1.0123136746597536, + "grad_norm": 0.04154512658715248, + "learning_rate": 0.00016598406768981503, + "loss": 0.3069, + "step": 12496 + }, + { + "epoch": 1.012394685677252, + "grad_norm": 0.04292042553424835, + "learning_rate": 0.0001659795670372204, + "loss": 0.3163, + "step": 12497 + }, + { + "epoch": 1.0124756966947506, + "grad_norm": 0.038710352033376694, + "learning_rate": 0.00016597506638462578, + "loss": 0.3267, + "step": 12498 + }, + { + "epoch": 1.0125567077122488, + "grad_norm": 0.040433041751384735, + "learning_rate": 0.00016597056573203117, + "loss": 0.3697, + "step": 12499 + }, + { + "epoch": 1.0126377187297473, + "grad_norm": 0.03633313998579979, + "learning_rate": 0.00016596606507943653, + "loss": 0.3227, + "step": 12500 + }, + { + "epoch": 1.0127187297472455, + "grad_norm": 0.037605587393045425, + "learning_rate": 0.00016596156442684189, + "loss": 0.3401, + "step": 12501 + }, + { + "epoch": 1.012799740764744, + "grad_norm": 0.042344413697719574, + "learning_rate": 0.00016595706377424727, + "loss": 0.3238, + "step": 12502 + }, + { + "epoch": 1.0128807517822425, + "grad_norm": 0.038133446127176285, + "learning_rate": 0.00016595256312165263, + "loss": 0.3116, + "step": 12503 + }, + { + "epoch": 1.0129617627997407, + "grad_norm": 0.03859379515051842, + "learning_rate": 0.00016594806246905802, + "loss": 0.2812, + "step": 12504 + }, + { + "epoch": 1.0130427738172392, + "grad_norm": 0.03487183526158333, + "learning_rate": 0.0001659435618164634, + "loss": 0.3167, + "step": 12505 + }, + { + "epoch": 1.0131237848347374, + "grad_norm": 0.03911701962351799, + "learning_rate": 0.00016593906116386877, + "loss": 0.334, + "step": 12506 + }, + { + "epoch": 1.0132047958522359, + "grad_norm": 0.03974626958370209, + "learning_rate": 0.00016593456051127413, + "loss": 0.2963, + "step": 12507 + }, + { + "epoch": 1.0132858068697344, + "grad_norm": 0.03957384079694748, + "learning_rate": 0.00016593005985867951, + "loss": 0.3501, + "step": 12508 + }, + { + "epoch": 1.0133668178872326, + "grad_norm": 0.03603899106383324, + "learning_rate": 0.00016592555920608487, + "loss": 0.2898, + "step": 12509 + }, + { + "epoch": 1.013447828904731, + "grad_norm": 0.03210967779159546, + "learning_rate": 0.00016592105855349026, + "loss": 0.3226, + "step": 12510 + }, + { + "epoch": 1.0135288399222295, + "grad_norm": 0.03462260589003563, + "learning_rate": 0.00016591655790089565, + "loss": 0.3106, + "step": 12511 + }, + { + "epoch": 1.0136098509397278, + "grad_norm": 0.03556382656097412, + "learning_rate": 0.000165912057248301, + "loss": 0.301, + "step": 12512 + }, + { + "epoch": 1.0136908619572262, + "grad_norm": 0.03905484452843666, + "learning_rate": 0.00016590755659570637, + "loss": 0.2933, + "step": 12513 + }, + { + "epoch": 1.0137718729747245, + "grad_norm": 0.03452567011117935, + "learning_rate": 0.00016590305594311176, + "loss": 0.3317, + "step": 12514 + }, + { + "epoch": 1.013852883992223, + "grad_norm": 0.038224294781684875, + "learning_rate": 0.00016589855529051714, + "loss": 0.3469, + "step": 12515 + }, + { + "epoch": 1.0139338950097214, + "grad_norm": 0.03787916898727417, + "learning_rate": 0.0001658940546379225, + "loss": 0.3032, + "step": 12516 + }, + { + "epoch": 1.0140149060272197, + "grad_norm": 0.0391772985458374, + "learning_rate": 0.0001658895539853279, + "loss": 0.3541, + "step": 12517 + }, + { + "epoch": 1.0140959170447181, + "grad_norm": 0.04080815985798836, + "learning_rate": 0.00016588505333273325, + "loss": 0.3605, + "step": 12518 + }, + { + "epoch": 1.0141769280622164, + "grad_norm": 0.04079817235469818, + "learning_rate": 0.0001658805526801386, + "loss": 0.3003, + "step": 12519 + }, + { + "epoch": 1.0142579390797148, + "grad_norm": 0.035111624747514725, + "learning_rate": 0.000165876052027544, + "loss": 0.3102, + "step": 12520 + }, + { + "epoch": 1.0143389500972133, + "grad_norm": 0.03829184174537659, + "learning_rate": 0.00016587155137494939, + "loss": 0.3625, + "step": 12521 + }, + { + "epoch": 1.0144199611147116, + "grad_norm": 0.039515420794487, + "learning_rate": 0.00016586705072235475, + "loss": 0.3352, + "step": 12522 + }, + { + "epoch": 1.01450097213221, + "grad_norm": 0.035139210522174835, + "learning_rate": 0.00016586255006976013, + "loss": 0.2576, + "step": 12523 + }, + { + "epoch": 1.0145819831497083, + "grad_norm": 0.04180475324392319, + "learning_rate": 0.0001658580494171655, + "loss": 0.3453, + "step": 12524 + }, + { + "epoch": 1.0146629941672067, + "grad_norm": 0.039726074784994125, + "learning_rate": 0.00016585354876457085, + "loss": 0.3493, + "step": 12525 + }, + { + "epoch": 1.0147440051847052, + "grad_norm": 0.03973781317472458, + "learning_rate": 0.00016584904811197624, + "loss": 0.3633, + "step": 12526 + }, + { + "epoch": 1.0148250162022034, + "grad_norm": 0.03894086182117462, + "learning_rate": 0.00016584454745938163, + "loss": 0.3417, + "step": 12527 + }, + { + "epoch": 1.014906027219702, + "grad_norm": 0.0374528244137764, + "learning_rate": 0.000165840046806787, + "loss": 0.3058, + "step": 12528 + }, + { + "epoch": 1.0149870382372002, + "grad_norm": 0.038687460124492645, + "learning_rate": 0.00016583554615419237, + "loss": 0.3745, + "step": 12529 + }, + { + "epoch": 1.0150680492546986, + "grad_norm": 0.05107789486646652, + "learning_rate": 0.00016583104550159773, + "loss": 0.3321, + "step": 12530 + }, + { + "epoch": 1.015149060272197, + "grad_norm": 0.03942408785223961, + "learning_rate": 0.0001658265448490031, + "loss": 0.3309, + "step": 12531 + }, + { + "epoch": 1.0152300712896953, + "grad_norm": 0.037755463272333145, + "learning_rate": 0.00016582204419640848, + "loss": 0.3305, + "step": 12532 + }, + { + "epoch": 1.0153110823071938, + "grad_norm": 0.03136523813009262, + "learning_rate": 0.00016581754354381387, + "loss": 0.2528, + "step": 12533 + }, + { + "epoch": 1.0153920933246923, + "grad_norm": 0.042500585317611694, + "learning_rate": 0.00016581304289121923, + "loss": 0.3954, + "step": 12534 + }, + { + "epoch": 1.0154731043421905, + "grad_norm": 0.03316302224993706, + "learning_rate": 0.00016580854223862462, + "loss": 0.3228, + "step": 12535 + }, + { + "epoch": 1.015554115359689, + "grad_norm": 0.03290242329239845, + "learning_rate": 0.00016580404158602998, + "loss": 0.2799, + "step": 12536 + }, + { + "epoch": 1.0156351263771872, + "grad_norm": 0.043563112616539, + "learning_rate": 0.00016579954093343534, + "loss": 0.3374, + "step": 12537 + }, + { + "epoch": 1.0157161373946857, + "grad_norm": 0.04143699258565903, + "learning_rate": 0.00016579504028084072, + "loss": 0.3435, + "step": 12538 + }, + { + "epoch": 1.0157971484121842, + "grad_norm": 0.03597777336835861, + "learning_rate": 0.0001657905396282461, + "loss": 0.3279, + "step": 12539 + }, + { + "epoch": 1.0158781594296824, + "grad_norm": 0.03355882689356804, + "learning_rate": 0.00016578603897565147, + "loss": 0.3118, + "step": 12540 + }, + { + "epoch": 1.0159591704471809, + "grad_norm": 0.0339498370885849, + "learning_rate": 0.00016578153832305686, + "loss": 0.2899, + "step": 12541 + }, + { + "epoch": 1.0160401814646791, + "grad_norm": 0.03829336538910866, + "learning_rate": 0.00016577703767046222, + "loss": 0.3292, + "step": 12542 + }, + { + "epoch": 1.0161211924821776, + "grad_norm": 0.034184329211711884, + "learning_rate": 0.00016577253701786758, + "loss": 0.3025, + "step": 12543 + }, + { + "epoch": 1.016202203499676, + "grad_norm": 0.03295601159334183, + "learning_rate": 0.000165768036365273, + "loss": 0.2878, + "step": 12544 + }, + { + "epoch": 1.0162832145171743, + "grad_norm": 0.03263118490576744, + "learning_rate": 0.00016576353571267835, + "loss": 0.3457, + "step": 12545 + }, + { + "epoch": 1.0163642255346728, + "grad_norm": 0.03363572061061859, + "learning_rate": 0.0001657590350600837, + "loss": 0.2976, + "step": 12546 + }, + { + "epoch": 1.016445236552171, + "grad_norm": 0.037825170904397964, + "learning_rate": 0.0001657545344074891, + "loss": 0.3104, + "step": 12547 + }, + { + "epoch": 1.0165262475696695, + "grad_norm": 0.04601071774959564, + "learning_rate": 0.00016575003375489446, + "loss": 0.3304, + "step": 12548 + }, + { + "epoch": 1.016607258587168, + "grad_norm": 0.035257138311862946, + "learning_rate": 0.00016574553310229982, + "loss": 0.3397, + "step": 12549 + }, + { + "epoch": 1.0166882696046662, + "grad_norm": 0.03136202692985535, + "learning_rate": 0.00016574103244970523, + "loss": 0.2956, + "step": 12550 + }, + { + "epoch": 1.0167692806221647, + "grad_norm": 0.033236436545848846, + "learning_rate": 0.0001657365317971106, + "loss": 0.3206, + "step": 12551 + }, + { + "epoch": 1.016850291639663, + "grad_norm": 0.03548416122794151, + "learning_rate": 0.00016573203114451595, + "loss": 0.3247, + "step": 12552 + }, + { + "epoch": 1.0169313026571614, + "grad_norm": 0.036083072423934937, + "learning_rate": 0.00016572753049192134, + "loss": 0.3196, + "step": 12553 + }, + { + "epoch": 1.0170123136746598, + "grad_norm": 0.03948529064655304, + "learning_rate": 0.0001657230298393267, + "loss": 0.3812, + "step": 12554 + }, + { + "epoch": 1.017093324692158, + "grad_norm": 0.03718789666891098, + "learning_rate": 0.0001657185291867321, + "loss": 0.3371, + "step": 12555 + }, + { + "epoch": 1.0171743357096565, + "grad_norm": 0.036488890647888184, + "learning_rate": 0.00016571402853413747, + "loss": 0.3426, + "step": 12556 + }, + { + "epoch": 1.0172553467271548, + "grad_norm": 0.03828705847263336, + "learning_rate": 0.00016570952788154283, + "loss": 0.3322, + "step": 12557 + }, + { + "epoch": 1.0173363577446533, + "grad_norm": 0.03848506510257721, + "learning_rate": 0.0001657050272289482, + "loss": 0.3401, + "step": 12558 + }, + { + "epoch": 1.0174173687621517, + "grad_norm": 0.037985723465681076, + "learning_rate": 0.00016570052657635358, + "loss": 0.3494, + "step": 12559 + }, + { + "epoch": 1.01749837977965, + "grad_norm": 0.03458062931895256, + "learning_rate": 0.00016569602592375894, + "loss": 0.3056, + "step": 12560 + }, + { + "epoch": 1.0175793907971484, + "grad_norm": 0.037043794989585876, + "learning_rate": 0.00016569152527116433, + "loss": 0.3423, + "step": 12561 + }, + { + "epoch": 1.017660401814647, + "grad_norm": 0.036882054060697556, + "learning_rate": 0.00016568702461856972, + "loss": 0.323, + "step": 12562 + }, + { + "epoch": 1.0177414128321451, + "grad_norm": 0.036688271909952164, + "learning_rate": 0.00016568252396597508, + "loss": 0.3355, + "step": 12563 + }, + { + "epoch": 1.0178224238496436, + "grad_norm": 0.03149713948369026, + "learning_rate": 0.00016567802331338044, + "loss": 0.2675, + "step": 12564 + }, + { + "epoch": 1.0179034348671419, + "grad_norm": 0.035839568823575974, + "learning_rate": 0.00016567352266078582, + "loss": 0.3237, + "step": 12565 + }, + { + "epoch": 1.0179844458846403, + "grad_norm": 0.036895159631967545, + "learning_rate": 0.00016566902200819118, + "loss": 0.3171, + "step": 12566 + }, + { + "epoch": 1.0180654569021388, + "grad_norm": 0.033506982028484344, + "learning_rate": 0.00016566452135559657, + "loss": 0.304, + "step": 12567 + }, + { + "epoch": 1.018146467919637, + "grad_norm": 0.04358444735407829, + "learning_rate": 0.00016566002070300196, + "loss": 0.3285, + "step": 12568 + }, + { + "epoch": 1.0182274789371355, + "grad_norm": 0.03842248022556305, + "learning_rate": 0.00016565552005040732, + "loss": 0.3214, + "step": 12569 + }, + { + "epoch": 1.0183084899546337, + "grad_norm": 0.03867550566792488, + "learning_rate": 0.00016565101939781268, + "loss": 0.3095, + "step": 12570 + }, + { + "epoch": 1.0183895009721322, + "grad_norm": 0.037699684500694275, + "learning_rate": 0.00016564651874521807, + "loss": 0.3342, + "step": 12571 + }, + { + "epoch": 1.0184705119896307, + "grad_norm": 0.033856455236673355, + "learning_rate": 0.00016564201809262343, + "loss": 0.2795, + "step": 12572 + }, + { + "epoch": 1.018551523007129, + "grad_norm": 0.03416990861296654, + "learning_rate": 0.0001656375174400288, + "loss": 0.2877, + "step": 12573 + }, + { + "epoch": 1.0186325340246274, + "grad_norm": 0.03922758251428604, + "learning_rate": 0.0001656330167874342, + "loss": 0.351, + "step": 12574 + }, + { + "epoch": 1.0187135450421256, + "grad_norm": 0.03650622069835663, + "learning_rate": 0.00016562851613483956, + "loss": 0.3001, + "step": 12575 + }, + { + "epoch": 1.018794556059624, + "grad_norm": 0.040765952318906784, + "learning_rate": 0.00016562401548224492, + "loss": 0.3258, + "step": 12576 + }, + { + "epoch": 1.0188755670771226, + "grad_norm": 0.03666510432958603, + "learning_rate": 0.0001656195148296503, + "loss": 0.316, + "step": 12577 + }, + { + "epoch": 1.0189565780946208, + "grad_norm": 0.0341981016099453, + "learning_rate": 0.00016561501417705567, + "loss": 0.331, + "step": 12578 + }, + { + "epoch": 1.0190375891121193, + "grad_norm": 0.035523395985364914, + "learning_rate": 0.00016561051352446105, + "loss": 0.3054, + "step": 12579 + }, + { + "epoch": 1.0191186001296175, + "grad_norm": 0.0412684865295887, + "learning_rate": 0.00016560601287186644, + "loss": 0.3498, + "step": 12580 + }, + { + "epoch": 1.019199611147116, + "grad_norm": 0.04070408269762993, + "learning_rate": 0.0001656015122192718, + "loss": 0.3212, + "step": 12581 + }, + { + "epoch": 1.0192806221646145, + "grad_norm": 0.03692404925823212, + "learning_rate": 0.00016559701156667716, + "loss": 0.2891, + "step": 12582 + }, + { + "epoch": 1.0193616331821127, + "grad_norm": 0.037462033331394196, + "learning_rate": 0.00016559251091408255, + "loss": 0.3539, + "step": 12583 + }, + { + "epoch": 1.0194426441996112, + "grad_norm": 0.03793095424771309, + "learning_rate": 0.0001655880102614879, + "loss": 0.3357, + "step": 12584 + }, + { + "epoch": 1.0195236552171094, + "grad_norm": 0.04043465107679367, + "learning_rate": 0.0001655835096088933, + "loss": 0.3406, + "step": 12585 + }, + { + "epoch": 1.0196046662346079, + "grad_norm": 0.03590738773345947, + "learning_rate": 0.00016557900895629868, + "loss": 0.3509, + "step": 12586 + }, + { + "epoch": 1.0196856772521063, + "grad_norm": 0.04093816131353378, + "learning_rate": 0.00016557450830370404, + "loss": 0.3627, + "step": 12587 + }, + { + "epoch": 1.0197666882696046, + "grad_norm": 0.04243616387248039, + "learning_rate": 0.0001655700076511094, + "loss": 0.3127, + "step": 12588 + }, + { + "epoch": 1.019847699287103, + "grad_norm": 0.03815116733312607, + "learning_rate": 0.0001655655069985148, + "loss": 0.306, + "step": 12589 + }, + { + "epoch": 1.0199287103046015, + "grad_norm": 0.03636857122182846, + "learning_rate": 0.00016556100634592015, + "loss": 0.3488, + "step": 12590 + }, + { + "epoch": 1.0200097213220998, + "grad_norm": 0.03637069836258888, + "learning_rate": 0.00016555650569332554, + "loss": 0.3454, + "step": 12591 + }, + { + "epoch": 1.0200907323395982, + "grad_norm": 0.04027386009693146, + "learning_rate": 0.00016555200504073092, + "loss": 0.3202, + "step": 12592 + }, + { + "epoch": 1.0201717433570965, + "grad_norm": 0.029983578249812126, + "learning_rate": 0.00016554750438813628, + "loss": 0.2673, + "step": 12593 + }, + { + "epoch": 1.020252754374595, + "grad_norm": 0.03252333030104637, + "learning_rate": 0.00016554300373554164, + "loss": 0.2997, + "step": 12594 + }, + { + "epoch": 1.0203337653920934, + "grad_norm": 0.03685925528407097, + "learning_rate": 0.00016553850308294703, + "loss": 0.3156, + "step": 12595 + }, + { + "epoch": 1.0204147764095917, + "grad_norm": 0.04175392538309097, + "learning_rate": 0.00016553400243035242, + "loss": 0.3562, + "step": 12596 + }, + { + "epoch": 1.0204957874270901, + "grad_norm": 0.04049062356352806, + "learning_rate": 0.00016552950177775778, + "loss": 0.3742, + "step": 12597 + }, + { + "epoch": 1.0205767984445884, + "grad_norm": 0.03493852540850639, + "learning_rate": 0.00016552500112516317, + "loss": 0.3111, + "step": 12598 + }, + { + "epoch": 1.0206578094620868, + "grad_norm": 0.04160711541771889, + "learning_rate": 0.00016552050047256853, + "loss": 0.3593, + "step": 12599 + }, + { + "epoch": 1.0207388204795853, + "grad_norm": 0.03934788703918457, + "learning_rate": 0.00016551599981997389, + "loss": 0.3271, + "step": 12600 + }, + { + "epoch": 1.0208198314970836, + "grad_norm": 0.038866519927978516, + "learning_rate": 0.00016551149916737927, + "loss": 0.3023, + "step": 12601 + }, + { + "epoch": 1.020900842514582, + "grad_norm": 0.04214729741215706, + "learning_rate": 0.00016550699851478466, + "loss": 0.346, + "step": 12602 + }, + { + "epoch": 1.0209818535320803, + "grad_norm": 0.0387173555791378, + "learning_rate": 0.00016550249786219002, + "loss": 0.3135, + "step": 12603 + }, + { + "epoch": 1.0210628645495787, + "grad_norm": 0.035196542739868164, + "learning_rate": 0.0001654979972095954, + "loss": 0.3233, + "step": 12604 + }, + { + "epoch": 1.0211438755670772, + "grad_norm": 0.036050982773303986, + "learning_rate": 0.00016549349655700077, + "loss": 0.2754, + "step": 12605 + }, + { + "epoch": 1.0212248865845754, + "grad_norm": 0.039573341608047485, + "learning_rate": 0.00016548899590440613, + "loss": 0.3403, + "step": 12606 + }, + { + "epoch": 1.021305897602074, + "grad_norm": 0.034855302423238754, + "learning_rate": 0.00016548449525181152, + "loss": 0.3194, + "step": 12607 + }, + { + "epoch": 1.0213869086195722, + "grad_norm": 0.042670026421546936, + "learning_rate": 0.0001654799945992169, + "loss": 0.3411, + "step": 12608 + }, + { + "epoch": 1.0214679196370706, + "grad_norm": 0.03425101190805435, + "learning_rate": 0.00016547549394662226, + "loss": 0.294, + "step": 12609 + }, + { + "epoch": 1.021548930654569, + "grad_norm": 0.035355765372514725, + "learning_rate": 0.00016547099329402765, + "loss": 0.3115, + "step": 12610 + }, + { + "epoch": 1.0216299416720673, + "grad_norm": 0.043876733630895615, + "learning_rate": 0.000165466492641433, + "loss": 0.3033, + "step": 12611 + }, + { + "epoch": 1.0217109526895658, + "grad_norm": 0.04104451835155487, + "learning_rate": 0.00016546199198883837, + "loss": 0.3531, + "step": 12612 + }, + { + "epoch": 1.0217919637070643, + "grad_norm": 0.038133054971694946, + "learning_rate": 0.00016545749133624376, + "loss": 0.3153, + "step": 12613 + }, + { + "epoch": 1.0218729747245625, + "grad_norm": 0.039468154311180115, + "learning_rate": 0.00016545299068364914, + "loss": 0.3387, + "step": 12614 + }, + { + "epoch": 1.021953985742061, + "grad_norm": 0.04036974534392357, + "learning_rate": 0.0001654484900310545, + "loss": 0.3256, + "step": 12615 + }, + { + "epoch": 1.0220349967595592, + "grad_norm": 0.03471231833100319, + "learning_rate": 0.0001654439893784599, + "loss": 0.2947, + "step": 12616 + }, + { + "epoch": 1.0221160077770577, + "grad_norm": 0.04138367995619774, + "learning_rate": 0.00016543948872586525, + "loss": 0.3119, + "step": 12617 + }, + { + "epoch": 1.0221970187945562, + "grad_norm": 0.04733224958181381, + "learning_rate": 0.00016543498807327064, + "loss": 0.3432, + "step": 12618 + }, + { + "epoch": 1.0222780298120544, + "grad_norm": 0.03255488723516464, + "learning_rate": 0.00016543048742067603, + "loss": 0.2945, + "step": 12619 + }, + { + "epoch": 1.0223590408295529, + "grad_norm": 0.04019397124648094, + "learning_rate": 0.00016542598676808139, + "loss": 0.3161, + "step": 12620 + }, + { + "epoch": 1.0224400518470511, + "grad_norm": 0.03219766542315483, + "learning_rate": 0.00016542148611548675, + "loss": 0.2848, + "step": 12621 + }, + { + "epoch": 1.0225210628645496, + "grad_norm": 0.041206423193216324, + "learning_rate": 0.00016541698546289213, + "loss": 0.3047, + "step": 12622 + }, + { + "epoch": 1.022602073882048, + "grad_norm": 0.043796516954898834, + "learning_rate": 0.0001654124848102975, + "loss": 0.3757, + "step": 12623 + }, + { + "epoch": 1.0226830848995463, + "grad_norm": 0.038199108093976974, + "learning_rate": 0.00016540798415770288, + "loss": 0.3825, + "step": 12624 + }, + { + "epoch": 1.0227640959170448, + "grad_norm": 0.03842081502079964, + "learning_rate": 0.00016540348350510827, + "loss": 0.3104, + "step": 12625 + }, + { + "epoch": 1.022845106934543, + "grad_norm": 0.03330446034669876, + "learning_rate": 0.00016539898285251363, + "loss": 0.2845, + "step": 12626 + }, + { + "epoch": 1.0229261179520415, + "grad_norm": 0.03181296959519386, + "learning_rate": 0.000165394482199919, + "loss": 0.299, + "step": 12627 + }, + { + "epoch": 1.02300712896954, + "grad_norm": 0.03798550367355347, + "learning_rate": 0.00016538998154732437, + "loss": 0.3271, + "step": 12628 + }, + { + "epoch": 1.0230881399870382, + "grad_norm": 0.035294223576784134, + "learning_rate": 0.00016538548089472973, + "loss": 0.2878, + "step": 12629 + }, + { + "epoch": 1.0231691510045366, + "grad_norm": 0.04210999608039856, + "learning_rate": 0.00016538098024213512, + "loss": 0.3353, + "step": 12630 + }, + { + "epoch": 1.023250162022035, + "grad_norm": 0.039759013801813126, + "learning_rate": 0.0001653764795895405, + "loss": 0.4063, + "step": 12631 + }, + { + "epoch": 1.0233311730395334, + "grad_norm": 0.036347441375255585, + "learning_rate": 0.00016537197893694587, + "loss": 0.3421, + "step": 12632 + }, + { + "epoch": 1.0234121840570318, + "grad_norm": 0.032233960926532745, + "learning_rate": 0.00016536747828435123, + "loss": 0.291, + "step": 12633 + }, + { + "epoch": 1.02349319507453, + "grad_norm": 0.030994798988103867, + "learning_rate": 0.00016536297763175662, + "loss": 0.2779, + "step": 12634 + }, + { + "epoch": 1.0235742060920285, + "grad_norm": 0.03740865737199783, + "learning_rate": 0.00016535847697916198, + "loss": 0.3453, + "step": 12635 + }, + { + "epoch": 1.023655217109527, + "grad_norm": 0.03850658982992172, + "learning_rate": 0.00016535397632656736, + "loss": 0.321, + "step": 12636 + }, + { + "epoch": 1.0237362281270252, + "grad_norm": 0.03866725414991379, + "learning_rate": 0.00016534947567397275, + "loss": 0.3467, + "step": 12637 + }, + { + "epoch": 1.0238172391445237, + "grad_norm": 0.04454237222671509, + "learning_rate": 0.0001653449750213781, + "loss": 0.3599, + "step": 12638 + }, + { + "epoch": 1.023898250162022, + "grad_norm": 0.036081261932849884, + "learning_rate": 0.00016534047436878347, + "loss": 0.2961, + "step": 12639 + }, + { + "epoch": 1.0239792611795204, + "grad_norm": 0.03776612877845764, + "learning_rate": 0.00016533597371618886, + "loss": 0.338, + "step": 12640 + }, + { + "epoch": 1.024060272197019, + "grad_norm": 0.0352783240377903, + "learning_rate": 0.00016533147306359422, + "loss": 0.3215, + "step": 12641 + }, + { + "epoch": 1.0241412832145171, + "grad_norm": 0.03597136586904526, + "learning_rate": 0.0001653269724109996, + "loss": 0.3075, + "step": 12642 + }, + { + "epoch": 1.0242222942320156, + "grad_norm": 0.037728115916252136, + "learning_rate": 0.000165322471758405, + "loss": 0.3122, + "step": 12643 + }, + { + "epoch": 1.0243033052495139, + "grad_norm": 0.039568349719047546, + "learning_rate": 0.00016531797110581035, + "loss": 0.32, + "step": 12644 + }, + { + "epoch": 1.0243843162670123, + "grad_norm": 0.039027776569128036, + "learning_rate": 0.0001653134704532157, + "loss": 0.3213, + "step": 12645 + }, + { + "epoch": 1.0244653272845108, + "grad_norm": 0.03547472506761551, + "learning_rate": 0.0001653089698006211, + "loss": 0.3103, + "step": 12646 + }, + { + "epoch": 1.024546338302009, + "grad_norm": 0.03676692023873329, + "learning_rate": 0.00016530446914802646, + "loss": 0.3289, + "step": 12647 + }, + { + "epoch": 1.0246273493195075, + "grad_norm": 0.04158513993024826, + "learning_rate": 0.00016529996849543185, + "loss": 0.3409, + "step": 12648 + }, + { + "epoch": 1.0247083603370057, + "grad_norm": 0.040971919894218445, + "learning_rate": 0.00016529546784283723, + "loss": 0.3207, + "step": 12649 + }, + { + "epoch": 1.0247893713545042, + "grad_norm": 0.03509372100234032, + "learning_rate": 0.0001652909671902426, + "loss": 0.3189, + "step": 12650 + }, + { + "epoch": 1.0248703823720027, + "grad_norm": 0.043991539627313614, + "learning_rate": 0.00016528646653764795, + "loss": 0.3416, + "step": 12651 + }, + { + "epoch": 1.024951393389501, + "grad_norm": 0.0418696291744709, + "learning_rate": 0.00016528196588505334, + "loss": 0.3327, + "step": 12652 + }, + { + "epoch": 1.0250324044069994, + "grad_norm": 0.03739069774746895, + "learning_rate": 0.0001652774652324587, + "loss": 0.3865, + "step": 12653 + }, + { + "epoch": 1.0251134154244976, + "grad_norm": 0.03675004467368126, + "learning_rate": 0.0001652729645798641, + "loss": 0.3498, + "step": 12654 + }, + { + "epoch": 1.025194426441996, + "grad_norm": 0.03807086870074272, + "learning_rate": 0.00016526846392726948, + "loss": 0.331, + "step": 12655 + }, + { + "epoch": 1.0252754374594946, + "grad_norm": 0.03432943671941757, + "learning_rate": 0.00016526396327467484, + "loss": 0.3042, + "step": 12656 + }, + { + "epoch": 1.0253564484769928, + "grad_norm": 0.03586483374238014, + "learning_rate": 0.0001652594626220802, + "loss": 0.2827, + "step": 12657 + }, + { + "epoch": 1.0254374594944913, + "grad_norm": 0.03931160271167755, + "learning_rate": 0.00016525496196948558, + "loss": 0.3446, + "step": 12658 + }, + { + "epoch": 1.0255184705119895, + "grad_norm": 0.036072421818971634, + "learning_rate": 0.00016525046131689094, + "loss": 0.3451, + "step": 12659 + }, + { + "epoch": 1.025599481529488, + "grad_norm": 0.04291589930653572, + "learning_rate": 0.00016524596066429633, + "loss": 0.3378, + "step": 12660 + }, + { + "epoch": 1.0256804925469865, + "grad_norm": 0.03836192935705185, + "learning_rate": 0.00016524146001170172, + "loss": 0.3498, + "step": 12661 + }, + { + "epoch": 1.0257615035644847, + "grad_norm": 0.038849957287311554, + "learning_rate": 0.00016523695935910708, + "loss": 0.3155, + "step": 12662 + }, + { + "epoch": 1.0258425145819832, + "grad_norm": 0.035676270723342896, + "learning_rate": 0.00016523245870651244, + "loss": 0.3029, + "step": 12663 + }, + { + "epoch": 1.0259235255994816, + "grad_norm": 0.04012364521622658, + "learning_rate": 0.00016522795805391782, + "loss": 0.3858, + "step": 12664 + }, + { + "epoch": 1.0260045366169799, + "grad_norm": 0.037432339042425156, + "learning_rate": 0.00016522345740132318, + "loss": 0.322, + "step": 12665 + }, + { + "epoch": 1.0260855476344783, + "grad_norm": 0.036835625767707825, + "learning_rate": 0.00016521895674872857, + "loss": 0.3359, + "step": 12666 + }, + { + "epoch": 1.0261665586519766, + "grad_norm": 0.03636383265256882, + "learning_rate": 0.00016521445609613396, + "loss": 0.2959, + "step": 12667 + }, + { + "epoch": 1.026247569669475, + "grad_norm": 0.03819280117750168, + "learning_rate": 0.00016520995544353932, + "loss": 0.2792, + "step": 12668 + }, + { + "epoch": 1.0263285806869735, + "grad_norm": 0.039029382169246674, + "learning_rate": 0.00016520545479094468, + "loss": 0.3051, + "step": 12669 + }, + { + "epoch": 1.0264095917044718, + "grad_norm": 0.03400282561779022, + "learning_rate": 0.00016520095413835007, + "loss": 0.2967, + "step": 12670 + }, + { + "epoch": 1.0264906027219702, + "grad_norm": 0.03828096762299538, + "learning_rate": 0.00016519645348575545, + "loss": 0.3152, + "step": 12671 + }, + { + "epoch": 1.0265716137394685, + "grad_norm": 0.039998847991228104, + "learning_rate": 0.0001651919528331608, + "loss": 0.3238, + "step": 12672 + }, + { + "epoch": 1.026652624756967, + "grad_norm": 0.03841540217399597, + "learning_rate": 0.0001651874521805662, + "loss": 0.3157, + "step": 12673 + }, + { + "epoch": 1.0267336357744654, + "grad_norm": 0.03829701989889145, + "learning_rate": 0.00016518295152797156, + "loss": 0.3399, + "step": 12674 + }, + { + "epoch": 1.0268146467919637, + "grad_norm": 0.03337425738573074, + "learning_rate": 0.00016517845087537692, + "loss": 0.3033, + "step": 12675 + }, + { + "epoch": 1.0268956578094621, + "grad_norm": 0.03805439919233322, + "learning_rate": 0.0001651739502227823, + "loss": 0.3487, + "step": 12676 + }, + { + "epoch": 1.0269766688269604, + "grad_norm": 0.039699751883745193, + "learning_rate": 0.0001651694495701877, + "loss": 0.324, + "step": 12677 + }, + { + "epoch": 1.0270576798444588, + "grad_norm": 0.03741767629981041, + "learning_rate": 0.00016516494891759305, + "loss": 0.297, + "step": 12678 + }, + { + "epoch": 1.0271386908619573, + "grad_norm": 0.03826878219842911, + "learning_rate": 0.00016516044826499844, + "loss": 0.3311, + "step": 12679 + }, + { + "epoch": 1.0272197018794555, + "grad_norm": 0.041468605399131775, + "learning_rate": 0.0001651559476124038, + "loss": 0.3181, + "step": 12680 + }, + { + "epoch": 1.027300712896954, + "grad_norm": 0.03656802698969841, + "learning_rate": 0.00016515144695980916, + "loss": 0.3039, + "step": 12681 + }, + { + "epoch": 1.0273817239144523, + "grad_norm": 0.03783682361245155, + "learning_rate": 0.00016514694630721455, + "loss": 0.3318, + "step": 12682 + }, + { + "epoch": 1.0274627349319507, + "grad_norm": 0.042352866381406784, + "learning_rate": 0.00016514244565461994, + "loss": 0.3418, + "step": 12683 + }, + { + "epoch": 1.0275437459494492, + "grad_norm": 0.039490267634391785, + "learning_rate": 0.0001651379450020253, + "loss": 0.3186, + "step": 12684 + }, + { + "epoch": 1.0276247569669474, + "grad_norm": 0.04535860940814018, + "learning_rate": 0.00016513344434943068, + "loss": 0.3763, + "step": 12685 + }, + { + "epoch": 1.027705767984446, + "grad_norm": 0.039086759090423584, + "learning_rate": 0.00016512894369683604, + "loss": 0.3221, + "step": 12686 + }, + { + "epoch": 1.0277867790019444, + "grad_norm": 0.03470296785235405, + "learning_rate": 0.00016512444304424143, + "loss": 0.2955, + "step": 12687 + }, + { + "epoch": 1.0278677900194426, + "grad_norm": 0.03623901680111885, + "learning_rate": 0.0001651199423916468, + "loss": 0.3199, + "step": 12688 + }, + { + "epoch": 1.027948801036941, + "grad_norm": 0.03384559229016304, + "learning_rate": 0.00016511544173905218, + "loss": 0.3335, + "step": 12689 + }, + { + "epoch": 1.0280298120544393, + "grad_norm": 0.03289582580327988, + "learning_rate": 0.00016511094108645754, + "loss": 0.3128, + "step": 12690 + }, + { + "epoch": 1.0281108230719378, + "grad_norm": 0.042164016515016556, + "learning_rate": 0.00016510644043386292, + "loss": 0.3708, + "step": 12691 + }, + { + "epoch": 1.0281918340894363, + "grad_norm": 0.035185497254133224, + "learning_rate": 0.00016510193978126828, + "loss": 0.3108, + "step": 12692 + }, + { + "epoch": 1.0282728451069345, + "grad_norm": 0.039614636451005936, + "learning_rate": 0.00016509743912867367, + "loss": 0.3313, + "step": 12693 + }, + { + "epoch": 1.028353856124433, + "grad_norm": 0.03454237058758736, + "learning_rate": 0.00016509293847607903, + "loss": 0.3086, + "step": 12694 + }, + { + "epoch": 1.0284348671419312, + "grad_norm": 0.043382588773965836, + "learning_rate": 0.00016508843782348442, + "loss": 0.368, + "step": 12695 + }, + { + "epoch": 1.0285158781594297, + "grad_norm": 0.034158188849687576, + "learning_rate": 0.00016508393717088978, + "loss": 0.2759, + "step": 12696 + }, + { + "epoch": 1.0285968891769282, + "grad_norm": 0.03920743614435196, + "learning_rate": 0.00016507943651829517, + "loss": 0.3234, + "step": 12697 + }, + { + "epoch": 1.0286779001944264, + "grad_norm": 0.03635401278734207, + "learning_rate": 0.00016507493586570053, + "loss": 0.3071, + "step": 12698 + }, + { + "epoch": 1.0287589112119249, + "grad_norm": 0.037762533873319626, + "learning_rate": 0.00016507043521310591, + "loss": 0.3268, + "step": 12699 + }, + { + "epoch": 1.028839922229423, + "grad_norm": 0.034993529319763184, + "learning_rate": 0.0001650659345605113, + "loss": 0.2665, + "step": 12700 + }, + { + "epoch": 1.0289209332469216, + "grad_norm": 0.03569914773106575, + "learning_rate": 0.00016506143390791666, + "loss": 0.2778, + "step": 12701 + }, + { + "epoch": 1.02900194426442, + "grad_norm": 0.03663048893213272, + "learning_rate": 0.00016505693325532202, + "loss": 0.3148, + "step": 12702 + }, + { + "epoch": 1.0290829552819183, + "grad_norm": 0.03235512226819992, + "learning_rate": 0.0001650524326027274, + "loss": 0.2983, + "step": 12703 + }, + { + "epoch": 1.0291639662994168, + "grad_norm": 0.03489063307642937, + "learning_rate": 0.00016504793195013277, + "loss": 0.3131, + "step": 12704 + }, + { + "epoch": 1.029244977316915, + "grad_norm": 0.03714337944984436, + "learning_rate": 0.00016504343129753816, + "loss": 0.3165, + "step": 12705 + }, + { + "epoch": 1.0293259883344135, + "grad_norm": 0.032778069376945496, + "learning_rate": 0.00016503893064494354, + "loss": 0.2898, + "step": 12706 + }, + { + "epoch": 1.029406999351912, + "grad_norm": 0.04097514972090721, + "learning_rate": 0.0001650344299923489, + "loss": 0.3605, + "step": 12707 + }, + { + "epoch": 1.0294880103694102, + "grad_norm": 0.03973684087395668, + "learning_rate": 0.00016502992933975426, + "loss": 0.3336, + "step": 12708 + }, + { + "epoch": 1.0295690213869086, + "grad_norm": 0.03882990777492523, + "learning_rate": 0.00016502542868715965, + "loss": 0.3164, + "step": 12709 + }, + { + "epoch": 1.029650032404407, + "grad_norm": 0.03919418901205063, + "learning_rate": 0.000165020928034565, + "loss": 0.3582, + "step": 12710 + }, + { + "epoch": 1.0297310434219054, + "grad_norm": 0.03751210868358612, + "learning_rate": 0.0001650164273819704, + "loss": 0.2969, + "step": 12711 + }, + { + "epoch": 1.0298120544394038, + "grad_norm": 0.034326836466789246, + "learning_rate": 0.00016501192672937578, + "loss": 0.3231, + "step": 12712 + }, + { + "epoch": 1.029893065456902, + "grad_norm": 0.02914053201675415, + "learning_rate": 0.00016500742607678114, + "loss": 0.2834, + "step": 12713 + }, + { + "epoch": 1.0299740764744005, + "grad_norm": 0.036301035434007645, + "learning_rate": 0.0001650029254241865, + "loss": 0.3006, + "step": 12714 + }, + { + "epoch": 1.030055087491899, + "grad_norm": 0.03592952340841293, + "learning_rate": 0.0001649984247715919, + "loss": 0.307, + "step": 12715 + }, + { + "epoch": 1.0301360985093972, + "grad_norm": 0.04360431432723999, + "learning_rate": 0.00016499392411899725, + "loss": 0.3239, + "step": 12716 + }, + { + "epoch": 1.0302171095268957, + "grad_norm": 0.036142975091934204, + "learning_rate": 0.00016498942346640264, + "loss": 0.3437, + "step": 12717 + }, + { + "epoch": 1.030298120544394, + "grad_norm": 0.03672938793897629, + "learning_rate": 0.00016498492281380803, + "loss": 0.313, + "step": 12718 + }, + { + "epoch": 1.0303791315618924, + "grad_norm": 0.033051345497369766, + "learning_rate": 0.00016498042216121339, + "loss": 0.2892, + "step": 12719 + }, + { + "epoch": 1.030460142579391, + "grad_norm": 0.036963798105716705, + "learning_rate": 0.00016497592150861875, + "loss": 0.2768, + "step": 12720 + }, + { + "epoch": 1.0305411535968891, + "grad_norm": 0.03400468826293945, + "learning_rate": 0.00016497142085602413, + "loss": 0.2884, + "step": 12721 + }, + { + "epoch": 1.0306221646143876, + "grad_norm": 0.03542281687259674, + "learning_rate": 0.0001649669202034295, + "loss": 0.3374, + "step": 12722 + }, + { + "epoch": 1.0307031756318858, + "grad_norm": 0.03506822884082794, + "learning_rate": 0.00016496241955083488, + "loss": 0.3056, + "step": 12723 + }, + { + "epoch": 1.0307841866493843, + "grad_norm": 0.039111554622650146, + "learning_rate": 0.00016495791889824027, + "loss": 0.3104, + "step": 12724 + }, + { + "epoch": 1.0308651976668828, + "grad_norm": 0.03314598277211189, + "learning_rate": 0.00016495341824564563, + "loss": 0.296, + "step": 12725 + }, + { + "epoch": 1.030946208684381, + "grad_norm": 0.03986050933599472, + "learning_rate": 0.000164948917593051, + "loss": 0.3249, + "step": 12726 + }, + { + "epoch": 1.0310272197018795, + "grad_norm": 0.040087323635816574, + "learning_rate": 0.00016494441694045637, + "loss": 0.303, + "step": 12727 + }, + { + "epoch": 1.0311082307193777, + "grad_norm": 0.03449118509888649, + "learning_rate": 0.00016493991628786173, + "loss": 0.3083, + "step": 12728 + }, + { + "epoch": 1.0311892417368762, + "grad_norm": 0.04075778275728226, + "learning_rate": 0.00016493541563526712, + "loss": 0.3229, + "step": 12729 + }, + { + "epoch": 1.0312702527543747, + "grad_norm": 0.03875141218304634, + "learning_rate": 0.0001649309149826725, + "loss": 0.342, + "step": 12730 + }, + { + "epoch": 1.031351263771873, + "grad_norm": 0.03614082932472229, + "learning_rate": 0.00016492641433007787, + "loss": 0.3056, + "step": 12731 + }, + { + "epoch": 1.0314322747893714, + "grad_norm": 0.03601795807480812, + "learning_rate": 0.00016492191367748323, + "loss": 0.3039, + "step": 12732 + }, + { + "epoch": 1.0315132858068696, + "grad_norm": 0.0373661145567894, + "learning_rate": 0.00016491741302488862, + "loss": 0.3006, + "step": 12733 + }, + { + "epoch": 1.031594296824368, + "grad_norm": 0.043441496789455414, + "learning_rate": 0.00016491291237229398, + "loss": 0.3699, + "step": 12734 + }, + { + "epoch": 1.0316753078418666, + "grad_norm": 0.04167543724179268, + "learning_rate": 0.00016490841171969936, + "loss": 0.3815, + "step": 12735 + }, + { + "epoch": 1.0317563188593648, + "grad_norm": 0.03903650864958763, + "learning_rate": 0.00016490391106710475, + "loss": 0.3061, + "step": 12736 + }, + { + "epoch": 1.0318373298768633, + "grad_norm": 0.03695125877857208, + "learning_rate": 0.0001648994104145101, + "loss": 0.3025, + "step": 12737 + }, + { + "epoch": 1.0319183408943617, + "grad_norm": 0.04857954382896423, + "learning_rate": 0.00016489490976191547, + "loss": 0.2771, + "step": 12738 + }, + { + "epoch": 1.03199935191186, + "grad_norm": 0.03504607826471329, + "learning_rate": 0.00016489040910932086, + "loss": 0.2675, + "step": 12739 + }, + { + "epoch": 1.0320803629293585, + "grad_norm": 0.03955131024122238, + "learning_rate": 0.00016488590845672622, + "loss": 0.2919, + "step": 12740 + }, + { + "epoch": 1.0321613739468567, + "grad_norm": 0.03247314691543579, + "learning_rate": 0.0001648814078041316, + "loss": 0.3099, + "step": 12741 + }, + { + "epoch": 1.0322423849643552, + "grad_norm": 0.03682674840092659, + "learning_rate": 0.000164876907151537, + "loss": 0.3116, + "step": 12742 + }, + { + "epoch": 1.0323233959818536, + "grad_norm": 0.036403726786375046, + "learning_rate": 0.00016487240649894235, + "loss": 0.2985, + "step": 12743 + }, + { + "epoch": 1.0324044069993519, + "grad_norm": 0.03299999237060547, + "learning_rate": 0.0001648679058463477, + "loss": 0.2904, + "step": 12744 + }, + { + "epoch": 1.0324854180168503, + "grad_norm": 0.04140744358301163, + "learning_rate": 0.0001648634051937531, + "loss": 0.3295, + "step": 12745 + }, + { + "epoch": 1.0325664290343486, + "grad_norm": 0.03611183539032936, + "learning_rate": 0.00016485890454115846, + "loss": 0.3071, + "step": 12746 + }, + { + "epoch": 1.032647440051847, + "grad_norm": 0.04135493189096451, + "learning_rate": 0.00016485440388856385, + "loss": 0.3406, + "step": 12747 + }, + { + "epoch": 1.0327284510693455, + "grad_norm": 0.03947634994983673, + "learning_rate": 0.00016484990323596923, + "loss": 0.3157, + "step": 12748 + }, + { + "epoch": 1.0328094620868438, + "grad_norm": 0.03282932937145233, + "learning_rate": 0.0001648454025833746, + "loss": 0.2609, + "step": 12749 + }, + { + "epoch": 1.0328904731043422, + "grad_norm": 0.04158890247344971, + "learning_rate": 0.00016484090193077995, + "loss": 0.3281, + "step": 12750 + }, + { + "epoch": 1.0329714841218405, + "grad_norm": 0.0459459125995636, + "learning_rate": 0.00016483640127818534, + "loss": 0.318, + "step": 12751 + }, + { + "epoch": 1.033052495139339, + "grad_norm": 0.04458964243531227, + "learning_rate": 0.00016483190062559073, + "loss": 0.2824, + "step": 12752 + }, + { + "epoch": 1.0331335061568374, + "grad_norm": 0.03444017842411995, + "learning_rate": 0.0001648273999729961, + "loss": 0.2451, + "step": 12753 + }, + { + "epoch": 1.0332145171743357, + "grad_norm": 0.037729572504758835, + "learning_rate": 0.00016482289932040148, + "loss": 0.2816, + "step": 12754 + }, + { + "epoch": 1.0332955281918341, + "grad_norm": 0.03754454106092453, + "learning_rate": 0.00016481839866780684, + "loss": 0.325, + "step": 12755 + }, + { + "epoch": 1.0333765392093324, + "grad_norm": 0.035541508346796036, + "learning_rate": 0.00016481389801521222, + "loss": 0.3064, + "step": 12756 + }, + { + "epoch": 1.0334575502268308, + "grad_norm": 0.040059611201286316, + "learning_rate": 0.00016480939736261758, + "loss": 0.3018, + "step": 12757 + }, + { + "epoch": 1.0335385612443293, + "grad_norm": 0.034477706998586655, + "learning_rate": 0.00016480489671002297, + "loss": 0.2846, + "step": 12758 + }, + { + "epoch": 1.0336195722618275, + "grad_norm": 0.0396190881729126, + "learning_rate": 0.00016480039605742833, + "loss": 0.3092, + "step": 12759 + }, + { + "epoch": 1.033700583279326, + "grad_norm": 0.03748718649148941, + "learning_rate": 0.00016479589540483372, + "loss": 0.2901, + "step": 12760 + }, + { + "epoch": 1.0337815942968245, + "grad_norm": 0.03152185305953026, + "learning_rate": 0.00016479139475223908, + "loss": 0.2671, + "step": 12761 + }, + { + "epoch": 1.0338626053143227, + "grad_norm": 0.038291048258543015, + "learning_rate": 0.00016478689409964446, + "loss": 0.3398, + "step": 12762 + }, + { + "epoch": 1.0339436163318212, + "grad_norm": 0.037285178899765015, + "learning_rate": 0.00016478239344704982, + "loss": 0.3386, + "step": 12763 + }, + { + "epoch": 1.0340246273493194, + "grad_norm": 0.041180677711963654, + "learning_rate": 0.0001647778927944552, + "loss": 0.3311, + "step": 12764 + }, + { + "epoch": 1.034105638366818, + "grad_norm": 0.044005073606967926, + "learning_rate": 0.00016477339214186057, + "loss": 0.3778, + "step": 12765 + }, + { + "epoch": 1.0341866493843164, + "grad_norm": 0.036691803485155106, + "learning_rate": 0.00016476889148926596, + "loss": 0.2971, + "step": 12766 + }, + { + "epoch": 1.0342676604018146, + "grad_norm": 0.03426551818847656, + "learning_rate": 0.00016476439083667132, + "loss": 0.3008, + "step": 12767 + }, + { + "epoch": 1.034348671419313, + "grad_norm": 0.03679486736655235, + "learning_rate": 0.0001647598901840767, + "loss": 0.3064, + "step": 12768 + }, + { + "epoch": 1.0344296824368113, + "grad_norm": 0.04301043227314949, + "learning_rate": 0.00016475538953148207, + "loss": 0.3404, + "step": 12769 + }, + { + "epoch": 1.0345106934543098, + "grad_norm": 0.03880010172724724, + "learning_rate": 0.00016475088887888745, + "loss": 0.3673, + "step": 12770 + }, + { + "epoch": 1.0345917044718083, + "grad_norm": 0.04446203634142876, + "learning_rate": 0.0001647463882262928, + "loss": 0.3706, + "step": 12771 + }, + { + "epoch": 1.0346727154893065, + "grad_norm": 0.03888606280088425, + "learning_rate": 0.0001647418875736982, + "loss": 0.3157, + "step": 12772 + }, + { + "epoch": 1.034753726506805, + "grad_norm": 0.03600630164146423, + "learning_rate": 0.00016473738692110356, + "loss": 0.3125, + "step": 12773 + }, + { + "epoch": 1.0348347375243032, + "grad_norm": 0.035312626510858536, + "learning_rate": 0.00016473288626850895, + "loss": 0.2885, + "step": 12774 + }, + { + "epoch": 1.0349157485418017, + "grad_norm": 0.033152591437101364, + "learning_rate": 0.0001647283856159143, + "loss": 0.2872, + "step": 12775 + }, + { + "epoch": 1.0349967595593002, + "grad_norm": 0.040598925203084946, + "learning_rate": 0.0001647238849633197, + "loss": 0.359, + "step": 12776 + }, + { + "epoch": 1.0350777705767984, + "grad_norm": 0.03144126012921333, + "learning_rate": 0.00016471938431072505, + "loss": 0.2809, + "step": 12777 + }, + { + "epoch": 1.0351587815942969, + "grad_norm": 0.0429706797003746, + "learning_rate": 0.00016471488365813044, + "loss": 0.3573, + "step": 12778 + }, + { + "epoch": 1.035239792611795, + "grad_norm": 0.04190703108906746, + "learning_rate": 0.0001647103830055358, + "loss": 0.3757, + "step": 12779 + }, + { + "epoch": 1.0353208036292936, + "grad_norm": 0.03837323933839798, + "learning_rate": 0.0001647058823529412, + "loss": 0.3048, + "step": 12780 + }, + { + "epoch": 1.035401814646792, + "grad_norm": 0.037326592952013016, + "learning_rate": 0.00016470138170034658, + "loss": 0.2965, + "step": 12781 + }, + { + "epoch": 1.0354828256642903, + "grad_norm": 0.04110584780573845, + "learning_rate": 0.00016469688104775194, + "loss": 0.2806, + "step": 12782 + }, + { + "epoch": 1.0355638366817888, + "grad_norm": 0.04354546591639519, + "learning_rate": 0.0001646923803951573, + "loss": 0.3752, + "step": 12783 + }, + { + "epoch": 1.035644847699287, + "grad_norm": 0.03725029155611992, + "learning_rate": 0.00016468787974256268, + "loss": 0.3183, + "step": 12784 + }, + { + "epoch": 1.0357258587167855, + "grad_norm": 0.03914825618267059, + "learning_rate": 0.00016468337908996804, + "loss": 0.3576, + "step": 12785 + }, + { + "epoch": 1.035806869734284, + "grad_norm": 0.03538013994693756, + "learning_rate": 0.00016467887843737343, + "loss": 0.3384, + "step": 12786 + }, + { + "epoch": 1.0358878807517822, + "grad_norm": 0.040092017501592636, + "learning_rate": 0.00016467437778477882, + "loss": 0.3358, + "step": 12787 + }, + { + "epoch": 1.0359688917692806, + "grad_norm": 0.03664885461330414, + "learning_rate": 0.00016466987713218418, + "loss": 0.3106, + "step": 12788 + }, + { + "epoch": 1.036049902786779, + "grad_norm": 0.03369804471731186, + "learning_rate": 0.00016466537647958954, + "loss": 0.3123, + "step": 12789 + }, + { + "epoch": 1.0361309138042774, + "grad_norm": 0.03672627732157707, + "learning_rate": 0.00016466087582699493, + "loss": 0.3299, + "step": 12790 + }, + { + "epoch": 1.0362119248217758, + "grad_norm": 0.03600330650806427, + "learning_rate": 0.00016465637517440029, + "loss": 0.3311, + "step": 12791 + }, + { + "epoch": 1.036292935839274, + "grad_norm": 0.03647133335471153, + "learning_rate": 0.00016465187452180567, + "loss": 0.3326, + "step": 12792 + }, + { + "epoch": 1.0363739468567725, + "grad_norm": 0.03292451798915863, + "learning_rate": 0.00016464737386921106, + "loss": 0.2895, + "step": 12793 + }, + { + "epoch": 1.036454957874271, + "grad_norm": 0.03374708443880081, + "learning_rate": 0.00016464287321661642, + "loss": 0.315, + "step": 12794 + }, + { + "epoch": 1.0365359688917692, + "grad_norm": 0.043175119906663895, + "learning_rate": 0.00016463837256402178, + "loss": 0.387, + "step": 12795 + }, + { + "epoch": 1.0366169799092677, + "grad_norm": 0.03675583750009537, + "learning_rate": 0.00016463387191142717, + "loss": 0.3378, + "step": 12796 + }, + { + "epoch": 1.036697990926766, + "grad_norm": 0.036697644740343094, + "learning_rate": 0.00016462937125883253, + "loss": 0.3323, + "step": 12797 + }, + { + "epoch": 1.0367790019442644, + "grad_norm": 0.036025747656822205, + "learning_rate": 0.00016462487060623791, + "loss": 0.3212, + "step": 12798 + }, + { + "epoch": 1.036860012961763, + "grad_norm": 0.04477591812610626, + "learning_rate": 0.0001646203699536433, + "loss": 0.3703, + "step": 12799 + }, + { + "epoch": 1.0369410239792611, + "grad_norm": 0.03663002327084541, + "learning_rate": 0.00016461586930104866, + "loss": 0.2979, + "step": 12800 + }, + { + "epoch": 1.0370220349967596, + "grad_norm": 0.031259819865226746, + "learning_rate": 0.00016461136864845402, + "loss": 0.3268, + "step": 12801 + }, + { + "epoch": 1.0371030460142578, + "grad_norm": 0.03488311171531677, + "learning_rate": 0.0001646068679958594, + "loss": 0.3042, + "step": 12802 + }, + { + "epoch": 1.0371840570317563, + "grad_norm": 0.03731519356369972, + "learning_rate": 0.00016460236734326477, + "loss": 0.2905, + "step": 12803 + }, + { + "epoch": 1.0372650680492548, + "grad_norm": 0.034211575984954834, + "learning_rate": 0.00016459786669067016, + "loss": 0.2775, + "step": 12804 + }, + { + "epoch": 1.037346079066753, + "grad_norm": 0.03737207502126694, + "learning_rate": 0.00016459336603807554, + "loss": 0.3309, + "step": 12805 + }, + { + "epoch": 1.0374270900842515, + "grad_norm": 0.0373041145503521, + "learning_rate": 0.0001645888653854809, + "loss": 0.3127, + "step": 12806 + }, + { + "epoch": 1.0375081011017497, + "grad_norm": 0.03419501706957817, + "learning_rate": 0.00016458436473288626, + "loss": 0.3012, + "step": 12807 + }, + { + "epoch": 1.0375891121192482, + "grad_norm": 0.03152981400489807, + "learning_rate": 0.00016457986408029165, + "loss": 0.2665, + "step": 12808 + }, + { + "epoch": 1.0376701231367467, + "grad_norm": 0.03565287962555885, + "learning_rate": 0.000164575363427697, + "loss": 0.2807, + "step": 12809 + }, + { + "epoch": 1.037751134154245, + "grad_norm": 0.03869590908288956, + "learning_rate": 0.0001645708627751024, + "loss": 0.3173, + "step": 12810 + }, + { + "epoch": 1.0378321451717434, + "grad_norm": 0.03857014328241348, + "learning_rate": 0.00016456636212250778, + "loss": 0.332, + "step": 12811 + }, + { + "epoch": 1.0379131561892416, + "grad_norm": 0.035806186497211456, + "learning_rate": 0.00016456186146991314, + "loss": 0.3253, + "step": 12812 + }, + { + "epoch": 1.03799416720674, + "grad_norm": 0.043626558035612106, + "learning_rate": 0.0001645573608173185, + "loss": 0.3661, + "step": 12813 + }, + { + "epoch": 1.0380751782242386, + "grad_norm": 0.04251417517662048, + "learning_rate": 0.0001645528601647239, + "loss": 0.326, + "step": 12814 + }, + { + "epoch": 1.0381561892417368, + "grad_norm": 0.035238828510046005, + "learning_rate": 0.00016454835951212925, + "loss": 0.2567, + "step": 12815 + }, + { + "epoch": 1.0382372002592353, + "grad_norm": 0.03851184621453285, + "learning_rate": 0.00016454385885953464, + "loss": 0.276, + "step": 12816 + }, + { + "epoch": 1.0383182112767337, + "grad_norm": 0.03683389723300934, + "learning_rate": 0.00016453935820694003, + "loss": 0.3146, + "step": 12817 + }, + { + "epoch": 1.038399222294232, + "grad_norm": 0.0394960418343544, + "learning_rate": 0.00016453485755434539, + "loss": 0.3678, + "step": 12818 + }, + { + "epoch": 1.0384802333117304, + "grad_norm": 0.035948723554611206, + "learning_rate": 0.00016453035690175075, + "loss": 0.3143, + "step": 12819 + }, + { + "epoch": 1.0385612443292287, + "grad_norm": 0.03658129274845123, + "learning_rate": 0.00016452585624915613, + "loss": 0.3083, + "step": 12820 + }, + { + "epoch": 1.0386422553467272, + "grad_norm": 0.03749111294746399, + "learning_rate": 0.0001645213555965615, + "loss": 0.3199, + "step": 12821 + }, + { + "epoch": 1.0387232663642256, + "grad_norm": 0.036379821598529816, + "learning_rate": 0.00016451685494396688, + "loss": 0.3364, + "step": 12822 + }, + { + "epoch": 1.0388042773817239, + "grad_norm": 0.03274570032954216, + "learning_rate": 0.00016451235429137227, + "loss": 0.2769, + "step": 12823 + }, + { + "epoch": 1.0388852883992223, + "grad_norm": 0.03964697942137718, + "learning_rate": 0.00016450785363877763, + "loss": 0.3666, + "step": 12824 + }, + { + "epoch": 1.0389662994167206, + "grad_norm": 0.037210613489151, + "learning_rate": 0.00016450335298618301, + "loss": 0.2968, + "step": 12825 + }, + { + "epoch": 1.039047310434219, + "grad_norm": 0.04473162814974785, + "learning_rate": 0.00016449885233358837, + "loss": 0.3418, + "step": 12826 + }, + { + "epoch": 1.0391283214517175, + "grad_norm": 0.03509940579533577, + "learning_rate": 0.00016449435168099373, + "loss": 0.2978, + "step": 12827 + }, + { + "epoch": 1.0392093324692158, + "grad_norm": 0.03791176155209541, + "learning_rate": 0.00016448985102839912, + "loss": 0.2901, + "step": 12828 + }, + { + "epoch": 1.0392903434867142, + "grad_norm": 0.03966226801276207, + "learning_rate": 0.0001644853503758045, + "loss": 0.32, + "step": 12829 + }, + { + "epoch": 1.0393713545042125, + "grad_norm": 0.03378706052899361, + "learning_rate": 0.00016448084972320987, + "loss": 0.2834, + "step": 12830 + }, + { + "epoch": 1.039452365521711, + "grad_norm": 0.044208187609910965, + "learning_rate": 0.00016447634907061526, + "loss": 0.3345, + "step": 12831 + }, + { + "epoch": 1.0395333765392094, + "grad_norm": 0.03585111349821091, + "learning_rate": 0.00016447184841802062, + "loss": 0.2822, + "step": 12832 + }, + { + "epoch": 1.0396143875567077, + "grad_norm": 0.03986233100295067, + "learning_rate": 0.000164467347765426, + "loss": 0.3364, + "step": 12833 + }, + { + "epoch": 1.0396953985742061, + "grad_norm": 0.048165518790483475, + "learning_rate": 0.00016446284711283136, + "loss": 0.3471, + "step": 12834 + }, + { + "epoch": 1.0397764095917044, + "grad_norm": 0.03753822296857834, + "learning_rate": 0.00016445834646023675, + "loss": 0.3161, + "step": 12835 + }, + { + "epoch": 1.0398574206092028, + "grad_norm": 0.040102481842041016, + "learning_rate": 0.0001644538458076421, + "loss": 0.3347, + "step": 12836 + }, + { + "epoch": 1.0399384316267013, + "grad_norm": 0.041427258402109146, + "learning_rate": 0.0001644493451550475, + "loss": 0.3632, + "step": 12837 + }, + { + "epoch": 1.0400194426441995, + "grad_norm": 0.041916172951459885, + "learning_rate": 0.00016444484450245286, + "loss": 0.3338, + "step": 12838 + }, + { + "epoch": 1.040100453661698, + "grad_norm": 0.03925507515668869, + "learning_rate": 0.00016444034384985825, + "loss": 0.3249, + "step": 12839 + }, + { + "epoch": 1.0401814646791965, + "grad_norm": 0.03893420845270157, + "learning_rate": 0.0001644358431972636, + "loss": 0.3433, + "step": 12840 + }, + { + "epoch": 1.0402624756966947, + "grad_norm": 0.04083637148141861, + "learning_rate": 0.000164431342544669, + "loss": 0.3355, + "step": 12841 + }, + { + "epoch": 1.0403434867141932, + "grad_norm": 0.04069959744811058, + "learning_rate": 0.00016442684189207435, + "loss": 0.3214, + "step": 12842 + }, + { + "epoch": 1.0404244977316914, + "grad_norm": 0.0337250754237175, + "learning_rate": 0.00016442234123947974, + "loss": 0.3004, + "step": 12843 + }, + { + "epoch": 1.04050550874919, + "grad_norm": 0.040874969214200974, + "learning_rate": 0.0001644178405868851, + "loss": 0.3274, + "step": 12844 + }, + { + "epoch": 1.0405865197666884, + "grad_norm": 0.038601577281951904, + "learning_rate": 0.0001644133399342905, + "loss": 0.3136, + "step": 12845 + }, + { + "epoch": 1.0406675307841866, + "grad_norm": 0.04093305021524429, + "learning_rate": 0.00016440883928169585, + "loss": 0.3371, + "step": 12846 + }, + { + "epoch": 1.040748541801685, + "grad_norm": 0.036659788340330124, + "learning_rate": 0.00016440433862910123, + "loss": 0.2797, + "step": 12847 + }, + { + "epoch": 1.0408295528191833, + "grad_norm": 0.04440654441714287, + "learning_rate": 0.0001643998379765066, + "loss": 0.3396, + "step": 12848 + }, + { + "epoch": 1.0409105638366818, + "grad_norm": 0.03522910550236702, + "learning_rate": 0.00016439533732391198, + "loss": 0.2922, + "step": 12849 + }, + { + "epoch": 1.0409915748541803, + "grad_norm": 0.03405408933758736, + "learning_rate": 0.00016439083667131734, + "loss": 0.288, + "step": 12850 + }, + { + "epoch": 1.0410725858716785, + "grad_norm": 0.03984960913658142, + "learning_rate": 0.00016438633601872273, + "loss": 0.3072, + "step": 12851 + }, + { + "epoch": 1.041153596889177, + "grad_norm": 0.03902706876397133, + "learning_rate": 0.0001643818353661281, + "loss": 0.3158, + "step": 12852 + }, + { + "epoch": 1.0412346079066752, + "grad_norm": 0.04116075485944748, + "learning_rate": 0.00016437733471353348, + "loss": 0.3211, + "step": 12853 + }, + { + "epoch": 1.0413156189241737, + "grad_norm": 0.042279284447431564, + "learning_rate": 0.00016437283406093884, + "loss": 0.3322, + "step": 12854 + }, + { + "epoch": 1.0413966299416721, + "grad_norm": 0.03590260446071625, + "learning_rate": 0.00016436833340834422, + "loss": 0.2918, + "step": 12855 + }, + { + "epoch": 1.0414776409591704, + "grad_norm": 0.034707460552453995, + "learning_rate": 0.00016436383275574958, + "loss": 0.2947, + "step": 12856 + }, + { + "epoch": 1.0415586519766689, + "grad_norm": 0.0405760295689106, + "learning_rate": 0.00016435933210315497, + "loss": 0.3064, + "step": 12857 + }, + { + "epoch": 1.041639662994167, + "grad_norm": 0.04545693099498749, + "learning_rate": 0.00016435483145056033, + "loss": 0.3494, + "step": 12858 + }, + { + "epoch": 1.0417206740116656, + "grad_norm": 0.03815295919775963, + "learning_rate": 0.00016435033079796572, + "loss": 0.2945, + "step": 12859 + }, + { + "epoch": 1.041801685029164, + "grad_norm": 0.03414066880941391, + "learning_rate": 0.00016434583014537108, + "loss": 0.299, + "step": 12860 + }, + { + "epoch": 1.0418826960466623, + "grad_norm": 0.03565811365842819, + "learning_rate": 0.00016434132949277646, + "loss": 0.2928, + "step": 12861 + }, + { + "epoch": 1.0419637070641607, + "grad_norm": 0.03718414530158043, + "learning_rate": 0.00016433682884018185, + "loss": 0.2813, + "step": 12862 + }, + { + "epoch": 1.0420447180816592, + "grad_norm": 0.04182311147451401, + "learning_rate": 0.0001643323281875872, + "loss": 0.3207, + "step": 12863 + }, + { + "epoch": 1.0421257290991575, + "grad_norm": 0.032909996807575226, + "learning_rate": 0.00016432782753499257, + "loss": 0.2821, + "step": 12864 + }, + { + "epoch": 1.042206740116656, + "grad_norm": 0.039710626006126404, + "learning_rate": 0.00016432332688239796, + "loss": 0.3407, + "step": 12865 + }, + { + "epoch": 1.0422877511341542, + "grad_norm": 0.0371675118803978, + "learning_rate": 0.00016431882622980332, + "loss": 0.3247, + "step": 12866 + }, + { + "epoch": 1.0423687621516526, + "grad_norm": 0.03714926168322563, + "learning_rate": 0.0001643143255772087, + "loss": 0.3256, + "step": 12867 + }, + { + "epoch": 1.042449773169151, + "grad_norm": 0.043942391872406006, + "learning_rate": 0.0001643098249246141, + "loss": 0.3158, + "step": 12868 + }, + { + "epoch": 1.0425307841866494, + "grad_norm": 0.04367447271943092, + "learning_rate": 0.00016430532427201945, + "loss": 0.326, + "step": 12869 + }, + { + "epoch": 1.0426117952041478, + "grad_norm": 0.03442533686757088, + "learning_rate": 0.0001643008236194248, + "loss": 0.2983, + "step": 12870 + }, + { + "epoch": 1.042692806221646, + "grad_norm": 0.04258235916495323, + "learning_rate": 0.0001642963229668302, + "loss": 0.3203, + "step": 12871 + }, + { + "epoch": 1.0427738172391445, + "grad_norm": 0.035100020468235016, + "learning_rate": 0.00016429182231423556, + "loss": 0.3276, + "step": 12872 + }, + { + "epoch": 1.042854828256643, + "grad_norm": 0.03650961443781853, + "learning_rate": 0.00016428732166164095, + "loss": 0.3231, + "step": 12873 + }, + { + "epoch": 1.0429358392741412, + "grad_norm": 0.037116020917892456, + "learning_rate": 0.00016428282100904633, + "loss": 0.318, + "step": 12874 + }, + { + "epoch": 1.0430168502916397, + "grad_norm": 0.03844992816448212, + "learning_rate": 0.0001642783203564517, + "loss": 0.315, + "step": 12875 + }, + { + "epoch": 1.043097861309138, + "grad_norm": 0.036285724490880966, + "learning_rate": 0.00016427381970385706, + "loss": 0.2932, + "step": 12876 + }, + { + "epoch": 1.0431788723266364, + "grad_norm": 0.034318238496780396, + "learning_rate": 0.00016426931905126244, + "loss": 0.3243, + "step": 12877 + }, + { + "epoch": 1.0432598833441349, + "grad_norm": 0.03383239731192589, + "learning_rate": 0.0001642648183986678, + "loss": 0.314, + "step": 12878 + }, + { + "epoch": 1.0433408943616331, + "grad_norm": 0.034283652901649475, + "learning_rate": 0.0001642603177460732, + "loss": 0.3085, + "step": 12879 + }, + { + "epoch": 1.0434219053791316, + "grad_norm": 0.03602326661348343, + "learning_rate": 0.00016425581709347858, + "loss": 0.312, + "step": 12880 + }, + { + "epoch": 1.0435029163966298, + "grad_norm": 0.04088228940963745, + "learning_rate": 0.00016425131644088394, + "loss": 0.3097, + "step": 12881 + }, + { + "epoch": 1.0435839274141283, + "grad_norm": 0.03683295473456383, + "learning_rate": 0.0001642468157882893, + "loss": 0.3107, + "step": 12882 + }, + { + "epoch": 1.0436649384316268, + "grad_norm": 0.03878350928425789, + "learning_rate": 0.00016424231513569468, + "loss": 0.3135, + "step": 12883 + }, + { + "epoch": 1.043745949449125, + "grad_norm": 0.03578276187181473, + "learning_rate": 0.00016423781448310004, + "loss": 0.3065, + "step": 12884 + }, + { + "epoch": 1.0438269604666235, + "grad_norm": 0.034060198813676834, + "learning_rate": 0.00016423331383050543, + "loss": 0.3074, + "step": 12885 + }, + { + "epoch": 1.043907971484122, + "grad_norm": 0.03889356181025505, + "learning_rate": 0.00016422881317791082, + "loss": 0.3294, + "step": 12886 + }, + { + "epoch": 1.0439889825016202, + "grad_norm": 0.03599948436021805, + "learning_rate": 0.00016422431252531618, + "loss": 0.288, + "step": 12887 + }, + { + "epoch": 1.0440699935191187, + "grad_norm": 0.03581930696964264, + "learning_rate": 0.00016421981187272154, + "loss": 0.2796, + "step": 12888 + }, + { + "epoch": 1.044151004536617, + "grad_norm": 0.037626467645168304, + "learning_rate": 0.00016421531122012693, + "loss": 0.3272, + "step": 12889 + }, + { + "epoch": 1.0442320155541154, + "grad_norm": 0.04451899230480194, + "learning_rate": 0.00016421081056753229, + "loss": 0.3691, + "step": 12890 + }, + { + "epoch": 1.0443130265716138, + "grad_norm": 0.037104591727256775, + "learning_rate": 0.00016420630991493767, + "loss": 0.3217, + "step": 12891 + }, + { + "epoch": 1.044394037589112, + "grad_norm": 0.0360608771443367, + "learning_rate": 0.00016420180926234306, + "loss": 0.305, + "step": 12892 + }, + { + "epoch": 1.0444750486066106, + "grad_norm": 0.03921990096569061, + "learning_rate": 0.00016419730860974842, + "loss": 0.3319, + "step": 12893 + }, + { + "epoch": 1.0445560596241088, + "grad_norm": 0.04052642732858658, + "learning_rate": 0.0001641928079571538, + "loss": 0.3399, + "step": 12894 + }, + { + "epoch": 1.0446370706416073, + "grad_norm": 0.0338423028588295, + "learning_rate": 0.00016418830730455917, + "loss": 0.2775, + "step": 12895 + }, + { + "epoch": 1.0447180816591057, + "grad_norm": 0.03587032109498978, + "learning_rate": 0.00016418380665196453, + "loss": 0.3293, + "step": 12896 + }, + { + "epoch": 1.044799092676604, + "grad_norm": 0.041994258761405945, + "learning_rate": 0.00016417930599936991, + "loss": 0.3898, + "step": 12897 + }, + { + "epoch": 1.0448801036941024, + "grad_norm": 0.04043351486325264, + "learning_rate": 0.0001641748053467753, + "loss": 0.3151, + "step": 12898 + }, + { + "epoch": 1.0449611147116007, + "grad_norm": 0.03635840490460396, + "learning_rate": 0.00016417030469418066, + "loss": 0.3521, + "step": 12899 + }, + { + "epoch": 1.0450421257290992, + "grad_norm": 0.034990232437849045, + "learning_rate": 0.00016416580404158605, + "loss": 0.2937, + "step": 12900 + }, + { + "epoch": 1.0451231367465976, + "grad_norm": 0.03458486497402191, + "learning_rate": 0.0001641613033889914, + "loss": 0.3201, + "step": 12901 + }, + { + "epoch": 1.0452041477640959, + "grad_norm": 0.04071643203496933, + "learning_rate": 0.00016415680273639677, + "loss": 0.323, + "step": 12902 + }, + { + "epoch": 1.0452851587815943, + "grad_norm": 0.03523261845111847, + "learning_rate": 0.00016415230208380216, + "loss": 0.285, + "step": 12903 + }, + { + "epoch": 1.0453661697990926, + "grad_norm": 0.035539254546165466, + "learning_rate": 0.00016414780143120754, + "loss": 0.321, + "step": 12904 + }, + { + "epoch": 1.045447180816591, + "grad_norm": 0.03952726349234581, + "learning_rate": 0.0001641433007786129, + "loss": 0.3593, + "step": 12905 + }, + { + "epoch": 1.0455281918340895, + "grad_norm": 0.03822580352425575, + "learning_rate": 0.0001641388001260183, + "loss": 0.3165, + "step": 12906 + }, + { + "epoch": 1.0456092028515878, + "grad_norm": 0.03817811980843544, + "learning_rate": 0.00016413429947342365, + "loss": 0.3337, + "step": 12907 + }, + { + "epoch": 1.0456902138690862, + "grad_norm": 0.0448293499648571, + "learning_rate": 0.000164129798820829, + "loss": 0.3813, + "step": 12908 + }, + { + "epoch": 1.0457712248865845, + "grad_norm": 0.03960398584604263, + "learning_rate": 0.0001641252981682344, + "loss": 0.3625, + "step": 12909 + }, + { + "epoch": 1.045852235904083, + "grad_norm": 0.041171953082084656, + "learning_rate": 0.00016412079751563978, + "loss": 0.3329, + "step": 12910 + }, + { + "epoch": 1.0459332469215814, + "grad_norm": 0.03784125670790672, + "learning_rate": 0.00016411629686304514, + "loss": 0.317, + "step": 12911 + }, + { + "epoch": 1.0460142579390797, + "grad_norm": 0.034470465034246445, + "learning_rate": 0.00016411179621045053, + "loss": 0.2741, + "step": 12912 + }, + { + "epoch": 1.0460952689565781, + "grad_norm": 0.04065170884132385, + "learning_rate": 0.0001641072955578559, + "loss": 0.3182, + "step": 12913 + }, + { + "epoch": 1.0461762799740764, + "grad_norm": 0.03109212964773178, + "learning_rate": 0.00016410279490526128, + "loss": 0.2838, + "step": 12914 + }, + { + "epoch": 1.0462572909915748, + "grad_norm": 0.03720557689666748, + "learning_rate": 0.00016409829425266664, + "loss": 0.311, + "step": 12915 + }, + { + "epoch": 1.0463383020090733, + "grad_norm": 0.03801567479968071, + "learning_rate": 0.00016409379360007203, + "loss": 0.333, + "step": 12916 + }, + { + "epoch": 1.0464193130265715, + "grad_norm": 0.037921782582998276, + "learning_rate": 0.00016408929294747739, + "loss": 0.3161, + "step": 12917 + }, + { + "epoch": 1.04650032404407, + "grad_norm": 0.03623802587389946, + "learning_rate": 0.00016408479229488277, + "loss": 0.3286, + "step": 12918 + }, + { + "epoch": 1.0465813350615685, + "grad_norm": 0.03764597326517105, + "learning_rate": 0.00016408029164228813, + "loss": 0.3306, + "step": 12919 + }, + { + "epoch": 1.0466623460790667, + "grad_norm": 0.03784124553203583, + "learning_rate": 0.00016407579098969352, + "loss": 0.3103, + "step": 12920 + }, + { + "epoch": 1.0467433570965652, + "grad_norm": 0.038992203772068024, + "learning_rate": 0.00016407129033709888, + "loss": 0.3061, + "step": 12921 + }, + { + "epoch": 1.0468243681140634, + "grad_norm": 0.03431059420108795, + "learning_rate": 0.00016406678968450427, + "loss": 0.2677, + "step": 12922 + }, + { + "epoch": 1.046905379131562, + "grad_norm": 0.03752612695097923, + "learning_rate": 0.00016406228903190963, + "loss": 0.3261, + "step": 12923 + }, + { + "epoch": 1.0469863901490604, + "grad_norm": 0.031996533274650574, + "learning_rate": 0.00016405778837931502, + "loss": 0.2638, + "step": 12924 + }, + { + "epoch": 1.0470674011665586, + "grad_norm": 0.03687674179673195, + "learning_rate": 0.00016405328772672038, + "loss": 0.335, + "step": 12925 + }, + { + "epoch": 1.047148412184057, + "grad_norm": 0.039893630892038345, + "learning_rate": 0.00016404878707412576, + "loss": 0.3086, + "step": 12926 + }, + { + "epoch": 1.0472294232015553, + "grad_norm": 0.038995370268821716, + "learning_rate": 0.00016404428642153112, + "loss": 0.3669, + "step": 12927 + }, + { + "epoch": 1.0473104342190538, + "grad_norm": 0.04130131006240845, + "learning_rate": 0.0001640397857689365, + "loss": 0.3297, + "step": 12928 + }, + { + "epoch": 1.0473914452365523, + "grad_norm": 0.04454691708087921, + "learning_rate": 0.00016403528511634187, + "loss": 0.3558, + "step": 12929 + }, + { + "epoch": 1.0474724562540505, + "grad_norm": 0.03834373503923416, + "learning_rate": 0.00016403078446374726, + "loss": 0.3092, + "step": 12930 + }, + { + "epoch": 1.047553467271549, + "grad_norm": 0.037397079169750214, + "learning_rate": 0.00016402628381115262, + "loss": 0.2944, + "step": 12931 + }, + { + "epoch": 1.0476344782890472, + "grad_norm": 0.03549044206738472, + "learning_rate": 0.000164021783158558, + "loss": 0.2894, + "step": 12932 + }, + { + "epoch": 1.0477154893065457, + "grad_norm": 0.03430061787366867, + "learning_rate": 0.00016401728250596336, + "loss": 0.3142, + "step": 12933 + }, + { + "epoch": 1.0477965003240441, + "grad_norm": 0.0349055640399456, + "learning_rate": 0.00016401278185336875, + "loss": 0.2956, + "step": 12934 + }, + { + "epoch": 1.0478775113415424, + "grad_norm": 0.038699712604284286, + "learning_rate": 0.0001640082812007741, + "loss": 0.3227, + "step": 12935 + }, + { + "epoch": 1.0479585223590409, + "grad_norm": 0.03880271688103676, + "learning_rate": 0.0001640037805481795, + "loss": 0.293, + "step": 12936 + }, + { + "epoch": 1.048039533376539, + "grad_norm": 0.03803123161196709, + "learning_rate": 0.00016399927989558489, + "loss": 0.3085, + "step": 12937 + }, + { + "epoch": 1.0481205443940376, + "grad_norm": 0.03667737543582916, + "learning_rate": 0.00016399477924299025, + "loss": 0.2785, + "step": 12938 + }, + { + "epoch": 1.048201555411536, + "grad_norm": 0.038014575839042664, + "learning_rate": 0.0001639902785903956, + "loss": 0.3486, + "step": 12939 + }, + { + "epoch": 1.0482825664290343, + "grad_norm": 0.03516806662082672, + "learning_rate": 0.000163985777937801, + "loss": 0.3293, + "step": 12940 + }, + { + "epoch": 1.0483635774465327, + "grad_norm": 0.03823082149028778, + "learning_rate": 0.00016398127728520635, + "loss": 0.3442, + "step": 12941 + }, + { + "epoch": 1.0484445884640312, + "grad_norm": 0.03937181457877159, + "learning_rate": 0.00016397677663261174, + "loss": 0.3099, + "step": 12942 + }, + { + "epoch": 1.0485255994815295, + "grad_norm": 0.03163138031959534, + "learning_rate": 0.00016397227598001713, + "loss": 0.2934, + "step": 12943 + }, + { + "epoch": 1.048606610499028, + "grad_norm": 0.03477585315704346, + "learning_rate": 0.0001639677753274225, + "loss": 0.2887, + "step": 12944 + }, + { + "epoch": 1.0486876215165262, + "grad_norm": 0.03753078728914261, + "learning_rate": 0.00016396327467482785, + "loss": 0.3147, + "step": 12945 + }, + { + "epoch": 1.0487686325340246, + "grad_norm": 0.04368378594517708, + "learning_rate": 0.00016395877402223323, + "loss": 0.3497, + "step": 12946 + }, + { + "epoch": 1.048849643551523, + "grad_norm": 0.04576634243130684, + "learning_rate": 0.0001639542733696386, + "loss": 0.3426, + "step": 12947 + }, + { + "epoch": 1.0489306545690213, + "grad_norm": 0.037550900131464005, + "learning_rate": 0.00016394977271704398, + "loss": 0.3578, + "step": 12948 + }, + { + "epoch": 1.0490116655865198, + "grad_norm": 0.045421287417411804, + "learning_rate": 0.00016394527206444937, + "loss": 0.2868, + "step": 12949 + }, + { + "epoch": 1.049092676604018, + "grad_norm": 0.03243739530444145, + "learning_rate": 0.00016394077141185473, + "loss": 0.2742, + "step": 12950 + }, + { + "epoch": 1.0491736876215165, + "grad_norm": 0.039180856198072433, + "learning_rate": 0.0001639362707592601, + "loss": 0.3233, + "step": 12951 + }, + { + "epoch": 1.049254698639015, + "grad_norm": 0.04149232432246208, + "learning_rate": 0.00016393177010666548, + "loss": 0.335, + "step": 12952 + }, + { + "epoch": 1.0493357096565132, + "grad_norm": 0.04031761735677719, + "learning_rate": 0.00016392726945407084, + "loss": 0.3435, + "step": 12953 + }, + { + "epoch": 1.0494167206740117, + "grad_norm": 0.04236694052815437, + "learning_rate": 0.00016392276880147622, + "loss": 0.3478, + "step": 12954 + }, + { + "epoch": 1.04949773169151, + "grad_norm": 0.051949284970760345, + "learning_rate": 0.0001639182681488816, + "loss": 0.3584, + "step": 12955 + }, + { + "epoch": 1.0495787427090084, + "grad_norm": 0.046114757657051086, + "learning_rate": 0.00016391376749628697, + "loss": 0.3402, + "step": 12956 + }, + { + "epoch": 1.0496597537265069, + "grad_norm": 0.03253263607621193, + "learning_rate": 0.00016390926684369233, + "loss": 0.2886, + "step": 12957 + }, + { + "epoch": 1.0497407647440051, + "grad_norm": 0.04305479675531387, + "learning_rate": 0.00016390476619109772, + "loss": 0.2942, + "step": 12958 + }, + { + "epoch": 1.0498217757615036, + "grad_norm": 0.03444790840148926, + "learning_rate": 0.00016390026553850308, + "loss": 0.3051, + "step": 12959 + }, + { + "epoch": 1.0499027867790018, + "grad_norm": 0.03732207044959068, + "learning_rate": 0.00016389576488590846, + "loss": 0.3095, + "step": 12960 + }, + { + "epoch": 1.0499837977965003, + "grad_norm": 0.041557539254426956, + "learning_rate": 0.00016389126423331385, + "loss": 0.3039, + "step": 12961 + }, + { + "epoch": 1.0500648088139988, + "grad_norm": 0.03193178027868271, + "learning_rate": 0.0001638867635807192, + "loss": 0.2837, + "step": 12962 + }, + { + "epoch": 1.050145819831497, + "grad_norm": 0.04071090370416641, + "learning_rate": 0.0001638822629281246, + "loss": 0.3618, + "step": 12963 + }, + { + "epoch": 1.0502268308489955, + "grad_norm": 0.036791760474443436, + "learning_rate": 0.00016387776227552996, + "loss": 0.3014, + "step": 12964 + }, + { + "epoch": 1.050307841866494, + "grad_norm": 0.04111775755882263, + "learning_rate": 0.00016387326162293532, + "loss": 0.3258, + "step": 12965 + }, + { + "epoch": 1.0503888528839922, + "grad_norm": 0.04430720582604408, + "learning_rate": 0.0001638687609703407, + "loss": 0.3106, + "step": 12966 + }, + { + "epoch": 1.0504698639014907, + "grad_norm": 0.039440203458070755, + "learning_rate": 0.0001638642603177461, + "loss": 0.2866, + "step": 12967 + }, + { + "epoch": 1.050550874918989, + "grad_norm": 0.042060382664203644, + "learning_rate": 0.00016385975966515145, + "loss": 0.3435, + "step": 12968 + }, + { + "epoch": 1.0506318859364874, + "grad_norm": 0.03567097708582878, + "learning_rate": 0.00016385525901255684, + "loss": 0.2904, + "step": 12969 + }, + { + "epoch": 1.0507128969539858, + "grad_norm": 0.038154810667037964, + "learning_rate": 0.0001638507583599622, + "loss": 0.3069, + "step": 12970 + }, + { + "epoch": 1.050793907971484, + "grad_norm": 0.043698426336050034, + "learning_rate": 0.00016384625770736756, + "loss": 0.3597, + "step": 12971 + }, + { + "epoch": 1.0508749189889826, + "grad_norm": 0.040363430976867676, + "learning_rate": 0.00016384175705477295, + "loss": 0.3008, + "step": 12972 + }, + { + "epoch": 1.0509559300064808, + "grad_norm": 0.03564104810357094, + "learning_rate": 0.00016383725640217834, + "loss": 0.3045, + "step": 12973 + }, + { + "epoch": 1.0510369410239793, + "grad_norm": 0.03445684164762497, + "learning_rate": 0.0001638327557495837, + "loss": 0.29, + "step": 12974 + }, + { + "epoch": 1.0511179520414777, + "grad_norm": 0.03905816003680229, + "learning_rate": 0.00016382825509698908, + "loss": 0.2679, + "step": 12975 + }, + { + "epoch": 1.051198963058976, + "grad_norm": 0.0405798964202404, + "learning_rate": 0.00016382375444439444, + "loss": 0.3466, + "step": 12976 + }, + { + "epoch": 1.0512799740764744, + "grad_norm": 0.03701363876461983, + "learning_rate": 0.0001638192537917998, + "loss": 0.2943, + "step": 12977 + }, + { + "epoch": 1.0513609850939727, + "grad_norm": 0.042401645332574844, + "learning_rate": 0.0001638147531392052, + "loss": 0.3147, + "step": 12978 + }, + { + "epoch": 1.0514419961114712, + "grad_norm": 0.036430634558200836, + "learning_rate": 0.00016381025248661058, + "loss": 0.3064, + "step": 12979 + }, + { + "epoch": 1.0515230071289696, + "grad_norm": 0.03717387095093727, + "learning_rate": 0.00016380575183401594, + "loss": 0.2709, + "step": 12980 + }, + { + "epoch": 1.0516040181464679, + "grad_norm": 0.04026516526937485, + "learning_rate": 0.00016380125118142132, + "loss": 0.2824, + "step": 12981 + }, + { + "epoch": 1.0516850291639663, + "grad_norm": 0.03760538995265961, + "learning_rate": 0.00016379675052882668, + "loss": 0.3229, + "step": 12982 + }, + { + "epoch": 1.0517660401814646, + "grad_norm": 0.03460705652832985, + "learning_rate": 0.00016379224987623204, + "loss": 0.2997, + "step": 12983 + }, + { + "epoch": 1.051847051198963, + "grad_norm": 0.04178851097822189, + "learning_rate": 0.00016378774922363743, + "loss": 0.3235, + "step": 12984 + }, + { + "epoch": 1.0519280622164615, + "grad_norm": 0.04115087911486626, + "learning_rate": 0.00016378324857104282, + "loss": 0.3357, + "step": 12985 + }, + { + "epoch": 1.0520090732339598, + "grad_norm": 0.040281862020492554, + "learning_rate": 0.00016377874791844818, + "loss": 0.3274, + "step": 12986 + }, + { + "epoch": 1.0520900842514582, + "grad_norm": 0.04082822427153587, + "learning_rate": 0.00016377424726585357, + "loss": 0.3274, + "step": 12987 + }, + { + "epoch": 1.0521710952689567, + "grad_norm": 0.03982575237751007, + "learning_rate": 0.00016376974661325893, + "loss": 0.3402, + "step": 12988 + }, + { + "epoch": 1.052252106286455, + "grad_norm": 0.03970259800553322, + "learning_rate": 0.0001637652459606643, + "loss": 0.335, + "step": 12989 + }, + { + "epoch": 1.0523331173039534, + "grad_norm": 0.034251753240823746, + "learning_rate": 0.00016376074530806967, + "loss": 0.2573, + "step": 12990 + }, + { + "epoch": 1.0524141283214516, + "grad_norm": 0.03692222386598587, + "learning_rate": 0.00016375624465547506, + "loss": 0.2965, + "step": 12991 + }, + { + "epoch": 1.0524951393389501, + "grad_norm": 0.03736606240272522, + "learning_rate": 0.00016375174400288042, + "loss": 0.3059, + "step": 12992 + }, + { + "epoch": 1.0525761503564486, + "grad_norm": 0.043112173676490784, + "learning_rate": 0.0001637472433502858, + "loss": 0.3189, + "step": 12993 + }, + { + "epoch": 1.0526571613739468, + "grad_norm": 0.034782927483320236, + "learning_rate": 0.00016374274269769117, + "loss": 0.2956, + "step": 12994 + }, + { + "epoch": 1.0527381723914453, + "grad_norm": 0.04126705229282379, + "learning_rate": 0.00016373824204509655, + "loss": 0.3298, + "step": 12995 + }, + { + "epoch": 1.0528191834089435, + "grad_norm": 0.037063319236040115, + "learning_rate": 0.00016373374139250191, + "loss": 0.3403, + "step": 12996 + }, + { + "epoch": 1.052900194426442, + "grad_norm": 0.042787209153175354, + "learning_rate": 0.0001637292407399073, + "loss": 0.3404, + "step": 12997 + }, + { + "epoch": 1.0529812054439405, + "grad_norm": 0.04349652677774429, + "learning_rate": 0.00016372474008731266, + "loss": 0.3265, + "step": 12998 + }, + { + "epoch": 1.0530622164614387, + "grad_norm": 0.03636706620454788, + "learning_rate": 0.00016372023943471805, + "loss": 0.2942, + "step": 12999 + }, + { + "epoch": 1.0531432274789372, + "grad_norm": 0.05386228486895561, + "learning_rate": 0.0001637157387821234, + "loss": 0.3552, + "step": 13000 + }, + { + "epoch": 1.0532242384964354, + "grad_norm": 0.04019056633114815, + "learning_rate": 0.0001637112381295288, + "loss": 0.3149, + "step": 13001 + }, + { + "epoch": 1.053305249513934, + "grad_norm": 0.047883372753858566, + "learning_rate": 0.00016370673747693416, + "loss": 0.3082, + "step": 13002 + }, + { + "epoch": 1.0533862605314324, + "grad_norm": 0.03943159058690071, + "learning_rate": 0.00016370223682433954, + "loss": 0.3077, + "step": 13003 + }, + { + "epoch": 1.0534672715489306, + "grad_norm": 0.038364700973033905, + "learning_rate": 0.0001636977361717449, + "loss": 0.3107, + "step": 13004 + }, + { + "epoch": 1.053548282566429, + "grad_norm": 0.03844317048788071, + "learning_rate": 0.0001636932355191503, + "loss": 0.3027, + "step": 13005 + }, + { + "epoch": 1.0536292935839273, + "grad_norm": 0.04385514557361603, + "learning_rate": 0.00016368873486655565, + "loss": 0.3321, + "step": 13006 + }, + { + "epoch": 1.0537103046014258, + "grad_norm": 0.038291554898023605, + "learning_rate": 0.00016368423421396104, + "loss": 0.3239, + "step": 13007 + }, + { + "epoch": 1.0537913156189243, + "grad_norm": 0.03738854452967644, + "learning_rate": 0.0001636797335613664, + "loss": 0.3322, + "step": 13008 + }, + { + "epoch": 1.0538723266364225, + "grad_norm": 0.0458126999437809, + "learning_rate": 0.00016367523290877178, + "loss": 0.3485, + "step": 13009 + }, + { + "epoch": 1.053953337653921, + "grad_norm": 0.03953162580728531, + "learning_rate": 0.00016367073225617715, + "loss": 0.2743, + "step": 13010 + }, + { + "epoch": 1.0540343486714192, + "grad_norm": 0.044255178421735764, + "learning_rate": 0.00016366623160358253, + "loss": 0.3115, + "step": 13011 + }, + { + "epoch": 1.0541153596889177, + "grad_norm": 0.03467179089784622, + "learning_rate": 0.0001636617309509879, + "loss": 0.281, + "step": 13012 + }, + { + "epoch": 1.0541963707064161, + "grad_norm": 0.038243770599365234, + "learning_rate": 0.00016365723029839328, + "loss": 0.3054, + "step": 13013 + }, + { + "epoch": 1.0542773817239144, + "grad_norm": 0.039358288049697876, + "learning_rate": 0.00016365272964579864, + "loss": 0.3009, + "step": 13014 + }, + { + "epoch": 1.0543583927414129, + "grad_norm": 0.03707633540034294, + "learning_rate": 0.00016364822899320403, + "loss": 0.312, + "step": 13015 + }, + { + "epoch": 1.054439403758911, + "grad_norm": 0.04041290655732155, + "learning_rate": 0.0001636437283406094, + "loss": 0.3237, + "step": 13016 + }, + { + "epoch": 1.0545204147764096, + "grad_norm": 0.05521214008331299, + "learning_rate": 0.00016363922768801477, + "loss": 0.3193, + "step": 13017 + }, + { + "epoch": 1.054601425793908, + "grad_norm": 0.03907698765397072, + "learning_rate": 0.00016363472703542016, + "loss": 0.3282, + "step": 13018 + }, + { + "epoch": 1.0546824368114063, + "grad_norm": 0.0368485264480114, + "learning_rate": 0.00016363022638282552, + "loss": 0.3078, + "step": 13019 + }, + { + "epoch": 1.0547634478289047, + "grad_norm": 0.036330461502075195, + "learning_rate": 0.00016362572573023088, + "loss": 0.2957, + "step": 13020 + }, + { + "epoch": 1.0548444588464032, + "grad_norm": 0.045567553490400314, + "learning_rate": 0.00016362122507763627, + "loss": 0.3387, + "step": 13021 + }, + { + "epoch": 1.0549254698639015, + "grad_norm": 0.035702306777238846, + "learning_rate": 0.00016361672442504163, + "loss": 0.2929, + "step": 13022 + }, + { + "epoch": 1.0550064808814, + "grad_norm": 0.042946431785821915, + "learning_rate": 0.00016361222377244702, + "loss": 0.3737, + "step": 13023 + }, + { + "epoch": 1.0550874918988982, + "grad_norm": 0.038137488067150116, + "learning_rate": 0.0001636077231198524, + "loss": 0.317, + "step": 13024 + }, + { + "epoch": 1.0551685029163966, + "grad_norm": 0.036011893302202225, + "learning_rate": 0.00016360322246725776, + "loss": 0.3223, + "step": 13025 + }, + { + "epoch": 1.055249513933895, + "grad_norm": 0.039589326828718185, + "learning_rate": 0.00016359872181466312, + "loss": 0.3017, + "step": 13026 + }, + { + "epoch": 1.0553305249513933, + "grad_norm": 0.04104938358068466, + "learning_rate": 0.0001635942211620685, + "loss": 0.2986, + "step": 13027 + }, + { + "epoch": 1.0554115359688918, + "grad_norm": 0.03723173215985298, + "learning_rate": 0.00016358972050947387, + "loss": 0.2994, + "step": 13028 + }, + { + "epoch": 1.05549254698639, + "grad_norm": 0.039599400013685226, + "learning_rate": 0.00016358521985687926, + "loss": 0.3278, + "step": 13029 + }, + { + "epoch": 1.0555735580038885, + "grad_norm": 0.034441448748111725, + "learning_rate": 0.00016358071920428464, + "loss": 0.3217, + "step": 13030 + }, + { + "epoch": 1.055654569021387, + "grad_norm": 0.037437498569488525, + "learning_rate": 0.00016357621855169, + "loss": 0.2911, + "step": 13031 + }, + { + "epoch": 1.0557355800388852, + "grad_norm": 0.0356329008936882, + "learning_rate": 0.0001635717178990954, + "loss": 0.3308, + "step": 13032 + }, + { + "epoch": 1.0558165910563837, + "grad_norm": 0.03364642709493637, + "learning_rate": 0.00016356721724650075, + "loss": 0.2906, + "step": 13033 + }, + { + "epoch": 1.055897602073882, + "grad_norm": 0.04249563440680504, + "learning_rate": 0.0001635627165939061, + "loss": 0.342, + "step": 13034 + }, + { + "epoch": 1.0559786130913804, + "grad_norm": 0.04023630544543266, + "learning_rate": 0.0001635582159413115, + "loss": 0.3249, + "step": 13035 + }, + { + "epoch": 1.0560596241088789, + "grad_norm": 0.03922013193368912, + "learning_rate": 0.00016355371528871689, + "loss": 0.3284, + "step": 13036 + }, + { + "epoch": 1.0561406351263771, + "grad_norm": 0.040603943169116974, + "learning_rate": 0.00016354921463612225, + "loss": 0.3261, + "step": 13037 + }, + { + "epoch": 1.0562216461438756, + "grad_norm": 0.03869624435901642, + "learning_rate": 0.00016354471398352763, + "loss": 0.275, + "step": 13038 + }, + { + "epoch": 1.0563026571613738, + "grad_norm": 0.04690798372030258, + "learning_rate": 0.000163540213330933, + "loss": 0.3152, + "step": 13039 + }, + { + "epoch": 1.0563836681788723, + "grad_norm": 0.0407758466899395, + "learning_rate": 0.00016353571267833835, + "loss": 0.3065, + "step": 13040 + }, + { + "epoch": 1.0564646791963708, + "grad_norm": 0.03870948776602745, + "learning_rate": 0.00016353121202574374, + "loss": 0.3174, + "step": 13041 + }, + { + "epoch": 1.056545690213869, + "grad_norm": 0.0395541675388813, + "learning_rate": 0.00016352671137314913, + "loss": 0.2982, + "step": 13042 + }, + { + "epoch": 1.0566267012313675, + "grad_norm": 0.03934469446539879, + "learning_rate": 0.0001635222107205545, + "loss": 0.3801, + "step": 13043 + }, + { + "epoch": 1.056707712248866, + "grad_norm": 0.035781171172857285, + "learning_rate": 0.00016351771006795987, + "loss": 0.2897, + "step": 13044 + }, + { + "epoch": 1.0567887232663642, + "grad_norm": 0.04096796363592148, + "learning_rate": 0.00016351320941536523, + "loss": 0.327, + "step": 13045 + }, + { + "epoch": 1.0568697342838627, + "grad_norm": 0.04043768346309662, + "learning_rate": 0.0001635087087627706, + "loss": 0.313, + "step": 13046 + }, + { + "epoch": 1.056950745301361, + "grad_norm": 0.03739682585000992, + "learning_rate": 0.00016350420811017598, + "loss": 0.2706, + "step": 13047 + }, + { + "epoch": 1.0570317563188594, + "grad_norm": 0.0397224985063076, + "learning_rate": 0.00016349970745758137, + "loss": 0.3215, + "step": 13048 + }, + { + "epoch": 1.0571127673363578, + "grad_norm": 0.04893937706947327, + "learning_rate": 0.00016349520680498673, + "loss": 0.3143, + "step": 13049 + }, + { + "epoch": 1.057193778353856, + "grad_norm": 0.03793314844369888, + "learning_rate": 0.00016349070615239212, + "loss": 0.3362, + "step": 13050 + }, + { + "epoch": 1.0572747893713546, + "grad_norm": 0.043163977563381195, + "learning_rate": 0.00016348620549979748, + "loss": 0.3462, + "step": 13051 + }, + { + "epoch": 1.0573558003888528, + "grad_norm": 0.04014117643237114, + "learning_rate": 0.00016348170484720284, + "loss": 0.3339, + "step": 13052 + }, + { + "epoch": 1.0574368114063513, + "grad_norm": 0.04330155998468399, + "learning_rate": 0.00016347720419460822, + "loss": 0.313, + "step": 13053 + }, + { + "epoch": 1.0575178224238497, + "grad_norm": 0.0403081513941288, + "learning_rate": 0.0001634727035420136, + "loss": 0.4026, + "step": 13054 + }, + { + "epoch": 1.057598833441348, + "grad_norm": 0.03789180889725685, + "learning_rate": 0.00016346820288941897, + "loss": 0.2955, + "step": 13055 + }, + { + "epoch": 1.0576798444588464, + "grad_norm": 0.03631223365664482, + "learning_rate": 0.00016346370223682436, + "loss": 0.3115, + "step": 13056 + }, + { + "epoch": 1.0577608554763447, + "grad_norm": 0.03948785737156868, + "learning_rate": 0.00016345920158422972, + "loss": 0.2895, + "step": 13057 + }, + { + "epoch": 1.0578418664938432, + "grad_norm": 0.03748257830739021, + "learning_rate": 0.00016345470093163508, + "loss": 0.3001, + "step": 13058 + }, + { + "epoch": 1.0579228775113416, + "grad_norm": 0.03798984736204147, + "learning_rate": 0.00016345020027904047, + "loss": 0.3112, + "step": 13059 + }, + { + "epoch": 1.0580038885288399, + "grad_norm": 0.033647872507572174, + "learning_rate": 0.00016344569962644585, + "loss": 0.2851, + "step": 13060 + }, + { + "epoch": 1.0580848995463383, + "grad_norm": 0.036701709032058716, + "learning_rate": 0.0001634411989738512, + "loss": 0.3051, + "step": 13061 + }, + { + "epoch": 1.0581659105638366, + "grad_norm": 0.040191926062107086, + "learning_rate": 0.0001634366983212566, + "loss": 0.3661, + "step": 13062 + }, + { + "epoch": 1.058246921581335, + "grad_norm": 0.036935754120349884, + "learning_rate": 0.00016343219766866196, + "loss": 0.3035, + "step": 13063 + }, + { + "epoch": 1.0583279325988335, + "grad_norm": 0.037047579884529114, + "learning_rate": 0.00016342769701606732, + "loss": 0.2997, + "step": 13064 + }, + { + "epoch": 1.0584089436163318, + "grad_norm": 0.0359288714826107, + "learning_rate": 0.0001634231963634727, + "loss": 0.3031, + "step": 13065 + }, + { + "epoch": 1.0584899546338302, + "grad_norm": 0.037823259830474854, + "learning_rate": 0.0001634186957108781, + "loss": 0.3405, + "step": 13066 + }, + { + "epoch": 1.0585709656513287, + "grad_norm": 0.03662921488285065, + "learning_rate": 0.00016341419505828345, + "loss": 0.3159, + "step": 13067 + }, + { + "epoch": 1.058651976668827, + "grad_norm": 0.04636320844292641, + "learning_rate": 0.00016340969440568884, + "loss": 0.3256, + "step": 13068 + }, + { + "epoch": 1.0587329876863254, + "grad_norm": 0.04248104989528656, + "learning_rate": 0.0001634051937530942, + "loss": 0.354, + "step": 13069 + }, + { + "epoch": 1.0588139987038236, + "grad_norm": 0.03274742141366005, + "learning_rate": 0.0001634006931004996, + "loss": 0.2699, + "step": 13070 + }, + { + "epoch": 1.0588950097213221, + "grad_norm": 0.036615077406167984, + "learning_rate": 0.00016339619244790495, + "loss": 0.3416, + "step": 13071 + }, + { + "epoch": 1.0589760207388206, + "grad_norm": 0.03806189075112343, + "learning_rate": 0.00016339169179531034, + "loss": 0.3002, + "step": 13072 + }, + { + "epoch": 1.0590570317563188, + "grad_norm": 0.041345518082380295, + "learning_rate": 0.0001633871911427157, + "loss": 0.3322, + "step": 13073 + }, + { + "epoch": 1.0591380427738173, + "grad_norm": 0.037649959325790405, + "learning_rate": 0.00016338269049012108, + "loss": 0.3091, + "step": 13074 + }, + { + "epoch": 1.0592190537913155, + "grad_norm": 0.04146602004766464, + "learning_rate": 0.00016337818983752644, + "loss": 0.3423, + "step": 13075 + }, + { + "epoch": 1.059300064808814, + "grad_norm": 0.04146433621644974, + "learning_rate": 0.00016337368918493183, + "loss": 0.3235, + "step": 13076 + }, + { + "epoch": 1.0593810758263125, + "grad_norm": 0.04007774218916893, + "learning_rate": 0.0001633691885323372, + "loss": 0.3634, + "step": 13077 + }, + { + "epoch": 1.0594620868438107, + "grad_norm": 0.03304166719317436, + "learning_rate": 0.00016336468787974258, + "loss": 0.3014, + "step": 13078 + }, + { + "epoch": 1.0595430978613092, + "grad_norm": 0.04017319157719612, + "learning_rate": 0.00016336018722714794, + "loss": 0.3375, + "step": 13079 + }, + { + "epoch": 1.0596241088788074, + "grad_norm": 0.04752025380730629, + "learning_rate": 0.00016335568657455332, + "loss": 0.3722, + "step": 13080 + }, + { + "epoch": 1.059705119896306, + "grad_norm": 0.03203922510147095, + "learning_rate": 0.00016335118592195868, + "loss": 0.2696, + "step": 13081 + }, + { + "epoch": 1.0597861309138044, + "grad_norm": 0.03847889602184296, + "learning_rate": 0.00016334668526936407, + "loss": 0.3433, + "step": 13082 + }, + { + "epoch": 1.0598671419313026, + "grad_norm": 0.03569968417286873, + "learning_rate": 0.00016334218461676943, + "loss": 0.2844, + "step": 13083 + }, + { + "epoch": 1.059948152948801, + "grad_norm": 0.03317081928253174, + "learning_rate": 0.00016333768396417482, + "loss": 0.318, + "step": 13084 + }, + { + "epoch": 1.0600291639662993, + "grad_norm": 0.03200779855251312, + "learning_rate": 0.00016333318331158018, + "loss": 0.3009, + "step": 13085 + }, + { + "epoch": 1.0601101749837978, + "grad_norm": 0.04079586640000343, + "learning_rate": 0.00016332868265898557, + "loss": 0.3539, + "step": 13086 + }, + { + "epoch": 1.0601911860012962, + "grad_norm": 0.037388890981674194, + "learning_rate": 0.00016332418200639093, + "loss": 0.3013, + "step": 13087 + }, + { + "epoch": 1.0602721970187945, + "grad_norm": 0.04083319753408432, + "learning_rate": 0.0001633196813537963, + "loss": 0.3637, + "step": 13088 + }, + { + "epoch": 1.060353208036293, + "grad_norm": 0.03597578406333923, + "learning_rate": 0.00016331518070120167, + "loss": 0.3176, + "step": 13089 + }, + { + "epoch": 1.0604342190537914, + "grad_norm": 0.03463973477482796, + "learning_rate": 0.00016331068004860706, + "loss": 0.3161, + "step": 13090 + }, + { + "epoch": 1.0605152300712897, + "grad_norm": 0.03823839873075485, + "learning_rate": 0.00016330617939601242, + "loss": 0.3191, + "step": 13091 + }, + { + "epoch": 1.0605962410887881, + "grad_norm": 0.03756007179617882, + "learning_rate": 0.0001633016787434178, + "loss": 0.3219, + "step": 13092 + }, + { + "epoch": 1.0606772521062864, + "grad_norm": 0.0337604284286499, + "learning_rate": 0.00016329717809082317, + "loss": 0.3246, + "step": 13093 + }, + { + "epoch": 1.0607582631237849, + "grad_norm": 0.03267901390790939, + "learning_rate": 0.00016329267743822855, + "loss": 0.2514, + "step": 13094 + }, + { + "epoch": 1.0608392741412833, + "grad_norm": 0.03957577422261238, + "learning_rate": 0.00016328817678563391, + "loss": 0.3338, + "step": 13095 + }, + { + "epoch": 1.0609202851587816, + "grad_norm": 0.04130283370614052, + "learning_rate": 0.0001632836761330393, + "loss": 0.3544, + "step": 13096 + }, + { + "epoch": 1.06100129617628, + "grad_norm": 0.03718462213873863, + "learning_rate": 0.00016327917548044466, + "loss": 0.3132, + "step": 13097 + }, + { + "epoch": 1.0610823071937783, + "grad_norm": 0.03442040830850601, + "learning_rate": 0.00016327467482785005, + "loss": 0.3061, + "step": 13098 + }, + { + "epoch": 1.0611633182112767, + "grad_norm": 0.04067227989435196, + "learning_rate": 0.00016327017417525544, + "loss": 0.3235, + "step": 13099 + }, + { + "epoch": 1.0612443292287752, + "grad_norm": 0.035462141036987305, + "learning_rate": 0.0001632656735226608, + "loss": 0.301, + "step": 13100 + }, + { + "epoch": 1.0613253402462735, + "grad_norm": 0.040709950029850006, + "learning_rate": 0.00016326117287006618, + "loss": 0.3091, + "step": 13101 + }, + { + "epoch": 1.061406351263772, + "grad_norm": 0.040912844240665436, + "learning_rate": 0.00016325667221747154, + "loss": 0.3084, + "step": 13102 + }, + { + "epoch": 1.0614873622812702, + "grad_norm": 0.04343321919441223, + "learning_rate": 0.0001632521715648769, + "loss": 0.3367, + "step": 13103 + }, + { + "epoch": 1.0615683732987686, + "grad_norm": 0.03998672589659691, + "learning_rate": 0.0001632476709122823, + "loss": 0.3231, + "step": 13104 + }, + { + "epoch": 1.061649384316267, + "grad_norm": 0.03875920921564102, + "learning_rate": 0.00016324317025968768, + "loss": 0.3041, + "step": 13105 + }, + { + "epoch": 1.0617303953337653, + "grad_norm": 0.03709811344742775, + "learning_rate": 0.00016323866960709304, + "loss": 0.3094, + "step": 13106 + }, + { + "epoch": 1.0618114063512638, + "grad_norm": 0.05348275601863861, + "learning_rate": 0.00016323416895449843, + "loss": 0.2579, + "step": 13107 + }, + { + "epoch": 1.061892417368762, + "grad_norm": 0.042734403163194656, + "learning_rate": 0.00016322966830190379, + "loss": 0.3195, + "step": 13108 + }, + { + "epoch": 1.0619734283862605, + "grad_norm": 0.04072289541363716, + "learning_rate": 0.00016322516764930915, + "loss": 0.3008, + "step": 13109 + }, + { + "epoch": 1.062054439403759, + "grad_norm": 0.038316257297992706, + "learning_rate": 0.00016322066699671453, + "loss": 0.3334, + "step": 13110 + }, + { + "epoch": 1.0621354504212572, + "grad_norm": 0.031170308589935303, + "learning_rate": 0.00016321616634411992, + "loss": 0.2796, + "step": 13111 + }, + { + "epoch": 1.0622164614387557, + "grad_norm": 0.0355224534869194, + "learning_rate": 0.00016321166569152528, + "loss": 0.3164, + "step": 13112 + }, + { + "epoch": 1.062297472456254, + "grad_norm": 0.038635775446891785, + "learning_rate": 0.00016320716503893067, + "loss": 0.3264, + "step": 13113 + }, + { + "epoch": 1.0623784834737524, + "grad_norm": 0.03472799062728882, + "learning_rate": 0.00016320266438633603, + "loss": 0.3303, + "step": 13114 + }, + { + "epoch": 1.0624594944912509, + "grad_norm": 0.033231720328330994, + "learning_rate": 0.0001631981637337414, + "loss": 0.2996, + "step": 13115 + }, + { + "epoch": 1.0625405055087491, + "grad_norm": 0.03596806153655052, + "learning_rate": 0.00016319366308114677, + "loss": 0.3168, + "step": 13116 + }, + { + "epoch": 1.0626215165262476, + "grad_norm": 0.03601103276014328, + "learning_rate": 0.00016318916242855216, + "loss": 0.2643, + "step": 13117 + }, + { + "epoch": 1.0627025275437458, + "grad_norm": 0.03784925863146782, + "learning_rate": 0.00016318466177595752, + "loss": 0.2973, + "step": 13118 + }, + { + "epoch": 1.0627835385612443, + "grad_norm": 0.038176700472831726, + "learning_rate": 0.0001631801611233629, + "loss": 0.3395, + "step": 13119 + }, + { + "epoch": 1.0628645495787428, + "grad_norm": 0.03922443091869354, + "learning_rate": 0.00016317566047076827, + "loss": 0.3558, + "step": 13120 + }, + { + "epoch": 1.062945560596241, + "grad_norm": 0.03958688676357269, + "learning_rate": 0.00016317115981817363, + "loss": 0.2903, + "step": 13121 + }, + { + "epoch": 1.0630265716137395, + "grad_norm": 0.04546581208705902, + "learning_rate": 0.00016316665916557902, + "loss": 0.3679, + "step": 13122 + }, + { + "epoch": 1.063107582631238, + "grad_norm": 0.04076612740755081, + "learning_rate": 0.0001631621585129844, + "loss": 0.2901, + "step": 13123 + }, + { + "epoch": 1.0631885936487362, + "grad_norm": 0.030991926789283752, + "learning_rate": 0.00016315765786038976, + "loss": 0.2661, + "step": 13124 + }, + { + "epoch": 1.0632696046662347, + "grad_norm": 0.040161989629268646, + "learning_rate": 0.00016315315720779515, + "loss": 0.3112, + "step": 13125 + }, + { + "epoch": 1.063350615683733, + "grad_norm": 0.03585921600461006, + "learning_rate": 0.0001631486565552005, + "loss": 0.2859, + "step": 13126 + }, + { + "epoch": 1.0634316267012314, + "grad_norm": 0.037038449198007584, + "learning_rate": 0.00016314415590260587, + "loss": 0.2949, + "step": 13127 + }, + { + "epoch": 1.0635126377187298, + "grad_norm": 0.03819497674703598, + "learning_rate": 0.00016313965525001126, + "loss": 0.355, + "step": 13128 + }, + { + "epoch": 1.063593648736228, + "grad_norm": 0.03675805404782295, + "learning_rate": 0.00016313515459741664, + "loss": 0.3155, + "step": 13129 + }, + { + "epoch": 1.0636746597537265, + "grad_norm": 0.04112779349088669, + "learning_rate": 0.000163130653944822, + "loss": 0.3144, + "step": 13130 + }, + { + "epoch": 1.0637556707712248, + "grad_norm": 0.04026306793093681, + "learning_rate": 0.0001631261532922274, + "loss": 0.3382, + "step": 13131 + }, + { + "epoch": 1.0638366817887233, + "grad_norm": 0.037816621363162994, + "learning_rate": 0.00016312165263963275, + "loss": 0.3044, + "step": 13132 + }, + { + "epoch": 1.0639176928062217, + "grad_norm": 0.04128459095954895, + "learning_rate": 0.0001631171519870381, + "loss": 0.3149, + "step": 13133 + }, + { + "epoch": 1.06399870382372, + "grad_norm": 0.036820001900196075, + "learning_rate": 0.0001631126513344435, + "loss": 0.295, + "step": 13134 + }, + { + "epoch": 1.0640797148412184, + "grad_norm": 0.040834393352270126, + "learning_rate": 0.00016310815068184889, + "loss": 0.3369, + "step": 13135 + }, + { + "epoch": 1.0641607258587167, + "grad_norm": 0.03920666500926018, + "learning_rate": 0.00016310365002925425, + "loss": 0.3184, + "step": 13136 + }, + { + "epoch": 1.0642417368762151, + "grad_norm": 0.04006076604127884, + "learning_rate": 0.00016309914937665963, + "loss": 0.3597, + "step": 13137 + }, + { + "epoch": 1.0643227478937136, + "grad_norm": 0.03771907463669777, + "learning_rate": 0.000163094648724065, + "loss": 0.3003, + "step": 13138 + }, + { + "epoch": 1.0644037589112119, + "grad_norm": 0.04066121578216553, + "learning_rate": 0.00016309014807147035, + "loss": 0.3441, + "step": 13139 + }, + { + "epoch": 1.0644847699287103, + "grad_norm": 0.033965013921260834, + "learning_rate": 0.00016308564741887574, + "loss": 0.3123, + "step": 13140 + }, + { + "epoch": 1.0645657809462086, + "grad_norm": 0.04005855694413185, + "learning_rate": 0.00016308114676628113, + "loss": 0.3413, + "step": 13141 + }, + { + "epoch": 1.064646791963707, + "grad_norm": 0.03749069198966026, + "learning_rate": 0.0001630766461136865, + "loss": 0.2959, + "step": 13142 + }, + { + "epoch": 1.0647278029812055, + "grad_norm": 0.04194750636816025, + "learning_rate": 0.00016307214546109187, + "loss": 0.3204, + "step": 13143 + }, + { + "epoch": 1.0648088139987038, + "grad_norm": 0.035881157964468, + "learning_rate": 0.00016306764480849723, + "loss": 0.297, + "step": 13144 + }, + { + "epoch": 1.0648898250162022, + "grad_norm": 0.03590167313814163, + "learning_rate": 0.0001630631441559026, + "loss": 0.2872, + "step": 13145 + }, + { + "epoch": 1.0649708360337007, + "grad_norm": 0.04288012161850929, + "learning_rate": 0.00016305864350330798, + "loss": 0.3507, + "step": 13146 + }, + { + "epoch": 1.065051847051199, + "grad_norm": 0.035566214472055435, + "learning_rate": 0.00016305414285071337, + "loss": 0.3073, + "step": 13147 + }, + { + "epoch": 1.0651328580686974, + "grad_norm": 0.03152133896946907, + "learning_rate": 0.00016304964219811873, + "loss": 0.2696, + "step": 13148 + }, + { + "epoch": 1.0652138690861956, + "grad_norm": 0.04102449119091034, + "learning_rate": 0.00016304514154552412, + "loss": 0.3524, + "step": 13149 + }, + { + "epoch": 1.065294880103694, + "grad_norm": 0.03946152701973915, + "learning_rate": 0.00016304064089292948, + "loss": 0.3044, + "step": 13150 + }, + { + "epoch": 1.0653758911211926, + "grad_norm": 0.04190444201231003, + "learning_rate": 0.00016303614024033486, + "loss": 0.3705, + "step": 13151 + }, + { + "epoch": 1.0654569021386908, + "grad_norm": 0.03652665764093399, + "learning_rate": 0.00016303163958774022, + "loss": 0.3397, + "step": 13152 + }, + { + "epoch": 1.0655379131561893, + "grad_norm": 0.04137907177209854, + "learning_rate": 0.0001630271389351456, + "loss": 0.3439, + "step": 13153 + }, + { + "epoch": 1.0656189241736875, + "grad_norm": 0.03943207114934921, + "learning_rate": 0.00016302263828255097, + "loss": 0.2565, + "step": 13154 + }, + { + "epoch": 1.065699935191186, + "grad_norm": 0.04126992076635361, + "learning_rate": 0.00016301813762995636, + "loss": 0.3527, + "step": 13155 + }, + { + "epoch": 1.0657809462086845, + "grad_norm": 0.038512103259563446, + "learning_rate": 0.00016301363697736172, + "loss": 0.3271, + "step": 13156 + }, + { + "epoch": 1.0658619572261827, + "grad_norm": 0.03877699747681618, + "learning_rate": 0.0001630091363247671, + "loss": 0.3409, + "step": 13157 + }, + { + "epoch": 1.0659429682436812, + "grad_norm": 0.03787248954176903, + "learning_rate": 0.00016300463567217247, + "loss": 0.3627, + "step": 13158 + }, + { + "epoch": 1.0660239792611794, + "grad_norm": 0.0397280678153038, + "learning_rate": 0.00016300013501957785, + "loss": 0.2851, + "step": 13159 + }, + { + "epoch": 1.0661049902786779, + "grad_norm": 0.0416765958070755, + "learning_rate": 0.0001629956343669832, + "loss": 0.2748, + "step": 13160 + }, + { + "epoch": 1.0661860012961764, + "grad_norm": 0.03804435208439827, + "learning_rate": 0.0001629911337143886, + "loss": 0.3328, + "step": 13161 + }, + { + "epoch": 1.0662670123136746, + "grad_norm": 0.04016241803765297, + "learning_rate": 0.00016298663306179396, + "loss": 0.331, + "step": 13162 + }, + { + "epoch": 1.066348023331173, + "grad_norm": 0.042689867317676544, + "learning_rate": 0.00016298213240919935, + "loss": 0.344, + "step": 13163 + }, + { + "epoch": 1.0664290343486713, + "grad_norm": 0.03467041999101639, + "learning_rate": 0.00016297763175660473, + "loss": 0.2828, + "step": 13164 + }, + { + "epoch": 1.0665100453661698, + "grad_norm": 0.03881874307990074, + "learning_rate": 0.0001629731311040101, + "loss": 0.3323, + "step": 13165 + }, + { + "epoch": 1.0665910563836682, + "grad_norm": 0.04006616026163101, + "learning_rate": 0.00016296863045141545, + "loss": 0.3439, + "step": 13166 + }, + { + "epoch": 1.0666720674011665, + "grad_norm": 0.04216201975941658, + "learning_rate": 0.00016296412979882084, + "loss": 0.3469, + "step": 13167 + }, + { + "epoch": 1.066753078418665, + "grad_norm": 0.03559141233563423, + "learning_rate": 0.0001629596291462262, + "loss": 0.2801, + "step": 13168 + }, + { + "epoch": 1.0668340894361634, + "grad_norm": 0.03447365388274193, + "learning_rate": 0.0001629551284936316, + "loss": 0.3118, + "step": 13169 + }, + { + "epoch": 1.0669151004536617, + "grad_norm": 0.031914982944726944, + "learning_rate": 0.00016295062784103698, + "loss": 0.3054, + "step": 13170 + }, + { + "epoch": 1.0669961114711601, + "grad_norm": 0.04312901198863983, + "learning_rate": 0.00016294612718844234, + "loss": 0.3305, + "step": 13171 + }, + { + "epoch": 1.0670771224886584, + "grad_norm": 0.04013076052069664, + "learning_rate": 0.0001629416265358477, + "loss": 0.331, + "step": 13172 + }, + { + "epoch": 1.0671581335061568, + "grad_norm": 0.04022202268242836, + "learning_rate": 0.00016293712588325308, + "loss": 0.3138, + "step": 13173 + }, + { + "epoch": 1.0672391445236553, + "grad_norm": 0.036136429756879807, + "learning_rate": 0.00016293262523065844, + "loss": 0.3198, + "step": 13174 + }, + { + "epoch": 1.0673201555411536, + "grad_norm": 0.0353563167154789, + "learning_rate": 0.00016292812457806383, + "loss": 0.2695, + "step": 13175 + }, + { + "epoch": 1.067401166558652, + "grad_norm": 0.03855662792921066, + "learning_rate": 0.00016292362392546922, + "loss": 0.2931, + "step": 13176 + }, + { + "epoch": 1.0674821775761503, + "grad_norm": 0.03579946234822273, + "learning_rate": 0.00016291912327287458, + "loss": 0.3143, + "step": 13177 + }, + { + "epoch": 1.0675631885936487, + "grad_norm": 0.038776103407144547, + "learning_rate": 0.00016291462262027994, + "loss": 0.3107, + "step": 13178 + }, + { + "epoch": 1.0676441996111472, + "grad_norm": 0.035354942083358765, + "learning_rate": 0.00016291012196768532, + "loss": 0.2964, + "step": 13179 + }, + { + "epoch": 1.0677252106286454, + "grad_norm": 0.03727215155959129, + "learning_rate": 0.0001629056213150907, + "loss": 0.3202, + "step": 13180 + }, + { + "epoch": 1.067806221646144, + "grad_norm": 0.041301924735307693, + "learning_rate": 0.00016290112066249607, + "loss": 0.3336, + "step": 13181 + }, + { + "epoch": 1.0678872326636422, + "grad_norm": 0.03613514080643654, + "learning_rate": 0.00016289662000990146, + "loss": 0.3225, + "step": 13182 + }, + { + "epoch": 1.0679682436811406, + "grad_norm": 0.040005408227443695, + "learning_rate": 0.00016289211935730682, + "loss": 0.3023, + "step": 13183 + }, + { + "epoch": 1.068049254698639, + "grad_norm": 0.03439958021044731, + "learning_rate": 0.00016288761870471218, + "loss": 0.2818, + "step": 13184 + }, + { + "epoch": 1.0681302657161373, + "grad_norm": 0.03871123865246773, + "learning_rate": 0.00016288311805211757, + "loss": 0.3091, + "step": 13185 + }, + { + "epoch": 1.0682112767336358, + "grad_norm": 0.04212125390768051, + "learning_rate": 0.00016287861739952295, + "loss": 0.2852, + "step": 13186 + }, + { + "epoch": 1.068292287751134, + "grad_norm": 0.03327667713165283, + "learning_rate": 0.0001628741167469283, + "loss": 0.2623, + "step": 13187 + }, + { + "epoch": 1.0683732987686325, + "grad_norm": 0.04048386961221695, + "learning_rate": 0.0001628696160943337, + "loss": 0.3345, + "step": 13188 + }, + { + "epoch": 1.068454309786131, + "grad_norm": 0.03479389473795891, + "learning_rate": 0.00016286511544173906, + "loss": 0.2986, + "step": 13189 + }, + { + "epoch": 1.0685353208036292, + "grad_norm": 0.03682103753089905, + "learning_rate": 0.00016286061478914442, + "loss": 0.3061, + "step": 13190 + }, + { + "epoch": 1.0686163318211277, + "grad_norm": 0.043244585394859314, + "learning_rate": 0.0001628561141365498, + "loss": 0.3194, + "step": 13191 + }, + { + "epoch": 1.0686973428386262, + "grad_norm": 0.04426548257470131, + "learning_rate": 0.0001628516134839552, + "loss": 0.3121, + "step": 13192 + }, + { + "epoch": 1.0687783538561244, + "grad_norm": 0.03656848147511482, + "learning_rate": 0.00016284711283136056, + "loss": 0.2951, + "step": 13193 + }, + { + "epoch": 1.0688593648736229, + "grad_norm": 0.039186250418424606, + "learning_rate": 0.00016284261217876594, + "loss": 0.328, + "step": 13194 + }, + { + "epoch": 1.0689403758911211, + "grad_norm": 0.041322581470012665, + "learning_rate": 0.0001628381115261713, + "loss": 0.3287, + "step": 13195 + }, + { + "epoch": 1.0690213869086196, + "grad_norm": 0.03569335862994194, + "learning_rate": 0.00016283361087357666, + "loss": 0.2767, + "step": 13196 + }, + { + "epoch": 1.069102397926118, + "grad_norm": 0.03692762553691864, + "learning_rate": 0.00016282911022098205, + "loss": 0.3011, + "step": 13197 + }, + { + "epoch": 1.0691834089436163, + "grad_norm": 0.034691039472818375, + "learning_rate": 0.00016282460956838744, + "loss": 0.3094, + "step": 13198 + }, + { + "epoch": 1.0692644199611148, + "grad_norm": 0.037673961371183395, + "learning_rate": 0.0001628201089157928, + "loss": 0.3295, + "step": 13199 + }, + { + "epoch": 1.069345430978613, + "grad_norm": 0.03888675570487976, + "learning_rate": 0.00016281560826319818, + "loss": 0.3302, + "step": 13200 + }, + { + "epoch": 1.0694264419961115, + "grad_norm": 0.0368889719247818, + "learning_rate": 0.00016281110761060354, + "loss": 0.3081, + "step": 13201 + }, + { + "epoch": 1.06950745301361, + "grad_norm": 0.038887474685907364, + "learning_rate": 0.0001628066069580089, + "loss": 0.351, + "step": 13202 + }, + { + "epoch": 1.0695884640311082, + "grad_norm": 0.03788851201534271, + "learning_rate": 0.0001628021063054143, + "loss": 0.3169, + "step": 13203 + }, + { + "epoch": 1.0696694750486067, + "grad_norm": 0.04103012755513191, + "learning_rate": 0.00016279760565281968, + "loss": 0.3415, + "step": 13204 + }, + { + "epoch": 1.069750486066105, + "grad_norm": 0.03659411892294884, + "learning_rate": 0.00016279310500022504, + "loss": 0.3414, + "step": 13205 + }, + { + "epoch": 1.0698314970836034, + "grad_norm": 0.03779793530702591, + "learning_rate": 0.00016278860434763043, + "loss": 0.3365, + "step": 13206 + }, + { + "epoch": 1.0699125081011018, + "grad_norm": 0.03485560789704323, + "learning_rate": 0.00016278410369503579, + "loss": 0.279, + "step": 13207 + }, + { + "epoch": 1.0699935191186, + "grad_norm": 0.04025677591562271, + "learning_rate": 0.00016277960304244115, + "loss": 0.3753, + "step": 13208 + }, + { + "epoch": 1.0700745301360985, + "grad_norm": 0.03404794633388519, + "learning_rate": 0.00016277510238984653, + "loss": 0.2857, + "step": 13209 + }, + { + "epoch": 1.0701555411535968, + "grad_norm": 0.035707104951143265, + "learning_rate": 0.00016277060173725192, + "loss": 0.2874, + "step": 13210 + }, + { + "epoch": 1.0702365521710953, + "grad_norm": 0.04319519177079201, + "learning_rate": 0.00016276610108465728, + "loss": 0.2729, + "step": 13211 + }, + { + "epoch": 1.0703175631885937, + "grad_norm": 0.03902759775519371, + "learning_rate": 0.00016276160043206267, + "loss": 0.3145, + "step": 13212 + }, + { + "epoch": 1.070398574206092, + "grad_norm": 0.03968610614538193, + "learning_rate": 0.00016275709977946803, + "loss": 0.3467, + "step": 13213 + }, + { + "epoch": 1.0704795852235904, + "grad_norm": 0.035142820328474045, + "learning_rate": 0.0001627525991268734, + "loss": 0.3028, + "step": 13214 + }, + { + "epoch": 1.070560596241089, + "grad_norm": 0.04717862606048584, + "learning_rate": 0.00016274809847427877, + "loss": 0.3768, + "step": 13215 + }, + { + "epoch": 1.0706416072585871, + "grad_norm": 0.04234934598207474, + "learning_rate": 0.00016274359782168416, + "loss": 0.3106, + "step": 13216 + }, + { + "epoch": 1.0707226182760856, + "grad_norm": 0.04084145277738571, + "learning_rate": 0.00016273909716908952, + "loss": 0.3159, + "step": 13217 + }, + { + "epoch": 1.0708036292935839, + "grad_norm": 0.047838181257247925, + "learning_rate": 0.0001627345965164949, + "loss": 0.3782, + "step": 13218 + }, + { + "epoch": 1.0708846403110823, + "grad_norm": 0.03364909067749977, + "learning_rate": 0.00016273009586390027, + "loss": 0.2973, + "step": 13219 + }, + { + "epoch": 1.0709656513285806, + "grad_norm": 0.03884413465857506, + "learning_rate": 0.00016272559521130563, + "loss": 0.3182, + "step": 13220 + }, + { + "epoch": 1.071046662346079, + "grad_norm": 0.04059368371963501, + "learning_rate": 0.00016272109455871102, + "loss": 0.3005, + "step": 13221 + }, + { + "epoch": 1.0711276733635775, + "grad_norm": 0.03587368503212929, + "learning_rate": 0.0001627165939061164, + "loss": 0.3038, + "step": 13222 + }, + { + "epoch": 1.0712086843810757, + "grad_norm": 0.04080083593726158, + "learning_rate": 0.00016271209325352176, + "loss": 0.336, + "step": 13223 + }, + { + "epoch": 1.0712896953985742, + "grad_norm": 0.03729918226599693, + "learning_rate": 0.00016270759260092715, + "loss": 0.3002, + "step": 13224 + }, + { + "epoch": 1.0713707064160727, + "grad_norm": 0.036567870527505875, + "learning_rate": 0.0001627030919483325, + "loss": 0.3065, + "step": 13225 + }, + { + "epoch": 1.071451717433571, + "grad_norm": 0.04116319119930267, + "learning_rate": 0.00016269859129573787, + "loss": 0.3131, + "step": 13226 + }, + { + "epoch": 1.0715327284510694, + "grad_norm": 0.03758919984102249, + "learning_rate": 0.00016269409064314326, + "loss": 0.3006, + "step": 13227 + }, + { + "epoch": 1.0716137394685676, + "grad_norm": 0.039732519537210464, + "learning_rate": 0.00016268958999054864, + "loss": 0.32, + "step": 13228 + }, + { + "epoch": 1.071694750486066, + "grad_norm": 0.04335511475801468, + "learning_rate": 0.000162685089337954, + "loss": 0.3034, + "step": 13229 + }, + { + "epoch": 1.0717757615035646, + "grad_norm": 0.034147460013628006, + "learning_rate": 0.0001626805886853594, + "loss": 0.2718, + "step": 13230 + }, + { + "epoch": 1.0718567725210628, + "grad_norm": 0.03723817691206932, + "learning_rate": 0.00016267608803276475, + "loss": 0.3082, + "step": 13231 + }, + { + "epoch": 1.0719377835385613, + "grad_norm": 0.03524041175842285, + "learning_rate": 0.00016267158738017014, + "loss": 0.2836, + "step": 13232 + }, + { + "epoch": 1.0720187945560595, + "grad_norm": 0.040226735174655914, + "learning_rate": 0.00016266708672757553, + "loss": 0.2881, + "step": 13233 + }, + { + "epoch": 1.072099805573558, + "grad_norm": 0.033649519085884094, + "learning_rate": 0.00016266258607498089, + "loss": 0.2919, + "step": 13234 + }, + { + "epoch": 1.0721808165910565, + "grad_norm": 0.03617114946246147, + "learning_rate": 0.00016265808542238625, + "loss": 0.2749, + "step": 13235 + }, + { + "epoch": 1.0722618276085547, + "grad_norm": 0.03616161271929741, + "learning_rate": 0.00016265358476979163, + "loss": 0.2836, + "step": 13236 + }, + { + "epoch": 1.0723428386260532, + "grad_norm": 0.0450529009103775, + "learning_rate": 0.000162649084117197, + "loss": 0.3569, + "step": 13237 + }, + { + "epoch": 1.0724238496435514, + "grad_norm": 0.03892510384321213, + "learning_rate": 0.00016264458346460238, + "loss": 0.3192, + "step": 13238 + }, + { + "epoch": 1.0725048606610499, + "grad_norm": 0.04008268192410469, + "learning_rate": 0.00016264008281200777, + "loss": 0.3062, + "step": 13239 + }, + { + "epoch": 1.0725858716785484, + "grad_norm": 0.03673470765352249, + "learning_rate": 0.00016263558215941313, + "loss": 0.3093, + "step": 13240 + }, + { + "epoch": 1.0726668826960466, + "grad_norm": 0.03691423684358597, + "learning_rate": 0.0001626310815068185, + "loss": 0.2929, + "step": 13241 + }, + { + "epoch": 1.072747893713545, + "grad_norm": 0.03831551969051361, + "learning_rate": 0.00016262658085422388, + "loss": 0.2825, + "step": 13242 + }, + { + "epoch": 1.0728289047310433, + "grad_norm": 0.036802027374506, + "learning_rate": 0.00016262208020162924, + "loss": 0.3217, + "step": 13243 + }, + { + "epoch": 1.0729099157485418, + "grad_norm": 0.04076598584651947, + "learning_rate": 0.00016261757954903462, + "loss": 0.3348, + "step": 13244 + }, + { + "epoch": 1.0729909267660402, + "grad_norm": 0.03657490387558937, + "learning_rate": 0.00016261307889644, + "loss": 0.3152, + "step": 13245 + }, + { + "epoch": 1.0730719377835385, + "grad_norm": 0.03813399747014046, + "learning_rate": 0.00016260857824384537, + "loss": 0.307, + "step": 13246 + }, + { + "epoch": 1.073152948801037, + "grad_norm": 0.03411310538649559, + "learning_rate": 0.00016260407759125073, + "loss": 0.3038, + "step": 13247 + }, + { + "epoch": 1.0732339598185354, + "grad_norm": 0.032801553606987, + "learning_rate": 0.00016259957693865612, + "loss": 0.2617, + "step": 13248 + }, + { + "epoch": 1.0733149708360337, + "grad_norm": 0.03492745757102966, + "learning_rate": 0.00016259507628606148, + "loss": 0.2922, + "step": 13249 + }, + { + "epoch": 1.0733959818535321, + "grad_norm": 0.041702572256326675, + "learning_rate": 0.00016259057563346686, + "loss": 0.3483, + "step": 13250 + }, + { + "epoch": 1.0734769928710304, + "grad_norm": 0.03449796885251999, + "learning_rate": 0.00016258607498087225, + "loss": 0.2842, + "step": 13251 + }, + { + "epoch": 1.0735580038885288, + "grad_norm": 0.03720291703939438, + "learning_rate": 0.0001625815743282776, + "loss": 0.2815, + "step": 13252 + }, + { + "epoch": 1.0736390149060273, + "grad_norm": 0.03792194649577141, + "learning_rate": 0.00016257707367568297, + "loss": 0.3124, + "step": 13253 + }, + { + "epoch": 1.0737200259235256, + "grad_norm": 0.03389739990234375, + "learning_rate": 0.00016257257302308836, + "loss": 0.3081, + "step": 13254 + }, + { + "epoch": 1.073801036941024, + "grad_norm": 0.0402398556470871, + "learning_rate": 0.00016256807237049375, + "loss": 0.32, + "step": 13255 + }, + { + "epoch": 1.0738820479585223, + "grad_norm": 0.03836144134402275, + "learning_rate": 0.0001625635717178991, + "loss": 0.2938, + "step": 13256 + }, + { + "epoch": 1.0739630589760207, + "grad_norm": 0.040930040180683136, + "learning_rate": 0.0001625590710653045, + "loss": 0.2935, + "step": 13257 + }, + { + "epoch": 1.0740440699935192, + "grad_norm": 0.038079436868429184, + "learning_rate": 0.00016255457041270985, + "loss": 0.339, + "step": 13258 + }, + { + "epoch": 1.0741250810110174, + "grad_norm": 0.03508659079670906, + "learning_rate": 0.0001625500697601152, + "loss": 0.2712, + "step": 13259 + }, + { + "epoch": 1.074206092028516, + "grad_norm": 0.03754564747214317, + "learning_rate": 0.0001625455691075206, + "loss": 0.3105, + "step": 13260 + }, + { + "epoch": 1.0742871030460142, + "grad_norm": 0.03899519145488739, + "learning_rate": 0.000162541068454926, + "loss": 0.3361, + "step": 13261 + }, + { + "epoch": 1.0743681140635126, + "grad_norm": 0.043972063809633255, + "learning_rate": 0.00016253656780233135, + "loss": 0.346, + "step": 13262 + }, + { + "epoch": 1.074449125081011, + "grad_norm": 0.03968114033341408, + "learning_rate": 0.00016253206714973673, + "loss": 0.2943, + "step": 13263 + }, + { + "epoch": 1.0745301360985093, + "grad_norm": 0.042418111115694046, + "learning_rate": 0.0001625275664971421, + "loss": 0.3463, + "step": 13264 + }, + { + "epoch": 1.0746111471160078, + "grad_norm": 0.04304884001612663, + "learning_rate": 0.00016252306584454745, + "loss": 0.3182, + "step": 13265 + }, + { + "epoch": 1.074692158133506, + "grad_norm": 0.03539090231060982, + "learning_rate": 0.00016251856519195284, + "loss": 0.2934, + "step": 13266 + }, + { + "epoch": 1.0747731691510045, + "grad_norm": 0.045006927102804184, + "learning_rate": 0.00016251406453935823, + "loss": 0.3262, + "step": 13267 + }, + { + "epoch": 1.074854180168503, + "grad_norm": 0.036517515778541565, + "learning_rate": 0.0001625095638867636, + "loss": 0.3148, + "step": 13268 + }, + { + "epoch": 1.0749351911860012, + "grad_norm": 0.04659212380647659, + "learning_rate": 0.00016250506323416898, + "loss": 0.3616, + "step": 13269 + }, + { + "epoch": 1.0750162022034997, + "grad_norm": 0.03476553037762642, + "learning_rate": 0.00016250056258157434, + "loss": 0.2881, + "step": 13270 + }, + { + "epoch": 1.0750972132209982, + "grad_norm": 0.035790711641311646, + "learning_rate": 0.0001624960619289797, + "loss": 0.3192, + "step": 13271 + }, + { + "epoch": 1.0751782242384964, + "grad_norm": 0.039025261998176575, + "learning_rate": 0.00016249156127638508, + "loss": 0.3133, + "step": 13272 + }, + { + "epoch": 1.0752592352559949, + "grad_norm": 0.032551951706409454, + "learning_rate": 0.00016248706062379047, + "loss": 0.2713, + "step": 13273 + }, + { + "epoch": 1.0753402462734931, + "grad_norm": 0.03767121210694313, + "learning_rate": 0.00016248255997119583, + "loss": 0.3512, + "step": 13274 + }, + { + "epoch": 1.0754212572909916, + "grad_norm": 0.03417384251952171, + "learning_rate": 0.00016247805931860122, + "loss": 0.3119, + "step": 13275 + }, + { + "epoch": 1.07550226830849, + "grad_norm": 0.0347181111574173, + "learning_rate": 0.00016247355866600658, + "loss": 0.3036, + "step": 13276 + }, + { + "epoch": 1.0755832793259883, + "grad_norm": 0.03645511344075203, + "learning_rate": 0.00016246905801341194, + "loss": 0.3242, + "step": 13277 + }, + { + "epoch": 1.0756642903434868, + "grad_norm": 0.043344851583242416, + "learning_rate": 0.00016246455736081732, + "loss": 0.3296, + "step": 13278 + }, + { + "epoch": 1.075745301360985, + "grad_norm": 0.03987930715084076, + "learning_rate": 0.0001624600567082227, + "loss": 0.3501, + "step": 13279 + }, + { + "epoch": 1.0758263123784835, + "grad_norm": 0.03823409229516983, + "learning_rate": 0.00016245555605562807, + "loss": 0.2972, + "step": 13280 + }, + { + "epoch": 1.075907323395982, + "grad_norm": 0.041413746774196625, + "learning_rate": 0.00016245105540303346, + "loss": 0.3113, + "step": 13281 + }, + { + "epoch": 1.0759883344134802, + "grad_norm": 0.04083913937211037, + "learning_rate": 0.00016244655475043882, + "loss": 0.3037, + "step": 13282 + }, + { + "epoch": 1.0760693454309787, + "grad_norm": 0.03827805444598198, + "learning_rate": 0.00016244205409784418, + "loss": 0.3334, + "step": 13283 + }, + { + "epoch": 1.076150356448477, + "grad_norm": 0.03913137689232826, + "learning_rate": 0.00016243755344524957, + "loss": 0.2937, + "step": 13284 + }, + { + "epoch": 1.0762313674659754, + "grad_norm": 0.03928875923156738, + "learning_rate": 0.00016243305279265495, + "loss": 0.3421, + "step": 13285 + }, + { + "epoch": 1.0763123784834738, + "grad_norm": 0.037695445120334625, + "learning_rate": 0.00016242855214006031, + "loss": 0.3135, + "step": 13286 + }, + { + "epoch": 1.076393389500972, + "grad_norm": 0.03602603077888489, + "learning_rate": 0.0001624240514874657, + "loss": 0.3054, + "step": 13287 + }, + { + "epoch": 1.0764744005184705, + "grad_norm": 0.038182325661182404, + "learning_rate": 0.00016241955083487106, + "loss": 0.3499, + "step": 13288 + }, + { + "epoch": 1.0765554115359688, + "grad_norm": 0.0474996492266655, + "learning_rate": 0.00016241505018227642, + "loss": 0.3918, + "step": 13289 + }, + { + "epoch": 1.0766364225534673, + "grad_norm": 0.04044167697429657, + "learning_rate": 0.0001624105495296818, + "loss": 0.3046, + "step": 13290 + }, + { + "epoch": 1.0767174335709657, + "grad_norm": 0.03613236919045448, + "learning_rate": 0.0001624060488770872, + "loss": 0.2835, + "step": 13291 + }, + { + "epoch": 1.076798444588464, + "grad_norm": 0.03918663412332535, + "learning_rate": 0.00016240154822449256, + "loss": 0.2991, + "step": 13292 + }, + { + "epoch": 1.0768794556059624, + "grad_norm": 0.043039511889219284, + "learning_rate": 0.00016239704757189794, + "loss": 0.306, + "step": 13293 + }, + { + "epoch": 1.076960466623461, + "grad_norm": 0.041325464844703674, + "learning_rate": 0.0001623925469193033, + "loss": 0.3253, + "step": 13294 + }, + { + "epoch": 1.0770414776409591, + "grad_norm": 0.038608718663454056, + "learning_rate": 0.00016238804626670866, + "loss": 0.3157, + "step": 13295 + }, + { + "epoch": 1.0771224886584576, + "grad_norm": 0.04200710356235504, + "learning_rate": 0.00016238354561411405, + "loss": 0.3496, + "step": 13296 + }, + { + "epoch": 1.0772034996759559, + "grad_norm": 0.039992254227399826, + "learning_rate": 0.00016237904496151944, + "loss": 0.3241, + "step": 13297 + }, + { + "epoch": 1.0772845106934543, + "grad_norm": 0.03987909108400345, + "learning_rate": 0.0001623745443089248, + "loss": 0.3189, + "step": 13298 + }, + { + "epoch": 1.0773655217109528, + "grad_norm": 0.04262173920869827, + "learning_rate": 0.00016237004365633018, + "loss": 0.3386, + "step": 13299 + }, + { + "epoch": 1.077446532728451, + "grad_norm": 0.03864779695868492, + "learning_rate": 0.00016236554300373554, + "loss": 0.3103, + "step": 13300 + }, + { + "epoch": 1.0775275437459495, + "grad_norm": 0.03877343609929085, + "learning_rate": 0.0001623610423511409, + "loss": 0.3345, + "step": 13301 + }, + { + "epoch": 1.0776085547634477, + "grad_norm": 0.04017069935798645, + "learning_rate": 0.00016235654169854632, + "loss": 0.3255, + "step": 13302 + }, + { + "epoch": 1.0776895657809462, + "grad_norm": 0.0396341048181057, + "learning_rate": 0.00016235204104595168, + "loss": 0.3445, + "step": 13303 + }, + { + "epoch": 1.0777705767984447, + "grad_norm": 0.03865274041891098, + "learning_rate": 0.00016234754039335704, + "loss": 0.2811, + "step": 13304 + }, + { + "epoch": 1.077851587815943, + "grad_norm": 0.03512314707040787, + "learning_rate": 0.00016234303974076243, + "loss": 0.2475, + "step": 13305 + }, + { + "epoch": 1.0779325988334414, + "grad_norm": 0.04257293790578842, + "learning_rate": 0.00016233853908816779, + "loss": 0.3237, + "step": 13306 + }, + { + "epoch": 1.0780136098509396, + "grad_norm": 0.03783601522445679, + "learning_rate": 0.00016233403843557317, + "loss": 0.2938, + "step": 13307 + }, + { + "epoch": 1.078094620868438, + "grad_norm": 0.03904139623045921, + "learning_rate": 0.00016232953778297856, + "loss": 0.2985, + "step": 13308 + }, + { + "epoch": 1.0781756318859366, + "grad_norm": 0.037982627749443054, + "learning_rate": 0.00016232503713038392, + "loss": 0.2701, + "step": 13309 + }, + { + "epoch": 1.0782566429034348, + "grad_norm": 0.03868936002254486, + "learning_rate": 0.00016232053647778928, + "loss": 0.2822, + "step": 13310 + }, + { + "epoch": 1.0783376539209333, + "grad_norm": 0.044151484966278076, + "learning_rate": 0.00016231603582519467, + "loss": 0.3327, + "step": 13311 + }, + { + "epoch": 1.0784186649384315, + "grad_norm": 0.04237937554717064, + "learning_rate": 0.00016231153517260003, + "loss": 0.3746, + "step": 13312 + }, + { + "epoch": 1.07849967595593, + "grad_norm": 0.041088610887527466, + "learning_rate": 0.00016230703452000541, + "loss": 0.3689, + "step": 13313 + }, + { + "epoch": 1.0785806869734285, + "grad_norm": 0.039148733019828796, + "learning_rate": 0.0001623025338674108, + "loss": 0.3415, + "step": 13314 + }, + { + "epoch": 1.0786616979909267, + "grad_norm": 0.04008744657039642, + "learning_rate": 0.00016229803321481616, + "loss": 0.3418, + "step": 13315 + }, + { + "epoch": 1.0787427090084252, + "grad_norm": 0.03863966092467308, + "learning_rate": 0.00016229353256222152, + "loss": 0.3719, + "step": 13316 + }, + { + "epoch": 1.0788237200259236, + "grad_norm": 0.04270761087536812, + "learning_rate": 0.0001622890319096269, + "loss": 0.338, + "step": 13317 + }, + { + "epoch": 1.0789047310434219, + "grad_norm": 0.03537660092115402, + "learning_rate": 0.00016228453125703227, + "loss": 0.3262, + "step": 13318 + }, + { + "epoch": 1.0789857420609203, + "grad_norm": 0.04052428528666496, + "learning_rate": 0.00016228003060443766, + "loss": 0.3001, + "step": 13319 + }, + { + "epoch": 1.0790667530784186, + "grad_norm": 0.03691032528877258, + "learning_rate": 0.00016227552995184304, + "loss": 0.3051, + "step": 13320 + }, + { + "epoch": 1.079147764095917, + "grad_norm": 0.04182439669966698, + "learning_rate": 0.0001622710292992484, + "loss": 0.3253, + "step": 13321 + }, + { + "epoch": 1.0792287751134155, + "grad_norm": 0.04117942601442337, + "learning_rate": 0.00016226652864665376, + "loss": 0.3312, + "step": 13322 + }, + { + "epoch": 1.0793097861309138, + "grad_norm": 0.03158815950155258, + "learning_rate": 0.00016226202799405915, + "loss": 0.2854, + "step": 13323 + }, + { + "epoch": 1.0793907971484122, + "grad_norm": 0.04201388731598854, + "learning_rate": 0.0001622575273414645, + "loss": 0.341, + "step": 13324 + }, + { + "epoch": 1.0794718081659105, + "grad_norm": 0.033538252115249634, + "learning_rate": 0.0001622530266888699, + "loss": 0.2677, + "step": 13325 + }, + { + "epoch": 1.079552819183409, + "grad_norm": 0.03603677079081535, + "learning_rate": 0.00016224852603627528, + "loss": 0.2832, + "step": 13326 + }, + { + "epoch": 1.0796338302009074, + "grad_norm": 0.04016566649079323, + "learning_rate": 0.00016224402538368065, + "loss": 0.2885, + "step": 13327 + }, + { + "epoch": 1.0797148412184057, + "grad_norm": 0.031220566481351852, + "learning_rate": 0.000162239524731086, + "loss": 0.262, + "step": 13328 + }, + { + "epoch": 1.0797958522359041, + "grad_norm": 0.044721174985170364, + "learning_rate": 0.0001622350240784914, + "loss": 0.3727, + "step": 13329 + }, + { + "epoch": 1.0798768632534024, + "grad_norm": 0.034470535814762115, + "learning_rate": 0.00016223052342589675, + "loss": 0.291, + "step": 13330 + }, + { + "epoch": 1.0799578742709008, + "grad_norm": 0.04218203201889992, + "learning_rate": 0.00016222602277330214, + "loss": 0.3315, + "step": 13331 + }, + { + "epoch": 1.0800388852883993, + "grad_norm": 0.04010971263051033, + "learning_rate": 0.00016222152212070753, + "loss": 0.3192, + "step": 13332 + }, + { + "epoch": 1.0801198963058976, + "grad_norm": 0.03233395144343376, + "learning_rate": 0.0001622170214681129, + "loss": 0.3063, + "step": 13333 + }, + { + "epoch": 1.080200907323396, + "grad_norm": 0.037743669003248215, + "learning_rate": 0.00016221252081551825, + "loss": 0.3124, + "step": 13334 + }, + { + "epoch": 1.0802819183408943, + "grad_norm": 0.03384163975715637, + "learning_rate": 0.00016220802016292363, + "loss": 0.2798, + "step": 13335 + }, + { + "epoch": 1.0803629293583927, + "grad_norm": 0.0341823548078537, + "learning_rate": 0.00016220351951032902, + "loss": 0.2978, + "step": 13336 + }, + { + "epoch": 1.0804439403758912, + "grad_norm": 0.03492417186498642, + "learning_rate": 0.00016219901885773438, + "loss": 0.2919, + "step": 13337 + }, + { + "epoch": 1.0805249513933894, + "grad_norm": 0.03947907313704491, + "learning_rate": 0.00016219451820513977, + "loss": 0.3078, + "step": 13338 + }, + { + "epoch": 1.080605962410888, + "grad_norm": 0.03812957927584648, + "learning_rate": 0.00016219001755254513, + "loss": 0.3157, + "step": 13339 + }, + { + "epoch": 1.0806869734283864, + "grad_norm": 0.04574564844369888, + "learning_rate": 0.0001621855168999505, + "loss": 0.313, + "step": 13340 + }, + { + "epoch": 1.0807679844458846, + "grad_norm": 0.043251559138298035, + "learning_rate": 0.00016218101624735588, + "loss": 0.2827, + "step": 13341 + }, + { + "epoch": 1.080848995463383, + "grad_norm": 0.04355183616280556, + "learning_rate": 0.00016217651559476126, + "loss": 0.3519, + "step": 13342 + }, + { + "epoch": 1.0809300064808813, + "grad_norm": 0.039647556841373444, + "learning_rate": 0.00016217201494216662, + "loss": 0.2988, + "step": 13343 + }, + { + "epoch": 1.0810110174983798, + "grad_norm": 0.03978102281689644, + "learning_rate": 0.000162167514289572, + "loss": 0.3303, + "step": 13344 + }, + { + "epoch": 1.081092028515878, + "grad_norm": 0.03419504687190056, + "learning_rate": 0.00016216301363697737, + "loss": 0.3003, + "step": 13345 + }, + { + "epoch": 1.0811730395333765, + "grad_norm": 0.03994619846343994, + "learning_rate": 0.00016215851298438273, + "loss": 0.3025, + "step": 13346 + }, + { + "epoch": 1.081254050550875, + "grad_norm": 0.04877599701285362, + "learning_rate": 0.00016215401233178812, + "loss": 0.3348, + "step": 13347 + }, + { + "epoch": 1.0813350615683732, + "grad_norm": 0.03815867006778717, + "learning_rate": 0.0001621495116791935, + "loss": 0.2887, + "step": 13348 + }, + { + "epoch": 1.0814160725858717, + "grad_norm": 0.04251018539071083, + "learning_rate": 0.00016214501102659886, + "loss": 0.2907, + "step": 13349 + }, + { + "epoch": 1.0814970836033702, + "grad_norm": 0.038885097950696945, + "learning_rate": 0.00016214051037400425, + "loss": 0.3316, + "step": 13350 + }, + { + "epoch": 1.0815780946208684, + "grad_norm": 0.039103202521800995, + "learning_rate": 0.0001621360097214096, + "loss": 0.3476, + "step": 13351 + }, + { + "epoch": 1.0816591056383669, + "grad_norm": 0.036649372428655624, + "learning_rate": 0.00016213150906881497, + "loss": 0.3336, + "step": 13352 + }, + { + "epoch": 1.0817401166558651, + "grad_norm": 0.03846638649702072, + "learning_rate": 0.00016212700841622036, + "loss": 0.2873, + "step": 13353 + }, + { + "epoch": 1.0818211276733636, + "grad_norm": 0.03785313665866852, + "learning_rate": 0.00016212250776362575, + "loss": 0.3242, + "step": 13354 + }, + { + "epoch": 1.081902138690862, + "grad_norm": 0.04096831753849983, + "learning_rate": 0.0001621180071110311, + "loss": 0.3688, + "step": 13355 + }, + { + "epoch": 1.0819831497083603, + "grad_norm": 0.03899220749735832, + "learning_rate": 0.0001621135064584365, + "loss": 0.3206, + "step": 13356 + }, + { + "epoch": 1.0820641607258588, + "grad_norm": 0.031790584325790405, + "learning_rate": 0.00016210900580584185, + "loss": 0.2658, + "step": 13357 + }, + { + "epoch": 1.082145171743357, + "grad_norm": 0.043712154030799866, + "learning_rate": 0.0001621045051532472, + "loss": 0.3022, + "step": 13358 + }, + { + "epoch": 1.0822261827608555, + "grad_norm": 0.03479669988155365, + "learning_rate": 0.0001621000045006526, + "loss": 0.34, + "step": 13359 + }, + { + "epoch": 1.082307193778354, + "grad_norm": 0.04478292167186737, + "learning_rate": 0.000162095503848058, + "loss": 0.3535, + "step": 13360 + }, + { + "epoch": 1.0823882047958522, + "grad_norm": 0.03777686879038811, + "learning_rate": 0.00016209100319546335, + "loss": 0.3142, + "step": 13361 + }, + { + "epoch": 1.0824692158133506, + "grad_norm": 0.041430700570344925, + "learning_rate": 0.00016208650254286873, + "loss": 0.3208, + "step": 13362 + }, + { + "epoch": 1.082550226830849, + "grad_norm": 0.04191647469997406, + "learning_rate": 0.0001620820018902741, + "loss": 0.3726, + "step": 13363 + }, + { + "epoch": 1.0826312378483474, + "grad_norm": 0.03765102103352547, + "learning_rate": 0.00016207750123767945, + "loss": 0.2909, + "step": 13364 + }, + { + "epoch": 1.0827122488658458, + "grad_norm": 0.03670511394739151, + "learning_rate": 0.00016207300058508484, + "loss": 0.3226, + "step": 13365 + }, + { + "epoch": 1.082793259883344, + "grad_norm": 0.03823100030422211, + "learning_rate": 0.00016206849993249023, + "loss": 0.294, + "step": 13366 + }, + { + "epoch": 1.0828742709008425, + "grad_norm": 0.04215855523943901, + "learning_rate": 0.0001620639992798956, + "loss": 0.2575, + "step": 13367 + }, + { + "epoch": 1.0829552819183408, + "grad_norm": 0.037356503307819366, + "learning_rate": 0.00016205949862730098, + "loss": 0.3122, + "step": 13368 + }, + { + "epoch": 1.0830362929358393, + "grad_norm": 0.03582943230867386, + "learning_rate": 0.00016205499797470634, + "loss": 0.2976, + "step": 13369 + }, + { + "epoch": 1.0831173039533377, + "grad_norm": 0.03671623393893242, + "learning_rate": 0.0001620504973221117, + "loss": 0.3179, + "step": 13370 + }, + { + "epoch": 1.083198314970836, + "grad_norm": 0.04530152678489685, + "learning_rate": 0.0001620459966695171, + "loss": 0.3153, + "step": 13371 + }, + { + "epoch": 1.0832793259883344, + "grad_norm": 0.04155336320400238, + "learning_rate": 0.00016204149601692247, + "loss": 0.3517, + "step": 13372 + }, + { + "epoch": 1.083360337005833, + "grad_norm": 0.036439746618270874, + "learning_rate": 0.00016203699536432783, + "loss": 0.2897, + "step": 13373 + }, + { + "epoch": 1.0834413480233311, + "grad_norm": 0.03697358816862106, + "learning_rate": 0.00016203249471173322, + "loss": 0.2568, + "step": 13374 + }, + { + "epoch": 1.0835223590408296, + "grad_norm": 0.04164021089673042, + "learning_rate": 0.00016202799405913858, + "loss": 0.3273, + "step": 13375 + }, + { + "epoch": 1.0836033700583279, + "grad_norm": 0.03787509724497795, + "learning_rate": 0.00016202349340654394, + "loss": 0.3166, + "step": 13376 + }, + { + "epoch": 1.0836843810758263, + "grad_norm": 0.0382017083466053, + "learning_rate": 0.00016201899275394935, + "loss": 0.2994, + "step": 13377 + }, + { + "epoch": 1.0837653920933248, + "grad_norm": 0.03486346825957298, + "learning_rate": 0.0001620144921013547, + "loss": 0.2928, + "step": 13378 + }, + { + "epoch": 1.083846403110823, + "grad_norm": 0.041407354176044464, + "learning_rate": 0.00016200999144876007, + "loss": 0.3383, + "step": 13379 + }, + { + "epoch": 1.0839274141283215, + "grad_norm": 0.03394337743520737, + "learning_rate": 0.00016200549079616546, + "loss": 0.3089, + "step": 13380 + }, + { + "epoch": 1.0840084251458197, + "grad_norm": 0.030327385291457176, + "learning_rate": 0.00016200099014357082, + "loss": 0.2834, + "step": 13381 + }, + { + "epoch": 1.0840894361633182, + "grad_norm": 0.03304734453558922, + "learning_rate": 0.00016199648949097618, + "loss": 0.3094, + "step": 13382 + }, + { + "epoch": 1.0841704471808167, + "grad_norm": 0.03882576897740364, + "learning_rate": 0.0001619919888383816, + "loss": 0.2942, + "step": 13383 + }, + { + "epoch": 1.084251458198315, + "grad_norm": 0.03737260773777962, + "learning_rate": 0.00016198748818578695, + "loss": 0.301, + "step": 13384 + }, + { + "epoch": 1.0843324692158134, + "grad_norm": 0.036476656794548035, + "learning_rate": 0.00016198298753319231, + "loss": 0.3604, + "step": 13385 + }, + { + "epoch": 1.0844134802333116, + "grad_norm": 0.042435936629772186, + "learning_rate": 0.0001619784868805977, + "loss": 0.3552, + "step": 13386 + }, + { + "epoch": 1.08449449125081, + "grad_norm": 0.04005247727036476, + "learning_rate": 0.00016197398622800306, + "loss": 0.2998, + "step": 13387 + }, + { + "epoch": 1.0845755022683086, + "grad_norm": 0.035631150007247925, + "learning_rate": 0.00016196948557540845, + "loss": 0.2897, + "step": 13388 + }, + { + "epoch": 1.0846565132858068, + "grad_norm": 0.037405405193567276, + "learning_rate": 0.00016196498492281384, + "loss": 0.351, + "step": 13389 + }, + { + "epoch": 1.0847375243033053, + "grad_norm": 0.035022035241127014, + "learning_rate": 0.0001619604842702192, + "loss": 0.2817, + "step": 13390 + }, + { + "epoch": 1.0848185353208035, + "grad_norm": 0.03630216792225838, + "learning_rate": 0.00016195598361762456, + "loss": 0.2864, + "step": 13391 + }, + { + "epoch": 1.084899546338302, + "grad_norm": 0.03898398578166962, + "learning_rate": 0.00016195148296502994, + "loss": 0.3124, + "step": 13392 + }, + { + "epoch": 1.0849805573558005, + "grad_norm": 0.04213237389922142, + "learning_rate": 0.0001619469823124353, + "loss": 0.3074, + "step": 13393 + }, + { + "epoch": 1.0850615683732987, + "grad_norm": 0.04674211144447327, + "learning_rate": 0.0001619424816598407, + "loss": 0.3913, + "step": 13394 + }, + { + "epoch": 1.0851425793907972, + "grad_norm": 0.04056910052895546, + "learning_rate": 0.00016193798100724608, + "loss": 0.3173, + "step": 13395 + }, + { + "epoch": 1.0852235904082956, + "grad_norm": 0.038967233151197433, + "learning_rate": 0.00016193348035465144, + "loss": 0.3144, + "step": 13396 + }, + { + "epoch": 1.0853046014257939, + "grad_norm": 0.04120781272649765, + "learning_rate": 0.0001619289797020568, + "loss": 0.3032, + "step": 13397 + }, + { + "epoch": 1.0853856124432923, + "grad_norm": 0.04656161367893219, + "learning_rate": 0.00016192447904946218, + "loss": 0.3592, + "step": 13398 + }, + { + "epoch": 1.0854666234607906, + "grad_norm": 0.047699473798274994, + "learning_rate": 0.00016191997839686754, + "loss": 0.3621, + "step": 13399 + }, + { + "epoch": 1.085547634478289, + "grad_norm": 0.051550593227148056, + "learning_rate": 0.00016191547774427293, + "loss": 0.3737, + "step": 13400 + }, + { + "epoch": 1.0856286454957875, + "grad_norm": 0.0335693359375, + "learning_rate": 0.00016191097709167832, + "loss": 0.2983, + "step": 13401 + }, + { + "epoch": 1.0857096565132858, + "grad_norm": 0.03921520337462425, + "learning_rate": 0.00016190647643908368, + "loss": 0.312, + "step": 13402 + }, + { + "epoch": 1.0857906675307842, + "grad_norm": 0.040765002369880676, + "learning_rate": 0.00016190197578648904, + "loss": 0.3251, + "step": 13403 + }, + { + "epoch": 1.0858716785482825, + "grad_norm": 0.04020378366112709, + "learning_rate": 0.00016189747513389443, + "loss": 0.3174, + "step": 13404 + }, + { + "epoch": 1.085952689565781, + "grad_norm": 0.035458121448755264, + "learning_rate": 0.00016189297448129979, + "loss": 0.2916, + "step": 13405 + }, + { + "epoch": 1.0860337005832794, + "grad_norm": 0.03757420554757118, + "learning_rate": 0.00016188847382870517, + "loss": 0.3191, + "step": 13406 + }, + { + "epoch": 1.0861147116007777, + "grad_norm": 0.037379421293735504, + "learning_rate": 0.00016188397317611056, + "loss": 0.3095, + "step": 13407 + }, + { + "epoch": 1.0861957226182761, + "grad_norm": 0.039359867572784424, + "learning_rate": 0.00016187947252351592, + "loss": 0.2978, + "step": 13408 + }, + { + "epoch": 1.0862767336357744, + "grad_norm": 0.034363653510808945, + "learning_rate": 0.00016187497187092128, + "loss": 0.2862, + "step": 13409 + }, + { + "epoch": 1.0863577446532728, + "grad_norm": 0.04078854247927666, + "learning_rate": 0.00016187047121832667, + "loss": 0.3244, + "step": 13410 + }, + { + "epoch": 1.0864387556707713, + "grad_norm": 0.04132116585969925, + "learning_rate": 0.00016186597056573203, + "loss": 0.3263, + "step": 13411 + }, + { + "epoch": 1.0865197666882696, + "grad_norm": 0.03711467981338501, + "learning_rate": 0.00016186146991313741, + "loss": 0.2796, + "step": 13412 + }, + { + "epoch": 1.086600777705768, + "grad_norm": 0.04074230045080185, + "learning_rate": 0.0001618569692605428, + "loss": 0.3382, + "step": 13413 + }, + { + "epoch": 1.0866817887232663, + "grad_norm": 0.03637443482875824, + "learning_rate": 0.00016185246860794816, + "loss": 0.2998, + "step": 13414 + }, + { + "epoch": 1.0867627997407647, + "grad_norm": 0.03684492036700249, + "learning_rate": 0.00016184796795535352, + "loss": 0.2826, + "step": 13415 + }, + { + "epoch": 1.0868438107582632, + "grad_norm": 0.035718128085136414, + "learning_rate": 0.0001618434673027589, + "loss": 0.3025, + "step": 13416 + }, + { + "epoch": 1.0869248217757614, + "grad_norm": 0.03895338252186775, + "learning_rate": 0.0001618389666501643, + "loss": 0.3109, + "step": 13417 + }, + { + "epoch": 1.08700583279326, + "grad_norm": 0.04166723042726517, + "learning_rate": 0.00016183446599756966, + "loss": 0.3203, + "step": 13418 + }, + { + "epoch": 1.0870868438107584, + "grad_norm": 0.04138009995222092, + "learning_rate": 0.00016182996534497504, + "loss": 0.316, + "step": 13419 + }, + { + "epoch": 1.0871678548282566, + "grad_norm": 0.04135255515575409, + "learning_rate": 0.0001618254646923804, + "loss": 0.3207, + "step": 13420 + }, + { + "epoch": 1.087248865845755, + "grad_norm": 0.03714308515191078, + "learning_rate": 0.00016182096403978576, + "loss": 0.2905, + "step": 13421 + }, + { + "epoch": 1.0873298768632533, + "grad_norm": 0.04605689272284508, + "learning_rate": 0.00016181646338719115, + "loss": 0.3506, + "step": 13422 + }, + { + "epoch": 1.0874108878807518, + "grad_norm": 0.03826843202114105, + "learning_rate": 0.00016181196273459654, + "loss": 0.3247, + "step": 13423 + }, + { + "epoch": 1.0874918988982503, + "grad_norm": 0.04529668018221855, + "learning_rate": 0.0001618074620820019, + "loss": 0.321, + "step": 13424 + }, + { + "epoch": 1.0875729099157485, + "grad_norm": 0.03837461769580841, + "learning_rate": 0.00016180296142940729, + "loss": 0.314, + "step": 13425 + }, + { + "epoch": 1.087653920933247, + "grad_norm": 0.04195813834667206, + "learning_rate": 0.00016179846077681265, + "loss": 0.3204, + "step": 13426 + }, + { + "epoch": 1.0877349319507452, + "grad_norm": 0.03744436800479889, + "learning_rate": 0.000161793960124218, + "loss": 0.3355, + "step": 13427 + }, + { + "epoch": 1.0878159429682437, + "grad_norm": 0.04252445697784424, + "learning_rate": 0.0001617894594716234, + "loss": 0.3436, + "step": 13428 + }, + { + "epoch": 1.0878969539857422, + "grad_norm": 0.03918018937110901, + "learning_rate": 0.00016178495881902878, + "loss": 0.316, + "step": 13429 + }, + { + "epoch": 1.0879779650032404, + "grad_norm": 0.036307670176029205, + "learning_rate": 0.00016178045816643414, + "loss": 0.2882, + "step": 13430 + }, + { + "epoch": 1.0880589760207389, + "grad_norm": 0.03624377399682999, + "learning_rate": 0.00016177595751383953, + "loss": 0.2911, + "step": 13431 + }, + { + "epoch": 1.088139987038237, + "grad_norm": 0.044146664440631866, + "learning_rate": 0.0001617714568612449, + "loss": 0.3389, + "step": 13432 + }, + { + "epoch": 1.0882209980557356, + "grad_norm": 0.0352458581328392, + "learning_rate": 0.00016176695620865025, + "loss": 0.2871, + "step": 13433 + }, + { + "epoch": 1.088302009073234, + "grad_norm": 0.038899607956409454, + "learning_rate": 0.00016176245555605563, + "loss": 0.2902, + "step": 13434 + }, + { + "epoch": 1.0883830200907323, + "grad_norm": 0.037828993052244186, + "learning_rate": 0.00016175795490346102, + "loss": 0.3268, + "step": 13435 + }, + { + "epoch": 1.0884640311082308, + "grad_norm": 0.03922426328063011, + "learning_rate": 0.00016175345425086638, + "loss": 0.3154, + "step": 13436 + }, + { + "epoch": 1.088545042125729, + "grad_norm": 0.037640493363142014, + "learning_rate": 0.00016174895359827177, + "loss": 0.325, + "step": 13437 + }, + { + "epoch": 1.0886260531432275, + "grad_norm": 0.04123581945896149, + "learning_rate": 0.00016174445294567713, + "loss": 0.3023, + "step": 13438 + }, + { + "epoch": 1.088707064160726, + "grad_norm": 0.038537222892045975, + "learning_rate": 0.0001617399522930825, + "loss": 0.2977, + "step": 13439 + }, + { + "epoch": 1.0887880751782242, + "grad_norm": 0.037821464240550995, + "learning_rate": 0.0001617354516404879, + "loss": 0.3106, + "step": 13440 + }, + { + "epoch": 1.0888690861957226, + "grad_norm": 0.035032059997320175, + "learning_rate": 0.00016173095098789326, + "loss": 0.3369, + "step": 13441 + }, + { + "epoch": 1.0889500972132211, + "grad_norm": 0.037530556321144104, + "learning_rate": 0.00016172645033529862, + "loss": 0.3109, + "step": 13442 + }, + { + "epoch": 1.0890311082307194, + "grad_norm": 0.042946796864271164, + "learning_rate": 0.000161721949682704, + "loss": 0.3547, + "step": 13443 + }, + { + "epoch": 1.0891121192482178, + "grad_norm": 0.0371532179415226, + "learning_rate": 0.00016171744903010937, + "loss": 0.3108, + "step": 13444 + }, + { + "epoch": 1.089193130265716, + "grad_norm": 0.03632621839642525, + "learning_rate": 0.00016171294837751473, + "loss": 0.3438, + "step": 13445 + }, + { + "epoch": 1.0892741412832145, + "grad_norm": 0.039438121020793915, + "learning_rate": 0.00016170844772492014, + "loss": 0.3297, + "step": 13446 + }, + { + "epoch": 1.0893551523007128, + "grad_norm": 0.03756951540708542, + "learning_rate": 0.0001617039470723255, + "loss": 0.2914, + "step": 13447 + }, + { + "epoch": 1.0894361633182112, + "grad_norm": 0.0482478141784668, + "learning_rate": 0.00016169944641973086, + "loss": 0.3197, + "step": 13448 + }, + { + "epoch": 1.0895171743357097, + "grad_norm": 0.03478642553091049, + "learning_rate": 0.00016169494576713625, + "loss": 0.3024, + "step": 13449 + }, + { + "epoch": 1.089598185353208, + "grad_norm": 0.04219109192490578, + "learning_rate": 0.0001616904451145416, + "loss": 0.3742, + "step": 13450 + }, + { + "epoch": 1.0896791963707064, + "grad_norm": 0.04149060696363449, + "learning_rate": 0.00016168594446194697, + "loss": 0.3444, + "step": 13451 + }, + { + "epoch": 1.089760207388205, + "grad_norm": 0.04618578031659126, + "learning_rate": 0.00016168144380935239, + "loss": 0.328, + "step": 13452 + }, + { + "epoch": 1.0898412184057031, + "grad_norm": 0.04148811101913452, + "learning_rate": 0.00016167694315675775, + "loss": 0.3284, + "step": 13453 + }, + { + "epoch": 1.0899222294232016, + "grad_norm": 0.03678377345204353, + "learning_rate": 0.0001616724425041631, + "loss": 0.3237, + "step": 13454 + }, + { + "epoch": 1.0900032404406998, + "grad_norm": 0.039635706692934036, + "learning_rate": 0.0001616679418515685, + "loss": 0.3288, + "step": 13455 + }, + { + "epoch": 1.0900842514581983, + "grad_norm": 0.03519574925303459, + "learning_rate": 0.00016166344119897385, + "loss": 0.2836, + "step": 13456 + }, + { + "epoch": 1.0901652624756968, + "grad_norm": 0.032641809433698654, + "learning_rate": 0.0001616589405463792, + "loss": 0.2589, + "step": 13457 + }, + { + "epoch": 1.090246273493195, + "grad_norm": 0.037794966250658035, + "learning_rate": 0.00016165443989378463, + "loss": 0.297, + "step": 13458 + }, + { + "epoch": 1.0903272845106935, + "grad_norm": 0.031619079411029816, + "learning_rate": 0.00016164993924119, + "loss": 0.2983, + "step": 13459 + }, + { + "epoch": 1.0904082955281917, + "grad_norm": 0.03748596832156181, + "learning_rate": 0.00016164543858859535, + "loss": 0.309, + "step": 13460 + }, + { + "epoch": 1.0904893065456902, + "grad_norm": 0.039078161120414734, + "learning_rate": 0.00016164093793600073, + "loss": 0.2891, + "step": 13461 + }, + { + "epoch": 1.0905703175631887, + "grad_norm": 0.03955300524830818, + "learning_rate": 0.0001616364372834061, + "loss": 0.3721, + "step": 13462 + }, + { + "epoch": 1.090651328580687, + "grad_norm": 0.044077832251787186, + "learning_rate": 0.00016163193663081146, + "loss": 0.3641, + "step": 13463 + }, + { + "epoch": 1.0907323395981854, + "grad_norm": 0.036638155579566956, + "learning_rate": 0.00016162743597821687, + "loss": 0.2976, + "step": 13464 + }, + { + "epoch": 1.0908133506156836, + "grad_norm": 0.040555648505687714, + "learning_rate": 0.00016162293532562223, + "loss": 0.3259, + "step": 13465 + }, + { + "epoch": 1.090894361633182, + "grad_norm": 0.03348960727453232, + "learning_rate": 0.0001616184346730276, + "loss": 0.2584, + "step": 13466 + }, + { + "epoch": 1.0909753726506806, + "grad_norm": 0.04036477208137512, + "learning_rate": 0.00016161393402043298, + "loss": 0.298, + "step": 13467 + }, + { + "epoch": 1.0910563836681788, + "grad_norm": 0.03749343752861023, + "learning_rate": 0.00016160943336783834, + "loss": 0.3002, + "step": 13468 + }, + { + "epoch": 1.0911373946856773, + "grad_norm": 0.036856718361377716, + "learning_rate": 0.00016160493271524372, + "loss": 0.2934, + "step": 13469 + }, + { + "epoch": 1.0912184057031755, + "grad_norm": 0.03889523446559906, + "learning_rate": 0.0001616004320626491, + "loss": 0.315, + "step": 13470 + }, + { + "epoch": 1.091299416720674, + "grad_norm": 0.04326172545552254, + "learning_rate": 0.00016159593141005447, + "loss": 0.324, + "step": 13471 + }, + { + "epoch": 1.0913804277381725, + "grad_norm": 0.037450067698955536, + "learning_rate": 0.00016159143075745983, + "loss": 0.2916, + "step": 13472 + }, + { + "epoch": 1.0914614387556707, + "grad_norm": 0.03876281529664993, + "learning_rate": 0.00016158693010486522, + "loss": 0.3034, + "step": 13473 + }, + { + "epoch": 1.0915424497731692, + "grad_norm": 0.04315292090177536, + "learning_rate": 0.00016158242945227058, + "loss": 0.3361, + "step": 13474 + }, + { + "epoch": 1.0916234607906676, + "grad_norm": 0.063300721347332, + "learning_rate": 0.00016157792879967597, + "loss": 0.3916, + "step": 13475 + }, + { + "epoch": 1.0917044718081659, + "grad_norm": 0.04203910008072853, + "learning_rate": 0.00016157342814708135, + "loss": 0.364, + "step": 13476 + }, + { + "epoch": 1.0917854828256643, + "grad_norm": 0.03829997405409813, + "learning_rate": 0.0001615689274944867, + "loss": 0.2792, + "step": 13477 + }, + { + "epoch": 1.0918664938431626, + "grad_norm": 0.035567138344049454, + "learning_rate": 0.00016156442684189207, + "loss": 0.302, + "step": 13478 + }, + { + "epoch": 1.091947504860661, + "grad_norm": 0.03505342826247215, + "learning_rate": 0.00016155992618929746, + "loss": 0.2906, + "step": 13479 + }, + { + "epoch": 1.0920285158781595, + "grad_norm": 0.03667743504047394, + "learning_rate": 0.00016155542553670282, + "loss": 0.2909, + "step": 13480 + }, + { + "epoch": 1.0921095268956578, + "grad_norm": 0.03935784846544266, + "learning_rate": 0.0001615509248841082, + "loss": 0.3509, + "step": 13481 + }, + { + "epoch": 1.0921905379131562, + "grad_norm": 0.039088696241378784, + "learning_rate": 0.0001615464242315136, + "loss": 0.2943, + "step": 13482 + }, + { + "epoch": 1.0922715489306545, + "grad_norm": 0.04239175096154213, + "learning_rate": 0.00016154192357891895, + "loss": 0.3225, + "step": 13483 + }, + { + "epoch": 1.092352559948153, + "grad_norm": 0.03504040092229843, + "learning_rate": 0.00016153742292632431, + "loss": 0.3069, + "step": 13484 + }, + { + "epoch": 1.0924335709656514, + "grad_norm": 0.03937983140349388, + "learning_rate": 0.0001615329222737297, + "loss": 0.3103, + "step": 13485 + }, + { + "epoch": 1.0925145819831497, + "grad_norm": 0.03920026496052742, + "learning_rate": 0.00016152842162113506, + "loss": 0.2984, + "step": 13486 + }, + { + "epoch": 1.0925955930006481, + "grad_norm": 0.038286179304122925, + "learning_rate": 0.00016152392096854045, + "loss": 0.3066, + "step": 13487 + }, + { + "epoch": 1.0926766040181464, + "grad_norm": 0.03593042120337486, + "learning_rate": 0.00016151942031594584, + "loss": 0.3042, + "step": 13488 + }, + { + "epoch": 1.0927576150356448, + "grad_norm": 0.03297584876418114, + "learning_rate": 0.0001615149196633512, + "loss": 0.2847, + "step": 13489 + }, + { + "epoch": 1.0928386260531433, + "grad_norm": 0.047829046845436096, + "learning_rate": 0.00016151041901075656, + "loss": 0.316, + "step": 13490 + }, + { + "epoch": 1.0929196370706415, + "grad_norm": 0.043388236314058304, + "learning_rate": 0.00016150591835816194, + "loss": 0.3403, + "step": 13491 + }, + { + "epoch": 1.09300064808814, + "grad_norm": 0.04012511670589447, + "learning_rate": 0.00016150141770556733, + "loss": 0.3023, + "step": 13492 + }, + { + "epoch": 1.0930816591056383, + "grad_norm": 0.04274914413690567, + "learning_rate": 0.0001614969170529727, + "loss": 0.3708, + "step": 13493 + }, + { + "epoch": 1.0931626701231367, + "grad_norm": 0.036317285150289536, + "learning_rate": 0.00016149241640037808, + "loss": 0.317, + "step": 13494 + }, + { + "epoch": 1.0932436811406352, + "grad_norm": 0.043395668268203735, + "learning_rate": 0.00016148791574778344, + "loss": 0.3627, + "step": 13495 + }, + { + "epoch": 1.0933246921581334, + "grad_norm": 0.038292981684207916, + "learning_rate": 0.0001614834150951888, + "loss": 0.3077, + "step": 13496 + }, + { + "epoch": 1.093405703175632, + "grad_norm": 0.040114227682352066, + "learning_rate": 0.00016147891444259418, + "loss": 0.3092, + "step": 13497 + }, + { + "epoch": 1.0934867141931304, + "grad_norm": 0.03980846703052521, + "learning_rate": 0.00016147441378999957, + "loss": 0.3377, + "step": 13498 + }, + { + "epoch": 1.0935677252106286, + "grad_norm": 0.03343471139669418, + "learning_rate": 0.00016146991313740493, + "loss": 0.2697, + "step": 13499 + }, + { + "epoch": 1.093648736228127, + "grad_norm": 0.03758417069911957, + "learning_rate": 0.00016146541248481032, + "loss": 0.2826, + "step": 13500 + }, + { + "epoch": 1.0937297472456253, + "grad_norm": 0.03633441776037216, + "learning_rate": 0.00016146091183221568, + "loss": 0.2761, + "step": 13501 + }, + { + "epoch": 1.0938107582631238, + "grad_norm": 0.04054310545325279, + "learning_rate": 0.00016145641117962104, + "loss": 0.303, + "step": 13502 + }, + { + "epoch": 1.0938917692806223, + "grad_norm": 0.039192259311676025, + "learning_rate": 0.00016145191052702643, + "loss": 0.3088, + "step": 13503 + }, + { + "epoch": 1.0939727802981205, + "grad_norm": 0.036303408443927765, + "learning_rate": 0.0001614474098744318, + "loss": 0.2925, + "step": 13504 + }, + { + "epoch": 1.094053791315619, + "grad_norm": 0.04174082353711128, + "learning_rate": 0.00016144290922183717, + "loss": 0.3105, + "step": 13505 + }, + { + "epoch": 1.0941348023331172, + "grad_norm": 0.03811978921294212, + "learning_rate": 0.00016143840856924256, + "loss": 0.2958, + "step": 13506 + }, + { + "epoch": 1.0942158133506157, + "grad_norm": 0.03642289713025093, + "learning_rate": 0.00016143390791664792, + "loss": 0.3037, + "step": 13507 + }, + { + "epoch": 1.0942968243681142, + "grad_norm": 0.03437235206365585, + "learning_rate": 0.00016142940726405328, + "loss": 0.2908, + "step": 13508 + }, + { + "epoch": 1.0943778353856124, + "grad_norm": 0.03871536627411842, + "learning_rate": 0.00016142490661145867, + "loss": 0.3089, + "step": 13509 + }, + { + "epoch": 1.0944588464031109, + "grad_norm": 0.04082522913813591, + "learning_rate": 0.00016142040595886406, + "loss": 0.3303, + "step": 13510 + }, + { + "epoch": 1.094539857420609, + "grad_norm": 0.04203804209828377, + "learning_rate": 0.00016141590530626942, + "loss": 0.3556, + "step": 13511 + }, + { + "epoch": 1.0946208684381076, + "grad_norm": 0.03674378991127014, + "learning_rate": 0.0001614114046536748, + "loss": 0.2966, + "step": 13512 + }, + { + "epoch": 1.094701879455606, + "grad_norm": 0.03520248457789421, + "learning_rate": 0.00016140690400108016, + "loss": 0.3171, + "step": 13513 + }, + { + "epoch": 1.0947828904731043, + "grad_norm": 0.03618206828832626, + "learning_rate": 0.00016140240334848552, + "loss": 0.279, + "step": 13514 + }, + { + "epoch": 1.0948639014906028, + "grad_norm": 0.04096698388457298, + "learning_rate": 0.0001613979026958909, + "loss": 0.2968, + "step": 13515 + }, + { + "epoch": 1.094944912508101, + "grad_norm": 0.03836210444569588, + "learning_rate": 0.0001613934020432963, + "loss": 0.2983, + "step": 13516 + }, + { + "epoch": 1.0950259235255995, + "grad_norm": 0.04501728713512421, + "learning_rate": 0.00016138890139070166, + "loss": 0.3536, + "step": 13517 + }, + { + "epoch": 1.095106934543098, + "grad_norm": 0.04326443746685982, + "learning_rate": 0.00016138440073810704, + "loss": 0.3309, + "step": 13518 + }, + { + "epoch": 1.0951879455605962, + "grad_norm": 0.044201552867889404, + "learning_rate": 0.0001613799000855124, + "loss": 0.3766, + "step": 13519 + }, + { + "epoch": 1.0952689565780946, + "grad_norm": 0.037375301122665405, + "learning_rate": 0.00016137539943291776, + "loss": 0.2803, + "step": 13520 + }, + { + "epoch": 1.095349967595593, + "grad_norm": 0.04308674857020378, + "learning_rate": 0.00016137089878032318, + "loss": 0.3287, + "step": 13521 + }, + { + "epoch": 1.0954309786130914, + "grad_norm": 0.03787890076637268, + "learning_rate": 0.00016136639812772854, + "loss": 0.2913, + "step": 13522 + }, + { + "epoch": 1.0955119896305898, + "grad_norm": 0.03910102695226669, + "learning_rate": 0.0001613618974751339, + "loss": 0.3162, + "step": 13523 + }, + { + "epoch": 1.095593000648088, + "grad_norm": 0.04268760234117508, + "learning_rate": 0.00016135739682253929, + "loss": 0.3078, + "step": 13524 + }, + { + "epoch": 1.0956740116655865, + "grad_norm": 0.03253033012151718, + "learning_rate": 0.00016135289616994465, + "loss": 0.2824, + "step": 13525 + }, + { + "epoch": 1.095755022683085, + "grad_norm": 0.04201733320951462, + "learning_rate": 0.00016134839551735, + "loss": 0.3268, + "step": 13526 + }, + { + "epoch": 1.0958360337005832, + "grad_norm": 0.03552337363362312, + "learning_rate": 0.00016134389486475542, + "loss": 0.2847, + "step": 13527 + }, + { + "epoch": 1.0959170447180817, + "grad_norm": 0.039983950555324554, + "learning_rate": 0.00016133939421216078, + "loss": 0.336, + "step": 13528 + }, + { + "epoch": 1.09599805573558, + "grad_norm": 0.0374993234872818, + "learning_rate": 0.00016133489355956614, + "loss": 0.3133, + "step": 13529 + }, + { + "epoch": 1.0960790667530784, + "grad_norm": 0.038941461592912674, + "learning_rate": 0.00016133039290697153, + "loss": 0.3335, + "step": 13530 + }, + { + "epoch": 1.096160077770577, + "grad_norm": 0.035729698836803436, + "learning_rate": 0.0001613258922543769, + "loss": 0.3002, + "step": 13531 + }, + { + "epoch": 1.0962410887880751, + "grad_norm": 0.04123429208993912, + "learning_rate": 0.00016132139160178225, + "loss": 0.2853, + "step": 13532 + }, + { + "epoch": 1.0963220998055736, + "grad_norm": 0.04294388368725777, + "learning_rate": 0.00016131689094918766, + "loss": 0.3547, + "step": 13533 + }, + { + "epoch": 1.0964031108230718, + "grad_norm": 0.0382094569504261, + "learning_rate": 0.00016131239029659302, + "loss": 0.3267, + "step": 13534 + }, + { + "epoch": 1.0964841218405703, + "grad_norm": 0.03548954427242279, + "learning_rate": 0.00016130788964399838, + "loss": 0.3127, + "step": 13535 + }, + { + "epoch": 1.0965651328580688, + "grad_norm": 0.03575649857521057, + "learning_rate": 0.00016130338899140377, + "loss": 0.2671, + "step": 13536 + }, + { + "epoch": 1.096646143875567, + "grad_norm": 0.04112618416547775, + "learning_rate": 0.00016129888833880913, + "loss": 0.2985, + "step": 13537 + }, + { + "epoch": 1.0967271548930655, + "grad_norm": 0.03346872702240944, + "learning_rate": 0.0001612943876862145, + "loss": 0.2774, + "step": 13538 + }, + { + "epoch": 1.0968081659105637, + "grad_norm": 0.04214170202612877, + "learning_rate": 0.0001612898870336199, + "loss": 0.3526, + "step": 13539 + }, + { + "epoch": 1.0968891769280622, + "grad_norm": 0.03760145977139473, + "learning_rate": 0.00016128538638102526, + "loss": 0.3031, + "step": 13540 + }, + { + "epoch": 1.0969701879455607, + "grad_norm": 0.045291200280189514, + "learning_rate": 0.00016128088572843062, + "loss": 0.3563, + "step": 13541 + }, + { + "epoch": 1.097051198963059, + "grad_norm": 0.04029682278633118, + "learning_rate": 0.000161276385075836, + "loss": 0.3036, + "step": 13542 + }, + { + "epoch": 1.0971322099805574, + "grad_norm": 0.039196692407131195, + "learning_rate": 0.00016127188442324137, + "loss": 0.3513, + "step": 13543 + }, + { + "epoch": 1.0972132209980558, + "grad_norm": 0.034414540976285934, + "learning_rate": 0.00016126738377064673, + "loss": 0.2933, + "step": 13544 + }, + { + "epoch": 1.097294232015554, + "grad_norm": 0.03912563994526863, + "learning_rate": 0.00016126288311805214, + "loss": 0.3291, + "step": 13545 + }, + { + "epoch": 1.0973752430330526, + "grad_norm": 0.035514701157808304, + "learning_rate": 0.0001612583824654575, + "loss": 0.2644, + "step": 13546 + }, + { + "epoch": 1.0974562540505508, + "grad_norm": 0.03862490504980087, + "learning_rate": 0.00016125388181286286, + "loss": 0.2769, + "step": 13547 + }, + { + "epoch": 1.0975372650680493, + "grad_norm": 0.03533685579895973, + "learning_rate": 0.00016124938116026825, + "loss": 0.3076, + "step": 13548 + }, + { + "epoch": 1.0976182760855475, + "grad_norm": 0.034725695848464966, + "learning_rate": 0.0001612448805076736, + "loss": 0.2808, + "step": 13549 + }, + { + "epoch": 1.097699287103046, + "grad_norm": 0.03933245316147804, + "learning_rate": 0.000161240379855079, + "loss": 0.3208, + "step": 13550 + }, + { + "epoch": 1.0977802981205445, + "grad_norm": 0.03894717991352081, + "learning_rate": 0.00016123587920248439, + "loss": 0.2945, + "step": 13551 + }, + { + "epoch": 1.0978613091380427, + "grad_norm": 0.03713586926460266, + "learning_rate": 0.00016123137854988975, + "loss": 0.3056, + "step": 13552 + }, + { + "epoch": 1.0979423201555412, + "grad_norm": 0.03865823894739151, + "learning_rate": 0.0001612268778972951, + "loss": 0.3257, + "step": 13553 + }, + { + "epoch": 1.0980233311730396, + "grad_norm": 0.03470766171813011, + "learning_rate": 0.0001612223772447005, + "loss": 0.2865, + "step": 13554 + }, + { + "epoch": 1.0981043421905379, + "grad_norm": 0.041255801916122437, + "learning_rate": 0.00016121787659210585, + "loss": 0.3433, + "step": 13555 + }, + { + "epoch": 1.0981853532080363, + "grad_norm": 0.03621998056769371, + "learning_rate": 0.00016121337593951124, + "loss": 0.3256, + "step": 13556 + }, + { + "epoch": 1.0982663642255346, + "grad_norm": 0.039620328694581985, + "learning_rate": 0.00016120887528691663, + "loss": 0.3068, + "step": 13557 + }, + { + "epoch": 1.098347375243033, + "grad_norm": 0.04105069488286972, + "learning_rate": 0.000161204374634322, + "loss": 0.3051, + "step": 13558 + }, + { + "epoch": 1.0984283862605315, + "grad_norm": 0.039884231984615326, + "learning_rate": 0.00016119987398172735, + "loss": 0.3234, + "step": 13559 + }, + { + "epoch": 1.0985093972780298, + "grad_norm": 0.03688472881913185, + "learning_rate": 0.00016119537332913274, + "loss": 0.2954, + "step": 13560 + }, + { + "epoch": 1.0985904082955282, + "grad_norm": 0.04016055911779404, + "learning_rate": 0.0001611908726765381, + "loss": 0.3341, + "step": 13561 + }, + { + "epoch": 1.0986714193130265, + "grad_norm": 0.042149852961301804, + "learning_rate": 0.00016118637202394348, + "loss": 0.3075, + "step": 13562 + }, + { + "epoch": 1.098752430330525, + "grad_norm": 0.037303440272808075, + "learning_rate": 0.00016118187137134887, + "loss": 0.2749, + "step": 13563 + }, + { + "epoch": 1.0988334413480234, + "grad_norm": 0.03796634450554848, + "learning_rate": 0.00016117737071875423, + "loss": 0.2935, + "step": 13564 + }, + { + "epoch": 1.0989144523655217, + "grad_norm": 0.03543466329574585, + "learning_rate": 0.0001611728700661596, + "loss": 0.2904, + "step": 13565 + }, + { + "epoch": 1.0989954633830201, + "grad_norm": 0.04048040136694908, + "learning_rate": 0.00016116836941356498, + "loss": 0.3239, + "step": 13566 + }, + { + "epoch": 1.0990764744005186, + "grad_norm": 0.042633056640625, + "learning_rate": 0.00016116386876097034, + "loss": 0.3353, + "step": 13567 + }, + { + "epoch": 1.0991574854180168, + "grad_norm": 0.03895155340433121, + "learning_rate": 0.00016115936810837572, + "loss": 0.3197, + "step": 13568 + }, + { + "epoch": 1.0992384964355153, + "grad_norm": 0.03703373670578003, + "learning_rate": 0.0001611548674557811, + "loss": 0.3239, + "step": 13569 + }, + { + "epoch": 1.0993195074530135, + "grad_norm": 0.03645946457982063, + "learning_rate": 0.00016115036680318647, + "loss": 0.3097, + "step": 13570 + }, + { + "epoch": 1.099400518470512, + "grad_norm": 0.036779873073101044, + "learning_rate": 0.00016114586615059183, + "loss": 0.3182, + "step": 13571 + }, + { + "epoch": 1.0994815294880103, + "grad_norm": 0.037688739597797394, + "learning_rate": 0.00016114136549799722, + "loss": 0.2918, + "step": 13572 + }, + { + "epoch": 1.0995625405055087, + "grad_norm": 0.03933629021048546, + "learning_rate": 0.0001611368648454026, + "loss": 0.3546, + "step": 13573 + }, + { + "epoch": 1.0996435515230072, + "grad_norm": 0.041833870112895966, + "learning_rate": 0.00016113236419280797, + "loss": 0.317, + "step": 13574 + }, + { + "epoch": 1.0997245625405054, + "grad_norm": 0.036539576947689056, + "learning_rate": 0.00016112786354021335, + "loss": 0.2978, + "step": 13575 + }, + { + "epoch": 1.099805573558004, + "grad_norm": 0.03901558741927147, + "learning_rate": 0.0001611233628876187, + "loss": 0.3173, + "step": 13576 + }, + { + "epoch": 1.0998865845755024, + "grad_norm": 0.03997525945305824, + "learning_rate": 0.00016111886223502407, + "loss": 0.3291, + "step": 13577 + }, + { + "epoch": 1.0999675955930006, + "grad_norm": 0.03499395772814751, + "learning_rate": 0.00016111436158242946, + "loss": 0.3013, + "step": 13578 + }, + { + "epoch": 1.100048606610499, + "grad_norm": 0.03280987963080406, + "learning_rate": 0.00016110986092983485, + "loss": 0.2733, + "step": 13579 + }, + { + "epoch": 1.1001296176279973, + "grad_norm": 0.0375014953315258, + "learning_rate": 0.0001611053602772402, + "loss": 0.2918, + "step": 13580 + }, + { + "epoch": 1.1002106286454958, + "grad_norm": 0.040403205901384354, + "learning_rate": 0.0001611008596246456, + "loss": 0.3348, + "step": 13581 + }, + { + "epoch": 1.1002916396629943, + "grad_norm": 0.039181552827358246, + "learning_rate": 0.00016109635897205095, + "loss": 0.2923, + "step": 13582 + }, + { + "epoch": 1.1003726506804925, + "grad_norm": 0.03221115842461586, + "learning_rate": 0.00016109185831945631, + "loss": 0.2504, + "step": 13583 + }, + { + "epoch": 1.100453661697991, + "grad_norm": 0.042914848774671555, + "learning_rate": 0.0001610873576668617, + "loss": 0.3013, + "step": 13584 + }, + { + "epoch": 1.1005346727154892, + "grad_norm": 0.04518318921327591, + "learning_rate": 0.0001610828570142671, + "loss": 0.318, + "step": 13585 + }, + { + "epoch": 1.1006156837329877, + "grad_norm": 0.03647695481777191, + "learning_rate": 0.00016107835636167245, + "loss": 0.2799, + "step": 13586 + }, + { + "epoch": 1.1006966947504861, + "grad_norm": 0.04397977516055107, + "learning_rate": 0.00016107385570907784, + "loss": 0.3168, + "step": 13587 + }, + { + "epoch": 1.1007777057679844, + "grad_norm": 0.03797837346792221, + "learning_rate": 0.0001610693550564832, + "loss": 0.2605, + "step": 13588 + }, + { + "epoch": 1.1008587167854829, + "grad_norm": 0.037378404289484024, + "learning_rate": 0.00016106485440388856, + "loss": 0.2823, + "step": 13589 + }, + { + "epoch": 1.100939727802981, + "grad_norm": 0.042138807475566864, + "learning_rate": 0.00016106035375129394, + "loss": 0.3103, + "step": 13590 + }, + { + "epoch": 1.1010207388204796, + "grad_norm": 0.03948663920164108, + "learning_rate": 0.00016105585309869933, + "loss": 0.3197, + "step": 13591 + }, + { + "epoch": 1.101101749837978, + "grad_norm": 0.03886083886027336, + "learning_rate": 0.0001610513524461047, + "loss": 0.2976, + "step": 13592 + }, + { + "epoch": 1.1011827608554763, + "grad_norm": 0.047055937349796295, + "learning_rate": 0.00016104685179351008, + "loss": 0.3242, + "step": 13593 + }, + { + "epoch": 1.1012637718729748, + "grad_norm": 0.04041250795125961, + "learning_rate": 0.00016104235114091544, + "loss": 0.3103, + "step": 13594 + }, + { + "epoch": 1.101344782890473, + "grad_norm": 0.04913514479994774, + "learning_rate": 0.0001610378504883208, + "loss": 0.3058, + "step": 13595 + }, + { + "epoch": 1.1014257939079715, + "grad_norm": 0.04520176723599434, + "learning_rate": 0.00016103334983572618, + "loss": 0.3102, + "step": 13596 + }, + { + "epoch": 1.10150680492547, + "grad_norm": 0.0359567254781723, + "learning_rate": 0.00016102884918313157, + "loss": 0.3043, + "step": 13597 + }, + { + "epoch": 1.1015878159429682, + "grad_norm": 0.034442391246557236, + "learning_rate": 0.00016102434853053693, + "loss": 0.2853, + "step": 13598 + }, + { + "epoch": 1.1016688269604666, + "grad_norm": 0.03527701646089554, + "learning_rate": 0.00016101984787794232, + "loss": 0.2869, + "step": 13599 + }, + { + "epoch": 1.101749837977965, + "grad_norm": 0.035869915038347244, + "learning_rate": 0.00016101534722534768, + "loss": 0.3078, + "step": 13600 + }, + { + "epoch": 1.1018308489954634, + "grad_norm": 0.032918624579906464, + "learning_rate": 0.00016101084657275304, + "loss": 0.2818, + "step": 13601 + }, + { + "epoch": 1.1019118600129618, + "grad_norm": 0.03698991239070892, + "learning_rate": 0.00016100634592015845, + "loss": 0.3051, + "step": 13602 + }, + { + "epoch": 1.10199287103046, + "grad_norm": 0.03834715113043785, + "learning_rate": 0.00016100184526756381, + "loss": 0.3272, + "step": 13603 + }, + { + "epoch": 1.1020738820479585, + "grad_norm": 0.03652876615524292, + "learning_rate": 0.00016099734461496917, + "loss": 0.3024, + "step": 13604 + }, + { + "epoch": 1.102154893065457, + "grad_norm": 0.03805585205554962, + "learning_rate": 0.00016099284396237456, + "loss": 0.27, + "step": 13605 + }, + { + "epoch": 1.1022359040829552, + "grad_norm": 0.045338697731494904, + "learning_rate": 0.00016098834330977992, + "loss": 0.3537, + "step": 13606 + }, + { + "epoch": 1.1023169151004537, + "grad_norm": 0.037921786308288574, + "learning_rate": 0.00016098384265718528, + "loss": 0.3129, + "step": 13607 + }, + { + "epoch": 1.102397926117952, + "grad_norm": 0.0399080328643322, + "learning_rate": 0.0001609793420045907, + "loss": 0.3162, + "step": 13608 + }, + { + "epoch": 1.1024789371354504, + "grad_norm": 0.04143821448087692, + "learning_rate": 0.00016097484135199606, + "loss": 0.3199, + "step": 13609 + }, + { + "epoch": 1.1025599481529489, + "grad_norm": 0.03901328518986702, + "learning_rate": 0.00016097034069940142, + "loss": 0.3591, + "step": 13610 + }, + { + "epoch": 1.1026409591704471, + "grad_norm": 0.039841070771217346, + "learning_rate": 0.0001609658400468068, + "loss": 0.3459, + "step": 13611 + }, + { + "epoch": 1.1027219701879456, + "grad_norm": 0.03591349720954895, + "learning_rate": 0.00016096133939421216, + "loss": 0.2803, + "step": 13612 + }, + { + "epoch": 1.1028029812054438, + "grad_norm": 0.041079454123973846, + "learning_rate": 0.00016095683874161752, + "loss": 0.3069, + "step": 13613 + }, + { + "epoch": 1.1028839922229423, + "grad_norm": 0.03818468004465103, + "learning_rate": 0.00016095233808902294, + "loss": 0.2928, + "step": 13614 + }, + { + "epoch": 1.1029650032404408, + "grad_norm": 0.03975100442767143, + "learning_rate": 0.0001609478374364283, + "loss": 0.3156, + "step": 13615 + }, + { + "epoch": 1.103046014257939, + "grad_norm": 0.03451533615589142, + "learning_rate": 0.00016094333678383366, + "loss": 0.2956, + "step": 13616 + }, + { + "epoch": 1.1031270252754375, + "grad_norm": 0.04473431780934334, + "learning_rate": 0.00016093883613123904, + "loss": 0.2829, + "step": 13617 + }, + { + "epoch": 1.1032080362929357, + "grad_norm": 0.045738816261291504, + "learning_rate": 0.0001609343354786444, + "loss": 0.358, + "step": 13618 + }, + { + "epoch": 1.1032890473104342, + "grad_norm": 0.046089205890893936, + "learning_rate": 0.00016092983482604976, + "loss": 0.3782, + "step": 13619 + }, + { + "epoch": 1.1033700583279327, + "grad_norm": 0.03943808749318123, + "learning_rate": 0.00016092533417345518, + "loss": 0.2896, + "step": 13620 + }, + { + "epoch": 1.103451069345431, + "grad_norm": 0.04181476682424545, + "learning_rate": 0.00016092083352086054, + "loss": 0.3226, + "step": 13621 + }, + { + "epoch": 1.1035320803629294, + "grad_norm": 0.04341307654976845, + "learning_rate": 0.0001609163328682659, + "loss": 0.3435, + "step": 13622 + }, + { + "epoch": 1.1036130913804278, + "grad_norm": 0.03791226074099541, + "learning_rate": 0.00016091183221567129, + "loss": 0.3019, + "step": 13623 + }, + { + "epoch": 1.103694102397926, + "grad_norm": 0.03913251310586929, + "learning_rate": 0.00016090733156307665, + "loss": 0.3095, + "step": 13624 + }, + { + "epoch": 1.1037751134154246, + "grad_norm": 0.03419218584895134, + "learning_rate": 0.00016090283091048203, + "loss": 0.2964, + "step": 13625 + }, + { + "epoch": 1.1038561244329228, + "grad_norm": 0.03727216273546219, + "learning_rate": 0.00016089833025788742, + "loss": 0.281, + "step": 13626 + }, + { + "epoch": 1.1039371354504213, + "grad_norm": 0.040743958204984665, + "learning_rate": 0.00016089382960529278, + "loss": 0.3185, + "step": 13627 + }, + { + "epoch": 1.1040181464679197, + "grad_norm": 0.033087439835071564, + "learning_rate": 0.00016088932895269814, + "loss": 0.2941, + "step": 13628 + }, + { + "epoch": 1.104099157485418, + "grad_norm": 0.04077974334359169, + "learning_rate": 0.00016088482830010353, + "loss": 0.3073, + "step": 13629 + }, + { + "epoch": 1.1041801685029164, + "grad_norm": 0.043515462428331375, + "learning_rate": 0.0001608803276475089, + "loss": 0.3272, + "step": 13630 + }, + { + "epoch": 1.1042611795204147, + "grad_norm": 0.041284892708063126, + "learning_rate": 0.00016087582699491427, + "loss": 0.3217, + "step": 13631 + }, + { + "epoch": 1.1043421905379132, + "grad_norm": 0.03675724193453789, + "learning_rate": 0.00016087132634231966, + "loss": 0.3003, + "step": 13632 + }, + { + "epoch": 1.1044232015554116, + "grad_norm": 0.038290172815322876, + "learning_rate": 0.00016086682568972502, + "loss": 0.3122, + "step": 13633 + }, + { + "epoch": 1.1045042125729099, + "grad_norm": 0.040265701711177826, + "learning_rate": 0.00016086232503713038, + "loss": 0.3189, + "step": 13634 + }, + { + "epoch": 1.1045852235904083, + "grad_norm": 0.03757871687412262, + "learning_rate": 0.00016085782438453577, + "loss": 0.3035, + "step": 13635 + }, + { + "epoch": 1.1046662346079066, + "grad_norm": 0.03657103329896927, + "learning_rate": 0.00016085332373194113, + "loss": 0.3059, + "step": 13636 + }, + { + "epoch": 1.104747245625405, + "grad_norm": 0.03874574601650238, + "learning_rate": 0.00016084882307934652, + "loss": 0.2982, + "step": 13637 + }, + { + "epoch": 1.1048282566429035, + "grad_norm": 0.03787166625261307, + "learning_rate": 0.0001608443224267519, + "loss": 0.3235, + "step": 13638 + }, + { + "epoch": 1.1049092676604018, + "grad_norm": 0.03862585499882698, + "learning_rate": 0.00016083982177415726, + "loss": 0.353, + "step": 13639 + }, + { + "epoch": 1.1049902786779002, + "grad_norm": 0.03860185667872429, + "learning_rate": 0.00016083532112156262, + "loss": 0.3144, + "step": 13640 + }, + { + "epoch": 1.1050712896953985, + "grad_norm": 0.03666646033525467, + "learning_rate": 0.000160830820468968, + "loss": 0.3328, + "step": 13641 + }, + { + "epoch": 1.105152300712897, + "grad_norm": 0.036767471581697464, + "learning_rate": 0.00016082631981637337, + "loss": 0.2915, + "step": 13642 + }, + { + "epoch": 1.1052333117303954, + "grad_norm": 0.03964807093143463, + "learning_rate": 0.00016082181916377876, + "loss": 0.3609, + "step": 13643 + }, + { + "epoch": 1.1053143227478937, + "grad_norm": 0.04057459905743599, + "learning_rate": 0.00016081731851118415, + "loss": 0.3036, + "step": 13644 + }, + { + "epoch": 1.1053953337653921, + "grad_norm": 0.04356246814131737, + "learning_rate": 0.0001608128178585895, + "loss": 0.3394, + "step": 13645 + }, + { + "epoch": 1.1054763447828906, + "grad_norm": 0.03935835137963295, + "learning_rate": 0.00016080831720599487, + "loss": 0.3399, + "step": 13646 + }, + { + "epoch": 1.1055573558003888, + "grad_norm": 0.03753943741321564, + "learning_rate": 0.00016080381655340025, + "loss": 0.2971, + "step": 13647 + }, + { + "epoch": 1.1056383668178873, + "grad_norm": 0.03873160481452942, + "learning_rate": 0.0001607993159008056, + "loss": 0.3276, + "step": 13648 + }, + { + "epoch": 1.1057193778353855, + "grad_norm": 0.0393451452255249, + "learning_rate": 0.000160794815248211, + "loss": 0.3387, + "step": 13649 + }, + { + "epoch": 1.105800388852884, + "grad_norm": 0.038663193583488464, + "learning_rate": 0.0001607903145956164, + "loss": 0.3553, + "step": 13650 + }, + { + "epoch": 1.1058813998703825, + "grad_norm": 0.04521648958325386, + "learning_rate": 0.00016078581394302175, + "loss": 0.3386, + "step": 13651 + }, + { + "epoch": 1.1059624108878807, + "grad_norm": 0.044531892985105515, + "learning_rate": 0.0001607813132904271, + "loss": 0.3004, + "step": 13652 + }, + { + "epoch": 1.1060434219053792, + "grad_norm": 0.037462715059518814, + "learning_rate": 0.0001607768126378325, + "loss": 0.3165, + "step": 13653 + }, + { + "epoch": 1.1061244329228774, + "grad_norm": 0.041082963347435, + "learning_rate": 0.00016077231198523788, + "loss": 0.3294, + "step": 13654 + }, + { + "epoch": 1.106205443940376, + "grad_norm": 0.039622195065021515, + "learning_rate": 0.00016076781133264324, + "loss": 0.2801, + "step": 13655 + }, + { + "epoch": 1.1062864549578744, + "grad_norm": 0.04551248252391815, + "learning_rate": 0.00016076331068004863, + "loss": 0.2985, + "step": 13656 + }, + { + "epoch": 1.1063674659753726, + "grad_norm": 0.03860415145754814, + "learning_rate": 0.000160758810027454, + "loss": 0.3012, + "step": 13657 + }, + { + "epoch": 1.106448476992871, + "grad_norm": 0.03250998258590698, + "learning_rate": 0.00016075430937485935, + "loss": 0.2973, + "step": 13658 + }, + { + "epoch": 1.1065294880103693, + "grad_norm": 0.03704884275794029, + "learning_rate": 0.00016074980872226474, + "loss": 0.2773, + "step": 13659 + }, + { + "epoch": 1.1066104990278678, + "grad_norm": 0.0459744855761528, + "learning_rate": 0.00016074530806967012, + "loss": 0.3019, + "step": 13660 + }, + { + "epoch": 1.1066915100453663, + "grad_norm": 0.03988320380449295, + "learning_rate": 0.00016074080741707548, + "loss": 0.324, + "step": 13661 + }, + { + "epoch": 1.1067725210628645, + "grad_norm": 0.03656921163201332, + "learning_rate": 0.00016073630676448087, + "loss": 0.2909, + "step": 13662 + }, + { + "epoch": 1.106853532080363, + "grad_norm": 0.03722751513123512, + "learning_rate": 0.00016073180611188623, + "loss": 0.329, + "step": 13663 + }, + { + "epoch": 1.1069345430978612, + "grad_norm": 0.03713303059339523, + "learning_rate": 0.0001607273054592916, + "loss": 0.3059, + "step": 13664 + }, + { + "epoch": 1.1070155541153597, + "grad_norm": 0.04212358966469765, + "learning_rate": 0.00016072280480669698, + "loss": 0.3593, + "step": 13665 + }, + { + "epoch": 1.1070965651328581, + "grad_norm": 0.04464373737573624, + "learning_rate": 0.00016071830415410236, + "loss": 0.3548, + "step": 13666 + }, + { + "epoch": 1.1071775761503564, + "grad_norm": 0.040497563779354095, + "learning_rate": 0.00016071380350150772, + "loss": 0.2827, + "step": 13667 + }, + { + "epoch": 1.1072585871678549, + "grad_norm": 0.0430973619222641, + "learning_rate": 0.0001607093028489131, + "loss": 0.2946, + "step": 13668 + }, + { + "epoch": 1.1073395981853533, + "grad_norm": 0.04000406712293625, + "learning_rate": 0.00016070480219631847, + "loss": 0.3266, + "step": 13669 + }, + { + "epoch": 1.1074206092028516, + "grad_norm": 0.04005291685461998, + "learning_rate": 0.00016070030154372383, + "loss": 0.3224, + "step": 13670 + }, + { + "epoch": 1.10750162022035, + "grad_norm": 0.039465539157390594, + "learning_rate": 0.00016069580089112922, + "loss": 0.3165, + "step": 13671 + }, + { + "epoch": 1.1075826312378483, + "grad_norm": 0.04171768203377724, + "learning_rate": 0.0001606913002385346, + "loss": 0.2878, + "step": 13672 + }, + { + "epoch": 1.1076636422553467, + "grad_norm": 0.03892872855067253, + "learning_rate": 0.00016068679958593997, + "loss": 0.3245, + "step": 13673 + }, + { + "epoch": 1.107744653272845, + "grad_norm": 0.03817931190133095, + "learning_rate": 0.00016068229893334535, + "loss": 0.3052, + "step": 13674 + }, + { + "epoch": 1.1078256642903435, + "grad_norm": 0.040611643344163895, + "learning_rate": 0.0001606777982807507, + "loss": 0.3325, + "step": 13675 + }, + { + "epoch": 1.107906675307842, + "grad_norm": 0.03535166755318642, + "learning_rate": 0.00016067329762815607, + "loss": 0.3035, + "step": 13676 + }, + { + "epoch": 1.1079876863253402, + "grad_norm": 0.035269174724817276, + "learning_rate": 0.00016066879697556146, + "loss": 0.2889, + "step": 13677 + }, + { + "epoch": 1.1080686973428386, + "grad_norm": 0.036208029836416245, + "learning_rate": 0.00016066429632296685, + "loss": 0.3215, + "step": 13678 + }, + { + "epoch": 1.108149708360337, + "grad_norm": 0.03432176262140274, + "learning_rate": 0.0001606597956703722, + "loss": 0.2806, + "step": 13679 + }, + { + "epoch": 1.1082307193778353, + "grad_norm": 0.048559654504060745, + "learning_rate": 0.0001606552950177776, + "loss": 0.3267, + "step": 13680 + }, + { + "epoch": 1.1083117303953338, + "grad_norm": 0.04100848361849785, + "learning_rate": 0.00016065079436518295, + "loss": 0.3196, + "step": 13681 + }, + { + "epoch": 1.108392741412832, + "grad_norm": 0.03949177637696266, + "learning_rate": 0.00016064629371258831, + "loss": 0.3001, + "step": 13682 + }, + { + "epoch": 1.1084737524303305, + "grad_norm": 0.036436039954423904, + "learning_rate": 0.00016064179305999373, + "loss": 0.3062, + "step": 13683 + }, + { + "epoch": 1.108554763447829, + "grad_norm": 0.0427464097738266, + "learning_rate": 0.0001606372924073991, + "loss": 0.3726, + "step": 13684 + }, + { + "epoch": 1.1086357744653272, + "grad_norm": 0.050769273191690445, + "learning_rate": 0.00016063279175480445, + "loss": 0.3005, + "step": 13685 + }, + { + "epoch": 1.1087167854828257, + "grad_norm": 0.03931155800819397, + "learning_rate": 0.00016062829110220984, + "loss": 0.2743, + "step": 13686 + }, + { + "epoch": 1.108797796500324, + "grad_norm": 0.04116865247488022, + "learning_rate": 0.0001606237904496152, + "loss": 0.3373, + "step": 13687 + }, + { + "epoch": 1.1088788075178224, + "grad_norm": 0.04374229907989502, + "learning_rate": 0.00016061928979702056, + "loss": 0.3198, + "step": 13688 + }, + { + "epoch": 1.1089598185353209, + "grad_norm": 0.045593272894620895, + "learning_rate": 0.00016061478914442597, + "loss": 0.3158, + "step": 13689 + }, + { + "epoch": 1.1090408295528191, + "grad_norm": 0.043646588921546936, + "learning_rate": 0.00016061028849183133, + "loss": 0.3859, + "step": 13690 + }, + { + "epoch": 1.1091218405703176, + "grad_norm": 0.0321785993874073, + "learning_rate": 0.0001606057878392367, + "loss": 0.2515, + "step": 13691 + }, + { + "epoch": 1.1092028515878158, + "grad_norm": 0.04451699182391167, + "learning_rate": 0.00016060128718664208, + "loss": 0.333, + "step": 13692 + }, + { + "epoch": 1.1092838626053143, + "grad_norm": 0.04380195960402489, + "learning_rate": 0.00016059678653404744, + "loss": 0.3268, + "step": 13693 + }, + { + "epoch": 1.1093648736228128, + "grad_norm": 0.03785150125622749, + "learning_rate": 0.0001605922858814528, + "loss": 0.3239, + "step": 13694 + }, + { + "epoch": 1.109445884640311, + "grad_norm": 0.03865273296833038, + "learning_rate": 0.0001605877852288582, + "loss": 0.2987, + "step": 13695 + }, + { + "epoch": 1.1095268956578095, + "grad_norm": 0.034965209662914276, + "learning_rate": 0.00016058328457626357, + "loss": 0.2739, + "step": 13696 + }, + { + "epoch": 1.1096079066753077, + "grad_norm": 0.034332286566495895, + "learning_rate": 0.00016057878392366893, + "loss": 0.2636, + "step": 13697 + }, + { + "epoch": 1.1096889176928062, + "grad_norm": 0.03668433055281639, + "learning_rate": 0.00016057428327107432, + "loss": 0.3158, + "step": 13698 + }, + { + "epoch": 1.1097699287103047, + "grad_norm": 0.04302181676030159, + "learning_rate": 0.00016056978261847968, + "loss": 0.3359, + "step": 13699 + }, + { + "epoch": 1.109850939727803, + "grad_norm": 0.037046417593955994, + "learning_rate": 0.00016056528196588504, + "loss": 0.3147, + "step": 13700 + }, + { + "epoch": 1.1099319507453014, + "grad_norm": 0.037314411252737045, + "learning_rate": 0.00016056078131329045, + "loss": 0.3443, + "step": 13701 + }, + { + "epoch": 1.1100129617627998, + "grad_norm": 0.03947966545820236, + "learning_rate": 0.00016055628066069581, + "loss": 0.3145, + "step": 13702 + }, + { + "epoch": 1.110093972780298, + "grad_norm": 0.03757372498512268, + "learning_rate": 0.00016055178000810117, + "loss": 0.3245, + "step": 13703 + }, + { + "epoch": 1.1101749837977966, + "grad_norm": 0.041169408708810806, + "learning_rate": 0.00016054727935550656, + "loss": 0.3149, + "step": 13704 + }, + { + "epoch": 1.1102559948152948, + "grad_norm": 0.04169107601046562, + "learning_rate": 0.00016054277870291192, + "loss": 0.3571, + "step": 13705 + }, + { + "epoch": 1.1103370058327933, + "grad_norm": 0.03553405776619911, + "learning_rate": 0.0001605382780503173, + "loss": 0.2502, + "step": 13706 + }, + { + "epoch": 1.1104180168502917, + "grad_norm": 0.042865827679634094, + "learning_rate": 0.0001605337773977227, + "loss": 0.3373, + "step": 13707 + }, + { + "epoch": 1.11049902786779, + "grad_norm": 0.03908531740307808, + "learning_rate": 0.00016052927674512806, + "loss": 0.3003, + "step": 13708 + }, + { + "epoch": 1.1105800388852884, + "grad_norm": 0.037927430123090744, + "learning_rate": 0.00016052477609253342, + "loss": 0.3063, + "step": 13709 + }, + { + "epoch": 1.1106610499027867, + "grad_norm": 0.04091384634375572, + "learning_rate": 0.0001605202754399388, + "loss": 0.3533, + "step": 13710 + }, + { + "epoch": 1.1107420609202852, + "grad_norm": 0.040006060153245926, + "learning_rate": 0.00016051577478734416, + "loss": 0.2912, + "step": 13711 + }, + { + "epoch": 1.1108230719377836, + "grad_norm": 0.041538480669260025, + "learning_rate": 0.00016051127413474955, + "loss": 0.3069, + "step": 13712 + }, + { + "epoch": 1.1109040829552819, + "grad_norm": 0.036893781274557114, + "learning_rate": 0.00016050677348215494, + "loss": 0.3215, + "step": 13713 + }, + { + "epoch": 1.1109850939727803, + "grad_norm": 0.03629325330257416, + "learning_rate": 0.0001605022728295603, + "loss": 0.2887, + "step": 13714 + }, + { + "epoch": 1.1110661049902786, + "grad_norm": 0.04308544471859932, + "learning_rate": 0.00016049777217696566, + "loss": 0.3463, + "step": 13715 + }, + { + "epoch": 1.111147116007777, + "grad_norm": 0.03460239991545677, + "learning_rate": 0.00016049327152437104, + "loss": 0.3038, + "step": 13716 + }, + { + "epoch": 1.1112281270252755, + "grad_norm": 0.03839297592639923, + "learning_rate": 0.0001604887708717764, + "loss": 0.3112, + "step": 13717 + }, + { + "epoch": 1.1113091380427738, + "grad_norm": 0.04267555847764015, + "learning_rate": 0.0001604842702191818, + "loss": 0.341, + "step": 13718 + }, + { + "epoch": 1.1113901490602722, + "grad_norm": 0.04148966446518898, + "learning_rate": 0.00016047976956658718, + "loss": 0.3254, + "step": 13719 + }, + { + "epoch": 1.1114711600777705, + "grad_norm": 0.0401795469224453, + "learning_rate": 0.00016047526891399254, + "loss": 0.2806, + "step": 13720 + }, + { + "epoch": 1.111552171095269, + "grad_norm": 0.0394086129963398, + "learning_rate": 0.0001604707682613979, + "loss": 0.3138, + "step": 13721 + }, + { + "epoch": 1.1116331821127674, + "grad_norm": 0.041196051985025406, + "learning_rate": 0.00016046626760880329, + "loss": 0.3121, + "step": 13722 + }, + { + "epoch": 1.1117141931302656, + "grad_norm": 0.03980787470936775, + "learning_rate": 0.00016046176695620865, + "loss": 0.2778, + "step": 13723 + }, + { + "epoch": 1.1117952041477641, + "grad_norm": 0.045564647763967514, + "learning_rate": 0.00016045726630361403, + "loss": 0.3361, + "step": 13724 + }, + { + "epoch": 1.1118762151652626, + "grad_norm": 0.04442529380321503, + "learning_rate": 0.00016045276565101942, + "loss": 0.3089, + "step": 13725 + }, + { + "epoch": 1.1119572261827608, + "grad_norm": 0.03888750821352005, + "learning_rate": 0.00016044826499842478, + "loss": 0.291, + "step": 13726 + }, + { + "epoch": 1.1120382372002593, + "grad_norm": 0.03487385809421539, + "learning_rate": 0.00016044376434583014, + "loss": 0.2892, + "step": 13727 + }, + { + "epoch": 1.1121192482177575, + "grad_norm": 0.046714432537555695, + "learning_rate": 0.00016043926369323553, + "loss": 0.3259, + "step": 13728 + }, + { + "epoch": 1.112200259235256, + "grad_norm": 0.04512115195393562, + "learning_rate": 0.0001604347630406409, + "loss": 0.3052, + "step": 13729 + }, + { + "epoch": 1.1122812702527545, + "grad_norm": 0.039746955037117004, + "learning_rate": 0.00016043026238804627, + "loss": 0.3195, + "step": 13730 + }, + { + "epoch": 1.1123622812702527, + "grad_norm": 0.037748560309410095, + "learning_rate": 0.00016042576173545166, + "loss": 0.3181, + "step": 13731 + }, + { + "epoch": 1.1124432922877512, + "grad_norm": 0.04270657151937485, + "learning_rate": 0.00016042126108285702, + "loss": 0.3559, + "step": 13732 + }, + { + "epoch": 1.1125243033052494, + "grad_norm": 0.04279698058962822, + "learning_rate": 0.00016041676043026238, + "loss": 0.311, + "step": 13733 + }, + { + "epoch": 1.112605314322748, + "grad_norm": 0.04194524139165878, + "learning_rate": 0.00016041225977766777, + "loss": 0.2672, + "step": 13734 + }, + { + "epoch": 1.1126863253402464, + "grad_norm": 0.041094232350587845, + "learning_rate": 0.00016040775912507316, + "loss": 0.2919, + "step": 13735 + }, + { + "epoch": 1.1127673363577446, + "grad_norm": 0.03560243919491768, + "learning_rate": 0.00016040325847247852, + "loss": 0.2923, + "step": 13736 + }, + { + "epoch": 1.112848347375243, + "grad_norm": 0.0350363552570343, + "learning_rate": 0.0001603987578198839, + "loss": 0.2972, + "step": 13737 + }, + { + "epoch": 1.1129293583927413, + "grad_norm": 0.03738215193152428, + "learning_rate": 0.00016039425716728926, + "loss": 0.2931, + "step": 13738 + }, + { + "epoch": 1.1130103694102398, + "grad_norm": 0.03830656036734581, + "learning_rate": 0.00016038975651469462, + "loss": 0.3016, + "step": 13739 + }, + { + "epoch": 1.1130913804277383, + "grad_norm": 0.0368039608001709, + "learning_rate": 0.0001603852558621, + "loss": 0.2775, + "step": 13740 + }, + { + "epoch": 1.1131723914452365, + "grad_norm": 0.03418166935443878, + "learning_rate": 0.0001603807552095054, + "loss": 0.2883, + "step": 13741 + }, + { + "epoch": 1.113253402462735, + "grad_norm": 0.04323248937726021, + "learning_rate": 0.00016037625455691076, + "loss": 0.3058, + "step": 13742 + }, + { + "epoch": 1.1133344134802332, + "grad_norm": 0.03451283648610115, + "learning_rate": 0.00016037175390431615, + "loss": 0.2643, + "step": 13743 + }, + { + "epoch": 1.1134154244977317, + "grad_norm": 0.037699535489082336, + "learning_rate": 0.0001603672532517215, + "loss": 0.2845, + "step": 13744 + }, + { + "epoch": 1.1134964355152301, + "grad_norm": 0.04381730407476425, + "learning_rate": 0.00016036275259912687, + "loss": 0.2783, + "step": 13745 + }, + { + "epoch": 1.1135774465327284, + "grad_norm": 0.04685590788722038, + "learning_rate": 0.00016035825194653225, + "loss": 0.3031, + "step": 13746 + }, + { + "epoch": 1.1136584575502269, + "grad_norm": 0.044405192136764526, + "learning_rate": 0.00016035375129393764, + "loss": 0.3171, + "step": 13747 + }, + { + "epoch": 1.1137394685677253, + "grad_norm": 0.047562070190906525, + "learning_rate": 0.000160349250641343, + "loss": 0.3485, + "step": 13748 + }, + { + "epoch": 1.1138204795852236, + "grad_norm": 0.04554133117198944, + "learning_rate": 0.0001603447499887484, + "loss": 0.3508, + "step": 13749 + }, + { + "epoch": 1.113901490602722, + "grad_norm": 0.037589896470308304, + "learning_rate": 0.00016034024933615375, + "loss": 0.2722, + "step": 13750 + }, + { + "epoch": 1.1139825016202203, + "grad_norm": 0.04414276033639908, + "learning_rate": 0.0001603357486835591, + "loss": 0.3225, + "step": 13751 + }, + { + "epoch": 1.1140635126377187, + "grad_norm": 0.03974426910281181, + "learning_rate": 0.0001603312480309645, + "loss": 0.296, + "step": 13752 + }, + { + "epoch": 1.1141445236552172, + "grad_norm": 0.036167360842227936, + "learning_rate": 0.00016032674737836988, + "loss": 0.3036, + "step": 13753 + }, + { + "epoch": 1.1142255346727155, + "grad_norm": 0.035503290593624115, + "learning_rate": 0.00016032224672577524, + "loss": 0.3042, + "step": 13754 + }, + { + "epoch": 1.114306545690214, + "grad_norm": 0.041777096688747406, + "learning_rate": 0.00016031774607318063, + "loss": 0.3543, + "step": 13755 + }, + { + "epoch": 1.1143875567077122, + "grad_norm": 0.037984009832143784, + "learning_rate": 0.000160313245420586, + "loss": 0.2943, + "step": 13756 + }, + { + "epoch": 1.1144685677252106, + "grad_norm": 0.03160495311021805, + "learning_rate": 0.00016030874476799135, + "loss": 0.2878, + "step": 13757 + }, + { + "epoch": 1.114549578742709, + "grad_norm": 0.04195152595639229, + "learning_rate": 0.00016030424411539676, + "loss": 0.2704, + "step": 13758 + }, + { + "epoch": 1.1146305897602073, + "grad_norm": 0.038813620805740356, + "learning_rate": 0.00016029974346280212, + "loss": 0.3305, + "step": 13759 + }, + { + "epoch": 1.1147116007777058, + "grad_norm": 0.04088602215051651, + "learning_rate": 0.00016029524281020748, + "loss": 0.3159, + "step": 13760 + }, + { + "epoch": 1.114792611795204, + "grad_norm": 0.039398763328790665, + "learning_rate": 0.00016029074215761287, + "loss": 0.2996, + "step": 13761 + }, + { + "epoch": 1.1148736228127025, + "grad_norm": 0.03807293251156807, + "learning_rate": 0.00016028624150501823, + "loss": 0.3218, + "step": 13762 + }, + { + "epoch": 1.114954633830201, + "grad_norm": 0.04101305827498436, + "learning_rate": 0.0001602817408524236, + "loss": 0.2992, + "step": 13763 + }, + { + "epoch": 1.1150356448476992, + "grad_norm": 0.050497643649578094, + "learning_rate": 0.000160277240199829, + "loss": 0.3338, + "step": 13764 + }, + { + "epoch": 1.1151166558651977, + "grad_norm": 0.03950982540845871, + "learning_rate": 0.00016027273954723436, + "loss": 0.2737, + "step": 13765 + }, + { + "epoch": 1.115197666882696, + "grad_norm": 0.041152723133563995, + "learning_rate": 0.00016026823889463972, + "loss": 0.3497, + "step": 13766 + }, + { + "epoch": 1.1152786779001944, + "grad_norm": 0.045656152069568634, + "learning_rate": 0.0001602637382420451, + "loss": 0.3661, + "step": 13767 + }, + { + "epoch": 1.1153596889176929, + "grad_norm": 0.035172052681446075, + "learning_rate": 0.00016025923758945047, + "loss": 0.2839, + "step": 13768 + }, + { + "epoch": 1.1154406999351911, + "grad_norm": 0.04323038086295128, + "learning_rate": 0.00016025473693685583, + "loss": 0.3197, + "step": 13769 + }, + { + "epoch": 1.1155217109526896, + "grad_norm": 0.04258815944194794, + "learning_rate": 0.00016025023628426125, + "loss": 0.3421, + "step": 13770 + }, + { + "epoch": 1.115602721970188, + "grad_norm": 0.04450952261686325, + "learning_rate": 0.0001602457356316666, + "loss": 0.3218, + "step": 13771 + }, + { + "epoch": 1.1156837329876863, + "grad_norm": 0.03626592829823494, + "learning_rate": 0.00016024123497907197, + "loss": 0.2947, + "step": 13772 + }, + { + "epoch": 1.1157647440051848, + "grad_norm": 0.03189956769347191, + "learning_rate": 0.00016023673432647735, + "loss": 0.2906, + "step": 13773 + }, + { + "epoch": 1.115845755022683, + "grad_norm": 0.037484340369701385, + "learning_rate": 0.0001602322336738827, + "loss": 0.2983, + "step": 13774 + }, + { + "epoch": 1.1159267660401815, + "grad_norm": 0.03559768199920654, + "learning_rate": 0.00016022773302128807, + "loss": 0.2906, + "step": 13775 + }, + { + "epoch": 1.1160077770576797, + "grad_norm": 0.046286530792713165, + "learning_rate": 0.0001602232323686935, + "loss": 0.3647, + "step": 13776 + }, + { + "epoch": 1.1160887880751782, + "grad_norm": 0.03982963413000107, + "learning_rate": 0.00016021873171609885, + "loss": 0.3175, + "step": 13777 + }, + { + "epoch": 1.1161697990926767, + "grad_norm": 0.03955851122736931, + "learning_rate": 0.0001602142310635042, + "loss": 0.3196, + "step": 13778 + }, + { + "epoch": 1.116250810110175, + "grad_norm": 0.03698073327541351, + "learning_rate": 0.0001602097304109096, + "loss": 0.2503, + "step": 13779 + }, + { + "epoch": 1.1163318211276734, + "grad_norm": 0.03260596841573715, + "learning_rate": 0.00016020522975831496, + "loss": 0.2944, + "step": 13780 + }, + { + "epoch": 1.1164128321451718, + "grad_norm": 0.03877520188689232, + "learning_rate": 0.00016020072910572032, + "loss": 0.3295, + "step": 13781 + }, + { + "epoch": 1.11649384316267, + "grad_norm": 0.045589517802000046, + "learning_rate": 0.00016019622845312573, + "loss": 0.3598, + "step": 13782 + }, + { + "epoch": 1.1165748541801686, + "grad_norm": 0.04367116466164589, + "learning_rate": 0.0001601917278005311, + "loss": 0.3007, + "step": 13783 + }, + { + "epoch": 1.1166558651976668, + "grad_norm": 0.03556925430893898, + "learning_rate": 0.00016018722714793645, + "loss": 0.3085, + "step": 13784 + }, + { + "epoch": 1.1167368762151653, + "grad_norm": 0.03693739324808121, + "learning_rate": 0.00016018272649534184, + "loss": 0.3143, + "step": 13785 + }, + { + "epoch": 1.1168178872326637, + "grad_norm": 0.03582121804356575, + "learning_rate": 0.0001601782258427472, + "loss": 0.267, + "step": 13786 + }, + { + "epoch": 1.116898898250162, + "grad_norm": 0.03946210816502571, + "learning_rate": 0.00016017372519015258, + "loss": 0.2989, + "step": 13787 + }, + { + "epoch": 1.1169799092676604, + "grad_norm": 0.045077886432409286, + "learning_rate": 0.00016016922453755797, + "loss": 0.3211, + "step": 13788 + }, + { + "epoch": 1.1170609202851587, + "grad_norm": 0.03933589905500412, + "learning_rate": 0.00016016472388496333, + "loss": 0.3248, + "step": 13789 + }, + { + "epoch": 1.1171419313026572, + "grad_norm": 0.03940128535032272, + "learning_rate": 0.0001601602232323687, + "loss": 0.3191, + "step": 13790 + }, + { + "epoch": 1.1172229423201556, + "grad_norm": 0.035947635769844055, + "learning_rate": 0.00016015572257977408, + "loss": 0.3125, + "step": 13791 + }, + { + "epoch": 1.1173039533376539, + "grad_norm": 0.038889285176992416, + "learning_rate": 0.00016015122192717944, + "loss": 0.3005, + "step": 13792 + }, + { + "epoch": 1.1173849643551523, + "grad_norm": 0.039641644805669785, + "learning_rate": 0.00016014672127458483, + "loss": 0.3274, + "step": 13793 + }, + { + "epoch": 1.1174659753726506, + "grad_norm": 0.0386282354593277, + "learning_rate": 0.0001601422206219902, + "loss": 0.3202, + "step": 13794 + }, + { + "epoch": 1.117546986390149, + "grad_norm": 0.03944721817970276, + "learning_rate": 0.00016013771996939557, + "loss": 0.3184, + "step": 13795 + }, + { + "epoch": 1.1176279974076475, + "grad_norm": 0.04022477939724922, + "learning_rate": 0.00016013321931680093, + "loss": 0.31, + "step": 13796 + }, + { + "epoch": 1.1177090084251458, + "grad_norm": 0.04214589670300484, + "learning_rate": 0.00016012871866420632, + "loss": 0.3351, + "step": 13797 + }, + { + "epoch": 1.1177900194426442, + "grad_norm": 0.036494381725788116, + "learning_rate": 0.00016012421801161168, + "loss": 0.2615, + "step": 13798 + }, + { + "epoch": 1.1178710304601425, + "grad_norm": 0.04439876228570938, + "learning_rate": 0.00016011971735901707, + "loss": 0.2859, + "step": 13799 + }, + { + "epoch": 1.117952041477641, + "grad_norm": 0.0420045368373394, + "learning_rate": 0.00016011521670642245, + "loss": 0.3235, + "step": 13800 + }, + { + "epoch": 1.1180330524951394, + "grad_norm": 0.037003420293331146, + "learning_rate": 0.00016011071605382781, + "loss": 0.2992, + "step": 13801 + }, + { + "epoch": 1.1181140635126376, + "grad_norm": 0.04284486547112465, + "learning_rate": 0.00016010621540123317, + "loss": 0.3373, + "step": 13802 + }, + { + "epoch": 1.1181950745301361, + "grad_norm": 0.040728338062763214, + "learning_rate": 0.00016010171474863856, + "loss": 0.3062, + "step": 13803 + }, + { + "epoch": 1.1182760855476346, + "grad_norm": 0.039135854691267014, + "learning_rate": 0.00016009721409604392, + "loss": 0.3061, + "step": 13804 + }, + { + "epoch": 1.1183570965651328, + "grad_norm": 0.035060565918684006, + "learning_rate": 0.0001600927134434493, + "loss": 0.2931, + "step": 13805 + }, + { + "epoch": 1.1184381075826313, + "grad_norm": 0.043133411556482315, + "learning_rate": 0.0001600882127908547, + "loss": 0.3366, + "step": 13806 + }, + { + "epoch": 1.1185191186001295, + "grad_norm": 0.0373905710875988, + "learning_rate": 0.00016008371213826006, + "loss": 0.3236, + "step": 13807 + }, + { + "epoch": 1.118600129617628, + "grad_norm": 0.04570890590548515, + "learning_rate": 0.00016007921148566542, + "loss": 0.3283, + "step": 13808 + }, + { + "epoch": 1.1186811406351265, + "grad_norm": 0.042422425001859665, + "learning_rate": 0.0001600747108330708, + "loss": 0.345, + "step": 13809 + }, + { + "epoch": 1.1187621516526247, + "grad_norm": 0.03744294494390488, + "learning_rate": 0.0001600702101804762, + "loss": 0.2998, + "step": 13810 + }, + { + "epoch": 1.1188431626701232, + "grad_norm": 0.04061727970838547, + "learning_rate": 0.00016006570952788155, + "loss": 0.2984, + "step": 13811 + }, + { + "epoch": 1.1189241736876214, + "grad_norm": 0.041145216673612595, + "learning_rate": 0.00016006120887528694, + "loss": 0.3325, + "step": 13812 + }, + { + "epoch": 1.11900518470512, + "grad_norm": 0.03937564790248871, + "learning_rate": 0.0001600567082226923, + "loss": 0.2948, + "step": 13813 + }, + { + "epoch": 1.1190861957226184, + "grad_norm": 0.039172567427158356, + "learning_rate": 0.00016005220757009766, + "loss": 0.3151, + "step": 13814 + }, + { + "epoch": 1.1191672067401166, + "grad_norm": 0.03841695189476013, + "learning_rate": 0.00016004770691750304, + "loss": 0.3169, + "step": 13815 + }, + { + "epoch": 1.119248217757615, + "grad_norm": 0.04288164898753166, + "learning_rate": 0.00016004320626490843, + "loss": 0.3313, + "step": 13816 + }, + { + "epoch": 1.1193292287751133, + "grad_norm": 0.036649782210588455, + "learning_rate": 0.0001600387056123138, + "loss": 0.2685, + "step": 13817 + }, + { + "epoch": 1.1194102397926118, + "grad_norm": 0.03929361701011658, + "learning_rate": 0.00016003420495971918, + "loss": 0.359, + "step": 13818 + }, + { + "epoch": 1.1194912508101102, + "grad_norm": 0.041453681886196136, + "learning_rate": 0.00016002970430712454, + "loss": 0.3186, + "step": 13819 + }, + { + "epoch": 1.1195722618276085, + "grad_norm": 0.038998812437057495, + "learning_rate": 0.0001600252036545299, + "loss": 0.3488, + "step": 13820 + }, + { + "epoch": 1.119653272845107, + "grad_norm": 0.04024531692266464, + "learning_rate": 0.00016002070300193529, + "loss": 0.3356, + "step": 13821 + }, + { + "epoch": 1.1197342838626052, + "grad_norm": 0.037038642913103104, + "learning_rate": 0.00016001620234934067, + "loss": 0.2813, + "step": 13822 + }, + { + "epoch": 1.1198152948801037, + "grad_norm": 0.039935849606990814, + "learning_rate": 0.00016001170169674603, + "loss": 0.3184, + "step": 13823 + }, + { + "epoch": 1.1198963058976021, + "grad_norm": 0.039462849497795105, + "learning_rate": 0.00016000720104415142, + "loss": 0.3609, + "step": 13824 + }, + { + "epoch": 1.1199773169151004, + "grad_norm": 0.04245318844914436, + "learning_rate": 0.00016000270039155678, + "loss": 0.3234, + "step": 13825 + }, + { + "epoch": 1.1200583279325989, + "grad_norm": 0.04086868464946747, + "learning_rate": 0.00015999819973896214, + "loss": 0.3421, + "step": 13826 + }, + { + "epoch": 1.1201393389500973, + "grad_norm": 0.03381660208106041, + "learning_rate": 0.00015999369908636753, + "loss": 0.2728, + "step": 13827 + }, + { + "epoch": 1.1202203499675956, + "grad_norm": 0.0502673014998436, + "learning_rate": 0.00015998919843377292, + "loss": 0.3618, + "step": 13828 + }, + { + "epoch": 1.120301360985094, + "grad_norm": 0.04325926676392555, + "learning_rate": 0.00015998469778117828, + "loss": 0.3295, + "step": 13829 + }, + { + "epoch": 1.1203823720025923, + "grad_norm": 0.04531802982091904, + "learning_rate": 0.00015998019712858366, + "loss": 0.3748, + "step": 13830 + }, + { + "epoch": 1.1204633830200907, + "grad_norm": 0.03567253053188324, + "learning_rate": 0.00015997569647598902, + "loss": 0.307, + "step": 13831 + }, + { + "epoch": 1.1205443940375892, + "grad_norm": 0.040204357355833054, + "learning_rate": 0.00015997119582339438, + "loss": 0.3271, + "step": 13832 + }, + { + "epoch": 1.1206254050550875, + "grad_norm": 0.03894279897212982, + "learning_rate": 0.00015996669517079977, + "loss": 0.3309, + "step": 13833 + }, + { + "epoch": 1.120706416072586, + "grad_norm": 0.0587170273065567, + "learning_rate": 0.00015996219451820516, + "loss": 0.353, + "step": 13834 + }, + { + "epoch": 1.1207874270900842, + "grad_norm": 0.03796997666358948, + "learning_rate": 0.00015995769386561052, + "loss": 0.2953, + "step": 13835 + }, + { + "epoch": 1.1208684381075826, + "grad_norm": 0.03643454238772392, + "learning_rate": 0.0001599531932130159, + "loss": 0.2976, + "step": 13836 + }, + { + "epoch": 1.120949449125081, + "grad_norm": 0.038293469697237015, + "learning_rate": 0.00015994869256042126, + "loss": 0.3251, + "step": 13837 + }, + { + "epoch": 1.1210304601425793, + "grad_norm": 0.04205470159649849, + "learning_rate": 0.00015994419190782662, + "loss": 0.3321, + "step": 13838 + }, + { + "epoch": 1.1211114711600778, + "grad_norm": 0.04378296434879303, + "learning_rate": 0.00015993969125523204, + "loss": 0.3149, + "step": 13839 + }, + { + "epoch": 1.121192482177576, + "grad_norm": 0.03723248094320297, + "learning_rate": 0.0001599351906026374, + "loss": 0.3334, + "step": 13840 + }, + { + "epoch": 1.1212734931950745, + "grad_norm": 0.03505038842558861, + "learning_rate": 0.00015993068995004276, + "loss": 0.2801, + "step": 13841 + }, + { + "epoch": 1.121354504212573, + "grad_norm": 0.03849254548549652, + "learning_rate": 0.00015992618929744815, + "loss": 0.3196, + "step": 13842 + }, + { + "epoch": 1.1214355152300712, + "grad_norm": 0.04109663888812065, + "learning_rate": 0.0001599216886448535, + "loss": 0.3283, + "step": 13843 + }, + { + "epoch": 1.1215165262475697, + "grad_norm": 0.04248509556055069, + "learning_rate": 0.00015991718799225887, + "loss": 0.3455, + "step": 13844 + }, + { + "epoch": 1.121597537265068, + "grad_norm": 0.03689438849687576, + "learning_rate": 0.00015991268733966428, + "loss": 0.3069, + "step": 13845 + }, + { + "epoch": 1.1216785482825664, + "grad_norm": 0.04201400279998779, + "learning_rate": 0.00015990818668706964, + "loss": 0.3465, + "step": 13846 + }, + { + "epoch": 1.1217595593000649, + "grad_norm": 0.040428366512060165, + "learning_rate": 0.000159903686034475, + "loss": 0.3367, + "step": 13847 + }, + { + "epoch": 1.1218405703175631, + "grad_norm": 0.04223720356822014, + "learning_rate": 0.0001598991853818804, + "loss": 0.3644, + "step": 13848 + }, + { + "epoch": 1.1219215813350616, + "grad_norm": 0.0399593859910965, + "learning_rate": 0.00015989468472928575, + "loss": 0.2838, + "step": 13849 + }, + { + "epoch": 1.12200259235256, + "grad_norm": 0.038730867207050323, + "learning_rate": 0.0001598901840766911, + "loss": 0.3084, + "step": 13850 + }, + { + "epoch": 1.1220836033700583, + "grad_norm": 0.04075966402888298, + "learning_rate": 0.00015988568342409652, + "loss": 0.3212, + "step": 13851 + }, + { + "epoch": 1.1221646143875568, + "grad_norm": 0.04116553068161011, + "learning_rate": 0.00015988118277150188, + "loss": 0.3366, + "step": 13852 + }, + { + "epoch": 1.122245625405055, + "grad_norm": 0.04146783798933029, + "learning_rate": 0.00015987668211890724, + "loss": 0.3376, + "step": 13853 + }, + { + "epoch": 1.1223266364225535, + "grad_norm": 0.038811422884464264, + "learning_rate": 0.00015987218146631263, + "loss": 0.26, + "step": 13854 + }, + { + "epoch": 1.122407647440052, + "grad_norm": 0.035753101110458374, + "learning_rate": 0.000159867680813718, + "loss": 0.338, + "step": 13855 + }, + { + "epoch": 1.1224886584575502, + "grad_norm": 0.03897230327129364, + "learning_rate": 0.00015986318016112335, + "loss": 0.2695, + "step": 13856 + }, + { + "epoch": 1.1225696694750487, + "grad_norm": 0.044433873146772385, + "learning_rate": 0.00015985867950852876, + "loss": 0.3424, + "step": 13857 + }, + { + "epoch": 1.122650680492547, + "grad_norm": 0.04034457355737686, + "learning_rate": 0.00015985417885593412, + "loss": 0.3306, + "step": 13858 + }, + { + "epoch": 1.1227316915100454, + "grad_norm": 0.04004496708512306, + "learning_rate": 0.00015984967820333948, + "loss": 0.3126, + "step": 13859 + }, + { + "epoch": 1.1228127025275438, + "grad_norm": 0.041642673313617706, + "learning_rate": 0.00015984517755074487, + "loss": 0.3152, + "step": 13860 + }, + { + "epoch": 1.122893713545042, + "grad_norm": 0.04158201068639755, + "learning_rate": 0.00015984067689815023, + "loss": 0.3176, + "step": 13861 + }, + { + "epoch": 1.1229747245625405, + "grad_norm": 0.038879748433828354, + "learning_rate": 0.0001598361762455556, + "loss": 0.3179, + "step": 13862 + }, + { + "epoch": 1.1230557355800388, + "grad_norm": 0.038589294999837875, + "learning_rate": 0.000159831675592961, + "loss": 0.3396, + "step": 13863 + }, + { + "epoch": 1.1231367465975373, + "grad_norm": 0.042497556656599045, + "learning_rate": 0.00015982717494036636, + "loss": 0.3039, + "step": 13864 + }, + { + "epoch": 1.1232177576150357, + "grad_norm": 0.040472209453582764, + "learning_rate": 0.00015982267428777172, + "loss": 0.3498, + "step": 13865 + }, + { + "epoch": 1.123298768632534, + "grad_norm": 0.03559787943959236, + "learning_rate": 0.0001598181736351771, + "loss": 0.3019, + "step": 13866 + }, + { + "epoch": 1.1233797796500324, + "grad_norm": 0.039029110223054886, + "learning_rate": 0.00015981367298258247, + "loss": 0.3296, + "step": 13867 + }, + { + "epoch": 1.1234607906675307, + "grad_norm": 0.03882824629545212, + "learning_rate": 0.00015980917232998786, + "loss": 0.3076, + "step": 13868 + }, + { + "epoch": 1.1235418016850292, + "grad_norm": 0.03973846137523651, + "learning_rate": 0.00015980467167739325, + "loss": 0.2771, + "step": 13869 + }, + { + "epoch": 1.1236228127025276, + "grad_norm": 0.035050515085458755, + "learning_rate": 0.0001598001710247986, + "loss": 0.2669, + "step": 13870 + }, + { + "epoch": 1.1237038237200259, + "grad_norm": 0.04399830475449562, + "learning_rate": 0.00015979567037220397, + "loss": 0.3435, + "step": 13871 + }, + { + "epoch": 1.1237848347375243, + "grad_norm": 0.042005930095911026, + "learning_rate": 0.00015979116971960935, + "loss": 0.323, + "step": 13872 + }, + { + "epoch": 1.1238658457550228, + "grad_norm": 0.046611249446868896, + "learning_rate": 0.00015978666906701471, + "loss": 0.3635, + "step": 13873 + }, + { + "epoch": 1.123946856772521, + "grad_norm": 0.03784230723977089, + "learning_rate": 0.0001597821684144201, + "loss": 0.2741, + "step": 13874 + }, + { + "epoch": 1.1240278677900195, + "grad_norm": 0.037510622292757034, + "learning_rate": 0.0001597776677618255, + "loss": 0.3011, + "step": 13875 + }, + { + "epoch": 1.1241088788075178, + "grad_norm": 0.03867172822356224, + "learning_rate": 0.00015977316710923085, + "loss": 0.3401, + "step": 13876 + }, + { + "epoch": 1.1241898898250162, + "grad_norm": 0.0382206067442894, + "learning_rate": 0.0001597686664566362, + "loss": 0.3258, + "step": 13877 + }, + { + "epoch": 1.1242709008425145, + "grad_norm": 0.041581250727176666, + "learning_rate": 0.0001597641658040416, + "loss": 0.2976, + "step": 13878 + }, + { + "epoch": 1.124351911860013, + "grad_norm": 0.039854757487773895, + "learning_rate": 0.00015975966515144696, + "loss": 0.2854, + "step": 13879 + }, + { + "epoch": 1.1244329228775114, + "grad_norm": 0.04291190207004547, + "learning_rate": 0.00015975516449885234, + "loss": 0.2964, + "step": 13880 + }, + { + "epoch": 1.1245139338950096, + "grad_norm": 0.038031816482543945, + "learning_rate": 0.00015975066384625773, + "loss": 0.2957, + "step": 13881 + }, + { + "epoch": 1.124594944912508, + "grad_norm": 0.04880787804722786, + "learning_rate": 0.0001597461631936631, + "loss": 0.3277, + "step": 13882 + }, + { + "epoch": 1.1246759559300066, + "grad_norm": 0.0424063503742218, + "learning_rate": 0.00015974166254106845, + "loss": 0.321, + "step": 13883 + }, + { + "epoch": 1.1247569669475048, + "grad_norm": 0.0411037839949131, + "learning_rate": 0.00015973716188847384, + "loss": 0.3259, + "step": 13884 + }, + { + "epoch": 1.1248379779650033, + "grad_norm": 0.03793978691101074, + "learning_rate": 0.0001597326612358792, + "loss": 0.2871, + "step": 13885 + }, + { + "epoch": 1.1249189889825015, + "grad_norm": 0.040353622287511826, + "learning_rate": 0.00015972816058328458, + "loss": 0.2924, + "step": 13886 + }, + { + "epoch": 1.125, + "grad_norm": 0.042982496321201324, + "learning_rate": 0.00015972365993068997, + "loss": 0.3395, + "step": 13887 + }, + { + "epoch": 1.1250810110174985, + "grad_norm": 0.04520965367555618, + "learning_rate": 0.00015971915927809533, + "loss": 0.3761, + "step": 13888 + }, + { + "epoch": 1.1251620220349967, + "grad_norm": 0.036293741315603256, + "learning_rate": 0.0001597146586255007, + "loss": 0.2902, + "step": 13889 + }, + { + "epoch": 1.1252430330524952, + "grad_norm": 0.0378212071955204, + "learning_rate": 0.00015971015797290608, + "loss": 0.2862, + "step": 13890 + }, + { + "epoch": 1.1253240440699934, + "grad_norm": 0.036096226423978806, + "learning_rate": 0.00015970565732031147, + "loss": 0.3155, + "step": 13891 + }, + { + "epoch": 1.125405055087492, + "grad_norm": 0.03830007091164589, + "learning_rate": 0.00015970115666771683, + "loss": 0.3307, + "step": 13892 + }, + { + "epoch": 1.1254860661049904, + "grad_norm": 0.03723623603582382, + "learning_rate": 0.0001596966560151222, + "loss": 0.2781, + "step": 13893 + }, + { + "epoch": 1.1255670771224886, + "grad_norm": 0.039892762899398804, + "learning_rate": 0.00015969215536252757, + "loss": 0.3453, + "step": 13894 + }, + { + "epoch": 1.125648088139987, + "grad_norm": 0.03982089087367058, + "learning_rate": 0.00015968765470993293, + "loss": 0.3028, + "step": 13895 + }, + { + "epoch": 1.1257290991574855, + "grad_norm": 0.03398760408163071, + "learning_rate": 0.00015968315405733832, + "loss": 0.2907, + "step": 13896 + }, + { + "epoch": 1.1258101101749838, + "grad_norm": 0.03256816416978836, + "learning_rate": 0.0001596786534047437, + "loss": 0.2921, + "step": 13897 + }, + { + "epoch": 1.1258911211924822, + "grad_norm": 0.03531118854880333, + "learning_rate": 0.00015967415275214907, + "loss": 0.2794, + "step": 13898 + }, + { + "epoch": 1.1259721322099805, + "grad_norm": 0.04041526839137077, + "learning_rate": 0.00015966965209955445, + "loss": 0.3278, + "step": 13899 + }, + { + "epoch": 1.126053143227479, + "grad_norm": 0.039820361882448196, + "learning_rate": 0.00015966515144695981, + "loss": 0.3472, + "step": 13900 + }, + { + "epoch": 1.1261341542449772, + "grad_norm": 0.03759269416332245, + "learning_rate": 0.00015966065079436517, + "loss": 0.3216, + "step": 13901 + }, + { + "epoch": 1.1262151652624757, + "grad_norm": 0.03688056394457817, + "learning_rate": 0.00015965615014177056, + "loss": 0.2979, + "step": 13902 + }, + { + "epoch": 1.1262961762799741, + "grad_norm": 0.03714337944984436, + "learning_rate": 0.00015965164948917595, + "loss": 0.3015, + "step": 13903 + }, + { + "epoch": 1.1263771872974724, + "grad_norm": 0.03759916499257088, + "learning_rate": 0.0001596471488365813, + "loss": 0.3035, + "step": 13904 + }, + { + "epoch": 1.1264581983149708, + "grad_norm": 0.04206295683979988, + "learning_rate": 0.0001596426481839867, + "loss": 0.2859, + "step": 13905 + }, + { + "epoch": 1.1265392093324693, + "grad_norm": 0.04519706219434738, + "learning_rate": 0.00015963814753139206, + "loss": 0.3358, + "step": 13906 + }, + { + "epoch": 1.1266202203499676, + "grad_norm": 0.038674045354127884, + "learning_rate": 0.00015963364687879742, + "loss": 0.3141, + "step": 13907 + }, + { + "epoch": 1.126701231367466, + "grad_norm": 0.04178118705749512, + "learning_rate": 0.0001596291462262028, + "loss": 0.319, + "step": 13908 + }, + { + "epoch": 1.1267822423849643, + "grad_norm": 0.03776085376739502, + "learning_rate": 0.0001596246455736082, + "loss": 0.2629, + "step": 13909 + }, + { + "epoch": 1.1268632534024627, + "grad_norm": 0.035705432295799255, + "learning_rate": 0.00015962014492101355, + "loss": 0.2704, + "step": 13910 + }, + { + "epoch": 1.1269442644199612, + "grad_norm": 0.03641466423869133, + "learning_rate": 0.00015961564426841894, + "loss": 0.2981, + "step": 13911 + }, + { + "epoch": 1.1270252754374595, + "grad_norm": 0.0384918749332428, + "learning_rate": 0.0001596111436158243, + "loss": 0.2709, + "step": 13912 + }, + { + "epoch": 1.127106286454958, + "grad_norm": 0.03596267104148865, + "learning_rate": 0.00015960664296322966, + "loss": 0.3003, + "step": 13913 + }, + { + "epoch": 1.1271872974724562, + "grad_norm": 0.0391799733042717, + "learning_rate": 0.00015960214231063505, + "loss": 0.3269, + "step": 13914 + }, + { + "epoch": 1.1272683084899546, + "grad_norm": 0.046616896986961365, + "learning_rate": 0.00015959764165804043, + "loss": 0.3749, + "step": 13915 + }, + { + "epoch": 1.127349319507453, + "grad_norm": 0.043165434151887894, + "learning_rate": 0.0001595931410054458, + "loss": 0.3351, + "step": 13916 + }, + { + "epoch": 1.1274303305249513, + "grad_norm": 0.03316278010606766, + "learning_rate": 0.00015958864035285118, + "loss": 0.2853, + "step": 13917 + }, + { + "epoch": 1.1275113415424498, + "grad_norm": 0.0387754887342453, + "learning_rate": 0.00015958413970025654, + "loss": 0.3071, + "step": 13918 + }, + { + "epoch": 1.1275923525599483, + "grad_norm": 0.03791152685880661, + "learning_rate": 0.0001595796390476619, + "loss": 0.3022, + "step": 13919 + }, + { + "epoch": 1.1276733635774465, + "grad_norm": 0.044122401624917984, + "learning_rate": 0.00015957513839506731, + "loss": 0.3311, + "step": 13920 + }, + { + "epoch": 1.127754374594945, + "grad_norm": 0.03224045783281326, + "learning_rate": 0.00015957063774247267, + "loss": 0.2618, + "step": 13921 + }, + { + "epoch": 1.1278353856124432, + "grad_norm": 0.04090102016925812, + "learning_rate": 0.00015956613708987803, + "loss": 0.3309, + "step": 13922 + }, + { + "epoch": 1.1279163966299417, + "grad_norm": 0.04744292050600052, + "learning_rate": 0.00015956163643728342, + "loss": 0.3234, + "step": 13923 + }, + { + "epoch": 1.12799740764744, + "grad_norm": 0.040213972330093384, + "learning_rate": 0.00015955713578468878, + "loss": 0.3255, + "step": 13924 + }, + { + "epoch": 1.1280784186649384, + "grad_norm": 0.04174848645925522, + "learning_rate": 0.00015955263513209414, + "loss": 0.3219, + "step": 13925 + }, + { + "epoch": 1.1281594296824369, + "grad_norm": 0.04265562444925308, + "learning_rate": 0.00015954813447949956, + "loss": 0.3139, + "step": 13926 + }, + { + "epoch": 1.1282404406999351, + "grad_norm": 0.03909571096301079, + "learning_rate": 0.00015954363382690492, + "loss": 0.2832, + "step": 13927 + }, + { + "epoch": 1.1283214517174336, + "grad_norm": 0.03760486841201782, + "learning_rate": 0.00015953913317431028, + "loss": 0.3112, + "step": 13928 + }, + { + "epoch": 1.128402462734932, + "grad_norm": 0.03686079382896423, + "learning_rate": 0.00015953463252171566, + "loss": 0.2786, + "step": 13929 + }, + { + "epoch": 1.1284834737524303, + "grad_norm": 0.03789995238184929, + "learning_rate": 0.00015953013186912102, + "loss": 0.3289, + "step": 13930 + }, + { + "epoch": 1.1285644847699288, + "grad_norm": 0.03965533524751663, + "learning_rate": 0.00015952563121652638, + "loss": 0.3029, + "step": 13931 + }, + { + "epoch": 1.128645495787427, + "grad_norm": 0.04098164662718773, + "learning_rate": 0.0001595211305639318, + "loss": 0.2928, + "step": 13932 + }, + { + "epoch": 1.1287265068049255, + "grad_norm": 0.03985410928726196, + "learning_rate": 0.00015951662991133716, + "loss": 0.322, + "step": 13933 + }, + { + "epoch": 1.128807517822424, + "grad_norm": 0.038576241582632065, + "learning_rate": 0.00015951212925874252, + "loss": 0.3067, + "step": 13934 + }, + { + "epoch": 1.1288885288399222, + "grad_norm": 0.0459025502204895, + "learning_rate": 0.0001595076286061479, + "loss": 0.3236, + "step": 13935 + }, + { + "epoch": 1.1289695398574207, + "grad_norm": 0.044762130826711655, + "learning_rate": 0.00015950312795355326, + "loss": 0.3272, + "step": 13936 + }, + { + "epoch": 1.129050550874919, + "grad_norm": 0.035859379917383194, + "learning_rate": 0.00015949862730095862, + "loss": 0.2561, + "step": 13937 + }, + { + "epoch": 1.1291315618924174, + "grad_norm": 0.036720190197229385, + "learning_rate": 0.00015949412664836404, + "loss": 0.2986, + "step": 13938 + }, + { + "epoch": 1.1292125729099158, + "grad_norm": 0.04893483966588974, + "learning_rate": 0.0001594896259957694, + "loss": 0.3214, + "step": 13939 + }, + { + "epoch": 1.129293583927414, + "grad_norm": 0.03709689900279045, + "learning_rate": 0.00015948512534317476, + "loss": 0.312, + "step": 13940 + }, + { + "epoch": 1.1293745949449125, + "grad_norm": 0.040259964764118195, + "learning_rate": 0.00015948062469058015, + "loss": 0.2999, + "step": 13941 + }, + { + "epoch": 1.1294556059624108, + "grad_norm": 0.044082023203372955, + "learning_rate": 0.0001594761240379855, + "loss": 0.3319, + "step": 13942 + }, + { + "epoch": 1.1295366169799093, + "grad_norm": 0.037773314863443375, + "learning_rate": 0.0001594716233853909, + "loss": 0.3176, + "step": 13943 + }, + { + "epoch": 1.1296176279974077, + "grad_norm": 0.04064793884754181, + "learning_rate": 0.00015946712273279628, + "loss": 0.3004, + "step": 13944 + }, + { + "epoch": 1.129698639014906, + "grad_norm": 0.04040275514125824, + "learning_rate": 0.00015946262208020164, + "loss": 0.328, + "step": 13945 + }, + { + "epoch": 1.1297796500324044, + "grad_norm": 0.043435558676719666, + "learning_rate": 0.000159458121427607, + "loss": 0.3366, + "step": 13946 + }, + { + "epoch": 1.1298606610499027, + "grad_norm": 0.03831151872873306, + "learning_rate": 0.0001594536207750124, + "loss": 0.2747, + "step": 13947 + }, + { + "epoch": 1.1299416720674011, + "grad_norm": 0.037033118307590485, + "learning_rate": 0.00015944912012241775, + "loss": 0.3001, + "step": 13948 + }, + { + "epoch": 1.1300226830848996, + "grad_norm": 0.03587247431278229, + "learning_rate": 0.00015944461946982313, + "loss": 0.275, + "step": 13949 + }, + { + "epoch": 1.1301036941023979, + "grad_norm": 0.04261304438114166, + "learning_rate": 0.00015944011881722852, + "loss": 0.2938, + "step": 13950 + }, + { + "epoch": 1.1301847051198963, + "grad_norm": 0.040578443557024, + "learning_rate": 0.00015943561816463388, + "loss": 0.3367, + "step": 13951 + }, + { + "epoch": 1.1302657161373948, + "grad_norm": 0.044319093227386475, + "learning_rate": 0.00015943111751203924, + "loss": 0.3281, + "step": 13952 + }, + { + "epoch": 1.130346727154893, + "grad_norm": 0.046974748373031616, + "learning_rate": 0.00015942661685944463, + "loss": 0.3121, + "step": 13953 + }, + { + "epoch": 1.1304277381723915, + "grad_norm": 0.04281812161207199, + "learning_rate": 0.00015942211620685, + "loss": 0.3386, + "step": 13954 + }, + { + "epoch": 1.1305087491898898, + "grad_norm": 0.03786301240324974, + "learning_rate": 0.00015941761555425538, + "loss": 0.3311, + "step": 13955 + }, + { + "epoch": 1.1305897602073882, + "grad_norm": 0.04455956444144249, + "learning_rate": 0.00015941311490166076, + "loss": 0.3553, + "step": 13956 + }, + { + "epoch": 1.1306707712248865, + "grad_norm": 0.03868843615055084, + "learning_rate": 0.00015940861424906612, + "loss": 0.3131, + "step": 13957 + }, + { + "epoch": 1.130751782242385, + "grad_norm": 0.038352277129888535, + "learning_rate": 0.00015940411359647148, + "loss": 0.3174, + "step": 13958 + }, + { + "epoch": 1.1308327932598834, + "grad_norm": 0.047438718378543854, + "learning_rate": 0.00015939961294387687, + "loss": 0.3297, + "step": 13959 + }, + { + "epoch": 1.1309138042773816, + "grad_norm": 0.04253394529223442, + "learning_rate": 0.00015939511229128223, + "loss": 0.2715, + "step": 13960 + }, + { + "epoch": 1.13099481529488, + "grad_norm": 0.04169415682554245, + "learning_rate": 0.00015939061163868762, + "loss": 0.3359, + "step": 13961 + }, + { + "epoch": 1.1310758263123786, + "grad_norm": 0.036576855927705765, + "learning_rate": 0.000159386110986093, + "loss": 0.2846, + "step": 13962 + }, + { + "epoch": 1.1311568373298768, + "grad_norm": 0.046189580112695694, + "learning_rate": 0.00015938161033349837, + "loss": 0.3306, + "step": 13963 + }, + { + "epoch": 1.1312378483473753, + "grad_norm": 0.03953096270561218, + "learning_rate": 0.00015937710968090373, + "loss": 0.3131, + "step": 13964 + }, + { + "epoch": 1.1313188593648735, + "grad_norm": 0.04358987510204315, + "learning_rate": 0.0001593726090283091, + "loss": 0.3084, + "step": 13965 + }, + { + "epoch": 1.131399870382372, + "grad_norm": 0.03750518709421158, + "learning_rate": 0.00015936810837571447, + "loss": 0.3118, + "step": 13966 + }, + { + "epoch": 1.1314808813998705, + "grad_norm": 0.04085781052708626, + "learning_rate": 0.00015936360772311986, + "loss": 0.3118, + "step": 13967 + }, + { + "epoch": 1.1315618924173687, + "grad_norm": 0.04128627851605415, + "learning_rate": 0.00015935910707052525, + "loss": 0.3291, + "step": 13968 + }, + { + "epoch": 1.1316429034348672, + "grad_norm": 0.04109803959727287, + "learning_rate": 0.0001593546064179306, + "loss": 0.3251, + "step": 13969 + }, + { + "epoch": 1.1317239144523654, + "grad_norm": 0.03611079975962639, + "learning_rate": 0.00015935010576533597, + "loss": 0.3247, + "step": 13970 + }, + { + "epoch": 1.1318049254698639, + "grad_norm": 0.04212360829114914, + "learning_rate": 0.00015934560511274135, + "loss": 0.3064, + "step": 13971 + }, + { + "epoch": 1.1318859364873624, + "grad_norm": 0.03634580597281456, + "learning_rate": 0.00015934110446014674, + "loss": 0.2657, + "step": 13972 + }, + { + "epoch": 1.1319669475048606, + "grad_norm": 0.04142121225595474, + "learning_rate": 0.0001593366038075521, + "loss": 0.3054, + "step": 13973 + }, + { + "epoch": 1.132047958522359, + "grad_norm": 0.043657053261995316, + "learning_rate": 0.0001593321031549575, + "loss": 0.3286, + "step": 13974 + }, + { + "epoch": 1.1321289695398575, + "grad_norm": 0.03691680356860161, + "learning_rate": 0.00015932760250236285, + "loss": 0.3089, + "step": 13975 + }, + { + "epoch": 1.1322099805573558, + "grad_norm": 0.038224101066589355, + "learning_rate": 0.0001593231018497682, + "loss": 0.2662, + "step": 13976 + }, + { + "epoch": 1.1322909915748542, + "grad_norm": 0.04033510386943817, + "learning_rate": 0.0001593186011971736, + "loss": 0.3308, + "step": 13977 + }, + { + "epoch": 1.1323720025923525, + "grad_norm": 0.037437643855810165, + "learning_rate": 0.00015931410054457898, + "loss": 0.3141, + "step": 13978 + }, + { + "epoch": 1.132453013609851, + "grad_norm": 0.03672100603580475, + "learning_rate": 0.00015930959989198434, + "loss": 0.2986, + "step": 13979 + }, + { + "epoch": 1.1325340246273492, + "grad_norm": 0.034205105155706406, + "learning_rate": 0.00015930509923938973, + "loss": 0.294, + "step": 13980 + }, + { + "epoch": 1.1326150356448477, + "grad_norm": 0.04216152802109718, + "learning_rate": 0.0001593005985867951, + "loss": 0.3082, + "step": 13981 + }, + { + "epoch": 1.1326960466623461, + "grad_norm": 0.03836638107895851, + "learning_rate": 0.00015929609793420045, + "loss": 0.3158, + "step": 13982 + }, + { + "epoch": 1.1327770576798444, + "grad_norm": 0.042772434651851654, + "learning_rate": 0.00015929159728160584, + "loss": 0.3331, + "step": 13983 + }, + { + "epoch": 1.1328580686973428, + "grad_norm": 0.03767096996307373, + "learning_rate": 0.00015928709662901122, + "loss": 0.3099, + "step": 13984 + }, + { + "epoch": 1.1329390797148413, + "grad_norm": 0.037372101098299026, + "learning_rate": 0.00015928259597641658, + "loss": 0.2968, + "step": 13985 + }, + { + "epoch": 1.1330200907323396, + "grad_norm": 0.03995388001203537, + "learning_rate": 0.00015927809532382197, + "loss": 0.3296, + "step": 13986 + }, + { + "epoch": 1.133101101749838, + "grad_norm": 0.03512907400727272, + "learning_rate": 0.00015927359467122733, + "loss": 0.2998, + "step": 13987 + }, + { + "epoch": 1.1331821127673363, + "grad_norm": 0.038549166172742844, + "learning_rate": 0.0001592690940186327, + "loss": 0.343, + "step": 13988 + }, + { + "epoch": 1.1332631237848347, + "grad_norm": 0.03864383324980736, + "learning_rate": 0.00015926459336603808, + "loss": 0.3171, + "step": 13989 + }, + { + "epoch": 1.1333441348023332, + "grad_norm": 0.036088306456804276, + "learning_rate": 0.00015926009271344347, + "loss": 0.3048, + "step": 13990 + }, + { + "epoch": 1.1334251458198314, + "grad_norm": 0.042519714683294296, + "learning_rate": 0.00015925559206084883, + "loss": 0.3058, + "step": 13991 + }, + { + "epoch": 1.13350615683733, + "grad_norm": 0.036296386271715164, + "learning_rate": 0.0001592510914082542, + "loss": 0.2861, + "step": 13992 + }, + { + "epoch": 1.1335871678548282, + "grad_norm": 0.041319336742162704, + "learning_rate": 0.00015924659075565957, + "loss": 0.2783, + "step": 13993 + }, + { + "epoch": 1.1336681788723266, + "grad_norm": 0.03771928697824478, + "learning_rate": 0.00015924209010306493, + "loss": 0.2959, + "step": 13994 + }, + { + "epoch": 1.133749189889825, + "grad_norm": 0.03731353208422661, + "learning_rate": 0.00015923758945047032, + "loss": 0.2769, + "step": 13995 + }, + { + "epoch": 1.1338302009073233, + "grad_norm": 0.04432228207588196, + "learning_rate": 0.0001592330887978757, + "loss": 0.3052, + "step": 13996 + }, + { + "epoch": 1.1339112119248218, + "grad_norm": 0.042736418545246124, + "learning_rate": 0.00015922858814528107, + "loss": 0.3085, + "step": 13997 + }, + { + "epoch": 1.1339922229423203, + "grad_norm": 0.04248335212469101, + "learning_rate": 0.00015922408749268645, + "loss": 0.3064, + "step": 13998 + }, + { + "epoch": 1.1340732339598185, + "grad_norm": 0.04500725120306015, + "learning_rate": 0.00015921958684009181, + "loss": 0.3088, + "step": 13999 + }, + { + "epoch": 1.134154244977317, + "grad_norm": 0.050973884761333466, + "learning_rate": 0.00015921508618749717, + "loss": 0.3918, + "step": 14000 + }, + { + "epoch": 1.1342352559948152, + "grad_norm": 0.03775111958384514, + "learning_rate": 0.0001592105855349026, + "loss": 0.2903, + "step": 14001 + }, + { + "epoch": 1.1343162670123137, + "grad_norm": 0.04045481234788895, + "learning_rate": 0.00015920608488230795, + "loss": 0.268, + "step": 14002 + }, + { + "epoch": 1.134397278029812, + "grad_norm": 0.034963369369506836, + "learning_rate": 0.0001592015842297133, + "loss": 0.2996, + "step": 14003 + }, + { + "epoch": 1.1344782890473104, + "grad_norm": 0.03908831253647804, + "learning_rate": 0.0001591970835771187, + "loss": 0.3033, + "step": 14004 + }, + { + "epoch": 1.1345593000648089, + "grad_norm": 0.038431428372859955, + "learning_rate": 0.00015919258292452406, + "loss": 0.3054, + "step": 14005 + }, + { + "epoch": 1.1346403110823071, + "grad_norm": 0.03698623180389404, + "learning_rate": 0.00015918808227192942, + "loss": 0.2849, + "step": 14006 + }, + { + "epoch": 1.1347213220998056, + "grad_norm": 0.03957449644804001, + "learning_rate": 0.00015918358161933483, + "loss": 0.3003, + "step": 14007 + }, + { + "epoch": 1.134802333117304, + "grad_norm": 0.040528152137994766, + "learning_rate": 0.0001591790809667402, + "loss": 0.3377, + "step": 14008 + }, + { + "epoch": 1.1348833441348023, + "grad_norm": 0.03744984418153763, + "learning_rate": 0.00015917458031414555, + "loss": 0.3261, + "step": 14009 + }, + { + "epoch": 1.1349643551523008, + "grad_norm": 0.037619199603796005, + "learning_rate": 0.00015917007966155094, + "loss": 0.2998, + "step": 14010 + }, + { + "epoch": 1.135045366169799, + "grad_norm": 0.04669422656297684, + "learning_rate": 0.0001591655790089563, + "loss": 0.3315, + "step": 14011 + }, + { + "epoch": 1.1351263771872975, + "grad_norm": 0.041102923452854156, + "learning_rate": 0.00015916107835636166, + "loss": 0.2808, + "step": 14012 + }, + { + "epoch": 1.135207388204796, + "grad_norm": 0.03753142058849335, + "learning_rate": 0.00015915657770376707, + "loss": 0.2902, + "step": 14013 + }, + { + "epoch": 1.1352883992222942, + "grad_norm": 0.03314594924449921, + "learning_rate": 0.00015915207705117243, + "loss": 0.2415, + "step": 14014 + }, + { + "epoch": 1.1353694102397927, + "grad_norm": 0.03576765954494476, + "learning_rate": 0.0001591475763985778, + "loss": 0.2806, + "step": 14015 + }, + { + "epoch": 1.135450421257291, + "grad_norm": 0.039405256509780884, + "learning_rate": 0.00015914307574598318, + "loss": 0.3163, + "step": 14016 + }, + { + "epoch": 1.1355314322747894, + "grad_norm": 0.047493211925029755, + "learning_rate": 0.00015913857509338854, + "loss": 0.3541, + "step": 14017 + }, + { + "epoch": 1.1356124432922878, + "grad_norm": 0.032639991492033005, + "learning_rate": 0.0001591340744407939, + "loss": 0.2691, + "step": 14018 + }, + { + "epoch": 1.135693454309786, + "grad_norm": 0.03855365887284279, + "learning_rate": 0.00015912957378819931, + "loss": 0.3026, + "step": 14019 + }, + { + "epoch": 1.1357744653272845, + "grad_norm": 0.04066522419452667, + "learning_rate": 0.00015912507313560467, + "loss": 0.326, + "step": 14020 + }, + { + "epoch": 1.135855476344783, + "grad_norm": 0.037609148770570755, + "learning_rate": 0.00015912057248301003, + "loss": 0.3111, + "step": 14021 + }, + { + "epoch": 1.1359364873622813, + "grad_norm": 0.0433291494846344, + "learning_rate": 0.00015911607183041542, + "loss": 0.3109, + "step": 14022 + }, + { + "epoch": 1.1360174983797797, + "grad_norm": 0.04275543987751007, + "learning_rate": 0.00015911157117782078, + "loss": 0.3141, + "step": 14023 + }, + { + "epoch": 1.136098509397278, + "grad_norm": 0.043383073061704636, + "learning_rate": 0.00015910707052522617, + "loss": 0.337, + "step": 14024 + }, + { + "epoch": 1.1361795204147764, + "grad_norm": 0.04801047593355179, + "learning_rate": 0.00015910256987263156, + "loss": 0.361, + "step": 14025 + }, + { + "epoch": 1.1362605314322747, + "grad_norm": 0.04143408313393593, + "learning_rate": 0.00015909806922003692, + "loss": 0.2957, + "step": 14026 + }, + { + "epoch": 1.1363415424497731, + "grad_norm": 0.039822012186050415, + "learning_rate": 0.00015909356856744228, + "loss": 0.3023, + "step": 14027 + }, + { + "epoch": 1.1364225534672716, + "grad_norm": 0.039689600467681885, + "learning_rate": 0.00015908906791484766, + "loss": 0.2697, + "step": 14028 + }, + { + "epoch": 1.1365035644847699, + "grad_norm": 0.044523414224386215, + "learning_rate": 0.00015908456726225302, + "loss": 0.3203, + "step": 14029 + }, + { + "epoch": 1.1365845755022683, + "grad_norm": 0.046490006148815155, + "learning_rate": 0.0001590800666096584, + "loss": 0.3288, + "step": 14030 + }, + { + "epoch": 1.1366655865197668, + "grad_norm": 0.04369664937257767, + "learning_rate": 0.0001590755659570638, + "loss": 0.3013, + "step": 14031 + }, + { + "epoch": 1.136746597537265, + "grad_norm": 0.04383647069334984, + "learning_rate": 0.00015907106530446916, + "loss": 0.2924, + "step": 14032 + }, + { + "epoch": 1.1368276085547635, + "grad_norm": 0.041048433631658554, + "learning_rate": 0.00015906656465187452, + "loss": 0.3095, + "step": 14033 + }, + { + "epoch": 1.1369086195722617, + "grad_norm": 0.04076990857720375, + "learning_rate": 0.0001590620639992799, + "loss": 0.298, + "step": 14034 + }, + { + "epoch": 1.1369896305897602, + "grad_norm": 0.03735438734292984, + "learning_rate": 0.00015905756334668526, + "loss": 0.2903, + "step": 14035 + }, + { + "epoch": 1.1370706416072587, + "grad_norm": 0.03785363584756851, + "learning_rate": 0.00015905306269409065, + "loss": 0.2893, + "step": 14036 + }, + { + "epoch": 1.137151652624757, + "grad_norm": 0.04257530719041824, + "learning_rate": 0.00015904856204149604, + "loss": 0.2786, + "step": 14037 + }, + { + "epoch": 1.1372326636422554, + "grad_norm": 0.037066444754600525, + "learning_rate": 0.0001590440613889014, + "loss": 0.3019, + "step": 14038 + }, + { + "epoch": 1.1373136746597536, + "grad_norm": 0.03434824198484421, + "learning_rate": 0.00015903956073630676, + "loss": 0.2924, + "step": 14039 + }, + { + "epoch": 1.137394685677252, + "grad_norm": 0.03972025588154793, + "learning_rate": 0.00015903506008371215, + "loss": 0.2738, + "step": 14040 + }, + { + "epoch": 1.1374756966947506, + "grad_norm": 0.03562634810805321, + "learning_rate": 0.0001590305594311175, + "loss": 0.308, + "step": 14041 + }, + { + "epoch": 1.1375567077122488, + "grad_norm": 0.038715023547410965, + "learning_rate": 0.0001590260587785229, + "loss": 0.3383, + "step": 14042 + }, + { + "epoch": 1.1376377187297473, + "grad_norm": 0.03528916835784912, + "learning_rate": 0.00015902155812592828, + "loss": 0.2796, + "step": 14043 + }, + { + "epoch": 1.1377187297472457, + "grad_norm": 0.037791598588228226, + "learning_rate": 0.00015901705747333364, + "loss": 0.2826, + "step": 14044 + }, + { + "epoch": 1.137799740764744, + "grad_norm": 0.03439135104417801, + "learning_rate": 0.000159012556820739, + "loss": 0.3217, + "step": 14045 + }, + { + "epoch": 1.1378807517822425, + "grad_norm": 0.03895426541566849, + "learning_rate": 0.0001590080561681444, + "loss": 0.2836, + "step": 14046 + }, + { + "epoch": 1.1379617627997407, + "grad_norm": 0.03870061784982681, + "learning_rate": 0.00015900355551554975, + "loss": 0.3294, + "step": 14047 + }, + { + "epoch": 1.1380427738172392, + "grad_norm": 0.03975560888648033, + "learning_rate": 0.00015899905486295513, + "loss": 0.3192, + "step": 14048 + }, + { + "epoch": 1.1381237848347374, + "grad_norm": 0.034254979342222214, + "learning_rate": 0.00015899455421036052, + "loss": 0.2719, + "step": 14049 + }, + { + "epoch": 1.1382047958522359, + "grad_norm": 0.038271915167570114, + "learning_rate": 0.00015899005355776588, + "loss": 0.326, + "step": 14050 + }, + { + "epoch": 1.1382858068697344, + "grad_norm": 0.039710208773612976, + "learning_rate": 0.00015898555290517124, + "loss": 0.3065, + "step": 14051 + }, + { + "epoch": 1.1383668178872326, + "grad_norm": 0.035166531801223755, + "learning_rate": 0.00015898105225257663, + "loss": 0.2895, + "step": 14052 + }, + { + "epoch": 1.138447828904731, + "grad_norm": 0.03941623494029045, + "learning_rate": 0.00015897655159998202, + "loss": 0.2963, + "step": 14053 + }, + { + "epoch": 1.1385288399222295, + "grad_norm": 0.038826532661914825, + "learning_rate": 0.00015897205094738738, + "loss": 0.3084, + "step": 14054 + }, + { + "epoch": 1.1386098509397278, + "grad_norm": 0.03994929790496826, + "learning_rate": 0.00015896755029479276, + "loss": 0.3236, + "step": 14055 + }, + { + "epoch": 1.1386908619572262, + "grad_norm": 0.04244782030582428, + "learning_rate": 0.00015896304964219812, + "loss": 0.3336, + "step": 14056 + }, + { + "epoch": 1.1387718729747245, + "grad_norm": 0.042472947388887405, + "learning_rate": 0.00015895854898960348, + "loss": 0.3268, + "step": 14057 + }, + { + "epoch": 1.138852883992223, + "grad_norm": 0.03509005531668663, + "learning_rate": 0.00015895404833700887, + "loss": 0.2596, + "step": 14058 + }, + { + "epoch": 1.1389338950097212, + "grad_norm": 0.03898772597312927, + "learning_rate": 0.00015894954768441426, + "loss": 0.3302, + "step": 14059 + }, + { + "epoch": 1.1390149060272197, + "grad_norm": 0.04091065004467964, + "learning_rate": 0.00015894504703181962, + "loss": 0.2922, + "step": 14060 + }, + { + "epoch": 1.1390959170447181, + "grad_norm": 0.038985904306173325, + "learning_rate": 0.000158940546379225, + "loss": 0.2673, + "step": 14061 + }, + { + "epoch": 1.1391769280622164, + "grad_norm": 0.04241928085684776, + "learning_rate": 0.00015893604572663037, + "loss": 0.3198, + "step": 14062 + }, + { + "epoch": 1.1392579390797148, + "grad_norm": 0.03850384056568146, + "learning_rate": 0.00015893154507403573, + "loss": 0.2849, + "step": 14063 + }, + { + "epoch": 1.1393389500972133, + "grad_norm": 0.04147018864750862, + "learning_rate": 0.0001589270444214411, + "loss": 0.3157, + "step": 14064 + }, + { + "epoch": 1.1394199611147116, + "grad_norm": 0.04068557918071747, + "learning_rate": 0.0001589225437688465, + "loss": 0.3169, + "step": 14065 + }, + { + "epoch": 1.13950097213221, + "grad_norm": 0.03858393803238869, + "learning_rate": 0.00015891804311625186, + "loss": 0.3044, + "step": 14066 + }, + { + "epoch": 1.1395819831497083, + "grad_norm": 0.03682612255215645, + "learning_rate": 0.00015891354246365725, + "loss": 0.2826, + "step": 14067 + }, + { + "epoch": 1.1396629941672067, + "grad_norm": 0.037353694438934326, + "learning_rate": 0.0001589090418110626, + "loss": 0.315, + "step": 14068 + }, + { + "epoch": 1.1397440051847052, + "grad_norm": 0.03581754118204117, + "learning_rate": 0.00015890454115846797, + "loss": 0.2829, + "step": 14069 + }, + { + "epoch": 1.1398250162022034, + "grad_norm": 0.03504345193505287, + "learning_rate": 0.00015890004050587335, + "loss": 0.2922, + "step": 14070 + }, + { + "epoch": 1.139906027219702, + "grad_norm": 0.03751121088862419, + "learning_rate": 0.00015889553985327874, + "loss": 0.2826, + "step": 14071 + }, + { + "epoch": 1.1399870382372002, + "grad_norm": 0.03658273443579674, + "learning_rate": 0.0001588910392006841, + "loss": 0.3045, + "step": 14072 + }, + { + "epoch": 1.1400680492546986, + "grad_norm": 0.04107685387134552, + "learning_rate": 0.0001588865385480895, + "loss": 0.3275, + "step": 14073 + }, + { + "epoch": 1.140149060272197, + "grad_norm": 0.04228817671537399, + "learning_rate": 0.00015888203789549485, + "loss": 0.2982, + "step": 14074 + }, + { + "epoch": 1.1402300712896953, + "grad_norm": 0.0487077496945858, + "learning_rate": 0.0001588775372429002, + "loss": 0.3327, + "step": 14075 + }, + { + "epoch": 1.1403110823071938, + "grad_norm": 0.04449397325515747, + "learning_rate": 0.00015887303659030562, + "loss": 0.3315, + "step": 14076 + }, + { + "epoch": 1.1403920933246923, + "grad_norm": 0.038302674889564514, + "learning_rate": 0.00015886853593771098, + "loss": 0.2889, + "step": 14077 + }, + { + "epoch": 1.1404731043421905, + "grad_norm": 0.039194364100694656, + "learning_rate": 0.00015886403528511634, + "loss": 0.2758, + "step": 14078 + }, + { + "epoch": 1.140554115359689, + "grad_norm": 0.04212159290909767, + "learning_rate": 0.00015885953463252173, + "loss": 0.3066, + "step": 14079 + }, + { + "epoch": 1.1406351263771872, + "grad_norm": 0.045051414519548416, + "learning_rate": 0.0001588550339799271, + "loss": 0.2842, + "step": 14080 + }, + { + "epoch": 1.1407161373946857, + "grad_norm": 0.042434271425008774, + "learning_rate": 0.00015885053332733245, + "loss": 0.3434, + "step": 14081 + }, + { + "epoch": 1.140797148412184, + "grad_norm": 0.05168015509843826, + "learning_rate": 0.00015884603267473786, + "loss": 0.3231, + "step": 14082 + }, + { + "epoch": 1.1408781594296824, + "grad_norm": 0.04804101586341858, + "learning_rate": 0.00015884153202214322, + "loss": 0.3347, + "step": 14083 + }, + { + "epoch": 1.1409591704471809, + "grad_norm": 0.042534634470939636, + "learning_rate": 0.00015883703136954858, + "loss": 0.3204, + "step": 14084 + }, + { + "epoch": 1.1410401814646791, + "grad_norm": 0.04687447473406792, + "learning_rate": 0.00015883253071695397, + "loss": 0.3502, + "step": 14085 + }, + { + "epoch": 1.1411211924821776, + "grad_norm": 0.04413938149809837, + "learning_rate": 0.00015882803006435933, + "loss": 0.302, + "step": 14086 + }, + { + "epoch": 1.141202203499676, + "grad_norm": 0.046513497829437256, + "learning_rate": 0.0001588235294117647, + "loss": 0.3513, + "step": 14087 + }, + { + "epoch": 1.1412832145171743, + "grad_norm": 0.04094574972987175, + "learning_rate": 0.0001588190287591701, + "loss": 0.3386, + "step": 14088 + }, + { + "epoch": 1.1413642255346728, + "grad_norm": 0.04302053898572922, + "learning_rate": 0.00015881452810657547, + "loss": 0.3285, + "step": 14089 + }, + { + "epoch": 1.141445236552171, + "grad_norm": 0.039479270577430725, + "learning_rate": 0.00015881002745398083, + "loss": 0.309, + "step": 14090 + }, + { + "epoch": 1.1415262475696695, + "grad_norm": 0.04067131131887436, + "learning_rate": 0.0001588055268013862, + "loss": 0.3072, + "step": 14091 + }, + { + "epoch": 1.141607258587168, + "grad_norm": 0.038594089448451996, + "learning_rate": 0.00015880102614879157, + "loss": 0.3116, + "step": 14092 + }, + { + "epoch": 1.1416882696046662, + "grad_norm": 0.035079196095466614, + "learning_rate": 0.00015879652549619693, + "loss": 0.3006, + "step": 14093 + }, + { + "epoch": 1.1417692806221647, + "grad_norm": 0.03509014472365379, + "learning_rate": 0.00015879202484360235, + "loss": 0.2823, + "step": 14094 + }, + { + "epoch": 1.141850291639663, + "grad_norm": 0.04012055695056915, + "learning_rate": 0.0001587875241910077, + "loss": 0.2946, + "step": 14095 + }, + { + "epoch": 1.1419313026571614, + "grad_norm": 0.04498868063092232, + "learning_rate": 0.00015878302353841307, + "loss": 0.3341, + "step": 14096 + }, + { + "epoch": 1.1420123136746598, + "grad_norm": 0.04122866317629814, + "learning_rate": 0.00015877852288581846, + "loss": 0.3181, + "step": 14097 + }, + { + "epoch": 1.142093324692158, + "grad_norm": 0.04573645442724228, + "learning_rate": 0.00015877402223322382, + "loss": 0.3106, + "step": 14098 + }, + { + "epoch": 1.1421743357096565, + "grad_norm": 0.03992963582277298, + "learning_rate": 0.00015876952158062918, + "loss": 0.3174, + "step": 14099 + }, + { + "epoch": 1.142255346727155, + "grad_norm": 0.03769395500421524, + "learning_rate": 0.0001587650209280346, + "loss": 0.2803, + "step": 14100 + }, + { + "epoch": 1.1423363577446533, + "grad_norm": 0.037454042583703995, + "learning_rate": 0.00015876052027543995, + "loss": 0.337, + "step": 14101 + }, + { + "epoch": 1.1424173687621517, + "grad_norm": 0.035413019359111786, + "learning_rate": 0.0001587560196228453, + "loss": 0.2735, + "step": 14102 + }, + { + "epoch": 1.14249837977965, + "grad_norm": 0.04085841029882431, + "learning_rate": 0.0001587515189702507, + "loss": 0.2882, + "step": 14103 + }, + { + "epoch": 1.1425793907971484, + "grad_norm": 0.03542950376868248, + "learning_rate": 0.00015874701831765606, + "loss": 0.3079, + "step": 14104 + }, + { + "epoch": 1.1426604018146467, + "grad_norm": 0.0359371043741703, + "learning_rate": 0.00015874251766506144, + "loss": 0.313, + "step": 14105 + }, + { + "epoch": 1.1427414128321451, + "grad_norm": 0.04420680180191994, + "learning_rate": 0.00015873801701246683, + "loss": 0.3043, + "step": 14106 + }, + { + "epoch": 1.1428224238496436, + "grad_norm": 0.044378090649843216, + "learning_rate": 0.0001587335163598722, + "loss": 0.3589, + "step": 14107 + }, + { + "epoch": 1.1429034348671419, + "grad_norm": 0.0452517494559288, + "learning_rate": 0.00015872901570727755, + "loss": 0.3643, + "step": 14108 + }, + { + "epoch": 1.1429844458846403, + "grad_norm": 0.042304959148168564, + "learning_rate": 0.00015872451505468294, + "loss": 0.3275, + "step": 14109 + }, + { + "epoch": 1.1430654569021388, + "grad_norm": 0.04190251603722572, + "learning_rate": 0.0001587200144020883, + "loss": 0.3102, + "step": 14110 + }, + { + "epoch": 1.143146467919637, + "grad_norm": 0.036500733345746994, + "learning_rate": 0.00015871551374949369, + "loss": 0.3007, + "step": 14111 + }, + { + "epoch": 1.1432274789371355, + "grad_norm": 0.036681219935417175, + "learning_rate": 0.00015871101309689907, + "loss": 0.2906, + "step": 14112 + }, + { + "epoch": 1.1433084899546337, + "grad_norm": 0.049908604472875595, + "learning_rate": 0.00015870651244430443, + "loss": 0.3585, + "step": 14113 + }, + { + "epoch": 1.1433895009721322, + "grad_norm": 0.041187454015016556, + "learning_rate": 0.0001587020117917098, + "loss": 0.3177, + "step": 14114 + }, + { + "epoch": 1.1434705119896307, + "grad_norm": 0.03634697571396828, + "learning_rate": 0.00015869751113911518, + "loss": 0.2874, + "step": 14115 + }, + { + "epoch": 1.143551523007129, + "grad_norm": 0.045490801334381104, + "learning_rate": 0.00015869301048652054, + "loss": 0.3447, + "step": 14116 + }, + { + "epoch": 1.1436325340246274, + "grad_norm": 0.04303700476884842, + "learning_rate": 0.00015868850983392593, + "loss": 0.3525, + "step": 14117 + }, + { + "epoch": 1.1437135450421256, + "grad_norm": 0.03868240490555763, + "learning_rate": 0.00015868400918133131, + "loss": 0.3193, + "step": 14118 + }, + { + "epoch": 1.143794556059624, + "grad_norm": 0.04534424841403961, + "learning_rate": 0.00015867950852873667, + "loss": 0.3478, + "step": 14119 + }, + { + "epoch": 1.1438755670771226, + "grad_norm": 0.039549026638269424, + "learning_rate": 0.00015867500787614203, + "loss": 0.2657, + "step": 14120 + }, + { + "epoch": 1.1439565780946208, + "grad_norm": 0.04135221242904663, + "learning_rate": 0.00015867050722354742, + "loss": 0.3023, + "step": 14121 + }, + { + "epoch": 1.1440375891121193, + "grad_norm": 0.0366831049323082, + "learning_rate": 0.00015866600657095278, + "loss": 0.2896, + "step": 14122 + }, + { + "epoch": 1.1441186001296177, + "grad_norm": 0.044443391263484955, + "learning_rate": 0.00015866150591835817, + "loss": 0.316, + "step": 14123 + }, + { + "epoch": 1.144199611147116, + "grad_norm": 0.035869430750608444, + "learning_rate": 0.00015865700526576356, + "loss": 0.2828, + "step": 14124 + }, + { + "epoch": 1.1442806221646145, + "grad_norm": 0.039250753819942474, + "learning_rate": 0.00015865250461316892, + "loss": 0.299, + "step": 14125 + }, + { + "epoch": 1.1443616331821127, + "grad_norm": 0.036866240203380585, + "learning_rate": 0.00015864800396057428, + "loss": 0.2866, + "step": 14126 + }, + { + "epoch": 1.1444426441996112, + "grad_norm": 0.0382998064160347, + "learning_rate": 0.00015864350330797966, + "loss": 0.2939, + "step": 14127 + }, + { + "epoch": 1.1445236552171094, + "grad_norm": 0.04512723162770271, + "learning_rate": 0.00015863900265538505, + "loss": 0.3071, + "step": 14128 + }, + { + "epoch": 1.1446046662346079, + "grad_norm": 0.0422217883169651, + "learning_rate": 0.0001586345020027904, + "loss": 0.282, + "step": 14129 + }, + { + "epoch": 1.1446856772521063, + "grad_norm": 0.03788993880152702, + "learning_rate": 0.0001586300013501958, + "loss": 0.2789, + "step": 14130 + }, + { + "epoch": 1.1447666882696046, + "grad_norm": 0.03889959305524826, + "learning_rate": 0.00015862550069760116, + "loss": 0.2998, + "step": 14131 + }, + { + "epoch": 1.144847699287103, + "grad_norm": 0.03628894314169884, + "learning_rate": 0.00015862100004500652, + "loss": 0.3067, + "step": 14132 + }, + { + "epoch": 1.1449287103046015, + "grad_norm": 0.03689340874552727, + "learning_rate": 0.0001586164993924119, + "loss": 0.2959, + "step": 14133 + }, + { + "epoch": 1.1450097213220998, + "grad_norm": 0.04336564615368843, + "learning_rate": 0.0001586119987398173, + "loss": 0.283, + "step": 14134 + }, + { + "epoch": 1.1450907323395982, + "grad_norm": 0.045776378363370895, + "learning_rate": 0.00015860749808722265, + "loss": 0.3326, + "step": 14135 + }, + { + "epoch": 1.1451717433570965, + "grad_norm": 0.03566877543926239, + "learning_rate": 0.00015860299743462804, + "loss": 0.2991, + "step": 14136 + }, + { + "epoch": 1.145252754374595, + "grad_norm": 0.04115091264247894, + "learning_rate": 0.0001585984967820334, + "loss": 0.3198, + "step": 14137 + }, + { + "epoch": 1.1453337653920934, + "grad_norm": 0.04076642915606499, + "learning_rate": 0.00015859399612943876, + "loss": 0.3067, + "step": 14138 + }, + { + "epoch": 1.1454147764095917, + "grad_norm": 0.041703108698129654, + "learning_rate": 0.00015858949547684415, + "loss": 0.3291, + "step": 14139 + }, + { + "epoch": 1.1454957874270901, + "grad_norm": 0.037575382739305496, + "learning_rate": 0.00015858499482424953, + "loss": 0.2899, + "step": 14140 + }, + { + "epoch": 1.1455767984445884, + "grad_norm": 0.04002415016293526, + "learning_rate": 0.0001585804941716549, + "loss": 0.3049, + "step": 14141 + }, + { + "epoch": 1.1456578094620868, + "grad_norm": 0.040226858109235764, + "learning_rate": 0.00015857599351906028, + "loss": 0.3073, + "step": 14142 + }, + { + "epoch": 1.1457388204795853, + "grad_norm": 0.03888882324099541, + "learning_rate": 0.00015857149286646564, + "loss": 0.3226, + "step": 14143 + }, + { + "epoch": 1.1458198314970836, + "grad_norm": 0.03859817981719971, + "learning_rate": 0.000158566992213871, + "loss": 0.3056, + "step": 14144 + }, + { + "epoch": 1.145900842514582, + "grad_norm": 0.04122031852602959, + "learning_rate": 0.0001585624915612764, + "loss": 0.3412, + "step": 14145 + }, + { + "epoch": 1.1459818535320805, + "grad_norm": 0.033064428716897964, + "learning_rate": 0.00015855799090868178, + "loss": 0.2723, + "step": 14146 + }, + { + "epoch": 1.1460628645495787, + "grad_norm": 0.04350362345576286, + "learning_rate": 0.00015855349025608714, + "loss": 0.3383, + "step": 14147 + }, + { + "epoch": 1.1461438755670772, + "grad_norm": 0.03967111557722092, + "learning_rate": 0.00015854898960349252, + "loss": 0.3021, + "step": 14148 + }, + { + "epoch": 1.1462248865845754, + "grad_norm": 0.03711658716201782, + "learning_rate": 0.00015854448895089788, + "loss": 0.3245, + "step": 14149 + }, + { + "epoch": 1.146305897602074, + "grad_norm": 0.03623461723327637, + "learning_rate": 0.00015853998829830324, + "loss": 0.296, + "step": 14150 + }, + { + "epoch": 1.1463869086195722, + "grad_norm": 0.04315349832177162, + "learning_rate": 0.00015853548764570863, + "loss": 0.272, + "step": 14151 + }, + { + "epoch": 1.1464679196370706, + "grad_norm": 0.038551878184080124, + "learning_rate": 0.00015853098699311402, + "loss": 0.3242, + "step": 14152 + }, + { + "epoch": 1.146548930654569, + "grad_norm": 0.0382794514298439, + "learning_rate": 0.00015852648634051938, + "loss": 0.3034, + "step": 14153 + }, + { + "epoch": 1.1466299416720673, + "grad_norm": 0.03740396350622177, + "learning_rate": 0.00015852198568792476, + "loss": 0.2885, + "step": 14154 + }, + { + "epoch": 1.1467109526895658, + "grad_norm": 0.046128056943416595, + "learning_rate": 0.00015851748503533012, + "loss": 0.3699, + "step": 14155 + }, + { + "epoch": 1.1467919637070643, + "grad_norm": 0.040248434990644455, + "learning_rate": 0.00015851298438273548, + "loss": 0.2957, + "step": 14156 + }, + { + "epoch": 1.1468729747245625, + "grad_norm": 0.047767698764801025, + "learning_rate": 0.0001585084837301409, + "loss": 0.3637, + "step": 14157 + }, + { + "epoch": 1.146953985742061, + "grad_norm": 0.04790830239653587, + "learning_rate": 0.00015850398307754626, + "loss": 0.3506, + "step": 14158 + }, + { + "epoch": 1.1470349967595592, + "grad_norm": 0.03813810646533966, + "learning_rate": 0.00015849948242495162, + "loss": 0.2931, + "step": 14159 + }, + { + "epoch": 1.1471160077770577, + "grad_norm": 0.0405675433576107, + "learning_rate": 0.000158494981772357, + "loss": 0.3258, + "step": 14160 + }, + { + "epoch": 1.1471970187945562, + "grad_norm": 0.03747477009892464, + "learning_rate": 0.00015849048111976237, + "loss": 0.2754, + "step": 14161 + }, + { + "epoch": 1.1472780298120544, + "grad_norm": 0.04514666646718979, + "learning_rate": 0.00015848598046716773, + "loss": 0.3132, + "step": 14162 + }, + { + "epoch": 1.1473590408295529, + "grad_norm": 0.03907744958996773, + "learning_rate": 0.00015848147981457314, + "loss": 0.3237, + "step": 14163 + }, + { + "epoch": 1.1474400518470511, + "grad_norm": 0.044495511800050735, + "learning_rate": 0.0001584769791619785, + "loss": 0.3362, + "step": 14164 + }, + { + "epoch": 1.1475210628645496, + "grad_norm": 0.04200047254562378, + "learning_rate": 0.00015847247850938386, + "loss": 0.349, + "step": 14165 + }, + { + "epoch": 1.147602073882048, + "grad_norm": 0.03570564463734627, + "learning_rate": 0.00015846797785678925, + "loss": 0.2715, + "step": 14166 + }, + { + "epoch": 1.1476830848995463, + "grad_norm": 0.03856359422206879, + "learning_rate": 0.0001584634772041946, + "loss": 0.3037, + "step": 14167 + }, + { + "epoch": 1.1477640959170448, + "grad_norm": 0.03832743689417839, + "learning_rate": 0.00015845897655159997, + "loss": 0.3179, + "step": 14168 + }, + { + "epoch": 1.147845106934543, + "grad_norm": 0.044524624943733215, + "learning_rate": 0.00015845447589900538, + "loss": 0.3458, + "step": 14169 + }, + { + "epoch": 1.1479261179520415, + "grad_norm": 0.03662329539656639, + "learning_rate": 0.00015844997524641074, + "loss": 0.2949, + "step": 14170 + }, + { + "epoch": 1.14800712896954, + "grad_norm": 0.04072437062859535, + "learning_rate": 0.0001584454745938161, + "loss": 0.2882, + "step": 14171 + }, + { + "epoch": 1.1480881399870382, + "grad_norm": 0.03684643656015396, + "learning_rate": 0.0001584409739412215, + "loss": 0.2946, + "step": 14172 + }, + { + "epoch": 1.1481691510045366, + "grad_norm": 0.03623619303107262, + "learning_rate": 0.00015843647328862685, + "loss": 0.2823, + "step": 14173 + }, + { + "epoch": 1.148250162022035, + "grad_norm": 0.04452592134475708, + "learning_rate": 0.0001584319726360322, + "loss": 0.3427, + "step": 14174 + }, + { + "epoch": 1.1483311730395334, + "grad_norm": 0.035655438899993896, + "learning_rate": 0.00015842747198343762, + "loss": 0.2673, + "step": 14175 + }, + { + "epoch": 1.1484121840570318, + "grad_norm": 0.032316237688064575, + "learning_rate": 0.00015842297133084298, + "loss": 0.2778, + "step": 14176 + }, + { + "epoch": 1.14849319507453, + "grad_norm": 0.035419922322034836, + "learning_rate": 0.00015841847067824834, + "loss": 0.2521, + "step": 14177 + }, + { + "epoch": 1.1485742060920285, + "grad_norm": 0.03664263337850571, + "learning_rate": 0.00015841397002565373, + "loss": 0.3023, + "step": 14178 + }, + { + "epoch": 1.148655217109527, + "grad_norm": 0.04097932577133179, + "learning_rate": 0.0001584094693730591, + "loss": 0.3242, + "step": 14179 + }, + { + "epoch": 1.1487362281270252, + "grad_norm": 0.04287213459610939, + "learning_rate": 0.00015840496872046445, + "loss": 0.3638, + "step": 14180 + }, + { + "epoch": 1.1488172391445237, + "grad_norm": 0.03930720314383507, + "learning_rate": 0.00015840046806786986, + "loss": 0.3225, + "step": 14181 + }, + { + "epoch": 1.148898250162022, + "grad_norm": 0.037118248641490936, + "learning_rate": 0.00015839596741527522, + "loss": 0.2974, + "step": 14182 + }, + { + "epoch": 1.1489792611795204, + "grad_norm": 0.04094469174742699, + "learning_rate": 0.00015839146676268058, + "loss": 0.3339, + "step": 14183 + }, + { + "epoch": 1.1490602721970187, + "grad_norm": 0.03562622144818306, + "learning_rate": 0.00015838696611008597, + "loss": 0.2681, + "step": 14184 + }, + { + "epoch": 1.1491412832145171, + "grad_norm": 0.03689657896757126, + "learning_rate": 0.00015838246545749133, + "loss": 0.2941, + "step": 14185 + }, + { + "epoch": 1.1492222942320156, + "grad_norm": 0.04406815394759178, + "learning_rate": 0.00015837796480489672, + "loss": 0.3157, + "step": 14186 + }, + { + "epoch": 1.1493033052495139, + "grad_norm": 0.03726741299033165, + "learning_rate": 0.0001583734641523021, + "loss": 0.3033, + "step": 14187 + }, + { + "epoch": 1.1493843162670123, + "grad_norm": 0.040153663605451584, + "learning_rate": 0.00015836896349970747, + "loss": 0.2765, + "step": 14188 + }, + { + "epoch": 1.1494653272845108, + "grad_norm": 0.04548710212111473, + "learning_rate": 0.00015836446284711283, + "loss": 0.3308, + "step": 14189 + }, + { + "epoch": 1.149546338302009, + "grad_norm": 0.03714310750365257, + "learning_rate": 0.00015835996219451821, + "loss": 0.3235, + "step": 14190 + }, + { + "epoch": 1.1496273493195075, + "grad_norm": 0.04844687879085541, + "learning_rate": 0.00015835546154192357, + "loss": 0.3682, + "step": 14191 + }, + { + "epoch": 1.1497083603370057, + "grad_norm": 0.04365730285644531, + "learning_rate": 0.00015835096088932896, + "loss": 0.3371, + "step": 14192 + }, + { + "epoch": 1.1497893713545042, + "grad_norm": 0.040620166808366776, + "learning_rate": 0.00015834646023673435, + "loss": 0.3111, + "step": 14193 + }, + { + "epoch": 1.1498703823720027, + "grad_norm": 0.04068024456501007, + "learning_rate": 0.0001583419595841397, + "loss": 0.2997, + "step": 14194 + }, + { + "epoch": 1.149951393389501, + "grad_norm": 0.04177763685584068, + "learning_rate": 0.00015833745893154507, + "loss": 0.2964, + "step": 14195 + }, + { + "epoch": 1.1500324044069994, + "grad_norm": 0.05007220804691315, + "learning_rate": 0.00015833295827895046, + "loss": 0.3481, + "step": 14196 + }, + { + "epoch": 1.1501134154244976, + "grad_norm": 0.04170903190970421, + "learning_rate": 0.00015832845762635582, + "loss": 0.3339, + "step": 14197 + }, + { + "epoch": 1.150194426441996, + "grad_norm": 0.03846409544348717, + "learning_rate": 0.0001583239569737612, + "loss": 0.3054, + "step": 14198 + }, + { + "epoch": 1.1502754374594946, + "grad_norm": 0.04317399486899376, + "learning_rate": 0.0001583194563211666, + "loss": 0.3124, + "step": 14199 + }, + { + "epoch": 1.1503564484769928, + "grad_norm": 0.03954905644059181, + "learning_rate": 0.00015831495566857195, + "loss": 0.306, + "step": 14200 + }, + { + "epoch": 1.1504374594944913, + "grad_norm": 0.04700899496674538, + "learning_rate": 0.0001583104550159773, + "loss": 0.3149, + "step": 14201 + }, + { + "epoch": 1.1505184705119897, + "grad_norm": 0.04225290194153786, + "learning_rate": 0.0001583059543633827, + "loss": 0.2912, + "step": 14202 + }, + { + "epoch": 1.150599481529488, + "grad_norm": 0.039747338742017746, + "learning_rate": 0.00015830145371078806, + "loss": 0.3522, + "step": 14203 + }, + { + "epoch": 1.1506804925469865, + "grad_norm": 0.031397707760334015, + "learning_rate": 0.00015829695305819344, + "loss": 0.2765, + "step": 14204 + }, + { + "epoch": 1.1507615035644847, + "grad_norm": 0.04136206582188606, + "learning_rate": 0.00015829245240559883, + "loss": 0.3057, + "step": 14205 + }, + { + "epoch": 1.1508425145819832, + "grad_norm": 0.03835195675492287, + "learning_rate": 0.0001582879517530042, + "loss": 0.2947, + "step": 14206 + }, + { + "epoch": 1.1509235255994814, + "grad_norm": 0.042692024260759354, + "learning_rate": 0.00015828345110040955, + "loss": 0.3265, + "step": 14207 + }, + { + "epoch": 1.1510045366169799, + "grad_norm": 0.03989066928625107, + "learning_rate": 0.00015827895044781494, + "loss": 0.3259, + "step": 14208 + }, + { + "epoch": 1.1510855476344783, + "grad_norm": 0.036856938153505325, + "learning_rate": 0.00015827444979522033, + "loss": 0.3095, + "step": 14209 + }, + { + "epoch": 1.1511665586519766, + "grad_norm": 0.03925073519349098, + "learning_rate": 0.00015826994914262569, + "loss": 0.318, + "step": 14210 + }, + { + "epoch": 1.151247569669475, + "grad_norm": 0.03539036586880684, + "learning_rate": 0.00015826544849003107, + "loss": 0.3117, + "step": 14211 + }, + { + "epoch": 1.1513285806869735, + "grad_norm": 0.036580659449100494, + "learning_rate": 0.00015826094783743643, + "loss": 0.2845, + "step": 14212 + }, + { + "epoch": 1.1514095917044718, + "grad_norm": 0.047995951026678085, + "learning_rate": 0.0001582564471848418, + "loss": 0.3726, + "step": 14213 + }, + { + "epoch": 1.1514906027219702, + "grad_norm": 0.03786792606115341, + "learning_rate": 0.00015825194653224718, + "loss": 0.2974, + "step": 14214 + }, + { + "epoch": 1.1515716137394685, + "grad_norm": 0.04033001884818077, + "learning_rate": 0.00015824744587965257, + "loss": 0.3026, + "step": 14215 + }, + { + "epoch": 1.151652624756967, + "grad_norm": 0.035620976239442825, + "learning_rate": 0.00015824294522705793, + "loss": 0.3363, + "step": 14216 + }, + { + "epoch": 1.1517336357744654, + "grad_norm": 0.041697435081005096, + "learning_rate": 0.00015823844457446331, + "loss": 0.3125, + "step": 14217 + }, + { + "epoch": 1.1518146467919637, + "grad_norm": 0.044587064534425735, + "learning_rate": 0.00015823394392186867, + "loss": 0.4001, + "step": 14218 + }, + { + "epoch": 1.1518956578094621, + "grad_norm": 0.03764927387237549, + "learning_rate": 0.00015822944326927403, + "loss": 0.2945, + "step": 14219 + }, + { + "epoch": 1.1519766688269604, + "grad_norm": 0.041600629687309265, + "learning_rate": 0.00015822494261667942, + "loss": 0.3169, + "step": 14220 + }, + { + "epoch": 1.1520576798444588, + "grad_norm": 0.040279630571603775, + "learning_rate": 0.0001582204419640848, + "loss": 0.3132, + "step": 14221 + }, + { + "epoch": 1.1521386908619573, + "grad_norm": 0.043576162308454514, + "learning_rate": 0.00015821594131149017, + "loss": 0.3182, + "step": 14222 + }, + { + "epoch": 1.1522197018794555, + "grad_norm": 0.04142558574676514, + "learning_rate": 0.00015821144065889556, + "loss": 0.2998, + "step": 14223 + }, + { + "epoch": 1.152300712896954, + "grad_norm": 0.039318881928920746, + "learning_rate": 0.00015820694000630092, + "loss": 0.314, + "step": 14224 + }, + { + "epoch": 1.1523817239144525, + "grad_norm": 0.03945578262209892, + "learning_rate": 0.00015820243935370628, + "loss": 0.2977, + "step": 14225 + }, + { + "epoch": 1.1524627349319507, + "grad_norm": 0.0380149707198143, + "learning_rate": 0.00015819793870111166, + "loss": 0.29, + "step": 14226 + }, + { + "epoch": 1.1525437459494492, + "grad_norm": 0.03710747882723808, + "learning_rate": 0.00015819343804851705, + "loss": 0.2713, + "step": 14227 + }, + { + "epoch": 1.1526247569669474, + "grad_norm": 0.04076594486832619, + "learning_rate": 0.0001581889373959224, + "loss": 0.305, + "step": 14228 + }, + { + "epoch": 1.152705767984446, + "grad_norm": 0.03769194334745407, + "learning_rate": 0.0001581844367433278, + "loss": 0.2675, + "step": 14229 + }, + { + "epoch": 1.1527867790019442, + "grad_norm": 0.038101229816675186, + "learning_rate": 0.00015817993609073316, + "loss": 0.2922, + "step": 14230 + }, + { + "epoch": 1.1528677900194426, + "grad_norm": 0.04154670238494873, + "learning_rate": 0.00015817543543813852, + "loss": 0.3006, + "step": 14231 + }, + { + "epoch": 1.152948801036941, + "grad_norm": 0.04486631602048874, + "learning_rate": 0.0001581709347855439, + "loss": 0.3604, + "step": 14232 + }, + { + "epoch": 1.1530298120544393, + "grad_norm": 0.04321269318461418, + "learning_rate": 0.0001581664341329493, + "loss": 0.3726, + "step": 14233 + }, + { + "epoch": 1.1531108230719378, + "grad_norm": 0.039845582097768784, + "learning_rate": 0.00015816193348035465, + "loss": 0.2857, + "step": 14234 + }, + { + "epoch": 1.1531918340894363, + "grad_norm": 0.03998946771025658, + "learning_rate": 0.00015815743282776004, + "loss": 0.2798, + "step": 14235 + }, + { + "epoch": 1.1532728451069345, + "grad_norm": 0.03772175684571266, + "learning_rate": 0.0001581529321751654, + "loss": 0.2998, + "step": 14236 + }, + { + "epoch": 1.153353856124433, + "grad_norm": 0.038299474865198135, + "learning_rate": 0.00015814843152257076, + "loss": 0.3004, + "step": 14237 + }, + { + "epoch": 1.1534348671419312, + "grad_norm": 0.035598307847976685, + "learning_rate": 0.00015814393086997617, + "loss": 0.2991, + "step": 14238 + }, + { + "epoch": 1.1535158781594297, + "grad_norm": 0.04178384318947792, + "learning_rate": 0.00015813943021738153, + "loss": 0.3, + "step": 14239 + }, + { + "epoch": 1.1535968891769282, + "grad_norm": 0.04785295948386192, + "learning_rate": 0.0001581349295647869, + "loss": 0.319, + "step": 14240 + }, + { + "epoch": 1.1536779001944264, + "grad_norm": 0.038838669657707214, + "learning_rate": 0.00015813042891219228, + "loss": 0.2968, + "step": 14241 + }, + { + "epoch": 1.1537589112119249, + "grad_norm": 0.034681666642427444, + "learning_rate": 0.00015812592825959764, + "loss": 0.294, + "step": 14242 + }, + { + "epoch": 1.153839922229423, + "grad_norm": 0.03547977656126022, + "learning_rate": 0.000158121427607003, + "loss": 0.2661, + "step": 14243 + }, + { + "epoch": 1.1539209332469216, + "grad_norm": 0.049016062170267105, + "learning_rate": 0.00015811692695440842, + "loss": 0.3183, + "step": 14244 + }, + { + "epoch": 1.15400194426442, + "grad_norm": 0.04250559210777283, + "learning_rate": 0.00015811242630181378, + "loss": 0.2893, + "step": 14245 + }, + { + "epoch": 1.1540829552819183, + "grad_norm": 0.03848905861377716, + "learning_rate": 0.00015810792564921914, + "loss": 0.3089, + "step": 14246 + }, + { + "epoch": 1.1541639662994168, + "grad_norm": 0.033761803060770035, + "learning_rate": 0.00015810342499662452, + "loss": 0.2733, + "step": 14247 + }, + { + "epoch": 1.1542449773169152, + "grad_norm": 0.039992865175008774, + "learning_rate": 0.00015809892434402988, + "loss": 0.3329, + "step": 14248 + }, + { + "epoch": 1.1543259883344135, + "grad_norm": 0.04326672852039337, + "learning_rate": 0.00015809442369143524, + "loss": 0.3033, + "step": 14249 + }, + { + "epoch": 1.154406999351912, + "grad_norm": 0.04974553734064102, + "learning_rate": 0.00015808992303884066, + "loss": 0.3068, + "step": 14250 + }, + { + "epoch": 1.1544880103694102, + "grad_norm": 0.03991048410534859, + "learning_rate": 0.00015808542238624602, + "loss": 0.328, + "step": 14251 + }, + { + "epoch": 1.1545690213869086, + "grad_norm": 0.04298831522464752, + "learning_rate": 0.00015808092173365138, + "loss": 0.2872, + "step": 14252 + }, + { + "epoch": 1.154650032404407, + "grad_norm": 0.0435427762567997, + "learning_rate": 0.00015807642108105676, + "loss": 0.336, + "step": 14253 + }, + { + "epoch": 1.1547310434219054, + "grad_norm": 0.043134018778800964, + "learning_rate": 0.00015807192042846212, + "loss": 0.2864, + "step": 14254 + }, + { + "epoch": 1.1548120544394038, + "grad_norm": 0.04340551421046257, + "learning_rate": 0.00015806741977586748, + "loss": 0.339, + "step": 14255 + }, + { + "epoch": 1.154893065456902, + "grad_norm": 0.0379253514111042, + "learning_rate": 0.0001580629191232729, + "loss": 0.3031, + "step": 14256 + }, + { + "epoch": 1.1549740764744005, + "grad_norm": 0.0484885573387146, + "learning_rate": 0.00015805841847067826, + "loss": 0.3403, + "step": 14257 + }, + { + "epoch": 1.155055087491899, + "grad_norm": 0.04308895766735077, + "learning_rate": 0.00015805391781808362, + "loss": 0.3047, + "step": 14258 + }, + { + "epoch": 1.1551360985093972, + "grad_norm": 0.04077281430363655, + "learning_rate": 0.000158049417165489, + "loss": 0.3278, + "step": 14259 + }, + { + "epoch": 1.1552171095268957, + "grad_norm": 0.038421981036663055, + "learning_rate": 0.00015804491651289437, + "loss": 0.3179, + "step": 14260 + }, + { + "epoch": 1.155298120544394, + "grad_norm": 0.036804962903261185, + "learning_rate": 0.00015804041586029975, + "loss": 0.2898, + "step": 14261 + }, + { + "epoch": 1.1553791315618924, + "grad_norm": 0.04037683829665184, + "learning_rate": 0.00015803591520770514, + "loss": 0.3247, + "step": 14262 + }, + { + "epoch": 1.155460142579391, + "grad_norm": 0.04343143478035927, + "learning_rate": 0.0001580314145551105, + "loss": 0.3185, + "step": 14263 + }, + { + "epoch": 1.1555411535968891, + "grad_norm": 0.03817358240485191, + "learning_rate": 0.00015802691390251586, + "loss": 0.2801, + "step": 14264 + }, + { + "epoch": 1.1556221646143876, + "grad_norm": 0.0375547930598259, + "learning_rate": 0.00015802241324992125, + "loss": 0.3564, + "step": 14265 + }, + { + "epoch": 1.1557031756318858, + "grad_norm": 0.0396089144051075, + "learning_rate": 0.0001580179125973266, + "loss": 0.307, + "step": 14266 + }, + { + "epoch": 1.1557841866493843, + "grad_norm": 0.039912160485982895, + "learning_rate": 0.000158013411944732, + "loss": 0.3099, + "step": 14267 + }, + { + "epoch": 1.1558651976668828, + "grad_norm": 0.039689574390649796, + "learning_rate": 0.00015800891129213738, + "loss": 0.2731, + "step": 14268 + }, + { + "epoch": 1.155946208684381, + "grad_norm": 0.03810999542474747, + "learning_rate": 0.00015800441063954274, + "loss": 0.3546, + "step": 14269 + }, + { + "epoch": 1.1560272197018795, + "grad_norm": 0.04423951730132103, + "learning_rate": 0.0001579999099869481, + "loss": 0.3234, + "step": 14270 + }, + { + "epoch": 1.1561082307193777, + "grad_norm": 0.047452397644519806, + "learning_rate": 0.0001579954093343535, + "loss": 0.3106, + "step": 14271 + }, + { + "epoch": 1.1561892417368762, + "grad_norm": 0.03986579552292824, + "learning_rate": 0.00015799090868175885, + "loss": 0.3345, + "step": 14272 + }, + { + "epoch": 1.1562702527543747, + "grad_norm": 0.045032236725091934, + "learning_rate": 0.00015798640802916424, + "loss": 0.3599, + "step": 14273 + }, + { + "epoch": 1.156351263771873, + "grad_norm": 0.04494628682732582, + "learning_rate": 0.00015798190737656962, + "loss": 0.3585, + "step": 14274 + }, + { + "epoch": 1.1564322747893714, + "grad_norm": 0.035816531628370285, + "learning_rate": 0.00015797740672397498, + "loss": 0.3082, + "step": 14275 + }, + { + "epoch": 1.1565132858068696, + "grad_norm": 0.04182044044137001, + "learning_rate": 0.00015797290607138034, + "loss": 0.3045, + "step": 14276 + }, + { + "epoch": 1.156594296824368, + "grad_norm": 0.0457390621304512, + "learning_rate": 0.00015796840541878573, + "loss": 0.3367, + "step": 14277 + }, + { + "epoch": 1.1566753078418666, + "grad_norm": 0.03820004686713219, + "learning_rate": 0.0001579639047661911, + "loss": 0.2954, + "step": 14278 + }, + { + "epoch": 1.1567563188593648, + "grad_norm": 0.03487598896026611, + "learning_rate": 0.00015795940411359648, + "loss": 0.2639, + "step": 14279 + }, + { + "epoch": 1.1568373298768633, + "grad_norm": 0.04177277535200119, + "learning_rate": 0.00015795490346100187, + "loss": 0.3396, + "step": 14280 + }, + { + "epoch": 1.1569183408943617, + "grad_norm": 0.03737100213766098, + "learning_rate": 0.00015795040280840723, + "loss": 0.3039, + "step": 14281 + }, + { + "epoch": 1.15699935191186, + "grad_norm": 0.03937702625989914, + "learning_rate": 0.00015794590215581259, + "loss": 0.3154, + "step": 14282 + }, + { + "epoch": 1.1570803629293585, + "grad_norm": 0.04279126599431038, + "learning_rate": 0.00015794140150321797, + "loss": 0.3449, + "step": 14283 + }, + { + "epoch": 1.1571613739468567, + "grad_norm": 0.04076807200908661, + "learning_rate": 0.00015793690085062333, + "loss": 0.3244, + "step": 14284 + }, + { + "epoch": 1.1572423849643552, + "grad_norm": 0.04064446687698364, + "learning_rate": 0.00015793240019802872, + "loss": 0.2993, + "step": 14285 + }, + { + "epoch": 1.1573233959818534, + "grad_norm": 0.03838472068309784, + "learning_rate": 0.0001579278995454341, + "loss": 0.2863, + "step": 14286 + }, + { + "epoch": 1.1574044069993519, + "grad_norm": 0.04467492550611496, + "learning_rate": 0.00015792339889283947, + "loss": 0.3393, + "step": 14287 + }, + { + "epoch": 1.1574854180168503, + "grad_norm": 0.042215894907712936, + "learning_rate": 0.00015791889824024483, + "loss": 0.2828, + "step": 14288 + }, + { + "epoch": 1.1575664290343486, + "grad_norm": 0.048905014991760254, + "learning_rate": 0.00015791439758765021, + "loss": 0.3662, + "step": 14289 + }, + { + "epoch": 1.157647440051847, + "grad_norm": 0.033709824085235596, + "learning_rate": 0.0001579098969350556, + "loss": 0.2772, + "step": 14290 + }, + { + "epoch": 1.1577284510693455, + "grad_norm": 0.049707137048244476, + "learning_rate": 0.00015790539628246096, + "loss": 0.3643, + "step": 14291 + }, + { + "epoch": 1.1578094620868438, + "grad_norm": 0.04020284488797188, + "learning_rate": 0.00015790089562986635, + "loss": 0.2992, + "step": 14292 + }, + { + "epoch": 1.1578904731043422, + "grad_norm": 0.04444952309131622, + "learning_rate": 0.0001578963949772717, + "loss": 0.3155, + "step": 14293 + }, + { + "epoch": 1.1579714841218405, + "grad_norm": 0.040825799107551575, + "learning_rate": 0.00015789189432467707, + "loss": 0.2988, + "step": 14294 + }, + { + "epoch": 1.158052495139339, + "grad_norm": 0.040664974600076675, + "learning_rate": 0.00015788739367208246, + "loss": 0.3179, + "step": 14295 + }, + { + "epoch": 1.1581335061568374, + "grad_norm": 0.03809911012649536, + "learning_rate": 0.00015788289301948784, + "loss": 0.3052, + "step": 14296 + }, + { + "epoch": 1.1582145171743357, + "grad_norm": 0.03830299898982048, + "learning_rate": 0.0001578783923668932, + "loss": 0.2803, + "step": 14297 + }, + { + "epoch": 1.1582955281918341, + "grad_norm": 0.04559274762868881, + "learning_rate": 0.0001578738917142986, + "loss": 0.2901, + "step": 14298 + }, + { + "epoch": 1.1583765392093324, + "grad_norm": 0.03717794641852379, + "learning_rate": 0.00015786939106170395, + "loss": 0.3092, + "step": 14299 + }, + { + "epoch": 1.1584575502268308, + "grad_norm": 0.03662274405360222, + "learning_rate": 0.0001578648904091093, + "loss": 0.2942, + "step": 14300 + }, + { + "epoch": 1.1585385612443293, + "grad_norm": 0.045669075101614, + "learning_rate": 0.0001578603897565147, + "loss": 0.3138, + "step": 14301 + }, + { + "epoch": 1.1586195722618275, + "grad_norm": 0.042246490716934204, + "learning_rate": 0.00015785588910392008, + "loss": 0.3654, + "step": 14302 + }, + { + "epoch": 1.158700583279326, + "grad_norm": 0.04223867878317833, + "learning_rate": 0.00015785138845132544, + "loss": 0.3255, + "step": 14303 + }, + { + "epoch": 1.1587815942968245, + "grad_norm": 0.036577608436346054, + "learning_rate": 0.00015784688779873083, + "loss": 0.2949, + "step": 14304 + }, + { + "epoch": 1.1588626053143227, + "grad_norm": 0.04074166342616081, + "learning_rate": 0.0001578423871461362, + "loss": 0.3073, + "step": 14305 + }, + { + "epoch": 1.1589436163318212, + "grad_norm": 0.03720313683152199, + "learning_rate": 0.00015783788649354155, + "loss": 0.2975, + "step": 14306 + }, + { + "epoch": 1.1590246273493194, + "grad_norm": 0.043085984885692596, + "learning_rate": 0.00015783338584094694, + "loss": 0.3295, + "step": 14307 + }, + { + "epoch": 1.159105638366818, + "grad_norm": 0.04072624817490578, + "learning_rate": 0.00015782888518835233, + "loss": 0.3283, + "step": 14308 + }, + { + "epoch": 1.1591866493843161, + "grad_norm": 0.03722485899925232, + "learning_rate": 0.00015782438453575769, + "loss": 0.304, + "step": 14309 + }, + { + "epoch": 1.1592676604018146, + "grad_norm": 0.036215007305145264, + "learning_rate": 0.00015781988388316307, + "loss": 0.3029, + "step": 14310 + }, + { + "epoch": 1.159348671419313, + "grad_norm": 0.04115285351872444, + "learning_rate": 0.00015781538323056843, + "loss": 0.3074, + "step": 14311 + }, + { + "epoch": 1.1594296824368113, + "grad_norm": 0.04612356796860695, + "learning_rate": 0.0001578108825779738, + "loss": 0.2944, + "step": 14312 + }, + { + "epoch": 1.1595106934543098, + "grad_norm": 0.04167279973626137, + "learning_rate": 0.0001578063819253792, + "loss": 0.3255, + "step": 14313 + }, + { + "epoch": 1.1595917044718083, + "grad_norm": 0.04015865549445152, + "learning_rate": 0.00015780188127278457, + "loss": 0.2837, + "step": 14314 + }, + { + "epoch": 1.1596727154893065, + "grad_norm": 0.04215192049741745, + "learning_rate": 0.00015779738062018993, + "loss": 0.3126, + "step": 14315 + }, + { + "epoch": 1.159753726506805, + "grad_norm": 0.04285452142357826, + "learning_rate": 0.00015779287996759531, + "loss": 0.3375, + "step": 14316 + }, + { + "epoch": 1.1598347375243032, + "grad_norm": 0.03980492800474167, + "learning_rate": 0.00015778837931500067, + "loss": 0.2828, + "step": 14317 + }, + { + "epoch": 1.1599157485418017, + "grad_norm": 0.03826780244708061, + "learning_rate": 0.00015778387866240603, + "loss": 0.2739, + "step": 14318 + }, + { + "epoch": 1.1599967595593002, + "grad_norm": 0.038279950618743896, + "learning_rate": 0.00015777937800981145, + "loss": 0.2948, + "step": 14319 + }, + { + "epoch": 1.1600777705767984, + "grad_norm": 0.037352073937654495, + "learning_rate": 0.0001577748773572168, + "loss": 0.2801, + "step": 14320 + }, + { + "epoch": 1.1601587815942969, + "grad_norm": 0.03508422523736954, + "learning_rate": 0.00015777037670462217, + "loss": 0.2843, + "step": 14321 + }, + { + "epoch": 1.160239792611795, + "grad_norm": 0.03702421486377716, + "learning_rate": 0.00015776587605202756, + "loss": 0.2824, + "step": 14322 + }, + { + "epoch": 1.1603208036292936, + "grad_norm": 0.04090304300189018, + "learning_rate": 0.00015776137539943292, + "loss": 0.2995, + "step": 14323 + }, + { + "epoch": 1.160401814646792, + "grad_norm": 0.049013279378414154, + "learning_rate": 0.00015775687474683828, + "loss": 0.2739, + "step": 14324 + }, + { + "epoch": 1.1604828256642903, + "grad_norm": 0.034894946962594986, + "learning_rate": 0.0001577523740942437, + "loss": 0.2742, + "step": 14325 + }, + { + "epoch": 1.1605638366817888, + "grad_norm": 0.04372987896203995, + "learning_rate": 0.00015774787344164905, + "loss": 0.3516, + "step": 14326 + }, + { + "epoch": 1.1606448476992872, + "grad_norm": 0.04152096062898636, + "learning_rate": 0.0001577433727890544, + "loss": 0.3299, + "step": 14327 + }, + { + "epoch": 1.1607258587167855, + "grad_norm": 0.032177865505218506, + "learning_rate": 0.0001577388721364598, + "loss": 0.252, + "step": 14328 + }, + { + "epoch": 1.160806869734284, + "grad_norm": 0.043868500739336014, + "learning_rate": 0.00015773437148386516, + "loss": 0.325, + "step": 14329 + }, + { + "epoch": 1.1608878807517822, + "grad_norm": 0.044172488152980804, + "learning_rate": 0.00015772987083127055, + "loss": 0.3035, + "step": 14330 + }, + { + "epoch": 1.1609688917692806, + "grad_norm": 0.03395839408040047, + "learning_rate": 0.00015772537017867593, + "loss": 0.2612, + "step": 14331 + }, + { + "epoch": 1.1610499027867789, + "grad_norm": 0.03549252077937126, + "learning_rate": 0.0001577208695260813, + "loss": 0.2882, + "step": 14332 + }, + { + "epoch": 1.1611309138042774, + "grad_norm": 0.04983199015259743, + "learning_rate": 0.00015771636887348665, + "loss": 0.3429, + "step": 14333 + }, + { + "epoch": 1.1612119248217758, + "grad_norm": 0.03520864620804787, + "learning_rate": 0.00015771186822089204, + "loss": 0.2724, + "step": 14334 + }, + { + "epoch": 1.161292935839274, + "grad_norm": 0.04003923013806343, + "learning_rate": 0.0001577073675682974, + "loss": 0.3025, + "step": 14335 + }, + { + "epoch": 1.1613739468567725, + "grad_norm": 0.037281379103660583, + "learning_rate": 0.0001577028669157028, + "loss": 0.3092, + "step": 14336 + }, + { + "epoch": 1.161454957874271, + "grad_norm": 0.03576560318470001, + "learning_rate": 0.00015769836626310817, + "loss": 0.3162, + "step": 14337 + }, + { + "epoch": 1.1615359688917692, + "grad_norm": 0.039813894778490067, + "learning_rate": 0.00015769386561051353, + "loss": 0.3403, + "step": 14338 + }, + { + "epoch": 1.1616169799092677, + "grad_norm": 0.039217110723257065, + "learning_rate": 0.0001576893649579189, + "loss": 0.3053, + "step": 14339 + }, + { + "epoch": 1.161697990926766, + "grad_norm": 0.04114118218421936, + "learning_rate": 0.00015768486430532428, + "loss": 0.2588, + "step": 14340 + }, + { + "epoch": 1.1617790019442644, + "grad_norm": 0.04316568747162819, + "learning_rate": 0.00015768036365272964, + "loss": 0.336, + "step": 14341 + }, + { + "epoch": 1.161860012961763, + "grad_norm": 0.03854662552475929, + "learning_rate": 0.00015767586300013503, + "loss": 0.2715, + "step": 14342 + }, + { + "epoch": 1.1619410239792611, + "grad_norm": 0.041014164686203, + "learning_rate": 0.00015767136234754042, + "loss": 0.3493, + "step": 14343 + }, + { + "epoch": 1.1620220349967596, + "grad_norm": 0.04309874773025513, + "learning_rate": 0.00015766686169494578, + "loss": 0.3256, + "step": 14344 + }, + { + "epoch": 1.1621030460142578, + "grad_norm": 0.039331283420324326, + "learning_rate": 0.00015766236104235114, + "loss": 0.2745, + "step": 14345 + }, + { + "epoch": 1.1621840570317563, + "grad_norm": 0.04590243846178055, + "learning_rate": 0.00015765786038975652, + "loss": 0.2849, + "step": 14346 + }, + { + "epoch": 1.1622650680492548, + "grad_norm": 0.04023681581020355, + "learning_rate": 0.00015765335973716188, + "loss": 0.299, + "step": 14347 + }, + { + "epoch": 1.162346079066753, + "grad_norm": 0.03895244002342224, + "learning_rate": 0.00015764885908456727, + "loss": 0.2907, + "step": 14348 + }, + { + "epoch": 1.1624270900842515, + "grad_norm": 0.03700018674135208, + "learning_rate": 0.00015764435843197266, + "loss": 0.2937, + "step": 14349 + }, + { + "epoch": 1.16250810110175, + "grad_norm": 0.038235098123550415, + "learning_rate": 0.00015763985777937802, + "loss": 0.2813, + "step": 14350 + }, + { + "epoch": 1.1625891121192482, + "grad_norm": 0.04438399523496628, + "learning_rate": 0.00015763535712678338, + "loss": 0.329, + "step": 14351 + }, + { + "epoch": 1.1626701231367467, + "grad_norm": 0.04124677553772926, + "learning_rate": 0.00015763085647418876, + "loss": 0.3366, + "step": 14352 + }, + { + "epoch": 1.162751134154245, + "grad_norm": 0.04604765772819519, + "learning_rate": 0.00015762635582159412, + "loss": 0.3107, + "step": 14353 + }, + { + "epoch": 1.1628321451717434, + "grad_norm": 0.0400199219584465, + "learning_rate": 0.0001576218551689995, + "loss": 0.2957, + "step": 14354 + }, + { + "epoch": 1.1629131561892416, + "grad_norm": 0.03940681368112564, + "learning_rate": 0.0001576173545164049, + "loss": 0.3156, + "step": 14355 + }, + { + "epoch": 1.16299416720674, + "grad_norm": 0.043107450008392334, + "learning_rate": 0.00015761285386381026, + "loss": 0.3307, + "step": 14356 + }, + { + "epoch": 1.1630751782242386, + "grad_norm": 0.03884882107377052, + "learning_rate": 0.00015760835321121562, + "loss": 0.2736, + "step": 14357 + }, + { + "epoch": 1.1631561892417368, + "grad_norm": 0.034172337502241135, + "learning_rate": 0.000157603852558621, + "loss": 0.2895, + "step": 14358 + }, + { + "epoch": 1.1632372002592353, + "grad_norm": 0.041414301842451096, + "learning_rate": 0.00015759935190602637, + "loss": 0.3168, + "step": 14359 + }, + { + "epoch": 1.1633182112767337, + "grad_norm": 0.03927793353796005, + "learning_rate": 0.00015759485125343175, + "loss": 0.303, + "step": 14360 + }, + { + "epoch": 1.163399222294232, + "grad_norm": 0.04739841818809509, + "learning_rate": 0.00015759035060083714, + "loss": 0.3275, + "step": 14361 + }, + { + "epoch": 1.1634802333117304, + "grad_norm": 0.0419585183262825, + "learning_rate": 0.0001575858499482425, + "loss": 0.3686, + "step": 14362 + }, + { + "epoch": 1.1635612443292287, + "grad_norm": 0.042601704597473145, + "learning_rate": 0.00015758134929564786, + "loss": 0.3097, + "step": 14363 + }, + { + "epoch": 1.1636422553467272, + "grad_norm": 0.0376935750246048, + "learning_rate": 0.00015757684864305325, + "loss": 0.3189, + "step": 14364 + }, + { + "epoch": 1.1637232663642256, + "grad_norm": 0.03754288703203201, + "learning_rate": 0.0001575723479904586, + "loss": 0.283, + "step": 14365 + }, + { + "epoch": 1.1638042773817239, + "grad_norm": 0.03639476001262665, + "learning_rate": 0.000157567847337864, + "loss": 0.2776, + "step": 14366 + }, + { + "epoch": 1.1638852883992223, + "grad_norm": 0.04828599467873573, + "learning_rate": 0.00015756334668526938, + "loss": 0.3138, + "step": 14367 + }, + { + "epoch": 1.1639662994167206, + "grad_norm": 0.04046766087412834, + "learning_rate": 0.00015755884603267474, + "loss": 0.305, + "step": 14368 + }, + { + "epoch": 1.164047310434219, + "grad_norm": 0.03723064810037613, + "learning_rate": 0.0001575543453800801, + "loss": 0.2456, + "step": 14369 + }, + { + "epoch": 1.1641283214517175, + "grad_norm": 0.03720134496688843, + "learning_rate": 0.0001575498447274855, + "loss": 0.2928, + "step": 14370 + }, + { + "epoch": 1.1642093324692158, + "grad_norm": 0.03799651190638542, + "learning_rate": 0.00015754534407489088, + "loss": 0.2884, + "step": 14371 + }, + { + "epoch": 1.1642903434867142, + "grad_norm": 0.04502912238240242, + "learning_rate": 0.00015754084342229624, + "loss": 0.3492, + "step": 14372 + }, + { + "epoch": 1.1643713545042127, + "grad_norm": 0.046434637159109116, + "learning_rate": 0.00015753634276970162, + "loss": 0.3515, + "step": 14373 + }, + { + "epoch": 1.164452365521711, + "grad_norm": 0.039329420775175095, + "learning_rate": 0.00015753184211710698, + "loss": 0.3143, + "step": 14374 + }, + { + "epoch": 1.1645333765392094, + "grad_norm": 0.03723298758268356, + "learning_rate": 0.00015752734146451234, + "loss": 0.2921, + "step": 14375 + }, + { + "epoch": 1.1646143875567077, + "grad_norm": 0.04591132700443268, + "learning_rate": 0.00015752284081191773, + "loss": 0.3352, + "step": 14376 + }, + { + "epoch": 1.1646953985742061, + "grad_norm": 0.03952774778008461, + "learning_rate": 0.00015751834015932312, + "loss": 0.2851, + "step": 14377 + }, + { + "epoch": 1.1647764095917044, + "grad_norm": 0.04761730879545212, + "learning_rate": 0.00015751383950672848, + "loss": 0.335, + "step": 14378 + }, + { + "epoch": 1.1648574206092028, + "grad_norm": 0.041464634239673615, + "learning_rate": 0.00015750933885413387, + "loss": 0.3514, + "step": 14379 + }, + { + "epoch": 1.1649384316267013, + "grad_norm": 0.03948846831917763, + "learning_rate": 0.00015750483820153923, + "loss": 0.3129, + "step": 14380 + }, + { + "epoch": 1.1650194426441995, + "grad_norm": 0.03981523960828781, + "learning_rate": 0.00015750033754894459, + "loss": 0.3009, + "step": 14381 + }, + { + "epoch": 1.165100453661698, + "grad_norm": 0.03930754214525223, + "learning_rate": 0.00015749583689634997, + "loss": 0.2811, + "step": 14382 + }, + { + "epoch": 1.1651814646791965, + "grad_norm": 0.03473753482103348, + "learning_rate": 0.00015749133624375536, + "loss": 0.2982, + "step": 14383 + }, + { + "epoch": 1.1652624756966947, + "grad_norm": 0.040980495512485504, + "learning_rate": 0.00015748683559116072, + "loss": 0.3249, + "step": 14384 + }, + { + "epoch": 1.1653434867141932, + "grad_norm": 0.04522862285375595, + "learning_rate": 0.0001574823349385661, + "loss": 0.3463, + "step": 14385 + }, + { + "epoch": 1.1654244977316914, + "grad_norm": 0.040367692708969116, + "learning_rate": 0.00015747783428597147, + "loss": 0.325, + "step": 14386 + }, + { + "epoch": 1.16550550874919, + "grad_norm": 0.03818906843662262, + "learning_rate": 0.00015747333363337683, + "loss": 0.3203, + "step": 14387 + }, + { + "epoch": 1.1655865197666881, + "grad_norm": 0.03720659017562866, + "learning_rate": 0.00015746883298078221, + "loss": 0.2866, + "step": 14388 + }, + { + "epoch": 1.1656675307841866, + "grad_norm": 0.0384877473115921, + "learning_rate": 0.0001574643323281876, + "loss": 0.2803, + "step": 14389 + }, + { + "epoch": 1.165748541801685, + "grad_norm": 0.04146861657500267, + "learning_rate": 0.00015745983167559296, + "loss": 0.3333, + "step": 14390 + }, + { + "epoch": 1.1658295528191833, + "grad_norm": 0.04111800342798233, + "learning_rate": 0.00015745533102299835, + "loss": 0.2812, + "step": 14391 + }, + { + "epoch": 1.1659105638366818, + "grad_norm": 0.04299687221646309, + "learning_rate": 0.0001574508303704037, + "loss": 0.382, + "step": 14392 + }, + { + "epoch": 1.1659915748541803, + "grad_norm": 0.03818053752183914, + "learning_rate": 0.0001574463297178091, + "loss": 0.2905, + "step": 14393 + }, + { + "epoch": 1.1660725858716785, + "grad_norm": 0.043594978749752045, + "learning_rate": 0.00015744182906521448, + "loss": 0.2981, + "step": 14394 + }, + { + "epoch": 1.166153596889177, + "grad_norm": 0.04402077943086624, + "learning_rate": 0.00015743732841261984, + "loss": 0.2602, + "step": 14395 + }, + { + "epoch": 1.1662346079066752, + "grad_norm": 0.03786975145339966, + "learning_rate": 0.0001574328277600252, + "loss": 0.2975, + "step": 14396 + }, + { + "epoch": 1.1663156189241737, + "grad_norm": 0.04601101204752922, + "learning_rate": 0.0001574283271074306, + "loss": 0.3394, + "step": 14397 + }, + { + "epoch": 1.1663966299416721, + "grad_norm": 0.04181542620062828, + "learning_rate": 0.00015742382645483595, + "loss": 0.3478, + "step": 14398 + }, + { + "epoch": 1.1664776409591704, + "grad_norm": 0.05040718615055084, + "learning_rate": 0.00015741932580224134, + "loss": 0.313, + "step": 14399 + }, + { + "epoch": 1.1665586519766689, + "grad_norm": 0.04258178547024727, + "learning_rate": 0.00015741482514964672, + "loss": 0.3442, + "step": 14400 + }, + { + "epoch": 1.166639662994167, + "grad_norm": 0.038349322974681854, + "learning_rate": 0.00015741032449705208, + "loss": 0.3307, + "step": 14401 + }, + { + "epoch": 1.1667206740116656, + "grad_norm": 0.04305305331945419, + "learning_rate": 0.00015740582384445744, + "loss": 0.3368, + "step": 14402 + }, + { + "epoch": 1.166801685029164, + "grad_norm": 0.04231201857328415, + "learning_rate": 0.00015740132319186283, + "loss": 0.3361, + "step": 14403 + }, + { + "epoch": 1.1668826960466623, + "grad_norm": 0.04161004349589348, + "learning_rate": 0.0001573968225392682, + "loss": 0.3189, + "step": 14404 + }, + { + "epoch": 1.1669637070641607, + "grad_norm": 0.03625449910759926, + "learning_rate": 0.00015739232188667358, + "loss": 0.3041, + "step": 14405 + }, + { + "epoch": 1.1670447180816592, + "grad_norm": 0.03723830729722977, + "learning_rate": 0.00015738782123407897, + "loss": 0.2839, + "step": 14406 + }, + { + "epoch": 1.1671257290991575, + "grad_norm": 0.04593285173177719, + "learning_rate": 0.00015738332058148433, + "loss": 0.3139, + "step": 14407 + }, + { + "epoch": 1.167206740116656, + "grad_norm": 0.044112276285886765, + "learning_rate": 0.00015737881992888969, + "loss": 0.3136, + "step": 14408 + }, + { + "epoch": 1.1672877511341542, + "grad_norm": 0.04640216380357742, + "learning_rate": 0.00015737431927629507, + "loss": 0.3331, + "step": 14409 + }, + { + "epoch": 1.1673687621516526, + "grad_norm": 0.03842916712164879, + "learning_rate": 0.00015736981862370043, + "loss": 0.3166, + "step": 14410 + }, + { + "epoch": 1.1674497731691509, + "grad_norm": 0.03956562653183937, + "learning_rate": 0.00015736531797110582, + "loss": 0.3116, + "step": 14411 + }, + { + "epoch": 1.1675307841866494, + "grad_norm": 0.03777385875582695, + "learning_rate": 0.0001573608173185112, + "loss": 0.2882, + "step": 14412 + }, + { + "epoch": 1.1676117952041478, + "grad_norm": 0.03600339964032173, + "learning_rate": 0.00015735631666591657, + "loss": 0.2897, + "step": 14413 + }, + { + "epoch": 1.167692806221646, + "grad_norm": 0.042145904153585434, + "learning_rate": 0.00015735181601332193, + "loss": 0.3072, + "step": 14414 + }, + { + "epoch": 1.1677738172391445, + "grad_norm": 0.04632596671581268, + "learning_rate": 0.00015734731536072732, + "loss": 0.3458, + "step": 14415 + }, + { + "epoch": 1.167854828256643, + "grad_norm": 0.04889935255050659, + "learning_rate": 0.00015734281470813268, + "loss": 0.2927, + "step": 14416 + }, + { + "epoch": 1.1679358392741412, + "grad_norm": 0.04121779277920723, + "learning_rate": 0.00015733831405553806, + "loss": 0.3133, + "step": 14417 + }, + { + "epoch": 1.1680168502916397, + "grad_norm": 0.04164166748523712, + "learning_rate": 0.00015733381340294345, + "loss": 0.3279, + "step": 14418 + }, + { + "epoch": 1.168097861309138, + "grad_norm": 0.03872714564204216, + "learning_rate": 0.0001573293127503488, + "loss": 0.2808, + "step": 14419 + }, + { + "epoch": 1.1681788723266364, + "grad_norm": 0.04460643604397774, + "learning_rate": 0.00015732481209775417, + "loss": 0.3182, + "step": 14420 + }, + { + "epoch": 1.1682598833441349, + "grad_norm": 0.045483093708753586, + "learning_rate": 0.00015732031144515956, + "loss": 0.3827, + "step": 14421 + }, + { + "epoch": 1.1683408943616331, + "grad_norm": 0.03746965900063515, + "learning_rate": 0.00015731581079256492, + "loss": 0.3131, + "step": 14422 + }, + { + "epoch": 1.1684219053791316, + "grad_norm": 0.03765581175684929, + "learning_rate": 0.0001573113101399703, + "loss": 0.3005, + "step": 14423 + }, + { + "epoch": 1.1685029163966298, + "grad_norm": 0.04205656424164772, + "learning_rate": 0.0001573068094873757, + "loss": 0.2785, + "step": 14424 + }, + { + "epoch": 1.1685839274141283, + "grad_norm": 0.042121246457099915, + "learning_rate": 0.00015730230883478105, + "loss": 0.3031, + "step": 14425 + }, + { + "epoch": 1.1686649384316268, + "grad_norm": 0.03836040943861008, + "learning_rate": 0.0001572978081821864, + "loss": 0.2686, + "step": 14426 + }, + { + "epoch": 1.168745949449125, + "grad_norm": 0.03815663978457451, + "learning_rate": 0.0001572933075295918, + "loss": 0.3018, + "step": 14427 + }, + { + "epoch": 1.1688269604666235, + "grad_norm": 0.04434993490576744, + "learning_rate": 0.00015728880687699716, + "loss": 0.3274, + "step": 14428 + }, + { + "epoch": 1.168907971484122, + "grad_norm": 0.03531665354967117, + "learning_rate": 0.00015728430622440255, + "loss": 0.2539, + "step": 14429 + }, + { + "epoch": 1.1689889825016202, + "grad_norm": 0.039583683013916016, + "learning_rate": 0.00015727980557180793, + "loss": 0.2733, + "step": 14430 + }, + { + "epoch": 1.1690699935191187, + "grad_norm": 0.04048553854227066, + "learning_rate": 0.0001572753049192133, + "loss": 0.3116, + "step": 14431 + }, + { + "epoch": 1.169151004536617, + "grad_norm": 0.04091466963291168, + "learning_rate": 0.00015727080426661865, + "loss": 0.3222, + "step": 14432 + }, + { + "epoch": 1.1692320155541154, + "grad_norm": 0.044963400810956955, + "learning_rate": 0.00015726630361402404, + "loss": 0.3188, + "step": 14433 + }, + { + "epoch": 1.1693130265716136, + "grad_norm": 0.03848746791481972, + "learning_rate": 0.0001572618029614294, + "loss": 0.2883, + "step": 14434 + }, + { + "epoch": 1.169394037589112, + "grad_norm": 0.045976340770721436, + "learning_rate": 0.0001572573023088348, + "loss": 0.2802, + "step": 14435 + }, + { + "epoch": 1.1694750486066106, + "grad_norm": 0.04464418813586235, + "learning_rate": 0.00015725280165624017, + "loss": 0.3392, + "step": 14436 + }, + { + "epoch": 1.1695560596241088, + "grad_norm": 0.040776655077934265, + "learning_rate": 0.00015724830100364553, + "loss": 0.2888, + "step": 14437 + }, + { + "epoch": 1.1696370706416073, + "grad_norm": 0.04193189740180969, + "learning_rate": 0.0001572438003510509, + "loss": 0.32, + "step": 14438 + }, + { + "epoch": 1.1697180816591057, + "grad_norm": 0.038984332233667374, + "learning_rate": 0.00015723929969845628, + "loss": 0.299, + "step": 14439 + }, + { + "epoch": 1.169799092676604, + "grad_norm": 0.04495490342378616, + "learning_rate": 0.00015723479904586164, + "loss": 0.2879, + "step": 14440 + }, + { + "epoch": 1.1698801036941024, + "grad_norm": 0.03746641427278519, + "learning_rate": 0.00015723029839326703, + "loss": 0.3071, + "step": 14441 + }, + { + "epoch": 1.1699611147116007, + "grad_norm": 0.04167228192090988, + "learning_rate": 0.00015722579774067242, + "loss": 0.2975, + "step": 14442 + }, + { + "epoch": 1.1700421257290992, + "grad_norm": 0.041208457201719284, + "learning_rate": 0.00015722129708807778, + "loss": 0.3123, + "step": 14443 + }, + { + "epoch": 1.1701231367465976, + "grad_norm": 0.03645787760615349, + "learning_rate": 0.00015721679643548314, + "loss": 0.298, + "step": 14444 + }, + { + "epoch": 1.1702041477640959, + "grad_norm": 0.035555098205804825, + "learning_rate": 0.00015721229578288852, + "loss": 0.2618, + "step": 14445 + }, + { + "epoch": 1.1702851587815943, + "grad_norm": 0.04322396591305733, + "learning_rate": 0.0001572077951302939, + "loss": 0.2824, + "step": 14446 + }, + { + "epoch": 1.1703661697990926, + "grad_norm": 0.04139561951160431, + "learning_rate": 0.00015720329447769927, + "loss": 0.3275, + "step": 14447 + }, + { + "epoch": 1.170447180816591, + "grad_norm": 0.049311134964227676, + "learning_rate": 0.00015719879382510466, + "loss": 0.3481, + "step": 14448 + }, + { + "epoch": 1.1705281918340895, + "grad_norm": 0.04070025682449341, + "learning_rate": 0.00015719429317251002, + "loss": 0.3049, + "step": 14449 + }, + { + "epoch": 1.1706092028515878, + "grad_norm": 0.04206862300634384, + "learning_rate": 0.00015718979251991538, + "loss": 0.2965, + "step": 14450 + }, + { + "epoch": 1.1706902138690862, + "grad_norm": 0.047473225742578506, + "learning_rate": 0.00015718529186732076, + "loss": 0.3326, + "step": 14451 + }, + { + "epoch": 1.1707712248865847, + "grad_norm": 0.0422075018286705, + "learning_rate": 0.00015718079121472615, + "loss": 0.3224, + "step": 14452 + }, + { + "epoch": 1.170852235904083, + "grad_norm": 0.04035336151719093, + "learning_rate": 0.0001571762905621315, + "loss": 0.3277, + "step": 14453 + }, + { + "epoch": 1.1709332469215814, + "grad_norm": 0.041684288531541824, + "learning_rate": 0.0001571717899095369, + "loss": 0.2951, + "step": 14454 + }, + { + "epoch": 1.1710142579390797, + "grad_norm": 0.04003993794322014, + "learning_rate": 0.00015716728925694226, + "loss": 0.2992, + "step": 14455 + }, + { + "epoch": 1.1710952689565781, + "grad_norm": 0.03539128601551056, + "learning_rate": 0.00015716278860434762, + "loss": 0.2498, + "step": 14456 + }, + { + "epoch": 1.1711762799740764, + "grad_norm": 0.03944115713238716, + "learning_rate": 0.000157158287951753, + "loss": 0.307, + "step": 14457 + }, + { + "epoch": 1.1712572909915748, + "grad_norm": 0.04354918375611305, + "learning_rate": 0.0001571537872991584, + "loss": 0.3214, + "step": 14458 + }, + { + "epoch": 1.1713383020090733, + "grad_norm": 0.03932163119316101, + "learning_rate": 0.00015714928664656375, + "loss": 0.3064, + "step": 14459 + }, + { + "epoch": 1.1714193130265715, + "grad_norm": 0.03566412627696991, + "learning_rate": 0.00015714478599396914, + "loss": 0.2781, + "step": 14460 + }, + { + "epoch": 1.17150032404407, + "grad_norm": 0.03720712289214134, + "learning_rate": 0.0001571402853413745, + "loss": 0.302, + "step": 14461 + }, + { + "epoch": 1.1715813350615685, + "grad_norm": 0.03921864926815033, + "learning_rate": 0.0001571357846887799, + "loss": 0.2906, + "step": 14462 + }, + { + "epoch": 1.1716623460790667, + "grad_norm": 0.037216588854789734, + "learning_rate": 0.00015713128403618525, + "loss": 0.295, + "step": 14463 + }, + { + "epoch": 1.1717433570965652, + "grad_norm": 0.03720587491989136, + "learning_rate": 0.00015712678338359064, + "loss": 0.3003, + "step": 14464 + }, + { + "epoch": 1.1718243681140634, + "grad_norm": 0.03727641701698303, + "learning_rate": 0.000157122282730996, + "loss": 0.2664, + "step": 14465 + }, + { + "epoch": 1.171905379131562, + "grad_norm": 0.04216910898685455, + "learning_rate": 0.00015711778207840138, + "loss": 0.3153, + "step": 14466 + }, + { + "epoch": 1.1719863901490604, + "grad_norm": 0.034736037254333496, + "learning_rate": 0.00015711328142580674, + "loss": 0.2879, + "step": 14467 + }, + { + "epoch": 1.1720674011665586, + "grad_norm": 0.04738327115774155, + "learning_rate": 0.00015710878077321213, + "loss": 0.316, + "step": 14468 + }, + { + "epoch": 1.172148412184057, + "grad_norm": 0.05080415681004524, + "learning_rate": 0.0001571042801206175, + "loss": 0.3563, + "step": 14469 + }, + { + "epoch": 1.1722294232015553, + "grad_norm": 0.038124267011880875, + "learning_rate": 0.00015709977946802288, + "loss": 0.3089, + "step": 14470 + }, + { + "epoch": 1.1723104342190538, + "grad_norm": 0.03749233856797218, + "learning_rate": 0.00015709527881542824, + "loss": 0.2939, + "step": 14471 + }, + { + "epoch": 1.1723914452365523, + "grad_norm": 0.0384153351187706, + "learning_rate": 0.00015709077816283362, + "loss": 0.2987, + "step": 14472 + }, + { + "epoch": 1.1724724562540505, + "grad_norm": 0.04736333712935448, + "learning_rate": 0.00015708627751023898, + "loss": 0.3691, + "step": 14473 + }, + { + "epoch": 1.172553467271549, + "grad_norm": 0.0412851981818676, + "learning_rate": 0.00015708177685764437, + "loss": 0.3414, + "step": 14474 + }, + { + "epoch": 1.1726344782890474, + "grad_norm": 0.042260248214006424, + "learning_rate": 0.00015707727620504976, + "loss": 0.3084, + "step": 14475 + }, + { + "epoch": 1.1727154893065457, + "grad_norm": 0.03734608739614487, + "learning_rate": 0.00015707277555245512, + "loss": 0.2988, + "step": 14476 + }, + { + "epoch": 1.1727965003240441, + "grad_norm": 0.0376180000603199, + "learning_rate": 0.00015706827489986048, + "loss": 0.2771, + "step": 14477 + }, + { + "epoch": 1.1728775113415424, + "grad_norm": 0.05453381687402725, + "learning_rate": 0.00015706377424726587, + "loss": 0.3332, + "step": 14478 + }, + { + "epoch": 1.1729585223590409, + "grad_norm": 0.04309339076280594, + "learning_rate": 0.00015705927359467123, + "loss": 0.2636, + "step": 14479 + }, + { + "epoch": 1.173039533376539, + "grad_norm": 0.04677773267030716, + "learning_rate": 0.0001570547729420766, + "loss": 0.3339, + "step": 14480 + }, + { + "epoch": 1.1731205443940376, + "grad_norm": 0.04022238403558731, + "learning_rate": 0.000157050272289482, + "loss": 0.2977, + "step": 14481 + }, + { + "epoch": 1.173201555411536, + "grad_norm": 0.04072597250342369, + "learning_rate": 0.00015704577163688736, + "loss": 0.2985, + "step": 14482 + }, + { + "epoch": 1.1732825664290343, + "grad_norm": 0.04243917018175125, + "learning_rate": 0.00015704127098429272, + "loss": 0.3302, + "step": 14483 + }, + { + "epoch": 1.1733635774465327, + "grad_norm": 0.037157706916332245, + "learning_rate": 0.0001570367703316981, + "loss": 0.292, + "step": 14484 + }, + { + "epoch": 1.1734445884640312, + "grad_norm": 0.042577095329761505, + "learning_rate": 0.00015703226967910347, + "loss": 0.3118, + "step": 14485 + }, + { + "epoch": 1.1735255994815295, + "grad_norm": 0.040878184139728546, + "learning_rate": 0.00015702776902650885, + "loss": 0.2686, + "step": 14486 + }, + { + "epoch": 1.173606610499028, + "grad_norm": 0.042164161801338196, + "learning_rate": 0.00015702326837391424, + "loss": 0.3169, + "step": 14487 + }, + { + "epoch": 1.1736876215165262, + "grad_norm": 0.04308300465345383, + "learning_rate": 0.0001570187677213196, + "loss": 0.2903, + "step": 14488 + }, + { + "epoch": 1.1737686325340246, + "grad_norm": 0.04266469553112984, + "learning_rate": 0.00015701426706872496, + "loss": 0.3199, + "step": 14489 + }, + { + "epoch": 1.173849643551523, + "grad_norm": 0.03647917881608009, + "learning_rate": 0.00015700976641613035, + "loss": 0.29, + "step": 14490 + }, + { + "epoch": 1.1739306545690213, + "grad_norm": 0.04074126482009888, + "learning_rate": 0.0001570052657635357, + "loss": 0.3361, + "step": 14491 + }, + { + "epoch": 1.1740116655865198, + "grad_norm": 0.03795121610164642, + "learning_rate": 0.0001570007651109411, + "loss": 0.2605, + "step": 14492 + }, + { + "epoch": 1.174092676604018, + "grad_norm": 0.0438787043094635, + "learning_rate": 0.00015699626445834648, + "loss": 0.3447, + "step": 14493 + }, + { + "epoch": 1.1741736876215165, + "grad_norm": 0.037022512406110764, + "learning_rate": 0.00015699176380575184, + "loss": 0.2762, + "step": 14494 + }, + { + "epoch": 1.174254698639015, + "grad_norm": 0.03483128175139427, + "learning_rate": 0.0001569872631531572, + "loss": 0.268, + "step": 14495 + }, + { + "epoch": 1.1743357096565132, + "grad_norm": 0.03925696015357971, + "learning_rate": 0.0001569827625005626, + "loss": 0.3382, + "step": 14496 + }, + { + "epoch": 1.1744167206740117, + "grad_norm": 0.03809134289622307, + "learning_rate": 0.00015697826184796795, + "loss": 0.2947, + "step": 14497 + }, + { + "epoch": 1.17449773169151, + "grad_norm": 0.04280867800116539, + "learning_rate": 0.00015697376119537334, + "loss": 0.3184, + "step": 14498 + }, + { + "epoch": 1.1745787427090084, + "grad_norm": 0.039980579167604446, + "learning_rate": 0.00015696926054277872, + "loss": 0.308, + "step": 14499 + }, + { + "epoch": 1.1746597537265069, + "grad_norm": 0.03806230053305626, + "learning_rate": 0.00015696475989018409, + "loss": 0.2924, + "step": 14500 + }, + { + "epoch": 1.1747407647440051, + "grad_norm": 0.040367115288972855, + "learning_rate": 0.00015696025923758945, + "loss": 0.3222, + "step": 14501 + }, + { + "epoch": 1.1748217757615036, + "grad_norm": 0.04814135283231735, + "learning_rate": 0.00015695575858499483, + "loss": 0.3295, + "step": 14502 + }, + { + "epoch": 1.1749027867790018, + "grad_norm": 0.03762340918183327, + "learning_rate": 0.0001569512579324002, + "loss": 0.3057, + "step": 14503 + }, + { + "epoch": 1.1749837977965003, + "grad_norm": 0.03948173671960831, + "learning_rate": 0.00015694675727980558, + "loss": 0.2905, + "step": 14504 + }, + { + "epoch": 1.1750648088139988, + "grad_norm": 0.03911614045500755, + "learning_rate": 0.00015694225662721097, + "loss": 0.3082, + "step": 14505 + }, + { + "epoch": 1.175145819831497, + "grad_norm": 0.040515441447496414, + "learning_rate": 0.00015693775597461633, + "loss": 0.3392, + "step": 14506 + }, + { + "epoch": 1.1752268308489955, + "grad_norm": 0.037460993975400925, + "learning_rate": 0.0001569332553220217, + "loss": 0.2566, + "step": 14507 + }, + { + "epoch": 1.175307841866494, + "grad_norm": 0.04020340368151665, + "learning_rate": 0.00015692875466942707, + "loss": 0.3127, + "step": 14508 + }, + { + "epoch": 1.1753888528839922, + "grad_norm": 0.037423986941576004, + "learning_rate": 0.00015692425401683243, + "loss": 0.3021, + "step": 14509 + }, + { + "epoch": 1.1754698639014907, + "grad_norm": 0.041398096829652786, + "learning_rate": 0.00015691975336423782, + "loss": 0.3459, + "step": 14510 + }, + { + "epoch": 1.175550874918989, + "grad_norm": 0.034351762384176254, + "learning_rate": 0.0001569152527116432, + "loss": 0.2458, + "step": 14511 + }, + { + "epoch": 1.1756318859364874, + "grad_norm": 0.04114198684692383, + "learning_rate": 0.00015691075205904857, + "loss": 0.3298, + "step": 14512 + }, + { + "epoch": 1.1757128969539856, + "grad_norm": 0.0437115840613842, + "learning_rate": 0.00015690625140645393, + "loss": 0.3463, + "step": 14513 + }, + { + "epoch": 1.175793907971484, + "grad_norm": 0.03909998759627342, + "learning_rate": 0.00015690175075385932, + "loss": 0.3115, + "step": 14514 + }, + { + "epoch": 1.1758749189889826, + "grad_norm": 0.04081261530518532, + "learning_rate": 0.00015689725010126468, + "loss": 0.2969, + "step": 14515 + }, + { + "epoch": 1.1759559300064808, + "grad_norm": 0.039999548345804214, + "learning_rate": 0.00015689274944867006, + "loss": 0.3258, + "step": 14516 + }, + { + "epoch": 1.1760369410239793, + "grad_norm": 0.04130193963646889, + "learning_rate": 0.00015688824879607545, + "loss": 0.3124, + "step": 14517 + }, + { + "epoch": 1.1761179520414777, + "grad_norm": 0.04370003193616867, + "learning_rate": 0.0001568837481434808, + "loss": 0.3379, + "step": 14518 + }, + { + "epoch": 1.176198963058976, + "grad_norm": 0.03728923201560974, + "learning_rate": 0.00015687924749088617, + "loss": 0.2934, + "step": 14519 + }, + { + "epoch": 1.1762799740764744, + "grad_norm": 0.043891359120607376, + "learning_rate": 0.00015687474683829156, + "loss": 0.3174, + "step": 14520 + }, + { + "epoch": 1.1763609850939727, + "grad_norm": 0.03610162436962128, + "learning_rate": 0.00015687024618569692, + "loss": 0.3094, + "step": 14521 + }, + { + "epoch": 1.1764419961114712, + "grad_norm": 0.04091033712029457, + "learning_rate": 0.0001568657455331023, + "loss": 0.3072, + "step": 14522 + }, + { + "epoch": 1.1765230071289696, + "grad_norm": 0.038009703159332275, + "learning_rate": 0.0001568612448805077, + "loss": 0.2712, + "step": 14523 + }, + { + "epoch": 1.1766040181464679, + "grad_norm": 0.041291069239377975, + "learning_rate": 0.00015685674422791305, + "loss": 0.3014, + "step": 14524 + }, + { + "epoch": 1.1766850291639663, + "grad_norm": 0.04737641662359238, + "learning_rate": 0.0001568522435753184, + "loss": 0.3081, + "step": 14525 + }, + { + "epoch": 1.1767660401814646, + "grad_norm": 0.038802701979875565, + "learning_rate": 0.0001568477429227238, + "loss": 0.298, + "step": 14526 + }, + { + "epoch": 1.176847051198963, + "grad_norm": 0.03882235661149025, + "learning_rate": 0.00015684324227012919, + "loss": 0.3296, + "step": 14527 + }, + { + "epoch": 1.1769280622164615, + "grad_norm": 0.041273076087236404, + "learning_rate": 0.00015683874161753455, + "loss": 0.298, + "step": 14528 + }, + { + "epoch": 1.1770090732339598, + "grad_norm": 0.03610391914844513, + "learning_rate": 0.00015683424096493993, + "loss": 0.2757, + "step": 14529 + }, + { + "epoch": 1.1770900842514582, + "grad_norm": 0.03763338550925255, + "learning_rate": 0.0001568297403123453, + "loss": 0.2966, + "step": 14530 + }, + { + "epoch": 1.1771710952689567, + "grad_norm": 0.04555139318108559, + "learning_rate": 0.00015682523965975068, + "loss": 0.2791, + "step": 14531 + }, + { + "epoch": 1.177252106286455, + "grad_norm": 0.03985441476106644, + "learning_rate": 0.00015682073900715604, + "loss": 0.2839, + "step": 14532 + }, + { + "epoch": 1.1773331173039534, + "grad_norm": 0.03928079083561897, + "learning_rate": 0.00015681623835456143, + "loss": 0.3061, + "step": 14533 + }, + { + "epoch": 1.1774141283214516, + "grad_norm": 0.03562358021736145, + "learning_rate": 0.0001568117377019668, + "loss": 0.2928, + "step": 14534 + }, + { + "epoch": 1.1774951393389501, + "grad_norm": 0.040488019585609436, + "learning_rate": 0.00015680723704937217, + "loss": 0.3247, + "step": 14535 + }, + { + "epoch": 1.1775761503564484, + "grad_norm": 0.04520033299922943, + "learning_rate": 0.00015680273639677753, + "loss": 0.3615, + "step": 14536 + }, + { + "epoch": 1.1776571613739468, + "grad_norm": 0.04714178293943405, + "learning_rate": 0.00015679823574418292, + "loss": 0.2838, + "step": 14537 + }, + { + "epoch": 1.1777381723914453, + "grad_norm": 0.041020654141902924, + "learning_rate": 0.00015679373509158828, + "loss": 0.3136, + "step": 14538 + }, + { + "epoch": 1.1778191834089435, + "grad_norm": 0.044311195611953735, + "learning_rate": 0.00015678923443899367, + "loss": 0.3027, + "step": 14539 + }, + { + "epoch": 1.177900194426442, + "grad_norm": 0.04358860105276108, + "learning_rate": 0.00015678473378639903, + "loss": 0.2881, + "step": 14540 + }, + { + "epoch": 1.1779812054439405, + "grad_norm": 0.038203202188014984, + "learning_rate": 0.00015678023313380442, + "loss": 0.2989, + "step": 14541 + }, + { + "epoch": 1.1780622164614387, + "grad_norm": 0.04246249049901962, + "learning_rate": 0.00015677573248120978, + "loss": 0.2721, + "step": 14542 + }, + { + "epoch": 1.1781432274789372, + "grad_norm": 0.04331636056303978, + "learning_rate": 0.00015677123182861516, + "loss": 0.2921, + "step": 14543 + }, + { + "epoch": 1.1782242384964354, + "grad_norm": 0.03967595845460892, + "learning_rate": 0.00015676673117602052, + "loss": 0.2995, + "step": 14544 + }, + { + "epoch": 1.178305249513934, + "grad_norm": 0.04198235273361206, + "learning_rate": 0.0001567622305234259, + "loss": 0.3049, + "step": 14545 + }, + { + "epoch": 1.1783862605314324, + "grad_norm": 0.03865557909011841, + "learning_rate": 0.00015675772987083127, + "loss": 0.2707, + "step": 14546 + }, + { + "epoch": 1.1784672715489306, + "grad_norm": 0.03893511742353439, + "learning_rate": 0.00015675322921823666, + "loss": 0.2957, + "step": 14547 + }, + { + "epoch": 1.178548282566429, + "grad_norm": 0.045068565756082535, + "learning_rate": 0.00015674872856564202, + "loss": 0.3531, + "step": 14548 + }, + { + "epoch": 1.1786292935839273, + "grad_norm": 0.043633587658405304, + "learning_rate": 0.0001567442279130474, + "loss": 0.2947, + "step": 14549 + }, + { + "epoch": 1.1787103046014258, + "grad_norm": 0.03821111097931862, + "learning_rate": 0.00015673972726045277, + "loss": 0.277, + "step": 14550 + }, + { + "epoch": 1.1787913156189243, + "grad_norm": 0.038753848522901535, + "learning_rate": 0.00015673522660785815, + "loss": 0.2961, + "step": 14551 + }, + { + "epoch": 1.1788723266364225, + "grad_norm": 0.0451669916510582, + "learning_rate": 0.0001567307259552635, + "loss": 0.2882, + "step": 14552 + }, + { + "epoch": 1.178953337653921, + "grad_norm": 0.050977639853954315, + "learning_rate": 0.0001567262253026689, + "loss": 0.3457, + "step": 14553 + }, + { + "epoch": 1.1790343486714194, + "grad_norm": 0.0382966622710228, + "learning_rate": 0.00015672172465007426, + "loss": 0.3034, + "step": 14554 + }, + { + "epoch": 1.1791153596889177, + "grad_norm": 0.05075497925281525, + "learning_rate": 0.00015671722399747965, + "loss": 0.3421, + "step": 14555 + }, + { + "epoch": 1.1791963707064161, + "grad_norm": 0.0408305898308754, + "learning_rate": 0.00015671272334488503, + "loss": 0.3074, + "step": 14556 + }, + { + "epoch": 1.1792773817239144, + "grad_norm": 0.04049469158053398, + "learning_rate": 0.0001567082226922904, + "loss": 0.2931, + "step": 14557 + }, + { + "epoch": 1.1793583927414129, + "grad_norm": 0.03187038004398346, + "learning_rate": 0.00015670372203969575, + "loss": 0.2607, + "step": 14558 + }, + { + "epoch": 1.179439403758911, + "grad_norm": 0.04170691594481468, + "learning_rate": 0.00015669922138710114, + "loss": 0.2891, + "step": 14559 + }, + { + "epoch": 1.1795204147764096, + "grad_norm": 0.04311882704496384, + "learning_rate": 0.0001566947207345065, + "loss": 0.3076, + "step": 14560 + }, + { + "epoch": 1.179601425793908, + "grad_norm": 0.04691338539123535, + "learning_rate": 0.0001566902200819119, + "loss": 0.3463, + "step": 14561 + }, + { + "epoch": 1.1796824368114063, + "grad_norm": 0.04103444144129753, + "learning_rate": 0.00015668571942931728, + "loss": 0.3452, + "step": 14562 + }, + { + "epoch": 1.1797634478289047, + "grad_norm": 0.038828045129776, + "learning_rate": 0.00015668121877672264, + "loss": 0.2922, + "step": 14563 + }, + { + "epoch": 1.1798444588464032, + "grad_norm": 0.04108811914920807, + "learning_rate": 0.000156676718124128, + "loss": 0.3192, + "step": 14564 + }, + { + "epoch": 1.1799254698639015, + "grad_norm": 0.03875766694545746, + "learning_rate": 0.00015667221747153338, + "loss": 0.2748, + "step": 14565 + }, + { + "epoch": 1.1800064808814, + "grad_norm": 0.044968828558921814, + "learning_rate": 0.00015666771681893874, + "loss": 0.3084, + "step": 14566 + }, + { + "epoch": 1.1800874918988982, + "grad_norm": 0.03477316349744797, + "learning_rate": 0.00015666321616634413, + "loss": 0.2727, + "step": 14567 + }, + { + "epoch": 1.1801685029163966, + "grad_norm": 0.03975457698106766, + "learning_rate": 0.00015665871551374952, + "loss": 0.3022, + "step": 14568 + }, + { + "epoch": 1.180249513933895, + "grad_norm": 0.04314548522233963, + "learning_rate": 0.00015665421486115488, + "loss": 0.3261, + "step": 14569 + }, + { + "epoch": 1.1803305249513933, + "grad_norm": 0.04482617601752281, + "learning_rate": 0.00015664971420856024, + "loss": 0.3477, + "step": 14570 + }, + { + "epoch": 1.1804115359688918, + "grad_norm": 0.037931445986032486, + "learning_rate": 0.00015664521355596562, + "loss": 0.3109, + "step": 14571 + }, + { + "epoch": 1.18049254698639, + "grad_norm": 0.03743727505207062, + "learning_rate": 0.00015664071290337098, + "loss": 0.3048, + "step": 14572 + }, + { + "epoch": 1.1805735580038885, + "grad_norm": 0.039792127907276154, + "learning_rate": 0.00015663621225077637, + "loss": 0.2796, + "step": 14573 + }, + { + "epoch": 1.180654569021387, + "grad_norm": 0.040228910744190216, + "learning_rate": 0.00015663171159818176, + "loss": 0.3057, + "step": 14574 + }, + { + "epoch": 1.1807355800388852, + "grad_norm": 0.042243167757987976, + "learning_rate": 0.00015662721094558712, + "loss": 0.2978, + "step": 14575 + }, + { + "epoch": 1.1808165910563837, + "grad_norm": 0.041131455451250076, + "learning_rate": 0.00015662271029299248, + "loss": 0.293, + "step": 14576 + }, + { + "epoch": 1.1808976020738822, + "grad_norm": 0.04173069819808006, + "learning_rate": 0.00015661820964039787, + "loss": 0.3145, + "step": 14577 + }, + { + "epoch": 1.1809786130913804, + "grad_norm": 0.0336281880736351, + "learning_rate": 0.00015661370898780323, + "loss": 0.2806, + "step": 14578 + }, + { + "epoch": 1.1810596241088789, + "grad_norm": 0.04197990521788597, + "learning_rate": 0.0001566092083352086, + "loss": 0.3016, + "step": 14579 + }, + { + "epoch": 1.1811406351263771, + "grad_norm": 0.04100755229592323, + "learning_rate": 0.000156604707682614, + "loss": 0.3097, + "step": 14580 + }, + { + "epoch": 1.1812216461438756, + "grad_norm": 0.040469493716955185, + "learning_rate": 0.00015660020703001936, + "loss": 0.2729, + "step": 14581 + }, + { + "epoch": 1.1813026571613738, + "grad_norm": 0.04121621325612068, + "learning_rate": 0.00015659570637742472, + "loss": 0.3219, + "step": 14582 + }, + { + "epoch": 1.1813836681788723, + "grad_norm": 0.04582282528281212, + "learning_rate": 0.0001565912057248301, + "loss": 0.3659, + "step": 14583 + }, + { + "epoch": 1.1814646791963708, + "grad_norm": 0.04043503850698471, + "learning_rate": 0.00015658670507223547, + "loss": 0.3278, + "step": 14584 + }, + { + "epoch": 1.181545690213869, + "grad_norm": 0.04137546941637993, + "learning_rate": 0.00015658220441964085, + "loss": 0.2734, + "step": 14585 + }, + { + "epoch": 1.1816267012313675, + "grad_norm": 0.043692342936992645, + "learning_rate": 0.00015657770376704624, + "loss": 0.2952, + "step": 14586 + }, + { + "epoch": 1.181707712248866, + "grad_norm": 0.04300142824649811, + "learning_rate": 0.0001565732031144516, + "loss": 0.3212, + "step": 14587 + }, + { + "epoch": 1.1817887232663642, + "grad_norm": 0.050188302993774414, + "learning_rate": 0.00015656870246185696, + "loss": 0.3537, + "step": 14588 + }, + { + "epoch": 1.1818697342838627, + "grad_norm": 0.04783901944756508, + "learning_rate": 0.00015656420180926235, + "loss": 0.318, + "step": 14589 + }, + { + "epoch": 1.181950745301361, + "grad_norm": 0.042725108563899994, + "learning_rate": 0.0001565597011566677, + "loss": 0.3274, + "step": 14590 + }, + { + "epoch": 1.1820317563188594, + "grad_norm": 0.03940672427415848, + "learning_rate": 0.0001565552005040731, + "loss": 0.2916, + "step": 14591 + }, + { + "epoch": 1.1821127673363578, + "grad_norm": 0.03988233208656311, + "learning_rate": 0.00015655069985147848, + "loss": 0.2568, + "step": 14592 + }, + { + "epoch": 1.182193778353856, + "grad_norm": 0.04097625985741615, + "learning_rate": 0.00015654619919888384, + "loss": 0.3416, + "step": 14593 + }, + { + "epoch": 1.1822747893713546, + "grad_norm": 0.04196294769644737, + "learning_rate": 0.0001565416985462892, + "loss": 0.3506, + "step": 14594 + }, + { + "epoch": 1.1823558003888528, + "grad_norm": 0.04109298437833786, + "learning_rate": 0.0001565371978936946, + "loss": 0.3081, + "step": 14595 + }, + { + "epoch": 1.1824368114063513, + "grad_norm": 0.04117002338171005, + "learning_rate": 0.00015653269724109995, + "loss": 0.3119, + "step": 14596 + }, + { + "epoch": 1.1825178224238497, + "grad_norm": 0.04104756563901901, + "learning_rate": 0.00015652819658850534, + "loss": 0.3054, + "step": 14597 + }, + { + "epoch": 1.182598833441348, + "grad_norm": 0.03538989648222923, + "learning_rate": 0.00015652369593591073, + "loss": 0.3011, + "step": 14598 + }, + { + "epoch": 1.1826798444588464, + "grad_norm": 0.04043510556221008, + "learning_rate": 0.00015651919528331609, + "loss": 0.2906, + "step": 14599 + }, + { + "epoch": 1.1827608554763447, + "grad_norm": 0.040407996624708176, + "learning_rate": 0.00015651469463072147, + "loss": 0.3063, + "step": 14600 + }, + { + "epoch": 1.1828418664938432, + "grad_norm": 0.04454110935330391, + "learning_rate": 0.00015651019397812683, + "loss": 0.3811, + "step": 14601 + }, + { + "epoch": 1.1829228775113416, + "grad_norm": 0.04084227234125137, + "learning_rate": 0.0001565056933255322, + "loss": 0.3268, + "step": 14602 + }, + { + "epoch": 1.1830038885288399, + "grad_norm": 0.04479276016354561, + "learning_rate": 0.00015650119267293758, + "loss": 0.344, + "step": 14603 + }, + { + "epoch": 1.1830848995463383, + "grad_norm": 0.04356138035655022, + "learning_rate": 0.00015649669202034297, + "loss": 0.3259, + "step": 14604 + }, + { + "epoch": 1.1831659105638366, + "grad_norm": 0.03863155096769333, + "learning_rate": 0.00015649219136774833, + "loss": 0.2627, + "step": 14605 + }, + { + "epoch": 1.183246921581335, + "grad_norm": 0.043492794036865234, + "learning_rate": 0.00015648769071515371, + "loss": 0.3267, + "step": 14606 + }, + { + "epoch": 1.1833279325988335, + "grad_norm": 0.03926840052008629, + "learning_rate": 0.00015648319006255907, + "loss": 0.3278, + "step": 14607 + }, + { + "epoch": 1.1834089436163318, + "grad_norm": 0.0472257025539875, + "learning_rate": 0.00015647868940996446, + "loss": 0.3595, + "step": 14608 + }, + { + "epoch": 1.1834899546338302, + "grad_norm": 0.036868367344141006, + "learning_rate": 0.00015647418875736982, + "loss": 0.2795, + "step": 14609 + }, + { + "epoch": 1.1835709656513287, + "grad_norm": 0.04061976820230484, + "learning_rate": 0.0001564696881047752, + "loss": 0.3147, + "step": 14610 + }, + { + "epoch": 1.183651976668827, + "grad_norm": 0.048738934099674225, + "learning_rate": 0.00015646518745218057, + "loss": 0.3379, + "step": 14611 + }, + { + "epoch": 1.1837329876863254, + "grad_norm": 0.041473086923360825, + "learning_rate": 0.00015646068679958596, + "loss": 0.3316, + "step": 14612 + }, + { + "epoch": 1.1838139987038236, + "grad_norm": 0.04210762307047844, + "learning_rate": 0.00015645618614699132, + "loss": 0.3278, + "step": 14613 + }, + { + "epoch": 1.1838950097213221, + "grad_norm": 0.04672138765454292, + "learning_rate": 0.0001564516854943967, + "loss": 0.3275, + "step": 14614 + }, + { + "epoch": 1.1839760207388204, + "grad_norm": 0.03847205638885498, + "learning_rate": 0.00015644718484180206, + "loss": 0.2786, + "step": 14615 + }, + { + "epoch": 1.1840570317563188, + "grad_norm": 0.04784049838781357, + "learning_rate": 0.00015644268418920745, + "loss": 0.3308, + "step": 14616 + }, + { + "epoch": 1.1841380427738173, + "grad_norm": 0.046784672886133194, + "learning_rate": 0.0001564381835366128, + "loss": 0.3287, + "step": 14617 + }, + { + "epoch": 1.1842190537913155, + "grad_norm": 0.044036705046892166, + "learning_rate": 0.0001564336828840182, + "loss": 0.3216, + "step": 14618 + }, + { + "epoch": 1.184300064808814, + "grad_norm": 0.04180029034614563, + "learning_rate": 0.00015642918223142356, + "loss": 0.2978, + "step": 14619 + }, + { + "epoch": 1.1843810758263125, + "grad_norm": 0.039441563189029694, + "learning_rate": 0.00015642468157882894, + "loss": 0.3131, + "step": 14620 + }, + { + "epoch": 1.1844620868438107, + "grad_norm": 0.037397947162389755, + "learning_rate": 0.0001564201809262343, + "loss": 0.2865, + "step": 14621 + }, + { + "epoch": 1.1845430978613092, + "grad_norm": 0.044736817479133606, + "learning_rate": 0.0001564156802736397, + "loss": 0.3352, + "step": 14622 + }, + { + "epoch": 1.1846241088788074, + "grad_norm": 0.04054537042975426, + "learning_rate": 0.00015641117962104505, + "loss": 0.2892, + "step": 14623 + }, + { + "epoch": 1.184705119896306, + "grad_norm": 0.03535526618361473, + "learning_rate": 0.00015640667896845044, + "loss": 0.2897, + "step": 14624 + }, + { + "epoch": 1.1847861309138044, + "grad_norm": 0.03459848091006279, + "learning_rate": 0.0001564021783158558, + "loss": 0.287, + "step": 14625 + }, + { + "epoch": 1.1848671419313026, + "grad_norm": 0.044806841760873795, + "learning_rate": 0.00015639767766326119, + "loss": 0.3095, + "step": 14626 + }, + { + "epoch": 1.184948152948801, + "grad_norm": 0.040577251464128494, + "learning_rate": 0.00015639317701066655, + "loss": 0.3046, + "step": 14627 + }, + { + "epoch": 1.1850291639662993, + "grad_norm": 0.03882734477519989, + "learning_rate": 0.00015638867635807193, + "loss": 0.281, + "step": 14628 + }, + { + "epoch": 1.1851101749837978, + "grad_norm": 0.040150970220565796, + "learning_rate": 0.0001563841757054773, + "loss": 0.3207, + "step": 14629 + }, + { + "epoch": 1.1851911860012962, + "grad_norm": 0.037906549870967865, + "learning_rate": 0.00015637967505288268, + "loss": 0.2897, + "step": 14630 + }, + { + "epoch": 1.1852721970187945, + "grad_norm": 0.042285289615392685, + "learning_rate": 0.00015637517440028807, + "loss": 0.342, + "step": 14631 + }, + { + "epoch": 1.185353208036293, + "grad_norm": 0.04037885740399361, + "learning_rate": 0.00015637067374769343, + "loss": 0.3101, + "step": 14632 + }, + { + "epoch": 1.1854342190537914, + "grad_norm": 0.03909390792250633, + "learning_rate": 0.0001563661730950988, + "loss": 0.2753, + "step": 14633 + }, + { + "epoch": 1.1855152300712897, + "grad_norm": 0.03916388377547264, + "learning_rate": 0.00015636167244250417, + "loss": 0.2996, + "step": 14634 + }, + { + "epoch": 1.1855962410887881, + "grad_norm": 0.03810279443860054, + "learning_rate": 0.00015635717178990954, + "loss": 0.3103, + "step": 14635 + }, + { + "epoch": 1.1856772521062864, + "grad_norm": 0.04713892191648483, + "learning_rate": 0.00015635267113731492, + "loss": 0.331, + "step": 14636 + }, + { + "epoch": 1.1857582631237849, + "grad_norm": 0.041569601744413376, + "learning_rate": 0.0001563481704847203, + "loss": 0.2999, + "step": 14637 + }, + { + "epoch": 1.185839274141283, + "grad_norm": 0.03498614951968193, + "learning_rate": 0.00015634366983212567, + "loss": 0.291, + "step": 14638 + }, + { + "epoch": 1.1859202851587816, + "grad_norm": 0.038540758192539215, + "learning_rate": 0.00015633916917953103, + "loss": 0.2772, + "step": 14639 + }, + { + "epoch": 1.18600129617628, + "grad_norm": 0.04108189418911934, + "learning_rate": 0.00015633466852693642, + "loss": 0.3065, + "step": 14640 + }, + { + "epoch": 1.1860823071937783, + "grad_norm": 0.039533913135528564, + "learning_rate": 0.00015633016787434178, + "loss": 0.3456, + "step": 14641 + }, + { + "epoch": 1.1861633182112767, + "grad_norm": 0.040663499385118484, + "learning_rate": 0.00015632566722174716, + "loss": 0.2879, + "step": 14642 + }, + { + "epoch": 1.1862443292287752, + "grad_norm": 0.04121604189276695, + "learning_rate": 0.00015632116656915255, + "loss": 0.3075, + "step": 14643 + }, + { + "epoch": 1.1863253402462735, + "grad_norm": 0.03926914185285568, + "learning_rate": 0.0001563166659165579, + "loss": 0.3049, + "step": 14644 + }, + { + "epoch": 1.186406351263772, + "grad_norm": 0.04522679001092911, + "learning_rate": 0.00015631216526396327, + "loss": 0.3001, + "step": 14645 + }, + { + "epoch": 1.1864873622812702, + "grad_norm": 0.03877709433436394, + "learning_rate": 0.00015630766461136866, + "loss": 0.2935, + "step": 14646 + }, + { + "epoch": 1.1865683732987686, + "grad_norm": 0.03700802102684975, + "learning_rate": 0.00015630316395877402, + "loss": 0.3181, + "step": 14647 + }, + { + "epoch": 1.186649384316267, + "grad_norm": 0.05110809579491615, + "learning_rate": 0.0001562986633061794, + "loss": 0.3649, + "step": 14648 + }, + { + "epoch": 1.1867303953337653, + "grad_norm": 0.03704257681965828, + "learning_rate": 0.0001562941626535848, + "loss": 0.2711, + "step": 14649 + }, + { + "epoch": 1.1868114063512638, + "grad_norm": 0.04234371334314346, + "learning_rate": 0.00015628966200099015, + "loss": 0.3677, + "step": 14650 + }, + { + "epoch": 1.186892417368762, + "grad_norm": 0.04498139023780823, + "learning_rate": 0.0001562851613483955, + "loss": 0.3933, + "step": 14651 + }, + { + "epoch": 1.1869734283862605, + "grad_norm": 0.041128672659397125, + "learning_rate": 0.0001562806606958009, + "loss": 0.3123, + "step": 14652 + }, + { + "epoch": 1.187054439403759, + "grad_norm": 0.04224330186843872, + "learning_rate": 0.00015627616004320626, + "loss": 0.2788, + "step": 14653 + }, + { + "epoch": 1.1871354504212572, + "grad_norm": 0.0469592809677124, + "learning_rate": 0.00015627165939061165, + "loss": 0.3154, + "step": 14654 + }, + { + "epoch": 1.1872164614387557, + "grad_norm": 0.04049103334546089, + "learning_rate": 0.00015626715873801703, + "loss": 0.3137, + "step": 14655 + }, + { + "epoch": 1.1872974724562542, + "grad_norm": 0.04263480752706528, + "learning_rate": 0.0001562626580854224, + "loss": 0.3052, + "step": 14656 + }, + { + "epoch": 1.1873784834737524, + "grad_norm": 0.040018241852521896, + "learning_rate": 0.00015625815743282775, + "loss": 0.273, + "step": 14657 + }, + { + "epoch": 1.1874594944912509, + "grad_norm": 0.03727559745311737, + "learning_rate": 0.00015625365678023314, + "loss": 0.3177, + "step": 14658 + }, + { + "epoch": 1.1875405055087491, + "grad_norm": 0.03743201121687889, + "learning_rate": 0.0001562491561276385, + "loss": 0.3113, + "step": 14659 + }, + { + "epoch": 1.1876215165262476, + "grad_norm": 0.0455901138484478, + "learning_rate": 0.0001562446554750439, + "loss": 0.2975, + "step": 14660 + }, + { + "epoch": 1.1877025275437458, + "grad_norm": 0.03635476902127266, + "learning_rate": 0.00015624015482244928, + "loss": 0.2866, + "step": 14661 + }, + { + "epoch": 1.1877835385612443, + "grad_norm": 0.045002736151218414, + "learning_rate": 0.00015623565416985464, + "loss": 0.358, + "step": 14662 + }, + { + "epoch": 1.1878645495787428, + "grad_norm": 0.0431043915450573, + "learning_rate": 0.00015623115351726, + "loss": 0.3263, + "step": 14663 + }, + { + "epoch": 1.187945560596241, + "grad_norm": 0.037004776298999786, + "learning_rate": 0.00015622665286466538, + "loss": 0.2974, + "step": 14664 + }, + { + "epoch": 1.1880265716137395, + "grad_norm": 0.03819013759493828, + "learning_rate": 0.00015622215221207074, + "loss": 0.2786, + "step": 14665 + }, + { + "epoch": 1.188107582631238, + "grad_norm": 0.042751818895339966, + "learning_rate": 0.00015621765155947613, + "loss": 0.3182, + "step": 14666 + }, + { + "epoch": 1.1881885936487362, + "grad_norm": 0.04809313639998436, + "learning_rate": 0.00015621315090688152, + "loss": 0.3312, + "step": 14667 + }, + { + "epoch": 1.1882696046662347, + "grad_norm": 0.04383840411901474, + "learning_rate": 0.00015620865025428688, + "loss": 0.321, + "step": 14668 + }, + { + "epoch": 1.188350615683733, + "grad_norm": 0.05272422730922699, + "learning_rate": 0.00015620414960169226, + "loss": 0.3385, + "step": 14669 + }, + { + "epoch": 1.1884316267012314, + "grad_norm": 0.049258504062891006, + "learning_rate": 0.00015619964894909762, + "loss": 0.3773, + "step": 14670 + }, + { + "epoch": 1.1885126377187298, + "grad_norm": 0.043628692626953125, + "learning_rate": 0.00015619514829650298, + "loss": 0.309, + "step": 14671 + }, + { + "epoch": 1.188593648736228, + "grad_norm": 0.03882945701479912, + "learning_rate": 0.00015619064764390837, + "loss": 0.268, + "step": 14672 + }, + { + "epoch": 1.1886746597537265, + "grad_norm": 0.0409681610763073, + "learning_rate": 0.00015618614699131376, + "loss": 0.3279, + "step": 14673 + }, + { + "epoch": 1.1887556707712248, + "grad_norm": 0.042711541056632996, + "learning_rate": 0.00015618164633871912, + "loss": 0.3125, + "step": 14674 + }, + { + "epoch": 1.1888366817887233, + "grad_norm": 0.046427272260189056, + "learning_rate": 0.0001561771456861245, + "loss": 0.3627, + "step": 14675 + }, + { + "epoch": 1.1889176928062217, + "grad_norm": 0.03923366591334343, + "learning_rate": 0.00015617264503352987, + "loss": 0.2783, + "step": 14676 + }, + { + "epoch": 1.18899870382372, + "grad_norm": 0.042100224643945694, + "learning_rate": 0.00015616814438093523, + "loss": 0.3132, + "step": 14677 + }, + { + "epoch": 1.1890797148412184, + "grad_norm": 0.05138513818383217, + "learning_rate": 0.0001561636437283406, + "loss": 0.3283, + "step": 14678 + }, + { + "epoch": 1.189160725858717, + "grad_norm": 0.038992591202259064, + "learning_rate": 0.000156159143075746, + "loss": 0.2974, + "step": 14679 + }, + { + "epoch": 1.1892417368762151, + "grad_norm": 0.03872741758823395, + "learning_rate": 0.00015615464242315136, + "loss": 0.2748, + "step": 14680 + }, + { + "epoch": 1.1893227478937136, + "grad_norm": 0.03662829101085663, + "learning_rate": 0.00015615014177055675, + "loss": 0.2969, + "step": 14681 + }, + { + "epoch": 1.1894037589112119, + "grad_norm": 0.044510021805763245, + "learning_rate": 0.0001561456411179621, + "loss": 0.3523, + "step": 14682 + }, + { + "epoch": 1.1894847699287103, + "grad_norm": 0.03760409727692604, + "learning_rate": 0.00015614114046536747, + "loss": 0.2595, + "step": 14683 + }, + { + "epoch": 1.1895657809462086, + "grad_norm": 0.03967760503292084, + "learning_rate": 0.00015613663981277286, + "loss": 0.2953, + "step": 14684 + }, + { + "epoch": 1.189646791963707, + "grad_norm": 0.043346818536520004, + "learning_rate": 0.00015613213916017824, + "loss": 0.3033, + "step": 14685 + }, + { + "epoch": 1.1897278029812055, + "grad_norm": 0.04163350909948349, + "learning_rate": 0.0001561276385075836, + "loss": 0.3169, + "step": 14686 + }, + { + "epoch": 1.1898088139987038, + "grad_norm": 0.04359233006834984, + "learning_rate": 0.000156123137854989, + "loss": 0.3216, + "step": 14687 + }, + { + "epoch": 1.1898898250162022, + "grad_norm": 0.04172962158918381, + "learning_rate": 0.00015611863720239435, + "loss": 0.2949, + "step": 14688 + }, + { + "epoch": 1.1899708360337007, + "grad_norm": 0.0403163880109787, + "learning_rate": 0.00015611413654979974, + "loss": 0.3082, + "step": 14689 + }, + { + "epoch": 1.190051847051199, + "grad_norm": 0.03849377855658531, + "learning_rate": 0.0001561096358972051, + "loss": 0.3316, + "step": 14690 + }, + { + "epoch": 1.1901328580686974, + "grad_norm": 0.041115447878837585, + "learning_rate": 0.00015610513524461048, + "loss": 0.2796, + "step": 14691 + }, + { + "epoch": 1.1902138690861956, + "grad_norm": 0.0451335534453392, + "learning_rate": 0.00015610063459201584, + "loss": 0.3169, + "step": 14692 + }, + { + "epoch": 1.190294880103694, + "grad_norm": 0.04398505389690399, + "learning_rate": 0.00015609613393942123, + "loss": 0.3318, + "step": 14693 + }, + { + "epoch": 1.1903758911211926, + "grad_norm": 0.034957144409418106, + "learning_rate": 0.0001560916332868266, + "loss": 0.2525, + "step": 14694 + }, + { + "epoch": 1.1904569021386908, + "grad_norm": 0.040896087884902954, + "learning_rate": 0.00015608713263423198, + "loss": 0.3242, + "step": 14695 + }, + { + "epoch": 1.1905379131561893, + "grad_norm": 0.03556426241993904, + "learning_rate": 0.00015608263198163734, + "loss": 0.2734, + "step": 14696 + }, + { + "epoch": 1.1906189241736875, + "grad_norm": 0.04682194069027901, + "learning_rate": 0.00015607813132904273, + "loss": 0.3236, + "step": 14697 + }, + { + "epoch": 1.190699935191186, + "grad_norm": 0.044954705983400345, + "learning_rate": 0.00015607363067644809, + "loss": 0.348, + "step": 14698 + }, + { + "epoch": 1.1907809462086845, + "grad_norm": 0.04162221774458885, + "learning_rate": 0.00015606913002385347, + "loss": 0.3005, + "step": 14699 + }, + { + "epoch": 1.1908619572261827, + "grad_norm": 0.045600537210702896, + "learning_rate": 0.00015606462937125883, + "loss": 0.3018, + "step": 14700 + }, + { + "epoch": 1.1909429682436812, + "grad_norm": 0.04428591579198837, + "learning_rate": 0.00015606012871866422, + "loss": 0.2874, + "step": 14701 + }, + { + "epoch": 1.1910239792611796, + "grad_norm": 0.04231373220682144, + "learning_rate": 0.00015605562806606958, + "loss": 0.3303, + "step": 14702 + }, + { + "epoch": 1.1911049902786779, + "grad_norm": 0.04606911540031433, + "learning_rate": 0.00015605112741347497, + "loss": 0.3176, + "step": 14703 + }, + { + "epoch": 1.1911860012961764, + "grad_norm": 0.04126888886094093, + "learning_rate": 0.00015604662676088033, + "loss": 0.2939, + "step": 14704 + }, + { + "epoch": 1.1912670123136746, + "grad_norm": 0.050967976450920105, + "learning_rate": 0.00015604212610828571, + "loss": 0.3263, + "step": 14705 + }, + { + "epoch": 1.191348023331173, + "grad_norm": 0.04140590876340866, + "learning_rate": 0.00015603762545569107, + "loss": 0.3294, + "step": 14706 + }, + { + "epoch": 1.1914290343486713, + "grad_norm": 0.03835592046380043, + "learning_rate": 0.00015603312480309646, + "loss": 0.2925, + "step": 14707 + }, + { + "epoch": 1.1915100453661698, + "grad_norm": 0.04171990230679512, + "learning_rate": 0.00015602862415050182, + "loss": 0.3168, + "step": 14708 + }, + { + "epoch": 1.1915910563836682, + "grad_norm": 0.04122872278094292, + "learning_rate": 0.0001560241234979072, + "loss": 0.3165, + "step": 14709 + }, + { + "epoch": 1.1916720674011665, + "grad_norm": 0.04952973499894142, + "learning_rate": 0.00015601962284531257, + "loss": 0.3228, + "step": 14710 + }, + { + "epoch": 1.191753078418665, + "grad_norm": 0.03939840570092201, + "learning_rate": 0.00015601512219271796, + "loss": 0.3099, + "step": 14711 + }, + { + "epoch": 1.1918340894361634, + "grad_norm": 0.04720155522227287, + "learning_rate": 0.00015601062154012334, + "loss": 0.3551, + "step": 14712 + }, + { + "epoch": 1.1919151004536617, + "grad_norm": 0.04138209670782089, + "learning_rate": 0.0001560061208875287, + "loss": 0.308, + "step": 14713 + }, + { + "epoch": 1.1919961114711601, + "grad_norm": 0.042646877467632294, + "learning_rate": 0.00015600162023493406, + "loss": 0.3357, + "step": 14714 + }, + { + "epoch": 1.1920771224886584, + "grad_norm": 0.03877168893814087, + "learning_rate": 0.00015599711958233945, + "loss": 0.3161, + "step": 14715 + }, + { + "epoch": 1.1921581335061568, + "grad_norm": 0.03586186096072197, + "learning_rate": 0.0001559926189297448, + "loss": 0.2873, + "step": 14716 + }, + { + "epoch": 1.1922391445236553, + "grad_norm": 0.039949409663677216, + "learning_rate": 0.0001559881182771502, + "loss": 0.2944, + "step": 14717 + }, + { + "epoch": 1.1923201555411536, + "grad_norm": 0.035949863493442535, + "learning_rate": 0.00015598361762455558, + "loss": 0.2724, + "step": 14718 + }, + { + "epoch": 1.192401166558652, + "grad_norm": 0.047032542526721954, + "learning_rate": 0.00015597911697196094, + "loss": 0.2925, + "step": 14719 + }, + { + "epoch": 1.1924821775761503, + "grad_norm": 0.042840149253606796, + "learning_rate": 0.0001559746163193663, + "loss": 0.3464, + "step": 14720 + }, + { + "epoch": 1.1925631885936487, + "grad_norm": 0.03936777636408806, + "learning_rate": 0.0001559701156667717, + "loss": 0.2892, + "step": 14721 + }, + { + "epoch": 1.1926441996111472, + "grad_norm": 0.03563765808939934, + "learning_rate": 0.00015596561501417705, + "loss": 0.271, + "step": 14722 + }, + { + "epoch": 1.1927252106286454, + "grad_norm": 0.04076451063156128, + "learning_rate": 0.00015596111436158244, + "loss": 0.3216, + "step": 14723 + }, + { + "epoch": 1.192806221646144, + "grad_norm": 0.04466822370886803, + "learning_rate": 0.00015595661370898783, + "loss": 0.3353, + "step": 14724 + }, + { + "epoch": 1.1928872326636422, + "grad_norm": 0.04330145940184593, + "learning_rate": 0.00015595211305639319, + "loss": 0.2819, + "step": 14725 + }, + { + "epoch": 1.1929682436811406, + "grad_norm": 0.04840012267231941, + "learning_rate": 0.00015594761240379855, + "loss": 0.3132, + "step": 14726 + }, + { + "epoch": 1.193049254698639, + "grad_norm": 0.052081044763326645, + "learning_rate": 0.00015594311175120393, + "loss": 0.3, + "step": 14727 + }, + { + "epoch": 1.1931302657161373, + "grad_norm": 0.04210412874817848, + "learning_rate": 0.0001559386110986093, + "loss": 0.3137, + "step": 14728 + }, + { + "epoch": 1.1932112767336358, + "grad_norm": 0.04098066687583923, + "learning_rate": 0.00015593411044601468, + "loss": 0.301, + "step": 14729 + }, + { + "epoch": 1.193292287751134, + "grad_norm": 0.043190643191337585, + "learning_rate": 0.00015592960979342007, + "loss": 0.299, + "step": 14730 + }, + { + "epoch": 1.1933732987686325, + "grad_norm": 0.041205570101737976, + "learning_rate": 0.00015592510914082543, + "loss": 0.2928, + "step": 14731 + }, + { + "epoch": 1.193454309786131, + "grad_norm": 0.04265919327735901, + "learning_rate": 0.0001559206084882308, + "loss": 0.3183, + "step": 14732 + }, + { + "epoch": 1.1935353208036292, + "grad_norm": 0.041771385818719864, + "learning_rate": 0.00015591610783563618, + "loss": 0.2907, + "step": 14733 + }, + { + "epoch": 1.1936163318211277, + "grad_norm": 0.04530198499560356, + "learning_rate": 0.00015591160718304154, + "loss": 0.2799, + "step": 14734 + }, + { + "epoch": 1.1936973428386262, + "grad_norm": 0.03936564549803734, + "learning_rate": 0.00015590710653044692, + "loss": 0.2737, + "step": 14735 + }, + { + "epoch": 1.1937783538561244, + "grad_norm": 0.051649149507284164, + "learning_rate": 0.0001559026058778523, + "loss": 0.3649, + "step": 14736 + }, + { + "epoch": 1.1938593648736229, + "grad_norm": 0.0410127229988575, + "learning_rate": 0.00015589810522525767, + "loss": 0.3007, + "step": 14737 + }, + { + "epoch": 1.1939403758911211, + "grad_norm": 0.03969676047563553, + "learning_rate": 0.00015589360457266306, + "loss": 0.2863, + "step": 14738 + }, + { + "epoch": 1.1940213869086196, + "grad_norm": 0.043512873351573944, + "learning_rate": 0.00015588910392006842, + "loss": 0.3205, + "step": 14739 + }, + { + "epoch": 1.1941023979261178, + "grad_norm": 0.04225597903132439, + "learning_rate": 0.00015588460326747378, + "loss": 0.2796, + "step": 14740 + }, + { + "epoch": 1.1941834089436163, + "grad_norm": 0.04662526398897171, + "learning_rate": 0.00015588010261487916, + "loss": 0.3338, + "step": 14741 + }, + { + "epoch": 1.1942644199611148, + "grad_norm": 0.03814233839511871, + "learning_rate": 0.00015587560196228455, + "loss": 0.3217, + "step": 14742 + }, + { + "epoch": 1.194345430978613, + "grad_norm": 0.04339953511953354, + "learning_rate": 0.0001558711013096899, + "loss": 0.3292, + "step": 14743 + }, + { + "epoch": 1.1944264419961115, + "grad_norm": 0.038826216012239456, + "learning_rate": 0.0001558666006570953, + "loss": 0.3074, + "step": 14744 + }, + { + "epoch": 1.19450745301361, + "grad_norm": 0.044687457382678986, + "learning_rate": 0.00015586210000450066, + "loss": 0.3296, + "step": 14745 + }, + { + "epoch": 1.1945884640311082, + "grad_norm": 0.04153241962194443, + "learning_rate": 0.00015585759935190602, + "loss": 0.3272, + "step": 14746 + }, + { + "epoch": 1.1946694750486067, + "grad_norm": 0.039639074355363846, + "learning_rate": 0.0001558530986993114, + "loss": 0.2531, + "step": 14747 + }, + { + "epoch": 1.194750486066105, + "grad_norm": 0.04232201352715492, + "learning_rate": 0.0001558485980467168, + "loss": 0.3128, + "step": 14748 + }, + { + "epoch": 1.1948314970836034, + "grad_norm": 0.03908455744385719, + "learning_rate": 0.00015584409739412215, + "loss": 0.2594, + "step": 14749 + }, + { + "epoch": 1.1949125081011018, + "grad_norm": 0.04053455963730812, + "learning_rate": 0.00015583959674152754, + "loss": 0.3064, + "step": 14750 + }, + { + "epoch": 1.1949935191186, + "grad_norm": 0.04426248371601105, + "learning_rate": 0.0001558350960889329, + "loss": 0.3536, + "step": 14751 + }, + { + "epoch": 1.1950745301360985, + "grad_norm": 0.04585438594222069, + "learning_rate": 0.00015583059543633826, + "loss": 0.2993, + "step": 14752 + }, + { + "epoch": 1.1951555411535968, + "grad_norm": 0.056634701788425446, + "learning_rate": 0.00015582609478374365, + "loss": 0.3503, + "step": 14753 + }, + { + "epoch": 1.1952365521710953, + "grad_norm": 0.0459580272436142, + "learning_rate": 0.00015582159413114903, + "loss": 0.2628, + "step": 14754 + }, + { + "epoch": 1.1953175631885937, + "grad_norm": 0.04491327330470085, + "learning_rate": 0.0001558170934785544, + "loss": 0.3054, + "step": 14755 + }, + { + "epoch": 1.195398574206092, + "grad_norm": 0.044432204216718674, + "learning_rate": 0.00015581259282595978, + "loss": 0.3375, + "step": 14756 + }, + { + "epoch": 1.1954795852235904, + "grad_norm": 0.03885927423834801, + "learning_rate": 0.00015580809217336514, + "loss": 0.2913, + "step": 14757 + }, + { + "epoch": 1.195560596241089, + "grad_norm": 0.04088282212615013, + "learning_rate": 0.0001558035915207705, + "loss": 0.2791, + "step": 14758 + }, + { + "epoch": 1.1956416072585871, + "grad_norm": 0.041324764490127563, + "learning_rate": 0.0001557990908681759, + "loss": 0.2867, + "step": 14759 + }, + { + "epoch": 1.1957226182760856, + "grad_norm": 0.049072910100221634, + "learning_rate": 0.00015579459021558128, + "loss": 0.3258, + "step": 14760 + }, + { + "epoch": 1.1958036292935839, + "grad_norm": 0.03818585351109505, + "learning_rate": 0.00015579008956298664, + "loss": 0.3228, + "step": 14761 + }, + { + "epoch": 1.1958846403110823, + "grad_norm": 0.051422007381916046, + "learning_rate": 0.00015578558891039202, + "loss": 0.3233, + "step": 14762 + }, + { + "epoch": 1.1959656513285806, + "grad_norm": 0.04118951037526131, + "learning_rate": 0.00015578108825779738, + "loss": 0.3029, + "step": 14763 + }, + { + "epoch": 1.196046662346079, + "grad_norm": 0.03960846737027168, + "learning_rate": 0.00015577658760520277, + "loss": 0.2672, + "step": 14764 + }, + { + "epoch": 1.1961276733635775, + "grad_norm": 0.039535727351903915, + "learning_rate": 0.00015577208695260813, + "loss": 0.3157, + "step": 14765 + }, + { + "epoch": 1.1962086843810757, + "grad_norm": 0.042673733085393906, + "learning_rate": 0.00015576758630001352, + "loss": 0.2838, + "step": 14766 + }, + { + "epoch": 1.1962896953985742, + "grad_norm": 0.04212876781821251, + "learning_rate": 0.00015576308564741888, + "loss": 0.2957, + "step": 14767 + }, + { + "epoch": 1.1963707064160727, + "grad_norm": 0.04549403116106987, + "learning_rate": 0.00015575858499482426, + "loss": 0.3269, + "step": 14768 + }, + { + "epoch": 1.196451717433571, + "grad_norm": 0.04399016126990318, + "learning_rate": 0.00015575408434222962, + "loss": 0.2969, + "step": 14769 + }, + { + "epoch": 1.1965327284510694, + "grad_norm": 0.044396717101335526, + "learning_rate": 0.000155749583689635, + "loss": 0.3453, + "step": 14770 + }, + { + "epoch": 1.1966137394685676, + "grad_norm": 0.03851454332470894, + "learning_rate": 0.00015574508303704037, + "loss": 0.2775, + "step": 14771 + }, + { + "epoch": 1.196694750486066, + "grad_norm": 0.043037425726652145, + "learning_rate": 0.00015574058238444576, + "loss": 0.3044, + "step": 14772 + }, + { + "epoch": 1.1967757615035646, + "grad_norm": 0.047682128846645355, + "learning_rate": 0.00015573608173185112, + "loss": 0.3349, + "step": 14773 + }, + { + "epoch": 1.1968567725210628, + "grad_norm": 0.04036872088909149, + "learning_rate": 0.0001557315810792565, + "loss": 0.2752, + "step": 14774 + }, + { + "epoch": 1.1969377835385613, + "grad_norm": 0.041800472885370255, + "learning_rate": 0.00015572708042666187, + "loss": 0.3147, + "step": 14775 + }, + { + "epoch": 1.1970187945560595, + "grad_norm": 0.042227502912282944, + "learning_rate": 0.00015572257977406725, + "loss": 0.318, + "step": 14776 + }, + { + "epoch": 1.197099805573558, + "grad_norm": 0.03940373286604881, + "learning_rate": 0.00015571807912147261, + "loss": 0.3208, + "step": 14777 + }, + { + "epoch": 1.1971808165910565, + "grad_norm": 0.03364204987883568, + "learning_rate": 0.000155713578468878, + "loss": 0.294, + "step": 14778 + }, + { + "epoch": 1.1972618276085547, + "grad_norm": 0.03734065592288971, + "learning_rate": 0.00015570907781628336, + "loss": 0.2902, + "step": 14779 + }, + { + "epoch": 1.1973428386260532, + "grad_norm": 0.03665858134627342, + "learning_rate": 0.00015570457716368875, + "loss": 0.2887, + "step": 14780 + }, + { + "epoch": 1.1974238496435516, + "grad_norm": 0.043738704174757004, + "learning_rate": 0.0001557000765110941, + "loss": 0.3203, + "step": 14781 + }, + { + "epoch": 1.1975048606610499, + "grad_norm": 0.04412857070565224, + "learning_rate": 0.0001556955758584995, + "loss": 0.316, + "step": 14782 + }, + { + "epoch": 1.1975858716785484, + "grad_norm": 0.03947075083851814, + "learning_rate": 0.00015569107520590486, + "loss": 0.2555, + "step": 14783 + }, + { + "epoch": 1.1976668826960466, + "grad_norm": 0.04453642666339874, + "learning_rate": 0.00015568657455331024, + "loss": 0.3004, + "step": 14784 + }, + { + "epoch": 1.197747893713545, + "grad_norm": 0.0402337908744812, + "learning_rate": 0.0001556820739007156, + "loss": 0.2874, + "step": 14785 + }, + { + "epoch": 1.1978289047310433, + "grad_norm": 0.042971957474946976, + "learning_rate": 0.000155677573248121, + "loss": 0.3229, + "step": 14786 + }, + { + "epoch": 1.1979099157485418, + "grad_norm": 0.03841191530227661, + "learning_rate": 0.00015567307259552635, + "loss": 0.2986, + "step": 14787 + }, + { + "epoch": 1.1979909267660402, + "grad_norm": 0.04925091192126274, + "learning_rate": 0.00015566857194293174, + "loss": 0.3171, + "step": 14788 + }, + { + "epoch": 1.1980719377835385, + "grad_norm": 0.04737703502178192, + "learning_rate": 0.0001556640712903371, + "loss": 0.299, + "step": 14789 + }, + { + "epoch": 1.198152948801037, + "grad_norm": 0.03469955176115036, + "learning_rate": 0.00015565957063774248, + "loss": 0.258, + "step": 14790 + }, + { + "epoch": 1.1982339598185354, + "grad_norm": 0.03793976455926895, + "learning_rate": 0.00015565506998514784, + "loss": 0.2769, + "step": 14791 + }, + { + "epoch": 1.1983149708360337, + "grad_norm": 0.04765570908784866, + "learning_rate": 0.00015565056933255323, + "loss": 0.3252, + "step": 14792 + }, + { + "epoch": 1.1983959818535321, + "grad_norm": 0.03665093332529068, + "learning_rate": 0.00015564606867995862, + "loss": 0.2515, + "step": 14793 + }, + { + "epoch": 1.1984769928710304, + "grad_norm": 0.04346012324094772, + "learning_rate": 0.00015564156802736398, + "loss": 0.3131, + "step": 14794 + }, + { + "epoch": 1.1985580038885288, + "grad_norm": 0.03895379975438118, + "learning_rate": 0.00015563706737476934, + "loss": 0.2658, + "step": 14795 + }, + { + "epoch": 1.1986390149060273, + "grad_norm": 0.05214675888419151, + "learning_rate": 0.00015563256672217473, + "loss": 0.3183, + "step": 14796 + }, + { + "epoch": 1.1987200259235256, + "grad_norm": 0.043334607034921646, + "learning_rate": 0.00015562806606958009, + "loss": 0.3181, + "step": 14797 + }, + { + "epoch": 1.198801036941024, + "grad_norm": 0.04547828808426857, + "learning_rate": 0.00015562356541698547, + "loss": 0.3412, + "step": 14798 + }, + { + "epoch": 1.1988820479585223, + "grad_norm": 0.037553027272224426, + "learning_rate": 0.00015561906476439086, + "loss": 0.3114, + "step": 14799 + }, + { + "epoch": 1.1989630589760207, + "grad_norm": 0.03953569009900093, + "learning_rate": 0.00015561456411179622, + "loss": 0.3184, + "step": 14800 + }, + { + "epoch": 1.1990440699935192, + "grad_norm": 0.03854568675160408, + "learning_rate": 0.00015561006345920158, + "loss": 0.27, + "step": 14801 + }, + { + "epoch": 1.1991250810110174, + "grad_norm": 0.03653649240732193, + "learning_rate": 0.00015560556280660697, + "loss": 0.2947, + "step": 14802 + }, + { + "epoch": 1.199206092028516, + "grad_norm": 0.0445239320397377, + "learning_rate": 0.00015560106215401233, + "loss": 0.3404, + "step": 14803 + }, + { + "epoch": 1.1992871030460144, + "grad_norm": 0.041224073618650436, + "learning_rate": 0.00015559656150141771, + "loss": 0.3194, + "step": 14804 + }, + { + "epoch": 1.1993681140635126, + "grad_norm": 0.04126469045877457, + "learning_rate": 0.0001555920608488231, + "loss": 0.2901, + "step": 14805 + }, + { + "epoch": 1.199449125081011, + "grad_norm": 0.04383617639541626, + "learning_rate": 0.00015558756019622846, + "loss": 0.3178, + "step": 14806 + }, + { + "epoch": 1.1995301360985093, + "grad_norm": 0.040692463517189026, + "learning_rate": 0.00015558305954363385, + "loss": 0.323, + "step": 14807 + }, + { + "epoch": 1.1996111471160078, + "grad_norm": 0.04072236642241478, + "learning_rate": 0.0001555785588910392, + "loss": 0.3049, + "step": 14808 + }, + { + "epoch": 1.199692158133506, + "grad_norm": 0.04335068538784981, + "learning_rate": 0.00015557405823844457, + "loss": 0.3169, + "step": 14809 + }, + { + "epoch": 1.1997731691510045, + "grad_norm": 0.037739019840955734, + "learning_rate": 0.00015556955758584996, + "loss": 0.2959, + "step": 14810 + }, + { + "epoch": 1.199854180168503, + "grad_norm": 0.038804128766059875, + "learning_rate": 0.00015556505693325534, + "loss": 0.3127, + "step": 14811 + }, + { + "epoch": 1.1999351911860012, + "grad_norm": 0.045990537852048874, + "learning_rate": 0.0001555605562806607, + "loss": 0.3228, + "step": 14812 + }, + { + "epoch": 1.2000162022034997, + "grad_norm": 0.040363702923059464, + "learning_rate": 0.0001555560556280661, + "loss": 0.3352, + "step": 14813 + }, + { + "epoch": 1.2000972132209982, + "grad_norm": 0.03828015923500061, + "learning_rate": 0.00015555155497547145, + "loss": 0.3149, + "step": 14814 + }, + { + "epoch": 1.2001782242384964, + "grad_norm": 0.040625084191560745, + "learning_rate": 0.0001555470543228768, + "loss": 0.3105, + "step": 14815 + }, + { + "epoch": 1.2002592352559949, + "grad_norm": 0.03519357740879059, + "learning_rate": 0.0001555425536702822, + "loss": 0.2812, + "step": 14816 + }, + { + "epoch": 1.2003402462734931, + "grad_norm": 0.03987620398402214, + "learning_rate": 0.00015553805301768759, + "loss": 0.3012, + "step": 14817 + }, + { + "epoch": 1.2004212572909916, + "grad_norm": 0.0429069809615612, + "learning_rate": 0.00015553355236509295, + "loss": 0.2906, + "step": 14818 + }, + { + "epoch": 1.20050226830849, + "grad_norm": 0.0450630784034729, + "learning_rate": 0.00015552905171249833, + "loss": 0.3188, + "step": 14819 + }, + { + "epoch": 1.2005832793259883, + "grad_norm": 0.03968661651015282, + "learning_rate": 0.0001555245510599037, + "loss": 0.2709, + "step": 14820 + }, + { + "epoch": 1.2006642903434868, + "grad_norm": 0.03986947983503342, + "learning_rate": 0.00015552005040730905, + "loss": 0.2897, + "step": 14821 + }, + { + "epoch": 1.200745301360985, + "grad_norm": 0.03897624462842941, + "learning_rate": 0.00015551554975471444, + "loss": 0.2697, + "step": 14822 + }, + { + "epoch": 1.2008263123784835, + "grad_norm": 0.04822707921266556, + "learning_rate": 0.00015551104910211983, + "loss": 0.3644, + "step": 14823 + }, + { + "epoch": 1.200907323395982, + "grad_norm": 0.03778839111328125, + "learning_rate": 0.0001555065484495252, + "loss": 0.2905, + "step": 14824 + }, + { + "epoch": 1.2009883344134802, + "grad_norm": 0.047402072697877884, + "learning_rate": 0.00015550204779693057, + "loss": 0.3363, + "step": 14825 + }, + { + "epoch": 1.2010693454309787, + "grad_norm": 0.04571527615189552, + "learning_rate": 0.00015549754714433593, + "loss": 0.3272, + "step": 14826 + }, + { + "epoch": 1.201150356448477, + "grad_norm": 0.0438561737537384, + "learning_rate": 0.0001554930464917413, + "loss": 0.2901, + "step": 14827 + }, + { + "epoch": 1.2012313674659754, + "grad_norm": 0.0397321879863739, + "learning_rate": 0.00015548854583914668, + "loss": 0.2673, + "step": 14828 + }, + { + "epoch": 1.2013123784834738, + "grad_norm": 0.04492023214697838, + "learning_rate": 0.00015548404518655207, + "loss": 0.3245, + "step": 14829 + }, + { + "epoch": 1.201393389500972, + "grad_norm": 0.04138173535466194, + "learning_rate": 0.00015547954453395743, + "loss": 0.3149, + "step": 14830 + }, + { + "epoch": 1.2014744005184705, + "grad_norm": 0.041036274284124374, + "learning_rate": 0.00015547504388136282, + "loss": 0.2777, + "step": 14831 + }, + { + "epoch": 1.2015554115359688, + "grad_norm": 0.03809208422899246, + "learning_rate": 0.00015547054322876818, + "loss": 0.2816, + "step": 14832 + }, + { + "epoch": 1.2016364225534673, + "grad_norm": 0.047296952456235886, + "learning_rate": 0.00015546604257617354, + "loss": 0.3102, + "step": 14833 + }, + { + "epoch": 1.2017174335709657, + "grad_norm": 0.041915081441402435, + "learning_rate": 0.00015546154192357892, + "loss": 0.3149, + "step": 14834 + }, + { + "epoch": 1.201798444588464, + "grad_norm": 0.038084451109170914, + "learning_rate": 0.0001554570412709843, + "loss": 0.2709, + "step": 14835 + }, + { + "epoch": 1.2018794556059624, + "grad_norm": 0.04211721941828728, + "learning_rate": 0.00015545254061838967, + "loss": 0.312, + "step": 14836 + }, + { + "epoch": 1.201960466623461, + "grad_norm": 0.044992879033088684, + "learning_rate": 0.00015544803996579506, + "loss": 0.3102, + "step": 14837 + }, + { + "epoch": 1.2020414776409591, + "grad_norm": 0.03669695556163788, + "learning_rate": 0.00015544353931320042, + "loss": 0.3174, + "step": 14838 + }, + { + "epoch": 1.2021224886584576, + "grad_norm": 0.036718446761369705, + "learning_rate": 0.00015543903866060578, + "loss": 0.2909, + "step": 14839 + }, + { + "epoch": 1.2022034996759559, + "grad_norm": 0.042346637696027756, + "learning_rate": 0.00015543453800801116, + "loss": 0.3211, + "step": 14840 + }, + { + "epoch": 1.2022845106934543, + "grad_norm": 0.038561925292015076, + "learning_rate": 0.00015543003735541655, + "loss": 0.2951, + "step": 14841 + }, + { + "epoch": 1.2023655217109526, + "grad_norm": 0.039432037621736526, + "learning_rate": 0.0001554255367028219, + "loss": 0.2802, + "step": 14842 + }, + { + "epoch": 1.202446532728451, + "grad_norm": 0.04120011627674103, + "learning_rate": 0.0001554210360502273, + "loss": 0.3294, + "step": 14843 + }, + { + "epoch": 1.2025275437459495, + "grad_norm": 0.044444065541028976, + "learning_rate": 0.00015541653539763266, + "loss": 0.3118, + "step": 14844 + }, + { + "epoch": 1.2026085547634477, + "grad_norm": 0.04563391953706741, + "learning_rate": 0.00015541203474503805, + "loss": 0.2899, + "step": 14845 + }, + { + "epoch": 1.2026895657809462, + "grad_norm": 0.03690198436379433, + "learning_rate": 0.0001554075340924434, + "loss": 0.3212, + "step": 14846 + }, + { + "epoch": 1.2027705767984447, + "grad_norm": 0.03772515431046486, + "learning_rate": 0.0001554030334398488, + "loss": 0.2357, + "step": 14847 + }, + { + "epoch": 1.202851587815943, + "grad_norm": 0.0380142480134964, + "learning_rate": 0.00015539853278725415, + "loss": 0.2648, + "step": 14848 + }, + { + "epoch": 1.2029325988334414, + "grad_norm": 0.042475104331970215, + "learning_rate": 0.00015539403213465954, + "loss": 0.3387, + "step": 14849 + }, + { + "epoch": 1.2030136098509396, + "grad_norm": 0.0449688620865345, + "learning_rate": 0.0001553895314820649, + "loss": 0.3302, + "step": 14850 + }, + { + "epoch": 1.203094620868438, + "grad_norm": 0.03996169567108154, + "learning_rate": 0.0001553850308294703, + "loss": 0.2959, + "step": 14851 + }, + { + "epoch": 1.2031756318859366, + "grad_norm": 0.04284268245100975, + "learning_rate": 0.00015538053017687565, + "loss": 0.3064, + "step": 14852 + }, + { + "epoch": 1.2032566429034348, + "grad_norm": 0.04294389858841896, + "learning_rate": 0.00015537602952428103, + "loss": 0.2904, + "step": 14853 + }, + { + "epoch": 1.2033376539209333, + "grad_norm": 0.04475172236561775, + "learning_rate": 0.0001553715288716864, + "loss": 0.3193, + "step": 14854 + }, + { + "epoch": 1.2034186649384315, + "grad_norm": 0.042766768485307693, + "learning_rate": 0.00015536702821909178, + "loss": 0.3168, + "step": 14855 + }, + { + "epoch": 1.20349967595593, + "grad_norm": 0.04142007604241371, + "learning_rate": 0.00015536252756649714, + "loss": 0.2939, + "step": 14856 + }, + { + "epoch": 1.2035806869734285, + "grad_norm": 0.04174195975065231, + "learning_rate": 0.00015535802691390253, + "loss": 0.2897, + "step": 14857 + }, + { + "epoch": 1.2036616979909267, + "grad_norm": 0.04544617980718613, + "learning_rate": 0.0001553535262613079, + "loss": 0.2798, + "step": 14858 + }, + { + "epoch": 1.2037427090084252, + "grad_norm": 0.03798126056790352, + "learning_rate": 0.00015534902560871328, + "loss": 0.2868, + "step": 14859 + }, + { + "epoch": 1.2038237200259236, + "grad_norm": 0.045044515281915665, + "learning_rate": 0.00015534452495611864, + "loss": 0.3122, + "step": 14860 + }, + { + "epoch": 1.2039047310434219, + "grad_norm": 0.044914714992046356, + "learning_rate": 0.00015534002430352402, + "loss": 0.3229, + "step": 14861 + }, + { + "epoch": 1.2039857420609203, + "grad_norm": 0.044051643460989, + "learning_rate": 0.00015533552365092938, + "loss": 0.2951, + "step": 14862 + }, + { + "epoch": 1.2040667530784186, + "grad_norm": 0.04305178299546242, + "learning_rate": 0.00015533102299833477, + "loss": 0.3235, + "step": 14863 + }, + { + "epoch": 1.204147764095917, + "grad_norm": 0.04135410860180855, + "learning_rate": 0.00015532652234574013, + "loss": 0.3039, + "step": 14864 + }, + { + "epoch": 1.2042287751134153, + "grad_norm": 0.04174239560961723, + "learning_rate": 0.00015532202169314552, + "loss": 0.3394, + "step": 14865 + }, + { + "epoch": 1.2043097861309138, + "grad_norm": 0.041874948889017105, + "learning_rate": 0.00015531752104055088, + "loss": 0.3091, + "step": 14866 + }, + { + "epoch": 1.2043907971484122, + "grad_norm": 0.039929281920194626, + "learning_rate": 0.00015531302038795627, + "loss": 0.3189, + "step": 14867 + }, + { + "epoch": 1.2044718081659105, + "grad_norm": 0.040334559977054596, + "learning_rate": 0.00015530851973536163, + "loss": 0.2749, + "step": 14868 + }, + { + "epoch": 1.204552819183409, + "grad_norm": 0.04389198124408722, + "learning_rate": 0.000155304019082767, + "loss": 0.3207, + "step": 14869 + }, + { + "epoch": 1.2046338302009074, + "grad_norm": 0.04084376245737076, + "learning_rate": 0.0001552995184301724, + "loss": 0.2819, + "step": 14870 + }, + { + "epoch": 1.2047148412184057, + "grad_norm": 0.03534472733736038, + "learning_rate": 0.00015529501777757776, + "loss": 0.2908, + "step": 14871 + }, + { + "epoch": 1.2047958522359041, + "grad_norm": 0.03947620093822479, + "learning_rate": 0.00015529051712498312, + "loss": 0.2971, + "step": 14872 + }, + { + "epoch": 1.2048768632534024, + "grad_norm": 0.042341433465480804, + "learning_rate": 0.0001552860164723885, + "loss": 0.3296, + "step": 14873 + }, + { + "epoch": 1.2049578742709008, + "grad_norm": 0.04293164238333702, + "learning_rate": 0.0001552815158197939, + "loss": 0.2891, + "step": 14874 + }, + { + "epoch": 1.2050388852883993, + "grad_norm": 0.041425734758377075, + "learning_rate": 0.00015527701516719925, + "loss": 0.3247, + "step": 14875 + }, + { + "epoch": 1.2051198963058976, + "grad_norm": 0.0458022877573967, + "learning_rate": 0.00015527251451460464, + "loss": 0.3297, + "step": 14876 + }, + { + "epoch": 1.205200907323396, + "grad_norm": 0.040675088763237, + "learning_rate": 0.00015526801386201, + "loss": 0.2815, + "step": 14877 + }, + { + "epoch": 1.2052819183408943, + "grad_norm": 0.0410967692732811, + "learning_rate": 0.00015526351320941536, + "loss": 0.2938, + "step": 14878 + }, + { + "epoch": 1.2053629293583927, + "grad_norm": 0.038196299225091934, + "learning_rate": 0.00015525901255682075, + "loss": 0.3281, + "step": 14879 + }, + { + "epoch": 1.2054439403758912, + "grad_norm": 0.04951519891619682, + "learning_rate": 0.00015525451190422614, + "loss": 0.3125, + "step": 14880 + }, + { + "epoch": 1.2055249513933894, + "grad_norm": 0.05080864578485489, + "learning_rate": 0.0001552500112516315, + "loss": 0.3106, + "step": 14881 + }, + { + "epoch": 1.205605962410888, + "grad_norm": 0.05040706321597099, + "learning_rate": 0.00015524551059903688, + "loss": 0.3864, + "step": 14882 + }, + { + "epoch": 1.2056869734283864, + "grad_norm": 0.03856421634554863, + "learning_rate": 0.00015524100994644224, + "loss": 0.2875, + "step": 14883 + }, + { + "epoch": 1.2057679844458846, + "grad_norm": 0.041090596467256546, + "learning_rate": 0.0001552365092938476, + "loss": 0.2899, + "step": 14884 + }, + { + "epoch": 1.205848995463383, + "grad_norm": 0.05047939345240593, + "learning_rate": 0.000155232008641253, + "loss": 0.3544, + "step": 14885 + }, + { + "epoch": 1.2059300064808813, + "grad_norm": 0.04525664448738098, + "learning_rate": 0.00015522750798865838, + "loss": 0.3289, + "step": 14886 + }, + { + "epoch": 1.2060110174983798, + "grad_norm": 0.04346325248479843, + "learning_rate": 0.00015522300733606374, + "loss": 0.3322, + "step": 14887 + }, + { + "epoch": 1.206092028515878, + "grad_norm": 0.044985491782426834, + "learning_rate": 0.00015521850668346912, + "loss": 0.3022, + "step": 14888 + }, + { + "epoch": 1.2061730395333765, + "grad_norm": 0.03676772490143776, + "learning_rate": 0.00015521400603087448, + "loss": 0.2682, + "step": 14889 + }, + { + "epoch": 1.206254050550875, + "grad_norm": 0.035457346588373184, + "learning_rate": 0.00015520950537827984, + "loss": 0.2738, + "step": 14890 + }, + { + "epoch": 1.2063350615683732, + "grad_norm": 0.04279814288020134, + "learning_rate": 0.00015520500472568523, + "loss": 0.3924, + "step": 14891 + }, + { + "epoch": 1.2064160725858717, + "grad_norm": 0.04287739098072052, + "learning_rate": 0.00015520050407309062, + "loss": 0.3226, + "step": 14892 + }, + { + "epoch": 1.2064970836033702, + "grad_norm": 0.04235265776515007, + "learning_rate": 0.00015519600342049598, + "loss": 0.335, + "step": 14893 + }, + { + "epoch": 1.2065780946208684, + "grad_norm": 0.04114942252635956, + "learning_rate": 0.00015519150276790137, + "loss": 0.3094, + "step": 14894 + }, + { + "epoch": 1.2066591056383669, + "grad_norm": 0.04386473447084427, + "learning_rate": 0.00015518700211530673, + "loss": 0.309, + "step": 14895 + }, + { + "epoch": 1.2067401166558651, + "grad_norm": 0.04421220347285271, + "learning_rate": 0.00015518250146271209, + "loss": 0.3256, + "step": 14896 + }, + { + "epoch": 1.2068211276733636, + "grad_norm": 0.041027799248695374, + "learning_rate": 0.00015517800081011747, + "loss": 0.3279, + "step": 14897 + }, + { + "epoch": 1.206902138690862, + "grad_norm": 0.038625966757535934, + "learning_rate": 0.00015517350015752286, + "loss": 0.2986, + "step": 14898 + }, + { + "epoch": 1.2069831497083603, + "grad_norm": 0.05030160769820213, + "learning_rate": 0.00015516899950492822, + "loss": 0.3738, + "step": 14899 + }, + { + "epoch": 1.2070641607258588, + "grad_norm": 0.03595034033060074, + "learning_rate": 0.0001551644988523336, + "loss": 0.2843, + "step": 14900 + }, + { + "epoch": 1.207145171743357, + "grad_norm": 0.04498247429728508, + "learning_rate": 0.00015515999819973897, + "loss": 0.3449, + "step": 14901 + }, + { + "epoch": 1.2072261827608555, + "grad_norm": 0.042141638696193695, + "learning_rate": 0.00015515549754714433, + "loss": 0.321, + "step": 14902 + }, + { + "epoch": 1.207307193778354, + "grad_norm": 0.04549839720129967, + "learning_rate": 0.00015515099689454971, + "loss": 0.262, + "step": 14903 + }, + { + "epoch": 1.2073882047958522, + "grad_norm": 0.047006864100694656, + "learning_rate": 0.0001551464962419551, + "loss": 0.2865, + "step": 14904 + }, + { + "epoch": 1.2074692158133506, + "grad_norm": 0.041136160492897034, + "learning_rate": 0.00015514199558936046, + "loss": 0.3178, + "step": 14905 + }, + { + "epoch": 1.2075502268308491, + "grad_norm": 0.03632412478327751, + "learning_rate": 0.00015513749493676585, + "loss": 0.2786, + "step": 14906 + }, + { + "epoch": 1.2076312378483474, + "grad_norm": 0.03551272302865982, + "learning_rate": 0.0001551329942841712, + "loss": 0.2647, + "step": 14907 + }, + { + "epoch": 1.2077122488658458, + "grad_norm": 0.0367630310356617, + "learning_rate": 0.00015512849363157657, + "loss": 0.292, + "step": 14908 + }, + { + "epoch": 1.207793259883344, + "grad_norm": 0.03985973820090294, + "learning_rate": 0.00015512399297898196, + "loss": 0.2996, + "step": 14909 + }, + { + "epoch": 1.2078742709008425, + "grad_norm": 0.03747883066534996, + "learning_rate": 0.00015511949232638734, + "loss": 0.2588, + "step": 14910 + }, + { + "epoch": 1.2079552819183408, + "grad_norm": 0.037703000009059906, + "learning_rate": 0.0001551149916737927, + "loss": 0.2753, + "step": 14911 + }, + { + "epoch": 1.2080362929358393, + "grad_norm": 0.046212971210479736, + "learning_rate": 0.0001551104910211981, + "loss": 0.3252, + "step": 14912 + }, + { + "epoch": 1.2081173039533377, + "grad_norm": 0.03943370282649994, + "learning_rate": 0.00015510599036860345, + "loss": 0.2965, + "step": 14913 + }, + { + "epoch": 1.208198314970836, + "grad_norm": 0.04212084040045738, + "learning_rate": 0.0001551014897160088, + "loss": 0.3352, + "step": 14914 + }, + { + "epoch": 1.2082793259883344, + "grad_norm": 0.0414251834154129, + "learning_rate": 0.0001550969890634142, + "loss": 0.3091, + "step": 14915 + }, + { + "epoch": 1.208360337005833, + "grad_norm": 0.040891196578741074, + "learning_rate": 0.00015509248841081959, + "loss": 0.2814, + "step": 14916 + }, + { + "epoch": 1.2084413480233311, + "grad_norm": 0.03951037675142288, + "learning_rate": 0.00015508798775822495, + "loss": 0.2863, + "step": 14917 + }, + { + "epoch": 1.2085223590408296, + "grad_norm": 0.039379850029945374, + "learning_rate": 0.00015508348710563033, + "loss": 0.3006, + "step": 14918 + }, + { + "epoch": 1.2086033700583279, + "grad_norm": 0.04138374701142311, + "learning_rate": 0.0001550789864530357, + "loss": 0.2759, + "step": 14919 + }, + { + "epoch": 1.2086843810758263, + "grad_norm": 0.059719931334257126, + "learning_rate": 0.00015507448580044105, + "loss": 0.3148, + "step": 14920 + }, + { + "epoch": 1.2087653920933248, + "grad_norm": 0.04690566286444664, + "learning_rate": 0.00015506998514784644, + "loss": 0.3427, + "step": 14921 + }, + { + "epoch": 1.208846403110823, + "grad_norm": 0.05078961327672005, + "learning_rate": 0.00015506548449525183, + "loss": 0.3673, + "step": 14922 + }, + { + "epoch": 1.2089274141283215, + "grad_norm": 0.039885710924863815, + "learning_rate": 0.0001550609838426572, + "loss": 0.2906, + "step": 14923 + }, + { + "epoch": 1.2090084251458197, + "grad_norm": 0.03954209014773369, + "learning_rate": 0.00015505648319006257, + "loss": 0.3098, + "step": 14924 + }, + { + "epoch": 1.2090894361633182, + "grad_norm": 0.0377873033285141, + "learning_rate": 0.00015505198253746793, + "loss": 0.2479, + "step": 14925 + }, + { + "epoch": 1.2091704471808167, + "grad_norm": 0.04104090481996536, + "learning_rate": 0.00015504748188487332, + "loss": 0.3105, + "step": 14926 + }, + { + "epoch": 1.209251458198315, + "grad_norm": 0.0429408960044384, + "learning_rate": 0.00015504298123227868, + "loss": 0.3034, + "step": 14927 + }, + { + "epoch": 1.2093324692158134, + "grad_norm": 0.03847665339708328, + "learning_rate": 0.00015503848057968407, + "loss": 0.3016, + "step": 14928 + }, + { + "epoch": 1.2094134802333119, + "grad_norm": 0.03994767740368843, + "learning_rate": 0.00015503397992708943, + "loss": 0.2945, + "step": 14929 + }, + { + "epoch": 1.20949449125081, + "grad_norm": 0.04051990061998367, + "learning_rate": 0.00015502947927449482, + "loss": 0.3454, + "step": 14930 + }, + { + "epoch": 1.2095755022683086, + "grad_norm": 0.04037369042634964, + "learning_rate": 0.00015502497862190018, + "loss": 0.3038, + "step": 14931 + }, + { + "epoch": 1.2096565132858068, + "grad_norm": 0.0358559787273407, + "learning_rate": 0.00015502047796930556, + "loss": 0.267, + "step": 14932 + }, + { + "epoch": 1.2097375243033053, + "grad_norm": 0.039637815207242966, + "learning_rate": 0.00015501597731671092, + "loss": 0.2757, + "step": 14933 + }, + { + "epoch": 1.2098185353208035, + "grad_norm": 0.042503852397203445, + "learning_rate": 0.0001550114766641163, + "loss": 0.3243, + "step": 14934 + }, + { + "epoch": 1.209899546338302, + "grad_norm": 0.04102681949734688, + "learning_rate": 0.00015500697601152167, + "loss": 0.3234, + "step": 14935 + }, + { + "epoch": 1.2099805573558005, + "grad_norm": 0.04349510744214058, + "learning_rate": 0.00015500247535892706, + "loss": 0.3462, + "step": 14936 + }, + { + "epoch": 1.2100615683732987, + "grad_norm": 0.037747252732515335, + "learning_rate": 0.00015499797470633242, + "loss": 0.2838, + "step": 14937 + }, + { + "epoch": 1.2101425793907972, + "grad_norm": 0.04947254806756973, + "learning_rate": 0.0001549934740537378, + "loss": 0.2977, + "step": 14938 + }, + { + "epoch": 1.2102235904082956, + "grad_norm": 0.0345742292702198, + "learning_rate": 0.0001549889734011432, + "loss": 0.2769, + "step": 14939 + }, + { + "epoch": 1.2103046014257939, + "grad_norm": 0.04381577670574188, + "learning_rate": 0.00015498447274854855, + "loss": 0.3068, + "step": 14940 + }, + { + "epoch": 1.2103856124432923, + "grad_norm": 0.04694143310189247, + "learning_rate": 0.0001549799720959539, + "loss": 0.2989, + "step": 14941 + }, + { + "epoch": 1.2104666234607906, + "grad_norm": 0.04614692181348801, + "learning_rate": 0.0001549754714433593, + "loss": 0.3008, + "step": 14942 + }, + { + "epoch": 1.210547634478289, + "grad_norm": 0.04504687711596489, + "learning_rate": 0.00015497097079076466, + "loss": 0.2721, + "step": 14943 + }, + { + "epoch": 1.2106286454957873, + "grad_norm": 0.04158374294638634, + "learning_rate": 0.00015496647013817005, + "loss": 0.3073, + "step": 14944 + }, + { + "epoch": 1.2107096565132858, + "grad_norm": 0.06658129394054413, + "learning_rate": 0.00015496196948557543, + "loss": 0.3523, + "step": 14945 + }, + { + "epoch": 1.2107906675307842, + "grad_norm": 0.03513149544596672, + "learning_rate": 0.0001549574688329808, + "loss": 0.265, + "step": 14946 + }, + { + "epoch": 1.2108716785482825, + "grad_norm": 0.04534105584025383, + "learning_rate": 0.00015495296818038615, + "loss": 0.2945, + "step": 14947 + }, + { + "epoch": 1.210952689565781, + "grad_norm": 0.04596344754099846, + "learning_rate": 0.00015494846752779154, + "loss": 0.3335, + "step": 14948 + }, + { + "epoch": 1.2110337005832794, + "grad_norm": 0.042053647339344025, + "learning_rate": 0.00015494396687519693, + "loss": 0.3075, + "step": 14949 + }, + { + "epoch": 1.2111147116007777, + "grad_norm": 0.0502544566988945, + "learning_rate": 0.0001549394662226023, + "loss": 0.3104, + "step": 14950 + }, + { + "epoch": 1.2111957226182761, + "grad_norm": 0.04380401596426964, + "learning_rate": 0.00015493496557000767, + "loss": 0.3072, + "step": 14951 + }, + { + "epoch": 1.2112767336357744, + "grad_norm": 0.043562911450862885, + "learning_rate": 0.00015493046491741304, + "loss": 0.3615, + "step": 14952 + }, + { + "epoch": 1.2113577446532728, + "grad_norm": 0.04325402155518532, + "learning_rate": 0.0001549259642648184, + "loss": 0.3349, + "step": 14953 + }, + { + "epoch": 1.2114387556707713, + "grad_norm": 0.04529077187180519, + "learning_rate": 0.00015492146361222378, + "loss": 0.337, + "step": 14954 + }, + { + "epoch": 1.2115197666882696, + "grad_norm": 0.04269943758845329, + "learning_rate": 0.00015491696295962917, + "loss": 0.3169, + "step": 14955 + }, + { + "epoch": 1.211600777705768, + "grad_norm": 0.03706458956003189, + "learning_rate": 0.00015491246230703453, + "loss": 0.2868, + "step": 14956 + }, + { + "epoch": 1.2116817887232663, + "grad_norm": 0.04907451570034027, + "learning_rate": 0.00015490796165443992, + "loss": 0.3273, + "step": 14957 + }, + { + "epoch": 1.2117627997407647, + "grad_norm": 0.035258274525403976, + "learning_rate": 0.00015490346100184528, + "loss": 0.2706, + "step": 14958 + }, + { + "epoch": 1.2118438107582632, + "grad_norm": 0.05509074032306671, + "learning_rate": 0.00015489896034925064, + "loss": 0.3811, + "step": 14959 + }, + { + "epoch": 1.2119248217757614, + "grad_norm": 0.038064759224653244, + "learning_rate": 0.00015489445969665602, + "loss": 0.3257, + "step": 14960 + }, + { + "epoch": 1.21200583279326, + "grad_norm": 0.038369227200746536, + "learning_rate": 0.0001548899590440614, + "loss": 0.2874, + "step": 14961 + }, + { + "epoch": 1.2120868438107584, + "grad_norm": 0.04148680716753006, + "learning_rate": 0.00015488545839146677, + "loss": 0.3028, + "step": 14962 + }, + { + "epoch": 1.2121678548282566, + "grad_norm": 0.038149863481521606, + "learning_rate": 0.00015488095773887216, + "loss": 0.3129, + "step": 14963 + }, + { + "epoch": 1.212248865845755, + "grad_norm": 0.04594748839735985, + "learning_rate": 0.00015487645708627752, + "loss": 0.3335, + "step": 14964 + }, + { + "epoch": 1.2123298768632533, + "grad_norm": 0.042934972792863846, + "learning_rate": 0.00015487195643368288, + "loss": 0.3348, + "step": 14965 + }, + { + "epoch": 1.2124108878807518, + "grad_norm": 0.040971312671899796, + "learning_rate": 0.00015486745578108827, + "loss": 0.3171, + "step": 14966 + }, + { + "epoch": 1.21249189889825, + "grad_norm": 0.041622862219810486, + "learning_rate": 0.00015486295512849365, + "loss": 0.2778, + "step": 14967 + }, + { + "epoch": 1.2125729099157485, + "grad_norm": 0.041777707636356354, + "learning_rate": 0.000154858454475899, + "loss": 0.2911, + "step": 14968 + }, + { + "epoch": 1.212653920933247, + "grad_norm": 0.04000363126397133, + "learning_rate": 0.0001548539538233044, + "loss": 0.3156, + "step": 14969 + }, + { + "epoch": 1.2127349319507452, + "grad_norm": 0.048961807042360306, + "learning_rate": 0.00015484945317070976, + "loss": 0.3202, + "step": 14970 + }, + { + "epoch": 1.2128159429682437, + "grad_norm": 0.03984816372394562, + "learning_rate": 0.00015484495251811512, + "loss": 0.2502, + "step": 14971 + }, + { + "epoch": 1.2128969539857422, + "grad_norm": 0.045237038284540176, + "learning_rate": 0.0001548404518655205, + "loss": 0.3387, + "step": 14972 + }, + { + "epoch": 1.2129779650032404, + "grad_norm": 0.039658062160015106, + "learning_rate": 0.0001548359512129259, + "loss": 0.2838, + "step": 14973 + }, + { + "epoch": 1.2130589760207389, + "grad_norm": 0.03694017976522446, + "learning_rate": 0.00015483145056033125, + "loss": 0.2642, + "step": 14974 + }, + { + "epoch": 1.213139987038237, + "grad_norm": 0.03846210986375809, + "learning_rate": 0.00015482694990773664, + "loss": 0.2934, + "step": 14975 + }, + { + "epoch": 1.2132209980557356, + "grad_norm": 0.04987949877977371, + "learning_rate": 0.000154822449255142, + "loss": 0.3406, + "step": 14976 + }, + { + "epoch": 1.213302009073234, + "grad_norm": 0.03913595527410507, + "learning_rate": 0.00015481794860254736, + "loss": 0.2639, + "step": 14977 + }, + { + "epoch": 1.2133830200907323, + "grad_norm": 0.04530547186732292, + "learning_rate": 0.00015481344794995275, + "loss": 0.3033, + "step": 14978 + }, + { + "epoch": 1.2134640311082308, + "grad_norm": 0.04652739688754082, + "learning_rate": 0.00015480894729735814, + "loss": 0.3455, + "step": 14979 + }, + { + "epoch": 1.213545042125729, + "grad_norm": 0.038609545677900314, + "learning_rate": 0.0001548044466447635, + "loss": 0.2893, + "step": 14980 + }, + { + "epoch": 1.2136260531432275, + "grad_norm": 0.03681579604744911, + "learning_rate": 0.00015479994599216888, + "loss": 0.2949, + "step": 14981 + }, + { + "epoch": 1.213707064160726, + "grad_norm": 0.035748064517974854, + "learning_rate": 0.00015479544533957424, + "loss": 0.2806, + "step": 14982 + }, + { + "epoch": 1.2137880751782242, + "grad_norm": 0.04264812543988228, + "learning_rate": 0.0001547909446869796, + "loss": 0.3046, + "step": 14983 + }, + { + "epoch": 1.2138690861957226, + "grad_norm": 0.041751082986593246, + "learning_rate": 0.000154786444034385, + "loss": 0.3084, + "step": 14984 + }, + { + "epoch": 1.2139500972132211, + "grad_norm": 0.04104901850223541, + "learning_rate": 0.00015478194338179038, + "loss": 0.3208, + "step": 14985 + }, + { + "epoch": 1.2140311082307194, + "grad_norm": 0.05218047276139259, + "learning_rate": 0.00015477744272919574, + "loss": 0.3657, + "step": 14986 + }, + { + "epoch": 1.2141121192482178, + "grad_norm": 0.04347262904047966, + "learning_rate": 0.00015477294207660112, + "loss": 0.3151, + "step": 14987 + }, + { + "epoch": 1.214193130265716, + "grad_norm": 0.03328379616141319, + "learning_rate": 0.00015476844142400648, + "loss": 0.2409, + "step": 14988 + }, + { + "epoch": 1.2142741412832145, + "grad_norm": 0.0384984090924263, + "learning_rate": 0.00015476394077141184, + "loss": 0.2918, + "step": 14989 + }, + { + "epoch": 1.2143551523007128, + "grad_norm": 0.03588297218084335, + "learning_rate": 0.00015475944011881723, + "loss": 0.2647, + "step": 14990 + }, + { + "epoch": 1.2144361633182112, + "grad_norm": 0.041738592088222504, + "learning_rate": 0.00015475493946622262, + "loss": 0.3172, + "step": 14991 + }, + { + "epoch": 1.2145171743357097, + "grad_norm": 0.04426318779587746, + "learning_rate": 0.00015475043881362798, + "loss": 0.3277, + "step": 14992 + }, + { + "epoch": 1.214598185353208, + "grad_norm": 0.04357115179300308, + "learning_rate": 0.00015474593816103337, + "loss": 0.2669, + "step": 14993 + }, + { + "epoch": 1.2146791963707064, + "grad_norm": 0.03911040350794792, + "learning_rate": 0.00015474143750843873, + "loss": 0.2868, + "step": 14994 + }, + { + "epoch": 1.214760207388205, + "grad_norm": 0.04531647264957428, + "learning_rate": 0.00015473693685584409, + "loss": 0.3478, + "step": 14995 + }, + { + "epoch": 1.2148412184057031, + "grad_norm": 0.04873902350664139, + "learning_rate": 0.00015473243620324947, + "loss": 0.3047, + "step": 14996 + }, + { + "epoch": 1.2149222294232016, + "grad_norm": 0.040747951716184616, + "learning_rate": 0.00015472793555065486, + "loss": 0.3153, + "step": 14997 + }, + { + "epoch": 1.2150032404406998, + "grad_norm": 0.04346758872270584, + "learning_rate": 0.00015472343489806022, + "loss": 0.261, + "step": 14998 + }, + { + "epoch": 1.2150842514581983, + "grad_norm": 0.045580729842185974, + "learning_rate": 0.0001547189342454656, + "loss": 0.3188, + "step": 14999 + }, + { + "epoch": 1.2151652624756968, + "grad_norm": 0.041919250041246414, + "learning_rate": 0.00015471443359287097, + "loss": 0.2996, + "step": 15000 + }, + { + "epoch": 1.215246273493195, + "grad_norm": 0.04074418172240257, + "learning_rate": 0.00015470993294027633, + "loss": 0.2698, + "step": 15001 + }, + { + "epoch": 1.2153272845106935, + "grad_norm": 0.03675012290477753, + "learning_rate": 0.00015470543228768172, + "loss": 0.2639, + "step": 15002 + }, + { + "epoch": 1.2154082955281917, + "grad_norm": 0.0448453314602375, + "learning_rate": 0.0001547009316350871, + "loss": 0.3048, + "step": 15003 + }, + { + "epoch": 1.2154893065456902, + "grad_norm": 0.04736652597784996, + "learning_rate": 0.00015469643098249246, + "loss": 0.3033, + "step": 15004 + }, + { + "epoch": 1.2155703175631887, + "grad_norm": 0.03896206244826317, + "learning_rate": 0.00015469193032989785, + "loss": 0.2953, + "step": 15005 + }, + { + "epoch": 1.215651328580687, + "grad_norm": 0.03998485207557678, + "learning_rate": 0.0001546874296773032, + "loss": 0.2969, + "step": 15006 + }, + { + "epoch": 1.2157323395981854, + "grad_norm": 0.040892038494348526, + "learning_rate": 0.0001546829290247086, + "loss": 0.3103, + "step": 15007 + }, + { + "epoch": 1.2158133506156839, + "grad_norm": 0.04300196096301079, + "learning_rate": 0.00015467842837211398, + "loss": 0.2877, + "step": 15008 + }, + { + "epoch": 1.215894361633182, + "grad_norm": 0.04934476315975189, + "learning_rate": 0.00015467392771951934, + "loss": 0.3347, + "step": 15009 + }, + { + "epoch": 1.2159753726506806, + "grad_norm": 0.04549160599708557, + "learning_rate": 0.0001546694270669247, + "loss": 0.2961, + "step": 15010 + }, + { + "epoch": 1.2160563836681788, + "grad_norm": 0.03903596103191376, + "learning_rate": 0.0001546649264143301, + "loss": 0.309, + "step": 15011 + }, + { + "epoch": 1.2161373946856773, + "grad_norm": 0.0424087792634964, + "learning_rate": 0.00015466042576173545, + "loss": 0.3197, + "step": 15012 + }, + { + "epoch": 1.2162184057031755, + "grad_norm": 0.037096478044986725, + "learning_rate": 0.00015465592510914084, + "loss": 0.29, + "step": 15013 + }, + { + "epoch": 1.216299416720674, + "grad_norm": 0.034570056945085526, + "learning_rate": 0.00015465142445654623, + "loss": 0.2682, + "step": 15014 + }, + { + "epoch": 1.2163804277381725, + "grad_norm": 0.038456372916698456, + "learning_rate": 0.00015464692380395159, + "loss": 0.3088, + "step": 15015 + }, + { + "epoch": 1.2164614387556707, + "grad_norm": 0.044545628130435944, + "learning_rate": 0.00015464242315135695, + "loss": 0.3117, + "step": 15016 + }, + { + "epoch": 1.2165424497731692, + "grad_norm": 0.043802566826343536, + "learning_rate": 0.00015463792249876233, + "loss": 0.3331, + "step": 15017 + }, + { + "epoch": 1.2166234607906676, + "grad_norm": 0.04616328328847885, + "learning_rate": 0.0001546334218461677, + "loss": 0.3316, + "step": 15018 + }, + { + "epoch": 1.2167044718081659, + "grad_norm": 0.040806081146001816, + "learning_rate": 0.00015462892119357308, + "loss": 0.2668, + "step": 15019 + }, + { + "epoch": 1.2167854828256643, + "grad_norm": 0.05050874873995781, + "learning_rate": 0.00015462442054097847, + "loss": 0.3035, + "step": 15020 + }, + { + "epoch": 1.2168664938431626, + "grad_norm": 0.04004824161529541, + "learning_rate": 0.00015461991988838383, + "loss": 0.3014, + "step": 15021 + }, + { + "epoch": 1.216947504860661, + "grad_norm": 0.03522452712059021, + "learning_rate": 0.0001546154192357892, + "loss": 0.2926, + "step": 15022 + }, + { + "epoch": 1.2170285158781595, + "grad_norm": 0.040806613862514496, + "learning_rate": 0.00015461091858319457, + "loss": 0.3207, + "step": 15023 + }, + { + "epoch": 1.2171095268956578, + "grad_norm": 0.04380201920866966, + "learning_rate": 0.00015460641793059993, + "loss": 0.3041, + "step": 15024 + }, + { + "epoch": 1.2171905379131562, + "grad_norm": 0.04264390841126442, + "learning_rate": 0.00015460191727800532, + "loss": 0.3032, + "step": 15025 + }, + { + "epoch": 1.2172715489306545, + "grad_norm": 0.04169783741235733, + "learning_rate": 0.0001545974166254107, + "loss": 0.3072, + "step": 15026 + }, + { + "epoch": 1.217352559948153, + "grad_norm": 0.0487448051571846, + "learning_rate": 0.00015459291597281607, + "loss": 0.2996, + "step": 15027 + }, + { + "epoch": 1.2174335709656514, + "grad_norm": 0.0421229712665081, + "learning_rate": 0.00015458841532022143, + "loss": 0.2939, + "step": 15028 + }, + { + "epoch": 1.2175145819831497, + "grad_norm": 0.041163280606269836, + "learning_rate": 0.00015458391466762682, + "loss": 0.3402, + "step": 15029 + }, + { + "epoch": 1.2175955930006481, + "grad_norm": 0.03774140030145645, + "learning_rate": 0.0001545794140150322, + "loss": 0.2603, + "step": 15030 + }, + { + "epoch": 1.2176766040181466, + "grad_norm": 0.03964897617697716, + "learning_rate": 0.00015457491336243756, + "loss": 0.2829, + "step": 15031 + }, + { + "epoch": 1.2177576150356448, + "grad_norm": 0.05228666588664055, + "learning_rate": 0.00015457041270984295, + "loss": 0.3825, + "step": 15032 + }, + { + "epoch": 1.2178386260531433, + "grad_norm": 0.03329094871878624, + "learning_rate": 0.0001545659120572483, + "loss": 0.2762, + "step": 15033 + }, + { + "epoch": 1.2179196370706415, + "grad_norm": 0.041959576308727264, + "learning_rate": 0.00015456141140465367, + "loss": 0.3409, + "step": 15034 + }, + { + "epoch": 1.21800064808814, + "grad_norm": 0.04155031964182854, + "learning_rate": 0.00015455691075205906, + "loss": 0.3188, + "step": 15035 + }, + { + "epoch": 1.2180816591056383, + "grad_norm": 0.039347093552351, + "learning_rate": 0.00015455241009946444, + "loss": 0.3165, + "step": 15036 + }, + { + "epoch": 1.2181626701231367, + "grad_norm": 0.039845991879701614, + "learning_rate": 0.0001545479094468698, + "loss": 0.2788, + "step": 15037 + }, + { + "epoch": 1.2182436811406352, + "grad_norm": 0.04731302335858345, + "learning_rate": 0.0001545434087942752, + "loss": 0.2945, + "step": 15038 + }, + { + "epoch": 1.2183246921581334, + "grad_norm": 0.04452233016490936, + "learning_rate": 0.00015453890814168055, + "loss": 0.314, + "step": 15039 + }, + { + "epoch": 1.218405703175632, + "grad_norm": 0.04520030319690704, + "learning_rate": 0.0001545344074890859, + "loss": 0.3582, + "step": 15040 + }, + { + "epoch": 1.2184867141931304, + "grad_norm": 0.04778093472123146, + "learning_rate": 0.0001545299068364913, + "loss": 0.3604, + "step": 15041 + }, + { + "epoch": 1.2185677252106286, + "grad_norm": 0.04183390736579895, + "learning_rate": 0.00015452540618389669, + "loss": 0.3324, + "step": 15042 + }, + { + "epoch": 1.218648736228127, + "grad_norm": 0.0479646660387516, + "learning_rate": 0.00015452090553130205, + "loss": 0.3547, + "step": 15043 + }, + { + "epoch": 1.2187297472456253, + "grad_norm": 0.049034297466278076, + "learning_rate": 0.00015451640487870743, + "loss": 0.3318, + "step": 15044 + }, + { + "epoch": 1.2188107582631238, + "grad_norm": 0.034791890531778336, + "learning_rate": 0.0001545119042261128, + "loss": 0.283, + "step": 15045 + }, + { + "epoch": 1.2188917692806223, + "grad_norm": 0.04443153366446495, + "learning_rate": 0.00015450740357351815, + "loss": 0.2866, + "step": 15046 + }, + { + "epoch": 1.2189727802981205, + "grad_norm": 0.04115704074501991, + "learning_rate": 0.00015450290292092354, + "loss": 0.2974, + "step": 15047 + }, + { + "epoch": 1.219053791315619, + "grad_norm": 0.04250843822956085, + "learning_rate": 0.00015449840226832893, + "loss": 0.2979, + "step": 15048 + }, + { + "epoch": 1.2191348023331172, + "grad_norm": 0.04619358107447624, + "learning_rate": 0.0001544939016157343, + "loss": 0.3257, + "step": 15049 + }, + { + "epoch": 1.2192158133506157, + "grad_norm": 0.04598325490951538, + "learning_rate": 0.00015448940096313968, + "loss": 0.3024, + "step": 15050 + }, + { + "epoch": 1.2192968243681142, + "grad_norm": 0.046086326241493225, + "learning_rate": 0.00015448490031054504, + "loss": 0.3025, + "step": 15051 + }, + { + "epoch": 1.2193778353856124, + "grad_norm": 0.04900304973125458, + "learning_rate": 0.0001544803996579504, + "loss": 0.2745, + "step": 15052 + }, + { + "epoch": 1.2194588464031109, + "grad_norm": 0.048376649618148804, + "learning_rate": 0.00015447589900535578, + "loss": 0.2921, + "step": 15053 + }, + { + "epoch": 1.219539857420609, + "grad_norm": 0.04333197697997093, + "learning_rate": 0.00015447139835276117, + "loss": 0.3155, + "step": 15054 + }, + { + "epoch": 1.2196208684381076, + "grad_norm": 0.042843107134103775, + "learning_rate": 0.00015446689770016653, + "loss": 0.3148, + "step": 15055 + }, + { + "epoch": 1.219701879455606, + "grad_norm": 0.044335655868053436, + "learning_rate": 0.00015446239704757192, + "loss": 0.3293, + "step": 15056 + }, + { + "epoch": 1.2197828904731043, + "grad_norm": 0.0423131063580513, + "learning_rate": 0.00015445789639497728, + "loss": 0.3143, + "step": 15057 + }, + { + "epoch": 1.2198639014906028, + "grad_norm": 0.0378672294318676, + "learning_rate": 0.00015445339574238264, + "loss": 0.2685, + "step": 15058 + }, + { + "epoch": 1.219944912508101, + "grad_norm": 0.04331289604306221, + "learning_rate": 0.00015444889508978802, + "loss": 0.3153, + "step": 15059 + }, + { + "epoch": 1.2200259235255995, + "grad_norm": 0.03961778059601784, + "learning_rate": 0.0001544443944371934, + "loss": 0.2746, + "step": 15060 + }, + { + "epoch": 1.220106934543098, + "grad_norm": 0.04394698515534401, + "learning_rate": 0.00015443989378459877, + "loss": 0.3073, + "step": 15061 + }, + { + "epoch": 1.2201879455605962, + "grad_norm": 0.03818221762776375, + "learning_rate": 0.00015443539313200416, + "loss": 0.2914, + "step": 15062 + }, + { + "epoch": 1.2202689565780946, + "grad_norm": 0.04286661371588707, + "learning_rate": 0.00015443089247940952, + "loss": 0.2921, + "step": 15063 + }, + { + "epoch": 1.220349967595593, + "grad_norm": 0.04242391511797905, + "learning_rate": 0.00015442639182681488, + "loss": 0.2607, + "step": 15064 + }, + { + "epoch": 1.2204309786130914, + "grad_norm": 0.04057682305574417, + "learning_rate": 0.00015442189117422027, + "loss": 0.3318, + "step": 15065 + }, + { + "epoch": 1.2205119896305898, + "grad_norm": 0.0383298434317112, + "learning_rate": 0.00015441739052162565, + "loss": 0.2709, + "step": 15066 + }, + { + "epoch": 1.220593000648088, + "grad_norm": 0.03954831883311272, + "learning_rate": 0.000154412889869031, + "loss": 0.2917, + "step": 15067 + }, + { + "epoch": 1.2206740116655865, + "grad_norm": 0.03849297761917114, + "learning_rate": 0.0001544083892164364, + "loss": 0.2938, + "step": 15068 + }, + { + "epoch": 1.2207550226830848, + "grad_norm": 0.04343443736433983, + "learning_rate": 0.00015440388856384176, + "loss": 0.2971, + "step": 15069 + }, + { + "epoch": 1.2208360337005832, + "grad_norm": 0.04036508873105049, + "learning_rate": 0.00015439938791124712, + "loss": 0.3394, + "step": 15070 + }, + { + "epoch": 1.2209170447180817, + "grad_norm": 0.03749840706586838, + "learning_rate": 0.0001543948872586525, + "loss": 0.2866, + "step": 15071 + }, + { + "epoch": 1.22099805573558, + "grad_norm": 0.03948388993740082, + "learning_rate": 0.0001543903866060579, + "loss": 0.3344, + "step": 15072 + }, + { + "epoch": 1.2210790667530784, + "grad_norm": 0.04332873970270157, + "learning_rate": 0.00015438588595346325, + "loss": 0.3372, + "step": 15073 + }, + { + "epoch": 1.221160077770577, + "grad_norm": 0.032650191336870193, + "learning_rate": 0.00015438138530086864, + "loss": 0.2635, + "step": 15074 + }, + { + "epoch": 1.2212410887880751, + "grad_norm": 0.04500601440668106, + "learning_rate": 0.000154376884648274, + "loss": 0.3153, + "step": 15075 + }, + { + "epoch": 1.2213220998055736, + "grad_norm": 0.03901544585824013, + "learning_rate": 0.00015437238399567936, + "loss": 0.3147, + "step": 15076 + }, + { + "epoch": 1.2214031108230718, + "grad_norm": 0.04170721769332886, + "learning_rate": 0.00015436788334308478, + "loss": 0.2852, + "step": 15077 + }, + { + "epoch": 1.2214841218405703, + "grad_norm": 0.04137682914733887, + "learning_rate": 0.00015436338269049014, + "loss": 0.2999, + "step": 15078 + }, + { + "epoch": 1.2215651328580688, + "grad_norm": 0.045757956802845, + "learning_rate": 0.0001543588820378955, + "loss": 0.3354, + "step": 15079 + }, + { + "epoch": 1.221646143875567, + "grad_norm": 0.04264063015580177, + "learning_rate": 0.00015435438138530088, + "loss": 0.2822, + "step": 15080 + }, + { + "epoch": 1.2217271548930655, + "grad_norm": 0.04429285600781441, + "learning_rate": 0.00015434988073270624, + "loss": 0.3438, + "step": 15081 + }, + { + "epoch": 1.2218081659105637, + "grad_norm": 0.038446854799985886, + "learning_rate": 0.00015434538008011163, + "loss": 0.2803, + "step": 15082 + }, + { + "epoch": 1.2218891769280622, + "grad_norm": 0.0383138470351696, + "learning_rate": 0.00015434087942751702, + "loss": 0.2487, + "step": 15083 + }, + { + "epoch": 1.2219701879455607, + "grad_norm": 0.04340263456106186, + "learning_rate": 0.00015433637877492238, + "loss": 0.3308, + "step": 15084 + }, + { + "epoch": 1.222051198963059, + "grad_norm": 0.04444803297519684, + "learning_rate": 0.00015433187812232774, + "loss": 0.3341, + "step": 15085 + }, + { + "epoch": 1.2221322099805574, + "grad_norm": 0.04100262001156807, + "learning_rate": 0.00015432737746973312, + "loss": 0.2889, + "step": 15086 + }, + { + "epoch": 1.2222132209980558, + "grad_norm": 0.04645245522260666, + "learning_rate": 0.00015432287681713849, + "loss": 0.3037, + "step": 15087 + }, + { + "epoch": 1.222294232015554, + "grad_norm": 0.0426318496465683, + "learning_rate": 0.00015431837616454387, + "loss": 0.3033, + "step": 15088 + }, + { + "epoch": 1.2223752430330526, + "grad_norm": 0.0452832467854023, + "learning_rate": 0.00015431387551194926, + "loss": 0.3108, + "step": 15089 + }, + { + "epoch": 1.2224562540505508, + "grad_norm": 0.038874559104442596, + "learning_rate": 0.00015430937485935462, + "loss": 0.3092, + "step": 15090 + }, + { + "epoch": 1.2225372650680493, + "grad_norm": 0.04850398376584053, + "learning_rate": 0.00015430487420675998, + "loss": 0.3435, + "step": 15091 + }, + { + "epoch": 1.2226182760855475, + "grad_norm": 0.04427666589617729, + "learning_rate": 0.00015430037355416537, + "loss": 0.3104, + "step": 15092 + }, + { + "epoch": 1.222699287103046, + "grad_norm": 0.045927468687295914, + "learning_rate": 0.00015429587290157073, + "loss": 0.3275, + "step": 15093 + }, + { + "epoch": 1.2227802981205445, + "grad_norm": 0.04388774186372757, + "learning_rate": 0.00015429137224897611, + "loss": 0.2877, + "step": 15094 + }, + { + "epoch": 1.2228613091380427, + "grad_norm": 0.040757108479738235, + "learning_rate": 0.0001542868715963815, + "loss": 0.2963, + "step": 15095 + }, + { + "epoch": 1.2229423201555412, + "grad_norm": 0.041438765823841095, + "learning_rate": 0.00015428237094378686, + "loss": 0.2913, + "step": 15096 + }, + { + "epoch": 1.2230233311730396, + "grad_norm": 0.03743297979235649, + "learning_rate": 0.00015427787029119222, + "loss": 0.3281, + "step": 15097 + }, + { + "epoch": 1.2231043421905379, + "grad_norm": 0.048839520663022995, + "learning_rate": 0.0001542733696385976, + "loss": 0.2851, + "step": 15098 + }, + { + "epoch": 1.2231853532080363, + "grad_norm": 0.04056665673851967, + "learning_rate": 0.00015426886898600297, + "loss": 0.2745, + "step": 15099 + }, + { + "epoch": 1.2232663642255346, + "grad_norm": 0.04060543701052666, + "learning_rate": 0.00015426436833340836, + "loss": 0.3062, + "step": 15100 + }, + { + "epoch": 1.223347375243033, + "grad_norm": 0.04008857533335686, + "learning_rate": 0.00015425986768081374, + "loss": 0.3009, + "step": 15101 + }, + { + "epoch": 1.2234283862605315, + "grad_norm": 0.04103999212384224, + "learning_rate": 0.0001542553670282191, + "loss": 0.3311, + "step": 15102 + }, + { + "epoch": 1.2235093972780298, + "grad_norm": 0.047354403883218765, + "learning_rate": 0.00015425086637562446, + "loss": 0.3822, + "step": 15103 + }, + { + "epoch": 1.2235904082955282, + "grad_norm": 0.045010678470134735, + "learning_rate": 0.00015424636572302985, + "loss": 0.3392, + "step": 15104 + }, + { + "epoch": 1.2236714193130265, + "grad_norm": 0.04420321062207222, + "learning_rate": 0.0001542418650704352, + "loss": 0.3222, + "step": 15105 + }, + { + "epoch": 1.223752430330525, + "grad_norm": 0.04299933463335037, + "learning_rate": 0.0001542373644178406, + "loss": 0.3307, + "step": 15106 + }, + { + "epoch": 1.2238334413480234, + "grad_norm": 0.0378875657916069, + "learning_rate": 0.00015423286376524598, + "loss": 0.2635, + "step": 15107 + }, + { + "epoch": 1.2239144523655217, + "grad_norm": 0.04010816290974617, + "learning_rate": 0.00015422836311265134, + "loss": 0.2729, + "step": 15108 + }, + { + "epoch": 1.2239954633830201, + "grad_norm": 0.04282679781317711, + "learning_rate": 0.0001542238624600567, + "loss": 0.3142, + "step": 15109 + }, + { + "epoch": 1.2240764744005186, + "grad_norm": 0.04327215999364853, + "learning_rate": 0.0001542193618074621, + "loss": 0.2783, + "step": 15110 + }, + { + "epoch": 1.2241574854180168, + "grad_norm": 0.04052073508501053, + "learning_rate": 0.00015421486115486748, + "loss": 0.2601, + "step": 15111 + }, + { + "epoch": 1.2242384964355153, + "grad_norm": 0.049192775040864944, + "learning_rate": 0.00015421036050227284, + "loss": 0.3162, + "step": 15112 + }, + { + "epoch": 1.2243195074530135, + "grad_norm": 0.041703008115291595, + "learning_rate": 0.00015420585984967823, + "loss": 0.3364, + "step": 15113 + }, + { + "epoch": 1.224400518470512, + "grad_norm": 0.04128837585449219, + "learning_rate": 0.00015420135919708359, + "loss": 0.3077, + "step": 15114 + }, + { + "epoch": 1.2244815294880103, + "grad_norm": 0.0411527045071125, + "learning_rate": 0.00015419685854448895, + "loss": 0.2942, + "step": 15115 + }, + { + "epoch": 1.2245625405055087, + "grad_norm": 0.03827161341905594, + "learning_rate": 0.00015419235789189433, + "loss": 0.2545, + "step": 15116 + }, + { + "epoch": 1.2246435515230072, + "grad_norm": 0.04548992961645126, + "learning_rate": 0.00015418785723929972, + "loss": 0.3359, + "step": 15117 + }, + { + "epoch": 1.2247245625405054, + "grad_norm": 0.043934501707553864, + "learning_rate": 0.00015418335658670508, + "loss": 0.295, + "step": 15118 + }, + { + "epoch": 1.224805573558004, + "grad_norm": 0.04573789983987808, + "learning_rate": 0.00015417885593411047, + "loss": 0.3106, + "step": 15119 + }, + { + "epoch": 1.2248865845755024, + "grad_norm": 0.040008544921875, + "learning_rate": 0.00015417435528151583, + "loss": 0.3301, + "step": 15120 + }, + { + "epoch": 1.2249675955930006, + "grad_norm": 0.04708806052803993, + "learning_rate": 0.0001541698546289212, + "loss": 0.3116, + "step": 15121 + }, + { + "epoch": 1.225048606610499, + "grad_norm": 0.044105127453804016, + "learning_rate": 0.00015416535397632657, + "loss": 0.3214, + "step": 15122 + }, + { + "epoch": 1.2251296176279973, + "grad_norm": 0.04013490676879883, + "learning_rate": 0.00015416085332373196, + "loss": 0.268, + "step": 15123 + }, + { + "epoch": 1.2252106286454958, + "grad_norm": 0.0509316511452198, + "learning_rate": 0.00015415635267113732, + "loss": 0.3329, + "step": 15124 + }, + { + "epoch": 1.2252916396629943, + "grad_norm": 0.04100751504302025, + "learning_rate": 0.0001541518520185427, + "loss": 0.3061, + "step": 15125 + }, + { + "epoch": 1.2253726506804925, + "grad_norm": 0.03576541692018509, + "learning_rate": 0.00015414735136594807, + "loss": 0.2857, + "step": 15126 + }, + { + "epoch": 1.225453661697991, + "grad_norm": 0.03365354984998703, + "learning_rate": 0.00015414285071335343, + "loss": 0.2441, + "step": 15127 + }, + { + "epoch": 1.2255346727154892, + "grad_norm": 0.04429009184241295, + "learning_rate": 0.00015413835006075882, + "loss": 0.289, + "step": 15128 + }, + { + "epoch": 1.2256156837329877, + "grad_norm": 0.04027354344725609, + "learning_rate": 0.0001541338494081642, + "loss": 0.2798, + "step": 15129 + }, + { + "epoch": 1.2256966947504861, + "grad_norm": 0.046282071620225906, + "learning_rate": 0.00015412934875556956, + "loss": 0.3011, + "step": 15130 + }, + { + "epoch": 1.2257777057679844, + "grad_norm": 0.04794919118285179, + "learning_rate": 0.00015412484810297495, + "loss": 0.3395, + "step": 15131 + }, + { + "epoch": 1.2258587167854829, + "grad_norm": 0.04859710484743118, + "learning_rate": 0.0001541203474503803, + "loss": 0.3479, + "step": 15132 + }, + { + "epoch": 1.2259397278029813, + "grad_norm": 0.043037887662649155, + "learning_rate": 0.00015411584679778567, + "loss": 0.2922, + "step": 15133 + }, + { + "epoch": 1.2260207388204796, + "grad_norm": 0.039615124464035034, + "learning_rate": 0.00015411134614519106, + "loss": 0.3031, + "step": 15134 + }, + { + "epoch": 1.226101749837978, + "grad_norm": 0.04278057813644409, + "learning_rate": 0.00015410684549259645, + "loss": 0.3131, + "step": 15135 + }, + { + "epoch": 1.2261827608554763, + "grad_norm": 0.0393165685236454, + "learning_rate": 0.0001541023448400018, + "loss": 0.3058, + "step": 15136 + }, + { + "epoch": 1.2262637718729748, + "grad_norm": 0.044020235538482666, + "learning_rate": 0.0001540978441874072, + "loss": 0.3091, + "step": 15137 + }, + { + "epoch": 1.226344782890473, + "grad_norm": 0.042965419590473175, + "learning_rate": 0.00015409334353481255, + "loss": 0.3214, + "step": 15138 + }, + { + "epoch": 1.2264257939079715, + "grad_norm": 0.03816758468747139, + "learning_rate": 0.0001540888428822179, + "loss": 0.2711, + "step": 15139 + }, + { + "epoch": 1.22650680492547, + "grad_norm": 0.03608936816453934, + "learning_rate": 0.0001540843422296233, + "loss": 0.3091, + "step": 15140 + }, + { + "epoch": 1.2265878159429682, + "grad_norm": 0.0426860935986042, + "learning_rate": 0.0001540798415770287, + "loss": 0.3157, + "step": 15141 + }, + { + "epoch": 1.2266688269604666, + "grad_norm": 0.04165487736463547, + "learning_rate": 0.00015407534092443405, + "loss": 0.3022, + "step": 15142 + }, + { + "epoch": 1.226749837977965, + "grad_norm": 0.04956432059407234, + "learning_rate": 0.00015407084027183943, + "loss": 0.2851, + "step": 15143 + }, + { + "epoch": 1.2268308489954634, + "grad_norm": 0.03930883854627609, + "learning_rate": 0.0001540663396192448, + "loss": 0.304, + "step": 15144 + }, + { + "epoch": 1.2269118600129618, + "grad_norm": 0.0448654443025589, + "learning_rate": 0.00015406183896665015, + "loss": 0.3332, + "step": 15145 + }, + { + "epoch": 1.22699287103046, + "grad_norm": 0.03653280436992645, + "learning_rate": 0.00015405733831405557, + "loss": 0.3023, + "step": 15146 + }, + { + "epoch": 1.2270738820479585, + "grad_norm": 0.04919267073273659, + "learning_rate": 0.00015405283766146093, + "loss": 0.3538, + "step": 15147 + }, + { + "epoch": 1.227154893065457, + "grad_norm": 0.04806479811668396, + "learning_rate": 0.0001540483370088663, + "loss": 0.3123, + "step": 15148 + }, + { + "epoch": 1.2272359040829552, + "grad_norm": 0.048344314098358154, + "learning_rate": 0.00015404383635627168, + "loss": 0.3102, + "step": 15149 + }, + { + "epoch": 1.2273169151004537, + "grad_norm": 0.04515837877988815, + "learning_rate": 0.00015403933570367704, + "loss": 0.312, + "step": 15150 + }, + { + "epoch": 1.227397926117952, + "grad_norm": 0.04120798036456108, + "learning_rate": 0.0001540348350510824, + "loss": 0.3222, + "step": 15151 + }, + { + "epoch": 1.2274789371354504, + "grad_norm": 0.0508146807551384, + "learning_rate": 0.0001540303343984878, + "loss": 0.3242, + "step": 15152 + }, + { + "epoch": 1.2275599481529489, + "grad_norm": 0.04866446927189827, + "learning_rate": 0.00015402583374589317, + "loss": 0.3112, + "step": 15153 + }, + { + "epoch": 1.2276409591704471, + "grad_norm": 0.04131367430090904, + "learning_rate": 0.00015402133309329853, + "loss": 0.3098, + "step": 15154 + }, + { + "epoch": 1.2277219701879456, + "grad_norm": 0.04007503390312195, + "learning_rate": 0.00015401683244070392, + "loss": 0.3208, + "step": 15155 + }, + { + "epoch": 1.2278029812054438, + "grad_norm": 0.04036682844161987, + "learning_rate": 0.00015401233178810928, + "loss": 0.2761, + "step": 15156 + }, + { + "epoch": 1.2278839922229423, + "grad_norm": 0.04943064972758293, + "learning_rate": 0.00015400783113551464, + "loss": 0.3039, + "step": 15157 + }, + { + "epoch": 1.2279650032404408, + "grad_norm": 0.03702479973435402, + "learning_rate": 0.00015400333048292005, + "loss": 0.3129, + "step": 15158 + }, + { + "epoch": 1.228046014257939, + "grad_norm": 0.04905194044113159, + "learning_rate": 0.0001539988298303254, + "loss": 0.2974, + "step": 15159 + }, + { + "epoch": 1.2281270252754375, + "grad_norm": 0.04597773775458336, + "learning_rate": 0.00015399432917773077, + "loss": 0.3339, + "step": 15160 + }, + { + "epoch": 1.2282080362929357, + "grad_norm": 0.03559258207678795, + "learning_rate": 0.00015398982852513616, + "loss": 0.2707, + "step": 15161 + }, + { + "epoch": 1.2282890473104342, + "grad_norm": 0.039929188787937164, + "learning_rate": 0.00015398532787254152, + "loss": 0.288, + "step": 15162 + }, + { + "epoch": 1.2283700583279327, + "grad_norm": 0.04652782529592514, + "learning_rate": 0.0001539808272199469, + "loss": 0.3275, + "step": 15163 + }, + { + "epoch": 1.228451069345431, + "grad_norm": 0.03702322393655777, + "learning_rate": 0.0001539763265673523, + "loss": 0.2802, + "step": 15164 + }, + { + "epoch": 1.2285320803629294, + "grad_norm": 0.04209384322166443, + "learning_rate": 0.00015397182591475765, + "loss": 0.3511, + "step": 15165 + }, + { + "epoch": 1.2286130913804278, + "grad_norm": 0.04149395599961281, + "learning_rate": 0.000153967325262163, + "loss": 0.3299, + "step": 15166 + }, + { + "epoch": 1.228694102397926, + "grad_norm": 0.046072304248809814, + "learning_rate": 0.0001539628246095684, + "loss": 0.3295, + "step": 15167 + }, + { + "epoch": 1.2287751134154246, + "grad_norm": 0.045435693114995956, + "learning_rate": 0.00015395832395697376, + "loss": 0.3289, + "step": 15168 + }, + { + "epoch": 1.2288561244329228, + "grad_norm": 0.039649687707424164, + "learning_rate": 0.00015395382330437915, + "loss": 0.2727, + "step": 15169 + }, + { + "epoch": 1.2289371354504213, + "grad_norm": 0.049699172377586365, + "learning_rate": 0.00015394932265178453, + "loss": 0.3213, + "step": 15170 + }, + { + "epoch": 1.2290181464679195, + "grad_norm": 0.04703816771507263, + "learning_rate": 0.0001539448219991899, + "loss": 0.3281, + "step": 15171 + }, + { + "epoch": 1.229099157485418, + "grad_norm": 0.04082157090306282, + "learning_rate": 0.00015394032134659525, + "loss": 0.2701, + "step": 15172 + }, + { + "epoch": 1.2291801685029164, + "grad_norm": 0.03887148201465607, + "learning_rate": 0.00015393582069400064, + "loss": 0.308, + "step": 15173 + }, + { + "epoch": 1.2292611795204147, + "grad_norm": 0.041878700256347656, + "learning_rate": 0.000153931320041406, + "loss": 0.3172, + "step": 15174 + }, + { + "epoch": 1.2293421905379132, + "grad_norm": 0.03661678358912468, + "learning_rate": 0.0001539268193888114, + "loss": 0.2712, + "step": 15175 + }, + { + "epoch": 1.2294232015554116, + "grad_norm": 0.05468657985329628, + "learning_rate": 0.00015392231873621678, + "loss": 0.3144, + "step": 15176 + }, + { + "epoch": 1.2295042125729099, + "grad_norm": 0.03974331542849541, + "learning_rate": 0.00015391781808362214, + "loss": 0.3124, + "step": 15177 + }, + { + "epoch": 1.2295852235904083, + "grad_norm": 0.04192359372973442, + "learning_rate": 0.0001539133174310275, + "loss": 0.3183, + "step": 15178 + }, + { + "epoch": 1.2296662346079066, + "grad_norm": 0.04826575890183449, + "learning_rate": 0.00015390881677843288, + "loss": 0.3606, + "step": 15179 + }, + { + "epoch": 1.229747245625405, + "grad_norm": 0.04235263168811798, + "learning_rate": 0.00015390431612583824, + "loss": 0.3161, + "step": 15180 + }, + { + "epoch": 1.2298282566429035, + "grad_norm": 0.04713137820363045, + "learning_rate": 0.00015389981547324363, + "loss": 0.3756, + "step": 15181 + }, + { + "epoch": 1.2299092676604018, + "grad_norm": 0.037279751151800156, + "learning_rate": 0.00015389531482064902, + "loss": 0.294, + "step": 15182 + }, + { + "epoch": 1.2299902786779002, + "grad_norm": 0.04957104101777077, + "learning_rate": 0.00015389081416805438, + "loss": 0.2958, + "step": 15183 + }, + { + "epoch": 1.2300712896953985, + "grad_norm": 0.04053379222750664, + "learning_rate": 0.00015388631351545974, + "loss": 0.3146, + "step": 15184 + }, + { + "epoch": 1.230152300712897, + "grad_norm": 0.04531674087047577, + "learning_rate": 0.00015388181286286513, + "loss": 0.3101, + "step": 15185 + }, + { + "epoch": 1.2302333117303954, + "grad_norm": 0.03993414342403412, + "learning_rate": 0.00015387731221027049, + "loss": 0.2923, + "step": 15186 + }, + { + "epoch": 1.2303143227478937, + "grad_norm": 0.04393785446882248, + "learning_rate": 0.00015387281155767587, + "loss": 0.3202, + "step": 15187 + }, + { + "epoch": 1.2303953337653921, + "grad_norm": 0.03797182813286781, + "learning_rate": 0.00015386831090508126, + "loss": 0.2721, + "step": 15188 + }, + { + "epoch": 1.2304763447828906, + "grad_norm": 0.040326736867427826, + "learning_rate": 0.00015386381025248662, + "loss": 0.3051, + "step": 15189 + }, + { + "epoch": 1.2305573558003888, + "grad_norm": 0.033600758761167526, + "learning_rate": 0.00015385930959989198, + "loss": 0.2619, + "step": 15190 + }, + { + "epoch": 1.2306383668178873, + "grad_norm": 0.04964074492454529, + "learning_rate": 0.00015385480894729737, + "loss": 0.3344, + "step": 15191 + }, + { + "epoch": 1.2307193778353855, + "grad_norm": 0.0473080575466156, + "learning_rate": 0.00015385030829470275, + "loss": 0.3022, + "step": 15192 + }, + { + "epoch": 1.230800388852884, + "grad_norm": 0.0415453277528286, + "learning_rate": 0.00015384580764210811, + "loss": 0.3057, + "step": 15193 + }, + { + "epoch": 1.2308813998703823, + "grad_norm": 0.040875665843486786, + "learning_rate": 0.0001538413069895135, + "loss": 0.307, + "step": 15194 + }, + { + "epoch": 1.2309624108878807, + "grad_norm": 0.04135308414697647, + "learning_rate": 0.00015383680633691886, + "loss": 0.3161, + "step": 15195 + }, + { + "epoch": 1.2310434219053792, + "grad_norm": 0.04142339527606964, + "learning_rate": 0.00015383230568432422, + "loss": 0.2891, + "step": 15196 + }, + { + "epoch": 1.2311244329228774, + "grad_norm": 0.04539525508880615, + "learning_rate": 0.0001538278050317296, + "loss": 0.3082, + "step": 15197 + }, + { + "epoch": 1.231205443940376, + "grad_norm": 0.044109832495450974, + "learning_rate": 0.000153823304379135, + "loss": 0.3335, + "step": 15198 + }, + { + "epoch": 1.2312864549578744, + "grad_norm": 0.046666741371154785, + "learning_rate": 0.00015381880372654036, + "loss": 0.3194, + "step": 15199 + }, + { + "epoch": 1.2313674659753726, + "grad_norm": 0.0451471321284771, + "learning_rate": 0.00015381430307394574, + "loss": 0.3056, + "step": 15200 + }, + { + "epoch": 1.231448476992871, + "grad_norm": 0.042114224284887314, + "learning_rate": 0.0001538098024213511, + "loss": 0.3196, + "step": 15201 + }, + { + "epoch": 1.2315294880103693, + "grad_norm": 0.044611066579818726, + "learning_rate": 0.00015380530176875646, + "loss": 0.2855, + "step": 15202 + }, + { + "epoch": 1.2316104990278678, + "grad_norm": 0.04717979580163956, + "learning_rate": 0.00015380080111616185, + "loss": 0.3026, + "step": 15203 + }, + { + "epoch": 1.2316915100453663, + "grad_norm": 0.04977048188447952, + "learning_rate": 0.00015379630046356724, + "loss": 0.3032, + "step": 15204 + }, + { + "epoch": 1.2317725210628645, + "grad_norm": 0.0431169793009758, + "learning_rate": 0.0001537917998109726, + "loss": 0.2813, + "step": 15205 + }, + { + "epoch": 1.231853532080363, + "grad_norm": 0.04329026862978935, + "learning_rate": 0.00015378729915837798, + "loss": 0.3119, + "step": 15206 + }, + { + "epoch": 1.2319345430978612, + "grad_norm": 0.046638138592243195, + "learning_rate": 0.00015378279850578334, + "loss": 0.3054, + "step": 15207 + }, + { + "epoch": 1.2320155541153597, + "grad_norm": 0.03704584017395973, + "learning_rate": 0.0001537782978531887, + "loss": 0.2952, + "step": 15208 + }, + { + "epoch": 1.2320965651328581, + "grad_norm": 0.04100892320275307, + "learning_rate": 0.0001537737972005941, + "loss": 0.2741, + "step": 15209 + }, + { + "epoch": 1.2321775761503564, + "grad_norm": 0.04119610786437988, + "learning_rate": 0.00015376929654799948, + "loss": 0.2946, + "step": 15210 + }, + { + "epoch": 1.2322585871678549, + "grad_norm": 0.041077807545661926, + "learning_rate": 0.00015376479589540484, + "loss": 0.3135, + "step": 15211 + }, + { + "epoch": 1.2323395981853533, + "grad_norm": 0.043649882078170776, + "learning_rate": 0.00015376029524281023, + "loss": 0.3032, + "step": 15212 + }, + { + "epoch": 1.2324206092028516, + "grad_norm": 0.04387475550174713, + "learning_rate": 0.00015375579459021559, + "loss": 0.2948, + "step": 15213 + }, + { + "epoch": 1.23250162022035, + "grad_norm": 0.04019913822412491, + "learning_rate": 0.00015375129393762095, + "loss": 0.305, + "step": 15214 + }, + { + "epoch": 1.2325826312378483, + "grad_norm": 0.05033343657851219, + "learning_rate": 0.00015374679328502636, + "loss": 0.3652, + "step": 15215 + }, + { + "epoch": 1.2326636422553467, + "grad_norm": 0.0421714186668396, + "learning_rate": 0.00015374229263243172, + "loss": 0.3317, + "step": 15216 + }, + { + "epoch": 1.232744653272845, + "grad_norm": 0.043096888810396194, + "learning_rate": 0.00015373779197983708, + "loss": 0.3201, + "step": 15217 + }, + { + "epoch": 1.2328256642903435, + "grad_norm": 0.04364513233304024, + "learning_rate": 0.00015373329132724247, + "loss": 0.2738, + "step": 15218 + }, + { + "epoch": 1.232906675307842, + "grad_norm": 0.04265977814793587, + "learning_rate": 0.00015372879067464783, + "loss": 0.3075, + "step": 15219 + }, + { + "epoch": 1.2329876863253402, + "grad_norm": 0.041783396154642105, + "learning_rate": 0.0001537242900220532, + "loss": 0.3063, + "step": 15220 + }, + { + "epoch": 1.2330686973428386, + "grad_norm": 0.04352150484919548, + "learning_rate": 0.0001537197893694586, + "loss": 0.3298, + "step": 15221 + }, + { + "epoch": 1.233149708360337, + "grad_norm": 0.039487238973379135, + "learning_rate": 0.00015371528871686396, + "loss": 0.2893, + "step": 15222 + }, + { + "epoch": 1.2332307193778353, + "grad_norm": 0.0369989238679409, + "learning_rate": 0.00015371078806426932, + "loss": 0.2741, + "step": 15223 + }, + { + "epoch": 1.2333117303953338, + "grad_norm": 0.04650986194610596, + "learning_rate": 0.0001537062874116747, + "loss": 0.3306, + "step": 15224 + }, + { + "epoch": 1.233392741412832, + "grad_norm": 0.03859679773449898, + "learning_rate": 0.00015370178675908007, + "loss": 0.2607, + "step": 15225 + }, + { + "epoch": 1.2334737524303305, + "grad_norm": 0.0406336709856987, + "learning_rate": 0.00015369728610648543, + "loss": 0.3277, + "step": 15226 + }, + { + "epoch": 1.233554763447829, + "grad_norm": 0.046244632452726364, + "learning_rate": 0.00015369278545389084, + "loss": 0.3064, + "step": 15227 + }, + { + "epoch": 1.2336357744653272, + "grad_norm": 0.04249390587210655, + "learning_rate": 0.0001536882848012962, + "loss": 0.3184, + "step": 15228 + }, + { + "epoch": 1.2337167854828257, + "grad_norm": 0.04118916392326355, + "learning_rate": 0.00015368378414870156, + "loss": 0.2811, + "step": 15229 + }, + { + "epoch": 1.233797796500324, + "grad_norm": 0.05073089152574539, + "learning_rate": 0.00015367928349610695, + "loss": 0.3123, + "step": 15230 + }, + { + "epoch": 1.2338788075178224, + "grad_norm": 0.039565760642290115, + "learning_rate": 0.0001536747828435123, + "loss": 0.317, + "step": 15231 + }, + { + "epoch": 1.2339598185353209, + "grad_norm": 0.043358445167541504, + "learning_rate": 0.00015367028219091767, + "loss": 0.3027, + "step": 15232 + }, + { + "epoch": 1.2340408295528191, + "grad_norm": 0.041212573647499084, + "learning_rate": 0.00015366578153832309, + "loss": 0.3226, + "step": 15233 + }, + { + "epoch": 1.2341218405703176, + "grad_norm": 0.04205805063247681, + "learning_rate": 0.00015366128088572845, + "loss": 0.3027, + "step": 15234 + }, + { + "epoch": 1.234202851587816, + "grad_norm": 0.04577530920505524, + "learning_rate": 0.0001536567802331338, + "loss": 0.3123, + "step": 15235 + }, + { + "epoch": 1.2342838626053143, + "grad_norm": 0.04439268633723259, + "learning_rate": 0.0001536522795805392, + "loss": 0.3067, + "step": 15236 + }, + { + "epoch": 1.2343648736228128, + "grad_norm": 0.04194721207022667, + "learning_rate": 0.00015364777892794455, + "loss": 0.3329, + "step": 15237 + }, + { + "epoch": 1.234445884640311, + "grad_norm": 0.040099386125802994, + "learning_rate": 0.0001536432782753499, + "loss": 0.3056, + "step": 15238 + }, + { + "epoch": 1.2345268956578095, + "grad_norm": 0.04326711222529411, + "learning_rate": 0.00015363877762275533, + "loss": 0.3409, + "step": 15239 + }, + { + "epoch": 1.2346079066753077, + "grad_norm": 0.03979482874274254, + "learning_rate": 0.0001536342769701607, + "loss": 0.3006, + "step": 15240 + }, + { + "epoch": 1.2346889176928062, + "grad_norm": 0.05187808722257614, + "learning_rate": 0.00015362977631756605, + "loss": 0.3089, + "step": 15241 + }, + { + "epoch": 1.2347699287103047, + "grad_norm": 0.05291687324643135, + "learning_rate": 0.00015362527566497143, + "loss": 0.3692, + "step": 15242 + }, + { + "epoch": 1.234850939727803, + "grad_norm": 0.03968328982591629, + "learning_rate": 0.0001536207750123768, + "loss": 0.2979, + "step": 15243 + }, + { + "epoch": 1.2349319507453014, + "grad_norm": 0.03818349540233612, + "learning_rate": 0.00015361627435978218, + "loss": 0.3323, + "step": 15244 + }, + { + "epoch": 1.2350129617627998, + "grad_norm": 0.04607069119811058, + "learning_rate": 0.00015361177370718757, + "loss": 0.3302, + "step": 15245 + }, + { + "epoch": 1.235093972780298, + "grad_norm": 0.04158300533890724, + "learning_rate": 0.00015360727305459293, + "loss": 0.2774, + "step": 15246 + }, + { + "epoch": 1.2351749837977966, + "grad_norm": 0.03845590353012085, + "learning_rate": 0.0001536027724019983, + "loss": 0.3039, + "step": 15247 + }, + { + "epoch": 1.2352559948152948, + "grad_norm": 0.05031782016158104, + "learning_rate": 0.00015359827174940368, + "loss": 0.2619, + "step": 15248 + }, + { + "epoch": 1.2353370058327933, + "grad_norm": 0.036526355892419815, + "learning_rate": 0.00015359377109680904, + "loss": 0.2565, + "step": 15249 + }, + { + "epoch": 1.2354180168502917, + "grad_norm": 0.04311860725283623, + "learning_rate": 0.00015358927044421442, + "loss": 0.2894, + "step": 15250 + }, + { + "epoch": 1.23549902786779, + "grad_norm": 0.04755273088812828, + "learning_rate": 0.0001535847697916198, + "loss": 0.3003, + "step": 15251 + }, + { + "epoch": 1.2355800388852884, + "grad_norm": 0.044981442391872406, + "learning_rate": 0.00015358026913902517, + "loss": 0.3073, + "step": 15252 + }, + { + "epoch": 1.2356610499027867, + "grad_norm": 0.051071830093860626, + "learning_rate": 0.00015357576848643053, + "loss": 0.3178, + "step": 15253 + }, + { + "epoch": 1.2357420609202852, + "grad_norm": 0.040591999888420105, + "learning_rate": 0.00015357126783383592, + "loss": 0.2578, + "step": 15254 + }, + { + "epoch": 1.2358230719377836, + "grad_norm": 0.03376752510666847, + "learning_rate": 0.00015356676718124128, + "loss": 0.2572, + "step": 15255 + }, + { + "epoch": 1.2359040829552819, + "grad_norm": 0.04118896648287773, + "learning_rate": 0.00015356226652864666, + "loss": 0.2724, + "step": 15256 + }, + { + "epoch": 1.2359850939727803, + "grad_norm": 0.04479838162660599, + "learning_rate": 0.00015355776587605205, + "loss": 0.2559, + "step": 15257 + }, + { + "epoch": 1.2360661049902788, + "grad_norm": 0.047844868153333664, + "learning_rate": 0.0001535532652234574, + "loss": 0.3781, + "step": 15258 + }, + { + "epoch": 1.236147116007777, + "grad_norm": 0.043366435915231705, + "learning_rate": 0.00015354876457086277, + "loss": 0.294, + "step": 15259 + }, + { + "epoch": 1.2362281270252755, + "grad_norm": 0.0474378801882267, + "learning_rate": 0.00015354426391826816, + "loss": 0.2816, + "step": 15260 + }, + { + "epoch": 1.2363091380427738, + "grad_norm": 0.05177191272377968, + "learning_rate": 0.00015353976326567352, + "loss": 0.2862, + "step": 15261 + }, + { + "epoch": 1.2363901490602722, + "grad_norm": 0.04230916500091553, + "learning_rate": 0.0001535352626130789, + "loss": 0.3047, + "step": 15262 + }, + { + "epoch": 1.2364711600777705, + "grad_norm": 0.0430777408182621, + "learning_rate": 0.0001535307619604843, + "loss": 0.2985, + "step": 15263 + }, + { + "epoch": 1.236552171095269, + "grad_norm": 0.04171663895249367, + "learning_rate": 0.00015352626130788965, + "loss": 0.3184, + "step": 15264 + }, + { + "epoch": 1.2366331821127674, + "grad_norm": 0.040578003972768784, + "learning_rate": 0.000153521760655295, + "loss": 0.2826, + "step": 15265 + }, + { + "epoch": 1.2367141931302656, + "grad_norm": 0.0430280901491642, + "learning_rate": 0.0001535172600027004, + "loss": 0.3106, + "step": 15266 + }, + { + "epoch": 1.2367952041477641, + "grad_norm": 0.04540487751364708, + "learning_rate": 0.0001535127593501058, + "loss": 0.3408, + "step": 15267 + }, + { + "epoch": 1.2368762151652626, + "grad_norm": 0.03872782737016678, + "learning_rate": 0.00015350825869751115, + "loss": 0.2879, + "step": 15268 + }, + { + "epoch": 1.2369572261827608, + "grad_norm": 0.041879042983055115, + "learning_rate": 0.00015350375804491654, + "loss": 0.3497, + "step": 15269 + }, + { + "epoch": 1.2370382372002593, + "grad_norm": 0.04300076141953468, + "learning_rate": 0.0001534992573923219, + "loss": 0.3395, + "step": 15270 + }, + { + "epoch": 1.2371192482177575, + "grad_norm": 0.03565739467740059, + "learning_rate": 0.00015349475673972726, + "loss": 0.2584, + "step": 15271 + }, + { + "epoch": 1.237200259235256, + "grad_norm": 0.048258304595947266, + "learning_rate": 0.00015349025608713264, + "loss": 0.3012, + "step": 15272 + }, + { + "epoch": 1.2372812702527543, + "grad_norm": 0.03877298906445503, + "learning_rate": 0.00015348575543453803, + "loss": 0.2592, + "step": 15273 + }, + { + "epoch": 1.2373622812702527, + "grad_norm": 0.04685317352414131, + "learning_rate": 0.0001534812547819434, + "loss": 0.3059, + "step": 15274 + }, + { + "epoch": 1.2374432922877512, + "grad_norm": 0.038681793957948685, + "learning_rate": 0.00015347675412934878, + "loss": 0.2573, + "step": 15275 + }, + { + "epoch": 1.2375243033052494, + "grad_norm": 0.04133173078298569, + "learning_rate": 0.00015347225347675414, + "loss": 0.2834, + "step": 15276 + }, + { + "epoch": 1.237605314322748, + "grad_norm": 0.04690474644303322, + "learning_rate": 0.0001534677528241595, + "loss": 0.3354, + "step": 15277 + }, + { + "epoch": 1.2376863253402464, + "grad_norm": 0.04486876353621483, + "learning_rate": 0.00015346325217156488, + "loss": 0.3612, + "step": 15278 + }, + { + "epoch": 1.2377673363577446, + "grad_norm": 0.04361611232161522, + "learning_rate": 0.00015345875151897027, + "loss": 0.3043, + "step": 15279 + }, + { + "epoch": 1.237848347375243, + "grad_norm": 0.041666265577077866, + "learning_rate": 0.00015345425086637563, + "loss": 0.2907, + "step": 15280 + }, + { + "epoch": 1.2379293583927413, + "grad_norm": 0.04613722488284111, + "learning_rate": 0.00015344975021378102, + "loss": 0.3166, + "step": 15281 + }, + { + "epoch": 1.2380103694102398, + "grad_norm": 0.04196156933903694, + "learning_rate": 0.00015344524956118638, + "loss": 0.2955, + "step": 15282 + }, + { + "epoch": 1.2380913804277383, + "grad_norm": 0.045780882239341736, + "learning_rate": 0.00015344074890859174, + "loss": 0.3262, + "step": 15283 + }, + { + "epoch": 1.2381723914452365, + "grad_norm": 0.049061235040426254, + "learning_rate": 0.00015343624825599713, + "loss": 0.3049, + "step": 15284 + }, + { + "epoch": 1.238253402462735, + "grad_norm": 0.044471532106399536, + "learning_rate": 0.0001534317476034025, + "loss": 0.2768, + "step": 15285 + }, + { + "epoch": 1.2383344134802332, + "grad_norm": 0.04229315370321274, + "learning_rate": 0.00015342724695080787, + "loss": 0.299, + "step": 15286 + }, + { + "epoch": 1.2384154244977317, + "grad_norm": 0.03775394335389137, + "learning_rate": 0.00015342274629821326, + "loss": 0.3116, + "step": 15287 + }, + { + "epoch": 1.2384964355152301, + "grad_norm": 0.04446544870734215, + "learning_rate": 0.00015341824564561862, + "loss": 0.2938, + "step": 15288 + }, + { + "epoch": 1.2385774465327284, + "grad_norm": 0.0390334390103817, + "learning_rate": 0.00015341374499302398, + "loss": 0.2985, + "step": 15289 + }, + { + "epoch": 1.2386584575502269, + "grad_norm": 0.046083953231573105, + "learning_rate": 0.00015340924434042937, + "loss": 0.3355, + "step": 15290 + }, + { + "epoch": 1.2387394685677253, + "grad_norm": 0.04037284478545189, + "learning_rate": 0.00015340474368783475, + "loss": 0.3072, + "step": 15291 + }, + { + "epoch": 1.2388204795852236, + "grad_norm": 0.04536395147442818, + "learning_rate": 0.00015340024303524011, + "loss": 0.319, + "step": 15292 + }, + { + "epoch": 1.238901490602722, + "grad_norm": 0.043972741812467575, + "learning_rate": 0.0001533957423826455, + "loss": 0.3173, + "step": 15293 + }, + { + "epoch": 1.2389825016202203, + "grad_norm": 0.04083540290594101, + "learning_rate": 0.00015339124173005086, + "loss": 0.3294, + "step": 15294 + }, + { + "epoch": 1.2390635126377187, + "grad_norm": 0.04482501000165939, + "learning_rate": 0.00015338674107745622, + "loss": 0.3093, + "step": 15295 + }, + { + "epoch": 1.239144523655217, + "grad_norm": 0.0340910442173481, + "learning_rate": 0.00015338224042486164, + "loss": 0.2693, + "step": 15296 + }, + { + "epoch": 1.2392255346727155, + "grad_norm": 0.05177828669548035, + "learning_rate": 0.000153377739772267, + "loss": 0.3094, + "step": 15297 + }, + { + "epoch": 1.239306545690214, + "grad_norm": 0.05645804852247238, + "learning_rate": 0.00015337323911967236, + "loss": 0.3089, + "step": 15298 + }, + { + "epoch": 1.2393875567077122, + "grad_norm": 0.04944702982902527, + "learning_rate": 0.00015336873846707774, + "loss": 0.3414, + "step": 15299 + }, + { + "epoch": 1.2394685677252106, + "grad_norm": 0.04450514167547226, + "learning_rate": 0.0001533642378144831, + "loss": 0.3182, + "step": 15300 + }, + { + "epoch": 1.239549578742709, + "grad_norm": 0.04502875730395317, + "learning_rate": 0.00015335973716188846, + "loss": 0.3041, + "step": 15301 + }, + { + "epoch": 1.2396305897602073, + "grad_norm": 0.042548295110464096, + "learning_rate": 0.00015335523650929388, + "loss": 0.2947, + "step": 15302 + }, + { + "epoch": 1.2397116007777058, + "grad_norm": 0.0360729917883873, + "learning_rate": 0.00015335073585669924, + "loss": 0.2631, + "step": 15303 + }, + { + "epoch": 1.239792611795204, + "grad_norm": 0.04980987682938576, + "learning_rate": 0.0001533462352041046, + "loss": 0.3343, + "step": 15304 + }, + { + "epoch": 1.2398736228127025, + "grad_norm": 0.041066061705350876, + "learning_rate": 0.00015334173455150998, + "loss": 0.3111, + "step": 15305 + }, + { + "epoch": 1.239954633830201, + "grad_norm": 0.03820958361029625, + "learning_rate": 0.00015333723389891534, + "loss": 0.2949, + "step": 15306 + }, + { + "epoch": 1.2400356448476992, + "grad_norm": 0.037509698420763016, + "learning_rate": 0.0001533327332463207, + "loss": 0.3029, + "step": 15307 + }, + { + "epoch": 1.2401166558651977, + "grad_norm": 0.04517628997564316, + "learning_rate": 0.00015332823259372612, + "loss": 0.313, + "step": 15308 + }, + { + "epoch": 1.240197666882696, + "grad_norm": 0.053675659000873566, + "learning_rate": 0.00015332373194113148, + "loss": 0.2906, + "step": 15309 + }, + { + "epoch": 1.2402786779001944, + "grad_norm": 0.041454900056123734, + "learning_rate": 0.00015331923128853684, + "loss": 0.3135, + "step": 15310 + }, + { + "epoch": 1.2403596889176929, + "grad_norm": 0.03922869265079498, + "learning_rate": 0.00015331473063594223, + "loss": 0.2857, + "step": 15311 + }, + { + "epoch": 1.2404406999351911, + "grad_norm": 0.04632218927145004, + "learning_rate": 0.00015331022998334759, + "loss": 0.296, + "step": 15312 + }, + { + "epoch": 1.2405217109526896, + "grad_norm": 0.036529503762722015, + "learning_rate": 0.00015330572933075295, + "loss": 0.2847, + "step": 15313 + }, + { + "epoch": 1.240602721970188, + "grad_norm": 0.04689698666334152, + "learning_rate": 0.00015330122867815836, + "loss": 0.3178, + "step": 15314 + }, + { + "epoch": 1.2406837329876863, + "grad_norm": 0.04123736917972565, + "learning_rate": 0.00015329672802556372, + "loss": 0.2981, + "step": 15315 + }, + { + "epoch": 1.2407647440051848, + "grad_norm": 0.041495393961668015, + "learning_rate": 0.00015329222737296908, + "loss": 0.3112, + "step": 15316 + }, + { + "epoch": 1.240845755022683, + "grad_norm": 0.04156705364584923, + "learning_rate": 0.00015328772672037447, + "loss": 0.3111, + "step": 15317 + }, + { + "epoch": 1.2409267660401815, + "grad_norm": 0.04301241785287857, + "learning_rate": 0.00015328322606777983, + "loss": 0.3103, + "step": 15318 + }, + { + "epoch": 1.2410077770576797, + "grad_norm": 0.041196610778570175, + "learning_rate": 0.0001532787254151852, + "loss": 0.3036, + "step": 15319 + }, + { + "epoch": 1.2410887880751782, + "grad_norm": 0.04731073975563049, + "learning_rate": 0.0001532742247625906, + "loss": 0.3079, + "step": 15320 + }, + { + "epoch": 1.2411697990926767, + "grad_norm": 0.03111322596669197, + "learning_rate": 0.00015326972410999596, + "loss": 0.235, + "step": 15321 + }, + { + "epoch": 1.241250810110175, + "grad_norm": 0.04209109768271446, + "learning_rate": 0.00015326522345740132, + "loss": 0.3478, + "step": 15322 + }, + { + "epoch": 1.2413318211276734, + "grad_norm": 0.041773680597543716, + "learning_rate": 0.0001532607228048067, + "loss": 0.2853, + "step": 15323 + }, + { + "epoch": 1.2414128321451718, + "grad_norm": 0.048851095139980316, + "learning_rate": 0.00015325622215221207, + "loss": 0.3461, + "step": 15324 + }, + { + "epoch": 1.24149384316267, + "grad_norm": 0.037057485431432724, + "learning_rate": 0.00015325172149961746, + "loss": 0.2731, + "step": 15325 + }, + { + "epoch": 1.2415748541801686, + "grad_norm": 0.046550452709198, + "learning_rate": 0.00015324722084702284, + "loss": 0.3483, + "step": 15326 + }, + { + "epoch": 1.2416558651976668, + "grad_norm": 0.05344468355178833, + "learning_rate": 0.0001532427201944282, + "loss": 0.319, + "step": 15327 + }, + { + "epoch": 1.2417368762151653, + "grad_norm": 0.05149804428219795, + "learning_rate": 0.00015323821954183356, + "loss": 0.3447, + "step": 15328 + }, + { + "epoch": 1.2418178872326637, + "grad_norm": 0.0394924022257328, + "learning_rate": 0.00015323371888923895, + "loss": 0.2996, + "step": 15329 + }, + { + "epoch": 1.241898898250162, + "grad_norm": 0.039302125573158264, + "learning_rate": 0.0001532292182366443, + "loss": 0.2984, + "step": 15330 + }, + { + "epoch": 1.2419799092676604, + "grad_norm": 0.03921142965555191, + "learning_rate": 0.0001532247175840497, + "loss": 0.2767, + "step": 15331 + }, + { + "epoch": 1.2420609202851587, + "grad_norm": 0.04495244100689888, + "learning_rate": 0.00015322021693145509, + "loss": 0.3193, + "step": 15332 + }, + { + "epoch": 1.2421419313026572, + "grad_norm": 0.04314647987484932, + "learning_rate": 0.00015321571627886045, + "loss": 0.2974, + "step": 15333 + }, + { + "epoch": 1.2422229423201556, + "grad_norm": 0.04043354466557503, + "learning_rate": 0.0001532112156262658, + "loss": 0.2751, + "step": 15334 + }, + { + "epoch": 1.2423039533376539, + "grad_norm": 0.03722742944955826, + "learning_rate": 0.0001532067149736712, + "loss": 0.2962, + "step": 15335 + }, + { + "epoch": 1.2423849643551523, + "grad_norm": 0.04360896348953247, + "learning_rate": 0.00015320221432107655, + "loss": 0.3269, + "step": 15336 + }, + { + "epoch": 1.2424659753726508, + "grad_norm": 0.055030226707458496, + "learning_rate": 0.00015319771366848194, + "loss": 0.3611, + "step": 15337 + }, + { + "epoch": 1.242546986390149, + "grad_norm": 0.04381483793258667, + "learning_rate": 0.00015319321301588733, + "loss": 0.2907, + "step": 15338 + }, + { + "epoch": 1.2426279974076475, + "grad_norm": 0.036216773092746735, + "learning_rate": 0.0001531887123632927, + "loss": 0.321, + "step": 15339 + }, + { + "epoch": 1.2427090084251458, + "grad_norm": 0.03686912730336189, + "learning_rate": 0.00015318421171069805, + "loss": 0.2704, + "step": 15340 + }, + { + "epoch": 1.2427900194426442, + "grad_norm": 0.04307206720113754, + "learning_rate": 0.00015317971105810343, + "loss": 0.3245, + "step": 15341 + }, + { + "epoch": 1.2428710304601425, + "grad_norm": 0.04554265737533569, + "learning_rate": 0.0001531752104055088, + "loss": 0.3081, + "step": 15342 + }, + { + "epoch": 1.242952041477641, + "grad_norm": 0.0367271825671196, + "learning_rate": 0.00015317070975291418, + "loss": 0.3246, + "step": 15343 + }, + { + "epoch": 1.2430330524951394, + "grad_norm": 0.037328340113162994, + "learning_rate": 0.00015316620910031957, + "loss": 0.2989, + "step": 15344 + }, + { + "epoch": 1.2431140635126376, + "grad_norm": 0.037408363074064255, + "learning_rate": 0.00015316170844772493, + "loss": 0.3109, + "step": 15345 + }, + { + "epoch": 1.2431950745301361, + "grad_norm": 0.04492205008864403, + "learning_rate": 0.0001531572077951303, + "loss": 0.2851, + "step": 15346 + }, + { + "epoch": 1.2432760855476346, + "grad_norm": 0.052792858332395554, + "learning_rate": 0.00015315270714253568, + "loss": 0.3778, + "step": 15347 + }, + { + "epoch": 1.2433570965651328, + "grad_norm": 0.03828036040067673, + "learning_rate": 0.00015314820648994106, + "loss": 0.2956, + "step": 15348 + }, + { + "epoch": 1.2434381075826313, + "grad_norm": 0.046718232333660126, + "learning_rate": 0.00015314370583734642, + "loss": 0.3328, + "step": 15349 + }, + { + "epoch": 1.2435191186001295, + "grad_norm": 0.04345204308629036, + "learning_rate": 0.0001531392051847518, + "loss": 0.3214, + "step": 15350 + }, + { + "epoch": 1.243600129617628, + "grad_norm": 0.03833979368209839, + "learning_rate": 0.00015313470453215717, + "loss": 0.3186, + "step": 15351 + }, + { + "epoch": 1.2436811406351265, + "grad_norm": 0.04523218795657158, + "learning_rate": 0.00015313020387956253, + "loss": 0.3308, + "step": 15352 + }, + { + "epoch": 1.2437621516526247, + "grad_norm": 0.043142274022102356, + "learning_rate": 0.00015312570322696792, + "loss": 0.3069, + "step": 15353 + }, + { + "epoch": 1.2438431626701232, + "grad_norm": 0.043952710926532745, + "learning_rate": 0.0001531212025743733, + "loss": 0.2971, + "step": 15354 + }, + { + "epoch": 1.2439241736876214, + "grad_norm": 0.04158525541424751, + "learning_rate": 0.00015311670192177866, + "loss": 0.3382, + "step": 15355 + }, + { + "epoch": 1.24400518470512, + "grad_norm": 0.040137652307748795, + "learning_rate": 0.00015311220126918405, + "loss": 0.317, + "step": 15356 + }, + { + "epoch": 1.2440861957226184, + "grad_norm": 0.04606618359684944, + "learning_rate": 0.0001531077006165894, + "loss": 0.3215, + "step": 15357 + }, + { + "epoch": 1.2441672067401166, + "grad_norm": 0.041263725608587265, + "learning_rate": 0.00015310319996399477, + "loss": 0.311, + "step": 15358 + }, + { + "epoch": 1.244248217757615, + "grad_norm": 0.044039517641067505, + "learning_rate": 0.00015309869931140016, + "loss": 0.3153, + "step": 15359 + }, + { + "epoch": 1.2443292287751135, + "grad_norm": 0.04090086743235588, + "learning_rate": 0.00015309419865880555, + "loss": 0.3142, + "step": 15360 + }, + { + "epoch": 1.2444102397926118, + "grad_norm": 0.04677727445960045, + "learning_rate": 0.0001530896980062109, + "loss": 0.2843, + "step": 15361 + }, + { + "epoch": 1.2444912508101102, + "grad_norm": 0.04350362718105316, + "learning_rate": 0.0001530851973536163, + "loss": 0.3047, + "step": 15362 + }, + { + "epoch": 1.2445722618276085, + "grad_norm": 0.0443713404238224, + "learning_rate": 0.00015308069670102165, + "loss": 0.3237, + "step": 15363 + }, + { + "epoch": 1.244653272845107, + "grad_norm": 0.04677911847829819, + "learning_rate": 0.00015307619604842701, + "loss": 0.3249, + "step": 15364 + }, + { + "epoch": 1.2447342838626052, + "grad_norm": 0.04166271165013313, + "learning_rate": 0.0001530716953958324, + "loss": 0.3289, + "step": 15365 + }, + { + "epoch": 1.2448152948801037, + "grad_norm": 0.03945012018084526, + "learning_rate": 0.0001530671947432378, + "loss": 0.2753, + "step": 15366 + }, + { + "epoch": 1.2448963058976021, + "grad_norm": 0.04785705730319023, + "learning_rate": 0.00015306269409064315, + "loss": 0.3095, + "step": 15367 + }, + { + "epoch": 1.2449773169151004, + "grad_norm": 0.04102540388703346, + "learning_rate": 0.00015305819343804854, + "loss": 0.3083, + "step": 15368 + }, + { + "epoch": 1.2450583279325989, + "grad_norm": 0.04111190885305405, + "learning_rate": 0.0001530536927854539, + "loss": 0.3204, + "step": 15369 + }, + { + "epoch": 1.2451393389500973, + "grad_norm": 0.054699547588825226, + "learning_rate": 0.00015304919213285926, + "loss": 0.326, + "step": 15370 + }, + { + "epoch": 1.2452203499675956, + "grad_norm": 0.03741315379738808, + "learning_rate": 0.00015304469148026464, + "loss": 0.3103, + "step": 15371 + }, + { + "epoch": 1.245301360985094, + "grad_norm": 0.03876160457730293, + "learning_rate": 0.00015304019082767003, + "loss": 0.3016, + "step": 15372 + }, + { + "epoch": 1.2453823720025923, + "grad_norm": 0.03826634958386421, + "learning_rate": 0.0001530356901750754, + "loss": 0.3104, + "step": 15373 + }, + { + "epoch": 1.2454633830200907, + "grad_norm": 0.03665310889482498, + "learning_rate": 0.00015303118952248078, + "loss": 0.2841, + "step": 15374 + }, + { + "epoch": 1.2455443940375892, + "grad_norm": 0.03642250970005989, + "learning_rate": 0.00015302668886988614, + "loss": 0.3106, + "step": 15375 + }, + { + "epoch": 1.2456254050550875, + "grad_norm": 0.042411211878061295, + "learning_rate": 0.0001530221882172915, + "loss": 0.2864, + "step": 15376 + }, + { + "epoch": 1.245706416072586, + "grad_norm": 0.03778219595551491, + "learning_rate": 0.0001530176875646969, + "loss": 0.2864, + "step": 15377 + }, + { + "epoch": 1.2457874270900842, + "grad_norm": 0.050751928240060806, + "learning_rate": 0.00015301318691210227, + "loss": 0.3848, + "step": 15378 + }, + { + "epoch": 1.2458684381075826, + "grad_norm": 0.042749639600515366, + "learning_rate": 0.00015300868625950763, + "loss": 0.3099, + "step": 15379 + }, + { + "epoch": 1.245949449125081, + "grad_norm": 0.039685070514678955, + "learning_rate": 0.00015300418560691302, + "loss": 0.3135, + "step": 15380 + }, + { + "epoch": 1.2460304601425793, + "grad_norm": 0.04631935432553291, + "learning_rate": 0.00015299968495431838, + "loss": 0.3155, + "step": 15381 + }, + { + "epoch": 1.2461114711600778, + "grad_norm": 0.048279713839292526, + "learning_rate": 0.00015299518430172374, + "loss": 0.3345, + "step": 15382 + }, + { + "epoch": 1.246192482177576, + "grad_norm": 0.04498374089598656, + "learning_rate": 0.00015299068364912915, + "loss": 0.3359, + "step": 15383 + }, + { + "epoch": 1.2462734931950745, + "grad_norm": 0.04783787950873375, + "learning_rate": 0.0001529861829965345, + "loss": 0.3339, + "step": 15384 + }, + { + "epoch": 1.246354504212573, + "grad_norm": 0.042057670652866364, + "learning_rate": 0.00015298168234393987, + "loss": 0.2938, + "step": 15385 + }, + { + "epoch": 1.2464355152300712, + "grad_norm": 0.03839834779500961, + "learning_rate": 0.00015297718169134526, + "loss": 0.3028, + "step": 15386 + }, + { + "epoch": 1.2465165262475697, + "grad_norm": 0.04023413360118866, + "learning_rate": 0.00015297268103875062, + "loss": 0.2902, + "step": 15387 + }, + { + "epoch": 1.246597537265068, + "grad_norm": 0.040760256350040436, + "learning_rate": 0.00015296818038615598, + "loss": 0.3011, + "step": 15388 + }, + { + "epoch": 1.2466785482825664, + "grad_norm": 0.050573479384183884, + "learning_rate": 0.0001529636797335614, + "loss": 0.2941, + "step": 15389 + }, + { + "epoch": 1.2467595593000649, + "grad_norm": 0.04576423019170761, + "learning_rate": 0.00015295917908096675, + "loss": 0.3508, + "step": 15390 + }, + { + "epoch": 1.2468405703175631, + "grad_norm": 0.04411442205309868, + "learning_rate": 0.00015295467842837211, + "loss": 0.2608, + "step": 15391 + }, + { + "epoch": 1.2469215813350616, + "grad_norm": 0.04792153090238571, + "learning_rate": 0.0001529501777757775, + "loss": 0.3503, + "step": 15392 + }, + { + "epoch": 1.24700259235256, + "grad_norm": 0.04913616180419922, + "learning_rate": 0.00015294567712318286, + "loss": 0.2908, + "step": 15393 + }, + { + "epoch": 1.2470836033700583, + "grad_norm": 0.038902029395103455, + "learning_rate": 0.00015294117647058822, + "loss": 0.2665, + "step": 15394 + }, + { + "epoch": 1.2471646143875568, + "grad_norm": 0.03957482799887657, + "learning_rate": 0.00015293667581799364, + "loss": 0.2804, + "step": 15395 + }, + { + "epoch": 1.247245625405055, + "grad_norm": 0.062009721994400024, + "learning_rate": 0.000152932175165399, + "loss": 0.3534, + "step": 15396 + }, + { + "epoch": 1.2473266364225535, + "grad_norm": 0.0566871352493763, + "learning_rate": 0.00015292767451280436, + "loss": 0.3503, + "step": 15397 + }, + { + "epoch": 1.2474076474400517, + "grad_norm": 0.040789470076560974, + "learning_rate": 0.00015292317386020974, + "loss": 0.2838, + "step": 15398 + }, + { + "epoch": 1.2474886584575502, + "grad_norm": 0.038834284991025925, + "learning_rate": 0.0001529186732076151, + "loss": 0.265, + "step": 15399 + }, + { + "epoch": 1.2475696694750487, + "grad_norm": 0.044831741601228714, + "learning_rate": 0.0001529141725550205, + "loss": 0.2864, + "step": 15400 + }, + { + "epoch": 1.247650680492547, + "grad_norm": 0.05505714565515518, + "learning_rate": 0.00015290967190242588, + "loss": 0.3743, + "step": 15401 + }, + { + "epoch": 1.2477316915100454, + "grad_norm": 0.04826575517654419, + "learning_rate": 0.00015290517124983124, + "loss": 0.3201, + "step": 15402 + }, + { + "epoch": 1.2478127025275438, + "grad_norm": 0.03487708047032356, + "learning_rate": 0.0001529006705972366, + "loss": 0.262, + "step": 15403 + }, + { + "epoch": 1.247893713545042, + "grad_norm": 0.053101979196071625, + "learning_rate": 0.00015289616994464199, + "loss": 0.3263, + "step": 15404 + }, + { + "epoch": 1.2479747245625405, + "grad_norm": 0.04719730094075203, + "learning_rate": 0.00015289166929204735, + "loss": 0.3098, + "step": 15405 + }, + { + "epoch": 1.2480557355800388, + "grad_norm": 0.049038488417863846, + "learning_rate": 0.00015288716863945273, + "loss": 0.3135, + "step": 15406 + }, + { + "epoch": 1.2481367465975373, + "grad_norm": 0.04459191858768463, + "learning_rate": 0.00015288266798685812, + "loss": 0.3194, + "step": 15407 + }, + { + "epoch": 1.2482177576150357, + "grad_norm": 0.04248789697885513, + "learning_rate": 0.00015287816733426348, + "loss": 0.3279, + "step": 15408 + }, + { + "epoch": 1.248298768632534, + "grad_norm": 0.03661114349961281, + "learning_rate": 0.00015287366668166884, + "loss": 0.2861, + "step": 15409 + }, + { + "epoch": 1.2483797796500324, + "grad_norm": 0.03910385072231293, + "learning_rate": 0.00015286916602907423, + "loss": 0.3059, + "step": 15410 + }, + { + "epoch": 1.2484607906675307, + "grad_norm": 0.043062131851911545, + "learning_rate": 0.0001528646653764796, + "loss": 0.2868, + "step": 15411 + }, + { + "epoch": 1.2485418016850292, + "grad_norm": 0.04848279803991318, + "learning_rate": 0.00015286016472388497, + "loss": 0.3387, + "step": 15412 + }, + { + "epoch": 1.2486228127025276, + "grad_norm": 0.045629847794771194, + "learning_rate": 0.00015285566407129036, + "loss": 0.3147, + "step": 15413 + }, + { + "epoch": 1.2487038237200259, + "grad_norm": 0.044876888394355774, + "learning_rate": 0.00015285116341869572, + "loss": 0.309, + "step": 15414 + }, + { + "epoch": 1.2487848347375243, + "grad_norm": 0.04068077355623245, + "learning_rate": 0.00015284666276610108, + "loss": 0.2813, + "step": 15415 + }, + { + "epoch": 1.2488658457550228, + "grad_norm": 0.043438006192445755, + "learning_rate": 0.00015284216211350647, + "loss": 0.3236, + "step": 15416 + }, + { + "epoch": 1.248946856772521, + "grad_norm": 0.04731074348092079, + "learning_rate": 0.00015283766146091183, + "loss": 0.3089, + "step": 15417 + }, + { + "epoch": 1.2490278677900195, + "grad_norm": 0.04230158403515816, + "learning_rate": 0.00015283316080831722, + "loss": 0.3025, + "step": 15418 + }, + { + "epoch": 1.2491088788075178, + "grad_norm": 0.04563254117965698, + "learning_rate": 0.0001528286601557226, + "loss": 0.3188, + "step": 15419 + }, + { + "epoch": 1.2491898898250162, + "grad_norm": 0.04034588858485222, + "learning_rate": 0.00015282415950312796, + "loss": 0.2977, + "step": 15420 + }, + { + "epoch": 1.2492709008425145, + "grad_norm": 0.045665670186281204, + "learning_rate": 0.00015281965885053332, + "loss": 0.2815, + "step": 15421 + }, + { + "epoch": 1.249351911860013, + "grad_norm": 0.04323578253388405, + "learning_rate": 0.0001528151581979387, + "loss": 0.291, + "step": 15422 + }, + { + "epoch": 1.2494329228775114, + "grad_norm": 0.03672725334763527, + "learning_rate": 0.00015281065754534407, + "loss": 0.2454, + "step": 15423 + }, + { + "epoch": 1.2495139338950096, + "grad_norm": 0.03858471289277077, + "learning_rate": 0.00015280615689274946, + "loss": 0.3145, + "step": 15424 + }, + { + "epoch": 1.249594944912508, + "grad_norm": 0.03955140709877014, + "learning_rate": 0.00015280165624015484, + "loss": 0.3142, + "step": 15425 + }, + { + "epoch": 1.2496759559300066, + "grad_norm": 0.03527604788541794, + "learning_rate": 0.0001527971555875602, + "loss": 0.2805, + "step": 15426 + }, + { + "epoch": 1.2497569669475048, + "grad_norm": 0.04132481664419174, + "learning_rate": 0.00015279265493496556, + "loss": 0.2937, + "step": 15427 + }, + { + "epoch": 1.2498379779650033, + "grad_norm": 0.045345671474933624, + "learning_rate": 0.00015278815428237095, + "loss": 0.3325, + "step": 15428 + }, + { + "epoch": 1.2499189889825015, + "grad_norm": 0.047520022839307785, + "learning_rate": 0.00015278365362977634, + "loss": 0.3387, + "step": 15429 + }, + { + "epoch": 1.25, + "grad_norm": 0.03609742969274521, + "learning_rate": 0.0001527791529771817, + "loss": 0.297, + "step": 15430 + }, + { + "epoch": 1.2500810110174982, + "grad_norm": 0.0441846139729023, + "learning_rate": 0.00015277465232458709, + "loss": 0.324, + "step": 15431 + }, + { + "epoch": 1.2501620220349967, + "grad_norm": 0.04409998655319214, + "learning_rate": 0.00015277015167199245, + "loss": 0.3145, + "step": 15432 + }, + { + "epoch": 1.2502430330524952, + "grad_norm": 0.045291196554899216, + "learning_rate": 0.0001527656510193978, + "loss": 0.3083, + "step": 15433 + }, + { + "epoch": 1.2503240440699934, + "grad_norm": 0.043974634259939194, + "learning_rate": 0.0001527611503668032, + "loss": 0.3109, + "step": 15434 + }, + { + "epoch": 1.250405055087492, + "grad_norm": 0.04255552589893341, + "learning_rate": 0.00015275664971420858, + "loss": 0.2912, + "step": 15435 + }, + { + "epoch": 1.2504860661049904, + "grad_norm": 0.04650450497865677, + "learning_rate": 0.00015275214906161394, + "loss": 0.3228, + "step": 15436 + }, + { + "epoch": 1.2505670771224886, + "grad_norm": 0.05076903477311134, + "learning_rate": 0.00015274764840901933, + "loss": 0.3413, + "step": 15437 + }, + { + "epoch": 1.250648088139987, + "grad_norm": 0.05367875471711159, + "learning_rate": 0.0001527431477564247, + "loss": 0.3418, + "step": 15438 + }, + { + "epoch": 1.2507290991574855, + "grad_norm": 0.04548540711402893, + "learning_rate": 0.00015273864710383005, + "loss": 0.3201, + "step": 15439 + }, + { + "epoch": 1.2508101101749838, + "grad_norm": 0.041880495846271515, + "learning_rate": 0.00015273414645123543, + "loss": 0.3281, + "step": 15440 + }, + { + "epoch": 1.2508911211924822, + "grad_norm": 0.04110018536448479, + "learning_rate": 0.00015272964579864082, + "loss": 0.2631, + "step": 15441 + }, + { + "epoch": 1.2509721322099805, + "grad_norm": 0.04534178227186203, + "learning_rate": 0.00015272514514604618, + "loss": 0.317, + "step": 15442 + }, + { + "epoch": 1.251053143227479, + "grad_norm": 0.038438715040683746, + "learning_rate": 0.00015272064449345157, + "loss": 0.2932, + "step": 15443 + }, + { + "epoch": 1.2511341542449772, + "grad_norm": 0.04278789460659027, + "learning_rate": 0.00015271614384085693, + "loss": 0.3121, + "step": 15444 + }, + { + "epoch": 1.2512151652624757, + "grad_norm": 0.04125070944428444, + "learning_rate": 0.0001527116431882623, + "loss": 0.3168, + "step": 15445 + }, + { + "epoch": 1.2512961762799741, + "grad_norm": 0.03827229142189026, + "learning_rate": 0.00015270714253566768, + "loss": 0.2752, + "step": 15446 + }, + { + "epoch": 1.2513771872974724, + "grad_norm": 0.0392126627266407, + "learning_rate": 0.00015270264188307306, + "loss": 0.2835, + "step": 15447 + }, + { + "epoch": 1.2514581983149708, + "grad_norm": 0.0405149906873703, + "learning_rate": 0.00015269814123047842, + "loss": 0.2923, + "step": 15448 + }, + { + "epoch": 1.2515392093324693, + "grad_norm": 0.03790876641869545, + "learning_rate": 0.0001526936405778838, + "loss": 0.3077, + "step": 15449 + }, + { + "epoch": 1.2516202203499676, + "grad_norm": 0.05015541613101959, + "learning_rate": 0.00015268913992528917, + "loss": 0.338, + "step": 15450 + }, + { + "epoch": 1.251701231367466, + "grad_norm": 0.04771968349814415, + "learning_rate": 0.00015268463927269453, + "loss": 0.3261, + "step": 15451 + }, + { + "epoch": 1.2517822423849643, + "grad_norm": 0.042020346969366074, + "learning_rate": 0.00015268013862009995, + "loss": 0.3063, + "step": 15452 + }, + { + "epoch": 1.2518632534024627, + "grad_norm": 0.044841647148132324, + "learning_rate": 0.0001526756379675053, + "loss": 0.3062, + "step": 15453 + }, + { + "epoch": 1.251944264419961, + "grad_norm": 0.035896364599466324, + "learning_rate": 0.00015267113731491067, + "loss": 0.2622, + "step": 15454 + }, + { + "epoch": 1.2520252754374595, + "grad_norm": 0.05748176574707031, + "learning_rate": 0.00015266663666231605, + "loss": 0.3526, + "step": 15455 + }, + { + "epoch": 1.252106286454958, + "grad_norm": 0.039488907903432846, + "learning_rate": 0.0001526621360097214, + "loss": 0.3131, + "step": 15456 + }, + { + "epoch": 1.2521872974724562, + "grad_norm": 0.04794564098119736, + "learning_rate": 0.00015265763535712677, + "loss": 0.2788, + "step": 15457 + }, + { + "epoch": 1.2522683084899546, + "grad_norm": 0.0368480309844017, + "learning_rate": 0.0001526531347045322, + "loss": 0.2824, + "step": 15458 + }, + { + "epoch": 1.252349319507453, + "grad_norm": 0.04272850602865219, + "learning_rate": 0.00015264863405193755, + "loss": 0.2885, + "step": 15459 + }, + { + "epoch": 1.2524303305249513, + "grad_norm": 0.04270879179239273, + "learning_rate": 0.0001526441333993429, + "loss": 0.2925, + "step": 15460 + }, + { + "epoch": 1.2525113415424498, + "grad_norm": 0.04070347175002098, + "learning_rate": 0.0001526396327467483, + "loss": 0.3141, + "step": 15461 + }, + { + "epoch": 1.2525923525599483, + "grad_norm": 0.047418802976608276, + "learning_rate": 0.00015263513209415365, + "loss": 0.3224, + "step": 15462 + }, + { + "epoch": 1.2526733635774465, + "grad_norm": 0.040149953216314316, + "learning_rate": 0.00015263063144155901, + "loss": 0.2434, + "step": 15463 + }, + { + "epoch": 1.252754374594945, + "grad_norm": 0.03160017356276512, + "learning_rate": 0.00015262613078896443, + "loss": 0.2669, + "step": 15464 + }, + { + "epoch": 1.2528353856124432, + "grad_norm": 0.043901924043893814, + "learning_rate": 0.0001526216301363698, + "loss": 0.3326, + "step": 15465 + }, + { + "epoch": 1.2529163966299417, + "grad_norm": 0.03790322691202164, + "learning_rate": 0.00015261712948377515, + "loss": 0.2761, + "step": 15466 + }, + { + "epoch": 1.25299740764744, + "grad_norm": 0.043482448905706406, + "learning_rate": 0.00015261262883118054, + "loss": 0.3094, + "step": 15467 + }, + { + "epoch": 1.2530784186649384, + "grad_norm": 0.039540037512779236, + "learning_rate": 0.0001526081281785859, + "loss": 0.2762, + "step": 15468 + }, + { + "epoch": 1.2531594296824369, + "grad_norm": 0.03756888955831528, + "learning_rate": 0.00015260362752599126, + "loss": 0.2782, + "step": 15469 + }, + { + "epoch": 1.2532404406999351, + "grad_norm": 0.045715007930994034, + "learning_rate": 0.00015259912687339667, + "loss": 0.2472, + "step": 15470 + }, + { + "epoch": 1.2533214517174336, + "grad_norm": 0.04127716273069382, + "learning_rate": 0.00015259462622080203, + "loss": 0.2787, + "step": 15471 + }, + { + "epoch": 1.253402462734932, + "grad_norm": 0.042705681174993515, + "learning_rate": 0.0001525901255682074, + "loss": 0.3274, + "step": 15472 + }, + { + "epoch": 1.2534834737524303, + "grad_norm": 0.04192306101322174, + "learning_rate": 0.00015258562491561278, + "loss": 0.3108, + "step": 15473 + }, + { + "epoch": 1.2535644847699288, + "grad_norm": 0.04167857766151428, + "learning_rate": 0.00015258112426301814, + "loss": 0.2934, + "step": 15474 + }, + { + "epoch": 1.253645495787427, + "grad_norm": 0.044703904539346695, + "learning_rate": 0.0001525766236104235, + "loss": 0.3106, + "step": 15475 + }, + { + "epoch": 1.2537265068049255, + "grad_norm": 0.03742494434118271, + "learning_rate": 0.0001525721229578289, + "loss": 0.2525, + "step": 15476 + }, + { + "epoch": 1.2538075178224237, + "grad_norm": 0.04481251910328865, + "learning_rate": 0.00015256762230523427, + "loss": 0.3179, + "step": 15477 + }, + { + "epoch": 1.2538885288399222, + "grad_norm": 0.04333583638072014, + "learning_rate": 0.00015256312165263963, + "loss": 0.3089, + "step": 15478 + }, + { + "epoch": 1.2539695398574207, + "grad_norm": 0.052331723272800446, + "learning_rate": 0.00015255862100004502, + "loss": 0.343, + "step": 15479 + }, + { + "epoch": 1.254050550874919, + "grad_norm": 0.045910757035017014, + "learning_rate": 0.00015255412034745038, + "loss": 0.3119, + "step": 15480 + }, + { + "epoch": 1.2541315618924174, + "grad_norm": 0.04972159489989281, + "learning_rate": 0.00015254961969485577, + "loss": 0.3247, + "step": 15481 + }, + { + "epoch": 1.2542125729099158, + "grad_norm": 0.045085709542036057, + "learning_rate": 0.00015254511904226115, + "loss": 0.3349, + "step": 15482 + }, + { + "epoch": 1.254293583927414, + "grad_norm": 0.13385389745235443, + "learning_rate": 0.0001525406183896665, + "loss": 0.2876, + "step": 15483 + }, + { + "epoch": 1.2543745949449125, + "grad_norm": 0.04315098002552986, + "learning_rate": 0.00015253611773707187, + "loss": 0.3223, + "step": 15484 + }, + { + "epoch": 1.254455605962411, + "grad_norm": 0.045498088002204895, + "learning_rate": 0.00015253161708447726, + "loss": 0.3079, + "step": 15485 + }, + { + "epoch": 1.2545366169799093, + "grad_norm": 0.041511572897434235, + "learning_rate": 0.00015252711643188262, + "loss": 0.3216, + "step": 15486 + }, + { + "epoch": 1.2546176279974077, + "grad_norm": 0.0475231409072876, + "learning_rate": 0.000152522615779288, + "loss": 0.3182, + "step": 15487 + }, + { + "epoch": 1.254698639014906, + "grad_norm": 0.03755148500204086, + "learning_rate": 0.0001525181151266934, + "loss": 0.3006, + "step": 15488 + }, + { + "epoch": 1.2547796500324044, + "grad_norm": 0.045942701399326324, + "learning_rate": 0.00015251361447409875, + "loss": 0.3557, + "step": 15489 + }, + { + "epoch": 1.2548606610499027, + "grad_norm": 0.04447175934910774, + "learning_rate": 0.00015250911382150411, + "loss": 0.2819, + "step": 15490 + }, + { + "epoch": 1.2549416720674011, + "grad_norm": 0.04260551556944847, + "learning_rate": 0.0001525046131689095, + "loss": 0.3191, + "step": 15491 + }, + { + "epoch": 1.2550226830848996, + "grad_norm": 0.04521317034959793, + "learning_rate": 0.00015250011251631486, + "loss": 0.3028, + "step": 15492 + }, + { + "epoch": 1.2551036941023979, + "grad_norm": 0.04569809138774872, + "learning_rate": 0.00015249561186372025, + "loss": 0.3328, + "step": 15493 + }, + { + "epoch": 1.2551847051198963, + "grad_norm": 0.045397352427244186, + "learning_rate": 0.00015249111121112564, + "loss": 0.2722, + "step": 15494 + }, + { + "epoch": 1.2552657161373948, + "grad_norm": 0.04033495858311653, + "learning_rate": 0.000152486610558531, + "loss": 0.2855, + "step": 15495 + }, + { + "epoch": 1.255346727154893, + "grad_norm": 0.041850823909044266, + "learning_rate": 0.00015248210990593636, + "loss": 0.2908, + "step": 15496 + }, + { + "epoch": 1.2554277381723915, + "grad_norm": 0.042379043996334076, + "learning_rate": 0.00015247760925334174, + "loss": 0.2709, + "step": 15497 + }, + { + "epoch": 1.2555087491898898, + "grad_norm": 0.04308145493268967, + "learning_rate": 0.0001524731086007471, + "loss": 0.3076, + "step": 15498 + }, + { + "epoch": 1.2555897602073882, + "grad_norm": 0.036246269941329956, + "learning_rate": 0.0001524686079481525, + "loss": 0.2688, + "step": 15499 + }, + { + "epoch": 1.2556707712248865, + "grad_norm": 0.04566572234034538, + "learning_rate": 0.00015246410729555788, + "loss": 0.2818, + "step": 15500 + }, + { + "epoch": 1.255751782242385, + "grad_norm": 0.04354838281869888, + "learning_rate": 0.00015245960664296324, + "loss": 0.3119, + "step": 15501 + }, + { + "epoch": 1.2558327932598834, + "grad_norm": 0.03831060230731964, + "learning_rate": 0.0001524551059903686, + "loss": 0.2723, + "step": 15502 + }, + { + "epoch": 1.2559138042773816, + "grad_norm": 0.04013657569885254, + "learning_rate": 0.00015245060533777399, + "loss": 0.2976, + "step": 15503 + }, + { + "epoch": 1.25599481529488, + "grad_norm": 0.04500504210591316, + "learning_rate": 0.00015244610468517935, + "loss": 0.3199, + "step": 15504 + }, + { + "epoch": 1.2560758263123786, + "grad_norm": 0.04480867460370064, + "learning_rate": 0.00015244160403258473, + "loss": 0.3475, + "step": 15505 + }, + { + "epoch": 1.2561568373298768, + "grad_norm": 0.042447056621313095, + "learning_rate": 0.00015243710337999012, + "loss": 0.2794, + "step": 15506 + }, + { + "epoch": 1.2562378483473753, + "grad_norm": 0.040769752115011215, + "learning_rate": 0.00015243260272739548, + "loss": 0.3123, + "step": 15507 + }, + { + "epoch": 1.2563188593648738, + "grad_norm": 0.04607153683900833, + "learning_rate": 0.00015242810207480084, + "loss": 0.306, + "step": 15508 + }, + { + "epoch": 1.256399870382372, + "grad_norm": 0.0423283614218235, + "learning_rate": 0.00015242360142220623, + "loss": 0.2972, + "step": 15509 + }, + { + "epoch": 1.2564808813998705, + "grad_norm": 0.04975079372525215, + "learning_rate": 0.00015241910076961161, + "loss": 0.3243, + "step": 15510 + }, + { + "epoch": 1.2565618924173687, + "grad_norm": 0.042507603764534, + "learning_rate": 0.00015241460011701697, + "loss": 0.2612, + "step": 15511 + }, + { + "epoch": 1.2566429034348672, + "grad_norm": 0.04295576363801956, + "learning_rate": 0.00015241009946442236, + "loss": 0.3121, + "step": 15512 + }, + { + "epoch": 1.2567239144523654, + "grad_norm": 0.054205574095249176, + "learning_rate": 0.00015240559881182772, + "loss": 0.3556, + "step": 15513 + }, + { + "epoch": 1.2568049254698639, + "grad_norm": 0.041493676602840424, + "learning_rate": 0.00015240109815923308, + "loss": 0.3016, + "step": 15514 + }, + { + "epoch": 1.2568859364873624, + "grad_norm": 0.0442705899477005, + "learning_rate": 0.00015239659750663847, + "loss": 0.2703, + "step": 15515 + }, + { + "epoch": 1.2569669475048606, + "grad_norm": 0.043185483664274216, + "learning_rate": 0.00015239209685404386, + "loss": 0.2641, + "step": 15516 + }, + { + "epoch": 1.257047958522359, + "grad_norm": 0.04261036589741707, + "learning_rate": 0.00015238759620144922, + "loss": 0.2885, + "step": 15517 + }, + { + "epoch": 1.2571289695398575, + "grad_norm": 0.03761589154601097, + "learning_rate": 0.0001523830955488546, + "loss": 0.2964, + "step": 15518 + }, + { + "epoch": 1.2572099805573558, + "grad_norm": 0.039735615253448486, + "learning_rate": 0.00015237859489625996, + "loss": 0.3292, + "step": 15519 + }, + { + "epoch": 1.2572909915748542, + "grad_norm": 0.043260227888822556, + "learning_rate": 0.00015237409424366532, + "loss": 0.3044, + "step": 15520 + }, + { + "epoch": 1.2573720025923525, + "grad_norm": 0.046115051954984665, + "learning_rate": 0.0001523695935910707, + "loss": 0.2867, + "step": 15521 + }, + { + "epoch": 1.257453013609851, + "grad_norm": 0.05022734776139259, + "learning_rate": 0.0001523650929384761, + "loss": 0.3176, + "step": 15522 + }, + { + "epoch": 1.2575340246273492, + "grad_norm": 0.047274477779865265, + "learning_rate": 0.00015236059228588146, + "loss": 0.3044, + "step": 15523 + }, + { + "epoch": 1.2576150356448477, + "grad_norm": 0.04098426550626755, + "learning_rate": 0.00015235609163328684, + "loss": 0.2952, + "step": 15524 + }, + { + "epoch": 1.2576960466623461, + "grad_norm": 0.04345531016588211, + "learning_rate": 0.0001523515909806922, + "loss": 0.2888, + "step": 15525 + }, + { + "epoch": 1.2577770576798444, + "grad_norm": 0.044809646904468536, + "learning_rate": 0.00015234709032809756, + "loss": 0.287, + "step": 15526 + }, + { + "epoch": 1.2578580686973428, + "grad_norm": 0.045098863542079926, + "learning_rate": 0.00015234258967550295, + "loss": 0.3268, + "step": 15527 + }, + { + "epoch": 1.2579390797148413, + "grad_norm": 0.04287554696202278, + "learning_rate": 0.00015233808902290834, + "loss": 0.2836, + "step": 15528 + }, + { + "epoch": 1.2580200907323396, + "grad_norm": 0.04294641315937042, + "learning_rate": 0.0001523335883703137, + "loss": 0.3327, + "step": 15529 + }, + { + "epoch": 1.258101101749838, + "grad_norm": 0.04521435499191284, + "learning_rate": 0.00015232908771771909, + "loss": 0.3041, + "step": 15530 + }, + { + "epoch": 1.2581821127673365, + "grad_norm": 0.046372584998607635, + "learning_rate": 0.00015232458706512445, + "loss": 0.315, + "step": 15531 + }, + { + "epoch": 1.2582631237848347, + "grad_norm": 0.050368066877126694, + "learning_rate": 0.0001523200864125298, + "loss": 0.3145, + "step": 15532 + }, + { + "epoch": 1.258344134802333, + "grad_norm": 0.04091639816761017, + "learning_rate": 0.00015231558575993522, + "loss": 0.2912, + "step": 15533 + }, + { + "epoch": 1.2584251458198314, + "grad_norm": 0.03936908766627312, + "learning_rate": 0.00015231108510734058, + "loss": 0.2832, + "step": 15534 + }, + { + "epoch": 1.25850615683733, + "grad_norm": 0.0416514128446579, + "learning_rate": 0.00015230658445474594, + "loss": 0.2746, + "step": 15535 + }, + { + "epoch": 1.2585871678548282, + "grad_norm": 0.04907258227467537, + "learning_rate": 0.00015230208380215133, + "loss": 0.357, + "step": 15536 + }, + { + "epoch": 1.2586681788723266, + "grad_norm": 0.04133478179574013, + "learning_rate": 0.0001522975831495567, + "loss": 0.3154, + "step": 15537 + }, + { + "epoch": 1.258749189889825, + "grad_norm": 0.03562864661216736, + "learning_rate": 0.00015229308249696205, + "loss": 0.2747, + "step": 15538 + }, + { + "epoch": 1.2588302009073233, + "grad_norm": 0.045105401426553726, + "learning_rate": 0.00015228858184436746, + "loss": 0.3255, + "step": 15539 + }, + { + "epoch": 1.2589112119248218, + "grad_norm": 0.04034801945090294, + "learning_rate": 0.00015228408119177282, + "loss": 0.2869, + "step": 15540 + }, + { + "epoch": 1.2589922229423203, + "grad_norm": 0.0402255542576313, + "learning_rate": 0.00015227958053917818, + "loss": 0.2629, + "step": 15541 + }, + { + "epoch": 1.2590732339598185, + "grad_norm": 0.042664676904678345, + "learning_rate": 0.00015227507988658357, + "loss": 0.3399, + "step": 15542 + }, + { + "epoch": 1.259154244977317, + "grad_norm": 0.04219324514269829, + "learning_rate": 0.00015227057923398893, + "loss": 0.2668, + "step": 15543 + }, + { + "epoch": 1.2592352559948152, + "grad_norm": 0.047899648547172546, + "learning_rate": 0.0001522660785813943, + "loss": 0.3185, + "step": 15544 + }, + { + "epoch": 1.2593162670123137, + "grad_norm": 0.04278462007641792, + "learning_rate": 0.0001522615779287997, + "loss": 0.3015, + "step": 15545 + }, + { + "epoch": 1.259397278029812, + "grad_norm": 0.04308171570301056, + "learning_rate": 0.00015225707727620506, + "loss": 0.3054, + "step": 15546 + }, + { + "epoch": 1.2594782890473104, + "grad_norm": 0.0512288436293602, + "learning_rate": 0.00015225257662361042, + "loss": 0.3416, + "step": 15547 + }, + { + "epoch": 1.2595593000648089, + "grad_norm": 0.04132172837853432, + "learning_rate": 0.0001522480759710158, + "loss": 0.2973, + "step": 15548 + }, + { + "epoch": 1.2596403110823071, + "grad_norm": 0.046558722853660583, + "learning_rate": 0.00015224357531842117, + "loss": 0.3055, + "step": 15549 + }, + { + "epoch": 1.2597213220998056, + "grad_norm": 0.036320433020591736, + "learning_rate": 0.00015223907466582653, + "loss": 0.3049, + "step": 15550 + }, + { + "epoch": 1.259802333117304, + "grad_norm": 0.048258814960718155, + "learning_rate": 0.00015223457401323195, + "loss": 0.3205, + "step": 15551 + }, + { + "epoch": 1.2598833441348023, + "grad_norm": 0.045856986194849014, + "learning_rate": 0.0001522300733606373, + "loss": 0.3342, + "step": 15552 + }, + { + "epoch": 1.2599643551523008, + "grad_norm": 0.04905549809336662, + "learning_rate": 0.00015222557270804267, + "loss": 0.3411, + "step": 15553 + }, + { + "epoch": 1.2600453661697992, + "grad_norm": 0.043548863381147385, + "learning_rate": 0.00015222107205544805, + "loss": 0.2846, + "step": 15554 + }, + { + "epoch": 1.2601263771872975, + "grad_norm": 0.043483052402734756, + "learning_rate": 0.0001522165714028534, + "loss": 0.2673, + "step": 15555 + }, + { + "epoch": 1.2602073882047957, + "grad_norm": 0.043352123349905014, + "learning_rate": 0.00015221207075025877, + "loss": 0.286, + "step": 15556 + }, + { + "epoch": 1.2602883992222942, + "grad_norm": 0.04295540973544121, + "learning_rate": 0.0001522075700976642, + "loss": 0.2844, + "step": 15557 + }, + { + "epoch": 1.2603694102397927, + "grad_norm": 0.04903509467840195, + "learning_rate": 0.00015220306944506955, + "loss": 0.3473, + "step": 15558 + }, + { + "epoch": 1.260450421257291, + "grad_norm": 0.04419170320034027, + "learning_rate": 0.0001521985687924749, + "loss": 0.2864, + "step": 15559 + }, + { + "epoch": 1.2605314322747894, + "grad_norm": 0.04279334098100662, + "learning_rate": 0.0001521940681398803, + "loss": 0.293, + "step": 15560 + }, + { + "epoch": 1.2606124432922878, + "grad_norm": 0.03789504989981651, + "learning_rate": 0.00015218956748728565, + "loss": 0.261, + "step": 15561 + }, + { + "epoch": 1.260693454309786, + "grad_norm": 0.0386795774102211, + "learning_rate": 0.00015218506683469104, + "loss": 0.2895, + "step": 15562 + }, + { + "epoch": 1.2607744653272845, + "grad_norm": 0.0440162718296051, + "learning_rate": 0.00015218056618209643, + "loss": 0.3101, + "step": 15563 + }, + { + "epoch": 1.260855476344783, + "grad_norm": 0.04073643311858177, + "learning_rate": 0.0001521760655295018, + "loss": 0.3154, + "step": 15564 + }, + { + "epoch": 1.2609364873622813, + "grad_norm": 0.036714520305395126, + "learning_rate": 0.00015217156487690715, + "loss": 0.3254, + "step": 15565 + }, + { + "epoch": 1.2610174983797797, + "grad_norm": 0.039863135665655136, + "learning_rate": 0.00015216706422431254, + "loss": 0.2819, + "step": 15566 + }, + { + "epoch": 1.261098509397278, + "grad_norm": 0.04084132984280586, + "learning_rate": 0.0001521625635717179, + "loss": 0.2923, + "step": 15567 + }, + { + "epoch": 1.2611795204147764, + "grad_norm": 0.03947173058986664, + "learning_rate": 0.00015215806291912328, + "loss": 0.2807, + "step": 15568 + }, + { + "epoch": 1.2612605314322747, + "grad_norm": 0.04105982184410095, + "learning_rate": 0.00015215356226652867, + "loss": 0.2919, + "step": 15569 + }, + { + "epoch": 1.2613415424497731, + "grad_norm": 0.049376294016838074, + "learning_rate": 0.00015214906161393403, + "loss": 0.3503, + "step": 15570 + }, + { + "epoch": 1.2614225534672716, + "grad_norm": 0.039907824248075485, + "learning_rate": 0.0001521445609613394, + "loss": 0.2742, + "step": 15571 + }, + { + "epoch": 1.2615035644847699, + "grad_norm": 0.03646915778517723, + "learning_rate": 0.00015214006030874478, + "loss": 0.2981, + "step": 15572 + }, + { + "epoch": 1.2615845755022683, + "grad_norm": 0.04895637184381485, + "learning_rate": 0.00015213555965615014, + "loss": 0.3376, + "step": 15573 + }, + { + "epoch": 1.2616655865197668, + "grad_norm": 0.043007057160139084, + "learning_rate": 0.00015213105900355552, + "loss": 0.2945, + "step": 15574 + }, + { + "epoch": 1.261746597537265, + "grad_norm": 0.03746945038437843, + "learning_rate": 0.0001521265583509609, + "loss": 0.3082, + "step": 15575 + }, + { + "epoch": 1.2618276085547635, + "grad_norm": 0.0426875576376915, + "learning_rate": 0.00015212205769836627, + "loss": 0.2913, + "step": 15576 + }, + { + "epoch": 1.2619086195722617, + "grad_norm": 0.04929223656654358, + "learning_rate": 0.00015211755704577163, + "loss": 0.3079, + "step": 15577 + }, + { + "epoch": 1.2619896305897602, + "grad_norm": 0.051895756274461746, + "learning_rate": 0.00015211305639317702, + "loss": 0.3234, + "step": 15578 + }, + { + "epoch": 1.2620706416072585, + "grad_norm": 0.036069467663764954, + "learning_rate": 0.00015210855574058238, + "loss": 0.2533, + "step": 15579 + }, + { + "epoch": 1.262151652624757, + "grad_norm": 0.04682864993810654, + "learning_rate": 0.00015210405508798777, + "loss": 0.3029, + "step": 15580 + }, + { + "epoch": 1.2622326636422554, + "grad_norm": 0.05245593190193176, + "learning_rate": 0.00015209955443539315, + "loss": 0.2854, + "step": 15581 + }, + { + "epoch": 1.2623136746597536, + "grad_norm": 0.04711830988526344, + "learning_rate": 0.0001520950537827985, + "loss": 0.2838, + "step": 15582 + }, + { + "epoch": 1.262394685677252, + "grad_norm": 0.03892109543085098, + "learning_rate": 0.00015209055313020387, + "loss": 0.3, + "step": 15583 + }, + { + "epoch": 1.2624756966947506, + "grad_norm": 0.040280282497406006, + "learning_rate": 0.00015208605247760926, + "loss": 0.2914, + "step": 15584 + }, + { + "epoch": 1.2625567077122488, + "grad_norm": 0.04260111600160599, + "learning_rate": 0.00015208155182501465, + "loss": 0.3323, + "step": 15585 + }, + { + "epoch": 1.2626377187297473, + "grad_norm": 0.03925931453704834, + "learning_rate": 0.00015207705117242, + "loss": 0.2601, + "step": 15586 + }, + { + "epoch": 1.2627187297472457, + "grad_norm": 0.03990313410758972, + "learning_rate": 0.0001520725505198254, + "loss": 0.3163, + "step": 15587 + }, + { + "epoch": 1.262799740764744, + "grad_norm": 0.036611080169677734, + "learning_rate": 0.00015206804986723076, + "loss": 0.3046, + "step": 15588 + }, + { + "epoch": 1.2628807517822425, + "grad_norm": 0.04936470463871956, + "learning_rate": 0.00015206354921463612, + "loss": 0.3541, + "step": 15589 + }, + { + "epoch": 1.2629617627997407, + "grad_norm": 0.04631369933485985, + "learning_rate": 0.0001520590485620415, + "loss": 0.3364, + "step": 15590 + }, + { + "epoch": 1.2630427738172392, + "grad_norm": 0.03967304900288582, + "learning_rate": 0.0001520545479094469, + "loss": 0.2897, + "step": 15591 + }, + { + "epoch": 1.2631237848347374, + "grad_norm": 0.03701883181929588, + "learning_rate": 0.00015205004725685225, + "loss": 0.2763, + "step": 15592 + }, + { + "epoch": 1.2632047958522359, + "grad_norm": 0.04268396273255348, + "learning_rate": 0.00015204554660425764, + "loss": 0.2927, + "step": 15593 + }, + { + "epoch": 1.2632858068697344, + "grad_norm": 0.0401676669716835, + "learning_rate": 0.000152041045951663, + "loss": 0.2805, + "step": 15594 + }, + { + "epoch": 1.2633668178872326, + "grad_norm": 0.04197492450475693, + "learning_rate": 0.00015203654529906836, + "loss": 0.2643, + "step": 15595 + }, + { + "epoch": 1.263447828904731, + "grad_norm": 0.0464635044336319, + "learning_rate": 0.00015203204464647374, + "loss": 0.3178, + "step": 15596 + }, + { + "epoch": 1.2635288399222295, + "grad_norm": 0.039722513407468796, + "learning_rate": 0.00015202754399387913, + "loss": 0.2892, + "step": 15597 + }, + { + "epoch": 1.2636098509397278, + "grad_norm": 0.049931466579437256, + "learning_rate": 0.0001520230433412845, + "loss": 0.3061, + "step": 15598 + }, + { + "epoch": 1.2636908619572262, + "grad_norm": 0.04221179708838463, + "learning_rate": 0.00015201854268868988, + "loss": 0.3012, + "step": 15599 + }, + { + "epoch": 1.2637718729747245, + "grad_norm": 0.045019712299108505, + "learning_rate": 0.00015201404203609524, + "loss": 0.2959, + "step": 15600 + }, + { + "epoch": 1.263852883992223, + "grad_norm": 0.04281473159790039, + "learning_rate": 0.0001520095413835006, + "loss": 0.2961, + "step": 15601 + }, + { + "epoch": 1.2639338950097212, + "grad_norm": 0.04097730293869972, + "learning_rate": 0.00015200504073090599, + "loss": 0.3082, + "step": 15602 + }, + { + "epoch": 1.2640149060272197, + "grad_norm": 0.0397987924516201, + "learning_rate": 0.00015200054007831137, + "loss": 0.2656, + "step": 15603 + }, + { + "epoch": 1.2640959170447181, + "grad_norm": 0.04969324544072151, + "learning_rate": 0.00015199603942571673, + "loss": 0.329, + "step": 15604 + }, + { + "epoch": 1.2641769280622164, + "grad_norm": 0.04034838080406189, + "learning_rate": 0.00015199153877312212, + "loss": 0.2902, + "step": 15605 + }, + { + "epoch": 1.2642579390797148, + "grad_norm": 0.0383288599550724, + "learning_rate": 0.00015198703812052748, + "loss": 0.2864, + "step": 15606 + }, + { + "epoch": 1.2643389500972133, + "grad_norm": 0.044115833938121796, + "learning_rate": 0.00015198253746793284, + "loss": 0.2675, + "step": 15607 + }, + { + "epoch": 1.2644199611147116, + "grad_norm": 0.04049624130129814, + "learning_rate": 0.00015197803681533823, + "loss": 0.3262, + "step": 15608 + }, + { + "epoch": 1.26450097213221, + "grad_norm": 0.04395872727036476, + "learning_rate": 0.00015197353616274361, + "loss": 0.2952, + "step": 15609 + }, + { + "epoch": 1.2645819831497085, + "grad_norm": 0.04466897249221802, + "learning_rate": 0.00015196903551014897, + "loss": 0.3386, + "step": 15610 + }, + { + "epoch": 1.2646629941672067, + "grad_norm": 0.040849585086107254, + "learning_rate": 0.00015196453485755436, + "loss": 0.2914, + "step": 15611 + }, + { + "epoch": 1.2647440051847052, + "grad_norm": 0.04917623847723007, + "learning_rate": 0.00015196003420495972, + "loss": 0.3178, + "step": 15612 + }, + { + "epoch": 1.2648250162022034, + "grad_norm": 0.04036034271121025, + "learning_rate": 0.00015195553355236508, + "loss": 0.2835, + "step": 15613 + }, + { + "epoch": 1.264906027219702, + "grad_norm": 0.044372837990522385, + "learning_rate": 0.0001519510328997705, + "loss": 0.326, + "step": 15614 + }, + { + "epoch": 1.2649870382372002, + "grad_norm": 0.048746511340141296, + "learning_rate": 0.00015194653224717586, + "loss": 0.3475, + "step": 15615 + }, + { + "epoch": 1.2650680492546986, + "grad_norm": 0.03705955669283867, + "learning_rate": 0.00015194203159458122, + "loss": 0.2772, + "step": 15616 + }, + { + "epoch": 1.265149060272197, + "grad_norm": 0.040319714695215225, + "learning_rate": 0.0001519375309419866, + "loss": 0.3251, + "step": 15617 + }, + { + "epoch": 1.2652300712896953, + "grad_norm": 0.04241141304373741, + "learning_rate": 0.00015193303028939196, + "loss": 0.2897, + "step": 15618 + }, + { + "epoch": 1.2653110823071938, + "grad_norm": 0.03959643840789795, + "learning_rate": 0.00015192852963679732, + "loss": 0.2967, + "step": 15619 + }, + { + "epoch": 1.2653920933246923, + "grad_norm": 0.03806019946932793, + "learning_rate": 0.00015192402898420274, + "loss": 0.2612, + "step": 15620 + }, + { + "epoch": 1.2654731043421905, + "grad_norm": 0.03872664272785187, + "learning_rate": 0.0001519195283316081, + "loss": 0.2668, + "step": 15621 + }, + { + "epoch": 1.265554115359689, + "grad_norm": 0.04202438145875931, + "learning_rate": 0.00015191502767901346, + "loss": 0.3002, + "step": 15622 + }, + { + "epoch": 1.2656351263771872, + "grad_norm": 0.03957134857773781, + "learning_rate": 0.00015191052702641884, + "loss": 0.3285, + "step": 15623 + }, + { + "epoch": 1.2657161373946857, + "grad_norm": 0.04093240946531296, + "learning_rate": 0.0001519060263738242, + "loss": 0.3002, + "step": 15624 + }, + { + "epoch": 1.265797148412184, + "grad_norm": 0.043608762323856354, + "learning_rate": 0.00015190152572122956, + "loss": 0.2952, + "step": 15625 + }, + { + "epoch": 1.2658781594296824, + "grad_norm": 0.045039575546979904, + "learning_rate": 0.00015189702506863498, + "loss": 0.3193, + "step": 15626 + }, + { + "epoch": 1.2659591704471809, + "grad_norm": 0.04398264363408089, + "learning_rate": 0.00015189252441604034, + "loss": 0.3487, + "step": 15627 + }, + { + "epoch": 1.2660401814646791, + "grad_norm": 0.04162249714136124, + "learning_rate": 0.0001518880237634457, + "loss": 0.3025, + "step": 15628 + }, + { + "epoch": 1.2661211924821776, + "grad_norm": 0.03937450423836708, + "learning_rate": 0.00015188352311085109, + "loss": 0.3137, + "step": 15629 + }, + { + "epoch": 1.266202203499676, + "grad_norm": 0.046762723475694656, + "learning_rate": 0.00015187902245825645, + "loss": 0.2971, + "step": 15630 + }, + { + "epoch": 1.2662832145171743, + "grad_norm": 0.03874950483441353, + "learning_rate": 0.0001518745218056618, + "loss": 0.3116, + "step": 15631 + }, + { + "epoch": 1.2663642255346728, + "grad_norm": 0.04077541083097458, + "learning_rate": 0.00015187002115306722, + "loss": 0.2905, + "step": 15632 + }, + { + "epoch": 1.2664452365521712, + "grad_norm": 0.04175429791212082, + "learning_rate": 0.00015186552050047258, + "loss": 0.2429, + "step": 15633 + }, + { + "epoch": 1.2665262475696695, + "grad_norm": 0.04630570858716965, + "learning_rate": 0.00015186101984787794, + "loss": 0.3291, + "step": 15634 + }, + { + "epoch": 1.2666072585871677, + "grad_norm": 0.04232180118560791, + "learning_rate": 0.00015185651919528333, + "loss": 0.2794, + "step": 15635 + }, + { + "epoch": 1.2666882696046662, + "grad_norm": 0.043028462678194046, + "learning_rate": 0.0001518520185426887, + "loss": 0.324, + "step": 15636 + }, + { + "epoch": 1.2667692806221647, + "grad_norm": 0.04590842127799988, + "learning_rate": 0.00015184751789009405, + "loss": 0.319, + "step": 15637 + }, + { + "epoch": 1.266850291639663, + "grad_norm": 0.040628597140312195, + "learning_rate": 0.00015184301723749946, + "loss": 0.2947, + "step": 15638 + }, + { + "epoch": 1.2669313026571614, + "grad_norm": 0.03495374694466591, + "learning_rate": 0.00015183851658490482, + "loss": 0.2917, + "step": 15639 + }, + { + "epoch": 1.2670123136746598, + "grad_norm": 0.04080813005566597, + "learning_rate": 0.00015183401593231018, + "loss": 0.2997, + "step": 15640 + }, + { + "epoch": 1.267093324692158, + "grad_norm": 0.038417864590883255, + "learning_rate": 0.00015182951527971557, + "loss": 0.2355, + "step": 15641 + }, + { + "epoch": 1.2671743357096565, + "grad_norm": 0.04798012971878052, + "learning_rate": 0.00015182501462712093, + "loss": 0.2792, + "step": 15642 + }, + { + "epoch": 1.267255346727155, + "grad_norm": 0.045979537069797516, + "learning_rate": 0.00015182051397452632, + "loss": 0.2972, + "step": 15643 + }, + { + "epoch": 1.2673363577446533, + "grad_norm": 0.05009061470627785, + "learning_rate": 0.0001518160133219317, + "loss": 0.3475, + "step": 15644 + }, + { + "epoch": 1.2674173687621517, + "grad_norm": 0.046117499470710754, + "learning_rate": 0.00015181151266933706, + "loss": 0.289, + "step": 15645 + }, + { + "epoch": 1.26749837977965, + "grad_norm": 0.04126918315887451, + "learning_rate": 0.00015180701201674242, + "loss": 0.2819, + "step": 15646 + }, + { + "epoch": 1.2675793907971484, + "grad_norm": 0.041201528161764145, + "learning_rate": 0.0001518025113641478, + "loss": 0.3306, + "step": 15647 + }, + { + "epoch": 1.2676604018146467, + "grad_norm": 0.04013405367732048, + "learning_rate": 0.00015179801071155317, + "loss": 0.2826, + "step": 15648 + }, + { + "epoch": 1.2677414128321451, + "grad_norm": 0.03912288695573807, + "learning_rate": 0.00015179351005895856, + "loss": 0.3013, + "step": 15649 + }, + { + "epoch": 1.2678224238496436, + "grad_norm": 0.04670781269669533, + "learning_rate": 0.00015178900940636395, + "loss": 0.2704, + "step": 15650 + }, + { + "epoch": 1.2679034348671419, + "grad_norm": 0.038590800017118454, + "learning_rate": 0.0001517845087537693, + "loss": 0.2888, + "step": 15651 + }, + { + "epoch": 1.2679844458846403, + "grad_norm": 0.04906099662184715, + "learning_rate": 0.00015178000810117467, + "loss": 0.3401, + "step": 15652 + }, + { + "epoch": 1.2680654569021388, + "grad_norm": 0.04225248098373413, + "learning_rate": 0.00015177550744858005, + "loss": 0.2836, + "step": 15653 + }, + { + "epoch": 1.268146467919637, + "grad_norm": 0.038599442690610886, + "learning_rate": 0.0001517710067959854, + "loss": 0.2802, + "step": 15654 + }, + { + "epoch": 1.2682274789371355, + "grad_norm": 0.038595061749219894, + "learning_rate": 0.0001517665061433908, + "loss": 0.2787, + "step": 15655 + }, + { + "epoch": 1.268308489954634, + "grad_norm": 0.038777854293584824, + "learning_rate": 0.0001517620054907962, + "loss": 0.3059, + "step": 15656 + }, + { + "epoch": 1.2683895009721322, + "grad_norm": 0.04319249466061592, + "learning_rate": 0.00015175750483820155, + "loss": 0.3212, + "step": 15657 + }, + { + "epoch": 1.2684705119896305, + "grad_norm": 0.04285728558897972, + "learning_rate": 0.0001517530041856069, + "loss": 0.3142, + "step": 15658 + }, + { + "epoch": 1.268551523007129, + "grad_norm": 0.045756593346595764, + "learning_rate": 0.0001517485035330123, + "loss": 0.3022, + "step": 15659 + }, + { + "epoch": 1.2686325340246274, + "grad_norm": 0.03946535289287567, + "learning_rate": 0.00015174400288041765, + "loss": 0.28, + "step": 15660 + }, + { + "epoch": 1.2687135450421256, + "grad_norm": 0.04428525269031525, + "learning_rate": 0.00015173950222782304, + "loss": 0.2901, + "step": 15661 + }, + { + "epoch": 1.268794556059624, + "grad_norm": 0.04757576063275337, + "learning_rate": 0.00015173500157522843, + "loss": 0.3419, + "step": 15662 + }, + { + "epoch": 1.2688755670771226, + "grad_norm": 0.043652888387441635, + "learning_rate": 0.0001517305009226338, + "loss": 0.3208, + "step": 15663 + }, + { + "epoch": 1.2689565780946208, + "grad_norm": 0.044308267533779144, + "learning_rate": 0.00015172600027003915, + "loss": 0.29, + "step": 15664 + }, + { + "epoch": 1.2690375891121193, + "grad_norm": 0.04619684815406799, + "learning_rate": 0.00015172149961744454, + "loss": 0.3121, + "step": 15665 + }, + { + "epoch": 1.2691186001296177, + "grad_norm": 0.04882878437638283, + "learning_rate": 0.00015171699896484992, + "loss": 0.3082, + "step": 15666 + }, + { + "epoch": 1.269199611147116, + "grad_norm": 0.04964202269911766, + "learning_rate": 0.00015171249831225528, + "loss": 0.3177, + "step": 15667 + }, + { + "epoch": 1.2692806221646145, + "grad_norm": 0.05015742406249046, + "learning_rate": 0.00015170799765966067, + "loss": 0.3823, + "step": 15668 + }, + { + "epoch": 1.2693616331821127, + "grad_norm": 0.047566697001457214, + "learning_rate": 0.00015170349700706603, + "loss": 0.3273, + "step": 15669 + }, + { + "epoch": 1.2694426441996112, + "grad_norm": 0.04747958853840828, + "learning_rate": 0.0001516989963544714, + "loss": 0.2991, + "step": 15670 + }, + { + "epoch": 1.2695236552171094, + "grad_norm": 0.04064423590898514, + "learning_rate": 0.00015169449570187678, + "loss": 0.2945, + "step": 15671 + }, + { + "epoch": 1.2696046662346079, + "grad_norm": 0.04318852722644806, + "learning_rate": 0.00015168999504928216, + "loss": 0.3127, + "step": 15672 + }, + { + "epoch": 1.2696856772521063, + "grad_norm": 0.040038831532001495, + "learning_rate": 0.00015168549439668752, + "loss": 0.2837, + "step": 15673 + }, + { + "epoch": 1.2697666882696046, + "grad_norm": 0.04599372297525406, + "learning_rate": 0.0001516809937440929, + "loss": 0.3082, + "step": 15674 + }, + { + "epoch": 1.269847699287103, + "grad_norm": 0.047460220754146576, + "learning_rate": 0.00015167649309149827, + "loss": 0.3208, + "step": 15675 + }, + { + "epoch": 1.2699287103046015, + "grad_norm": 0.03879735246300697, + "learning_rate": 0.00015167199243890363, + "loss": 0.2752, + "step": 15676 + }, + { + "epoch": 1.2700097213220998, + "grad_norm": 0.04079611599445343, + "learning_rate": 0.00015166749178630902, + "loss": 0.2928, + "step": 15677 + }, + { + "epoch": 1.2700907323395982, + "grad_norm": 0.049497924745082855, + "learning_rate": 0.0001516629911337144, + "loss": 0.3384, + "step": 15678 + }, + { + "epoch": 1.2701717433570965, + "grad_norm": 0.03955889493227005, + "learning_rate": 0.00015165849048111977, + "loss": 0.285, + "step": 15679 + }, + { + "epoch": 1.270252754374595, + "grad_norm": 0.040690623223781586, + "learning_rate": 0.00015165398982852515, + "loss": 0.3007, + "step": 15680 + }, + { + "epoch": 1.2703337653920932, + "grad_norm": 0.047002773731946945, + "learning_rate": 0.00015164948917593051, + "loss": 0.3075, + "step": 15681 + }, + { + "epoch": 1.2704147764095917, + "grad_norm": 0.04874710366129875, + "learning_rate": 0.00015164498852333587, + "loss": 0.324, + "step": 15682 + }, + { + "epoch": 1.2704957874270901, + "grad_norm": 0.04769693315029144, + "learning_rate": 0.00015164048787074126, + "loss": 0.3241, + "step": 15683 + }, + { + "epoch": 1.2705767984445884, + "grad_norm": 0.043867453932762146, + "learning_rate": 0.00015163598721814665, + "loss": 0.3269, + "step": 15684 + }, + { + "epoch": 1.2706578094620868, + "grad_norm": 0.04034234583377838, + "learning_rate": 0.000151631486565552, + "loss": 0.2962, + "step": 15685 + }, + { + "epoch": 1.2707388204795853, + "grad_norm": 0.04986928403377533, + "learning_rate": 0.0001516269859129574, + "loss": 0.3145, + "step": 15686 + }, + { + "epoch": 1.2708198314970836, + "grad_norm": 0.04775730147957802, + "learning_rate": 0.00015162248526036276, + "loss": 0.3141, + "step": 15687 + }, + { + "epoch": 1.270900842514582, + "grad_norm": 0.04053473100066185, + "learning_rate": 0.00015161798460776812, + "loss": 0.2885, + "step": 15688 + }, + { + "epoch": 1.2709818535320805, + "grad_norm": 0.042710866779088974, + "learning_rate": 0.0001516134839551735, + "loss": 0.3097, + "step": 15689 + }, + { + "epoch": 1.2710628645495787, + "grad_norm": 0.03983086347579956, + "learning_rate": 0.0001516089833025789, + "loss": 0.3149, + "step": 15690 + }, + { + "epoch": 1.2711438755670772, + "grad_norm": 0.04158749803900719, + "learning_rate": 0.00015160448264998425, + "loss": 0.2694, + "step": 15691 + }, + { + "epoch": 1.2712248865845754, + "grad_norm": 0.04513084515929222, + "learning_rate": 0.00015159998199738964, + "loss": 0.2985, + "step": 15692 + }, + { + "epoch": 1.271305897602074, + "grad_norm": 0.053826138377189636, + "learning_rate": 0.000151595481344795, + "loss": 0.3506, + "step": 15693 + }, + { + "epoch": 1.2713869086195722, + "grad_norm": 0.04571051150560379, + "learning_rate": 0.00015159098069220036, + "loss": 0.3144, + "step": 15694 + }, + { + "epoch": 1.2714679196370706, + "grad_norm": 0.04284343495965004, + "learning_rate": 0.00015158648003960577, + "loss": 0.3118, + "step": 15695 + }, + { + "epoch": 1.271548930654569, + "grad_norm": 0.04464377835392952, + "learning_rate": 0.00015158197938701113, + "loss": 0.3267, + "step": 15696 + }, + { + "epoch": 1.2716299416720673, + "grad_norm": 0.03773884475231171, + "learning_rate": 0.0001515774787344165, + "loss": 0.2651, + "step": 15697 + }, + { + "epoch": 1.2717109526895658, + "grad_norm": 0.051023904234170914, + "learning_rate": 0.00015157297808182188, + "loss": 0.3329, + "step": 15698 + }, + { + "epoch": 1.2717919637070643, + "grad_norm": 0.040893442928791046, + "learning_rate": 0.00015156847742922724, + "loss": 0.2699, + "step": 15699 + }, + { + "epoch": 1.2718729747245625, + "grad_norm": 0.06270833313465118, + "learning_rate": 0.0001515639767766326, + "loss": 0.3397, + "step": 15700 + }, + { + "epoch": 1.271953985742061, + "grad_norm": 0.043569017201662064, + "learning_rate": 0.000151559476124038, + "loss": 0.3259, + "step": 15701 + }, + { + "epoch": 1.2720349967595592, + "grad_norm": 0.04084454104304314, + "learning_rate": 0.00015155497547144337, + "loss": 0.3232, + "step": 15702 + }, + { + "epoch": 1.2721160077770577, + "grad_norm": 0.04653387516736984, + "learning_rate": 0.00015155047481884873, + "loss": 0.3565, + "step": 15703 + }, + { + "epoch": 1.272197018794556, + "grad_norm": 0.03843921422958374, + "learning_rate": 0.00015154597416625412, + "loss": 0.3202, + "step": 15704 + }, + { + "epoch": 1.2722780298120544, + "grad_norm": 0.04410295560956001, + "learning_rate": 0.00015154147351365948, + "loss": 0.2891, + "step": 15705 + }, + { + "epoch": 1.2723590408295529, + "grad_norm": 0.04347636550664902, + "learning_rate": 0.00015153697286106484, + "loss": 0.3228, + "step": 15706 + }, + { + "epoch": 1.2724400518470511, + "grad_norm": 0.043565716594457626, + "learning_rate": 0.00015153247220847025, + "loss": 0.3351, + "step": 15707 + }, + { + "epoch": 1.2725210628645496, + "grad_norm": 0.040979839861392975, + "learning_rate": 0.00015152797155587561, + "loss": 0.3374, + "step": 15708 + }, + { + "epoch": 1.272602073882048, + "grad_norm": 0.04642684757709503, + "learning_rate": 0.00015152347090328097, + "loss": 0.3309, + "step": 15709 + }, + { + "epoch": 1.2726830848995463, + "grad_norm": 0.04228156432509422, + "learning_rate": 0.00015151897025068636, + "loss": 0.3325, + "step": 15710 + }, + { + "epoch": 1.2727640959170448, + "grad_norm": 0.038930002599954605, + "learning_rate": 0.00015151446959809172, + "loss": 0.3278, + "step": 15711 + }, + { + "epoch": 1.2728451069345432, + "grad_norm": 0.04951472580432892, + "learning_rate": 0.00015150996894549708, + "loss": 0.3123, + "step": 15712 + }, + { + "epoch": 1.2729261179520415, + "grad_norm": 0.043662507086992264, + "learning_rate": 0.0001515054682929025, + "loss": 0.2975, + "step": 15713 + }, + { + "epoch": 1.27300712896954, + "grad_norm": 0.04310872405767441, + "learning_rate": 0.00015150096764030786, + "loss": 0.2943, + "step": 15714 + }, + { + "epoch": 1.2730881399870382, + "grad_norm": 0.04810471460223198, + "learning_rate": 0.00015149646698771322, + "loss": 0.2975, + "step": 15715 + }, + { + "epoch": 1.2731691510045366, + "grad_norm": 0.043445102870464325, + "learning_rate": 0.0001514919663351186, + "loss": 0.2829, + "step": 15716 + }, + { + "epoch": 1.273250162022035, + "grad_norm": 0.03980591893196106, + "learning_rate": 0.00015148746568252396, + "loss": 0.2977, + "step": 15717 + }, + { + "epoch": 1.2733311730395334, + "grad_norm": 0.04535724222660065, + "learning_rate": 0.00015148296502992935, + "loss": 0.332, + "step": 15718 + }, + { + "epoch": 1.2734121840570318, + "grad_norm": 0.04394984245300293, + "learning_rate": 0.00015147846437733474, + "loss": 0.2866, + "step": 15719 + }, + { + "epoch": 1.27349319507453, + "grad_norm": 0.04253106936812401, + "learning_rate": 0.0001514739637247401, + "loss": 0.3218, + "step": 15720 + }, + { + "epoch": 1.2735742060920285, + "grad_norm": 0.04656419903039932, + "learning_rate": 0.00015146946307214546, + "loss": 0.28, + "step": 15721 + }, + { + "epoch": 1.273655217109527, + "grad_norm": 0.038466498255729675, + "learning_rate": 0.00015146496241955085, + "loss": 0.2783, + "step": 15722 + }, + { + "epoch": 1.2737362281270252, + "grad_norm": 0.03624296188354492, + "learning_rate": 0.0001514604617669562, + "loss": 0.2554, + "step": 15723 + }, + { + "epoch": 1.2738172391445237, + "grad_norm": 0.048857830464839935, + "learning_rate": 0.0001514559611143616, + "loss": 0.3126, + "step": 15724 + }, + { + "epoch": 1.273898250162022, + "grad_norm": 0.04408116638660431, + "learning_rate": 0.00015145146046176698, + "loss": 0.2878, + "step": 15725 + }, + { + "epoch": 1.2739792611795204, + "grad_norm": 0.04815554991364479, + "learning_rate": 0.00015144695980917234, + "loss": 0.3494, + "step": 15726 + }, + { + "epoch": 1.2740602721970187, + "grad_norm": 0.04634399339556694, + "learning_rate": 0.0001514424591565777, + "loss": 0.3288, + "step": 15727 + }, + { + "epoch": 1.2741412832145171, + "grad_norm": 0.044304654002189636, + "learning_rate": 0.0001514379585039831, + "loss": 0.2857, + "step": 15728 + }, + { + "epoch": 1.2742222942320156, + "grad_norm": 0.04345221072435379, + "learning_rate": 0.00015143345785138845, + "loss": 0.3198, + "step": 15729 + }, + { + "epoch": 1.2743033052495139, + "grad_norm": 0.038415733724832535, + "learning_rate": 0.00015142895719879383, + "loss": 0.2673, + "step": 15730 + }, + { + "epoch": 1.2743843162670123, + "grad_norm": 0.047189049422740936, + "learning_rate": 0.00015142445654619922, + "loss": 0.3257, + "step": 15731 + }, + { + "epoch": 1.2744653272845108, + "grad_norm": 0.053995609283447266, + "learning_rate": 0.00015141995589360458, + "loss": 0.3678, + "step": 15732 + }, + { + "epoch": 1.274546338302009, + "grad_norm": 0.04572221264243126, + "learning_rate": 0.00015141545524100994, + "loss": 0.2847, + "step": 15733 + }, + { + "epoch": 1.2746273493195075, + "grad_norm": 0.03478986769914627, + "learning_rate": 0.00015141095458841533, + "loss": 0.248, + "step": 15734 + }, + { + "epoch": 1.274708360337006, + "grad_norm": 0.04654289036989212, + "learning_rate": 0.0001514064539358207, + "loss": 0.3294, + "step": 15735 + }, + { + "epoch": 1.2747893713545042, + "grad_norm": 0.05372928828001022, + "learning_rate": 0.00015140195328322608, + "loss": 0.3601, + "step": 15736 + }, + { + "epoch": 1.2748703823720027, + "grad_norm": 0.044612761586904526, + "learning_rate": 0.00015139745263063146, + "loss": 0.2979, + "step": 15737 + }, + { + "epoch": 1.274951393389501, + "grad_norm": 0.04504215344786644, + "learning_rate": 0.00015139295197803682, + "loss": 0.2803, + "step": 15738 + }, + { + "epoch": 1.2750324044069994, + "grad_norm": 0.037995193153619766, + "learning_rate": 0.00015138845132544218, + "loss": 0.296, + "step": 15739 + }, + { + "epoch": 1.2751134154244976, + "grad_norm": 0.042726241052150726, + "learning_rate": 0.00015138395067284757, + "loss": 0.2689, + "step": 15740 + }, + { + "epoch": 1.275194426441996, + "grad_norm": 0.04570639133453369, + "learning_rate": 0.00015137945002025293, + "loss": 0.2965, + "step": 15741 + }, + { + "epoch": 1.2752754374594946, + "grad_norm": 0.04122673347592354, + "learning_rate": 0.00015137494936765832, + "loss": 0.305, + "step": 15742 + }, + { + "epoch": 1.2753564484769928, + "grad_norm": 0.058693546801805496, + "learning_rate": 0.0001513704487150637, + "loss": 0.3323, + "step": 15743 + }, + { + "epoch": 1.2754374594944913, + "grad_norm": 0.03962014988064766, + "learning_rate": 0.00015136594806246906, + "loss": 0.2568, + "step": 15744 + }, + { + "epoch": 1.2755184705119897, + "grad_norm": 0.04162263125181198, + "learning_rate": 0.00015136144740987442, + "loss": 0.2972, + "step": 15745 + }, + { + "epoch": 1.275599481529488, + "grad_norm": 0.0414293073117733, + "learning_rate": 0.0001513569467572798, + "loss": 0.2943, + "step": 15746 + }, + { + "epoch": 1.2756804925469865, + "grad_norm": 0.036952096968889236, + "learning_rate": 0.0001513524461046852, + "loss": 0.2626, + "step": 15747 + }, + { + "epoch": 1.2757615035644847, + "grad_norm": 0.03710842505097389, + "learning_rate": 0.00015134794545209056, + "loss": 0.286, + "step": 15748 + }, + { + "epoch": 1.2758425145819832, + "grad_norm": 0.04875032976269722, + "learning_rate": 0.00015134344479949595, + "loss": 0.3319, + "step": 15749 + }, + { + "epoch": 1.2759235255994814, + "grad_norm": 0.03967723250389099, + "learning_rate": 0.0001513389441469013, + "loss": 0.2607, + "step": 15750 + }, + { + "epoch": 1.2760045366169799, + "grad_norm": 0.04703553766012192, + "learning_rate": 0.00015133444349430667, + "loss": 0.3261, + "step": 15751 + }, + { + "epoch": 1.2760855476344783, + "grad_norm": 0.04855358600616455, + "learning_rate": 0.00015132994284171205, + "loss": 0.3398, + "step": 15752 + }, + { + "epoch": 1.2761665586519766, + "grad_norm": 0.038160208612680435, + "learning_rate": 0.00015132544218911744, + "loss": 0.284, + "step": 15753 + }, + { + "epoch": 1.276247569669475, + "grad_norm": 0.05315076559782028, + "learning_rate": 0.0001513209415365228, + "loss": 0.2767, + "step": 15754 + }, + { + "epoch": 1.2763285806869735, + "grad_norm": 0.05116632208228111, + "learning_rate": 0.0001513164408839282, + "loss": 0.319, + "step": 15755 + }, + { + "epoch": 1.2764095917044718, + "grad_norm": 0.041848476976156235, + "learning_rate": 0.00015131194023133355, + "loss": 0.2921, + "step": 15756 + }, + { + "epoch": 1.2764906027219702, + "grad_norm": 0.03994974493980408, + "learning_rate": 0.0001513074395787389, + "loss": 0.2873, + "step": 15757 + }, + { + "epoch": 1.2765716137394687, + "grad_norm": 0.04145783931016922, + "learning_rate": 0.0001513029389261443, + "loss": 0.263, + "step": 15758 + }, + { + "epoch": 1.276652624756967, + "grad_norm": 0.045399490743875504, + "learning_rate": 0.00015129843827354968, + "loss": 0.2765, + "step": 15759 + }, + { + "epoch": 1.2767336357744652, + "grad_norm": 0.041569847613573074, + "learning_rate": 0.00015129393762095504, + "loss": 0.3211, + "step": 15760 + }, + { + "epoch": 1.2768146467919637, + "grad_norm": 0.04491395503282547, + "learning_rate": 0.00015128943696836043, + "loss": 0.2978, + "step": 15761 + }, + { + "epoch": 1.2768956578094621, + "grad_norm": 0.04015737026929855, + "learning_rate": 0.0001512849363157658, + "loss": 0.2363, + "step": 15762 + }, + { + "epoch": 1.2769766688269604, + "grad_norm": 0.05360627546906471, + "learning_rate": 0.00015128043566317115, + "loss": 0.3579, + "step": 15763 + }, + { + "epoch": 1.2770576798444588, + "grad_norm": 0.04414328187704086, + "learning_rate": 0.00015127593501057654, + "loss": 0.3453, + "step": 15764 + }, + { + "epoch": 1.2771386908619573, + "grad_norm": 0.04586583375930786, + "learning_rate": 0.00015127143435798192, + "loss": 0.3126, + "step": 15765 + }, + { + "epoch": 1.2772197018794555, + "grad_norm": 0.04022606834769249, + "learning_rate": 0.00015126693370538728, + "loss": 0.3004, + "step": 15766 + }, + { + "epoch": 1.277300712896954, + "grad_norm": 0.04480239376425743, + "learning_rate": 0.00015126243305279267, + "loss": 0.2437, + "step": 15767 + }, + { + "epoch": 1.2773817239144525, + "grad_norm": 0.048118509352207184, + "learning_rate": 0.00015125793240019803, + "loss": 0.358, + "step": 15768 + }, + { + "epoch": 1.2774627349319507, + "grad_norm": 0.043030571192502975, + "learning_rate": 0.0001512534317476034, + "loss": 0.2872, + "step": 15769 + }, + { + "epoch": 1.2775437459494492, + "grad_norm": 0.04102887958288193, + "learning_rate": 0.0001512489310950088, + "loss": 0.3027, + "step": 15770 + }, + { + "epoch": 1.2776247569669474, + "grad_norm": 0.04139826446771622, + "learning_rate": 0.00015124443044241417, + "loss": 0.2893, + "step": 15771 + }, + { + "epoch": 1.277705767984446, + "grad_norm": 0.045518554747104645, + "learning_rate": 0.00015123992978981953, + "loss": 0.29, + "step": 15772 + }, + { + "epoch": 1.2777867790019442, + "grad_norm": 0.04312366247177124, + "learning_rate": 0.0001512354291372249, + "loss": 0.3286, + "step": 15773 + }, + { + "epoch": 1.2778677900194426, + "grad_norm": 0.05005163326859474, + "learning_rate": 0.00015123092848463027, + "loss": 0.2689, + "step": 15774 + }, + { + "epoch": 1.277948801036941, + "grad_norm": 0.04603840783238411, + "learning_rate": 0.00015122642783203563, + "loss": 0.3112, + "step": 15775 + }, + { + "epoch": 1.2780298120544393, + "grad_norm": 0.03954975679516792, + "learning_rate": 0.00015122192717944105, + "loss": 0.2838, + "step": 15776 + }, + { + "epoch": 1.2781108230719378, + "grad_norm": 0.04375135526061058, + "learning_rate": 0.0001512174265268464, + "loss": 0.3219, + "step": 15777 + }, + { + "epoch": 1.2781918340894363, + "grad_norm": 0.048291079699993134, + "learning_rate": 0.00015121292587425177, + "loss": 0.3121, + "step": 15778 + }, + { + "epoch": 1.2782728451069345, + "grad_norm": 0.043258678168058395, + "learning_rate": 0.00015120842522165715, + "loss": 0.2719, + "step": 15779 + }, + { + "epoch": 1.278353856124433, + "grad_norm": 0.0422712042927742, + "learning_rate": 0.00015120392456906251, + "loss": 0.3121, + "step": 15780 + }, + { + "epoch": 1.2784348671419314, + "grad_norm": 0.04089551419019699, + "learning_rate": 0.00015119942391646787, + "loss": 0.2963, + "step": 15781 + }, + { + "epoch": 1.2785158781594297, + "grad_norm": 0.051967184990644455, + "learning_rate": 0.0001511949232638733, + "loss": 0.3152, + "step": 15782 + }, + { + "epoch": 1.278596889176928, + "grad_norm": 0.038720134645700455, + "learning_rate": 0.00015119042261127865, + "loss": 0.2836, + "step": 15783 + }, + { + "epoch": 1.2786779001944264, + "grad_norm": 0.04551508277654648, + "learning_rate": 0.000151185921958684, + "loss": 0.3157, + "step": 15784 + }, + { + "epoch": 1.2787589112119249, + "grad_norm": 0.045961856842041016, + "learning_rate": 0.0001511814213060894, + "loss": 0.3197, + "step": 15785 + }, + { + "epoch": 1.278839922229423, + "grad_norm": 0.04323948547244072, + "learning_rate": 0.00015117692065349476, + "loss": 0.3239, + "step": 15786 + }, + { + "epoch": 1.2789209332469216, + "grad_norm": 0.04054180532693863, + "learning_rate": 0.00015117242000090012, + "loss": 0.2835, + "step": 15787 + }, + { + "epoch": 1.27900194426442, + "grad_norm": 0.04344893619418144, + "learning_rate": 0.00015116791934830553, + "loss": 0.3076, + "step": 15788 + }, + { + "epoch": 1.2790829552819183, + "grad_norm": 0.042704347521066666, + "learning_rate": 0.0001511634186957109, + "loss": 0.3257, + "step": 15789 + }, + { + "epoch": 1.2791639662994168, + "grad_norm": 0.03959158435463905, + "learning_rate": 0.00015115891804311625, + "loss": 0.2839, + "step": 15790 + }, + { + "epoch": 1.2792449773169152, + "grad_norm": 0.04714252054691315, + "learning_rate": 0.00015115441739052164, + "loss": 0.2966, + "step": 15791 + }, + { + "epoch": 1.2793259883344135, + "grad_norm": 0.04205985367298126, + "learning_rate": 0.000151149916737927, + "loss": 0.2981, + "step": 15792 + }, + { + "epoch": 1.279406999351912, + "grad_norm": 0.05052034929394722, + "learning_rate": 0.00015114541608533236, + "loss": 0.3106, + "step": 15793 + }, + { + "epoch": 1.2794880103694102, + "grad_norm": 0.04639451950788498, + "learning_rate": 0.00015114091543273777, + "loss": 0.3283, + "step": 15794 + }, + { + "epoch": 1.2795690213869086, + "grad_norm": 0.04097670689225197, + "learning_rate": 0.00015113641478014313, + "loss": 0.2752, + "step": 15795 + }, + { + "epoch": 1.279650032404407, + "grad_norm": 0.0460551492869854, + "learning_rate": 0.0001511319141275485, + "loss": 0.2972, + "step": 15796 + }, + { + "epoch": 1.2797310434219054, + "grad_norm": 0.03947293758392334, + "learning_rate": 0.00015112741347495388, + "loss": 0.331, + "step": 15797 + }, + { + "epoch": 1.2798120544394038, + "grad_norm": 0.039245136082172394, + "learning_rate": 0.00015112291282235924, + "loss": 0.2845, + "step": 15798 + }, + { + "epoch": 1.279893065456902, + "grad_norm": 0.058283090591430664, + "learning_rate": 0.00015111841216976463, + "loss": 0.3352, + "step": 15799 + }, + { + "epoch": 1.2799740764744005, + "grad_norm": 0.04156630113720894, + "learning_rate": 0.00015111391151717, + "loss": 0.3177, + "step": 15800 + }, + { + "epoch": 1.280055087491899, + "grad_norm": 0.04782935976982117, + "learning_rate": 0.00015110941086457537, + "loss": 0.3109, + "step": 15801 + }, + { + "epoch": 1.2801360985093972, + "grad_norm": 0.047765400260686874, + "learning_rate": 0.00015110491021198073, + "loss": 0.3176, + "step": 15802 + }, + { + "epoch": 1.2802171095268957, + "grad_norm": 0.045515209436416626, + "learning_rate": 0.00015110040955938612, + "loss": 0.3055, + "step": 15803 + }, + { + "epoch": 1.280298120544394, + "grad_norm": 0.041018128395080566, + "learning_rate": 0.00015109590890679148, + "loss": 0.311, + "step": 15804 + }, + { + "epoch": 1.2803791315618924, + "grad_norm": 0.037294745445251465, + "learning_rate": 0.00015109140825419687, + "loss": 0.2812, + "step": 15805 + }, + { + "epoch": 1.2804601425793907, + "grad_norm": 0.0468532033264637, + "learning_rate": 0.00015108690760160225, + "loss": 0.3211, + "step": 15806 + }, + { + "epoch": 1.2805411535968891, + "grad_norm": 0.039705317467451096, + "learning_rate": 0.00015108240694900761, + "loss": 0.2898, + "step": 15807 + }, + { + "epoch": 1.2806221646143876, + "grad_norm": 0.03788604214787483, + "learning_rate": 0.00015107790629641297, + "loss": 0.2577, + "step": 15808 + }, + { + "epoch": 1.2807031756318858, + "grad_norm": 0.04811332747340202, + "learning_rate": 0.00015107340564381836, + "loss": 0.3519, + "step": 15809 + }, + { + "epoch": 1.2807841866493843, + "grad_norm": 0.03987358510494232, + "learning_rate": 0.00015106890499122372, + "loss": 0.2791, + "step": 15810 + }, + { + "epoch": 1.2808651976668828, + "grad_norm": 0.050667937844991684, + "learning_rate": 0.0001510644043386291, + "loss": 0.3111, + "step": 15811 + }, + { + "epoch": 1.280946208684381, + "grad_norm": 0.04332325607538223, + "learning_rate": 0.0001510599036860345, + "loss": 0.2667, + "step": 15812 + }, + { + "epoch": 1.2810272197018795, + "grad_norm": 0.041449159383773804, + "learning_rate": 0.00015105540303343986, + "loss": 0.2847, + "step": 15813 + }, + { + "epoch": 1.281108230719378, + "grad_norm": 0.03840943053364754, + "learning_rate": 0.00015105090238084522, + "loss": 0.2873, + "step": 15814 + }, + { + "epoch": 1.2811892417368762, + "grad_norm": 0.043712034821510315, + "learning_rate": 0.0001510464017282506, + "loss": 0.3275, + "step": 15815 + }, + { + "epoch": 1.2812702527543747, + "grad_norm": 0.062320683151483536, + "learning_rate": 0.00015104190107565596, + "loss": 0.3337, + "step": 15816 + }, + { + "epoch": 1.281351263771873, + "grad_norm": 0.040366072207689285, + "learning_rate": 0.00015103740042306135, + "loss": 0.2903, + "step": 15817 + }, + { + "epoch": 1.2814322747893714, + "grad_norm": 0.05467378348112106, + "learning_rate": 0.00015103289977046674, + "loss": 0.3829, + "step": 15818 + }, + { + "epoch": 1.2815132858068696, + "grad_norm": 0.04404067620635033, + "learning_rate": 0.0001510283991178721, + "loss": 0.3029, + "step": 15819 + }, + { + "epoch": 1.281594296824368, + "grad_norm": 0.0416707769036293, + "learning_rate": 0.00015102389846527746, + "loss": 0.2667, + "step": 15820 + }, + { + "epoch": 1.2816753078418666, + "grad_norm": 0.051012761890888214, + "learning_rate": 0.00015101939781268285, + "loss": 0.3023, + "step": 15821 + }, + { + "epoch": 1.2817563188593648, + "grad_norm": 0.04759092628955841, + "learning_rate": 0.0001510148971600882, + "loss": 0.2905, + "step": 15822 + }, + { + "epoch": 1.2818373298768633, + "grad_norm": 0.04226645827293396, + "learning_rate": 0.0001510103965074936, + "loss": 0.3202, + "step": 15823 + }, + { + "epoch": 1.2819183408943617, + "grad_norm": 0.04447139799594879, + "learning_rate": 0.00015100589585489898, + "loss": 0.349, + "step": 15824 + }, + { + "epoch": 1.28199935191186, + "grad_norm": 0.04605408385396004, + "learning_rate": 0.00015100139520230434, + "loss": 0.311, + "step": 15825 + }, + { + "epoch": 1.2820803629293585, + "grad_norm": 0.05514140799641609, + "learning_rate": 0.0001509968945497097, + "loss": 0.3447, + "step": 15826 + }, + { + "epoch": 1.2821613739468567, + "grad_norm": 0.04618806391954422, + "learning_rate": 0.0001509923938971151, + "loss": 0.3194, + "step": 15827 + }, + { + "epoch": 1.2822423849643552, + "grad_norm": 0.04209521412849426, + "learning_rate": 0.00015098789324452047, + "loss": 0.2996, + "step": 15828 + }, + { + "epoch": 1.2823233959818534, + "grad_norm": 0.04071464017033577, + "learning_rate": 0.00015098339259192583, + "loss": 0.291, + "step": 15829 + }, + { + "epoch": 1.2824044069993519, + "grad_norm": 0.048178721219301224, + "learning_rate": 0.00015097889193933122, + "loss": 0.3175, + "step": 15830 + }, + { + "epoch": 1.2824854180168503, + "grad_norm": 0.04599504545331001, + "learning_rate": 0.00015097439128673658, + "loss": 0.2919, + "step": 15831 + }, + { + "epoch": 1.2825664290343486, + "grad_norm": 0.04587290808558464, + "learning_rate": 0.00015096989063414194, + "loss": 0.3469, + "step": 15832 + }, + { + "epoch": 1.282647440051847, + "grad_norm": 0.04044627398252487, + "learning_rate": 0.00015096538998154733, + "loss": 0.2704, + "step": 15833 + }, + { + "epoch": 1.2827284510693455, + "grad_norm": 0.0455966591835022, + "learning_rate": 0.00015096088932895272, + "loss": 0.3059, + "step": 15834 + }, + { + "epoch": 1.2828094620868438, + "grad_norm": 0.04464682191610336, + "learning_rate": 0.00015095638867635808, + "loss": 0.2863, + "step": 15835 + }, + { + "epoch": 1.2828904731043422, + "grad_norm": 0.04729429632425308, + "learning_rate": 0.00015095188802376346, + "loss": 0.3079, + "step": 15836 + }, + { + "epoch": 1.2829714841218407, + "grad_norm": 0.04429822415113449, + "learning_rate": 0.00015094738737116882, + "loss": 0.2738, + "step": 15837 + }, + { + "epoch": 1.283052495139339, + "grad_norm": 0.04838932305574417, + "learning_rate": 0.00015094288671857418, + "loss": 0.3454, + "step": 15838 + }, + { + "epoch": 1.2831335061568374, + "grad_norm": 0.0450855977833271, + "learning_rate": 0.00015093838606597957, + "loss": 0.2738, + "step": 15839 + }, + { + "epoch": 1.2832145171743357, + "grad_norm": 0.044912971556186676, + "learning_rate": 0.00015093388541338496, + "loss": 0.3287, + "step": 15840 + }, + { + "epoch": 1.2832955281918341, + "grad_norm": 0.05100622400641441, + "learning_rate": 0.00015092938476079032, + "loss": 0.337, + "step": 15841 + }, + { + "epoch": 1.2833765392093324, + "grad_norm": 0.043419577181339264, + "learning_rate": 0.0001509248841081957, + "loss": 0.2982, + "step": 15842 + }, + { + "epoch": 1.2834575502268308, + "grad_norm": 0.04492282122373581, + "learning_rate": 0.00015092038345560106, + "loss": 0.2882, + "step": 15843 + }, + { + "epoch": 1.2835385612443293, + "grad_norm": 0.04124194756150246, + "learning_rate": 0.00015091588280300642, + "loss": 0.2766, + "step": 15844 + }, + { + "epoch": 1.2836195722618275, + "grad_norm": 0.04995120316743851, + "learning_rate": 0.0001509113821504118, + "loss": 0.3657, + "step": 15845 + }, + { + "epoch": 1.283700583279326, + "grad_norm": 0.03788880258798599, + "learning_rate": 0.0001509068814978172, + "loss": 0.2765, + "step": 15846 + }, + { + "epoch": 1.2837815942968245, + "grad_norm": 0.04601196572184563, + "learning_rate": 0.00015090238084522256, + "loss": 0.295, + "step": 15847 + }, + { + "epoch": 1.2838626053143227, + "grad_norm": 0.03844109922647476, + "learning_rate": 0.00015089788019262795, + "loss": 0.2282, + "step": 15848 + }, + { + "epoch": 1.2839436163318212, + "grad_norm": 0.03849412500858307, + "learning_rate": 0.0001508933795400333, + "loss": 0.2936, + "step": 15849 + }, + { + "epoch": 1.2840246273493194, + "grad_norm": 0.03941682353615761, + "learning_rate": 0.00015088887888743867, + "loss": 0.2896, + "step": 15850 + }, + { + "epoch": 1.284105638366818, + "grad_norm": 0.038460344076156616, + "learning_rate": 0.00015088437823484408, + "loss": 0.3021, + "step": 15851 + }, + { + "epoch": 1.2841866493843161, + "grad_norm": 0.0497819148004055, + "learning_rate": 0.00015087987758224944, + "loss": 0.3235, + "step": 15852 + }, + { + "epoch": 1.2842676604018146, + "grad_norm": 0.048957642167806625, + "learning_rate": 0.0001508753769296548, + "loss": 0.3431, + "step": 15853 + }, + { + "epoch": 1.284348671419313, + "grad_norm": 0.051105812191963196, + "learning_rate": 0.0001508708762770602, + "loss": 0.3352, + "step": 15854 + }, + { + "epoch": 1.2844296824368113, + "grad_norm": 0.04054543375968933, + "learning_rate": 0.00015086637562446555, + "loss": 0.2647, + "step": 15855 + }, + { + "epoch": 1.2845106934543098, + "grad_norm": 0.03664393350481987, + "learning_rate": 0.0001508618749718709, + "loss": 0.2576, + "step": 15856 + }, + { + "epoch": 1.2845917044718083, + "grad_norm": 0.04197325557470322, + "learning_rate": 0.00015085737431927632, + "loss": 0.2832, + "step": 15857 + }, + { + "epoch": 1.2846727154893065, + "grad_norm": 0.04285228252410889, + "learning_rate": 0.00015085287366668168, + "loss": 0.2866, + "step": 15858 + }, + { + "epoch": 1.284753726506805, + "grad_norm": 0.04115356504917145, + "learning_rate": 0.00015084837301408704, + "loss": 0.3039, + "step": 15859 + }, + { + "epoch": 1.2848347375243034, + "grad_norm": 0.04148733615875244, + "learning_rate": 0.00015084387236149243, + "loss": 0.2662, + "step": 15860 + }, + { + "epoch": 1.2849157485418017, + "grad_norm": 0.04504326730966568, + "learning_rate": 0.0001508393717088978, + "loss": 0.303, + "step": 15861 + }, + { + "epoch": 1.2849967595593, + "grad_norm": 0.04083064943552017, + "learning_rate": 0.00015083487105630315, + "loss": 0.2733, + "step": 15862 + }, + { + "epoch": 1.2850777705767984, + "grad_norm": 0.049322277307510376, + "learning_rate": 0.00015083037040370856, + "loss": 0.3014, + "step": 15863 + }, + { + "epoch": 1.2851587815942969, + "grad_norm": 0.049569688737392426, + "learning_rate": 0.00015082586975111392, + "loss": 0.2967, + "step": 15864 + }, + { + "epoch": 1.285239792611795, + "grad_norm": 0.0467229038476944, + "learning_rate": 0.00015082136909851928, + "loss": 0.3244, + "step": 15865 + }, + { + "epoch": 1.2853208036292936, + "grad_norm": 0.04364887624979019, + "learning_rate": 0.00015081686844592467, + "loss": 0.2963, + "step": 15866 + }, + { + "epoch": 1.285401814646792, + "grad_norm": 0.04608934745192528, + "learning_rate": 0.00015081236779333003, + "loss": 0.3102, + "step": 15867 + }, + { + "epoch": 1.2854828256642903, + "grad_norm": 0.055260661989450455, + "learning_rate": 0.0001508078671407354, + "loss": 0.3336, + "step": 15868 + }, + { + "epoch": 1.2855638366817888, + "grad_norm": 0.046738460659980774, + "learning_rate": 0.0001508033664881408, + "loss": 0.3034, + "step": 15869 + }, + { + "epoch": 1.2856448476992872, + "grad_norm": 0.03964695334434509, + "learning_rate": 0.00015079886583554617, + "loss": 0.2938, + "step": 15870 + }, + { + "epoch": 1.2857258587167855, + "grad_norm": 0.041268426924943924, + "learning_rate": 0.00015079436518295153, + "loss": 0.2884, + "step": 15871 + }, + { + "epoch": 1.285806869734284, + "grad_norm": 0.04362771660089493, + "learning_rate": 0.0001507898645303569, + "loss": 0.2964, + "step": 15872 + }, + { + "epoch": 1.2858878807517822, + "grad_norm": 0.04615508392453194, + "learning_rate": 0.00015078536387776227, + "loss": 0.2959, + "step": 15873 + }, + { + "epoch": 1.2859688917692806, + "grad_norm": 0.040400903671979904, + "learning_rate": 0.00015078086322516763, + "loss": 0.2843, + "step": 15874 + }, + { + "epoch": 1.2860499027867789, + "grad_norm": 0.05180887505412102, + "learning_rate": 0.00015077636257257305, + "loss": 0.3766, + "step": 15875 + }, + { + "epoch": 1.2861309138042774, + "grad_norm": 0.049692705273628235, + "learning_rate": 0.0001507718619199784, + "loss": 0.3236, + "step": 15876 + }, + { + "epoch": 1.2862119248217758, + "grad_norm": 0.044623278081417084, + "learning_rate": 0.00015076736126738377, + "loss": 0.3004, + "step": 15877 + }, + { + "epoch": 1.286292935839274, + "grad_norm": 0.04762900993227959, + "learning_rate": 0.00015076286061478915, + "loss": 0.3315, + "step": 15878 + }, + { + "epoch": 1.2863739468567725, + "grad_norm": 0.054607536643743515, + "learning_rate": 0.00015075835996219451, + "loss": 0.3493, + "step": 15879 + }, + { + "epoch": 1.286454957874271, + "grad_norm": 0.044743411242961884, + "learning_rate": 0.0001507538593095999, + "loss": 0.2617, + "step": 15880 + }, + { + "epoch": 1.2865359688917692, + "grad_norm": 0.046804748475551605, + "learning_rate": 0.0001507493586570053, + "loss": 0.3197, + "step": 15881 + }, + { + "epoch": 1.2866169799092677, + "grad_norm": 0.03905186429619789, + "learning_rate": 0.00015074485800441065, + "loss": 0.2653, + "step": 15882 + }, + { + "epoch": 1.2866979909267662, + "grad_norm": 0.04008020833134651, + "learning_rate": 0.000150740357351816, + "loss": 0.3009, + "step": 15883 + }, + { + "epoch": 1.2867790019442644, + "grad_norm": 0.039590250700712204, + "learning_rate": 0.0001507358566992214, + "loss": 0.2968, + "step": 15884 + }, + { + "epoch": 1.2868600129617627, + "grad_norm": 0.0442819744348526, + "learning_rate": 0.00015073135604662676, + "loss": 0.3143, + "step": 15885 + }, + { + "epoch": 1.2869410239792611, + "grad_norm": 0.04467121511697769, + "learning_rate": 0.00015072685539403214, + "loss": 0.289, + "step": 15886 + }, + { + "epoch": 1.2870220349967596, + "grad_norm": 0.045479509979486465, + "learning_rate": 0.00015072235474143753, + "loss": 0.3054, + "step": 15887 + }, + { + "epoch": 1.2871030460142578, + "grad_norm": 0.043260522186756134, + "learning_rate": 0.0001507178540888429, + "loss": 0.3363, + "step": 15888 + }, + { + "epoch": 1.2871840570317563, + "grad_norm": 0.0441555455327034, + "learning_rate": 0.00015071335343624825, + "loss": 0.295, + "step": 15889 + }, + { + "epoch": 1.2872650680492548, + "grad_norm": 0.04531952738761902, + "learning_rate": 0.00015070885278365364, + "loss": 0.3141, + "step": 15890 + }, + { + "epoch": 1.287346079066753, + "grad_norm": 0.03444262593984604, + "learning_rate": 0.000150704352131059, + "loss": 0.2738, + "step": 15891 + }, + { + "epoch": 1.2874270900842515, + "grad_norm": 0.04534309729933739, + "learning_rate": 0.00015069985147846438, + "loss": 0.2784, + "step": 15892 + }, + { + "epoch": 1.28750810110175, + "grad_norm": 0.042812563478946686, + "learning_rate": 0.00015069535082586977, + "loss": 0.2636, + "step": 15893 + }, + { + "epoch": 1.2875891121192482, + "grad_norm": 0.05558900535106659, + "learning_rate": 0.00015069085017327513, + "loss": 0.3311, + "step": 15894 + }, + { + "epoch": 1.2876701231367467, + "grad_norm": 0.046561017632484436, + "learning_rate": 0.0001506863495206805, + "loss": 0.3049, + "step": 15895 + }, + { + "epoch": 1.287751134154245, + "grad_norm": 0.04676572233438492, + "learning_rate": 0.00015068184886808588, + "loss": 0.317, + "step": 15896 + }, + { + "epoch": 1.2878321451717434, + "grad_norm": 0.04445261508226395, + "learning_rate": 0.00015067734821549124, + "loss": 0.2855, + "step": 15897 + }, + { + "epoch": 1.2879131561892416, + "grad_norm": 0.039349205791950226, + "learning_rate": 0.00015067284756289663, + "loss": 0.2909, + "step": 15898 + }, + { + "epoch": 1.28799416720674, + "grad_norm": 0.041501231491565704, + "learning_rate": 0.000150668346910302, + "loss": 0.2833, + "step": 15899 + }, + { + "epoch": 1.2880751782242386, + "grad_norm": 0.04623178020119667, + "learning_rate": 0.00015066384625770737, + "loss": 0.2798, + "step": 15900 + }, + { + "epoch": 1.2881561892417368, + "grad_norm": 0.03733741492033005, + "learning_rate": 0.00015065934560511273, + "loss": 0.2776, + "step": 15901 + }, + { + "epoch": 1.2882372002592353, + "grad_norm": 0.0544278547167778, + "learning_rate": 0.00015065484495251812, + "loss": 0.289, + "step": 15902 + }, + { + "epoch": 1.2883182112767337, + "grad_norm": 0.0542629212141037, + "learning_rate": 0.0001506503442999235, + "loss": 0.2831, + "step": 15903 + }, + { + "epoch": 1.288399222294232, + "grad_norm": 0.04090864211320877, + "learning_rate": 0.00015064584364732887, + "loss": 0.2856, + "step": 15904 + }, + { + "epoch": 1.2884802333117304, + "grad_norm": 0.04039837047457695, + "learning_rate": 0.00015064134299473426, + "loss": 0.2829, + "step": 15905 + }, + { + "epoch": 1.2885612443292287, + "grad_norm": 0.039326541125774384, + "learning_rate": 0.00015063684234213962, + "loss": 0.3147, + "step": 15906 + }, + { + "epoch": 1.2886422553467272, + "grad_norm": 0.040783412754535675, + "learning_rate": 0.00015063234168954498, + "loss": 0.2949, + "step": 15907 + }, + { + "epoch": 1.2887232663642254, + "grad_norm": 0.04851400852203369, + "learning_rate": 0.00015062784103695036, + "loss": 0.3431, + "step": 15908 + }, + { + "epoch": 1.2888042773817239, + "grad_norm": 0.05781017243862152, + "learning_rate": 0.00015062334038435575, + "loss": 0.3282, + "step": 15909 + }, + { + "epoch": 1.2888852883992223, + "grad_norm": 0.047731805592775345, + "learning_rate": 0.0001506188397317611, + "loss": 0.3127, + "step": 15910 + }, + { + "epoch": 1.2889662994167206, + "grad_norm": 0.04410161077976227, + "learning_rate": 0.0001506143390791665, + "loss": 0.3221, + "step": 15911 + }, + { + "epoch": 1.289047310434219, + "grad_norm": 0.03798040747642517, + "learning_rate": 0.00015060983842657186, + "loss": 0.275, + "step": 15912 + }, + { + "epoch": 1.2891283214517175, + "grad_norm": 0.042938366532325745, + "learning_rate": 0.00015060533777397722, + "loss": 0.3042, + "step": 15913 + }, + { + "epoch": 1.2892093324692158, + "grad_norm": 0.04302908107638359, + "learning_rate": 0.0001506008371213826, + "loss": 0.2968, + "step": 15914 + }, + { + "epoch": 1.2892903434867142, + "grad_norm": 0.04758167266845703, + "learning_rate": 0.000150596336468788, + "loss": 0.3327, + "step": 15915 + }, + { + "epoch": 1.2893713545042127, + "grad_norm": 0.04208313673734665, + "learning_rate": 0.00015059183581619335, + "loss": 0.2942, + "step": 15916 + }, + { + "epoch": 1.289452365521711, + "grad_norm": 0.039246536791324615, + "learning_rate": 0.00015058733516359874, + "loss": 0.2958, + "step": 15917 + }, + { + "epoch": 1.2895333765392094, + "grad_norm": 0.0409688763320446, + "learning_rate": 0.0001505828345110041, + "loss": 0.3468, + "step": 15918 + }, + { + "epoch": 1.2896143875567077, + "grad_norm": 0.04577702283859253, + "learning_rate": 0.00015057833385840946, + "loss": 0.294, + "step": 15919 + }, + { + "epoch": 1.2896953985742061, + "grad_norm": 0.04426812753081322, + "learning_rate": 0.00015057383320581485, + "loss": 0.3375, + "step": 15920 + }, + { + "epoch": 1.2897764095917044, + "grad_norm": 0.03992077335715294, + "learning_rate": 0.00015056933255322023, + "loss": 0.3062, + "step": 15921 + }, + { + "epoch": 1.2898574206092028, + "grad_norm": 0.04026995599269867, + "learning_rate": 0.0001505648319006256, + "loss": 0.2851, + "step": 15922 + }, + { + "epoch": 1.2899384316267013, + "grad_norm": 0.04487210139632225, + "learning_rate": 0.00015056033124803098, + "loss": 0.3218, + "step": 15923 + }, + { + "epoch": 1.2900194426441995, + "grad_norm": 0.03883304074406624, + "learning_rate": 0.00015055583059543634, + "loss": 0.2588, + "step": 15924 + }, + { + "epoch": 1.290100453661698, + "grad_norm": 0.04050496220588684, + "learning_rate": 0.0001505513299428417, + "loss": 0.2928, + "step": 15925 + }, + { + "epoch": 1.2901814646791965, + "grad_norm": 0.046088963747024536, + "learning_rate": 0.0001505468292902471, + "loss": 0.3024, + "step": 15926 + }, + { + "epoch": 1.2902624756966947, + "grad_norm": 0.0416017584502697, + "learning_rate": 0.00015054232863765247, + "loss": 0.3235, + "step": 15927 + }, + { + "epoch": 1.2903434867141932, + "grad_norm": 0.04543936997652054, + "learning_rate": 0.00015053782798505783, + "loss": 0.3001, + "step": 15928 + }, + { + "epoch": 1.2904244977316914, + "grad_norm": 0.046383414417505264, + "learning_rate": 0.00015053332733246322, + "loss": 0.3005, + "step": 15929 + }, + { + "epoch": 1.29050550874919, + "grad_norm": 0.03874411806464195, + "learning_rate": 0.00015052882667986858, + "loss": 0.2843, + "step": 15930 + }, + { + "epoch": 1.2905865197666881, + "grad_norm": 0.040499668568372726, + "learning_rate": 0.00015052432602727394, + "loss": 0.29, + "step": 15931 + }, + { + "epoch": 1.2906675307841866, + "grad_norm": 0.040241751819849014, + "learning_rate": 0.00015051982537467936, + "loss": 0.3069, + "step": 15932 + }, + { + "epoch": 1.290748541801685, + "grad_norm": 0.039121709764003754, + "learning_rate": 0.00015051532472208472, + "loss": 0.2483, + "step": 15933 + }, + { + "epoch": 1.2908295528191833, + "grad_norm": 0.04153452441096306, + "learning_rate": 0.00015051082406949008, + "loss": 0.2819, + "step": 15934 + }, + { + "epoch": 1.2909105638366818, + "grad_norm": 0.041106656193733215, + "learning_rate": 0.00015050632341689546, + "loss": 0.317, + "step": 15935 + }, + { + "epoch": 1.2909915748541803, + "grad_norm": 0.04836004227399826, + "learning_rate": 0.00015050182276430082, + "loss": 0.2961, + "step": 15936 + }, + { + "epoch": 1.2910725858716785, + "grad_norm": 0.04820879176259041, + "learning_rate": 0.00015049732211170618, + "loss": 0.3212, + "step": 15937 + }, + { + "epoch": 1.291153596889177, + "grad_norm": 0.053317271173000336, + "learning_rate": 0.0001504928214591116, + "loss": 0.3056, + "step": 15938 + }, + { + "epoch": 1.2912346079066754, + "grad_norm": 0.04657156392931938, + "learning_rate": 0.00015048832080651696, + "loss": 0.3214, + "step": 15939 + }, + { + "epoch": 1.2913156189241737, + "grad_norm": 0.050732627511024475, + "learning_rate": 0.00015048382015392232, + "loss": 0.2788, + "step": 15940 + }, + { + "epoch": 1.2913966299416721, + "grad_norm": 0.04055896773934364, + "learning_rate": 0.0001504793195013277, + "loss": 0.2733, + "step": 15941 + }, + { + "epoch": 1.2914776409591704, + "grad_norm": 0.04172273725271225, + "learning_rate": 0.00015047481884873306, + "loss": 0.2872, + "step": 15942 + }, + { + "epoch": 1.2915586519766689, + "grad_norm": 0.055269479751586914, + "learning_rate": 0.00015047031819613842, + "loss": 0.3522, + "step": 15943 + }, + { + "epoch": 1.291639662994167, + "grad_norm": 0.035411085933446884, + "learning_rate": 0.00015046581754354384, + "loss": 0.2478, + "step": 15944 + }, + { + "epoch": 1.2917206740116656, + "grad_norm": 0.044676654040813446, + "learning_rate": 0.0001504613168909492, + "loss": 0.3349, + "step": 15945 + }, + { + "epoch": 1.291801685029164, + "grad_norm": 0.044342026114463806, + "learning_rate": 0.00015045681623835456, + "loss": 0.326, + "step": 15946 + }, + { + "epoch": 1.2918826960466623, + "grad_norm": 0.03670578449964523, + "learning_rate": 0.00015045231558575995, + "loss": 0.2899, + "step": 15947 + }, + { + "epoch": 1.2919637070641607, + "grad_norm": 0.04561435803771019, + "learning_rate": 0.0001504478149331653, + "loss": 0.2935, + "step": 15948 + }, + { + "epoch": 1.2920447180816592, + "grad_norm": 0.04605408012866974, + "learning_rate": 0.00015044331428057067, + "loss": 0.2711, + "step": 15949 + }, + { + "epoch": 1.2921257290991575, + "grad_norm": 0.049196407198905945, + "learning_rate": 0.00015043881362797608, + "loss": 0.3535, + "step": 15950 + }, + { + "epoch": 1.292206740116656, + "grad_norm": 0.041564781218767166, + "learning_rate": 0.00015043431297538144, + "loss": 0.2731, + "step": 15951 + }, + { + "epoch": 1.2922877511341542, + "grad_norm": 0.04630525782704353, + "learning_rate": 0.0001504298123227868, + "loss": 0.3286, + "step": 15952 + }, + { + "epoch": 1.2923687621516526, + "grad_norm": 0.040204476565122604, + "learning_rate": 0.0001504253116701922, + "loss": 0.2874, + "step": 15953 + }, + { + "epoch": 1.2924497731691509, + "grad_norm": 0.041800253093242645, + "learning_rate": 0.00015042081101759755, + "loss": 0.3494, + "step": 15954 + }, + { + "epoch": 1.2925307841866494, + "grad_norm": 0.04281659051775932, + "learning_rate": 0.00015041631036500294, + "loss": 0.2867, + "step": 15955 + }, + { + "epoch": 1.2926117952041478, + "grad_norm": 0.03701747581362724, + "learning_rate": 0.00015041180971240832, + "loss": 0.3058, + "step": 15956 + }, + { + "epoch": 1.292692806221646, + "grad_norm": 0.04389016330242157, + "learning_rate": 0.00015040730905981368, + "loss": 0.3048, + "step": 15957 + }, + { + "epoch": 1.2927738172391445, + "grad_norm": 0.04472460597753525, + "learning_rate": 0.00015040280840721904, + "loss": 0.2887, + "step": 15958 + }, + { + "epoch": 1.292854828256643, + "grad_norm": 0.04361078143119812, + "learning_rate": 0.00015039830775462443, + "loss": 0.3126, + "step": 15959 + }, + { + "epoch": 1.2929358392741412, + "grad_norm": 0.050942130386829376, + "learning_rate": 0.0001503938071020298, + "loss": 0.3615, + "step": 15960 + }, + { + "epoch": 1.2930168502916397, + "grad_norm": 0.04138937592506409, + "learning_rate": 0.00015038930644943518, + "loss": 0.2942, + "step": 15961 + }, + { + "epoch": 1.2930978613091382, + "grad_norm": 0.045624807476997375, + "learning_rate": 0.00015038480579684056, + "loss": 0.3094, + "step": 15962 + }, + { + "epoch": 1.2931788723266364, + "grad_norm": 0.0472959503531456, + "learning_rate": 0.00015038030514424592, + "loss": 0.33, + "step": 15963 + }, + { + "epoch": 1.2932598833441347, + "grad_norm": 0.043403808027505875, + "learning_rate": 0.00015037580449165128, + "loss": 0.2962, + "step": 15964 + }, + { + "epoch": 1.2933408943616331, + "grad_norm": 0.045694395899772644, + "learning_rate": 0.00015037130383905667, + "loss": 0.3109, + "step": 15965 + }, + { + "epoch": 1.2934219053791316, + "grad_norm": 0.03987668454647064, + "learning_rate": 0.00015036680318646203, + "loss": 0.2422, + "step": 15966 + }, + { + "epoch": 1.2935029163966298, + "grad_norm": 0.04315539449453354, + "learning_rate": 0.00015036230253386742, + "loss": 0.3088, + "step": 15967 + }, + { + "epoch": 1.2935839274141283, + "grad_norm": 0.04949916899204254, + "learning_rate": 0.0001503578018812728, + "loss": 0.3267, + "step": 15968 + }, + { + "epoch": 1.2936649384316268, + "grad_norm": 0.040975723415613174, + "learning_rate": 0.00015035330122867817, + "loss": 0.3058, + "step": 15969 + }, + { + "epoch": 1.293745949449125, + "grad_norm": 0.047277871519327164, + "learning_rate": 0.00015034880057608353, + "loss": 0.3061, + "step": 15970 + }, + { + "epoch": 1.2938269604666235, + "grad_norm": 0.04787454754114151, + "learning_rate": 0.0001503442999234889, + "loss": 0.3157, + "step": 15971 + }, + { + "epoch": 1.293907971484122, + "grad_norm": 0.0453806146979332, + "learning_rate": 0.00015033979927089427, + "loss": 0.2985, + "step": 15972 + }, + { + "epoch": 1.2939889825016202, + "grad_norm": 0.046817343682050705, + "learning_rate": 0.00015033529861829966, + "loss": 0.3108, + "step": 15973 + }, + { + "epoch": 1.2940699935191187, + "grad_norm": 0.05633321404457092, + "learning_rate": 0.00015033079796570505, + "loss": 0.3269, + "step": 15974 + }, + { + "epoch": 1.294151004536617, + "grad_norm": 0.043721895664930344, + "learning_rate": 0.0001503262973131104, + "loss": 0.256, + "step": 15975 + }, + { + "epoch": 1.2942320155541154, + "grad_norm": 0.04535561054944992, + "learning_rate": 0.00015032179666051577, + "loss": 0.276, + "step": 15976 + }, + { + "epoch": 1.2943130265716136, + "grad_norm": 0.049087490886449814, + "learning_rate": 0.00015031729600792115, + "loss": 0.3295, + "step": 15977 + }, + { + "epoch": 1.294394037589112, + "grad_norm": 0.043317973613739014, + "learning_rate": 0.00015031279535532651, + "loss": 0.3067, + "step": 15978 + }, + { + "epoch": 1.2944750486066106, + "grad_norm": 0.04272015765309334, + "learning_rate": 0.0001503082947027319, + "loss": 0.2662, + "step": 15979 + }, + { + "epoch": 1.2945560596241088, + "grad_norm": 0.04230866953730583, + "learning_rate": 0.0001503037940501373, + "loss": 0.2777, + "step": 15980 + }, + { + "epoch": 1.2946370706416073, + "grad_norm": 0.04598616436123848, + "learning_rate": 0.00015029929339754265, + "loss": 0.2968, + "step": 15981 + }, + { + "epoch": 1.2947180816591057, + "grad_norm": 0.042806848883628845, + "learning_rate": 0.000150294792744948, + "loss": 0.2666, + "step": 15982 + }, + { + "epoch": 1.294799092676604, + "grad_norm": 0.04042374715209007, + "learning_rate": 0.0001502902920923534, + "loss": 0.2923, + "step": 15983 + }, + { + "epoch": 1.2948801036941024, + "grad_norm": 0.05210564658045769, + "learning_rate": 0.00015028579143975878, + "loss": 0.2913, + "step": 15984 + }, + { + "epoch": 1.294961114711601, + "grad_norm": 0.0484447255730629, + "learning_rate": 0.00015028129078716414, + "loss": 0.3355, + "step": 15985 + }, + { + "epoch": 1.2950421257290992, + "grad_norm": 0.03525340557098389, + "learning_rate": 0.00015027679013456953, + "loss": 0.2806, + "step": 15986 + }, + { + "epoch": 1.2951231367465974, + "grad_norm": 0.05046231672167778, + "learning_rate": 0.0001502722894819749, + "loss": 0.3127, + "step": 15987 + }, + { + "epoch": 1.2952041477640959, + "grad_norm": 0.03851288929581642, + "learning_rate": 0.00015026778882938025, + "loss": 0.2616, + "step": 15988 + }, + { + "epoch": 1.2952851587815943, + "grad_norm": 0.04725256562232971, + "learning_rate": 0.00015026328817678564, + "loss": 0.3236, + "step": 15989 + }, + { + "epoch": 1.2953661697990926, + "grad_norm": 0.041266750544309616, + "learning_rate": 0.00015025878752419102, + "loss": 0.3075, + "step": 15990 + }, + { + "epoch": 1.295447180816591, + "grad_norm": 0.0488935261964798, + "learning_rate": 0.00015025428687159639, + "loss": 0.3391, + "step": 15991 + }, + { + "epoch": 1.2955281918340895, + "grad_norm": 0.04301506653428078, + "learning_rate": 0.00015024978621900177, + "loss": 0.2989, + "step": 15992 + }, + { + "epoch": 1.2956092028515878, + "grad_norm": 0.04415897652506828, + "learning_rate": 0.00015024528556640713, + "loss": 0.3148, + "step": 15993 + }, + { + "epoch": 1.2956902138690862, + "grad_norm": 0.040060512721538544, + "learning_rate": 0.0001502407849138125, + "loss": 0.2771, + "step": 15994 + }, + { + "epoch": 1.2957712248865847, + "grad_norm": 0.04739377275109291, + "learning_rate": 0.00015023628426121788, + "loss": 0.3172, + "step": 15995 + }, + { + "epoch": 1.295852235904083, + "grad_norm": 0.045232877135276794, + "learning_rate": 0.00015023178360862327, + "loss": 0.279, + "step": 15996 + }, + { + "epoch": 1.2959332469215814, + "grad_norm": 0.04951009899377823, + "learning_rate": 0.00015022728295602863, + "loss": 0.3251, + "step": 15997 + }, + { + "epoch": 1.2960142579390797, + "grad_norm": 0.04696754366159439, + "learning_rate": 0.00015022278230343401, + "loss": 0.2981, + "step": 15998 + }, + { + "epoch": 1.2960952689565781, + "grad_norm": 0.045662686228752136, + "learning_rate": 0.00015021828165083937, + "loss": 0.2965, + "step": 15999 + }, + { + "epoch": 1.2961762799740764, + "grad_norm": 0.04374690353870392, + "learning_rate": 0.00015021378099824473, + "loss": 0.3251, + "step": 16000 + }, + { + "epoch": 1.2962572909915748, + "grad_norm": 0.04445065185427666, + "learning_rate": 0.00015020928034565012, + "loss": 0.2917, + "step": 16001 + }, + { + "epoch": 1.2963383020090733, + "grad_norm": 0.04260138422250748, + "learning_rate": 0.0001502047796930555, + "loss": 0.3029, + "step": 16002 + }, + { + "epoch": 1.2964193130265715, + "grad_norm": 0.037433069199323654, + "learning_rate": 0.00015020027904046087, + "loss": 0.2683, + "step": 16003 + }, + { + "epoch": 1.29650032404407, + "grad_norm": 0.04841666668653488, + "learning_rate": 0.00015019577838786626, + "loss": 0.2876, + "step": 16004 + }, + { + "epoch": 1.2965813350615685, + "grad_norm": 0.03761151432991028, + "learning_rate": 0.00015019127773527162, + "loss": 0.2823, + "step": 16005 + }, + { + "epoch": 1.2966623460790667, + "grad_norm": 0.04309995472431183, + "learning_rate": 0.00015018677708267698, + "loss": 0.302, + "step": 16006 + }, + { + "epoch": 1.2967433570965652, + "grad_norm": 0.045569028705358505, + "learning_rate": 0.00015018227643008236, + "loss": 0.3338, + "step": 16007 + }, + { + "epoch": 1.2968243681140634, + "grad_norm": 0.04372565820813179, + "learning_rate": 0.00015017777577748775, + "loss": 0.3042, + "step": 16008 + }, + { + "epoch": 1.296905379131562, + "grad_norm": 0.043127454817295074, + "learning_rate": 0.0001501732751248931, + "loss": 0.2757, + "step": 16009 + }, + { + "epoch": 1.2969863901490601, + "grad_norm": 0.04809211939573288, + "learning_rate": 0.0001501687744722985, + "loss": 0.3057, + "step": 16010 + }, + { + "epoch": 1.2970674011665586, + "grad_norm": 0.04196740314364433, + "learning_rate": 0.00015016427381970386, + "loss": 0.2729, + "step": 16011 + }, + { + "epoch": 1.297148412184057, + "grad_norm": 0.04643071070313454, + "learning_rate": 0.00015015977316710922, + "loss": 0.2972, + "step": 16012 + }, + { + "epoch": 1.2972294232015553, + "grad_norm": 0.045497287064790726, + "learning_rate": 0.00015015527251451463, + "loss": 0.3334, + "step": 16013 + }, + { + "epoch": 1.2973104342190538, + "grad_norm": 0.04679165408015251, + "learning_rate": 0.00015015077186192, + "loss": 0.3005, + "step": 16014 + }, + { + "epoch": 1.2973914452365523, + "grad_norm": 0.045765623450279236, + "learning_rate": 0.00015014627120932535, + "loss": 0.3369, + "step": 16015 + }, + { + "epoch": 1.2974724562540505, + "grad_norm": 0.04222280532121658, + "learning_rate": 0.00015014177055673074, + "loss": 0.2684, + "step": 16016 + }, + { + "epoch": 1.297553467271549, + "grad_norm": 0.04675154760479927, + "learning_rate": 0.0001501372699041361, + "loss": 0.3181, + "step": 16017 + }, + { + "epoch": 1.2976344782890474, + "grad_norm": 0.04510919377207756, + "learning_rate": 0.00015013276925154146, + "loss": 0.3513, + "step": 16018 + }, + { + "epoch": 1.2977154893065457, + "grad_norm": 0.05718798562884331, + "learning_rate": 0.00015012826859894687, + "loss": 0.3105, + "step": 16019 + }, + { + "epoch": 1.2977965003240441, + "grad_norm": 0.03958940505981445, + "learning_rate": 0.00015012376794635223, + "loss": 0.2973, + "step": 16020 + }, + { + "epoch": 1.2978775113415424, + "grad_norm": 0.036500755697488785, + "learning_rate": 0.0001501192672937576, + "loss": 0.241, + "step": 16021 + }, + { + "epoch": 1.2979585223590409, + "grad_norm": 0.03581966832280159, + "learning_rate": 0.00015011476664116298, + "loss": 0.2634, + "step": 16022 + }, + { + "epoch": 1.298039533376539, + "grad_norm": 0.04557995870709419, + "learning_rate": 0.00015011026598856834, + "loss": 0.304, + "step": 16023 + }, + { + "epoch": 1.2981205443940376, + "grad_norm": 0.04019990190863609, + "learning_rate": 0.0001501057653359737, + "loss": 0.2557, + "step": 16024 + }, + { + "epoch": 1.298201555411536, + "grad_norm": 0.050131477415561676, + "learning_rate": 0.00015010126468337911, + "loss": 0.2861, + "step": 16025 + }, + { + "epoch": 1.2982825664290343, + "grad_norm": 0.046285029500722885, + "learning_rate": 0.00015009676403078447, + "loss": 0.2933, + "step": 16026 + }, + { + "epoch": 1.2983635774465327, + "grad_norm": 0.04279644414782524, + "learning_rate": 0.00015009226337818983, + "loss": 0.2914, + "step": 16027 + }, + { + "epoch": 1.2984445884640312, + "grad_norm": 0.049533769488334656, + "learning_rate": 0.00015008776272559522, + "loss": 0.3532, + "step": 16028 + }, + { + "epoch": 1.2985255994815295, + "grad_norm": 0.04774849861860275, + "learning_rate": 0.00015008326207300058, + "loss": 0.2548, + "step": 16029 + }, + { + "epoch": 1.298606610499028, + "grad_norm": 0.04950843006372452, + "learning_rate": 0.00015007876142040594, + "loss": 0.3234, + "step": 16030 + }, + { + "epoch": 1.2986876215165262, + "grad_norm": 0.040186185389757156, + "learning_rate": 0.00015007426076781136, + "loss": 0.2999, + "step": 16031 + }, + { + "epoch": 1.2987686325340246, + "grad_norm": 0.04521823674440384, + "learning_rate": 0.00015006976011521672, + "loss": 0.3095, + "step": 16032 + }, + { + "epoch": 1.2988496435515229, + "grad_norm": 0.04393572732806206, + "learning_rate": 0.00015006525946262208, + "loss": 0.2888, + "step": 16033 + }, + { + "epoch": 1.2989306545690213, + "grad_norm": 0.037971485406160355, + "learning_rate": 0.00015006075881002746, + "loss": 0.2801, + "step": 16034 + }, + { + "epoch": 1.2990116655865198, + "grad_norm": 0.04628375545144081, + "learning_rate": 0.00015005625815743282, + "loss": 0.2941, + "step": 16035 + }, + { + "epoch": 1.299092676604018, + "grad_norm": 0.04361240938305855, + "learning_rate": 0.0001500517575048382, + "loss": 0.2815, + "step": 16036 + }, + { + "epoch": 1.2991736876215165, + "grad_norm": 0.044115811586380005, + "learning_rate": 0.0001500472568522436, + "loss": 0.3492, + "step": 16037 + }, + { + "epoch": 1.299254698639015, + "grad_norm": 0.04842967540025711, + "learning_rate": 0.00015004275619964896, + "loss": 0.326, + "step": 16038 + }, + { + "epoch": 1.2993357096565132, + "grad_norm": 0.045519985258579254, + "learning_rate": 0.00015003825554705432, + "loss": 0.3042, + "step": 16039 + }, + { + "epoch": 1.2994167206740117, + "grad_norm": 0.04397178441286087, + "learning_rate": 0.0001500337548944597, + "loss": 0.2878, + "step": 16040 + }, + { + "epoch": 1.2994977316915102, + "grad_norm": 0.040161408483982086, + "learning_rate": 0.00015002925424186507, + "loss": 0.2637, + "step": 16041 + }, + { + "epoch": 1.2995787427090084, + "grad_norm": 0.04626128077507019, + "learning_rate": 0.00015002475358927045, + "loss": 0.2902, + "step": 16042 + }, + { + "epoch": 1.2996597537265069, + "grad_norm": 0.050964128226041794, + "learning_rate": 0.00015002025293667584, + "loss": 0.3491, + "step": 16043 + }, + { + "epoch": 1.2997407647440051, + "grad_norm": 0.04593636468052864, + "learning_rate": 0.0001500157522840812, + "loss": 0.3294, + "step": 16044 + }, + { + "epoch": 1.2998217757615036, + "grad_norm": 0.05174465477466583, + "learning_rate": 0.00015001125163148656, + "loss": 0.3259, + "step": 16045 + }, + { + "epoch": 1.2999027867790018, + "grad_norm": 0.043847907334566116, + "learning_rate": 0.00015000675097889195, + "loss": 0.3159, + "step": 16046 + }, + { + "epoch": 1.2999837977965003, + "grad_norm": 0.047936998307704926, + "learning_rate": 0.0001500022503262973, + "loss": 0.3451, + "step": 16047 + }, + { + "epoch": 1.3000648088139988, + "grad_norm": 0.04735151678323746, + "learning_rate": 0.0001499977496737027, + "loss": 0.3138, + "step": 16048 + }, + { + "epoch": 1.300145819831497, + "grad_norm": 0.038559772074222565, + "learning_rate": 0.00014999324902110808, + "loss": 0.2651, + "step": 16049 + }, + { + "epoch": 1.3002268308489955, + "grad_norm": 0.04015229269862175, + "learning_rate": 0.00014998874836851344, + "loss": 0.2682, + "step": 16050 + }, + { + "epoch": 1.300307841866494, + "grad_norm": 0.042013492435216904, + "learning_rate": 0.0001499842477159188, + "loss": 0.2803, + "step": 16051 + }, + { + "epoch": 1.3003888528839922, + "grad_norm": 0.043495483696460724, + "learning_rate": 0.0001499797470633242, + "loss": 0.2651, + "step": 16052 + }, + { + "epoch": 1.3004698639014907, + "grad_norm": 0.03844980522990227, + "learning_rate": 0.00014997524641072955, + "loss": 0.2618, + "step": 16053 + }, + { + "epoch": 1.300550874918989, + "grad_norm": 0.04398927837610245, + "learning_rate": 0.00014997074575813494, + "loss": 0.2874, + "step": 16054 + }, + { + "epoch": 1.3006318859364874, + "grad_norm": 0.04379849135875702, + "learning_rate": 0.00014996624510554032, + "loss": 0.3122, + "step": 16055 + }, + { + "epoch": 1.3007128969539856, + "grad_norm": 0.044488757848739624, + "learning_rate": 0.00014996174445294568, + "loss": 0.3078, + "step": 16056 + }, + { + "epoch": 1.300793907971484, + "grad_norm": 0.04269413650035858, + "learning_rate": 0.00014995724380035104, + "loss": 0.2786, + "step": 16057 + }, + { + "epoch": 1.3008749189889826, + "grad_norm": 0.04010821878910065, + "learning_rate": 0.00014995274314775643, + "loss": 0.252, + "step": 16058 + }, + { + "epoch": 1.3009559300064808, + "grad_norm": 0.042552195489406586, + "learning_rate": 0.0001499482424951618, + "loss": 0.3149, + "step": 16059 + }, + { + "epoch": 1.3010369410239793, + "grad_norm": 0.043433062732219696, + "learning_rate": 0.00014994374184256718, + "loss": 0.3159, + "step": 16060 + }, + { + "epoch": 1.3011179520414777, + "grad_norm": 0.0455293282866478, + "learning_rate": 0.00014993924118997256, + "loss": 0.3078, + "step": 16061 + }, + { + "epoch": 1.301198963058976, + "grad_norm": 0.04365817457437515, + "learning_rate": 0.00014993474053737792, + "loss": 0.2855, + "step": 16062 + }, + { + "epoch": 1.3012799740764744, + "grad_norm": 0.04493528977036476, + "learning_rate": 0.00014993023988478328, + "loss": 0.3133, + "step": 16063 + }, + { + "epoch": 1.301360985093973, + "grad_norm": 0.04929269477725029, + "learning_rate": 0.00014992573923218867, + "loss": 0.3045, + "step": 16064 + }, + { + "epoch": 1.3014419961114712, + "grad_norm": 0.047745268791913986, + "learning_rate": 0.00014992123857959406, + "loss": 0.3086, + "step": 16065 + }, + { + "epoch": 1.3015230071289696, + "grad_norm": 0.0507318489253521, + "learning_rate": 0.00014991673792699942, + "loss": 0.3284, + "step": 16066 + }, + { + "epoch": 1.3016040181464679, + "grad_norm": 0.04578344523906708, + "learning_rate": 0.0001499122372744048, + "loss": 0.289, + "step": 16067 + }, + { + "epoch": 1.3016850291639663, + "grad_norm": 0.04364114999771118, + "learning_rate": 0.00014990773662181017, + "loss": 0.3165, + "step": 16068 + }, + { + "epoch": 1.3017660401814646, + "grad_norm": 0.052346739917993546, + "learning_rate": 0.00014990323596921553, + "loss": 0.3213, + "step": 16069 + }, + { + "epoch": 1.301847051198963, + "grad_norm": 0.047455038875341415, + "learning_rate": 0.0001498987353166209, + "loss": 0.2998, + "step": 16070 + }, + { + "epoch": 1.3019280622164615, + "grad_norm": 0.046549778431653976, + "learning_rate": 0.0001498942346640263, + "loss": 0.3323, + "step": 16071 + }, + { + "epoch": 1.3020090732339598, + "grad_norm": 0.04644530639052391, + "learning_rate": 0.00014988973401143166, + "loss": 0.2931, + "step": 16072 + }, + { + "epoch": 1.3020900842514582, + "grad_norm": 0.044147513806819916, + "learning_rate": 0.00014988523335883705, + "loss": 0.2885, + "step": 16073 + }, + { + "epoch": 1.3021710952689567, + "grad_norm": 0.05160215497016907, + "learning_rate": 0.0001498807327062424, + "loss": 0.3206, + "step": 16074 + }, + { + "epoch": 1.302252106286455, + "grad_norm": 0.04298309236764908, + "learning_rate": 0.00014987623205364777, + "loss": 0.3082, + "step": 16075 + }, + { + "epoch": 1.3023331173039534, + "grad_norm": 0.03974214941263199, + "learning_rate": 0.00014987173140105315, + "loss": 0.3057, + "step": 16076 + }, + { + "epoch": 1.3024141283214516, + "grad_norm": 0.04440717026591301, + "learning_rate": 0.00014986723074845854, + "loss": 0.2749, + "step": 16077 + }, + { + "epoch": 1.3024951393389501, + "grad_norm": 0.043637245893478394, + "learning_rate": 0.0001498627300958639, + "loss": 0.3224, + "step": 16078 + }, + { + "epoch": 1.3025761503564484, + "grad_norm": 0.05086296796798706, + "learning_rate": 0.0001498582294432693, + "loss": 0.3007, + "step": 16079 + }, + { + "epoch": 1.3026571613739468, + "grad_norm": 0.044451914727687836, + "learning_rate": 0.00014985372879067465, + "loss": 0.314, + "step": 16080 + }, + { + "epoch": 1.3027381723914453, + "grad_norm": 0.04223513975739479, + "learning_rate": 0.00014984922813808, + "loss": 0.2982, + "step": 16081 + }, + { + "epoch": 1.3028191834089435, + "grad_norm": 0.035806506872177124, + "learning_rate": 0.0001498447274854854, + "loss": 0.2795, + "step": 16082 + }, + { + "epoch": 1.302900194426442, + "grad_norm": 0.03943643346428871, + "learning_rate": 0.00014984022683289078, + "loss": 0.2938, + "step": 16083 + }, + { + "epoch": 1.3029812054439405, + "grad_norm": 0.0411057323217392, + "learning_rate": 0.00014983572618029614, + "loss": 0.3254, + "step": 16084 + }, + { + "epoch": 1.3030622164614387, + "grad_norm": 0.0396103709936142, + "learning_rate": 0.00014983122552770153, + "loss": 0.2992, + "step": 16085 + }, + { + "epoch": 1.3031432274789372, + "grad_norm": 0.03847880661487579, + "learning_rate": 0.0001498267248751069, + "loss": 0.2803, + "step": 16086 + }, + { + "epoch": 1.3032242384964356, + "grad_norm": 0.044615842401981354, + "learning_rate": 0.00014982222422251225, + "loss": 0.2959, + "step": 16087 + }, + { + "epoch": 1.303305249513934, + "grad_norm": 0.046743474900722504, + "learning_rate": 0.00014981772356991767, + "loss": 0.3133, + "step": 16088 + }, + { + "epoch": 1.3033862605314321, + "grad_norm": 0.051682259887456894, + "learning_rate": 0.00014981322291732303, + "loss": 0.2637, + "step": 16089 + }, + { + "epoch": 1.3034672715489306, + "grad_norm": 0.045838940888643265, + "learning_rate": 0.00014980872226472839, + "loss": 0.3488, + "step": 16090 + }, + { + "epoch": 1.303548282566429, + "grad_norm": 0.039805248379707336, + "learning_rate": 0.00014980422161213377, + "loss": 0.2865, + "step": 16091 + }, + { + "epoch": 1.3036292935839273, + "grad_norm": 0.04791427031159401, + "learning_rate": 0.00014979972095953913, + "loss": 0.325, + "step": 16092 + }, + { + "epoch": 1.3037103046014258, + "grad_norm": 0.04406188055872917, + "learning_rate": 0.0001497952203069445, + "loss": 0.3095, + "step": 16093 + }, + { + "epoch": 1.3037913156189243, + "grad_norm": 0.04150623828172684, + "learning_rate": 0.0001497907196543499, + "loss": 0.2898, + "step": 16094 + }, + { + "epoch": 1.3038723266364225, + "grad_norm": 0.04663126915693283, + "learning_rate": 0.00014978621900175527, + "loss": 0.2939, + "step": 16095 + }, + { + "epoch": 1.303953337653921, + "grad_norm": 0.04955912381410599, + "learning_rate": 0.00014978171834916063, + "loss": 0.2944, + "step": 16096 + }, + { + "epoch": 1.3040343486714194, + "grad_norm": 0.044626910239458084, + "learning_rate": 0.00014977721769656601, + "loss": 0.294, + "step": 16097 + }, + { + "epoch": 1.3041153596889177, + "grad_norm": 0.047228164970874786, + "learning_rate": 0.00014977271704397137, + "loss": 0.3016, + "step": 16098 + }, + { + "epoch": 1.3041963707064161, + "grad_norm": 0.052044130861759186, + "learning_rate": 0.00014976821639137673, + "loss": 0.2996, + "step": 16099 + }, + { + "epoch": 1.3042773817239144, + "grad_norm": 0.0400003157556057, + "learning_rate": 0.00014976371573878215, + "loss": 0.2809, + "step": 16100 + }, + { + "epoch": 1.3043583927414129, + "grad_norm": 0.042021676898002625, + "learning_rate": 0.0001497592150861875, + "loss": 0.3078, + "step": 16101 + }, + { + "epoch": 1.304439403758911, + "grad_norm": 0.04687660560011864, + "learning_rate": 0.00014975471443359287, + "loss": 0.29, + "step": 16102 + }, + { + "epoch": 1.3045204147764096, + "grad_norm": 0.04040345549583435, + "learning_rate": 0.00014975021378099826, + "loss": 0.3057, + "step": 16103 + }, + { + "epoch": 1.304601425793908, + "grad_norm": 0.037515588104724884, + "learning_rate": 0.00014974571312840362, + "loss": 0.2705, + "step": 16104 + }, + { + "epoch": 1.3046824368114063, + "grad_norm": 0.039864424616098404, + "learning_rate": 0.000149741212475809, + "loss": 0.2901, + "step": 16105 + }, + { + "epoch": 1.3047634478289047, + "grad_norm": 0.046790868043899536, + "learning_rate": 0.0001497367118232144, + "loss": 0.3593, + "step": 16106 + }, + { + "epoch": 1.3048444588464032, + "grad_norm": 0.044947970658540726, + "learning_rate": 0.00014973221117061975, + "loss": 0.3466, + "step": 16107 + }, + { + "epoch": 1.3049254698639015, + "grad_norm": 0.04707445576786995, + "learning_rate": 0.0001497277105180251, + "loss": 0.3203, + "step": 16108 + }, + { + "epoch": 1.3050064808814, + "grad_norm": 0.04465334489941597, + "learning_rate": 0.0001497232098654305, + "loss": 0.3207, + "step": 16109 + }, + { + "epoch": 1.3050874918988984, + "grad_norm": 0.03804957494139671, + "learning_rate": 0.00014971870921283586, + "loss": 0.2731, + "step": 16110 + }, + { + "epoch": 1.3051685029163966, + "grad_norm": 0.04358971491456032, + "learning_rate": 0.00014971420856024124, + "loss": 0.3086, + "step": 16111 + }, + { + "epoch": 1.3052495139338949, + "grad_norm": 0.040443554520606995, + "learning_rate": 0.00014970970790764663, + "loss": 0.2703, + "step": 16112 + }, + { + "epoch": 1.3053305249513933, + "grad_norm": 0.043810319155454636, + "learning_rate": 0.000149705207255052, + "loss": 0.3033, + "step": 16113 + }, + { + "epoch": 1.3054115359688918, + "grad_norm": 0.04541864991188049, + "learning_rate": 0.00014970070660245735, + "loss": 0.2834, + "step": 16114 + }, + { + "epoch": 1.30549254698639, + "grad_norm": 0.04730004817247391, + "learning_rate": 0.00014969620594986274, + "loss": 0.357, + "step": 16115 + }, + { + "epoch": 1.3055735580038885, + "grad_norm": 0.04252275452017784, + "learning_rate": 0.0001496917052972681, + "loss": 0.3275, + "step": 16116 + }, + { + "epoch": 1.305654569021387, + "grad_norm": 0.04537322744727135, + "learning_rate": 0.00014968720464467349, + "loss": 0.318, + "step": 16117 + }, + { + "epoch": 1.3057355800388852, + "grad_norm": 0.03675117343664169, + "learning_rate": 0.00014968270399207887, + "loss": 0.2532, + "step": 16118 + }, + { + "epoch": 1.3058165910563837, + "grad_norm": 0.04434943199157715, + "learning_rate": 0.00014967820333948423, + "loss": 0.2633, + "step": 16119 + }, + { + "epoch": 1.3058976020738822, + "grad_norm": 0.04275086894631386, + "learning_rate": 0.0001496737026868896, + "loss": 0.2996, + "step": 16120 + }, + { + "epoch": 1.3059786130913804, + "grad_norm": 0.044664327055215836, + "learning_rate": 0.00014966920203429498, + "loss": 0.2699, + "step": 16121 + }, + { + "epoch": 1.3060596241088789, + "grad_norm": 0.041324496269226074, + "learning_rate": 0.00014966470138170034, + "loss": 0.3208, + "step": 16122 + }, + { + "epoch": 1.3061406351263771, + "grad_norm": 0.052138637751340866, + "learning_rate": 0.00014966020072910573, + "loss": 0.3199, + "step": 16123 + }, + { + "epoch": 1.3062216461438756, + "grad_norm": 0.04451883211731911, + "learning_rate": 0.00014965570007651111, + "loss": 0.2913, + "step": 16124 + }, + { + "epoch": 1.3063026571613738, + "grad_norm": 0.04054604098200798, + "learning_rate": 0.00014965119942391647, + "loss": 0.2798, + "step": 16125 + }, + { + "epoch": 1.3063836681788723, + "grad_norm": 0.04771227017045021, + "learning_rate": 0.00014964669877132184, + "loss": 0.2989, + "step": 16126 + }, + { + "epoch": 1.3064646791963708, + "grad_norm": 0.0469462089240551, + "learning_rate": 0.00014964219811872722, + "loss": 0.3066, + "step": 16127 + }, + { + "epoch": 1.306545690213869, + "grad_norm": 0.03986061364412308, + "learning_rate": 0.00014963769746613258, + "loss": 0.2905, + "step": 16128 + }, + { + "epoch": 1.3066267012313675, + "grad_norm": 0.04885406419634819, + "learning_rate": 0.00014963319681353797, + "loss": 0.257, + "step": 16129 + }, + { + "epoch": 1.306707712248866, + "grad_norm": 0.047162119299173355, + "learning_rate": 0.00014962869616094336, + "loss": 0.3222, + "step": 16130 + }, + { + "epoch": 1.3067887232663642, + "grad_norm": 0.04074522852897644, + "learning_rate": 0.00014962419550834872, + "loss": 0.2669, + "step": 16131 + }, + { + "epoch": 1.3068697342838627, + "grad_norm": 0.03833572939038277, + "learning_rate": 0.00014961969485575408, + "loss": 0.2841, + "step": 16132 + }, + { + "epoch": 1.306950745301361, + "grad_norm": 0.04301103204488754, + "learning_rate": 0.00014961519420315946, + "loss": 0.2464, + "step": 16133 + }, + { + "epoch": 1.3070317563188594, + "grad_norm": 0.03820543736219406, + "learning_rate": 0.00014961069355056482, + "loss": 0.2788, + "step": 16134 + }, + { + "epoch": 1.3071127673363576, + "grad_norm": 0.040909651666879654, + "learning_rate": 0.0001496061928979702, + "loss": 0.2877, + "step": 16135 + }, + { + "epoch": 1.307193778353856, + "grad_norm": 0.04465370252728462, + "learning_rate": 0.0001496016922453756, + "loss": 0.296, + "step": 16136 + }, + { + "epoch": 1.3072747893713546, + "grad_norm": 0.043238457292318344, + "learning_rate": 0.00014959719159278096, + "loss": 0.3099, + "step": 16137 + }, + { + "epoch": 1.3073558003888528, + "grad_norm": 0.05324762687087059, + "learning_rate": 0.00014959269094018632, + "loss": 0.3171, + "step": 16138 + }, + { + "epoch": 1.3074368114063513, + "grad_norm": 0.04730679839849472, + "learning_rate": 0.0001495881902875917, + "loss": 0.2649, + "step": 16139 + }, + { + "epoch": 1.3075178224238497, + "grad_norm": 0.04472094029188156, + "learning_rate": 0.00014958368963499707, + "loss": 0.3214, + "step": 16140 + }, + { + "epoch": 1.307598833441348, + "grad_norm": 0.04622946307063103, + "learning_rate": 0.00014957918898240245, + "loss": 0.3102, + "step": 16141 + }, + { + "epoch": 1.3076798444588464, + "grad_norm": 0.04655399173498154, + "learning_rate": 0.00014957468832980784, + "loss": 0.3093, + "step": 16142 + }, + { + "epoch": 1.307760855476345, + "grad_norm": 0.05537700280547142, + "learning_rate": 0.0001495701876772132, + "loss": 0.304, + "step": 16143 + }, + { + "epoch": 1.3078418664938432, + "grad_norm": 0.04523095861077309, + "learning_rate": 0.00014956568702461856, + "loss": 0.3066, + "step": 16144 + }, + { + "epoch": 1.3079228775113416, + "grad_norm": 0.04799005389213562, + "learning_rate": 0.00014956118637202395, + "loss": 0.3204, + "step": 16145 + }, + { + "epoch": 1.3080038885288399, + "grad_norm": 0.04450956732034683, + "learning_rate": 0.00014955668571942933, + "loss": 0.3182, + "step": 16146 + }, + { + "epoch": 1.3080848995463383, + "grad_norm": 0.045555002987384796, + "learning_rate": 0.0001495521850668347, + "loss": 0.3486, + "step": 16147 + }, + { + "epoch": 1.3081659105638366, + "grad_norm": 0.04731540009379387, + "learning_rate": 0.00014954768441424008, + "loss": 0.3025, + "step": 16148 + }, + { + "epoch": 1.308246921581335, + "grad_norm": 0.04336179420351982, + "learning_rate": 0.00014954318376164544, + "loss": 0.2982, + "step": 16149 + }, + { + "epoch": 1.3083279325988335, + "grad_norm": 0.04941492900252342, + "learning_rate": 0.0001495386831090508, + "loss": 0.3448, + "step": 16150 + }, + { + "epoch": 1.3084089436163318, + "grad_norm": 0.04972584545612335, + "learning_rate": 0.0001495341824564562, + "loss": 0.3649, + "step": 16151 + }, + { + "epoch": 1.3084899546338302, + "grad_norm": 0.04234988987445831, + "learning_rate": 0.00014952968180386158, + "loss": 0.3235, + "step": 16152 + }, + { + "epoch": 1.3085709656513287, + "grad_norm": 0.03894378989934921, + "learning_rate": 0.00014952518115126694, + "loss": 0.2583, + "step": 16153 + }, + { + "epoch": 1.308651976668827, + "grad_norm": 0.043541986495256424, + "learning_rate": 0.00014952068049867232, + "loss": 0.257, + "step": 16154 + }, + { + "epoch": 1.3087329876863254, + "grad_norm": 0.049867913126945496, + "learning_rate": 0.00014951617984607768, + "loss": 0.3288, + "step": 16155 + }, + { + "epoch": 1.3088139987038236, + "grad_norm": 0.044467225670814514, + "learning_rate": 0.00014951167919348304, + "loss": 0.3199, + "step": 16156 + }, + { + "epoch": 1.3088950097213221, + "grad_norm": 0.0463348850607872, + "learning_rate": 0.00014950717854088843, + "loss": 0.3653, + "step": 16157 + }, + { + "epoch": 1.3089760207388204, + "grad_norm": 0.043573103845119476, + "learning_rate": 0.00014950267788829382, + "loss": 0.2814, + "step": 16158 + }, + { + "epoch": 1.3090570317563188, + "grad_norm": 0.04555029794573784, + "learning_rate": 0.00014949817723569918, + "loss": 0.3303, + "step": 16159 + }, + { + "epoch": 1.3091380427738173, + "grad_norm": 0.04604102298617363, + "learning_rate": 0.00014949367658310456, + "loss": 0.2835, + "step": 16160 + }, + { + "epoch": 1.3092190537913155, + "grad_norm": 0.049761392176151276, + "learning_rate": 0.00014948917593050992, + "loss": 0.3303, + "step": 16161 + }, + { + "epoch": 1.309300064808814, + "grad_norm": 0.044333383440971375, + "learning_rate": 0.00014948467527791528, + "loss": 0.2645, + "step": 16162 + }, + { + "epoch": 1.3093810758263125, + "grad_norm": 0.04771001636981964, + "learning_rate": 0.00014948017462532067, + "loss": 0.2924, + "step": 16163 + }, + { + "epoch": 1.3094620868438107, + "grad_norm": 0.050036001950502396, + "learning_rate": 0.00014947567397272606, + "loss": 0.307, + "step": 16164 + }, + { + "epoch": 1.3095430978613092, + "grad_norm": 0.037328965961933136, + "learning_rate": 0.00014947117332013142, + "loss": 0.2602, + "step": 16165 + }, + { + "epoch": 1.3096241088788076, + "grad_norm": 0.04898840934038162, + "learning_rate": 0.0001494666726675368, + "loss": 0.3122, + "step": 16166 + }, + { + "epoch": 1.309705119896306, + "grad_norm": 0.04634608328342438, + "learning_rate": 0.00014946217201494217, + "loss": 0.2927, + "step": 16167 + }, + { + "epoch": 1.3097861309138044, + "grad_norm": 0.040241241455078125, + "learning_rate": 0.00014945767136234755, + "loss": 0.2361, + "step": 16168 + }, + { + "epoch": 1.3098671419313026, + "grad_norm": 0.03884800523519516, + "learning_rate": 0.00014945317070975294, + "loss": 0.2764, + "step": 16169 + }, + { + "epoch": 1.309948152948801, + "grad_norm": 0.04845311865210533, + "learning_rate": 0.0001494486700571583, + "loss": 0.2989, + "step": 16170 + }, + { + "epoch": 1.3100291639662993, + "grad_norm": 0.04241282120347023, + "learning_rate": 0.00014944416940456366, + "loss": 0.2814, + "step": 16171 + }, + { + "epoch": 1.3101101749837978, + "grad_norm": 0.045538969337940216, + "learning_rate": 0.00014943966875196905, + "loss": 0.3064, + "step": 16172 + }, + { + "epoch": 1.3101911860012962, + "grad_norm": 0.045505356043577194, + "learning_rate": 0.0001494351680993744, + "loss": 0.3011, + "step": 16173 + }, + { + "epoch": 1.3102721970187945, + "grad_norm": 0.05274083465337753, + "learning_rate": 0.0001494306674467798, + "loss": 0.3548, + "step": 16174 + }, + { + "epoch": 1.310353208036293, + "grad_norm": 0.04924383759498596, + "learning_rate": 0.00014942616679418518, + "loss": 0.3182, + "step": 16175 + }, + { + "epoch": 1.3104342190537914, + "grad_norm": 0.04586834833025932, + "learning_rate": 0.00014942166614159054, + "loss": 0.2625, + "step": 16176 + }, + { + "epoch": 1.3105152300712897, + "grad_norm": 0.036821745336055756, + "learning_rate": 0.0001494171654889959, + "loss": 0.2864, + "step": 16177 + }, + { + "epoch": 1.3105962410887881, + "grad_norm": 0.0499521940946579, + "learning_rate": 0.0001494126648364013, + "loss": 0.3059, + "step": 16178 + }, + { + "epoch": 1.3106772521062864, + "grad_norm": 0.046249426901340485, + "learning_rate": 0.00014940816418380665, + "loss": 0.3361, + "step": 16179 + }, + { + "epoch": 1.3107582631237849, + "grad_norm": 0.04398550093173981, + "learning_rate": 0.00014940366353121204, + "loss": 0.2944, + "step": 16180 + }, + { + "epoch": 1.310839274141283, + "grad_norm": 0.04730512201786041, + "learning_rate": 0.00014939916287861742, + "loss": 0.2812, + "step": 16181 + }, + { + "epoch": 1.3109202851587816, + "grad_norm": 0.045634277164936066, + "learning_rate": 0.00014939466222602278, + "loss": 0.3237, + "step": 16182 + }, + { + "epoch": 1.31100129617628, + "grad_norm": 0.0385470911860466, + "learning_rate": 0.00014939016157342814, + "loss": 0.2575, + "step": 16183 + }, + { + "epoch": 1.3110823071937783, + "grad_norm": 0.0583970732986927, + "learning_rate": 0.00014938566092083353, + "loss": 0.326, + "step": 16184 + }, + { + "epoch": 1.3111633182112767, + "grad_norm": 0.0451340489089489, + "learning_rate": 0.0001493811602682389, + "loss": 0.3356, + "step": 16185 + }, + { + "epoch": 1.3112443292287752, + "grad_norm": 0.04714914783835411, + "learning_rate": 0.00014937665961564428, + "loss": 0.3159, + "step": 16186 + }, + { + "epoch": 1.3113253402462735, + "grad_norm": 0.04920654743909836, + "learning_rate": 0.00014937215896304967, + "loss": 0.3433, + "step": 16187 + }, + { + "epoch": 1.311406351263772, + "grad_norm": 0.047340042889118195, + "learning_rate": 0.00014936765831045503, + "loss": 0.3036, + "step": 16188 + }, + { + "epoch": 1.3114873622812704, + "grad_norm": 0.04961530491709709, + "learning_rate": 0.00014936315765786039, + "loss": 0.3524, + "step": 16189 + }, + { + "epoch": 1.3115683732987686, + "grad_norm": 0.047209519892930984, + "learning_rate": 0.00014935865700526577, + "loss": 0.283, + "step": 16190 + }, + { + "epoch": 1.3116493843162669, + "grad_norm": 0.04718087241053581, + "learning_rate": 0.00014935415635267113, + "loss": 0.3011, + "step": 16191 + }, + { + "epoch": 1.3117303953337653, + "grad_norm": 0.03914792090654373, + "learning_rate": 0.00014934965570007652, + "loss": 0.2609, + "step": 16192 + }, + { + "epoch": 1.3118114063512638, + "grad_norm": 0.0349646620452404, + "learning_rate": 0.0001493451550474819, + "loss": 0.2508, + "step": 16193 + }, + { + "epoch": 1.311892417368762, + "grad_norm": 0.04413396492600441, + "learning_rate": 0.00014934065439488727, + "loss": 0.3095, + "step": 16194 + }, + { + "epoch": 1.3119734283862605, + "grad_norm": 0.04602505639195442, + "learning_rate": 0.00014933615374229263, + "loss": 0.3345, + "step": 16195 + }, + { + "epoch": 1.312054439403759, + "grad_norm": 0.046725794672966, + "learning_rate": 0.00014933165308969801, + "loss": 0.284, + "step": 16196 + }, + { + "epoch": 1.3121354504212572, + "grad_norm": 0.04499703273177147, + "learning_rate": 0.00014932715243710337, + "loss": 0.3166, + "step": 16197 + }, + { + "epoch": 1.3122164614387557, + "grad_norm": 0.04107973352074623, + "learning_rate": 0.00014932265178450876, + "loss": 0.2955, + "step": 16198 + }, + { + "epoch": 1.3122974724562542, + "grad_norm": 0.047366026788949966, + "learning_rate": 0.00014931815113191415, + "loss": 0.3459, + "step": 16199 + }, + { + "epoch": 1.3123784834737524, + "grad_norm": 0.046791817992925644, + "learning_rate": 0.0001493136504793195, + "loss": 0.3169, + "step": 16200 + }, + { + "epoch": 1.3124594944912509, + "grad_norm": 0.04345640167593956, + "learning_rate": 0.00014930914982672487, + "loss": 0.3244, + "step": 16201 + }, + { + "epoch": 1.3125405055087491, + "grad_norm": 0.049602605402469635, + "learning_rate": 0.00014930464917413026, + "loss": 0.3081, + "step": 16202 + }, + { + "epoch": 1.3126215165262476, + "grad_norm": 0.038922298699617386, + "learning_rate": 0.00014930014852153562, + "loss": 0.3138, + "step": 16203 + }, + { + "epoch": 1.3127025275437458, + "grad_norm": 0.038196027278900146, + "learning_rate": 0.000149295647868941, + "loss": 0.2738, + "step": 16204 + }, + { + "epoch": 1.3127835385612443, + "grad_norm": 0.04386325925588608, + "learning_rate": 0.0001492911472163464, + "loss": 0.2777, + "step": 16205 + }, + { + "epoch": 1.3128645495787428, + "grad_norm": 0.04231401905417442, + "learning_rate": 0.00014928664656375175, + "loss": 0.2798, + "step": 16206 + }, + { + "epoch": 1.312945560596241, + "grad_norm": 0.04036155715584755, + "learning_rate": 0.0001492821459111571, + "loss": 0.2626, + "step": 16207 + }, + { + "epoch": 1.3130265716137395, + "grad_norm": 0.044200096279382706, + "learning_rate": 0.0001492776452585625, + "loss": 0.2679, + "step": 16208 + }, + { + "epoch": 1.313107582631238, + "grad_norm": 0.050400812178850174, + "learning_rate": 0.00014927314460596786, + "loss": 0.2955, + "step": 16209 + }, + { + "epoch": 1.3131885936487362, + "grad_norm": 0.05265969783067703, + "learning_rate": 0.00014926864395337324, + "loss": 0.3352, + "step": 16210 + }, + { + "epoch": 1.3132696046662347, + "grad_norm": 0.05643681436777115, + "learning_rate": 0.00014926414330077863, + "loss": 0.3202, + "step": 16211 + }, + { + "epoch": 1.3133506156837331, + "grad_norm": 0.044375356286764145, + "learning_rate": 0.000149259642648184, + "loss": 0.2982, + "step": 16212 + }, + { + "epoch": 1.3134316267012314, + "grad_norm": 0.04609943926334381, + "learning_rate": 0.00014925514199558935, + "loss": 0.2706, + "step": 16213 + }, + { + "epoch": 1.3135126377187296, + "grad_norm": 0.04458443075418472, + "learning_rate": 0.00014925064134299474, + "loss": 0.3428, + "step": 16214 + }, + { + "epoch": 1.313593648736228, + "grad_norm": 0.043120864778757095, + "learning_rate": 0.0001492461406904001, + "loss": 0.314, + "step": 16215 + }, + { + "epoch": 1.3136746597537265, + "grad_norm": 0.04164094477891922, + "learning_rate": 0.0001492416400378055, + "loss": 0.2947, + "step": 16216 + }, + { + "epoch": 1.3137556707712248, + "grad_norm": 0.04162612557411194, + "learning_rate": 0.00014923713938521087, + "loss": 0.2416, + "step": 16217 + }, + { + "epoch": 1.3138366817887233, + "grad_norm": 0.04056188836693764, + "learning_rate": 0.00014923263873261623, + "loss": 0.295, + "step": 16218 + }, + { + "epoch": 1.3139176928062217, + "grad_norm": 0.04554165154695511, + "learning_rate": 0.0001492281380800216, + "loss": 0.2958, + "step": 16219 + }, + { + "epoch": 1.31399870382372, + "grad_norm": 0.03735050931572914, + "learning_rate": 0.00014922363742742698, + "loss": 0.2633, + "step": 16220 + }, + { + "epoch": 1.3140797148412184, + "grad_norm": 0.04008159786462784, + "learning_rate": 0.00014921913677483237, + "loss": 0.2751, + "step": 16221 + }, + { + "epoch": 1.314160725858717, + "grad_norm": 0.044225722551345825, + "learning_rate": 0.00014921463612223773, + "loss": 0.3397, + "step": 16222 + }, + { + "epoch": 1.3142417368762151, + "grad_norm": 0.043361153453588486, + "learning_rate": 0.00014921013546964312, + "loss": 0.2926, + "step": 16223 + }, + { + "epoch": 1.3143227478937136, + "grad_norm": 0.052007514983415604, + "learning_rate": 0.00014920563481704848, + "loss": 0.3254, + "step": 16224 + }, + { + "epoch": 1.3144037589112119, + "grad_norm": 0.04618023708462715, + "learning_rate": 0.00014920113416445384, + "loss": 0.297, + "step": 16225 + }, + { + "epoch": 1.3144847699287103, + "grad_norm": 0.036869119852781296, + "learning_rate": 0.00014919663351185922, + "loss": 0.2568, + "step": 16226 + }, + { + "epoch": 1.3145657809462086, + "grad_norm": 0.041353028267621994, + "learning_rate": 0.0001491921328592646, + "loss": 0.3196, + "step": 16227 + }, + { + "epoch": 1.314646791963707, + "grad_norm": 0.04122963547706604, + "learning_rate": 0.00014918763220666997, + "loss": 0.2587, + "step": 16228 + }, + { + "epoch": 1.3147278029812055, + "grad_norm": 0.04404785856604576, + "learning_rate": 0.00014918313155407536, + "loss": 0.292, + "step": 16229 + }, + { + "epoch": 1.3148088139987038, + "grad_norm": 0.04821160435676575, + "learning_rate": 0.00014917863090148072, + "loss": 0.3102, + "step": 16230 + }, + { + "epoch": 1.3148898250162022, + "grad_norm": 0.04912686347961426, + "learning_rate": 0.00014917413024888608, + "loss": 0.3306, + "step": 16231 + }, + { + "epoch": 1.3149708360337007, + "grad_norm": 0.03853919729590416, + "learning_rate": 0.00014916962959629146, + "loss": 0.2546, + "step": 16232 + }, + { + "epoch": 1.315051847051199, + "grad_norm": 0.05155406892299652, + "learning_rate": 0.00014916512894369685, + "loss": 0.3081, + "step": 16233 + }, + { + "epoch": 1.3151328580686974, + "grad_norm": 0.04806877300143242, + "learning_rate": 0.0001491606282911022, + "loss": 0.2985, + "step": 16234 + }, + { + "epoch": 1.3152138690861956, + "grad_norm": 0.04302647337317467, + "learning_rate": 0.0001491561276385076, + "loss": 0.2841, + "step": 16235 + }, + { + "epoch": 1.315294880103694, + "grad_norm": 0.05157895386219025, + "learning_rate": 0.00014915162698591296, + "loss": 0.3008, + "step": 16236 + }, + { + "epoch": 1.3153758911211924, + "grad_norm": 0.04015476256608963, + "learning_rate": 0.00014914712633331835, + "loss": 0.2475, + "step": 16237 + }, + { + "epoch": 1.3154569021386908, + "grad_norm": 0.04195760563015938, + "learning_rate": 0.0001491426256807237, + "loss": 0.2656, + "step": 16238 + }, + { + "epoch": 1.3155379131561893, + "grad_norm": 0.04301521182060242, + "learning_rate": 0.0001491381250281291, + "loss": 0.2797, + "step": 16239 + }, + { + "epoch": 1.3156189241736875, + "grad_norm": 0.03678332269191742, + "learning_rate": 0.00014913362437553445, + "loss": 0.2708, + "step": 16240 + }, + { + "epoch": 1.315699935191186, + "grad_norm": 0.04338261857628822, + "learning_rate": 0.00014912912372293984, + "loss": 0.284, + "step": 16241 + }, + { + "epoch": 1.3157809462086845, + "grad_norm": 0.04346082732081413, + "learning_rate": 0.0001491246230703452, + "loss": 0.3124, + "step": 16242 + }, + { + "epoch": 1.3158619572261827, + "grad_norm": 0.046911466866731644, + "learning_rate": 0.0001491201224177506, + "loss": 0.3336, + "step": 16243 + }, + { + "epoch": 1.3159429682436812, + "grad_norm": 0.045720525085926056, + "learning_rate": 0.00014911562176515595, + "loss": 0.3069, + "step": 16244 + }, + { + "epoch": 1.3160239792611796, + "grad_norm": 0.04304931312799454, + "learning_rate": 0.00014911112111256133, + "loss": 0.325, + "step": 16245 + }, + { + "epoch": 1.3161049902786779, + "grad_norm": 0.04115717113018036, + "learning_rate": 0.0001491066204599667, + "loss": 0.2686, + "step": 16246 + }, + { + "epoch": 1.3161860012961764, + "grad_norm": 0.04282047972083092, + "learning_rate": 0.00014910211980737208, + "loss": 0.2918, + "step": 16247 + }, + { + "epoch": 1.3162670123136746, + "grad_norm": 0.041621021926403046, + "learning_rate": 0.00014909761915477744, + "loss": 0.2957, + "step": 16248 + }, + { + "epoch": 1.316348023331173, + "grad_norm": 0.052234623581171036, + "learning_rate": 0.00014909311850218283, + "loss": 0.3141, + "step": 16249 + }, + { + "epoch": 1.3164290343486713, + "grad_norm": 0.04725967347621918, + "learning_rate": 0.00014908861784958822, + "loss": 0.3002, + "step": 16250 + }, + { + "epoch": 1.3165100453661698, + "grad_norm": 0.04024965688586235, + "learning_rate": 0.00014908411719699358, + "loss": 0.2501, + "step": 16251 + }, + { + "epoch": 1.3165910563836682, + "grad_norm": 0.04805604740977287, + "learning_rate": 0.00014907961654439894, + "loss": 0.3048, + "step": 16252 + }, + { + "epoch": 1.3166720674011665, + "grad_norm": 0.03625888749957085, + "learning_rate": 0.00014907511589180432, + "loss": 0.257, + "step": 16253 + }, + { + "epoch": 1.316753078418665, + "grad_norm": 0.041517432779073715, + "learning_rate": 0.00014907061523920968, + "loss": 0.2926, + "step": 16254 + }, + { + "epoch": 1.3168340894361634, + "grad_norm": 0.03820646181702614, + "learning_rate": 0.00014906611458661507, + "loss": 0.2643, + "step": 16255 + }, + { + "epoch": 1.3169151004536617, + "grad_norm": 0.04265735298395157, + "learning_rate": 0.00014906161393402046, + "loss": 0.2882, + "step": 16256 + }, + { + "epoch": 1.3169961114711601, + "grad_norm": 0.04392039030790329, + "learning_rate": 0.00014905711328142582, + "loss": 0.2883, + "step": 16257 + }, + { + "epoch": 1.3170771224886584, + "grad_norm": 0.05036157742142677, + "learning_rate": 0.00014905261262883118, + "loss": 0.2946, + "step": 16258 + }, + { + "epoch": 1.3171581335061568, + "grad_norm": 0.042300909757614136, + "learning_rate": 0.00014904811197623656, + "loss": 0.2485, + "step": 16259 + }, + { + "epoch": 1.317239144523655, + "grad_norm": 0.04726019129157066, + "learning_rate": 0.00014904361132364192, + "loss": 0.2971, + "step": 16260 + }, + { + "epoch": 1.3173201555411536, + "grad_norm": 0.04520502686500549, + "learning_rate": 0.0001490391106710473, + "loss": 0.3104, + "step": 16261 + }, + { + "epoch": 1.317401166558652, + "grad_norm": 0.04767908155918121, + "learning_rate": 0.0001490346100184527, + "loss": 0.3258, + "step": 16262 + }, + { + "epoch": 1.3174821775761503, + "grad_norm": 0.04345300421118736, + "learning_rate": 0.00014903010936585806, + "loss": 0.2666, + "step": 16263 + }, + { + "epoch": 1.3175631885936487, + "grad_norm": 0.0426764152944088, + "learning_rate": 0.00014902560871326342, + "loss": 0.2775, + "step": 16264 + }, + { + "epoch": 1.3176441996111472, + "grad_norm": 0.045147597789764404, + "learning_rate": 0.0001490211080606688, + "loss": 0.3101, + "step": 16265 + }, + { + "epoch": 1.3177252106286454, + "grad_norm": 0.04169589281082153, + "learning_rate": 0.00014901660740807417, + "loss": 0.3093, + "step": 16266 + }, + { + "epoch": 1.317806221646144, + "grad_norm": 0.04596323519945145, + "learning_rate": 0.00014901210675547955, + "loss": 0.3083, + "step": 16267 + }, + { + "epoch": 1.3178872326636424, + "grad_norm": 0.040057264268398285, + "learning_rate": 0.00014900760610288494, + "loss": 0.2925, + "step": 16268 + }, + { + "epoch": 1.3179682436811406, + "grad_norm": 0.04390899837017059, + "learning_rate": 0.0001490031054502903, + "loss": 0.3158, + "step": 16269 + }, + { + "epoch": 1.318049254698639, + "grad_norm": 0.04600910469889641, + "learning_rate": 0.00014899860479769566, + "loss": 0.2962, + "step": 16270 + }, + { + "epoch": 1.3181302657161373, + "grad_norm": 0.04342671483755112, + "learning_rate": 0.00014899410414510105, + "loss": 0.2998, + "step": 16271 + }, + { + "epoch": 1.3182112767336358, + "grad_norm": 0.0377708300948143, + "learning_rate": 0.0001489896034925064, + "loss": 0.2635, + "step": 16272 + }, + { + "epoch": 1.318292287751134, + "grad_norm": 0.04475024342536926, + "learning_rate": 0.0001489851028399118, + "loss": 0.3128, + "step": 16273 + }, + { + "epoch": 1.3183732987686325, + "grad_norm": 0.045971859246492386, + "learning_rate": 0.00014898060218731718, + "loss": 0.2968, + "step": 16274 + }, + { + "epoch": 1.318454309786131, + "grad_norm": 0.046566497534513474, + "learning_rate": 0.00014897610153472254, + "loss": 0.3358, + "step": 16275 + }, + { + "epoch": 1.3185353208036292, + "grad_norm": 0.04151848703622818, + "learning_rate": 0.0001489716008821279, + "loss": 0.2838, + "step": 16276 + }, + { + "epoch": 1.3186163318211277, + "grad_norm": 0.050141703337430954, + "learning_rate": 0.0001489671002295333, + "loss": 0.3329, + "step": 16277 + }, + { + "epoch": 1.3186973428386262, + "grad_norm": 0.04717142507433891, + "learning_rate": 0.00014896259957693865, + "loss": 0.2995, + "step": 16278 + }, + { + "epoch": 1.3187783538561244, + "grad_norm": 0.036442629992961884, + "learning_rate": 0.00014895809892434404, + "loss": 0.2636, + "step": 16279 + }, + { + "epoch": 1.3188593648736229, + "grad_norm": 0.04248730465769768, + "learning_rate": 0.00014895359827174942, + "loss": 0.2475, + "step": 16280 + }, + { + "epoch": 1.3189403758911211, + "grad_norm": 0.04148675128817558, + "learning_rate": 0.00014894909761915478, + "loss": 0.3094, + "step": 16281 + }, + { + "epoch": 1.3190213869086196, + "grad_norm": 0.041609227657318115, + "learning_rate": 0.00014894459696656014, + "loss": 0.2662, + "step": 16282 + }, + { + "epoch": 1.3191023979261178, + "grad_norm": 0.037535324692726135, + "learning_rate": 0.00014894009631396553, + "loss": 0.2697, + "step": 16283 + }, + { + "epoch": 1.3191834089436163, + "grad_norm": 0.04177939519286156, + "learning_rate": 0.0001489355956613709, + "loss": 0.2829, + "step": 16284 + }, + { + "epoch": 1.3192644199611148, + "grad_norm": 0.04366298019886017, + "learning_rate": 0.00014893109500877628, + "loss": 0.2721, + "step": 16285 + }, + { + "epoch": 1.319345430978613, + "grad_norm": 0.05289445072412491, + "learning_rate": 0.00014892659435618167, + "loss": 0.3097, + "step": 16286 + }, + { + "epoch": 1.3194264419961115, + "grad_norm": 0.05009431391954422, + "learning_rate": 0.00014892209370358703, + "loss": 0.3314, + "step": 16287 + }, + { + "epoch": 1.31950745301361, + "grad_norm": 0.044810205698013306, + "learning_rate": 0.00014891759305099239, + "loss": 0.2782, + "step": 16288 + }, + { + "epoch": 1.3195884640311082, + "grad_norm": 0.04492989182472229, + "learning_rate": 0.00014891309239839777, + "loss": 0.2969, + "step": 16289 + }, + { + "epoch": 1.3196694750486067, + "grad_norm": 0.043226175010204315, + "learning_rate": 0.00014890859174580313, + "loss": 0.2892, + "step": 16290 + }, + { + "epoch": 1.3197504860661051, + "grad_norm": 0.043849408626556396, + "learning_rate": 0.00014890409109320852, + "loss": 0.2898, + "step": 16291 + }, + { + "epoch": 1.3198314970836034, + "grad_norm": 0.04191071540117264, + "learning_rate": 0.0001488995904406139, + "loss": 0.2868, + "step": 16292 + }, + { + "epoch": 1.3199125081011016, + "grad_norm": 0.046212535351514816, + "learning_rate": 0.00014889508978801927, + "loss": 0.3105, + "step": 16293 + }, + { + "epoch": 1.3199935191186, + "grad_norm": 0.04912080615758896, + "learning_rate": 0.00014889058913542463, + "loss": 0.3092, + "step": 16294 + }, + { + "epoch": 1.3200745301360985, + "grad_norm": 0.039494119584560394, + "learning_rate": 0.00014888608848283001, + "loss": 0.301, + "step": 16295 + }, + { + "epoch": 1.3201555411535968, + "grad_norm": 0.04608963057398796, + "learning_rate": 0.00014888158783023537, + "loss": 0.3144, + "step": 16296 + }, + { + "epoch": 1.3202365521710953, + "grad_norm": 0.04640983045101166, + "learning_rate": 0.00014887708717764076, + "loss": 0.319, + "step": 16297 + }, + { + "epoch": 1.3203175631885937, + "grad_norm": 0.05373993143439293, + "learning_rate": 0.00014887258652504615, + "loss": 0.2999, + "step": 16298 + }, + { + "epoch": 1.320398574206092, + "grad_norm": 0.04362735524773598, + "learning_rate": 0.0001488680858724515, + "loss": 0.2942, + "step": 16299 + }, + { + "epoch": 1.3204795852235904, + "grad_norm": 0.04731497913599014, + "learning_rate": 0.00014886358521985687, + "loss": 0.3022, + "step": 16300 + }, + { + "epoch": 1.320560596241089, + "grad_norm": 0.052373286336660385, + "learning_rate": 0.00014885908456726226, + "loss": 0.2766, + "step": 16301 + }, + { + "epoch": 1.3206416072585871, + "grad_norm": 0.04618718847632408, + "learning_rate": 0.00014885458391466764, + "loss": 0.2711, + "step": 16302 + }, + { + "epoch": 1.3207226182760856, + "grad_norm": 0.03901274874806404, + "learning_rate": 0.000148850083262073, + "loss": 0.2865, + "step": 16303 + }, + { + "epoch": 1.3208036292935839, + "grad_norm": 0.04215940088033676, + "learning_rate": 0.0001488455826094784, + "loss": 0.2837, + "step": 16304 + }, + { + "epoch": 1.3208846403110823, + "grad_norm": 0.04067666083574295, + "learning_rate": 0.00014884108195688375, + "loss": 0.2676, + "step": 16305 + }, + { + "epoch": 1.3209656513285806, + "grad_norm": 0.042485661804676056, + "learning_rate": 0.00014883658130428914, + "loss": 0.3269, + "step": 16306 + }, + { + "epoch": 1.321046662346079, + "grad_norm": 0.042939670383930206, + "learning_rate": 0.0001488320806516945, + "loss": 0.2639, + "step": 16307 + }, + { + "epoch": 1.3211276733635775, + "grad_norm": 0.04631086438894272, + "learning_rate": 0.00014882757999909989, + "loss": 0.3038, + "step": 16308 + }, + { + "epoch": 1.3212086843810757, + "grad_norm": 0.05879298225045204, + "learning_rate": 0.00014882307934650525, + "loss": 0.3235, + "step": 16309 + }, + { + "epoch": 1.3212896953985742, + "grad_norm": 0.04729129374027252, + "learning_rate": 0.00014881857869391063, + "loss": 0.3623, + "step": 16310 + }, + { + "epoch": 1.3213707064160727, + "grad_norm": 0.04663609713315964, + "learning_rate": 0.000148814078041316, + "loss": 0.3376, + "step": 16311 + }, + { + "epoch": 1.321451717433571, + "grad_norm": 0.03580520674586296, + "learning_rate": 0.00014880957738872138, + "loss": 0.278, + "step": 16312 + }, + { + "epoch": 1.3215327284510694, + "grad_norm": 0.04505905508995056, + "learning_rate": 0.00014880507673612674, + "loss": 0.2813, + "step": 16313 + }, + { + "epoch": 1.3216137394685679, + "grad_norm": 0.04475625604391098, + "learning_rate": 0.00014880057608353213, + "loss": 0.2851, + "step": 16314 + }, + { + "epoch": 1.321694750486066, + "grad_norm": 0.04107343405485153, + "learning_rate": 0.0001487960754309375, + "loss": 0.2649, + "step": 16315 + }, + { + "epoch": 1.3217757615035644, + "grad_norm": 0.04097530618309975, + "learning_rate": 0.00014879157477834287, + "loss": 0.2806, + "step": 16316 + }, + { + "epoch": 1.3218567725210628, + "grad_norm": 0.04211768880486488, + "learning_rate": 0.00014878707412574823, + "loss": 0.2702, + "step": 16317 + }, + { + "epoch": 1.3219377835385613, + "grad_norm": 0.043108146637678146, + "learning_rate": 0.00014878257347315362, + "loss": 0.2729, + "step": 16318 + }, + { + "epoch": 1.3220187945560595, + "grad_norm": 0.04378829896450043, + "learning_rate": 0.00014877807282055898, + "loss": 0.2976, + "step": 16319 + }, + { + "epoch": 1.322099805573558, + "grad_norm": 0.04426180198788643, + "learning_rate": 0.00014877357216796437, + "loss": 0.3138, + "step": 16320 + }, + { + "epoch": 1.3221808165910565, + "grad_norm": 0.05119778588414192, + "learning_rate": 0.00014876907151536973, + "loss": 0.3341, + "step": 16321 + }, + { + "epoch": 1.3222618276085547, + "grad_norm": 0.04219551756978035, + "learning_rate": 0.00014876457086277512, + "loss": 0.304, + "step": 16322 + }, + { + "epoch": 1.3223428386260532, + "grad_norm": 0.04721548780798912, + "learning_rate": 0.00014876007021018048, + "loss": 0.3352, + "step": 16323 + }, + { + "epoch": 1.3224238496435516, + "grad_norm": 0.046636637300252914, + "learning_rate": 0.00014875556955758586, + "loss": 0.3165, + "step": 16324 + }, + { + "epoch": 1.3225048606610499, + "grad_norm": 0.04913676157593727, + "learning_rate": 0.00014875106890499122, + "loss": 0.3119, + "step": 16325 + }, + { + "epoch": 1.3225858716785484, + "grad_norm": 0.043943922966718674, + "learning_rate": 0.0001487465682523966, + "loss": 0.2866, + "step": 16326 + }, + { + "epoch": 1.3226668826960466, + "grad_norm": 0.04472362995147705, + "learning_rate": 0.00014874206759980197, + "loss": 0.2931, + "step": 16327 + }, + { + "epoch": 1.322747893713545, + "grad_norm": 0.035618189722299576, + "learning_rate": 0.00014873756694720736, + "loss": 0.2295, + "step": 16328 + }, + { + "epoch": 1.3228289047310433, + "grad_norm": 0.04164008051156998, + "learning_rate": 0.00014873306629461272, + "loss": 0.2838, + "step": 16329 + }, + { + "epoch": 1.3229099157485418, + "grad_norm": 0.05346325784921646, + "learning_rate": 0.0001487285656420181, + "loss": 0.3446, + "step": 16330 + }, + { + "epoch": 1.3229909267660402, + "grad_norm": 0.04301166161894798, + "learning_rate": 0.0001487240649894235, + "loss": 0.2922, + "step": 16331 + }, + { + "epoch": 1.3230719377835385, + "grad_norm": 0.04445755109190941, + "learning_rate": 0.00014871956433682885, + "loss": 0.2521, + "step": 16332 + }, + { + "epoch": 1.323152948801037, + "grad_norm": 0.04274790361523628, + "learning_rate": 0.0001487150636842342, + "loss": 0.2775, + "step": 16333 + }, + { + "epoch": 1.3232339598185354, + "grad_norm": 0.047810930758714676, + "learning_rate": 0.0001487105630316396, + "loss": 0.3125, + "step": 16334 + }, + { + "epoch": 1.3233149708360337, + "grad_norm": 0.04470434412360191, + "learning_rate": 0.00014870606237904496, + "loss": 0.2963, + "step": 16335 + }, + { + "epoch": 1.3233959818535321, + "grad_norm": 0.05826588347554207, + "learning_rate": 0.00014870156172645035, + "loss": 0.3744, + "step": 16336 + }, + { + "epoch": 1.3234769928710304, + "grad_norm": 0.04465784505009651, + "learning_rate": 0.00014869706107385573, + "loss": 0.2895, + "step": 16337 + }, + { + "epoch": 1.3235580038885288, + "grad_norm": 0.04440483823418617, + "learning_rate": 0.0001486925604212611, + "loss": 0.2829, + "step": 16338 + }, + { + "epoch": 1.323639014906027, + "grad_norm": 0.04333437606692314, + "learning_rate": 0.00014868805976866645, + "loss": 0.29, + "step": 16339 + }, + { + "epoch": 1.3237200259235256, + "grad_norm": 0.04430120065808296, + "learning_rate": 0.00014868355911607184, + "loss": 0.2673, + "step": 16340 + }, + { + "epoch": 1.323801036941024, + "grad_norm": 0.04287057742476463, + "learning_rate": 0.0001486790584634772, + "loss": 0.3074, + "step": 16341 + }, + { + "epoch": 1.3238820479585223, + "grad_norm": 0.045032232999801636, + "learning_rate": 0.0001486745578108826, + "loss": 0.3032, + "step": 16342 + }, + { + "epoch": 1.3239630589760207, + "grad_norm": 0.046202123165130615, + "learning_rate": 0.00014867005715828797, + "loss": 0.2965, + "step": 16343 + }, + { + "epoch": 1.3240440699935192, + "grad_norm": 0.042404867708683014, + "learning_rate": 0.00014866555650569333, + "loss": 0.3006, + "step": 16344 + }, + { + "epoch": 1.3241250810110174, + "grad_norm": 0.045439064502716064, + "learning_rate": 0.0001486610558530987, + "loss": 0.2766, + "step": 16345 + }, + { + "epoch": 1.324206092028516, + "grad_norm": 0.045394547283649445, + "learning_rate": 0.00014865655520050408, + "loss": 0.271, + "step": 16346 + }, + { + "epoch": 1.3242871030460144, + "grad_norm": 0.04254448786377907, + "learning_rate": 0.00014865205454790944, + "loss": 0.3006, + "step": 16347 + }, + { + "epoch": 1.3243681140635126, + "grad_norm": 0.05081197991967201, + "learning_rate": 0.00014864755389531483, + "loss": 0.3272, + "step": 16348 + }, + { + "epoch": 1.324449125081011, + "grad_norm": 0.05010443925857544, + "learning_rate": 0.00014864305324272022, + "loss": 0.3003, + "step": 16349 + }, + { + "epoch": 1.3245301360985093, + "grad_norm": 0.04016037657856941, + "learning_rate": 0.00014863855259012558, + "loss": 0.285, + "step": 16350 + }, + { + "epoch": 1.3246111471160078, + "grad_norm": 0.04254277050495148, + "learning_rate": 0.00014863405193753094, + "loss": 0.2802, + "step": 16351 + }, + { + "epoch": 1.324692158133506, + "grad_norm": 0.048468347638845444, + "learning_rate": 0.00014862955128493632, + "loss": 0.2796, + "step": 16352 + }, + { + "epoch": 1.3247731691510045, + "grad_norm": 0.03761716187000275, + "learning_rate": 0.00014862505063234168, + "loss": 0.2681, + "step": 16353 + }, + { + "epoch": 1.324854180168503, + "grad_norm": 0.04276173934340477, + "learning_rate": 0.00014862054997974707, + "loss": 0.2996, + "step": 16354 + }, + { + "epoch": 1.3249351911860012, + "grad_norm": 0.044217273592948914, + "learning_rate": 0.00014861604932715246, + "loss": 0.2882, + "step": 16355 + }, + { + "epoch": 1.3250162022034997, + "grad_norm": 0.0482565313577652, + "learning_rate": 0.00014861154867455782, + "loss": 0.3273, + "step": 16356 + }, + { + "epoch": 1.3250972132209982, + "grad_norm": 0.04667958989739418, + "learning_rate": 0.00014860704802196318, + "loss": 0.3081, + "step": 16357 + }, + { + "epoch": 1.3251782242384964, + "grad_norm": 0.04477810487151146, + "learning_rate": 0.00014860254736936857, + "loss": 0.2585, + "step": 16358 + }, + { + "epoch": 1.3252592352559949, + "grad_norm": 0.04654529318213463, + "learning_rate": 0.00014859804671677393, + "loss": 0.3354, + "step": 16359 + }, + { + "epoch": 1.3253402462734931, + "grad_norm": 0.04797455668449402, + "learning_rate": 0.0001485935460641793, + "loss": 0.338, + "step": 16360 + }, + { + "epoch": 1.3254212572909916, + "grad_norm": 0.039246611297130585, + "learning_rate": 0.0001485890454115847, + "loss": 0.2936, + "step": 16361 + }, + { + "epoch": 1.3255022683084898, + "grad_norm": 0.04272114485502243, + "learning_rate": 0.00014858454475899006, + "loss": 0.2724, + "step": 16362 + }, + { + "epoch": 1.3255832793259883, + "grad_norm": 0.0355217345058918, + "learning_rate": 0.00014858004410639542, + "loss": 0.2761, + "step": 16363 + }, + { + "epoch": 1.3256642903434868, + "grad_norm": 0.040763113647699356, + "learning_rate": 0.0001485755434538008, + "loss": 0.2998, + "step": 16364 + }, + { + "epoch": 1.325745301360985, + "grad_norm": 0.04279746860265732, + "learning_rate": 0.00014857104280120617, + "loss": 0.2933, + "step": 16365 + }, + { + "epoch": 1.3258263123784835, + "grad_norm": 0.04600545018911362, + "learning_rate": 0.00014856654214861155, + "loss": 0.3289, + "step": 16366 + }, + { + "epoch": 1.325907323395982, + "grad_norm": 0.04490230605006218, + "learning_rate": 0.00014856204149601694, + "loss": 0.3455, + "step": 16367 + }, + { + "epoch": 1.3259883344134802, + "grad_norm": 0.047660212963819504, + "learning_rate": 0.0001485575408434223, + "loss": 0.31, + "step": 16368 + }, + { + "epoch": 1.3260693454309787, + "grad_norm": 0.05434110388159752, + "learning_rate": 0.00014855304019082766, + "loss": 0.357, + "step": 16369 + }, + { + "epoch": 1.3261503564484771, + "grad_norm": 0.04194248095154762, + "learning_rate": 0.00014854853953823305, + "loss": 0.2769, + "step": 16370 + }, + { + "epoch": 1.3262313674659754, + "grad_norm": 0.04256175085902214, + "learning_rate": 0.0001485440388856384, + "loss": 0.3126, + "step": 16371 + }, + { + "epoch": 1.3263123784834738, + "grad_norm": 0.04505886137485504, + "learning_rate": 0.0001485395382330438, + "loss": 0.3247, + "step": 16372 + }, + { + "epoch": 1.326393389500972, + "grad_norm": 0.05674450099468231, + "learning_rate": 0.00014853503758044918, + "loss": 0.3143, + "step": 16373 + }, + { + "epoch": 1.3264744005184705, + "grad_norm": 0.055465519428253174, + "learning_rate": 0.00014853053692785454, + "loss": 0.3387, + "step": 16374 + }, + { + "epoch": 1.3265554115359688, + "grad_norm": 0.04465511441230774, + "learning_rate": 0.00014852603627525993, + "loss": 0.29, + "step": 16375 + }, + { + "epoch": 1.3266364225534673, + "grad_norm": 0.04995818808674812, + "learning_rate": 0.0001485215356226653, + "loss": 0.3619, + "step": 16376 + }, + { + "epoch": 1.3267174335709657, + "grad_norm": 0.04046409949660301, + "learning_rate": 0.00014851703497007065, + "loss": 0.3127, + "step": 16377 + }, + { + "epoch": 1.326798444588464, + "grad_norm": 0.04179410636425018, + "learning_rate": 0.00014851253431747604, + "loss": 0.2956, + "step": 16378 + }, + { + "epoch": 1.3268794556059624, + "grad_norm": 0.04889443516731262, + "learning_rate": 0.00014850803366488142, + "loss": 0.3252, + "step": 16379 + }, + { + "epoch": 1.326960466623461, + "grad_norm": 0.04586949199438095, + "learning_rate": 0.00014850353301228678, + "loss": 0.2492, + "step": 16380 + }, + { + "epoch": 1.3270414776409591, + "grad_norm": 0.046670492738485336, + "learning_rate": 0.00014849903235969217, + "loss": 0.3469, + "step": 16381 + }, + { + "epoch": 1.3271224886584576, + "grad_norm": 0.04176964983344078, + "learning_rate": 0.00014849453170709753, + "loss": 0.2757, + "step": 16382 + }, + { + "epoch": 1.3272034996759559, + "grad_norm": 0.04027750715613365, + "learning_rate": 0.00014849003105450292, + "loss": 0.2785, + "step": 16383 + }, + { + "epoch": 1.3272845106934543, + "grad_norm": 0.04773688316345215, + "learning_rate": 0.00014848553040190828, + "loss": 0.2931, + "step": 16384 + }, + { + "epoch": 1.3273655217109526, + "grad_norm": 0.045987844467163086, + "learning_rate": 0.00014848102974931367, + "loss": 0.2867, + "step": 16385 + }, + { + "epoch": 1.327446532728451, + "grad_norm": 0.03335455805063248, + "learning_rate": 0.00014847652909671903, + "loss": 0.2578, + "step": 16386 + }, + { + "epoch": 1.3275275437459495, + "grad_norm": 0.048070844262838364, + "learning_rate": 0.0001484720284441244, + "loss": 0.316, + "step": 16387 + }, + { + "epoch": 1.3276085547634477, + "grad_norm": 0.05480222404003143, + "learning_rate": 0.00014846752779152977, + "loss": 0.3444, + "step": 16388 + }, + { + "epoch": 1.3276895657809462, + "grad_norm": 0.05221012607216835, + "learning_rate": 0.00014846302713893516, + "loss": 0.3302, + "step": 16389 + }, + { + "epoch": 1.3277705767984447, + "grad_norm": 0.04818179830908775, + "learning_rate": 0.00014845852648634052, + "loss": 0.305, + "step": 16390 + }, + { + "epoch": 1.327851587815943, + "grad_norm": 0.04937390983104706, + "learning_rate": 0.0001484540258337459, + "loss": 0.3346, + "step": 16391 + }, + { + "epoch": 1.3279325988334414, + "grad_norm": 0.041629184037446976, + "learning_rate": 0.00014844952518115127, + "loss": 0.3057, + "step": 16392 + }, + { + "epoch": 1.3280136098509399, + "grad_norm": 0.04728194326162338, + "learning_rate": 0.00014844502452855665, + "loss": 0.2956, + "step": 16393 + }, + { + "epoch": 1.328094620868438, + "grad_norm": 0.0420515350997448, + "learning_rate": 0.00014844052387596201, + "loss": 0.2648, + "step": 16394 + }, + { + "epoch": 1.3281756318859366, + "grad_norm": 0.045087482780218124, + "learning_rate": 0.0001484360232233674, + "loss": 0.3079, + "step": 16395 + }, + { + "epoch": 1.3282566429034348, + "grad_norm": 0.04409553483128548, + "learning_rate": 0.00014843152257077276, + "loss": 0.3124, + "step": 16396 + }, + { + "epoch": 1.3283376539209333, + "grad_norm": 0.043479710817337036, + "learning_rate": 0.00014842702191817815, + "loss": 0.2951, + "step": 16397 + }, + { + "epoch": 1.3284186649384315, + "grad_norm": 0.049205128103494644, + "learning_rate": 0.0001484225212655835, + "loss": 0.297, + "step": 16398 + }, + { + "epoch": 1.32849967595593, + "grad_norm": 0.05884668231010437, + "learning_rate": 0.0001484180206129889, + "loss": 0.3413, + "step": 16399 + }, + { + "epoch": 1.3285806869734285, + "grad_norm": 0.04603936895728111, + "learning_rate": 0.00014841351996039426, + "loss": 0.2662, + "step": 16400 + }, + { + "epoch": 1.3286616979909267, + "grad_norm": 0.04407604783773422, + "learning_rate": 0.00014840901930779964, + "loss": 0.3168, + "step": 16401 + }, + { + "epoch": 1.3287427090084252, + "grad_norm": 0.051175616681575775, + "learning_rate": 0.000148404518655205, + "loss": 0.3223, + "step": 16402 + }, + { + "epoch": 1.3288237200259236, + "grad_norm": 0.053574636578559875, + "learning_rate": 0.0001484000180026104, + "loss": 0.3955, + "step": 16403 + }, + { + "epoch": 1.3289047310434219, + "grad_norm": 0.0408712662756443, + "learning_rate": 0.00014839551735001575, + "loss": 0.3262, + "step": 16404 + }, + { + "epoch": 1.3289857420609203, + "grad_norm": 0.038339763879776, + "learning_rate": 0.00014839101669742114, + "loss": 0.2753, + "step": 16405 + }, + { + "epoch": 1.3290667530784186, + "grad_norm": 0.045235779136419296, + "learning_rate": 0.00014838651604482653, + "loss": 0.3101, + "step": 16406 + }, + { + "epoch": 1.329147764095917, + "grad_norm": 0.049895305186510086, + "learning_rate": 0.00014838201539223189, + "loss": 0.3351, + "step": 16407 + }, + { + "epoch": 1.3292287751134153, + "grad_norm": 0.0396096333861351, + "learning_rate": 0.00014837751473963725, + "loss": 0.2719, + "step": 16408 + }, + { + "epoch": 1.3293097861309138, + "grad_norm": 0.041300006210803986, + "learning_rate": 0.00014837301408704263, + "loss": 0.2878, + "step": 16409 + }, + { + "epoch": 1.3293907971484122, + "grad_norm": 0.04082218557596207, + "learning_rate": 0.000148368513434448, + "loss": 0.3059, + "step": 16410 + }, + { + "epoch": 1.3294718081659105, + "grad_norm": 0.04691319167613983, + "learning_rate": 0.00014836401278185338, + "loss": 0.3249, + "step": 16411 + }, + { + "epoch": 1.329552819183409, + "grad_norm": 0.037029873579740524, + "learning_rate": 0.00014835951212925877, + "loss": 0.2554, + "step": 16412 + }, + { + "epoch": 1.3296338302009074, + "grad_norm": 0.048528771847486496, + "learning_rate": 0.00014835501147666413, + "loss": 0.2937, + "step": 16413 + }, + { + "epoch": 1.3297148412184057, + "grad_norm": 0.04355372115969658, + "learning_rate": 0.0001483505108240695, + "loss": 0.2976, + "step": 16414 + }, + { + "epoch": 1.3297958522359041, + "grad_norm": 0.045403700321912766, + "learning_rate": 0.00014834601017147487, + "loss": 0.3183, + "step": 16415 + }, + { + "epoch": 1.3298768632534026, + "grad_norm": 0.04810098558664322, + "learning_rate": 0.00014834150951888023, + "loss": 0.2951, + "step": 16416 + }, + { + "epoch": 1.3299578742709008, + "grad_norm": 0.04955977946519852, + "learning_rate": 0.00014833700886628562, + "loss": 0.3336, + "step": 16417 + }, + { + "epoch": 1.330038885288399, + "grad_norm": 0.047212012112140656, + "learning_rate": 0.000148332508213691, + "loss": 0.3043, + "step": 16418 + }, + { + "epoch": 1.3301198963058976, + "grad_norm": 0.05372791364789009, + "learning_rate": 0.00014832800756109637, + "loss": 0.3067, + "step": 16419 + }, + { + "epoch": 1.330200907323396, + "grad_norm": 0.03757733106613159, + "learning_rate": 0.00014832350690850173, + "loss": 0.2678, + "step": 16420 + }, + { + "epoch": 1.3302819183408943, + "grad_norm": 0.044789623469114304, + "learning_rate": 0.00014831900625590712, + "loss": 0.3491, + "step": 16421 + }, + { + "epoch": 1.3303629293583927, + "grad_norm": 0.04268178716301918, + "learning_rate": 0.00014831450560331248, + "loss": 0.2949, + "step": 16422 + }, + { + "epoch": 1.3304439403758912, + "grad_norm": 0.041059188544750214, + "learning_rate": 0.00014831000495071786, + "loss": 0.2526, + "step": 16423 + }, + { + "epoch": 1.3305249513933894, + "grad_norm": 0.052552729845047, + "learning_rate": 0.00014830550429812325, + "loss": 0.3152, + "step": 16424 + }, + { + "epoch": 1.330605962410888, + "grad_norm": 0.04184343293309212, + "learning_rate": 0.0001483010036455286, + "loss": 0.3075, + "step": 16425 + }, + { + "epoch": 1.3306869734283864, + "grad_norm": 0.04641185328364372, + "learning_rate": 0.00014829650299293397, + "loss": 0.3, + "step": 16426 + }, + { + "epoch": 1.3307679844458846, + "grad_norm": 0.04540396109223366, + "learning_rate": 0.00014829200234033936, + "loss": 0.2751, + "step": 16427 + }, + { + "epoch": 1.330848995463383, + "grad_norm": 0.04766688868403435, + "learning_rate": 0.00014828750168774472, + "loss": 0.2932, + "step": 16428 + }, + { + "epoch": 1.3309300064808813, + "grad_norm": 0.050059616565704346, + "learning_rate": 0.0001482830010351501, + "loss": 0.2732, + "step": 16429 + }, + { + "epoch": 1.3310110174983798, + "grad_norm": 0.055511631071567535, + "learning_rate": 0.0001482785003825555, + "loss": 0.3117, + "step": 16430 + }, + { + "epoch": 1.331092028515878, + "grad_norm": 0.04264995455741882, + "learning_rate": 0.00014827399972996085, + "loss": 0.2903, + "step": 16431 + }, + { + "epoch": 1.3311730395333765, + "grad_norm": 0.052487634122371674, + "learning_rate": 0.0001482694990773662, + "loss": 0.3735, + "step": 16432 + }, + { + "epoch": 1.331254050550875, + "grad_norm": 0.051421862095594406, + "learning_rate": 0.0001482649984247716, + "loss": 0.332, + "step": 16433 + }, + { + "epoch": 1.3313350615683732, + "grad_norm": 0.04284625127911568, + "learning_rate": 0.00014826049777217696, + "loss": 0.266, + "step": 16434 + }, + { + "epoch": 1.3314160725858717, + "grad_norm": 0.046187713742256165, + "learning_rate": 0.00014825599711958235, + "loss": 0.2859, + "step": 16435 + }, + { + "epoch": 1.3314970836033702, + "grad_norm": 0.04239524528384209, + "learning_rate": 0.00014825149646698773, + "loss": 0.2705, + "step": 16436 + }, + { + "epoch": 1.3315780946208684, + "grad_norm": 0.04885365813970566, + "learning_rate": 0.0001482469958143931, + "loss": 0.3089, + "step": 16437 + }, + { + "epoch": 1.3316591056383669, + "grad_norm": 0.046007949858903885, + "learning_rate": 0.00014824249516179845, + "loss": 0.3065, + "step": 16438 + }, + { + "epoch": 1.3317401166558653, + "grad_norm": 0.05193227529525757, + "learning_rate": 0.00014823799450920384, + "loss": 0.3323, + "step": 16439 + }, + { + "epoch": 1.3318211276733636, + "grad_norm": 0.04352136328816414, + "learning_rate": 0.0001482334938566092, + "loss": 0.323, + "step": 16440 + }, + { + "epoch": 1.3319021386908618, + "grad_norm": 0.04223669320344925, + "learning_rate": 0.0001482289932040146, + "loss": 0.2823, + "step": 16441 + }, + { + "epoch": 1.3319831497083603, + "grad_norm": 0.04365150257945061, + "learning_rate": 0.00014822449255141998, + "loss": 0.2984, + "step": 16442 + }, + { + "epoch": 1.3320641607258588, + "grad_norm": 0.03911641985177994, + "learning_rate": 0.00014821999189882534, + "loss": 0.2908, + "step": 16443 + }, + { + "epoch": 1.332145171743357, + "grad_norm": 0.03676704689860344, + "learning_rate": 0.00014821549124623072, + "loss": 0.246, + "step": 16444 + }, + { + "epoch": 1.3322261827608555, + "grad_norm": 0.051078058779239655, + "learning_rate": 0.00014821099059363608, + "loss": 0.2601, + "step": 16445 + }, + { + "epoch": 1.332307193778354, + "grad_norm": 0.04659094661474228, + "learning_rate": 0.00014820648994104144, + "loss": 0.3044, + "step": 16446 + }, + { + "epoch": 1.3323882047958522, + "grad_norm": 0.04543064907193184, + "learning_rate": 0.00014820198928844683, + "loss": 0.2749, + "step": 16447 + }, + { + "epoch": 1.3324692158133506, + "grad_norm": 0.053839847445487976, + "learning_rate": 0.00014819748863585222, + "loss": 0.3291, + "step": 16448 + }, + { + "epoch": 1.3325502268308491, + "grad_norm": 0.04643457382917404, + "learning_rate": 0.00014819298798325758, + "loss": 0.2965, + "step": 16449 + }, + { + "epoch": 1.3326312378483474, + "grad_norm": 0.05472167581319809, + "learning_rate": 0.00014818848733066296, + "loss": 0.3555, + "step": 16450 + }, + { + "epoch": 1.3327122488658458, + "grad_norm": 0.05177174508571625, + "learning_rate": 0.00014818398667806832, + "loss": 0.2944, + "step": 16451 + }, + { + "epoch": 1.332793259883344, + "grad_norm": 0.04957205057144165, + "learning_rate": 0.00014817948602547368, + "loss": 0.2958, + "step": 16452 + }, + { + "epoch": 1.3328742709008425, + "grad_norm": 0.049657341092824936, + "learning_rate": 0.00014817498537287907, + "loss": 0.288, + "step": 16453 + }, + { + "epoch": 1.3329552819183408, + "grad_norm": 0.045000869780778885, + "learning_rate": 0.00014817048472028446, + "loss": 0.278, + "step": 16454 + }, + { + "epoch": 1.3330362929358393, + "grad_norm": 0.04302288219332695, + "learning_rate": 0.00014816598406768982, + "loss": 0.3046, + "step": 16455 + }, + { + "epoch": 1.3331173039533377, + "grad_norm": 0.04679296538233757, + "learning_rate": 0.0001481614834150952, + "loss": 0.2865, + "step": 16456 + }, + { + "epoch": 1.333198314970836, + "grad_norm": 0.04690950736403465, + "learning_rate": 0.00014815698276250057, + "loss": 0.3157, + "step": 16457 + }, + { + "epoch": 1.3332793259883344, + "grad_norm": 0.044018328189849854, + "learning_rate": 0.00014815248210990595, + "loss": 0.3128, + "step": 16458 + }, + { + "epoch": 1.333360337005833, + "grad_norm": 0.0467769019305706, + "learning_rate": 0.0001481479814573113, + "loss": 0.3097, + "step": 16459 + }, + { + "epoch": 1.3334413480233311, + "grad_norm": 0.0482206828892231, + "learning_rate": 0.0001481434808047167, + "loss": 0.2782, + "step": 16460 + }, + { + "epoch": 1.3335223590408296, + "grad_norm": 0.04488348215818405, + "learning_rate": 0.00014813898015212206, + "loss": 0.2825, + "step": 16461 + }, + { + "epoch": 1.3336033700583279, + "grad_norm": 0.045889075845479965, + "learning_rate": 0.00014813447949952745, + "loss": 0.3228, + "step": 16462 + }, + { + "epoch": 1.3336843810758263, + "grad_norm": 0.0443207211792469, + "learning_rate": 0.0001481299788469328, + "loss": 0.3034, + "step": 16463 + }, + { + "epoch": 1.3337653920933246, + "grad_norm": 0.035741958767175674, + "learning_rate": 0.0001481254781943382, + "loss": 0.2797, + "step": 16464 + }, + { + "epoch": 1.333846403110823, + "grad_norm": 0.04170793667435646, + "learning_rate": 0.00014812097754174355, + "loss": 0.2595, + "step": 16465 + }, + { + "epoch": 1.3339274141283215, + "grad_norm": 0.05149725452065468, + "learning_rate": 0.00014811647688914894, + "loss": 0.3146, + "step": 16466 + }, + { + "epoch": 1.3340084251458197, + "grad_norm": 0.04214129596948624, + "learning_rate": 0.0001481119762365543, + "loss": 0.2842, + "step": 16467 + }, + { + "epoch": 1.3340894361633182, + "grad_norm": 0.04943012818694115, + "learning_rate": 0.0001481074755839597, + "loss": 0.3227, + "step": 16468 + }, + { + "epoch": 1.3341704471808167, + "grad_norm": 0.04872075840830803, + "learning_rate": 0.00014810297493136505, + "loss": 0.3225, + "step": 16469 + }, + { + "epoch": 1.334251458198315, + "grad_norm": 0.0405779629945755, + "learning_rate": 0.00014809847427877044, + "loss": 0.2914, + "step": 16470 + }, + { + "epoch": 1.3343324692158134, + "grad_norm": 0.05061832815408707, + "learning_rate": 0.0001480939736261758, + "loss": 0.3559, + "step": 16471 + }, + { + "epoch": 1.3344134802333119, + "grad_norm": 0.04265550896525383, + "learning_rate": 0.00014808947297358118, + "loss": 0.3212, + "step": 16472 + }, + { + "epoch": 1.33449449125081, + "grad_norm": 0.04414314031600952, + "learning_rate": 0.00014808497232098654, + "loss": 0.3507, + "step": 16473 + }, + { + "epoch": 1.3345755022683086, + "grad_norm": 0.04216546565294266, + "learning_rate": 0.00014808047166839193, + "loss": 0.276, + "step": 16474 + }, + { + "epoch": 1.3346565132858068, + "grad_norm": 0.04311543330550194, + "learning_rate": 0.0001480759710157973, + "loss": 0.2618, + "step": 16475 + }, + { + "epoch": 1.3347375243033053, + "grad_norm": 0.04496273025870323, + "learning_rate": 0.00014807147036320268, + "loss": 0.3317, + "step": 16476 + }, + { + "epoch": 1.3348185353208035, + "grad_norm": 0.04574717953801155, + "learning_rate": 0.00014806696971060804, + "loss": 0.3539, + "step": 16477 + }, + { + "epoch": 1.334899546338302, + "grad_norm": 0.04789029061794281, + "learning_rate": 0.00014806246905801342, + "loss": 0.3045, + "step": 16478 + }, + { + "epoch": 1.3349805573558005, + "grad_norm": 0.03678739443421364, + "learning_rate": 0.00014805796840541878, + "loss": 0.2564, + "step": 16479 + }, + { + "epoch": 1.3350615683732987, + "grad_norm": 0.047425612807273865, + "learning_rate": 0.00014805346775282417, + "loss": 0.2778, + "step": 16480 + }, + { + "epoch": 1.3351425793907972, + "grad_norm": 0.03888651356101036, + "learning_rate": 0.00014804896710022953, + "loss": 0.2499, + "step": 16481 + }, + { + "epoch": 1.3352235904082956, + "grad_norm": 0.05751119554042816, + "learning_rate": 0.00014804446644763492, + "loss": 0.3156, + "step": 16482 + }, + { + "epoch": 1.3353046014257939, + "grad_norm": 0.043744124472141266, + "learning_rate": 0.00014803996579504028, + "loss": 0.2931, + "step": 16483 + }, + { + "epoch": 1.3353856124432923, + "grad_norm": 0.04385169968008995, + "learning_rate": 0.00014803546514244567, + "loss": 0.258, + "step": 16484 + }, + { + "epoch": 1.3354666234607906, + "grad_norm": 0.04109294340014458, + "learning_rate": 0.00014803096448985103, + "loss": 0.2515, + "step": 16485 + }, + { + "epoch": 1.335547634478289, + "grad_norm": 0.05116341635584831, + "learning_rate": 0.00014802646383725641, + "loss": 0.2932, + "step": 16486 + }, + { + "epoch": 1.3356286454957873, + "grad_norm": 0.04903048649430275, + "learning_rate": 0.0001480219631846618, + "loss": 0.3168, + "step": 16487 + }, + { + "epoch": 1.3357096565132858, + "grad_norm": 0.05178588256239891, + "learning_rate": 0.00014801746253206716, + "loss": 0.3728, + "step": 16488 + }, + { + "epoch": 1.3357906675307842, + "grad_norm": 0.04729301482439041, + "learning_rate": 0.00014801296187947252, + "loss": 0.3108, + "step": 16489 + }, + { + "epoch": 1.3358716785482825, + "grad_norm": 0.038130391389131546, + "learning_rate": 0.0001480084612268779, + "loss": 0.2694, + "step": 16490 + }, + { + "epoch": 1.335952689565781, + "grad_norm": 0.05216986685991287, + "learning_rate": 0.00014800396057428327, + "loss": 0.3211, + "step": 16491 + }, + { + "epoch": 1.3360337005832794, + "grad_norm": 0.05172670632600784, + "learning_rate": 0.00014799945992168866, + "loss": 0.329, + "step": 16492 + }, + { + "epoch": 1.3361147116007777, + "grad_norm": 0.045888472348451614, + "learning_rate": 0.00014799495926909404, + "loss": 0.2929, + "step": 16493 + }, + { + "epoch": 1.3361957226182761, + "grad_norm": 0.0423261895775795, + "learning_rate": 0.0001479904586164994, + "loss": 0.2788, + "step": 16494 + }, + { + "epoch": 1.3362767336357746, + "grad_norm": 0.04532453417778015, + "learning_rate": 0.00014798595796390476, + "loss": 0.2841, + "step": 16495 + }, + { + "epoch": 1.3363577446532728, + "grad_norm": 0.0453985370695591, + "learning_rate": 0.00014798145731131015, + "loss": 0.2901, + "step": 16496 + }, + { + "epoch": 1.3364387556707713, + "grad_norm": 0.049831438809633255, + "learning_rate": 0.0001479769566587155, + "loss": 0.3611, + "step": 16497 + }, + { + "epoch": 1.3365197666882696, + "grad_norm": 0.050108373165130615, + "learning_rate": 0.0001479724560061209, + "loss": 0.2991, + "step": 16498 + }, + { + "epoch": 1.336600777705768, + "grad_norm": 0.04659736528992653, + "learning_rate": 0.00014796795535352628, + "loss": 0.2765, + "step": 16499 + }, + { + "epoch": 1.3366817887232663, + "grad_norm": 0.04281048849225044, + "learning_rate": 0.00014796345470093164, + "loss": 0.2732, + "step": 16500 + }, + { + "epoch": 1.3367627997407647, + "grad_norm": 0.04346665367484093, + "learning_rate": 0.000147958954048337, + "loss": 0.3203, + "step": 16501 + }, + { + "epoch": 1.3368438107582632, + "grad_norm": 0.04083739593625069, + "learning_rate": 0.0001479544533957424, + "loss": 0.2664, + "step": 16502 + }, + { + "epoch": 1.3369248217757614, + "grad_norm": 0.04874040186405182, + "learning_rate": 0.00014794995274314775, + "loss": 0.3213, + "step": 16503 + }, + { + "epoch": 1.33700583279326, + "grad_norm": 0.042423397302627563, + "learning_rate": 0.00014794545209055314, + "loss": 0.2827, + "step": 16504 + }, + { + "epoch": 1.3370868438107584, + "grad_norm": 0.0455317497253418, + "learning_rate": 0.00014794095143795853, + "loss": 0.3029, + "step": 16505 + }, + { + "epoch": 1.3371678548282566, + "grad_norm": 0.05330074205994606, + "learning_rate": 0.00014793645078536389, + "loss": 0.2914, + "step": 16506 + }, + { + "epoch": 1.337248865845755, + "grad_norm": 0.04478881135582924, + "learning_rate": 0.00014793195013276925, + "loss": 0.2997, + "step": 16507 + }, + { + "epoch": 1.3373298768632533, + "grad_norm": 0.05366770923137665, + "learning_rate": 0.00014792744948017463, + "loss": 0.2951, + "step": 16508 + }, + { + "epoch": 1.3374108878807518, + "grad_norm": 0.053794004023075104, + "learning_rate": 0.00014792294882758, + "loss": 0.2707, + "step": 16509 + }, + { + "epoch": 1.33749189889825, + "grad_norm": 0.04974134638905525, + "learning_rate": 0.00014791844817498538, + "loss": 0.3038, + "step": 16510 + }, + { + "epoch": 1.3375729099157485, + "grad_norm": 0.05007876455783844, + "learning_rate": 0.00014791394752239077, + "loss": 0.2686, + "step": 16511 + }, + { + "epoch": 1.337653920933247, + "grad_norm": 0.05567089840769768, + "learning_rate": 0.00014790944686979613, + "loss": 0.298, + "step": 16512 + }, + { + "epoch": 1.3377349319507452, + "grad_norm": 0.05598515644669533, + "learning_rate": 0.00014790494621720151, + "loss": 0.339, + "step": 16513 + }, + { + "epoch": 1.3378159429682437, + "grad_norm": 0.04248699173331261, + "learning_rate": 0.00014790044556460687, + "loss": 0.3005, + "step": 16514 + }, + { + "epoch": 1.3378969539857422, + "grad_norm": 0.039727192372083664, + "learning_rate": 0.00014789594491201223, + "loss": 0.2775, + "step": 16515 + }, + { + "epoch": 1.3379779650032404, + "grad_norm": 0.04616788029670715, + "learning_rate": 0.00014789144425941762, + "loss": 0.305, + "step": 16516 + }, + { + "epoch": 1.3380589760207389, + "grad_norm": 0.04732425510883331, + "learning_rate": 0.000147886943606823, + "loss": 0.3015, + "step": 16517 + }, + { + "epoch": 1.3381399870382373, + "grad_norm": 0.04137314856052399, + "learning_rate": 0.00014788244295422837, + "loss": 0.2626, + "step": 16518 + }, + { + "epoch": 1.3382209980557356, + "grad_norm": 0.04044054448604584, + "learning_rate": 0.00014787794230163376, + "loss": 0.2897, + "step": 16519 + }, + { + "epoch": 1.3383020090732338, + "grad_norm": 0.05046490207314491, + "learning_rate": 0.00014787344164903912, + "loss": 0.2944, + "step": 16520 + }, + { + "epoch": 1.3383830200907323, + "grad_norm": 0.05168326944112778, + "learning_rate": 0.00014786894099644448, + "loss": 0.2951, + "step": 16521 + }, + { + "epoch": 1.3384640311082308, + "grad_norm": 0.04427158087491989, + "learning_rate": 0.00014786444034384986, + "loss": 0.3418, + "step": 16522 + }, + { + "epoch": 1.338545042125729, + "grad_norm": 0.04621238633990288, + "learning_rate": 0.00014785993969125525, + "loss": 0.3099, + "step": 16523 + }, + { + "epoch": 1.3386260531432275, + "grad_norm": 0.04961875081062317, + "learning_rate": 0.0001478554390386606, + "loss": 0.3417, + "step": 16524 + }, + { + "epoch": 1.338707064160726, + "grad_norm": 0.041570305824279785, + "learning_rate": 0.000147850938386066, + "loss": 0.2956, + "step": 16525 + }, + { + "epoch": 1.3387880751782242, + "grad_norm": 0.04750753566622734, + "learning_rate": 0.00014784643773347136, + "loss": 0.3531, + "step": 16526 + }, + { + "epoch": 1.3388690861957226, + "grad_norm": 0.041701339185237885, + "learning_rate": 0.00014784193708087672, + "loss": 0.2812, + "step": 16527 + }, + { + "epoch": 1.3389500972132211, + "grad_norm": 0.04579732567071915, + "learning_rate": 0.0001478374364282821, + "loss": 0.2792, + "step": 16528 + }, + { + "epoch": 1.3390311082307194, + "grad_norm": 0.048295967280864716, + "learning_rate": 0.0001478329357756875, + "loss": 0.3072, + "step": 16529 + }, + { + "epoch": 1.3391121192482178, + "grad_norm": 0.053043704479932785, + "learning_rate": 0.00014782843512309285, + "loss": 0.3387, + "step": 16530 + }, + { + "epoch": 1.339193130265716, + "grad_norm": 0.04526020213961601, + "learning_rate": 0.00014782393447049824, + "loss": 0.2962, + "step": 16531 + }, + { + "epoch": 1.3392741412832145, + "grad_norm": 0.04586252570152283, + "learning_rate": 0.0001478194338179036, + "loss": 0.2838, + "step": 16532 + }, + { + "epoch": 1.3393551523007128, + "grad_norm": 0.043184127658605576, + "learning_rate": 0.00014781493316530896, + "loss": 0.2821, + "step": 16533 + }, + { + "epoch": 1.3394361633182112, + "grad_norm": 0.04876738041639328, + "learning_rate": 0.00014781043251271435, + "loss": 0.3037, + "step": 16534 + }, + { + "epoch": 1.3395171743357097, + "grad_norm": 0.04843151569366455, + "learning_rate": 0.00014780593186011973, + "loss": 0.2878, + "step": 16535 + }, + { + "epoch": 1.339598185353208, + "grad_norm": 0.04222843796014786, + "learning_rate": 0.0001478014312075251, + "loss": 0.2696, + "step": 16536 + }, + { + "epoch": 1.3396791963707064, + "grad_norm": 0.04102891683578491, + "learning_rate": 0.00014779693055493048, + "loss": 0.2983, + "step": 16537 + }, + { + "epoch": 1.339760207388205, + "grad_norm": 0.04077741503715515, + "learning_rate": 0.00014779242990233584, + "loss": 0.2883, + "step": 16538 + }, + { + "epoch": 1.3398412184057031, + "grad_norm": 0.0449368953704834, + "learning_rate": 0.00014778792924974123, + "loss": 0.2648, + "step": 16539 + }, + { + "epoch": 1.3399222294232016, + "grad_norm": 0.04294194281101227, + "learning_rate": 0.0001477834285971466, + "loss": 0.2885, + "step": 16540 + }, + { + "epoch": 1.3400032404407, + "grad_norm": 0.048157889395952225, + "learning_rate": 0.00014777892794455198, + "loss": 0.3053, + "step": 16541 + }, + { + "epoch": 1.3400842514581983, + "grad_norm": 0.043538760393857956, + "learning_rate": 0.00014777442729195734, + "loss": 0.283, + "step": 16542 + }, + { + "epoch": 1.3401652624756966, + "grad_norm": 0.04985831677913666, + "learning_rate": 0.00014776992663936272, + "loss": 0.339, + "step": 16543 + }, + { + "epoch": 1.340246273493195, + "grad_norm": 0.04058707505464554, + "learning_rate": 0.00014776542598676808, + "loss": 0.3098, + "step": 16544 + }, + { + "epoch": 1.3403272845106935, + "grad_norm": 0.04834947735071182, + "learning_rate": 0.00014776092533417347, + "loss": 0.3083, + "step": 16545 + }, + { + "epoch": 1.3404082955281917, + "grad_norm": 0.04549410566687584, + "learning_rate": 0.00014775642468157883, + "loss": 0.2795, + "step": 16546 + }, + { + "epoch": 1.3404893065456902, + "grad_norm": 0.05225663259625435, + "learning_rate": 0.00014775192402898422, + "loss": 0.3025, + "step": 16547 + }, + { + "epoch": 1.3405703175631887, + "grad_norm": 0.04144188389182091, + "learning_rate": 0.00014774742337638958, + "loss": 0.2712, + "step": 16548 + }, + { + "epoch": 1.340651328580687, + "grad_norm": 0.04098622500896454, + "learning_rate": 0.00014774292272379496, + "loss": 0.282, + "step": 16549 + }, + { + "epoch": 1.3407323395981854, + "grad_norm": 0.0445941723883152, + "learning_rate": 0.00014773842207120032, + "loss": 0.3265, + "step": 16550 + }, + { + "epoch": 1.3408133506156839, + "grad_norm": 0.04298444464802742, + "learning_rate": 0.0001477339214186057, + "loss": 0.2892, + "step": 16551 + }, + { + "epoch": 1.340894361633182, + "grad_norm": 0.04422687739133835, + "learning_rate": 0.00014772942076601107, + "loss": 0.289, + "step": 16552 + }, + { + "epoch": 1.3409753726506806, + "grad_norm": 0.04136590287089348, + "learning_rate": 0.00014772492011341646, + "loss": 0.2734, + "step": 16553 + }, + { + "epoch": 1.3410563836681788, + "grad_norm": 0.03996701166033745, + "learning_rate": 0.00014772041946082182, + "loss": 0.2978, + "step": 16554 + }, + { + "epoch": 1.3411373946856773, + "grad_norm": 0.03591948375105858, + "learning_rate": 0.0001477159188082272, + "loss": 0.2585, + "step": 16555 + }, + { + "epoch": 1.3412184057031755, + "grad_norm": 0.045594893395900726, + "learning_rate": 0.00014771141815563257, + "loss": 0.3528, + "step": 16556 + }, + { + "epoch": 1.341299416720674, + "grad_norm": 0.052428532391786575, + "learning_rate": 0.00014770691750303795, + "loss": 0.3203, + "step": 16557 + }, + { + "epoch": 1.3413804277381725, + "grad_norm": 0.04628319665789604, + "learning_rate": 0.0001477024168504433, + "loss": 0.2687, + "step": 16558 + }, + { + "epoch": 1.3414614387556707, + "grad_norm": 0.04328621178865433, + "learning_rate": 0.0001476979161978487, + "loss": 0.2834, + "step": 16559 + }, + { + "epoch": 1.3415424497731692, + "grad_norm": 0.044492077082395554, + "learning_rate": 0.00014769341554525406, + "loss": 0.317, + "step": 16560 + }, + { + "epoch": 1.3416234607906676, + "grad_norm": 0.039298199117183685, + "learning_rate": 0.00014768891489265945, + "loss": 0.2663, + "step": 16561 + }, + { + "epoch": 1.3417044718081659, + "grad_norm": 0.04610498994588852, + "learning_rate": 0.0001476844142400648, + "loss": 0.2967, + "step": 16562 + }, + { + "epoch": 1.3417854828256643, + "grad_norm": 0.04996765777468681, + "learning_rate": 0.0001476799135874702, + "loss": 0.3204, + "step": 16563 + }, + { + "epoch": 1.3418664938431626, + "grad_norm": 0.04526286572217941, + "learning_rate": 0.00014767541293487555, + "loss": 0.3328, + "step": 16564 + }, + { + "epoch": 1.341947504860661, + "grad_norm": 0.04287734255194664, + "learning_rate": 0.00014767091228228094, + "loss": 0.2392, + "step": 16565 + }, + { + "epoch": 1.3420285158781593, + "grad_norm": 0.049837589263916016, + "learning_rate": 0.0001476664116296863, + "loss": 0.2849, + "step": 16566 + }, + { + "epoch": 1.3421095268956578, + "grad_norm": 0.04827631637454033, + "learning_rate": 0.0001476619109770917, + "loss": 0.3377, + "step": 16567 + }, + { + "epoch": 1.3421905379131562, + "grad_norm": 0.048483602702617645, + "learning_rate": 0.00014765741032449708, + "loss": 0.3081, + "step": 16568 + }, + { + "epoch": 1.3422715489306545, + "grad_norm": 0.04682917147874832, + "learning_rate": 0.00014765290967190244, + "loss": 0.3185, + "step": 16569 + }, + { + "epoch": 1.342352559948153, + "grad_norm": 0.04458910971879959, + "learning_rate": 0.0001476484090193078, + "loss": 0.3021, + "step": 16570 + }, + { + "epoch": 1.3424335709656514, + "grad_norm": 0.04497900605201721, + "learning_rate": 0.00014764390836671318, + "loss": 0.3189, + "step": 16571 + }, + { + "epoch": 1.3425145819831497, + "grad_norm": 0.039753060787916183, + "learning_rate": 0.00014763940771411854, + "loss": 0.2651, + "step": 16572 + }, + { + "epoch": 1.3425955930006481, + "grad_norm": 0.05053701251745224, + "learning_rate": 0.00014763490706152393, + "loss": 0.3575, + "step": 16573 + }, + { + "epoch": 1.3426766040181466, + "grad_norm": 0.04674624651670456, + "learning_rate": 0.00014763040640892932, + "loss": 0.2702, + "step": 16574 + }, + { + "epoch": 1.3427576150356448, + "grad_norm": 0.0462510772049427, + "learning_rate": 0.00014762590575633468, + "loss": 0.2925, + "step": 16575 + }, + { + "epoch": 1.3428386260531433, + "grad_norm": 0.04673721268773079, + "learning_rate": 0.00014762140510374004, + "loss": 0.2746, + "step": 16576 + }, + { + "epoch": 1.3429196370706415, + "grad_norm": 0.043024204671382904, + "learning_rate": 0.00014761690445114543, + "loss": 0.3132, + "step": 16577 + }, + { + "epoch": 1.34300064808814, + "grad_norm": 0.040180426090955734, + "learning_rate": 0.00014761240379855079, + "loss": 0.3054, + "step": 16578 + }, + { + "epoch": 1.3430816591056383, + "grad_norm": 0.05153704062104225, + "learning_rate": 0.00014760790314595617, + "loss": 0.3107, + "step": 16579 + }, + { + "epoch": 1.3431626701231367, + "grad_norm": 0.04208536073565483, + "learning_rate": 0.00014760340249336156, + "loss": 0.2981, + "step": 16580 + }, + { + "epoch": 1.3432436811406352, + "grad_norm": 0.04051988944411278, + "learning_rate": 0.00014759890184076692, + "loss": 0.3113, + "step": 16581 + }, + { + "epoch": 1.3433246921581334, + "grad_norm": 0.0628843903541565, + "learning_rate": 0.0001475944011881723, + "loss": 0.2698, + "step": 16582 + }, + { + "epoch": 1.343405703175632, + "grad_norm": 0.04788912460207939, + "learning_rate": 0.00014758990053557767, + "loss": 0.3223, + "step": 16583 + }, + { + "epoch": 1.3434867141931304, + "grad_norm": 0.042176347225904465, + "learning_rate": 0.00014758539988298303, + "loss": 0.2468, + "step": 16584 + }, + { + "epoch": 1.3435677252106286, + "grad_norm": 0.03956156224012375, + "learning_rate": 0.00014758089923038841, + "loss": 0.3001, + "step": 16585 + }, + { + "epoch": 1.343648736228127, + "grad_norm": 0.04439728707075119, + "learning_rate": 0.0001475763985777938, + "loss": 0.3086, + "step": 16586 + }, + { + "epoch": 1.3437297472456253, + "grad_norm": 0.04363232105970383, + "learning_rate": 0.00014757189792519916, + "loss": 0.2737, + "step": 16587 + }, + { + "epoch": 1.3438107582631238, + "grad_norm": 0.05417967215180397, + "learning_rate": 0.00014756739727260455, + "loss": 0.3214, + "step": 16588 + }, + { + "epoch": 1.343891769280622, + "grad_norm": 0.03835189715027809, + "learning_rate": 0.0001475628966200099, + "loss": 0.265, + "step": 16589 + }, + { + "epoch": 1.3439727802981205, + "grad_norm": 0.05207205191254616, + "learning_rate": 0.00014755839596741527, + "loss": 0.3273, + "step": 16590 + }, + { + "epoch": 1.344053791315619, + "grad_norm": 0.04198315367102623, + "learning_rate": 0.00014755389531482066, + "loss": 0.2906, + "step": 16591 + }, + { + "epoch": 1.3441348023331172, + "grad_norm": 0.04492257907986641, + "learning_rate": 0.00014754939466222604, + "loss": 0.2879, + "step": 16592 + }, + { + "epoch": 1.3442158133506157, + "grad_norm": 0.04414338245987892, + "learning_rate": 0.0001475448940096314, + "loss": 0.2829, + "step": 16593 + }, + { + "epoch": 1.3442968243681142, + "grad_norm": 0.036125894635915756, + "learning_rate": 0.0001475403933570368, + "loss": 0.2216, + "step": 16594 + }, + { + "epoch": 1.3443778353856124, + "grad_norm": 0.04091149568557739, + "learning_rate": 0.00014753589270444215, + "loss": 0.2783, + "step": 16595 + }, + { + "epoch": 1.3444588464031109, + "grad_norm": 0.04277289658784866, + "learning_rate": 0.0001475313920518475, + "loss": 0.279, + "step": 16596 + }, + { + "epoch": 1.3445398574206093, + "grad_norm": 0.04748170077800751, + "learning_rate": 0.0001475268913992529, + "loss": 0.2978, + "step": 16597 + }, + { + "epoch": 1.3446208684381076, + "grad_norm": 0.049579180777072906, + "learning_rate": 0.00014752239074665828, + "loss": 0.2737, + "step": 16598 + }, + { + "epoch": 1.344701879455606, + "grad_norm": 0.055698081851005554, + "learning_rate": 0.00014751789009406364, + "loss": 0.3412, + "step": 16599 + }, + { + "epoch": 1.3447828904731043, + "grad_norm": 0.04331175982952118, + "learning_rate": 0.00014751338944146903, + "loss": 0.3004, + "step": 16600 + }, + { + "epoch": 1.3448639014906028, + "grad_norm": 0.04292542114853859, + "learning_rate": 0.0001475088887888744, + "loss": 0.2767, + "step": 16601 + }, + { + "epoch": 1.344944912508101, + "grad_norm": 0.04305851086974144, + "learning_rate": 0.00014750438813627975, + "loss": 0.2672, + "step": 16602 + }, + { + "epoch": 1.3450259235255995, + "grad_norm": 0.05051714926958084, + "learning_rate": 0.00014749988748368514, + "loss": 0.302, + "step": 16603 + }, + { + "epoch": 1.345106934543098, + "grad_norm": 0.05231226980686188, + "learning_rate": 0.00014749538683109053, + "loss": 0.3682, + "step": 16604 + }, + { + "epoch": 1.3451879455605962, + "grad_norm": 0.052741702646017075, + "learning_rate": 0.00014749088617849589, + "loss": 0.3041, + "step": 16605 + }, + { + "epoch": 1.3452689565780946, + "grad_norm": 0.041467368602752686, + "learning_rate": 0.00014748638552590127, + "loss": 0.3073, + "step": 16606 + }, + { + "epoch": 1.345349967595593, + "grad_norm": 0.048310671001672745, + "learning_rate": 0.00014748188487330663, + "loss": 0.2952, + "step": 16607 + }, + { + "epoch": 1.3454309786130914, + "grad_norm": 0.043791867792606354, + "learning_rate": 0.000147477384220712, + "loss": 0.2843, + "step": 16608 + }, + { + "epoch": 1.3455119896305898, + "grad_norm": 0.04615228623151779, + "learning_rate": 0.00014747288356811738, + "loss": 0.3369, + "step": 16609 + }, + { + "epoch": 1.345593000648088, + "grad_norm": 0.04794533923268318, + "learning_rate": 0.00014746838291552277, + "loss": 0.2995, + "step": 16610 + }, + { + "epoch": 1.3456740116655865, + "grad_norm": 0.03745264932513237, + "learning_rate": 0.00014746388226292813, + "loss": 0.2865, + "step": 16611 + }, + { + "epoch": 1.3457550226830848, + "grad_norm": 0.04993395879864693, + "learning_rate": 0.00014745938161033351, + "loss": 0.341, + "step": 16612 + }, + { + "epoch": 1.3458360337005832, + "grad_norm": 0.053492747247219086, + "learning_rate": 0.00014745488095773887, + "loss": 0.3089, + "step": 16613 + }, + { + "epoch": 1.3459170447180817, + "grad_norm": 0.04382877051830292, + "learning_rate": 0.00014745038030514423, + "loss": 0.2897, + "step": 16614 + }, + { + "epoch": 1.34599805573558, + "grad_norm": 0.039354801177978516, + "learning_rate": 0.00014744587965254962, + "loss": 0.2673, + "step": 16615 + }, + { + "epoch": 1.3460790667530784, + "grad_norm": 0.03777182474732399, + "learning_rate": 0.000147441378999955, + "loss": 0.2716, + "step": 16616 + }, + { + "epoch": 1.346160077770577, + "grad_norm": 0.043617118149995804, + "learning_rate": 0.00014743687834736037, + "loss": 0.2819, + "step": 16617 + }, + { + "epoch": 1.3462410887880751, + "grad_norm": 0.0481729619204998, + "learning_rate": 0.00014743237769476576, + "loss": 0.2864, + "step": 16618 + }, + { + "epoch": 1.3463220998055736, + "grad_norm": 0.048706311732530594, + "learning_rate": 0.00014742787704217112, + "loss": 0.3245, + "step": 16619 + }, + { + "epoch": 1.346403110823072, + "grad_norm": 0.046131476759910583, + "learning_rate": 0.0001474233763895765, + "loss": 0.2945, + "step": 16620 + }, + { + "epoch": 1.3464841218405703, + "grad_norm": 0.049588076770305634, + "learning_rate": 0.00014741887573698186, + "loss": 0.2744, + "step": 16621 + }, + { + "epoch": 1.3465651328580686, + "grad_norm": 0.045150045305490494, + "learning_rate": 0.00014741437508438725, + "loss": 0.3076, + "step": 16622 + }, + { + "epoch": 1.346646143875567, + "grad_norm": 0.03973205387592316, + "learning_rate": 0.0001474098744317926, + "loss": 0.2607, + "step": 16623 + }, + { + "epoch": 1.3467271548930655, + "grad_norm": 0.042886774986982346, + "learning_rate": 0.000147405373779198, + "loss": 0.2928, + "step": 16624 + }, + { + "epoch": 1.3468081659105637, + "grad_norm": 0.046221207827329636, + "learning_rate": 0.00014740087312660336, + "loss": 0.3033, + "step": 16625 + }, + { + "epoch": 1.3468891769280622, + "grad_norm": 0.04569321125745773, + "learning_rate": 0.00014739637247400875, + "loss": 0.3301, + "step": 16626 + }, + { + "epoch": 1.3469701879455607, + "grad_norm": 0.049562666565179825, + "learning_rate": 0.0001473918718214141, + "loss": 0.3283, + "step": 16627 + }, + { + "epoch": 1.347051198963059, + "grad_norm": 0.042315803468227386, + "learning_rate": 0.0001473873711688195, + "loss": 0.2909, + "step": 16628 + }, + { + "epoch": 1.3471322099805574, + "grad_norm": 0.04370975121855736, + "learning_rate": 0.00014738287051622485, + "loss": 0.3323, + "step": 16629 + }, + { + "epoch": 1.3472132209980558, + "grad_norm": 0.05320056900382042, + "learning_rate": 0.00014737836986363024, + "loss": 0.3371, + "step": 16630 + }, + { + "epoch": 1.347294232015554, + "grad_norm": 0.05056382715702057, + "learning_rate": 0.0001473738692110356, + "loss": 0.3028, + "step": 16631 + }, + { + "epoch": 1.3473752430330526, + "grad_norm": 0.04404057189822197, + "learning_rate": 0.000147369368558441, + "loss": 0.3013, + "step": 16632 + }, + { + "epoch": 1.3474562540505508, + "grad_norm": 0.04282480478286743, + "learning_rate": 0.00014736486790584635, + "loss": 0.2595, + "step": 16633 + }, + { + "epoch": 1.3475372650680493, + "grad_norm": 0.04623326659202576, + "learning_rate": 0.00014736036725325173, + "loss": 0.294, + "step": 16634 + }, + { + "epoch": 1.3476182760855475, + "grad_norm": 0.050502434372901917, + "learning_rate": 0.0001473558666006571, + "loss": 0.3359, + "step": 16635 + }, + { + "epoch": 1.347699287103046, + "grad_norm": 0.04713571071624756, + "learning_rate": 0.00014735136594806248, + "loss": 0.3093, + "step": 16636 + }, + { + "epoch": 1.3477802981205445, + "grad_norm": 0.04389585182070732, + "learning_rate": 0.00014734686529546784, + "loss": 0.3002, + "step": 16637 + }, + { + "epoch": 1.3478613091380427, + "grad_norm": 0.04367394745349884, + "learning_rate": 0.00014734236464287323, + "loss": 0.2863, + "step": 16638 + }, + { + "epoch": 1.3479423201555412, + "grad_norm": 0.04276891425251961, + "learning_rate": 0.0001473378639902786, + "loss": 0.3023, + "step": 16639 + }, + { + "epoch": 1.3480233311730396, + "grad_norm": 0.06252908706665039, + "learning_rate": 0.00014733336333768398, + "loss": 0.353, + "step": 16640 + }, + { + "epoch": 1.3481043421905379, + "grad_norm": 0.04885753616690636, + "learning_rate": 0.00014732886268508934, + "loss": 0.2753, + "step": 16641 + }, + { + "epoch": 1.3481853532080363, + "grad_norm": 0.04044380784034729, + "learning_rate": 0.00014732436203249472, + "loss": 0.2673, + "step": 16642 + }, + { + "epoch": 1.3482663642255348, + "grad_norm": 0.04267757385969162, + "learning_rate": 0.00014731986137990008, + "loss": 0.2644, + "step": 16643 + }, + { + "epoch": 1.348347375243033, + "grad_norm": 0.04965035989880562, + "learning_rate": 0.00014731536072730547, + "loss": 0.3132, + "step": 16644 + }, + { + "epoch": 1.3484283862605313, + "grad_norm": 0.04539056494832039, + "learning_rate": 0.00014731086007471086, + "loss": 0.2785, + "step": 16645 + }, + { + "epoch": 1.3485093972780298, + "grad_norm": 0.04558189958333969, + "learning_rate": 0.00014730635942211622, + "loss": 0.3345, + "step": 16646 + }, + { + "epoch": 1.3485904082955282, + "grad_norm": 0.048003729432821274, + "learning_rate": 0.00014730185876952158, + "loss": 0.2813, + "step": 16647 + }, + { + "epoch": 1.3486714193130265, + "grad_norm": 0.04879504814743996, + "learning_rate": 0.00014729735811692696, + "loss": 0.3074, + "step": 16648 + }, + { + "epoch": 1.348752430330525, + "grad_norm": 0.0417785607278347, + "learning_rate": 0.00014729285746433235, + "loss": 0.2859, + "step": 16649 + }, + { + "epoch": 1.3488334413480234, + "grad_norm": 0.042382605373859406, + "learning_rate": 0.0001472883568117377, + "loss": 0.3068, + "step": 16650 + }, + { + "epoch": 1.3489144523655217, + "grad_norm": 0.05004865303635597, + "learning_rate": 0.0001472838561591431, + "loss": 0.3222, + "step": 16651 + }, + { + "epoch": 1.3489954633830201, + "grad_norm": 0.0468045249581337, + "learning_rate": 0.00014727935550654846, + "loss": 0.315, + "step": 16652 + }, + { + "epoch": 1.3490764744005186, + "grad_norm": 0.040945470333099365, + "learning_rate": 0.00014727485485395382, + "loss": 0.3163, + "step": 16653 + }, + { + "epoch": 1.3491574854180168, + "grad_norm": 0.03802323713898659, + "learning_rate": 0.0001472703542013592, + "loss": 0.2759, + "step": 16654 + }, + { + "epoch": 1.3492384964355153, + "grad_norm": 0.04716533049941063, + "learning_rate": 0.0001472658535487646, + "loss": 0.3159, + "step": 16655 + }, + { + "epoch": 1.3493195074530135, + "grad_norm": 0.056089192628860474, + "learning_rate": 0.00014726135289616995, + "loss": 0.3586, + "step": 16656 + }, + { + "epoch": 1.349400518470512, + "grad_norm": 0.042343322187662125, + "learning_rate": 0.00014725685224357534, + "loss": 0.3355, + "step": 16657 + }, + { + "epoch": 1.3494815294880103, + "grad_norm": 0.045325689017772675, + "learning_rate": 0.0001472523515909807, + "loss": 0.2954, + "step": 16658 + }, + { + "epoch": 1.3495625405055087, + "grad_norm": 0.04668663069605827, + "learning_rate": 0.00014724785093838606, + "loss": 0.2666, + "step": 16659 + }, + { + "epoch": 1.3496435515230072, + "grad_norm": 0.044138941913843155, + "learning_rate": 0.00014724335028579145, + "loss": 0.2783, + "step": 16660 + }, + { + "epoch": 1.3497245625405054, + "grad_norm": 0.04154333844780922, + "learning_rate": 0.00014723884963319683, + "loss": 0.3154, + "step": 16661 + }, + { + "epoch": 1.349805573558004, + "grad_norm": 0.04226982966065407, + "learning_rate": 0.0001472343489806022, + "loss": 0.2385, + "step": 16662 + }, + { + "epoch": 1.3498865845755024, + "grad_norm": 0.040852706879377365, + "learning_rate": 0.00014722984832800758, + "loss": 0.2797, + "step": 16663 + }, + { + "epoch": 1.3499675955930006, + "grad_norm": 0.04127061739563942, + "learning_rate": 0.00014722534767541294, + "loss": 0.3099, + "step": 16664 + }, + { + "epoch": 1.350048606610499, + "grad_norm": 0.0487934872508049, + "learning_rate": 0.0001472208470228183, + "loss": 0.2807, + "step": 16665 + }, + { + "epoch": 1.3501296176279973, + "grad_norm": 0.04939623177051544, + "learning_rate": 0.0001472163463702237, + "loss": 0.3383, + "step": 16666 + }, + { + "epoch": 1.3502106286454958, + "grad_norm": 0.05318183824419975, + "learning_rate": 0.00014721184571762908, + "loss": 0.337, + "step": 16667 + }, + { + "epoch": 1.350291639662994, + "grad_norm": 0.03577316924929619, + "learning_rate": 0.00014720734506503444, + "loss": 0.2556, + "step": 16668 + }, + { + "epoch": 1.3503726506804925, + "grad_norm": 0.04493138939142227, + "learning_rate": 0.00014720284441243982, + "loss": 0.2872, + "step": 16669 + }, + { + "epoch": 1.350453661697991, + "grad_norm": 0.03896234929561615, + "learning_rate": 0.00014719834375984518, + "loss": 0.284, + "step": 16670 + }, + { + "epoch": 1.3505346727154892, + "grad_norm": 0.04790037125349045, + "learning_rate": 0.00014719384310725054, + "loss": 0.3177, + "step": 16671 + }, + { + "epoch": 1.3506156837329877, + "grad_norm": 0.049241382628679276, + "learning_rate": 0.00014718934245465593, + "loss": 0.2927, + "step": 16672 + }, + { + "epoch": 1.3506966947504861, + "grad_norm": 0.046655260026454926, + "learning_rate": 0.00014718484180206132, + "loss": 0.2792, + "step": 16673 + }, + { + "epoch": 1.3507777057679844, + "grad_norm": 0.04560606926679611, + "learning_rate": 0.00014718034114946668, + "loss": 0.2893, + "step": 16674 + }, + { + "epoch": 1.3508587167854829, + "grad_norm": 0.059893637895584106, + "learning_rate": 0.00014717584049687207, + "loss": 0.3453, + "step": 16675 + }, + { + "epoch": 1.3509397278029813, + "grad_norm": 0.051331330090761185, + "learning_rate": 0.00014717133984427743, + "loss": 0.2729, + "step": 16676 + }, + { + "epoch": 1.3510207388204796, + "grad_norm": 0.04739075154066086, + "learning_rate": 0.00014716683919168279, + "loss": 0.2542, + "step": 16677 + }, + { + "epoch": 1.351101749837978, + "grad_norm": 0.05445697903633118, + "learning_rate": 0.00014716233853908817, + "loss": 0.2887, + "step": 16678 + }, + { + "epoch": 1.3511827608554763, + "grad_norm": 0.04767782613635063, + "learning_rate": 0.00014715783788649356, + "loss": 0.3112, + "step": 16679 + }, + { + "epoch": 1.3512637718729748, + "grad_norm": 0.04585118964314461, + "learning_rate": 0.00014715333723389892, + "loss": 0.3262, + "step": 16680 + }, + { + "epoch": 1.351344782890473, + "grad_norm": 0.04599033296108246, + "learning_rate": 0.0001471488365813043, + "loss": 0.2791, + "step": 16681 + }, + { + "epoch": 1.3514257939079715, + "grad_norm": 0.041311413049697876, + "learning_rate": 0.00014714433592870967, + "loss": 0.2836, + "step": 16682 + }, + { + "epoch": 1.35150680492547, + "grad_norm": 0.04801380634307861, + "learning_rate": 0.00014713983527611503, + "loss": 0.2978, + "step": 16683 + }, + { + "epoch": 1.3515878159429682, + "grad_norm": 0.04437221586704254, + "learning_rate": 0.00014713533462352041, + "loss": 0.3324, + "step": 16684 + }, + { + "epoch": 1.3516688269604666, + "grad_norm": 0.042100001126527786, + "learning_rate": 0.0001471308339709258, + "loss": 0.2648, + "step": 16685 + }, + { + "epoch": 1.351749837977965, + "grad_norm": 0.04318871721625328, + "learning_rate": 0.00014712633331833116, + "loss": 0.311, + "step": 16686 + }, + { + "epoch": 1.3518308489954634, + "grad_norm": 0.037165261805057526, + "learning_rate": 0.00014712183266573655, + "loss": 0.2695, + "step": 16687 + }, + { + "epoch": 1.3519118600129618, + "grad_norm": 0.05104199796915054, + "learning_rate": 0.0001471173320131419, + "loss": 0.3237, + "step": 16688 + }, + { + "epoch": 1.35199287103046, + "grad_norm": 0.05252790078520775, + "learning_rate": 0.00014711283136054727, + "loss": 0.3422, + "step": 16689 + }, + { + "epoch": 1.3520738820479585, + "grad_norm": 0.04297966510057449, + "learning_rate": 0.00014710833070795266, + "loss": 0.3139, + "step": 16690 + }, + { + "epoch": 1.3521548930654568, + "grad_norm": 0.04057961329817772, + "learning_rate": 0.00014710383005535804, + "loss": 0.2905, + "step": 16691 + }, + { + "epoch": 1.3522359040829552, + "grad_norm": 0.043488454073667526, + "learning_rate": 0.0001470993294027634, + "loss": 0.2831, + "step": 16692 + }, + { + "epoch": 1.3523169151004537, + "grad_norm": 0.04362538829445839, + "learning_rate": 0.0001470948287501688, + "loss": 0.31, + "step": 16693 + }, + { + "epoch": 1.352397926117952, + "grad_norm": 0.05452515929937363, + "learning_rate": 0.00014709032809757415, + "loss": 0.3327, + "step": 16694 + }, + { + "epoch": 1.3524789371354504, + "grad_norm": 0.0442318469285965, + "learning_rate": 0.0001470858274449795, + "loss": 0.2992, + "step": 16695 + }, + { + "epoch": 1.3525599481529489, + "grad_norm": 0.04387791454792023, + "learning_rate": 0.0001470813267923849, + "loss": 0.315, + "step": 16696 + }, + { + "epoch": 1.3526409591704471, + "grad_norm": 0.03960513323545456, + "learning_rate": 0.00014707682613979028, + "loss": 0.277, + "step": 16697 + }, + { + "epoch": 1.3527219701879456, + "grad_norm": 0.05234871059656143, + "learning_rate": 0.00014707232548719564, + "loss": 0.3153, + "step": 16698 + }, + { + "epoch": 1.352802981205444, + "grad_norm": 0.04601548984646797, + "learning_rate": 0.00014706782483460103, + "loss": 0.2602, + "step": 16699 + }, + { + "epoch": 1.3528839922229423, + "grad_norm": 0.0502309650182724, + "learning_rate": 0.0001470633241820064, + "loss": 0.3246, + "step": 16700 + }, + { + "epoch": 1.3529650032404408, + "grad_norm": 0.04587402939796448, + "learning_rate": 0.00014705882352941178, + "loss": 0.2925, + "step": 16701 + }, + { + "epoch": 1.353046014257939, + "grad_norm": 0.04139650985598564, + "learning_rate": 0.00014705432287681714, + "loss": 0.305, + "step": 16702 + }, + { + "epoch": 1.3531270252754375, + "grad_norm": 0.04805470257997513, + "learning_rate": 0.00014704982222422253, + "loss": 0.3126, + "step": 16703 + }, + { + "epoch": 1.3532080362929357, + "grad_norm": 0.04224188253283501, + "learning_rate": 0.00014704532157162789, + "loss": 0.273, + "step": 16704 + }, + { + "epoch": 1.3532890473104342, + "grad_norm": 0.04937662184238434, + "learning_rate": 0.00014704082091903327, + "loss": 0.3476, + "step": 16705 + }, + { + "epoch": 1.3533700583279327, + "grad_norm": 0.0471457913517952, + "learning_rate": 0.00014703632026643863, + "loss": 0.3152, + "step": 16706 + }, + { + "epoch": 1.353451069345431, + "grad_norm": 0.04233188554644585, + "learning_rate": 0.00014703181961384402, + "loss": 0.2936, + "step": 16707 + }, + { + "epoch": 1.3535320803629294, + "grad_norm": 0.04140879586338997, + "learning_rate": 0.00014702731896124938, + "loss": 0.2541, + "step": 16708 + }, + { + "epoch": 1.3536130913804278, + "grad_norm": 0.050857096910476685, + "learning_rate": 0.00014702281830865477, + "loss": 0.3222, + "step": 16709 + }, + { + "epoch": 1.353694102397926, + "grad_norm": 0.04600680246949196, + "learning_rate": 0.00014701831765606013, + "loss": 0.2904, + "step": 16710 + }, + { + "epoch": 1.3537751134154246, + "grad_norm": 0.04443364217877388, + "learning_rate": 0.00014701381700346551, + "loss": 0.3104, + "step": 16711 + }, + { + "epoch": 1.3538561244329228, + "grad_norm": 0.08359038829803467, + "learning_rate": 0.00014700931635087088, + "loss": 0.3043, + "step": 16712 + }, + { + "epoch": 1.3539371354504213, + "grad_norm": 0.04045131430029869, + "learning_rate": 0.00014700481569827626, + "loss": 0.3045, + "step": 16713 + }, + { + "epoch": 1.3540181464679195, + "grad_norm": 0.04331865534186363, + "learning_rate": 0.00014700031504568165, + "loss": 0.3371, + "step": 16714 + }, + { + "epoch": 1.354099157485418, + "grad_norm": 0.039291027933359146, + "learning_rate": 0.000146995814393087, + "loss": 0.2393, + "step": 16715 + }, + { + "epoch": 1.3541801685029164, + "grad_norm": 0.043209612369537354, + "learning_rate": 0.00014699131374049237, + "loss": 0.3281, + "step": 16716 + }, + { + "epoch": 1.3542611795204147, + "grad_norm": 0.04783690720796585, + "learning_rate": 0.00014698681308789776, + "loss": 0.3078, + "step": 16717 + }, + { + "epoch": 1.3543421905379132, + "grad_norm": 0.04343714192509651, + "learning_rate": 0.00014698231243530312, + "loss": 0.2993, + "step": 16718 + }, + { + "epoch": 1.3544232015554116, + "grad_norm": 0.04262785241007805, + "learning_rate": 0.0001469778117827085, + "loss": 0.2941, + "step": 16719 + }, + { + "epoch": 1.3545042125729099, + "grad_norm": 0.045166414231061935, + "learning_rate": 0.0001469733111301139, + "loss": 0.2887, + "step": 16720 + }, + { + "epoch": 1.3545852235904083, + "grad_norm": 0.04084278643131256, + "learning_rate": 0.00014696881047751925, + "loss": 0.268, + "step": 16721 + }, + { + "epoch": 1.3546662346079068, + "grad_norm": 0.042416613548994064, + "learning_rate": 0.0001469643098249246, + "loss": 0.2828, + "step": 16722 + }, + { + "epoch": 1.354747245625405, + "grad_norm": 0.05023491382598877, + "learning_rate": 0.00014695980917233, + "loss": 0.2986, + "step": 16723 + }, + { + "epoch": 1.3548282566429035, + "grad_norm": 0.04192998260259628, + "learning_rate": 0.00014695530851973539, + "loss": 0.2799, + "step": 16724 + }, + { + "epoch": 1.3549092676604018, + "grad_norm": 0.04842350259423256, + "learning_rate": 0.00014695080786714075, + "loss": 0.3371, + "step": 16725 + }, + { + "epoch": 1.3549902786779002, + "grad_norm": 0.0444394126534462, + "learning_rate": 0.00014694630721454613, + "loss": 0.3015, + "step": 16726 + }, + { + "epoch": 1.3550712896953985, + "grad_norm": 0.04761913791298866, + "learning_rate": 0.0001469418065619515, + "loss": 0.3308, + "step": 16727 + }, + { + "epoch": 1.355152300712897, + "grad_norm": 0.04710664227604866, + "learning_rate": 0.00014693730590935685, + "loss": 0.2814, + "step": 16728 + }, + { + "epoch": 1.3552333117303954, + "grad_norm": 0.04607471823692322, + "learning_rate": 0.00014693280525676224, + "loss": 0.3356, + "step": 16729 + }, + { + "epoch": 1.3553143227478937, + "grad_norm": 0.0481552891433239, + "learning_rate": 0.00014692830460416763, + "loss": 0.291, + "step": 16730 + }, + { + "epoch": 1.3553953337653921, + "grad_norm": 0.045392464846372604, + "learning_rate": 0.000146923803951573, + "loss": 0.2821, + "step": 16731 + }, + { + "epoch": 1.3554763447828906, + "grad_norm": 0.045290619134902954, + "learning_rate": 0.00014691930329897837, + "loss": 0.3186, + "step": 16732 + }, + { + "epoch": 1.3555573558003888, + "grad_norm": 0.045965004712343216, + "learning_rate": 0.00014691480264638373, + "loss": 0.3045, + "step": 16733 + }, + { + "epoch": 1.3556383668178873, + "grad_norm": 0.054191283881664276, + "learning_rate": 0.0001469103019937891, + "loss": 0.3299, + "step": 16734 + }, + { + "epoch": 1.3557193778353855, + "grad_norm": 0.04757121577858925, + "learning_rate": 0.00014690580134119448, + "loss": 0.3076, + "step": 16735 + }, + { + "epoch": 1.355800388852884, + "grad_norm": 0.05400211736559868, + "learning_rate": 0.00014690130068859987, + "loss": 0.3373, + "step": 16736 + }, + { + "epoch": 1.3558813998703823, + "grad_norm": 0.05310973897576332, + "learning_rate": 0.00014689680003600523, + "loss": 0.3002, + "step": 16737 + }, + { + "epoch": 1.3559624108878807, + "grad_norm": 0.03847629949450493, + "learning_rate": 0.00014689229938341062, + "loss": 0.2417, + "step": 16738 + }, + { + "epoch": 1.3560434219053792, + "grad_norm": 0.04777615889906883, + "learning_rate": 0.00014688779873081598, + "loss": 0.2763, + "step": 16739 + }, + { + "epoch": 1.3561244329228774, + "grad_norm": 0.04702061042189598, + "learning_rate": 0.00014688329807822134, + "loss": 0.3106, + "step": 16740 + }, + { + "epoch": 1.356205443940376, + "grad_norm": 0.044273655861616135, + "learning_rate": 0.00014687879742562672, + "loss": 0.27, + "step": 16741 + }, + { + "epoch": 1.3562864549578744, + "grad_norm": 0.04390040040016174, + "learning_rate": 0.0001468742967730321, + "loss": 0.3069, + "step": 16742 + }, + { + "epoch": 1.3563674659753726, + "grad_norm": 0.05370044335722923, + "learning_rate": 0.00014686979612043747, + "loss": 0.3169, + "step": 16743 + }, + { + "epoch": 1.356448476992871, + "grad_norm": 0.05036519095301628, + "learning_rate": 0.00014686529546784286, + "loss": 0.315, + "step": 16744 + }, + { + "epoch": 1.3565294880103695, + "grad_norm": 0.04587824270129204, + "learning_rate": 0.00014686079481524822, + "loss": 0.3091, + "step": 16745 + }, + { + "epoch": 1.3566104990278678, + "grad_norm": 0.04910546541213989, + "learning_rate": 0.00014685629416265358, + "loss": 0.281, + "step": 16746 + }, + { + "epoch": 1.356691510045366, + "grad_norm": 0.040887534618377686, + "learning_rate": 0.00014685179351005896, + "loss": 0.2888, + "step": 16747 + }, + { + "epoch": 1.3567725210628645, + "grad_norm": 0.03990871086716652, + "learning_rate": 0.00014684729285746435, + "loss": 0.2821, + "step": 16748 + }, + { + "epoch": 1.356853532080363, + "grad_norm": 0.04099421203136444, + "learning_rate": 0.0001468427922048697, + "loss": 0.2888, + "step": 16749 + }, + { + "epoch": 1.3569345430978612, + "grad_norm": 0.04588409140706062, + "learning_rate": 0.0001468382915522751, + "loss": 0.329, + "step": 16750 + }, + { + "epoch": 1.3570155541153597, + "grad_norm": 0.052914079278707504, + "learning_rate": 0.00014683379089968046, + "loss": 0.3042, + "step": 16751 + }, + { + "epoch": 1.3570965651328581, + "grad_norm": 0.04542308300733566, + "learning_rate": 0.00014682929024708582, + "loss": 0.3289, + "step": 16752 + }, + { + "epoch": 1.3571775761503564, + "grad_norm": 0.048569608479738235, + "learning_rate": 0.0001468247895944912, + "loss": 0.3064, + "step": 16753 + }, + { + "epoch": 1.3572585871678549, + "grad_norm": 0.04433464631438255, + "learning_rate": 0.0001468202889418966, + "loss": 0.2435, + "step": 16754 + }, + { + "epoch": 1.3573395981853533, + "grad_norm": 0.052565090358257294, + "learning_rate": 0.00014681578828930195, + "loss": 0.3285, + "step": 16755 + }, + { + "epoch": 1.3574206092028516, + "grad_norm": 0.04711638391017914, + "learning_rate": 0.00014681128763670734, + "loss": 0.2904, + "step": 16756 + }, + { + "epoch": 1.35750162022035, + "grad_norm": 0.05769209563732147, + "learning_rate": 0.0001468067869841127, + "loss": 0.3118, + "step": 16757 + }, + { + "epoch": 1.3575826312378483, + "grad_norm": 0.04438735172152519, + "learning_rate": 0.00014680228633151806, + "loss": 0.2729, + "step": 16758 + }, + { + "epoch": 1.3576636422553467, + "grad_norm": 0.053901974111795425, + "learning_rate": 0.00014679778567892345, + "loss": 0.3609, + "step": 16759 + }, + { + "epoch": 1.357744653272845, + "grad_norm": 0.04904919117689133, + "learning_rate": 0.00014679328502632884, + "loss": 0.2664, + "step": 16760 + }, + { + "epoch": 1.3578256642903435, + "grad_norm": 0.038649268448352814, + "learning_rate": 0.0001467887843737342, + "loss": 0.2512, + "step": 16761 + }, + { + "epoch": 1.357906675307842, + "grad_norm": 0.05013785883784294, + "learning_rate": 0.00014678428372113958, + "loss": 0.275, + "step": 16762 + }, + { + "epoch": 1.3579876863253402, + "grad_norm": 0.05385733023285866, + "learning_rate": 0.00014677978306854494, + "loss": 0.2983, + "step": 16763 + }, + { + "epoch": 1.3580686973428386, + "grad_norm": 0.05296805500984192, + "learning_rate": 0.0001467752824159503, + "loss": 0.3009, + "step": 16764 + }, + { + "epoch": 1.358149708360337, + "grad_norm": 0.048822782933712006, + "learning_rate": 0.0001467707817633557, + "loss": 0.2802, + "step": 16765 + }, + { + "epoch": 1.3582307193778353, + "grad_norm": 0.04486341029405594, + "learning_rate": 0.00014676628111076108, + "loss": 0.2856, + "step": 16766 + }, + { + "epoch": 1.3583117303953338, + "grad_norm": 0.04700850695371628, + "learning_rate": 0.00014676178045816644, + "loss": 0.2815, + "step": 16767 + }, + { + "epoch": 1.3583927414128323, + "grad_norm": 0.04619745537638664, + "learning_rate": 0.00014675727980557182, + "loss": 0.3142, + "step": 16768 + }, + { + "epoch": 1.3584737524303305, + "grad_norm": 0.045293714851140976, + "learning_rate": 0.00014675277915297718, + "loss": 0.2886, + "step": 16769 + }, + { + "epoch": 1.3585547634478288, + "grad_norm": 0.04872259870171547, + "learning_rate": 0.00014674827850038254, + "loss": 0.3158, + "step": 16770 + }, + { + "epoch": 1.3586357744653272, + "grad_norm": 0.05652736499905586, + "learning_rate": 0.00014674377784778793, + "loss": 0.3662, + "step": 16771 + }, + { + "epoch": 1.3587167854828257, + "grad_norm": 0.04640227183699608, + "learning_rate": 0.00014673927719519332, + "loss": 0.3093, + "step": 16772 + }, + { + "epoch": 1.358797796500324, + "grad_norm": 0.04424037039279938, + "learning_rate": 0.00014673477654259868, + "loss": 0.3359, + "step": 16773 + }, + { + "epoch": 1.3588788075178224, + "grad_norm": 0.048362359404563904, + "learning_rate": 0.00014673027589000407, + "loss": 0.2752, + "step": 16774 + }, + { + "epoch": 1.3589598185353209, + "grad_norm": 0.04164674133062363, + "learning_rate": 0.00014672577523740943, + "loss": 0.2781, + "step": 16775 + }, + { + "epoch": 1.3590408295528191, + "grad_norm": 0.04155677929520607, + "learning_rate": 0.0001467212745848148, + "loss": 0.2752, + "step": 16776 + }, + { + "epoch": 1.3591218405703176, + "grad_norm": 0.05100404471158981, + "learning_rate": 0.00014671677393222017, + "loss": 0.3171, + "step": 16777 + }, + { + "epoch": 1.359202851587816, + "grad_norm": 0.04038337990641594, + "learning_rate": 0.00014671227327962556, + "loss": 0.2684, + "step": 16778 + }, + { + "epoch": 1.3592838626053143, + "grad_norm": 0.05007191002368927, + "learning_rate": 0.00014670777262703092, + "loss": 0.2847, + "step": 16779 + }, + { + "epoch": 1.3593648736228128, + "grad_norm": 0.04454847052693367, + "learning_rate": 0.0001467032719744363, + "loss": 0.278, + "step": 16780 + }, + { + "epoch": 1.359445884640311, + "grad_norm": 0.040483422577381134, + "learning_rate": 0.00014669877132184167, + "loss": 0.2605, + "step": 16781 + }, + { + "epoch": 1.3595268956578095, + "grad_norm": 0.04559832438826561, + "learning_rate": 0.00014669427066924705, + "loss": 0.3204, + "step": 16782 + }, + { + "epoch": 1.3596079066753077, + "grad_norm": 0.04614581912755966, + "learning_rate": 0.00014668977001665244, + "loss": 0.3391, + "step": 16783 + }, + { + "epoch": 1.3596889176928062, + "grad_norm": 0.047624580562114716, + "learning_rate": 0.0001466852693640578, + "loss": 0.302, + "step": 16784 + }, + { + "epoch": 1.3597699287103047, + "grad_norm": 0.04650955647230148, + "learning_rate": 0.00014668076871146316, + "loss": 0.2762, + "step": 16785 + }, + { + "epoch": 1.359850939727803, + "grad_norm": 0.04590458795428276, + "learning_rate": 0.00014667626805886855, + "loss": 0.2966, + "step": 16786 + }, + { + "epoch": 1.3599319507453014, + "grad_norm": 0.04254709929227829, + "learning_rate": 0.0001466717674062739, + "loss": 0.2759, + "step": 16787 + }, + { + "epoch": 1.3600129617627998, + "grad_norm": 0.04450596868991852, + "learning_rate": 0.0001466672667536793, + "loss": 0.2721, + "step": 16788 + }, + { + "epoch": 1.360093972780298, + "grad_norm": 0.04304325953125954, + "learning_rate": 0.00014666276610108468, + "loss": 0.2606, + "step": 16789 + }, + { + "epoch": 1.3601749837977966, + "grad_norm": 0.046191249042749405, + "learning_rate": 0.00014665826544849004, + "loss": 0.3214, + "step": 16790 + }, + { + "epoch": 1.3602559948152948, + "grad_norm": 0.04645387455821037, + "learning_rate": 0.0001466537647958954, + "loss": 0.3186, + "step": 16791 + }, + { + "epoch": 1.3603370058327933, + "grad_norm": 0.04364481940865517, + "learning_rate": 0.0001466492641433008, + "loss": 0.314, + "step": 16792 + }, + { + "epoch": 1.3604180168502915, + "grad_norm": 0.04091165214776993, + "learning_rate": 0.00014664476349070615, + "loss": 0.2958, + "step": 16793 + }, + { + "epoch": 1.36049902786779, + "grad_norm": 0.04911810904741287, + "learning_rate": 0.00014664026283811154, + "loss": 0.322, + "step": 16794 + }, + { + "epoch": 1.3605800388852884, + "grad_norm": 0.04907684773206711, + "learning_rate": 0.00014663576218551692, + "loss": 0.2986, + "step": 16795 + }, + { + "epoch": 1.3606610499027867, + "grad_norm": 0.04427606239914894, + "learning_rate": 0.00014663126153292228, + "loss": 0.288, + "step": 16796 + }, + { + "epoch": 1.3607420609202852, + "grad_norm": 0.048113491386175156, + "learning_rate": 0.00014662676088032764, + "loss": 0.2997, + "step": 16797 + }, + { + "epoch": 1.3608230719377836, + "grad_norm": 0.04680365324020386, + "learning_rate": 0.00014662226022773303, + "loss": 0.3007, + "step": 16798 + }, + { + "epoch": 1.3609040829552819, + "grad_norm": 0.0571950301527977, + "learning_rate": 0.0001466177595751384, + "loss": 0.2874, + "step": 16799 + }, + { + "epoch": 1.3609850939727803, + "grad_norm": 0.03910844773054123, + "learning_rate": 0.00014661325892254378, + "loss": 0.2678, + "step": 16800 + }, + { + "epoch": 1.3610661049902788, + "grad_norm": 0.04900038614869118, + "learning_rate": 0.00014660875826994917, + "loss": 0.3503, + "step": 16801 + }, + { + "epoch": 1.361147116007777, + "grad_norm": 0.047941070050001144, + "learning_rate": 0.00014660425761735453, + "loss": 0.3336, + "step": 16802 + }, + { + "epoch": 1.3612281270252755, + "grad_norm": 0.049411624670028687, + "learning_rate": 0.0001465997569647599, + "loss": 0.2802, + "step": 16803 + }, + { + "epoch": 1.3613091380427738, + "grad_norm": 0.04363062605261803, + "learning_rate": 0.00014659525631216527, + "loss": 0.2786, + "step": 16804 + }, + { + "epoch": 1.3613901490602722, + "grad_norm": 0.0482541024684906, + "learning_rate": 0.00014659075565957066, + "loss": 0.2847, + "step": 16805 + }, + { + "epoch": 1.3614711600777705, + "grad_norm": 0.04462805762887001, + "learning_rate": 0.00014658625500697602, + "loss": 0.3073, + "step": 16806 + }, + { + "epoch": 1.361552171095269, + "grad_norm": 0.047291915863752365, + "learning_rate": 0.0001465817543543814, + "loss": 0.2919, + "step": 16807 + }, + { + "epoch": 1.3616331821127674, + "grad_norm": 0.044917941093444824, + "learning_rate": 0.00014657725370178677, + "loss": 0.3, + "step": 16808 + }, + { + "epoch": 1.3617141931302656, + "grad_norm": 0.045333001762628555, + "learning_rate": 0.00014657275304919213, + "loss": 0.2598, + "step": 16809 + }, + { + "epoch": 1.3617952041477641, + "grad_norm": 0.052305012941360474, + "learning_rate": 0.00014656825239659752, + "loss": 0.3113, + "step": 16810 + }, + { + "epoch": 1.3618762151652626, + "grad_norm": 0.046395689249038696, + "learning_rate": 0.0001465637517440029, + "loss": 0.3137, + "step": 16811 + }, + { + "epoch": 1.3619572261827608, + "grad_norm": 0.038306351751089096, + "learning_rate": 0.00014655925109140826, + "loss": 0.2799, + "step": 16812 + }, + { + "epoch": 1.3620382372002593, + "grad_norm": 0.04151049628853798, + "learning_rate": 0.00014655475043881365, + "loss": 0.247, + "step": 16813 + }, + { + "epoch": 1.3621192482177575, + "grad_norm": 0.05814644321799278, + "learning_rate": 0.000146550249786219, + "loss": 0.3111, + "step": 16814 + }, + { + "epoch": 1.362200259235256, + "grad_norm": 0.03642072528600693, + "learning_rate": 0.00014654574913362437, + "loss": 0.2466, + "step": 16815 + }, + { + "epoch": 1.3622812702527543, + "grad_norm": 0.05054713413119316, + "learning_rate": 0.00014654124848102976, + "loss": 0.3172, + "step": 16816 + }, + { + "epoch": 1.3623622812702527, + "grad_norm": 0.04339751601219177, + "learning_rate": 0.00014653674782843514, + "loss": 0.2787, + "step": 16817 + }, + { + "epoch": 1.3624432922877512, + "grad_norm": 0.05909118056297302, + "learning_rate": 0.0001465322471758405, + "loss": 0.3593, + "step": 16818 + }, + { + "epoch": 1.3625243033052494, + "grad_norm": 0.04289279133081436, + "learning_rate": 0.0001465277465232459, + "loss": 0.2902, + "step": 16819 + }, + { + "epoch": 1.362605314322748, + "grad_norm": 0.04428792744874954, + "learning_rate": 0.00014652324587065125, + "loss": 0.2842, + "step": 16820 + }, + { + "epoch": 1.3626863253402464, + "grad_norm": 0.050207529217004776, + "learning_rate": 0.0001465187452180566, + "loss": 0.3221, + "step": 16821 + }, + { + "epoch": 1.3627673363577446, + "grad_norm": 0.04290533438324928, + "learning_rate": 0.000146514244565462, + "loss": 0.2907, + "step": 16822 + }, + { + "epoch": 1.362848347375243, + "grad_norm": 0.04496665298938751, + "learning_rate": 0.00014650974391286739, + "loss": 0.3309, + "step": 16823 + }, + { + "epoch": 1.3629293583927415, + "grad_norm": 0.04096877574920654, + "learning_rate": 0.00014650524326027275, + "loss": 0.2958, + "step": 16824 + }, + { + "epoch": 1.3630103694102398, + "grad_norm": 0.05002980679273605, + "learning_rate": 0.00014650074260767813, + "loss": 0.3151, + "step": 16825 + }, + { + "epoch": 1.3630913804277383, + "grad_norm": 0.04218354821205139, + "learning_rate": 0.0001464962419550835, + "loss": 0.3038, + "step": 16826 + }, + { + "epoch": 1.3631723914452365, + "grad_norm": 0.047836434096097946, + "learning_rate": 0.00014649174130248885, + "loss": 0.313, + "step": 16827 + }, + { + "epoch": 1.363253402462735, + "grad_norm": 0.04476723447442055, + "learning_rate": 0.00014648724064989424, + "loss": 0.3203, + "step": 16828 + }, + { + "epoch": 1.3633344134802332, + "grad_norm": 0.05513838678598404, + "learning_rate": 0.00014648273999729963, + "loss": 0.3342, + "step": 16829 + }, + { + "epoch": 1.3634154244977317, + "grad_norm": 0.05754096433520317, + "learning_rate": 0.000146478239344705, + "loss": 0.3432, + "step": 16830 + }, + { + "epoch": 1.3634964355152301, + "grad_norm": 0.04611041024327278, + "learning_rate": 0.00014647373869211037, + "loss": 0.2919, + "step": 16831 + }, + { + "epoch": 1.3635774465327284, + "grad_norm": 0.04048772528767586, + "learning_rate": 0.00014646923803951573, + "loss": 0.272, + "step": 16832 + }, + { + "epoch": 1.3636584575502269, + "grad_norm": 0.03679872676730156, + "learning_rate": 0.0001464647373869211, + "loss": 0.2405, + "step": 16833 + }, + { + "epoch": 1.3637394685677253, + "grad_norm": 0.046283673495054245, + "learning_rate": 0.00014646023673432648, + "loss": 0.3303, + "step": 16834 + }, + { + "epoch": 1.3638204795852236, + "grad_norm": 0.047594837844371796, + "learning_rate": 0.00014645573608173187, + "loss": 0.3108, + "step": 16835 + }, + { + "epoch": 1.363901490602722, + "grad_norm": 0.04769527167081833, + "learning_rate": 0.00014645123542913723, + "loss": 0.3136, + "step": 16836 + }, + { + "epoch": 1.3639825016202203, + "grad_norm": 0.04665999859571457, + "learning_rate": 0.00014644673477654262, + "loss": 0.2843, + "step": 16837 + }, + { + "epoch": 1.3640635126377187, + "grad_norm": 0.04001079127192497, + "learning_rate": 0.00014644223412394798, + "loss": 0.2717, + "step": 16838 + }, + { + "epoch": 1.364144523655217, + "grad_norm": 0.04948032647371292, + "learning_rate": 0.00014643773347135334, + "loss": 0.2933, + "step": 16839 + }, + { + "epoch": 1.3642255346727155, + "grad_norm": 0.04881342127919197, + "learning_rate": 0.00014643323281875872, + "loss": 0.3302, + "step": 16840 + }, + { + "epoch": 1.364306545690214, + "grad_norm": 0.04329438880085945, + "learning_rate": 0.0001464287321661641, + "loss": 0.2974, + "step": 16841 + }, + { + "epoch": 1.3643875567077122, + "grad_norm": 0.0506560280919075, + "learning_rate": 0.00014642423151356947, + "loss": 0.3261, + "step": 16842 + }, + { + "epoch": 1.3644685677252106, + "grad_norm": 0.046005334705114365, + "learning_rate": 0.00014641973086097486, + "loss": 0.283, + "step": 16843 + }, + { + "epoch": 1.364549578742709, + "grad_norm": 0.05078957602381706, + "learning_rate": 0.00014641523020838022, + "loss": 0.3248, + "step": 16844 + }, + { + "epoch": 1.3646305897602073, + "grad_norm": 0.04549172893166542, + "learning_rate": 0.00014641072955578558, + "loss": 0.2666, + "step": 16845 + }, + { + "epoch": 1.3647116007777058, + "grad_norm": 0.04811951890587807, + "learning_rate": 0.00014640622890319096, + "loss": 0.2956, + "step": 16846 + }, + { + "epoch": 1.3647926117952043, + "grad_norm": 0.05006801337003708, + "learning_rate": 0.00014640172825059635, + "loss": 0.314, + "step": 16847 + }, + { + "epoch": 1.3648736228127025, + "grad_norm": 0.04566046968102455, + "learning_rate": 0.0001463972275980017, + "loss": 0.2905, + "step": 16848 + }, + { + "epoch": 1.3649546338302008, + "grad_norm": 0.049143534153699875, + "learning_rate": 0.0001463927269454071, + "loss": 0.2689, + "step": 16849 + }, + { + "epoch": 1.3650356448476992, + "grad_norm": 0.050470754504203796, + "learning_rate": 0.00014638822629281246, + "loss": 0.3447, + "step": 16850 + }, + { + "epoch": 1.3651166558651977, + "grad_norm": 0.04866886883974075, + "learning_rate": 0.00014638372564021782, + "loss": 0.2903, + "step": 16851 + }, + { + "epoch": 1.365197666882696, + "grad_norm": 0.053343262523412704, + "learning_rate": 0.00014637922498762323, + "loss": 0.2902, + "step": 16852 + }, + { + "epoch": 1.3652786779001944, + "grad_norm": 0.04238571971654892, + "learning_rate": 0.0001463747243350286, + "loss": 0.2767, + "step": 16853 + }, + { + "epoch": 1.3653596889176929, + "grad_norm": 0.054251790046691895, + "learning_rate": 0.00014637022368243395, + "loss": 0.3201, + "step": 16854 + }, + { + "epoch": 1.3654406999351911, + "grad_norm": 0.046247709542512894, + "learning_rate": 0.00014636572302983934, + "loss": 0.3041, + "step": 16855 + }, + { + "epoch": 1.3655217109526896, + "grad_norm": 0.05009758844971657, + "learning_rate": 0.0001463612223772447, + "loss": 0.2653, + "step": 16856 + }, + { + "epoch": 1.365602721970188, + "grad_norm": 0.055453184992074966, + "learning_rate": 0.0001463567217246501, + "loss": 0.3334, + "step": 16857 + }, + { + "epoch": 1.3656837329876863, + "grad_norm": 0.04012593999505043, + "learning_rate": 0.00014635222107205548, + "loss": 0.2727, + "step": 16858 + }, + { + "epoch": 1.3657647440051848, + "grad_norm": 0.04042156785726547, + "learning_rate": 0.00014634772041946084, + "loss": 0.2738, + "step": 16859 + }, + { + "epoch": 1.365845755022683, + "grad_norm": 0.04087120667099953, + "learning_rate": 0.0001463432197668662, + "loss": 0.2631, + "step": 16860 + }, + { + "epoch": 1.3659267660401815, + "grad_norm": 0.04135334491729736, + "learning_rate": 0.00014633871911427158, + "loss": 0.247, + "step": 16861 + }, + { + "epoch": 1.3660077770576797, + "grad_norm": 0.04643384367227554, + "learning_rate": 0.00014633421846167694, + "loss": 0.2988, + "step": 16862 + }, + { + "epoch": 1.3660887880751782, + "grad_norm": 0.04361230880022049, + "learning_rate": 0.00014632971780908233, + "loss": 0.295, + "step": 16863 + }, + { + "epoch": 1.3661697990926767, + "grad_norm": 0.053531769663095474, + "learning_rate": 0.00014632521715648772, + "loss": 0.2676, + "step": 16864 + }, + { + "epoch": 1.366250810110175, + "grad_norm": 0.04573979601264, + "learning_rate": 0.00014632071650389308, + "loss": 0.2732, + "step": 16865 + }, + { + "epoch": 1.3663318211276734, + "grad_norm": 0.05206222087144852, + "learning_rate": 0.00014631621585129844, + "loss": 0.3256, + "step": 16866 + }, + { + "epoch": 1.3664128321451718, + "grad_norm": 0.046878959983587265, + "learning_rate": 0.00014631171519870382, + "loss": 0.3083, + "step": 16867 + }, + { + "epoch": 1.36649384316267, + "grad_norm": 0.046941112726926804, + "learning_rate": 0.00014630721454610918, + "loss": 0.2547, + "step": 16868 + }, + { + "epoch": 1.3665748541801686, + "grad_norm": 0.04700201749801636, + "learning_rate": 0.00014630271389351457, + "loss": 0.3062, + "step": 16869 + }, + { + "epoch": 1.366655865197667, + "grad_norm": 0.042983245104551315, + "learning_rate": 0.00014629821324091996, + "loss": 0.294, + "step": 16870 + }, + { + "epoch": 1.3667368762151653, + "grad_norm": 0.0427616648375988, + "learning_rate": 0.00014629371258832532, + "loss": 0.2736, + "step": 16871 + }, + { + "epoch": 1.3668178872326635, + "grad_norm": 0.04252465441823006, + "learning_rate": 0.00014628921193573068, + "loss": 0.2706, + "step": 16872 + }, + { + "epoch": 1.366898898250162, + "grad_norm": 0.04775184020400047, + "learning_rate": 0.00014628471128313607, + "loss": 0.3251, + "step": 16873 + }, + { + "epoch": 1.3669799092676604, + "grad_norm": 0.051210030913352966, + "learning_rate": 0.00014628021063054143, + "loss": 0.301, + "step": 16874 + }, + { + "epoch": 1.3670609202851587, + "grad_norm": 0.050241678953170776, + "learning_rate": 0.0001462757099779468, + "loss": 0.3119, + "step": 16875 + }, + { + "epoch": 1.3671419313026572, + "grad_norm": 0.042821403592824936, + "learning_rate": 0.0001462712093253522, + "loss": 0.2613, + "step": 16876 + }, + { + "epoch": 1.3672229423201556, + "grad_norm": 0.04285527765750885, + "learning_rate": 0.00014626670867275756, + "loss": 0.3185, + "step": 16877 + }, + { + "epoch": 1.3673039533376539, + "grad_norm": 0.04558612033724785, + "learning_rate": 0.00014626220802016292, + "loss": 0.2942, + "step": 16878 + }, + { + "epoch": 1.3673849643551523, + "grad_norm": 0.04869738593697548, + "learning_rate": 0.0001462577073675683, + "loss": 0.3108, + "step": 16879 + }, + { + "epoch": 1.3674659753726508, + "grad_norm": 0.04682771861553192, + "learning_rate": 0.00014625320671497367, + "loss": 0.3039, + "step": 16880 + }, + { + "epoch": 1.367546986390149, + "grad_norm": 0.04650042578577995, + "learning_rate": 0.00014624870606237905, + "loss": 0.2919, + "step": 16881 + }, + { + "epoch": 1.3676279974076475, + "grad_norm": 0.042950332164764404, + "learning_rate": 0.00014624420540978444, + "loss": 0.2497, + "step": 16882 + }, + { + "epoch": 1.3677090084251458, + "grad_norm": 0.04653036221861839, + "learning_rate": 0.0001462397047571898, + "loss": 0.2843, + "step": 16883 + }, + { + "epoch": 1.3677900194426442, + "grad_norm": 0.049992140382528305, + "learning_rate": 0.00014623520410459516, + "loss": 0.3237, + "step": 16884 + }, + { + "epoch": 1.3678710304601425, + "grad_norm": 0.0425015352666378, + "learning_rate": 0.00014623070345200055, + "loss": 0.3143, + "step": 16885 + }, + { + "epoch": 1.367952041477641, + "grad_norm": 0.05130867660045624, + "learning_rate": 0.00014622620279940594, + "loss": 0.3054, + "step": 16886 + }, + { + "epoch": 1.3680330524951394, + "grad_norm": 0.045671138912439346, + "learning_rate": 0.0001462217021468113, + "loss": 0.2901, + "step": 16887 + }, + { + "epoch": 1.3681140635126376, + "grad_norm": 0.04083164036273956, + "learning_rate": 0.00014621720149421668, + "loss": 0.2918, + "step": 16888 + }, + { + "epoch": 1.3681950745301361, + "grad_norm": 0.041421759873628616, + "learning_rate": 0.00014621270084162204, + "loss": 0.2704, + "step": 16889 + }, + { + "epoch": 1.3682760855476346, + "grad_norm": 0.053979452699422836, + "learning_rate": 0.0001462082001890274, + "loss": 0.3252, + "step": 16890 + }, + { + "epoch": 1.3683570965651328, + "grad_norm": 0.0409124456346035, + "learning_rate": 0.0001462036995364328, + "loss": 0.2791, + "step": 16891 + }, + { + "epoch": 1.3684381075826313, + "grad_norm": 0.04741591587662697, + "learning_rate": 0.00014619919888383818, + "loss": 0.288, + "step": 16892 + }, + { + "epoch": 1.3685191186001295, + "grad_norm": 0.054248884320259094, + "learning_rate": 0.00014619469823124354, + "loss": 0.3152, + "step": 16893 + }, + { + "epoch": 1.368600129617628, + "grad_norm": 0.047506216913461685, + "learning_rate": 0.00014619019757864893, + "loss": 0.2871, + "step": 16894 + }, + { + "epoch": 1.3686811406351262, + "grad_norm": 0.04755987972021103, + "learning_rate": 0.00014618569692605429, + "loss": 0.3085, + "step": 16895 + }, + { + "epoch": 1.3687621516526247, + "grad_norm": 0.04973354935646057, + "learning_rate": 0.00014618119627345965, + "loss": 0.3267, + "step": 16896 + }, + { + "epoch": 1.3688431626701232, + "grad_norm": 0.04893682152032852, + "learning_rate": 0.00014617669562086503, + "loss": 0.3173, + "step": 16897 + }, + { + "epoch": 1.3689241736876214, + "grad_norm": 0.044737182557582855, + "learning_rate": 0.00014617219496827042, + "loss": 0.2754, + "step": 16898 + }, + { + "epoch": 1.36900518470512, + "grad_norm": 0.05379049479961395, + "learning_rate": 0.00014616769431567578, + "loss": 0.3503, + "step": 16899 + }, + { + "epoch": 1.3690861957226184, + "grad_norm": 0.04396909475326538, + "learning_rate": 0.00014616319366308117, + "loss": 0.259, + "step": 16900 + }, + { + "epoch": 1.3691672067401166, + "grad_norm": 0.04770589992403984, + "learning_rate": 0.00014615869301048653, + "loss": 0.285, + "step": 16901 + }, + { + "epoch": 1.369248217757615, + "grad_norm": 0.0434144102036953, + "learning_rate": 0.0001461541923578919, + "loss": 0.305, + "step": 16902 + }, + { + "epoch": 1.3693292287751135, + "grad_norm": 0.04375230893492699, + "learning_rate": 0.00014614969170529727, + "loss": 0.2663, + "step": 16903 + }, + { + "epoch": 1.3694102397926118, + "grad_norm": 0.05147865414619446, + "learning_rate": 0.00014614519105270266, + "loss": 0.295, + "step": 16904 + }, + { + "epoch": 1.3694912508101102, + "grad_norm": 0.047179412096738815, + "learning_rate": 0.00014614069040010802, + "loss": 0.3113, + "step": 16905 + }, + { + "epoch": 1.3695722618276085, + "grad_norm": 0.04674246534705162, + "learning_rate": 0.0001461361897475134, + "loss": 0.2754, + "step": 16906 + }, + { + "epoch": 1.369653272845107, + "grad_norm": 0.04732316732406616, + "learning_rate": 0.00014613168909491877, + "loss": 0.3144, + "step": 16907 + }, + { + "epoch": 1.3697342838626052, + "grad_norm": 0.0458158478140831, + "learning_rate": 0.00014612718844232413, + "loss": 0.3086, + "step": 16908 + }, + { + "epoch": 1.3698152948801037, + "grad_norm": 0.049340032041072845, + "learning_rate": 0.00014612268778972952, + "loss": 0.3215, + "step": 16909 + }, + { + "epoch": 1.3698963058976021, + "grad_norm": 0.04514643922448158, + "learning_rate": 0.0001461181871371349, + "loss": 0.3146, + "step": 16910 + }, + { + "epoch": 1.3699773169151004, + "grad_norm": 0.04190957173705101, + "learning_rate": 0.00014611368648454026, + "loss": 0.2956, + "step": 16911 + }, + { + "epoch": 1.3700583279325989, + "grad_norm": 0.04475057125091553, + "learning_rate": 0.00014610918583194565, + "loss": 0.2908, + "step": 16912 + }, + { + "epoch": 1.3701393389500973, + "grad_norm": 0.04853258281946182, + "learning_rate": 0.000146104685179351, + "loss": 0.3459, + "step": 16913 + }, + { + "epoch": 1.3702203499675956, + "grad_norm": 0.04533115401864052, + "learning_rate": 0.00014610018452675637, + "loss": 0.3097, + "step": 16914 + }, + { + "epoch": 1.370301360985094, + "grad_norm": 0.03996393084526062, + "learning_rate": 0.00014609568387416176, + "loss": 0.2828, + "step": 16915 + }, + { + "epoch": 1.3703823720025923, + "grad_norm": 0.05056044086813927, + "learning_rate": 0.00014609118322156714, + "loss": 0.2908, + "step": 16916 + }, + { + "epoch": 1.3704633830200907, + "grad_norm": 0.04065033793449402, + "learning_rate": 0.0001460866825689725, + "loss": 0.317, + "step": 16917 + }, + { + "epoch": 1.370544394037589, + "grad_norm": 0.04022886976599693, + "learning_rate": 0.0001460821819163779, + "loss": 0.2535, + "step": 16918 + }, + { + "epoch": 1.3706254050550875, + "grad_norm": 0.04769391939043999, + "learning_rate": 0.00014607768126378325, + "loss": 0.3187, + "step": 16919 + }, + { + "epoch": 1.370706416072586, + "grad_norm": 0.043292880058288574, + "learning_rate": 0.0001460731806111886, + "loss": 0.2666, + "step": 16920 + }, + { + "epoch": 1.3707874270900842, + "grad_norm": 0.04575731232762337, + "learning_rate": 0.00014606867995859403, + "loss": 0.3144, + "step": 16921 + }, + { + "epoch": 1.3708684381075826, + "grad_norm": 0.04306940734386444, + "learning_rate": 0.00014606417930599939, + "loss": 0.3005, + "step": 16922 + }, + { + "epoch": 1.370949449125081, + "grad_norm": 0.04159868508577347, + "learning_rate": 0.00014605967865340475, + "loss": 0.2348, + "step": 16923 + }, + { + "epoch": 1.3710304601425793, + "grad_norm": 0.04979290813207626, + "learning_rate": 0.00014605517800081013, + "loss": 0.3427, + "step": 16924 + }, + { + "epoch": 1.3711114711600778, + "grad_norm": 0.05398887023329735, + "learning_rate": 0.0001460506773482155, + "loss": 0.3358, + "step": 16925 + }, + { + "epoch": 1.3711924821775763, + "grad_norm": 0.04066229984164238, + "learning_rate": 0.00014604617669562085, + "loss": 0.2587, + "step": 16926 + }, + { + "epoch": 1.3712734931950745, + "grad_norm": 0.046182893216609955, + "learning_rate": 0.00014604167604302627, + "loss": 0.3177, + "step": 16927 + }, + { + "epoch": 1.371354504212573, + "grad_norm": 0.03884848207235336, + "learning_rate": 0.00014603717539043163, + "loss": 0.2703, + "step": 16928 + }, + { + "epoch": 1.3714355152300712, + "grad_norm": 0.04299648851156235, + "learning_rate": 0.000146032674737837, + "loss": 0.3059, + "step": 16929 + }, + { + "epoch": 1.3715165262475697, + "grad_norm": 0.04192721098661423, + "learning_rate": 0.00014602817408524237, + "loss": 0.2758, + "step": 16930 + }, + { + "epoch": 1.371597537265068, + "grad_norm": 0.055731527507305145, + "learning_rate": 0.00014602367343264773, + "loss": 0.3202, + "step": 16931 + }, + { + "epoch": 1.3716785482825664, + "grad_norm": 0.04261324554681778, + "learning_rate": 0.0001460191727800531, + "loss": 0.2719, + "step": 16932 + }, + { + "epoch": 1.3717595593000649, + "grad_norm": 0.04510960727930069, + "learning_rate": 0.0001460146721274585, + "loss": 0.305, + "step": 16933 + }, + { + "epoch": 1.3718405703175631, + "grad_norm": 0.04371994361281395, + "learning_rate": 0.00014601017147486387, + "loss": 0.2983, + "step": 16934 + }, + { + "epoch": 1.3719215813350616, + "grad_norm": 0.048851270228624344, + "learning_rate": 0.00014600567082226923, + "loss": 0.3076, + "step": 16935 + }, + { + "epoch": 1.37200259235256, + "grad_norm": 0.0492849163711071, + "learning_rate": 0.00014600117016967462, + "loss": 0.3139, + "step": 16936 + }, + { + "epoch": 1.3720836033700583, + "grad_norm": 0.04994687810540199, + "learning_rate": 0.00014599666951707998, + "loss": 0.2965, + "step": 16937 + }, + { + "epoch": 1.3721646143875568, + "grad_norm": 0.044168904423713684, + "learning_rate": 0.00014599216886448536, + "loss": 0.2867, + "step": 16938 + }, + { + "epoch": 1.372245625405055, + "grad_norm": 0.047688718885183334, + "learning_rate": 0.00014598766821189075, + "loss": 0.3327, + "step": 16939 + }, + { + "epoch": 1.3723266364225535, + "grad_norm": 0.04277990758419037, + "learning_rate": 0.0001459831675592961, + "loss": 0.2811, + "step": 16940 + }, + { + "epoch": 1.3724076474400517, + "grad_norm": 0.044236935675144196, + "learning_rate": 0.00014597866690670147, + "loss": 0.3336, + "step": 16941 + }, + { + "epoch": 1.3724886584575502, + "grad_norm": 0.042176589369773865, + "learning_rate": 0.00014597416625410686, + "loss": 0.3254, + "step": 16942 + }, + { + "epoch": 1.3725696694750487, + "grad_norm": 0.05188947916030884, + "learning_rate": 0.00014596966560151222, + "loss": 0.3014, + "step": 16943 + }, + { + "epoch": 1.372650680492547, + "grad_norm": 0.04572753980755806, + "learning_rate": 0.0001459651649489176, + "loss": 0.3422, + "step": 16944 + }, + { + "epoch": 1.3727316915100454, + "grad_norm": 0.0529056079685688, + "learning_rate": 0.000145960664296323, + "loss": 0.3064, + "step": 16945 + }, + { + "epoch": 1.3728127025275438, + "grad_norm": 0.05411520600318909, + "learning_rate": 0.00014595616364372835, + "loss": 0.2982, + "step": 16946 + }, + { + "epoch": 1.372893713545042, + "grad_norm": 0.04950553923845291, + "learning_rate": 0.0001459516629911337, + "loss": 0.3289, + "step": 16947 + }, + { + "epoch": 1.3729747245625405, + "grad_norm": 0.04515305906534195, + "learning_rate": 0.0001459471623385391, + "loss": 0.3142, + "step": 16948 + }, + { + "epoch": 1.373055735580039, + "grad_norm": 0.05544627085328102, + "learning_rate": 0.00014594266168594446, + "loss": 0.325, + "step": 16949 + }, + { + "epoch": 1.3731367465975373, + "grad_norm": 0.04322170093655586, + "learning_rate": 0.00014593816103334985, + "loss": 0.2928, + "step": 16950 + }, + { + "epoch": 1.3732177576150357, + "grad_norm": 0.04435882344841957, + "learning_rate": 0.00014593366038075523, + "loss": 0.279, + "step": 16951 + }, + { + "epoch": 1.373298768632534, + "grad_norm": 0.04589640349149704, + "learning_rate": 0.0001459291597281606, + "loss": 0.3156, + "step": 16952 + }, + { + "epoch": 1.3733797796500324, + "grad_norm": 0.04302145168185234, + "learning_rate": 0.00014592465907556595, + "loss": 0.3197, + "step": 16953 + }, + { + "epoch": 1.3734607906675307, + "grad_norm": 0.04616771265864372, + "learning_rate": 0.00014592015842297134, + "loss": 0.2785, + "step": 16954 + }, + { + "epoch": 1.3735418016850292, + "grad_norm": 0.04837910830974579, + "learning_rate": 0.0001459156577703767, + "loss": 0.3168, + "step": 16955 + }, + { + "epoch": 1.3736228127025276, + "grad_norm": 0.056114088743925095, + "learning_rate": 0.0001459111571177821, + "loss": 0.2685, + "step": 16956 + }, + { + "epoch": 1.3737038237200259, + "grad_norm": 0.04780881479382515, + "learning_rate": 0.00014590665646518748, + "loss": 0.295, + "step": 16957 + }, + { + "epoch": 1.3737848347375243, + "grad_norm": 0.047661397606134415, + "learning_rate": 0.00014590215581259284, + "loss": 0.3353, + "step": 16958 + }, + { + "epoch": 1.3738658457550228, + "grad_norm": 0.052063945680856705, + "learning_rate": 0.0001458976551599982, + "loss": 0.298, + "step": 16959 + }, + { + "epoch": 1.373946856772521, + "grad_norm": 0.049559805542230606, + "learning_rate": 0.00014589315450740358, + "loss": 0.3432, + "step": 16960 + }, + { + "epoch": 1.3740278677900195, + "grad_norm": 0.04893847927451134, + "learning_rate": 0.00014588865385480894, + "loss": 0.3224, + "step": 16961 + }, + { + "epoch": 1.3741088788075178, + "grad_norm": 0.04671037942171097, + "learning_rate": 0.00014588415320221433, + "loss": 0.3191, + "step": 16962 + }, + { + "epoch": 1.3741898898250162, + "grad_norm": 0.04712628945708275, + "learning_rate": 0.00014587965254961972, + "loss": 0.3082, + "step": 16963 + }, + { + "epoch": 1.3742709008425145, + "grad_norm": 0.037124503403902054, + "learning_rate": 0.00014587515189702508, + "loss": 0.2676, + "step": 16964 + }, + { + "epoch": 1.374351911860013, + "grad_norm": 0.04226502776145935, + "learning_rate": 0.00014587065124443044, + "loss": 0.3052, + "step": 16965 + }, + { + "epoch": 1.3744329228775114, + "grad_norm": 0.0540158748626709, + "learning_rate": 0.00014586615059183582, + "loss": 0.2956, + "step": 16966 + }, + { + "epoch": 1.3745139338950096, + "grad_norm": 0.04997124522924423, + "learning_rate": 0.0001458616499392412, + "loss": 0.2883, + "step": 16967 + }, + { + "epoch": 1.374594944912508, + "grad_norm": 0.044050950556993484, + "learning_rate": 0.00014585714928664657, + "loss": 0.2909, + "step": 16968 + }, + { + "epoch": 1.3746759559300066, + "grad_norm": 0.04482242837548256, + "learning_rate": 0.00014585264863405196, + "loss": 0.3312, + "step": 16969 + }, + { + "epoch": 1.3747569669475048, + "grad_norm": 0.049184177070856094, + "learning_rate": 0.00014584814798145732, + "loss": 0.2984, + "step": 16970 + }, + { + "epoch": 1.3748379779650033, + "grad_norm": 0.040518440306186676, + "learning_rate": 0.00014584364732886268, + "loss": 0.278, + "step": 16971 + }, + { + "epoch": 1.3749189889825018, + "grad_norm": 0.0521448478102684, + "learning_rate": 0.00014583914667626807, + "loss": 0.306, + "step": 16972 + }, + { + "epoch": 1.375, + "grad_norm": 0.04701065644621849, + "learning_rate": 0.00014583464602367345, + "loss": 0.3253, + "step": 16973 + }, + { + "epoch": 1.3750810110174982, + "grad_norm": 0.04387321695685387, + "learning_rate": 0.0001458301453710788, + "loss": 0.2966, + "step": 16974 + }, + { + "epoch": 1.3751620220349967, + "grad_norm": 0.04575170576572418, + "learning_rate": 0.0001458256447184842, + "loss": 0.3201, + "step": 16975 + }, + { + "epoch": 1.3752430330524952, + "grad_norm": 0.04442807659506798, + "learning_rate": 0.00014582114406588956, + "loss": 0.2826, + "step": 16976 + }, + { + "epoch": 1.3753240440699934, + "grad_norm": 0.04788399487733841, + "learning_rate": 0.00014581664341329492, + "loss": 0.2797, + "step": 16977 + }, + { + "epoch": 1.375405055087492, + "grad_norm": 0.0455004945397377, + "learning_rate": 0.0001458121427607003, + "loss": 0.3181, + "step": 16978 + }, + { + "epoch": 1.3754860661049904, + "grad_norm": 0.05180566757917404, + "learning_rate": 0.0001458076421081057, + "loss": 0.293, + "step": 16979 + }, + { + "epoch": 1.3755670771224886, + "grad_norm": 0.040193237364292145, + "learning_rate": 0.00014580314145551105, + "loss": 0.2766, + "step": 16980 + }, + { + "epoch": 1.375648088139987, + "grad_norm": 0.05090703070163727, + "learning_rate": 0.00014579864080291644, + "loss": 0.3479, + "step": 16981 + }, + { + "epoch": 1.3757290991574855, + "grad_norm": 0.04570503905415535, + "learning_rate": 0.0001457941401503218, + "loss": 0.3072, + "step": 16982 + }, + { + "epoch": 1.3758101101749838, + "grad_norm": 0.04995448887348175, + "learning_rate": 0.00014578963949772716, + "loss": 0.3209, + "step": 16983 + }, + { + "epoch": 1.3758911211924822, + "grad_norm": 0.048252664506435394, + "learning_rate": 0.00014578513884513255, + "loss": 0.2397, + "step": 16984 + }, + { + "epoch": 1.3759721322099805, + "grad_norm": 0.04591905698180199, + "learning_rate": 0.00014578063819253794, + "loss": 0.3117, + "step": 16985 + }, + { + "epoch": 1.376053143227479, + "grad_norm": 0.04259791970252991, + "learning_rate": 0.0001457761375399433, + "loss": 0.2689, + "step": 16986 + }, + { + "epoch": 1.3761341542449772, + "grad_norm": 0.04957650601863861, + "learning_rate": 0.00014577163688734868, + "loss": 0.3, + "step": 16987 + }, + { + "epoch": 1.3762151652624757, + "grad_norm": 0.037725962698459625, + "learning_rate": 0.00014576713623475404, + "loss": 0.2581, + "step": 16988 + }, + { + "epoch": 1.3762961762799741, + "grad_norm": 0.04181136563420296, + "learning_rate": 0.0001457626355821594, + "loss": 0.271, + "step": 16989 + }, + { + "epoch": 1.3763771872974724, + "grad_norm": 0.043500158935785294, + "learning_rate": 0.00014575813492956482, + "loss": 0.2849, + "step": 16990 + }, + { + "epoch": 1.3764581983149708, + "grad_norm": 0.04969172552227974, + "learning_rate": 0.00014575363427697018, + "loss": 0.3, + "step": 16991 + }, + { + "epoch": 1.3765392093324693, + "grad_norm": 0.04714135825634003, + "learning_rate": 0.00014574913362437554, + "loss": 0.3405, + "step": 16992 + }, + { + "epoch": 1.3766202203499676, + "grad_norm": 0.04822403937578201, + "learning_rate": 0.00014574463297178093, + "loss": 0.2928, + "step": 16993 + }, + { + "epoch": 1.376701231367466, + "grad_norm": 0.05032119154930115, + "learning_rate": 0.00014574013231918629, + "loss": 0.3185, + "step": 16994 + }, + { + "epoch": 1.3767822423849643, + "grad_norm": 0.05615558847784996, + "learning_rate": 0.00014573563166659165, + "loss": 0.3301, + "step": 16995 + }, + { + "epoch": 1.3768632534024627, + "grad_norm": 0.06276535987854004, + "learning_rate": 0.00014573113101399706, + "loss": 0.2663, + "step": 16996 + }, + { + "epoch": 1.376944264419961, + "grad_norm": 0.05143308266997337, + "learning_rate": 0.00014572663036140242, + "loss": 0.358, + "step": 16997 + }, + { + "epoch": 1.3770252754374595, + "grad_norm": 0.04123299568891525, + "learning_rate": 0.00014572212970880778, + "loss": 0.2668, + "step": 16998 + }, + { + "epoch": 1.377106286454958, + "grad_norm": 0.047784462571144104, + "learning_rate": 0.00014571762905621317, + "loss": 0.3215, + "step": 16999 + }, + { + "epoch": 1.3771872974724562, + "grad_norm": 0.041549235582351685, + "learning_rate": 0.00014571312840361853, + "loss": 0.2888, + "step": 17000 + }, + { + "epoch": 1.3772683084899546, + "grad_norm": 0.046233825385570526, + "learning_rate": 0.0001457086277510239, + "loss": 0.3146, + "step": 17001 + }, + { + "epoch": 1.377349319507453, + "grad_norm": 0.04736294224858284, + "learning_rate": 0.0001457041270984293, + "loss": 0.289, + "step": 17002 + }, + { + "epoch": 1.3774303305249513, + "grad_norm": 0.03976118192076683, + "learning_rate": 0.00014569962644583466, + "loss": 0.2903, + "step": 17003 + }, + { + "epoch": 1.3775113415424498, + "grad_norm": 0.04543759301304817, + "learning_rate": 0.00014569512579324002, + "loss": 0.2915, + "step": 17004 + }, + { + "epoch": 1.3775923525599483, + "grad_norm": 0.05306227132678032, + "learning_rate": 0.0001456906251406454, + "loss": 0.3543, + "step": 17005 + }, + { + "epoch": 1.3776733635774465, + "grad_norm": 0.045866914093494415, + "learning_rate": 0.00014568612448805077, + "loss": 0.2865, + "step": 17006 + }, + { + "epoch": 1.377754374594945, + "grad_norm": 0.04696584492921829, + "learning_rate": 0.00014568162383545613, + "loss": 0.3061, + "step": 17007 + }, + { + "epoch": 1.3778353856124432, + "grad_norm": 0.04269943758845329, + "learning_rate": 0.00014567712318286154, + "loss": 0.289, + "step": 17008 + }, + { + "epoch": 1.3779163966299417, + "grad_norm": 0.04807358980178833, + "learning_rate": 0.0001456726225302669, + "loss": 0.3005, + "step": 17009 + }, + { + "epoch": 1.37799740764744, + "grad_norm": 0.04203903675079346, + "learning_rate": 0.00014566812187767226, + "loss": 0.2712, + "step": 17010 + }, + { + "epoch": 1.3780784186649384, + "grad_norm": 0.044012703001499176, + "learning_rate": 0.00014566362122507765, + "loss": 0.2589, + "step": 17011 + }, + { + "epoch": 1.3781594296824369, + "grad_norm": 0.05308305844664574, + "learning_rate": 0.000145659120572483, + "loss": 0.2844, + "step": 17012 + }, + { + "epoch": 1.3782404406999351, + "grad_norm": 0.06335795670747757, + "learning_rate": 0.00014565461991988837, + "loss": 0.2811, + "step": 17013 + }, + { + "epoch": 1.3783214517174336, + "grad_norm": 0.049153171479701996, + "learning_rate": 0.00014565011926729378, + "loss": 0.2922, + "step": 17014 + }, + { + "epoch": 1.378402462734932, + "grad_norm": 0.04896317794919014, + "learning_rate": 0.00014564561861469914, + "loss": 0.2884, + "step": 17015 + }, + { + "epoch": 1.3784834737524303, + "grad_norm": 0.04377609118819237, + "learning_rate": 0.0001456411179621045, + "loss": 0.3007, + "step": 17016 + }, + { + "epoch": 1.3785644847699288, + "grad_norm": 0.042761363089084625, + "learning_rate": 0.0001456366173095099, + "loss": 0.281, + "step": 17017 + }, + { + "epoch": 1.378645495787427, + "grad_norm": 0.044017307460308075, + "learning_rate": 0.00014563211665691525, + "loss": 0.2993, + "step": 17018 + }, + { + "epoch": 1.3787265068049255, + "grad_norm": 0.046515289694070816, + "learning_rate": 0.00014562761600432064, + "loss": 0.2952, + "step": 17019 + }, + { + "epoch": 1.3788075178224237, + "grad_norm": 0.050369229167699814, + "learning_rate": 0.00014562311535172603, + "loss": 0.297, + "step": 17020 + }, + { + "epoch": 1.3788885288399222, + "grad_norm": 0.045705631375312805, + "learning_rate": 0.00014561861469913139, + "loss": 0.2727, + "step": 17021 + }, + { + "epoch": 1.3789695398574207, + "grad_norm": 0.039376430213451385, + "learning_rate": 0.00014561411404653675, + "loss": 0.2973, + "step": 17022 + }, + { + "epoch": 1.379050550874919, + "grad_norm": 0.04685184359550476, + "learning_rate": 0.00014560961339394213, + "loss": 0.2878, + "step": 17023 + }, + { + "epoch": 1.3791315618924174, + "grad_norm": 0.04315702244639397, + "learning_rate": 0.0001456051127413475, + "loss": 0.2918, + "step": 17024 + }, + { + "epoch": 1.3792125729099158, + "grad_norm": 0.051262155175209045, + "learning_rate": 0.00014560061208875288, + "loss": 0.3369, + "step": 17025 + }, + { + "epoch": 1.379293583927414, + "grad_norm": 0.041663069278001785, + "learning_rate": 0.00014559611143615827, + "loss": 0.2638, + "step": 17026 + }, + { + "epoch": 1.3793745949449125, + "grad_norm": 0.04210826754570007, + "learning_rate": 0.00014559161078356363, + "loss": 0.2637, + "step": 17027 + }, + { + "epoch": 1.379455605962411, + "grad_norm": 0.0514579713344574, + "learning_rate": 0.000145587110130969, + "loss": 0.3166, + "step": 17028 + }, + { + "epoch": 1.3795366169799093, + "grad_norm": 0.04053298756480217, + "learning_rate": 0.00014558260947837438, + "loss": 0.2989, + "step": 17029 + }, + { + "epoch": 1.3796176279974077, + "grad_norm": 0.041419435292482376, + "learning_rate": 0.00014557810882577974, + "loss": 0.2582, + "step": 17030 + }, + { + "epoch": 1.379698639014906, + "grad_norm": 0.050826363265514374, + "learning_rate": 0.00014557360817318512, + "loss": 0.2929, + "step": 17031 + }, + { + "epoch": 1.3797796500324044, + "grad_norm": 0.0434403158724308, + "learning_rate": 0.0001455691075205905, + "loss": 0.2849, + "step": 17032 + }, + { + "epoch": 1.3798606610499027, + "grad_norm": 0.056152284145355225, + "learning_rate": 0.00014556460686799587, + "loss": 0.2936, + "step": 17033 + }, + { + "epoch": 1.3799416720674011, + "grad_norm": 0.04322462156414986, + "learning_rate": 0.00014556010621540123, + "loss": 0.2972, + "step": 17034 + }, + { + "epoch": 1.3800226830848996, + "grad_norm": 0.04615986347198486, + "learning_rate": 0.00014555560556280662, + "loss": 0.2901, + "step": 17035 + }, + { + "epoch": 1.3801036941023979, + "grad_norm": 0.043903838843107224, + "learning_rate": 0.00014555110491021198, + "loss": 0.2885, + "step": 17036 + }, + { + "epoch": 1.3801847051198963, + "grad_norm": 0.04112345725297928, + "learning_rate": 0.00014554660425761736, + "loss": 0.2867, + "step": 17037 + }, + { + "epoch": 1.3802657161373948, + "grad_norm": 0.04830494895577431, + "learning_rate": 0.00014554210360502275, + "loss": 0.3119, + "step": 17038 + }, + { + "epoch": 1.380346727154893, + "grad_norm": 0.053838301450014114, + "learning_rate": 0.0001455376029524281, + "loss": 0.294, + "step": 17039 + }, + { + "epoch": 1.3804277381723915, + "grad_norm": 0.041832443326711655, + "learning_rate": 0.00014553310229983347, + "loss": 0.27, + "step": 17040 + }, + { + "epoch": 1.3805087491898898, + "grad_norm": 0.04943208023905754, + "learning_rate": 0.00014552860164723886, + "loss": 0.3196, + "step": 17041 + }, + { + "epoch": 1.3805897602073882, + "grad_norm": 0.05328208953142166, + "learning_rate": 0.00014552410099464425, + "loss": 0.3522, + "step": 17042 + }, + { + "epoch": 1.3806707712248865, + "grad_norm": 0.052018262445926666, + "learning_rate": 0.0001455196003420496, + "loss": 0.2947, + "step": 17043 + }, + { + "epoch": 1.380751782242385, + "grad_norm": 0.054234579205513, + "learning_rate": 0.000145515099689455, + "loss": 0.3148, + "step": 17044 + }, + { + "epoch": 1.3808327932598834, + "grad_norm": 0.044175803661346436, + "learning_rate": 0.00014551059903686035, + "loss": 0.2758, + "step": 17045 + }, + { + "epoch": 1.3809138042773816, + "grad_norm": 0.046540990471839905, + "learning_rate": 0.0001455060983842657, + "loss": 0.3532, + "step": 17046 + }, + { + "epoch": 1.38099481529488, + "grad_norm": 0.04491589963436127, + "learning_rate": 0.0001455015977316711, + "loss": 0.2979, + "step": 17047 + }, + { + "epoch": 1.3810758263123786, + "grad_norm": 0.044613197445869446, + "learning_rate": 0.0001454970970790765, + "loss": 0.281, + "step": 17048 + }, + { + "epoch": 1.3811568373298768, + "grad_norm": 0.043135564774274826, + "learning_rate": 0.00014549259642648185, + "loss": 0.298, + "step": 17049 + }, + { + "epoch": 1.3812378483473753, + "grad_norm": 0.046049654483795166, + "learning_rate": 0.00014548809577388723, + "loss": 0.312, + "step": 17050 + }, + { + "epoch": 1.3813188593648738, + "grad_norm": 0.05278931185603142, + "learning_rate": 0.0001454835951212926, + "loss": 0.3036, + "step": 17051 + }, + { + "epoch": 1.381399870382372, + "grad_norm": 0.03843000903725624, + "learning_rate": 0.00014547909446869795, + "loss": 0.2619, + "step": 17052 + }, + { + "epoch": 1.3814808813998705, + "grad_norm": 0.043770916759967804, + "learning_rate": 0.00014547459381610334, + "loss": 0.284, + "step": 17053 + }, + { + "epoch": 1.3815618924173687, + "grad_norm": 0.045125383883714676, + "learning_rate": 0.00014547009316350873, + "loss": 0.2819, + "step": 17054 + }, + { + "epoch": 1.3816429034348672, + "grad_norm": 0.043314822018146515, + "learning_rate": 0.0001454655925109141, + "loss": 0.2853, + "step": 17055 + }, + { + "epoch": 1.3817239144523654, + "grad_norm": 0.05009135231375694, + "learning_rate": 0.00014546109185831948, + "loss": 0.3329, + "step": 17056 + }, + { + "epoch": 1.3818049254698639, + "grad_norm": 0.04217385873198509, + "learning_rate": 0.00014545659120572484, + "loss": 0.3421, + "step": 17057 + }, + { + "epoch": 1.3818859364873624, + "grad_norm": 0.04606221616268158, + "learning_rate": 0.0001454520905531302, + "loss": 0.2794, + "step": 17058 + }, + { + "epoch": 1.3819669475048606, + "grad_norm": 0.04918968677520752, + "learning_rate": 0.00014544758990053558, + "loss": 0.3144, + "step": 17059 + }, + { + "epoch": 1.382047958522359, + "grad_norm": 0.04076380655169487, + "learning_rate": 0.00014544308924794097, + "loss": 0.2515, + "step": 17060 + }, + { + "epoch": 1.3821289695398575, + "grad_norm": 0.045574650168418884, + "learning_rate": 0.00014543858859534633, + "loss": 0.2567, + "step": 17061 + }, + { + "epoch": 1.3822099805573558, + "grad_norm": 0.05103718861937523, + "learning_rate": 0.00014543408794275172, + "loss": 0.2954, + "step": 17062 + }, + { + "epoch": 1.3822909915748542, + "grad_norm": 0.048355668783187866, + "learning_rate": 0.00014542958729015708, + "loss": 0.3115, + "step": 17063 + }, + { + "epoch": 1.3823720025923525, + "grad_norm": 0.04657682776451111, + "learning_rate": 0.00014542508663756244, + "loss": 0.2857, + "step": 17064 + }, + { + "epoch": 1.382453013609851, + "grad_norm": 0.04876073822379112, + "learning_rate": 0.00014542058598496782, + "loss": 0.2728, + "step": 17065 + }, + { + "epoch": 1.3825340246273492, + "grad_norm": 0.04318344220519066, + "learning_rate": 0.0001454160853323732, + "loss": 0.2725, + "step": 17066 + }, + { + "epoch": 1.3826150356448477, + "grad_norm": 0.04278486222028732, + "learning_rate": 0.00014541158467977857, + "loss": 0.2927, + "step": 17067 + }, + { + "epoch": 1.3826960466623461, + "grad_norm": 0.039600007236003876, + "learning_rate": 0.00014540708402718396, + "loss": 0.2698, + "step": 17068 + }, + { + "epoch": 1.3827770576798444, + "grad_norm": 0.03894643113017082, + "learning_rate": 0.00014540258337458932, + "loss": 0.3127, + "step": 17069 + }, + { + "epoch": 1.3828580686973428, + "grad_norm": 0.05142957344651222, + "learning_rate": 0.00014539808272199468, + "loss": 0.3259, + "step": 17070 + }, + { + "epoch": 1.3829390797148413, + "grad_norm": 0.049879662692546844, + "learning_rate": 0.0001453935820694001, + "loss": 0.37, + "step": 17071 + }, + { + "epoch": 1.3830200907323396, + "grad_norm": 0.043519534170627594, + "learning_rate": 0.00014538908141680545, + "loss": 0.2947, + "step": 17072 + }, + { + "epoch": 1.383101101749838, + "grad_norm": 0.054054081439971924, + "learning_rate": 0.00014538458076421081, + "loss": 0.3453, + "step": 17073 + }, + { + "epoch": 1.3831821127673365, + "grad_norm": 0.04072052985429764, + "learning_rate": 0.0001453800801116162, + "loss": 0.2949, + "step": 17074 + }, + { + "epoch": 1.3832631237848347, + "grad_norm": 0.052756231278181076, + "learning_rate": 0.00014537557945902156, + "loss": 0.3266, + "step": 17075 + }, + { + "epoch": 1.383344134802333, + "grad_norm": 0.05641273781657219, + "learning_rate": 0.00014537107880642692, + "loss": 0.3461, + "step": 17076 + }, + { + "epoch": 1.3834251458198314, + "grad_norm": 0.04871641844511032, + "learning_rate": 0.00014536657815383234, + "loss": 0.2565, + "step": 17077 + }, + { + "epoch": 1.38350615683733, + "grad_norm": 0.052028074860572815, + "learning_rate": 0.0001453620775012377, + "loss": 0.3101, + "step": 17078 + }, + { + "epoch": 1.3835871678548282, + "grad_norm": 0.044042039662599564, + "learning_rate": 0.00014535757684864306, + "loss": 0.3034, + "step": 17079 + }, + { + "epoch": 1.3836681788723266, + "grad_norm": 0.040498316287994385, + "learning_rate": 0.00014535307619604844, + "loss": 0.2616, + "step": 17080 + }, + { + "epoch": 1.383749189889825, + "grad_norm": 0.057534705847501755, + "learning_rate": 0.0001453485755434538, + "loss": 0.2903, + "step": 17081 + }, + { + "epoch": 1.3838302009073233, + "grad_norm": 0.045549727976322174, + "learning_rate": 0.00014534407489085916, + "loss": 0.348, + "step": 17082 + }, + { + "epoch": 1.3839112119248218, + "grad_norm": 0.045121192932128906, + "learning_rate": 0.00014533957423826458, + "loss": 0.299, + "step": 17083 + }, + { + "epoch": 1.3839922229423203, + "grad_norm": 0.04222647100687027, + "learning_rate": 0.00014533507358566994, + "loss": 0.3126, + "step": 17084 + }, + { + "epoch": 1.3840732339598185, + "grad_norm": 0.04713848978281021, + "learning_rate": 0.0001453305729330753, + "loss": 0.268, + "step": 17085 + }, + { + "epoch": 1.384154244977317, + "grad_norm": 0.04560885950922966, + "learning_rate": 0.00014532607228048068, + "loss": 0.2943, + "step": 17086 + }, + { + "epoch": 1.3842352559948152, + "grad_norm": 0.04805802181363106, + "learning_rate": 0.00014532157162788604, + "loss": 0.2794, + "step": 17087 + }, + { + "epoch": 1.3843162670123137, + "grad_norm": 0.052336081862449646, + "learning_rate": 0.0001453170709752914, + "loss": 0.32, + "step": 17088 + }, + { + "epoch": 1.384397278029812, + "grad_norm": 0.040716711431741714, + "learning_rate": 0.00014531257032269682, + "loss": 0.2258, + "step": 17089 + }, + { + "epoch": 1.3844782890473104, + "grad_norm": 0.04950367286801338, + "learning_rate": 0.00014530806967010218, + "loss": 0.309, + "step": 17090 + }, + { + "epoch": 1.3845593000648089, + "grad_norm": 0.045787155628204346, + "learning_rate": 0.00014530356901750754, + "loss": 0.279, + "step": 17091 + }, + { + "epoch": 1.3846403110823071, + "grad_norm": 0.04720460996031761, + "learning_rate": 0.00014529906836491293, + "loss": 0.3106, + "step": 17092 + }, + { + "epoch": 1.3847213220998056, + "grad_norm": 0.048547353595495224, + "learning_rate": 0.00014529456771231829, + "loss": 0.2946, + "step": 17093 + }, + { + "epoch": 1.384802333117304, + "grad_norm": 0.057024236768484116, + "learning_rate": 0.00014529006705972367, + "loss": 0.2819, + "step": 17094 + }, + { + "epoch": 1.3848833441348023, + "grad_norm": 0.04217173159122467, + "learning_rate": 0.00014528556640712906, + "loss": 0.3118, + "step": 17095 + }, + { + "epoch": 1.3849643551523008, + "grad_norm": 0.04492117837071419, + "learning_rate": 0.00014528106575453442, + "loss": 0.2932, + "step": 17096 + }, + { + "epoch": 1.3850453661697992, + "grad_norm": 0.04013582691550255, + "learning_rate": 0.00014527656510193978, + "loss": 0.2452, + "step": 17097 + }, + { + "epoch": 1.3851263771872975, + "grad_norm": 0.04117108881473541, + "learning_rate": 0.00014527206444934517, + "loss": 0.2858, + "step": 17098 + }, + { + "epoch": 1.3852073882047957, + "grad_norm": 0.05159113183617592, + "learning_rate": 0.00014526756379675053, + "loss": 0.3098, + "step": 17099 + }, + { + "epoch": 1.3852883992222942, + "grad_norm": 0.047474876046180725, + "learning_rate": 0.00014526306314415591, + "loss": 0.3164, + "step": 17100 + }, + { + "epoch": 1.3853694102397927, + "grad_norm": 0.05002215877175331, + "learning_rate": 0.0001452585624915613, + "loss": 0.3241, + "step": 17101 + }, + { + "epoch": 1.385450421257291, + "grad_norm": 0.0520719438791275, + "learning_rate": 0.00014525406183896666, + "loss": 0.3029, + "step": 17102 + }, + { + "epoch": 1.3855314322747894, + "grad_norm": 0.05380908399820328, + "learning_rate": 0.00014524956118637202, + "loss": 0.3065, + "step": 17103 + }, + { + "epoch": 1.3856124432922878, + "grad_norm": 0.044111158698797226, + "learning_rate": 0.0001452450605337774, + "loss": 0.2745, + "step": 17104 + }, + { + "epoch": 1.385693454309786, + "grad_norm": 0.04867509752511978, + "learning_rate": 0.00014524055988118277, + "loss": 0.2873, + "step": 17105 + }, + { + "epoch": 1.3857744653272845, + "grad_norm": 0.042329221963882446, + "learning_rate": 0.00014523605922858816, + "loss": 0.2689, + "step": 17106 + }, + { + "epoch": 1.385855476344783, + "grad_norm": 0.048590246587991714, + "learning_rate": 0.00014523155857599354, + "loss": 0.3431, + "step": 17107 + }, + { + "epoch": 1.3859364873622813, + "grad_norm": 0.05173628404736519, + "learning_rate": 0.0001452270579233989, + "loss": 0.3024, + "step": 17108 + }, + { + "epoch": 1.3860174983797797, + "grad_norm": 0.04990382865071297, + "learning_rate": 0.00014522255727080426, + "loss": 0.3145, + "step": 17109 + }, + { + "epoch": 1.386098509397278, + "grad_norm": 0.04661140218377113, + "learning_rate": 0.00014521805661820965, + "loss": 0.3038, + "step": 17110 + }, + { + "epoch": 1.3861795204147764, + "grad_norm": 0.050237026065588, + "learning_rate": 0.000145213555965615, + "loss": 0.3157, + "step": 17111 + }, + { + "epoch": 1.3862605314322747, + "grad_norm": 0.0511772558093071, + "learning_rate": 0.0001452090553130204, + "loss": 0.2904, + "step": 17112 + }, + { + "epoch": 1.3863415424497731, + "grad_norm": 0.05320873484015465, + "learning_rate": 0.00014520455466042578, + "loss": 0.2833, + "step": 17113 + }, + { + "epoch": 1.3864225534672716, + "grad_norm": 0.05491953343153, + "learning_rate": 0.00014520005400783114, + "loss": 0.3432, + "step": 17114 + }, + { + "epoch": 1.3865035644847699, + "grad_norm": 0.0481005497276783, + "learning_rate": 0.0001451955533552365, + "loss": 0.2956, + "step": 17115 + }, + { + "epoch": 1.3865845755022683, + "grad_norm": 0.04878426343202591, + "learning_rate": 0.0001451910527026419, + "loss": 0.3071, + "step": 17116 + }, + { + "epoch": 1.3866655865197668, + "grad_norm": 0.04745024815201759, + "learning_rate": 0.00014518655205004725, + "loss": 0.3111, + "step": 17117 + }, + { + "epoch": 1.386746597537265, + "grad_norm": 0.044038161635398865, + "learning_rate": 0.00014518205139745264, + "loss": 0.2737, + "step": 17118 + }, + { + "epoch": 1.3868276085547635, + "grad_norm": 0.04577605426311493, + "learning_rate": 0.00014517755074485803, + "loss": 0.2782, + "step": 17119 + }, + { + "epoch": 1.3869086195722617, + "grad_norm": 0.052599694579839706, + "learning_rate": 0.0001451730500922634, + "loss": 0.3343, + "step": 17120 + }, + { + "epoch": 1.3869896305897602, + "grad_norm": 0.045472402125597, + "learning_rate": 0.00014516854943966875, + "loss": 0.2855, + "step": 17121 + }, + { + "epoch": 1.3870706416072585, + "grad_norm": 0.04352157935500145, + "learning_rate": 0.00014516404878707413, + "loss": 0.2857, + "step": 17122 + }, + { + "epoch": 1.387151652624757, + "grad_norm": 0.04844208061695099, + "learning_rate": 0.00014515954813447952, + "loss": 0.2909, + "step": 17123 + }, + { + "epoch": 1.3872326636422554, + "grad_norm": 0.041017644107341766, + "learning_rate": 0.00014515504748188488, + "loss": 0.301, + "step": 17124 + }, + { + "epoch": 1.3873136746597536, + "grad_norm": 0.0501578226685524, + "learning_rate": 0.00014515054682929027, + "loss": 0.326, + "step": 17125 + }, + { + "epoch": 1.387394685677252, + "grad_norm": 0.04330161586403847, + "learning_rate": 0.00014514604617669563, + "loss": 0.2822, + "step": 17126 + }, + { + "epoch": 1.3874756966947506, + "grad_norm": 0.046240877360105515, + "learning_rate": 0.000145141545524101, + "loss": 0.2819, + "step": 17127 + }, + { + "epoch": 1.3875567077122488, + "grad_norm": 0.04883384704589844, + "learning_rate": 0.00014513704487150638, + "loss": 0.3387, + "step": 17128 + }, + { + "epoch": 1.3876377187297473, + "grad_norm": 0.04287239536643028, + "learning_rate": 0.00014513254421891176, + "loss": 0.2792, + "step": 17129 + }, + { + "epoch": 1.3877187297472457, + "grad_norm": 0.04883171617984772, + "learning_rate": 0.00014512804356631712, + "loss": 0.3086, + "step": 17130 + }, + { + "epoch": 1.387799740764744, + "grad_norm": 0.04150502011179924, + "learning_rate": 0.0001451235429137225, + "loss": 0.2497, + "step": 17131 + }, + { + "epoch": 1.3878807517822425, + "grad_norm": 0.048590175807476044, + "learning_rate": 0.00014511904226112787, + "loss": 0.3063, + "step": 17132 + }, + { + "epoch": 1.3879617627997407, + "grad_norm": 0.044636547565460205, + "learning_rate": 0.00014511454160853323, + "loss": 0.2559, + "step": 17133 + }, + { + "epoch": 1.3880427738172392, + "grad_norm": 0.047890182584524155, + "learning_rate": 0.00014511004095593862, + "loss": 0.2809, + "step": 17134 + }, + { + "epoch": 1.3881237848347374, + "grad_norm": 0.04968463256955147, + "learning_rate": 0.000145105540303344, + "loss": 0.2887, + "step": 17135 + }, + { + "epoch": 1.3882047958522359, + "grad_norm": 0.04098277539014816, + "learning_rate": 0.00014510103965074936, + "loss": 0.2905, + "step": 17136 + }, + { + "epoch": 1.3882858068697344, + "grad_norm": 0.0413254052400589, + "learning_rate": 0.00014509653899815475, + "loss": 0.2363, + "step": 17137 + }, + { + "epoch": 1.3883668178872326, + "grad_norm": 0.047378312796354294, + "learning_rate": 0.0001450920383455601, + "loss": 0.2808, + "step": 17138 + }, + { + "epoch": 1.388447828904731, + "grad_norm": 0.04449395462870598, + "learning_rate": 0.00014508753769296547, + "loss": 0.2763, + "step": 17139 + }, + { + "epoch": 1.3885288399222295, + "grad_norm": 0.04626189172267914, + "learning_rate": 0.00014508303704037086, + "loss": 0.2849, + "step": 17140 + }, + { + "epoch": 1.3886098509397278, + "grad_norm": 0.03956826031208038, + "learning_rate": 0.00014507853638777625, + "loss": 0.2612, + "step": 17141 + }, + { + "epoch": 1.3886908619572262, + "grad_norm": 0.04964509606361389, + "learning_rate": 0.0001450740357351816, + "loss": 0.3347, + "step": 17142 + }, + { + "epoch": 1.3887718729747245, + "grad_norm": 0.04381033033132553, + "learning_rate": 0.000145069535082587, + "loss": 0.2745, + "step": 17143 + }, + { + "epoch": 1.388852883992223, + "grad_norm": 0.05008375644683838, + "learning_rate": 0.00014506503442999235, + "loss": 0.3153, + "step": 17144 + }, + { + "epoch": 1.3889338950097212, + "grad_norm": 0.053938351571559906, + "learning_rate": 0.0001450605337773977, + "loss": 0.3015, + "step": 17145 + }, + { + "epoch": 1.3890149060272197, + "grad_norm": 0.04867379739880562, + "learning_rate": 0.0001450560331248031, + "loss": 0.3107, + "step": 17146 + }, + { + "epoch": 1.3890959170447181, + "grad_norm": 0.036837805062532425, + "learning_rate": 0.0001450515324722085, + "loss": 0.2407, + "step": 17147 + }, + { + "epoch": 1.3891769280622164, + "grad_norm": 0.04842856153845787, + "learning_rate": 0.00014504703181961385, + "loss": 0.335, + "step": 17148 + }, + { + "epoch": 1.3892579390797148, + "grad_norm": 0.050567056983709335, + "learning_rate": 0.00014504253116701923, + "loss": 0.3484, + "step": 17149 + }, + { + "epoch": 1.3893389500972133, + "grad_norm": 0.04667452722787857, + "learning_rate": 0.0001450380305144246, + "loss": 0.3453, + "step": 17150 + }, + { + "epoch": 1.3894199611147116, + "grad_norm": 0.04436580464243889, + "learning_rate": 0.00014503352986182995, + "loss": 0.3176, + "step": 17151 + }, + { + "epoch": 1.38950097213221, + "grad_norm": 0.04287957027554512, + "learning_rate": 0.00014502902920923537, + "loss": 0.2645, + "step": 17152 + }, + { + "epoch": 1.3895819831497085, + "grad_norm": 0.04573435336351395, + "learning_rate": 0.00014502452855664073, + "loss": 0.3097, + "step": 17153 + }, + { + "epoch": 1.3896629941672067, + "grad_norm": 0.052313655614852905, + "learning_rate": 0.0001450200279040461, + "loss": 0.3208, + "step": 17154 + }, + { + "epoch": 1.3897440051847052, + "grad_norm": 0.04121054708957672, + "learning_rate": 0.00014501552725145148, + "loss": 0.2768, + "step": 17155 + }, + { + "epoch": 1.3898250162022034, + "grad_norm": 0.03237319737672806, + "learning_rate": 0.00014501102659885684, + "loss": 0.2415, + "step": 17156 + }, + { + "epoch": 1.389906027219702, + "grad_norm": 0.04825165122747421, + "learning_rate": 0.0001450065259462622, + "loss": 0.3146, + "step": 17157 + }, + { + "epoch": 1.3899870382372002, + "grad_norm": 0.0393325574696064, + "learning_rate": 0.0001450020252936676, + "loss": 0.3009, + "step": 17158 + }, + { + "epoch": 1.3900680492546986, + "grad_norm": 0.04445468261837959, + "learning_rate": 0.00014499752464107297, + "loss": 0.2746, + "step": 17159 + }, + { + "epoch": 1.390149060272197, + "grad_norm": 0.050849344581365585, + "learning_rate": 0.00014499302398847833, + "loss": 0.2809, + "step": 17160 + }, + { + "epoch": 1.3902300712896953, + "grad_norm": 0.04268684610724449, + "learning_rate": 0.00014498852333588372, + "loss": 0.2856, + "step": 17161 + }, + { + "epoch": 1.3903110823071938, + "grad_norm": 0.040725164115428925, + "learning_rate": 0.00014498402268328908, + "loss": 0.2781, + "step": 17162 + }, + { + "epoch": 1.3903920933246923, + "grad_norm": 0.05028020963072777, + "learning_rate": 0.00014497952203069444, + "loss": 0.3093, + "step": 17163 + }, + { + "epoch": 1.3904731043421905, + "grad_norm": 0.04498764127492905, + "learning_rate": 0.00014497502137809985, + "loss": 0.3043, + "step": 17164 + }, + { + "epoch": 1.390554115359689, + "grad_norm": 0.0481903962790966, + "learning_rate": 0.0001449705207255052, + "loss": 0.2977, + "step": 17165 + }, + { + "epoch": 1.3906351263771872, + "grad_norm": 0.055557865649461746, + "learning_rate": 0.00014496602007291057, + "loss": 0.3173, + "step": 17166 + }, + { + "epoch": 1.3907161373946857, + "grad_norm": 0.04860248789191246, + "learning_rate": 0.00014496151942031596, + "loss": 0.3222, + "step": 17167 + }, + { + "epoch": 1.390797148412184, + "grad_norm": 0.048381075263023376, + "learning_rate": 0.00014495701876772132, + "loss": 0.2739, + "step": 17168 + }, + { + "epoch": 1.3908781594296824, + "grad_norm": 0.0555526427924633, + "learning_rate": 0.00014495251811512668, + "loss": 0.3217, + "step": 17169 + }, + { + "epoch": 1.3909591704471809, + "grad_norm": 0.04361342266201973, + "learning_rate": 0.0001449480174625321, + "loss": 0.3157, + "step": 17170 + }, + { + "epoch": 1.3910401814646791, + "grad_norm": 0.05354481190443039, + "learning_rate": 0.00014494351680993745, + "loss": 0.3008, + "step": 17171 + }, + { + "epoch": 1.3911211924821776, + "grad_norm": 0.046501703560352325, + "learning_rate": 0.00014493901615734281, + "loss": 0.2904, + "step": 17172 + }, + { + "epoch": 1.391202203499676, + "grad_norm": 0.0433221235871315, + "learning_rate": 0.0001449345155047482, + "loss": 0.2671, + "step": 17173 + }, + { + "epoch": 1.3912832145171743, + "grad_norm": 0.04641888290643692, + "learning_rate": 0.00014493001485215356, + "loss": 0.2833, + "step": 17174 + }, + { + "epoch": 1.3913642255346728, + "grad_norm": 0.047434285283088684, + "learning_rate": 0.00014492551419955895, + "loss": 0.2848, + "step": 17175 + }, + { + "epoch": 1.3914452365521712, + "grad_norm": 0.04717138037085533, + "learning_rate": 0.00014492101354696434, + "loss": 0.3092, + "step": 17176 + }, + { + "epoch": 1.3915262475696695, + "grad_norm": 0.050207510590553284, + "learning_rate": 0.0001449165128943697, + "loss": 0.2864, + "step": 17177 + }, + { + "epoch": 1.3916072585871677, + "grad_norm": 0.047605060040950775, + "learning_rate": 0.00014491201224177506, + "loss": 0.3189, + "step": 17178 + }, + { + "epoch": 1.3916882696046662, + "grad_norm": 0.04513358324766159, + "learning_rate": 0.00014490751158918044, + "loss": 0.3065, + "step": 17179 + }, + { + "epoch": 1.3917692806221647, + "grad_norm": 0.04292263090610504, + "learning_rate": 0.0001449030109365858, + "loss": 0.3054, + "step": 17180 + }, + { + "epoch": 1.391850291639663, + "grad_norm": 0.04038042575120926, + "learning_rate": 0.0001448985102839912, + "loss": 0.2982, + "step": 17181 + }, + { + "epoch": 1.3919313026571614, + "grad_norm": 0.038698893040418625, + "learning_rate": 0.00014489400963139658, + "loss": 0.2906, + "step": 17182 + }, + { + "epoch": 1.3920123136746598, + "grad_norm": 0.04013046249747276, + "learning_rate": 0.00014488950897880194, + "loss": 0.2854, + "step": 17183 + }, + { + "epoch": 1.392093324692158, + "grad_norm": 0.0430646613240242, + "learning_rate": 0.0001448850083262073, + "loss": 0.3223, + "step": 17184 + }, + { + "epoch": 1.3921743357096565, + "grad_norm": 0.04854704812169075, + "learning_rate": 0.00014488050767361268, + "loss": 0.3339, + "step": 17185 + }, + { + "epoch": 1.392255346727155, + "grad_norm": 0.04367974027991295, + "learning_rate": 0.00014487600702101804, + "loss": 0.3059, + "step": 17186 + }, + { + "epoch": 1.3923363577446533, + "grad_norm": 0.044980958104133606, + "learning_rate": 0.00014487150636842343, + "loss": 0.3084, + "step": 17187 + }, + { + "epoch": 1.3924173687621517, + "grad_norm": 0.04090898111462593, + "learning_rate": 0.00014486700571582882, + "loss": 0.2755, + "step": 17188 + }, + { + "epoch": 1.39249837977965, + "grad_norm": 0.04232384264469147, + "learning_rate": 0.00014486250506323418, + "loss": 0.3006, + "step": 17189 + }, + { + "epoch": 1.3925793907971484, + "grad_norm": 0.051384977996349335, + "learning_rate": 0.00014485800441063954, + "loss": 0.342, + "step": 17190 + }, + { + "epoch": 1.3926604018146467, + "grad_norm": 0.04421504586935043, + "learning_rate": 0.00014485350375804493, + "loss": 0.2529, + "step": 17191 + }, + { + "epoch": 1.3927414128321451, + "grad_norm": 0.05006080120801926, + "learning_rate": 0.00014484900310545029, + "loss": 0.2788, + "step": 17192 + }, + { + "epoch": 1.3928224238496436, + "grad_norm": 0.04934096336364746, + "learning_rate": 0.00014484450245285567, + "loss": 0.3027, + "step": 17193 + }, + { + "epoch": 1.3929034348671419, + "grad_norm": 0.04648890718817711, + "learning_rate": 0.00014484000180026106, + "loss": 0.2849, + "step": 17194 + }, + { + "epoch": 1.3929844458846403, + "grad_norm": 0.04589102044701576, + "learning_rate": 0.00014483550114766642, + "loss": 0.3191, + "step": 17195 + }, + { + "epoch": 1.3930654569021388, + "grad_norm": 0.05279819667339325, + "learning_rate": 0.00014483100049507178, + "loss": 0.3106, + "step": 17196 + }, + { + "epoch": 1.393146467919637, + "grad_norm": 0.04485386237502098, + "learning_rate": 0.00014482649984247717, + "loss": 0.2625, + "step": 17197 + }, + { + "epoch": 1.3932274789371355, + "grad_norm": 0.05579359084367752, + "learning_rate": 0.00014482199918988253, + "loss": 0.3536, + "step": 17198 + }, + { + "epoch": 1.393308489954634, + "grad_norm": 0.052236780524253845, + "learning_rate": 0.00014481749853728791, + "loss": 0.302, + "step": 17199 + }, + { + "epoch": 1.3933895009721322, + "grad_norm": 0.04790930077433586, + "learning_rate": 0.0001448129978846933, + "loss": 0.2871, + "step": 17200 + }, + { + "epoch": 1.3934705119896305, + "grad_norm": 0.05095363408327103, + "learning_rate": 0.00014480849723209866, + "loss": 0.2967, + "step": 17201 + }, + { + "epoch": 1.393551523007129, + "grad_norm": 0.04160373657941818, + "learning_rate": 0.00014480399657950402, + "loss": 0.2765, + "step": 17202 + }, + { + "epoch": 1.3936325340246274, + "grad_norm": 0.058393362909555435, + "learning_rate": 0.0001447994959269094, + "loss": 0.3585, + "step": 17203 + }, + { + "epoch": 1.3937135450421256, + "grad_norm": 0.049784865230321884, + "learning_rate": 0.0001447949952743148, + "loss": 0.2966, + "step": 17204 + }, + { + "epoch": 1.393794556059624, + "grad_norm": 0.04413321241736412, + "learning_rate": 0.00014479049462172016, + "loss": 0.2658, + "step": 17205 + }, + { + "epoch": 1.3938755670771226, + "grad_norm": 0.05727369338274002, + "learning_rate": 0.00014478599396912554, + "loss": 0.2876, + "step": 17206 + }, + { + "epoch": 1.3939565780946208, + "grad_norm": 0.05408977344632149, + "learning_rate": 0.0001447814933165309, + "loss": 0.294, + "step": 17207 + }, + { + "epoch": 1.3940375891121193, + "grad_norm": 0.056550346314907074, + "learning_rate": 0.00014477699266393626, + "loss": 0.3085, + "step": 17208 + }, + { + "epoch": 1.3941186001296177, + "grad_norm": 0.05060124769806862, + "learning_rate": 0.00014477249201134165, + "loss": 0.297, + "step": 17209 + }, + { + "epoch": 1.394199611147116, + "grad_norm": 0.04698517918586731, + "learning_rate": 0.00014476799135874704, + "loss": 0.3272, + "step": 17210 + }, + { + "epoch": 1.3942806221646145, + "grad_norm": 0.0382196418941021, + "learning_rate": 0.0001447634907061524, + "loss": 0.2702, + "step": 17211 + }, + { + "epoch": 1.3943616331821127, + "grad_norm": 0.05150233581662178, + "learning_rate": 0.00014475899005355779, + "loss": 0.3431, + "step": 17212 + }, + { + "epoch": 1.3944426441996112, + "grad_norm": 0.05292709171772003, + "learning_rate": 0.00014475448940096315, + "loss": 0.3246, + "step": 17213 + }, + { + "epoch": 1.3945236552171094, + "grad_norm": 0.04780266806483269, + "learning_rate": 0.0001447499887483685, + "loss": 0.3281, + "step": 17214 + }, + { + "epoch": 1.3946046662346079, + "grad_norm": 0.04817802086472511, + "learning_rate": 0.0001447454880957739, + "loss": 0.3149, + "step": 17215 + }, + { + "epoch": 1.3946856772521063, + "grad_norm": 0.04268627613782883, + "learning_rate": 0.00014474098744317928, + "loss": 0.2797, + "step": 17216 + }, + { + "epoch": 1.3947666882696046, + "grad_norm": 0.05220309644937515, + "learning_rate": 0.00014473648679058464, + "loss": 0.3197, + "step": 17217 + }, + { + "epoch": 1.394847699287103, + "grad_norm": 0.0412866473197937, + "learning_rate": 0.00014473198613799003, + "loss": 0.2593, + "step": 17218 + }, + { + "epoch": 1.3949287103046015, + "grad_norm": 0.042804330587387085, + "learning_rate": 0.0001447274854853954, + "loss": 0.3024, + "step": 17219 + }, + { + "epoch": 1.3950097213220998, + "grad_norm": 0.049674104899168015, + "learning_rate": 0.00014472298483280075, + "loss": 0.2912, + "step": 17220 + }, + { + "epoch": 1.3950907323395982, + "grad_norm": 0.04464632272720337, + "learning_rate": 0.00014471848418020613, + "loss": 0.29, + "step": 17221 + }, + { + "epoch": 1.3951717433570965, + "grad_norm": 0.04729987680912018, + "learning_rate": 0.00014471398352761152, + "loss": 0.3131, + "step": 17222 + }, + { + "epoch": 1.395252754374595, + "grad_norm": 0.04054408147931099, + "learning_rate": 0.00014470948287501688, + "loss": 0.2598, + "step": 17223 + }, + { + "epoch": 1.3953337653920932, + "grad_norm": 0.04292924329638481, + "learning_rate": 0.00014470498222242227, + "loss": 0.2883, + "step": 17224 + }, + { + "epoch": 1.3954147764095917, + "grad_norm": 0.05302182585000992, + "learning_rate": 0.00014470048156982763, + "loss": 0.3552, + "step": 17225 + }, + { + "epoch": 1.3954957874270901, + "grad_norm": 0.05824055150151253, + "learning_rate": 0.000144695980917233, + "loss": 0.3049, + "step": 17226 + }, + { + "epoch": 1.3955767984445884, + "grad_norm": 0.04591545835137367, + "learning_rate": 0.0001446914802646384, + "loss": 0.3713, + "step": 17227 + }, + { + "epoch": 1.3956578094620868, + "grad_norm": 0.04642440751194954, + "learning_rate": 0.00014468697961204376, + "loss": 0.2973, + "step": 17228 + }, + { + "epoch": 1.3957388204795853, + "grad_norm": 0.050178322941064835, + "learning_rate": 0.00014468247895944912, + "loss": 0.3178, + "step": 17229 + }, + { + "epoch": 1.3958198314970836, + "grad_norm": 0.04168196767568588, + "learning_rate": 0.0001446779783068545, + "loss": 0.2828, + "step": 17230 + }, + { + "epoch": 1.395900842514582, + "grad_norm": 0.044799406081438065, + "learning_rate": 0.00014467347765425987, + "loss": 0.2779, + "step": 17231 + }, + { + "epoch": 1.3959818535320805, + "grad_norm": 0.04037578031420708, + "learning_rate": 0.00014466897700166523, + "loss": 0.27, + "step": 17232 + }, + { + "epoch": 1.3960628645495787, + "grad_norm": 0.0469634085893631, + "learning_rate": 0.00014466447634907064, + "loss": 0.3166, + "step": 17233 + }, + { + "epoch": 1.3961438755670772, + "grad_norm": 0.04802028089761734, + "learning_rate": 0.000144659975696476, + "loss": 0.2894, + "step": 17234 + }, + { + "epoch": 1.3962248865845754, + "grad_norm": 0.05452967435121536, + "learning_rate": 0.00014465547504388136, + "loss": 0.2915, + "step": 17235 + }, + { + "epoch": 1.396305897602074, + "grad_norm": 0.05245371535420418, + "learning_rate": 0.00014465097439128675, + "loss": 0.3184, + "step": 17236 + }, + { + "epoch": 1.3963869086195722, + "grad_norm": 0.04539962857961655, + "learning_rate": 0.0001446464737386921, + "loss": 0.2804, + "step": 17237 + }, + { + "epoch": 1.3964679196370706, + "grad_norm": 0.04864729195833206, + "learning_rate": 0.00014464197308609747, + "loss": 0.3261, + "step": 17238 + }, + { + "epoch": 1.396548930654569, + "grad_norm": 0.042114321142435074, + "learning_rate": 0.00014463747243350289, + "loss": 0.2805, + "step": 17239 + }, + { + "epoch": 1.3966299416720673, + "grad_norm": 0.045672401785850525, + "learning_rate": 0.00014463297178090825, + "loss": 0.2741, + "step": 17240 + }, + { + "epoch": 1.3967109526895658, + "grad_norm": 0.048088978976011276, + "learning_rate": 0.0001446284711283136, + "loss": 0.3332, + "step": 17241 + }, + { + "epoch": 1.3967919637070643, + "grad_norm": 0.04908011108636856, + "learning_rate": 0.000144623970475719, + "loss": 0.3323, + "step": 17242 + }, + { + "epoch": 1.3968729747245625, + "grad_norm": 0.041393984109163284, + "learning_rate": 0.00014461946982312435, + "loss": 0.304, + "step": 17243 + }, + { + "epoch": 1.396953985742061, + "grad_norm": 0.041824113577604294, + "learning_rate": 0.0001446149691705297, + "loss": 0.2911, + "step": 17244 + }, + { + "epoch": 1.3970349967595592, + "grad_norm": 0.04793976619839668, + "learning_rate": 0.00014461046851793513, + "loss": 0.3734, + "step": 17245 + }, + { + "epoch": 1.3971160077770577, + "grad_norm": 0.04303743317723274, + "learning_rate": 0.0001446059678653405, + "loss": 0.2384, + "step": 17246 + }, + { + "epoch": 1.397197018794556, + "grad_norm": 0.04717111960053444, + "learning_rate": 0.00014460146721274585, + "loss": 0.3152, + "step": 17247 + }, + { + "epoch": 1.3972780298120544, + "grad_norm": 0.040281910449266434, + "learning_rate": 0.00014459696656015123, + "loss": 0.2517, + "step": 17248 + }, + { + "epoch": 1.3973590408295529, + "grad_norm": 0.056516893208026886, + "learning_rate": 0.0001445924659075566, + "loss": 0.3499, + "step": 17249 + }, + { + "epoch": 1.3974400518470511, + "grad_norm": 0.04654380679130554, + "learning_rate": 0.00014458796525496195, + "loss": 0.3169, + "step": 17250 + }, + { + "epoch": 1.3975210628645496, + "grad_norm": 0.03778151795268059, + "learning_rate": 0.00014458346460236737, + "loss": 0.248, + "step": 17251 + }, + { + "epoch": 1.397602073882048, + "grad_norm": 0.040179815143346786, + "learning_rate": 0.00014457896394977273, + "loss": 0.2426, + "step": 17252 + }, + { + "epoch": 1.3976830848995463, + "grad_norm": 0.0417088121175766, + "learning_rate": 0.0001445744632971781, + "loss": 0.2871, + "step": 17253 + }, + { + "epoch": 1.3977640959170448, + "grad_norm": 0.04187750443816185, + "learning_rate": 0.00014456996264458348, + "loss": 0.2724, + "step": 17254 + }, + { + "epoch": 1.3978451069345432, + "grad_norm": 0.04558548703789711, + "learning_rate": 0.00014456546199198884, + "loss": 0.2547, + "step": 17255 + }, + { + "epoch": 1.3979261179520415, + "grad_norm": 0.04336896911263466, + "learning_rate": 0.00014456096133939422, + "loss": 0.2819, + "step": 17256 + }, + { + "epoch": 1.39800712896954, + "grad_norm": 0.04509134590625763, + "learning_rate": 0.0001445564606867996, + "loss": 0.2874, + "step": 17257 + }, + { + "epoch": 1.3980881399870382, + "grad_norm": 0.05202138051390648, + "learning_rate": 0.00014455196003420497, + "loss": 0.3186, + "step": 17258 + }, + { + "epoch": 1.3981691510045366, + "grad_norm": 0.04768068343400955, + "learning_rate": 0.00014454745938161033, + "loss": 0.2865, + "step": 17259 + }, + { + "epoch": 1.398250162022035, + "grad_norm": 0.050039276480674744, + "learning_rate": 0.00014454295872901572, + "loss": 0.3219, + "step": 17260 + }, + { + "epoch": 1.3983311730395334, + "grad_norm": 0.04390815645456314, + "learning_rate": 0.00014453845807642108, + "loss": 0.2912, + "step": 17261 + }, + { + "epoch": 1.3984121840570318, + "grad_norm": 0.047734200954437256, + "learning_rate": 0.00014453395742382647, + "loss": 0.2992, + "step": 17262 + }, + { + "epoch": 1.39849319507453, + "grad_norm": 0.043062061071395874, + "learning_rate": 0.00014452945677123185, + "loss": 0.2591, + "step": 17263 + }, + { + "epoch": 1.3985742060920285, + "grad_norm": 0.057991400361061096, + "learning_rate": 0.0001445249561186372, + "loss": 0.3354, + "step": 17264 + }, + { + "epoch": 1.398655217109527, + "grad_norm": 0.05490287020802498, + "learning_rate": 0.00014452045546604257, + "loss": 0.2914, + "step": 17265 + }, + { + "epoch": 1.3987362281270252, + "grad_norm": 0.04641677439212799, + "learning_rate": 0.00014451595481344796, + "loss": 0.2985, + "step": 17266 + }, + { + "epoch": 1.3988172391445237, + "grad_norm": 0.043556440621614456, + "learning_rate": 0.00014451145416085332, + "loss": 0.2545, + "step": 17267 + }, + { + "epoch": 1.398898250162022, + "grad_norm": 0.050701990723609924, + "learning_rate": 0.0001445069535082587, + "loss": 0.3046, + "step": 17268 + }, + { + "epoch": 1.3989792611795204, + "grad_norm": 0.04276740923523903, + "learning_rate": 0.0001445024528556641, + "loss": 0.2597, + "step": 17269 + }, + { + "epoch": 1.3990602721970187, + "grad_norm": 0.050360411405563354, + "learning_rate": 0.00014449795220306945, + "loss": 0.298, + "step": 17270 + }, + { + "epoch": 1.3991412832145171, + "grad_norm": 0.050647344440221786, + "learning_rate": 0.00014449345155047481, + "loss": 0.29, + "step": 17271 + }, + { + "epoch": 1.3992222942320156, + "grad_norm": 0.04819323495030403, + "learning_rate": 0.0001444889508978802, + "loss": 0.2892, + "step": 17272 + }, + { + "epoch": 1.3993033052495139, + "grad_norm": 0.046121057122945786, + "learning_rate": 0.00014448445024528556, + "loss": 0.2957, + "step": 17273 + }, + { + "epoch": 1.3993843162670123, + "grad_norm": 0.042291220277547836, + "learning_rate": 0.00014447994959269095, + "loss": 0.2936, + "step": 17274 + }, + { + "epoch": 1.3994653272845108, + "grad_norm": 0.04065663740038872, + "learning_rate": 0.00014447544894009634, + "loss": 0.2683, + "step": 17275 + }, + { + "epoch": 1.399546338302009, + "grad_norm": 0.04960739612579346, + "learning_rate": 0.0001444709482875017, + "loss": 0.3218, + "step": 17276 + }, + { + "epoch": 1.3996273493195075, + "grad_norm": 0.0449470616877079, + "learning_rate": 0.00014446644763490706, + "loss": 0.3225, + "step": 17277 + }, + { + "epoch": 1.399708360337006, + "grad_norm": 0.038688644766807556, + "learning_rate": 0.00014446194698231244, + "loss": 0.2883, + "step": 17278 + }, + { + "epoch": 1.3997893713545042, + "grad_norm": 0.04773914813995361, + "learning_rate": 0.00014445744632971783, + "loss": 0.2974, + "step": 17279 + }, + { + "epoch": 1.3998703823720027, + "grad_norm": 0.04623384773731232, + "learning_rate": 0.0001444529456771232, + "loss": 0.2791, + "step": 17280 + }, + { + "epoch": 1.399951393389501, + "grad_norm": 0.05427708104252815, + "learning_rate": 0.00014444844502452858, + "loss": 0.3503, + "step": 17281 + }, + { + "epoch": 1.4000324044069994, + "grad_norm": 0.04826388135552406, + "learning_rate": 0.00014444394437193394, + "loss": 0.2907, + "step": 17282 + }, + { + "epoch": 1.4001134154244976, + "grad_norm": 0.04934573173522949, + "learning_rate": 0.0001444394437193393, + "loss": 0.3046, + "step": 17283 + }, + { + "epoch": 1.400194426441996, + "grad_norm": 0.053253334015607834, + "learning_rate": 0.00014443494306674468, + "loss": 0.2954, + "step": 17284 + }, + { + "epoch": 1.4002754374594946, + "grad_norm": 0.04720380902290344, + "learning_rate": 0.00014443044241415007, + "loss": 0.2678, + "step": 17285 + }, + { + "epoch": 1.4003564484769928, + "grad_norm": 0.05413306504487991, + "learning_rate": 0.00014442594176155543, + "loss": 0.3269, + "step": 17286 + }, + { + "epoch": 1.4004374594944913, + "grad_norm": 0.04777345061302185, + "learning_rate": 0.00014442144110896082, + "loss": 0.2964, + "step": 17287 + }, + { + "epoch": 1.4005184705119897, + "grad_norm": 0.04929986596107483, + "learning_rate": 0.00014441694045636618, + "loss": 0.3227, + "step": 17288 + }, + { + "epoch": 1.400599481529488, + "grad_norm": 0.05416525527834892, + "learning_rate": 0.00014441243980377154, + "loss": 0.342, + "step": 17289 + }, + { + "epoch": 1.4006804925469865, + "grad_norm": 0.04756501317024231, + "learning_rate": 0.00014440793915117693, + "loss": 0.2701, + "step": 17290 + }, + { + "epoch": 1.4007615035644847, + "grad_norm": 0.05230550467967987, + "learning_rate": 0.0001444034384985823, + "loss": 0.3122, + "step": 17291 + }, + { + "epoch": 1.4008425145819832, + "grad_norm": 0.04841061308979988, + "learning_rate": 0.00014439893784598767, + "loss": 0.283, + "step": 17292 + }, + { + "epoch": 1.4009235255994814, + "grad_norm": 0.054997723549604416, + "learning_rate": 0.00014439443719339306, + "loss": 0.2914, + "step": 17293 + }, + { + "epoch": 1.4010045366169799, + "grad_norm": 0.04789207503199577, + "learning_rate": 0.00014438993654079842, + "loss": 0.3314, + "step": 17294 + }, + { + "epoch": 1.4010855476344783, + "grad_norm": 0.04890673980116844, + "learning_rate": 0.00014438543588820378, + "loss": 0.2881, + "step": 17295 + }, + { + "epoch": 1.4011665586519766, + "grad_norm": 0.052223674952983856, + "learning_rate": 0.00014438093523560917, + "loss": 0.288, + "step": 17296 + }, + { + "epoch": 1.401247569669475, + "grad_norm": 0.04447759687900543, + "learning_rate": 0.00014437643458301455, + "loss": 0.3056, + "step": 17297 + }, + { + "epoch": 1.4013285806869735, + "grad_norm": 0.05134432390332222, + "learning_rate": 0.00014437193393041991, + "loss": 0.3211, + "step": 17298 + }, + { + "epoch": 1.4014095917044718, + "grad_norm": 0.04146159812808037, + "learning_rate": 0.0001443674332778253, + "loss": 0.2842, + "step": 17299 + }, + { + "epoch": 1.4014906027219702, + "grad_norm": 0.049241818487644196, + "learning_rate": 0.00014436293262523066, + "loss": 0.2775, + "step": 17300 + }, + { + "epoch": 1.4015716137394687, + "grad_norm": 0.044086962938308716, + "learning_rate": 0.00014435843197263602, + "loss": 0.2868, + "step": 17301 + }, + { + "epoch": 1.401652624756967, + "grad_norm": 0.04333024471998215, + "learning_rate": 0.0001443539313200414, + "loss": 0.2454, + "step": 17302 + }, + { + "epoch": 1.4017336357744652, + "grad_norm": 0.04945259913802147, + "learning_rate": 0.0001443494306674468, + "loss": 0.3419, + "step": 17303 + }, + { + "epoch": 1.4018146467919637, + "grad_norm": 0.04579545557498932, + "learning_rate": 0.00014434493001485216, + "loss": 0.2969, + "step": 17304 + }, + { + "epoch": 1.4018956578094621, + "grad_norm": 0.04276122525334358, + "learning_rate": 0.00014434042936225754, + "loss": 0.2742, + "step": 17305 + }, + { + "epoch": 1.4019766688269604, + "grad_norm": 0.04025482386350632, + "learning_rate": 0.0001443359287096629, + "loss": 0.2863, + "step": 17306 + }, + { + "epoch": 1.4020576798444588, + "grad_norm": 0.04628659412264824, + "learning_rate": 0.00014433142805706826, + "loss": 0.3348, + "step": 17307 + }, + { + "epoch": 1.4021386908619573, + "grad_norm": 0.04473595693707466, + "learning_rate": 0.00014432692740447368, + "loss": 0.334, + "step": 17308 + }, + { + "epoch": 1.4022197018794555, + "grad_norm": 0.04520515352487564, + "learning_rate": 0.00014432242675187904, + "loss": 0.2613, + "step": 17309 + }, + { + "epoch": 1.402300712896954, + "grad_norm": 0.04926326498389244, + "learning_rate": 0.0001443179260992844, + "loss": 0.2874, + "step": 17310 + }, + { + "epoch": 1.4023817239144525, + "grad_norm": 0.050219591706991196, + "learning_rate": 0.00014431342544668979, + "loss": 0.3218, + "step": 17311 + }, + { + "epoch": 1.4024627349319507, + "grad_norm": 0.04277326911687851, + "learning_rate": 0.00014430892479409515, + "loss": 0.2502, + "step": 17312 + }, + { + "epoch": 1.4025437459494492, + "grad_norm": 0.04090512916445732, + "learning_rate": 0.0001443044241415005, + "loss": 0.2855, + "step": 17313 + }, + { + "epoch": 1.4026247569669474, + "grad_norm": 0.03912261128425598, + "learning_rate": 0.00014429992348890592, + "loss": 0.2717, + "step": 17314 + }, + { + "epoch": 1.402705767984446, + "grad_norm": 0.04668138921260834, + "learning_rate": 0.00014429542283631128, + "loss": 0.2876, + "step": 17315 + }, + { + "epoch": 1.4027867790019442, + "grad_norm": 0.04623110964894295, + "learning_rate": 0.00014429092218371664, + "loss": 0.3286, + "step": 17316 + }, + { + "epoch": 1.4028677900194426, + "grad_norm": 0.052278418093919754, + "learning_rate": 0.00014428642153112203, + "loss": 0.3155, + "step": 17317 + }, + { + "epoch": 1.402948801036941, + "grad_norm": 0.04533730074763298, + "learning_rate": 0.0001442819208785274, + "loss": 0.279, + "step": 17318 + }, + { + "epoch": 1.4030298120544393, + "grad_norm": 0.047513846307992935, + "learning_rate": 0.00014427742022593275, + "loss": 0.2908, + "step": 17319 + }, + { + "epoch": 1.4031108230719378, + "grad_norm": 0.04029323533177376, + "learning_rate": 0.00014427291957333816, + "loss": 0.2659, + "step": 17320 + }, + { + "epoch": 1.4031918340894363, + "grad_norm": 0.041442520916461945, + "learning_rate": 0.00014426841892074352, + "loss": 0.2502, + "step": 17321 + }, + { + "epoch": 1.4032728451069345, + "grad_norm": 0.039636142551898956, + "learning_rate": 0.00014426391826814888, + "loss": 0.2842, + "step": 17322 + }, + { + "epoch": 1.403353856124433, + "grad_norm": 0.04423043504357338, + "learning_rate": 0.00014425941761555427, + "loss": 0.2613, + "step": 17323 + }, + { + "epoch": 1.4034348671419314, + "grad_norm": 0.04182843491435051, + "learning_rate": 0.00014425491696295963, + "loss": 0.2998, + "step": 17324 + }, + { + "epoch": 1.4035158781594297, + "grad_norm": 0.043021295219659805, + "learning_rate": 0.000144250416310365, + "loss": 0.2717, + "step": 17325 + }, + { + "epoch": 1.403596889176928, + "grad_norm": 0.060046643018722534, + "learning_rate": 0.0001442459156577704, + "loss": 0.313, + "step": 17326 + }, + { + "epoch": 1.4036779001944264, + "grad_norm": 0.04655013605952263, + "learning_rate": 0.00014424141500517576, + "loss": 0.2629, + "step": 17327 + }, + { + "epoch": 1.4037589112119249, + "grad_norm": 0.04855664446949959, + "learning_rate": 0.00014423691435258112, + "loss": 0.2973, + "step": 17328 + }, + { + "epoch": 1.403839922229423, + "grad_norm": 0.044373802840709686, + "learning_rate": 0.0001442324136999865, + "loss": 0.2731, + "step": 17329 + }, + { + "epoch": 1.4039209332469216, + "grad_norm": 0.045156631618738174, + "learning_rate": 0.00014422791304739187, + "loss": 0.3102, + "step": 17330 + }, + { + "epoch": 1.40400194426442, + "grad_norm": 0.048915185034275055, + "learning_rate": 0.00014422341239479723, + "loss": 0.2817, + "step": 17331 + }, + { + "epoch": 1.4040829552819183, + "grad_norm": 0.05194459110498428, + "learning_rate": 0.00014421891174220264, + "loss": 0.2527, + "step": 17332 + }, + { + "epoch": 1.4041639662994168, + "grad_norm": 0.05551757663488388, + "learning_rate": 0.000144214411089608, + "loss": 0.3105, + "step": 17333 + }, + { + "epoch": 1.4042449773169152, + "grad_norm": 0.05318770557641983, + "learning_rate": 0.00014420991043701336, + "loss": 0.282, + "step": 17334 + }, + { + "epoch": 1.4043259883344135, + "grad_norm": 0.05146007612347603, + "learning_rate": 0.00014420540978441875, + "loss": 0.3012, + "step": 17335 + }, + { + "epoch": 1.404406999351912, + "grad_norm": 0.04602733254432678, + "learning_rate": 0.0001442009091318241, + "loss": 0.2793, + "step": 17336 + }, + { + "epoch": 1.4044880103694102, + "grad_norm": 0.048853326588869095, + "learning_rate": 0.0001441964084792295, + "loss": 0.274, + "step": 17337 + }, + { + "epoch": 1.4045690213869086, + "grad_norm": 0.0575927197933197, + "learning_rate": 0.00014419190782663489, + "loss": 0.3131, + "step": 17338 + }, + { + "epoch": 1.404650032404407, + "grad_norm": 0.04368162155151367, + "learning_rate": 0.00014418740717404025, + "loss": 0.2708, + "step": 17339 + }, + { + "epoch": 1.4047310434219054, + "grad_norm": 0.050481297075748444, + "learning_rate": 0.0001441829065214456, + "loss": 0.3178, + "step": 17340 + }, + { + "epoch": 1.4048120544394038, + "grad_norm": 0.05853893235325813, + "learning_rate": 0.000144178405868851, + "loss": 0.2754, + "step": 17341 + }, + { + "epoch": 1.404893065456902, + "grad_norm": 0.04734257981181145, + "learning_rate": 0.00014417390521625635, + "loss": 0.2655, + "step": 17342 + }, + { + "epoch": 1.4049740764744005, + "grad_norm": 0.04163322597742081, + "learning_rate": 0.00014416940456366174, + "loss": 0.2701, + "step": 17343 + }, + { + "epoch": 1.405055087491899, + "grad_norm": 0.049195900559425354, + "learning_rate": 0.00014416490391106713, + "loss": 0.2844, + "step": 17344 + }, + { + "epoch": 1.4051360985093972, + "grad_norm": 0.04943544417619705, + "learning_rate": 0.0001441604032584725, + "loss": 0.3052, + "step": 17345 + }, + { + "epoch": 1.4052171095268957, + "grad_norm": 0.03431745246052742, + "learning_rate": 0.00014415590260587785, + "loss": 0.2339, + "step": 17346 + }, + { + "epoch": 1.405298120544394, + "grad_norm": 0.0482507050037384, + "learning_rate": 0.00014415140195328324, + "loss": 0.3311, + "step": 17347 + }, + { + "epoch": 1.4053791315618924, + "grad_norm": 0.05462920293211937, + "learning_rate": 0.0001441469013006886, + "loss": 0.2867, + "step": 17348 + }, + { + "epoch": 1.4054601425793907, + "grad_norm": 0.04644763469696045, + "learning_rate": 0.00014414240064809398, + "loss": 0.2695, + "step": 17349 + }, + { + "epoch": 1.4055411535968891, + "grad_norm": 0.046674180775880814, + "learning_rate": 0.00014413789999549937, + "loss": 0.2702, + "step": 17350 + }, + { + "epoch": 1.4056221646143876, + "grad_norm": 0.04269137978553772, + "learning_rate": 0.00014413339934290473, + "loss": 0.2963, + "step": 17351 + }, + { + "epoch": 1.4057031756318858, + "grad_norm": 0.05493892356753349, + "learning_rate": 0.0001441288986903101, + "loss": 0.2937, + "step": 17352 + }, + { + "epoch": 1.4057841866493843, + "grad_norm": 0.04131404310464859, + "learning_rate": 0.00014412439803771548, + "loss": 0.2734, + "step": 17353 + }, + { + "epoch": 1.4058651976668828, + "grad_norm": 0.049556102603673935, + "learning_rate": 0.00014411989738512084, + "loss": 0.2928, + "step": 17354 + }, + { + "epoch": 1.405946208684381, + "grad_norm": 0.05011004954576492, + "learning_rate": 0.00014411539673252622, + "loss": 0.3133, + "step": 17355 + }, + { + "epoch": 1.4060272197018795, + "grad_norm": 0.044947218149900436, + "learning_rate": 0.0001441108960799316, + "loss": 0.279, + "step": 17356 + }, + { + "epoch": 1.406108230719378, + "grad_norm": 0.04014430567622185, + "learning_rate": 0.00014410639542733697, + "loss": 0.2732, + "step": 17357 + }, + { + "epoch": 1.4061892417368762, + "grad_norm": 0.044052235782146454, + "learning_rate": 0.00014410189477474233, + "loss": 0.2701, + "step": 17358 + }, + { + "epoch": 1.4062702527543747, + "grad_norm": 0.05131961405277252, + "learning_rate": 0.00014409739412214772, + "loss": 0.2827, + "step": 17359 + }, + { + "epoch": 1.406351263771873, + "grad_norm": 0.048617325723171234, + "learning_rate": 0.0001440928934695531, + "loss": 0.2951, + "step": 17360 + }, + { + "epoch": 1.4064322747893714, + "grad_norm": 0.04641023278236389, + "learning_rate": 0.00014408839281695847, + "loss": 0.2634, + "step": 17361 + }, + { + "epoch": 1.4065132858068696, + "grad_norm": 0.04850432276725769, + "learning_rate": 0.00014408389216436385, + "loss": 0.3197, + "step": 17362 + }, + { + "epoch": 1.406594296824368, + "grad_norm": 0.052125390619039536, + "learning_rate": 0.0001440793915117692, + "loss": 0.318, + "step": 17363 + }, + { + "epoch": 1.4066753078418666, + "grad_norm": 0.05008331313729286, + "learning_rate": 0.00014407489085917457, + "loss": 0.2845, + "step": 17364 + }, + { + "epoch": 1.4067563188593648, + "grad_norm": 0.043981775641441345, + "learning_rate": 0.00014407039020657996, + "loss": 0.2716, + "step": 17365 + }, + { + "epoch": 1.4068373298768633, + "grad_norm": 0.04916222020983696, + "learning_rate": 0.00014406588955398535, + "loss": 0.3511, + "step": 17366 + }, + { + "epoch": 1.4069183408943617, + "grad_norm": 0.05921647697687149, + "learning_rate": 0.0001440613889013907, + "loss": 0.3094, + "step": 17367 + }, + { + "epoch": 1.40699935191186, + "grad_norm": 0.059582922607660294, + "learning_rate": 0.0001440568882487961, + "loss": 0.3039, + "step": 17368 + }, + { + "epoch": 1.4070803629293585, + "grad_norm": 0.046799056231975555, + "learning_rate": 0.00014405238759620145, + "loss": 0.2956, + "step": 17369 + }, + { + "epoch": 1.4071613739468567, + "grad_norm": 0.04210817441344261, + "learning_rate": 0.00014404788694360681, + "loss": 0.282, + "step": 17370 + }, + { + "epoch": 1.4072423849643552, + "grad_norm": 0.047428447753190994, + "learning_rate": 0.0001440433862910122, + "loss": 0.3341, + "step": 17371 + }, + { + "epoch": 1.4073233959818534, + "grad_norm": 0.04631941020488739, + "learning_rate": 0.0001440388856384176, + "loss": 0.2893, + "step": 17372 + }, + { + "epoch": 1.4074044069993519, + "grad_norm": 0.04561125487089157, + "learning_rate": 0.00014403438498582295, + "loss": 0.2995, + "step": 17373 + }, + { + "epoch": 1.4074854180168503, + "grad_norm": 0.041558220982551575, + "learning_rate": 0.00014402988433322834, + "loss": 0.2435, + "step": 17374 + }, + { + "epoch": 1.4075664290343486, + "grad_norm": 0.04296829551458359, + "learning_rate": 0.0001440253836806337, + "loss": 0.2909, + "step": 17375 + }, + { + "epoch": 1.407647440051847, + "grad_norm": 0.04158598557114601, + "learning_rate": 0.00014402088302803906, + "loss": 0.2772, + "step": 17376 + }, + { + "epoch": 1.4077284510693455, + "grad_norm": 0.048123203217983246, + "learning_rate": 0.00014401638237544444, + "loss": 0.317, + "step": 17377 + }, + { + "epoch": 1.4078094620868438, + "grad_norm": 0.046641673892736435, + "learning_rate": 0.00014401188172284983, + "loss": 0.2621, + "step": 17378 + }, + { + "epoch": 1.4078904731043422, + "grad_norm": 0.04805005341768265, + "learning_rate": 0.0001440073810702552, + "loss": 0.2927, + "step": 17379 + }, + { + "epoch": 1.4079714841218407, + "grad_norm": 0.042665332555770874, + "learning_rate": 0.00014400288041766058, + "loss": 0.2765, + "step": 17380 + }, + { + "epoch": 1.408052495139339, + "grad_norm": 0.040832191705703735, + "learning_rate": 0.00014399837976506594, + "loss": 0.2718, + "step": 17381 + }, + { + "epoch": 1.4081335061568374, + "grad_norm": 0.052016451954841614, + "learning_rate": 0.0001439938791124713, + "loss": 0.3092, + "step": 17382 + }, + { + "epoch": 1.4082145171743357, + "grad_norm": 0.051831454038619995, + "learning_rate": 0.00014398937845987668, + "loss": 0.2816, + "step": 17383 + }, + { + "epoch": 1.4082955281918341, + "grad_norm": 0.05814917013049126, + "learning_rate": 0.00014398487780728207, + "loss": 0.3025, + "step": 17384 + }, + { + "epoch": 1.4083765392093324, + "grad_norm": 0.044517673552036285, + "learning_rate": 0.00014398037715468743, + "loss": 0.302, + "step": 17385 + }, + { + "epoch": 1.4084575502268308, + "grad_norm": 0.04865877702832222, + "learning_rate": 0.00014397587650209282, + "loss": 0.2901, + "step": 17386 + }, + { + "epoch": 1.4085385612443293, + "grad_norm": 0.04901030287146568, + "learning_rate": 0.00014397137584949818, + "loss": 0.2944, + "step": 17387 + }, + { + "epoch": 1.4086195722618275, + "grad_norm": 0.049782563000917435, + "learning_rate": 0.00014396687519690354, + "loss": 0.3291, + "step": 17388 + }, + { + "epoch": 1.408700583279326, + "grad_norm": 0.04484162852168083, + "learning_rate": 0.00014396237454430895, + "loss": 0.2685, + "step": 17389 + }, + { + "epoch": 1.4087815942968245, + "grad_norm": 0.044998399913311005, + "learning_rate": 0.00014395787389171431, + "loss": 0.3116, + "step": 17390 + }, + { + "epoch": 1.4088626053143227, + "grad_norm": 0.05644839629530907, + "learning_rate": 0.00014395337323911967, + "loss": 0.3065, + "step": 17391 + }, + { + "epoch": 1.4089436163318212, + "grad_norm": 0.04989921674132347, + "learning_rate": 0.00014394887258652506, + "loss": 0.305, + "step": 17392 + }, + { + "epoch": 1.4090246273493194, + "grad_norm": 0.04435759410262108, + "learning_rate": 0.00014394437193393042, + "loss": 0.2975, + "step": 17393 + }, + { + "epoch": 1.409105638366818, + "grad_norm": 0.03662458434700966, + "learning_rate": 0.00014393987128133578, + "loss": 0.237, + "step": 17394 + }, + { + "epoch": 1.4091866493843161, + "grad_norm": 0.04757731780409813, + "learning_rate": 0.0001439353706287412, + "loss": 0.2674, + "step": 17395 + }, + { + "epoch": 1.4092676604018146, + "grad_norm": 0.04408986121416092, + "learning_rate": 0.00014393086997614656, + "loss": 0.2879, + "step": 17396 + }, + { + "epoch": 1.409348671419313, + "grad_norm": 0.04086359590291977, + "learning_rate": 0.00014392636932355192, + "loss": 0.2643, + "step": 17397 + }, + { + "epoch": 1.4094296824368113, + "grad_norm": 0.04684007167816162, + "learning_rate": 0.0001439218686709573, + "loss": 0.3393, + "step": 17398 + }, + { + "epoch": 1.4095106934543098, + "grad_norm": 0.05799132585525513, + "learning_rate": 0.00014391736801836266, + "loss": 0.3512, + "step": 17399 + }, + { + "epoch": 1.4095917044718083, + "grad_norm": 0.046153489500284195, + "learning_rate": 0.00014391286736576802, + "loss": 0.2964, + "step": 17400 + }, + { + "epoch": 1.4096727154893065, + "grad_norm": 0.041574712842702866, + "learning_rate": 0.00014390836671317344, + "loss": 0.306, + "step": 17401 + }, + { + "epoch": 1.409753726506805, + "grad_norm": 0.046612098813056946, + "learning_rate": 0.0001439038660605788, + "loss": 0.3065, + "step": 17402 + }, + { + "epoch": 1.4098347375243034, + "grad_norm": 0.056140147149562836, + "learning_rate": 0.00014389936540798416, + "loss": 0.3139, + "step": 17403 + }, + { + "epoch": 1.4099157485418017, + "grad_norm": 0.04897418990731239, + "learning_rate": 0.00014389486475538954, + "loss": 0.3261, + "step": 17404 + }, + { + "epoch": 1.4099967595593, + "grad_norm": 0.046919066458940506, + "learning_rate": 0.0001438903641027949, + "loss": 0.2988, + "step": 17405 + }, + { + "epoch": 1.4100777705767984, + "grad_norm": 0.044254470616579056, + "learning_rate": 0.00014388586345020026, + "loss": 0.2725, + "step": 17406 + }, + { + "epoch": 1.4101587815942969, + "grad_norm": 0.04446631669998169, + "learning_rate": 0.00014388136279760568, + "loss": 0.2901, + "step": 17407 + }, + { + "epoch": 1.410239792611795, + "grad_norm": 0.041478581726551056, + "learning_rate": 0.00014387686214501104, + "loss": 0.2538, + "step": 17408 + }, + { + "epoch": 1.4103208036292936, + "grad_norm": 0.04481646791100502, + "learning_rate": 0.0001438723614924164, + "loss": 0.2382, + "step": 17409 + }, + { + "epoch": 1.410401814646792, + "grad_norm": 0.05845337361097336, + "learning_rate": 0.00014386786083982179, + "loss": 0.2876, + "step": 17410 + }, + { + "epoch": 1.4104828256642903, + "grad_norm": 0.04668666422367096, + "learning_rate": 0.00014386336018722715, + "loss": 0.2543, + "step": 17411 + }, + { + "epoch": 1.4105638366817888, + "grad_norm": 0.05035103112459183, + "learning_rate": 0.00014385885953463253, + "loss": 0.31, + "step": 17412 + }, + { + "epoch": 1.4106448476992872, + "grad_norm": 0.050838127732276917, + "learning_rate": 0.00014385435888203792, + "loss": 0.3168, + "step": 17413 + }, + { + "epoch": 1.4107258587167855, + "grad_norm": 0.045839011669158936, + "learning_rate": 0.00014384985822944328, + "loss": 0.3024, + "step": 17414 + }, + { + "epoch": 1.410806869734284, + "grad_norm": 0.05352982133626938, + "learning_rate": 0.00014384535757684864, + "loss": 0.244, + "step": 17415 + }, + { + "epoch": 1.4108878807517822, + "grad_norm": 0.03870326653122902, + "learning_rate": 0.00014384085692425403, + "loss": 0.3024, + "step": 17416 + }, + { + "epoch": 1.4109688917692806, + "grad_norm": 0.04420241713523865, + "learning_rate": 0.0001438363562716594, + "loss": 0.3003, + "step": 17417 + }, + { + "epoch": 1.4110499027867789, + "grad_norm": 0.0470711812376976, + "learning_rate": 0.00014383185561906477, + "loss": 0.314, + "step": 17418 + }, + { + "epoch": 1.4111309138042774, + "grad_norm": 0.05128021910786629, + "learning_rate": 0.00014382735496647016, + "loss": 0.2769, + "step": 17419 + }, + { + "epoch": 1.4112119248217758, + "grad_norm": 0.05623095855116844, + "learning_rate": 0.00014382285431387552, + "loss": 0.2973, + "step": 17420 + }, + { + "epoch": 1.411292935839274, + "grad_norm": 0.04474678263068199, + "learning_rate": 0.00014381835366128088, + "loss": 0.2599, + "step": 17421 + }, + { + "epoch": 1.4113739468567725, + "grad_norm": 0.047019362449645996, + "learning_rate": 0.00014381385300868627, + "loss": 0.3274, + "step": 17422 + }, + { + "epoch": 1.411454957874271, + "grad_norm": 0.048342298716306686, + "learning_rate": 0.00014380935235609163, + "loss": 0.2736, + "step": 17423 + }, + { + "epoch": 1.4115359688917692, + "grad_norm": 0.05731989070773125, + "learning_rate": 0.00014380485170349702, + "loss": 0.3377, + "step": 17424 + }, + { + "epoch": 1.4116169799092677, + "grad_norm": 0.05092157796025276, + "learning_rate": 0.0001438003510509024, + "loss": 0.2952, + "step": 17425 + }, + { + "epoch": 1.4116979909267662, + "grad_norm": 0.05478844791650772, + "learning_rate": 0.00014379585039830776, + "loss": 0.2794, + "step": 17426 + }, + { + "epoch": 1.4117790019442644, + "grad_norm": 0.055522721260786057, + "learning_rate": 0.00014379134974571312, + "loss": 0.3056, + "step": 17427 + }, + { + "epoch": 1.4118600129617627, + "grad_norm": 0.058090608566999435, + "learning_rate": 0.0001437868490931185, + "loss": 0.316, + "step": 17428 + }, + { + "epoch": 1.4119410239792611, + "grad_norm": 0.050185300409793854, + "learning_rate": 0.00014378234844052387, + "loss": 0.3293, + "step": 17429 + }, + { + "epoch": 1.4120220349967596, + "grad_norm": 0.053385164588689804, + "learning_rate": 0.00014377784778792926, + "loss": 0.273, + "step": 17430 + }, + { + "epoch": 1.4121030460142578, + "grad_norm": 0.039984747767448425, + "learning_rate": 0.00014377334713533464, + "loss": 0.2756, + "step": 17431 + }, + { + "epoch": 1.4121840570317563, + "grad_norm": 0.04153714329004288, + "learning_rate": 0.00014376884648274, + "loss": 0.2774, + "step": 17432 + }, + { + "epoch": 1.4122650680492548, + "grad_norm": 0.04483303055167198, + "learning_rate": 0.00014376434583014536, + "loss": 0.2972, + "step": 17433 + }, + { + "epoch": 1.412346079066753, + "grad_norm": 0.048719923943281174, + "learning_rate": 0.00014375984517755075, + "loss": 0.2931, + "step": 17434 + }, + { + "epoch": 1.4124270900842515, + "grad_norm": 0.05421128123998642, + "learning_rate": 0.0001437553445249561, + "loss": 0.3014, + "step": 17435 + }, + { + "epoch": 1.41250810110175, + "grad_norm": 0.04559450224041939, + "learning_rate": 0.0001437508438723615, + "loss": 0.2765, + "step": 17436 + }, + { + "epoch": 1.4125891121192482, + "grad_norm": 0.04924327880144119, + "learning_rate": 0.0001437463432197669, + "loss": 0.2795, + "step": 17437 + }, + { + "epoch": 1.4126701231367467, + "grad_norm": 0.047822628170251846, + "learning_rate": 0.00014374184256717225, + "loss": 0.2936, + "step": 17438 + }, + { + "epoch": 1.412751134154245, + "grad_norm": 0.05012073740363121, + "learning_rate": 0.0001437373419145776, + "loss": 0.2822, + "step": 17439 + }, + { + "epoch": 1.4128321451717434, + "grad_norm": 0.04607747495174408, + "learning_rate": 0.000143732841261983, + "loss": 0.2702, + "step": 17440 + }, + { + "epoch": 1.4129131561892416, + "grad_norm": 0.0424031987786293, + "learning_rate": 0.00014372834060938838, + "loss": 0.2642, + "step": 17441 + }, + { + "epoch": 1.41299416720674, + "grad_norm": 0.04610888659954071, + "learning_rate": 0.00014372383995679374, + "loss": 0.3052, + "step": 17442 + }, + { + "epoch": 1.4130751782242386, + "grad_norm": 0.05413646996021271, + "learning_rate": 0.00014371933930419913, + "loss": 0.3381, + "step": 17443 + }, + { + "epoch": 1.4131561892417368, + "grad_norm": 0.041760869324207306, + "learning_rate": 0.0001437148386516045, + "loss": 0.2994, + "step": 17444 + }, + { + "epoch": 1.4132372002592353, + "grad_norm": 0.05390756204724312, + "learning_rate": 0.00014371033799900985, + "loss": 0.3173, + "step": 17445 + }, + { + "epoch": 1.4133182112767337, + "grad_norm": 0.04660959541797638, + "learning_rate": 0.00014370583734641524, + "loss": 0.2799, + "step": 17446 + }, + { + "epoch": 1.413399222294232, + "grad_norm": 0.05528154969215393, + "learning_rate": 0.00014370133669382062, + "loss": 0.2931, + "step": 17447 + }, + { + "epoch": 1.4134802333117304, + "grad_norm": 0.052793197333812714, + "learning_rate": 0.00014369683604122598, + "loss": 0.3628, + "step": 17448 + }, + { + "epoch": 1.4135612443292287, + "grad_norm": 0.059102971106767654, + "learning_rate": 0.00014369233538863137, + "loss": 0.2787, + "step": 17449 + }, + { + "epoch": 1.4136422553467272, + "grad_norm": 0.04352337867021561, + "learning_rate": 0.00014368783473603673, + "loss": 0.2759, + "step": 17450 + }, + { + "epoch": 1.4137232663642254, + "grad_norm": 0.04289750009775162, + "learning_rate": 0.0001436833340834421, + "loss": 0.2716, + "step": 17451 + }, + { + "epoch": 1.4138042773817239, + "grad_norm": 0.04225050285458565, + "learning_rate": 0.00014367883343084748, + "loss": 0.2609, + "step": 17452 + }, + { + "epoch": 1.4138852883992223, + "grad_norm": 0.0459427535533905, + "learning_rate": 0.00014367433277825286, + "loss": 0.3019, + "step": 17453 + }, + { + "epoch": 1.4139662994167206, + "grad_norm": 0.04791085422039032, + "learning_rate": 0.00014366983212565822, + "loss": 0.27, + "step": 17454 + }, + { + "epoch": 1.414047310434219, + "grad_norm": 0.043755799531936646, + "learning_rate": 0.0001436653314730636, + "loss": 0.2919, + "step": 17455 + }, + { + "epoch": 1.4141283214517175, + "grad_norm": 0.048999980092048645, + "learning_rate": 0.00014366083082046897, + "loss": 0.2995, + "step": 17456 + }, + { + "epoch": 1.4142093324692158, + "grad_norm": 0.05153946951031685, + "learning_rate": 0.00014365633016787433, + "loss": 0.3496, + "step": 17457 + }, + { + "epoch": 1.4142903434867142, + "grad_norm": 0.047688182443380356, + "learning_rate": 0.00014365182951527972, + "loss": 0.2802, + "step": 17458 + }, + { + "epoch": 1.4143713545042127, + "grad_norm": 0.049873966723680496, + "learning_rate": 0.0001436473288626851, + "loss": 0.2901, + "step": 17459 + }, + { + "epoch": 1.414452365521711, + "grad_norm": 0.038455478847026825, + "learning_rate": 0.00014364282821009047, + "loss": 0.2676, + "step": 17460 + }, + { + "epoch": 1.4145333765392094, + "grad_norm": 0.045070916414260864, + "learning_rate": 0.00014363832755749585, + "loss": 0.3148, + "step": 17461 + }, + { + "epoch": 1.4146143875567077, + "grad_norm": 0.051428597420454025, + "learning_rate": 0.0001436338269049012, + "loss": 0.3067, + "step": 17462 + }, + { + "epoch": 1.4146953985742061, + "grad_norm": 0.044932324439287186, + "learning_rate": 0.00014362932625230657, + "loss": 0.284, + "step": 17463 + }, + { + "epoch": 1.4147764095917044, + "grad_norm": 0.04706451669335365, + "learning_rate": 0.00014362482559971196, + "loss": 0.2753, + "step": 17464 + }, + { + "epoch": 1.4148574206092028, + "grad_norm": 0.045721590518951416, + "learning_rate": 0.00014362032494711735, + "loss": 0.318, + "step": 17465 + }, + { + "epoch": 1.4149384316267013, + "grad_norm": 0.04566137120127678, + "learning_rate": 0.0001436158242945227, + "loss": 0.2943, + "step": 17466 + }, + { + "epoch": 1.4150194426441995, + "grad_norm": 0.05262121185660362, + "learning_rate": 0.0001436113236419281, + "loss": 0.2875, + "step": 17467 + }, + { + "epoch": 1.415100453661698, + "grad_norm": 0.03956965357065201, + "learning_rate": 0.00014360682298933345, + "loss": 0.2832, + "step": 17468 + }, + { + "epoch": 1.4151814646791965, + "grad_norm": 0.05302854999899864, + "learning_rate": 0.00014360232233673881, + "loss": 0.3504, + "step": 17469 + }, + { + "epoch": 1.4152624756966947, + "grad_norm": 0.04107096791267395, + "learning_rate": 0.00014359782168414423, + "loss": 0.2568, + "step": 17470 + }, + { + "epoch": 1.4153434867141932, + "grad_norm": 0.04127948731184006, + "learning_rate": 0.0001435933210315496, + "loss": 0.2456, + "step": 17471 + }, + { + "epoch": 1.4154244977316914, + "grad_norm": 0.05177254602313042, + "learning_rate": 0.00014358882037895495, + "loss": 0.2714, + "step": 17472 + }, + { + "epoch": 1.41550550874919, + "grad_norm": 0.051889799535274506, + "learning_rate": 0.00014358431972636034, + "loss": 0.3258, + "step": 17473 + }, + { + "epoch": 1.4155865197666881, + "grad_norm": 0.05505822226405144, + "learning_rate": 0.0001435798190737657, + "loss": 0.3176, + "step": 17474 + }, + { + "epoch": 1.4156675307841866, + "grad_norm": 0.045118533074855804, + "learning_rate": 0.00014357531842117106, + "loss": 0.2549, + "step": 17475 + }, + { + "epoch": 1.415748541801685, + "grad_norm": 0.04382086545228958, + "learning_rate": 0.00014357081776857647, + "loss": 0.2793, + "step": 17476 + }, + { + "epoch": 1.4158295528191833, + "grad_norm": 0.04673447832465172, + "learning_rate": 0.00014356631711598183, + "loss": 0.2626, + "step": 17477 + }, + { + "epoch": 1.4159105638366818, + "grad_norm": 0.05098240077495575, + "learning_rate": 0.0001435618164633872, + "loss": 0.3066, + "step": 17478 + }, + { + "epoch": 1.4159915748541803, + "grad_norm": 0.044868770986795425, + "learning_rate": 0.00014355731581079258, + "loss": 0.3084, + "step": 17479 + }, + { + "epoch": 1.4160725858716785, + "grad_norm": 0.05144832283258438, + "learning_rate": 0.00014355281515819794, + "loss": 0.3084, + "step": 17480 + }, + { + "epoch": 1.416153596889177, + "grad_norm": 0.04495812952518463, + "learning_rate": 0.0001435483145056033, + "loss": 0.2875, + "step": 17481 + }, + { + "epoch": 1.4162346079066754, + "grad_norm": 0.0467311292886734, + "learning_rate": 0.0001435438138530087, + "loss": 0.2889, + "step": 17482 + }, + { + "epoch": 1.4163156189241737, + "grad_norm": 0.041240394115448, + "learning_rate": 0.00014353931320041407, + "loss": 0.2478, + "step": 17483 + }, + { + "epoch": 1.4163966299416721, + "grad_norm": 0.05013001710176468, + "learning_rate": 0.00014353481254781943, + "loss": 0.2951, + "step": 17484 + }, + { + "epoch": 1.4164776409591704, + "grad_norm": 0.043379612267017365, + "learning_rate": 0.00014353031189522482, + "loss": 0.3091, + "step": 17485 + }, + { + "epoch": 1.4165586519766689, + "grad_norm": 0.043509263545274734, + "learning_rate": 0.00014352581124263018, + "loss": 0.2935, + "step": 17486 + }, + { + "epoch": 1.416639662994167, + "grad_norm": 0.0534079447388649, + "learning_rate": 0.00014352131059003554, + "loss": 0.3182, + "step": 17487 + }, + { + "epoch": 1.4167206740116656, + "grad_norm": 0.03979664295911789, + "learning_rate": 0.00014351680993744095, + "loss": 0.2867, + "step": 17488 + }, + { + "epoch": 1.416801685029164, + "grad_norm": 0.044355884194374084, + "learning_rate": 0.00014351230928484631, + "loss": 0.2988, + "step": 17489 + }, + { + "epoch": 1.4168826960466623, + "grad_norm": 0.05444491282105446, + "learning_rate": 0.00014350780863225167, + "loss": 0.3639, + "step": 17490 + }, + { + "epoch": 1.4169637070641607, + "grad_norm": 0.04946650564670563, + "learning_rate": 0.00014350330797965706, + "loss": 0.313, + "step": 17491 + }, + { + "epoch": 1.4170447180816592, + "grad_norm": 0.04110720753669739, + "learning_rate": 0.00014349880732706242, + "loss": 0.2727, + "step": 17492 + }, + { + "epoch": 1.4171257290991575, + "grad_norm": 0.04805957153439522, + "learning_rate": 0.0001434943066744678, + "loss": 0.2919, + "step": 17493 + }, + { + "epoch": 1.417206740116656, + "grad_norm": 0.04576924815773964, + "learning_rate": 0.0001434898060218732, + "loss": 0.3147, + "step": 17494 + }, + { + "epoch": 1.4172877511341542, + "grad_norm": 0.04689079895615578, + "learning_rate": 0.00014348530536927856, + "loss": 0.2974, + "step": 17495 + }, + { + "epoch": 1.4173687621516526, + "grad_norm": 0.04951557517051697, + "learning_rate": 0.00014348080471668392, + "loss": 0.298, + "step": 17496 + }, + { + "epoch": 1.4174497731691509, + "grad_norm": 0.051957134157419205, + "learning_rate": 0.0001434763040640893, + "loss": 0.3061, + "step": 17497 + }, + { + "epoch": 1.4175307841866494, + "grad_norm": 0.04447033628821373, + "learning_rate": 0.00014347180341149466, + "loss": 0.3085, + "step": 17498 + }, + { + "epoch": 1.4176117952041478, + "grad_norm": 0.04421267658472061, + "learning_rate": 0.00014346730275890005, + "loss": 0.2995, + "step": 17499 + }, + { + "epoch": 1.417692806221646, + "grad_norm": 0.051344774663448334, + "learning_rate": 0.00014346280210630544, + "loss": 0.3357, + "step": 17500 + }, + { + "epoch": 1.4177738172391445, + "grad_norm": 0.04639267548918724, + "learning_rate": 0.0001434583014537108, + "loss": 0.2876, + "step": 17501 + }, + { + "epoch": 1.417854828256643, + "grad_norm": 0.052176062017679214, + "learning_rate": 0.00014345380080111616, + "loss": 0.3093, + "step": 17502 + }, + { + "epoch": 1.4179358392741412, + "grad_norm": 0.043765075504779816, + "learning_rate": 0.00014344930014852154, + "loss": 0.2827, + "step": 17503 + }, + { + "epoch": 1.4180168502916397, + "grad_norm": 0.04381469264626503, + "learning_rate": 0.0001434447994959269, + "loss": 0.3041, + "step": 17504 + }, + { + "epoch": 1.4180978613091382, + "grad_norm": 0.04380662366747856, + "learning_rate": 0.0001434402988433323, + "loss": 0.2814, + "step": 17505 + }, + { + "epoch": 1.4181788723266364, + "grad_norm": 0.049800705164670944, + "learning_rate": 0.00014343579819073768, + "loss": 0.2786, + "step": 17506 + }, + { + "epoch": 1.4182598833441347, + "grad_norm": 0.039393678307533264, + "learning_rate": 0.00014343129753814304, + "loss": 0.2672, + "step": 17507 + }, + { + "epoch": 1.4183408943616331, + "grad_norm": 0.04441035911440849, + "learning_rate": 0.0001434267968855484, + "loss": 0.3138, + "step": 17508 + }, + { + "epoch": 1.4184219053791316, + "grad_norm": 0.06868467479944229, + "learning_rate": 0.00014342229623295379, + "loss": 0.3052, + "step": 17509 + }, + { + "epoch": 1.4185029163966298, + "grad_norm": 0.038274139165878296, + "learning_rate": 0.00014341779558035915, + "loss": 0.2415, + "step": 17510 + }, + { + "epoch": 1.4185839274141283, + "grad_norm": 0.04708916321396828, + "learning_rate": 0.00014341329492776453, + "loss": 0.2847, + "step": 17511 + }, + { + "epoch": 1.4186649384316268, + "grad_norm": 0.03978331759572029, + "learning_rate": 0.00014340879427516992, + "loss": 0.2361, + "step": 17512 + }, + { + "epoch": 1.418745949449125, + "grad_norm": 0.04263559356331825, + "learning_rate": 0.00014340429362257528, + "loss": 0.278, + "step": 17513 + }, + { + "epoch": 1.4188269604666235, + "grad_norm": 0.04371342808008194, + "learning_rate": 0.00014339979296998064, + "loss": 0.2796, + "step": 17514 + }, + { + "epoch": 1.418907971484122, + "grad_norm": 0.05186539515852928, + "learning_rate": 0.00014339529231738603, + "loss": 0.2925, + "step": 17515 + }, + { + "epoch": 1.4189889825016202, + "grad_norm": 0.045742131769657135, + "learning_rate": 0.0001433907916647914, + "loss": 0.3061, + "step": 17516 + }, + { + "epoch": 1.4190699935191187, + "grad_norm": 0.05176788941025734, + "learning_rate": 0.00014338629101219677, + "loss": 0.2963, + "step": 17517 + }, + { + "epoch": 1.419151004536617, + "grad_norm": 0.05797381326556206, + "learning_rate": 0.00014338179035960216, + "loss": 0.2565, + "step": 17518 + }, + { + "epoch": 1.4192320155541154, + "grad_norm": 0.03917957469820976, + "learning_rate": 0.00014337728970700752, + "loss": 0.25, + "step": 17519 + }, + { + "epoch": 1.4193130265716136, + "grad_norm": 0.05365482345223427, + "learning_rate": 0.00014337278905441288, + "loss": 0.2898, + "step": 17520 + }, + { + "epoch": 1.419394037589112, + "grad_norm": 0.03978351876139641, + "learning_rate": 0.00014336828840181827, + "loss": 0.2653, + "step": 17521 + }, + { + "epoch": 1.4194750486066106, + "grad_norm": 0.05424314737319946, + "learning_rate": 0.00014336378774922366, + "loss": 0.3284, + "step": 17522 + }, + { + "epoch": 1.4195560596241088, + "grad_norm": 0.05543599650263786, + "learning_rate": 0.00014335928709662902, + "loss": 0.3098, + "step": 17523 + }, + { + "epoch": 1.4196370706416073, + "grad_norm": 0.04948757216334343, + "learning_rate": 0.0001433547864440344, + "loss": 0.3232, + "step": 17524 + }, + { + "epoch": 1.4197180816591057, + "grad_norm": 0.05123332887887955, + "learning_rate": 0.00014335028579143976, + "loss": 0.3639, + "step": 17525 + }, + { + "epoch": 1.419799092676604, + "grad_norm": 0.05080258101224899, + "learning_rate": 0.00014334578513884512, + "loss": 0.2935, + "step": 17526 + }, + { + "epoch": 1.4198801036941024, + "grad_norm": 0.0436762310564518, + "learning_rate": 0.0001433412844862505, + "loss": 0.2761, + "step": 17527 + }, + { + "epoch": 1.419961114711601, + "grad_norm": 0.04575483128428459, + "learning_rate": 0.0001433367838336559, + "loss": 0.2589, + "step": 17528 + }, + { + "epoch": 1.4200421257290992, + "grad_norm": 0.04396912455558777, + "learning_rate": 0.00014333228318106126, + "loss": 0.298, + "step": 17529 + }, + { + "epoch": 1.4201231367465974, + "grad_norm": 0.05272604525089264, + "learning_rate": 0.00014332778252846665, + "loss": 0.3248, + "step": 17530 + }, + { + "epoch": 1.4202041477640959, + "grad_norm": 0.048338234424591064, + "learning_rate": 0.000143323281875872, + "loss": 0.3337, + "step": 17531 + }, + { + "epoch": 1.4202851587815943, + "grad_norm": 0.05578792467713356, + "learning_rate": 0.00014331878122327737, + "loss": 0.3559, + "step": 17532 + }, + { + "epoch": 1.4203661697990926, + "grad_norm": 0.04565225541591644, + "learning_rate": 0.00014331428057068275, + "loss": 0.3109, + "step": 17533 + }, + { + "epoch": 1.420447180816591, + "grad_norm": 0.04514371603727341, + "learning_rate": 0.00014330977991808814, + "loss": 0.3046, + "step": 17534 + }, + { + "epoch": 1.4205281918340895, + "grad_norm": 0.04526843503117561, + "learning_rate": 0.0001433052792654935, + "loss": 0.2795, + "step": 17535 + }, + { + "epoch": 1.4206092028515878, + "grad_norm": 0.04618031159043312, + "learning_rate": 0.0001433007786128989, + "loss": 0.2451, + "step": 17536 + }, + { + "epoch": 1.4206902138690862, + "grad_norm": 0.05096264183521271, + "learning_rate": 0.00014329627796030425, + "loss": 0.302, + "step": 17537 + }, + { + "epoch": 1.4207712248865847, + "grad_norm": 0.048887792974710464, + "learning_rate": 0.0001432917773077096, + "loss": 0.3095, + "step": 17538 + }, + { + "epoch": 1.420852235904083, + "grad_norm": 0.04953059181571007, + "learning_rate": 0.000143287276655115, + "loss": 0.2865, + "step": 17539 + }, + { + "epoch": 1.4209332469215814, + "grad_norm": 0.050928402692079544, + "learning_rate": 0.00014328277600252038, + "loss": 0.2743, + "step": 17540 + }, + { + "epoch": 1.4210142579390797, + "grad_norm": 0.05457817390561104, + "learning_rate": 0.00014327827534992574, + "loss": 0.2768, + "step": 17541 + }, + { + "epoch": 1.4210952689565781, + "grad_norm": 0.05337541922926903, + "learning_rate": 0.00014327377469733113, + "loss": 0.312, + "step": 17542 + }, + { + "epoch": 1.4211762799740764, + "grad_norm": 0.049907129257917404, + "learning_rate": 0.0001432692740447365, + "loss": 0.294, + "step": 17543 + }, + { + "epoch": 1.4212572909915748, + "grad_norm": 0.04843693599104881, + "learning_rate": 0.00014326477339214185, + "loss": 0.2941, + "step": 17544 + }, + { + "epoch": 1.4213383020090733, + "grad_norm": 0.05297977849841118, + "learning_rate": 0.00014326027273954726, + "loss": 0.31, + "step": 17545 + }, + { + "epoch": 1.4214193130265715, + "grad_norm": 0.051067106425762177, + "learning_rate": 0.00014325577208695262, + "loss": 0.3499, + "step": 17546 + }, + { + "epoch": 1.42150032404407, + "grad_norm": 0.05172743275761604, + "learning_rate": 0.00014325127143435798, + "loss": 0.319, + "step": 17547 + }, + { + "epoch": 1.4215813350615685, + "grad_norm": 0.04919605702161789, + "learning_rate": 0.00014324677078176337, + "loss": 0.2782, + "step": 17548 + }, + { + "epoch": 1.4216623460790667, + "grad_norm": 0.05195494741201401, + "learning_rate": 0.00014324227012916873, + "loss": 0.3252, + "step": 17549 + }, + { + "epoch": 1.4217433570965652, + "grad_norm": 0.045493561774492264, + "learning_rate": 0.0001432377694765741, + "loss": 0.2898, + "step": 17550 + }, + { + "epoch": 1.4218243681140634, + "grad_norm": 0.04597517102956772, + "learning_rate": 0.0001432332688239795, + "loss": 0.2682, + "step": 17551 + }, + { + "epoch": 1.421905379131562, + "grad_norm": 0.04705383628606796, + "learning_rate": 0.00014322876817138486, + "loss": 0.3145, + "step": 17552 + }, + { + "epoch": 1.4219863901490601, + "grad_norm": 0.0446794331073761, + "learning_rate": 0.00014322426751879022, + "loss": 0.2827, + "step": 17553 + }, + { + "epoch": 1.4220674011665586, + "grad_norm": 0.051160577684640884, + "learning_rate": 0.0001432197668661956, + "loss": 0.2759, + "step": 17554 + }, + { + "epoch": 1.422148412184057, + "grad_norm": 0.04348822310566902, + "learning_rate": 0.00014321526621360097, + "loss": 0.2852, + "step": 17555 + }, + { + "epoch": 1.4222294232015553, + "grad_norm": 0.053231049329042435, + "learning_rate": 0.00014321076556100633, + "loss": 0.2997, + "step": 17556 + }, + { + "epoch": 1.4223104342190538, + "grad_norm": 0.052900660783052444, + "learning_rate": 0.00014320626490841175, + "loss": 0.3744, + "step": 17557 + }, + { + "epoch": 1.4223914452365523, + "grad_norm": 0.04624491184949875, + "learning_rate": 0.0001432017642558171, + "loss": 0.3021, + "step": 17558 + }, + { + "epoch": 1.4224724562540505, + "grad_norm": 0.04583517462015152, + "learning_rate": 0.00014319726360322247, + "loss": 0.2795, + "step": 17559 + }, + { + "epoch": 1.422553467271549, + "grad_norm": 0.049809325486421585, + "learning_rate": 0.00014319276295062785, + "loss": 0.2634, + "step": 17560 + }, + { + "epoch": 1.4226344782890474, + "grad_norm": 0.04780822992324829, + "learning_rate": 0.0001431882622980332, + "loss": 0.2788, + "step": 17561 + }, + { + "epoch": 1.4227154893065457, + "grad_norm": 0.049446288496255875, + "learning_rate": 0.00014318376164543857, + "loss": 0.2799, + "step": 17562 + }, + { + "epoch": 1.4227965003240441, + "grad_norm": 0.0466834157705307, + "learning_rate": 0.000143179260992844, + "loss": 0.3056, + "step": 17563 + }, + { + "epoch": 1.4228775113415424, + "grad_norm": 0.05659082531929016, + "learning_rate": 0.00014317476034024935, + "loss": 0.3108, + "step": 17564 + }, + { + "epoch": 1.4229585223590409, + "grad_norm": 0.045440223067998886, + "learning_rate": 0.0001431702596876547, + "loss": 0.3025, + "step": 17565 + }, + { + "epoch": 1.423039533376539, + "grad_norm": 0.04813716188073158, + "learning_rate": 0.0001431657590350601, + "loss": 0.2921, + "step": 17566 + }, + { + "epoch": 1.4231205443940376, + "grad_norm": 0.05100923404097557, + "learning_rate": 0.00014316125838246545, + "loss": 0.2936, + "step": 17567 + }, + { + "epoch": 1.423201555411536, + "grad_norm": 0.05094525218009949, + "learning_rate": 0.00014315675772987081, + "loss": 0.2931, + "step": 17568 + }, + { + "epoch": 1.4232825664290343, + "grad_norm": 0.042763397097587585, + "learning_rate": 0.00014315225707727623, + "loss": 0.26, + "step": 17569 + }, + { + "epoch": 1.4233635774465327, + "grad_norm": 0.0470680370926857, + "learning_rate": 0.0001431477564246816, + "loss": 0.287, + "step": 17570 + }, + { + "epoch": 1.4234445884640312, + "grad_norm": 0.04994115233421326, + "learning_rate": 0.00014314325577208695, + "loss": 0.3198, + "step": 17571 + }, + { + "epoch": 1.4235255994815295, + "grad_norm": 0.04325047507882118, + "learning_rate": 0.00014313875511949234, + "loss": 0.3214, + "step": 17572 + }, + { + "epoch": 1.423606610499028, + "grad_norm": 0.049197446554899216, + "learning_rate": 0.0001431342544668977, + "loss": 0.3107, + "step": 17573 + }, + { + "epoch": 1.4236876215165262, + "grad_norm": 0.043233487755060196, + "learning_rate": 0.00014312975381430308, + "loss": 0.2531, + "step": 17574 + }, + { + "epoch": 1.4237686325340246, + "grad_norm": 0.045737169682979584, + "learning_rate": 0.00014312525316170847, + "loss": 0.2693, + "step": 17575 + }, + { + "epoch": 1.4238496435515229, + "grad_norm": 0.05091886594891548, + "learning_rate": 0.00014312075250911383, + "loss": 0.2524, + "step": 17576 + }, + { + "epoch": 1.4239306545690213, + "grad_norm": 0.04994532838463783, + "learning_rate": 0.0001431162518565192, + "loss": 0.3172, + "step": 17577 + }, + { + "epoch": 1.4240116655865198, + "grad_norm": 0.044858817011117935, + "learning_rate": 0.00014311175120392458, + "loss": 0.2679, + "step": 17578 + }, + { + "epoch": 1.424092676604018, + "grad_norm": 0.05132998526096344, + "learning_rate": 0.00014310725055132994, + "loss": 0.2839, + "step": 17579 + }, + { + "epoch": 1.4241736876215165, + "grad_norm": 0.04050062224268913, + "learning_rate": 0.00014310274989873533, + "loss": 0.2747, + "step": 17580 + }, + { + "epoch": 1.424254698639015, + "grad_norm": 0.048599354922771454, + "learning_rate": 0.0001430982492461407, + "loss": 0.3611, + "step": 17581 + }, + { + "epoch": 1.4243357096565132, + "grad_norm": 0.05495349317789078, + "learning_rate": 0.00014309374859354607, + "loss": 0.3496, + "step": 17582 + }, + { + "epoch": 1.4244167206740117, + "grad_norm": 0.046302735805511475, + "learning_rate": 0.00014308924794095143, + "loss": 0.3011, + "step": 17583 + }, + { + "epoch": 1.4244977316915102, + "grad_norm": 0.04650885984301567, + "learning_rate": 0.00014308474728835682, + "loss": 0.2765, + "step": 17584 + }, + { + "epoch": 1.4245787427090084, + "grad_norm": 0.043730463832616806, + "learning_rate": 0.00014308024663576218, + "loss": 0.2923, + "step": 17585 + }, + { + "epoch": 1.4246597537265069, + "grad_norm": 0.040796149522066116, + "learning_rate": 0.00014307574598316757, + "loss": 0.264, + "step": 17586 + }, + { + "epoch": 1.4247407647440051, + "grad_norm": 0.05266737937927246, + "learning_rate": 0.00014307124533057295, + "loss": 0.3034, + "step": 17587 + }, + { + "epoch": 1.4248217757615036, + "grad_norm": 0.051856864243745804, + "learning_rate": 0.00014306674467797831, + "loss": 0.3652, + "step": 17588 + }, + { + "epoch": 1.4249027867790018, + "grad_norm": 0.053262967616319656, + "learning_rate": 0.00014306224402538367, + "loss": 0.3236, + "step": 17589 + }, + { + "epoch": 1.4249837977965003, + "grad_norm": 0.05611581355333328, + "learning_rate": 0.00014305774337278906, + "loss": 0.3119, + "step": 17590 + }, + { + "epoch": 1.4250648088139988, + "grad_norm": 0.0478108711540699, + "learning_rate": 0.00014305324272019442, + "loss": 0.2652, + "step": 17591 + }, + { + "epoch": 1.425145819831497, + "grad_norm": 0.04079686850309372, + "learning_rate": 0.0001430487420675998, + "loss": 0.2671, + "step": 17592 + }, + { + "epoch": 1.4252268308489955, + "grad_norm": 0.05292508751153946, + "learning_rate": 0.0001430442414150052, + "loss": 0.3286, + "step": 17593 + }, + { + "epoch": 1.425307841866494, + "grad_norm": 0.040990427136421204, + "learning_rate": 0.00014303974076241056, + "loss": 0.2554, + "step": 17594 + }, + { + "epoch": 1.4253888528839922, + "grad_norm": 0.044480640441179276, + "learning_rate": 0.00014303524010981592, + "loss": 0.2897, + "step": 17595 + }, + { + "epoch": 1.4254698639014907, + "grad_norm": 0.0434565506875515, + "learning_rate": 0.0001430307394572213, + "loss": 0.2837, + "step": 17596 + }, + { + "epoch": 1.425550874918989, + "grad_norm": 0.053456861525774, + "learning_rate": 0.0001430262388046267, + "loss": 0.3343, + "step": 17597 + }, + { + "epoch": 1.4256318859364874, + "grad_norm": 0.05187445133924484, + "learning_rate": 0.00014302173815203205, + "loss": 0.3253, + "step": 17598 + }, + { + "epoch": 1.4257128969539856, + "grad_norm": 0.046234335750341415, + "learning_rate": 0.00014301723749943744, + "loss": 0.2808, + "step": 17599 + }, + { + "epoch": 1.425793907971484, + "grad_norm": 0.048222288489341736, + "learning_rate": 0.0001430127368468428, + "loss": 0.2329, + "step": 17600 + }, + { + "epoch": 1.4258749189889826, + "grad_norm": 0.05416186898946762, + "learning_rate": 0.00014300823619424816, + "loss": 0.3028, + "step": 17601 + }, + { + "epoch": 1.4259559300064808, + "grad_norm": 0.04630477353930473, + "learning_rate": 0.00014300373554165354, + "loss": 0.3023, + "step": 17602 + }, + { + "epoch": 1.4260369410239793, + "grad_norm": 0.05828891322016716, + "learning_rate": 0.00014299923488905893, + "loss": 0.3259, + "step": 17603 + }, + { + "epoch": 1.4261179520414777, + "grad_norm": 0.04590104520320892, + "learning_rate": 0.0001429947342364643, + "loss": 0.2762, + "step": 17604 + }, + { + "epoch": 1.426198963058976, + "grad_norm": 0.050497934222221375, + "learning_rate": 0.00014299023358386968, + "loss": 0.2782, + "step": 17605 + }, + { + "epoch": 1.4262799740764744, + "grad_norm": 0.04419134929776192, + "learning_rate": 0.00014298573293127504, + "loss": 0.2795, + "step": 17606 + }, + { + "epoch": 1.426360985093973, + "grad_norm": 0.047383926808834076, + "learning_rate": 0.0001429812322786804, + "loss": 0.2885, + "step": 17607 + }, + { + "epoch": 1.4264419961114712, + "grad_norm": 0.04415738582611084, + "learning_rate": 0.00014297673162608579, + "loss": 0.2795, + "step": 17608 + }, + { + "epoch": 1.4265230071289696, + "grad_norm": 0.048009831458330154, + "learning_rate": 0.00014297223097349117, + "loss": 0.312, + "step": 17609 + }, + { + "epoch": 1.4266040181464679, + "grad_norm": 0.04848635196685791, + "learning_rate": 0.00014296773032089653, + "loss": 0.3205, + "step": 17610 + }, + { + "epoch": 1.4266850291639663, + "grad_norm": 0.04432026669383049, + "learning_rate": 0.00014296322966830192, + "loss": 0.277, + "step": 17611 + }, + { + "epoch": 1.4267660401814646, + "grad_norm": 0.04767496883869171, + "learning_rate": 0.00014295872901570728, + "loss": 0.3138, + "step": 17612 + }, + { + "epoch": 1.426847051198963, + "grad_norm": 0.04357754439115524, + "learning_rate": 0.00014295422836311264, + "loss": 0.2943, + "step": 17613 + }, + { + "epoch": 1.4269280622164615, + "grad_norm": 0.05366687476634979, + "learning_rate": 0.00014294972771051803, + "loss": 0.2843, + "step": 17614 + }, + { + "epoch": 1.4270090732339598, + "grad_norm": 0.03847344592213631, + "learning_rate": 0.00014294522705792341, + "loss": 0.2662, + "step": 17615 + }, + { + "epoch": 1.4270900842514582, + "grad_norm": 0.0483989343047142, + "learning_rate": 0.00014294072640532878, + "loss": 0.2377, + "step": 17616 + }, + { + "epoch": 1.4271710952689567, + "grad_norm": 0.0436316654086113, + "learning_rate": 0.00014293622575273416, + "loss": 0.2917, + "step": 17617 + }, + { + "epoch": 1.427252106286455, + "grad_norm": 0.052246369421482086, + "learning_rate": 0.00014293172510013952, + "loss": 0.3499, + "step": 17618 + }, + { + "epoch": 1.4273331173039534, + "grad_norm": 0.058432210236787796, + "learning_rate": 0.00014292722444754488, + "loss": 0.2948, + "step": 17619 + }, + { + "epoch": 1.4274141283214516, + "grad_norm": 0.052339889109134674, + "learning_rate": 0.00014292272379495027, + "loss": 0.3119, + "step": 17620 + }, + { + "epoch": 1.4274951393389501, + "grad_norm": 0.045229263603687286, + "learning_rate": 0.00014291822314235566, + "loss": 0.3068, + "step": 17621 + }, + { + "epoch": 1.4275761503564484, + "grad_norm": 0.04981428384780884, + "learning_rate": 0.00014291372248976102, + "loss": 0.3281, + "step": 17622 + }, + { + "epoch": 1.4276571613739468, + "grad_norm": 0.047702182084321976, + "learning_rate": 0.0001429092218371664, + "loss": 0.3222, + "step": 17623 + }, + { + "epoch": 1.4277381723914453, + "grad_norm": 0.043431397527456284, + "learning_rate": 0.00014290472118457176, + "loss": 0.2738, + "step": 17624 + }, + { + "epoch": 1.4278191834089435, + "grad_norm": 0.037868376821279526, + "learning_rate": 0.00014290022053197712, + "loss": 0.2335, + "step": 17625 + }, + { + "epoch": 1.427900194426442, + "grad_norm": 0.04401480779051781, + "learning_rate": 0.00014289571987938254, + "loss": 0.3073, + "step": 17626 + }, + { + "epoch": 1.4279812054439405, + "grad_norm": 0.05242225527763367, + "learning_rate": 0.0001428912192267879, + "loss": 0.352, + "step": 17627 + }, + { + "epoch": 1.4280622164614387, + "grad_norm": 0.049203623086214066, + "learning_rate": 0.00014288671857419326, + "loss": 0.2981, + "step": 17628 + }, + { + "epoch": 1.4281432274789372, + "grad_norm": 0.04736964777112007, + "learning_rate": 0.00014288221792159865, + "loss": 0.2753, + "step": 17629 + }, + { + "epoch": 1.4282242384964356, + "grad_norm": 0.053792353719472885, + "learning_rate": 0.000142877717269004, + "loss": 0.2722, + "step": 17630 + }, + { + "epoch": 1.428305249513934, + "grad_norm": 0.04648834094405174, + "learning_rate": 0.00014287321661640937, + "loss": 0.2952, + "step": 17631 + }, + { + "epoch": 1.4283862605314321, + "grad_norm": 0.04978673905134201, + "learning_rate": 0.00014286871596381478, + "loss": 0.3041, + "step": 17632 + }, + { + "epoch": 1.4284672715489306, + "grad_norm": 0.053194884210824966, + "learning_rate": 0.00014286421531122014, + "loss": 0.2755, + "step": 17633 + }, + { + "epoch": 1.428548282566429, + "grad_norm": 0.048594143241643906, + "learning_rate": 0.0001428597146586255, + "loss": 0.3009, + "step": 17634 + }, + { + "epoch": 1.4286292935839273, + "grad_norm": 0.039179448038339615, + "learning_rate": 0.0001428552140060309, + "loss": 0.2707, + "step": 17635 + }, + { + "epoch": 1.4287103046014258, + "grad_norm": 0.052394479513168335, + "learning_rate": 0.00014285071335343625, + "loss": 0.3635, + "step": 17636 + }, + { + "epoch": 1.4287913156189243, + "grad_norm": 0.04384802281856537, + "learning_rate": 0.0001428462127008416, + "loss": 0.2835, + "step": 17637 + }, + { + "epoch": 1.4288723266364225, + "grad_norm": 0.05427468195557594, + "learning_rate": 0.00014284171204824702, + "loss": 0.3465, + "step": 17638 + }, + { + "epoch": 1.428953337653921, + "grad_norm": 0.047030430287122726, + "learning_rate": 0.00014283721139565238, + "loss": 0.3357, + "step": 17639 + }, + { + "epoch": 1.4290343486714194, + "grad_norm": 0.04885796457529068, + "learning_rate": 0.00014283271074305774, + "loss": 0.2944, + "step": 17640 + }, + { + "epoch": 1.4291153596889177, + "grad_norm": 0.05539722368121147, + "learning_rate": 0.00014282821009046313, + "loss": 0.2876, + "step": 17641 + }, + { + "epoch": 1.4291963707064161, + "grad_norm": 0.03706725314259529, + "learning_rate": 0.0001428237094378685, + "loss": 0.269, + "step": 17642 + }, + { + "epoch": 1.4292773817239144, + "grad_norm": 0.047710664570331573, + "learning_rate": 0.00014281920878527385, + "loss": 0.3027, + "step": 17643 + }, + { + "epoch": 1.4293583927414129, + "grad_norm": 0.04649528115987778, + "learning_rate": 0.00014281470813267926, + "loss": 0.2662, + "step": 17644 + }, + { + "epoch": 1.429439403758911, + "grad_norm": 0.052990663796663284, + "learning_rate": 0.00014281020748008462, + "loss": 0.3236, + "step": 17645 + }, + { + "epoch": 1.4295204147764096, + "grad_norm": 0.04301619529724121, + "learning_rate": 0.00014280570682748998, + "loss": 0.2903, + "step": 17646 + }, + { + "epoch": 1.429601425793908, + "grad_norm": 0.04583217203617096, + "learning_rate": 0.00014280120617489537, + "loss": 0.3078, + "step": 17647 + }, + { + "epoch": 1.4296824368114063, + "grad_norm": 0.04579533636569977, + "learning_rate": 0.00014279670552230073, + "loss": 0.269, + "step": 17648 + }, + { + "epoch": 1.4297634478289047, + "grad_norm": 0.048855386674404144, + "learning_rate": 0.0001427922048697061, + "loss": 0.2819, + "step": 17649 + }, + { + "epoch": 1.4298444588464032, + "grad_norm": 0.045893169939517975, + "learning_rate": 0.0001427877042171115, + "loss": 0.3297, + "step": 17650 + }, + { + "epoch": 1.4299254698639015, + "grad_norm": 0.04148982837796211, + "learning_rate": 0.00014278320356451686, + "loss": 0.2879, + "step": 17651 + }, + { + "epoch": 1.4300064808814, + "grad_norm": 0.04670432209968567, + "learning_rate": 0.00014277870291192222, + "loss": 0.2744, + "step": 17652 + }, + { + "epoch": 1.4300874918988984, + "grad_norm": 0.05058503895998001, + "learning_rate": 0.0001427742022593276, + "loss": 0.2655, + "step": 17653 + }, + { + "epoch": 1.4301685029163966, + "grad_norm": 0.04702136293053627, + "learning_rate": 0.00014276970160673297, + "loss": 0.3089, + "step": 17654 + }, + { + "epoch": 1.4302495139338949, + "grad_norm": 0.04836324602365494, + "learning_rate": 0.00014276520095413836, + "loss": 0.2993, + "step": 17655 + }, + { + "epoch": 1.4303305249513933, + "grad_norm": 0.0631798654794693, + "learning_rate": 0.00014276070030154375, + "loss": 0.2981, + "step": 17656 + }, + { + "epoch": 1.4304115359688918, + "grad_norm": 0.06408338248729706, + "learning_rate": 0.0001427561996489491, + "loss": 0.302, + "step": 17657 + }, + { + "epoch": 1.43049254698639, + "grad_norm": 0.04548690468072891, + "learning_rate": 0.00014275169899635447, + "loss": 0.2829, + "step": 17658 + }, + { + "epoch": 1.4305735580038885, + "grad_norm": 0.053524602204561234, + "learning_rate": 0.00014274719834375985, + "loss": 0.3103, + "step": 17659 + }, + { + "epoch": 1.430654569021387, + "grad_norm": 0.05356777086853981, + "learning_rate": 0.00014274269769116521, + "loss": 0.2945, + "step": 17660 + }, + { + "epoch": 1.4307355800388852, + "grad_norm": 0.046562310308218, + "learning_rate": 0.0001427381970385706, + "loss": 0.3177, + "step": 17661 + }, + { + "epoch": 1.4308165910563837, + "grad_norm": 0.043342217803001404, + "learning_rate": 0.000142733696385976, + "loss": 0.2433, + "step": 17662 + }, + { + "epoch": 1.4308976020738822, + "grad_norm": 0.04943990707397461, + "learning_rate": 0.00014272919573338135, + "loss": 0.2833, + "step": 17663 + }, + { + "epoch": 1.4309786130913804, + "grad_norm": 0.0495469756424427, + "learning_rate": 0.0001427246950807867, + "loss": 0.3207, + "step": 17664 + }, + { + "epoch": 1.4310596241088789, + "grad_norm": 0.041657302528619766, + "learning_rate": 0.0001427201944281921, + "loss": 0.2768, + "step": 17665 + }, + { + "epoch": 1.4311406351263771, + "grad_norm": 0.054395124316215515, + "learning_rate": 0.00014271569377559746, + "loss": 0.32, + "step": 17666 + }, + { + "epoch": 1.4312216461438756, + "grad_norm": 0.04485293850302696, + "learning_rate": 0.00014271119312300284, + "loss": 0.3026, + "step": 17667 + }, + { + "epoch": 1.4313026571613738, + "grad_norm": 0.03999423235654831, + "learning_rate": 0.00014270669247040823, + "loss": 0.2654, + "step": 17668 + }, + { + "epoch": 1.4313836681788723, + "grad_norm": 0.05643171817064285, + "learning_rate": 0.0001427021918178136, + "loss": 0.3557, + "step": 17669 + }, + { + "epoch": 1.4314646791963708, + "grad_norm": 0.05252460390329361, + "learning_rate": 0.00014269769116521895, + "loss": 0.3132, + "step": 17670 + }, + { + "epoch": 1.431545690213869, + "grad_norm": 0.05533226206898689, + "learning_rate": 0.00014269319051262434, + "loss": 0.3212, + "step": 17671 + }, + { + "epoch": 1.4316267012313675, + "grad_norm": 0.04171612858772278, + "learning_rate": 0.0001426886898600297, + "loss": 0.2695, + "step": 17672 + }, + { + "epoch": 1.431707712248866, + "grad_norm": 0.04758473113179207, + "learning_rate": 0.00014268418920743508, + "loss": 0.2862, + "step": 17673 + }, + { + "epoch": 1.4317887232663642, + "grad_norm": 0.03944450989365578, + "learning_rate": 0.00014267968855484047, + "loss": 0.2567, + "step": 17674 + }, + { + "epoch": 1.4318697342838627, + "grad_norm": 0.052580054849386215, + "learning_rate": 0.00014267518790224583, + "loss": 0.2806, + "step": 17675 + }, + { + "epoch": 1.431950745301361, + "grad_norm": 0.04589229077100754, + "learning_rate": 0.0001426706872496512, + "loss": 0.2795, + "step": 17676 + }, + { + "epoch": 1.4320317563188594, + "grad_norm": 0.05369507148861885, + "learning_rate": 0.00014266618659705658, + "loss": 0.2978, + "step": 17677 + }, + { + "epoch": 1.4321127673363576, + "grad_norm": 0.06819190084934235, + "learning_rate": 0.00014266168594446197, + "loss": 0.3306, + "step": 17678 + }, + { + "epoch": 1.432193778353856, + "grad_norm": 0.04986560717225075, + "learning_rate": 0.00014265718529186733, + "loss": 0.3172, + "step": 17679 + }, + { + "epoch": 1.4322747893713546, + "grad_norm": 0.06651494652032852, + "learning_rate": 0.0001426526846392727, + "loss": 0.3052, + "step": 17680 + }, + { + "epoch": 1.4323558003888528, + "grad_norm": 0.04793441295623779, + "learning_rate": 0.00014264818398667807, + "loss": 0.2924, + "step": 17681 + }, + { + "epoch": 1.4324368114063513, + "grad_norm": 0.04924444481730461, + "learning_rate": 0.00014264368333408343, + "loss": 0.2638, + "step": 17682 + }, + { + "epoch": 1.4325178224238497, + "grad_norm": 0.047013383358716965, + "learning_rate": 0.00014263918268148882, + "loss": 0.3146, + "step": 17683 + }, + { + "epoch": 1.432598833441348, + "grad_norm": 0.04840421304106712, + "learning_rate": 0.0001426346820288942, + "loss": 0.2787, + "step": 17684 + }, + { + "epoch": 1.4326798444588464, + "grad_norm": 0.0506095290184021, + "learning_rate": 0.00014263018137629957, + "loss": 0.3159, + "step": 17685 + }, + { + "epoch": 1.432760855476345, + "grad_norm": 0.04708279296755791, + "learning_rate": 0.00014262568072370495, + "loss": 0.2614, + "step": 17686 + }, + { + "epoch": 1.4328418664938432, + "grad_norm": 0.05179835110902786, + "learning_rate": 0.00014262118007111031, + "loss": 0.2564, + "step": 17687 + }, + { + "epoch": 1.4329228775113416, + "grad_norm": 0.04784620180726051, + "learning_rate": 0.00014261667941851567, + "loss": 0.3049, + "step": 17688 + }, + { + "epoch": 1.4330038885288399, + "grad_norm": 0.04964848980307579, + "learning_rate": 0.00014261217876592106, + "loss": 0.2776, + "step": 17689 + }, + { + "epoch": 1.4330848995463383, + "grad_norm": 0.049182306975126266, + "learning_rate": 0.00014260767811332645, + "loss": 0.3259, + "step": 17690 + }, + { + "epoch": 1.4331659105638366, + "grad_norm": 0.05033789947628975, + "learning_rate": 0.0001426031774607318, + "loss": 0.3005, + "step": 17691 + }, + { + "epoch": 1.433246921581335, + "grad_norm": 0.04850676655769348, + "learning_rate": 0.0001425986768081372, + "loss": 0.2947, + "step": 17692 + }, + { + "epoch": 1.4333279325988335, + "grad_norm": 0.041882604360580444, + "learning_rate": 0.00014259417615554256, + "loss": 0.2313, + "step": 17693 + }, + { + "epoch": 1.4334089436163318, + "grad_norm": 0.053109001368284225, + "learning_rate": 0.00014258967550294792, + "loss": 0.2912, + "step": 17694 + }, + { + "epoch": 1.4334899546338302, + "grad_norm": 0.0499955415725708, + "learning_rate": 0.0001425851748503533, + "loss": 0.3335, + "step": 17695 + }, + { + "epoch": 1.4335709656513287, + "grad_norm": 0.044821664690971375, + "learning_rate": 0.0001425806741977587, + "loss": 0.288, + "step": 17696 + }, + { + "epoch": 1.433651976668827, + "grad_norm": 0.04659029468894005, + "learning_rate": 0.00014257617354516405, + "loss": 0.2719, + "step": 17697 + }, + { + "epoch": 1.4337329876863254, + "grad_norm": 0.04900295287370682, + "learning_rate": 0.00014257167289256944, + "loss": 0.2859, + "step": 17698 + }, + { + "epoch": 1.4338139987038236, + "grad_norm": 0.04085109010338783, + "learning_rate": 0.0001425671722399748, + "loss": 0.2879, + "step": 17699 + }, + { + "epoch": 1.4338950097213221, + "grad_norm": 0.044099897146224976, + "learning_rate": 0.00014256267158738016, + "loss": 0.2745, + "step": 17700 + }, + { + "epoch": 1.4339760207388204, + "grad_norm": 0.04478868842124939, + "learning_rate": 0.00014255817093478554, + "loss": 0.2946, + "step": 17701 + }, + { + "epoch": 1.4340570317563188, + "grad_norm": 0.0422249473631382, + "learning_rate": 0.00014255367028219093, + "loss": 0.2904, + "step": 17702 + }, + { + "epoch": 1.4341380427738173, + "grad_norm": 0.048670295625925064, + "learning_rate": 0.0001425491696295963, + "loss": 0.3119, + "step": 17703 + }, + { + "epoch": 1.4342190537913155, + "grad_norm": 0.046880774199962616, + "learning_rate": 0.00014254466897700168, + "loss": 0.2731, + "step": 17704 + }, + { + "epoch": 1.434300064808814, + "grad_norm": 0.05380629748106003, + "learning_rate": 0.00014254016832440704, + "loss": 0.3093, + "step": 17705 + }, + { + "epoch": 1.4343810758263125, + "grad_norm": 0.05190601944923401, + "learning_rate": 0.0001425356676718124, + "loss": 0.2951, + "step": 17706 + }, + { + "epoch": 1.4344620868438107, + "grad_norm": 0.0382777564227581, + "learning_rate": 0.00014253116701921781, + "loss": 0.2484, + "step": 17707 + }, + { + "epoch": 1.4345430978613092, + "grad_norm": 0.04491351172327995, + "learning_rate": 0.00014252666636662317, + "loss": 0.2674, + "step": 17708 + }, + { + "epoch": 1.4346241088788076, + "grad_norm": 0.05805452540516853, + "learning_rate": 0.00014252216571402853, + "loss": 0.2843, + "step": 17709 + }, + { + "epoch": 1.434705119896306, + "grad_norm": 0.03996812179684639, + "learning_rate": 0.00014251766506143392, + "loss": 0.2579, + "step": 17710 + }, + { + "epoch": 1.4347861309138044, + "grad_norm": 0.047434549778699875, + "learning_rate": 0.00014251316440883928, + "loss": 0.2858, + "step": 17711 + }, + { + "epoch": 1.4348671419313026, + "grad_norm": 0.0471944697201252, + "learning_rate": 0.00014250866375624464, + "loss": 0.2849, + "step": 17712 + }, + { + "epoch": 1.434948152948801, + "grad_norm": 0.06150485947728157, + "learning_rate": 0.00014250416310365006, + "loss": 0.3582, + "step": 17713 + }, + { + "epoch": 1.4350291639662993, + "grad_norm": 0.050793759524822235, + "learning_rate": 0.00014249966245105542, + "loss": 0.2473, + "step": 17714 + }, + { + "epoch": 1.4351101749837978, + "grad_norm": 0.04835623875260353, + "learning_rate": 0.00014249516179846078, + "loss": 0.3284, + "step": 17715 + }, + { + "epoch": 1.4351911860012962, + "grad_norm": 0.047694023698568344, + "learning_rate": 0.00014249066114586616, + "loss": 0.2609, + "step": 17716 + }, + { + "epoch": 1.4352721970187945, + "grad_norm": 0.0514262430369854, + "learning_rate": 0.00014248616049327152, + "loss": 0.3275, + "step": 17717 + }, + { + "epoch": 1.435353208036293, + "grad_norm": 0.046545013785362244, + "learning_rate": 0.00014248165984067688, + "loss": 0.2731, + "step": 17718 + }, + { + "epoch": 1.4354342190537914, + "grad_norm": 0.043400492519140244, + "learning_rate": 0.0001424771591880823, + "loss": 0.2886, + "step": 17719 + }, + { + "epoch": 1.4355152300712897, + "grad_norm": 0.05241835489869118, + "learning_rate": 0.00014247265853548766, + "loss": 0.3394, + "step": 17720 + }, + { + "epoch": 1.4355962410887881, + "grad_norm": 0.046961188316345215, + "learning_rate": 0.00014246815788289302, + "loss": 0.2782, + "step": 17721 + }, + { + "epoch": 1.4356772521062864, + "grad_norm": 0.05200467258691788, + "learning_rate": 0.0001424636572302984, + "loss": 0.3346, + "step": 17722 + }, + { + "epoch": 1.4357582631237849, + "grad_norm": 0.04980026185512543, + "learning_rate": 0.00014245915657770376, + "loss": 0.3204, + "step": 17723 + }, + { + "epoch": 1.435839274141283, + "grad_norm": 0.04377777874469757, + "learning_rate": 0.00014245465592510912, + "loss": 0.2862, + "step": 17724 + }, + { + "epoch": 1.4359202851587816, + "grad_norm": 0.04981323331594467, + "learning_rate": 0.00014245015527251454, + "loss": 0.3012, + "step": 17725 + }, + { + "epoch": 1.43600129617628, + "grad_norm": 0.049664206802845, + "learning_rate": 0.0001424456546199199, + "loss": 0.3252, + "step": 17726 + }, + { + "epoch": 1.4360823071937783, + "grad_norm": 0.05872774124145508, + "learning_rate": 0.00014244115396732526, + "loss": 0.3248, + "step": 17727 + }, + { + "epoch": 1.4361633182112767, + "grad_norm": 0.0413481779396534, + "learning_rate": 0.00014243665331473065, + "loss": 0.2742, + "step": 17728 + }, + { + "epoch": 1.4362443292287752, + "grad_norm": 0.05045654624700546, + "learning_rate": 0.000142432152662136, + "loss": 0.299, + "step": 17729 + }, + { + "epoch": 1.4363253402462735, + "grad_norm": 0.050098661333322525, + "learning_rate": 0.0001424276520095414, + "loss": 0.2855, + "step": 17730 + }, + { + "epoch": 1.436406351263772, + "grad_norm": 0.043931033462285995, + "learning_rate": 0.00014242315135694678, + "loss": 0.2678, + "step": 17731 + }, + { + "epoch": 1.4364873622812704, + "grad_norm": 0.04665660858154297, + "learning_rate": 0.00014241865070435214, + "loss": 0.2913, + "step": 17732 + }, + { + "epoch": 1.4365683732987686, + "grad_norm": 0.04515928775072098, + "learning_rate": 0.0001424141500517575, + "loss": 0.2995, + "step": 17733 + }, + { + "epoch": 1.4366493843162669, + "grad_norm": 0.05133425071835518, + "learning_rate": 0.0001424096493991629, + "loss": 0.3253, + "step": 17734 + }, + { + "epoch": 1.4367303953337653, + "grad_norm": 0.05313633382320404, + "learning_rate": 0.00014240514874656825, + "loss": 0.3065, + "step": 17735 + }, + { + "epoch": 1.4368114063512638, + "grad_norm": 0.04846735671162605, + "learning_rate": 0.00014240064809397363, + "loss": 0.2985, + "step": 17736 + }, + { + "epoch": 1.436892417368762, + "grad_norm": 0.04319280758500099, + "learning_rate": 0.00014239614744137902, + "loss": 0.2498, + "step": 17737 + }, + { + "epoch": 1.4369734283862605, + "grad_norm": 0.03988509252667427, + "learning_rate": 0.00014239164678878438, + "loss": 0.2677, + "step": 17738 + }, + { + "epoch": 1.437054439403759, + "grad_norm": 0.04044670984148979, + "learning_rate": 0.00014238714613618974, + "loss": 0.2926, + "step": 17739 + }, + { + "epoch": 1.4371354504212572, + "grad_norm": 0.05485132709145546, + "learning_rate": 0.00014238264548359513, + "loss": 0.3302, + "step": 17740 + }, + { + "epoch": 1.4372164614387557, + "grad_norm": 0.04441394284367561, + "learning_rate": 0.0001423781448310005, + "loss": 0.2643, + "step": 17741 + }, + { + "epoch": 1.4372974724562542, + "grad_norm": 0.04969778284430504, + "learning_rate": 0.00014237364417840588, + "loss": 0.2784, + "step": 17742 + }, + { + "epoch": 1.4373784834737524, + "grad_norm": 0.04707542806863785, + "learning_rate": 0.00014236914352581126, + "loss": 0.2669, + "step": 17743 + }, + { + "epoch": 1.4374594944912509, + "grad_norm": 0.04709019884467125, + "learning_rate": 0.00014236464287321662, + "loss": 0.2504, + "step": 17744 + }, + { + "epoch": 1.4375405055087491, + "grad_norm": 0.04919714853167534, + "learning_rate": 0.00014236014222062198, + "loss": 0.2899, + "step": 17745 + }, + { + "epoch": 1.4376215165262476, + "grad_norm": 0.043658845126628876, + "learning_rate": 0.00014235564156802737, + "loss": 0.3024, + "step": 17746 + }, + { + "epoch": 1.4377025275437458, + "grad_norm": 0.04478023573756218, + "learning_rate": 0.00014235114091543273, + "loss": 0.3129, + "step": 17747 + }, + { + "epoch": 1.4377835385612443, + "grad_norm": 0.04204754903912544, + "learning_rate": 0.00014234664026283812, + "loss": 0.2523, + "step": 17748 + }, + { + "epoch": 1.4378645495787428, + "grad_norm": 0.04705436900258064, + "learning_rate": 0.0001423421396102435, + "loss": 0.3134, + "step": 17749 + }, + { + "epoch": 1.437945560596241, + "grad_norm": 0.045828547328710556, + "learning_rate": 0.00014233763895764886, + "loss": 0.2992, + "step": 17750 + }, + { + "epoch": 1.4380265716137395, + "grad_norm": 0.04419634863734245, + "learning_rate": 0.00014233313830505423, + "loss": 0.2926, + "step": 17751 + }, + { + "epoch": 1.438107582631238, + "grad_norm": 0.047092728316783905, + "learning_rate": 0.0001423286376524596, + "loss": 0.251, + "step": 17752 + }, + { + "epoch": 1.4381885936487362, + "grad_norm": 0.0492546483874321, + "learning_rate": 0.00014232413699986497, + "loss": 0.2937, + "step": 17753 + }, + { + "epoch": 1.4382696046662347, + "grad_norm": 0.04803192988038063, + "learning_rate": 0.00014231963634727036, + "loss": 0.2688, + "step": 17754 + }, + { + "epoch": 1.4383506156837331, + "grad_norm": 0.05287334322929382, + "learning_rate": 0.00014231513569467575, + "loss": 0.3084, + "step": 17755 + }, + { + "epoch": 1.4384316267012314, + "grad_norm": 0.04357105493545532, + "learning_rate": 0.0001423106350420811, + "loss": 0.2838, + "step": 17756 + }, + { + "epoch": 1.4385126377187296, + "grad_norm": 0.03843948245048523, + "learning_rate": 0.00014230613438948647, + "loss": 0.2891, + "step": 17757 + }, + { + "epoch": 1.438593648736228, + "grad_norm": 0.04217882826924324, + "learning_rate": 0.00014230163373689185, + "loss": 0.278, + "step": 17758 + }, + { + "epoch": 1.4386746597537265, + "grad_norm": 0.04510761424899101, + "learning_rate": 0.00014229713308429724, + "loss": 0.2839, + "step": 17759 + }, + { + "epoch": 1.4387556707712248, + "grad_norm": 0.04868185892701149, + "learning_rate": 0.0001422926324317026, + "loss": 0.3012, + "step": 17760 + }, + { + "epoch": 1.4388366817887233, + "grad_norm": 0.04734332486987114, + "learning_rate": 0.000142288131779108, + "loss": 0.3107, + "step": 17761 + }, + { + "epoch": 1.4389176928062217, + "grad_norm": 0.05641593411564827, + "learning_rate": 0.00014228363112651335, + "loss": 0.3275, + "step": 17762 + }, + { + "epoch": 1.43899870382372, + "grad_norm": 0.04929044097661972, + "learning_rate": 0.0001422791304739187, + "loss": 0.3088, + "step": 17763 + }, + { + "epoch": 1.4390797148412184, + "grad_norm": 0.055661290884017944, + "learning_rate": 0.0001422746298213241, + "loss": 0.2993, + "step": 17764 + }, + { + "epoch": 1.439160725858717, + "grad_norm": 0.04595927894115448, + "learning_rate": 0.00014227012916872948, + "loss": 0.2959, + "step": 17765 + }, + { + "epoch": 1.4392417368762151, + "grad_norm": 0.04587925225496292, + "learning_rate": 0.00014226562851613484, + "loss": 0.2938, + "step": 17766 + }, + { + "epoch": 1.4393227478937136, + "grad_norm": 0.043285440653562546, + "learning_rate": 0.00014226112786354023, + "loss": 0.2862, + "step": 17767 + }, + { + "epoch": 1.4394037589112119, + "grad_norm": 0.04577759653329849, + "learning_rate": 0.0001422566272109456, + "loss": 0.284, + "step": 17768 + }, + { + "epoch": 1.4394847699287103, + "grad_norm": 0.05022251978516579, + "learning_rate": 0.00014225212655835095, + "loss": 0.2691, + "step": 17769 + }, + { + "epoch": 1.4395657809462086, + "grad_norm": 0.04994913190603256, + "learning_rate": 0.00014224762590575634, + "loss": 0.347, + "step": 17770 + }, + { + "epoch": 1.439646791963707, + "grad_norm": 0.04189305379986763, + "learning_rate": 0.00014224312525316172, + "loss": 0.2782, + "step": 17771 + }, + { + "epoch": 1.4397278029812055, + "grad_norm": 0.05979640409350395, + "learning_rate": 0.00014223862460056708, + "loss": 0.275, + "step": 17772 + }, + { + "epoch": 1.4398088139987038, + "grad_norm": 0.041147999465465546, + "learning_rate": 0.00014223412394797247, + "loss": 0.2508, + "step": 17773 + }, + { + "epoch": 1.4398898250162022, + "grad_norm": 0.05074213445186615, + "learning_rate": 0.00014222962329537783, + "loss": 0.2646, + "step": 17774 + }, + { + "epoch": 1.4399708360337007, + "grad_norm": 0.0547347255051136, + "learning_rate": 0.0001422251226427832, + "loss": 0.2874, + "step": 17775 + }, + { + "epoch": 1.440051847051199, + "grad_norm": 0.04395623505115509, + "learning_rate": 0.00014222062199018858, + "loss": 0.2624, + "step": 17776 + }, + { + "epoch": 1.4401328580686974, + "grad_norm": 0.05009927973151207, + "learning_rate": 0.00014221612133759397, + "loss": 0.3157, + "step": 17777 + }, + { + "epoch": 1.4402138690861956, + "grad_norm": 0.05450264737010002, + "learning_rate": 0.00014221162068499933, + "loss": 0.3138, + "step": 17778 + }, + { + "epoch": 1.440294880103694, + "grad_norm": 0.05897742137312889, + "learning_rate": 0.0001422071200324047, + "loss": 0.3221, + "step": 17779 + }, + { + "epoch": 1.4403758911211924, + "grad_norm": 0.049845121800899506, + "learning_rate": 0.00014220261937981007, + "loss": 0.2763, + "step": 17780 + }, + { + "epoch": 1.4404569021386908, + "grad_norm": 0.04709333926439285, + "learning_rate": 0.00014219811872721543, + "loss": 0.3036, + "step": 17781 + }, + { + "epoch": 1.4405379131561893, + "grad_norm": 0.0525975339114666, + "learning_rate": 0.00014219361807462082, + "loss": 0.3123, + "step": 17782 + }, + { + "epoch": 1.4406189241736875, + "grad_norm": 0.04980793967843056, + "learning_rate": 0.0001421891174220262, + "loss": 0.3106, + "step": 17783 + }, + { + "epoch": 1.440699935191186, + "grad_norm": 0.04696754738688469, + "learning_rate": 0.00014218461676943157, + "loss": 0.3115, + "step": 17784 + }, + { + "epoch": 1.4407809462086845, + "grad_norm": 0.046658169478178024, + "learning_rate": 0.00014218011611683695, + "loss": 0.2919, + "step": 17785 + }, + { + "epoch": 1.4408619572261827, + "grad_norm": 0.058640480041503906, + "learning_rate": 0.00014217561546424231, + "loss": 0.3256, + "step": 17786 + }, + { + "epoch": 1.4409429682436812, + "grad_norm": 0.04588627442717552, + "learning_rate": 0.00014217111481164767, + "loss": 0.2918, + "step": 17787 + }, + { + "epoch": 1.4410239792611796, + "grad_norm": 0.04578271880745888, + "learning_rate": 0.0001421666141590531, + "loss": 0.2941, + "step": 17788 + }, + { + "epoch": 1.4411049902786779, + "grad_norm": 0.05437834560871124, + "learning_rate": 0.00014216211350645845, + "loss": 0.3152, + "step": 17789 + }, + { + "epoch": 1.4411860012961764, + "grad_norm": 0.05292617529630661, + "learning_rate": 0.0001421576128538638, + "loss": 0.3266, + "step": 17790 + }, + { + "epoch": 1.4412670123136746, + "grad_norm": 0.04186420515179634, + "learning_rate": 0.0001421531122012692, + "loss": 0.2849, + "step": 17791 + }, + { + "epoch": 1.441348023331173, + "grad_norm": 0.04509038105607033, + "learning_rate": 0.00014214861154867456, + "loss": 0.3231, + "step": 17792 + }, + { + "epoch": 1.4414290343486713, + "grad_norm": 0.053367290645837784, + "learning_rate": 0.00014214411089607992, + "loss": 0.338, + "step": 17793 + }, + { + "epoch": 1.4415100453661698, + "grad_norm": 0.04629664123058319, + "learning_rate": 0.00014213961024348533, + "loss": 0.2907, + "step": 17794 + }, + { + "epoch": 1.4415910563836682, + "grad_norm": 0.04417245090007782, + "learning_rate": 0.0001421351095908907, + "loss": 0.2831, + "step": 17795 + }, + { + "epoch": 1.4416720674011665, + "grad_norm": 0.04873376712203026, + "learning_rate": 0.00014213060893829605, + "loss": 0.2693, + "step": 17796 + }, + { + "epoch": 1.441753078418665, + "grad_norm": 0.0454351082444191, + "learning_rate": 0.00014212610828570144, + "loss": 0.2783, + "step": 17797 + }, + { + "epoch": 1.4418340894361634, + "grad_norm": 0.052286408841609955, + "learning_rate": 0.0001421216076331068, + "loss": 0.3376, + "step": 17798 + }, + { + "epoch": 1.4419151004536617, + "grad_norm": 0.04228602349758148, + "learning_rate": 0.00014211710698051216, + "loss": 0.257, + "step": 17799 + }, + { + "epoch": 1.4419961114711601, + "grad_norm": 0.05968720465898514, + "learning_rate": 0.00014211260632791757, + "loss": 0.3144, + "step": 17800 + }, + { + "epoch": 1.4420771224886584, + "grad_norm": 0.05366584286093712, + "learning_rate": 0.00014210810567532293, + "loss": 0.3307, + "step": 17801 + }, + { + "epoch": 1.4421581335061568, + "grad_norm": 0.05395115166902542, + "learning_rate": 0.0001421036050227283, + "loss": 0.3175, + "step": 17802 + }, + { + "epoch": 1.442239144523655, + "grad_norm": 0.05399424210190773, + "learning_rate": 0.00014209910437013368, + "loss": 0.3105, + "step": 17803 + }, + { + "epoch": 1.4423201555411536, + "grad_norm": 0.041070666164159775, + "learning_rate": 0.00014209460371753904, + "loss": 0.2223, + "step": 17804 + }, + { + "epoch": 1.442401166558652, + "grad_norm": 0.05013779550790787, + "learning_rate": 0.0001420901030649444, + "loss": 0.3038, + "step": 17805 + }, + { + "epoch": 1.4424821775761503, + "grad_norm": 0.04620078206062317, + "learning_rate": 0.00014208560241234981, + "loss": 0.2561, + "step": 17806 + }, + { + "epoch": 1.4425631885936487, + "grad_norm": 0.050512149930000305, + "learning_rate": 0.00014208110175975517, + "loss": 0.2422, + "step": 17807 + }, + { + "epoch": 1.4426441996111472, + "grad_norm": 0.041955072432756424, + "learning_rate": 0.00014207660110716053, + "loss": 0.3296, + "step": 17808 + }, + { + "epoch": 1.4427252106286454, + "grad_norm": 0.043352846056222916, + "learning_rate": 0.00014207210045456592, + "loss": 0.2497, + "step": 17809 + }, + { + "epoch": 1.442806221646144, + "grad_norm": 0.05135820806026459, + "learning_rate": 0.00014206759980197128, + "loss": 0.2877, + "step": 17810 + }, + { + "epoch": 1.4428872326636424, + "grad_norm": 0.04289088025689125, + "learning_rate": 0.00014206309914937667, + "loss": 0.2951, + "step": 17811 + }, + { + "epoch": 1.4429682436811406, + "grad_norm": 0.053548287600278854, + "learning_rate": 0.00014205859849678206, + "loss": 0.2862, + "step": 17812 + }, + { + "epoch": 1.443049254698639, + "grad_norm": 0.04844128340482712, + "learning_rate": 0.00014205409784418742, + "loss": 0.2694, + "step": 17813 + }, + { + "epoch": 1.4431302657161373, + "grad_norm": 0.059893593192100525, + "learning_rate": 0.00014204959719159278, + "loss": 0.3669, + "step": 17814 + }, + { + "epoch": 1.4432112767336358, + "grad_norm": 0.04999880865216255, + "learning_rate": 0.00014204509653899816, + "loss": 0.2994, + "step": 17815 + }, + { + "epoch": 1.443292287751134, + "grad_norm": 0.054112501442432404, + "learning_rate": 0.00014204059588640352, + "loss": 0.3448, + "step": 17816 + }, + { + "epoch": 1.4433732987686325, + "grad_norm": 0.057990577071905136, + "learning_rate": 0.0001420360952338089, + "loss": 0.3429, + "step": 17817 + }, + { + "epoch": 1.443454309786131, + "grad_norm": 0.047677554190158844, + "learning_rate": 0.0001420315945812143, + "loss": 0.3265, + "step": 17818 + }, + { + "epoch": 1.4435353208036292, + "grad_norm": 0.04634621739387512, + "learning_rate": 0.00014202709392861966, + "loss": 0.3059, + "step": 17819 + }, + { + "epoch": 1.4436163318211277, + "grad_norm": 0.05056798458099365, + "learning_rate": 0.00014202259327602502, + "loss": 0.3319, + "step": 17820 + }, + { + "epoch": 1.4436973428386262, + "grad_norm": 0.04358064383268356, + "learning_rate": 0.0001420180926234304, + "loss": 0.3111, + "step": 17821 + }, + { + "epoch": 1.4437783538561244, + "grad_norm": 0.05352209135890007, + "learning_rate": 0.00014201359197083576, + "loss": 0.2741, + "step": 17822 + }, + { + "epoch": 1.4438593648736229, + "grad_norm": 0.05255879834294319, + "learning_rate": 0.00014200909131824115, + "loss": 0.3207, + "step": 17823 + }, + { + "epoch": 1.4439403758911211, + "grad_norm": 0.048375453799963, + "learning_rate": 0.00014200459066564654, + "loss": 0.2694, + "step": 17824 + }, + { + "epoch": 1.4440213869086196, + "grad_norm": 0.04612603783607483, + "learning_rate": 0.0001420000900130519, + "loss": 0.2852, + "step": 17825 + }, + { + "epoch": 1.4441023979261178, + "grad_norm": 0.041807424277067184, + "learning_rate": 0.00014199558936045726, + "loss": 0.3001, + "step": 17826 + }, + { + "epoch": 1.4441834089436163, + "grad_norm": 0.04739182069897652, + "learning_rate": 0.00014199108870786265, + "loss": 0.337, + "step": 17827 + }, + { + "epoch": 1.4442644199611148, + "grad_norm": 0.04741252213716507, + "learning_rate": 0.000141986588055268, + "loss": 0.2582, + "step": 17828 + }, + { + "epoch": 1.444345430978613, + "grad_norm": 0.05981162562966347, + "learning_rate": 0.0001419820874026734, + "loss": 0.3234, + "step": 17829 + }, + { + "epoch": 1.4444264419961115, + "grad_norm": 0.04911743104457855, + "learning_rate": 0.00014197758675007878, + "loss": 0.3054, + "step": 17830 + }, + { + "epoch": 1.44450745301361, + "grad_norm": 0.04073718190193176, + "learning_rate": 0.00014197308609748414, + "loss": 0.2575, + "step": 17831 + }, + { + "epoch": 1.4445884640311082, + "grad_norm": 0.06219786778092384, + "learning_rate": 0.0001419685854448895, + "loss": 0.3347, + "step": 17832 + }, + { + "epoch": 1.4446694750486067, + "grad_norm": 0.054439812898635864, + "learning_rate": 0.0001419640847922949, + "loss": 0.2808, + "step": 17833 + }, + { + "epoch": 1.4447504860661051, + "grad_norm": 0.05290693789720535, + "learning_rate": 0.00014195958413970025, + "loss": 0.2597, + "step": 17834 + }, + { + "epoch": 1.4448314970836034, + "grad_norm": 0.04396319016814232, + "learning_rate": 0.00014195508348710563, + "loss": 0.287, + "step": 17835 + }, + { + "epoch": 1.4449125081011016, + "grad_norm": 0.04777374863624573, + "learning_rate": 0.00014195058283451102, + "loss": 0.3194, + "step": 17836 + }, + { + "epoch": 1.4449935191186, + "grad_norm": 0.041372284293174744, + "learning_rate": 0.00014194608218191638, + "loss": 0.2489, + "step": 17837 + }, + { + "epoch": 1.4450745301360985, + "grad_norm": 0.048189952969551086, + "learning_rate": 0.00014194158152932174, + "loss": 0.2694, + "step": 17838 + }, + { + "epoch": 1.4451555411535968, + "grad_norm": 0.04953427240252495, + "learning_rate": 0.00014193708087672713, + "loss": 0.3139, + "step": 17839 + }, + { + "epoch": 1.4452365521710953, + "grad_norm": 0.0434766449034214, + "learning_rate": 0.00014193258022413252, + "loss": 0.3034, + "step": 17840 + }, + { + "epoch": 1.4453175631885937, + "grad_norm": 0.0509701706469059, + "learning_rate": 0.00014192807957153788, + "loss": 0.2699, + "step": 17841 + }, + { + "epoch": 1.445398574206092, + "grad_norm": 0.044415779411792755, + "learning_rate": 0.00014192357891894326, + "loss": 0.2832, + "step": 17842 + }, + { + "epoch": 1.4454795852235904, + "grad_norm": 0.048741765320301056, + "learning_rate": 0.00014191907826634862, + "loss": 0.2821, + "step": 17843 + }, + { + "epoch": 1.445560596241089, + "grad_norm": 0.043712157756090164, + "learning_rate": 0.00014191457761375398, + "loss": 0.286, + "step": 17844 + }, + { + "epoch": 1.4456416072585871, + "grad_norm": 0.051524609327316284, + "learning_rate": 0.00014191007696115937, + "loss": 0.3056, + "step": 17845 + }, + { + "epoch": 1.4457226182760856, + "grad_norm": 0.051667921245098114, + "learning_rate": 0.00014190557630856476, + "loss": 0.278, + "step": 17846 + }, + { + "epoch": 1.4458036292935839, + "grad_norm": 0.05290085822343826, + "learning_rate": 0.00014190107565597012, + "loss": 0.2713, + "step": 17847 + }, + { + "epoch": 1.4458846403110823, + "grad_norm": 0.04937123507261276, + "learning_rate": 0.0001418965750033755, + "loss": 0.2541, + "step": 17848 + }, + { + "epoch": 1.4459656513285806, + "grad_norm": 0.04817288741469383, + "learning_rate": 0.00014189207435078087, + "loss": 0.2999, + "step": 17849 + }, + { + "epoch": 1.446046662346079, + "grad_norm": 0.061793092638254166, + "learning_rate": 0.00014188757369818623, + "loss": 0.3782, + "step": 17850 + }, + { + "epoch": 1.4461276733635775, + "grad_norm": 0.04492030665278435, + "learning_rate": 0.0001418830730455916, + "loss": 0.2565, + "step": 17851 + }, + { + "epoch": 1.4462086843810757, + "grad_norm": 0.04402526840567589, + "learning_rate": 0.000141878572392997, + "loss": 0.2974, + "step": 17852 + }, + { + "epoch": 1.4462896953985742, + "grad_norm": 0.04675083979964256, + "learning_rate": 0.00014187407174040236, + "loss": 0.2617, + "step": 17853 + }, + { + "epoch": 1.4463707064160727, + "grad_norm": 0.046562857925891876, + "learning_rate": 0.00014186957108780775, + "loss": 0.2916, + "step": 17854 + }, + { + "epoch": 1.446451717433571, + "grad_norm": 0.05099884793162346, + "learning_rate": 0.0001418650704352131, + "loss": 0.3285, + "step": 17855 + }, + { + "epoch": 1.4465327284510694, + "grad_norm": 0.04533091560006142, + "learning_rate": 0.00014186056978261847, + "loss": 0.2499, + "step": 17856 + }, + { + "epoch": 1.4466137394685679, + "grad_norm": 0.053434841334819794, + "learning_rate": 0.00014185606913002385, + "loss": 0.2926, + "step": 17857 + }, + { + "epoch": 1.446694750486066, + "grad_norm": 0.04376845434308052, + "learning_rate": 0.00014185156847742924, + "loss": 0.2999, + "step": 17858 + }, + { + "epoch": 1.4467757615035644, + "grad_norm": 0.046161361038684845, + "learning_rate": 0.0001418470678248346, + "loss": 0.2671, + "step": 17859 + }, + { + "epoch": 1.4468567725210628, + "grad_norm": 0.0418320931494236, + "learning_rate": 0.00014184256717224, + "loss": 0.2765, + "step": 17860 + }, + { + "epoch": 1.4469377835385613, + "grad_norm": 0.04614844545722008, + "learning_rate": 0.00014183806651964535, + "loss": 0.2622, + "step": 17861 + }, + { + "epoch": 1.4470187945560595, + "grad_norm": 0.04278077557682991, + "learning_rate": 0.0001418335658670507, + "loss": 0.2763, + "step": 17862 + }, + { + "epoch": 1.447099805573558, + "grad_norm": 0.051653698086738586, + "learning_rate": 0.00014182906521445612, + "loss": 0.289, + "step": 17863 + }, + { + "epoch": 1.4471808165910565, + "grad_norm": 0.04692598059773445, + "learning_rate": 0.00014182456456186148, + "loss": 0.2912, + "step": 17864 + }, + { + "epoch": 1.4472618276085547, + "grad_norm": 0.05240803584456444, + "learning_rate": 0.00014182006390926684, + "loss": 0.3039, + "step": 17865 + }, + { + "epoch": 1.4473428386260532, + "grad_norm": 0.04631833732128143, + "learning_rate": 0.00014181556325667223, + "loss": 0.2821, + "step": 17866 + }, + { + "epoch": 1.4474238496435516, + "grad_norm": 0.05302568897604942, + "learning_rate": 0.0001418110626040776, + "loss": 0.3315, + "step": 17867 + }, + { + "epoch": 1.4475048606610499, + "grad_norm": 0.0616886280477047, + "learning_rate": 0.00014180656195148295, + "loss": 0.3201, + "step": 17868 + }, + { + "epoch": 1.4475858716785484, + "grad_norm": 0.04690217599272728, + "learning_rate": 0.00014180206129888836, + "loss": 0.2642, + "step": 17869 + }, + { + "epoch": 1.4476668826960466, + "grad_norm": 0.059810154139995575, + "learning_rate": 0.00014179756064629372, + "loss": 0.3053, + "step": 17870 + }, + { + "epoch": 1.447747893713545, + "grad_norm": 0.04747282341122627, + "learning_rate": 0.00014179305999369908, + "loss": 0.3423, + "step": 17871 + }, + { + "epoch": 1.4478289047310433, + "grad_norm": 0.05117661878466606, + "learning_rate": 0.00014178855934110447, + "loss": 0.3011, + "step": 17872 + }, + { + "epoch": 1.4479099157485418, + "grad_norm": 0.045453134924173355, + "learning_rate": 0.00014178405868850983, + "loss": 0.2932, + "step": 17873 + }, + { + "epoch": 1.4479909267660402, + "grad_norm": 0.04899902641773224, + "learning_rate": 0.00014177955803591522, + "loss": 0.3111, + "step": 17874 + }, + { + "epoch": 1.4480719377835385, + "grad_norm": 0.04835022613406181, + "learning_rate": 0.0001417750573833206, + "loss": 0.2965, + "step": 17875 + }, + { + "epoch": 1.448152948801037, + "grad_norm": 0.04425729438662529, + "learning_rate": 0.00014177055673072597, + "loss": 0.2807, + "step": 17876 + }, + { + "epoch": 1.4482339598185354, + "grad_norm": 0.04642229899764061, + "learning_rate": 0.00014176605607813133, + "loss": 0.2362, + "step": 17877 + }, + { + "epoch": 1.4483149708360337, + "grad_norm": 0.04590180143713951, + "learning_rate": 0.0001417615554255367, + "loss": 0.2883, + "step": 17878 + }, + { + "epoch": 1.4483959818535321, + "grad_norm": 0.046345289796590805, + "learning_rate": 0.00014175705477294207, + "loss": 0.2911, + "step": 17879 + }, + { + "epoch": 1.4484769928710304, + "grad_norm": 0.05325405299663544, + "learning_rate": 0.00014175255412034746, + "loss": 0.2957, + "step": 17880 + }, + { + "epoch": 1.4485580038885288, + "grad_norm": 0.05683707445859909, + "learning_rate": 0.00014174805346775285, + "loss": 0.2585, + "step": 17881 + }, + { + "epoch": 1.448639014906027, + "grad_norm": 0.04651009663939476, + "learning_rate": 0.0001417435528151582, + "loss": 0.2747, + "step": 17882 + }, + { + "epoch": 1.4487200259235256, + "grad_norm": 0.04978650435805321, + "learning_rate": 0.00014173905216256357, + "loss": 0.2927, + "step": 17883 + }, + { + "epoch": 1.448801036941024, + "grad_norm": 0.05127064511179924, + "learning_rate": 0.00014173455150996895, + "loss": 0.3347, + "step": 17884 + }, + { + "epoch": 1.4488820479585223, + "grad_norm": 0.04449443891644478, + "learning_rate": 0.00014173005085737431, + "loss": 0.2603, + "step": 17885 + }, + { + "epoch": 1.4489630589760207, + "grad_norm": 0.051162779331207275, + "learning_rate": 0.0001417255502047797, + "loss": 0.3022, + "step": 17886 + }, + { + "epoch": 1.4490440699935192, + "grad_norm": 0.04203864932060242, + "learning_rate": 0.0001417210495521851, + "loss": 0.2839, + "step": 17887 + }, + { + "epoch": 1.4491250810110174, + "grad_norm": 0.05202589929103851, + "learning_rate": 0.00014171654889959045, + "loss": 0.2748, + "step": 17888 + }, + { + "epoch": 1.449206092028516, + "grad_norm": 0.05412520840764046, + "learning_rate": 0.0001417120482469958, + "loss": 0.3208, + "step": 17889 + }, + { + "epoch": 1.4492871030460144, + "grad_norm": 0.040271785110235214, + "learning_rate": 0.0001417075475944012, + "loss": 0.2777, + "step": 17890 + }, + { + "epoch": 1.4493681140635126, + "grad_norm": 0.04579610377550125, + "learning_rate": 0.00014170304694180656, + "loss": 0.293, + "step": 17891 + }, + { + "epoch": 1.449449125081011, + "grad_norm": 0.05482166260480881, + "learning_rate": 0.00014169854628921194, + "loss": 0.3704, + "step": 17892 + }, + { + "epoch": 1.4495301360985093, + "grad_norm": 0.05623460188508034, + "learning_rate": 0.00014169404563661733, + "loss": 0.3182, + "step": 17893 + }, + { + "epoch": 1.4496111471160078, + "grad_norm": 0.04352636635303497, + "learning_rate": 0.0001416895449840227, + "loss": 0.2911, + "step": 17894 + }, + { + "epoch": 1.449692158133506, + "grad_norm": 0.04971605911850929, + "learning_rate": 0.00014168504433142805, + "loss": 0.3116, + "step": 17895 + }, + { + "epoch": 1.4497731691510045, + "grad_norm": 0.04764750599861145, + "learning_rate": 0.00014168054367883344, + "loss": 0.2911, + "step": 17896 + }, + { + "epoch": 1.449854180168503, + "grad_norm": 0.0546216182410717, + "learning_rate": 0.0001416760430262388, + "loss": 0.3103, + "step": 17897 + }, + { + "epoch": 1.4499351911860012, + "grad_norm": 0.046539004892110825, + "learning_rate": 0.00014167154237364419, + "loss": 0.2938, + "step": 17898 + }, + { + "epoch": 1.4500162022034997, + "grad_norm": 0.043932583183050156, + "learning_rate": 0.00014166704172104957, + "loss": 0.2989, + "step": 17899 + }, + { + "epoch": 1.4500972132209982, + "grad_norm": 0.0462028943002224, + "learning_rate": 0.00014166254106845493, + "loss": 0.2953, + "step": 17900 + }, + { + "epoch": 1.4501782242384964, + "grad_norm": 0.04697510227560997, + "learning_rate": 0.0001416580404158603, + "loss": 0.3176, + "step": 17901 + }, + { + "epoch": 1.4502592352559949, + "grad_norm": 0.04830094054341316, + "learning_rate": 0.00014165353976326568, + "loss": 0.2974, + "step": 17902 + }, + { + "epoch": 1.4503402462734931, + "grad_norm": 0.05398035794496536, + "learning_rate": 0.00014164903911067104, + "loss": 0.3185, + "step": 17903 + }, + { + "epoch": 1.4504212572909916, + "grad_norm": 0.04325325787067413, + "learning_rate": 0.00014164453845807643, + "loss": 0.2901, + "step": 17904 + }, + { + "epoch": 1.4505022683084898, + "grad_norm": 0.05543939396739006, + "learning_rate": 0.00014164003780548181, + "loss": 0.299, + "step": 17905 + }, + { + "epoch": 1.4505832793259883, + "grad_norm": 0.046884406358003616, + "learning_rate": 0.00014163553715288717, + "loss": 0.3183, + "step": 17906 + }, + { + "epoch": 1.4506642903434868, + "grad_norm": 0.04522902891039848, + "learning_rate": 0.00014163103650029253, + "loss": 0.3069, + "step": 17907 + }, + { + "epoch": 1.450745301360985, + "grad_norm": 0.04732300341129303, + "learning_rate": 0.00014162653584769792, + "loss": 0.2952, + "step": 17908 + }, + { + "epoch": 1.4508263123784835, + "grad_norm": 0.04480345919728279, + "learning_rate": 0.00014162203519510328, + "loss": 0.2469, + "step": 17909 + }, + { + "epoch": 1.450907323395982, + "grad_norm": 0.04138052836060524, + "learning_rate": 0.00014161753454250867, + "loss": 0.259, + "step": 17910 + }, + { + "epoch": 1.4509883344134802, + "grad_norm": 0.04558480530977249, + "learning_rate": 0.00014161303388991406, + "loss": 0.2778, + "step": 17911 + }, + { + "epoch": 1.4510693454309787, + "grad_norm": 0.04157237336039543, + "learning_rate": 0.00014160853323731942, + "loss": 0.2622, + "step": 17912 + }, + { + "epoch": 1.4511503564484771, + "grad_norm": 0.04621091112494469, + "learning_rate": 0.00014160403258472478, + "loss": 0.3095, + "step": 17913 + }, + { + "epoch": 1.4512313674659754, + "grad_norm": 0.04249318689107895, + "learning_rate": 0.00014159953193213016, + "loss": 0.2948, + "step": 17914 + }, + { + "epoch": 1.4513123784834738, + "grad_norm": 0.05524726212024689, + "learning_rate": 0.00014159503127953555, + "loss": 0.3546, + "step": 17915 + }, + { + "epoch": 1.451393389500972, + "grad_norm": 0.05138620734214783, + "learning_rate": 0.0001415905306269409, + "loss": 0.2997, + "step": 17916 + }, + { + "epoch": 1.4514744005184705, + "grad_norm": 0.04627459496259689, + "learning_rate": 0.0001415860299743463, + "loss": 0.269, + "step": 17917 + }, + { + "epoch": 1.4515554115359688, + "grad_norm": 0.042437851428985596, + "learning_rate": 0.00014158152932175166, + "loss": 0.3065, + "step": 17918 + }, + { + "epoch": 1.4516364225534673, + "grad_norm": 0.048960473388433456, + "learning_rate": 0.00014157702866915702, + "loss": 0.3007, + "step": 17919 + }, + { + "epoch": 1.4517174335709657, + "grad_norm": 0.04249783605337143, + "learning_rate": 0.0001415725280165624, + "loss": 0.2606, + "step": 17920 + }, + { + "epoch": 1.451798444588464, + "grad_norm": 0.0489497147500515, + "learning_rate": 0.0001415680273639678, + "loss": 0.3152, + "step": 17921 + }, + { + "epoch": 1.4518794556059624, + "grad_norm": 0.04541374370455742, + "learning_rate": 0.00014156352671137315, + "loss": 0.3066, + "step": 17922 + }, + { + "epoch": 1.451960466623461, + "grad_norm": 0.05575599893927574, + "learning_rate": 0.00014155902605877854, + "loss": 0.2829, + "step": 17923 + }, + { + "epoch": 1.4520414776409591, + "grad_norm": 0.05299101397395134, + "learning_rate": 0.0001415545254061839, + "loss": 0.3202, + "step": 17924 + }, + { + "epoch": 1.4521224886584576, + "grad_norm": 0.03805427998304367, + "learning_rate": 0.00014155002475358926, + "loss": 0.247, + "step": 17925 + }, + { + "epoch": 1.4522034996759559, + "grad_norm": 0.047305621206760406, + "learning_rate": 0.00014154552410099465, + "loss": 0.2711, + "step": 17926 + }, + { + "epoch": 1.4522845106934543, + "grad_norm": 0.05216123163700104, + "learning_rate": 0.00014154102344840003, + "loss": 0.2862, + "step": 17927 + }, + { + "epoch": 1.4523655217109526, + "grad_norm": 0.04718941077589989, + "learning_rate": 0.0001415365227958054, + "loss": 0.33, + "step": 17928 + }, + { + "epoch": 1.452446532728451, + "grad_norm": 0.04569919407367706, + "learning_rate": 0.00014153202214321078, + "loss": 0.2696, + "step": 17929 + }, + { + "epoch": 1.4525275437459495, + "grad_norm": 0.03612606227397919, + "learning_rate": 0.00014152752149061614, + "loss": 0.2425, + "step": 17930 + }, + { + "epoch": 1.4526085547634477, + "grad_norm": 0.05211987718939781, + "learning_rate": 0.0001415230208380215, + "loss": 0.2915, + "step": 17931 + }, + { + "epoch": 1.4526895657809462, + "grad_norm": 0.04864644259214401, + "learning_rate": 0.0001415185201854269, + "loss": 0.2833, + "step": 17932 + }, + { + "epoch": 1.4527705767984447, + "grad_norm": 0.05485297366976738, + "learning_rate": 0.00014151401953283228, + "loss": 0.2679, + "step": 17933 + }, + { + "epoch": 1.452851587815943, + "grad_norm": 0.05420330911874771, + "learning_rate": 0.00014150951888023764, + "loss": 0.3259, + "step": 17934 + }, + { + "epoch": 1.4529325988334414, + "grad_norm": 0.04696902632713318, + "learning_rate": 0.00014150501822764302, + "loss": 0.2796, + "step": 17935 + }, + { + "epoch": 1.4530136098509399, + "grad_norm": 0.05307691916823387, + "learning_rate": 0.00014150051757504838, + "loss": 0.2913, + "step": 17936 + }, + { + "epoch": 1.453094620868438, + "grad_norm": 0.05445937439799309, + "learning_rate": 0.00014149601692245374, + "loss": 0.2901, + "step": 17937 + }, + { + "epoch": 1.4531756318859366, + "grad_norm": 0.0543481819331646, + "learning_rate": 0.00014149151626985913, + "loss": 0.3006, + "step": 17938 + }, + { + "epoch": 1.4532566429034348, + "grad_norm": 0.0560697577893734, + "learning_rate": 0.00014148701561726452, + "loss": 0.2659, + "step": 17939 + }, + { + "epoch": 1.4533376539209333, + "grad_norm": 0.04972623661160469, + "learning_rate": 0.00014148251496466988, + "loss": 0.2573, + "step": 17940 + }, + { + "epoch": 1.4534186649384315, + "grad_norm": 0.05344128608703613, + "learning_rate": 0.00014147801431207526, + "loss": 0.2826, + "step": 17941 + }, + { + "epoch": 1.45349967595593, + "grad_norm": 0.04564177617430687, + "learning_rate": 0.00014147351365948062, + "loss": 0.287, + "step": 17942 + }, + { + "epoch": 1.4535806869734285, + "grad_norm": 0.04605825990438461, + "learning_rate": 0.000141469013006886, + "loss": 0.2749, + "step": 17943 + }, + { + "epoch": 1.4536616979909267, + "grad_norm": 0.04860905557870865, + "learning_rate": 0.0001414645123542914, + "loss": 0.3092, + "step": 17944 + }, + { + "epoch": 1.4537427090084252, + "grad_norm": 0.048022761940956116, + "learning_rate": 0.00014146001170169676, + "loss": 0.2806, + "step": 17945 + }, + { + "epoch": 1.4538237200259236, + "grad_norm": 0.05513114109635353, + "learning_rate": 0.00014145551104910212, + "loss": 0.2919, + "step": 17946 + }, + { + "epoch": 1.4539047310434219, + "grad_norm": 0.04558014124631882, + "learning_rate": 0.0001414510103965075, + "loss": 0.2748, + "step": 17947 + }, + { + "epoch": 1.4539857420609203, + "grad_norm": 0.05039425566792488, + "learning_rate": 0.00014144650974391287, + "loss": 0.2663, + "step": 17948 + }, + { + "epoch": 1.4540667530784186, + "grad_norm": 0.03847365826368332, + "learning_rate": 0.00014144200909131825, + "loss": 0.2411, + "step": 17949 + }, + { + "epoch": 1.454147764095917, + "grad_norm": 0.04872952029109001, + "learning_rate": 0.00014143750843872364, + "loss": 0.3062, + "step": 17950 + }, + { + "epoch": 1.4542287751134153, + "grad_norm": 0.04281361401081085, + "learning_rate": 0.000141433007786129, + "loss": 0.285, + "step": 17951 + }, + { + "epoch": 1.4543097861309138, + "grad_norm": 0.04651939123868942, + "learning_rate": 0.00014142850713353436, + "loss": 0.2774, + "step": 17952 + }, + { + "epoch": 1.4543907971484122, + "grad_norm": 0.04647616669535637, + "learning_rate": 0.00014142400648093975, + "loss": 0.2742, + "step": 17953 + }, + { + "epoch": 1.4544718081659105, + "grad_norm": 0.05316271260380745, + "learning_rate": 0.0001414195058283451, + "loss": 0.3138, + "step": 17954 + }, + { + "epoch": 1.454552819183409, + "grad_norm": 0.048011794686317444, + "learning_rate": 0.0001414150051757505, + "loss": 0.3148, + "step": 17955 + }, + { + "epoch": 1.4546338302009074, + "grad_norm": 0.046368878334760666, + "learning_rate": 0.00014141050452315588, + "loss": 0.3113, + "step": 17956 + }, + { + "epoch": 1.4547148412184057, + "grad_norm": 0.05619870498776436, + "learning_rate": 0.00014140600387056124, + "loss": 0.2799, + "step": 17957 + }, + { + "epoch": 1.4547958522359041, + "grad_norm": 0.04389476403594017, + "learning_rate": 0.0001414015032179666, + "loss": 0.2668, + "step": 17958 + }, + { + "epoch": 1.4548768632534026, + "grad_norm": 0.0636439397931099, + "learning_rate": 0.000141397002565372, + "loss": 0.3739, + "step": 17959 + }, + { + "epoch": 1.4549578742709008, + "grad_norm": 0.04439515620470047, + "learning_rate": 0.00014139250191277735, + "loss": 0.2836, + "step": 17960 + }, + { + "epoch": 1.455038885288399, + "grad_norm": 0.05382426083087921, + "learning_rate": 0.00014138800126018274, + "loss": 0.2935, + "step": 17961 + }, + { + "epoch": 1.4551198963058976, + "grad_norm": 0.04022745415568352, + "learning_rate": 0.00014138350060758812, + "loss": 0.2283, + "step": 17962 + }, + { + "epoch": 1.455200907323396, + "grad_norm": 0.047797948122024536, + "learning_rate": 0.00014137899995499348, + "loss": 0.2851, + "step": 17963 + }, + { + "epoch": 1.4552819183408943, + "grad_norm": 0.0559130422770977, + "learning_rate": 0.00014137449930239884, + "loss": 0.3301, + "step": 17964 + }, + { + "epoch": 1.4553629293583927, + "grad_norm": 0.05886486545205116, + "learning_rate": 0.00014136999864980423, + "loss": 0.3114, + "step": 17965 + }, + { + "epoch": 1.4554439403758912, + "grad_norm": 0.04601144418120384, + "learning_rate": 0.0001413654979972096, + "loss": 0.2874, + "step": 17966 + }, + { + "epoch": 1.4555249513933894, + "grad_norm": 0.05067246034741402, + "learning_rate": 0.00014136099734461498, + "loss": 0.3187, + "step": 17967 + }, + { + "epoch": 1.455605962410888, + "grad_norm": 0.04644129052758217, + "learning_rate": 0.00014135649669202036, + "loss": 0.2851, + "step": 17968 + }, + { + "epoch": 1.4556869734283864, + "grad_norm": 0.04704859480261803, + "learning_rate": 0.00014135199603942572, + "loss": 0.3031, + "step": 17969 + }, + { + "epoch": 1.4557679844458846, + "grad_norm": 0.04019991680979729, + "learning_rate": 0.00014134749538683108, + "loss": 0.293, + "step": 17970 + }, + { + "epoch": 1.455848995463383, + "grad_norm": 0.04760279878973961, + "learning_rate": 0.00014134299473423647, + "loss": 0.3085, + "step": 17971 + }, + { + "epoch": 1.4559300064808813, + "grad_norm": 0.03884115442633629, + "learning_rate": 0.00014133849408164183, + "loss": 0.2446, + "step": 17972 + }, + { + "epoch": 1.4560110174983798, + "grad_norm": 0.04347531870007515, + "learning_rate": 0.00014133399342904722, + "loss": 0.2866, + "step": 17973 + }, + { + "epoch": 1.456092028515878, + "grad_norm": 0.045292872935533524, + "learning_rate": 0.0001413294927764526, + "loss": 0.305, + "step": 17974 + }, + { + "epoch": 1.4561730395333765, + "grad_norm": 0.04531371593475342, + "learning_rate": 0.00014132499212385797, + "loss": 0.3083, + "step": 17975 + }, + { + "epoch": 1.456254050550875, + "grad_norm": 0.05037960782647133, + "learning_rate": 0.00014132049147126333, + "loss": 0.3167, + "step": 17976 + }, + { + "epoch": 1.4563350615683732, + "grad_norm": 0.04448464885354042, + "learning_rate": 0.00014131599081866871, + "loss": 0.2932, + "step": 17977 + }, + { + "epoch": 1.4564160725858717, + "grad_norm": 0.06180466711521149, + "learning_rate": 0.00014131149016607407, + "loss": 0.3675, + "step": 17978 + }, + { + "epoch": 1.4564970836033702, + "grad_norm": 0.04839387536048889, + "learning_rate": 0.00014130698951347946, + "loss": 0.2794, + "step": 17979 + }, + { + "epoch": 1.4565780946208684, + "grad_norm": 0.04605800285935402, + "learning_rate": 0.00014130248886088485, + "loss": 0.2985, + "step": 17980 + }, + { + "epoch": 1.4566591056383669, + "grad_norm": 0.04858919233083725, + "learning_rate": 0.0001412979882082902, + "loss": 0.3068, + "step": 17981 + }, + { + "epoch": 1.4567401166558653, + "grad_norm": 0.0511137954890728, + "learning_rate": 0.00014129348755569557, + "loss": 0.3077, + "step": 17982 + }, + { + "epoch": 1.4568211276733636, + "grad_norm": 0.040559422224760056, + "learning_rate": 0.00014128898690310096, + "loss": 0.2673, + "step": 17983 + }, + { + "epoch": 1.4569021386908618, + "grad_norm": 0.05546125769615173, + "learning_rate": 0.00014128448625050632, + "loss": 0.2827, + "step": 17984 + }, + { + "epoch": 1.4569831497083603, + "grad_norm": 0.047785788774490356, + "learning_rate": 0.0001412799855979117, + "loss": 0.2556, + "step": 17985 + }, + { + "epoch": 1.4570641607258588, + "grad_norm": 0.04613126814365387, + "learning_rate": 0.0001412754849453171, + "loss": 0.2681, + "step": 17986 + }, + { + "epoch": 1.457145171743357, + "grad_norm": 0.053333960473537445, + "learning_rate": 0.00014127098429272245, + "loss": 0.3264, + "step": 17987 + }, + { + "epoch": 1.4572261827608555, + "grad_norm": 0.0531015619635582, + "learning_rate": 0.0001412664836401278, + "loss": 0.3412, + "step": 17988 + }, + { + "epoch": 1.457307193778354, + "grad_norm": 0.04451011121273041, + "learning_rate": 0.0001412619829875332, + "loss": 0.282, + "step": 17989 + }, + { + "epoch": 1.4573882047958522, + "grad_norm": 0.04917840659618378, + "learning_rate": 0.00014125748233493856, + "loss": 0.2605, + "step": 17990 + }, + { + "epoch": 1.4574692158133506, + "grad_norm": 0.04786407947540283, + "learning_rate": 0.00014125298168234394, + "loss": 0.2776, + "step": 17991 + }, + { + "epoch": 1.4575502268308491, + "grad_norm": 0.04443423077464104, + "learning_rate": 0.00014124848102974933, + "loss": 0.2901, + "step": 17992 + }, + { + "epoch": 1.4576312378483474, + "grad_norm": 0.05426434054970741, + "learning_rate": 0.0001412439803771547, + "loss": 0.3013, + "step": 17993 + }, + { + "epoch": 1.4577122488658458, + "grad_norm": 0.05337275192141533, + "learning_rate": 0.00014123947972456005, + "loss": 0.2869, + "step": 17994 + }, + { + "epoch": 1.457793259883344, + "grad_norm": 0.04778950288891792, + "learning_rate": 0.00014123497907196544, + "loss": 0.2652, + "step": 17995 + }, + { + "epoch": 1.4578742709008425, + "grad_norm": 0.05062709003686905, + "learning_rate": 0.00014123047841937083, + "loss": 0.329, + "step": 17996 + }, + { + "epoch": 1.4579552819183408, + "grad_norm": 0.045372236520051956, + "learning_rate": 0.00014122597776677619, + "loss": 0.2665, + "step": 17997 + }, + { + "epoch": 1.4580362929358393, + "grad_norm": 0.058160923421382904, + "learning_rate": 0.00014122147711418157, + "loss": 0.3171, + "step": 17998 + }, + { + "epoch": 1.4581173039533377, + "grad_norm": 0.052749160677194595, + "learning_rate": 0.00014121697646158693, + "loss": 0.2994, + "step": 17999 + }, + { + "epoch": 1.458198314970836, + "grad_norm": 0.04014609754085541, + "learning_rate": 0.0001412124758089923, + "loss": 0.2543, + "step": 18000 + }, + { + "epoch": 1.4582793259883344, + "grad_norm": 0.050106558948755264, + "learning_rate": 0.00014120797515639768, + "loss": 0.3141, + "step": 18001 + }, + { + "epoch": 1.458360337005833, + "grad_norm": 0.061030786484479904, + "learning_rate": 0.00014120347450380307, + "loss": 0.2815, + "step": 18002 + }, + { + "epoch": 1.4584413480233311, + "grad_norm": 0.047040242701768875, + "learning_rate": 0.00014119897385120843, + "loss": 0.2907, + "step": 18003 + }, + { + "epoch": 1.4585223590408296, + "grad_norm": 0.06359859555959702, + "learning_rate": 0.00014119447319861381, + "loss": 0.34, + "step": 18004 + }, + { + "epoch": 1.4586033700583279, + "grad_norm": 0.04778430610895157, + "learning_rate": 0.00014118997254601917, + "loss": 0.3083, + "step": 18005 + }, + { + "epoch": 1.4586843810758263, + "grad_norm": 0.04784518852829933, + "learning_rate": 0.00014118547189342453, + "loss": 0.2844, + "step": 18006 + }, + { + "epoch": 1.4587653920933246, + "grad_norm": 0.04674403741955757, + "learning_rate": 0.00014118097124082992, + "loss": 0.3182, + "step": 18007 + }, + { + "epoch": 1.458846403110823, + "grad_norm": 0.04453251138329506, + "learning_rate": 0.0001411764705882353, + "loss": 0.2985, + "step": 18008 + }, + { + "epoch": 1.4589274141283215, + "grad_norm": 0.06165555119514465, + "learning_rate": 0.00014117196993564067, + "loss": 0.3341, + "step": 18009 + }, + { + "epoch": 1.4590084251458197, + "grad_norm": 0.04634040594100952, + "learning_rate": 0.00014116746928304606, + "loss": 0.3042, + "step": 18010 + }, + { + "epoch": 1.4590894361633182, + "grad_norm": 0.04452341049909592, + "learning_rate": 0.00014116296863045142, + "loss": 0.3252, + "step": 18011 + }, + { + "epoch": 1.4591704471808167, + "grad_norm": 0.041508760303258896, + "learning_rate": 0.0001411584679778568, + "loss": 0.2849, + "step": 18012 + }, + { + "epoch": 1.459251458198315, + "grad_norm": 0.05098029226064682, + "learning_rate": 0.00014115396732526216, + "loss": 0.2989, + "step": 18013 + }, + { + "epoch": 1.4593324692158134, + "grad_norm": 0.04109809920191765, + "learning_rate": 0.00014114946667266755, + "loss": 0.2533, + "step": 18014 + }, + { + "epoch": 1.4594134802333119, + "grad_norm": 0.043970998376607895, + "learning_rate": 0.0001411449660200729, + "loss": 0.2887, + "step": 18015 + }, + { + "epoch": 1.45949449125081, + "grad_norm": 0.043616171926259995, + "learning_rate": 0.0001411404653674783, + "loss": 0.2799, + "step": 18016 + }, + { + "epoch": 1.4595755022683086, + "grad_norm": 0.046028878539800644, + "learning_rate": 0.00014113596471488366, + "loss": 0.2528, + "step": 18017 + }, + { + "epoch": 1.4596565132858068, + "grad_norm": 0.037560880184173584, + "learning_rate": 0.00014113146406228904, + "loss": 0.2436, + "step": 18018 + }, + { + "epoch": 1.4597375243033053, + "grad_norm": 0.059593748301267624, + "learning_rate": 0.0001411269634096944, + "loss": 0.3001, + "step": 18019 + }, + { + "epoch": 1.4598185353208035, + "grad_norm": 0.046373482793569565, + "learning_rate": 0.0001411224627570998, + "loss": 0.232, + "step": 18020 + }, + { + "epoch": 1.459899546338302, + "grad_norm": 0.05016429349780083, + "learning_rate": 0.00014111796210450515, + "loss": 0.2768, + "step": 18021 + }, + { + "epoch": 1.4599805573558005, + "grad_norm": 0.05345659330487251, + "learning_rate": 0.00014111346145191054, + "loss": 0.2539, + "step": 18022 + }, + { + "epoch": 1.4600615683732987, + "grad_norm": 0.05080629140138626, + "learning_rate": 0.0001411089607993159, + "loss": 0.2969, + "step": 18023 + }, + { + "epoch": 1.4601425793907972, + "grad_norm": 0.049367163330316544, + "learning_rate": 0.0001411044601467213, + "loss": 0.2725, + "step": 18024 + }, + { + "epoch": 1.4602235904082956, + "grad_norm": 0.048777006566524506, + "learning_rate": 0.00014109995949412667, + "loss": 0.2831, + "step": 18025 + }, + { + "epoch": 1.4603046014257939, + "grad_norm": 0.05571383982896805, + "learning_rate": 0.00014109545884153203, + "loss": 0.3142, + "step": 18026 + }, + { + "epoch": 1.4603856124432923, + "grad_norm": 0.051831867545843124, + "learning_rate": 0.0001410909581889374, + "loss": 0.2662, + "step": 18027 + }, + { + "epoch": 1.4604666234607906, + "grad_norm": 0.0479249432682991, + "learning_rate": 0.00014108645753634278, + "loss": 0.279, + "step": 18028 + }, + { + "epoch": 1.460547634478289, + "grad_norm": 0.05813343822956085, + "learning_rate": 0.00014108195688374814, + "loss": 0.3218, + "step": 18029 + }, + { + "epoch": 1.4606286454957873, + "grad_norm": 0.06661345064640045, + "learning_rate": 0.00014107745623115353, + "loss": 0.3647, + "step": 18030 + }, + { + "epoch": 1.4607096565132858, + "grad_norm": 0.048136595636606216, + "learning_rate": 0.00014107295557855892, + "loss": 0.3022, + "step": 18031 + }, + { + "epoch": 1.4607906675307842, + "grad_norm": 0.046726103872060776, + "learning_rate": 0.00014106845492596428, + "loss": 0.3248, + "step": 18032 + }, + { + "epoch": 1.4608716785482825, + "grad_norm": 0.04466182738542557, + "learning_rate": 0.00014106395427336964, + "loss": 0.2979, + "step": 18033 + }, + { + "epoch": 1.460952689565781, + "grad_norm": 0.04451071098446846, + "learning_rate": 0.00014105945362077502, + "loss": 0.2743, + "step": 18034 + }, + { + "epoch": 1.4610337005832794, + "grad_norm": 0.045306671410799026, + "learning_rate": 0.00014105495296818038, + "loss": 0.2636, + "step": 18035 + }, + { + "epoch": 1.4611147116007777, + "grad_norm": 0.044049546122550964, + "learning_rate": 0.00014105045231558577, + "loss": 0.2532, + "step": 18036 + }, + { + "epoch": 1.4611957226182761, + "grad_norm": 0.04672233387827873, + "learning_rate": 0.00014104595166299116, + "loss": 0.2601, + "step": 18037 + }, + { + "epoch": 1.4612767336357746, + "grad_norm": 0.05258664861321449, + "learning_rate": 0.00014104145101039652, + "loss": 0.3111, + "step": 18038 + }, + { + "epoch": 1.4613577446532728, + "grad_norm": 0.04627470672130585, + "learning_rate": 0.00014103695035780188, + "loss": 0.3116, + "step": 18039 + }, + { + "epoch": 1.4614387556707713, + "grad_norm": 0.054817069321870804, + "learning_rate": 0.00014103244970520726, + "loss": 0.3016, + "step": 18040 + }, + { + "epoch": 1.4615197666882696, + "grad_norm": 0.045628152787685394, + "learning_rate": 0.00014102794905261262, + "loss": 0.2678, + "step": 18041 + }, + { + "epoch": 1.461600777705768, + "grad_norm": 0.04165344685316086, + "learning_rate": 0.000141023448400018, + "loss": 0.3255, + "step": 18042 + }, + { + "epoch": 1.4616817887232663, + "grad_norm": 0.056146860122680664, + "learning_rate": 0.0001410189477474234, + "loss": 0.3322, + "step": 18043 + }, + { + "epoch": 1.4617627997407647, + "grad_norm": 0.04300034046173096, + "learning_rate": 0.00014101444709482876, + "loss": 0.2555, + "step": 18044 + }, + { + "epoch": 1.4618438107582632, + "grad_norm": 0.04808373376727104, + "learning_rate": 0.00014100994644223412, + "loss": 0.3001, + "step": 18045 + }, + { + "epoch": 1.4619248217757614, + "grad_norm": 0.04786072298884392, + "learning_rate": 0.0001410054457896395, + "loss": 0.3113, + "step": 18046 + }, + { + "epoch": 1.46200583279326, + "grad_norm": 0.04329356551170349, + "learning_rate": 0.00014100094513704487, + "loss": 0.2971, + "step": 18047 + }, + { + "epoch": 1.4620868438107584, + "grad_norm": 0.0741897001862526, + "learning_rate": 0.00014099644448445025, + "loss": 0.3311, + "step": 18048 + }, + { + "epoch": 1.4621678548282566, + "grad_norm": 0.0446975976228714, + "learning_rate": 0.00014099194383185564, + "loss": 0.2803, + "step": 18049 + }, + { + "epoch": 1.462248865845755, + "grad_norm": 0.054658737033605576, + "learning_rate": 0.000140987443179261, + "loss": 0.3161, + "step": 18050 + }, + { + "epoch": 1.4623298768632533, + "grad_norm": 0.053584374487400055, + "learning_rate": 0.00014098294252666636, + "loss": 0.2971, + "step": 18051 + }, + { + "epoch": 1.4624108878807518, + "grad_norm": 0.04646073654294014, + "learning_rate": 0.00014097844187407175, + "loss": 0.2612, + "step": 18052 + }, + { + "epoch": 1.46249189889825, + "grad_norm": 0.049993809312582016, + "learning_rate": 0.0001409739412214771, + "loss": 0.3111, + "step": 18053 + }, + { + "epoch": 1.4625729099157485, + "grad_norm": 0.05515061691403389, + "learning_rate": 0.0001409694405688825, + "loss": 0.2937, + "step": 18054 + }, + { + "epoch": 1.462653920933247, + "grad_norm": 0.042825594544410706, + "learning_rate": 0.00014096493991628788, + "loss": 0.2597, + "step": 18055 + }, + { + "epoch": 1.4627349319507452, + "grad_norm": 0.04601466655731201, + "learning_rate": 0.00014096043926369324, + "loss": 0.2796, + "step": 18056 + }, + { + "epoch": 1.4628159429682437, + "grad_norm": 0.04693853110074997, + "learning_rate": 0.0001409559386110986, + "loss": 0.2861, + "step": 18057 + }, + { + "epoch": 1.4628969539857422, + "grad_norm": 0.05535530298948288, + "learning_rate": 0.000140951437958504, + "loss": 0.3323, + "step": 18058 + }, + { + "epoch": 1.4629779650032404, + "grad_norm": 0.047633446753025055, + "learning_rate": 0.00014094693730590935, + "loss": 0.3258, + "step": 18059 + }, + { + "epoch": 1.4630589760207389, + "grad_norm": 0.05180913954973221, + "learning_rate": 0.00014094243665331474, + "loss": 0.2995, + "step": 18060 + }, + { + "epoch": 1.4631399870382373, + "grad_norm": 0.04927406460046768, + "learning_rate": 0.00014093793600072012, + "loss": 0.3096, + "step": 18061 + }, + { + "epoch": 1.4632209980557356, + "grad_norm": 0.04759169742465019, + "learning_rate": 0.00014093343534812548, + "loss": 0.2886, + "step": 18062 + }, + { + "epoch": 1.4633020090732338, + "grad_norm": 0.04811166226863861, + "learning_rate": 0.00014092893469553084, + "loss": 0.2862, + "step": 18063 + }, + { + "epoch": 1.4633830200907323, + "grad_norm": 0.05206162855029106, + "learning_rate": 0.00014092443404293623, + "loss": 0.2675, + "step": 18064 + }, + { + "epoch": 1.4634640311082308, + "grad_norm": 0.051354411989450455, + "learning_rate": 0.0001409199333903416, + "loss": 0.2809, + "step": 18065 + }, + { + "epoch": 1.463545042125729, + "grad_norm": 0.05393500253558159, + "learning_rate": 0.00014091543273774698, + "loss": 0.299, + "step": 18066 + }, + { + "epoch": 1.4636260531432275, + "grad_norm": 0.048284344375133514, + "learning_rate": 0.00014091093208515237, + "loss": 0.3017, + "step": 18067 + }, + { + "epoch": 1.463707064160726, + "grad_norm": 0.05360013619065285, + "learning_rate": 0.00014090643143255773, + "loss": 0.2872, + "step": 18068 + }, + { + "epoch": 1.4637880751782242, + "grad_norm": 0.04138386994600296, + "learning_rate": 0.00014090193077996309, + "loss": 0.2826, + "step": 18069 + }, + { + "epoch": 1.4638690861957226, + "grad_norm": 0.053617946803569794, + "learning_rate": 0.00014089743012736847, + "loss": 0.3046, + "step": 18070 + }, + { + "epoch": 1.4639500972132211, + "grad_norm": 0.04091386869549751, + "learning_rate": 0.00014089292947477383, + "loss": 0.2847, + "step": 18071 + }, + { + "epoch": 1.4640311082307194, + "grad_norm": 0.05604439973831177, + "learning_rate": 0.00014088842882217922, + "loss": 0.3217, + "step": 18072 + }, + { + "epoch": 1.4641121192482178, + "grad_norm": 0.049055736511945724, + "learning_rate": 0.0001408839281695846, + "loss": 0.2878, + "step": 18073 + }, + { + "epoch": 1.464193130265716, + "grad_norm": 0.05496743321418762, + "learning_rate": 0.00014087942751698997, + "loss": 0.2818, + "step": 18074 + }, + { + "epoch": 1.4642741412832145, + "grad_norm": 0.05350017547607422, + "learning_rate": 0.00014087492686439533, + "loss": 0.3004, + "step": 18075 + }, + { + "epoch": 1.4643551523007128, + "grad_norm": 0.060675207525491714, + "learning_rate": 0.00014087042621180071, + "loss": 0.2802, + "step": 18076 + }, + { + "epoch": 1.4644361633182112, + "grad_norm": 0.06121957674622536, + "learning_rate": 0.0001408659255592061, + "loss": 0.3384, + "step": 18077 + }, + { + "epoch": 1.4645171743357097, + "grad_norm": 0.05118240416049957, + "learning_rate": 0.00014086142490661146, + "loss": 0.2998, + "step": 18078 + }, + { + "epoch": 1.464598185353208, + "grad_norm": 0.05059054493904114, + "learning_rate": 0.00014085692425401685, + "loss": 0.3121, + "step": 18079 + }, + { + "epoch": 1.4646791963707064, + "grad_norm": 0.05860761180520058, + "learning_rate": 0.0001408524236014222, + "loss": 0.3112, + "step": 18080 + }, + { + "epoch": 1.464760207388205, + "grad_norm": 0.0478266216814518, + "learning_rate": 0.0001408479229488276, + "loss": 0.2879, + "step": 18081 + }, + { + "epoch": 1.4648412184057031, + "grad_norm": 0.04629899561405182, + "learning_rate": 0.00014084342229623296, + "loss": 0.2616, + "step": 18082 + }, + { + "epoch": 1.4649222294232016, + "grad_norm": 0.05281487852334976, + "learning_rate": 0.00014083892164363834, + "loss": 0.3508, + "step": 18083 + }, + { + "epoch": 1.4650032404407, + "grad_norm": 0.04183907061815262, + "learning_rate": 0.0001408344209910437, + "loss": 0.2721, + "step": 18084 + }, + { + "epoch": 1.4650842514581983, + "grad_norm": 0.052894480526447296, + "learning_rate": 0.0001408299203384491, + "loss": 0.2953, + "step": 18085 + }, + { + "epoch": 1.4651652624756966, + "grad_norm": 0.04844099283218384, + "learning_rate": 0.00014082541968585445, + "loss": 0.3042, + "step": 18086 + }, + { + "epoch": 1.465246273493195, + "grad_norm": 0.04348544776439667, + "learning_rate": 0.00014082091903325984, + "loss": 0.2652, + "step": 18087 + }, + { + "epoch": 1.4653272845106935, + "grad_norm": 0.04488521069288254, + "learning_rate": 0.0001408164183806652, + "loss": 0.2853, + "step": 18088 + }, + { + "epoch": 1.4654082955281917, + "grad_norm": 0.05188070237636566, + "learning_rate": 0.00014081191772807058, + "loss": 0.2873, + "step": 18089 + }, + { + "epoch": 1.4654893065456902, + "grad_norm": 0.05340287834405899, + "learning_rate": 0.00014080741707547594, + "loss": 0.2951, + "step": 18090 + }, + { + "epoch": 1.4655703175631887, + "grad_norm": 0.05681544169783592, + "learning_rate": 0.00014080291642288133, + "loss": 0.2965, + "step": 18091 + }, + { + "epoch": 1.465651328580687, + "grad_norm": 0.043873131275177, + "learning_rate": 0.0001407984157702867, + "loss": 0.2541, + "step": 18092 + }, + { + "epoch": 1.4657323395981854, + "grad_norm": 0.058677103370428085, + "learning_rate": 0.00014079391511769208, + "loss": 0.3375, + "step": 18093 + }, + { + "epoch": 1.4658133506156839, + "grad_norm": 0.05628986656665802, + "learning_rate": 0.00014078941446509744, + "loss": 0.3147, + "step": 18094 + }, + { + "epoch": 1.465894361633182, + "grad_norm": 0.049282487481832504, + "learning_rate": 0.00014078491381250283, + "loss": 0.2791, + "step": 18095 + }, + { + "epoch": 1.4659753726506806, + "grad_norm": 0.053728487342596054, + "learning_rate": 0.00014078041315990819, + "loss": 0.34, + "step": 18096 + }, + { + "epoch": 1.4660563836681788, + "grad_norm": 0.04620833694934845, + "learning_rate": 0.00014077591250731357, + "loss": 0.272, + "step": 18097 + }, + { + "epoch": 1.4661373946856773, + "grad_norm": 0.048192430287599564, + "learning_rate": 0.00014077141185471893, + "loss": 0.2744, + "step": 18098 + }, + { + "epoch": 1.4662184057031755, + "grad_norm": 0.0455770380795002, + "learning_rate": 0.00014076691120212432, + "loss": 0.2665, + "step": 18099 + }, + { + "epoch": 1.466299416720674, + "grad_norm": 0.04394909366965294, + "learning_rate": 0.0001407624105495297, + "loss": 0.282, + "step": 18100 + }, + { + "epoch": 1.4663804277381725, + "grad_norm": 0.054294608533382416, + "learning_rate": 0.00014075790989693507, + "loss": 0.2904, + "step": 18101 + }, + { + "epoch": 1.4664614387556707, + "grad_norm": 0.04804273322224617, + "learning_rate": 0.00014075340924434043, + "loss": 0.3221, + "step": 18102 + }, + { + "epoch": 1.4665424497731692, + "grad_norm": 0.04664277285337448, + "learning_rate": 0.00014074890859174581, + "loss": 0.2732, + "step": 18103 + }, + { + "epoch": 1.4666234607906676, + "grad_norm": 0.04305239021778107, + "learning_rate": 0.00014074440793915117, + "loss": 0.2846, + "step": 18104 + }, + { + "epoch": 1.4667044718081659, + "grad_norm": 0.0597095862030983, + "learning_rate": 0.00014073990728655656, + "loss": 0.3724, + "step": 18105 + }, + { + "epoch": 1.4667854828256643, + "grad_norm": 0.050561290234327316, + "learning_rate": 0.00014073540663396195, + "loss": 0.3046, + "step": 18106 + }, + { + "epoch": 1.4668664938431626, + "grad_norm": 0.039581701159477234, + "learning_rate": 0.0001407309059813673, + "loss": 0.2853, + "step": 18107 + }, + { + "epoch": 1.466947504860661, + "grad_norm": 0.05390976369380951, + "learning_rate": 0.00014072640532877267, + "loss": 0.2613, + "step": 18108 + }, + { + "epoch": 1.4670285158781593, + "grad_norm": 0.06380044668912888, + "learning_rate": 0.00014072190467617806, + "loss": 0.2974, + "step": 18109 + }, + { + "epoch": 1.4671095268956578, + "grad_norm": 0.04904315248131752, + "learning_rate": 0.00014071740402358342, + "loss": 0.2834, + "step": 18110 + }, + { + "epoch": 1.4671905379131562, + "grad_norm": 0.048541679978370667, + "learning_rate": 0.0001407129033709888, + "loss": 0.2899, + "step": 18111 + }, + { + "epoch": 1.4672715489306545, + "grad_norm": 0.04984492063522339, + "learning_rate": 0.0001407084027183942, + "loss": 0.2583, + "step": 18112 + }, + { + "epoch": 1.467352559948153, + "grad_norm": 0.0442902147769928, + "learning_rate": 0.00014070390206579955, + "loss": 0.2719, + "step": 18113 + }, + { + "epoch": 1.4674335709656514, + "grad_norm": 0.046335652470588684, + "learning_rate": 0.0001406994014132049, + "loss": 0.3099, + "step": 18114 + }, + { + "epoch": 1.4675145819831497, + "grad_norm": 0.051748134195804596, + "learning_rate": 0.0001406949007606103, + "loss": 0.3051, + "step": 18115 + }, + { + "epoch": 1.4675955930006481, + "grad_norm": 0.04839160665869713, + "learning_rate": 0.00014069040010801566, + "loss": 0.2853, + "step": 18116 + }, + { + "epoch": 1.4676766040181466, + "grad_norm": 0.053315989673137665, + "learning_rate": 0.00014068589945542105, + "loss": 0.3041, + "step": 18117 + }, + { + "epoch": 1.4677576150356448, + "grad_norm": 0.048422813415527344, + "learning_rate": 0.00014068139880282643, + "loss": 0.3255, + "step": 18118 + }, + { + "epoch": 1.4678386260531433, + "grad_norm": 0.045284856110811234, + "learning_rate": 0.0001406768981502318, + "loss": 0.3213, + "step": 18119 + }, + { + "epoch": 1.4679196370706415, + "grad_norm": 0.04589143767952919, + "learning_rate": 0.00014067239749763715, + "loss": 0.2953, + "step": 18120 + }, + { + "epoch": 1.46800064808814, + "grad_norm": 0.04163743555545807, + "learning_rate": 0.00014066789684504254, + "loss": 0.2669, + "step": 18121 + }, + { + "epoch": 1.4680816591056383, + "grad_norm": 0.050901882350444794, + "learning_rate": 0.0001406633961924479, + "loss": 0.2972, + "step": 18122 + }, + { + "epoch": 1.4681626701231367, + "grad_norm": 0.042865533381700516, + "learning_rate": 0.0001406588955398533, + "loss": 0.2819, + "step": 18123 + }, + { + "epoch": 1.4682436811406352, + "grad_norm": 0.04150066897273064, + "learning_rate": 0.00014065439488725867, + "loss": 0.2596, + "step": 18124 + }, + { + "epoch": 1.4683246921581334, + "grad_norm": 0.053796712309122086, + "learning_rate": 0.00014064989423466403, + "loss": 0.3035, + "step": 18125 + }, + { + "epoch": 1.468405703175632, + "grad_norm": 0.045569829642772675, + "learning_rate": 0.0001406453935820694, + "loss": 0.2902, + "step": 18126 + }, + { + "epoch": 1.4684867141931304, + "grad_norm": 0.05256570503115654, + "learning_rate": 0.00014064089292947478, + "loss": 0.307, + "step": 18127 + }, + { + "epoch": 1.4685677252106286, + "grad_norm": 0.054923005402088165, + "learning_rate": 0.00014063639227688014, + "loss": 0.3389, + "step": 18128 + }, + { + "epoch": 1.468648736228127, + "grad_norm": 0.05528505519032478, + "learning_rate": 0.00014063189162428553, + "loss": 0.2991, + "step": 18129 + }, + { + "epoch": 1.4687297472456253, + "grad_norm": 0.059665605425834656, + "learning_rate": 0.00014062739097169092, + "loss": 0.3061, + "step": 18130 + }, + { + "epoch": 1.4688107582631238, + "grad_norm": 0.058723267167806625, + "learning_rate": 0.00014062289031909628, + "loss": 0.3146, + "step": 18131 + }, + { + "epoch": 1.468891769280622, + "grad_norm": 0.055252302438020706, + "learning_rate": 0.00014061838966650164, + "loss": 0.3149, + "step": 18132 + }, + { + "epoch": 1.4689727802981205, + "grad_norm": 0.053718701004981995, + "learning_rate": 0.00014061388901390702, + "loss": 0.3216, + "step": 18133 + }, + { + "epoch": 1.469053791315619, + "grad_norm": 0.05037958547472954, + "learning_rate": 0.00014060938836131238, + "loss": 0.2632, + "step": 18134 + }, + { + "epoch": 1.4691348023331172, + "grad_norm": 0.043746188282966614, + "learning_rate": 0.00014060488770871777, + "loss": 0.2798, + "step": 18135 + }, + { + "epoch": 1.4692158133506157, + "grad_norm": 0.06655730307102203, + "learning_rate": 0.00014060038705612316, + "loss": 0.2882, + "step": 18136 + }, + { + "epoch": 1.4692968243681142, + "grad_norm": 0.04868955910205841, + "learning_rate": 0.00014059588640352852, + "loss": 0.2769, + "step": 18137 + }, + { + "epoch": 1.4693778353856124, + "grad_norm": 0.04965856298804283, + "learning_rate": 0.00014059138575093388, + "loss": 0.287, + "step": 18138 + }, + { + "epoch": 1.4694588464031109, + "grad_norm": 0.053828127682209015, + "learning_rate": 0.00014058688509833926, + "loss": 0.3166, + "step": 18139 + }, + { + "epoch": 1.4695398574206093, + "grad_norm": 0.045842841267585754, + "learning_rate": 0.00014058238444574462, + "loss": 0.2803, + "step": 18140 + }, + { + "epoch": 1.4696208684381076, + "grad_norm": 0.04934623837471008, + "learning_rate": 0.00014057788379315, + "loss": 0.2944, + "step": 18141 + }, + { + "epoch": 1.469701879455606, + "grad_norm": 0.051355887204408646, + "learning_rate": 0.0001405733831405554, + "loss": 0.2672, + "step": 18142 + }, + { + "epoch": 1.4697828904731043, + "grad_norm": 0.045832522213459015, + "learning_rate": 0.00014056888248796076, + "loss": 0.2666, + "step": 18143 + }, + { + "epoch": 1.4698639014906028, + "grad_norm": 0.04589347168803215, + "learning_rate": 0.00014056438183536612, + "loss": 0.2691, + "step": 18144 + }, + { + "epoch": 1.469944912508101, + "grad_norm": 0.05721011757850647, + "learning_rate": 0.0001405598811827715, + "loss": 0.3372, + "step": 18145 + }, + { + "epoch": 1.4700259235255995, + "grad_norm": 0.04498670995235443, + "learning_rate": 0.00014055538053017687, + "loss": 0.2897, + "step": 18146 + }, + { + "epoch": 1.470106934543098, + "grad_norm": 0.04440313205122948, + "learning_rate": 0.00014055087987758225, + "loss": 0.2951, + "step": 18147 + }, + { + "epoch": 1.4701879455605962, + "grad_norm": 0.046017423272132874, + "learning_rate": 0.00014054637922498764, + "loss": 0.2621, + "step": 18148 + }, + { + "epoch": 1.4702689565780946, + "grad_norm": 0.05513910576701164, + "learning_rate": 0.000140541878572393, + "loss": 0.3399, + "step": 18149 + }, + { + "epoch": 1.470349967595593, + "grad_norm": 0.054075244814157486, + "learning_rate": 0.0001405373779197984, + "loss": 0.3114, + "step": 18150 + }, + { + "epoch": 1.4704309786130914, + "grad_norm": 0.04777602478861809, + "learning_rate": 0.00014053287726720375, + "loss": 0.2559, + "step": 18151 + }, + { + "epoch": 1.4705119896305898, + "grad_norm": 0.03741517290472984, + "learning_rate": 0.0001405283766146091, + "loss": 0.2615, + "step": 18152 + }, + { + "epoch": 1.470593000648088, + "grad_norm": 0.045710425823926926, + "learning_rate": 0.0001405238759620145, + "loss": 0.2705, + "step": 18153 + }, + { + "epoch": 1.4706740116655865, + "grad_norm": 0.05262228474020958, + "learning_rate": 0.00014051937530941988, + "loss": 0.2863, + "step": 18154 + }, + { + "epoch": 1.4707550226830848, + "grad_norm": 0.049241792410612106, + "learning_rate": 0.00014051487465682524, + "loss": 0.3101, + "step": 18155 + }, + { + "epoch": 1.4708360337005832, + "grad_norm": 0.047971051186323166, + "learning_rate": 0.00014051037400423063, + "loss": 0.3168, + "step": 18156 + }, + { + "epoch": 1.4709170447180817, + "grad_norm": 0.04645678028464317, + "learning_rate": 0.000140505873351636, + "loss": 0.3053, + "step": 18157 + }, + { + "epoch": 1.47099805573558, + "grad_norm": 0.044148728251457214, + "learning_rate": 0.00014050137269904138, + "loss": 0.2624, + "step": 18158 + }, + { + "epoch": 1.4710790667530784, + "grad_norm": 0.05452611297369003, + "learning_rate": 0.00014049687204644674, + "loss": 0.3388, + "step": 18159 + }, + { + "epoch": 1.471160077770577, + "grad_norm": 0.05382085219025612, + "learning_rate": 0.00014049237139385212, + "loss": 0.309, + "step": 18160 + }, + { + "epoch": 1.4712410887880751, + "grad_norm": 0.05108357220888138, + "learning_rate": 0.00014048787074125748, + "loss": 0.275, + "step": 18161 + }, + { + "epoch": 1.4713220998055736, + "grad_norm": 0.0608837716281414, + "learning_rate": 0.00014048337008866287, + "loss": 0.3417, + "step": 18162 + }, + { + "epoch": 1.471403110823072, + "grad_norm": 0.036789074540138245, + "learning_rate": 0.00014047886943606823, + "loss": 0.2618, + "step": 18163 + }, + { + "epoch": 1.4714841218405703, + "grad_norm": 0.038801342248916626, + "learning_rate": 0.00014047436878347362, + "loss": 0.2433, + "step": 18164 + }, + { + "epoch": 1.4715651328580686, + "grad_norm": 0.04561394080519676, + "learning_rate": 0.00014046986813087898, + "loss": 0.2788, + "step": 18165 + }, + { + "epoch": 1.471646143875567, + "grad_norm": 0.04409138858318329, + "learning_rate": 0.00014046536747828437, + "loss": 0.2491, + "step": 18166 + }, + { + "epoch": 1.4717271548930655, + "grad_norm": 0.04505815729498863, + "learning_rate": 0.00014046086682568973, + "loss": 0.2643, + "step": 18167 + }, + { + "epoch": 1.4718081659105637, + "grad_norm": 0.05399011820554733, + "learning_rate": 0.0001404563661730951, + "loss": 0.2834, + "step": 18168 + }, + { + "epoch": 1.4718891769280622, + "grad_norm": 0.049549635499715805, + "learning_rate": 0.00014045186552050047, + "loss": 0.2641, + "step": 18169 + }, + { + "epoch": 1.4719701879455607, + "grad_norm": 0.04148612171411514, + "learning_rate": 0.00014044736486790586, + "loss": 0.3006, + "step": 18170 + }, + { + "epoch": 1.472051198963059, + "grad_norm": 0.042439449578523636, + "learning_rate": 0.00014044286421531122, + "loss": 0.2713, + "step": 18171 + }, + { + "epoch": 1.4721322099805574, + "grad_norm": 0.046045321971178055, + "learning_rate": 0.0001404383635627166, + "loss": 0.2845, + "step": 18172 + }, + { + "epoch": 1.4722132209980558, + "grad_norm": 0.04349822551012039, + "learning_rate": 0.00014043386291012197, + "loss": 0.2893, + "step": 18173 + }, + { + "epoch": 1.472294232015554, + "grad_norm": 0.04683893546462059, + "learning_rate": 0.00014042936225752735, + "loss": 0.2782, + "step": 18174 + }, + { + "epoch": 1.4723752430330526, + "grad_norm": 0.05124137923121452, + "learning_rate": 0.00014042486160493271, + "loss": 0.3123, + "step": 18175 + }, + { + "epoch": 1.4724562540505508, + "grad_norm": 0.050686314702034, + "learning_rate": 0.0001404203609523381, + "loss": 0.3128, + "step": 18176 + }, + { + "epoch": 1.4725372650680493, + "grad_norm": 0.04599520564079285, + "learning_rate": 0.00014041586029974346, + "loss": 0.275, + "step": 18177 + }, + { + "epoch": 1.4726182760855475, + "grad_norm": 0.056231122463941574, + "learning_rate": 0.00014041135964714885, + "loss": 0.3083, + "step": 18178 + }, + { + "epoch": 1.472699287103046, + "grad_norm": 0.0451853983104229, + "learning_rate": 0.0001404068589945542, + "loss": 0.2657, + "step": 18179 + }, + { + "epoch": 1.4727802981205445, + "grad_norm": 0.04859994724392891, + "learning_rate": 0.0001404023583419596, + "loss": 0.2684, + "step": 18180 + }, + { + "epoch": 1.4728613091380427, + "grad_norm": 0.05537387728691101, + "learning_rate": 0.00014039785768936498, + "loss": 0.309, + "step": 18181 + }, + { + "epoch": 1.4729423201555412, + "grad_norm": 0.04671725258231163, + "learning_rate": 0.00014039335703677034, + "loss": 0.3001, + "step": 18182 + }, + { + "epoch": 1.4730233311730396, + "grad_norm": 0.05779655650258064, + "learning_rate": 0.0001403888563841757, + "loss": 0.3482, + "step": 18183 + }, + { + "epoch": 1.4731043421905379, + "grad_norm": 0.04974488168954849, + "learning_rate": 0.0001403843557315811, + "loss": 0.2858, + "step": 18184 + }, + { + "epoch": 1.4731853532080363, + "grad_norm": 0.047579389065504074, + "learning_rate": 0.00014037985507898645, + "loss": 0.2808, + "step": 18185 + }, + { + "epoch": 1.4732663642255348, + "grad_norm": 0.05018430948257446, + "learning_rate": 0.00014037535442639184, + "loss": 0.3242, + "step": 18186 + }, + { + "epoch": 1.473347375243033, + "grad_norm": 0.04664905369281769, + "learning_rate": 0.00014037085377379722, + "loss": 0.2626, + "step": 18187 + }, + { + "epoch": 1.4734283862605313, + "grad_norm": 0.043412186205387115, + "learning_rate": 0.00014036635312120258, + "loss": 0.2443, + "step": 18188 + }, + { + "epoch": 1.4735093972780298, + "grad_norm": 0.04253380000591278, + "learning_rate": 0.00014036185246860794, + "loss": 0.2591, + "step": 18189 + }, + { + "epoch": 1.4735904082955282, + "grad_norm": 0.05158968269824982, + "learning_rate": 0.00014035735181601333, + "loss": 0.2936, + "step": 18190 + }, + { + "epoch": 1.4736714193130265, + "grad_norm": 0.05277436599135399, + "learning_rate": 0.0001403528511634187, + "loss": 0.2995, + "step": 18191 + }, + { + "epoch": 1.473752430330525, + "grad_norm": 0.047883354127407074, + "learning_rate": 0.00014034835051082408, + "loss": 0.2825, + "step": 18192 + }, + { + "epoch": 1.4738334413480234, + "grad_norm": 0.04544740915298462, + "learning_rate": 0.00014034384985822947, + "loss": 0.2947, + "step": 18193 + }, + { + "epoch": 1.4739144523655217, + "grad_norm": 0.05491040274500847, + "learning_rate": 0.00014033934920563483, + "loss": 0.2906, + "step": 18194 + }, + { + "epoch": 1.4739954633830201, + "grad_norm": 0.057347312569618225, + "learning_rate": 0.00014033484855304019, + "loss": 0.2865, + "step": 18195 + }, + { + "epoch": 1.4740764744005186, + "grad_norm": 0.048794638365507126, + "learning_rate": 0.00014033034790044557, + "loss": 0.2695, + "step": 18196 + }, + { + "epoch": 1.4741574854180168, + "grad_norm": 0.04354074224829674, + "learning_rate": 0.00014032584724785093, + "loss": 0.2649, + "step": 18197 + }, + { + "epoch": 1.4742384964355153, + "grad_norm": 0.05033900961279869, + "learning_rate": 0.00014032134659525632, + "loss": 0.2949, + "step": 18198 + }, + { + "epoch": 1.4743195074530135, + "grad_norm": 0.046822357922792435, + "learning_rate": 0.0001403168459426617, + "loss": 0.2547, + "step": 18199 + }, + { + "epoch": 1.474400518470512, + "grad_norm": 0.05028039216995239, + "learning_rate": 0.00014031234529006707, + "loss": 0.3164, + "step": 18200 + }, + { + "epoch": 1.4744815294880103, + "grad_norm": 0.058645930141210556, + "learning_rate": 0.00014030784463747243, + "loss": 0.2958, + "step": 18201 + }, + { + "epoch": 1.4745625405055087, + "grad_norm": 0.05373714864253998, + "learning_rate": 0.00014030334398487782, + "loss": 0.2801, + "step": 18202 + }, + { + "epoch": 1.4746435515230072, + "grad_norm": 0.04682587832212448, + "learning_rate": 0.00014029884333228318, + "loss": 0.2834, + "step": 18203 + }, + { + "epoch": 1.4747245625405054, + "grad_norm": 0.057389553636312485, + "learning_rate": 0.00014029434267968856, + "loss": 0.3034, + "step": 18204 + }, + { + "epoch": 1.474805573558004, + "grad_norm": 0.05309831351041794, + "learning_rate": 0.00014028984202709395, + "loss": 0.3266, + "step": 18205 + }, + { + "epoch": 1.4748865845755024, + "grad_norm": 0.05439450964331627, + "learning_rate": 0.0001402853413744993, + "loss": 0.3163, + "step": 18206 + }, + { + "epoch": 1.4749675955930006, + "grad_norm": 0.04491687938570976, + "learning_rate": 0.00014028084072190467, + "loss": 0.2546, + "step": 18207 + }, + { + "epoch": 1.475048606610499, + "grad_norm": 0.04674319177865982, + "learning_rate": 0.00014027634006931006, + "loss": 0.2771, + "step": 18208 + }, + { + "epoch": 1.4751296176279973, + "grad_norm": 0.044882722198963165, + "learning_rate": 0.00014027183941671542, + "loss": 0.3102, + "step": 18209 + }, + { + "epoch": 1.4752106286454958, + "grad_norm": 0.049309100955724716, + "learning_rate": 0.0001402673387641208, + "loss": 0.2976, + "step": 18210 + }, + { + "epoch": 1.475291639662994, + "grad_norm": 0.04568256065249443, + "learning_rate": 0.0001402628381115262, + "loss": 0.2624, + "step": 18211 + }, + { + "epoch": 1.4753726506804925, + "grad_norm": 0.048085831105709076, + "learning_rate": 0.00014025833745893155, + "loss": 0.2628, + "step": 18212 + }, + { + "epoch": 1.475453661697991, + "grad_norm": 0.045278891921043396, + "learning_rate": 0.0001402538368063369, + "loss": 0.2605, + "step": 18213 + }, + { + "epoch": 1.4755346727154892, + "grad_norm": 0.04354943707585335, + "learning_rate": 0.0001402493361537423, + "loss": 0.2852, + "step": 18214 + }, + { + "epoch": 1.4756156837329877, + "grad_norm": 0.051201093941926956, + "learning_rate": 0.00014024483550114766, + "loss": 0.2679, + "step": 18215 + }, + { + "epoch": 1.4756966947504861, + "grad_norm": 0.0424734428524971, + "learning_rate": 0.00014024033484855305, + "loss": 0.2383, + "step": 18216 + }, + { + "epoch": 1.4757777057679844, + "grad_norm": 0.052323777228593826, + "learning_rate": 0.00014023583419595843, + "loss": 0.3315, + "step": 18217 + }, + { + "epoch": 1.4758587167854829, + "grad_norm": 0.04114263877272606, + "learning_rate": 0.0001402313335433638, + "loss": 0.2762, + "step": 18218 + }, + { + "epoch": 1.4759397278029813, + "grad_norm": 0.046002596616744995, + "learning_rate": 0.00014022683289076918, + "loss": 0.2701, + "step": 18219 + }, + { + "epoch": 1.4760207388204796, + "grad_norm": 0.049378395080566406, + "learning_rate": 0.00014022233223817454, + "loss": 0.2435, + "step": 18220 + }, + { + "epoch": 1.476101749837978, + "grad_norm": 0.04540861397981644, + "learning_rate": 0.0001402178315855799, + "loss": 0.3062, + "step": 18221 + }, + { + "epoch": 1.4761827608554763, + "grad_norm": 0.047455836087465286, + "learning_rate": 0.0001402133309329853, + "loss": 0.3133, + "step": 18222 + }, + { + "epoch": 1.4762637718729748, + "grad_norm": 0.04688198119401932, + "learning_rate": 0.00014020883028039067, + "loss": 0.2991, + "step": 18223 + }, + { + "epoch": 1.476344782890473, + "grad_norm": 0.04360269755125046, + "learning_rate": 0.00014020432962779603, + "loss": 0.2659, + "step": 18224 + }, + { + "epoch": 1.4764257939079715, + "grad_norm": 0.04572906717658043, + "learning_rate": 0.00014019982897520142, + "loss": 0.2849, + "step": 18225 + }, + { + "epoch": 1.47650680492547, + "grad_norm": 0.04466778039932251, + "learning_rate": 0.00014019532832260678, + "loss": 0.2693, + "step": 18226 + }, + { + "epoch": 1.4765878159429682, + "grad_norm": 0.04478795453906059, + "learning_rate": 0.00014019082767001214, + "loss": 0.2815, + "step": 18227 + }, + { + "epoch": 1.4766688269604666, + "grad_norm": 0.04503241553902626, + "learning_rate": 0.00014018632701741753, + "loss": 0.2585, + "step": 18228 + }, + { + "epoch": 1.476749837977965, + "grad_norm": 0.04465152323246002, + "learning_rate": 0.00014018182636482292, + "loss": 0.2921, + "step": 18229 + }, + { + "epoch": 1.4768308489954634, + "grad_norm": 0.04942317679524422, + "learning_rate": 0.00014017732571222828, + "loss": 0.326, + "step": 18230 + }, + { + "epoch": 1.4769118600129618, + "grad_norm": 0.05225396901369095, + "learning_rate": 0.00014017282505963366, + "loss": 0.2962, + "step": 18231 + }, + { + "epoch": 1.47699287103046, + "grad_norm": 0.042505279183387756, + "learning_rate": 0.00014016832440703902, + "loss": 0.2689, + "step": 18232 + }, + { + "epoch": 1.4770738820479585, + "grad_norm": 0.055289316922426224, + "learning_rate": 0.0001401638237544444, + "loss": 0.3329, + "step": 18233 + }, + { + "epoch": 1.4771548930654568, + "grad_norm": 0.04561949148774147, + "learning_rate": 0.00014015932310184977, + "loss": 0.3152, + "step": 18234 + }, + { + "epoch": 1.4772359040829552, + "grad_norm": 0.041787758469581604, + "learning_rate": 0.00014015482244925516, + "loss": 0.2709, + "step": 18235 + }, + { + "epoch": 1.4773169151004537, + "grad_norm": 0.055366311222314835, + "learning_rate": 0.00014015032179666052, + "loss": 0.2999, + "step": 18236 + }, + { + "epoch": 1.477397926117952, + "grad_norm": 0.047048453241586685, + "learning_rate": 0.0001401458211440659, + "loss": 0.2852, + "step": 18237 + }, + { + "epoch": 1.4774789371354504, + "grad_norm": 0.048917848616838455, + "learning_rate": 0.00014014132049147126, + "loss": 0.2683, + "step": 18238 + }, + { + "epoch": 1.4775599481529489, + "grad_norm": 0.04917580261826515, + "learning_rate": 0.00014013681983887665, + "loss": 0.286, + "step": 18239 + }, + { + "epoch": 1.4776409591704471, + "grad_norm": 0.04912577569484711, + "learning_rate": 0.000140132319186282, + "loss": 0.261, + "step": 18240 + }, + { + "epoch": 1.4777219701879456, + "grad_norm": 0.04626433923840523, + "learning_rate": 0.0001401278185336874, + "loss": 0.2988, + "step": 18241 + }, + { + "epoch": 1.477802981205444, + "grad_norm": 0.05389011278748512, + "learning_rate": 0.00014012331788109276, + "loss": 0.3133, + "step": 18242 + }, + { + "epoch": 1.4778839922229423, + "grad_norm": 0.051110610365867615, + "learning_rate": 0.00014011881722849815, + "loss": 0.3139, + "step": 18243 + }, + { + "epoch": 1.4779650032404408, + "grad_norm": 0.04332983121275902, + "learning_rate": 0.0001401143165759035, + "loss": 0.2587, + "step": 18244 + }, + { + "epoch": 1.478046014257939, + "grad_norm": 0.04385901615023613, + "learning_rate": 0.0001401098159233089, + "loss": 0.2861, + "step": 18245 + }, + { + "epoch": 1.4781270252754375, + "grad_norm": 0.04720534384250641, + "learning_rate": 0.00014010531527071425, + "loss": 0.3258, + "step": 18246 + }, + { + "epoch": 1.4782080362929357, + "grad_norm": 0.04445433244109154, + "learning_rate": 0.00014010081461811964, + "loss": 0.2608, + "step": 18247 + }, + { + "epoch": 1.4782890473104342, + "grad_norm": 0.04828941076993942, + "learning_rate": 0.000140096313965525, + "loss": 0.231, + "step": 18248 + }, + { + "epoch": 1.4783700583279327, + "grad_norm": 0.04584532603621483, + "learning_rate": 0.0001400918133129304, + "loss": 0.3124, + "step": 18249 + }, + { + "epoch": 1.478451069345431, + "grad_norm": 0.04655173793435097, + "learning_rate": 0.00014008731266033575, + "loss": 0.2961, + "step": 18250 + }, + { + "epoch": 1.4785320803629294, + "grad_norm": 0.06158711016178131, + "learning_rate": 0.00014008281200774114, + "loss": 0.3231, + "step": 18251 + }, + { + "epoch": 1.4786130913804278, + "grad_norm": 0.05093095451593399, + "learning_rate": 0.0001400783113551465, + "loss": 0.2938, + "step": 18252 + }, + { + "epoch": 1.478694102397926, + "grad_norm": 0.06089789792895317, + "learning_rate": 0.00014007381070255188, + "loss": 0.3123, + "step": 18253 + }, + { + "epoch": 1.4787751134154246, + "grad_norm": 0.04535433277487755, + "learning_rate": 0.00014006931004995724, + "loss": 0.2581, + "step": 18254 + }, + { + "epoch": 1.4788561244329228, + "grad_norm": 0.04125452786684036, + "learning_rate": 0.00014006480939736263, + "loss": 0.2553, + "step": 18255 + }, + { + "epoch": 1.4789371354504213, + "grad_norm": 0.056558143347501755, + "learning_rate": 0.000140060308744768, + "loss": 0.278, + "step": 18256 + }, + { + "epoch": 1.4790181464679195, + "grad_norm": 0.04998468607664108, + "learning_rate": 0.00014005580809217338, + "loss": 0.2844, + "step": 18257 + }, + { + "epoch": 1.479099157485418, + "grad_norm": 0.05126415938138962, + "learning_rate": 0.00014005130743957874, + "loss": 0.3271, + "step": 18258 + }, + { + "epoch": 1.4791801685029164, + "grad_norm": 0.04823947697877884, + "learning_rate": 0.00014004680678698412, + "loss": 0.2878, + "step": 18259 + }, + { + "epoch": 1.4792611795204147, + "grad_norm": 0.04105127230286598, + "learning_rate": 0.00014004230613438948, + "loss": 0.2413, + "step": 18260 + }, + { + "epoch": 1.4793421905379132, + "grad_norm": 0.0530543178319931, + "learning_rate": 0.00014003780548179487, + "loss": 0.287, + "step": 18261 + }, + { + "epoch": 1.4794232015554116, + "grad_norm": 0.04239355027675629, + "learning_rate": 0.00014003330482920026, + "loss": 0.3181, + "step": 18262 + }, + { + "epoch": 1.4795042125729099, + "grad_norm": 0.043683599680662155, + "learning_rate": 0.00014002880417660562, + "loss": 0.2634, + "step": 18263 + }, + { + "epoch": 1.4795852235904083, + "grad_norm": 0.05439092591404915, + "learning_rate": 0.00014002430352401098, + "loss": 0.3294, + "step": 18264 + }, + { + "epoch": 1.4796662346079068, + "grad_norm": 0.04399878531694412, + "learning_rate": 0.00014001980287141637, + "loss": 0.3174, + "step": 18265 + }, + { + "epoch": 1.479747245625405, + "grad_norm": 0.04555346071720123, + "learning_rate": 0.00014001530221882173, + "loss": 0.3127, + "step": 18266 + }, + { + "epoch": 1.4798282566429035, + "grad_norm": 0.041780564934015274, + "learning_rate": 0.0001400108015662271, + "loss": 0.2853, + "step": 18267 + }, + { + "epoch": 1.4799092676604018, + "grad_norm": 0.046299781650304794, + "learning_rate": 0.0001400063009136325, + "loss": 0.3038, + "step": 18268 + }, + { + "epoch": 1.4799902786779002, + "grad_norm": 0.04602280631661415, + "learning_rate": 0.00014000180026103786, + "loss": 0.2824, + "step": 18269 + }, + { + "epoch": 1.4800712896953985, + "grad_norm": 0.04503730311989784, + "learning_rate": 0.00013999729960844322, + "loss": 0.27, + "step": 18270 + }, + { + "epoch": 1.480152300712897, + "grad_norm": 0.04678992182016373, + "learning_rate": 0.0001399927989558486, + "loss": 0.3059, + "step": 18271 + }, + { + "epoch": 1.4802333117303954, + "grad_norm": 0.04083758592605591, + "learning_rate": 0.00013998829830325397, + "loss": 0.3109, + "step": 18272 + }, + { + "epoch": 1.4803143227478937, + "grad_norm": 0.04555083438754082, + "learning_rate": 0.00013998379765065935, + "loss": 0.2654, + "step": 18273 + }, + { + "epoch": 1.4803953337653921, + "grad_norm": 0.04763795807957649, + "learning_rate": 0.00013997929699806474, + "loss": 0.2976, + "step": 18274 + }, + { + "epoch": 1.4804763447828906, + "grad_norm": 0.06001761183142662, + "learning_rate": 0.0001399747963454701, + "loss": 0.3181, + "step": 18275 + }, + { + "epoch": 1.4805573558003888, + "grad_norm": 0.04997747391462326, + "learning_rate": 0.00013997029569287546, + "loss": 0.3341, + "step": 18276 + }, + { + "epoch": 1.4806383668178873, + "grad_norm": 0.05390239134430885, + "learning_rate": 0.00013996579504028085, + "loss": 0.3128, + "step": 18277 + }, + { + "epoch": 1.4807193778353855, + "grad_norm": 0.04573750123381615, + "learning_rate": 0.0001399612943876862, + "loss": 0.2834, + "step": 18278 + }, + { + "epoch": 1.480800388852884, + "grad_norm": 0.05009123310446739, + "learning_rate": 0.0001399567937350916, + "loss": 0.2786, + "step": 18279 + }, + { + "epoch": 1.4808813998703823, + "grad_norm": 0.049583666026592255, + "learning_rate": 0.00013995229308249698, + "loss": 0.2841, + "step": 18280 + }, + { + "epoch": 1.4809624108878807, + "grad_norm": 0.04602464661002159, + "learning_rate": 0.00013994779242990234, + "loss": 0.2817, + "step": 18281 + }, + { + "epoch": 1.4810434219053792, + "grad_norm": 0.04475036635994911, + "learning_rate": 0.0001399432917773077, + "loss": 0.272, + "step": 18282 + }, + { + "epoch": 1.4811244329228774, + "grad_norm": 0.04411635920405388, + "learning_rate": 0.0001399387911247131, + "loss": 0.2847, + "step": 18283 + }, + { + "epoch": 1.481205443940376, + "grad_norm": 0.057060644030570984, + "learning_rate": 0.00013993429047211845, + "loss": 0.3075, + "step": 18284 + }, + { + "epoch": 1.4812864549578744, + "grad_norm": 0.04523240029811859, + "learning_rate": 0.00013992978981952384, + "loss": 0.2952, + "step": 18285 + }, + { + "epoch": 1.4813674659753726, + "grad_norm": 0.059247277677059174, + "learning_rate": 0.00013992528916692922, + "loss": 0.3341, + "step": 18286 + }, + { + "epoch": 1.481448476992871, + "grad_norm": 0.043466899544000626, + "learning_rate": 0.00013992078851433458, + "loss": 0.2429, + "step": 18287 + }, + { + "epoch": 1.4815294880103695, + "grad_norm": 0.05750332400202751, + "learning_rate": 0.00013991628786173997, + "loss": 0.2758, + "step": 18288 + }, + { + "epoch": 1.4816104990278678, + "grad_norm": 0.05220554396510124, + "learning_rate": 0.00013991178720914533, + "loss": 0.2578, + "step": 18289 + }, + { + "epoch": 1.481691510045366, + "grad_norm": 0.05209901183843613, + "learning_rate": 0.0001399072865565507, + "loss": 0.2796, + "step": 18290 + }, + { + "epoch": 1.4817725210628645, + "grad_norm": 0.043190717697143555, + "learning_rate": 0.00013990278590395608, + "loss": 0.2821, + "step": 18291 + }, + { + "epoch": 1.481853532080363, + "grad_norm": 0.04831504821777344, + "learning_rate": 0.00013989828525136147, + "loss": 0.3245, + "step": 18292 + }, + { + "epoch": 1.4819345430978612, + "grad_norm": 0.04756138473749161, + "learning_rate": 0.00013989378459876683, + "loss": 0.3114, + "step": 18293 + }, + { + "epoch": 1.4820155541153597, + "grad_norm": 0.04918740317225456, + "learning_rate": 0.00013988928394617221, + "loss": 0.3163, + "step": 18294 + }, + { + "epoch": 1.4820965651328581, + "grad_norm": 0.04362611472606659, + "learning_rate": 0.00013988478329357757, + "loss": 0.2815, + "step": 18295 + }, + { + "epoch": 1.4821775761503564, + "grad_norm": 0.043086010962724686, + "learning_rate": 0.00013988028264098293, + "loss": 0.2774, + "step": 18296 + }, + { + "epoch": 1.4822585871678549, + "grad_norm": 0.051327820867300034, + "learning_rate": 0.00013987578198838832, + "loss": 0.2935, + "step": 18297 + }, + { + "epoch": 1.4823395981853533, + "grad_norm": 0.0406213216483593, + "learning_rate": 0.0001398712813357937, + "loss": 0.2591, + "step": 18298 + }, + { + "epoch": 1.4824206092028516, + "grad_norm": 0.050979651510715485, + "learning_rate": 0.00013986678068319907, + "loss": 0.2859, + "step": 18299 + }, + { + "epoch": 1.48250162022035, + "grad_norm": 0.04462830349802971, + "learning_rate": 0.00013986228003060446, + "loss": 0.2924, + "step": 18300 + }, + { + "epoch": 1.4825826312378483, + "grad_norm": 0.04354723170399666, + "learning_rate": 0.00013985777937800982, + "loss": 0.2744, + "step": 18301 + }, + { + "epoch": 1.4826636422553467, + "grad_norm": 0.05323687568306923, + "learning_rate": 0.00013985327872541518, + "loss": 0.2964, + "step": 18302 + }, + { + "epoch": 1.482744653272845, + "grad_norm": 0.046059656888246536, + "learning_rate": 0.00013984877807282056, + "loss": 0.3088, + "step": 18303 + }, + { + "epoch": 1.4828256642903435, + "grad_norm": 0.05356958881020546, + "learning_rate": 0.00013984427742022595, + "loss": 0.3491, + "step": 18304 + }, + { + "epoch": 1.482906675307842, + "grad_norm": 0.042450789362192154, + "learning_rate": 0.0001398397767676313, + "loss": 0.2752, + "step": 18305 + }, + { + "epoch": 1.4829876863253402, + "grad_norm": 0.05261091887950897, + "learning_rate": 0.0001398352761150367, + "loss": 0.3322, + "step": 18306 + }, + { + "epoch": 1.4830686973428386, + "grad_norm": 0.04929700866341591, + "learning_rate": 0.00013983077546244206, + "loss": 0.2929, + "step": 18307 + }, + { + "epoch": 1.483149708360337, + "grad_norm": 0.044822052121162415, + "learning_rate": 0.00013982627480984742, + "loss": 0.2821, + "step": 18308 + }, + { + "epoch": 1.4832307193778353, + "grad_norm": 0.04936167225241661, + "learning_rate": 0.0001398217741572528, + "loss": 0.2719, + "step": 18309 + }, + { + "epoch": 1.4833117303953338, + "grad_norm": 0.051214780658483505, + "learning_rate": 0.0001398172735046582, + "loss": 0.274, + "step": 18310 + }, + { + "epoch": 1.4833927414128323, + "grad_norm": 0.048769284039735794, + "learning_rate": 0.00013981277285206355, + "loss": 0.3083, + "step": 18311 + }, + { + "epoch": 1.4834737524303305, + "grad_norm": 0.050249192863702774, + "learning_rate": 0.00013980827219946894, + "loss": 0.324, + "step": 18312 + }, + { + "epoch": 1.4835547634478288, + "grad_norm": 0.042009156197309494, + "learning_rate": 0.0001398037715468743, + "loss": 0.2717, + "step": 18313 + }, + { + "epoch": 1.4836357744653272, + "grad_norm": 0.05800290405750275, + "learning_rate": 0.00013979927089427969, + "loss": 0.3219, + "step": 18314 + }, + { + "epoch": 1.4837167854828257, + "grad_norm": 0.04696693271398544, + "learning_rate": 0.00013979477024168505, + "loss": 0.2772, + "step": 18315 + }, + { + "epoch": 1.483797796500324, + "grad_norm": 0.04265100136399269, + "learning_rate": 0.00013979026958909043, + "loss": 0.3088, + "step": 18316 + }, + { + "epoch": 1.4838788075178224, + "grad_norm": 0.05752689763903618, + "learning_rate": 0.0001397857689364958, + "loss": 0.3209, + "step": 18317 + }, + { + "epoch": 1.4839598185353209, + "grad_norm": 0.04664149880409241, + "learning_rate": 0.00013978126828390118, + "loss": 0.3288, + "step": 18318 + }, + { + "epoch": 1.4840408295528191, + "grad_norm": 0.06104827672243118, + "learning_rate": 0.00013977676763130654, + "loss": 0.3064, + "step": 18319 + }, + { + "epoch": 1.4841218405703176, + "grad_norm": 0.055936019867658615, + "learning_rate": 0.00013977226697871193, + "loss": 0.2811, + "step": 18320 + }, + { + "epoch": 1.484202851587816, + "grad_norm": 0.0556662380695343, + "learning_rate": 0.0001397677663261173, + "loss": 0.2917, + "step": 18321 + }, + { + "epoch": 1.4842838626053143, + "grad_norm": 0.04936107248067856, + "learning_rate": 0.00013976326567352267, + "loss": 0.2785, + "step": 18322 + }, + { + "epoch": 1.4843648736228128, + "grad_norm": 0.04691338539123535, + "learning_rate": 0.00013975876502092803, + "loss": 0.2867, + "step": 18323 + }, + { + "epoch": 1.484445884640311, + "grad_norm": 0.04699557274580002, + "learning_rate": 0.00013975426436833342, + "loss": 0.2698, + "step": 18324 + }, + { + "epoch": 1.4845268956578095, + "grad_norm": 0.046980343759059906, + "learning_rate": 0.00013974976371573878, + "loss": 0.3092, + "step": 18325 + }, + { + "epoch": 1.4846079066753077, + "grad_norm": 0.04569510743021965, + "learning_rate": 0.00013974526306314417, + "loss": 0.292, + "step": 18326 + }, + { + "epoch": 1.4846889176928062, + "grad_norm": 0.05107861012220383, + "learning_rate": 0.00013974076241054953, + "loss": 0.3182, + "step": 18327 + }, + { + "epoch": 1.4847699287103047, + "grad_norm": 0.040237490087747574, + "learning_rate": 0.00013973626175795492, + "loss": 0.2539, + "step": 18328 + }, + { + "epoch": 1.484850939727803, + "grad_norm": 0.052254654467105865, + "learning_rate": 0.00013973176110536028, + "loss": 0.3111, + "step": 18329 + }, + { + "epoch": 1.4849319507453014, + "grad_norm": 0.03984718397259712, + "learning_rate": 0.00013972726045276566, + "loss": 0.2724, + "step": 18330 + }, + { + "epoch": 1.4850129617627998, + "grad_norm": 0.05064450949430466, + "learning_rate": 0.00013972275980017102, + "loss": 0.2959, + "step": 18331 + }, + { + "epoch": 1.485093972780298, + "grad_norm": 0.04890420287847519, + "learning_rate": 0.0001397182591475764, + "loss": 0.2624, + "step": 18332 + }, + { + "epoch": 1.4851749837977966, + "grad_norm": 0.05717543512582779, + "learning_rate": 0.00013971375849498177, + "loss": 0.3479, + "step": 18333 + }, + { + "epoch": 1.4852559948152948, + "grad_norm": 0.05508338287472725, + "learning_rate": 0.00013970925784238716, + "loss": 0.3087, + "step": 18334 + }, + { + "epoch": 1.4853370058327933, + "grad_norm": 0.04428192973136902, + "learning_rate": 0.00013970475718979252, + "loss": 0.2528, + "step": 18335 + }, + { + "epoch": 1.4854180168502915, + "grad_norm": 0.046977389603853226, + "learning_rate": 0.0001397002565371979, + "loss": 0.2712, + "step": 18336 + }, + { + "epoch": 1.48549902786779, + "grad_norm": 0.051317162811756134, + "learning_rate": 0.00013969575588460327, + "loss": 0.2886, + "step": 18337 + }, + { + "epoch": 1.4855800388852884, + "grad_norm": 0.04508852958679199, + "learning_rate": 0.00013969125523200865, + "loss": 0.2637, + "step": 18338 + }, + { + "epoch": 1.4856610499027867, + "grad_norm": 0.04501248523592949, + "learning_rate": 0.000139686754579414, + "loss": 0.2792, + "step": 18339 + }, + { + "epoch": 1.4857420609202852, + "grad_norm": 0.05640905722975731, + "learning_rate": 0.0001396822539268194, + "loss": 0.3121, + "step": 18340 + }, + { + "epoch": 1.4858230719377836, + "grad_norm": 0.048217132687568665, + "learning_rate": 0.00013967775327422476, + "loss": 0.2848, + "step": 18341 + }, + { + "epoch": 1.4859040829552819, + "grad_norm": 0.04802275076508522, + "learning_rate": 0.00013967325262163015, + "loss": 0.2975, + "step": 18342 + }, + { + "epoch": 1.4859850939727803, + "grad_norm": 0.04685540497303009, + "learning_rate": 0.00013966875196903553, + "loss": 0.2761, + "step": 18343 + }, + { + "epoch": 1.4860661049902788, + "grad_norm": 0.04356532543897629, + "learning_rate": 0.0001396642513164409, + "loss": 0.2649, + "step": 18344 + }, + { + "epoch": 1.486147116007777, + "grad_norm": 0.04948470741510391, + "learning_rate": 0.00013965975066384625, + "loss": 0.2986, + "step": 18345 + }, + { + "epoch": 1.4862281270252755, + "grad_norm": 0.04743214324116707, + "learning_rate": 0.00013965525001125164, + "loss": 0.2842, + "step": 18346 + }, + { + "epoch": 1.4863091380427738, + "grad_norm": 0.05075887218117714, + "learning_rate": 0.000139650749358657, + "loss": 0.3015, + "step": 18347 + }, + { + "epoch": 1.4863901490602722, + "grad_norm": 0.052088137716054916, + "learning_rate": 0.0001396462487060624, + "loss": 0.2715, + "step": 18348 + }, + { + "epoch": 1.4864711600777705, + "grad_norm": 0.04909933730959892, + "learning_rate": 0.00013964174805346778, + "loss": 0.2645, + "step": 18349 + }, + { + "epoch": 1.486552171095269, + "grad_norm": 0.04945594444870949, + "learning_rate": 0.00013963724740087314, + "loss": 0.2919, + "step": 18350 + }, + { + "epoch": 1.4866331821127674, + "grad_norm": 0.05588537082076073, + "learning_rate": 0.0001396327467482785, + "loss": 0.3082, + "step": 18351 + }, + { + "epoch": 1.4867141931302656, + "grad_norm": 0.058845289051532745, + "learning_rate": 0.00013962824609568388, + "loss": 0.3624, + "step": 18352 + }, + { + "epoch": 1.4867952041477641, + "grad_norm": 0.047321610152721405, + "learning_rate": 0.00013962374544308924, + "loss": 0.3048, + "step": 18353 + }, + { + "epoch": 1.4868762151652626, + "grad_norm": 0.046751320362091064, + "learning_rate": 0.00013961924479049463, + "loss": 0.2997, + "step": 18354 + }, + { + "epoch": 1.4869572261827608, + "grad_norm": 0.05394889786839485, + "learning_rate": 0.00013961474413790002, + "loss": 0.3257, + "step": 18355 + }, + { + "epoch": 1.4870382372002593, + "grad_norm": 0.06627120077610016, + "learning_rate": 0.00013961024348530538, + "loss": 0.2903, + "step": 18356 + }, + { + "epoch": 1.4871192482177575, + "grad_norm": 0.05737738311290741, + "learning_rate": 0.00013960574283271076, + "loss": 0.3475, + "step": 18357 + }, + { + "epoch": 1.487200259235256, + "grad_norm": 0.05117543041706085, + "learning_rate": 0.00013960124218011612, + "loss": 0.2446, + "step": 18358 + }, + { + "epoch": 1.4872812702527543, + "grad_norm": 0.04747667536139488, + "learning_rate": 0.00013959674152752148, + "loss": 0.2513, + "step": 18359 + }, + { + "epoch": 1.4873622812702527, + "grad_norm": 0.046115368604660034, + "learning_rate": 0.00013959224087492687, + "loss": 0.3149, + "step": 18360 + }, + { + "epoch": 1.4874432922877512, + "grad_norm": 0.05121514946222305, + "learning_rate": 0.00013958774022233226, + "loss": 0.2996, + "step": 18361 + }, + { + "epoch": 1.4875243033052494, + "grad_norm": 0.05462023615837097, + "learning_rate": 0.00013958323956973762, + "loss": 0.2928, + "step": 18362 + }, + { + "epoch": 1.487605314322748, + "grad_norm": 0.052688416093587875, + "learning_rate": 0.000139578738917143, + "loss": 0.3073, + "step": 18363 + }, + { + "epoch": 1.4876863253402464, + "grad_norm": 0.04555663838982582, + "learning_rate": 0.00013957423826454837, + "loss": 0.2668, + "step": 18364 + }, + { + "epoch": 1.4877673363577446, + "grad_norm": 0.051788438111543655, + "learning_rate": 0.00013956973761195373, + "loss": 0.2916, + "step": 18365 + }, + { + "epoch": 1.487848347375243, + "grad_norm": 0.04320453852415085, + "learning_rate": 0.0001395652369593591, + "loss": 0.2672, + "step": 18366 + }, + { + "epoch": 1.4879293583927415, + "grad_norm": 0.055264852941036224, + "learning_rate": 0.0001395607363067645, + "loss": 0.3008, + "step": 18367 + }, + { + "epoch": 1.4880103694102398, + "grad_norm": 0.05236586555838585, + "learning_rate": 0.00013955623565416986, + "loss": 0.282, + "step": 18368 + }, + { + "epoch": 1.4880913804277383, + "grad_norm": 0.04332033917307854, + "learning_rate": 0.00013955173500157525, + "loss": 0.2821, + "step": 18369 + }, + { + "epoch": 1.4881723914452365, + "grad_norm": 0.054549943655729294, + "learning_rate": 0.0001395472343489806, + "loss": 0.2878, + "step": 18370 + }, + { + "epoch": 1.488253402462735, + "grad_norm": 0.04600951448082924, + "learning_rate": 0.00013954273369638597, + "loss": 0.2934, + "step": 18371 + }, + { + "epoch": 1.4883344134802332, + "grad_norm": 0.042095400393009186, + "learning_rate": 0.00013953823304379135, + "loss": 0.2413, + "step": 18372 + }, + { + "epoch": 1.4884154244977317, + "grad_norm": 0.056848566979169846, + "learning_rate": 0.00013953373239119674, + "loss": 0.3536, + "step": 18373 + }, + { + "epoch": 1.4884964355152301, + "grad_norm": 0.046874742954969406, + "learning_rate": 0.0001395292317386021, + "loss": 0.2433, + "step": 18374 + }, + { + "epoch": 1.4885774465327284, + "grad_norm": 0.053148914128541946, + "learning_rate": 0.0001395247310860075, + "loss": 0.2906, + "step": 18375 + }, + { + "epoch": 1.4886584575502269, + "grad_norm": 0.04982958361506462, + "learning_rate": 0.00013952023043341285, + "loss": 0.3081, + "step": 18376 + }, + { + "epoch": 1.4887394685677253, + "grad_norm": 0.050669070333242416, + "learning_rate": 0.0001395157297808182, + "loss": 0.3039, + "step": 18377 + }, + { + "epoch": 1.4888204795852236, + "grad_norm": 0.05367177352309227, + "learning_rate": 0.0001395112291282236, + "loss": 0.3278, + "step": 18378 + }, + { + "epoch": 1.488901490602722, + "grad_norm": 0.04868924245238304, + "learning_rate": 0.00013950672847562898, + "loss": 0.3139, + "step": 18379 + }, + { + "epoch": 1.4889825016202203, + "grad_norm": 0.04303275793790817, + "learning_rate": 0.00013950222782303434, + "loss": 0.2567, + "step": 18380 + }, + { + "epoch": 1.4890635126377187, + "grad_norm": 0.05001796782016754, + "learning_rate": 0.00013949772717043973, + "loss": 0.3035, + "step": 18381 + }, + { + "epoch": 1.489144523655217, + "grad_norm": 0.05062073841691017, + "learning_rate": 0.0001394932265178451, + "loss": 0.2901, + "step": 18382 + }, + { + "epoch": 1.4892255346727155, + "grad_norm": 0.04815378040075302, + "learning_rate": 0.00013948872586525045, + "loss": 0.2813, + "step": 18383 + }, + { + "epoch": 1.489306545690214, + "grad_norm": 0.04104358330368996, + "learning_rate": 0.00013948422521265584, + "loss": 0.2862, + "step": 18384 + }, + { + "epoch": 1.4893875567077122, + "grad_norm": 0.05661662295460701, + "learning_rate": 0.00013947972456006123, + "loss": 0.3342, + "step": 18385 + }, + { + "epoch": 1.4894685677252106, + "grad_norm": 0.04493676871061325, + "learning_rate": 0.00013947522390746659, + "loss": 0.2763, + "step": 18386 + }, + { + "epoch": 1.489549578742709, + "grad_norm": 0.05062812939286232, + "learning_rate": 0.00013947072325487197, + "loss": 0.2884, + "step": 18387 + }, + { + "epoch": 1.4896305897602073, + "grad_norm": 0.043501146137714386, + "learning_rate": 0.00013946622260227733, + "loss": 0.2742, + "step": 18388 + }, + { + "epoch": 1.4897116007777058, + "grad_norm": 0.04774140566587448, + "learning_rate": 0.0001394617219496827, + "loss": 0.2644, + "step": 18389 + }, + { + "epoch": 1.4897926117952043, + "grad_norm": 0.05543723329901695, + "learning_rate": 0.00013945722129708808, + "loss": 0.2662, + "step": 18390 + }, + { + "epoch": 1.4898736228127025, + "grad_norm": 0.04897398129105568, + "learning_rate": 0.00013945272064449347, + "loss": 0.2799, + "step": 18391 + }, + { + "epoch": 1.4899546338302008, + "grad_norm": 0.04249687120318413, + "learning_rate": 0.00013944821999189883, + "loss": 0.2538, + "step": 18392 + }, + { + "epoch": 1.4900356448476992, + "grad_norm": 0.03824577480554581, + "learning_rate": 0.00013944371933930421, + "loss": 0.219, + "step": 18393 + }, + { + "epoch": 1.4901166558651977, + "grad_norm": 0.0580143965780735, + "learning_rate": 0.00013943921868670957, + "loss": 0.285, + "step": 18394 + }, + { + "epoch": 1.490197666882696, + "grad_norm": 0.04311606287956238, + "learning_rate": 0.00013943471803411496, + "loss": 0.2878, + "step": 18395 + }, + { + "epoch": 1.4902786779001944, + "grad_norm": 0.05549796298146248, + "learning_rate": 0.00013943021738152032, + "loss": 0.3463, + "step": 18396 + }, + { + "epoch": 1.4903596889176929, + "grad_norm": 0.05039701983332634, + "learning_rate": 0.0001394257167289257, + "loss": 0.2625, + "step": 18397 + }, + { + "epoch": 1.4904406999351911, + "grad_norm": 0.04013196378946304, + "learning_rate": 0.00013942121607633107, + "loss": 0.2531, + "step": 18398 + }, + { + "epoch": 1.4905217109526896, + "grad_norm": 0.04465265944600105, + "learning_rate": 0.00013941671542373646, + "loss": 0.2879, + "step": 18399 + }, + { + "epoch": 1.490602721970188, + "grad_norm": 0.04678081348538399, + "learning_rate": 0.00013941221477114182, + "loss": 0.2783, + "step": 18400 + }, + { + "epoch": 1.4906837329876863, + "grad_norm": 0.04981692135334015, + "learning_rate": 0.0001394077141185472, + "loss": 0.3072, + "step": 18401 + }, + { + "epoch": 1.4907647440051848, + "grad_norm": 0.05292908474802971, + "learning_rate": 0.00013940321346595256, + "loss": 0.3052, + "step": 18402 + }, + { + "epoch": 1.490845755022683, + "grad_norm": 0.06737762689590454, + "learning_rate": 0.00013939871281335795, + "loss": 0.307, + "step": 18403 + }, + { + "epoch": 1.4909267660401815, + "grad_norm": 0.04568171873688698, + "learning_rate": 0.0001393942121607633, + "loss": 0.2601, + "step": 18404 + }, + { + "epoch": 1.4910077770576797, + "grad_norm": 0.052503764629364014, + "learning_rate": 0.0001393897115081687, + "loss": 0.3105, + "step": 18405 + }, + { + "epoch": 1.4910887880751782, + "grad_norm": 0.046734701842069626, + "learning_rate": 0.00013938521085557406, + "loss": 0.292, + "step": 18406 + }, + { + "epoch": 1.4911697990926767, + "grad_norm": 0.042353659868240356, + "learning_rate": 0.00013938071020297944, + "loss": 0.2559, + "step": 18407 + }, + { + "epoch": 1.491250810110175, + "grad_norm": 0.04739280045032501, + "learning_rate": 0.0001393762095503848, + "loss": 0.2775, + "step": 18408 + }, + { + "epoch": 1.4913318211276734, + "grad_norm": 0.04475112631917, + "learning_rate": 0.0001393717088977902, + "loss": 0.286, + "step": 18409 + }, + { + "epoch": 1.4914128321451718, + "grad_norm": 0.047658856958150864, + "learning_rate": 0.00013936720824519555, + "loss": 0.2744, + "step": 18410 + }, + { + "epoch": 1.49149384316267, + "grad_norm": 0.0562983863055706, + "learning_rate": 0.00013936270759260094, + "loss": 0.2906, + "step": 18411 + }, + { + "epoch": 1.4915748541801686, + "grad_norm": 0.047967586666345596, + "learning_rate": 0.0001393582069400063, + "loss": 0.3272, + "step": 18412 + }, + { + "epoch": 1.491655865197667, + "grad_norm": 0.047111138701438904, + "learning_rate": 0.00013935370628741169, + "loss": 0.2879, + "step": 18413 + }, + { + "epoch": 1.4917368762151653, + "grad_norm": 0.047560688108205795, + "learning_rate": 0.00013934920563481705, + "loss": 0.2566, + "step": 18414 + }, + { + "epoch": 1.4918178872326635, + "grad_norm": 0.04752170667052269, + "learning_rate": 0.00013934470498222243, + "loss": 0.26, + "step": 18415 + }, + { + "epoch": 1.491898898250162, + "grad_norm": 0.04229968041181564, + "learning_rate": 0.0001393402043296278, + "loss": 0.269, + "step": 18416 + }, + { + "epoch": 1.4919799092676604, + "grad_norm": 0.05088653042912483, + "learning_rate": 0.00013933570367703318, + "loss": 0.2976, + "step": 18417 + }, + { + "epoch": 1.4920609202851587, + "grad_norm": 0.04122448340058327, + "learning_rate": 0.00013933120302443857, + "loss": 0.2717, + "step": 18418 + }, + { + "epoch": 1.4921419313026572, + "grad_norm": 0.0648375079035759, + "learning_rate": 0.00013932670237184393, + "loss": 0.2909, + "step": 18419 + }, + { + "epoch": 1.4922229423201556, + "grad_norm": 0.056340668350458145, + "learning_rate": 0.00013932220171924931, + "loss": 0.3037, + "step": 18420 + }, + { + "epoch": 1.4923039533376539, + "grad_norm": 0.043164364993572235, + "learning_rate": 0.00013931770106665467, + "loss": 0.2863, + "step": 18421 + }, + { + "epoch": 1.4923849643551523, + "grad_norm": 0.049361515790224075, + "learning_rate": 0.00013931320041406003, + "loss": 0.2761, + "step": 18422 + }, + { + "epoch": 1.4924659753726508, + "grad_norm": 0.044015221297740936, + "learning_rate": 0.00013930869976146542, + "loss": 0.3073, + "step": 18423 + }, + { + "epoch": 1.492546986390149, + "grad_norm": 0.0538572296500206, + "learning_rate": 0.0001393041991088708, + "loss": 0.3024, + "step": 18424 + }, + { + "epoch": 1.4926279974076475, + "grad_norm": 0.04860401153564453, + "learning_rate": 0.00013929969845627617, + "loss": 0.2891, + "step": 18425 + }, + { + "epoch": 1.4927090084251458, + "grad_norm": 0.04993215203285217, + "learning_rate": 0.00013929519780368156, + "loss": 0.2824, + "step": 18426 + }, + { + "epoch": 1.4927900194426442, + "grad_norm": 0.047310274094343185, + "learning_rate": 0.00013929069715108692, + "loss": 0.251, + "step": 18427 + }, + { + "epoch": 1.4928710304601425, + "grad_norm": 0.043921127915382385, + "learning_rate": 0.00013928619649849228, + "loss": 0.2705, + "step": 18428 + }, + { + "epoch": 1.492952041477641, + "grad_norm": 0.050883155316114426, + "learning_rate": 0.00013928169584589766, + "loss": 0.3099, + "step": 18429 + }, + { + "epoch": 1.4930330524951394, + "grad_norm": 0.0460064597427845, + "learning_rate": 0.00013927719519330305, + "loss": 0.2787, + "step": 18430 + }, + { + "epoch": 1.4931140635126376, + "grad_norm": 0.050332728773355484, + "learning_rate": 0.0001392726945407084, + "loss": 0.285, + "step": 18431 + }, + { + "epoch": 1.4931950745301361, + "grad_norm": 0.049601174890995026, + "learning_rate": 0.0001392681938881138, + "loss": 0.2961, + "step": 18432 + }, + { + "epoch": 1.4932760855476346, + "grad_norm": 0.04399363324046135, + "learning_rate": 0.00013926369323551916, + "loss": 0.2833, + "step": 18433 + }, + { + "epoch": 1.4933570965651328, + "grad_norm": 0.044266752898693085, + "learning_rate": 0.00013925919258292452, + "loss": 0.2742, + "step": 18434 + }, + { + "epoch": 1.4934381075826313, + "grad_norm": 0.05331169441342354, + "learning_rate": 0.0001392546919303299, + "loss": 0.2892, + "step": 18435 + }, + { + "epoch": 1.4935191186001295, + "grad_norm": 0.0475342683494091, + "learning_rate": 0.0001392501912777353, + "loss": 0.2838, + "step": 18436 + }, + { + "epoch": 1.493600129617628, + "grad_norm": 0.04593534767627716, + "learning_rate": 0.00013924569062514065, + "loss": 0.3068, + "step": 18437 + }, + { + "epoch": 1.4936811406351262, + "grad_norm": 0.04650498926639557, + "learning_rate": 0.00013924118997254604, + "loss": 0.272, + "step": 18438 + }, + { + "epoch": 1.4937621516526247, + "grad_norm": 0.04349982365965843, + "learning_rate": 0.0001392366893199514, + "loss": 0.2705, + "step": 18439 + }, + { + "epoch": 1.4938431626701232, + "grad_norm": 0.05977478250861168, + "learning_rate": 0.00013923218866735676, + "loss": 0.313, + "step": 18440 + }, + { + "epoch": 1.4939241736876214, + "grad_norm": 0.04337127506732941, + "learning_rate": 0.00013922768801476215, + "loss": 0.2555, + "step": 18441 + }, + { + "epoch": 1.49400518470512, + "grad_norm": 0.05166735500097275, + "learning_rate": 0.00013922318736216753, + "loss": 0.3171, + "step": 18442 + }, + { + "epoch": 1.4940861957226184, + "grad_norm": 0.05477170646190643, + "learning_rate": 0.0001392186867095729, + "loss": 0.2734, + "step": 18443 + }, + { + "epoch": 1.4941672067401166, + "grad_norm": 0.05633719637989998, + "learning_rate": 0.00013921418605697828, + "loss": 0.2861, + "step": 18444 + }, + { + "epoch": 1.494248217757615, + "grad_norm": 0.05167591571807861, + "learning_rate": 0.00013920968540438364, + "loss": 0.2632, + "step": 18445 + }, + { + "epoch": 1.4943292287751135, + "grad_norm": 0.05309131368994713, + "learning_rate": 0.000139205184751789, + "loss": 0.2883, + "step": 18446 + }, + { + "epoch": 1.4944102397926118, + "grad_norm": 0.04940500110387802, + "learning_rate": 0.0001392006840991944, + "loss": 0.2707, + "step": 18447 + }, + { + "epoch": 1.4944912508101102, + "grad_norm": 0.04576598480343819, + "learning_rate": 0.00013919618344659978, + "loss": 0.2856, + "step": 18448 + }, + { + "epoch": 1.4945722618276085, + "grad_norm": 0.04570867866277695, + "learning_rate": 0.00013919168279400514, + "loss": 0.2681, + "step": 18449 + }, + { + "epoch": 1.494653272845107, + "grad_norm": 0.04411808401346207, + "learning_rate": 0.00013918718214141052, + "loss": 0.2846, + "step": 18450 + }, + { + "epoch": 1.4947342838626052, + "grad_norm": 0.050253257155418396, + "learning_rate": 0.00013918268148881588, + "loss": 0.2648, + "step": 18451 + }, + { + "epoch": 1.4948152948801037, + "grad_norm": 0.05088942497968674, + "learning_rate": 0.00013917818083622124, + "loss": 0.2748, + "step": 18452 + }, + { + "epoch": 1.4948963058976021, + "grad_norm": 0.045565392822027206, + "learning_rate": 0.00013917368018362663, + "loss": 0.2779, + "step": 18453 + }, + { + "epoch": 1.4949773169151004, + "grad_norm": 0.04917675629258156, + "learning_rate": 0.00013916917953103202, + "loss": 0.3069, + "step": 18454 + }, + { + "epoch": 1.4950583279325989, + "grad_norm": 0.05241185799241066, + "learning_rate": 0.00013916467887843738, + "loss": 0.3057, + "step": 18455 + }, + { + "epoch": 1.4951393389500973, + "grad_norm": 0.045868489891290665, + "learning_rate": 0.00013916017822584276, + "loss": 0.2771, + "step": 18456 + }, + { + "epoch": 1.4952203499675956, + "grad_norm": 0.04369146004319191, + "learning_rate": 0.00013915567757324812, + "loss": 0.2512, + "step": 18457 + }, + { + "epoch": 1.495301360985094, + "grad_norm": 0.044587358832359314, + "learning_rate": 0.00013915117692065348, + "loss": 0.2714, + "step": 18458 + }, + { + "epoch": 1.4953823720025923, + "grad_norm": 0.050855766981840134, + "learning_rate": 0.00013914667626805887, + "loss": 0.2898, + "step": 18459 + }, + { + "epoch": 1.4954633830200907, + "grad_norm": 0.05688999593257904, + "learning_rate": 0.00013914217561546426, + "loss": 0.2924, + "step": 18460 + }, + { + "epoch": 1.495544394037589, + "grad_norm": 0.05517961457371712, + "learning_rate": 0.00013913767496286962, + "loss": 0.268, + "step": 18461 + }, + { + "epoch": 1.4956254050550875, + "grad_norm": 0.05347253754734993, + "learning_rate": 0.000139133174310275, + "loss": 0.2947, + "step": 18462 + }, + { + "epoch": 1.495706416072586, + "grad_norm": 0.048167914152145386, + "learning_rate": 0.00013912867365768037, + "loss": 0.2519, + "step": 18463 + }, + { + "epoch": 1.4957874270900842, + "grad_norm": 0.045652590692043304, + "learning_rate": 0.00013912417300508573, + "loss": 0.2295, + "step": 18464 + }, + { + "epoch": 1.4958684381075826, + "grad_norm": 0.04998354986310005, + "learning_rate": 0.0001391196723524911, + "loss": 0.2637, + "step": 18465 + }, + { + "epoch": 1.495949449125081, + "grad_norm": 0.05486319586634636, + "learning_rate": 0.0001391151716998965, + "loss": 0.334, + "step": 18466 + }, + { + "epoch": 1.4960304601425793, + "grad_norm": 0.048164885491132736, + "learning_rate": 0.00013911067104730186, + "loss": 0.3039, + "step": 18467 + }, + { + "epoch": 1.4961114711600778, + "grad_norm": 0.03973591327667236, + "learning_rate": 0.00013910617039470725, + "loss": 0.2758, + "step": 18468 + }, + { + "epoch": 1.4961924821775763, + "grad_norm": 0.05702631548047066, + "learning_rate": 0.0001391016697421126, + "loss": 0.2889, + "step": 18469 + }, + { + "epoch": 1.4962734931950745, + "grad_norm": 0.05530570447444916, + "learning_rate": 0.00013909716908951797, + "loss": 0.2989, + "step": 18470 + }, + { + "epoch": 1.496354504212573, + "grad_norm": 0.04951141029596329, + "learning_rate": 0.00013909266843692335, + "loss": 0.3221, + "step": 18471 + }, + { + "epoch": 1.4964355152300712, + "grad_norm": 0.04481812193989754, + "learning_rate": 0.00013908816778432874, + "loss": 0.2597, + "step": 18472 + }, + { + "epoch": 1.4965165262475697, + "grad_norm": 0.05551238730549812, + "learning_rate": 0.0001390836671317341, + "loss": 0.2723, + "step": 18473 + }, + { + "epoch": 1.496597537265068, + "grad_norm": 0.052483901381492615, + "learning_rate": 0.0001390791664791395, + "loss": 0.283, + "step": 18474 + }, + { + "epoch": 1.4966785482825664, + "grad_norm": 0.0454094335436821, + "learning_rate": 0.00013907466582654485, + "loss": 0.2775, + "step": 18475 + }, + { + "epoch": 1.4967595593000649, + "grad_norm": 0.05346366763114929, + "learning_rate": 0.00013907016517395024, + "loss": 0.3174, + "step": 18476 + }, + { + "epoch": 1.4968405703175631, + "grad_norm": 0.044364653527736664, + "learning_rate": 0.0001390656645213556, + "loss": 0.2586, + "step": 18477 + }, + { + "epoch": 1.4969215813350616, + "grad_norm": 0.04217236861586571, + "learning_rate": 0.00013906116386876098, + "loss": 0.2678, + "step": 18478 + }, + { + "epoch": 1.49700259235256, + "grad_norm": 0.04457024112343788, + "learning_rate": 0.00013905666321616634, + "loss": 0.2623, + "step": 18479 + }, + { + "epoch": 1.4970836033700583, + "grad_norm": 0.0516212061047554, + "learning_rate": 0.00013905216256357173, + "loss": 0.2941, + "step": 18480 + }, + { + "epoch": 1.4971646143875568, + "grad_norm": 0.05036289617419243, + "learning_rate": 0.0001390476619109771, + "loss": 0.3116, + "step": 18481 + }, + { + "epoch": 1.497245625405055, + "grad_norm": 0.045464541763067245, + "learning_rate": 0.00013904316125838248, + "loss": 0.3038, + "step": 18482 + }, + { + "epoch": 1.4973266364225535, + "grad_norm": 0.05470053851604462, + "learning_rate": 0.00013903866060578784, + "loss": 0.2821, + "step": 18483 + }, + { + "epoch": 1.4974076474400517, + "grad_norm": 0.05759792774915695, + "learning_rate": 0.00013903415995319323, + "loss": 0.3298, + "step": 18484 + }, + { + "epoch": 1.4974886584575502, + "grad_norm": 0.049921244382858276, + "learning_rate": 0.00013902965930059859, + "loss": 0.2963, + "step": 18485 + }, + { + "epoch": 1.4975696694750487, + "grad_norm": 0.04551756754517555, + "learning_rate": 0.00013902515864800397, + "loss": 0.2899, + "step": 18486 + }, + { + "epoch": 1.497650680492547, + "grad_norm": 0.053860023617744446, + "learning_rate": 0.00013902065799540933, + "loss": 0.2752, + "step": 18487 + }, + { + "epoch": 1.4977316915100454, + "grad_norm": 0.060332879424095154, + "learning_rate": 0.00013901615734281472, + "loss": 0.3393, + "step": 18488 + }, + { + "epoch": 1.4978127025275438, + "grad_norm": 0.05206407979130745, + "learning_rate": 0.0001390116566902201, + "loss": 0.2747, + "step": 18489 + }, + { + "epoch": 1.497893713545042, + "grad_norm": 0.0485624261200428, + "learning_rate": 0.00013900715603762547, + "loss": 0.2808, + "step": 18490 + }, + { + "epoch": 1.4979747245625405, + "grad_norm": 0.059844858944416046, + "learning_rate": 0.00013900265538503083, + "loss": 0.3153, + "step": 18491 + }, + { + "epoch": 1.498055735580039, + "grad_norm": 0.04991476610302925, + "learning_rate": 0.00013899815473243621, + "loss": 0.2955, + "step": 18492 + }, + { + "epoch": 1.4981367465975373, + "grad_norm": 0.05374494194984436, + "learning_rate": 0.00013899365407984157, + "loss": 0.3298, + "step": 18493 + }, + { + "epoch": 1.4982177576150357, + "grad_norm": 0.057440925389528275, + "learning_rate": 0.00013898915342724696, + "loss": 0.3143, + "step": 18494 + }, + { + "epoch": 1.498298768632534, + "grad_norm": 0.04277382791042328, + "learning_rate": 0.00013898465277465235, + "loss": 0.2817, + "step": 18495 + }, + { + "epoch": 1.4983797796500324, + "grad_norm": 0.04948005452752113, + "learning_rate": 0.0001389801521220577, + "loss": 0.3178, + "step": 18496 + }, + { + "epoch": 1.4984607906675307, + "grad_norm": 0.06216001138091087, + "learning_rate": 0.00013897565146946307, + "loss": 0.3059, + "step": 18497 + }, + { + "epoch": 1.4985418016850292, + "grad_norm": 0.05815086141228676, + "learning_rate": 0.00013897115081686846, + "loss": 0.2974, + "step": 18498 + }, + { + "epoch": 1.4986228127025276, + "grad_norm": 0.04554952308535576, + "learning_rate": 0.00013896665016427384, + "loss": 0.2602, + "step": 18499 + }, + { + "epoch": 1.4987038237200259, + "grad_norm": 0.046857476234436035, + "learning_rate": 0.0001389621495116792, + "loss": 0.2645, + "step": 18500 + }, + { + "epoch": 1.4987848347375243, + "grad_norm": 0.049220532178878784, + "learning_rate": 0.0001389576488590846, + "loss": 0.3211, + "step": 18501 + }, + { + "epoch": 1.4988658457550228, + "grad_norm": 0.04458710551261902, + "learning_rate": 0.00013895314820648995, + "loss": 0.3119, + "step": 18502 + }, + { + "epoch": 1.498946856772521, + "grad_norm": 0.05195513367652893, + "learning_rate": 0.0001389486475538953, + "loss": 0.2737, + "step": 18503 + }, + { + "epoch": 1.4990278677900195, + "grad_norm": 0.043182797729969025, + "learning_rate": 0.0001389441469013007, + "loss": 0.2691, + "step": 18504 + }, + { + "epoch": 1.4991088788075178, + "grad_norm": 0.049426015466451645, + "learning_rate": 0.00013893964624870608, + "loss": 0.2943, + "step": 18505 + }, + { + "epoch": 1.4991898898250162, + "grad_norm": 0.04895054176449776, + "learning_rate": 0.00013893514559611144, + "loss": 0.2965, + "step": 18506 + }, + { + "epoch": 1.4992709008425145, + "grad_norm": 0.05293383076786995, + "learning_rate": 0.00013893064494351683, + "loss": 0.3132, + "step": 18507 + }, + { + "epoch": 1.499351911860013, + "grad_norm": 0.053053028881549835, + "learning_rate": 0.0001389261442909222, + "loss": 0.2741, + "step": 18508 + }, + { + "epoch": 1.4994329228775114, + "grad_norm": 0.04623028635978699, + "learning_rate": 0.00013892164363832755, + "loss": 0.2794, + "step": 18509 + }, + { + "epoch": 1.4995139338950096, + "grad_norm": 0.05131025239825249, + "learning_rate": 0.00013891714298573294, + "loss": 0.3244, + "step": 18510 + }, + { + "epoch": 1.499594944912508, + "grad_norm": 0.05024973303079605, + "learning_rate": 0.00013891264233313833, + "loss": 0.2817, + "step": 18511 + }, + { + "epoch": 1.4996759559300066, + "grad_norm": 0.048978786915540695, + "learning_rate": 0.00013890814168054369, + "loss": 0.2873, + "step": 18512 + }, + { + "epoch": 1.4997569669475048, + "grad_norm": 0.045770205557346344, + "learning_rate": 0.00013890364102794907, + "loss": 0.2728, + "step": 18513 + }, + { + "epoch": 1.4998379779650033, + "grad_norm": 0.05459889397025108, + "learning_rate": 0.00013889914037535443, + "loss": 0.2726, + "step": 18514 + }, + { + "epoch": 1.4999189889825018, + "grad_norm": 0.058012884110212326, + "learning_rate": 0.0001388946397227598, + "loss": 0.2823, + "step": 18515 + }, + { + "epoch": 1.5, + "grad_norm": 0.06093365326523781, + "learning_rate": 0.00013889013907016518, + "loss": 0.2882, + "step": 18516 + }, + { + "epoch": 1.5000810110174982, + "grad_norm": 0.054403156042099, + "learning_rate": 0.00013888563841757057, + "loss": 0.328, + "step": 18517 + }, + { + "epoch": 1.5001620220349967, + "grad_norm": 0.05934557691216469, + "learning_rate": 0.00013888113776497593, + "loss": 0.2826, + "step": 18518 + }, + { + "epoch": 1.5002430330524952, + "grad_norm": 0.05087953060865402, + "learning_rate": 0.00013887663711238132, + "loss": 0.2786, + "step": 18519 + }, + { + "epoch": 1.5003240440699934, + "grad_norm": 0.04411671683192253, + "learning_rate": 0.00013887213645978668, + "loss": 0.2637, + "step": 18520 + }, + { + "epoch": 1.500405055087492, + "grad_norm": 0.049634527415037155, + "learning_rate": 0.00013886763580719204, + "loss": 0.2708, + "step": 18521 + }, + { + "epoch": 1.5004860661049904, + "grad_norm": 0.0397411547601223, + "learning_rate": 0.00013886313515459742, + "loss": 0.2411, + "step": 18522 + }, + { + "epoch": 1.5005670771224886, + "grad_norm": 0.060061804950237274, + "learning_rate": 0.0001388586345020028, + "loss": 0.3434, + "step": 18523 + }, + { + "epoch": 1.500648088139987, + "grad_norm": 0.05206239968538284, + "learning_rate": 0.00013885413384940817, + "loss": 0.2641, + "step": 18524 + }, + { + "epoch": 1.5007290991574855, + "grad_norm": 0.04855002462863922, + "learning_rate": 0.00013884963319681356, + "loss": 0.3072, + "step": 18525 + }, + { + "epoch": 1.5008101101749838, + "grad_norm": 0.0422656387090683, + "learning_rate": 0.00013884513254421892, + "loss": 0.2511, + "step": 18526 + }, + { + "epoch": 1.500891121192482, + "grad_norm": 0.05039331316947937, + "learning_rate": 0.00013884063189162428, + "loss": 0.2744, + "step": 18527 + }, + { + "epoch": 1.5009721322099807, + "grad_norm": 0.04326654598116875, + "learning_rate": 0.00013883613123902966, + "loss": 0.2671, + "step": 18528 + }, + { + "epoch": 1.501053143227479, + "grad_norm": 0.04627027362585068, + "learning_rate": 0.00013883163058643505, + "loss": 0.2412, + "step": 18529 + }, + { + "epoch": 1.5011341542449772, + "grad_norm": 0.04790741577744484, + "learning_rate": 0.0001388271299338404, + "loss": 0.3072, + "step": 18530 + }, + { + "epoch": 1.5012151652624757, + "grad_norm": 0.05516636371612549, + "learning_rate": 0.0001388226292812458, + "loss": 0.3146, + "step": 18531 + }, + { + "epoch": 1.5012961762799741, + "grad_norm": 0.0491386316716671, + "learning_rate": 0.00013881812862865116, + "loss": 0.2919, + "step": 18532 + }, + { + "epoch": 1.5013771872974724, + "grad_norm": 0.05206599831581116, + "learning_rate": 0.00013881362797605652, + "loss": 0.3314, + "step": 18533 + }, + { + "epoch": 1.5014581983149708, + "grad_norm": 0.05277208983898163, + "learning_rate": 0.0001388091273234619, + "loss": 0.3017, + "step": 18534 + }, + { + "epoch": 1.5015392093324693, + "grad_norm": 0.049126457422971725, + "learning_rate": 0.0001388046266708673, + "loss": 0.2696, + "step": 18535 + }, + { + "epoch": 1.5016202203499676, + "grad_norm": 0.04647735878825188, + "learning_rate": 0.00013880012601827265, + "loss": 0.2639, + "step": 18536 + }, + { + "epoch": 1.501701231367466, + "grad_norm": 0.04798891395330429, + "learning_rate": 0.00013879562536567804, + "loss": 0.283, + "step": 18537 + }, + { + "epoch": 1.5017822423849645, + "grad_norm": 0.04328464716672897, + "learning_rate": 0.0001387911247130834, + "loss": 0.2765, + "step": 18538 + }, + { + "epoch": 1.5018632534024627, + "grad_norm": 0.05452942103147507, + "learning_rate": 0.00013878662406048876, + "loss": 0.2826, + "step": 18539 + }, + { + "epoch": 1.501944264419961, + "grad_norm": 0.045899324119091034, + "learning_rate": 0.00013878212340789415, + "loss": 0.2913, + "step": 18540 + }, + { + "epoch": 1.5020252754374595, + "grad_norm": 0.042319826781749725, + "learning_rate": 0.00013877762275529953, + "loss": 0.2545, + "step": 18541 + }, + { + "epoch": 1.502106286454958, + "grad_norm": 0.05380789935588837, + "learning_rate": 0.0001387731221027049, + "loss": 0.2885, + "step": 18542 + }, + { + "epoch": 1.5021872974724562, + "grad_norm": 0.05620346963405609, + "learning_rate": 0.00013876862145011028, + "loss": 0.3163, + "step": 18543 + }, + { + "epoch": 1.5022683084899546, + "grad_norm": 0.05543176829814911, + "learning_rate": 0.00013876412079751564, + "loss": 0.2977, + "step": 18544 + }, + { + "epoch": 1.502349319507453, + "grad_norm": 0.047539882361888885, + "learning_rate": 0.000138759620144921, + "loss": 0.2925, + "step": 18545 + }, + { + "epoch": 1.5024303305249513, + "grad_norm": 0.05766825005412102, + "learning_rate": 0.0001387551194923264, + "loss": 0.3395, + "step": 18546 + }, + { + "epoch": 1.5025113415424498, + "grad_norm": 0.04431447759270668, + "learning_rate": 0.00013875061883973178, + "loss": 0.2685, + "step": 18547 + }, + { + "epoch": 1.5025923525599483, + "grad_norm": 0.061709705740213394, + "learning_rate": 0.00013874611818713714, + "loss": 0.3308, + "step": 18548 + }, + { + "epoch": 1.5026733635774465, + "grad_norm": 0.043706364929676056, + "learning_rate": 0.00013874161753454252, + "loss": 0.2554, + "step": 18549 + }, + { + "epoch": 1.5027543745949448, + "grad_norm": 0.04865025728940964, + "learning_rate": 0.00013873711688194788, + "loss": 0.2923, + "step": 18550 + }, + { + "epoch": 1.5028353856124435, + "grad_norm": 0.05347156152129173, + "learning_rate": 0.00013873261622935327, + "loss": 0.3071, + "step": 18551 + }, + { + "epoch": 1.5029163966299417, + "grad_norm": 0.055005237460136414, + "learning_rate": 0.00013872811557675863, + "loss": 0.3183, + "step": 18552 + }, + { + "epoch": 1.50299740764744, + "grad_norm": 0.049281034618616104, + "learning_rate": 0.00013872361492416402, + "loss": 0.2497, + "step": 18553 + }, + { + "epoch": 1.5030784186649384, + "grad_norm": 0.05435588210821152, + "learning_rate": 0.00013871911427156938, + "loss": 0.3302, + "step": 18554 + }, + { + "epoch": 1.5031594296824369, + "grad_norm": 0.04774954169988632, + "learning_rate": 0.00013871461361897476, + "loss": 0.2671, + "step": 18555 + }, + { + "epoch": 1.5032404406999351, + "grad_norm": 0.05450746417045593, + "learning_rate": 0.00013871011296638012, + "loss": 0.3222, + "step": 18556 + }, + { + "epoch": 1.5033214517174336, + "grad_norm": 0.04396272823214531, + "learning_rate": 0.0001387056123137855, + "loss": 0.2874, + "step": 18557 + }, + { + "epoch": 1.503402462734932, + "grad_norm": 0.04642244800925255, + "learning_rate": 0.0001387011116611909, + "loss": 0.3038, + "step": 18558 + }, + { + "epoch": 1.5034834737524303, + "grad_norm": 0.04344436153769493, + "learning_rate": 0.00013869661100859626, + "loss": 0.264, + "step": 18559 + }, + { + "epoch": 1.5035644847699285, + "grad_norm": 0.04258277639746666, + "learning_rate": 0.00013869211035600162, + "loss": 0.2747, + "step": 18560 + }, + { + "epoch": 1.5036454957874272, + "grad_norm": 0.05755390226840973, + "learning_rate": 0.000138687609703407, + "loss": 0.3079, + "step": 18561 + }, + { + "epoch": 1.5037265068049255, + "grad_norm": 0.046239741146564484, + "learning_rate": 0.00013868310905081237, + "loss": 0.289, + "step": 18562 + }, + { + "epoch": 1.5038075178224237, + "grad_norm": 0.053038790822029114, + "learning_rate": 0.00013867860839821775, + "loss": 0.2833, + "step": 18563 + }, + { + "epoch": 1.5038885288399222, + "grad_norm": 0.04764937609434128, + "learning_rate": 0.00013867410774562314, + "loss": 0.2666, + "step": 18564 + }, + { + "epoch": 1.5039695398574207, + "grad_norm": 0.05545291304588318, + "learning_rate": 0.0001386696070930285, + "loss": 0.282, + "step": 18565 + }, + { + "epoch": 1.504050550874919, + "grad_norm": 0.061405330896377563, + "learning_rate": 0.00013866510644043386, + "loss": 0.3036, + "step": 18566 + }, + { + "epoch": 1.5041315618924174, + "grad_norm": 0.046580031514167786, + "learning_rate": 0.00013866060578783925, + "loss": 0.2773, + "step": 18567 + }, + { + "epoch": 1.5042125729099158, + "grad_norm": 0.055441372096538544, + "learning_rate": 0.0001386561051352446, + "loss": 0.3012, + "step": 18568 + }, + { + "epoch": 1.504293583927414, + "grad_norm": 0.06004418060183525, + "learning_rate": 0.00013865160448265, + "loss": 0.2965, + "step": 18569 + }, + { + "epoch": 1.5043745949449125, + "grad_norm": 0.05109797790646553, + "learning_rate": 0.00013864710383005538, + "loss": 0.313, + "step": 18570 + }, + { + "epoch": 1.504455605962411, + "grad_norm": 0.0582164004445076, + "learning_rate": 0.00013864260317746074, + "loss": 0.3293, + "step": 18571 + }, + { + "epoch": 1.5045366169799093, + "grad_norm": 0.05205508694052696, + "learning_rate": 0.0001386381025248661, + "loss": 0.3073, + "step": 18572 + }, + { + "epoch": 1.5046176279974075, + "grad_norm": 0.05729497969150543, + "learning_rate": 0.0001386336018722715, + "loss": 0.2782, + "step": 18573 + }, + { + "epoch": 1.5046986390149062, + "grad_norm": 0.05498183146119118, + "learning_rate": 0.00013862910121967685, + "loss": 0.2763, + "step": 18574 + }, + { + "epoch": 1.5047796500324044, + "grad_norm": 0.05813434720039368, + "learning_rate": 0.00013862460056708224, + "loss": 0.2855, + "step": 18575 + }, + { + "epoch": 1.5048606610499027, + "grad_norm": 0.050552502274513245, + "learning_rate": 0.00013862009991448762, + "loss": 0.3153, + "step": 18576 + }, + { + "epoch": 1.5049416720674011, + "grad_norm": 0.0465339720249176, + "learning_rate": 0.00013861559926189298, + "loss": 0.2752, + "step": 18577 + }, + { + "epoch": 1.5050226830848996, + "grad_norm": 0.05752887949347496, + "learning_rate": 0.00013861109860929834, + "loss": 0.2684, + "step": 18578 + }, + { + "epoch": 1.5051036941023979, + "grad_norm": 0.05408168584108353, + "learning_rate": 0.00013860659795670373, + "loss": 0.3078, + "step": 18579 + }, + { + "epoch": 1.5051847051198963, + "grad_norm": 0.04079076275229454, + "learning_rate": 0.00013860209730410912, + "loss": 0.2744, + "step": 18580 + }, + { + "epoch": 1.5052657161373948, + "grad_norm": 0.04741482436656952, + "learning_rate": 0.00013859759665151448, + "loss": 0.2624, + "step": 18581 + }, + { + "epoch": 1.505346727154893, + "grad_norm": 0.04578164219856262, + "learning_rate": 0.00013859309599891987, + "loss": 0.2917, + "step": 18582 + }, + { + "epoch": 1.5054277381723913, + "grad_norm": 0.04694436863064766, + "learning_rate": 0.00013858859534632523, + "loss": 0.3019, + "step": 18583 + }, + { + "epoch": 1.50550874918989, + "grad_norm": 0.05345213785767555, + "learning_rate": 0.00013858409469373059, + "loss": 0.2916, + "step": 18584 + }, + { + "epoch": 1.5055897602073882, + "grad_norm": 0.0502551831305027, + "learning_rate": 0.00013857959404113597, + "loss": 0.2881, + "step": 18585 + }, + { + "epoch": 1.5056707712248865, + "grad_norm": 0.04362354055047035, + "learning_rate": 0.00013857509338854136, + "loss": 0.2879, + "step": 18586 + }, + { + "epoch": 1.505751782242385, + "grad_norm": 0.061859581619501114, + "learning_rate": 0.00013857059273594672, + "loss": 0.3341, + "step": 18587 + }, + { + "epoch": 1.5058327932598834, + "grad_norm": 0.048323486000299454, + "learning_rate": 0.0001385660920833521, + "loss": 0.3062, + "step": 18588 + }, + { + "epoch": 1.5059138042773816, + "grad_norm": 0.05235842987895012, + "learning_rate": 0.00013856159143075747, + "loss": 0.31, + "step": 18589 + }, + { + "epoch": 1.50599481529488, + "grad_norm": 0.05464842915534973, + "learning_rate": 0.00013855709077816283, + "loss": 0.3411, + "step": 18590 + }, + { + "epoch": 1.5060758263123786, + "grad_norm": 0.05268260836601257, + "learning_rate": 0.00013855259012556821, + "loss": 0.3265, + "step": 18591 + }, + { + "epoch": 1.5061568373298768, + "grad_norm": 0.05023358017206192, + "learning_rate": 0.0001385480894729736, + "loss": 0.2897, + "step": 18592 + }, + { + "epoch": 1.5062378483473753, + "grad_norm": 0.046102315187454224, + "learning_rate": 0.00013854358882037896, + "loss": 0.2753, + "step": 18593 + }, + { + "epoch": 1.5063188593648738, + "grad_norm": 0.04473813995718956, + "learning_rate": 0.00013853908816778435, + "loss": 0.2897, + "step": 18594 + }, + { + "epoch": 1.506399870382372, + "grad_norm": 0.049142077565193176, + "learning_rate": 0.0001385345875151897, + "loss": 0.2912, + "step": 18595 + }, + { + "epoch": 1.5064808813998702, + "grad_norm": 0.05272606760263443, + "learning_rate": 0.00013853008686259507, + "loss": 0.3167, + "step": 18596 + }, + { + "epoch": 1.5065618924173687, + "grad_norm": 0.06326889991760254, + "learning_rate": 0.00013852558621000046, + "loss": 0.3176, + "step": 18597 + }, + { + "epoch": 1.5066429034348672, + "grad_norm": 0.04475260153412819, + "learning_rate": 0.00013852108555740584, + "loss": 0.2458, + "step": 18598 + }, + { + "epoch": 1.5067239144523654, + "grad_norm": 0.051189832389354706, + "learning_rate": 0.0001385165849048112, + "loss": 0.3195, + "step": 18599 + }, + { + "epoch": 1.5068049254698639, + "grad_norm": 0.05150541290640831, + "learning_rate": 0.0001385120842522166, + "loss": 0.2626, + "step": 18600 + }, + { + "epoch": 1.5068859364873624, + "grad_norm": 0.05144593492150307, + "learning_rate": 0.00013850758359962195, + "loss": 0.2497, + "step": 18601 + }, + { + "epoch": 1.5069669475048606, + "grad_norm": 0.04862655699253082, + "learning_rate": 0.0001385030829470273, + "loss": 0.2796, + "step": 18602 + }, + { + "epoch": 1.507047958522359, + "grad_norm": 0.03998184576630592, + "learning_rate": 0.0001384985822944327, + "loss": 0.2604, + "step": 18603 + }, + { + "epoch": 1.5071289695398575, + "grad_norm": 0.0496944859623909, + "learning_rate": 0.00013849408164183808, + "loss": 0.3112, + "step": 18604 + }, + { + "epoch": 1.5072099805573558, + "grad_norm": 0.04353320598602295, + "learning_rate": 0.00013848958098924344, + "loss": 0.2425, + "step": 18605 + }, + { + "epoch": 1.507290991574854, + "grad_norm": 0.046205393970012665, + "learning_rate": 0.00013848508033664883, + "loss": 0.2865, + "step": 18606 + }, + { + "epoch": 1.5073720025923527, + "grad_norm": 0.06040682643651962, + "learning_rate": 0.0001384805796840542, + "loss": 0.3052, + "step": 18607 + }, + { + "epoch": 1.507453013609851, + "grad_norm": 0.05146654695272446, + "learning_rate": 0.00013847607903145955, + "loss": 0.2685, + "step": 18608 + }, + { + "epoch": 1.5075340246273492, + "grad_norm": 0.05239380523562431, + "learning_rate": 0.00013847157837886494, + "loss": 0.2962, + "step": 18609 + }, + { + "epoch": 1.5076150356448477, + "grad_norm": 0.04868777096271515, + "learning_rate": 0.00013846707772627033, + "loss": 0.2733, + "step": 18610 + }, + { + "epoch": 1.5076960466623461, + "grad_norm": 0.06022943556308746, + "learning_rate": 0.0001384625770736757, + "loss": 0.2893, + "step": 18611 + }, + { + "epoch": 1.5077770576798444, + "grad_norm": 0.045286670327186584, + "learning_rate": 0.00013845807642108107, + "loss": 0.3286, + "step": 18612 + }, + { + "epoch": 1.5078580686973428, + "grad_norm": 0.04287097230553627, + "learning_rate": 0.00013845357576848643, + "loss": 0.2638, + "step": 18613 + }, + { + "epoch": 1.5079390797148413, + "grad_norm": 0.058371007442474365, + "learning_rate": 0.0001384490751158918, + "loss": 0.3051, + "step": 18614 + }, + { + "epoch": 1.5080200907323396, + "grad_norm": 0.04612262547016144, + "learning_rate": 0.00013844457446329718, + "loss": 0.2739, + "step": 18615 + }, + { + "epoch": 1.508101101749838, + "grad_norm": 0.04888421297073364, + "learning_rate": 0.00013844007381070257, + "loss": 0.2859, + "step": 18616 + }, + { + "epoch": 1.5081821127673365, + "grad_norm": 0.04447241500020027, + "learning_rate": 0.00013843557315810793, + "loss": 0.3008, + "step": 18617 + }, + { + "epoch": 1.5082631237848347, + "grad_norm": 0.047536179423332214, + "learning_rate": 0.00013843107250551332, + "loss": 0.2653, + "step": 18618 + }, + { + "epoch": 1.508344134802333, + "grad_norm": 0.052498817443847656, + "learning_rate": 0.00013842657185291868, + "loss": 0.2943, + "step": 18619 + }, + { + "epoch": 1.5084251458198314, + "grad_norm": 0.053822360932826996, + "learning_rate": 0.00013842207120032404, + "loss": 0.322, + "step": 18620 + }, + { + "epoch": 1.50850615683733, + "grad_norm": 0.058178890496492386, + "learning_rate": 0.00013841757054772942, + "loss": 0.2767, + "step": 18621 + }, + { + "epoch": 1.5085871678548282, + "grad_norm": 0.06313327699899673, + "learning_rate": 0.0001384130698951348, + "loss": 0.3065, + "step": 18622 + }, + { + "epoch": 1.5086681788723266, + "grad_norm": 0.06296181678771973, + "learning_rate": 0.00013840856924254017, + "loss": 0.259, + "step": 18623 + }, + { + "epoch": 1.508749189889825, + "grad_norm": 0.04204053431749344, + "learning_rate": 0.00013840406858994556, + "loss": 0.2567, + "step": 18624 + }, + { + "epoch": 1.5088302009073233, + "grad_norm": 0.049650777131319046, + "learning_rate": 0.00013839956793735092, + "loss": 0.2601, + "step": 18625 + }, + { + "epoch": 1.5089112119248218, + "grad_norm": 0.050487373024225235, + "learning_rate": 0.00013839506728475628, + "loss": 0.2811, + "step": 18626 + }, + { + "epoch": 1.5089922229423203, + "grad_norm": 0.05244138464331627, + "learning_rate": 0.0001383905666321617, + "loss": 0.3, + "step": 18627 + }, + { + "epoch": 1.5090732339598185, + "grad_norm": 0.04472174495458603, + "learning_rate": 0.00013838606597956705, + "loss": 0.2397, + "step": 18628 + }, + { + "epoch": 1.5091542449773168, + "grad_norm": 0.057056084275245667, + "learning_rate": 0.0001383815653269724, + "loss": 0.3052, + "step": 18629 + }, + { + "epoch": 1.5092352559948155, + "grad_norm": 0.0568094402551651, + "learning_rate": 0.0001383770646743778, + "loss": 0.3133, + "step": 18630 + }, + { + "epoch": 1.5093162670123137, + "grad_norm": 0.040568944066762924, + "learning_rate": 0.00013837256402178316, + "loss": 0.2348, + "step": 18631 + }, + { + "epoch": 1.509397278029812, + "grad_norm": 0.05801571533083916, + "learning_rate": 0.00013836806336918855, + "loss": 0.3339, + "step": 18632 + }, + { + "epoch": 1.5094782890473104, + "grad_norm": 0.04366452246904373, + "learning_rate": 0.00013836356271659393, + "loss": 0.2626, + "step": 18633 + }, + { + "epoch": 1.5095593000648089, + "grad_norm": 0.05549009516835213, + "learning_rate": 0.0001383590620639993, + "loss": 0.331, + "step": 18634 + }, + { + "epoch": 1.5096403110823071, + "grad_norm": 0.0553416907787323, + "learning_rate": 0.00013835456141140465, + "loss": 0.2957, + "step": 18635 + }, + { + "epoch": 1.5097213220998056, + "grad_norm": 0.05349811166524887, + "learning_rate": 0.00013835006075881004, + "loss": 0.2911, + "step": 18636 + }, + { + "epoch": 1.509802333117304, + "grad_norm": 0.04866451397538185, + "learning_rate": 0.0001383455601062154, + "loss": 0.2892, + "step": 18637 + }, + { + "epoch": 1.5098833441348023, + "grad_norm": 0.046243034303188324, + "learning_rate": 0.0001383410594536208, + "loss": 0.3073, + "step": 18638 + }, + { + "epoch": 1.5099643551523008, + "grad_norm": 0.053792405873537064, + "learning_rate": 0.00013833655880102617, + "loss": 0.3153, + "step": 18639 + }, + { + "epoch": 1.5100453661697992, + "grad_norm": 0.05517613887786865, + "learning_rate": 0.00013833205814843153, + "loss": 0.3032, + "step": 18640 + }, + { + "epoch": 1.5101263771872975, + "grad_norm": 0.04403368756175041, + "learning_rate": 0.0001383275574958369, + "loss": 0.2745, + "step": 18641 + }, + { + "epoch": 1.5102073882047957, + "grad_norm": 0.04711280018091202, + "learning_rate": 0.00013832305684324228, + "loss": 0.2498, + "step": 18642 + }, + { + "epoch": 1.5102883992222942, + "grad_norm": 0.04522211104631424, + "learning_rate": 0.00013831855619064764, + "loss": 0.2616, + "step": 18643 + }, + { + "epoch": 1.5103694102397927, + "grad_norm": 0.04840913414955139, + "learning_rate": 0.00013831405553805303, + "loss": 0.2929, + "step": 18644 + }, + { + "epoch": 1.510450421257291, + "grad_norm": 0.045142967253923416, + "learning_rate": 0.00013830955488545842, + "loss": 0.2936, + "step": 18645 + }, + { + "epoch": 1.5105314322747894, + "grad_norm": 0.04537534341216087, + "learning_rate": 0.00013830505423286378, + "loss": 0.2616, + "step": 18646 + }, + { + "epoch": 1.5106124432922878, + "grad_norm": 0.04791952297091484, + "learning_rate": 0.00013830055358026914, + "loss": 0.2661, + "step": 18647 + }, + { + "epoch": 1.510693454309786, + "grad_norm": 0.051988594233989716, + "learning_rate": 0.00013829605292767452, + "loss": 0.2785, + "step": 18648 + }, + { + "epoch": 1.5107744653272845, + "grad_norm": 0.043577536940574646, + "learning_rate": 0.00013829155227507988, + "loss": 0.2803, + "step": 18649 + }, + { + "epoch": 1.510855476344783, + "grad_norm": 0.04489186778664589, + "learning_rate": 0.00013828705162248527, + "loss": 0.2837, + "step": 18650 + }, + { + "epoch": 1.5109364873622813, + "grad_norm": 0.04787694290280342, + "learning_rate": 0.00013828255096989066, + "loss": 0.3055, + "step": 18651 + }, + { + "epoch": 1.5110174983797795, + "grad_norm": 0.06286022067070007, + "learning_rate": 0.00013827805031729602, + "loss": 0.3101, + "step": 18652 + }, + { + "epoch": 1.5110985093972782, + "grad_norm": 0.05875331163406372, + "learning_rate": 0.00013827354966470138, + "loss": 0.3048, + "step": 18653 + }, + { + "epoch": 1.5111795204147764, + "grad_norm": 0.050677020102739334, + "learning_rate": 0.00013826904901210677, + "loss": 0.3073, + "step": 18654 + }, + { + "epoch": 1.5112605314322747, + "grad_norm": 0.04322303086519241, + "learning_rate": 0.00013826454835951213, + "loss": 0.2828, + "step": 18655 + }, + { + "epoch": 1.5113415424497731, + "grad_norm": 0.0534433051943779, + "learning_rate": 0.0001382600477069175, + "loss": 0.3363, + "step": 18656 + }, + { + "epoch": 1.5114225534672716, + "grad_norm": 0.046610474586486816, + "learning_rate": 0.0001382555470543229, + "loss": 0.3099, + "step": 18657 + }, + { + "epoch": 1.5115035644847699, + "grad_norm": 0.051539093255996704, + "learning_rate": 0.00013825104640172826, + "loss": 0.2926, + "step": 18658 + }, + { + "epoch": 1.5115845755022683, + "grad_norm": 0.047712355852127075, + "learning_rate": 0.00013824654574913362, + "loss": 0.2777, + "step": 18659 + }, + { + "epoch": 1.5116655865197668, + "grad_norm": 0.0468960702419281, + "learning_rate": 0.000138242045096539, + "loss": 0.2748, + "step": 18660 + }, + { + "epoch": 1.511746597537265, + "grad_norm": 0.05287209525704384, + "learning_rate": 0.0001382375444439444, + "loss": 0.2902, + "step": 18661 + }, + { + "epoch": 1.5118276085547635, + "grad_norm": 0.04535992816090584, + "learning_rate": 0.00013823304379134975, + "loss": 0.2915, + "step": 18662 + }, + { + "epoch": 1.511908619572262, + "grad_norm": 0.05408494547009468, + "learning_rate": 0.00013822854313875514, + "loss": 0.3081, + "step": 18663 + }, + { + "epoch": 1.5119896305897602, + "grad_norm": 0.04434020444750786, + "learning_rate": 0.0001382240424861605, + "loss": 0.281, + "step": 18664 + }, + { + "epoch": 1.5120706416072585, + "grad_norm": 0.05857977271080017, + "learning_rate": 0.00013821954183356586, + "loss": 0.255, + "step": 18665 + }, + { + "epoch": 1.512151652624757, + "grad_norm": 0.04309207201004028, + "learning_rate": 0.00013821504118097125, + "loss": 0.2897, + "step": 18666 + }, + { + "epoch": 1.5122326636422554, + "grad_norm": 0.06811527162790298, + "learning_rate": 0.00013821054052837664, + "loss": 0.2611, + "step": 18667 + }, + { + "epoch": 1.5123136746597536, + "grad_norm": 0.047615617513656616, + "learning_rate": 0.000138206039875782, + "loss": 0.31, + "step": 18668 + }, + { + "epoch": 1.512394685677252, + "grad_norm": 0.05192350596189499, + "learning_rate": 0.00013820153922318738, + "loss": 0.2876, + "step": 18669 + }, + { + "epoch": 1.5124756966947506, + "grad_norm": 0.04546111449599266, + "learning_rate": 0.00013819703857059274, + "loss": 0.2567, + "step": 18670 + }, + { + "epoch": 1.5125567077122488, + "grad_norm": 0.04693352431058884, + "learning_rate": 0.0001381925379179981, + "loss": 0.2647, + "step": 18671 + }, + { + "epoch": 1.5126377187297473, + "grad_norm": 0.0454537570476532, + "learning_rate": 0.0001381880372654035, + "loss": 0.2871, + "step": 18672 + }, + { + "epoch": 1.5127187297472457, + "grad_norm": 0.05148732289671898, + "learning_rate": 0.00013818353661280888, + "loss": 0.3049, + "step": 18673 + }, + { + "epoch": 1.512799740764744, + "grad_norm": 0.059178005903959274, + "learning_rate": 0.00013817903596021424, + "loss": 0.303, + "step": 18674 + }, + { + "epoch": 1.5128807517822422, + "grad_norm": 0.05884253978729248, + "learning_rate": 0.00013817453530761962, + "loss": 0.3042, + "step": 18675 + }, + { + "epoch": 1.512961762799741, + "grad_norm": 0.05346214771270752, + "learning_rate": 0.00013817003465502498, + "loss": 0.3251, + "step": 18676 + }, + { + "epoch": 1.5130427738172392, + "grad_norm": 0.049198366701602936, + "learning_rate": 0.00013816553400243034, + "loss": 0.2834, + "step": 18677 + }, + { + "epoch": 1.5131237848347374, + "grad_norm": 0.054034922271966934, + "learning_rate": 0.00013816103334983573, + "loss": 0.2889, + "step": 18678 + }, + { + "epoch": 1.5132047958522359, + "grad_norm": 0.0477595180273056, + "learning_rate": 0.00013815653269724112, + "loss": 0.2607, + "step": 18679 + }, + { + "epoch": 1.5132858068697344, + "grad_norm": 0.05005710944533348, + "learning_rate": 0.00013815203204464648, + "loss": 0.2908, + "step": 18680 + }, + { + "epoch": 1.5133668178872326, + "grad_norm": 0.055473003536462784, + "learning_rate": 0.00013814753139205187, + "loss": 0.3023, + "step": 18681 + }, + { + "epoch": 1.513447828904731, + "grad_norm": 0.05821644887328148, + "learning_rate": 0.00013814303073945723, + "loss": 0.28, + "step": 18682 + }, + { + "epoch": 1.5135288399222295, + "grad_norm": 0.047872528433799744, + "learning_rate": 0.00013813853008686259, + "loss": 0.2639, + "step": 18683 + }, + { + "epoch": 1.5136098509397278, + "grad_norm": 0.05764663219451904, + "learning_rate": 0.00013813402943426797, + "loss": 0.3254, + "step": 18684 + }, + { + "epoch": 1.513690861957226, + "grad_norm": 0.06259345263242722, + "learning_rate": 0.00013812952878167336, + "loss": 0.3194, + "step": 18685 + }, + { + "epoch": 1.5137718729747247, + "grad_norm": 0.054814137518405914, + "learning_rate": 0.00013812502812907872, + "loss": 0.2866, + "step": 18686 + }, + { + "epoch": 1.513852883992223, + "grad_norm": 0.04742126166820526, + "learning_rate": 0.0001381205274764841, + "loss": 0.2908, + "step": 18687 + }, + { + "epoch": 1.5139338950097212, + "grad_norm": 0.04217066615819931, + "learning_rate": 0.00013811602682388947, + "loss": 0.2882, + "step": 18688 + }, + { + "epoch": 1.5140149060272197, + "grad_norm": 0.04281475394964218, + "learning_rate": 0.00013811152617129483, + "loss": 0.3041, + "step": 18689 + }, + { + "epoch": 1.5140959170447181, + "grad_norm": 0.0419379360973835, + "learning_rate": 0.00013810702551870021, + "loss": 0.2662, + "step": 18690 + }, + { + "epoch": 1.5141769280622164, + "grad_norm": 0.050524819642305374, + "learning_rate": 0.0001381025248661056, + "loss": 0.2754, + "step": 18691 + }, + { + "epoch": 1.5142579390797148, + "grad_norm": 0.04706466943025589, + "learning_rate": 0.00013809802421351096, + "loss": 0.2768, + "step": 18692 + }, + { + "epoch": 1.5143389500972133, + "grad_norm": 0.05239817500114441, + "learning_rate": 0.00013809352356091635, + "loss": 0.2826, + "step": 18693 + }, + { + "epoch": 1.5144199611147116, + "grad_norm": 0.04976906254887581, + "learning_rate": 0.0001380890229083217, + "loss": 0.2582, + "step": 18694 + }, + { + "epoch": 1.51450097213221, + "grad_norm": 0.05384444817900658, + "learning_rate": 0.00013808452225572707, + "loss": 0.2548, + "step": 18695 + }, + { + "epoch": 1.5145819831497085, + "grad_norm": 0.043319009244441986, + "learning_rate": 0.00013808002160313248, + "loss": 0.2722, + "step": 18696 + }, + { + "epoch": 1.5146629941672067, + "grad_norm": 0.05653691291809082, + "learning_rate": 0.00013807552095053784, + "loss": 0.327, + "step": 18697 + }, + { + "epoch": 1.514744005184705, + "grad_norm": 0.05521196871995926, + "learning_rate": 0.0001380710202979432, + "loss": 0.3193, + "step": 18698 + }, + { + "epoch": 1.5148250162022034, + "grad_norm": 0.05279504880309105, + "learning_rate": 0.0001380665196453486, + "loss": 0.3488, + "step": 18699 + }, + { + "epoch": 1.514906027219702, + "grad_norm": 0.04920048266649246, + "learning_rate": 0.00013806201899275395, + "loss": 0.2944, + "step": 18700 + }, + { + "epoch": 1.5149870382372002, + "grad_norm": 0.05808188021183014, + "learning_rate": 0.0001380575183401593, + "loss": 0.2877, + "step": 18701 + }, + { + "epoch": 1.5150680492546986, + "grad_norm": 0.04960033297538757, + "learning_rate": 0.00013805301768756473, + "loss": 0.3101, + "step": 18702 + }, + { + "epoch": 1.515149060272197, + "grad_norm": 0.042428918182849884, + "learning_rate": 0.00013804851703497009, + "loss": 0.2628, + "step": 18703 + }, + { + "epoch": 1.5152300712896953, + "grad_norm": 0.04198756814002991, + "learning_rate": 0.00013804401638237545, + "loss": 0.2959, + "step": 18704 + }, + { + "epoch": 1.5153110823071938, + "grad_norm": 0.04299665987491608, + "learning_rate": 0.00013803951572978083, + "loss": 0.2875, + "step": 18705 + }, + { + "epoch": 1.5153920933246923, + "grad_norm": 0.039975300431251526, + "learning_rate": 0.0001380350150771862, + "loss": 0.2438, + "step": 18706 + }, + { + "epoch": 1.5154731043421905, + "grad_norm": 0.06994131207466125, + "learning_rate": 0.00013803051442459155, + "loss": 0.3259, + "step": 18707 + }, + { + "epoch": 1.5155541153596888, + "grad_norm": 0.044267553836107254, + "learning_rate": 0.00013802601377199697, + "loss": 0.2604, + "step": 18708 + }, + { + "epoch": 1.5156351263771874, + "grad_norm": 0.04812091961503029, + "learning_rate": 0.00013802151311940233, + "loss": 0.2525, + "step": 18709 + }, + { + "epoch": 1.5157161373946857, + "grad_norm": 0.05721137672662735, + "learning_rate": 0.0001380170124668077, + "loss": 0.2777, + "step": 18710 + }, + { + "epoch": 1.515797148412184, + "grad_norm": 0.05760475993156433, + "learning_rate": 0.00013801251181421307, + "loss": 0.3565, + "step": 18711 + }, + { + "epoch": 1.5158781594296824, + "grad_norm": 0.04711325094103813, + "learning_rate": 0.00013800801116161843, + "loss": 0.2974, + "step": 18712 + }, + { + "epoch": 1.5159591704471809, + "grad_norm": 0.05328639969229698, + "learning_rate": 0.00013800351050902382, + "loss": 0.3143, + "step": 18713 + }, + { + "epoch": 1.5160401814646791, + "grad_norm": 0.0565636083483696, + "learning_rate": 0.0001379990098564292, + "loss": 0.2871, + "step": 18714 + }, + { + "epoch": 1.5161211924821776, + "grad_norm": 0.05294475704431534, + "learning_rate": 0.00013799450920383457, + "loss": 0.3358, + "step": 18715 + }, + { + "epoch": 1.516202203499676, + "grad_norm": 0.047887083142995834, + "learning_rate": 0.00013799000855123993, + "loss": 0.2812, + "step": 18716 + }, + { + "epoch": 1.5162832145171743, + "grad_norm": 0.05028738081455231, + "learning_rate": 0.00013798550789864532, + "loss": 0.2868, + "step": 18717 + }, + { + "epoch": 1.5163642255346728, + "grad_norm": 0.05214919522404671, + "learning_rate": 0.00013798100724605068, + "loss": 0.3086, + "step": 18718 + }, + { + "epoch": 1.5164452365521712, + "grad_norm": 0.047210779041051865, + "learning_rate": 0.00013797650659345606, + "loss": 0.2988, + "step": 18719 + }, + { + "epoch": 1.5165262475696695, + "grad_norm": 0.04988611117005348, + "learning_rate": 0.00013797200594086145, + "loss": 0.2667, + "step": 18720 + }, + { + "epoch": 1.5166072585871677, + "grad_norm": 0.052436117082834244, + "learning_rate": 0.0001379675052882668, + "loss": 0.3187, + "step": 18721 + }, + { + "epoch": 1.5166882696046662, + "grad_norm": 0.05738939344882965, + "learning_rate": 0.00013796300463567217, + "loss": 0.283, + "step": 18722 + }, + { + "epoch": 1.5167692806221647, + "grad_norm": 0.04664130136370659, + "learning_rate": 0.00013795850398307756, + "loss": 0.3041, + "step": 18723 + }, + { + "epoch": 1.516850291639663, + "grad_norm": 0.044622667133808136, + "learning_rate": 0.00013795400333048292, + "loss": 0.2655, + "step": 18724 + }, + { + "epoch": 1.5169313026571614, + "grad_norm": 0.052109766751527786, + "learning_rate": 0.0001379495026778883, + "loss": 0.2748, + "step": 18725 + }, + { + "epoch": 1.5170123136746598, + "grad_norm": 0.05363732948899269, + "learning_rate": 0.0001379450020252937, + "loss": 0.3003, + "step": 18726 + }, + { + "epoch": 1.517093324692158, + "grad_norm": 0.04929107427597046, + "learning_rate": 0.00013794050137269905, + "loss": 0.2836, + "step": 18727 + }, + { + "epoch": 1.5171743357096565, + "grad_norm": 0.04485325142741203, + "learning_rate": 0.0001379360007201044, + "loss": 0.2683, + "step": 18728 + }, + { + "epoch": 1.517255346727155, + "grad_norm": 0.05199460685253143, + "learning_rate": 0.0001379315000675098, + "loss": 0.2702, + "step": 18729 + }, + { + "epoch": 1.5173363577446533, + "grad_norm": 0.04943700134754181, + "learning_rate": 0.00013792699941491516, + "loss": 0.2759, + "step": 18730 + }, + { + "epoch": 1.5174173687621515, + "grad_norm": 0.05386276915669441, + "learning_rate": 0.00013792249876232055, + "loss": 0.2885, + "step": 18731 + }, + { + "epoch": 1.5174983797796502, + "grad_norm": 0.04591602087020874, + "learning_rate": 0.00013791799810972593, + "loss": 0.2807, + "step": 18732 + }, + { + "epoch": 1.5175793907971484, + "grad_norm": 0.041390374302864075, + "learning_rate": 0.0001379134974571313, + "loss": 0.2853, + "step": 18733 + }, + { + "epoch": 1.5176604018146467, + "grad_norm": 0.050120241940021515, + "learning_rate": 0.00013790899680453665, + "loss": 0.3006, + "step": 18734 + }, + { + "epoch": 1.5177414128321451, + "grad_norm": 0.04909467324614525, + "learning_rate": 0.00013790449615194204, + "loss": 0.2909, + "step": 18735 + }, + { + "epoch": 1.5178224238496436, + "grad_norm": 0.05132598802447319, + "learning_rate": 0.00013789999549934743, + "loss": 0.2679, + "step": 18736 + }, + { + "epoch": 1.5179034348671419, + "grad_norm": 0.060010362416505814, + "learning_rate": 0.0001378954948467528, + "loss": 0.3035, + "step": 18737 + }, + { + "epoch": 1.5179844458846403, + "grad_norm": 0.07251916080713272, + "learning_rate": 0.00013789099419415817, + "loss": 0.3148, + "step": 18738 + }, + { + "epoch": 1.5180654569021388, + "grad_norm": 0.042271800339221954, + "learning_rate": 0.00013788649354156353, + "loss": 0.2836, + "step": 18739 + }, + { + "epoch": 1.518146467919637, + "grad_norm": 0.055395521223545074, + "learning_rate": 0.0001378819928889689, + "loss": 0.3016, + "step": 18740 + }, + { + "epoch": 1.5182274789371355, + "grad_norm": 0.050871629267930984, + "learning_rate": 0.00013787749223637428, + "loss": 0.2927, + "step": 18741 + }, + { + "epoch": 1.518308489954634, + "grad_norm": 0.06203100457787514, + "learning_rate": 0.00013787299158377967, + "loss": 0.2865, + "step": 18742 + }, + { + "epoch": 1.5183895009721322, + "grad_norm": 0.042529165744781494, + "learning_rate": 0.00013786849093118503, + "loss": 0.2828, + "step": 18743 + }, + { + "epoch": 1.5184705119896305, + "grad_norm": 0.04327407479286194, + "learning_rate": 0.00013786399027859042, + "loss": 0.29, + "step": 18744 + }, + { + "epoch": 1.518551523007129, + "grad_norm": 0.05121006444096565, + "learning_rate": 0.00013785948962599578, + "loss": 0.2783, + "step": 18745 + }, + { + "epoch": 1.5186325340246274, + "grad_norm": 0.051384586840867996, + "learning_rate": 0.00013785498897340114, + "loss": 0.3032, + "step": 18746 + }, + { + "epoch": 1.5187135450421256, + "grad_norm": 0.0496654212474823, + "learning_rate": 0.00013785048832080652, + "loss": 0.2931, + "step": 18747 + }, + { + "epoch": 1.518794556059624, + "grad_norm": 0.05933926999568939, + "learning_rate": 0.0001378459876682119, + "loss": 0.3127, + "step": 18748 + }, + { + "epoch": 1.5188755670771226, + "grad_norm": 0.045520905405282974, + "learning_rate": 0.00013784148701561727, + "loss": 0.2372, + "step": 18749 + }, + { + "epoch": 1.5189565780946208, + "grad_norm": 0.05414096266031265, + "learning_rate": 0.00013783698636302266, + "loss": 0.2762, + "step": 18750 + }, + { + "epoch": 1.5190375891121193, + "grad_norm": 0.044945355504751205, + "learning_rate": 0.00013783248571042802, + "loss": 0.2687, + "step": 18751 + }, + { + "epoch": 1.5191186001296177, + "grad_norm": 0.04823799431324005, + "learning_rate": 0.00013782798505783338, + "loss": 0.2872, + "step": 18752 + }, + { + "epoch": 1.519199611147116, + "grad_norm": 0.05766402184963226, + "learning_rate": 0.00013782348440523877, + "loss": 0.3247, + "step": 18753 + }, + { + "epoch": 1.5192806221646142, + "grad_norm": 0.05425436422228813, + "learning_rate": 0.00013781898375264415, + "loss": 0.3004, + "step": 18754 + }, + { + "epoch": 1.519361633182113, + "grad_norm": 0.05799352005124092, + "learning_rate": 0.0001378144831000495, + "loss": 0.2595, + "step": 18755 + }, + { + "epoch": 1.5194426441996112, + "grad_norm": 0.0714273452758789, + "learning_rate": 0.0001378099824474549, + "loss": 0.28, + "step": 18756 + }, + { + "epoch": 1.5195236552171094, + "grad_norm": 0.046737149357795715, + "learning_rate": 0.00013780548179486026, + "loss": 0.2781, + "step": 18757 + }, + { + "epoch": 1.5196046662346079, + "grad_norm": 0.06238202005624771, + "learning_rate": 0.00013780098114226562, + "loss": 0.3323, + "step": 18758 + }, + { + "epoch": 1.5196856772521063, + "grad_norm": 0.05099009349942207, + "learning_rate": 0.000137796480489671, + "loss": 0.3094, + "step": 18759 + }, + { + "epoch": 1.5197666882696046, + "grad_norm": 0.04891882464289665, + "learning_rate": 0.0001377919798370764, + "loss": 0.2804, + "step": 18760 + }, + { + "epoch": 1.519847699287103, + "grad_norm": 0.04960760846734047, + "learning_rate": 0.00013778747918448175, + "loss": 0.2865, + "step": 18761 + }, + { + "epoch": 1.5199287103046015, + "grad_norm": 0.04393989220261574, + "learning_rate": 0.00013778297853188714, + "loss": 0.3369, + "step": 18762 + }, + { + "epoch": 1.5200097213220998, + "grad_norm": 0.05547909438610077, + "learning_rate": 0.0001377784778792925, + "loss": 0.2722, + "step": 18763 + }, + { + "epoch": 1.5200907323395982, + "grad_norm": 0.04545421525835991, + "learning_rate": 0.00013777397722669786, + "loss": 0.2816, + "step": 18764 + }, + { + "epoch": 1.5201717433570967, + "grad_norm": 0.051297880709171295, + "learning_rate": 0.00013776947657410328, + "loss": 0.2571, + "step": 18765 + }, + { + "epoch": 1.520252754374595, + "grad_norm": 0.05652231723070145, + "learning_rate": 0.00013776497592150864, + "loss": 0.3263, + "step": 18766 + }, + { + "epoch": 1.5203337653920932, + "grad_norm": 0.041101183742284775, + "learning_rate": 0.000137760475268914, + "loss": 0.2426, + "step": 18767 + }, + { + "epoch": 1.5204147764095917, + "grad_norm": 0.047826338559389114, + "learning_rate": 0.00013775597461631938, + "loss": 0.2619, + "step": 18768 + }, + { + "epoch": 1.5204957874270901, + "grad_norm": 0.04008450359106064, + "learning_rate": 0.00013775147396372474, + "loss": 0.2381, + "step": 18769 + }, + { + "epoch": 1.5205767984445884, + "grad_norm": 0.057629168033599854, + "learning_rate": 0.0001377469733111301, + "loss": 0.2777, + "step": 18770 + }, + { + "epoch": 1.5206578094620868, + "grad_norm": 0.050094276666641235, + "learning_rate": 0.00013774247265853552, + "loss": 0.262, + "step": 18771 + }, + { + "epoch": 1.5207388204795853, + "grad_norm": 0.05211780220270157, + "learning_rate": 0.00013773797200594088, + "loss": 0.3132, + "step": 18772 + }, + { + "epoch": 1.5208198314970836, + "grad_norm": 0.05060528963804245, + "learning_rate": 0.00013773347135334624, + "loss": 0.2381, + "step": 18773 + }, + { + "epoch": 1.520900842514582, + "grad_norm": 0.047037769109010696, + "learning_rate": 0.00013772897070075162, + "loss": 0.3041, + "step": 18774 + }, + { + "epoch": 1.5209818535320805, + "grad_norm": 0.05610048770904541, + "learning_rate": 0.00013772447004815698, + "loss": 0.2731, + "step": 18775 + }, + { + "epoch": 1.5210628645495787, + "grad_norm": 0.05277544632554054, + "learning_rate": 0.00013771996939556234, + "loss": 0.3047, + "step": 18776 + }, + { + "epoch": 1.521143875567077, + "grad_norm": 0.047693025320768356, + "learning_rate": 0.00013771546874296776, + "loss": 0.2823, + "step": 18777 + }, + { + "epoch": 1.5212248865845757, + "grad_norm": 0.050775110721588135, + "learning_rate": 0.00013771096809037312, + "loss": 0.2768, + "step": 18778 + }, + { + "epoch": 1.521305897602074, + "grad_norm": 0.058379460126161575, + "learning_rate": 0.00013770646743777848, + "loss": 0.2978, + "step": 18779 + }, + { + "epoch": 1.5213869086195722, + "grad_norm": 0.053059257566928864, + "learning_rate": 0.00013770196678518387, + "loss": 0.2895, + "step": 18780 + }, + { + "epoch": 1.5214679196370706, + "grad_norm": 0.04597696289420128, + "learning_rate": 0.00013769746613258923, + "loss": 0.3044, + "step": 18781 + }, + { + "epoch": 1.521548930654569, + "grad_norm": 0.04516984522342682, + "learning_rate": 0.00013769296547999459, + "loss": 0.2924, + "step": 18782 + }, + { + "epoch": 1.5216299416720673, + "grad_norm": 0.04692981392145157, + "learning_rate": 0.0001376884648274, + "loss": 0.2911, + "step": 18783 + }, + { + "epoch": 1.5217109526895658, + "grad_norm": 0.049040842801332474, + "learning_rate": 0.00013768396417480536, + "loss": 0.2863, + "step": 18784 + }, + { + "epoch": 1.5217919637070643, + "grad_norm": 0.047845106571912766, + "learning_rate": 0.00013767946352221072, + "loss": 0.2823, + "step": 18785 + }, + { + "epoch": 1.5218729747245625, + "grad_norm": 0.047967370599508286, + "learning_rate": 0.0001376749628696161, + "loss": 0.3035, + "step": 18786 + }, + { + "epoch": 1.5219539857420608, + "grad_norm": 0.047568053007125854, + "learning_rate": 0.00013767046221702147, + "loss": 0.2859, + "step": 18787 + }, + { + "epoch": 1.5220349967595594, + "grad_norm": 0.046472664922475815, + "learning_rate": 0.00013766596156442683, + "loss": 0.2749, + "step": 18788 + }, + { + "epoch": 1.5221160077770577, + "grad_norm": 0.04872645437717438, + "learning_rate": 0.00013766146091183224, + "loss": 0.2833, + "step": 18789 + }, + { + "epoch": 1.522197018794556, + "grad_norm": 0.05547712370753288, + "learning_rate": 0.0001376569602592376, + "loss": 0.2678, + "step": 18790 + }, + { + "epoch": 1.5222780298120544, + "grad_norm": 0.046784743666648865, + "learning_rate": 0.00013765245960664296, + "loss": 0.2886, + "step": 18791 + }, + { + "epoch": 1.5223590408295529, + "grad_norm": 0.048536427319049835, + "learning_rate": 0.00013764795895404835, + "loss": 0.2632, + "step": 18792 + }, + { + "epoch": 1.5224400518470511, + "grad_norm": 0.05491678789258003, + "learning_rate": 0.0001376434583014537, + "loss": 0.2646, + "step": 18793 + }, + { + "epoch": 1.5225210628645496, + "grad_norm": 0.04058699309825897, + "learning_rate": 0.0001376389576488591, + "loss": 0.2609, + "step": 18794 + }, + { + "epoch": 1.522602073882048, + "grad_norm": 0.06221689283847809, + "learning_rate": 0.00013763445699626448, + "loss": 0.2851, + "step": 18795 + }, + { + "epoch": 1.5226830848995463, + "grad_norm": 0.05283937230706215, + "learning_rate": 0.00013762995634366984, + "loss": 0.3339, + "step": 18796 + }, + { + "epoch": 1.5227640959170448, + "grad_norm": 0.05966833978891373, + "learning_rate": 0.0001376254556910752, + "loss": 0.2732, + "step": 18797 + }, + { + "epoch": 1.5228451069345432, + "grad_norm": 0.047837648540735245, + "learning_rate": 0.0001376209550384806, + "loss": 0.2711, + "step": 18798 + }, + { + "epoch": 1.5229261179520415, + "grad_norm": 0.05150927975773811, + "learning_rate": 0.00013761645438588595, + "loss": 0.2826, + "step": 18799 + }, + { + "epoch": 1.5230071289695397, + "grad_norm": 0.053468842059373856, + "learning_rate": 0.00013761195373329134, + "loss": 0.2756, + "step": 18800 + }, + { + "epoch": 1.5230881399870384, + "grad_norm": 0.04660942032933235, + "learning_rate": 0.00013760745308069673, + "loss": 0.2749, + "step": 18801 + }, + { + "epoch": 1.5231691510045366, + "grad_norm": 0.05439922958612442, + "learning_rate": 0.00013760295242810209, + "loss": 0.2701, + "step": 18802 + }, + { + "epoch": 1.523250162022035, + "grad_norm": 0.06845547258853912, + "learning_rate": 0.00013759845177550745, + "loss": 0.3109, + "step": 18803 + }, + { + "epoch": 1.5233311730395334, + "grad_norm": 0.07089272886514664, + "learning_rate": 0.00013759395112291283, + "loss": 0.301, + "step": 18804 + }, + { + "epoch": 1.5234121840570318, + "grad_norm": 0.05223238095641136, + "learning_rate": 0.0001375894504703182, + "loss": 0.2846, + "step": 18805 + }, + { + "epoch": 1.52349319507453, + "grad_norm": 0.06159967929124832, + "learning_rate": 0.00013758494981772358, + "loss": 0.2898, + "step": 18806 + }, + { + "epoch": 1.5235742060920285, + "grad_norm": 0.048389732837677, + "learning_rate": 0.00013758044916512897, + "loss": 0.293, + "step": 18807 + }, + { + "epoch": 1.523655217109527, + "grad_norm": 0.04446495324373245, + "learning_rate": 0.00013757594851253433, + "loss": 0.2769, + "step": 18808 + }, + { + "epoch": 1.5237362281270252, + "grad_norm": 0.05596426874399185, + "learning_rate": 0.0001375714478599397, + "loss": 0.3366, + "step": 18809 + }, + { + "epoch": 1.5238172391445235, + "grad_norm": 0.04864346235990524, + "learning_rate": 0.00013756694720734507, + "loss": 0.3128, + "step": 18810 + }, + { + "epoch": 1.5238982501620222, + "grad_norm": 0.042249538004398346, + "learning_rate": 0.00013756244655475043, + "loss": 0.2685, + "step": 18811 + }, + { + "epoch": 1.5239792611795204, + "grad_norm": 0.05110679939389229, + "learning_rate": 0.00013755794590215582, + "loss": 0.282, + "step": 18812 + }, + { + "epoch": 1.5240602721970187, + "grad_norm": 0.05548878759145737, + "learning_rate": 0.0001375534452495612, + "loss": 0.3002, + "step": 18813 + }, + { + "epoch": 1.5241412832145171, + "grad_norm": 0.04173737019300461, + "learning_rate": 0.00013754894459696657, + "loss": 0.2399, + "step": 18814 + }, + { + "epoch": 1.5242222942320156, + "grad_norm": 0.0537974052131176, + "learning_rate": 0.00013754444394437193, + "loss": 0.2716, + "step": 18815 + }, + { + "epoch": 1.5243033052495139, + "grad_norm": 0.05061505362391472, + "learning_rate": 0.00013753994329177732, + "loss": 0.2845, + "step": 18816 + }, + { + "epoch": 1.5243843162670123, + "grad_norm": 0.055333398282527924, + "learning_rate": 0.0001375354426391827, + "loss": 0.3048, + "step": 18817 + }, + { + "epoch": 1.5244653272845108, + "grad_norm": 0.05376095324754715, + "learning_rate": 0.00013753094198658806, + "loss": 0.3129, + "step": 18818 + }, + { + "epoch": 1.524546338302009, + "grad_norm": 0.053078003227710724, + "learning_rate": 0.00013752644133399345, + "loss": 0.3172, + "step": 18819 + }, + { + "epoch": 1.5246273493195075, + "grad_norm": 0.05460762605071068, + "learning_rate": 0.0001375219406813988, + "loss": 0.3039, + "step": 18820 + }, + { + "epoch": 1.524708360337006, + "grad_norm": 0.055081307888031006, + "learning_rate": 0.00013751744002880417, + "loss": 0.2923, + "step": 18821 + }, + { + "epoch": 1.5247893713545042, + "grad_norm": 0.055108651518821716, + "learning_rate": 0.00013751293937620956, + "loss": 0.2926, + "step": 18822 + }, + { + "epoch": 1.5248703823720025, + "grad_norm": 0.050001360476017, + "learning_rate": 0.00013750843872361494, + "loss": 0.2775, + "step": 18823 + }, + { + "epoch": 1.524951393389501, + "grad_norm": 0.042685020714998245, + "learning_rate": 0.0001375039380710203, + "loss": 0.2413, + "step": 18824 + }, + { + "epoch": 1.5250324044069994, + "grad_norm": 0.055367305874824524, + "learning_rate": 0.0001374994374184257, + "loss": 0.2962, + "step": 18825 + }, + { + "epoch": 1.5251134154244976, + "grad_norm": 0.05210021138191223, + "learning_rate": 0.00013749493676583105, + "loss": 0.3033, + "step": 18826 + }, + { + "epoch": 1.525194426441996, + "grad_norm": 0.04955118149518967, + "learning_rate": 0.0001374904361132364, + "loss": 0.277, + "step": 18827 + }, + { + "epoch": 1.5252754374594946, + "grad_norm": 0.04781867936253548, + "learning_rate": 0.0001374859354606418, + "loss": 0.2971, + "step": 18828 + }, + { + "epoch": 1.5253564484769928, + "grad_norm": 0.05286426097154617, + "learning_rate": 0.00013748143480804719, + "loss": 0.2948, + "step": 18829 + }, + { + "epoch": 1.5254374594944913, + "grad_norm": 0.04718099907040596, + "learning_rate": 0.00013747693415545255, + "loss": 0.2907, + "step": 18830 + }, + { + "epoch": 1.5255184705119897, + "grad_norm": 0.05359350144863129, + "learning_rate": 0.00013747243350285793, + "loss": 0.2927, + "step": 18831 + }, + { + "epoch": 1.525599481529488, + "grad_norm": 0.06179993227124214, + "learning_rate": 0.0001374679328502633, + "loss": 0.3047, + "step": 18832 + }, + { + "epoch": 1.5256804925469862, + "grad_norm": 0.048131536692380905, + "learning_rate": 0.00013746343219766865, + "loss": 0.294, + "step": 18833 + }, + { + "epoch": 1.525761503564485, + "grad_norm": 0.04800281301140785, + "learning_rate": 0.00013745893154507404, + "loss": 0.2949, + "step": 18834 + }, + { + "epoch": 1.5258425145819832, + "grad_norm": 0.03917567431926727, + "learning_rate": 0.00013745443089247943, + "loss": 0.2439, + "step": 18835 + }, + { + "epoch": 1.5259235255994814, + "grad_norm": 0.047527339309453964, + "learning_rate": 0.0001374499302398848, + "loss": 0.3206, + "step": 18836 + }, + { + "epoch": 1.5260045366169799, + "grad_norm": 0.04575043171644211, + "learning_rate": 0.00013744542958729018, + "loss": 0.2978, + "step": 18837 + }, + { + "epoch": 1.5260855476344783, + "grad_norm": 0.04156474769115448, + "learning_rate": 0.00013744092893469554, + "loss": 0.2449, + "step": 18838 + }, + { + "epoch": 1.5261665586519766, + "grad_norm": 0.0502975694835186, + "learning_rate": 0.0001374364282821009, + "loss": 0.2913, + "step": 18839 + }, + { + "epoch": 1.526247569669475, + "grad_norm": 0.048595551401376724, + "learning_rate": 0.00013743192762950628, + "loss": 0.2757, + "step": 18840 + }, + { + "epoch": 1.5263285806869735, + "grad_norm": 0.049081310629844666, + "learning_rate": 0.00013742742697691167, + "loss": 0.2851, + "step": 18841 + }, + { + "epoch": 1.5264095917044718, + "grad_norm": 0.0556858591735363, + "learning_rate": 0.00013742292632431703, + "loss": 0.2821, + "step": 18842 + }, + { + "epoch": 1.5264906027219702, + "grad_norm": 0.053837850689888, + "learning_rate": 0.00013741842567172242, + "loss": 0.3199, + "step": 18843 + }, + { + "epoch": 1.5265716137394687, + "grad_norm": 0.055824149399995804, + "learning_rate": 0.00013741392501912778, + "loss": 0.2556, + "step": 18844 + }, + { + "epoch": 1.526652624756967, + "grad_norm": 0.05074144899845123, + "learning_rate": 0.00013740942436653314, + "loss": 0.2915, + "step": 18845 + }, + { + "epoch": 1.5267336357744652, + "grad_norm": 0.057336222380399704, + "learning_rate": 0.00013740492371393855, + "loss": 0.27, + "step": 18846 + }, + { + "epoch": 1.5268146467919637, + "grad_norm": 0.054322924464941025, + "learning_rate": 0.0001374004230613439, + "loss": 0.3092, + "step": 18847 + }, + { + "epoch": 1.5268956578094621, + "grad_norm": 0.04016140475869179, + "learning_rate": 0.00013739592240874927, + "loss": 0.2597, + "step": 18848 + }, + { + "epoch": 1.5269766688269604, + "grad_norm": 0.05268358811736107, + "learning_rate": 0.00013739142175615466, + "loss": 0.2882, + "step": 18849 + }, + { + "epoch": 1.5270576798444588, + "grad_norm": 0.059245605021715164, + "learning_rate": 0.00013738692110356002, + "loss": 0.3328, + "step": 18850 + }, + { + "epoch": 1.5271386908619573, + "grad_norm": 0.05614931508898735, + "learning_rate": 0.00013738242045096538, + "loss": 0.383, + "step": 18851 + }, + { + "epoch": 1.5272197018794555, + "grad_norm": 0.05188550800085068, + "learning_rate": 0.0001373779197983708, + "loss": 0.2679, + "step": 18852 + }, + { + "epoch": 1.527300712896954, + "grad_norm": 0.049801215529441833, + "learning_rate": 0.00013737341914577615, + "loss": 0.2703, + "step": 18853 + }, + { + "epoch": 1.5273817239144525, + "grad_norm": 0.046407975256443024, + "learning_rate": 0.0001373689184931815, + "loss": 0.2531, + "step": 18854 + }, + { + "epoch": 1.5274627349319507, + "grad_norm": 0.051928356289863586, + "learning_rate": 0.0001373644178405869, + "loss": 0.3049, + "step": 18855 + }, + { + "epoch": 1.527543745949449, + "grad_norm": 0.04042185842990875, + "learning_rate": 0.00013735991718799226, + "loss": 0.2441, + "step": 18856 + }, + { + "epoch": 1.5276247569669477, + "grad_norm": 0.04542936757206917, + "learning_rate": 0.00013735541653539762, + "loss": 0.3054, + "step": 18857 + }, + { + "epoch": 1.527705767984446, + "grad_norm": 0.044426001608371735, + "learning_rate": 0.00013735091588280303, + "loss": 0.3067, + "step": 18858 + }, + { + "epoch": 1.5277867790019442, + "grad_norm": 0.0555790439248085, + "learning_rate": 0.0001373464152302084, + "loss": 0.301, + "step": 18859 + }, + { + "epoch": 1.5278677900194426, + "grad_norm": 0.045266322791576385, + "learning_rate": 0.00013734191457761375, + "loss": 0.2609, + "step": 18860 + }, + { + "epoch": 1.527948801036941, + "grad_norm": 0.04733710736036301, + "learning_rate": 0.00013733741392501914, + "loss": 0.2707, + "step": 18861 + }, + { + "epoch": 1.5280298120544393, + "grad_norm": 0.051505301147699356, + "learning_rate": 0.0001373329132724245, + "loss": 0.295, + "step": 18862 + }, + { + "epoch": 1.5281108230719378, + "grad_norm": 0.04492027685046196, + "learning_rate": 0.00013732841261982986, + "loss": 0.3219, + "step": 18863 + }, + { + "epoch": 1.5281918340894363, + "grad_norm": 0.04752475023269653, + "learning_rate": 0.00013732391196723528, + "loss": 0.2978, + "step": 18864 + }, + { + "epoch": 1.5282728451069345, + "grad_norm": 0.04412756860256195, + "learning_rate": 0.00013731941131464064, + "loss": 0.2908, + "step": 18865 + }, + { + "epoch": 1.528353856124433, + "grad_norm": 0.045019157230854034, + "learning_rate": 0.000137314910662046, + "loss": 0.2837, + "step": 18866 + }, + { + "epoch": 1.5284348671419314, + "grad_norm": 0.05006730556488037, + "learning_rate": 0.00013731041000945138, + "loss": 0.3036, + "step": 18867 + }, + { + "epoch": 1.5285158781594297, + "grad_norm": 0.04813380166888237, + "learning_rate": 0.00013730590935685674, + "loss": 0.2999, + "step": 18868 + }, + { + "epoch": 1.528596889176928, + "grad_norm": 0.052330706268548965, + "learning_rate": 0.00013730140870426213, + "loss": 0.2947, + "step": 18869 + }, + { + "epoch": 1.5286779001944264, + "grad_norm": 0.05194742977619171, + "learning_rate": 0.00013729690805166752, + "loss": 0.2825, + "step": 18870 + }, + { + "epoch": 1.5287589112119249, + "grad_norm": 0.052682504057884216, + "learning_rate": 0.00013729240739907288, + "loss": 0.3046, + "step": 18871 + }, + { + "epoch": 1.528839922229423, + "grad_norm": 0.0457267127931118, + "learning_rate": 0.00013728790674647824, + "loss": 0.2767, + "step": 18872 + }, + { + "epoch": 1.5289209332469216, + "grad_norm": 0.05539494380354881, + "learning_rate": 0.00013728340609388362, + "loss": 0.2687, + "step": 18873 + }, + { + "epoch": 1.52900194426442, + "grad_norm": 0.04881107062101364, + "learning_rate": 0.00013727890544128898, + "loss": 0.2965, + "step": 18874 + }, + { + "epoch": 1.5290829552819183, + "grad_norm": 0.05195486173033714, + "learning_rate": 0.00013727440478869437, + "loss": 0.3044, + "step": 18875 + }, + { + "epoch": 1.5291639662994168, + "grad_norm": 0.03716466948390007, + "learning_rate": 0.00013726990413609976, + "loss": 0.2507, + "step": 18876 + }, + { + "epoch": 1.5292449773169152, + "grad_norm": 0.05379246175289154, + "learning_rate": 0.00013726540348350512, + "loss": 0.3013, + "step": 18877 + }, + { + "epoch": 1.5293259883344135, + "grad_norm": 0.04568881914019585, + "learning_rate": 0.00013726090283091048, + "loss": 0.2533, + "step": 18878 + }, + { + "epoch": 1.5294069993519117, + "grad_norm": 0.05620739236474037, + "learning_rate": 0.00013725640217831587, + "loss": 0.2854, + "step": 18879 + }, + { + "epoch": 1.5294880103694104, + "grad_norm": 0.053102824836969376, + "learning_rate": 0.00013725190152572123, + "loss": 0.2861, + "step": 18880 + }, + { + "epoch": 1.5295690213869086, + "grad_norm": 0.046912338584661484, + "learning_rate": 0.00013724740087312661, + "loss": 0.2486, + "step": 18881 + }, + { + "epoch": 1.529650032404407, + "grad_norm": 0.04645490646362305, + "learning_rate": 0.000137242900220532, + "loss": 0.3039, + "step": 18882 + }, + { + "epoch": 1.5297310434219054, + "grad_norm": 0.04673031345009804, + "learning_rate": 0.00013723839956793736, + "loss": 0.2477, + "step": 18883 + }, + { + "epoch": 1.5298120544394038, + "grad_norm": 0.04913533106446266, + "learning_rate": 0.00013723389891534272, + "loss": 0.2624, + "step": 18884 + }, + { + "epoch": 1.529893065456902, + "grad_norm": 0.04520237073302269, + "learning_rate": 0.0001372293982627481, + "loss": 0.2663, + "step": 18885 + }, + { + "epoch": 1.5299740764744005, + "grad_norm": 0.048991698771715164, + "learning_rate": 0.00013722489761015347, + "loss": 0.2824, + "step": 18886 + }, + { + "epoch": 1.530055087491899, + "grad_norm": 0.05292793735861778, + "learning_rate": 0.00013722039695755886, + "loss": 0.2912, + "step": 18887 + }, + { + "epoch": 1.5301360985093972, + "grad_norm": 0.0511743500828743, + "learning_rate": 0.00013721589630496424, + "loss": 0.3158, + "step": 18888 + }, + { + "epoch": 1.5302171095268955, + "grad_norm": 0.05098418518900871, + "learning_rate": 0.0001372113956523696, + "loss": 0.2752, + "step": 18889 + }, + { + "epoch": 1.5302981205443942, + "grad_norm": 0.04406699538230896, + "learning_rate": 0.00013720689499977496, + "loss": 0.2852, + "step": 18890 + }, + { + "epoch": 1.5303791315618924, + "grad_norm": 0.04527914896607399, + "learning_rate": 0.00013720239434718035, + "loss": 0.317, + "step": 18891 + }, + { + "epoch": 1.5304601425793907, + "grad_norm": 0.049641575664281845, + "learning_rate": 0.0001371978936945857, + "loss": 0.2888, + "step": 18892 + }, + { + "epoch": 1.5305411535968891, + "grad_norm": 0.06409697979688644, + "learning_rate": 0.0001371933930419911, + "loss": 0.3378, + "step": 18893 + }, + { + "epoch": 1.5306221646143876, + "grad_norm": 0.04698451608419418, + "learning_rate": 0.00013718889238939648, + "loss": 0.2883, + "step": 18894 + }, + { + "epoch": 1.5307031756318858, + "grad_norm": 0.04558132216334343, + "learning_rate": 0.00013718439173680184, + "loss": 0.2941, + "step": 18895 + }, + { + "epoch": 1.5307841866493843, + "grad_norm": 0.04831308871507645, + "learning_rate": 0.0001371798910842072, + "loss": 0.2903, + "step": 18896 + }, + { + "epoch": 1.5308651976668828, + "grad_norm": 0.05204075947403908, + "learning_rate": 0.0001371753904316126, + "loss": 0.2744, + "step": 18897 + }, + { + "epoch": 1.530946208684381, + "grad_norm": 0.043518051505088806, + "learning_rate": 0.00013717088977901798, + "loss": 0.2612, + "step": 18898 + }, + { + "epoch": 1.5310272197018795, + "grad_norm": 0.04689870402216911, + "learning_rate": 0.00013716638912642334, + "loss": 0.2682, + "step": 18899 + }, + { + "epoch": 1.531108230719378, + "grad_norm": 0.040445707738399506, + "learning_rate": 0.00013716188847382873, + "loss": 0.2518, + "step": 18900 + }, + { + "epoch": 1.5311892417368762, + "grad_norm": 0.0630839467048645, + "learning_rate": 0.00013715738782123409, + "loss": 0.3021, + "step": 18901 + }, + { + "epoch": 1.5312702527543745, + "grad_norm": 0.05061734840273857, + "learning_rate": 0.00013715288716863945, + "loss": 0.3078, + "step": 18902 + }, + { + "epoch": 1.5313512637718731, + "grad_norm": 0.05375457555055618, + "learning_rate": 0.00013714838651604483, + "loss": 0.2956, + "step": 18903 + }, + { + "epoch": 1.5314322747893714, + "grad_norm": 0.043118610978126526, + "learning_rate": 0.00013714388586345022, + "loss": 0.2787, + "step": 18904 + }, + { + "epoch": 1.5315132858068696, + "grad_norm": 0.04651355743408203, + "learning_rate": 0.00013713938521085558, + "loss": 0.3131, + "step": 18905 + }, + { + "epoch": 1.531594296824368, + "grad_norm": 0.04447954148054123, + "learning_rate": 0.00013713488455826097, + "loss": 0.2437, + "step": 18906 + }, + { + "epoch": 1.5316753078418666, + "grad_norm": 0.04906405508518219, + "learning_rate": 0.00013713038390566633, + "loss": 0.3045, + "step": 18907 + }, + { + "epoch": 1.5317563188593648, + "grad_norm": 0.04233681783080101, + "learning_rate": 0.0001371258832530717, + "loss": 0.2579, + "step": 18908 + }, + { + "epoch": 1.5318373298768633, + "grad_norm": 0.05201317369937897, + "learning_rate": 0.00013712138260047707, + "loss": 0.2673, + "step": 18909 + }, + { + "epoch": 1.5319183408943617, + "grad_norm": 0.04613606631755829, + "learning_rate": 0.00013711688194788246, + "loss": 0.2893, + "step": 18910 + }, + { + "epoch": 1.53199935191186, + "grad_norm": 0.04684216529130936, + "learning_rate": 0.00013711238129528782, + "loss": 0.2955, + "step": 18911 + }, + { + "epoch": 1.5320803629293582, + "grad_norm": 0.05115676671266556, + "learning_rate": 0.0001371078806426932, + "loss": 0.2879, + "step": 18912 + }, + { + "epoch": 1.532161373946857, + "grad_norm": 0.04565107822418213, + "learning_rate": 0.00013710337999009857, + "loss": 0.2841, + "step": 18913 + }, + { + "epoch": 1.5322423849643552, + "grad_norm": 0.0487804040312767, + "learning_rate": 0.00013709887933750393, + "loss": 0.2481, + "step": 18914 + }, + { + "epoch": 1.5323233959818534, + "grad_norm": 0.060450732707977295, + "learning_rate": 0.00013709437868490932, + "loss": 0.3028, + "step": 18915 + }, + { + "epoch": 1.5324044069993519, + "grad_norm": 0.05281240865588188, + "learning_rate": 0.0001370898780323147, + "loss": 0.3199, + "step": 18916 + }, + { + "epoch": 1.5324854180168503, + "grad_norm": 0.053525056689977646, + "learning_rate": 0.00013708537737972006, + "loss": 0.3291, + "step": 18917 + }, + { + "epoch": 1.5325664290343486, + "grad_norm": 0.053674425929784775, + "learning_rate": 0.00013708087672712545, + "loss": 0.2716, + "step": 18918 + }, + { + "epoch": 1.532647440051847, + "grad_norm": 0.05458691343665123, + "learning_rate": 0.0001370763760745308, + "loss": 0.2458, + "step": 18919 + }, + { + "epoch": 1.5327284510693455, + "grad_norm": 0.05632995441555977, + "learning_rate": 0.00013707187542193617, + "loss": 0.3023, + "step": 18920 + }, + { + "epoch": 1.5328094620868438, + "grad_norm": 0.04286112263798714, + "learning_rate": 0.00013706737476934158, + "loss": 0.2662, + "step": 18921 + }, + { + "epoch": 1.5328904731043422, + "grad_norm": 0.05146702751517296, + "learning_rate": 0.00013706287411674694, + "loss": 0.2854, + "step": 18922 + }, + { + "epoch": 1.5329714841218407, + "grad_norm": 0.05799100548028946, + "learning_rate": 0.0001370583734641523, + "loss": 0.3491, + "step": 18923 + }, + { + "epoch": 1.533052495139339, + "grad_norm": 0.05911766365170479, + "learning_rate": 0.0001370538728115577, + "loss": 0.2962, + "step": 18924 + }, + { + "epoch": 1.5331335061568372, + "grad_norm": 0.047158051282167435, + "learning_rate": 0.00013704937215896305, + "loss": 0.3015, + "step": 18925 + }, + { + "epoch": 1.5332145171743357, + "grad_norm": 0.04690094664692879, + "learning_rate": 0.0001370448715063684, + "loss": 0.2827, + "step": 18926 + }, + { + "epoch": 1.5332955281918341, + "grad_norm": 0.0472753569483757, + "learning_rate": 0.00013704037085377383, + "loss": 0.286, + "step": 18927 + }, + { + "epoch": 1.5333765392093324, + "grad_norm": 0.051584504544734955, + "learning_rate": 0.0001370358702011792, + "loss": 0.2563, + "step": 18928 + }, + { + "epoch": 1.5334575502268308, + "grad_norm": 0.04185318574309349, + "learning_rate": 0.00013703136954858455, + "loss": 0.2376, + "step": 18929 + }, + { + "epoch": 1.5335385612443293, + "grad_norm": 0.047741927206516266, + "learning_rate": 0.00013702686889598993, + "loss": 0.2723, + "step": 18930 + }, + { + "epoch": 1.5336195722618275, + "grad_norm": 0.04596942290663719, + "learning_rate": 0.0001370223682433953, + "loss": 0.2821, + "step": 18931 + }, + { + "epoch": 1.533700583279326, + "grad_norm": 0.041286639869213104, + "learning_rate": 0.00013701786759080065, + "loss": 0.2836, + "step": 18932 + }, + { + "epoch": 1.5337815942968245, + "grad_norm": 0.048663485795259476, + "learning_rate": 0.00013701336693820607, + "loss": 0.3046, + "step": 18933 + }, + { + "epoch": 1.5338626053143227, + "grad_norm": 0.05503995344042778, + "learning_rate": 0.00013700886628561143, + "loss": 0.2844, + "step": 18934 + }, + { + "epoch": 1.533943616331821, + "grad_norm": 0.04451589286327362, + "learning_rate": 0.0001370043656330168, + "loss": 0.2738, + "step": 18935 + }, + { + "epoch": 1.5340246273493197, + "grad_norm": 0.04549826681613922, + "learning_rate": 0.00013699986498042218, + "loss": 0.2936, + "step": 18936 + }, + { + "epoch": 1.534105638366818, + "grad_norm": 0.046200018376111984, + "learning_rate": 0.00013699536432782754, + "loss": 0.3045, + "step": 18937 + }, + { + "epoch": 1.5341866493843161, + "grad_norm": 0.05498679727315903, + "learning_rate": 0.0001369908636752329, + "loss": 0.3277, + "step": 18938 + }, + { + "epoch": 1.5342676604018146, + "grad_norm": 0.057283565402030945, + "learning_rate": 0.0001369863630226383, + "loss": 0.2978, + "step": 18939 + }, + { + "epoch": 1.534348671419313, + "grad_norm": 0.05207530036568642, + "learning_rate": 0.00013698186237004367, + "loss": 0.2682, + "step": 18940 + }, + { + "epoch": 1.5344296824368113, + "grad_norm": 0.05777692794799805, + "learning_rate": 0.00013697736171744903, + "loss": 0.319, + "step": 18941 + }, + { + "epoch": 1.5345106934543098, + "grad_norm": 0.04652559012174606, + "learning_rate": 0.00013697286106485442, + "loss": 0.2649, + "step": 18942 + }, + { + "epoch": 1.5345917044718083, + "grad_norm": 0.05349148064851761, + "learning_rate": 0.00013696836041225978, + "loss": 0.3342, + "step": 18943 + }, + { + "epoch": 1.5346727154893065, + "grad_norm": 0.057243578135967255, + "learning_rate": 0.00013696385975966514, + "loss": 0.3424, + "step": 18944 + }, + { + "epoch": 1.534753726506805, + "grad_norm": 0.051278337836265564, + "learning_rate": 0.00013695935910707055, + "loss": 0.294, + "step": 18945 + }, + { + "epoch": 1.5348347375243034, + "grad_norm": 0.04856126755475998, + "learning_rate": 0.0001369548584544759, + "loss": 0.2559, + "step": 18946 + }, + { + "epoch": 1.5349157485418017, + "grad_norm": 0.05090457201004028, + "learning_rate": 0.00013695035780188127, + "loss": 0.2822, + "step": 18947 + }, + { + "epoch": 1.5349967595593, + "grad_norm": 0.05697944760322571, + "learning_rate": 0.00013694585714928666, + "loss": 0.2799, + "step": 18948 + }, + { + "epoch": 1.5350777705767984, + "grad_norm": 0.04438178613781929, + "learning_rate": 0.00013694135649669202, + "loss": 0.2901, + "step": 18949 + }, + { + "epoch": 1.5351587815942969, + "grad_norm": 0.05179465189576149, + "learning_rate": 0.0001369368558440974, + "loss": 0.2867, + "step": 18950 + }, + { + "epoch": 1.535239792611795, + "grad_norm": 0.044153694063425064, + "learning_rate": 0.0001369323551915028, + "loss": 0.2616, + "step": 18951 + }, + { + "epoch": 1.5353208036292936, + "grad_norm": 0.05822839215397835, + "learning_rate": 0.00013692785453890815, + "loss": 0.3369, + "step": 18952 + }, + { + "epoch": 1.535401814646792, + "grad_norm": 0.044818926602602005, + "learning_rate": 0.0001369233538863135, + "loss": 0.2568, + "step": 18953 + }, + { + "epoch": 1.5354828256642903, + "grad_norm": 0.050371669232845306, + "learning_rate": 0.0001369188532337189, + "loss": 0.3114, + "step": 18954 + }, + { + "epoch": 1.5355638366817888, + "grad_norm": 0.04291696846485138, + "learning_rate": 0.00013691435258112426, + "loss": 0.2632, + "step": 18955 + }, + { + "epoch": 1.5356448476992872, + "grad_norm": 0.04437808692455292, + "learning_rate": 0.00013690985192852965, + "loss": 0.251, + "step": 18956 + }, + { + "epoch": 1.5357258587167855, + "grad_norm": 0.04808518663048744, + "learning_rate": 0.00013690535127593503, + "loss": 0.2611, + "step": 18957 + }, + { + "epoch": 1.5358068697342837, + "grad_norm": 0.0442044772207737, + "learning_rate": 0.0001369008506233404, + "loss": 0.3265, + "step": 18958 + }, + { + "epoch": 1.5358878807517824, + "grad_norm": 0.04689866304397583, + "learning_rate": 0.00013689634997074575, + "loss": 0.307, + "step": 18959 + }, + { + "epoch": 1.5359688917692806, + "grad_norm": 0.051713429391384125, + "learning_rate": 0.00013689184931815114, + "loss": 0.3164, + "step": 18960 + }, + { + "epoch": 1.5360499027867789, + "grad_norm": 0.04567249119281769, + "learning_rate": 0.0001368873486655565, + "loss": 0.2978, + "step": 18961 + }, + { + "epoch": 1.5361309138042774, + "grad_norm": 0.045768845826387405, + "learning_rate": 0.0001368828480129619, + "loss": 0.2633, + "step": 18962 + }, + { + "epoch": 1.5362119248217758, + "grad_norm": 0.04977133870124817, + "learning_rate": 0.00013687834736036728, + "loss": 0.2855, + "step": 18963 + }, + { + "epoch": 1.536292935839274, + "grad_norm": 0.0451236255466938, + "learning_rate": 0.00013687384670777264, + "loss": 0.3, + "step": 18964 + }, + { + "epoch": 1.5363739468567725, + "grad_norm": 0.05996592342853546, + "learning_rate": 0.000136869346055178, + "loss": 0.2787, + "step": 18965 + }, + { + "epoch": 1.536454957874271, + "grad_norm": 0.052222318947315216, + "learning_rate": 0.00013686484540258338, + "loss": 0.2754, + "step": 18966 + }, + { + "epoch": 1.5365359688917692, + "grad_norm": 0.05358374863862991, + "learning_rate": 0.00013686034474998874, + "loss": 0.3048, + "step": 18967 + }, + { + "epoch": 1.5366169799092677, + "grad_norm": 0.0475376695394516, + "learning_rate": 0.00013685584409739413, + "loss": 0.2867, + "step": 18968 + }, + { + "epoch": 1.5366979909267662, + "grad_norm": 0.04334684833884239, + "learning_rate": 0.00013685134344479952, + "loss": 0.2459, + "step": 18969 + }, + { + "epoch": 1.5367790019442644, + "grad_norm": 0.0507650263607502, + "learning_rate": 0.00013684684279220488, + "loss": 0.2975, + "step": 18970 + }, + { + "epoch": 1.5368600129617627, + "grad_norm": 0.05293979123234749, + "learning_rate": 0.00013684234213961024, + "loss": 0.2737, + "step": 18971 + }, + { + "epoch": 1.5369410239792611, + "grad_norm": 0.051188874989748, + "learning_rate": 0.00013683784148701563, + "loss": 0.2817, + "step": 18972 + }, + { + "epoch": 1.5370220349967596, + "grad_norm": 0.05214584991335869, + "learning_rate": 0.00013683334083442099, + "loss": 0.2927, + "step": 18973 + }, + { + "epoch": 1.5371030460142578, + "grad_norm": 0.05490950495004654, + "learning_rate": 0.00013682884018182637, + "loss": 0.3274, + "step": 18974 + }, + { + "epoch": 1.5371840570317563, + "grad_norm": 0.05883365496993065, + "learning_rate": 0.00013682433952923176, + "loss": 0.3159, + "step": 18975 + }, + { + "epoch": 1.5372650680492548, + "grad_norm": 0.058409273624420166, + "learning_rate": 0.00013681983887663712, + "loss": 0.3258, + "step": 18976 + }, + { + "epoch": 1.537346079066753, + "grad_norm": 0.05302930995821953, + "learning_rate": 0.00013681533822404248, + "loss": 0.2753, + "step": 18977 + }, + { + "epoch": 1.5374270900842515, + "grad_norm": 0.052349865436553955, + "learning_rate": 0.00013681083757144787, + "loss": 0.2985, + "step": 18978 + }, + { + "epoch": 1.53750810110175, + "grad_norm": 0.045233648270368576, + "learning_rate": 0.00013680633691885325, + "loss": 0.2495, + "step": 18979 + }, + { + "epoch": 1.5375891121192482, + "grad_norm": 0.053387340158224106, + "learning_rate": 0.00013680183626625861, + "loss": 0.2737, + "step": 18980 + }, + { + "epoch": 1.5376701231367464, + "grad_norm": 0.05499692261219025, + "learning_rate": 0.000136797335613664, + "loss": 0.3166, + "step": 18981 + }, + { + "epoch": 1.5377511341542451, + "grad_norm": 0.049109093844890594, + "learning_rate": 0.00013679283496106936, + "loss": 0.2929, + "step": 18982 + }, + { + "epoch": 1.5378321451717434, + "grad_norm": 0.05261330306529999, + "learning_rate": 0.00013678833430847472, + "loss": 0.3063, + "step": 18983 + }, + { + "epoch": 1.5379131561892416, + "grad_norm": 0.04580008238554001, + "learning_rate": 0.0001367838336558801, + "loss": 0.248, + "step": 18984 + }, + { + "epoch": 1.53799416720674, + "grad_norm": 0.051151324063539505, + "learning_rate": 0.0001367793330032855, + "loss": 0.3288, + "step": 18985 + }, + { + "epoch": 1.5380751782242386, + "grad_norm": 0.04817868024110794, + "learning_rate": 0.00013677483235069086, + "loss": 0.2552, + "step": 18986 + }, + { + "epoch": 1.5381561892417368, + "grad_norm": 0.06041828915476799, + "learning_rate": 0.00013677033169809624, + "loss": 0.3565, + "step": 18987 + }, + { + "epoch": 1.5382372002592353, + "grad_norm": 0.045897360891103745, + "learning_rate": 0.0001367658310455016, + "loss": 0.2806, + "step": 18988 + }, + { + "epoch": 1.5383182112767337, + "grad_norm": 0.041819456964731216, + "learning_rate": 0.00013676133039290696, + "loss": 0.2634, + "step": 18989 + }, + { + "epoch": 1.538399222294232, + "grad_norm": 0.05615771561861038, + "learning_rate": 0.00013675682974031235, + "loss": 0.2944, + "step": 18990 + }, + { + "epoch": 1.5384802333117304, + "grad_norm": 0.047729093581438065, + "learning_rate": 0.00013675232908771774, + "loss": 0.2856, + "step": 18991 + }, + { + "epoch": 1.538561244329229, + "grad_norm": 0.041438955813646317, + "learning_rate": 0.0001367478284351231, + "loss": 0.256, + "step": 18992 + }, + { + "epoch": 1.5386422553467272, + "grad_norm": 0.04753054305911064, + "learning_rate": 0.00013674332778252848, + "loss": 0.2723, + "step": 18993 + }, + { + "epoch": 1.5387232663642254, + "grad_norm": 0.055781301110982895, + "learning_rate": 0.00013673882712993384, + "loss": 0.2838, + "step": 18994 + }, + { + "epoch": 1.5388042773817239, + "grad_norm": 0.046481624245643616, + "learning_rate": 0.0001367343264773392, + "loss": 0.298, + "step": 18995 + }, + { + "epoch": 1.5388852883992223, + "grad_norm": 0.0620138980448246, + "learning_rate": 0.0001367298258247446, + "loss": 0.2808, + "step": 18996 + }, + { + "epoch": 1.5389662994167206, + "grad_norm": 0.05862954258918762, + "learning_rate": 0.00013672532517214998, + "loss": 0.2687, + "step": 18997 + }, + { + "epoch": 1.539047310434219, + "grad_norm": 0.051817674189805984, + "learning_rate": 0.00013672082451955534, + "loss": 0.2347, + "step": 18998 + }, + { + "epoch": 1.5391283214517175, + "grad_norm": 0.04890400543808937, + "learning_rate": 0.00013671632386696073, + "loss": 0.2528, + "step": 18999 + }, + { + "epoch": 1.5392093324692158, + "grad_norm": 0.06076984480023384, + "learning_rate": 0.00013671182321436609, + "loss": 0.3509, + "step": 19000 + }, + { + "epoch": 1.5392903434867142, + "grad_norm": 0.046450335532426834, + "learning_rate": 0.00013670732256177145, + "loss": 0.3023, + "step": 19001 + }, + { + "epoch": 1.5393713545042127, + "grad_norm": 0.043737445026636124, + "learning_rate": 0.00013670282190917686, + "loss": 0.2698, + "step": 19002 + }, + { + "epoch": 1.539452365521711, + "grad_norm": 0.0688759982585907, + "learning_rate": 0.00013669832125658222, + "loss": 0.3433, + "step": 19003 + }, + { + "epoch": 1.5395333765392092, + "grad_norm": 0.048470985144376755, + "learning_rate": 0.00013669382060398758, + "loss": 0.2475, + "step": 19004 + }, + { + "epoch": 1.5396143875567079, + "grad_norm": 0.04027277231216431, + "learning_rate": 0.00013668931995139297, + "loss": 0.2401, + "step": 19005 + }, + { + "epoch": 1.5396953985742061, + "grad_norm": 0.05675550922751427, + "learning_rate": 0.00013668481929879833, + "loss": 0.317, + "step": 19006 + }, + { + "epoch": 1.5397764095917044, + "grad_norm": 0.04998868331313133, + "learning_rate": 0.0001366803186462037, + "loss": 0.3076, + "step": 19007 + }, + { + "epoch": 1.5398574206092028, + "grad_norm": 0.04275386407971382, + "learning_rate": 0.0001366758179936091, + "loss": 0.26, + "step": 19008 + }, + { + "epoch": 1.5399384316267013, + "grad_norm": 0.05047708749771118, + "learning_rate": 0.00013667131734101446, + "loss": 0.2602, + "step": 19009 + }, + { + "epoch": 1.5400194426441995, + "grad_norm": 0.05603862553834915, + "learning_rate": 0.00013666681668841982, + "loss": 0.2823, + "step": 19010 + }, + { + "epoch": 1.540100453661698, + "grad_norm": 0.0549708716571331, + "learning_rate": 0.0001366623160358252, + "loss": 0.2937, + "step": 19011 + }, + { + "epoch": 1.5401814646791965, + "grad_norm": 0.06376846134662628, + "learning_rate": 0.00013665781538323057, + "loss": 0.3173, + "step": 19012 + }, + { + "epoch": 1.5402624756966947, + "grad_norm": 0.05238895118236542, + "learning_rate": 0.00013665331473063593, + "loss": 0.2934, + "step": 19013 + }, + { + "epoch": 1.540343486714193, + "grad_norm": 0.04790802299976349, + "learning_rate": 0.00013664881407804134, + "loss": 0.3099, + "step": 19014 + }, + { + "epoch": 1.5404244977316917, + "grad_norm": 0.05614732950925827, + "learning_rate": 0.0001366443134254467, + "loss": 0.2684, + "step": 19015 + }, + { + "epoch": 1.54050550874919, + "grad_norm": 0.044618602842092514, + "learning_rate": 0.00013663981277285206, + "loss": 0.2466, + "step": 19016 + }, + { + "epoch": 1.5405865197666881, + "grad_norm": 0.04293638467788696, + "learning_rate": 0.00013663531212025745, + "loss": 0.2836, + "step": 19017 + }, + { + "epoch": 1.5406675307841866, + "grad_norm": 0.044261954724788666, + "learning_rate": 0.0001366308114676628, + "loss": 0.269, + "step": 19018 + }, + { + "epoch": 1.540748541801685, + "grad_norm": 0.048489660024642944, + "learning_rate": 0.00013662631081506817, + "loss": 0.2914, + "step": 19019 + }, + { + "epoch": 1.5408295528191833, + "grad_norm": 0.04978737235069275, + "learning_rate": 0.00013662181016247359, + "loss": 0.2629, + "step": 19020 + }, + { + "epoch": 1.5409105638366818, + "grad_norm": 0.04418829083442688, + "learning_rate": 0.00013661730950987895, + "loss": 0.2401, + "step": 19021 + }, + { + "epoch": 1.5409915748541803, + "grad_norm": 0.05238080397248268, + "learning_rate": 0.0001366128088572843, + "loss": 0.2939, + "step": 19022 + }, + { + "epoch": 1.5410725858716785, + "grad_norm": 0.05178670585155487, + "learning_rate": 0.0001366083082046897, + "loss": 0.3246, + "step": 19023 + }, + { + "epoch": 1.541153596889177, + "grad_norm": 0.046030569821596146, + "learning_rate": 0.00013660380755209505, + "loss": 0.2812, + "step": 19024 + }, + { + "epoch": 1.5412346079066754, + "grad_norm": 0.03960884362459183, + "learning_rate": 0.0001365993068995004, + "loss": 0.2723, + "step": 19025 + }, + { + "epoch": 1.5413156189241737, + "grad_norm": 0.048268433660268784, + "learning_rate": 0.00013659480624690583, + "loss": 0.3143, + "step": 19026 + }, + { + "epoch": 1.541396629941672, + "grad_norm": 0.05451720952987671, + "learning_rate": 0.0001365903055943112, + "loss": 0.2771, + "step": 19027 + }, + { + "epoch": 1.5414776409591704, + "grad_norm": 0.05472222715616226, + "learning_rate": 0.00013658580494171655, + "loss": 0.2614, + "step": 19028 + }, + { + "epoch": 1.5415586519766689, + "grad_norm": 0.05245329067111015, + "learning_rate": 0.00013658130428912193, + "loss": 0.3151, + "step": 19029 + }, + { + "epoch": 1.541639662994167, + "grad_norm": 0.059196244925260544, + "learning_rate": 0.0001365768036365273, + "loss": 0.319, + "step": 19030 + }, + { + "epoch": 1.5417206740116656, + "grad_norm": 0.05338888615369797, + "learning_rate": 0.00013657230298393268, + "loss": 0.2921, + "step": 19031 + }, + { + "epoch": 1.541801685029164, + "grad_norm": 0.04819253832101822, + "learning_rate": 0.00013656780233133807, + "loss": 0.265, + "step": 19032 + }, + { + "epoch": 1.5418826960466623, + "grad_norm": 0.05281459912657738, + "learning_rate": 0.00013656330167874343, + "loss": 0.2933, + "step": 19033 + }, + { + "epoch": 1.5419637070641607, + "grad_norm": 0.043195176869630814, + "learning_rate": 0.0001365588010261488, + "loss": 0.2945, + "step": 19034 + }, + { + "epoch": 1.5420447180816592, + "grad_norm": 0.049393683671951294, + "learning_rate": 0.00013655430037355418, + "loss": 0.3001, + "step": 19035 + }, + { + "epoch": 1.5421257290991575, + "grad_norm": 0.04863373190164566, + "learning_rate": 0.00013654979972095954, + "loss": 0.2658, + "step": 19036 + }, + { + "epoch": 1.5422067401166557, + "grad_norm": 0.05779989808797836, + "learning_rate": 0.00013654529906836492, + "loss": 0.2974, + "step": 19037 + }, + { + "epoch": 1.5422877511341544, + "grad_norm": 0.04628223553299904, + "learning_rate": 0.0001365407984157703, + "loss": 0.27, + "step": 19038 + }, + { + "epoch": 1.5423687621516526, + "grad_norm": 0.05051897466182709, + "learning_rate": 0.00013653629776317567, + "loss": 0.2776, + "step": 19039 + }, + { + "epoch": 1.5424497731691509, + "grad_norm": 0.05451449379324913, + "learning_rate": 0.00013653179711058103, + "loss": 0.2728, + "step": 19040 + }, + { + "epoch": 1.5425307841866494, + "grad_norm": 0.05515547841787338, + "learning_rate": 0.00013652729645798642, + "loss": 0.2856, + "step": 19041 + }, + { + "epoch": 1.5426117952041478, + "grad_norm": 0.06005766987800598, + "learning_rate": 0.00013652279580539178, + "loss": 0.3029, + "step": 19042 + }, + { + "epoch": 1.542692806221646, + "grad_norm": 0.0674663782119751, + "learning_rate": 0.00013651829515279716, + "loss": 0.2887, + "step": 19043 + }, + { + "epoch": 1.5427738172391445, + "grad_norm": 0.0524708591401577, + "learning_rate": 0.00013651379450020255, + "loss": 0.2841, + "step": 19044 + }, + { + "epoch": 1.542854828256643, + "grad_norm": 0.04781711474061012, + "learning_rate": 0.0001365092938476079, + "loss": 0.2882, + "step": 19045 + }, + { + "epoch": 1.5429358392741412, + "grad_norm": 0.05105120688676834, + "learning_rate": 0.00013650479319501327, + "loss": 0.3169, + "step": 19046 + }, + { + "epoch": 1.5430168502916397, + "grad_norm": 0.05355464294552803, + "learning_rate": 0.00013650029254241866, + "loss": 0.2709, + "step": 19047 + }, + { + "epoch": 1.5430978613091382, + "grad_norm": 0.045331329107284546, + "learning_rate": 0.00013649579188982402, + "loss": 0.2627, + "step": 19048 + }, + { + "epoch": 1.5431788723266364, + "grad_norm": 0.045703526586294174, + "learning_rate": 0.0001364912912372294, + "loss": 0.2689, + "step": 19049 + }, + { + "epoch": 1.5432598833441347, + "grad_norm": 0.05270133540034294, + "learning_rate": 0.0001364867905846348, + "loss": 0.2694, + "step": 19050 + }, + { + "epoch": 1.5433408943616331, + "grad_norm": 0.05833232030272484, + "learning_rate": 0.00013648228993204015, + "loss": 0.3095, + "step": 19051 + }, + { + "epoch": 1.5434219053791316, + "grad_norm": 0.06126684695482254, + "learning_rate": 0.0001364777892794455, + "loss": 0.2659, + "step": 19052 + }, + { + "epoch": 1.5435029163966298, + "grad_norm": 0.06940959393978119, + "learning_rate": 0.0001364732886268509, + "loss": 0.2783, + "step": 19053 + }, + { + "epoch": 1.5435839274141283, + "grad_norm": 0.06364569813013077, + "learning_rate": 0.0001364687879742563, + "loss": 0.3155, + "step": 19054 + }, + { + "epoch": 1.5436649384316268, + "grad_norm": 0.04655701667070389, + "learning_rate": 0.00013646428732166165, + "loss": 0.2484, + "step": 19055 + }, + { + "epoch": 1.543745949449125, + "grad_norm": 0.05707899108529091, + "learning_rate": 0.00013645978666906703, + "loss": 0.2915, + "step": 19056 + }, + { + "epoch": 1.5438269604666235, + "grad_norm": 0.05672769248485565, + "learning_rate": 0.0001364552860164724, + "loss": 0.2799, + "step": 19057 + }, + { + "epoch": 1.543907971484122, + "grad_norm": 0.06137559935450554, + "learning_rate": 0.00013645078536387775, + "loss": 0.301, + "step": 19058 + }, + { + "epoch": 1.5439889825016202, + "grad_norm": 0.051159653812646866, + "learning_rate": 0.00013644628471128314, + "loss": 0.2557, + "step": 19059 + }, + { + "epoch": 1.5440699935191184, + "grad_norm": 0.047979846596717834, + "learning_rate": 0.00013644178405868853, + "loss": 0.2928, + "step": 19060 + }, + { + "epoch": 1.5441510045366171, + "grad_norm": 0.04678960144519806, + "learning_rate": 0.0001364372834060939, + "loss": 0.3019, + "step": 19061 + }, + { + "epoch": 1.5442320155541154, + "grad_norm": 0.053505491465330124, + "learning_rate": 0.00013643278275349928, + "loss": 0.2764, + "step": 19062 + }, + { + "epoch": 1.5443130265716136, + "grad_norm": 0.04693808779120445, + "learning_rate": 0.00013642828210090464, + "loss": 0.2854, + "step": 19063 + }, + { + "epoch": 1.544394037589112, + "grad_norm": 0.0580628328025341, + "learning_rate": 0.00013642378144831, + "loss": 0.2941, + "step": 19064 + }, + { + "epoch": 1.5444750486066106, + "grad_norm": 0.06445372849702835, + "learning_rate": 0.00013641928079571538, + "loss": 0.322, + "step": 19065 + }, + { + "epoch": 1.5445560596241088, + "grad_norm": 0.04730147495865822, + "learning_rate": 0.00013641478014312077, + "loss": 0.2475, + "step": 19066 + }, + { + "epoch": 1.5446370706416073, + "grad_norm": 0.052395354956388474, + "learning_rate": 0.00013641027949052613, + "loss": 0.3442, + "step": 19067 + }, + { + "epoch": 1.5447180816591057, + "grad_norm": 0.060794439166784286, + "learning_rate": 0.00013640577883793152, + "loss": 0.2869, + "step": 19068 + }, + { + "epoch": 1.544799092676604, + "grad_norm": 0.062032055109739304, + "learning_rate": 0.00013640127818533688, + "loss": 0.2932, + "step": 19069 + }, + { + "epoch": 1.5448801036941024, + "grad_norm": 0.04806280508637428, + "learning_rate": 0.00013639677753274224, + "loss": 0.3021, + "step": 19070 + }, + { + "epoch": 1.544961114711601, + "grad_norm": 0.05165525898337364, + "learning_rate": 0.00013639227688014763, + "loss": 0.2776, + "step": 19071 + }, + { + "epoch": 1.5450421257290992, + "grad_norm": 0.051210954785346985, + "learning_rate": 0.000136387776227553, + "loss": 0.3015, + "step": 19072 + }, + { + "epoch": 1.5451231367465974, + "grad_norm": 0.059677209705114365, + "learning_rate": 0.00013638327557495837, + "loss": 0.2757, + "step": 19073 + }, + { + "epoch": 1.5452041477640959, + "grad_norm": 0.04551670327782631, + "learning_rate": 0.00013637877492236376, + "loss": 0.2604, + "step": 19074 + }, + { + "epoch": 1.5452851587815943, + "grad_norm": 0.05329824611544609, + "learning_rate": 0.00013637427426976912, + "loss": 0.259, + "step": 19075 + }, + { + "epoch": 1.5453661697990926, + "grad_norm": 0.05617989972233772, + "learning_rate": 0.00013636977361717448, + "loss": 0.3109, + "step": 19076 + }, + { + "epoch": 1.545447180816591, + "grad_norm": 0.04428596794605255, + "learning_rate": 0.00013636527296457987, + "loss": 0.2748, + "step": 19077 + }, + { + "epoch": 1.5455281918340895, + "grad_norm": 0.057310551404953, + "learning_rate": 0.00013636077231198525, + "loss": 0.267, + "step": 19078 + }, + { + "epoch": 1.5456092028515878, + "grad_norm": 0.043585795909166336, + "learning_rate": 0.00013635627165939061, + "loss": 0.2734, + "step": 19079 + }, + { + "epoch": 1.5456902138690862, + "grad_norm": 0.05618955194950104, + "learning_rate": 0.000136351771006796, + "loss": 0.3172, + "step": 19080 + }, + { + "epoch": 1.5457712248865847, + "grad_norm": 0.0505533441901207, + "learning_rate": 0.00013634727035420136, + "loss": 0.2817, + "step": 19081 + }, + { + "epoch": 1.545852235904083, + "grad_norm": 0.04784523695707321, + "learning_rate": 0.00013634276970160672, + "loss": 0.2702, + "step": 19082 + }, + { + "epoch": 1.5459332469215812, + "grad_norm": 0.04198610410094261, + "learning_rate": 0.00013633826904901214, + "loss": 0.2668, + "step": 19083 + }, + { + "epoch": 1.5460142579390799, + "grad_norm": 0.05166913568973541, + "learning_rate": 0.0001363337683964175, + "loss": 0.2733, + "step": 19084 + }, + { + "epoch": 1.5460952689565781, + "grad_norm": 0.053657166659832, + "learning_rate": 0.00013632926774382286, + "loss": 0.2881, + "step": 19085 + }, + { + "epoch": 1.5461762799740764, + "grad_norm": 0.05604879558086395, + "learning_rate": 0.00013632476709122824, + "loss": 0.287, + "step": 19086 + }, + { + "epoch": 1.5462572909915748, + "grad_norm": 0.049701027572155, + "learning_rate": 0.0001363202664386336, + "loss": 0.3058, + "step": 19087 + }, + { + "epoch": 1.5463383020090733, + "grad_norm": 0.04624010622501373, + "learning_rate": 0.00013631576578603896, + "loss": 0.2654, + "step": 19088 + }, + { + "epoch": 1.5464193130265715, + "grad_norm": 0.052133120596408844, + "learning_rate": 0.00013631126513344438, + "loss": 0.2648, + "step": 19089 + }, + { + "epoch": 1.54650032404407, + "grad_norm": 0.04207282140851021, + "learning_rate": 0.00013630676448084974, + "loss": 0.2635, + "step": 19090 + }, + { + "epoch": 1.5465813350615685, + "grad_norm": 0.04825148731470108, + "learning_rate": 0.0001363022638282551, + "loss": 0.2842, + "step": 19091 + }, + { + "epoch": 1.5466623460790667, + "grad_norm": 0.04433571919798851, + "learning_rate": 0.00013629776317566048, + "loss": 0.2424, + "step": 19092 + }, + { + "epoch": 1.5467433570965652, + "grad_norm": 0.05187336727976799, + "learning_rate": 0.00013629326252306584, + "loss": 0.3201, + "step": 19093 + }, + { + "epoch": 1.5468243681140637, + "grad_norm": 0.05122842639684677, + "learning_rate": 0.0001362887618704712, + "loss": 0.2831, + "step": 19094 + }, + { + "epoch": 1.546905379131562, + "grad_norm": 0.042315494269132614, + "learning_rate": 0.00013628426121787662, + "loss": 0.2557, + "step": 19095 + }, + { + "epoch": 1.5469863901490601, + "grad_norm": 0.05235358327627182, + "learning_rate": 0.00013627976056528198, + "loss": 0.2863, + "step": 19096 + }, + { + "epoch": 1.5470674011665586, + "grad_norm": 0.04973796010017395, + "learning_rate": 0.00013627525991268734, + "loss": 0.2852, + "step": 19097 + }, + { + "epoch": 1.547148412184057, + "grad_norm": 0.05600909888744354, + "learning_rate": 0.00013627075926009273, + "loss": 0.3081, + "step": 19098 + }, + { + "epoch": 1.5472294232015553, + "grad_norm": 0.05479155853390694, + "learning_rate": 0.00013626625860749809, + "loss": 0.2887, + "step": 19099 + }, + { + "epoch": 1.5473104342190538, + "grad_norm": 0.04682200029492378, + "learning_rate": 0.00013626175795490345, + "loss": 0.2567, + "step": 19100 + }, + { + "epoch": 1.5473914452365523, + "grad_norm": 0.04580902308225632, + "learning_rate": 0.00013625725730230886, + "loss": 0.2551, + "step": 19101 + }, + { + "epoch": 1.5474724562540505, + "grad_norm": 0.047862354665994644, + "learning_rate": 0.00013625275664971422, + "loss": 0.2854, + "step": 19102 + }, + { + "epoch": 1.547553467271549, + "grad_norm": 0.058732111006975174, + "learning_rate": 0.00013624825599711958, + "loss": 0.2954, + "step": 19103 + }, + { + "epoch": 1.5476344782890474, + "grad_norm": 0.05385908856987953, + "learning_rate": 0.00013624375534452497, + "loss": 0.2788, + "step": 19104 + }, + { + "epoch": 1.5477154893065457, + "grad_norm": 0.05026691034436226, + "learning_rate": 0.00013623925469193033, + "loss": 0.2716, + "step": 19105 + }, + { + "epoch": 1.547796500324044, + "grad_norm": 0.05301313474774361, + "learning_rate": 0.0001362347540393357, + "loss": 0.3034, + "step": 19106 + }, + { + "epoch": 1.5478775113415426, + "grad_norm": 0.0488019734621048, + "learning_rate": 0.0001362302533867411, + "loss": 0.3002, + "step": 19107 + }, + { + "epoch": 1.5479585223590409, + "grad_norm": 0.05719370022416115, + "learning_rate": 0.00013622575273414646, + "loss": 0.2721, + "step": 19108 + }, + { + "epoch": 1.548039533376539, + "grad_norm": 0.05312721058726311, + "learning_rate": 0.00013622125208155182, + "loss": 0.3002, + "step": 19109 + }, + { + "epoch": 1.5481205443940376, + "grad_norm": 0.05826451629400253, + "learning_rate": 0.0001362167514289572, + "loss": 0.3005, + "step": 19110 + }, + { + "epoch": 1.548201555411536, + "grad_norm": 0.046084482222795486, + "learning_rate": 0.00013621225077636257, + "loss": 0.3012, + "step": 19111 + }, + { + "epoch": 1.5482825664290343, + "grad_norm": 0.05233805999159813, + "learning_rate": 0.00013620775012376796, + "loss": 0.2818, + "step": 19112 + }, + { + "epoch": 1.5483635774465327, + "grad_norm": 0.05259188264608383, + "learning_rate": 0.00013620324947117334, + "loss": 0.2608, + "step": 19113 + }, + { + "epoch": 1.5484445884640312, + "grad_norm": 0.05448652058839798, + "learning_rate": 0.0001361987488185787, + "loss": 0.2965, + "step": 19114 + }, + { + "epoch": 1.5485255994815295, + "grad_norm": 0.055655933916568756, + "learning_rate": 0.00013619424816598406, + "loss": 0.2867, + "step": 19115 + }, + { + "epoch": 1.5486066104990277, + "grad_norm": 0.05749504640698433, + "learning_rate": 0.00013618974751338945, + "loss": 0.3009, + "step": 19116 + }, + { + "epoch": 1.5486876215165264, + "grad_norm": 0.05507310479879379, + "learning_rate": 0.0001361852468607948, + "loss": 0.2982, + "step": 19117 + }, + { + "epoch": 1.5487686325340246, + "grad_norm": 0.0514855682849884, + "learning_rate": 0.0001361807462082002, + "loss": 0.2551, + "step": 19118 + }, + { + "epoch": 1.5488496435515229, + "grad_norm": 0.03809289261698723, + "learning_rate": 0.00013617624555560559, + "loss": 0.2445, + "step": 19119 + }, + { + "epoch": 1.5489306545690213, + "grad_norm": 0.04524914175271988, + "learning_rate": 0.00013617174490301095, + "loss": 0.2473, + "step": 19120 + }, + { + "epoch": 1.5490116655865198, + "grad_norm": 0.04850262776017189, + "learning_rate": 0.0001361672442504163, + "loss": 0.2869, + "step": 19121 + }, + { + "epoch": 1.549092676604018, + "grad_norm": 0.04947483539581299, + "learning_rate": 0.0001361627435978217, + "loss": 0.2706, + "step": 19122 + }, + { + "epoch": 1.5491736876215165, + "grad_norm": 0.04799078777432442, + "learning_rate": 0.00013615824294522705, + "loss": 0.2938, + "step": 19123 + }, + { + "epoch": 1.549254698639015, + "grad_norm": 0.058111388236284256, + "learning_rate": 0.00013615374229263244, + "loss": 0.2702, + "step": 19124 + }, + { + "epoch": 1.5493357096565132, + "grad_norm": 0.055344365537166595, + "learning_rate": 0.00013614924164003783, + "loss": 0.3008, + "step": 19125 + }, + { + "epoch": 1.5494167206740117, + "grad_norm": 0.04902368038892746, + "learning_rate": 0.0001361447409874432, + "loss": 0.2692, + "step": 19126 + }, + { + "epoch": 1.5494977316915102, + "grad_norm": 0.04455145075917244, + "learning_rate": 0.00013614024033484855, + "loss": 0.3004, + "step": 19127 + }, + { + "epoch": 1.5495787427090084, + "grad_norm": 0.04834412783384323, + "learning_rate": 0.00013613573968225393, + "loss": 0.2658, + "step": 19128 + }, + { + "epoch": 1.5496597537265067, + "grad_norm": 0.04589447006583214, + "learning_rate": 0.0001361312390296593, + "loss": 0.3171, + "step": 19129 + }, + { + "epoch": 1.5497407647440054, + "grad_norm": 0.04412047564983368, + "learning_rate": 0.00013612673837706468, + "loss": 0.299, + "step": 19130 + }, + { + "epoch": 1.5498217757615036, + "grad_norm": 0.042024120688438416, + "learning_rate": 0.00013612223772447007, + "loss": 0.2811, + "step": 19131 + }, + { + "epoch": 1.5499027867790018, + "grad_norm": 0.04573988541960716, + "learning_rate": 0.00013611773707187543, + "loss": 0.3066, + "step": 19132 + }, + { + "epoch": 1.5499837977965003, + "grad_norm": 0.044483110308647156, + "learning_rate": 0.0001361132364192808, + "loss": 0.2522, + "step": 19133 + }, + { + "epoch": 1.5500648088139988, + "grad_norm": 0.050241101533174515, + "learning_rate": 0.00013610873576668618, + "loss": 0.3022, + "step": 19134 + }, + { + "epoch": 1.550145819831497, + "grad_norm": 0.05022579804062843, + "learning_rate": 0.00013610423511409156, + "loss": 0.3209, + "step": 19135 + }, + { + "epoch": 1.5502268308489955, + "grad_norm": 0.045545876026153564, + "learning_rate": 0.00013609973446149692, + "loss": 0.2856, + "step": 19136 + }, + { + "epoch": 1.550307841866494, + "grad_norm": 0.04494267329573631, + "learning_rate": 0.0001360952338089023, + "loss": 0.2814, + "step": 19137 + }, + { + "epoch": 1.5503888528839922, + "grad_norm": 0.050585806369781494, + "learning_rate": 0.00013609073315630767, + "loss": 0.3115, + "step": 19138 + }, + { + "epoch": 1.5504698639014904, + "grad_norm": 0.05736057087779045, + "learning_rate": 0.00013608623250371303, + "loss": 0.2699, + "step": 19139 + }, + { + "epoch": 1.5505508749189891, + "grad_norm": 0.04995905980467796, + "learning_rate": 0.00013608173185111842, + "loss": 0.2802, + "step": 19140 + }, + { + "epoch": 1.5506318859364874, + "grad_norm": 0.05028783157467842, + "learning_rate": 0.0001360772311985238, + "loss": 0.3247, + "step": 19141 + }, + { + "epoch": 1.5507128969539856, + "grad_norm": 0.05537936091423035, + "learning_rate": 0.00013607273054592916, + "loss": 0.2982, + "step": 19142 + }, + { + "epoch": 1.550793907971484, + "grad_norm": 0.049491897225379944, + "learning_rate": 0.00013606822989333455, + "loss": 0.3079, + "step": 19143 + }, + { + "epoch": 1.5508749189889826, + "grad_norm": 0.042341284453868866, + "learning_rate": 0.0001360637292407399, + "loss": 0.289, + "step": 19144 + }, + { + "epoch": 1.5509559300064808, + "grad_norm": 0.05065532401204109, + "learning_rate": 0.00013605922858814527, + "loss": 0.3116, + "step": 19145 + }, + { + "epoch": 1.5510369410239793, + "grad_norm": 0.0424003005027771, + "learning_rate": 0.00013605472793555066, + "loss": 0.253, + "step": 19146 + }, + { + "epoch": 1.5511179520414777, + "grad_norm": 0.05388662964105606, + "learning_rate": 0.00013605022728295605, + "loss": 0.3141, + "step": 19147 + }, + { + "epoch": 1.551198963058976, + "grad_norm": 0.06315270066261292, + "learning_rate": 0.0001360457266303614, + "loss": 0.293, + "step": 19148 + }, + { + "epoch": 1.5512799740764744, + "grad_norm": 0.05217668414115906, + "learning_rate": 0.0001360412259777668, + "loss": 0.2742, + "step": 19149 + }, + { + "epoch": 1.551360985093973, + "grad_norm": 0.04942014440894127, + "learning_rate": 0.00013603672532517215, + "loss": 0.2802, + "step": 19150 + }, + { + "epoch": 1.5514419961114712, + "grad_norm": 0.05215258523821831, + "learning_rate": 0.00013603222467257751, + "loss": 0.2602, + "step": 19151 + }, + { + "epoch": 1.5515230071289694, + "grad_norm": 0.05975009500980377, + "learning_rate": 0.0001360277240199829, + "loss": 0.3159, + "step": 19152 + }, + { + "epoch": 1.5516040181464679, + "grad_norm": 0.05319590866565704, + "learning_rate": 0.0001360232233673883, + "loss": 0.3004, + "step": 19153 + }, + { + "epoch": 1.5516850291639663, + "grad_norm": 0.05167746916413307, + "learning_rate": 0.00013601872271479365, + "loss": 0.2637, + "step": 19154 + }, + { + "epoch": 1.5517660401814646, + "grad_norm": 0.05901259556412697, + "learning_rate": 0.00013601422206219904, + "loss": 0.3177, + "step": 19155 + }, + { + "epoch": 1.551847051198963, + "grad_norm": 0.051348935812711716, + "learning_rate": 0.0001360097214096044, + "loss": 0.2617, + "step": 19156 + }, + { + "epoch": 1.5519280622164615, + "grad_norm": 0.055392563343048096, + "learning_rate": 0.00013600522075700976, + "loss": 0.2699, + "step": 19157 + }, + { + "epoch": 1.5520090732339598, + "grad_norm": 0.05384603887796402, + "learning_rate": 0.00013600072010441514, + "loss": 0.2647, + "step": 19158 + }, + { + "epoch": 1.5520900842514582, + "grad_norm": 0.05610581114888191, + "learning_rate": 0.00013599621945182053, + "loss": 0.3151, + "step": 19159 + }, + { + "epoch": 1.5521710952689567, + "grad_norm": 0.04605025053024292, + "learning_rate": 0.0001359917187992259, + "loss": 0.244, + "step": 19160 + }, + { + "epoch": 1.552252106286455, + "grad_norm": 0.045179493725299835, + "learning_rate": 0.00013598721814663128, + "loss": 0.2821, + "step": 19161 + }, + { + "epoch": 1.5523331173039532, + "grad_norm": 0.04526267200708389, + "learning_rate": 0.00013598271749403664, + "loss": 0.2885, + "step": 19162 + }, + { + "epoch": 1.5524141283214519, + "grad_norm": 0.04857548326253891, + "learning_rate": 0.000135978216841442, + "loss": 0.3125, + "step": 19163 + }, + { + "epoch": 1.5524951393389501, + "grad_norm": 0.051715102046728134, + "learning_rate": 0.0001359737161888474, + "loss": 0.2556, + "step": 19164 + }, + { + "epoch": 1.5525761503564484, + "grad_norm": 0.05461445078253746, + "learning_rate": 0.00013596921553625277, + "loss": 0.2873, + "step": 19165 + }, + { + "epoch": 1.5526571613739468, + "grad_norm": 0.044770218431949615, + "learning_rate": 0.00013596471488365813, + "loss": 0.2829, + "step": 19166 + }, + { + "epoch": 1.5527381723914453, + "grad_norm": 0.044490501284599304, + "learning_rate": 0.00013596021423106352, + "loss": 0.3031, + "step": 19167 + }, + { + "epoch": 1.5528191834089435, + "grad_norm": 0.046804603189229965, + "learning_rate": 0.00013595571357846888, + "loss": 0.2768, + "step": 19168 + }, + { + "epoch": 1.552900194426442, + "grad_norm": 0.04840075969696045, + "learning_rate": 0.00013595121292587424, + "loss": 0.2539, + "step": 19169 + }, + { + "epoch": 1.5529812054439405, + "grad_norm": 0.05297040939331055, + "learning_rate": 0.00013594671227327965, + "loss": 0.3076, + "step": 19170 + }, + { + "epoch": 1.5530622164614387, + "grad_norm": 0.050705526024103165, + "learning_rate": 0.000135942211620685, + "loss": 0.2785, + "step": 19171 + }, + { + "epoch": 1.5531432274789372, + "grad_norm": 0.053877465426921844, + "learning_rate": 0.00013593771096809037, + "loss": 0.2833, + "step": 19172 + }, + { + "epoch": 1.5532242384964356, + "grad_norm": 0.04980199784040451, + "learning_rate": 0.00013593321031549576, + "loss": 0.2553, + "step": 19173 + }, + { + "epoch": 1.553305249513934, + "grad_norm": 0.050213202834129333, + "learning_rate": 0.00013592870966290112, + "loss": 0.2904, + "step": 19174 + }, + { + "epoch": 1.5533862605314321, + "grad_norm": 0.04667943716049194, + "learning_rate": 0.00013592420901030648, + "loss": 0.2767, + "step": 19175 + }, + { + "epoch": 1.5534672715489306, + "grad_norm": 0.04889126121997833, + "learning_rate": 0.0001359197083577119, + "loss": 0.2961, + "step": 19176 + }, + { + "epoch": 1.553548282566429, + "grad_norm": 0.04985184967517853, + "learning_rate": 0.00013591520770511725, + "loss": 0.2659, + "step": 19177 + }, + { + "epoch": 1.5536292935839273, + "grad_norm": 0.0674809142947197, + "learning_rate": 0.00013591070705252261, + "loss": 0.3292, + "step": 19178 + }, + { + "epoch": 1.5537103046014258, + "grad_norm": 0.05704353377223015, + "learning_rate": 0.000135906206399928, + "loss": 0.294, + "step": 19179 + }, + { + "epoch": 1.5537913156189243, + "grad_norm": 0.049726054072380066, + "learning_rate": 0.00013590170574733336, + "loss": 0.2835, + "step": 19180 + }, + { + "epoch": 1.5538723266364225, + "grad_norm": 0.044575802981853485, + "learning_rate": 0.00013589720509473872, + "loss": 0.3227, + "step": 19181 + }, + { + "epoch": 1.553953337653921, + "grad_norm": 0.04700697213411331, + "learning_rate": 0.00013589270444214414, + "loss": 0.2811, + "step": 19182 + }, + { + "epoch": 1.5540343486714194, + "grad_norm": 0.05900681018829346, + "learning_rate": 0.0001358882037895495, + "loss": 0.3242, + "step": 19183 + }, + { + "epoch": 1.5541153596889177, + "grad_norm": 0.05375407263636589, + "learning_rate": 0.00013588370313695486, + "loss": 0.2843, + "step": 19184 + }, + { + "epoch": 1.554196370706416, + "grad_norm": 0.04696367681026459, + "learning_rate": 0.00013587920248436024, + "loss": 0.2742, + "step": 19185 + }, + { + "epoch": 1.5542773817239146, + "grad_norm": 0.051585521548986435, + "learning_rate": 0.0001358747018317656, + "loss": 0.3225, + "step": 19186 + }, + { + "epoch": 1.5543583927414129, + "grad_norm": 0.05937698483467102, + "learning_rate": 0.000135870201179171, + "loss": 0.3203, + "step": 19187 + }, + { + "epoch": 1.554439403758911, + "grad_norm": 0.05009043589234352, + "learning_rate": 0.00013586570052657638, + "loss": 0.3167, + "step": 19188 + }, + { + "epoch": 1.5545204147764096, + "grad_norm": 0.05212171748280525, + "learning_rate": 0.00013586119987398174, + "loss": 0.2756, + "step": 19189 + }, + { + "epoch": 1.554601425793908, + "grad_norm": 0.04361918941140175, + "learning_rate": 0.0001358566992213871, + "loss": 0.2835, + "step": 19190 + }, + { + "epoch": 1.5546824368114063, + "grad_norm": 0.05416923388838768, + "learning_rate": 0.00013585219856879248, + "loss": 0.3242, + "step": 19191 + }, + { + "epoch": 1.5547634478289047, + "grad_norm": 0.0524650439620018, + "learning_rate": 0.00013584769791619784, + "loss": 0.2967, + "step": 19192 + }, + { + "epoch": 1.5548444588464032, + "grad_norm": 0.052620600908994675, + "learning_rate": 0.00013584319726360323, + "loss": 0.2755, + "step": 19193 + }, + { + "epoch": 1.5549254698639015, + "grad_norm": 0.05024907365441322, + "learning_rate": 0.00013583869661100862, + "loss": 0.2904, + "step": 19194 + }, + { + "epoch": 1.5550064808814, + "grad_norm": 0.04997064173221588, + "learning_rate": 0.00013583419595841398, + "loss": 0.2797, + "step": 19195 + }, + { + "epoch": 1.5550874918988984, + "grad_norm": 0.05072508379817009, + "learning_rate": 0.00013582969530581934, + "loss": 0.2913, + "step": 19196 + }, + { + "epoch": 1.5551685029163966, + "grad_norm": 0.04423803091049194, + "learning_rate": 0.00013582519465322473, + "loss": 0.237, + "step": 19197 + }, + { + "epoch": 1.5552495139338949, + "grad_norm": 0.048984941095113754, + "learning_rate": 0.0001358206940006301, + "loss": 0.2689, + "step": 19198 + }, + { + "epoch": 1.5553305249513933, + "grad_norm": 0.04901856929063797, + "learning_rate": 0.00013581619334803547, + "loss": 0.2846, + "step": 19199 + }, + { + "epoch": 1.5554115359688918, + "grad_norm": 0.05620553344488144, + "learning_rate": 0.00013581169269544086, + "loss": 0.2786, + "step": 19200 + }, + { + "epoch": 1.55549254698639, + "grad_norm": 0.06047564372420311, + "learning_rate": 0.00013580719204284622, + "loss": 0.2953, + "step": 19201 + }, + { + "epoch": 1.5555735580038885, + "grad_norm": 0.049802668392658234, + "learning_rate": 0.00013580269139025158, + "loss": 0.282, + "step": 19202 + }, + { + "epoch": 1.555654569021387, + "grad_norm": 0.04509425163269043, + "learning_rate": 0.00013579819073765697, + "loss": 0.2552, + "step": 19203 + }, + { + "epoch": 1.5557355800388852, + "grad_norm": 0.053334422409534454, + "learning_rate": 0.00013579369008506233, + "loss": 0.2578, + "step": 19204 + }, + { + "epoch": 1.5558165910563837, + "grad_norm": 0.05020352452993393, + "learning_rate": 0.00013578918943246772, + "loss": 0.3115, + "step": 19205 + }, + { + "epoch": 1.5558976020738822, + "grad_norm": 0.05819111317396164, + "learning_rate": 0.0001357846887798731, + "loss": 0.2912, + "step": 19206 + }, + { + "epoch": 1.5559786130913804, + "grad_norm": 0.06057784706354141, + "learning_rate": 0.00013578018812727846, + "loss": 0.3445, + "step": 19207 + }, + { + "epoch": 1.5560596241088787, + "grad_norm": 0.050647422671318054, + "learning_rate": 0.00013577568747468382, + "loss": 0.3067, + "step": 19208 + }, + { + "epoch": 1.5561406351263773, + "grad_norm": 0.05491769313812256, + "learning_rate": 0.0001357711868220892, + "loss": 0.3065, + "step": 19209 + }, + { + "epoch": 1.5562216461438756, + "grad_norm": 0.0505046546459198, + "learning_rate": 0.00013576668616949457, + "loss": 0.2424, + "step": 19210 + }, + { + "epoch": 1.5563026571613738, + "grad_norm": 0.05184290558099747, + "learning_rate": 0.00013576218551689996, + "loss": 0.3057, + "step": 19211 + }, + { + "epoch": 1.5563836681788723, + "grad_norm": 0.050715941935777664, + "learning_rate": 0.00013575768486430534, + "loss": 0.3058, + "step": 19212 + }, + { + "epoch": 1.5564646791963708, + "grad_norm": 0.049122605472803116, + "learning_rate": 0.0001357531842117107, + "loss": 0.3018, + "step": 19213 + }, + { + "epoch": 1.556545690213869, + "grad_norm": 0.0555947870016098, + "learning_rate": 0.00013574868355911606, + "loss": 0.3167, + "step": 19214 + }, + { + "epoch": 1.5566267012313675, + "grad_norm": 0.046853404492139816, + "learning_rate": 0.00013574418290652145, + "loss": 0.2695, + "step": 19215 + }, + { + "epoch": 1.556707712248866, + "grad_norm": 0.05228375643491745, + "learning_rate": 0.00013573968225392684, + "loss": 0.3139, + "step": 19216 + }, + { + "epoch": 1.5567887232663642, + "grad_norm": 0.05127185583114624, + "learning_rate": 0.0001357351816013322, + "loss": 0.2837, + "step": 19217 + }, + { + "epoch": 1.5568697342838627, + "grad_norm": 0.05675850808620453, + "learning_rate": 0.00013573068094873759, + "loss": 0.3022, + "step": 19218 + }, + { + "epoch": 1.5569507453013611, + "grad_norm": 0.050446517765522, + "learning_rate": 0.00013572618029614295, + "loss": 0.2719, + "step": 19219 + }, + { + "epoch": 1.5570317563188594, + "grad_norm": 0.06610075384378433, + "learning_rate": 0.0001357216796435483, + "loss": 0.3132, + "step": 19220 + }, + { + "epoch": 1.5571127673363576, + "grad_norm": 0.046209342777729034, + "learning_rate": 0.0001357171789909537, + "loss": 0.262, + "step": 19221 + }, + { + "epoch": 1.557193778353856, + "grad_norm": 0.05675473064184189, + "learning_rate": 0.00013571267833835908, + "loss": 0.2933, + "step": 19222 + }, + { + "epoch": 1.5572747893713546, + "grad_norm": 0.05571332573890686, + "learning_rate": 0.00013570817768576444, + "loss": 0.2948, + "step": 19223 + }, + { + "epoch": 1.5573558003888528, + "grad_norm": 0.06254886090755463, + "learning_rate": 0.00013570367703316983, + "loss": 0.3251, + "step": 19224 + }, + { + "epoch": 1.5574368114063513, + "grad_norm": 0.05003569275140762, + "learning_rate": 0.0001356991763805752, + "loss": 0.3413, + "step": 19225 + }, + { + "epoch": 1.5575178224238497, + "grad_norm": 0.048256766051054, + "learning_rate": 0.00013569467572798055, + "loss": 0.2815, + "step": 19226 + }, + { + "epoch": 1.557598833441348, + "grad_norm": 0.05164124816656113, + "learning_rate": 0.00013569017507538593, + "loss": 0.278, + "step": 19227 + }, + { + "epoch": 1.5576798444588464, + "grad_norm": 0.05035339668393135, + "learning_rate": 0.00013568567442279132, + "loss": 0.2995, + "step": 19228 + }, + { + "epoch": 1.557760855476345, + "grad_norm": 0.037863705307245255, + "learning_rate": 0.00013568117377019668, + "loss": 0.2534, + "step": 19229 + }, + { + "epoch": 1.5578418664938432, + "grad_norm": 0.048973795026540756, + "learning_rate": 0.00013567667311760207, + "loss": 0.2748, + "step": 19230 + }, + { + "epoch": 1.5579228775113414, + "grad_norm": 0.043724387884140015, + "learning_rate": 0.00013567217246500743, + "loss": 0.2589, + "step": 19231 + }, + { + "epoch": 1.55800388852884, + "grad_norm": 0.052051693201065063, + "learning_rate": 0.0001356676718124128, + "loss": 0.3129, + "step": 19232 + }, + { + "epoch": 1.5580848995463383, + "grad_norm": 0.04797010496258736, + "learning_rate": 0.00013566317115981818, + "loss": 0.277, + "step": 19233 + }, + { + "epoch": 1.5581659105638366, + "grad_norm": 0.04128566384315491, + "learning_rate": 0.00013565867050722356, + "loss": 0.2721, + "step": 19234 + }, + { + "epoch": 1.558246921581335, + "grad_norm": 0.041017647832632065, + "learning_rate": 0.00013565416985462892, + "loss": 0.2811, + "step": 19235 + }, + { + "epoch": 1.5583279325988335, + "grad_norm": 0.04845331236720085, + "learning_rate": 0.0001356496692020343, + "loss": 0.2946, + "step": 19236 + }, + { + "epoch": 1.5584089436163318, + "grad_norm": 0.0455239862203598, + "learning_rate": 0.00013564516854943967, + "loss": 0.2714, + "step": 19237 + }, + { + "epoch": 1.5584899546338302, + "grad_norm": 0.058418456465005875, + "learning_rate": 0.00013564066789684503, + "loss": 0.2962, + "step": 19238 + }, + { + "epoch": 1.5585709656513287, + "grad_norm": 0.046155910938978195, + "learning_rate": 0.00013563616724425044, + "loss": 0.2602, + "step": 19239 + }, + { + "epoch": 1.558651976668827, + "grad_norm": 0.05215545743703842, + "learning_rate": 0.0001356316665916558, + "loss": 0.2988, + "step": 19240 + }, + { + "epoch": 1.5587329876863252, + "grad_norm": 0.04679625481367111, + "learning_rate": 0.00013562716593906117, + "loss": 0.2687, + "step": 19241 + }, + { + "epoch": 1.5588139987038239, + "grad_norm": 0.055831775069236755, + "learning_rate": 0.00013562266528646655, + "loss": 0.3001, + "step": 19242 + }, + { + "epoch": 1.5588950097213221, + "grad_norm": 0.05438535660505295, + "learning_rate": 0.0001356181646338719, + "loss": 0.3026, + "step": 19243 + }, + { + "epoch": 1.5589760207388204, + "grad_norm": 0.04468968138098717, + "learning_rate": 0.00013561366398127727, + "loss": 0.2537, + "step": 19244 + }, + { + "epoch": 1.5590570317563188, + "grad_norm": 0.06018408387899399, + "learning_rate": 0.0001356091633286827, + "loss": 0.3102, + "step": 19245 + }, + { + "epoch": 1.5591380427738173, + "grad_norm": 0.050145700573921204, + "learning_rate": 0.00013560466267608805, + "loss": 0.2801, + "step": 19246 + }, + { + "epoch": 1.5592190537913155, + "grad_norm": 0.07588254660367966, + "learning_rate": 0.0001356001620234934, + "loss": 0.3548, + "step": 19247 + }, + { + "epoch": 1.559300064808814, + "grad_norm": 0.05625752732157707, + "learning_rate": 0.0001355956613708988, + "loss": 0.3069, + "step": 19248 + }, + { + "epoch": 1.5593810758263125, + "grad_norm": 0.0515391044318676, + "learning_rate": 0.00013559116071830415, + "loss": 0.3146, + "step": 19249 + }, + { + "epoch": 1.5594620868438107, + "grad_norm": 0.05147033929824829, + "learning_rate": 0.00013558666006570951, + "loss": 0.2481, + "step": 19250 + }, + { + "epoch": 1.5595430978613092, + "grad_norm": 0.05439019575715065, + "learning_rate": 0.00013558215941311493, + "loss": 0.3062, + "step": 19251 + }, + { + "epoch": 1.5596241088788076, + "grad_norm": 0.04817438870668411, + "learning_rate": 0.0001355776587605203, + "loss": 0.2924, + "step": 19252 + }, + { + "epoch": 1.559705119896306, + "grad_norm": 0.059530194848775864, + "learning_rate": 0.00013557315810792565, + "loss": 0.3252, + "step": 19253 + }, + { + "epoch": 1.5597861309138041, + "grad_norm": 0.051332466304302216, + "learning_rate": 0.00013556865745533104, + "loss": 0.2866, + "step": 19254 + }, + { + "epoch": 1.5598671419313026, + "grad_norm": 0.04718302935361862, + "learning_rate": 0.0001355641568027364, + "loss": 0.2859, + "step": 19255 + }, + { + "epoch": 1.559948152948801, + "grad_norm": 0.044280778616666794, + "learning_rate": 0.00013555965615014176, + "loss": 0.2865, + "step": 19256 + }, + { + "epoch": 1.5600291639662993, + "grad_norm": 0.04140870273113251, + "learning_rate": 0.00013555515549754717, + "loss": 0.2405, + "step": 19257 + }, + { + "epoch": 1.5601101749837978, + "grad_norm": 0.046388860791921616, + "learning_rate": 0.00013555065484495253, + "loss": 0.2829, + "step": 19258 + }, + { + "epoch": 1.5601911860012962, + "grad_norm": 0.04737057164311409, + "learning_rate": 0.0001355461541923579, + "loss": 0.2739, + "step": 19259 + }, + { + "epoch": 1.5602721970187945, + "grad_norm": 0.04849565029144287, + "learning_rate": 0.00013554165353976328, + "loss": 0.2338, + "step": 19260 + }, + { + "epoch": 1.560353208036293, + "grad_norm": 0.05050482973456383, + "learning_rate": 0.00013553715288716864, + "loss": 0.2674, + "step": 19261 + }, + { + "epoch": 1.5604342190537914, + "grad_norm": 0.04807204753160477, + "learning_rate": 0.000135532652234574, + "loss": 0.2819, + "step": 19262 + }, + { + "epoch": 1.5605152300712897, + "grad_norm": 0.05020787939429283, + "learning_rate": 0.0001355281515819794, + "loss": 0.2694, + "step": 19263 + }, + { + "epoch": 1.560596241088788, + "grad_norm": 0.044631484895944595, + "learning_rate": 0.00013552365092938477, + "loss": 0.2222, + "step": 19264 + }, + { + "epoch": 1.5606772521062866, + "grad_norm": 0.0597679540514946, + "learning_rate": 0.00013551915027679013, + "loss": 0.3144, + "step": 19265 + }, + { + "epoch": 1.5607582631237849, + "grad_norm": 0.049190703779459, + "learning_rate": 0.00013551464962419552, + "loss": 0.2971, + "step": 19266 + }, + { + "epoch": 1.560839274141283, + "grad_norm": 0.05174947530031204, + "learning_rate": 0.00013551014897160088, + "loss": 0.2619, + "step": 19267 + }, + { + "epoch": 1.5609202851587816, + "grad_norm": 0.053329844027757645, + "learning_rate": 0.00013550564831900627, + "loss": 0.2714, + "step": 19268 + }, + { + "epoch": 1.56100129617628, + "grad_norm": 0.05406653881072998, + "learning_rate": 0.00013550114766641165, + "loss": 0.3264, + "step": 19269 + }, + { + "epoch": 1.5610823071937783, + "grad_norm": 0.05473935604095459, + "learning_rate": 0.000135496647013817, + "loss": 0.2865, + "step": 19270 + }, + { + "epoch": 1.5611633182112767, + "grad_norm": 0.05382101982831955, + "learning_rate": 0.00013549214636122237, + "loss": 0.2634, + "step": 19271 + }, + { + "epoch": 1.5612443292287752, + "grad_norm": 0.06472937762737274, + "learning_rate": 0.00013548764570862776, + "loss": 0.2841, + "step": 19272 + }, + { + "epoch": 1.5613253402462735, + "grad_norm": 0.04545477405190468, + "learning_rate": 0.00013548314505603312, + "loss": 0.2482, + "step": 19273 + }, + { + "epoch": 1.561406351263772, + "grad_norm": 0.05186472088098526, + "learning_rate": 0.0001354786444034385, + "loss": 0.2807, + "step": 19274 + }, + { + "epoch": 1.5614873622812704, + "grad_norm": 0.062175050377845764, + "learning_rate": 0.0001354741437508439, + "loss": 0.3009, + "step": 19275 + }, + { + "epoch": 1.5615683732987686, + "grad_norm": 0.0523802749812603, + "learning_rate": 0.00013546964309824925, + "loss": 0.2879, + "step": 19276 + }, + { + "epoch": 1.5616493843162669, + "grad_norm": 0.06917037814855576, + "learning_rate": 0.00013546514244565461, + "loss": 0.3506, + "step": 19277 + }, + { + "epoch": 1.5617303953337653, + "grad_norm": 0.05295359715819359, + "learning_rate": 0.00013546064179306, + "loss": 0.3047, + "step": 19278 + }, + { + "epoch": 1.5618114063512638, + "grad_norm": 0.05048537999391556, + "learning_rate": 0.00013545614114046536, + "loss": 0.3138, + "step": 19279 + }, + { + "epoch": 1.561892417368762, + "grad_norm": 0.0454871729016304, + "learning_rate": 0.00013545164048787075, + "loss": 0.3103, + "step": 19280 + }, + { + "epoch": 1.5619734283862605, + "grad_norm": 0.048254676163196564, + "learning_rate": 0.00013544713983527614, + "loss": 0.2625, + "step": 19281 + }, + { + "epoch": 1.562054439403759, + "grad_norm": 0.052300721406936646, + "learning_rate": 0.0001354426391826815, + "loss": 0.2706, + "step": 19282 + }, + { + "epoch": 1.5621354504212572, + "grad_norm": 0.04279913753271103, + "learning_rate": 0.00013543813853008686, + "loss": 0.2616, + "step": 19283 + }, + { + "epoch": 1.5622164614387557, + "grad_norm": 0.04854980483651161, + "learning_rate": 0.00013543363787749224, + "loss": 0.2711, + "step": 19284 + }, + { + "epoch": 1.5622974724562542, + "grad_norm": 0.06649527698755264, + "learning_rate": 0.0001354291372248976, + "loss": 0.2833, + "step": 19285 + }, + { + "epoch": 1.5623784834737524, + "grad_norm": 0.04639168828725815, + "learning_rate": 0.000135424636572303, + "loss": 0.2559, + "step": 19286 + }, + { + "epoch": 1.5624594944912507, + "grad_norm": 0.049177560955286026, + "learning_rate": 0.00013542013591970838, + "loss": 0.3056, + "step": 19287 + }, + { + "epoch": 1.5625405055087493, + "grad_norm": 0.05000094324350357, + "learning_rate": 0.00013541563526711374, + "loss": 0.2694, + "step": 19288 + }, + { + "epoch": 1.5626215165262476, + "grad_norm": 0.048982229083776474, + "learning_rate": 0.0001354111346145191, + "loss": 0.2824, + "step": 19289 + }, + { + "epoch": 1.5627025275437458, + "grad_norm": 0.05096309632062912, + "learning_rate": 0.00013540663396192449, + "loss": 0.2573, + "step": 19290 + }, + { + "epoch": 1.5627835385612443, + "grad_norm": 0.04594533517956734, + "learning_rate": 0.00013540213330932985, + "loss": 0.2853, + "step": 19291 + }, + { + "epoch": 1.5628645495787428, + "grad_norm": 0.049910467118024826, + "learning_rate": 0.00013539763265673523, + "loss": 0.2632, + "step": 19292 + }, + { + "epoch": 1.562945560596241, + "grad_norm": 0.05282336473464966, + "learning_rate": 0.00013539313200414062, + "loss": 0.2825, + "step": 19293 + }, + { + "epoch": 1.5630265716137395, + "grad_norm": 0.04863569512963295, + "learning_rate": 0.00013538863135154598, + "loss": 0.3071, + "step": 19294 + }, + { + "epoch": 1.563107582631238, + "grad_norm": 0.05358542501926422, + "learning_rate": 0.00013538413069895134, + "loss": 0.2847, + "step": 19295 + }, + { + "epoch": 1.5631885936487362, + "grad_norm": 0.0446048267185688, + "learning_rate": 0.00013537963004635673, + "loss": 0.2695, + "step": 19296 + }, + { + "epoch": 1.5632696046662347, + "grad_norm": 0.05163099244236946, + "learning_rate": 0.00013537512939376211, + "loss": 0.2833, + "step": 19297 + }, + { + "epoch": 1.5633506156837331, + "grad_norm": 0.04648342728614807, + "learning_rate": 0.00013537062874116747, + "loss": 0.2864, + "step": 19298 + }, + { + "epoch": 1.5634316267012314, + "grad_norm": 0.059515487402677536, + "learning_rate": 0.00013536612808857286, + "loss": 0.2898, + "step": 19299 + }, + { + "epoch": 1.5635126377187296, + "grad_norm": 0.04971949756145477, + "learning_rate": 0.00013536162743597822, + "loss": 0.2834, + "step": 19300 + }, + { + "epoch": 1.563593648736228, + "grad_norm": 0.05322429537773132, + "learning_rate": 0.00013535712678338358, + "loss": 0.2776, + "step": 19301 + }, + { + "epoch": 1.5636746597537265, + "grad_norm": 0.051670897752046585, + "learning_rate": 0.00013535262613078897, + "loss": 0.2525, + "step": 19302 + }, + { + "epoch": 1.5637556707712248, + "grad_norm": 0.043443743139505386, + "learning_rate": 0.00013534812547819436, + "loss": 0.2671, + "step": 19303 + }, + { + "epoch": 1.5638366817887233, + "grad_norm": 0.0499313585460186, + "learning_rate": 0.00013534362482559972, + "loss": 0.3068, + "step": 19304 + }, + { + "epoch": 1.5639176928062217, + "grad_norm": 0.057684704661369324, + "learning_rate": 0.0001353391241730051, + "loss": 0.3108, + "step": 19305 + }, + { + "epoch": 1.56399870382372, + "grad_norm": 0.0660921260714531, + "learning_rate": 0.00013533462352041046, + "loss": 0.2984, + "step": 19306 + }, + { + "epoch": 1.5640797148412184, + "grad_norm": 0.048565756529569626, + "learning_rate": 0.00013533012286781582, + "loss": 0.2767, + "step": 19307 + }, + { + "epoch": 1.564160725858717, + "grad_norm": 0.04774582386016846, + "learning_rate": 0.0001353256222152212, + "loss": 0.2724, + "step": 19308 + }, + { + "epoch": 1.5642417368762151, + "grad_norm": 0.048284415155649185, + "learning_rate": 0.0001353211215626266, + "loss": 0.2749, + "step": 19309 + }, + { + "epoch": 1.5643227478937134, + "grad_norm": 0.04952572286128998, + "learning_rate": 0.00013531662091003196, + "loss": 0.304, + "step": 19310 + }, + { + "epoch": 1.564403758911212, + "grad_norm": 0.047845348715782166, + "learning_rate": 0.00013531212025743734, + "loss": 0.2561, + "step": 19311 + }, + { + "epoch": 1.5644847699287103, + "grad_norm": 0.04719892144203186, + "learning_rate": 0.0001353076196048427, + "loss": 0.3392, + "step": 19312 + }, + { + "epoch": 1.5645657809462086, + "grad_norm": 0.045906711369752884, + "learning_rate": 0.00013530311895224806, + "loss": 0.2525, + "step": 19313 + }, + { + "epoch": 1.564646791963707, + "grad_norm": 0.0455571748316288, + "learning_rate": 0.00013529861829965345, + "loss": 0.2435, + "step": 19314 + }, + { + "epoch": 1.5647278029812055, + "grad_norm": 0.048583488911390305, + "learning_rate": 0.00013529411764705884, + "loss": 0.3033, + "step": 19315 + }, + { + "epoch": 1.5648088139987038, + "grad_norm": 0.055878374725580215, + "learning_rate": 0.0001352896169944642, + "loss": 0.3001, + "step": 19316 + }, + { + "epoch": 1.5648898250162022, + "grad_norm": 0.04683026298880577, + "learning_rate": 0.00013528511634186959, + "loss": 0.2882, + "step": 19317 + }, + { + "epoch": 1.5649708360337007, + "grad_norm": 0.055208344012498856, + "learning_rate": 0.00013528061568927495, + "loss": 0.3108, + "step": 19318 + }, + { + "epoch": 1.565051847051199, + "grad_norm": 0.06561359763145447, + "learning_rate": 0.0001352761150366803, + "loss": 0.2945, + "step": 19319 + }, + { + "epoch": 1.5651328580686974, + "grad_norm": 0.05128493160009384, + "learning_rate": 0.00013527161438408572, + "loss": 0.2801, + "step": 19320 + }, + { + "epoch": 1.5652138690861959, + "grad_norm": 0.04719381034374237, + "learning_rate": 0.00013526711373149108, + "loss": 0.2506, + "step": 19321 + }, + { + "epoch": 1.565294880103694, + "grad_norm": 0.05548151209950447, + "learning_rate": 0.00013526261307889644, + "loss": 0.2729, + "step": 19322 + }, + { + "epoch": 1.5653758911211924, + "grad_norm": 0.05958663299679756, + "learning_rate": 0.00013525811242630183, + "loss": 0.2758, + "step": 19323 + }, + { + "epoch": 1.5654569021386908, + "grad_norm": 0.04910269379615784, + "learning_rate": 0.0001352536117737072, + "loss": 0.2814, + "step": 19324 + }, + { + "epoch": 1.5655379131561893, + "grad_norm": 0.05134573206305504, + "learning_rate": 0.00013524911112111255, + "loss": 0.2896, + "step": 19325 + }, + { + "epoch": 1.5656189241736875, + "grad_norm": 0.05062694102525711, + "learning_rate": 0.00013524461046851796, + "loss": 0.2615, + "step": 19326 + }, + { + "epoch": 1.565699935191186, + "grad_norm": 0.05874665826559067, + "learning_rate": 0.00013524010981592332, + "loss": 0.2627, + "step": 19327 + }, + { + "epoch": 1.5657809462086845, + "grad_norm": 0.05786015838384628, + "learning_rate": 0.00013523560916332868, + "loss": 0.2801, + "step": 19328 + }, + { + "epoch": 1.5658619572261827, + "grad_norm": 0.05415690690279007, + "learning_rate": 0.00013523110851073407, + "loss": 0.3098, + "step": 19329 + }, + { + "epoch": 1.5659429682436812, + "grad_norm": 0.05289224535226822, + "learning_rate": 0.00013522660785813943, + "loss": 0.3232, + "step": 19330 + }, + { + "epoch": 1.5660239792611796, + "grad_norm": 0.04391355440020561, + "learning_rate": 0.0001352221072055448, + "loss": 0.2806, + "step": 19331 + }, + { + "epoch": 1.5661049902786779, + "grad_norm": 0.04492342099547386, + "learning_rate": 0.0001352176065529502, + "loss": 0.2628, + "step": 19332 + }, + { + "epoch": 1.5661860012961761, + "grad_norm": 0.047401949763298035, + "learning_rate": 0.00013521310590035556, + "loss": 0.3076, + "step": 19333 + }, + { + "epoch": 1.5662670123136748, + "grad_norm": 0.05914795398712158, + "learning_rate": 0.00013520860524776092, + "loss": 0.2709, + "step": 19334 + }, + { + "epoch": 1.566348023331173, + "grad_norm": 0.052210431545972824, + "learning_rate": 0.0001352041045951663, + "loss": 0.2864, + "step": 19335 + }, + { + "epoch": 1.5664290343486713, + "grad_norm": 0.0464906245470047, + "learning_rate": 0.00013519960394257167, + "loss": 0.2522, + "step": 19336 + }, + { + "epoch": 1.5665100453661698, + "grad_norm": 0.05061884596943855, + "learning_rate": 0.00013519510328997703, + "loss": 0.2815, + "step": 19337 + }, + { + "epoch": 1.5665910563836682, + "grad_norm": 0.05345157906413078, + "learning_rate": 0.00013519060263738245, + "loss": 0.2781, + "step": 19338 + }, + { + "epoch": 1.5666720674011665, + "grad_norm": 0.048576343804597855, + "learning_rate": 0.0001351861019847878, + "loss": 0.3225, + "step": 19339 + }, + { + "epoch": 1.566753078418665, + "grad_norm": 0.05282697454094887, + "learning_rate": 0.00013518160133219317, + "loss": 0.2956, + "step": 19340 + }, + { + "epoch": 1.5668340894361634, + "grad_norm": 0.05145876109600067, + "learning_rate": 0.00013517710067959855, + "loss": 0.2932, + "step": 19341 + }, + { + "epoch": 1.5669151004536617, + "grad_norm": 0.05020388588309288, + "learning_rate": 0.0001351726000270039, + "loss": 0.2749, + "step": 19342 + }, + { + "epoch": 1.56699611147116, + "grad_norm": 0.05083455145359039, + "learning_rate": 0.00013516809937440927, + "loss": 0.2608, + "step": 19343 + }, + { + "epoch": 1.5670771224886586, + "grad_norm": 0.04994390159845352, + "learning_rate": 0.0001351635987218147, + "loss": 0.2783, + "step": 19344 + }, + { + "epoch": 1.5671581335061568, + "grad_norm": 0.05404075235128403, + "learning_rate": 0.00013515909806922005, + "loss": 0.2697, + "step": 19345 + }, + { + "epoch": 1.567239144523655, + "grad_norm": 0.05101686343550682, + "learning_rate": 0.0001351545974166254, + "loss": 0.2923, + "step": 19346 + }, + { + "epoch": 1.5673201555411536, + "grad_norm": 0.04621126502752304, + "learning_rate": 0.0001351500967640308, + "loss": 0.3011, + "step": 19347 + }, + { + "epoch": 1.567401166558652, + "grad_norm": 0.058542583137750626, + "learning_rate": 0.00013514559611143615, + "loss": 0.2787, + "step": 19348 + }, + { + "epoch": 1.5674821775761503, + "grad_norm": 0.055109862238168716, + "learning_rate": 0.00013514109545884154, + "loss": 0.324, + "step": 19349 + }, + { + "epoch": 1.5675631885936487, + "grad_norm": 0.042174868285655975, + "learning_rate": 0.00013513659480624693, + "loss": 0.2538, + "step": 19350 + }, + { + "epoch": 1.5676441996111472, + "grad_norm": 0.05382600799202919, + "learning_rate": 0.0001351320941536523, + "loss": 0.283, + "step": 19351 + }, + { + "epoch": 1.5677252106286454, + "grad_norm": 0.049266114830970764, + "learning_rate": 0.00013512759350105765, + "loss": 0.2964, + "step": 19352 + }, + { + "epoch": 1.567806221646144, + "grad_norm": 0.04278041794896126, + "learning_rate": 0.00013512309284846304, + "loss": 0.2726, + "step": 19353 + }, + { + "epoch": 1.5678872326636424, + "grad_norm": 0.05372599512338638, + "learning_rate": 0.0001351185921958684, + "loss": 0.3168, + "step": 19354 + }, + { + "epoch": 1.5679682436811406, + "grad_norm": 0.05103114992380142, + "learning_rate": 0.00013511409154327378, + "loss": 0.2903, + "step": 19355 + }, + { + "epoch": 1.5680492546986389, + "grad_norm": 0.053942613303661346, + "learning_rate": 0.00013510959089067917, + "loss": 0.2771, + "step": 19356 + }, + { + "epoch": 1.5681302657161373, + "grad_norm": 0.04800404980778694, + "learning_rate": 0.00013510509023808453, + "loss": 0.3018, + "step": 19357 + }, + { + "epoch": 1.5682112767336358, + "grad_norm": 0.04979465529322624, + "learning_rate": 0.0001351005895854899, + "loss": 0.2866, + "step": 19358 + }, + { + "epoch": 1.568292287751134, + "grad_norm": 0.047334734350442886, + "learning_rate": 0.00013509608893289528, + "loss": 0.2854, + "step": 19359 + }, + { + "epoch": 1.5683732987686325, + "grad_norm": 0.04779450595378876, + "learning_rate": 0.00013509158828030064, + "loss": 0.2819, + "step": 19360 + }, + { + "epoch": 1.568454309786131, + "grad_norm": 0.0574072003364563, + "learning_rate": 0.00013508708762770602, + "loss": 0.2799, + "step": 19361 + }, + { + "epoch": 1.5685353208036292, + "grad_norm": 0.0440792478621006, + "learning_rate": 0.0001350825869751114, + "loss": 0.2604, + "step": 19362 + }, + { + "epoch": 1.5686163318211277, + "grad_norm": 0.06242513284087181, + "learning_rate": 0.00013507808632251677, + "loss": 0.3305, + "step": 19363 + }, + { + "epoch": 1.5686973428386262, + "grad_norm": 0.06059674918651581, + "learning_rate": 0.00013507358566992213, + "loss": 0.2981, + "step": 19364 + }, + { + "epoch": 1.5687783538561244, + "grad_norm": 0.04905066266655922, + "learning_rate": 0.00013506908501732752, + "loss": 0.2793, + "step": 19365 + }, + { + "epoch": 1.5688593648736227, + "grad_norm": 0.04303139075636864, + "learning_rate": 0.00013506458436473288, + "loss": 0.2432, + "step": 19366 + }, + { + "epoch": 1.5689403758911213, + "grad_norm": 0.04874027520418167, + "learning_rate": 0.00013506008371213827, + "loss": 0.2696, + "step": 19367 + }, + { + "epoch": 1.5690213869086196, + "grad_norm": 0.05774131789803505, + "learning_rate": 0.00013505558305954365, + "loss": 0.2821, + "step": 19368 + }, + { + "epoch": 1.5691023979261178, + "grad_norm": 0.04771944135427475, + "learning_rate": 0.000135051082406949, + "loss": 0.2732, + "step": 19369 + }, + { + "epoch": 1.5691834089436163, + "grad_norm": 0.054138049483299255, + "learning_rate": 0.00013504658175435437, + "loss": 0.3147, + "step": 19370 + }, + { + "epoch": 1.5692644199611148, + "grad_norm": 0.05368463322520256, + "learning_rate": 0.00013504208110175976, + "loss": 0.2969, + "step": 19371 + }, + { + "epoch": 1.569345430978613, + "grad_norm": 0.05088932812213898, + "learning_rate": 0.00013503758044916515, + "loss": 0.2483, + "step": 19372 + }, + { + "epoch": 1.5694264419961115, + "grad_norm": 0.049424633383750916, + "learning_rate": 0.0001350330797965705, + "loss": 0.2875, + "step": 19373 + }, + { + "epoch": 1.56950745301361, + "grad_norm": 0.05756201595067978, + "learning_rate": 0.0001350285791439759, + "loss": 0.3183, + "step": 19374 + }, + { + "epoch": 1.5695884640311082, + "grad_norm": 0.047381263226270676, + "learning_rate": 0.00013502407849138125, + "loss": 0.271, + "step": 19375 + }, + { + "epoch": 1.5696694750486067, + "grad_norm": 0.053029779344797134, + "learning_rate": 0.00013501957783878662, + "loss": 0.299, + "step": 19376 + }, + { + "epoch": 1.5697504860661051, + "grad_norm": 0.053522516041994095, + "learning_rate": 0.000135015077186192, + "loss": 0.3147, + "step": 19377 + }, + { + "epoch": 1.5698314970836034, + "grad_norm": 0.04558708518743515, + "learning_rate": 0.0001350105765335974, + "loss": 0.2693, + "step": 19378 + }, + { + "epoch": 1.5699125081011016, + "grad_norm": 0.055690888315439224, + "learning_rate": 0.00013500607588100275, + "loss": 0.3297, + "step": 19379 + }, + { + "epoch": 1.5699935191186, + "grad_norm": 0.0678800642490387, + "learning_rate": 0.00013500157522840814, + "loss": 0.3371, + "step": 19380 + }, + { + "epoch": 1.5700745301360985, + "grad_norm": 0.04769781231880188, + "learning_rate": 0.0001349970745758135, + "loss": 0.2699, + "step": 19381 + }, + { + "epoch": 1.5701555411535968, + "grad_norm": 0.05501679331064224, + "learning_rate": 0.00013499257392321886, + "loss": 0.2611, + "step": 19382 + }, + { + "epoch": 1.5702365521710953, + "grad_norm": 0.05800247937440872, + "learning_rate": 0.00013498807327062424, + "loss": 0.312, + "step": 19383 + }, + { + "epoch": 1.5703175631885937, + "grad_norm": 0.05244293808937073, + "learning_rate": 0.00013498357261802963, + "loss": 0.2793, + "step": 19384 + }, + { + "epoch": 1.570398574206092, + "grad_norm": 0.04592202231287956, + "learning_rate": 0.000134979071965435, + "loss": 0.2493, + "step": 19385 + }, + { + "epoch": 1.5704795852235904, + "grad_norm": 0.059504128992557526, + "learning_rate": 0.00013497457131284038, + "loss": 0.3381, + "step": 19386 + }, + { + "epoch": 1.570560596241089, + "grad_norm": 0.04630604386329651, + "learning_rate": 0.00013497007066024574, + "loss": 0.3126, + "step": 19387 + }, + { + "epoch": 1.5706416072585871, + "grad_norm": 0.05985010787844658, + "learning_rate": 0.0001349655700076511, + "loss": 0.3084, + "step": 19388 + }, + { + "epoch": 1.5707226182760854, + "grad_norm": 0.04947219043970108, + "learning_rate": 0.00013496106935505649, + "loss": 0.2638, + "step": 19389 + }, + { + "epoch": 1.570803629293584, + "grad_norm": 0.05288197100162506, + "learning_rate": 0.00013495656870246187, + "loss": 0.2529, + "step": 19390 + }, + { + "epoch": 1.5708846403110823, + "grad_norm": 0.04978443309664726, + "learning_rate": 0.00013495206804986723, + "loss": 0.2473, + "step": 19391 + }, + { + "epoch": 1.5709656513285806, + "grad_norm": 0.05209624394774437, + "learning_rate": 0.00013494756739727262, + "loss": 0.2942, + "step": 19392 + }, + { + "epoch": 1.571046662346079, + "grad_norm": 0.04603233560919762, + "learning_rate": 0.00013494306674467798, + "loss": 0.2777, + "step": 19393 + }, + { + "epoch": 1.5711276733635775, + "grad_norm": 0.052153099328279495, + "learning_rate": 0.00013493856609208334, + "loss": 0.2729, + "step": 19394 + }, + { + "epoch": 1.5712086843810757, + "grad_norm": 0.044548213481903076, + "learning_rate": 0.00013493406543948873, + "loss": 0.2765, + "step": 19395 + }, + { + "epoch": 1.5712896953985742, + "grad_norm": 0.04247027635574341, + "learning_rate": 0.00013492956478689411, + "loss": 0.2657, + "step": 19396 + }, + { + "epoch": 1.5713707064160727, + "grad_norm": 0.05424690991640091, + "learning_rate": 0.00013492506413429947, + "loss": 0.3083, + "step": 19397 + }, + { + "epoch": 1.571451717433571, + "grad_norm": 0.05457203835248947, + "learning_rate": 0.00013492056348170486, + "loss": 0.2551, + "step": 19398 + }, + { + "epoch": 1.5715327284510694, + "grad_norm": 0.054207317531108856, + "learning_rate": 0.00013491606282911022, + "loss": 0.3004, + "step": 19399 + }, + { + "epoch": 1.5716137394685679, + "grad_norm": 0.05946018546819687, + "learning_rate": 0.00013491156217651558, + "loss": 0.3862, + "step": 19400 + }, + { + "epoch": 1.571694750486066, + "grad_norm": 0.05369342491030693, + "learning_rate": 0.000134907061523921, + "loss": 0.2731, + "step": 19401 + }, + { + "epoch": 1.5717757615035644, + "grad_norm": 0.04194348305463791, + "learning_rate": 0.00013490256087132636, + "loss": 0.2552, + "step": 19402 + }, + { + "epoch": 1.5718567725210628, + "grad_norm": 0.048225175589323044, + "learning_rate": 0.00013489806021873172, + "loss": 0.2676, + "step": 19403 + }, + { + "epoch": 1.5719377835385613, + "grad_norm": 0.04498755931854248, + "learning_rate": 0.0001348935595661371, + "loss": 0.2717, + "step": 19404 + }, + { + "epoch": 1.5720187945560595, + "grad_norm": 0.048275288194417953, + "learning_rate": 0.00013488905891354246, + "loss": 0.2988, + "step": 19405 + }, + { + "epoch": 1.572099805573558, + "grad_norm": 0.05003470182418823, + "learning_rate": 0.00013488455826094782, + "loss": 0.3039, + "step": 19406 + }, + { + "epoch": 1.5721808165910565, + "grad_norm": 0.04922705143690109, + "learning_rate": 0.00013488005760835324, + "loss": 0.3035, + "step": 19407 + }, + { + "epoch": 1.5722618276085547, + "grad_norm": 0.06346455216407776, + "learning_rate": 0.0001348755569557586, + "loss": 0.3198, + "step": 19408 + }, + { + "epoch": 1.5723428386260532, + "grad_norm": 0.052734699100255966, + "learning_rate": 0.00013487105630316396, + "loss": 0.2844, + "step": 19409 + }, + { + "epoch": 1.5724238496435516, + "grad_norm": 0.05498180910944939, + "learning_rate": 0.00013486655565056934, + "loss": 0.3042, + "step": 19410 + }, + { + "epoch": 1.5725048606610499, + "grad_norm": 0.040842171758413315, + "learning_rate": 0.0001348620549979747, + "loss": 0.2255, + "step": 19411 + }, + { + "epoch": 1.5725858716785481, + "grad_norm": 0.047706086188554764, + "learning_rate": 0.00013485755434538006, + "loss": 0.2813, + "step": 19412 + }, + { + "epoch": 1.5726668826960468, + "grad_norm": 0.04232097789645195, + "learning_rate": 0.00013485305369278548, + "loss": 0.2824, + "step": 19413 + }, + { + "epoch": 1.572747893713545, + "grad_norm": 0.04460098221898079, + "learning_rate": 0.00013484855304019084, + "loss": 0.2744, + "step": 19414 + }, + { + "epoch": 1.5728289047310433, + "grad_norm": 0.06017705798149109, + "learning_rate": 0.0001348440523875962, + "loss": 0.2867, + "step": 19415 + }, + { + "epoch": 1.5729099157485418, + "grad_norm": 0.05444073677062988, + "learning_rate": 0.00013483955173500159, + "loss": 0.2994, + "step": 19416 + }, + { + "epoch": 1.5729909267660402, + "grad_norm": 0.047964904457330704, + "learning_rate": 0.00013483505108240695, + "loss": 0.2811, + "step": 19417 + }, + { + "epoch": 1.5730719377835385, + "grad_norm": 0.049424637109041214, + "learning_rate": 0.0001348305504298123, + "loss": 0.2927, + "step": 19418 + }, + { + "epoch": 1.573152948801037, + "grad_norm": 0.05039997026324272, + "learning_rate": 0.00013482604977721772, + "loss": 0.2835, + "step": 19419 + }, + { + "epoch": 1.5732339598185354, + "grad_norm": 0.04230662062764168, + "learning_rate": 0.00013482154912462308, + "loss": 0.2777, + "step": 19420 + }, + { + "epoch": 1.5733149708360337, + "grad_norm": 0.05108647048473358, + "learning_rate": 0.00013481704847202844, + "loss": 0.2719, + "step": 19421 + }, + { + "epoch": 1.5733959818535321, + "grad_norm": 0.045273154973983765, + "learning_rate": 0.00013481254781943383, + "loss": 0.2714, + "step": 19422 + }, + { + "epoch": 1.5734769928710306, + "grad_norm": 0.05585255101323128, + "learning_rate": 0.0001348080471668392, + "loss": 0.3056, + "step": 19423 + }, + { + "epoch": 1.5735580038885288, + "grad_norm": 0.04518725723028183, + "learning_rate": 0.00013480354651424455, + "loss": 0.2783, + "step": 19424 + }, + { + "epoch": 1.573639014906027, + "grad_norm": 0.0475885234773159, + "learning_rate": 0.00013479904586164996, + "loss": 0.3019, + "step": 19425 + }, + { + "epoch": 1.5737200259235256, + "grad_norm": 0.05177724361419678, + "learning_rate": 0.00013479454520905532, + "loss": 0.3063, + "step": 19426 + }, + { + "epoch": 1.573801036941024, + "grad_norm": 0.053175684064626694, + "learning_rate": 0.00013479004455646068, + "loss": 0.3155, + "step": 19427 + }, + { + "epoch": 1.5738820479585223, + "grad_norm": 0.05215033143758774, + "learning_rate": 0.00013478554390386607, + "loss": 0.2957, + "step": 19428 + }, + { + "epoch": 1.5739630589760207, + "grad_norm": 0.04903462529182434, + "learning_rate": 0.00013478104325127143, + "loss": 0.2765, + "step": 19429 + }, + { + "epoch": 1.5740440699935192, + "grad_norm": 0.058999255299568176, + "learning_rate": 0.00013477654259867682, + "loss": 0.3445, + "step": 19430 + }, + { + "epoch": 1.5741250810110174, + "grad_norm": 0.042476482689380646, + "learning_rate": 0.0001347720419460822, + "loss": 0.2704, + "step": 19431 + }, + { + "epoch": 1.574206092028516, + "grad_norm": 0.05020613968372345, + "learning_rate": 0.00013476754129348756, + "loss": 0.2711, + "step": 19432 + }, + { + "epoch": 1.5742871030460144, + "grad_norm": 0.05600776523351669, + "learning_rate": 0.00013476304064089292, + "loss": 0.2579, + "step": 19433 + }, + { + "epoch": 1.5743681140635126, + "grad_norm": 0.07813206315040588, + "learning_rate": 0.0001347585399882983, + "loss": 0.393, + "step": 19434 + }, + { + "epoch": 1.5744491250810109, + "grad_norm": 0.055210404098033905, + "learning_rate": 0.00013475403933570367, + "loss": 0.3479, + "step": 19435 + }, + { + "epoch": 1.5745301360985096, + "grad_norm": 0.042050011456012726, + "learning_rate": 0.00013474953868310906, + "loss": 0.2318, + "step": 19436 + }, + { + "epoch": 1.5746111471160078, + "grad_norm": 0.05371921509504318, + "learning_rate": 0.00013474503803051445, + "loss": 0.2841, + "step": 19437 + }, + { + "epoch": 1.574692158133506, + "grad_norm": 0.05059438198804855, + "learning_rate": 0.0001347405373779198, + "loss": 0.2818, + "step": 19438 + }, + { + "epoch": 1.5747731691510045, + "grad_norm": 0.054289672523736954, + "learning_rate": 0.00013473603672532517, + "loss": 0.3056, + "step": 19439 + }, + { + "epoch": 1.574854180168503, + "grad_norm": 0.049676887691020966, + "learning_rate": 0.00013473153607273055, + "loss": 0.2827, + "step": 19440 + }, + { + "epoch": 1.5749351911860012, + "grad_norm": 0.055509135127067566, + "learning_rate": 0.0001347270354201359, + "loss": 0.3255, + "step": 19441 + }, + { + "epoch": 1.5750162022034997, + "grad_norm": 0.04096323624253273, + "learning_rate": 0.0001347225347675413, + "loss": 0.2607, + "step": 19442 + }, + { + "epoch": 1.5750972132209982, + "grad_norm": 0.05271543562412262, + "learning_rate": 0.0001347180341149467, + "loss": 0.2707, + "step": 19443 + }, + { + "epoch": 1.5751782242384964, + "grad_norm": 0.05153711512684822, + "learning_rate": 0.00013471353346235205, + "loss": 0.291, + "step": 19444 + }, + { + "epoch": 1.5752592352559946, + "grad_norm": 0.05824122950434685, + "learning_rate": 0.0001347090328097574, + "loss": 0.2906, + "step": 19445 + }, + { + "epoch": 1.5753402462734933, + "grad_norm": 0.055363208055496216, + "learning_rate": 0.0001347045321571628, + "loss": 0.2877, + "step": 19446 + }, + { + "epoch": 1.5754212572909916, + "grad_norm": 0.048949237912893295, + "learning_rate": 0.00013470003150456815, + "loss": 0.289, + "step": 19447 + }, + { + "epoch": 1.5755022683084898, + "grad_norm": 0.05374382436275482, + "learning_rate": 0.00013469553085197354, + "loss": 0.2762, + "step": 19448 + }, + { + "epoch": 1.5755832793259883, + "grad_norm": 0.052326034754514694, + "learning_rate": 0.00013469103019937893, + "loss": 0.2752, + "step": 19449 + }, + { + "epoch": 1.5756642903434868, + "grad_norm": 0.052585724741220474, + "learning_rate": 0.0001346865295467843, + "loss": 0.3296, + "step": 19450 + }, + { + "epoch": 1.575745301360985, + "grad_norm": 0.06618500500917435, + "learning_rate": 0.00013468202889418965, + "loss": 0.2835, + "step": 19451 + }, + { + "epoch": 1.5758263123784835, + "grad_norm": 0.046448152512311935, + "learning_rate": 0.00013467752824159504, + "loss": 0.3083, + "step": 19452 + }, + { + "epoch": 1.575907323395982, + "grad_norm": 0.053853485733270645, + "learning_rate": 0.00013467302758900042, + "loss": 0.3046, + "step": 19453 + }, + { + "epoch": 1.5759883344134802, + "grad_norm": 0.056492555886507034, + "learning_rate": 0.00013466852693640578, + "loss": 0.3049, + "step": 19454 + }, + { + "epoch": 1.5760693454309787, + "grad_norm": 0.06340819597244263, + "learning_rate": 0.00013466402628381117, + "loss": 0.3437, + "step": 19455 + }, + { + "epoch": 1.5761503564484771, + "grad_norm": 0.04808042198419571, + "learning_rate": 0.00013465952563121653, + "loss": 0.2704, + "step": 19456 + }, + { + "epoch": 1.5762313674659754, + "grad_norm": 0.04911893606185913, + "learning_rate": 0.0001346550249786219, + "loss": 0.2651, + "step": 19457 + }, + { + "epoch": 1.5763123784834736, + "grad_norm": 0.05891195312142372, + "learning_rate": 0.00013465052432602728, + "loss": 0.269, + "step": 19458 + }, + { + "epoch": 1.5763933895009723, + "grad_norm": 0.06033793464303017, + "learning_rate": 0.00013464602367343266, + "loss": 0.2942, + "step": 19459 + }, + { + "epoch": 1.5764744005184705, + "grad_norm": 0.05624804645776749, + "learning_rate": 0.00013464152302083802, + "loss": 0.2491, + "step": 19460 + }, + { + "epoch": 1.5765554115359688, + "grad_norm": 0.054478585720062256, + "learning_rate": 0.0001346370223682434, + "loss": 0.2932, + "step": 19461 + }, + { + "epoch": 1.5766364225534673, + "grad_norm": 0.050502706319093704, + "learning_rate": 0.00013463252171564877, + "loss": 0.2737, + "step": 19462 + }, + { + "epoch": 1.5767174335709657, + "grad_norm": 0.05171595513820648, + "learning_rate": 0.00013462802106305413, + "loss": 0.2842, + "step": 19463 + }, + { + "epoch": 1.576798444588464, + "grad_norm": 0.04873314127326012, + "learning_rate": 0.00013462352041045952, + "loss": 0.2706, + "step": 19464 + }, + { + "epoch": 1.5768794556059624, + "grad_norm": 0.052232805639505386, + "learning_rate": 0.0001346190197578649, + "loss": 0.2655, + "step": 19465 + }, + { + "epoch": 1.576960466623461, + "grad_norm": 0.04653653874993324, + "learning_rate": 0.00013461451910527027, + "loss": 0.2775, + "step": 19466 + }, + { + "epoch": 1.5770414776409591, + "grad_norm": 0.055312540382146835, + "learning_rate": 0.00013461001845267565, + "loss": 0.3084, + "step": 19467 + }, + { + "epoch": 1.5771224886584574, + "grad_norm": 0.06885188072919846, + "learning_rate": 0.00013460551780008101, + "loss": 0.2853, + "step": 19468 + }, + { + "epoch": 1.577203499675956, + "grad_norm": 0.055660393089056015, + "learning_rate": 0.00013460101714748637, + "loss": 0.3098, + "step": 19469 + }, + { + "epoch": 1.5772845106934543, + "grad_norm": 0.04958134889602661, + "learning_rate": 0.00013459651649489176, + "loss": 0.2592, + "step": 19470 + }, + { + "epoch": 1.5773655217109526, + "grad_norm": 0.04702504724264145, + "learning_rate": 0.00013459201584229715, + "loss": 0.2825, + "step": 19471 + }, + { + "epoch": 1.577446532728451, + "grad_norm": 0.04876773804426193, + "learning_rate": 0.0001345875151897025, + "loss": 0.2493, + "step": 19472 + }, + { + "epoch": 1.5775275437459495, + "grad_norm": 0.05104687064886093, + "learning_rate": 0.0001345830145371079, + "loss": 0.3036, + "step": 19473 + }, + { + "epoch": 1.5776085547634477, + "grad_norm": 0.048596564680337906, + "learning_rate": 0.00013457851388451326, + "loss": 0.2729, + "step": 19474 + }, + { + "epoch": 1.5776895657809462, + "grad_norm": 0.05432022362947464, + "learning_rate": 0.00013457401323191862, + "loss": 0.3191, + "step": 19475 + }, + { + "epoch": 1.5777705767984447, + "grad_norm": 0.05890238285064697, + "learning_rate": 0.000134569512579324, + "loss": 0.3281, + "step": 19476 + }, + { + "epoch": 1.577851587815943, + "grad_norm": 0.0422152616083622, + "learning_rate": 0.0001345650119267294, + "loss": 0.2609, + "step": 19477 + }, + { + "epoch": 1.5779325988334414, + "grad_norm": 0.048545897006988525, + "learning_rate": 0.00013456051127413475, + "loss": 0.2838, + "step": 19478 + }, + { + "epoch": 1.5780136098509399, + "grad_norm": 0.048626262694597244, + "learning_rate": 0.00013455601062154014, + "loss": 0.2753, + "step": 19479 + }, + { + "epoch": 1.578094620868438, + "grad_norm": 0.0606694258749485, + "learning_rate": 0.0001345515099689455, + "loss": 0.3308, + "step": 19480 + }, + { + "epoch": 1.5781756318859363, + "grad_norm": 0.04808412119746208, + "learning_rate": 0.00013454700931635086, + "loss": 0.2622, + "step": 19481 + }, + { + "epoch": 1.5782566429034348, + "grad_norm": 0.056980349123477936, + "learning_rate": 0.00013454250866375627, + "loss": 0.3184, + "step": 19482 + }, + { + "epoch": 1.5783376539209333, + "grad_norm": 0.047187671065330505, + "learning_rate": 0.00013453800801116163, + "loss": 0.2792, + "step": 19483 + }, + { + "epoch": 1.5784186649384315, + "grad_norm": 0.05486692488193512, + "learning_rate": 0.000134533507358567, + "loss": 0.2921, + "step": 19484 + }, + { + "epoch": 1.57849967595593, + "grad_norm": 0.042787306010723114, + "learning_rate": 0.00013452900670597238, + "loss": 0.2742, + "step": 19485 + }, + { + "epoch": 1.5785806869734285, + "grad_norm": 0.04654007405042648, + "learning_rate": 0.00013452450605337774, + "loss": 0.2752, + "step": 19486 + }, + { + "epoch": 1.5786616979909267, + "grad_norm": 0.04305479675531387, + "learning_rate": 0.0001345200054007831, + "loss": 0.2936, + "step": 19487 + }, + { + "epoch": 1.5787427090084252, + "grad_norm": 0.04199616238474846, + "learning_rate": 0.0001345155047481885, + "loss": 0.251, + "step": 19488 + }, + { + "epoch": 1.5788237200259236, + "grad_norm": 0.055879976600408554, + "learning_rate": 0.00013451100409559387, + "loss": 0.3394, + "step": 19489 + }, + { + "epoch": 1.5789047310434219, + "grad_norm": 0.04630248621106148, + "learning_rate": 0.00013450650344299923, + "loss": 0.2817, + "step": 19490 + }, + { + "epoch": 1.5789857420609201, + "grad_norm": 0.06566954404115677, + "learning_rate": 0.00013450200279040462, + "loss": 0.2839, + "step": 19491 + }, + { + "epoch": 1.5790667530784188, + "grad_norm": 0.058554135262966156, + "learning_rate": 0.00013449750213780998, + "loss": 0.2627, + "step": 19492 + }, + { + "epoch": 1.579147764095917, + "grad_norm": 0.05380035191774368, + "learning_rate": 0.00013449300148521534, + "loss": 0.2689, + "step": 19493 + }, + { + "epoch": 1.5792287751134153, + "grad_norm": 0.042370859533548355, + "learning_rate": 0.00013448850083262075, + "loss": 0.2608, + "step": 19494 + }, + { + "epoch": 1.5793097861309138, + "grad_norm": 0.05102236941456795, + "learning_rate": 0.00013448400018002611, + "loss": 0.2601, + "step": 19495 + }, + { + "epoch": 1.5793907971484122, + "grad_norm": 0.05376815423369408, + "learning_rate": 0.00013447949952743147, + "loss": 0.2965, + "step": 19496 + }, + { + "epoch": 1.5794718081659105, + "grad_norm": 0.055364444851875305, + "learning_rate": 0.00013447499887483686, + "loss": 0.3266, + "step": 19497 + }, + { + "epoch": 1.579552819183409, + "grad_norm": 0.0610194057226181, + "learning_rate": 0.00013447049822224222, + "loss": 0.2914, + "step": 19498 + }, + { + "epoch": 1.5796338302009074, + "grad_norm": 0.051366176456213, + "learning_rate": 0.00013446599756964758, + "loss": 0.2758, + "step": 19499 + }, + { + "epoch": 1.5797148412184057, + "grad_norm": 0.05474573001265526, + "learning_rate": 0.000134461496917053, + "loss": 0.264, + "step": 19500 + }, + { + "epoch": 1.5797958522359041, + "grad_norm": 0.058302175253629684, + "learning_rate": 0.00013445699626445836, + "loss": 0.2763, + "step": 19501 + }, + { + "epoch": 1.5798768632534026, + "grad_norm": 0.04867679625749588, + "learning_rate": 0.00013445249561186372, + "loss": 0.2755, + "step": 19502 + }, + { + "epoch": 1.5799578742709008, + "grad_norm": 0.051118914037942886, + "learning_rate": 0.0001344479949592691, + "loss": 0.2846, + "step": 19503 + }, + { + "epoch": 1.580038885288399, + "grad_norm": 0.05177982151508331, + "learning_rate": 0.00013444349430667446, + "loss": 0.2764, + "step": 19504 + }, + { + "epoch": 1.5801198963058976, + "grad_norm": 0.050728704780340195, + "learning_rate": 0.00013443899365407985, + "loss": 0.2439, + "step": 19505 + }, + { + "epoch": 1.580200907323396, + "grad_norm": 0.055721383541822433, + "learning_rate": 0.00013443449300148524, + "loss": 0.317, + "step": 19506 + }, + { + "epoch": 1.5802819183408943, + "grad_norm": 0.0517570935189724, + "learning_rate": 0.0001344299923488906, + "loss": 0.2538, + "step": 19507 + }, + { + "epoch": 1.5803629293583927, + "grad_norm": 0.04586503654718399, + "learning_rate": 0.00013442549169629596, + "loss": 0.308, + "step": 19508 + }, + { + "epoch": 1.5804439403758912, + "grad_norm": 0.03953614458441734, + "learning_rate": 0.00013442099104370134, + "loss": 0.2506, + "step": 19509 + }, + { + "epoch": 1.5805249513933894, + "grad_norm": 0.05850844085216522, + "learning_rate": 0.0001344164903911067, + "loss": 0.3596, + "step": 19510 + }, + { + "epoch": 1.580605962410888, + "grad_norm": 0.05324858799576759, + "learning_rate": 0.0001344119897385121, + "loss": 0.3396, + "step": 19511 + }, + { + "epoch": 1.5806869734283864, + "grad_norm": 0.04520611837506294, + "learning_rate": 0.00013440748908591748, + "loss": 0.2488, + "step": 19512 + }, + { + "epoch": 1.5807679844458846, + "grad_norm": 0.0491422601044178, + "learning_rate": 0.00013440298843332284, + "loss": 0.2924, + "step": 19513 + }, + { + "epoch": 1.5808489954633829, + "grad_norm": 0.05382819101214409, + "learning_rate": 0.0001343984877807282, + "loss": 0.2977, + "step": 19514 + }, + { + "epoch": 1.5809300064808816, + "grad_norm": 0.04993279650807381, + "learning_rate": 0.0001343939871281336, + "loss": 0.3074, + "step": 19515 + }, + { + "epoch": 1.5810110174983798, + "grad_norm": 0.0456116609275341, + "learning_rate": 0.00013438948647553895, + "loss": 0.3097, + "step": 19516 + }, + { + "epoch": 1.581092028515878, + "grad_norm": 0.05514628440141678, + "learning_rate": 0.00013438498582294433, + "loss": 0.3011, + "step": 19517 + }, + { + "epoch": 1.5811730395333765, + "grad_norm": 0.05581878498196602, + "learning_rate": 0.00013438048517034972, + "loss": 0.3181, + "step": 19518 + }, + { + "epoch": 1.581254050550875, + "grad_norm": 0.052173078060150146, + "learning_rate": 0.00013437598451775508, + "loss": 0.2768, + "step": 19519 + }, + { + "epoch": 1.5813350615683732, + "grad_norm": 0.04671182110905647, + "learning_rate": 0.00013437148386516044, + "loss": 0.2637, + "step": 19520 + }, + { + "epoch": 1.5814160725858717, + "grad_norm": 0.051288314163684845, + "learning_rate": 0.00013436698321256583, + "loss": 0.2879, + "step": 19521 + }, + { + "epoch": 1.5814970836033702, + "grad_norm": 0.055354394018650055, + "learning_rate": 0.0001343624825599712, + "loss": 0.3061, + "step": 19522 + }, + { + "epoch": 1.5815780946208684, + "grad_norm": 0.05607502534985542, + "learning_rate": 0.00013435798190737658, + "loss": 0.2576, + "step": 19523 + }, + { + "epoch": 1.5816591056383669, + "grad_norm": 0.047366395592689514, + "learning_rate": 0.00013435348125478196, + "loss": 0.2879, + "step": 19524 + }, + { + "epoch": 1.5817401166558653, + "grad_norm": 0.048803966492414474, + "learning_rate": 0.00013434898060218732, + "loss": 0.2782, + "step": 19525 + }, + { + "epoch": 1.5818211276733636, + "grad_norm": 0.05734861642122269, + "learning_rate": 0.00013434447994959268, + "loss": 0.3287, + "step": 19526 + }, + { + "epoch": 1.5819021386908618, + "grad_norm": 0.04951030761003494, + "learning_rate": 0.00013433997929699807, + "loss": 0.2746, + "step": 19527 + }, + { + "epoch": 1.5819831497083603, + "grad_norm": 0.051852792501449585, + "learning_rate": 0.00013433547864440343, + "loss": 0.2498, + "step": 19528 + }, + { + "epoch": 1.5820641607258588, + "grad_norm": 0.05859070271253586, + "learning_rate": 0.00013433097799180882, + "loss": 0.3057, + "step": 19529 + }, + { + "epoch": 1.582145171743357, + "grad_norm": 0.0653352215886116, + "learning_rate": 0.0001343264773392142, + "loss": 0.3034, + "step": 19530 + }, + { + "epoch": 1.5822261827608555, + "grad_norm": 0.051609303802251816, + "learning_rate": 0.00013432197668661956, + "loss": 0.2612, + "step": 19531 + }, + { + "epoch": 1.582307193778354, + "grad_norm": 0.05950654670596123, + "learning_rate": 0.00013431747603402492, + "loss": 0.3185, + "step": 19532 + }, + { + "epoch": 1.5823882047958522, + "grad_norm": 0.047630857676267624, + "learning_rate": 0.0001343129753814303, + "loss": 0.2937, + "step": 19533 + }, + { + "epoch": 1.5824692158133506, + "grad_norm": 0.04461874067783356, + "learning_rate": 0.0001343084747288357, + "loss": 0.2714, + "step": 19534 + }, + { + "epoch": 1.5825502268308491, + "grad_norm": 0.0678873211145401, + "learning_rate": 0.00013430397407624106, + "loss": 0.3534, + "step": 19535 + }, + { + "epoch": 1.5826312378483474, + "grad_norm": 0.04869205504655838, + "learning_rate": 0.00013429947342364645, + "loss": 0.2626, + "step": 19536 + }, + { + "epoch": 1.5827122488658456, + "grad_norm": 0.044049911201000214, + "learning_rate": 0.0001342949727710518, + "loss": 0.2942, + "step": 19537 + }, + { + "epoch": 1.5827932598833443, + "grad_norm": 0.05476945638656616, + "learning_rate": 0.00013429047211845717, + "loss": 0.2983, + "step": 19538 + }, + { + "epoch": 1.5828742709008425, + "grad_norm": 0.044789087027311325, + "learning_rate": 0.00013428597146586255, + "loss": 0.2706, + "step": 19539 + }, + { + "epoch": 1.5829552819183408, + "grad_norm": 0.0458502434194088, + "learning_rate": 0.00013428147081326794, + "loss": 0.2838, + "step": 19540 + }, + { + "epoch": 1.5830362929358393, + "grad_norm": 0.04885415360331535, + "learning_rate": 0.0001342769701606733, + "loss": 0.2562, + "step": 19541 + }, + { + "epoch": 1.5831173039533377, + "grad_norm": 0.044314928352832794, + "learning_rate": 0.0001342724695080787, + "loss": 0.2976, + "step": 19542 + }, + { + "epoch": 1.583198314970836, + "grad_norm": 0.05950617417693138, + "learning_rate": 0.00013426796885548405, + "loss": 0.2917, + "step": 19543 + }, + { + "epoch": 1.5832793259883344, + "grad_norm": 0.06253184378147125, + "learning_rate": 0.0001342634682028894, + "loss": 0.3023, + "step": 19544 + }, + { + "epoch": 1.583360337005833, + "grad_norm": 0.047593940049409866, + "learning_rate": 0.0001342589675502948, + "loss": 0.2945, + "step": 19545 + }, + { + "epoch": 1.5834413480233311, + "grad_norm": 0.05673489719629288, + "learning_rate": 0.00013425446689770018, + "loss": 0.3181, + "step": 19546 + }, + { + "epoch": 1.5835223590408296, + "grad_norm": 0.06391898542642593, + "learning_rate": 0.00013424996624510554, + "loss": 0.3179, + "step": 19547 + }, + { + "epoch": 1.583603370058328, + "grad_norm": 0.049235813319683075, + "learning_rate": 0.00013424546559251093, + "loss": 0.2842, + "step": 19548 + }, + { + "epoch": 1.5836843810758263, + "grad_norm": 0.048863064497709274, + "learning_rate": 0.0001342409649399163, + "loss": 0.2508, + "step": 19549 + }, + { + "epoch": 1.5837653920933246, + "grad_norm": 0.06420667469501495, + "learning_rate": 0.00013423646428732165, + "loss": 0.2819, + "step": 19550 + }, + { + "epoch": 1.583846403110823, + "grad_norm": 0.048935070633888245, + "learning_rate": 0.00013423196363472704, + "loss": 0.2987, + "step": 19551 + }, + { + "epoch": 1.5839274141283215, + "grad_norm": 0.04862522333860397, + "learning_rate": 0.00013422746298213242, + "loss": 0.2812, + "step": 19552 + }, + { + "epoch": 1.5840084251458197, + "grad_norm": 0.053085774183273315, + "learning_rate": 0.00013422296232953778, + "loss": 0.2789, + "step": 19553 + }, + { + "epoch": 1.5840894361633182, + "grad_norm": 0.048895008862018585, + "learning_rate": 0.00013421846167694317, + "loss": 0.2916, + "step": 19554 + }, + { + "epoch": 1.5841704471808167, + "grad_norm": 0.0500405989587307, + "learning_rate": 0.00013421396102434853, + "loss": 0.2506, + "step": 19555 + }, + { + "epoch": 1.584251458198315, + "grad_norm": 0.05650602653622627, + "learning_rate": 0.0001342094603717539, + "loss": 0.2992, + "step": 19556 + }, + { + "epoch": 1.5843324692158134, + "grad_norm": 0.050165776163339615, + "learning_rate": 0.0001342049597191593, + "loss": 0.2973, + "step": 19557 + }, + { + "epoch": 1.5844134802333119, + "grad_norm": 0.0493108294904232, + "learning_rate": 0.00013420045906656467, + "loss": 0.2982, + "step": 19558 + }, + { + "epoch": 1.58449449125081, + "grad_norm": 0.0474187433719635, + "learning_rate": 0.00013419595841397003, + "loss": 0.262, + "step": 19559 + }, + { + "epoch": 1.5845755022683083, + "grad_norm": 0.04100741446018219, + "learning_rate": 0.0001341914577613754, + "loss": 0.2536, + "step": 19560 + }, + { + "epoch": 1.584656513285807, + "grad_norm": 0.04271881654858589, + "learning_rate": 0.00013418695710878077, + "loss": 0.2489, + "step": 19561 + }, + { + "epoch": 1.5847375243033053, + "grad_norm": 0.055077798664569855, + "learning_rate": 0.00013418245645618613, + "loss": 0.2785, + "step": 19562 + }, + { + "epoch": 1.5848185353208035, + "grad_norm": 0.05069742351770401, + "learning_rate": 0.00013417795580359155, + "loss": 0.2351, + "step": 19563 + }, + { + "epoch": 1.584899546338302, + "grad_norm": 0.053563978523015976, + "learning_rate": 0.0001341734551509969, + "loss": 0.2767, + "step": 19564 + }, + { + "epoch": 1.5849805573558005, + "grad_norm": 0.049226801842451096, + "learning_rate": 0.00013416895449840227, + "loss": 0.2993, + "step": 19565 + }, + { + "epoch": 1.5850615683732987, + "grad_norm": 0.05453573539853096, + "learning_rate": 0.00013416445384580765, + "loss": 0.3365, + "step": 19566 + }, + { + "epoch": 1.5851425793907972, + "grad_norm": 0.05534984543919563, + "learning_rate": 0.00013415995319321301, + "loss": 0.2636, + "step": 19567 + }, + { + "epoch": 1.5852235904082956, + "grad_norm": 0.0382307767868042, + "learning_rate": 0.00013415545254061837, + "loss": 0.2521, + "step": 19568 + }, + { + "epoch": 1.5853046014257939, + "grad_norm": 0.04936755821108818, + "learning_rate": 0.0001341509518880238, + "loss": 0.2632, + "step": 19569 + }, + { + "epoch": 1.5853856124432921, + "grad_norm": 0.047801628708839417, + "learning_rate": 0.00013414645123542915, + "loss": 0.2908, + "step": 19570 + }, + { + "epoch": 1.5854666234607908, + "grad_norm": 0.047462377697229385, + "learning_rate": 0.0001341419505828345, + "loss": 0.2765, + "step": 19571 + }, + { + "epoch": 1.585547634478289, + "grad_norm": 0.047430410981178284, + "learning_rate": 0.0001341374499302399, + "loss": 0.2358, + "step": 19572 + }, + { + "epoch": 1.5856286454957873, + "grad_norm": 0.047434475272893906, + "learning_rate": 0.00013413294927764526, + "loss": 0.2591, + "step": 19573 + }, + { + "epoch": 1.5857096565132858, + "grad_norm": 0.044510822743177414, + "learning_rate": 0.00013412844862505062, + "loss": 0.2849, + "step": 19574 + }, + { + "epoch": 1.5857906675307842, + "grad_norm": 0.04769321531057358, + "learning_rate": 0.00013412394797245603, + "loss": 0.2614, + "step": 19575 + }, + { + "epoch": 1.5858716785482825, + "grad_norm": 0.06467246264219284, + "learning_rate": 0.0001341194473198614, + "loss": 0.2783, + "step": 19576 + }, + { + "epoch": 1.585952689565781, + "grad_norm": 0.05072154104709625, + "learning_rate": 0.00013411494666726675, + "loss": 0.3038, + "step": 19577 + }, + { + "epoch": 1.5860337005832794, + "grad_norm": 0.057482898235321045, + "learning_rate": 0.00013411044601467214, + "loss": 0.3215, + "step": 19578 + }, + { + "epoch": 1.5861147116007777, + "grad_norm": 0.05687712877988815, + "learning_rate": 0.0001341059453620775, + "loss": 0.319, + "step": 19579 + }, + { + "epoch": 1.5861957226182761, + "grad_norm": 0.047046370804309845, + "learning_rate": 0.00013410144470948286, + "loss": 0.2611, + "step": 19580 + }, + { + "epoch": 1.5862767336357746, + "grad_norm": 0.04093106836080551, + "learning_rate": 0.00013409694405688827, + "loss": 0.2503, + "step": 19581 + }, + { + "epoch": 1.5863577446532728, + "grad_norm": 0.0708787590265274, + "learning_rate": 0.00013409244340429363, + "loss": 0.3086, + "step": 19582 + }, + { + "epoch": 1.586438755670771, + "grad_norm": 0.05671603977680206, + "learning_rate": 0.000134087942751699, + "loss": 0.2799, + "step": 19583 + }, + { + "epoch": 1.5865197666882696, + "grad_norm": 0.051028985530138016, + "learning_rate": 0.00013408344209910438, + "loss": 0.3173, + "step": 19584 + }, + { + "epoch": 1.586600777705768, + "grad_norm": 0.05028403177857399, + "learning_rate": 0.00013407894144650974, + "loss": 0.271, + "step": 19585 + }, + { + "epoch": 1.5866817887232663, + "grad_norm": 0.056971270591020584, + "learning_rate": 0.00013407444079391513, + "loss": 0.3144, + "step": 19586 + }, + { + "epoch": 1.5867627997407647, + "grad_norm": 0.05798279494047165, + "learning_rate": 0.0001340699401413205, + "loss": 0.297, + "step": 19587 + }, + { + "epoch": 1.5868438107582632, + "grad_norm": 0.05333030968904495, + "learning_rate": 0.00013406543948872587, + "loss": 0.2364, + "step": 19588 + }, + { + "epoch": 1.5869248217757614, + "grad_norm": 0.055028073489665985, + "learning_rate": 0.00013406093883613123, + "loss": 0.3158, + "step": 19589 + }, + { + "epoch": 1.58700583279326, + "grad_norm": 0.048676956444978714, + "learning_rate": 0.00013405643818353662, + "loss": 0.2999, + "step": 19590 + }, + { + "epoch": 1.5870868438107584, + "grad_norm": 0.06379703432321548, + "learning_rate": 0.00013405193753094198, + "loss": 0.2945, + "step": 19591 + }, + { + "epoch": 1.5871678548282566, + "grad_norm": 0.056056320667266846, + "learning_rate": 0.00013404743687834737, + "loss": 0.2736, + "step": 19592 + }, + { + "epoch": 1.5872488658457549, + "grad_norm": 0.05605302378535271, + "learning_rate": 0.00013404293622575275, + "loss": 0.303, + "step": 19593 + }, + { + "epoch": 1.5873298768632536, + "grad_norm": 0.04912743717432022, + "learning_rate": 0.00013403843557315811, + "loss": 0.2689, + "step": 19594 + }, + { + "epoch": 1.5874108878807518, + "grad_norm": 0.06005407124757767, + "learning_rate": 0.00013403393492056347, + "loss": 0.3133, + "step": 19595 + }, + { + "epoch": 1.58749189889825, + "grad_norm": 0.05698971077799797, + "learning_rate": 0.00013402943426796886, + "loss": 0.2711, + "step": 19596 + }, + { + "epoch": 1.5875729099157485, + "grad_norm": 0.040859419852495193, + "learning_rate": 0.00013402493361537422, + "loss": 0.2427, + "step": 19597 + }, + { + "epoch": 1.587653920933247, + "grad_norm": 0.05441635474562645, + "learning_rate": 0.0001340204329627796, + "loss": 0.2553, + "step": 19598 + }, + { + "epoch": 1.5877349319507452, + "grad_norm": 0.049378473311662674, + "learning_rate": 0.000134015932310185, + "loss": 0.2659, + "step": 19599 + }, + { + "epoch": 1.5878159429682437, + "grad_norm": 0.043783094733953476, + "learning_rate": 0.00013401143165759036, + "loss": 0.2473, + "step": 19600 + }, + { + "epoch": 1.5878969539857422, + "grad_norm": 0.05409387871623039, + "learning_rate": 0.00013400693100499572, + "loss": 0.2935, + "step": 19601 + }, + { + "epoch": 1.5879779650032404, + "grad_norm": 0.058873020112514496, + "learning_rate": 0.0001340024303524011, + "loss": 0.3162, + "step": 19602 + }, + { + "epoch": 1.5880589760207389, + "grad_norm": 0.05617973953485489, + "learning_rate": 0.00013399792969980646, + "loss": 0.2918, + "step": 19603 + }, + { + "epoch": 1.5881399870382373, + "grad_norm": 0.046545181423425674, + "learning_rate": 0.00013399342904721185, + "loss": 0.2495, + "step": 19604 + }, + { + "epoch": 1.5882209980557356, + "grad_norm": 0.04627370461821556, + "learning_rate": 0.00013398892839461724, + "loss": 0.2722, + "step": 19605 + }, + { + "epoch": 1.5883020090732338, + "grad_norm": 0.04813306778669357, + "learning_rate": 0.0001339844277420226, + "loss": 0.2627, + "step": 19606 + }, + { + "epoch": 1.5883830200907323, + "grad_norm": 0.04957534372806549, + "learning_rate": 0.00013397992708942796, + "loss": 0.2822, + "step": 19607 + }, + { + "epoch": 1.5884640311082308, + "grad_norm": 0.04994957521557808, + "learning_rate": 0.00013397542643683335, + "loss": 0.2609, + "step": 19608 + }, + { + "epoch": 1.588545042125729, + "grad_norm": 0.049995407462120056, + "learning_rate": 0.0001339709257842387, + "loss": 0.2599, + "step": 19609 + }, + { + "epoch": 1.5886260531432275, + "grad_norm": 0.05555260181427002, + "learning_rate": 0.0001339664251316441, + "loss": 0.3067, + "step": 19610 + }, + { + "epoch": 1.588707064160726, + "grad_norm": 0.05339108407497406, + "learning_rate": 0.00013396192447904948, + "loss": 0.3451, + "step": 19611 + }, + { + "epoch": 1.5887880751782242, + "grad_norm": 0.04250529035925865, + "learning_rate": 0.00013395742382645484, + "loss": 0.2602, + "step": 19612 + }, + { + "epoch": 1.5888690861957226, + "grad_norm": 0.04517268389463425, + "learning_rate": 0.0001339529231738602, + "loss": 0.2728, + "step": 19613 + }, + { + "epoch": 1.5889500972132211, + "grad_norm": 0.04415430128574371, + "learning_rate": 0.0001339484225212656, + "loss": 0.2613, + "step": 19614 + }, + { + "epoch": 1.5890311082307194, + "grad_norm": 0.03891155496239662, + "learning_rate": 0.00013394392186867097, + "loss": 0.2542, + "step": 19615 + }, + { + "epoch": 1.5891121192482176, + "grad_norm": 0.055569179356098175, + "learning_rate": 0.00013393942121607633, + "loss": 0.3395, + "step": 19616 + }, + { + "epoch": 1.5891931302657163, + "grad_norm": 0.05368742346763611, + "learning_rate": 0.00013393492056348172, + "loss": 0.2745, + "step": 19617 + }, + { + "epoch": 1.5892741412832145, + "grad_norm": 0.051335424184799194, + "learning_rate": 0.00013393041991088708, + "loss": 0.2891, + "step": 19618 + }, + { + "epoch": 1.5893551523007128, + "grad_norm": 0.05128169432282448, + "learning_rate": 0.00013392591925829244, + "loss": 0.2831, + "step": 19619 + }, + { + "epoch": 1.5894361633182112, + "grad_norm": 0.04808347672224045, + "learning_rate": 0.00013392141860569783, + "loss": 0.2637, + "step": 19620 + }, + { + "epoch": 1.5895171743357097, + "grad_norm": 0.04751835763454437, + "learning_rate": 0.00013391691795310322, + "loss": 0.2618, + "step": 19621 + }, + { + "epoch": 1.589598185353208, + "grad_norm": 0.04695792868733406, + "learning_rate": 0.00013391241730050858, + "loss": 0.2314, + "step": 19622 + }, + { + "epoch": 1.5896791963707064, + "grad_norm": 0.06030832231044769, + "learning_rate": 0.00013390791664791396, + "loss": 0.2612, + "step": 19623 + }, + { + "epoch": 1.589760207388205, + "grad_norm": 0.0565250962972641, + "learning_rate": 0.00013390341599531932, + "loss": 0.2887, + "step": 19624 + }, + { + "epoch": 1.5898412184057031, + "grad_norm": 0.05243346095085144, + "learning_rate": 0.00013389891534272468, + "loss": 0.3233, + "step": 19625 + }, + { + "epoch": 1.5899222294232016, + "grad_norm": 0.058184653520584106, + "learning_rate": 0.00013389441469013007, + "loss": 0.3108, + "step": 19626 + }, + { + "epoch": 1.5900032404407, + "grad_norm": 0.06172959506511688, + "learning_rate": 0.00013388991403753546, + "loss": 0.2936, + "step": 19627 + }, + { + "epoch": 1.5900842514581983, + "grad_norm": 0.04676850140094757, + "learning_rate": 0.00013388541338494082, + "loss": 0.2531, + "step": 19628 + }, + { + "epoch": 1.5901652624756966, + "grad_norm": 0.052669767290353775, + "learning_rate": 0.0001338809127323462, + "loss": 0.3351, + "step": 19629 + }, + { + "epoch": 1.590246273493195, + "grad_norm": 0.04854946210980415, + "learning_rate": 0.00013387641207975156, + "loss": 0.2523, + "step": 19630 + }, + { + "epoch": 1.5903272845106935, + "grad_norm": 0.0540703609585762, + "learning_rate": 0.00013387191142715692, + "loss": 0.2923, + "step": 19631 + }, + { + "epoch": 1.5904082955281917, + "grad_norm": 0.06427466869354248, + "learning_rate": 0.0001338674107745623, + "loss": 0.3333, + "step": 19632 + }, + { + "epoch": 1.5904893065456902, + "grad_norm": 0.05325065180659294, + "learning_rate": 0.0001338629101219677, + "loss": 0.3103, + "step": 19633 + }, + { + "epoch": 1.5905703175631887, + "grad_norm": 0.047619082033634186, + "learning_rate": 0.00013385840946937306, + "loss": 0.2728, + "step": 19634 + }, + { + "epoch": 1.590651328580687, + "grad_norm": 0.057228464633226395, + "learning_rate": 0.00013385390881677845, + "loss": 0.2492, + "step": 19635 + }, + { + "epoch": 1.5907323395981854, + "grad_norm": 0.056406740099191666, + "learning_rate": 0.0001338494081641838, + "loss": 0.2636, + "step": 19636 + }, + { + "epoch": 1.5908133506156839, + "grad_norm": 0.0460297130048275, + "learning_rate": 0.00013384490751158917, + "loss": 0.2481, + "step": 19637 + }, + { + "epoch": 1.590894361633182, + "grad_norm": 0.05521533265709877, + "learning_rate": 0.00013384040685899458, + "loss": 0.3225, + "step": 19638 + }, + { + "epoch": 1.5909753726506803, + "grad_norm": 0.06045277416706085, + "learning_rate": 0.00013383590620639994, + "loss": 0.3055, + "step": 19639 + }, + { + "epoch": 1.591056383668179, + "grad_norm": 0.06774187088012695, + "learning_rate": 0.0001338314055538053, + "loss": 0.3209, + "step": 19640 + }, + { + "epoch": 1.5911373946856773, + "grad_norm": 0.05821792781352997, + "learning_rate": 0.0001338269049012107, + "loss": 0.3121, + "step": 19641 + }, + { + "epoch": 1.5912184057031755, + "grad_norm": 0.0538976714015007, + "learning_rate": 0.00013382240424861605, + "loss": 0.2769, + "step": 19642 + }, + { + "epoch": 1.591299416720674, + "grad_norm": 0.04965272173285484, + "learning_rate": 0.0001338179035960214, + "loss": 0.2695, + "step": 19643 + }, + { + "epoch": 1.5913804277381725, + "grad_norm": 0.04983029142022133, + "learning_rate": 0.00013381340294342682, + "loss": 0.2999, + "step": 19644 + }, + { + "epoch": 1.5914614387556707, + "grad_norm": 0.058221280574798584, + "learning_rate": 0.00013380890229083218, + "loss": 0.2875, + "step": 19645 + }, + { + "epoch": 1.5915424497731692, + "grad_norm": 0.06419771909713745, + "learning_rate": 0.00013380440163823754, + "loss": 0.3187, + "step": 19646 + }, + { + "epoch": 1.5916234607906676, + "grad_norm": 0.05506671965122223, + "learning_rate": 0.00013379990098564293, + "loss": 0.3442, + "step": 19647 + }, + { + "epoch": 1.5917044718081659, + "grad_norm": 0.04870070889592171, + "learning_rate": 0.0001337954003330483, + "loss": 0.2922, + "step": 19648 + }, + { + "epoch": 1.5917854828256643, + "grad_norm": 0.04817147180438042, + "learning_rate": 0.00013379089968045368, + "loss": 0.2788, + "step": 19649 + }, + { + "epoch": 1.5918664938431628, + "grad_norm": 0.0634249746799469, + "learning_rate": 0.00013378639902785906, + "loss": 0.2812, + "step": 19650 + }, + { + "epoch": 1.591947504860661, + "grad_norm": 0.05736738443374634, + "learning_rate": 0.00013378189837526442, + "loss": 0.2951, + "step": 19651 + }, + { + "epoch": 1.5920285158781593, + "grad_norm": 0.055955443531274796, + "learning_rate": 0.00013377739772266978, + "loss": 0.3102, + "step": 19652 + }, + { + "epoch": 1.5921095268956578, + "grad_norm": 0.05353270098567009, + "learning_rate": 0.00013377289707007517, + "loss": 0.2824, + "step": 19653 + }, + { + "epoch": 1.5921905379131562, + "grad_norm": 0.054427746683359146, + "learning_rate": 0.00013376839641748053, + "loss": 0.2642, + "step": 19654 + }, + { + "epoch": 1.5922715489306545, + "grad_norm": 0.05478702858090401, + "learning_rate": 0.00013376389576488592, + "loss": 0.2947, + "step": 19655 + }, + { + "epoch": 1.592352559948153, + "grad_norm": 0.05507731810212135, + "learning_rate": 0.0001337593951122913, + "loss": 0.3027, + "step": 19656 + }, + { + "epoch": 1.5924335709656514, + "grad_norm": 0.05065590888261795, + "learning_rate": 0.00013375489445969667, + "loss": 0.229, + "step": 19657 + }, + { + "epoch": 1.5925145819831497, + "grad_norm": 0.047295115888118744, + "learning_rate": 0.00013375039380710203, + "loss": 0.2758, + "step": 19658 + }, + { + "epoch": 1.5925955930006481, + "grad_norm": 0.050888653844594955, + "learning_rate": 0.0001337458931545074, + "loss": 0.3103, + "step": 19659 + }, + { + "epoch": 1.5926766040181466, + "grad_norm": 0.05038614943623543, + "learning_rate": 0.00013374139250191277, + "loss": 0.2487, + "step": 19660 + }, + { + "epoch": 1.5927576150356448, + "grad_norm": 0.042133983224630356, + "learning_rate": 0.00013373689184931816, + "loss": 0.2702, + "step": 19661 + }, + { + "epoch": 1.592838626053143, + "grad_norm": 0.05329374596476555, + "learning_rate": 0.00013373239119672355, + "loss": 0.2634, + "step": 19662 + }, + { + "epoch": 1.5929196370706418, + "grad_norm": 0.06329121440649033, + "learning_rate": 0.0001337278905441289, + "loss": 0.3362, + "step": 19663 + }, + { + "epoch": 1.59300064808814, + "grad_norm": 0.05555858090519905, + "learning_rate": 0.00013372338989153427, + "loss": 0.2875, + "step": 19664 + }, + { + "epoch": 1.5930816591056383, + "grad_norm": 0.061923980712890625, + "learning_rate": 0.00013371888923893965, + "loss": 0.3068, + "step": 19665 + }, + { + "epoch": 1.5931626701231367, + "grad_norm": 0.054352499544620514, + "learning_rate": 0.00013371438858634501, + "loss": 0.3165, + "step": 19666 + }, + { + "epoch": 1.5932436811406352, + "grad_norm": 0.06146497651934624, + "learning_rate": 0.0001337098879337504, + "loss": 0.3136, + "step": 19667 + }, + { + "epoch": 1.5933246921581334, + "grad_norm": 0.05410310998558998, + "learning_rate": 0.0001337053872811558, + "loss": 0.2703, + "step": 19668 + }, + { + "epoch": 1.593405703175632, + "grad_norm": 0.043867308646440506, + "learning_rate": 0.00013370088662856115, + "loss": 0.2674, + "step": 19669 + }, + { + "epoch": 1.5934867141931304, + "grad_norm": 0.0502147302031517, + "learning_rate": 0.0001336963859759665, + "loss": 0.3127, + "step": 19670 + }, + { + "epoch": 1.5935677252106286, + "grad_norm": 0.05285639315843582, + "learning_rate": 0.0001336918853233719, + "loss": 0.3093, + "step": 19671 + }, + { + "epoch": 1.5936487362281269, + "grad_norm": 0.04711088910698891, + "learning_rate": 0.00013368738467077726, + "loss": 0.2931, + "step": 19672 + }, + { + "epoch": 1.5937297472456255, + "grad_norm": 0.04537595063447952, + "learning_rate": 0.00013368288401818264, + "loss": 0.2638, + "step": 19673 + }, + { + "epoch": 1.5938107582631238, + "grad_norm": 0.05486955866217613, + "learning_rate": 0.00013367838336558803, + "loss": 0.2727, + "step": 19674 + }, + { + "epoch": 1.593891769280622, + "grad_norm": 0.06857334822416306, + "learning_rate": 0.0001336738827129934, + "loss": 0.3224, + "step": 19675 + }, + { + "epoch": 1.5939727802981205, + "grad_norm": 0.04904793202877045, + "learning_rate": 0.00013366938206039875, + "loss": 0.2678, + "step": 19676 + }, + { + "epoch": 1.594053791315619, + "grad_norm": 0.04866638034582138, + "learning_rate": 0.00013366488140780414, + "loss": 0.2629, + "step": 19677 + }, + { + "epoch": 1.5941348023331172, + "grad_norm": 0.05716971680521965, + "learning_rate": 0.0001336603807552095, + "loss": 0.2929, + "step": 19678 + }, + { + "epoch": 1.5942158133506157, + "grad_norm": 0.058549296110868454, + "learning_rate": 0.00013365588010261488, + "loss": 0.3016, + "step": 19679 + }, + { + "epoch": 1.5942968243681142, + "grad_norm": 0.04250664636492729, + "learning_rate": 0.00013365137945002027, + "loss": 0.29, + "step": 19680 + }, + { + "epoch": 1.5943778353856124, + "grad_norm": 0.0584678053855896, + "learning_rate": 0.00013364687879742563, + "loss": 0.2894, + "step": 19681 + }, + { + "epoch": 1.5944588464031109, + "grad_norm": 0.04140187054872513, + "learning_rate": 0.000133642378144831, + "loss": 0.2392, + "step": 19682 + }, + { + "epoch": 1.5945398574206093, + "grad_norm": 0.044475093483924866, + "learning_rate": 0.00013363787749223638, + "loss": 0.2629, + "step": 19683 + }, + { + "epoch": 1.5946208684381076, + "grad_norm": 0.05308195948600769, + "learning_rate": 0.00013363337683964174, + "loss": 0.2899, + "step": 19684 + }, + { + "epoch": 1.5947018794556058, + "grad_norm": 0.05468233302235603, + "learning_rate": 0.00013362887618704713, + "loss": 0.3294, + "step": 19685 + }, + { + "epoch": 1.5947828904731045, + "grad_norm": 0.05329752340912819, + "learning_rate": 0.0001336243755344525, + "loss": 0.3098, + "step": 19686 + }, + { + "epoch": 1.5948639014906028, + "grad_norm": 0.05600878596305847, + "learning_rate": 0.00013361987488185787, + "loss": 0.272, + "step": 19687 + }, + { + "epoch": 1.594944912508101, + "grad_norm": 0.061277762055397034, + "learning_rate": 0.00013361537422926323, + "loss": 0.2922, + "step": 19688 + }, + { + "epoch": 1.5950259235255995, + "grad_norm": 0.045982733368873596, + "learning_rate": 0.00013361087357666862, + "loss": 0.281, + "step": 19689 + }, + { + "epoch": 1.595106934543098, + "grad_norm": 0.06412628293037415, + "learning_rate": 0.000133606372924074, + "loss": 0.2422, + "step": 19690 + }, + { + "epoch": 1.5951879455605962, + "grad_norm": 0.04596994072198868, + "learning_rate": 0.00013360187227147937, + "loss": 0.28, + "step": 19691 + }, + { + "epoch": 1.5952689565780946, + "grad_norm": 0.05474567785859108, + "learning_rate": 0.00013359737161888475, + "loss": 0.2654, + "step": 19692 + }, + { + "epoch": 1.595349967595593, + "grad_norm": 0.04921437427401543, + "learning_rate": 0.00013359287096629012, + "loss": 0.2825, + "step": 19693 + }, + { + "epoch": 1.5954309786130914, + "grad_norm": 0.05111038312315941, + "learning_rate": 0.00013358837031369548, + "loss": 0.2894, + "step": 19694 + }, + { + "epoch": 1.5955119896305896, + "grad_norm": 0.04755771532654762, + "learning_rate": 0.00013358386966110086, + "loss": 0.2627, + "step": 19695 + }, + { + "epoch": 1.5955930006480883, + "grad_norm": 0.047934528440237045, + "learning_rate": 0.00013357936900850625, + "loss": 0.252, + "step": 19696 + }, + { + "epoch": 1.5956740116655865, + "grad_norm": 0.058660369366407394, + "learning_rate": 0.0001335748683559116, + "loss": 0.2857, + "step": 19697 + }, + { + "epoch": 1.5957550226830848, + "grad_norm": 0.049679480493068695, + "learning_rate": 0.000133570367703317, + "loss": 0.2798, + "step": 19698 + }, + { + "epoch": 1.5958360337005832, + "grad_norm": 0.05734862759709358, + "learning_rate": 0.00013356586705072236, + "loss": 0.2779, + "step": 19699 + }, + { + "epoch": 1.5959170447180817, + "grad_norm": 0.056212328374385834, + "learning_rate": 0.00013356136639812772, + "loss": 0.2877, + "step": 19700 + }, + { + "epoch": 1.59599805573558, + "grad_norm": 0.056470245122909546, + "learning_rate": 0.0001335568657455331, + "loss": 0.2883, + "step": 19701 + }, + { + "epoch": 1.5960790667530784, + "grad_norm": 0.04747643321752548, + "learning_rate": 0.0001335523650929385, + "loss": 0.2784, + "step": 19702 + }, + { + "epoch": 1.596160077770577, + "grad_norm": 0.05902129039168358, + "learning_rate": 0.00013354786444034385, + "loss": 0.2904, + "step": 19703 + }, + { + "epoch": 1.5962410887880751, + "grad_norm": 0.057399291545152664, + "learning_rate": 0.00013354336378774924, + "loss": 0.2585, + "step": 19704 + }, + { + "epoch": 1.5963220998055736, + "grad_norm": 0.05625062435865402, + "learning_rate": 0.0001335388631351546, + "loss": 0.3098, + "step": 19705 + }, + { + "epoch": 1.596403110823072, + "grad_norm": 0.06203244626522064, + "learning_rate": 0.00013353436248255996, + "loss": 0.3249, + "step": 19706 + }, + { + "epoch": 1.5964841218405703, + "grad_norm": 0.056399717926979065, + "learning_rate": 0.00013352986182996535, + "loss": 0.2994, + "step": 19707 + }, + { + "epoch": 1.5965651328580686, + "grad_norm": 0.05485859513282776, + "learning_rate": 0.00013352536117737073, + "loss": 0.2942, + "step": 19708 + }, + { + "epoch": 1.596646143875567, + "grad_norm": 0.047439463436603546, + "learning_rate": 0.0001335208605247761, + "loss": 0.2994, + "step": 19709 + }, + { + "epoch": 1.5967271548930655, + "grad_norm": 0.05042650178074837, + "learning_rate": 0.00013351635987218148, + "loss": 0.2681, + "step": 19710 + }, + { + "epoch": 1.5968081659105637, + "grad_norm": 0.06080637127161026, + "learning_rate": 0.00013351185921958684, + "loss": 0.3173, + "step": 19711 + }, + { + "epoch": 1.5968891769280622, + "grad_norm": 0.06810148060321808, + "learning_rate": 0.0001335073585669922, + "loss": 0.3148, + "step": 19712 + }, + { + "epoch": 1.5969701879455607, + "grad_norm": 0.0476883128285408, + "learning_rate": 0.0001335028579143976, + "loss": 0.2561, + "step": 19713 + }, + { + "epoch": 1.597051198963059, + "grad_norm": 0.058156874030828476, + "learning_rate": 0.00013349835726180297, + "loss": 0.2933, + "step": 19714 + }, + { + "epoch": 1.5971322099805574, + "grad_norm": 0.046778421849012375, + "learning_rate": 0.00013349385660920833, + "loss": 0.2485, + "step": 19715 + }, + { + "epoch": 1.5972132209980558, + "grad_norm": 0.06392556428909302, + "learning_rate": 0.00013348935595661372, + "loss": 0.3113, + "step": 19716 + }, + { + "epoch": 1.597294232015554, + "grad_norm": 0.04123203828930855, + "learning_rate": 0.00013348485530401908, + "loss": 0.2385, + "step": 19717 + }, + { + "epoch": 1.5973752430330523, + "grad_norm": 0.04342777654528618, + "learning_rate": 0.00013348035465142447, + "loss": 0.2473, + "step": 19718 + }, + { + "epoch": 1.597456254050551, + "grad_norm": 0.04915757104754448, + "learning_rate": 0.00013347585399882986, + "loss": 0.283, + "step": 19719 + }, + { + "epoch": 1.5975372650680493, + "grad_norm": 0.04471985995769501, + "learning_rate": 0.00013347135334623522, + "loss": 0.2644, + "step": 19720 + }, + { + "epoch": 1.5976182760855475, + "grad_norm": 0.05780022218823433, + "learning_rate": 0.00013346685269364058, + "loss": 0.3069, + "step": 19721 + }, + { + "epoch": 1.597699287103046, + "grad_norm": 0.05209074914455414, + "learning_rate": 0.00013346235204104596, + "loss": 0.2777, + "step": 19722 + }, + { + "epoch": 1.5977802981205445, + "grad_norm": 0.05138828232884407, + "learning_rate": 0.00013345785138845132, + "loss": 0.3253, + "step": 19723 + }, + { + "epoch": 1.5978613091380427, + "grad_norm": 0.05990460887551308, + "learning_rate": 0.0001334533507358567, + "loss": 0.3176, + "step": 19724 + }, + { + "epoch": 1.5979423201555412, + "grad_norm": 0.04816710948944092, + "learning_rate": 0.0001334488500832621, + "loss": 0.24, + "step": 19725 + }, + { + "epoch": 1.5980233311730396, + "grad_norm": 0.045972611755132675, + "learning_rate": 0.00013344434943066746, + "loss": 0.2509, + "step": 19726 + }, + { + "epoch": 1.5981043421905379, + "grad_norm": 0.04820011183619499, + "learning_rate": 0.00013343984877807282, + "loss": 0.2836, + "step": 19727 + }, + { + "epoch": 1.5981853532080363, + "grad_norm": 0.06232719495892525, + "learning_rate": 0.0001334353481254782, + "loss": 0.3038, + "step": 19728 + }, + { + "epoch": 1.5982663642255348, + "grad_norm": 0.05783186852931976, + "learning_rate": 0.00013343084747288356, + "loss": 0.3069, + "step": 19729 + }, + { + "epoch": 1.598347375243033, + "grad_norm": 0.059091247618198395, + "learning_rate": 0.00013342634682028895, + "loss": 0.2702, + "step": 19730 + }, + { + "epoch": 1.5984283862605313, + "grad_norm": 0.05414794012904167, + "learning_rate": 0.00013342184616769434, + "loss": 0.2929, + "step": 19731 + }, + { + "epoch": 1.5985093972780298, + "grad_norm": 0.0421329103410244, + "learning_rate": 0.0001334173455150997, + "loss": 0.2436, + "step": 19732 + }, + { + "epoch": 1.5985904082955282, + "grad_norm": 0.053817491978406906, + "learning_rate": 0.00013341284486250506, + "loss": 0.2695, + "step": 19733 + }, + { + "epoch": 1.5986714193130265, + "grad_norm": 0.059979382902383804, + "learning_rate": 0.00013340834420991045, + "loss": 0.2983, + "step": 19734 + }, + { + "epoch": 1.598752430330525, + "grad_norm": 0.04933563247323036, + "learning_rate": 0.0001334038435573158, + "loss": 0.2356, + "step": 19735 + }, + { + "epoch": 1.5988334413480234, + "grad_norm": 0.046898871660232544, + "learning_rate": 0.0001333993429047212, + "loss": 0.273, + "step": 19736 + }, + { + "epoch": 1.5989144523655217, + "grad_norm": 0.0689791813492775, + "learning_rate": 0.00013339484225212658, + "loss": 0.3238, + "step": 19737 + }, + { + "epoch": 1.5989954633830201, + "grad_norm": 0.05081493780016899, + "learning_rate": 0.00013339034159953194, + "loss": 0.2774, + "step": 19738 + }, + { + "epoch": 1.5990764744005186, + "grad_norm": 0.052922870963811874, + "learning_rate": 0.0001333858409469373, + "loss": 0.2425, + "step": 19739 + }, + { + "epoch": 1.5991574854180168, + "grad_norm": 0.049743879586458206, + "learning_rate": 0.0001333813402943427, + "loss": 0.2914, + "step": 19740 + }, + { + "epoch": 1.599238496435515, + "grad_norm": 0.05588243901729584, + "learning_rate": 0.00013337683964174805, + "loss": 0.2706, + "step": 19741 + }, + { + "epoch": 1.5993195074530138, + "grad_norm": 0.04906295984983444, + "learning_rate": 0.00013337233898915344, + "loss": 0.261, + "step": 19742 + }, + { + "epoch": 1.599400518470512, + "grad_norm": 0.04569677263498306, + "learning_rate": 0.00013336783833655882, + "loss": 0.2512, + "step": 19743 + }, + { + "epoch": 1.5994815294880103, + "grad_norm": 0.05717282369732857, + "learning_rate": 0.00013336333768396418, + "loss": 0.2593, + "step": 19744 + }, + { + "epoch": 1.5995625405055087, + "grad_norm": 0.05262308940291405, + "learning_rate": 0.00013335883703136954, + "loss": 0.2638, + "step": 19745 + }, + { + "epoch": 1.5996435515230072, + "grad_norm": 0.05189024284482002, + "learning_rate": 0.00013335433637877493, + "loss": 0.3071, + "step": 19746 + }, + { + "epoch": 1.5997245625405054, + "grad_norm": 0.06069159135222435, + "learning_rate": 0.0001333498357261803, + "loss": 0.3037, + "step": 19747 + }, + { + "epoch": 1.599805573558004, + "grad_norm": 0.05159150809049606, + "learning_rate": 0.00013334533507358568, + "loss": 0.2855, + "step": 19748 + }, + { + "epoch": 1.5998865845755024, + "grad_norm": 0.055219776928424835, + "learning_rate": 0.00013334083442099106, + "loss": 0.2991, + "step": 19749 + }, + { + "epoch": 1.5999675955930006, + "grad_norm": 0.049742020666599274, + "learning_rate": 0.00013333633376839642, + "loss": 0.2937, + "step": 19750 + }, + { + "epoch": 1.600048606610499, + "grad_norm": 0.04801604896783829, + "learning_rate": 0.00013333183311580178, + "loss": 0.2799, + "step": 19751 + }, + { + "epoch": 1.6001296176279975, + "grad_norm": 0.05257720500230789, + "learning_rate": 0.00013332733246320717, + "loss": 0.2931, + "step": 19752 + }, + { + "epoch": 1.6002106286454958, + "grad_norm": 0.05499967560172081, + "learning_rate": 0.00013332283181061253, + "loss": 0.2917, + "step": 19753 + }, + { + "epoch": 1.600291639662994, + "grad_norm": 0.04537876695394516, + "learning_rate": 0.00013331833115801792, + "loss": 0.2593, + "step": 19754 + }, + { + "epoch": 1.6003726506804925, + "grad_norm": 0.05684947222471237, + "learning_rate": 0.0001333138305054233, + "loss": 0.2915, + "step": 19755 + }, + { + "epoch": 1.600453661697991, + "grad_norm": 0.051478736102581024, + "learning_rate": 0.00013330932985282867, + "loss": 0.2961, + "step": 19756 + }, + { + "epoch": 1.6005346727154892, + "grad_norm": 0.045026905834674835, + "learning_rate": 0.00013330482920023403, + "loss": 0.2744, + "step": 19757 + }, + { + "epoch": 1.6006156837329877, + "grad_norm": 0.05228782445192337, + "learning_rate": 0.0001333003285476394, + "loss": 0.3082, + "step": 19758 + }, + { + "epoch": 1.6006966947504861, + "grad_norm": 0.050884321331977844, + "learning_rate": 0.00013329582789504477, + "loss": 0.2914, + "step": 19759 + }, + { + "epoch": 1.6007777057679844, + "grad_norm": 0.050620514899492264, + "learning_rate": 0.00013329132724245016, + "loss": 0.2926, + "step": 19760 + }, + { + "epoch": 1.6008587167854829, + "grad_norm": 0.061932142823934555, + "learning_rate": 0.00013328682658985555, + "loss": 0.2993, + "step": 19761 + }, + { + "epoch": 1.6009397278029813, + "grad_norm": 0.05216047912836075, + "learning_rate": 0.0001332823259372609, + "loss": 0.2462, + "step": 19762 + }, + { + "epoch": 1.6010207388204796, + "grad_norm": 0.05384761095046997, + "learning_rate": 0.00013327782528466627, + "loss": 0.3201, + "step": 19763 + }, + { + "epoch": 1.6011017498379778, + "grad_norm": 0.048409927636384964, + "learning_rate": 0.00013327332463207165, + "loss": 0.2529, + "step": 19764 + }, + { + "epoch": 1.6011827608554765, + "grad_norm": 0.05935129523277283, + "learning_rate": 0.00013326882397947701, + "loss": 0.2723, + "step": 19765 + }, + { + "epoch": 1.6012637718729748, + "grad_norm": 0.05299336835741997, + "learning_rate": 0.0001332643233268824, + "loss": 0.2942, + "step": 19766 + }, + { + "epoch": 1.601344782890473, + "grad_norm": 0.04570074379444122, + "learning_rate": 0.0001332598226742878, + "loss": 0.3037, + "step": 19767 + }, + { + "epoch": 1.6014257939079715, + "grad_norm": 0.05521856248378754, + "learning_rate": 0.00013325532202169315, + "loss": 0.313, + "step": 19768 + }, + { + "epoch": 1.60150680492547, + "grad_norm": 0.05584537982940674, + "learning_rate": 0.0001332508213690985, + "loss": 0.268, + "step": 19769 + }, + { + "epoch": 1.6015878159429682, + "grad_norm": 0.05476771667599678, + "learning_rate": 0.0001332463207165039, + "loss": 0.2673, + "step": 19770 + }, + { + "epoch": 1.6016688269604666, + "grad_norm": 0.047139693051576614, + "learning_rate": 0.00013324182006390928, + "loss": 0.2551, + "step": 19771 + }, + { + "epoch": 1.601749837977965, + "grad_norm": 0.053314223885536194, + "learning_rate": 0.00013323731941131464, + "loss": 0.2799, + "step": 19772 + }, + { + "epoch": 1.6018308489954634, + "grad_norm": 0.04546458646655083, + "learning_rate": 0.00013323281875872003, + "loss": 0.2599, + "step": 19773 + }, + { + "epoch": 1.6019118600129616, + "grad_norm": 0.05735450237989426, + "learning_rate": 0.0001332283181061254, + "loss": 0.3001, + "step": 19774 + }, + { + "epoch": 1.6019928710304603, + "grad_norm": 0.049065109342336655, + "learning_rate": 0.00013322381745353075, + "loss": 0.2525, + "step": 19775 + }, + { + "epoch": 1.6020738820479585, + "grad_norm": 0.05018934980034828, + "learning_rate": 0.00013321931680093614, + "loss": 0.3187, + "step": 19776 + }, + { + "epoch": 1.6021548930654568, + "grad_norm": 0.05642695724964142, + "learning_rate": 0.00013321481614834152, + "loss": 0.3033, + "step": 19777 + }, + { + "epoch": 1.6022359040829552, + "grad_norm": 0.058584533631801605, + "learning_rate": 0.00013321031549574688, + "loss": 0.3525, + "step": 19778 + }, + { + "epoch": 1.6023169151004537, + "grad_norm": 0.051006946712732315, + "learning_rate": 0.00013320581484315227, + "loss": 0.2552, + "step": 19779 + }, + { + "epoch": 1.602397926117952, + "grad_norm": 0.05385277792811394, + "learning_rate": 0.00013320131419055763, + "loss": 0.2986, + "step": 19780 + }, + { + "epoch": 1.6024789371354504, + "grad_norm": 0.04713079333305359, + "learning_rate": 0.000133196813537963, + "loss": 0.246, + "step": 19781 + }, + { + "epoch": 1.6025599481529489, + "grad_norm": 0.043642837554216385, + "learning_rate": 0.00013319231288536838, + "loss": 0.2647, + "step": 19782 + }, + { + "epoch": 1.6026409591704471, + "grad_norm": 0.05561329796910286, + "learning_rate": 0.00013318781223277377, + "loss": 0.3199, + "step": 19783 + }, + { + "epoch": 1.6027219701879456, + "grad_norm": 0.05162304639816284, + "learning_rate": 0.00013318331158017913, + "loss": 0.264, + "step": 19784 + }, + { + "epoch": 1.602802981205444, + "grad_norm": 0.048306021839380264, + "learning_rate": 0.00013317881092758451, + "loss": 0.2482, + "step": 19785 + }, + { + "epoch": 1.6028839922229423, + "grad_norm": 0.053587399423122406, + "learning_rate": 0.00013317431027498987, + "loss": 0.296, + "step": 19786 + }, + { + "epoch": 1.6029650032404406, + "grad_norm": 0.06057516857981682, + "learning_rate": 0.00013316980962239526, + "loss": 0.288, + "step": 19787 + }, + { + "epoch": 1.6030460142579392, + "grad_norm": 0.05490785837173462, + "learning_rate": 0.00013316530896980062, + "loss": 0.3423, + "step": 19788 + }, + { + "epoch": 1.6031270252754375, + "grad_norm": 0.06102534383535385, + "learning_rate": 0.000133160808317206, + "loss": 0.2629, + "step": 19789 + }, + { + "epoch": 1.6032080362929357, + "grad_norm": 0.05123463645577431, + "learning_rate": 0.00013315630766461137, + "loss": 0.2602, + "step": 19790 + }, + { + "epoch": 1.6032890473104342, + "grad_norm": 0.052425283938646317, + "learning_rate": 0.00013315180701201676, + "loss": 0.285, + "step": 19791 + }, + { + "epoch": 1.6033700583279327, + "grad_norm": 0.05635687708854675, + "learning_rate": 0.00013314730635942212, + "loss": 0.26, + "step": 19792 + }, + { + "epoch": 1.603451069345431, + "grad_norm": 0.05245514586567879, + "learning_rate": 0.0001331428057068275, + "loss": 0.3, + "step": 19793 + }, + { + "epoch": 1.6035320803629294, + "grad_norm": 0.04669433832168579, + "learning_rate": 0.00013313830505423286, + "loss": 0.2645, + "step": 19794 + }, + { + "epoch": 1.6036130913804278, + "grad_norm": 0.060190096497535706, + "learning_rate": 0.00013313380440163825, + "loss": 0.3014, + "step": 19795 + }, + { + "epoch": 1.603694102397926, + "grad_norm": 0.05049053579568863, + "learning_rate": 0.0001331293037490436, + "loss": 0.2762, + "step": 19796 + }, + { + "epoch": 1.6037751134154243, + "grad_norm": 0.052670665085315704, + "learning_rate": 0.000133124803096449, + "loss": 0.3005, + "step": 19797 + }, + { + "epoch": 1.603856124432923, + "grad_norm": 0.044620417058467865, + "learning_rate": 0.00013312030244385436, + "loss": 0.2919, + "step": 19798 + }, + { + "epoch": 1.6039371354504213, + "grad_norm": 0.04406053200364113, + "learning_rate": 0.00013311580179125974, + "loss": 0.2886, + "step": 19799 + }, + { + "epoch": 1.6040181464679195, + "grad_norm": 0.055919043719768524, + "learning_rate": 0.00013311130113866513, + "loss": 0.3127, + "step": 19800 + }, + { + "epoch": 1.604099157485418, + "grad_norm": 0.05733250826597214, + "learning_rate": 0.0001331068004860705, + "loss": 0.2822, + "step": 19801 + }, + { + "epoch": 1.6041801685029164, + "grad_norm": 0.056601881980895996, + "learning_rate": 0.00013310229983347585, + "loss": 0.3001, + "step": 19802 + }, + { + "epoch": 1.6042611795204147, + "grad_norm": 0.056987252086400986, + "learning_rate": 0.00013309779918088124, + "loss": 0.3274, + "step": 19803 + }, + { + "epoch": 1.6043421905379132, + "grad_norm": 0.051350705325603485, + "learning_rate": 0.0001330932985282866, + "loss": 0.2828, + "step": 19804 + }, + { + "epoch": 1.6044232015554116, + "grad_norm": 0.04618507996201515, + "learning_rate": 0.00013308879787569199, + "loss": 0.2887, + "step": 19805 + }, + { + "epoch": 1.6045042125729099, + "grad_norm": 0.04378882050514221, + "learning_rate": 0.00013308429722309737, + "loss": 0.2404, + "step": 19806 + }, + { + "epoch": 1.6045852235904083, + "grad_norm": 0.05531271919608116, + "learning_rate": 0.00013307979657050273, + "loss": 0.3079, + "step": 19807 + }, + { + "epoch": 1.6046662346079068, + "grad_norm": 0.046200696378946304, + "learning_rate": 0.0001330752959179081, + "loss": 0.247, + "step": 19808 + }, + { + "epoch": 1.604747245625405, + "grad_norm": 0.04871489107608795, + "learning_rate": 0.00013307079526531348, + "loss": 0.2675, + "step": 19809 + }, + { + "epoch": 1.6048282566429033, + "grad_norm": 0.05007130280137062, + "learning_rate": 0.00013306629461271884, + "loss": 0.2578, + "step": 19810 + }, + { + "epoch": 1.6049092676604018, + "grad_norm": 0.04867855831980705, + "learning_rate": 0.00013306179396012423, + "loss": 0.2535, + "step": 19811 + }, + { + "epoch": 1.6049902786779002, + "grad_norm": 0.06068941205739975, + "learning_rate": 0.00013305729330752961, + "loss": 0.3194, + "step": 19812 + }, + { + "epoch": 1.6050712896953985, + "grad_norm": 0.05833900719881058, + "learning_rate": 0.00013305279265493497, + "loss": 0.3053, + "step": 19813 + }, + { + "epoch": 1.605152300712897, + "grad_norm": 0.05458652228116989, + "learning_rate": 0.00013304829200234033, + "loss": 0.2762, + "step": 19814 + }, + { + "epoch": 1.6052333117303954, + "grad_norm": 0.04864118620753288, + "learning_rate": 0.00013304379134974572, + "loss": 0.2749, + "step": 19815 + }, + { + "epoch": 1.6053143227478937, + "grad_norm": 0.050415292382240295, + "learning_rate": 0.00013303929069715108, + "loss": 0.2581, + "step": 19816 + }, + { + "epoch": 1.6053953337653921, + "grad_norm": 0.05480793118476868, + "learning_rate": 0.00013303479004455647, + "loss": 0.2748, + "step": 19817 + }, + { + "epoch": 1.6054763447828906, + "grad_norm": 0.06063258647918701, + "learning_rate": 0.00013303028939196186, + "loss": 0.3672, + "step": 19818 + }, + { + "epoch": 1.6055573558003888, + "grad_norm": 0.045498237013816833, + "learning_rate": 0.00013302578873936722, + "loss": 0.2592, + "step": 19819 + }, + { + "epoch": 1.605638366817887, + "grad_norm": 0.05446869507431984, + "learning_rate": 0.00013302128808677258, + "loss": 0.3247, + "step": 19820 + }, + { + "epoch": 1.6057193778353858, + "grad_norm": 0.05108707770705223, + "learning_rate": 0.00013301678743417796, + "loss": 0.3059, + "step": 19821 + }, + { + "epoch": 1.605800388852884, + "grad_norm": 0.051279496401548386, + "learning_rate": 0.00013301228678158332, + "loss": 0.3078, + "step": 19822 + }, + { + "epoch": 1.6058813998703823, + "grad_norm": 0.04803336411714554, + "learning_rate": 0.0001330077861289887, + "loss": 0.2683, + "step": 19823 + }, + { + "epoch": 1.6059624108878807, + "grad_norm": 0.04614466428756714, + "learning_rate": 0.0001330032854763941, + "loss": 0.3015, + "step": 19824 + }, + { + "epoch": 1.6060434219053792, + "grad_norm": 0.04936172068119049, + "learning_rate": 0.00013299878482379946, + "loss": 0.2777, + "step": 19825 + }, + { + "epoch": 1.6061244329228774, + "grad_norm": 0.05479852110147476, + "learning_rate": 0.00013299428417120482, + "loss": 0.2587, + "step": 19826 + }, + { + "epoch": 1.606205443940376, + "grad_norm": 0.05028698593378067, + "learning_rate": 0.0001329897835186102, + "loss": 0.2459, + "step": 19827 + }, + { + "epoch": 1.6062864549578744, + "grad_norm": 0.04531967639923096, + "learning_rate": 0.00013298528286601557, + "loss": 0.2751, + "step": 19828 + }, + { + "epoch": 1.6063674659753726, + "grad_norm": 0.057223107665777206, + "learning_rate": 0.00013298078221342095, + "loss": 0.2888, + "step": 19829 + }, + { + "epoch": 1.606448476992871, + "grad_norm": 0.05254938453435898, + "learning_rate": 0.00013297628156082634, + "loss": 0.2792, + "step": 19830 + }, + { + "epoch": 1.6065294880103695, + "grad_norm": 0.04757395759224892, + "learning_rate": 0.0001329717809082317, + "loss": 0.2652, + "step": 19831 + }, + { + "epoch": 1.6066104990278678, + "grad_norm": 0.04999027028679848, + "learning_rate": 0.00013296728025563706, + "loss": 0.253, + "step": 19832 + }, + { + "epoch": 1.606691510045366, + "grad_norm": 0.04609188064932823, + "learning_rate": 0.00013296277960304245, + "loss": 0.2631, + "step": 19833 + }, + { + "epoch": 1.6067725210628645, + "grad_norm": 0.0687926635146141, + "learning_rate": 0.0001329582789504478, + "loss": 0.2908, + "step": 19834 + }, + { + "epoch": 1.606853532080363, + "grad_norm": 0.04869447648525238, + "learning_rate": 0.0001329537782978532, + "loss": 0.2789, + "step": 19835 + }, + { + "epoch": 1.6069345430978612, + "grad_norm": 0.056731611490249634, + "learning_rate": 0.00013294927764525858, + "loss": 0.2591, + "step": 19836 + }, + { + "epoch": 1.6070155541153597, + "grad_norm": 0.04279259964823723, + "learning_rate": 0.00013294477699266394, + "loss": 0.2566, + "step": 19837 + }, + { + "epoch": 1.6070965651328581, + "grad_norm": 0.04434427246451378, + "learning_rate": 0.0001329402763400693, + "loss": 0.2493, + "step": 19838 + }, + { + "epoch": 1.6071775761503564, + "grad_norm": 0.051402896642684937, + "learning_rate": 0.0001329357756874747, + "loss": 0.2977, + "step": 19839 + }, + { + "epoch": 1.6072585871678549, + "grad_norm": 0.043560683727264404, + "learning_rate": 0.00013293127503488005, + "loss": 0.2336, + "step": 19840 + }, + { + "epoch": 1.6073395981853533, + "grad_norm": 0.04773182049393654, + "learning_rate": 0.00013292677438228544, + "loss": 0.2619, + "step": 19841 + }, + { + "epoch": 1.6074206092028516, + "grad_norm": 0.06585206091403961, + "learning_rate": 0.00013292227372969082, + "loss": 0.3395, + "step": 19842 + }, + { + "epoch": 1.6075016202203498, + "grad_norm": 0.05797537788748741, + "learning_rate": 0.00013291777307709618, + "loss": 0.3004, + "step": 19843 + }, + { + "epoch": 1.6075826312378485, + "grad_norm": 0.04809495434165001, + "learning_rate": 0.00013291327242450154, + "loss": 0.2938, + "step": 19844 + }, + { + "epoch": 1.6076636422553467, + "grad_norm": 0.05231109634041786, + "learning_rate": 0.00013290877177190693, + "loss": 0.3025, + "step": 19845 + }, + { + "epoch": 1.607744653272845, + "grad_norm": 0.050500620156526566, + "learning_rate": 0.0001329042711193123, + "loss": 0.2904, + "step": 19846 + }, + { + "epoch": 1.6078256642903435, + "grad_norm": 0.04343164339661598, + "learning_rate": 0.00013289977046671768, + "loss": 0.2609, + "step": 19847 + }, + { + "epoch": 1.607906675307842, + "grad_norm": 0.047382425516843796, + "learning_rate": 0.00013289526981412306, + "loss": 0.2377, + "step": 19848 + }, + { + "epoch": 1.6079876863253402, + "grad_norm": 0.06244940310716629, + "learning_rate": 0.00013289076916152842, + "loss": 0.3594, + "step": 19849 + }, + { + "epoch": 1.6080686973428386, + "grad_norm": 0.04549206793308258, + "learning_rate": 0.00013288626850893378, + "loss": 0.2358, + "step": 19850 + }, + { + "epoch": 1.608149708360337, + "grad_norm": 0.05260597914457321, + "learning_rate": 0.00013288176785633917, + "loss": 0.258, + "step": 19851 + }, + { + "epoch": 1.6082307193778353, + "grad_norm": 0.05020173266530037, + "learning_rate": 0.00013287726720374456, + "loss": 0.3011, + "step": 19852 + }, + { + "epoch": 1.6083117303953338, + "grad_norm": 0.0409814827144146, + "learning_rate": 0.00013287276655114992, + "loss": 0.2598, + "step": 19853 + }, + { + "epoch": 1.6083927414128323, + "grad_norm": 0.05606376752257347, + "learning_rate": 0.0001328682658985553, + "loss": 0.2746, + "step": 19854 + }, + { + "epoch": 1.6084737524303305, + "grad_norm": 0.05606624856591225, + "learning_rate": 0.00013286376524596067, + "loss": 0.3034, + "step": 19855 + }, + { + "epoch": 1.6085547634478288, + "grad_norm": 0.04946485906839371, + "learning_rate": 0.00013285926459336605, + "loss": 0.3051, + "step": 19856 + }, + { + "epoch": 1.6086357744653272, + "grad_norm": 0.05261649191379547, + "learning_rate": 0.0001328547639407714, + "loss": 0.2926, + "step": 19857 + }, + { + "epoch": 1.6087167854828257, + "grad_norm": 0.0560835599899292, + "learning_rate": 0.0001328502632881768, + "loss": 0.3266, + "step": 19858 + }, + { + "epoch": 1.608797796500324, + "grad_norm": 0.04868018627166748, + "learning_rate": 0.00013284576263558216, + "loss": 0.2871, + "step": 19859 + }, + { + "epoch": 1.6088788075178224, + "grad_norm": 0.051668599247932434, + "learning_rate": 0.00013284126198298755, + "loss": 0.2739, + "step": 19860 + }, + { + "epoch": 1.6089598185353209, + "grad_norm": 0.0633268728852272, + "learning_rate": 0.0001328367613303929, + "loss": 0.2766, + "step": 19861 + }, + { + "epoch": 1.6090408295528191, + "grad_norm": 0.05197061970829964, + "learning_rate": 0.0001328322606777983, + "loss": 0.2537, + "step": 19862 + }, + { + "epoch": 1.6091218405703176, + "grad_norm": 0.051223527640104294, + "learning_rate": 0.00013282776002520365, + "loss": 0.2814, + "step": 19863 + }, + { + "epoch": 1.609202851587816, + "grad_norm": 0.05875227227807045, + "learning_rate": 0.00013282325937260904, + "loss": 0.3074, + "step": 19864 + }, + { + "epoch": 1.6092838626053143, + "grad_norm": 0.046049121767282486, + "learning_rate": 0.0001328187587200144, + "loss": 0.2814, + "step": 19865 + }, + { + "epoch": 1.6093648736228126, + "grad_norm": 0.04564657807350159, + "learning_rate": 0.0001328142580674198, + "loss": 0.2924, + "step": 19866 + }, + { + "epoch": 1.6094458846403112, + "grad_norm": 0.058303602039813995, + "learning_rate": 0.00013280975741482515, + "loss": 0.3222, + "step": 19867 + }, + { + "epoch": 1.6095268956578095, + "grad_norm": 0.07431328296661377, + "learning_rate": 0.00013280525676223054, + "loss": 0.3287, + "step": 19868 + }, + { + "epoch": 1.6096079066753077, + "grad_norm": 0.054437801241874695, + "learning_rate": 0.0001328007561096359, + "loss": 0.2953, + "step": 19869 + }, + { + "epoch": 1.6096889176928062, + "grad_norm": 0.04726843908429146, + "learning_rate": 0.00013279625545704128, + "loss": 0.2388, + "step": 19870 + }, + { + "epoch": 1.6097699287103047, + "grad_norm": 0.052995022386312485, + "learning_rate": 0.00013279175480444664, + "loss": 0.2557, + "step": 19871 + }, + { + "epoch": 1.609850939727803, + "grad_norm": 0.06081311032176018, + "learning_rate": 0.00013278725415185203, + "loss": 0.3259, + "step": 19872 + }, + { + "epoch": 1.6099319507453014, + "grad_norm": 0.0430576428771019, + "learning_rate": 0.0001327827534992574, + "loss": 0.2449, + "step": 19873 + }, + { + "epoch": 1.6100129617627998, + "grad_norm": 0.050903283059597015, + "learning_rate": 0.00013277825284666278, + "loss": 0.2343, + "step": 19874 + }, + { + "epoch": 1.610093972780298, + "grad_norm": 0.053889211267232895, + "learning_rate": 0.00013277375219406817, + "loss": 0.2699, + "step": 19875 + }, + { + "epoch": 1.6101749837977966, + "grad_norm": 0.04701241850852966, + "learning_rate": 0.00013276925154147353, + "loss": 0.2732, + "step": 19876 + }, + { + "epoch": 1.610255994815295, + "grad_norm": 0.060669828206300735, + "learning_rate": 0.00013276475088887889, + "loss": 0.3078, + "step": 19877 + }, + { + "epoch": 1.6103370058327933, + "grad_norm": 0.0490998774766922, + "learning_rate": 0.00013276025023628427, + "loss": 0.2881, + "step": 19878 + }, + { + "epoch": 1.6104180168502915, + "grad_norm": 0.04689916968345642, + "learning_rate": 0.00013275574958368963, + "loss": 0.2816, + "step": 19879 + }, + { + "epoch": 1.61049902786779, + "grad_norm": 0.04962588846683502, + "learning_rate": 0.00013275124893109502, + "loss": 0.2833, + "step": 19880 + }, + { + "epoch": 1.6105800388852884, + "grad_norm": 0.05080372467637062, + "learning_rate": 0.0001327467482785004, + "loss": 0.2993, + "step": 19881 + }, + { + "epoch": 1.6106610499027867, + "grad_norm": 0.051091890782117844, + "learning_rate": 0.00013274224762590577, + "loss": 0.2686, + "step": 19882 + }, + { + "epoch": 1.6107420609202852, + "grad_norm": 0.04882129281759262, + "learning_rate": 0.00013273774697331113, + "loss": 0.2645, + "step": 19883 + }, + { + "epoch": 1.6108230719377836, + "grad_norm": 0.04933195188641548, + "learning_rate": 0.00013273324632071651, + "loss": 0.2666, + "step": 19884 + }, + { + "epoch": 1.6109040829552819, + "grad_norm": 0.05477169156074524, + "learning_rate": 0.00013272874566812187, + "loss": 0.309, + "step": 19885 + }, + { + "epoch": 1.6109850939727803, + "grad_norm": 0.05535150319337845, + "learning_rate": 0.00013272424501552726, + "loss": 0.2667, + "step": 19886 + }, + { + "epoch": 1.6110661049902788, + "grad_norm": 0.06334707140922546, + "learning_rate": 0.00013271974436293265, + "loss": 0.3023, + "step": 19887 + }, + { + "epoch": 1.611147116007777, + "grad_norm": 0.054047200828790665, + "learning_rate": 0.000132715243710338, + "loss": 0.2959, + "step": 19888 + }, + { + "epoch": 1.6112281270252753, + "grad_norm": 0.058965399861335754, + "learning_rate": 0.00013271074305774337, + "loss": 0.2971, + "step": 19889 + }, + { + "epoch": 1.611309138042774, + "grad_norm": 0.054235897958278656, + "learning_rate": 0.00013270624240514876, + "loss": 0.2655, + "step": 19890 + }, + { + "epoch": 1.6113901490602722, + "grad_norm": 0.04498407617211342, + "learning_rate": 0.00013270174175255412, + "loss": 0.2698, + "step": 19891 + }, + { + "epoch": 1.6114711600777705, + "grad_norm": 0.04842450097203255, + "learning_rate": 0.0001326972410999595, + "loss": 0.2538, + "step": 19892 + }, + { + "epoch": 1.611552171095269, + "grad_norm": 0.05681498348712921, + "learning_rate": 0.0001326927404473649, + "loss": 0.306, + "step": 19893 + }, + { + "epoch": 1.6116331821127674, + "grad_norm": 0.058336373418569565, + "learning_rate": 0.00013268823979477025, + "loss": 0.302, + "step": 19894 + }, + { + "epoch": 1.6117141931302656, + "grad_norm": 0.060595858842134476, + "learning_rate": 0.0001326837391421756, + "loss": 0.3151, + "step": 19895 + }, + { + "epoch": 1.6117952041477641, + "grad_norm": 0.05249301716685295, + "learning_rate": 0.000132679238489581, + "loss": 0.2786, + "step": 19896 + }, + { + "epoch": 1.6118762151652626, + "grad_norm": 0.05370105430483818, + "learning_rate": 0.00013267473783698636, + "loss": 0.2927, + "step": 19897 + }, + { + "epoch": 1.6119572261827608, + "grad_norm": 0.06065599247813225, + "learning_rate": 0.00013267023718439174, + "loss": 0.3331, + "step": 19898 + }, + { + "epoch": 1.612038237200259, + "grad_norm": 0.05296957865357399, + "learning_rate": 0.00013266573653179713, + "loss": 0.3013, + "step": 19899 + }, + { + "epoch": 1.6121192482177578, + "grad_norm": 0.043871358036994934, + "learning_rate": 0.0001326612358792025, + "loss": 0.2553, + "step": 19900 + }, + { + "epoch": 1.612200259235256, + "grad_norm": 0.053302399814128876, + "learning_rate": 0.00013265673522660785, + "loss": 0.2641, + "step": 19901 + }, + { + "epoch": 1.6122812702527543, + "grad_norm": 0.05802002549171448, + "learning_rate": 0.00013265223457401324, + "loss": 0.2746, + "step": 19902 + }, + { + "epoch": 1.6123622812702527, + "grad_norm": 0.05712159350514412, + "learning_rate": 0.0001326477339214186, + "loss": 0.3001, + "step": 19903 + }, + { + "epoch": 1.6124432922877512, + "grad_norm": 0.05771899223327637, + "learning_rate": 0.00013264323326882399, + "loss": 0.2807, + "step": 19904 + }, + { + "epoch": 1.6125243033052494, + "grad_norm": 0.0562940388917923, + "learning_rate": 0.00013263873261622937, + "loss": 0.2796, + "step": 19905 + }, + { + "epoch": 1.612605314322748, + "grad_norm": 0.054468363523483276, + "learning_rate": 0.00013263423196363473, + "loss": 0.2586, + "step": 19906 + }, + { + "epoch": 1.6126863253402464, + "grad_norm": 0.05557860806584358, + "learning_rate": 0.0001326297313110401, + "loss": 0.2958, + "step": 19907 + }, + { + "epoch": 1.6127673363577446, + "grad_norm": 0.04691595211625099, + "learning_rate": 0.00013262523065844548, + "loss": 0.2893, + "step": 19908 + }, + { + "epoch": 1.612848347375243, + "grad_norm": 0.04499751329421997, + "learning_rate": 0.00013262073000585084, + "loss": 0.2617, + "step": 19909 + }, + { + "epoch": 1.6129293583927415, + "grad_norm": 0.05714256316423416, + "learning_rate": 0.00013261622935325623, + "loss": 0.3022, + "step": 19910 + }, + { + "epoch": 1.6130103694102398, + "grad_norm": 0.04927900433540344, + "learning_rate": 0.00013261172870066161, + "loss": 0.2652, + "step": 19911 + }, + { + "epoch": 1.613091380427738, + "grad_norm": 0.05857254937291145, + "learning_rate": 0.00013260722804806697, + "loss": 0.28, + "step": 19912 + }, + { + "epoch": 1.6131723914452365, + "grad_norm": 0.04966528341174126, + "learning_rate": 0.00013260272739547233, + "loss": 0.2972, + "step": 19913 + }, + { + "epoch": 1.613253402462735, + "grad_norm": 0.0434284582734108, + "learning_rate": 0.00013259822674287772, + "loss": 0.2709, + "step": 19914 + }, + { + "epoch": 1.6133344134802332, + "grad_norm": 0.0510571151971817, + "learning_rate": 0.00013259372609028308, + "loss": 0.2709, + "step": 19915 + }, + { + "epoch": 1.6134154244977317, + "grad_norm": 0.04903542995452881, + "learning_rate": 0.00013258922543768847, + "loss": 0.2988, + "step": 19916 + }, + { + "epoch": 1.6134964355152301, + "grad_norm": 0.047472693026065826, + "learning_rate": 0.00013258472478509386, + "loss": 0.2568, + "step": 19917 + }, + { + "epoch": 1.6135774465327284, + "grad_norm": 0.04385467246174812, + "learning_rate": 0.00013258022413249922, + "loss": 0.2541, + "step": 19918 + }, + { + "epoch": 1.6136584575502269, + "grad_norm": 0.05597635731101036, + "learning_rate": 0.00013257572347990458, + "loss": 0.2855, + "step": 19919 + }, + { + "epoch": 1.6137394685677253, + "grad_norm": 0.04206670820713043, + "learning_rate": 0.00013257122282730996, + "loss": 0.2741, + "step": 19920 + }, + { + "epoch": 1.6138204795852236, + "grad_norm": 0.04178616777062416, + "learning_rate": 0.00013256672217471532, + "loss": 0.2366, + "step": 19921 + }, + { + "epoch": 1.6139014906027218, + "grad_norm": 0.046484146267175674, + "learning_rate": 0.0001325622215221207, + "loss": 0.248, + "step": 19922 + }, + { + "epoch": 1.6139825016202205, + "grad_norm": 0.05964015796780586, + "learning_rate": 0.0001325577208695261, + "loss": 0.3048, + "step": 19923 + }, + { + "epoch": 1.6140635126377187, + "grad_norm": 0.04161230847239494, + "learning_rate": 0.00013255322021693146, + "loss": 0.2623, + "step": 19924 + }, + { + "epoch": 1.614144523655217, + "grad_norm": 0.05486117675900459, + "learning_rate": 0.00013254871956433685, + "loss": 0.2727, + "step": 19925 + }, + { + "epoch": 1.6142255346727155, + "grad_norm": 0.053675271570682526, + "learning_rate": 0.0001325442189117422, + "loss": 0.2751, + "step": 19926 + }, + { + "epoch": 1.614306545690214, + "grad_norm": 0.054313309490680695, + "learning_rate": 0.00013253971825914757, + "loss": 0.3051, + "step": 19927 + }, + { + "epoch": 1.6143875567077122, + "grad_norm": 0.06386713683605194, + "learning_rate": 0.00013253521760655295, + "loss": 0.2903, + "step": 19928 + }, + { + "epoch": 1.6144685677252106, + "grad_norm": 0.049881864339113235, + "learning_rate": 0.00013253071695395834, + "loss": 0.2439, + "step": 19929 + }, + { + "epoch": 1.614549578742709, + "grad_norm": 0.05016208440065384, + "learning_rate": 0.0001325262163013637, + "loss": 0.2485, + "step": 19930 + }, + { + "epoch": 1.6146305897602073, + "grad_norm": 0.055789947509765625, + "learning_rate": 0.0001325217156487691, + "loss": 0.2693, + "step": 19931 + }, + { + "epoch": 1.6147116007777058, + "grad_norm": 0.04924854636192322, + "learning_rate": 0.00013251721499617445, + "loss": 0.2696, + "step": 19932 + }, + { + "epoch": 1.6147926117952043, + "grad_norm": 0.052969809621572495, + "learning_rate": 0.00013251271434357983, + "loss": 0.2935, + "step": 19933 + }, + { + "epoch": 1.6148736228127025, + "grad_norm": 0.046944133937358856, + "learning_rate": 0.0001325082136909852, + "loss": 0.2736, + "step": 19934 + }, + { + "epoch": 1.6149546338302008, + "grad_norm": 0.05402558296918869, + "learning_rate": 0.00013250371303839058, + "loss": 0.2807, + "step": 19935 + }, + { + "epoch": 1.6150356448476992, + "grad_norm": 0.05391369387507439, + "learning_rate": 0.00013249921238579594, + "loss": 0.3001, + "step": 19936 + }, + { + "epoch": 1.6151166558651977, + "grad_norm": 0.04467498138546944, + "learning_rate": 0.00013249471173320133, + "loss": 0.2821, + "step": 19937 + }, + { + "epoch": 1.615197666882696, + "grad_norm": 0.047838423401117325, + "learning_rate": 0.0001324902110806067, + "loss": 0.2755, + "step": 19938 + }, + { + "epoch": 1.6152786779001944, + "grad_norm": 0.04848472401499748, + "learning_rate": 0.00013248571042801208, + "loss": 0.2691, + "step": 19939 + }, + { + "epoch": 1.6153596889176929, + "grad_norm": 0.050038691610097885, + "learning_rate": 0.00013248120977541744, + "loss": 0.2876, + "step": 19940 + }, + { + "epoch": 1.6154406999351911, + "grad_norm": 0.04588980972766876, + "learning_rate": 0.00013247670912282282, + "loss": 0.2601, + "step": 19941 + }, + { + "epoch": 1.6155217109526896, + "grad_norm": 0.0561276376247406, + "learning_rate": 0.00013247220847022818, + "loss": 0.2882, + "step": 19942 + }, + { + "epoch": 1.615602721970188, + "grad_norm": 0.05049389973282814, + "learning_rate": 0.00013246770781763357, + "loss": 0.2887, + "step": 19943 + }, + { + "epoch": 1.6156837329876863, + "grad_norm": 0.045459870249032974, + "learning_rate": 0.00013246320716503893, + "loss": 0.2607, + "step": 19944 + }, + { + "epoch": 1.6157647440051845, + "grad_norm": 0.06378405541181564, + "learning_rate": 0.00013245870651244432, + "loss": 0.2883, + "step": 19945 + }, + { + "epoch": 1.6158457550226832, + "grad_norm": 0.042002785950899124, + "learning_rate": 0.00013245420585984968, + "loss": 0.245, + "step": 19946 + }, + { + "epoch": 1.6159267660401815, + "grad_norm": 0.058416012674570084, + "learning_rate": 0.00013244970520725506, + "loss": 0.2586, + "step": 19947 + }, + { + "epoch": 1.6160077770576797, + "grad_norm": 0.06198345497250557, + "learning_rate": 0.00013244520455466042, + "loss": 0.2756, + "step": 19948 + }, + { + "epoch": 1.6160887880751782, + "grad_norm": 0.04815270006656647, + "learning_rate": 0.0001324407039020658, + "loss": 0.2747, + "step": 19949 + }, + { + "epoch": 1.6161697990926767, + "grad_norm": 0.05506781488656998, + "learning_rate": 0.00013243620324947117, + "loss": 0.2645, + "step": 19950 + }, + { + "epoch": 1.616250810110175, + "grad_norm": 0.053383927792310715, + "learning_rate": 0.00013243170259687656, + "loss": 0.2612, + "step": 19951 + }, + { + "epoch": 1.6163318211276734, + "grad_norm": 0.05213800072669983, + "learning_rate": 0.00013242720194428192, + "loss": 0.3022, + "step": 19952 + }, + { + "epoch": 1.6164128321451718, + "grad_norm": 0.05713577941060066, + "learning_rate": 0.0001324227012916873, + "loss": 0.272, + "step": 19953 + }, + { + "epoch": 1.61649384316267, + "grad_norm": 0.06162557750940323, + "learning_rate": 0.00013241820063909267, + "loss": 0.2899, + "step": 19954 + }, + { + "epoch": 1.6165748541801686, + "grad_norm": 0.0560946948826313, + "learning_rate": 0.00013241369998649805, + "loss": 0.2882, + "step": 19955 + }, + { + "epoch": 1.616655865197667, + "grad_norm": 0.0490209124982357, + "learning_rate": 0.00013240919933390344, + "loss": 0.258, + "step": 19956 + }, + { + "epoch": 1.6167368762151653, + "grad_norm": 0.05507722124457359, + "learning_rate": 0.0001324046986813088, + "loss": 0.3084, + "step": 19957 + }, + { + "epoch": 1.6168178872326635, + "grad_norm": 0.05552484840154648, + "learning_rate": 0.00013240019802871416, + "loss": 0.2671, + "step": 19958 + }, + { + "epoch": 1.616898898250162, + "grad_norm": 0.05078752711415291, + "learning_rate": 0.00013239569737611955, + "loss": 0.2699, + "step": 19959 + }, + { + "epoch": 1.6169799092676604, + "grad_norm": 0.058282483369112015, + "learning_rate": 0.0001323911967235249, + "loss": 0.2638, + "step": 19960 + }, + { + "epoch": 1.6170609202851587, + "grad_norm": 0.05486408621072769, + "learning_rate": 0.0001323866960709303, + "loss": 0.3054, + "step": 19961 + }, + { + "epoch": 1.6171419313026572, + "grad_norm": 0.06649196892976761, + "learning_rate": 0.00013238219541833568, + "loss": 0.2942, + "step": 19962 + }, + { + "epoch": 1.6172229423201556, + "grad_norm": 0.04470723867416382, + "learning_rate": 0.00013237769476574104, + "loss": 0.2525, + "step": 19963 + }, + { + "epoch": 1.6173039533376539, + "grad_norm": 0.04812270402908325, + "learning_rate": 0.0001323731941131464, + "loss": 0.307, + "step": 19964 + }, + { + "epoch": 1.6173849643551523, + "grad_norm": 0.04237162694334984, + "learning_rate": 0.0001323686934605518, + "loss": 0.2718, + "step": 19965 + }, + { + "epoch": 1.6174659753726508, + "grad_norm": 0.05187264829874039, + "learning_rate": 0.00013236419280795715, + "loss": 0.2814, + "step": 19966 + }, + { + "epoch": 1.617546986390149, + "grad_norm": 0.04562138766050339, + "learning_rate": 0.00013235969215536254, + "loss": 0.2613, + "step": 19967 + }, + { + "epoch": 1.6176279974076473, + "grad_norm": 0.057726096361875534, + "learning_rate": 0.00013235519150276792, + "loss": 0.2808, + "step": 19968 + }, + { + "epoch": 1.617709008425146, + "grad_norm": 0.04597414657473564, + "learning_rate": 0.00013235069085017328, + "loss": 0.2576, + "step": 19969 + }, + { + "epoch": 1.6177900194426442, + "grad_norm": 0.0467311330139637, + "learning_rate": 0.00013234619019757864, + "loss": 0.2876, + "step": 19970 + }, + { + "epoch": 1.6178710304601425, + "grad_norm": 0.05734841153025627, + "learning_rate": 0.00013234168954498403, + "loss": 0.2855, + "step": 19971 + }, + { + "epoch": 1.617952041477641, + "grad_norm": 0.05373970791697502, + "learning_rate": 0.0001323371888923894, + "loss": 0.2681, + "step": 19972 + }, + { + "epoch": 1.6180330524951394, + "grad_norm": 0.054225221276283264, + "learning_rate": 0.00013233268823979478, + "loss": 0.2718, + "step": 19973 + }, + { + "epoch": 1.6181140635126376, + "grad_norm": 0.059341464191675186, + "learning_rate": 0.00013232818758720017, + "loss": 0.298, + "step": 19974 + }, + { + "epoch": 1.6181950745301361, + "grad_norm": 0.05167774483561516, + "learning_rate": 0.00013232368693460553, + "loss": 0.2943, + "step": 19975 + }, + { + "epoch": 1.6182760855476346, + "grad_norm": 0.04827655479311943, + "learning_rate": 0.00013231918628201089, + "loss": 0.3108, + "step": 19976 + }, + { + "epoch": 1.6183570965651328, + "grad_norm": 0.058139413595199585, + "learning_rate": 0.00013231468562941627, + "loss": 0.3123, + "step": 19977 + }, + { + "epoch": 1.6184381075826313, + "grad_norm": 0.053694140166044235, + "learning_rate": 0.00013231018497682163, + "loss": 0.2752, + "step": 19978 + }, + { + "epoch": 1.6185191186001298, + "grad_norm": 0.06099880859255791, + "learning_rate": 0.00013230568432422702, + "loss": 0.3095, + "step": 19979 + }, + { + "epoch": 1.618600129617628, + "grad_norm": 0.04789900779724121, + "learning_rate": 0.0001323011836716324, + "loss": 0.3085, + "step": 19980 + }, + { + "epoch": 1.6186811406351262, + "grad_norm": 0.055885620415210724, + "learning_rate": 0.00013229668301903777, + "loss": 0.2826, + "step": 19981 + }, + { + "epoch": 1.6187621516526247, + "grad_norm": 0.05799628049135208, + "learning_rate": 0.00013229218236644313, + "loss": 0.3371, + "step": 19982 + }, + { + "epoch": 1.6188431626701232, + "grad_norm": 0.06020451337099075, + "learning_rate": 0.00013228768171384851, + "loss": 0.3252, + "step": 19983 + }, + { + "epoch": 1.6189241736876214, + "grad_norm": 0.04801985248923302, + "learning_rate": 0.00013228318106125387, + "loss": 0.2793, + "step": 19984 + }, + { + "epoch": 1.61900518470512, + "grad_norm": 0.055071376264095306, + "learning_rate": 0.00013227868040865926, + "loss": 0.2328, + "step": 19985 + }, + { + "epoch": 1.6190861957226184, + "grad_norm": 0.050464339554309845, + "learning_rate": 0.00013227417975606465, + "loss": 0.3128, + "step": 19986 + }, + { + "epoch": 1.6191672067401166, + "grad_norm": 0.048161376267671585, + "learning_rate": 0.00013226967910347, + "loss": 0.2921, + "step": 19987 + }, + { + "epoch": 1.619248217757615, + "grad_norm": 0.049764443188905716, + "learning_rate": 0.00013226517845087537, + "loss": 0.2977, + "step": 19988 + }, + { + "epoch": 1.6193292287751135, + "grad_norm": 0.04596319422125816, + "learning_rate": 0.00013226067779828076, + "loss": 0.2582, + "step": 19989 + }, + { + "epoch": 1.6194102397926118, + "grad_norm": 0.048334065824747086, + "learning_rate": 0.00013225617714568612, + "loss": 0.2833, + "step": 19990 + }, + { + "epoch": 1.61949125081011, + "grad_norm": 0.046483226120471954, + "learning_rate": 0.0001322516764930915, + "loss": 0.2516, + "step": 19991 + }, + { + "epoch": 1.6195722618276087, + "grad_norm": 0.05041767284274101, + "learning_rate": 0.0001322471758404969, + "loss": 0.2894, + "step": 19992 + }, + { + "epoch": 1.619653272845107, + "grad_norm": 0.06127709522843361, + "learning_rate": 0.00013224267518790225, + "loss": 0.2809, + "step": 19993 + }, + { + "epoch": 1.6197342838626052, + "grad_norm": 0.04631965979933739, + "learning_rate": 0.00013223817453530764, + "loss": 0.2726, + "step": 19994 + }, + { + "epoch": 1.6198152948801037, + "grad_norm": 0.04897136241197586, + "learning_rate": 0.000132233673882713, + "loss": 0.2937, + "step": 19995 + }, + { + "epoch": 1.6198963058976021, + "grad_norm": 0.045244332402944565, + "learning_rate": 0.00013222917323011836, + "loss": 0.2655, + "step": 19996 + }, + { + "epoch": 1.6199773169151004, + "grad_norm": 0.054183993488550186, + "learning_rate": 0.00013222467257752374, + "loss": 0.3365, + "step": 19997 + }, + { + "epoch": 1.6200583279325989, + "grad_norm": 0.047290198504924774, + "learning_rate": 0.00013222017192492913, + "loss": 0.2607, + "step": 19998 + }, + { + "epoch": 1.6201393389500973, + "grad_norm": 0.06739253550767899, + "learning_rate": 0.0001322156712723345, + "loss": 0.3229, + "step": 19999 + }, + { + "epoch": 1.6202203499675956, + "grad_norm": 0.05656042322516441, + "learning_rate": 0.00013221117061973988, + "loss": 0.2965, + "step": 20000 + }, + { + "epoch": 1.6203013609850938, + "grad_norm": 0.05531647056341171, + "learning_rate": 0.00013220666996714524, + "loss": 0.2771, + "step": 20001 + }, + { + "epoch": 1.6203823720025925, + "grad_norm": 0.04169746860861778, + "learning_rate": 0.0001322021693145506, + "loss": 0.2543, + "step": 20002 + }, + { + "epoch": 1.6204633830200907, + "grad_norm": 0.05271003022789955, + "learning_rate": 0.00013219766866195599, + "loss": 0.3237, + "step": 20003 + }, + { + "epoch": 1.620544394037589, + "grad_norm": 0.04714246466755867, + "learning_rate": 0.00013219316800936137, + "loss": 0.2578, + "step": 20004 + }, + { + "epoch": 1.6206254050550875, + "grad_norm": 0.04622301459312439, + "learning_rate": 0.00013218866735676673, + "loss": 0.2473, + "step": 20005 + }, + { + "epoch": 1.620706416072586, + "grad_norm": 0.06228908896446228, + "learning_rate": 0.00013218416670417212, + "loss": 0.2811, + "step": 20006 + }, + { + "epoch": 1.6207874270900842, + "grad_norm": 0.04656405746936798, + "learning_rate": 0.00013217966605157748, + "loss": 0.2645, + "step": 20007 + }, + { + "epoch": 1.6208684381075826, + "grad_norm": 0.04981234669685364, + "learning_rate": 0.00013217516539898287, + "loss": 0.3067, + "step": 20008 + }, + { + "epoch": 1.620949449125081, + "grad_norm": 0.059026286005973816, + "learning_rate": 0.00013217066474638823, + "loss": 0.2659, + "step": 20009 + }, + { + "epoch": 1.6210304601425793, + "grad_norm": 0.05904344469308853, + "learning_rate": 0.00013216616409379362, + "loss": 0.2942, + "step": 20010 + }, + { + "epoch": 1.6211114711600778, + "grad_norm": 0.057300008833408356, + "learning_rate": 0.00013216166344119898, + "loss": 0.363, + "step": 20011 + }, + { + "epoch": 1.6211924821775763, + "grad_norm": 0.04654074087738991, + "learning_rate": 0.00013215716278860436, + "loss": 0.2622, + "step": 20012 + }, + { + "epoch": 1.6212734931950745, + "grad_norm": 0.04676036536693573, + "learning_rate": 0.00013215266213600972, + "loss": 0.2864, + "step": 20013 + }, + { + "epoch": 1.6213545042125728, + "grad_norm": 0.05033322051167488, + "learning_rate": 0.0001321481614834151, + "loss": 0.2451, + "step": 20014 + }, + { + "epoch": 1.6214355152300715, + "grad_norm": 0.05171377584338188, + "learning_rate": 0.00013214366083082047, + "loss": 0.2608, + "step": 20015 + }, + { + "epoch": 1.6215165262475697, + "grad_norm": 0.0688476637005806, + "learning_rate": 0.00013213916017822586, + "loss": 0.3154, + "step": 20016 + }, + { + "epoch": 1.621597537265068, + "grad_norm": 0.06028394401073456, + "learning_rate": 0.00013213465952563122, + "loss": 0.2567, + "step": 20017 + }, + { + "epoch": 1.6216785482825664, + "grad_norm": 0.05227206274867058, + "learning_rate": 0.0001321301588730366, + "loss": 0.2897, + "step": 20018 + }, + { + "epoch": 1.6217595593000649, + "grad_norm": 0.06116931885480881, + "learning_rate": 0.00013212565822044196, + "loss": 0.2995, + "step": 20019 + }, + { + "epoch": 1.6218405703175631, + "grad_norm": 0.06551968306303024, + "learning_rate": 0.00013212115756784735, + "loss": 0.3206, + "step": 20020 + }, + { + "epoch": 1.6219215813350616, + "grad_norm": 0.052293311804533005, + "learning_rate": 0.0001321166569152527, + "loss": 0.269, + "step": 20021 + }, + { + "epoch": 1.62200259235256, + "grad_norm": 0.04668476805090904, + "learning_rate": 0.0001321121562626581, + "loss": 0.2801, + "step": 20022 + }, + { + "epoch": 1.6220836033700583, + "grad_norm": 0.044271472841501236, + "learning_rate": 0.00013210765561006346, + "loss": 0.2416, + "step": 20023 + }, + { + "epoch": 1.6221646143875565, + "grad_norm": 0.04560382664203644, + "learning_rate": 0.00013210315495746885, + "loss": 0.256, + "step": 20024 + }, + { + "epoch": 1.6222456254050552, + "grad_norm": 0.06311310082674026, + "learning_rate": 0.0001320986543048742, + "loss": 0.3124, + "step": 20025 + }, + { + "epoch": 1.6223266364225535, + "grad_norm": 0.05169247090816498, + "learning_rate": 0.0001320941536522796, + "loss": 0.2653, + "step": 20026 + }, + { + "epoch": 1.6224076474400517, + "grad_norm": 0.05335776507854462, + "learning_rate": 0.00013208965299968495, + "loss": 0.2366, + "step": 20027 + }, + { + "epoch": 1.6224886584575502, + "grad_norm": 0.06243913248181343, + "learning_rate": 0.00013208515234709034, + "loss": 0.3162, + "step": 20028 + }, + { + "epoch": 1.6225696694750487, + "grad_norm": 0.04636109620332718, + "learning_rate": 0.0001320806516944957, + "loss": 0.2772, + "step": 20029 + }, + { + "epoch": 1.622650680492547, + "grad_norm": 0.042730703949928284, + "learning_rate": 0.0001320761510419011, + "loss": 0.2821, + "step": 20030 + }, + { + "epoch": 1.6227316915100454, + "grad_norm": 0.05869164690375328, + "learning_rate": 0.00013207165038930645, + "loss": 0.2794, + "step": 20031 + }, + { + "epoch": 1.6228127025275438, + "grad_norm": 0.05897437408566475, + "learning_rate": 0.00013206714973671183, + "loss": 0.2435, + "step": 20032 + }, + { + "epoch": 1.622893713545042, + "grad_norm": 0.04350670427083969, + "learning_rate": 0.0001320626490841172, + "loss": 0.2421, + "step": 20033 + }, + { + "epoch": 1.6229747245625405, + "grad_norm": 0.05823887139558792, + "learning_rate": 0.00013205814843152258, + "loss": 0.2929, + "step": 20034 + }, + { + "epoch": 1.623055735580039, + "grad_norm": 0.0536017045378685, + "learning_rate": 0.00013205364777892794, + "loss": 0.2449, + "step": 20035 + }, + { + "epoch": 1.6231367465975373, + "grad_norm": 0.05177351087331772, + "learning_rate": 0.00013204914712633333, + "loss": 0.2554, + "step": 20036 + }, + { + "epoch": 1.6232177576150355, + "grad_norm": 0.04570000618696213, + "learning_rate": 0.00013204464647373872, + "loss": 0.2591, + "step": 20037 + }, + { + "epoch": 1.623298768632534, + "grad_norm": 0.046484798192977905, + "learning_rate": 0.00013204014582114408, + "loss": 0.258, + "step": 20038 + }, + { + "epoch": 1.6233797796500324, + "grad_norm": 0.057637009769678116, + "learning_rate": 0.00013203564516854944, + "loss": 0.2751, + "step": 20039 + }, + { + "epoch": 1.6234607906675307, + "grad_norm": 0.05245767906308174, + "learning_rate": 0.00013203114451595482, + "loss": 0.2826, + "step": 20040 + }, + { + "epoch": 1.6235418016850292, + "grad_norm": 0.047266315668821335, + "learning_rate": 0.00013202664386336018, + "loss": 0.2923, + "step": 20041 + }, + { + "epoch": 1.6236228127025276, + "grad_norm": 0.051065824925899506, + "learning_rate": 0.00013202214321076557, + "loss": 0.3063, + "step": 20042 + }, + { + "epoch": 1.6237038237200259, + "grad_norm": 0.046526215970516205, + "learning_rate": 0.00013201764255817096, + "loss": 0.2932, + "step": 20043 + }, + { + "epoch": 1.6237848347375243, + "grad_norm": 0.04801933839917183, + "learning_rate": 0.00013201314190557632, + "loss": 0.2875, + "step": 20044 + }, + { + "epoch": 1.6238658457550228, + "grad_norm": 0.050072457641363144, + "learning_rate": 0.00013200864125298168, + "loss": 0.2944, + "step": 20045 + }, + { + "epoch": 1.623946856772521, + "grad_norm": 0.06301888823509216, + "learning_rate": 0.00013200414060038706, + "loss": 0.28, + "step": 20046 + }, + { + "epoch": 1.6240278677900193, + "grad_norm": 0.05195078253746033, + "learning_rate": 0.00013199963994779242, + "loss": 0.2753, + "step": 20047 + }, + { + "epoch": 1.624108878807518, + "grad_norm": 0.05736146122217178, + "learning_rate": 0.0001319951392951978, + "loss": 0.2927, + "step": 20048 + }, + { + "epoch": 1.6241898898250162, + "grad_norm": 0.046378325670957565, + "learning_rate": 0.0001319906386426032, + "loss": 0.2689, + "step": 20049 + }, + { + "epoch": 1.6242709008425145, + "grad_norm": 0.046528127044439316, + "learning_rate": 0.00013198613799000856, + "loss": 0.2705, + "step": 20050 + }, + { + "epoch": 1.624351911860013, + "grad_norm": 0.06076221540570259, + "learning_rate": 0.00013198163733741392, + "loss": 0.296, + "step": 20051 + }, + { + "epoch": 1.6244329228775114, + "grad_norm": 0.060062017291784286, + "learning_rate": 0.0001319771366848193, + "loss": 0.2803, + "step": 20052 + }, + { + "epoch": 1.6245139338950096, + "grad_norm": 0.05728905275464058, + "learning_rate": 0.00013197263603222467, + "loss": 0.2699, + "step": 20053 + }, + { + "epoch": 1.624594944912508, + "grad_norm": 0.047682132571935654, + "learning_rate": 0.00013196813537963005, + "loss": 0.3047, + "step": 20054 + }, + { + "epoch": 1.6246759559300066, + "grad_norm": 0.05280459299683571, + "learning_rate": 0.00013196363472703544, + "loss": 0.291, + "step": 20055 + }, + { + "epoch": 1.6247569669475048, + "grad_norm": 0.06270458549261093, + "learning_rate": 0.0001319591340744408, + "loss": 0.333, + "step": 20056 + }, + { + "epoch": 1.6248379779650033, + "grad_norm": 0.056303154677152634, + "learning_rate": 0.00013195463342184616, + "loss": 0.2765, + "step": 20057 + }, + { + "epoch": 1.6249189889825018, + "grad_norm": 0.046801600605249405, + "learning_rate": 0.00013195013276925155, + "loss": 0.2613, + "step": 20058 + }, + { + "epoch": 1.625, + "grad_norm": 0.0449674017727375, + "learning_rate": 0.0001319456321166569, + "loss": 0.3219, + "step": 20059 + }, + { + "epoch": 1.6250810110174982, + "grad_norm": 0.05680710822343826, + "learning_rate": 0.0001319411314640623, + "loss": 0.3005, + "step": 20060 + }, + { + "epoch": 1.6251620220349967, + "grad_norm": 0.043305426836013794, + "learning_rate": 0.00013193663081146768, + "loss": 0.2581, + "step": 20061 + }, + { + "epoch": 1.6252430330524952, + "grad_norm": 0.05441425368189812, + "learning_rate": 0.00013193213015887304, + "loss": 0.3147, + "step": 20062 + }, + { + "epoch": 1.6253240440699934, + "grad_norm": 0.05140216648578644, + "learning_rate": 0.00013192762950627843, + "loss": 0.2828, + "step": 20063 + }, + { + "epoch": 1.625405055087492, + "grad_norm": 0.06091989949345589, + "learning_rate": 0.0001319231288536838, + "loss": 0.3423, + "step": 20064 + }, + { + "epoch": 1.6254860661049904, + "grad_norm": 0.05574041232466698, + "learning_rate": 0.00013191862820108915, + "loss": 0.2845, + "step": 20065 + }, + { + "epoch": 1.6255670771224886, + "grad_norm": 0.049806464463472366, + "learning_rate": 0.00013191412754849454, + "loss": 0.2564, + "step": 20066 + }, + { + "epoch": 1.625648088139987, + "grad_norm": 0.05343364179134369, + "learning_rate": 0.00013190962689589992, + "loss": 0.2985, + "step": 20067 + }, + { + "epoch": 1.6257290991574855, + "grad_norm": 0.04434232786297798, + "learning_rate": 0.00013190512624330528, + "loss": 0.2855, + "step": 20068 + }, + { + "epoch": 1.6258101101749838, + "grad_norm": 0.05006261169910431, + "learning_rate": 0.00013190062559071067, + "loss": 0.2845, + "step": 20069 + }, + { + "epoch": 1.625891121192482, + "grad_norm": 0.05992110073566437, + "learning_rate": 0.00013189612493811603, + "loss": 0.2911, + "step": 20070 + }, + { + "epoch": 1.6259721322099807, + "grad_norm": 0.05126938223838806, + "learning_rate": 0.0001318916242855214, + "loss": 0.3202, + "step": 20071 + }, + { + "epoch": 1.626053143227479, + "grad_norm": 0.05510419234633446, + "learning_rate": 0.00013188712363292678, + "loss": 0.3291, + "step": 20072 + }, + { + "epoch": 1.6261341542449772, + "grad_norm": 0.05981983244419098, + "learning_rate": 0.00013188262298033217, + "loss": 0.2688, + "step": 20073 + }, + { + "epoch": 1.6262151652624757, + "grad_norm": 0.045334115624427795, + "learning_rate": 0.00013187812232773753, + "loss": 0.257, + "step": 20074 + }, + { + "epoch": 1.6262961762799741, + "grad_norm": 0.060123879462480545, + "learning_rate": 0.0001318736216751429, + "loss": 0.308, + "step": 20075 + }, + { + "epoch": 1.6263771872974724, + "grad_norm": 0.055499814450740814, + "learning_rate": 0.00013186912102254827, + "loss": 0.2708, + "step": 20076 + }, + { + "epoch": 1.6264581983149708, + "grad_norm": 0.06412205100059509, + "learning_rate": 0.00013186462036995363, + "loss": 0.3135, + "step": 20077 + }, + { + "epoch": 1.6265392093324693, + "grad_norm": 0.05298837646842003, + "learning_rate": 0.00013186011971735902, + "loss": 0.2762, + "step": 20078 + }, + { + "epoch": 1.6266202203499676, + "grad_norm": 0.05011403560638428, + "learning_rate": 0.0001318556190647644, + "loss": 0.2516, + "step": 20079 + }, + { + "epoch": 1.626701231367466, + "grad_norm": 0.060437921434640884, + "learning_rate": 0.00013185111841216977, + "loss": 0.3206, + "step": 20080 + }, + { + "epoch": 1.6267822423849645, + "grad_norm": 0.04542511701583862, + "learning_rate": 0.00013184661775957515, + "loss": 0.2584, + "step": 20081 + }, + { + "epoch": 1.6268632534024627, + "grad_norm": 0.059091124683618546, + "learning_rate": 0.00013184211710698051, + "loss": 0.2532, + "step": 20082 + }, + { + "epoch": 1.626944264419961, + "grad_norm": 0.04803014174103737, + "learning_rate": 0.00013183761645438587, + "loss": 0.2554, + "step": 20083 + }, + { + "epoch": 1.6270252754374595, + "grad_norm": 0.056689485907554626, + "learning_rate": 0.00013183311580179126, + "loss": 0.29, + "step": 20084 + }, + { + "epoch": 1.627106286454958, + "grad_norm": 0.05078176409006119, + "learning_rate": 0.00013182861514919665, + "loss": 0.229, + "step": 20085 + }, + { + "epoch": 1.6271872974724562, + "grad_norm": 0.0545271635055542, + "learning_rate": 0.000131824114496602, + "loss": 0.2475, + "step": 20086 + }, + { + "epoch": 1.6272683084899546, + "grad_norm": 0.052732549607753754, + "learning_rate": 0.0001318196138440074, + "loss": 0.307, + "step": 20087 + }, + { + "epoch": 1.627349319507453, + "grad_norm": 0.05406218767166138, + "learning_rate": 0.00013181511319141276, + "loss": 0.2839, + "step": 20088 + }, + { + "epoch": 1.6274303305249513, + "grad_norm": 0.05025879293680191, + "learning_rate": 0.00013181061253881814, + "loss": 0.2795, + "step": 20089 + }, + { + "epoch": 1.6275113415424498, + "grad_norm": 0.04761442542076111, + "learning_rate": 0.0001318061118862235, + "loss": 0.266, + "step": 20090 + }, + { + "epoch": 1.6275923525599483, + "grad_norm": 0.06434717029333115, + "learning_rate": 0.0001318016112336289, + "loss": 0.303, + "step": 20091 + }, + { + "epoch": 1.6276733635774465, + "grad_norm": 0.06810448318719864, + "learning_rate": 0.00013179711058103425, + "loss": 0.3327, + "step": 20092 + }, + { + "epoch": 1.6277543745949448, + "grad_norm": 0.04770027473568916, + "learning_rate": 0.00013179260992843964, + "loss": 0.2729, + "step": 20093 + }, + { + "epoch": 1.6278353856124435, + "grad_norm": 0.056478384882211685, + "learning_rate": 0.000131788109275845, + "loss": 0.2728, + "step": 20094 + }, + { + "epoch": 1.6279163966299417, + "grad_norm": 0.04458216577768326, + "learning_rate": 0.00013178360862325038, + "loss": 0.2847, + "step": 20095 + }, + { + "epoch": 1.62799740764744, + "grad_norm": 0.04367813467979431, + "learning_rate": 0.00013177910797065574, + "loss": 0.2731, + "step": 20096 + }, + { + "epoch": 1.6280784186649384, + "grad_norm": 0.05187131091952324, + "learning_rate": 0.00013177460731806113, + "loss": 0.2605, + "step": 20097 + }, + { + "epoch": 1.6281594296824369, + "grad_norm": 0.05495690554380417, + "learning_rate": 0.0001317701066654665, + "loss": 0.2637, + "step": 20098 + }, + { + "epoch": 1.6282404406999351, + "grad_norm": 0.055277954787015915, + "learning_rate": 0.00013176560601287188, + "loss": 0.2953, + "step": 20099 + }, + { + "epoch": 1.6283214517174336, + "grad_norm": 0.04511606693267822, + "learning_rate": 0.00013176110536027724, + "loss": 0.2593, + "step": 20100 + }, + { + "epoch": 1.628402462734932, + "grad_norm": 0.05307050421833992, + "learning_rate": 0.00013175660470768263, + "loss": 0.2926, + "step": 20101 + }, + { + "epoch": 1.6284834737524303, + "grad_norm": 0.057604528963565826, + "learning_rate": 0.000131752104055088, + "loss": 0.2901, + "step": 20102 + }, + { + "epoch": 1.6285644847699285, + "grad_norm": 0.06348736584186554, + "learning_rate": 0.00013174760340249337, + "loss": 0.2771, + "step": 20103 + }, + { + "epoch": 1.6286454957874272, + "grad_norm": 0.04991592839360237, + "learning_rate": 0.00013174310274989873, + "loss": 0.2991, + "step": 20104 + }, + { + "epoch": 1.6287265068049255, + "grad_norm": 0.058820828795433044, + "learning_rate": 0.00013173860209730412, + "loss": 0.2914, + "step": 20105 + }, + { + "epoch": 1.6288075178224237, + "grad_norm": 0.05092411860823631, + "learning_rate": 0.00013173410144470948, + "loss": 0.2716, + "step": 20106 + }, + { + "epoch": 1.6288885288399222, + "grad_norm": 0.05166938900947571, + "learning_rate": 0.00013172960079211487, + "loss": 0.2684, + "step": 20107 + }, + { + "epoch": 1.6289695398574207, + "grad_norm": 0.0568322129547596, + "learning_rate": 0.00013172510013952023, + "loss": 0.2474, + "step": 20108 + }, + { + "epoch": 1.629050550874919, + "grad_norm": 0.05410854145884514, + "learning_rate": 0.00013172059948692562, + "loss": 0.3125, + "step": 20109 + }, + { + "epoch": 1.6291315618924174, + "grad_norm": 0.05433070659637451, + "learning_rate": 0.00013171609883433098, + "loss": 0.2614, + "step": 20110 + }, + { + "epoch": 1.6292125729099158, + "grad_norm": 0.05375772714614868, + "learning_rate": 0.00013171159818173636, + "loss": 0.2543, + "step": 20111 + }, + { + "epoch": 1.629293583927414, + "grad_norm": 0.05799659341573715, + "learning_rate": 0.00013170709752914172, + "loss": 0.3079, + "step": 20112 + }, + { + "epoch": 1.6293745949449125, + "grad_norm": 0.0654786229133606, + "learning_rate": 0.0001317025968765471, + "loss": 0.3204, + "step": 20113 + }, + { + "epoch": 1.629455605962411, + "grad_norm": 0.059415362775325775, + "learning_rate": 0.00013169809622395247, + "loss": 0.2798, + "step": 20114 + }, + { + "epoch": 1.6295366169799093, + "grad_norm": 0.0585675872862339, + "learning_rate": 0.00013169359557135786, + "loss": 0.2913, + "step": 20115 + }, + { + "epoch": 1.6296176279974075, + "grad_norm": 0.05293847620487213, + "learning_rate": 0.00013168909491876322, + "loss": 0.2866, + "step": 20116 + }, + { + "epoch": 1.6296986390149062, + "grad_norm": 0.05343810096383095, + "learning_rate": 0.0001316845942661686, + "loss": 0.2786, + "step": 20117 + }, + { + "epoch": 1.6297796500324044, + "grad_norm": 0.05106763541698456, + "learning_rate": 0.000131680093613574, + "loss": 0.2988, + "step": 20118 + }, + { + "epoch": 1.6298606610499027, + "grad_norm": 0.054290104657411575, + "learning_rate": 0.00013167559296097935, + "loss": 0.316, + "step": 20119 + }, + { + "epoch": 1.6299416720674011, + "grad_norm": 0.05953530967235565, + "learning_rate": 0.0001316710923083847, + "loss": 0.2736, + "step": 20120 + }, + { + "epoch": 1.6300226830848996, + "grad_norm": 0.05168008804321289, + "learning_rate": 0.0001316665916557901, + "loss": 0.3099, + "step": 20121 + }, + { + "epoch": 1.6301036941023979, + "grad_norm": 0.04760206490755081, + "learning_rate": 0.00013166209100319546, + "loss": 0.2531, + "step": 20122 + }, + { + "epoch": 1.6301847051198963, + "grad_norm": 0.05584317818284035, + "learning_rate": 0.00013165759035060085, + "loss": 0.2728, + "step": 20123 + }, + { + "epoch": 1.6302657161373948, + "grad_norm": 0.05340051278471947, + "learning_rate": 0.00013165308969800623, + "loss": 0.2668, + "step": 20124 + }, + { + "epoch": 1.630346727154893, + "grad_norm": 0.04660770669579506, + "learning_rate": 0.0001316485890454116, + "loss": 0.224, + "step": 20125 + }, + { + "epoch": 1.6304277381723913, + "grad_norm": 0.05097239837050438, + "learning_rate": 0.00013164408839281695, + "loss": 0.2676, + "step": 20126 + }, + { + "epoch": 1.63050874918989, + "grad_norm": 0.04739508777856827, + "learning_rate": 0.00013163958774022234, + "loss": 0.2661, + "step": 20127 + }, + { + "epoch": 1.6305897602073882, + "grad_norm": 0.0610460564494133, + "learning_rate": 0.0001316350870876277, + "loss": 0.266, + "step": 20128 + }, + { + "epoch": 1.6306707712248865, + "grad_norm": 0.05964686721563339, + "learning_rate": 0.0001316305864350331, + "loss": 0.2859, + "step": 20129 + }, + { + "epoch": 1.630751782242385, + "grad_norm": 0.04790445789694786, + "learning_rate": 0.00013162608578243847, + "loss": 0.2922, + "step": 20130 + }, + { + "epoch": 1.6308327932598834, + "grad_norm": 0.06071170046925545, + "learning_rate": 0.00013162158512984383, + "loss": 0.2737, + "step": 20131 + }, + { + "epoch": 1.6309138042773816, + "grad_norm": 0.05310392007231712, + "learning_rate": 0.00013161708447724922, + "loss": 0.2589, + "step": 20132 + }, + { + "epoch": 1.63099481529488, + "grad_norm": 0.05222730338573456, + "learning_rate": 0.00013161258382465458, + "loss": 0.2686, + "step": 20133 + }, + { + "epoch": 1.6310758263123786, + "grad_norm": 0.04885387793183327, + "learning_rate": 0.00013160808317205994, + "loss": 0.2792, + "step": 20134 + }, + { + "epoch": 1.6311568373298768, + "grad_norm": 0.042033977806568146, + "learning_rate": 0.00013160358251946533, + "loss": 0.2782, + "step": 20135 + }, + { + "epoch": 1.6312378483473753, + "grad_norm": 0.0582573227584362, + "learning_rate": 0.00013159908186687072, + "loss": 0.3166, + "step": 20136 + }, + { + "epoch": 1.6313188593648738, + "grad_norm": 0.056182943284511566, + "learning_rate": 0.00013159458121427608, + "loss": 0.296, + "step": 20137 + }, + { + "epoch": 1.631399870382372, + "grad_norm": 0.06435632705688477, + "learning_rate": 0.00013159008056168146, + "loss": 0.2847, + "step": 20138 + }, + { + "epoch": 1.6314808813998702, + "grad_norm": 0.055193573236465454, + "learning_rate": 0.00013158557990908682, + "loss": 0.3055, + "step": 20139 + }, + { + "epoch": 1.6315618924173687, + "grad_norm": 0.05577018857002258, + "learning_rate": 0.00013158107925649218, + "loss": 0.2911, + "step": 20140 + }, + { + "epoch": 1.6316429034348672, + "grad_norm": 0.04244306683540344, + "learning_rate": 0.00013157657860389757, + "loss": 0.253, + "step": 20141 + }, + { + "epoch": 1.6317239144523654, + "grad_norm": 0.04200892895460129, + "learning_rate": 0.00013157207795130296, + "loss": 0.251, + "step": 20142 + }, + { + "epoch": 1.6318049254698639, + "grad_norm": 0.05072421580553055, + "learning_rate": 0.00013156757729870832, + "loss": 0.2786, + "step": 20143 + }, + { + "epoch": 1.6318859364873624, + "grad_norm": 0.044243331998586655, + "learning_rate": 0.0001315630766461137, + "loss": 0.2764, + "step": 20144 + }, + { + "epoch": 1.6319669475048606, + "grad_norm": 0.04605749621987343, + "learning_rate": 0.00013155857599351907, + "loss": 0.255, + "step": 20145 + }, + { + "epoch": 1.632047958522359, + "grad_norm": 0.052316538989543915, + "learning_rate": 0.00013155407534092443, + "loss": 0.2871, + "step": 20146 + }, + { + "epoch": 1.6321289695398575, + "grad_norm": 0.0470651313662529, + "learning_rate": 0.0001315495746883298, + "loss": 0.2831, + "step": 20147 + }, + { + "epoch": 1.6322099805573558, + "grad_norm": 0.053917501121759415, + "learning_rate": 0.0001315450740357352, + "loss": 0.3259, + "step": 20148 + }, + { + "epoch": 1.632290991574854, + "grad_norm": 0.06069952994585037, + "learning_rate": 0.00013154057338314056, + "loss": 0.2908, + "step": 20149 + }, + { + "epoch": 1.6323720025923527, + "grad_norm": 0.04950543865561485, + "learning_rate": 0.00013153607273054595, + "loss": 0.3107, + "step": 20150 + }, + { + "epoch": 1.632453013609851, + "grad_norm": 0.05352965369820595, + "learning_rate": 0.0001315315720779513, + "loss": 0.2724, + "step": 20151 + }, + { + "epoch": 1.6325340246273492, + "grad_norm": 0.0512455552816391, + "learning_rate": 0.00013152707142535667, + "loss": 0.2733, + "step": 20152 + }, + { + "epoch": 1.6326150356448477, + "grad_norm": 0.046733301132917404, + "learning_rate": 0.00013152257077276205, + "loss": 0.2442, + "step": 20153 + }, + { + "epoch": 1.6326960466623461, + "grad_norm": 0.0573970265686512, + "learning_rate": 0.00013151807012016744, + "loss": 0.2914, + "step": 20154 + }, + { + "epoch": 1.6327770576798444, + "grad_norm": 0.049615781754255295, + "learning_rate": 0.0001315135694675728, + "loss": 0.2836, + "step": 20155 + }, + { + "epoch": 1.6328580686973428, + "grad_norm": 0.05010844022035599, + "learning_rate": 0.0001315090688149782, + "loss": 0.2526, + "step": 20156 + }, + { + "epoch": 1.6329390797148413, + "grad_norm": 0.05412207543849945, + "learning_rate": 0.00013150456816238355, + "loss": 0.2725, + "step": 20157 + }, + { + "epoch": 1.6330200907323396, + "grad_norm": 0.055283691734075546, + "learning_rate": 0.0001315000675097889, + "loss": 0.3277, + "step": 20158 + }, + { + "epoch": 1.633101101749838, + "grad_norm": 0.058319512754678726, + "learning_rate": 0.0001314955668571943, + "loss": 0.2579, + "step": 20159 + }, + { + "epoch": 1.6331821127673365, + "grad_norm": 0.05270714685320854, + "learning_rate": 0.00013149106620459968, + "loss": 0.294, + "step": 20160 + }, + { + "epoch": 1.6332631237848347, + "grad_norm": 0.05729326605796814, + "learning_rate": 0.00013148656555200504, + "loss": 0.2738, + "step": 20161 + }, + { + "epoch": 1.633344134802333, + "grad_norm": 0.04980797320604324, + "learning_rate": 0.00013148206489941043, + "loss": 0.2803, + "step": 20162 + }, + { + "epoch": 1.6334251458198314, + "grad_norm": 0.059038545936346054, + "learning_rate": 0.0001314775642468158, + "loss": 0.2587, + "step": 20163 + }, + { + "epoch": 1.63350615683733, + "grad_norm": 0.04804708808660507, + "learning_rate": 0.00013147306359422115, + "loss": 0.2635, + "step": 20164 + }, + { + "epoch": 1.6335871678548282, + "grad_norm": 0.05786438658833504, + "learning_rate": 0.00013146856294162654, + "loss": 0.3201, + "step": 20165 + }, + { + "epoch": 1.6336681788723266, + "grad_norm": 0.05506935343146324, + "learning_rate": 0.00013146406228903192, + "loss": 0.2825, + "step": 20166 + }, + { + "epoch": 1.633749189889825, + "grad_norm": 0.04705440625548363, + "learning_rate": 0.00013145956163643728, + "loss": 0.2446, + "step": 20167 + }, + { + "epoch": 1.6338302009073233, + "grad_norm": 0.05770174041390419, + "learning_rate": 0.00013145506098384267, + "loss": 0.2974, + "step": 20168 + }, + { + "epoch": 1.6339112119248218, + "grad_norm": 0.049085672944784164, + "learning_rate": 0.00013145056033124803, + "loss": 0.2468, + "step": 20169 + }, + { + "epoch": 1.6339922229423203, + "grad_norm": 0.053372543305158615, + "learning_rate": 0.00013144605967865342, + "loss": 0.303, + "step": 20170 + }, + { + "epoch": 1.6340732339598185, + "grad_norm": 0.06096953526139259, + "learning_rate": 0.00013144155902605878, + "loss": 0.2802, + "step": 20171 + }, + { + "epoch": 1.6341542449773168, + "grad_norm": 0.047225162386894226, + "learning_rate": 0.00013143705837346417, + "loss": 0.2732, + "step": 20172 + }, + { + "epoch": 1.6342352559948155, + "grad_norm": 0.06632104516029358, + "learning_rate": 0.00013143255772086953, + "loss": 0.3016, + "step": 20173 + }, + { + "epoch": 1.6343162670123137, + "grad_norm": 0.05281345173716545, + "learning_rate": 0.0001314280570682749, + "loss": 0.2807, + "step": 20174 + }, + { + "epoch": 1.634397278029812, + "grad_norm": 0.048660047352313995, + "learning_rate": 0.00013142355641568027, + "loss": 0.2648, + "step": 20175 + }, + { + "epoch": 1.6344782890473104, + "grad_norm": 0.050357669591903687, + "learning_rate": 0.00013141905576308566, + "loss": 0.2743, + "step": 20176 + }, + { + "epoch": 1.6345593000648089, + "grad_norm": 0.059288132935762405, + "learning_rate": 0.00013141455511049102, + "loss": 0.3192, + "step": 20177 + }, + { + "epoch": 1.6346403110823071, + "grad_norm": 0.04744809865951538, + "learning_rate": 0.0001314100544578964, + "loss": 0.246, + "step": 20178 + }, + { + "epoch": 1.6347213220998056, + "grad_norm": 0.0480501614511013, + "learning_rate": 0.00013140555380530177, + "loss": 0.2612, + "step": 20179 + }, + { + "epoch": 1.634802333117304, + "grad_norm": 0.04682370275259018, + "learning_rate": 0.00013140105315270715, + "loss": 0.2773, + "step": 20180 + }, + { + "epoch": 1.6348833441348023, + "grad_norm": 0.04360784590244293, + "learning_rate": 0.00013139655250011251, + "loss": 0.2636, + "step": 20181 + }, + { + "epoch": 1.6349643551523008, + "grad_norm": 0.051908232271671295, + "learning_rate": 0.0001313920518475179, + "loss": 0.3092, + "step": 20182 + }, + { + "epoch": 1.6350453661697992, + "grad_norm": 0.051509421318769455, + "learning_rate": 0.00013138755119492326, + "loss": 0.2803, + "step": 20183 + }, + { + "epoch": 1.6351263771872975, + "grad_norm": 0.05230452120304108, + "learning_rate": 0.00013138305054232865, + "loss": 0.3067, + "step": 20184 + }, + { + "epoch": 1.6352073882047957, + "grad_norm": 0.043822553008794785, + "learning_rate": 0.000131378549889734, + "loss": 0.2649, + "step": 20185 + }, + { + "epoch": 1.6352883992222942, + "grad_norm": 0.058286089450120926, + "learning_rate": 0.0001313740492371394, + "loss": 0.2641, + "step": 20186 + }, + { + "epoch": 1.6353694102397927, + "grad_norm": 0.051882125437259674, + "learning_rate": 0.00013136954858454476, + "loss": 0.2829, + "step": 20187 + }, + { + "epoch": 1.635450421257291, + "grad_norm": 0.055661335587501526, + "learning_rate": 0.00013136504793195014, + "loss": 0.3405, + "step": 20188 + }, + { + "epoch": 1.6355314322747894, + "grad_norm": 0.05822267755866051, + "learning_rate": 0.0001313605472793555, + "loss": 0.2748, + "step": 20189 + }, + { + "epoch": 1.6356124432922878, + "grad_norm": 0.04890811815857887, + "learning_rate": 0.0001313560466267609, + "loss": 0.2865, + "step": 20190 + }, + { + "epoch": 1.635693454309786, + "grad_norm": 0.04698796942830086, + "learning_rate": 0.00013135154597416625, + "loss": 0.2644, + "step": 20191 + }, + { + "epoch": 1.6357744653272845, + "grad_norm": 0.04925640672445297, + "learning_rate": 0.00013134704532157164, + "loss": 0.3161, + "step": 20192 + }, + { + "epoch": 1.635855476344783, + "grad_norm": 0.0510898195207119, + "learning_rate": 0.00013134254466897703, + "loss": 0.2785, + "step": 20193 + }, + { + "epoch": 1.6359364873622813, + "grad_norm": 0.050242744386196136, + "learning_rate": 0.00013133804401638239, + "loss": 0.2656, + "step": 20194 + }, + { + "epoch": 1.6360174983797795, + "grad_norm": 0.04777007922530174, + "learning_rate": 0.00013133354336378777, + "loss": 0.3204, + "step": 20195 + }, + { + "epoch": 1.6360985093972782, + "grad_norm": 0.05214406177401543, + "learning_rate": 0.00013132904271119313, + "loss": 0.2745, + "step": 20196 + }, + { + "epoch": 1.6361795204147764, + "grad_norm": 0.04864123463630676, + "learning_rate": 0.0001313245420585985, + "loss": 0.2579, + "step": 20197 + }, + { + "epoch": 1.6362605314322747, + "grad_norm": 0.05020648613572121, + "learning_rate": 0.00013132004140600388, + "loss": 0.2676, + "step": 20198 + }, + { + "epoch": 1.6363415424497731, + "grad_norm": 0.056893881410360336, + "learning_rate": 0.00013131554075340927, + "loss": 0.2787, + "step": 20199 + }, + { + "epoch": 1.6364225534672716, + "grad_norm": 0.05220147222280502, + "learning_rate": 0.00013131104010081463, + "loss": 0.2912, + "step": 20200 + }, + { + "epoch": 1.6365035644847699, + "grad_norm": 0.054814036935567856, + "learning_rate": 0.00013130653944822001, + "loss": 0.279, + "step": 20201 + }, + { + "epoch": 1.6365845755022683, + "grad_norm": 0.04547103866934776, + "learning_rate": 0.00013130203879562537, + "loss": 0.2545, + "step": 20202 + }, + { + "epoch": 1.6366655865197668, + "grad_norm": 0.05352579429745674, + "learning_rate": 0.00013129753814303073, + "loss": 0.3117, + "step": 20203 + }, + { + "epoch": 1.636746597537265, + "grad_norm": 0.058115895837545395, + "learning_rate": 0.00013129303749043612, + "loss": 0.2895, + "step": 20204 + }, + { + "epoch": 1.6368276085547635, + "grad_norm": 0.04940159246325493, + "learning_rate": 0.0001312885368378415, + "loss": 0.2506, + "step": 20205 + }, + { + "epoch": 1.636908619572262, + "grad_norm": 0.04255246743559837, + "learning_rate": 0.00013128403618524687, + "loss": 0.2632, + "step": 20206 + }, + { + "epoch": 1.6369896305897602, + "grad_norm": 0.053756408393383026, + "learning_rate": 0.00013127953553265226, + "loss": 0.2918, + "step": 20207 + }, + { + "epoch": 1.6370706416072585, + "grad_norm": 0.05477878823876381, + "learning_rate": 0.00013127503488005762, + "loss": 0.3305, + "step": 20208 + }, + { + "epoch": 1.637151652624757, + "grad_norm": 0.04882469028234482, + "learning_rate": 0.00013127053422746298, + "loss": 0.229, + "step": 20209 + }, + { + "epoch": 1.6372326636422554, + "grad_norm": 0.049400970339775085, + "learning_rate": 0.00013126603357486836, + "loss": 0.2977, + "step": 20210 + }, + { + "epoch": 1.6373136746597536, + "grad_norm": 0.05772639438509941, + "learning_rate": 0.00013126153292227375, + "loss": 0.3003, + "step": 20211 + }, + { + "epoch": 1.637394685677252, + "grad_norm": 0.04149458929896355, + "learning_rate": 0.0001312570322696791, + "loss": 0.223, + "step": 20212 + }, + { + "epoch": 1.6374756966947506, + "grad_norm": 0.047275107353925705, + "learning_rate": 0.0001312525316170845, + "loss": 0.2717, + "step": 20213 + }, + { + "epoch": 1.6375567077122488, + "grad_norm": 0.04378741979598999, + "learning_rate": 0.00013124803096448986, + "loss": 0.2655, + "step": 20214 + }, + { + "epoch": 1.6376377187297473, + "grad_norm": 0.04953139275312424, + "learning_rate": 0.00013124353031189522, + "loss": 0.267, + "step": 20215 + }, + { + "epoch": 1.6377187297472457, + "grad_norm": 0.04930621385574341, + "learning_rate": 0.0001312390296593006, + "loss": 0.2901, + "step": 20216 + }, + { + "epoch": 1.637799740764744, + "grad_norm": 0.056873247027397156, + "learning_rate": 0.000131234529006706, + "loss": 0.287, + "step": 20217 + }, + { + "epoch": 1.6378807517822422, + "grad_norm": 0.06545663625001907, + "learning_rate": 0.00013123002835411135, + "loss": 0.2924, + "step": 20218 + }, + { + "epoch": 1.637961762799741, + "grad_norm": 0.050055429339408875, + "learning_rate": 0.00013122552770151674, + "loss": 0.2541, + "step": 20219 + }, + { + "epoch": 1.6380427738172392, + "grad_norm": 0.05391817167401314, + "learning_rate": 0.0001312210270489221, + "loss": 0.2993, + "step": 20220 + }, + { + "epoch": 1.6381237848347374, + "grad_norm": 0.060411565005779266, + "learning_rate": 0.00013121652639632746, + "loss": 0.2711, + "step": 20221 + }, + { + "epoch": 1.6382047958522359, + "grad_norm": 0.05216376855969429, + "learning_rate": 0.00013121202574373285, + "loss": 0.2893, + "step": 20222 + }, + { + "epoch": 1.6382858068697344, + "grad_norm": 0.058760929852724075, + "learning_rate": 0.00013120752509113823, + "loss": 0.3197, + "step": 20223 + }, + { + "epoch": 1.6383668178872326, + "grad_norm": 0.05227070674300194, + "learning_rate": 0.0001312030244385436, + "loss": 0.2757, + "step": 20224 + }, + { + "epoch": 1.638447828904731, + "grad_norm": 0.059265658259391785, + "learning_rate": 0.00013119852378594898, + "loss": 0.3337, + "step": 20225 + }, + { + "epoch": 1.6385288399222295, + "grad_norm": 0.0466594323515892, + "learning_rate": 0.00013119402313335434, + "loss": 0.2669, + "step": 20226 + }, + { + "epoch": 1.6386098509397278, + "grad_norm": 0.05499688908457756, + "learning_rate": 0.0001311895224807597, + "loss": 0.2749, + "step": 20227 + }, + { + "epoch": 1.638690861957226, + "grad_norm": 0.05702508985996246, + "learning_rate": 0.0001311850218281651, + "loss": 0.3247, + "step": 20228 + }, + { + "epoch": 1.6387718729747247, + "grad_norm": 0.06296319514513016, + "learning_rate": 0.00013118052117557047, + "loss": 0.3166, + "step": 20229 + }, + { + "epoch": 1.638852883992223, + "grad_norm": 0.04582956060767174, + "learning_rate": 0.00013117602052297583, + "loss": 0.2367, + "step": 20230 + }, + { + "epoch": 1.6389338950097212, + "grad_norm": 0.0633278340101242, + "learning_rate": 0.00013117151987038122, + "loss": 0.2992, + "step": 20231 + }, + { + "epoch": 1.6390149060272197, + "grad_norm": 0.05358787998557091, + "learning_rate": 0.00013116701921778658, + "loss": 0.3292, + "step": 20232 + }, + { + "epoch": 1.6390959170447181, + "grad_norm": 0.040785521268844604, + "learning_rate": 0.00013116251856519194, + "loss": 0.2723, + "step": 20233 + }, + { + "epoch": 1.6391769280622164, + "grad_norm": 0.049127183854579926, + "learning_rate": 0.00013115801791259733, + "loss": 0.2379, + "step": 20234 + }, + { + "epoch": 1.6392579390797148, + "grad_norm": 0.05054868385195732, + "learning_rate": 0.00013115351726000272, + "loss": 0.2747, + "step": 20235 + }, + { + "epoch": 1.6393389500972133, + "grad_norm": 0.05272538959980011, + "learning_rate": 0.00013114901660740808, + "loss": 0.2752, + "step": 20236 + }, + { + "epoch": 1.6394199611147116, + "grad_norm": 0.04339519515633583, + "learning_rate": 0.00013114451595481346, + "loss": 0.2689, + "step": 20237 + }, + { + "epoch": 1.63950097213221, + "grad_norm": 0.04754365608096123, + "learning_rate": 0.00013114001530221882, + "loss": 0.2559, + "step": 20238 + }, + { + "epoch": 1.6395819831497085, + "grad_norm": 0.058999720960855484, + "learning_rate": 0.00013113551464962418, + "loss": 0.2714, + "step": 20239 + }, + { + "epoch": 1.6396629941672067, + "grad_norm": 0.05868087708950043, + "learning_rate": 0.00013113101399702957, + "loss": 0.2717, + "step": 20240 + }, + { + "epoch": 1.639744005184705, + "grad_norm": 0.049035605043172836, + "learning_rate": 0.00013112651334443496, + "loss": 0.2686, + "step": 20241 + }, + { + "epoch": 1.6398250162022034, + "grad_norm": 0.051105234771966934, + "learning_rate": 0.00013112201269184032, + "loss": 0.2869, + "step": 20242 + }, + { + "epoch": 1.639906027219702, + "grad_norm": 0.052198246121406555, + "learning_rate": 0.0001311175120392457, + "loss": 0.2844, + "step": 20243 + }, + { + "epoch": 1.6399870382372002, + "grad_norm": 0.05529845878481865, + "learning_rate": 0.00013111301138665107, + "loss": 0.2922, + "step": 20244 + }, + { + "epoch": 1.6400680492546986, + "grad_norm": 0.057615384459495544, + "learning_rate": 0.00013110851073405643, + "loss": 0.281, + "step": 20245 + }, + { + "epoch": 1.640149060272197, + "grad_norm": 0.05965334177017212, + "learning_rate": 0.0001311040100814618, + "loss": 0.3059, + "step": 20246 + }, + { + "epoch": 1.6402300712896953, + "grad_norm": 0.05527438223361969, + "learning_rate": 0.0001310995094288672, + "loss": 0.2756, + "step": 20247 + }, + { + "epoch": 1.6403110823071938, + "grad_norm": 0.0498809777200222, + "learning_rate": 0.00013109500877627256, + "loss": 0.2556, + "step": 20248 + }, + { + "epoch": 1.6403920933246923, + "grad_norm": 0.0581757053732872, + "learning_rate": 0.00013109050812367795, + "loss": 0.3066, + "step": 20249 + }, + { + "epoch": 1.6404731043421905, + "grad_norm": 0.05269348621368408, + "learning_rate": 0.0001310860074710833, + "loss": 0.2849, + "step": 20250 + }, + { + "epoch": 1.6405541153596888, + "grad_norm": 0.05979086831212044, + "learning_rate": 0.0001310815068184887, + "loss": 0.2938, + "step": 20251 + }, + { + "epoch": 1.6406351263771874, + "grad_norm": 0.04561556130647659, + "learning_rate": 0.00013107700616589405, + "loss": 0.2656, + "step": 20252 + }, + { + "epoch": 1.6407161373946857, + "grad_norm": 0.06042017787694931, + "learning_rate": 0.00013107250551329944, + "loss": 0.3043, + "step": 20253 + }, + { + "epoch": 1.640797148412184, + "grad_norm": 0.047696713358163834, + "learning_rate": 0.0001310680048607048, + "loss": 0.272, + "step": 20254 + }, + { + "epoch": 1.6408781594296824, + "grad_norm": 0.052089713513851166, + "learning_rate": 0.0001310635042081102, + "loss": 0.3056, + "step": 20255 + }, + { + "epoch": 1.6409591704471809, + "grad_norm": 0.05706511810421944, + "learning_rate": 0.00013105900355551555, + "loss": 0.2903, + "step": 20256 + }, + { + "epoch": 1.6410401814646791, + "grad_norm": 0.0495259165763855, + "learning_rate": 0.00013105450290292094, + "loss": 0.2448, + "step": 20257 + }, + { + "epoch": 1.6411211924821776, + "grad_norm": 0.04729805141687393, + "learning_rate": 0.0001310500022503263, + "loss": 0.2734, + "step": 20258 + }, + { + "epoch": 1.641202203499676, + "grad_norm": 0.057043250650167465, + "learning_rate": 0.00013104550159773168, + "loss": 0.2919, + "step": 20259 + }, + { + "epoch": 1.6412832145171743, + "grad_norm": 0.05842726677656174, + "learning_rate": 0.00013104100094513704, + "loss": 0.3415, + "step": 20260 + }, + { + "epoch": 1.6413642255346728, + "grad_norm": 0.048161424696445465, + "learning_rate": 0.00013103650029254243, + "loss": 0.2791, + "step": 20261 + }, + { + "epoch": 1.6414452365521712, + "grad_norm": 0.045487869530916214, + "learning_rate": 0.0001310319996399478, + "loss": 0.2468, + "step": 20262 + }, + { + "epoch": 1.6415262475696695, + "grad_norm": 0.05410471558570862, + "learning_rate": 0.00013102749898735318, + "loss": 0.2646, + "step": 20263 + }, + { + "epoch": 1.6416072585871677, + "grad_norm": 0.04716672748327255, + "learning_rate": 0.00013102299833475856, + "loss": 0.2815, + "step": 20264 + }, + { + "epoch": 1.6416882696046662, + "grad_norm": 0.0537717230618, + "learning_rate": 0.00013101849768216392, + "loss": 0.3024, + "step": 20265 + }, + { + "epoch": 1.6417692806221647, + "grad_norm": 0.04909708723425865, + "learning_rate": 0.00013101399702956928, + "loss": 0.2653, + "step": 20266 + }, + { + "epoch": 1.641850291639663, + "grad_norm": 0.05597676336765289, + "learning_rate": 0.00013100949637697467, + "loss": 0.2867, + "step": 20267 + }, + { + "epoch": 1.6419313026571614, + "grad_norm": 0.05452214181423187, + "learning_rate": 0.00013100499572438003, + "loss": 0.2578, + "step": 20268 + }, + { + "epoch": 1.6420123136746598, + "grad_norm": 0.05111711472272873, + "learning_rate": 0.00013100049507178542, + "loss": 0.2473, + "step": 20269 + }, + { + "epoch": 1.642093324692158, + "grad_norm": 0.059686530381441116, + "learning_rate": 0.0001309959944191908, + "loss": 0.2795, + "step": 20270 + }, + { + "epoch": 1.6421743357096565, + "grad_norm": 0.047529637813568115, + "learning_rate": 0.00013099149376659617, + "loss": 0.2571, + "step": 20271 + }, + { + "epoch": 1.642255346727155, + "grad_norm": 0.05156063660979271, + "learning_rate": 0.00013098699311400153, + "loss": 0.287, + "step": 20272 + }, + { + "epoch": 1.6423363577446533, + "grad_norm": 0.05362094193696976, + "learning_rate": 0.0001309824924614069, + "loss": 0.2528, + "step": 20273 + }, + { + "epoch": 1.6424173687621515, + "grad_norm": 0.05864579230546951, + "learning_rate": 0.0001309779918088123, + "loss": 0.2787, + "step": 20274 + }, + { + "epoch": 1.6424983797796502, + "grad_norm": 0.04949674382805824, + "learning_rate": 0.00013097349115621766, + "loss": 0.228, + "step": 20275 + }, + { + "epoch": 1.6425793907971484, + "grad_norm": 0.0509091354906559, + "learning_rate": 0.00013096899050362305, + "loss": 0.2809, + "step": 20276 + }, + { + "epoch": 1.6426604018146467, + "grad_norm": 0.058638475835323334, + "learning_rate": 0.0001309644898510284, + "loss": 0.332, + "step": 20277 + }, + { + "epoch": 1.6427414128321451, + "grad_norm": 0.049978744238615036, + "learning_rate": 0.00013095998919843377, + "loss": 0.2658, + "step": 20278 + }, + { + "epoch": 1.6428224238496436, + "grad_norm": 0.07554782927036285, + "learning_rate": 0.00013095548854583916, + "loss": 0.3293, + "step": 20279 + }, + { + "epoch": 1.6429034348671419, + "grad_norm": 0.06077819690108299, + "learning_rate": 0.00013095098789324454, + "loss": 0.2941, + "step": 20280 + }, + { + "epoch": 1.6429844458846403, + "grad_norm": 0.060247037559747696, + "learning_rate": 0.0001309464872406499, + "loss": 0.314, + "step": 20281 + }, + { + "epoch": 1.6430654569021388, + "grad_norm": 0.055557847023010254, + "learning_rate": 0.0001309419865880553, + "loss": 0.3071, + "step": 20282 + }, + { + "epoch": 1.643146467919637, + "grad_norm": 0.05224480479955673, + "learning_rate": 0.00013093748593546065, + "loss": 0.2662, + "step": 20283 + }, + { + "epoch": 1.6432274789371355, + "grad_norm": 0.0507444366812706, + "learning_rate": 0.000130932985282866, + "loss": 0.2676, + "step": 20284 + }, + { + "epoch": 1.643308489954634, + "grad_norm": 0.052977025508880615, + "learning_rate": 0.0001309284846302714, + "loss": 0.2647, + "step": 20285 + }, + { + "epoch": 1.6433895009721322, + "grad_norm": 0.05128326639533043, + "learning_rate": 0.00013092398397767678, + "loss": 0.2753, + "step": 20286 + }, + { + "epoch": 1.6434705119896305, + "grad_norm": 0.048690736293792725, + "learning_rate": 0.00013091948332508214, + "loss": 0.2403, + "step": 20287 + }, + { + "epoch": 1.643551523007129, + "grad_norm": 0.05496395751833916, + "learning_rate": 0.00013091498267248753, + "loss": 0.2716, + "step": 20288 + }, + { + "epoch": 1.6436325340246274, + "grad_norm": 0.056154485791921616, + "learning_rate": 0.0001309104820198929, + "loss": 0.309, + "step": 20289 + }, + { + "epoch": 1.6437135450421256, + "grad_norm": 0.04780879244208336, + "learning_rate": 0.00013090598136729825, + "loss": 0.2867, + "step": 20290 + }, + { + "epoch": 1.643794556059624, + "grad_norm": 0.05079265311360359, + "learning_rate": 0.00013090148071470364, + "loss": 0.2678, + "step": 20291 + }, + { + "epoch": 1.6438755670771226, + "grad_norm": 0.05555180087685585, + "learning_rate": 0.00013089698006210903, + "loss": 0.2979, + "step": 20292 + }, + { + "epoch": 1.6439565780946208, + "grad_norm": 0.05565644055604935, + "learning_rate": 0.00013089247940951439, + "loss": 0.2806, + "step": 20293 + }, + { + "epoch": 1.6440375891121193, + "grad_norm": 0.06228690221905708, + "learning_rate": 0.00013088797875691977, + "loss": 0.3504, + "step": 20294 + }, + { + "epoch": 1.6441186001296177, + "grad_norm": 0.06760898977518082, + "learning_rate": 0.00013088347810432513, + "loss": 0.377, + "step": 20295 + }, + { + "epoch": 1.644199611147116, + "grad_norm": 0.05215907469391823, + "learning_rate": 0.0001308789774517305, + "loss": 0.3102, + "step": 20296 + }, + { + "epoch": 1.6442806221646142, + "grad_norm": 0.059776682406663895, + "learning_rate": 0.00013087447679913588, + "loss": 0.324, + "step": 20297 + }, + { + "epoch": 1.644361633182113, + "grad_norm": 0.04926168546080589, + "learning_rate": 0.00013086997614654127, + "loss": 0.2834, + "step": 20298 + }, + { + "epoch": 1.6444426441996112, + "grad_norm": 0.05643089860677719, + "learning_rate": 0.00013086547549394663, + "loss": 0.2819, + "step": 20299 + }, + { + "epoch": 1.6445236552171094, + "grad_norm": 0.0547204464673996, + "learning_rate": 0.00013086097484135201, + "loss": 0.3061, + "step": 20300 + }, + { + "epoch": 1.6446046662346079, + "grad_norm": 0.04373621568083763, + "learning_rate": 0.00013085647418875737, + "loss": 0.2539, + "step": 20301 + }, + { + "epoch": 1.6446856772521063, + "grad_norm": 0.06481010466814041, + "learning_rate": 0.00013085197353616273, + "loss": 0.2944, + "step": 20302 + }, + { + "epoch": 1.6447666882696046, + "grad_norm": 0.047319646924734116, + "learning_rate": 0.00013084747288356812, + "loss": 0.2712, + "step": 20303 + }, + { + "epoch": 1.644847699287103, + "grad_norm": 0.04775365814566612, + "learning_rate": 0.0001308429722309735, + "loss": 0.2608, + "step": 20304 + }, + { + "epoch": 1.6449287103046015, + "grad_norm": 0.08077125251293182, + "learning_rate": 0.00013083847157837887, + "loss": 0.2927, + "step": 20305 + }, + { + "epoch": 1.6450097213220998, + "grad_norm": 0.05693648383021355, + "learning_rate": 0.00013083397092578426, + "loss": 0.3129, + "step": 20306 + }, + { + "epoch": 1.6450907323395982, + "grad_norm": 0.06557469815015793, + "learning_rate": 0.00013082947027318962, + "loss": 0.2719, + "step": 20307 + }, + { + "epoch": 1.6451717433570967, + "grad_norm": 0.047028761357069016, + "learning_rate": 0.00013082496962059498, + "loss": 0.2601, + "step": 20308 + }, + { + "epoch": 1.645252754374595, + "grad_norm": 0.05751359462738037, + "learning_rate": 0.00013082046896800036, + "loss": 0.3182, + "step": 20309 + }, + { + "epoch": 1.6453337653920932, + "grad_norm": 0.044193051755428314, + "learning_rate": 0.00013081596831540575, + "loss": 0.2716, + "step": 20310 + }, + { + "epoch": 1.6454147764095917, + "grad_norm": 0.05198691040277481, + "learning_rate": 0.0001308114676628111, + "loss": 0.2334, + "step": 20311 + }, + { + "epoch": 1.6454957874270901, + "grad_norm": 0.04755130782723427, + "learning_rate": 0.0001308069670102165, + "loss": 0.2808, + "step": 20312 + }, + { + "epoch": 1.6455767984445884, + "grad_norm": 0.051487136632204056, + "learning_rate": 0.00013080246635762186, + "loss": 0.2478, + "step": 20313 + }, + { + "epoch": 1.6456578094620868, + "grad_norm": 0.0572948083281517, + "learning_rate": 0.00013079796570502722, + "loss": 0.2927, + "step": 20314 + }, + { + "epoch": 1.6457388204795853, + "grad_norm": 0.060806773602962494, + "learning_rate": 0.0001307934650524326, + "loss": 0.2897, + "step": 20315 + }, + { + "epoch": 1.6458198314970836, + "grad_norm": 0.05558852106332779, + "learning_rate": 0.000130788964399838, + "loss": 0.3071, + "step": 20316 + }, + { + "epoch": 1.645900842514582, + "grad_norm": 0.042439140379428864, + "learning_rate": 0.00013078446374724335, + "loss": 0.2648, + "step": 20317 + }, + { + "epoch": 1.6459818535320805, + "grad_norm": 0.04296138882637024, + "learning_rate": 0.00013077996309464874, + "loss": 0.2554, + "step": 20318 + }, + { + "epoch": 1.6460628645495787, + "grad_norm": 0.060558926314115524, + "learning_rate": 0.0001307754624420541, + "loss": 0.2917, + "step": 20319 + }, + { + "epoch": 1.646143875567077, + "grad_norm": 0.058144014328718185, + "learning_rate": 0.00013077096178945946, + "loss": 0.2486, + "step": 20320 + }, + { + "epoch": 1.6462248865845757, + "grad_norm": 0.05100826546549797, + "learning_rate": 0.00013076646113686485, + "loss": 0.2704, + "step": 20321 + }, + { + "epoch": 1.646305897602074, + "grad_norm": 0.0650191381573677, + "learning_rate": 0.00013076196048427023, + "loss": 0.3276, + "step": 20322 + }, + { + "epoch": 1.6463869086195722, + "grad_norm": 0.0722268596291542, + "learning_rate": 0.0001307574598316756, + "loss": 0.2804, + "step": 20323 + }, + { + "epoch": 1.6464679196370706, + "grad_norm": 0.06000415235757828, + "learning_rate": 0.00013075295917908098, + "loss": 0.2547, + "step": 20324 + }, + { + "epoch": 1.646548930654569, + "grad_norm": 0.06664841622114182, + "learning_rate": 0.00013074845852648634, + "loss": 0.3037, + "step": 20325 + }, + { + "epoch": 1.6466299416720673, + "grad_norm": 0.06420580297708511, + "learning_rate": 0.00013074395787389173, + "loss": 0.2886, + "step": 20326 + }, + { + "epoch": 1.6467109526895658, + "grad_norm": 0.05521161109209061, + "learning_rate": 0.0001307394572212971, + "loss": 0.3071, + "step": 20327 + }, + { + "epoch": 1.6467919637070643, + "grad_norm": 0.05648175626993179, + "learning_rate": 0.00013073495656870248, + "loss": 0.279, + "step": 20328 + }, + { + "epoch": 1.6468729747245625, + "grad_norm": 0.055062033236026764, + "learning_rate": 0.00013073045591610784, + "loss": 0.289, + "step": 20329 + }, + { + "epoch": 1.6469539857420608, + "grad_norm": 0.04393686354160309, + "learning_rate": 0.00013072595526351322, + "loss": 0.2747, + "step": 20330 + }, + { + "epoch": 1.6470349967595594, + "grad_norm": 0.06465456634759903, + "learning_rate": 0.00013072145461091858, + "loss": 0.3107, + "step": 20331 + }, + { + "epoch": 1.6471160077770577, + "grad_norm": 0.048442304134368896, + "learning_rate": 0.00013071695395832397, + "loss": 0.3138, + "step": 20332 + }, + { + "epoch": 1.647197018794556, + "grad_norm": 0.04944649338722229, + "learning_rate": 0.00013071245330572936, + "loss": 0.2602, + "step": 20333 + }, + { + "epoch": 1.6472780298120544, + "grad_norm": 0.05407331883907318, + "learning_rate": 0.00013070795265313472, + "loss": 0.3204, + "step": 20334 + }, + { + "epoch": 1.6473590408295529, + "grad_norm": 0.04247947037220001, + "learning_rate": 0.00013070345200054008, + "loss": 0.2753, + "step": 20335 + }, + { + "epoch": 1.6474400518470511, + "grad_norm": 0.055948950350284576, + "learning_rate": 0.00013069895134794546, + "loss": 0.3221, + "step": 20336 + }, + { + "epoch": 1.6475210628645496, + "grad_norm": 0.04786483943462372, + "learning_rate": 0.00013069445069535082, + "loss": 0.2718, + "step": 20337 + }, + { + "epoch": 1.647602073882048, + "grad_norm": 0.05264323949813843, + "learning_rate": 0.0001306899500427562, + "loss": 0.293, + "step": 20338 + }, + { + "epoch": 1.6476830848995463, + "grad_norm": 0.0563749223947525, + "learning_rate": 0.0001306854493901616, + "loss": 0.3238, + "step": 20339 + }, + { + "epoch": 1.6477640959170448, + "grad_norm": 0.05304846167564392, + "learning_rate": 0.00013068094873756696, + "loss": 0.2754, + "step": 20340 + }, + { + "epoch": 1.6478451069345432, + "grad_norm": 0.04724569618701935, + "learning_rate": 0.00013067644808497232, + "loss": 0.2655, + "step": 20341 + }, + { + "epoch": 1.6479261179520415, + "grad_norm": 0.05889305844902992, + "learning_rate": 0.0001306719474323777, + "loss": 0.2864, + "step": 20342 + }, + { + "epoch": 1.6480071289695397, + "grad_norm": 0.05639440193772316, + "learning_rate": 0.00013066744677978307, + "loss": 0.2899, + "step": 20343 + }, + { + "epoch": 1.6480881399870384, + "grad_norm": 0.05589323863387108, + "learning_rate": 0.00013066294612718845, + "loss": 0.287, + "step": 20344 + }, + { + "epoch": 1.6481691510045366, + "grad_norm": 0.04943550378084183, + "learning_rate": 0.00013065844547459384, + "loss": 0.2824, + "step": 20345 + }, + { + "epoch": 1.648250162022035, + "grad_norm": 0.05775272101163864, + "learning_rate": 0.0001306539448219992, + "loss": 0.2698, + "step": 20346 + }, + { + "epoch": 1.6483311730395334, + "grad_norm": 0.05911261960864067, + "learning_rate": 0.00013064944416940456, + "loss": 0.3123, + "step": 20347 + }, + { + "epoch": 1.6484121840570318, + "grad_norm": 0.05397322401404381, + "learning_rate": 0.00013064494351680995, + "loss": 0.2795, + "step": 20348 + }, + { + "epoch": 1.64849319507453, + "grad_norm": 0.04608829692006111, + "learning_rate": 0.0001306404428642153, + "loss": 0.3049, + "step": 20349 + }, + { + "epoch": 1.6485742060920285, + "grad_norm": 0.051778387278318405, + "learning_rate": 0.0001306359422116207, + "loss": 0.2885, + "step": 20350 + }, + { + "epoch": 1.648655217109527, + "grad_norm": 0.04542142152786255, + "learning_rate": 0.00013063144155902608, + "loss": 0.2235, + "step": 20351 + }, + { + "epoch": 1.6487362281270252, + "grad_norm": 0.04840834438800812, + "learning_rate": 0.00013062694090643144, + "loss": 0.2946, + "step": 20352 + }, + { + "epoch": 1.6488172391445235, + "grad_norm": 0.05071964114904404, + "learning_rate": 0.0001306224402538368, + "loss": 0.271, + "step": 20353 + }, + { + "epoch": 1.6488982501620222, + "grad_norm": 0.05756606161594391, + "learning_rate": 0.0001306179396012422, + "loss": 0.2519, + "step": 20354 + }, + { + "epoch": 1.6489792611795204, + "grad_norm": 0.05126466229557991, + "learning_rate": 0.00013061343894864758, + "loss": 0.2839, + "step": 20355 + }, + { + "epoch": 1.6490602721970187, + "grad_norm": 0.06566730886697769, + "learning_rate": 0.00013060893829605294, + "loss": 0.3015, + "step": 20356 + }, + { + "epoch": 1.6491412832145171, + "grad_norm": 0.06146741658449173, + "learning_rate": 0.00013060443764345832, + "loss": 0.2913, + "step": 20357 + }, + { + "epoch": 1.6492222942320156, + "grad_norm": 0.04742487892508507, + "learning_rate": 0.00013059993699086368, + "loss": 0.2492, + "step": 20358 + }, + { + "epoch": 1.6493033052495139, + "grad_norm": 0.051782794296741486, + "learning_rate": 0.00013059543633826904, + "loss": 0.299, + "step": 20359 + }, + { + "epoch": 1.6493843162670123, + "grad_norm": 0.056378141045570374, + "learning_rate": 0.00013059093568567443, + "loss": 0.2448, + "step": 20360 + }, + { + "epoch": 1.6494653272845108, + "grad_norm": 0.06598246842622757, + "learning_rate": 0.00013058643503307982, + "loss": 0.3373, + "step": 20361 + }, + { + "epoch": 1.649546338302009, + "grad_norm": 0.05629661679267883, + "learning_rate": 0.00013058193438048518, + "loss": 0.2929, + "step": 20362 + }, + { + "epoch": 1.6496273493195075, + "grad_norm": 0.05046766623854637, + "learning_rate": 0.00013057743372789056, + "loss": 0.2524, + "step": 20363 + }, + { + "epoch": 1.649708360337006, + "grad_norm": 0.0475756861269474, + "learning_rate": 0.00013057293307529592, + "loss": 0.2779, + "step": 20364 + }, + { + "epoch": 1.6497893713545042, + "grad_norm": 0.045517344027757645, + "learning_rate": 0.00013056843242270128, + "loss": 0.2772, + "step": 20365 + }, + { + "epoch": 1.6498703823720025, + "grad_norm": 0.054667480289936066, + "learning_rate": 0.00013056393177010667, + "loss": 0.2758, + "step": 20366 + }, + { + "epoch": 1.649951393389501, + "grad_norm": 0.04701853170990944, + "learning_rate": 0.00013055943111751206, + "loss": 0.2884, + "step": 20367 + }, + { + "epoch": 1.6500324044069994, + "grad_norm": 0.049440670758485794, + "learning_rate": 0.00013055493046491742, + "loss": 0.2548, + "step": 20368 + }, + { + "epoch": 1.6501134154244976, + "grad_norm": 0.05815757438540459, + "learning_rate": 0.0001305504298123228, + "loss": 0.2529, + "step": 20369 + }, + { + "epoch": 1.650194426441996, + "grad_norm": 0.049093883484601974, + "learning_rate": 0.00013054592915972817, + "loss": 0.3112, + "step": 20370 + }, + { + "epoch": 1.6502754374594946, + "grad_norm": 0.05279753729701042, + "learning_rate": 0.00013054142850713353, + "loss": 0.2654, + "step": 20371 + }, + { + "epoch": 1.6503564484769928, + "grad_norm": 0.04632333666086197, + "learning_rate": 0.00013053692785453891, + "loss": 0.2417, + "step": 20372 + }, + { + "epoch": 1.6504374594944913, + "grad_norm": 0.043918710201978683, + "learning_rate": 0.0001305324272019443, + "loss": 0.2442, + "step": 20373 + }, + { + "epoch": 1.6505184705119897, + "grad_norm": 0.04563755542039871, + "learning_rate": 0.00013052792654934966, + "loss": 0.2821, + "step": 20374 + }, + { + "epoch": 1.650599481529488, + "grad_norm": 0.05067136138677597, + "learning_rate": 0.00013052342589675505, + "loss": 0.236, + "step": 20375 + }, + { + "epoch": 1.6506804925469862, + "grad_norm": 0.05287901684641838, + "learning_rate": 0.0001305189252441604, + "loss": 0.2115, + "step": 20376 + }, + { + "epoch": 1.650761503564485, + "grad_norm": 0.04865317791700363, + "learning_rate": 0.00013051442459156577, + "loss": 0.2497, + "step": 20377 + }, + { + "epoch": 1.6508425145819832, + "grad_norm": 0.059927087277173996, + "learning_rate": 0.00013050992393897116, + "loss": 0.2981, + "step": 20378 + }, + { + "epoch": 1.6509235255994814, + "grad_norm": 0.044385023415088654, + "learning_rate": 0.00013050542328637654, + "loss": 0.2481, + "step": 20379 + }, + { + "epoch": 1.6510045366169799, + "grad_norm": 0.05484599247574806, + "learning_rate": 0.0001305009226337819, + "loss": 0.2793, + "step": 20380 + }, + { + "epoch": 1.6510855476344783, + "grad_norm": 0.053661175072193146, + "learning_rate": 0.0001304964219811873, + "loss": 0.2613, + "step": 20381 + }, + { + "epoch": 1.6511665586519766, + "grad_norm": 0.050644651055336, + "learning_rate": 0.00013049192132859265, + "loss": 0.2869, + "step": 20382 + }, + { + "epoch": 1.651247569669475, + "grad_norm": 0.0510164275765419, + "learning_rate": 0.000130487420675998, + "loss": 0.2726, + "step": 20383 + }, + { + "epoch": 1.6513285806869735, + "grad_norm": 0.05315467342734337, + "learning_rate": 0.0001304829200234034, + "loss": 0.2751, + "step": 20384 + }, + { + "epoch": 1.6514095917044718, + "grad_norm": 0.05707908421754837, + "learning_rate": 0.00013047841937080878, + "loss": 0.2527, + "step": 20385 + }, + { + "epoch": 1.6514906027219702, + "grad_norm": 0.05162080377340317, + "learning_rate": 0.00013047391871821414, + "loss": 0.2513, + "step": 20386 + }, + { + "epoch": 1.6515716137394687, + "grad_norm": 0.06012650206685066, + "learning_rate": 0.00013046941806561953, + "loss": 0.2579, + "step": 20387 + }, + { + "epoch": 1.651652624756967, + "grad_norm": 0.05233855918049812, + "learning_rate": 0.0001304649174130249, + "loss": 0.2584, + "step": 20388 + }, + { + "epoch": 1.6517336357744652, + "grad_norm": 0.06599611043930054, + "learning_rate": 0.00013046041676043025, + "loss": 0.2751, + "step": 20389 + }, + { + "epoch": 1.6518146467919637, + "grad_norm": 0.07051976770162582, + "learning_rate": 0.00013045591610783564, + "loss": 0.3284, + "step": 20390 + }, + { + "epoch": 1.6518956578094621, + "grad_norm": 0.052024535834789276, + "learning_rate": 0.00013045141545524103, + "loss": 0.291, + "step": 20391 + }, + { + "epoch": 1.6519766688269604, + "grad_norm": 0.05041825398802757, + "learning_rate": 0.00013044691480264639, + "loss": 0.2656, + "step": 20392 + }, + { + "epoch": 1.6520576798444588, + "grad_norm": 0.059028733521699905, + "learning_rate": 0.00013044241415005177, + "loss": 0.3197, + "step": 20393 + }, + { + "epoch": 1.6521386908619573, + "grad_norm": 0.06829122453927994, + "learning_rate": 0.00013043791349745713, + "loss": 0.2976, + "step": 20394 + }, + { + "epoch": 1.6522197018794555, + "grad_norm": 0.050032202154397964, + "learning_rate": 0.0001304334128448625, + "loss": 0.2844, + "step": 20395 + }, + { + "epoch": 1.652300712896954, + "grad_norm": 0.05230595916509628, + "learning_rate": 0.00013042891219226788, + "loss": 0.2435, + "step": 20396 + }, + { + "epoch": 1.6523817239144525, + "grad_norm": 0.048366427421569824, + "learning_rate": 0.00013042441153967327, + "loss": 0.3016, + "step": 20397 + }, + { + "epoch": 1.6524627349319507, + "grad_norm": 0.05470036715269089, + "learning_rate": 0.00013041991088707863, + "loss": 0.2688, + "step": 20398 + }, + { + "epoch": 1.652543745949449, + "grad_norm": 0.048787783831357956, + "learning_rate": 0.00013041541023448401, + "loss": 0.3097, + "step": 20399 + }, + { + "epoch": 1.6526247569669477, + "grad_norm": 0.0499483160674572, + "learning_rate": 0.00013041090958188937, + "loss": 0.2896, + "step": 20400 + }, + { + "epoch": 1.652705767984446, + "grad_norm": 0.05546579509973526, + "learning_rate": 0.00013040640892929473, + "loss": 0.2932, + "step": 20401 + }, + { + "epoch": 1.6527867790019442, + "grad_norm": 0.05673975497484207, + "learning_rate": 0.00013040190827670015, + "loss": 0.3236, + "step": 20402 + }, + { + "epoch": 1.6528677900194426, + "grad_norm": 0.05085534229874611, + "learning_rate": 0.0001303974076241055, + "loss": 0.2284, + "step": 20403 + }, + { + "epoch": 1.652948801036941, + "grad_norm": 0.05804755911231041, + "learning_rate": 0.00013039290697151087, + "loss": 0.3285, + "step": 20404 + }, + { + "epoch": 1.6530298120544393, + "grad_norm": 0.05439860746264458, + "learning_rate": 0.00013038840631891626, + "loss": 0.3477, + "step": 20405 + }, + { + "epoch": 1.6531108230719378, + "grad_norm": 0.04965033009648323, + "learning_rate": 0.00013038390566632162, + "loss": 0.266, + "step": 20406 + }, + { + "epoch": 1.6531918340894363, + "grad_norm": 0.05848463624715805, + "learning_rate": 0.000130379405013727, + "loss": 0.3379, + "step": 20407 + }, + { + "epoch": 1.6532728451069345, + "grad_norm": 0.05859372764825821, + "learning_rate": 0.0001303749043611324, + "loss": 0.2968, + "step": 20408 + }, + { + "epoch": 1.653353856124433, + "grad_norm": 0.05018116161227226, + "learning_rate": 0.00013037040370853775, + "loss": 0.2596, + "step": 20409 + }, + { + "epoch": 1.6534348671419314, + "grad_norm": 0.046670421957969666, + "learning_rate": 0.0001303659030559431, + "loss": 0.2622, + "step": 20410 + }, + { + "epoch": 1.6535158781594297, + "grad_norm": 0.0573502741754055, + "learning_rate": 0.0001303614024033485, + "loss": 0.2577, + "step": 20411 + }, + { + "epoch": 1.653596889176928, + "grad_norm": 0.05708976462483406, + "learning_rate": 0.00013035690175075386, + "loss": 0.2701, + "step": 20412 + }, + { + "epoch": 1.6536779001944264, + "grad_norm": 0.05246642231941223, + "learning_rate": 0.00013035240109815924, + "loss": 0.2621, + "step": 20413 + }, + { + "epoch": 1.6537589112119249, + "grad_norm": 0.0493154302239418, + "learning_rate": 0.00013034790044556463, + "loss": 0.2561, + "step": 20414 + }, + { + "epoch": 1.653839922229423, + "grad_norm": 0.061303410679101944, + "learning_rate": 0.00013034339979297, + "loss": 0.2716, + "step": 20415 + }, + { + "epoch": 1.6539209332469216, + "grad_norm": 0.05280701443552971, + "learning_rate": 0.00013033889914037535, + "loss": 0.2537, + "step": 20416 + }, + { + "epoch": 1.65400194426442, + "grad_norm": 0.059893831610679626, + "learning_rate": 0.00013033439848778074, + "loss": 0.2837, + "step": 20417 + }, + { + "epoch": 1.6540829552819183, + "grad_norm": 0.07370199263095856, + "learning_rate": 0.0001303298978351861, + "loss": 0.2741, + "step": 20418 + }, + { + "epoch": 1.6541639662994168, + "grad_norm": 0.06630618870258331, + "learning_rate": 0.0001303253971825915, + "loss": 0.286, + "step": 20419 + }, + { + "epoch": 1.6542449773169152, + "grad_norm": 0.053009796887636185, + "learning_rate": 0.00013032089652999687, + "loss": 0.256, + "step": 20420 + }, + { + "epoch": 1.6543259883344135, + "grad_norm": 0.05130352824926376, + "learning_rate": 0.00013031639587740223, + "loss": 0.3012, + "step": 20421 + }, + { + "epoch": 1.6544069993519117, + "grad_norm": 0.04923132061958313, + "learning_rate": 0.0001303118952248076, + "loss": 0.2647, + "step": 20422 + }, + { + "epoch": 1.6544880103694104, + "grad_norm": 0.062321193516254425, + "learning_rate": 0.00013030739457221298, + "loss": 0.3192, + "step": 20423 + }, + { + "epoch": 1.6545690213869086, + "grad_norm": 0.05298849195241928, + "learning_rate": 0.00013030289391961834, + "loss": 0.3241, + "step": 20424 + }, + { + "epoch": 1.654650032404407, + "grad_norm": 0.057993028312921524, + "learning_rate": 0.00013029839326702373, + "loss": 0.3059, + "step": 20425 + }, + { + "epoch": 1.6547310434219054, + "grad_norm": 0.05777161940932274, + "learning_rate": 0.00013029389261442912, + "loss": 0.308, + "step": 20426 + }, + { + "epoch": 1.6548120544394038, + "grad_norm": 0.049483541399240494, + "learning_rate": 0.00013028939196183448, + "loss": 0.2899, + "step": 20427 + }, + { + "epoch": 1.654893065456902, + "grad_norm": 0.04754691198468208, + "learning_rate": 0.00013028489130923984, + "loss": 0.2583, + "step": 20428 + }, + { + "epoch": 1.6549740764744005, + "grad_norm": 0.0468088760972023, + "learning_rate": 0.00013028039065664522, + "loss": 0.2979, + "step": 20429 + }, + { + "epoch": 1.655055087491899, + "grad_norm": 0.05034726858139038, + "learning_rate": 0.00013027589000405058, + "loss": 0.2969, + "step": 20430 + }, + { + "epoch": 1.6551360985093972, + "grad_norm": 0.05580078065395355, + "learning_rate": 0.00013027138935145597, + "loss": 0.2764, + "step": 20431 + }, + { + "epoch": 1.6552171095268955, + "grad_norm": 0.0637517049908638, + "learning_rate": 0.00013026688869886136, + "loss": 0.3075, + "step": 20432 + }, + { + "epoch": 1.6552981205443942, + "grad_norm": 0.03854849934577942, + "learning_rate": 0.00013026238804626672, + "loss": 0.2383, + "step": 20433 + }, + { + "epoch": 1.6553791315618924, + "grad_norm": 0.04968152195215225, + "learning_rate": 0.00013025788739367208, + "loss": 0.2882, + "step": 20434 + }, + { + "epoch": 1.6554601425793907, + "grad_norm": 0.0447208434343338, + "learning_rate": 0.00013025338674107746, + "loss": 0.3073, + "step": 20435 + }, + { + "epoch": 1.6555411535968891, + "grad_norm": 0.07282092422246933, + "learning_rate": 0.00013024888608848285, + "loss": 0.3316, + "step": 20436 + }, + { + "epoch": 1.6556221646143876, + "grad_norm": 0.06616365164518356, + "learning_rate": 0.0001302443854358882, + "loss": 0.3561, + "step": 20437 + }, + { + "epoch": 1.6557031756318858, + "grad_norm": 0.04712116718292236, + "learning_rate": 0.0001302398847832936, + "loss": 0.2482, + "step": 20438 + }, + { + "epoch": 1.6557841866493843, + "grad_norm": 0.05836176499724388, + "learning_rate": 0.00013023538413069896, + "loss": 0.3062, + "step": 20439 + }, + { + "epoch": 1.6558651976668828, + "grad_norm": 0.05766526237130165, + "learning_rate": 0.00013023088347810432, + "loss": 0.2625, + "step": 20440 + }, + { + "epoch": 1.655946208684381, + "grad_norm": 0.04732430353760719, + "learning_rate": 0.0001302263828255097, + "loss": 0.3104, + "step": 20441 + }, + { + "epoch": 1.6560272197018795, + "grad_norm": 0.05533413216471672, + "learning_rate": 0.0001302218821729151, + "loss": 0.332, + "step": 20442 + }, + { + "epoch": 1.656108230719378, + "grad_norm": 0.04660717025399208, + "learning_rate": 0.00013021738152032045, + "loss": 0.2553, + "step": 20443 + }, + { + "epoch": 1.6561892417368762, + "grad_norm": 0.04941389709711075, + "learning_rate": 0.00013021288086772584, + "loss": 0.2431, + "step": 20444 + }, + { + "epoch": 1.6562702527543745, + "grad_norm": 0.052802614867687225, + "learning_rate": 0.0001302083802151312, + "loss": 0.2749, + "step": 20445 + }, + { + "epoch": 1.6563512637718731, + "grad_norm": 0.059079330414533615, + "learning_rate": 0.00013020387956253656, + "loss": 0.3094, + "step": 20446 + }, + { + "epoch": 1.6564322747893714, + "grad_norm": 0.04814727231860161, + "learning_rate": 0.00013019937890994195, + "loss": 0.2567, + "step": 20447 + }, + { + "epoch": 1.6565132858068696, + "grad_norm": 0.049222610890865326, + "learning_rate": 0.00013019487825734733, + "loss": 0.2786, + "step": 20448 + }, + { + "epoch": 1.656594296824368, + "grad_norm": 0.0641581192612648, + "learning_rate": 0.0001301903776047527, + "loss": 0.2967, + "step": 20449 + }, + { + "epoch": 1.6566753078418666, + "grad_norm": 0.0636487528681755, + "learning_rate": 0.00013018587695215808, + "loss": 0.3429, + "step": 20450 + }, + { + "epoch": 1.6567563188593648, + "grad_norm": 0.06307125836610794, + "learning_rate": 0.00013018137629956344, + "loss": 0.306, + "step": 20451 + }, + { + "epoch": 1.6568373298768633, + "grad_norm": 0.054315872490406036, + "learning_rate": 0.0001301768756469688, + "loss": 0.283, + "step": 20452 + }, + { + "epoch": 1.6569183408943617, + "grad_norm": 0.05288204923272133, + "learning_rate": 0.0001301723749943742, + "loss": 0.2725, + "step": 20453 + }, + { + "epoch": 1.65699935191186, + "grad_norm": 0.05411860719323158, + "learning_rate": 0.00013016787434177958, + "loss": 0.2966, + "step": 20454 + }, + { + "epoch": 1.6570803629293582, + "grad_norm": 0.05066904053092003, + "learning_rate": 0.00013016337368918494, + "loss": 0.2858, + "step": 20455 + }, + { + "epoch": 1.657161373946857, + "grad_norm": 0.06001361459493637, + "learning_rate": 0.00013015887303659032, + "loss": 0.303, + "step": 20456 + }, + { + "epoch": 1.6572423849643552, + "grad_norm": 0.05340325087308884, + "learning_rate": 0.00013015437238399568, + "loss": 0.2669, + "step": 20457 + }, + { + "epoch": 1.6573233959818534, + "grad_norm": 0.050688330084085464, + "learning_rate": 0.00013014987173140104, + "loss": 0.2609, + "step": 20458 + }, + { + "epoch": 1.6574044069993519, + "grad_norm": 0.05546950921416283, + "learning_rate": 0.00013014537107880643, + "loss": 0.3079, + "step": 20459 + }, + { + "epoch": 1.6574854180168503, + "grad_norm": 0.05770743638277054, + "learning_rate": 0.00013014087042621182, + "loss": 0.2666, + "step": 20460 + }, + { + "epoch": 1.6575664290343486, + "grad_norm": 0.05666600912809372, + "learning_rate": 0.00013013636977361718, + "loss": 0.2775, + "step": 20461 + }, + { + "epoch": 1.657647440051847, + "grad_norm": 0.05800339952111244, + "learning_rate": 0.00013013186912102257, + "loss": 0.2936, + "step": 20462 + }, + { + "epoch": 1.6577284510693455, + "grad_norm": 0.04769282788038254, + "learning_rate": 0.00013012736846842793, + "loss": 0.2965, + "step": 20463 + }, + { + "epoch": 1.6578094620868438, + "grad_norm": 0.05040494725108147, + "learning_rate": 0.00013012286781583329, + "loss": 0.2545, + "step": 20464 + }, + { + "epoch": 1.6578904731043422, + "grad_norm": 0.058276791125535965, + "learning_rate": 0.00013011836716323867, + "loss": 0.3043, + "step": 20465 + }, + { + "epoch": 1.6579714841218407, + "grad_norm": 0.05103198066353798, + "learning_rate": 0.00013011386651064406, + "loss": 0.293, + "step": 20466 + }, + { + "epoch": 1.658052495139339, + "grad_norm": 0.050009891390800476, + "learning_rate": 0.00013010936585804942, + "loss": 0.2961, + "step": 20467 + }, + { + "epoch": 1.6581335061568372, + "grad_norm": 0.04965313896536827, + "learning_rate": 0.0001301048652054548, + "loss": 0.2734, + "step": 20468 + }, + { + "epoch": 1.6582145171743357, + "grad_norm": 0.0604669414460659, + "learning_rate": 0.00013010036455286017, + "loss": 0.2839, + "step": 20469 + }, + { + "epoch": 1.6582955281918341, + "grad_norm": 0.052262432873249054, + "learning_rate": 0.00013009586390026553, + "loss": 0.2666, + "step": 20470 + }, + { + "epoch": 1.6583765392093324, + "grad_norm": 0.06164982542395592, + "learning_rate": 0.00013009136324767094, + "loss": 0.27, + "step": 20471 + }, + { + "epoch": 1.6584575502268308, + "grad_norm": 0.049257468432188034, + "learning_rate": 0.0001300868625950763, + "loss": 0.2946, + "step": 20472 + }, + { + "epoch": 1.6585385612443293, + "grad_norm": 0.04953254014253616, + "learning_rate": 0.00013008236194248166, + "loss": 0.2726, + "step": 20473 + }, + { + "epoch": 1.6586195722618275, + "grad_norm": 0.047944556921720505, + "learning_rate": 0.00013007786128988705, + "loss": 0.2686, + "step": 20474 + }, + { + "epoch": 1.658700583279326, + "grad_norm": 0.0578407347202301, + "learning_rate": 0.0001300733606372924, + "loss": 0.308, + "step": 20475 + }, + { + "epoch": 1.6587815942968245, + "grad_norm": 0.048618391156196594, + "learning_rate": 0.00013006885998469777, + "loss": 0.2911, + "step": 20476 + }, + { + "epoch": 1.6588626053143227, + "grad_norm": 0.05097212642431259, + "learning_rate": 0.00013006435933210318, + "loss": 0.2919, + "step": 20477 + }, + { + "epoch": 1.658943616331821, + "grad_norm": 0.05795508995652199, + "learning_rate": 0.00013005985867950854, + "loss": 0.2925, + "step": 20478 + }, + { + "epoch": 1.6590246273493197, + "grad_norm": 0.04863395914435387, + "learning_rate": 0.0001300553580269139, + "loss": 0.2612, + "step": 20479 + }, + { + "epoch": 1.659105638366818, + "grad_norm": 0.0567551851272583, + "learning_rate": 0.0001300508573743193, + "loss": 0.2638, + "step": 20480 + }, + { + "epoch": 1.6591866493843161, + "grad_norm": 0.05892769247293472, + "learning_rate": 0.00013004635672172465, + "loss": 0.3301, + "step": 20481 + }, + { + "epoch": 1.6592676604018146, + "grad_norm": 0.049185313284397125, + "learning_rate": 0.00013004185606913, + "loss": 0.246, + "step": 20482 + }, + { + "epoch": 1.659348671419313, + "grad_norm": 0.055416181683540344, + "learning_rate": 0.00013003735541653542, + "loss": 0.3175, + "step": 20483 + }, + { + "epoch": 1.6594296824368113, + "grad_norm": 0.06165776774287224, + "learning_rate": 0.00013003285476394078, + "loss": 0.2874, + "step": 20484 + }, + { + "epoch": 1.6595106934543098, + "grad_norm": 0.04640460014343262, + "learning_rate": 0.00013002835411134614, + "loss": 0.257, + "step": 20485 + }, + { + "epoch": 1.6595917044718083, + "grad_norm": 0.04604089632630348, + "learning_rate": 0.00013002385345875153, + "loss": 0.2438, + "step": 20486 + }, + { + "epoch": 1.6596727154893065, + "grad_norm": 0.04631912335753441, + "learning_rate": 0.0001300193528061569, + "loss": 0.272, + "step": 20487 + }, + { + "epoch": 1.659753726506805, + "grad_norm": 0.04735549911856651, + "learning_rate": 0.00013001485215356228, + "loss": 0.2845, + "step": 20488 + }, + { + "epoch": 1.6598347375243034, + "grad_norm": 0.05785086750984192, + "learning_rate": 0.00013001035150096767, + "loss": 0.3138, + "step": 20489 + }, + { + "epoch": 1.6599157485418017, + "grad_norm": 0.046076275408267975, + "learning_rate": 0.00013000585084837303, + "loss": 0.2419, + "step": 20490 + }, + { + "epoch": 1.6599967595593, + "grad_norm": 0.04774550348520279, + "learning_rate": 0.00013000135019577839, + "loss": 0.2504, + "step": 20491 + }, + { + "epoch": 1.6600777705767984, + "grad_norm": 0.04708302021026611, + "learning_rate": 0.00012999684954318377, + "loss": 0.2562, + "step": 20492 + }, + { + "epoch": 1.6601587815942969, + "grad_norm": 0.048026781529188156, + "learning_rate": 0.00012999234889058913, + "loss": 0.2327, + "step": 20493 + }, + { + "epoch": 1.660239792611795, + "grad_norm": 0.05444342643022537, + "learning_rate": 0.00012998784823799452, + "loss": 0.2696, + "step": 20494 + }, + { + "epoch": 1.6603208036292936, + "grad_norm": 0.05067823454737663, + "learning_rate": 0.0001299833475853999, + "loss": 0.2818, + "step": 20495 + }, + { + "epoch": 1.660401814646792, + "grad_norm": 0.051078833639621735, + "learning_rate": 0.00012997884693280527, + "loss": 0.2722, + "step": 20496 + }, + { + "epoch": 1.6604828256642903, + "grad_norm": 0.06498068571090698, + "learning_rate": 0.00012997434628021063, + "loss": 0.2903, + "step": 20497 + }, + { + "epoch": 1.6605638366817888, + "grad_norm": 0.05589541792869568, + "learning_rate": 0.00012996984562761601, + "loss": 0.2993, + "step": 20498 + }, + { + "epoch": 1.6606448476992872, + "grad_norm": 0.06637276709079742, + "learning_rate": 0.00012996534497502137, + "loss": 0.2834, + "step": 20499 + }, + { + "epoch": 1.6607258587167855, + "grad_norm": 0.060838427394628525, + "learning_rate": 0.00012996084432242676, + "loss": 0.2652, + "step": 20500 + }, + { + "epoch": 1.6608068697342837, + "grad_norm": 0.0554727278649807, + "learning_rate": 0.00012995634366983215, + "loss": 0.2752, + "step": 20501 + }, + { + "epoch": 1.6608878807517824, + "grad_norm": 0.06209820508956909, + "learning_rate": 0.0001299518430172375, + "loss": 0.302, + "step": 20502 + }, + { + "epoch": 1.6609688917692806, + "grad_norm": 0.05828804150223732, + "learning_rate": 0.00012994734236464287, + "loss": 0.2439, + "step": 20503 + }, + { + "epoch": 1.6610499027867789, + "grad_norm": 0.05182472616434097, + "learning_rate": 0.00012994284171204826, + "loss": 0.2688, + "step": 20504 + }, + { + "epoch": 1.6611309138042774, + "grad_norm": 0.0592801459133625, + "learning_rate": 0.00012993834105945362, + "loss": 0.2649, + "step": 20505 + }, + { + "epoch": 1.6612119248217758, + "grad_norm": 0.047676533460617065, + "learning_rate": 0.000129933840406859, + "loss": 0.2698, + "step": 20506 + }, + { + "epoch": 1.661292935839274, + "grad_norm": 0.058383118361234665, + "learning_rate": 0.0001299293397542644, + "loss": 0.3119, + "step": 20507 + }, + { + "epoch": 1.6613739468567725, + "grad_norm": 0.05639087036252022, + "learning_rate": 0.00012992483910166975, + "loss": 0.2971, + "step": 20508 + }, + { + "epoch": 1.661454957874271, + "grad_norm": 0.05297534912824631, + "learning_rate": 0.0001299203384490751, + "loss": 0.3108, + "step": 20509 + }, + { + "epoch": 1.6615359688917692, + "grad_norm": 0.0611218698322773, + "learning_rate": 0.0001299158377964805, + "loss": 0.3098, + "step": 20510 + }, + { + "epoch": 1.6616169799092677, + "grad_norm": 0.04799456521868706, + "learning_rate": 0.00012991133714388589, + "loss": 0.2553, + "step": 20511 + }, + { + "epoch": 1.6616979909267662, + "grad_norm": 0.0584605410695076, + "learning_rate": 0.00012990683649129125, + "loss": 0.2777, + "step": 20512 + }, + { + "epoch": 1.6617790019442644, + "grad_norm": 0.057419463992118835, + "learning_rate": 0.00012990233583869663, + "loss": 0.2732, + "step": 20513 + }, + { + "epoch": 1.6618600129617627, + "grad_norm": 0.05271988362073898, + "learning_rate": 0.000129897835186102, + "loss": 0.2453, + "step": 20514 + }, + { + "epoch": 1.6619410239792611, + "grad_norm": 0.04664996638894081, + "learning_rate": 0.00012989333453350735, + "loss": 0.2789, + "step": 20515 + }, + { + "epoch": 1.6620220349967596, + "grad_norm": 0.05341833829879761, + "learning_rate": 0.00012988883388091274, + "loss": 0.3222, + "step": 20516 + }, + { + "epoch": 1.6621030460142578, + "grad_norm": 0.06119184568524361, + "learning_rate": 0.00012988433322831813, + "loss": 0.3275, + "step": 20517 + }, + { + "epoch": 1.6621840570317563, + "grad_norm": 0.05141368880867958, + "learning_rate": 0.0001298798325757235, + "loss": 0.2669, + "step": 20518 + }, + { + "epoch": 1.6622650680492548, + "grad_norm": 0.05545364320278168, + "learning_rate": 0.00012987533192312887, + "loss": 0.299, + "step": 20519 + }, + { + "epoch": 1.662346079066753, + "grad_norm": 0.060117967426776886, + "learning_rate": 0.00012987083127053423, + "loss": 0.2914, + "step": 20520 + }, + { + "epoch": 1.6624270900842515, + "grad_norm": 0.04995759576559067, + "learning_rate": 0.0001298663306179396, + "loss": 0.2584, + "step": 20521 + }, + { + "epoch": 1.66250810110175, + "grad_norm": 0.061563752591609955, + "learning_rate": 0.00012986182996534498, + "loss": 0.2928, + "step": 20522 + }, + { + "epoch": 1.6625891121192482, + "grad_norm": 0.05292750149965286, + "learning_rate": 0.00012985732931275037, + "loss": 0.277, + "step": 20523 + }, + { + "epoch": 1.6626701231367464, + "grad_norm": 0.0493243932723999, + "learning_rate": 0.00012985282866015573, + "loss": 0.2379, + "step": 20524 + }, + { + "epoch": 1.6627511341542451, + "grad_norm": 0.05584977939724922, + "learning_rate": 0.00012984832800756112, + "loss": 0.3169, + "step": 20525 + }, + { + "epoch": 1.6628321451717434, + "grad_norm": 0.05710172653198242, + "learning_rate": 0.00012984382735496648, + "loss": 0.3137, + "step": 20526 + }, + { + "epoch": 1.6629131561892416, + "grad_norm": 0.05750979483127594, + "learning_rate": 0.00012983932670237184, + "loss": 0.2796, + "step": 20527 + }, + { + "epoch": 1.66299416720674, + "grad_norm": 0.04641050845384598, + "learning_rate": 0.00012983482604977722, + "loss": 0.2926, + "step": 20528 + }, + { + "epoch": 1.6630751782242386, + "grad_norm": 0.046793535351753235, + "learning_rate": 0.0001298303253971826, + "loss": 0.3039, + "step": 20529 + }, + { + "epoch": 1.6631561892417368, + "grad_norm": 0.06085945665836334, + "learning_rate": 0.00012982582474458797, + "loss": 0.2852, + "step": 20530 + }, + { + "epoch": 1.6632372002592353, + "grad_norm": 0.05577259510755539, + "learning_rate": 0.00012982132409199336, + "loss": 0.2757, + "step": 20531 + }, + { + "epoch": 1.6633182112767337, + "grad_norm": 0.06003398075699806, + "learning_rate": 0.00012981682343939872, + "loss": 0.3303, + "step": 20532 + }, + { + "epoch": 1.663399222294232, + "grad_norm": 0.04969777911901474, + "learning_rate": 0.00012981232278680408, + "loss": 0.2768, + "step": 20533 + }, + { + "epoch": 1.6634802333117304, + "grad_norm": 0.04817051440477371, + "learning_rate": 0.00012980782213420946, + "loss": 0.285, + "step": 20534 + }, + { + "epoch": 1.663561244329229, + "grad_norm": 0.05781611427664757, + "learning_rate": 0.00012980332148161485, + "loss": 0.3096, + "step": 20535 + }, + { + "epoch": 1.6636422553467272, + "grad_norm": 0.05585617944598198, + "learning_rate": 0.0001297988208290202, + "loss": 0.2738, + "step": 20536 + }, + { + "epoch": 1.6637232663642254, + "grad_norm": 0.06109308823943138, + "learning_rate": 0.0001297943201764256, + "loss": 0.3138, + "step": 20537 + }, + { + "epoch": 1.6638042773817239, + "grad_norm": 0.06365103274583817, + "learning_rate": 0.00012978981952383096, + "loss": 0.2642, + "step": 20538 + }, + { + "epoch": 1.6638852883992223, + "grad_norm": 0.05765408277511597, + "learning_rate": 0.00012978531887123632, + "loss": 0.2954, + "step": 20539 + }, + { + "epoch": 1.6639662994167206, + "grad_norm": 0.05007627233862877, + "learning_rate": 0.00012978081821864173, + "loss": 0.2646, + "step": 20540 + }, + { + "epoch": 1.664047310434219, + "grad_norm": 0.05124497786164284, + "learning_rate": 0.0001297763175660471, + "loss": 0.2546, + "step": 20541 + }, + { + "epoch": 1.6641283214517175, + "grad_norm": 0.05661545321345329, + "learning_rate": 0.00012977181691345245, + "loss": 0.2833, + "step": 20542 + }, + { + "epoch": 1.6642093324692158, + "grad_norm": 0.05698556825518608, + "learning_rate": 0.00012976731626085784, + "loss": 0.3098, + "step": 20543 + }, + { + "epoch": 1.6642903434867142, + "grad_norm": 0.052347540855407715, + "learning_rate": 0.0001297628156082632, + "loss": 0.2823, + "step": 20544 + }, + { + "epoch": 1.6643713545042127, + "grad_norm": 0.05601882189512253, + "learning_rate": 0.00012975831495566856, + "loss": 0.2739, + "step": 20545 + }, + { + "epoch": 1.664452365521711, + "grad_norm": 0.04795810952782631, + "learning_rate": 0.00012975381430307397, + "loss": 0.3018, + "step": 20546 + }, + { + "epoch": 1.6645333765392092, + "grad_norm": 0.053273119032382965, + "learning_rate": 0.00012974931365047933, + "loss": 0.2787, + "step": 20547 + }, + { + "epoch": 1.6646143875567079, + "grad_norm": 0.058366890996694565, + "learning_rate": 0.0001297448129978847, + "loss": 0.2966, + "step": 20548 + }, + { + "epoch": 1.6646953985742061, + "grad_norm": 0.05622422322630882, + "learning_rate": 0.00012974031234529008, + "loss": 0.2686, + "step": 20549 + }, + { + "epoch": 1.6647764095917044, + "grad_norm": 0.07095862179994583, + "learning_rate": 0.00012973581169269544, + "loss": 0.3119, + "step": 20550 + }, + { + "epoch": 1.6648574206092028, + "grad_norm": 0.056710973381996155, + "learning_rate": 0.0001297313110401008, + "loss": 0.2593, + "step": 20551 + }, + { + "epoch": 1.6649384316267013, + "grad_norm": 0.05565338581800461, + "learning_rate": 0.00012972681038750622, + "loss": 0.2868, + "step": 20552 + }, + { + "epoch": 1.6650194426441995, + "grad_norm": 0.05398929491639137, + "learning_rate": 0.00012972230973491158, + "loss": 0.3339, + "step": 20553 + }, + { + "epoch": 1.665100453661698, + "grad_norm": 0.05687215179204941, + "learning_rate": 0.00012971780908231694, + "loss": 0.2824, + "step": 20554 + }, + { + "epoch": 1.6651814646791965, + "grad_norm": 0.06144799664616585, + "learning_rate": 0.00012971330842972232, + "loss": 0.2891, + "step": 20555 + }, + { + "epoch": 1.6652624756966947, + "grad_norm": 0.05557479336857796, + "learning_rate": 0.00012970880777712768, + "loss": 0.2883, + "step": 20556 + }, + { + "epoch": 1.665343486714193, + "grad_norm": 0.050593696534633636, + "learning_rate": 0.00012970430712453304, + "loss": 0.2786, + "step": 20557 + }, + { + "epoch": 1.6654244977316917, + "grad_norm": 0.049495626240968704, + "learning_rate": 0.00012969980647193846, + "loss": 0.28, + "step": 20558 + }, + { + "epoch": 1.66550550874919, + "grad_norm": 0.05402037128806114, + "learning_rate": 0.00012969530581934382, + "loss": 0.3113, + "step": 20559 + }, + { + "epoch": 1.6655865197666881, + "grad_norm": 0.06765372306108475, + "learning_rate": 0.00012969080516674918, + "loss": 0.332, + "step": 20560 + }, + { + "epoch": 1.6656675307841866, + "grad_norm": 0.056443896144628525, + "learning_rate": 0.00012968630451415457, + "loss": 0.2614, + "step": 20561 + }, + { + "epoch": 1.665748541801685, + "grad_norm": 0.058400485664606094, + "learning_rate": 0.00012968180386155993, + "loss": 0.2766, + "step": 20562 + }, + { + "epoch": 1.6658295528191833, + "grad_norm": 0.05166015774011612, + "learning_rate": 0.0001296773032089653, + "loss": 0.2637, + "step": 20563 + }, + { + "epoch": 1.6659105638366818, + "grad_norm": 0.0569235160946846, + "learning_rate": 0.0001296728025563707, + "loss": 0.3026, + "step": 20564 + }, + { + "epoch": 1.6659915748541803, + "grad_norm": 0.06572642177343369, + "learning_rate": 0.00012966830190377606, + "loss": 0.297, + "step": 20565 + }, + { + "epoch": 1.6660725858716785, + "grad_norm": 0.04707733914256096, + "learning_rate": 0.00012966380125118142, + "loss": 0.2663, + "step": 20566 + }, + { + "epoch": 1.666153596889177, + "grad_norm": 0.06494346261024475, + "learning_rate": 0.0001296593005985868, + "loss": 0.3112, + "step": 20567 + }, + { + "epoch": 1.6662346079066754, + "grad_norm": 0.05284969136118889, + "learning_rate": 0.00012965479994599217, + "loss": 0.2605, + "step": 20568 + }, + { + "epoch": 1.6663156189241737, + "grad_norm": 0.06947685033082962, + "learning_rate": 0.00012965029929339755, + "loss": 0.2734, + "step": 20569 + }, + { + "epoch": 1.666396629941672, + "grad_norm": 0.056318532675504684, + "learning_rate": 0.00012964579864080294, + "loss": 0.2526, + "step": 20570 + }, + { + "epoch": 1.6664776409591704, + "grad_norm": 0.05064530298113823, + "learning_rate": 0.0001296412979882083, + "loss": 0.289, + "step": 20571 + }, + { + "epoch": 1.6665586519766689, + "grad_norm": 0.04759865626692772, + "learning_rate": 0.00012963679733561366, + "loss": 0.2512, + "step": 20572 + }, + { + "epoch": 1.666639662994167, + "grad_norm": 0.04251798987388611, + "learning_rate": 0.00012963229668301905, + "loss": 0.2449, + "step": 20573 + }, + { + "epoch": 1.6667206740116656, + "grad_norm": 0.05017111077904701, + "learning_rate": 0.0001296277960304244, + "loss": 0.2238, + "step": 20574 + }, + { + "epoch": 1.666801685029164, + "grad_norm": 0.0470963716506958, + "learning_rate": 0.0001296232953778298, + "loss": 0.2688, + "step": 20575 + }, + { + "epoch": 1.6668826960466623, + "grad_norm": 0.057013124227523804, + "learning_rate": 0.00012961879472523518, + "loss": 0.2855, + "step": 20576 + }, + { + "epoch": 1.6669637070641607, + "grad_norm": 0.0556722991168499, + "learning_rate": 0.00012961429407264054, + "loss": 0.2772, + "step": 20577 + }, + { + "epoch": 1.6670447180816592, + "grad_norm": 0.04770893603563309, + "learning_rate": 0.0001296097934200459, + "loss": 0.2668, + "step": 20578 + }, + { + "epoch": 1.6671257290991575, + "grad_norm": 0.05699243023991585, + "learning_rate": 0.0001296052927674513, + "loss": 0.3188, + "step": 20579 + }, + { + "epoch": 1.6672067401166557, + "grad_norm": 0.055579714477062225, + "learning_rate": 0.00012960079211485665, + "loss": 0.3047, + "step": 20580 + }, + { + "epoch": 1.6672877511341544, + "grad_norm": 0.05369487777352333, + "learning_rate": 0.00012959629146226204, + "loss": 0.2819, + "step": 20581 + }, + { + "epoch": 1.6673687621516526, + "grad_norm": 0.049619536846876144, + "learning_rate": 0.00012959179080966742, + "loss": 0.2517, + "step": 20582 + }, + { + "epoch": 1.6674497731691509, + "grad_norm": 0.04203467816114426, + "learning_rate": 0.00012958729015707278, + "loss": 0.2601, + "step": 20583 + }, + { + "epoch": 1.6675307841866494, + "grad_norm": 0.05514657124876976, + "learning_rate": 0.00012958278950447814, + "loss": 0.2758, + "step": 20584 + }, + { + "epoch": 1.6676117952041478, + "grad_norm": 0.052887558937072754, + "learning_rate": 0.00012957828885188353, + "loss": 0.2788, + "step": 20585 + }, + { + "epoch": 1.667692806221646, + "grad_norm": 0.045983172953128815, + "learning_rate": 0.0001295737881992889, + "loss": 0.2397, + "step": 20586 + }, + { + "epoch": 1.6677738172391445, + "grad_norm": 0.062085479497909546, + "learning_rate": 0.00012956928754669428, + "loss": 0.324, + "step": 20587 + }, + { + "epoch": 1.667854828256643, + "grad_norm": 0.049464598298072815, + "learning_rate": 0.00012956478689409967, + "loss": 0.2674, + "step": 20588 + }, + { + "epoch": 1.6679358392741412, + "grad_norm": 0.053387902677059174, + "learning_rate": 0.00012956028624150503, + "loss": 0.3101, + "step": 20589 + }, + { + "epoch": 1.6680168502916397, + "grad_norm": 0.06461282074451447, + "learning_rate": 0.00012955578558891039, + "loss": 0.3134, + "step": 20590 + }, + { + "epoch": 1.6680978613091382, + "grad_norm": 0.05711589381098747, + "learning_rate": 0.00012955128493631577, + "loss": 0.3026, + "step": 20591 + }, + { + "epoch": 1.6681788723266364, + "grad_norm": 0.0830380916595459, + "learning_rate": 0.00012954678428372116, + "loss": 0.2814, + "step": 20592 + }, + { + "epoch": 1.6682598833441347, + "grad_norm": 0.062311798334121704, + "learning_rate": 0.00012954228363112652, + "loss": 0.3021, + "step": 20593 + }, + { + "epoch": 1.6683408943616331, + "grad_norm": 0.04709800332784653, + "learning_rate": 0.0001295377829785319, + "loss": 0.2514, + "step": 20594 + }, + { + "epoch": 1.6684219053791316, + "grad_norm": 0.05270976200699806, + "learning_rate": 0.00012953328232593727, + "loss": 0.2879, + "step": 20595 + }, + { + "epoch": 1.6685029163966298, + "grad_norm": 0.0634923130273819, + "learning_rate": 0.00012952878167334263, + "loss": 0.2683, + "step": 20596 + }, + { + "epoch": 1.6685839274141283, + "grad_norm": 0.055968694388866425, + "learning_rate": 0.00012952428102074802, + "loss": 0.2837, + "step": 20597 + }, + { + "epoch": 1.6686649384316268, + "grad_norm": 0.04898613691329956, + "learning_rate": 0.0001295197803681534, + "loss": 0.2484, + "step": 20598 + }, + { + "epoch": 1.668745949449125, + "grad_norm": 0.05628184974193573, + "learning_rate": 0.00012951527971555876, + "loss": 0.2705, + "step": 20599 + }, + { + "epoch": 1.6688269604666235, + "grad_norm": 0.05504019185900688, + "learning_rate": 0.00012951077906296415, + "loss": 0.3015, + "step": 20600 + }, + { + "epoch": 1.668907971484122, + "grad_norm": 0.05534826219081879, + "learning_rate": 0.0001295062784103695, + "loss": 0.2768, + "step": 20601 + }, + { + "epoch": 1.6689889825016202, + "grad_norm": 0.06548382341861725, + "learning_rate": 0.00012950177775777487, + "loss": 0.2393, + "step": 20602 + }, + { + "epoch": 1.6690699935191184, + "grad_norm": 0.0632016733288765, + "learning_rate": 0.00012949727710518026, + "loss": 0.2924, + "step": 20603 + }, + { + "epoch": 1.6691510045366171, + "grad_norm": 0.06453005969524384, + "learning_rate": 0.00012949277645258564, + "loss": 0.3037, + "step": 20604 + }, + { + "epoch": 1.6692320155541154, + "grad_norm": 0.06613216549158096, + "learning_rate": 0.000129488275799991, + "loss": 0.3031, + "step": 20605 + }, + { + "epoch": 1.6693130265716136, + "grad_norm": 0.050734955817461014, + "learning_rate": 0.0001294837751473964, + "loss": 0.3052, + "step": 20606 + }, + { + "epoch": 1.669394037589112, + "grad_norm": 0.05854317173361778, + "learning_rate": 0.00012947927449480175, + "loss": 0.3104, + "step": 20607 + }, + { + "epoch": 1.6694750486066106, + "grad_norm": 0.060037367045879364, + "learning_rate": 0.0001294747738422071, + "loss": 0.2905, + "step": 20608 + }, + { + "epoch": 1.6695560596241088, + "grad_norm": 0.056980401277542114, + "learning_rate": 0.0001294702731896125, + "loss": 0.2875, + "step": 20609 + }, + { + "epoch": 1.6696370706416073, + "grad_norm": 0.05283212289214134, + "learning_rate": 0.00012946577253701789, + "loss": 0.2639, + "step": 20610 + }, + { + "epoch": 1.6697180816591057, + "grad_norm": 0.06313985586166382, + "learning_rate": 0.00012946127188442325, + "loss": 0.2819, + "step": 20611 + }, + { + "epoch": 1.669799092676604, + "grad_norm": 0.05628564953804016, + "learning_rate": 0.00012945677123182863, + "loss": 0.3137, + "step": 20612 + }, + { + "epoch": 1.6698801036941024, + "grad_norm": 0.06897181272506714, + "learning_rate": 0.000129452270579234, + "loss": 0.2928, + "step": 20613 + }, + { + "epoch": 1.669961114711601, + "grad_norm": 0.05419771000742912, + "learning_rate": 0.00012944776992663935, + "loss": 0.2446, + "step": 20614 + }, + { + "epoch": 1.6700421257290992, + "grad_norm": 0.05473244562745094, + "learning_rate": 0.00012944326927404474, + "loss": 0.2953, + "step": 20615 + }, + { + "epoch": 1.6701231367465974, + "grad_norm": 0.05732650309801102, + "learning_rate": 0.00012943876862145013, + "loss": 0.3044, + "step": 20616 + }, + { + "epoch": 1.6702041477640959, + "grad_norm": 0.052806396037340164, + "learning_rate": 0.0001294342679688555, + "loss": 0.2676, + "step": 20617 + }, + { + "epoch": 1.6702851587815943, + "grad_norm": 0.05250782519578934, + "learning_rate": 0.00012942976731626087, + "loss": 0.2572, + "step": 20618 + }, + { + "epoch": 1.6703661697990926, + "grad_norm": 0.05007362365722656, + "learning_rate": 0.00012942526666366623, + "loss": 0.2733, + "step": 20619 + }, + { + "epoch": 1.670447180816591, + "grad_norm": 0.058031823486089706, + "learning_rate": 0.0001294207660110716, + "loss": 0.2791, + "step": 20620 + }, + { + "epoch": 1.6705281918340895, + "grad_norm": 0.04678702726960182, + "learning_rate": 0.000129416265358477, + "loss": 0.2754, + "step": 20621 + }, + { + "epoch": 1.6706092028515878, + "grad_norm": 0.05354061350226402, + "learning_rate": 0.00012941176470588237, + "loss": 0.2984, + "step": 20622 + }, + { + "epoch": 1.6706902138690862, + "grad_norm": 0.04879770427942276, + "learning_rate": 0.00012940726405328773, + "loss": 0.2545, + "step": 20623 + }, + { + "epoch": 1.6707712248865847, + "grad_norm": 0.046921178698539734, + "learning_rate": 0.00012940276340069312, + "loss": 0.2406, + "step": 20624 + }, + { + "epoch": 1.670852235904083, + "grad_norm": 0.046346645802259445, + "learning_rate": 0.00012939826274809848, + "loss": 0.2841, + "step": 20625 + }, + { + "epoch": 1.6709332469215812, + "grad_norm": 0.054475318640470505, + "learning_rate": 0.00012939376209550384, + "loss": 0.3229, + "step": 20626 + }, + { + "epoch": 1.6710142579390799, + "grad_norm": 0.05155748873949051, + "learning_rate": 0.00012938926144290925, + "loss": 0.2663, + "step": 20627 + }, + { + "epoch": 1.6710952689565781, + "grad_norm": 0.05086018890142441, + "learning_rate": 0.0001293847607903146, + "loss": 0.2469, + "step": 20628 + }, + { + "epoch": 1.6711762799740764, + "grad_norm": 0.05731143429875374, + "learning_rate": 0.00012938026013771997, + "loss": 0.2846, + "step": 20629 + }, + { + "epoch": 1.6712572909915748, + "grad_norm": 0.06155941262841225, + "learning_rate": 0.00012937575948512536, + "loss": 0.2922, + "step": 20630 + }, + { + "epoch": 1.6713383020090733, + "grad_norm": 0.043573517352342606, + "learning_rate": 0.00012937125883253072, + "loss": 0.2457, + "step": 20631 + }, + { + "epoch": 1.6714193130265715, + "grad_norm": 0.04966678470373154, + "learning_rate": 0.00012936675817993608, + "loss": 0.2833, + "step": 20632 + }, + { + "epoch": 1.67150032404407, + "grad_norm": 0.05184457451105118, + "learning_rate": 0.0001293622575273415, + "loss": 0.2837, + "step": 20633 + }, + { + "epoch": 1.6715813350615685, + "grad_norm": 0.052014995366334915, + "learning_rate": 0.00012935775687474685, + "loss": 0.2912, + "step": 20634 + }, + { + "epoch": 1.6716623460790667, + "grad_norm": 0.053042974323034286, + "learning_rate": 0.0001293532562221522, + "loss": 0.2888, + "step": 20635 + }, + { + "epoch": 1.6717433570965652, + "grad_norm": 0.0523151159286499, + "learning_rate": 0.0001293487555695576, + "loss": 0.2608, + "step": 20636 + }, + { + "epoch": 1.6718243681140637, + "grad_norm": 0.05375725403428078, + "learning_rate": 0.00012934425491696296, + "loss": 0.2976, + "step": 20637 + }, + { + "epoch": 1.671905379131562, + "grad_norm": 0.05377538502216339, + "learning_rate": 0.00012933975426436832, + "loss": 0.2689, + "step": 20638 + }, + { + "epoch": 1.6719863901490601, + "grad_norm": 0.058378856629133224, + "learning_rate": 0.00012933525361177373, + "loss": 0.2767, + "step": 20639 + }, + { + "epoch": 1.6720674011665586, + "grad_norm": 0.05989628657698631, + "learning_rate": 0.0001293307529591791, + "loss": 0.3311, + "step": 20640 + }, + { + "epoch": 1.672148412184057, + "grad_norm": 0.05216878652572632, + "learning_rate": 0.00012932625230658445, + "loss": 0.2917, + "step": 20641 + }, + { + "epoch": 1.6722294232015553, + "grad_norm": 0.05707364156842232, + "learning_rate": 0.00012932175165398984, + "loss": 0.3096, + "step": 20642 + }, + { + "epoch": 1.6723104342190538, + "grad_norm": 0.04194723442196846, + "learning_rate": 0.0001293172510013952, + "loss": 0.2297, + "step": 20643 + }, + { + "epoch": 1.6723914452365523, + "grad_norm": 0.04870683327317238, + "learning_rate": 0.0001293127503488006, + "loss": 0.2698, + "step": 20644 + }, + { + "epoch": 1.6724724562540505, + "grad_norm": 0.05523337423801422, + "learning_rate": 0.00012930824969620598, + "loss": 0.3107, + "step": 20645 + }, + { + "epoch": 1.672553467271549, + "grad_norm": 0.04324561730027199, + "learning_rate": 0.00012930374904361134, + "loss": 0.2683, + "step": 20646 + }, + { + "epoch": 1.6726344782890474, + "grad_norm": 0.049787040799856186, + "learning_rate": 0.0001292992483910167, + "loss": 0.3126, + "step": 20647 + }, + { + "epoch": 1.6727154893065457, + "grad_norm": 0.059789739549160004, + "learning_rate": 0.00012929474773842208, + "loss": 0.2588, + "step": 20648 + }, + { + "epoch": 1.672796500324044, + "grad_norm": 0.05197549983859062, + "learning_rate": 0.00012929024708582744, + "loss": 0.2771, + "step": 20649 + }, + { + "epoch": 1.6728775113415426, + "grad_norm": 0.05637570470571518, + "learning_rate": 0.00012928574643323283, + "loss": 0.2943, + "step": 20650 + }, + { + "epoch": 1.6729585223590409, + "grad_norm": 0.06383640319108963, + "learning_rate": 0.00012928124578063822, + "loss": 0.278, + "step": 20651 + }, + { + "epoch": 1.673039533376539, + "grad_norm": 0.05009127035737038, + "learning_rate": 0.00012927674512804358, + "loss": 0.2547, + "step": 20652 + }, + { + "epoch": 1.6731205443940376, + "grad_norm": 0.05628490820527077, + "learning_rate": 0.00012927224447544894, + "loss": 0.2892, + "step": 20653 + }, + { + "epoch": 1.673201555411536, + "grad_norm": 0.0458667129278183, + "learning_rate": 0.00012926774382285432, + "loss": 0.2209, + "step": 20654 + }, + { + "epoch": 1.6732825664290343, + "grad_norm": 0.049527622759342194, + "learning_rate": 0.00012926324317025968, + "loss": 0.2369, + "step": 20655 + }, + { + "epoch": 1.6733635774465327, + "grad_norm": 0.06553801894187927, + "learning_rate": 0.00012925874251766507, + "loss": 0.294, + "step": 20656 + }, + { + "epoch": 1.6734445884640312, + "grad_norm": 0.06354054808616638, + "learning_rate": 0.00012925424186507046, + "loss": 0.294, + "step": 20657 + }, + { + "epoch": 1.6735255994815295, + "grad_norm": 0.06330928206443787, + "learning_rate": 0.00012924974121247582, + "loss": 0.3038, + "step": 20658 + }, + { + "epoch": 1.6736066104990277, + "grad_norm": 0.05782151222229004, + "learning_rate": 0.00012924524055988118, + "loss": 0.2555, + "step": 20659 + }, + { + "epoch": 1.6736876215165264, + "grad_norm": 0.05652369186282158, + "learning_rate": 0.00012924073990728657, + "loss": 0.3064, + "step": 20660 + }, + { + "epoch": 1.6737686325340246, + "grad_norm": 0.04747147858142853, + "learning_rate": 0.00012923623925469193, + "loss": 0.2983, + "step": 20661 + }, + { + "epoch": 1.6738496435515229, + "grad_norm": 0.047677140682935715, + "learning_rate": 0.0001292317386020973, + "loss": 0.2811, + "step": 20662 + }, + { + "epoch": 1.6739306545690213, + "grad_norm": 0.048979323357343674, + "learning_rate": 0.0001292272379495027, + "loss": 0.2795, + "step": 20663 + }, + { + "epoch": 1.6740116655865198, + "grad_norm": 0.04414774477481842, + "learning_rate": 0.00012922273729690806, + "loss": 0.2478, + "step": 20664 + }, + { + "epoch": 1.674092676604018, + "grad_norm": 0.060810793191194534, + "learning_rate": 0.00012921823664431342, + "loss": 0.2607, + "step": 20665 + }, + { + "epoch": 1.6741736876215165, + "grad_norm": 0.0502205528318882, + "learning_rate": 0.0001292137359917188, + "loss": 0.2426, + "step": 20666 + }, + { + "epoch": 1.674254698639015, + "grad_norm": 0.05981326103210449, + "learning_rate": 0.00012920923533912417, + "loss": 0.2912, + "step": 20667 + }, + { + "epoch": 1.6743357096565132, + "grad_norm": 0.05012542009353638, + "learning_rate": 0.00012920473468652955, + "loss": 0.2721, + "step": 20668 + }, + { + "epoch": 1.6744167206740117, + "grad_norm": 0.05499397963285446, + "learning_rate": 0.00012920023403393494, + "loss": 0.3, + "step": 20669 + }, + { + "epoch": 1.6744977316915102, + "grad_norm": 0.057382624596357346, + "learning_rate": 0.0001291957333813403, + "loss": 0.278, + "step": 20670 + }, + { + "epoch": 1.6745787427090084, + "grad_norm": 0.05538110062479973, + "learning_rate": 0.00012919123272874566, + "loss": 0.2779, + "step": 20671 + }, + { + "epoch": 1.6746597537265067, + "grad_norm": 0.061685092747211456, + "learning_rate": 0.00012918673207615105, + "loss": 0.289, + "step": 20672 + }, + { + "epoch": 1.6747407647440054, + "grad_norm": 0.07038842141628265, + "learning_rate": 0.00012918223142355644, + "loss": 0.3155, + "step": 20673 + }, + { + "epoch": 1.6748217757615036, + "grad_norm": 0.060771241784095764, + "learning_rate": 0.0001291777307709618, + "loss": 0.3103, + "step": 20674 + }, + { + "epoch": 1.6749027867790018, + "grad_norm": 0.052663542330265045, + "learning_rate": 0.00012917323011836718, + "loss": 0.2545, + "step": 20675 + }, + { + "epoch": 1.6749837977965003, + "grad_norm": 0.0837271586060524, + "learning_rate": 0.00012916872946577254, + "loss": 0.3386, + "step": 20676 + }, + { + "epoch": 1.6750648088139988, + "grad_norm": 0.045294780284166336, + "learning_rate": 0.0001291642288131779, + "loss": 0.2472, + "step": 20677 + }, + { + "epoch": 1.675145819831497, + "grad_norm": 0.05026855319738388, + "learning_rate": 0.0001291597281605833, + "loss": 0.2859, + "step": 20678 + }, + { + "epoch": 1.6752268308489955, + "grad_norm": 0.058996520936489105, + "learning_rate": 0.00012915522750798868, + "loss": 0.2834, + "step": 20679 + }, + { + "epoch": 1.675307841866494, + "grad_norm": 0.05485003441572189, + "learning_rate": 0.00012915072685539404, + "loss": 0.2755, + "step": 20680 + }, + { + "epoch": 1.6753888528839922, + "grad_norm": 0.05620567500591278, + "learning_rate": 0.00012914622620279942, + "loss": 0.2792, + "step": 20681 + }, + { + "epoch": 1.6754698639014904, + "grad_norm": 0.051583871245384216, + "learning_rate": 0.00012914172555020478, + "loss": 0.2537, + "step": 20682 + }, + { + "epoch": 1.6755508749189891, + "grad_norm": 0.050795476883649826, + "learning_rate": 0.00012913722489761014, + "loss": 0.308, + "step": 20683 + }, + { + "epoch": 1.6756318859364874, + "grad_norm": 0.052895255386829376, + "learning_rate": 0.00012913272424501553, + "loss": 0.28, + "step": 20684 + }, + { + "epoch": 1.6757128969539856, + "grad_norm": 0.051595091819763184, + "learning_rate": 0.00012912822359242092, + "loss": 0.3025, + "step": 20685 + }, + { + "epoch": 1.675793907971484, + "grad_norm": 0.06392206996679306, + "learning_rate": 0.00012912372293982628, + "loss": 0.3176, + "step": 20686 + }, + { + "epoch": 1.6758749189889826, + "grad_norm": 0.055683355778455734, + "learning_rate": 0.00012911922228723167, + "loss": 0.2902, + "step": 20687 + }, + { + "epoch": 1.6759559300064808, + "grad_norm": 0.05265095829963684, + "learning_rate": 0.00012911472163463703, + "loss": 0.2988, + "step": 20688 + }, + { + "epoch": 1.6760369410239793, + "grad_norm": 0.05111074075102806, + "learning_rate": 0.0001291102209820424, + "loss": 0.2888, + "step": 20689 + }, + { + "epoch": 1.6761179520414777, + "grad_norm": 0.04726038873195648, + "learning_rate": 0.00012910572032944777, + "loss": 0.2649, + "step": 20690 + }, + { + "epoch": 1.676198963058976, + "grad_norm": 0.05326800048351288, + "learning_rate": 0.00012910121967685316, + "loss": 0.2836, + "step": 20691 + }, + { + "epoch": 1.6762799740764744, + "grad_norm": 0.061078645288944244, + "learning_rate": 0.00012909671902425852, + "loss": 0.2928, + "step": 20692 + }, + { + "epoch": 1.676360985093973, + "grad_norm": 0.051358435302972794, + "learning_rate": 0.0001290922183716639, + "loss": 0.2802, + "step": 20693 + }, + { + "epoch": 1.6764419961114712, + "grad_norm": 0.04708100110292435, + "learning_rate": 0.00012908771771906927, + "loss": 0.3042, + "step": 20694 + }, + { + "epoch": 1.6765230071289694, + "grad_norm": 0.05056946352124214, + "learning_rate": 0.00012908321706647463, + "loss": 0.2921, + "step": 20695 + }, + { + "epoch": 1.6766040181464679, + "grad_norm": 0.05706246197223663, + "learning_rate": 0.00012907871641388004, + "loss": 0.3176, + "step": 20696 + }, + { + "epoch": 1.6766850291639663, + "grad_norm": 0.06805767118930817, + "learning_rate": 0.0001290742157612854, + "loss": 0.2965, + "step": 20697 + }, + { + "epoch": 1.6767660401814646, + "grad_norm": 0.06454560905694962, + "learning_rate": 0.00012906971510869076, + "loss": 0.2863, + "step": 20698 + }, + { + "epoch": 1.676847051198963, + "grad_norm": 0.052051447331905365, + "learning_rate": 0.00012906521445609615, + "loss": 0.297, + "step": 20699 + }, + { + "epoch": 1.6769280622164615, + "grad_norm": 0.05402744561433792, + "learning_rate": 0.0001290607138035015, + "loss": 0.2792, + "step": 20700 + }, + { + "epoch": 1.6770090732339598, + "grad_norm": 0.0522739440202713, + "learning_rate": 0.00012905621315090687, + "loss": 0.3004, + "step": 20701 + }, + { + "epoch": 1.6770900842514582, + "grad_norm": 0.04815400764346123, + "learning_rate": 0.00012905171249831228, + "loss": 0.2512, + "step": 20702 + }, + { + "epoch": 1.6771710952689567, + "grad_norm": 0.05232715234160423, + "learning_rate": 0.00012904721184571764, + "loss": 0.2482, + "step": 20703 + }, + { + "epoch": 1.677252106286455, + "grad_norm": 0.058511216193437576, + "learning_rate": 0.000129042711193123, + "loss": 0.2865, + "step": 20704 + }, + { + "epoch": 1.6773331173039532, + "grad_norm": 0.05682501196861267, + "learning_rate": 0.0001290382105405284, + "loss": 0.2623, + "step": 20705 + }, + { + "epoch": 1.6774141283214519, + "grad_norm": 0.05637514218688011, + "learning_rate": 0.00012903370988793375, + "loss": 0.2702, + "step": 20706 + }, + { + "epoch": 1.6774951393389501, + "grad_norm": 0.05251268669962883, + "learning_rate": 0.0001290292092353391, + "loss": 0.3053, + "step": 20707 + }, + { + "epoch": 1.6775761503564484, + "grad_norm": 0.04838861525058746, + "learning_rate": 0.00012902470858274453, + "loss": 0.2284, + "step": 20708 + }, + { + "epoch": 1.6776571613739468, + "grad_norm": 0.05945704132318497, + "learning_rate": 0.00012902020793014989, + "loss": 0.2693, + "step": 20709 + }, + { + "epoch": 1.6777381723914453, + "grad_norm": 0.051180288195610046, + "learning_rate": 0.00012901570727755525, + "loss": 0.248, + "step": 20710 + }, + { + "epoch": 1.6778191834089435, + "grad_norm": 0.053558263927698135, + "learning_rate": 0.00012901120662496063, + "loss": 0.282, + "step": 20711 + }, + { + "epoch": 1.677900194426442, + "grad_norm": 0.05868763476610184, + "learning_rate": 0.000129006705972366, + "loss": 0.3009, + "step": 20712 + }, + { + "epoch": 1.6779812054439405, + "grad_norm": 0.05742897465825081, + "learning_rate": 0.00012900220531977135, + "loss": 0.296, + "step": 20713 + }, + { + "epoch": 1.6780622164614387, + "grad_norm": 0.05803952366113663, + "learning_rate": 0.00012899770466717677, + "loss": 0.2632, + "step": 20714 + }, + { + "epoch": 1.6781432274789372, + "grad_norm": 0.053818389773368835, + "learning_rate": 0.00012899320401458213, + "loss": 0.2453, + "step": 20715 + }, + { + "epoch": 1.6782242384964356, + "grad_norm": 0.06133642420172691, + "learning_rate": 0.0001289887033619875, + "loss": 0.2794, + "step": 20716 + }, + { + "epoch": 1.678305249513934, + "grad_norm": 0.05301635339856148, + "learning_rate": 0.00012898420270939287, + "loss": 0.3004, + "step": 20717 + }, + { + "epoch": 1.6783862605314321, + "grad_norm": 0.05932917818427086, + "learning_rate": 0.00012897970205679823, + "loss": 0.2614, + "step": 20718 + }, + { + "epoch": 1.6784672715489306, + "grad_norm": 0.04853160306811333, + "learning_rate": 0.0001289752014042036, + "loss": 0.2496, + "step": 20719 + }, + { + "epoch": 1.678548282566429, + "grad_norm": 0.058354031294584274, + "learning_rate": 0.000128970700751609, + "loss": 0.3044, + "step": 20720 + }, + { + "epoch": 1.6786292935839273, + "grad_norm": 0.05149131640791893, + "learning_rate": 0.00012896620009901437, + "loss": 0.2766, + "step": 20721 + }, + { + "epoch": 1.6787103046014258, + "grad_norm": 0.05151427909731865, + "learning_rate": 0.00012896169944641973, + "loss": 0.2734, + "step": 20722 + }, + { + "epoch": 1.6787913156189243, + "grad_norm": 0.05959561467170715, + "learning_rate": 0.00012895719879382512, + "loss": 0.2876, + "step": 20723 + }, + { + "epoch": 1.6788723266364225, + "grad_norm": 0.05226140841841698, + "learning_rate": 0.00012895269814123048, + "loss": 0.261, + "step": 20724 + }, + { + "epoch": 1.678953337653921, + "grad_norm": 0.044787511229515076, + "learning_rate": 0.00012894819748863586, + "loss": 0.2482, + "step": 20725 + }, + { + "epoch": 1.6790343486714194, + "grad_norm": 0.04887351393699646, + "learning_rate": 0.00012894369683604125, + "loss": 0.2702, + "step": 20726 + }, + { + "epoch": 1.6791153596889177, + "grad_norm": 0.06846939772367477, + "learning_rate": 0.0001289391961834466, + "loss": 0.3236, + "step": 20727 + }, + { + "epoch": 1.679196370706416, + "grad_norm": 0.04261481389403343, + "learning_rate": 0.00012893469553085197, + "loss": 0.2375, + "step": 20728 + }, + { + "epoch": 1.6792773817239146, + "grad_norm": 0.05724846199154854, + "learning_rate": 0.00012893019487825736, + "loss": 0.2916, + "step": 20729 + }, + { + "epoch": 1.6793583927414129, + "grad_norm": 0.053449541330337524, + "learning_rate": 0.00012892569422566272, + "loss": 0.2881, + "step": 20730 + }, + { + "epoch": 1.679439403758911, + "grad_norm": 0.06028129532933235, + "learning_rate": 0.0001289211935730681, + "loss": 0.3287, + "step": 20731 + }, + { + "epoch": 1.6795204147764096, + "grad_norm": 0.0621492937207222, + "learning_rate": 0.0001289166929204735, + "loss": 0.2622, + "step": 20732 + }, + { + "epoch": 1.679601425793908, + "grad_norm": 0.060144782066345215, + "learning_rate": 0.00012891219226787885, + "loss": 0.3052, + "step": 20733 + }, + { + "epoch": 1.6796824368114063, + "grad_norm": 0.05058223754167557, + "learning_rate": 0.0001289076916152842, + "loss": 0.2842, + "step": 20734 + }, + { + "epoch": 1.6797634478289047, + "grad_norm": 0.04709825664758682, + "learning_rate": 0.0001289031909626896, + "loss": 0.2284, + "step": 20735 + }, + { + "epoch": 1.6798444588464032, + "grad_norm": 0.05158427730202675, + "learning_rate": 0.00012889869031009496, + "loss": 0.2426, + "step": 20736 + }, + { + "epoch": 1.6799254698639015, + "grad_norm": 0.07317867130041122, + "learning_rate": 0.00012889418965750035, + "loss": 0.2963, + "step": 20737 + }, + { + "epoch": 1.6800064808814, + "grad_norm": 0.05780772864818573, + "learning_rate": 0.00012888968900490573, + "loss": 0.3051, + "step": 20738 + }, + { + "epoch": 1.6800874918988984, + "grad_norm": 0.05150393769145012, + "learning_rate": 0.0001288851883523111, + "loss": 0.2998, + "step": 20739 + }, + { + "epoch": 1.6801685029163966, + "grad_norm": 0.048123449087142944, + "learning_rate": 0.00012888068769971645, + "loss": 0.25, + "step": 20740 + }, + { + "epoch": 1.6802495139338949, + "grad_norm": 0.054436638951301575, + "learning_rate": 0.00012887618704712184, + "loss": 0.3114, + "step": 20741 + }, + { + "epoch": 1.6803305249513933, + "grad_norm": 0.05005558952689171, + "learning_rate": 0.0001288716863945272, + "loss": 0.2566, + "step": 20742 + }, + { + "epoch": 1.6804115359688918, + "grad_norm": 0.060406871140003204, + "learning_rate": 0.0001288671857419326, + "loss": 0.2977, + "step": 20743 + }, + { + "epoch": 1.68049254698639, + "grad_norm": 0.05145399272441864, + "learning_rate": 0.00012886268508933798, + "loss": 0.2792, + "step": 20744 + }, + { + "epoch": 1.6805735580038885, + "grad_norm": 0.04409467428922653, + "learning_rate": 0.00012885818443674334, + "loss": 0.253, + "step": 20745 + }, + { + "epoch": 1.680654569021387, + "grad_norm": 0.05243460834026337, + "learning_rate": 0.0001288536837841487, + "loss": 0.2766, + "step": 20746 + }, + { + "epoch": 1.6807355800388852, + "grad_norm": 0.05014029145240784, + "learning_rate": 0.00012884918313155408, + "loss": 0.2773, + "step": 20747 + }, + { + "epoch": 1.6808165910563837, + "grad_norm": 0.05909423530101776, + "learning_rate": 0.00012884468247895944, + "loss": 0.2925, + "step": 20748 + }, + { + "epoch": 1.6808976020738822, + "grad_norm": 0.053994834423065186, + "learning_rate": 0.00012884018182636483, + "loss": 0.2547, + "step": 20749 + }, + { + "epoch": 1.6809786130913804, + "grad_norm": 0.059154659509658813, + "learning_rate": 0.00012883568117377022, + "loss": 0.3333, + "step": 20750 + }, + { + "epoch": 1.6810596241088787, + "grad_norm": 0.0678883045911789, + "learning_rate": 0.00012883118052117558, + "loss": 0.3044, + "step": 20751 + }, + { + "epoch": 1.6811406351263773, + "grad_norm": 0.049253012984991074, + "learning_rate": 0.00012882667986858094, + "loss": 0.2851, + "step": 20752 + }, + { + "epoch": 1.6812216461438756, + "grad_norm": 0.06529238820075989, + "learning_rate": 0.00012882217921598632, + "loss": 0.3103, + "step": 20753 + }, + { + "epoch": 1.6813026571613738, + "grad_norm": 0.05641709268093109, + "learning_rate": 0.0001288176785633917, + "loss": 0.269, + "step": 20754 + }, + { + "epoch": 1.6813836681788723, + "grad_norm": 0.05562283471226692, + "learning_rate": 0.00012881317791079707, + "loss": 0.299, + "step": 20755 + }, + { + "epoch": 1.6814646791963708, + "grad_norm": 0.042370688170194626, + "learning_rate": 0.00012880867725820246, + "loss": 0.2386, + "step": 20756 + }, + { + "epoch": 1.681545690213869, + "grad_norm": 0.05039699375629425, + "learning_rate": 0.00012880417660560782, + "loss": 0.2541, + "step": 20757 + }, + { + "epoch": 1.6816267012313675, + "grad_norm": 0.06104245409369469, + "learning_rate": 0.00012879967595301318, + "loss": 0.2773, + "step": 20758 + }, + { + "epoch": 1.681707712248866, + "grad_norm": 0.05837586894631386, + "learning_rate": 0.00012879517530041857, + "loss": 0.295, + "step": 20759 + }, + { + "epoch": 1.6817887232663642, + "grad_norm": 0.046630583703517914, + "learning_rate": 0.00012879067464782395, + "loss": 0.264, + "step": 20760 + }, + { + "epoch": 1.6818697342838627, + "grad_norm": 0.05475366860628128, + "learning_rate": 0.0001287861739952293, + "loss": 0.2939, + "step": 20761 + }, + { + "epoch": 1.6819507453013611, + "grad_norm": 0.048055436462163925, + "learning_rate": 0.0001287816733426347, + "loss": 0.2697, + "step": 20762 + }, + { + "epoch": 1.6820317563188594, + "grad_norm": 0.055682431906461716, + "learning_rate": 0.00012877717269004006, + "loss": 0.2667, + "step": 20763 + }, + { + "epoch": 1.6821127673363576, + "grad_norm": 0.04282234236598015, + "learning_rate": 0.00012877267203744542, + "loss": 0.2542, + "step": 20764 + }, + { + "epoch": 1.682193778353856, + "grad_norm": 0.05483906716108322, + "learning_rate": 0.0001287681713848508, + "loss": 0.3043, + "step": 20765 + }, + { + "epoch": 1.6822747893713546, + "grad_norm": 0.044364769011735916, + "learning_rate": 0.0001287636707322562, + "loss": 0.2642, + "step": 20766 + }, + { + "epoch": 1.6823558003888528, + "grad_norm": 0.06031596660614014, + "learning_rate": 0.00012875917007966155, + "loss": 0.295, + "step": 20767 + }, + { + "epoch": 1.6824368114063513, + "grad_norm": 0.04557067155838013, + "learning_rate": 0.00012875466942706694, + "loss": 0.2715, + "step": 20768 + }, + { + "epoch": 1.6825178224238497, + "grad_norm": 0.058976978063583374, + "learning_rate": 0.0001287501687744723, + "loss": 0.2877, + "step": 20769 + }, + { + "epoch": 1.682598833441348, + "grad_norm": 0.0493365079164505, + "learning_rate": 0.00012874566812187766, + "loss": 0.2472, + "step": 20770 + }, + { + "epoch": 1.6826798444588464, + "grad_norm": 0.07643086463212967, + "learning_rate": 0.00012874116746928305, + "loss": 0.3133, + "step": 20771 + }, + { + "epoch": 1.682760855476345, + "grad_norm": 0.06138523668050766, + "learning_rate": 0.00012873666681668844, + "loss": 0.3085, + "step": 20772 + }, + { + "epoch": 1.6828418664938432, + "grad_norm": 0.049202244728803635, + "learning_rate": 0.0001287321661640938, + "loss": 0.2776, + "step": 20773 + }, + { + "epoch": 1.6829228775113414, + "grad_norm": 0.05692125856876373, + "learning_rate": 0.00012872766551149918, + "loss": 0.2871, + "step": 20774 + }, + { + "epoch": 1.68300388852884, + "grad_norm": 0.04877998307347298, + "learning_rate": 0.00012872316485890454, + "loss": 0.291, + "step": 20775 + }, + { + "epoch": 1.6830848995463383, + "grad_norm": 0.05860499292612076, + "learning_rate": 0.0001287186642063099, + "loss": 0.3119, + "step": 20776 + }, + { + "epoch": 1.6831659105638366, + "grad_norm": 0.05695262551307678, + "learning_rate": 0.00012871416355371532, + "loss": 0.2751, + "step": 20777 + }, + { + "epoch": 1.683246921581335, + "grad_norm": 0.0618031769990921, + "learning_rate": 0.00012870966290112068, + "loss": 0.3365, + "step": 20778 + }, + { + "epoch": 1.6833279325988335, + "grad_norm": 0.05980094522237778, + "learning_rate": 0.00012870516224852604, + "loss": 0.3221, + "step": 20779 + }, + { + "epoch": 1.6834089436163318, + "grad_norm": 0.04977751895785332, + "learning_rate": 0.00012870066159593143, + "loss": 0.3018, + "step": 20780 + }, + { + "epoch": 1.6834899546338302, + "grad_norm": 0.04806521162390709, + "learning_rate": 0.00012869616094333679, + "loss": 0.2362, + "step": 20781 + }, + { + "epoch": 1.6835709656513287, + "grad_norm": 0.06428217142820358, + "learning_rate": 0.00012869166029074215, + "loss": 0.2974, + "step": 20782 + }, + { + "epoch": 1.683651976668827, + "grad_norm": 0.048949554562568665, + "learning_rate": 0.00012868715963814756, + "loss": 0.2726, + "step": 20783 + }, + { + "epoch": 1.6837329876863252, + "grad_norm": 0.04711005091667175, + "learning_rate": 0.00012868265898555292, + "loss": 0.2634, + "step": 20784 + }, + { + "epoch": 1.6838139987038239, + "grad_norm": 0.05052475258708, + "learning_rate": 0.00012867815833295828, + "loss": 0.2577, + "step": 20785 + }, + { + "epoch": 1.6838950097213221, + "grad_norm": 0.062074240297079086, + "learning_rate": 0.00012867365768036367, + "loss": 0.3477, + "step": 20786 + }, + { + "epoch": 1.6839760207388204, + "grad_norm": 0.05127348750829697, + "learning_rate": 0.00012866915702776903, + "loss": 0.3066, + "step": 20787 + }, + { + "epoch": 1.6840570317563188, + "grad_norm": 0.0517883263528347, + "learning_rate": 0.0001286646563751744, + "loss": 0.2778, + "step": 20788 + }, + { + "epoch": 1.6841380427738173, + "grad_norm": 0.050224028527736664, + "learning_rate": 0.0001286601557225798, + "loss": 0.2827, + "step": 20789 + }, + { + "epoch": 1.6842190537913155, + "grad_norm": 0.05457613989710808, + "learning_rate": 0.00012865565506998516, + "loss": 0.3157, + "step": 20790 + }, + { + "epoch": 1.684300064808814, + "grad_norm": 0.05149427428841591, + "learning_rate": 0.00012865115441739052, + "loss": 0.2896, + "step": 20791 + }, + { + "epoch": 1.6843810758263125, + "grad_norm": 0.050849854946136475, + "learning_rate": 0.0001286466537647959, + "loss": 0.311, + "step": 20792 + }, + { + "epoch": 1.6844620868438107, + "grad_norm": 0.05668675899505615, + "learning_rate": 0.00012864215311220127, + "loss": 0.326, + "step": 20793 + }, + { + "epoch": 1.6845430978613092, + "grad_norm": 0.05928093567490578, + "learning_rate": 0.00012863765245960663, + "loss": 0.3129, + "step": 20794 + }, + { + "epoch": 1.6846241088788076, + "grad_norm": 0.049157604575157166, + "learning_rate": 0.00012863315180701204, + "loss": 0.2941, + "step": 20795 + }, + { + "epoch": 1.684705119896306, + "grad_norm": 0.05986670032143593, + "learning_rate": 0.0001286286511544174, + "loss": 0.282, + "step": 20796 + }, + { + "epoch": 1.6847861309138041, + "grad_norm": 0.05676200985908508, + "learning_rate": 0.00012862415050182276, + "loss": 0.2784, + "step": 20797 + }, + { + "epoch": 1.6848671419313026, + "grad_norm": 0.0540945827960968, + "learning_rate": 0.00012861964984922815, + "loss": 0.2402, + "step": 20798 + }, + { + "epoch": 1.684948152948801, + "grad_norm": 0.055026695132255554, + "learning_rate": 0.0001286151491966335, + "loss": 0.2869, + "step": 20799 + }, + { + "epoch": 1.6850291639662993, + "grad_norm": 0.057930923998355865, + "learning_rate": 0.00012861064854403887, + "loss": 0.2931, + "step": 20800 + }, + { + "epoch": 1.6851101749837978, + "grad_norm": 0.05763215571641922, + "learning_rate": 0.00012860614789144428, + "loss": 0.2743, + "step": 20801 + }, + { + "epoch": 1.6851911860012962, + "grad_norm": 0.061730217188596725, + "learning_rate": 0.00012860164723884964, + "loss": 0.2572, + "step": 20802 + }, + { + "epoch": 1.6852721970187945, + "grad_norm": 0.05967223271727562, + "learning_rate": 0.000128597146586255, + "loss": 0.2968, + "step": 20803 + }, + { + "epoch": 1.685353208036293, + "grad_norm": 0.06001397594809532, + "learning_rate": 0.0001285926459336604, + "loss": 0.3049, + "step": 20804 + }, + { + "epoch": 1.6854342190537914, + "grad_norm": 0.05998661741614342, + "learning_rate": 0.00012858814528106575, + "loss": 0.2757, + "step": 20805 + }, + { + "epoch": 1.6855152300712897, + "grad_norm": 0.05021259933710098, + "learning_rate": 0.00012858364462847114, + "loss": 0.2494, + "step": 20806 + }, + { + "epoch": 1.685596241088788, + "grad_norm": 0.06738859415054321, + "learning_rate": 0.00012857914397587653, + "loss": 0.2915, + "step": 20807 + }, + { + "epoch": 1.6856772521062866, + "grad_norm": 0.04895463213324547, + "learning_rate": 0.00012857464332328189, + "loss": 0.2639, + "step": 20808 + }, + { + "epoch": 1.6857582631237849, + "grad_norm": 0.05599305033683777, + "learning_rate": 0.00012857014267068725, + "loss": 0.2374, + "step": 20809 + }, + { + "epoch": 1.685839274141283, + "grad_norm": 0.07651516050100327, + "learning_rate": 0.00012856564201809263, + "loss": 0.2839, + "step": 20810 + }, + { + "epoch": 1.6859202851587816, + "grad_norm": 0.046530772000551224, + "learning_rate": 0.000128561141365498, + "loss": 0.2518, + "step": 20811 + }, + { + "epoch": 1.68600129617628, + "grad_norm": 0.06377851963043213, + "learning_rate": 0.00012855664071290338, + "loss": 0.2948, + "step": 20812 + }, + { + "epoch": 1.6860823071937783, + "grad_norm": 0.04535413905978203, + "learning_rate": 0.00012855214006030877, + "loss": 0.2587, + "step": 20813 + }, + { + "epoch": 1.6861633182112767, + "grad_norm": 0.06376946717500687, + "learning_rate": 0.00012854763940771413, + "loss": 0.2953, + "step": 20814 + }, + { + "epoch": 1.6862443292287752, + "grad_norm": 0.061817001551389694, + "learning_rate": 0.0001285431387551195, + "loss": 0.2642, + "step": 20815 + }, + { + "epoch": 1.6863253402462735, + "grad_norm": 0.05439610034227371, + "learning_rate": 0.00012853863810252487, + "loss": 0.283, + "step": 20816 + }, + { + "epoch": 1.686406351263772, + "grad_norm": 0.05513634905219078, + "learning_rate": 0.00012853413744993023, + "loss": 0.2958, + "step": 20817 + }, + { + "epoch": 1.6864873622812704, + "grad_norm": 0.05512907728552818, + "learning_rate": 0.00012852963679733562, + "loss": 0.2607, + "step": 20818 + }, + { + "epoch": 1.6865683732987686, + "grad_norm": 0.046912264078855515, + "learning_rate": 0.000128525136144741, + "loss": 0.2632, + "step": 20819 + }, + { + "epoch": 1.6866493843162669, + "grad_norm": 0.06997843831777573, + "learning_rate": 0.00012852063549214637, + "loss": 0.3013, + "step": 20820 + }, + { + "epoch": 1.6867303953337653, + "grad_norm": 0.0577542781829834, + "learning_rate": 0.00012851613483955173, + "loss": 0.3108, + "step": 20821 + }, + { + "epoch": 1.6868114063512638, + "grad_norm": 0.057242099195718765, + "learning_rate": 0.00012851163418695712, + "loss": 0.2931, + "step": 20822 + }, + { + "epoch": 1.686892417368762, + "grad_norm": 0.05348726734519005, + "learning_rate": 0.00012850713353436248, + "loss": 0.2881, + "step": 20823 + }, + { + "epoch": 1.6869734283862605, + "grad_norm": 0.05593732371926308, + "learning_rate": 0.00012850263288176786, + "loss": 0.2489, + "step": 20824 + }, + { + "epoch": 1.687054439403759, + "grad_norm": 0.05484984442591667, + "learning_rate": 0.00012849813222917325, + "loss": 0.2887, + "step": 20825 + }, + { + "epoch": 1.6871354504212572, + "grad_norm": 0.0500054694712162, + "learning_rate": 0.0001284936315765786, + "loss": 0.2889, + "step": 20826 + }, + { + "epoch": 1.6872164614387557, + "grad_norm": 0.04467097297310829, + "learning_rate": 0.00012848913092398397, + "loss": 0.2735, + "step": 20827 + }, + { + "epoch": 1.6872974724562542, + "grad_norm": 0.05153514817357063, + "learning_rate": 0.00012848463027138936, + "loss": 0.2521, + "step": 20828 + }, + { + "epoch": 1.6873784834737524, + "grad_norm": 0.04148275777697563, + "learning_rate": 0.00012848012961879475, + "loss": 0.2359, + "step": 20829 + }, + { + "epoch": 1.6874594944912507, + "grad_norm": 0.057666435837745667, + "learning_rate": 0.0001284756289662001, + "loss": 0.2951, + "step": 20830 + }, + { + "epoch": 1.6875405055087493, + "grad_norm": 0.05120784044265747, + "learning_rate": 0.0001284711283136055, + "loss": 0.2797, + "step": 20831 + }, + { + "epoch": 1.6876215165262476, + "grad_norm": 0.04996515065431595, + "learning_rate": 0.00012846662766101085, + "loss": 0.2624, + "step": 20832 + }, + { + "epoch": 1.6877025275437458, + "grad_norm": 0.07200618833303452, + "learning_rate": 0.0001284621270084162, + "loss": 0.296, + "step": 20833 + }, + { + "epoch": 1.6877835385612443, + "grad_norm": 0.06274188309907913, + "learning_rate": 0.0001284576263558216, + "loss": 0.3114, + "step": 20834 + }, + { + "epoch": 1.6878645495787428, + "grad_norm": 0.058538395911455154, + "learning_rate": 0.000128453125703227, + "loss": 0.3231, + "step": 20835 + }, + { + "epoch": 1.687945560596241, + "grad_norm": 0.05318986251950264, + "learning_rate": 0.00012844862505063235, + "loss": 0.2907, + "step": 20836 + }, + { + "epoch": 1.6880265716137395, + "grad_norm": 0.04951918497681618, + "learning_rate": 0.00012844412439803773, + "loss": 0.2628, + "step": 20837 + }, + { + "epoch": 1.688107582631238, + "grad_norm": 0.03856775164604187, + "learning_rate": 0.0001284396237454431, + "loss": 0.232, + "step": 20838 + }, + { + "epoch": 1.6881885936487362, + "grad_norm": 0.051587074995040894, + "learning_rate": 0.00012843512309284845, + "loss": 0.3128, + "step": 20839 + }, + { + "epoch": 1.6882696046662347, + "grad_norm": 0.04774363711476326, + "learning_rate": 0.00012843062244025384, + "loss": 0.2544, + "step": 20840 + }, + { + "epoch": 1.6883506156837331, + "grad_norm": 0.05891667678952217, + "learning_rate": 0.00012842612178765923, + "loss": 0.3391, + "step": 20841 + }, + { + "epoch": 1.6884316267012314, + "grad_norm": 0.04155726358294487, + "learning_rate": 0.0001284216211350646, + "loss": 0.2679, + "step": 20842 + }, + { + "epoch": 1.6885126377187296, + "grad_norm": 0.060029737651348114, + "learning_rate": 0.00012841712048246998, + "loss": 0.2772, + "step": 20843 + }, + { + "epoch": 1.688593648736228, + "grad_norm": 0.05514264106750488, + "learning_rate": 0.00012841261982987534, + "loss": 0.3129, + "step": 20844 + }, + { + "epoch": 1.6886746597537265, + "grad_norm": 0.05384942144155502, + "learning_rate": 0.0001284081191772807, + "loss": 0.283, + "step": 20845 + }, + { + "epoch": 1.6887556707712248, + "grad_norm": 0.043618954718112946, + "learning_rate": 0.00012840361852468608, + "loss": 0.2453, + "step": 20846 + }, + { + "epoch": 1.6888366817887233, + "grad_norm": 0.05581897497177124, + "learning_rate": 0.00012839911787209147, + "loss": 0.2889, + "step": 20847 + }, + { + "epoch": 1.6889176928062217, + "grad_norm": 0.05275612697005272, + "learning_rate": 0.00012839461721949683, + "loss": 0.2718, + "step": 20848 + }, + { + "epoch": 1.68899870382372, + "grad_norm": 0.045593030750751495, + "learning_rate": 0.00012839011656690222, + "loss": 0.2462, + "step": 20849 + }, + { + "epoch": 1.6890797148412184, + "grad_norm": 0.05088045448064804, + "learning_rate": 0.00012838561591430758, + "loss": 0.2864, + "step": 20850 + }, + { + "epoch": 1.689160725858717, + "grad_norm": 0.056569818407297134, + "learning_rate": 0.00012838111526171294, + "loss": 0.2831, + "step": 20851 + }, + { + "epoch": 1.6892417368762151, + "grad_norm": 0.06638054549694061, + "learning_rate": 0.00012837661460911832, + "loss": 0.2928, + "step": 20852 + }, + { + "epoch": 1.6893227478937134, + "grad_norm": 0.05728859081864357, + "learning_rate": 0.0001283721139565237, + "loss": 0.3075, + "step": 20853 + }, + { + "epoch": 1.689403758911212, + "grad_norm": 0.04625353589653969, + "learning_rate": 0.00012836761330392907, + "loss": 0.2441, + "step": 20854 + }, + { + "epoch": 1.6894847699287103, + "grad_norm": 0.0503210686147213, + "learning_rate": 0.00012836311265133446, + "loss": 0.2689, + "step": 20855 + }, + { + "epoch": 1.6895657809462086, + "grad_norm": 0.054897889494895935, + "learning_rate": 0.00012835861199873982, + "loss": 0.2731, + "step": 20856 + }, + { + "epoch": 1.689646791963707, + "grad_norm": 0.04353231564164162, + "learning_rate": 0.00012835411134614518, + "loss": 0.2226, + "step": 20857 + }, + { + "epoch": 1.6897278029812055, + "grad_norm": 0.07019881904125214, + "learning_rate": 0.0001283496106935506, + "loss": 0.308, + "step": 20858 + }, + { + "epoch": 1.6898088139987038, + "grad_norm": 0.061276018619537354, + "learning_rate": 0.00012834511004095595, + "loss": 0.2721, + "step": 20859 + }, + { + "epoch": 1.6898898250162022, + "grad_norm": 0.05636150389909744, + "learning_rate": 0.0001283406093883613, + "loss": 0.3094, + "step": 20860 + }, + { + "epoch": 1.6899708360337007, + "grad_norm": 0.05796360224485397, + "learning_rate": 0.0001283361087357667, + "loss": 0.3013, + "step": 20861 + }, + { + "epoch": 1.690051847051199, + "grad_norm": 0.06425435841083527, + "learning_rate": 0.00012833160808317206, + "loss": 0.2917, + "step": 20862 + }, + { + "epoch": 1.6901328580686974, + "grad_norm": 0.046928420662879944, + "learning_rate": 0.00012832710743057742, + "loss": 0.254, + "step": 20863 + }, + { + "epoch": 1.6902138690861959, + "grad_norm": 0.05220559984445572, + "learning_rate": 0.00012832260677798283, + "loss": 0.2858, + "step": 20864 + }, + { + "epoch": 1.690294880103694, + "grad_norm": 0.04551084712147713, + "learning_rate": 0.0001283181061253882, + "loss": 0.2597, + "step": 20865 + }, + { + "epoch": 1.6903758911211924, + "grad_norm": 0.05507994815707207, + "learning_rate": 0.00012831360547279356, + "loss": 0.2619, + "step": 20866 + }, + { + "epoch": 1.6904569021386908, + "grad_norm": 0.06035435572266579, + "learning_rate": 0.00012830910482019894, + "loss": 0.3125, + "step": 20867 + }, + { + "epoch": 1.6905379131561893, + "grad_norm": 0.05258706584572792, + "learning_rate": 0.0001283046041676043, + "loss": 0.2726, + "step": 20868 + }, + { + "epoch": 1.6906189241736875, + "grad_norm": 0.05856553837656975, + "learning_rate": 0.00012830010351500966, + "loss": 0.2849, + "step": 20869 + }, + { + "epoch": 1.690699935191186, + "grad_norm": 0.05880199745297432, + "learning_rate": 0.00012829560286241508, + "loss": 0.2828, + "step": 20870 + }, + { + "epoch": 1.6907809462086845, + "grad_norm": 0.049865689128637314, + "learning_rate": 0.00012829110220982044, + "loss": 0.2599, + "step": 20871 + }, + { + "epoch": 1.6908619572261827, + "grad_norm": 0.054322708398103714, + "learning_rate": 0.0001282866015572258, + "loss": 0.2874, + "step": 20872 + }, + { + "epoch": 1.6909429682436812, + "grad_norm": 0.05771753937005997, + "learning_rate": 0.00012828210090463118, + "loss": 0.3064, + "step": 20873 + }, + { + "epoch": 1.6910239792611796, + "grad_norm": 0.057738374918699265, + "learning_rate": 0.00012827760025203654, + "loss": 0.3114, + "step": 20874 + }, + { + "epoch": 1.6911049902786779, + "grad_norm": 0.0632070004940033, + "learning_rate": 0.0001282730995994419, + "loss": 0.2768, + "step": 20875 + }, + { + "epoch": 1.6911860012961761, + "grad_norm": 0.05157562717795372, + "learning_rate": 0.00012826859894684732, + "loss": 0.2667, + "step": 20876 + }, + { + "epoch": 1.6912670123136748, + "grad_norm": 0.06026139110326767, + "learning_rate": 0.00012826409829425268, + "loss": 0.2678, + "step": 20877 + }, + { + "epoch": 1.691348023331173, + "grad_norm": 0.05859403684735298, + "learning_rate": 0.00012825959764165804, + "loss": 0.313, + "step": 20878 + }, + { + "epoch": 1.6914290343486713, + "grad_norm": 0.06271016597747803, + "learning_rate": 0.00012825509698906343, + "loss": 0.2845, + "step": 20879 + }, + { + "epoch": 1.6915100453661698, + "grad_norm": 0.07072103023529053, + "learning_rate": 0.00012825059633646879, + "loss": 0.266, + "step": 20880 + }, + { + "epoch": 1.6915910563836682, + "grad_norm": 0.05164683610200882, + "learning_rate": 0.00012824609568387417, + "loss": 0.2517, + "step": 20881 + }, + { + "epoch": 1.6916720674011665, + "grad_norm": 0.06269783526659012, + "learning_rate": 0.00012824159503127956, + "loss": 0.298, + "step": 20882 + }, + { + "epoch": 1.691753078418665, + "grad_norm": 0.06345339119434357, + "learning_rate": 0.00012823709437868492, + "loss": 0.2779, + "step": 20883 + }, + { + "epoch": 1.6918340894361634, + "grad_norm": 0.05781438946723938, + "learning_rate": 0.00012823259372609028, + "loss": 0.267, + "step": 20884 + }, + { + "epoch": 1.6919151004536617, + "grad_norm": 0.054122913628816605, + "learning_rate": 0.00012822809307349567, + "loss": 0.297, + "step": 20885 + }, + { + "epoch": 1.69199611147116, + "grad_norm": 0.0550195574760437, + "learning_rate": 0.00012822359242090103, + "loss": 0.2628, + "step": 20886 + }, + { + "epoch": 1.6920771224886586, + "grad_norm": 0.06621591001749039, + "learning_rate": 0.00012821909176830641, + "loss": 0.294, + "step": 20887 + }, + { + "epoch": 1.6921581335061568, + "grad_norm": 0.04980989173054695, + "learning_rate": 0.0001282145911157118, + "loss": 0.3155, + "step": 20888 + }, + { + "epoch": 1.692239144523655, + "grad_norm": 0.04660594090819359, + "learning_rate": 0.00012821009046311716, + "loss": 0.256, + "step": 20889 + }, + { + "epoch": 1.6923201555411536, + "grad_norm": 0.05284009501338005, + "learning_rate": 0.00012820558981052252, + "loss": 0.2647, + "step": 20890 + }, + { + "epoch": 1.692401166558652, + "grad_norm": 0.042124852538108826, + "learning_rate": 0.0001282010891579279, + "loss": 0.2752, + "step": 20891 + }, + { + "epoch": 1.6924821775761503, + "grad_norm": 0.05570278316736221, + "learning_rate": 0.00012819658850533327, + "loss": 0.2909, + "step": 20892 + }, + { + "epoch": 1.6925631885936487, + "grad_norm": 0.06244548037648201, + "learning_rate": 0.00012819208785273866, + "loss": 0.3273, + "step": 20893 + }, + { + "epoch": 1.6926441996111472, + "grad_norm": 0.050186578184366226, + "learning_rate": 0.00012818758720014404, + "loss": 0.2584, + "step": 20894 + }, + { + "epoch": 1.6927252106286454, + "grad_norm": 0.055262841284275055, + "learning_rate": 0.0001281830865475494, + "loss": 0.307, + "step": 20895 + }, + { + "epoch": 1.692806221646144, + "grad_norm": 0.051545362919569016, + "learning_rate": 0.00012817858589495476, + "loss": 0.2571, + "step": 20896 + }, + { + "epoch": 1.6928872326636424, + "grad_norm": 0.04908451810479164, + "learning_rate": 0.00012817408524236015, + "loss": 0.2695, + "step": 20897 + }, + { + "epoch": 1.6929682436811406, + "grad_norm": 0.07136844098567963, + "learning_rate": 0.0001281695845897655, + "loss": 0.3325, + "step": 20898 + }, + { + "epoch": 1.6930492546986389, + "grad_norm": 0.057610828429460526, + "learning_rate": 0.0001281650839371709, + "loss": 0.225, + "step": 20899 + }, + { + "epoch": 1.6931302657161373, + "grad_norm": 0.05767635256052017, + "learning_rate": 0.00012816058328457628, + "loss": 0.2623, + "step": 20900 + }, + { + "epoch": 1.6932112767336358, + "grad_norm": 0.04903646931052208, + "learning_rate": 0.00012815608263198164, + "loss": 0.2907, + "step": 20901 + }, + { + "epoch": 1.693292287751134, + "grad_norm": 0.04549427703022957, + "learning_rate": 0.000128151581979387, + "loss": 0.2858, + "step": 20902 + }, + { + "epoch": 1.6933732987686325, + "grad_norm": 0.06195332109928131, + "learning_rate": 0.0001281470813267924, + "loss": 0.2713, + "step": 20903 + }, + { + "epoch": 1.693454309786131, + "grad_norm": 0.04301120340824127, + "learning_rate": 0.00012814258067419775, + "loss": 0.269, + "step": 20904 + }, + { + "epoch": 1.6935353208036292, + "grad_norm": 0.05006838217377663, + "learning_rate": 0.00012813808002160314, + "loss": 0.2541, + "step": 20905 + }, + { + "epoch": 1.6936163318211277, + "grad_norm": 0.06199532374739647, + "learning_rate": 0.00012813357936900853, + "loss": 0.2884, + "step": 20906 + }, + { + "epoch": 1.6936973428386262, + "grad_norm": 0.07248139381408691, + "learning_rate": 0.00012812907871641389, + "loss": 0.3172, + "step": 20907 + }, + { + "epoch": 1.6937783538561244, + "grad_norm": 0.05518472567200661, + "learning_rate": 0.00012812457806381925, + "loss": 0.3236, + "step": 20908 + }, + { + "epoch": 1.6938593648736227, + "grad_norm": 0.05791405588388443, + "learning_rate": 0.00012812007741122463, + "loss": 0.2963, + "step": 20909 + }, + { + "epoch": 1.6939403758911213, + "grad_norm": 0.06537389755249023, + "learning_rate": 0.00012811557675863002, + "loss": 0.3128, + "step": 20910 + }, + { + "epoch": 1.6940213869086196, + "grad_norm": 0.05233273655176163, + "learning_rate": 0.00012811107610603538, + "loss": 0.2617, + "step": 20911 + }, + { + "epoch": 1.6941023979261178, + "grad_norm": 0.05526323616504669, + "learning_rate": 0.00012810657545344077, + "loss": 0.2473, + "step": 20912 + }, + { + "epoch": 1.6941834089436163, + "grad_norm": 0.05334017053246498, + "learning_rate": 0.00012810207480084613, + "loss": 0.291, + "step": 20913 + }, + { + "epoch": 1.6942644199611148, + "grad_norm": 0.05555533617734909, + "learning_rate": 0.0001280975741482515, + "loss": 0.2981, + "step": 20914 + }, + { + "epoch": 1.694345430978613, + "grad_norm": 0.04688166454434395, + "learning_rate": 0.00012809307349565688, + "loss": 0.2667, + "step": 20915 + }, + { + "epoch": 1.6944264419961115, + "grad_norm": 0.054582808166742325, + "learning_rate": 0.00012808857284306226, + "loss": 0.2771, + "step": 20916 + }, + { + "epoch": 1.69450745301361, + "grad_norm": 0.05647823214530945, + "learning_rate": 0.00012808407219046762, + "loss": 0.2949, + "step": 20917 + }, + { + "epoch": 1.6945884640311082, + "grad_norm": 0.05077538266777992, + "learning_rate": 0.000128079571537873, + "loss": 0.2537, + "step": 20918 + }, + { + "epoch": 1.6946694750486067, + "grad_norm": 0.059872034937143326, + "learning_rate": 0.00012807507088527837, + "loss": 0.2731, + "step": 20919 + }, + { + "epoch": 1.6947504860661051, + "grad_norm": 0.05296333506703377, + "learning_rate": 0.00012807057023268373, + "loss": 0.2547, + "step": 20920 + }, + { + "epoch": 1.6948314970836034, + "grad_norm": 0.0454367958009243, + "learning_rate": 0.00012806606958008912, + "loss": 0.2442, + "step": 20921 + }, + { + "epoch": 1.6949125081011016, + "grad_norm": 0.05745544284582138, + "learning_rate": 0.0001280615689274945, + "loss": 0.2773, + "step": 20922 + }, + { + "epoch": 1.6949935191186, + "grad_norm": 0.05502451956272125, + "learning_rate": 0.00012805706827489986, + "loss": 0.2551, + "step": 20923 + }, + { + "epoch": 1.6950745301360985, + "grad_norm": 0.0566684864461422, + "learning_rate": 0.00012805256762230525, + "loss": 0.3057, + "step": 20924 + }, + { + "epoch": 1.6951555411535968, + "grad_norm": 0.0476890504360199, + "learning_rate": 0.0001280480669697106, + "loss": 0.3012, + "step": 20925 + }, + { + "epoch": 1.6952365521710953, + "grad_norm": 0.05982305482029915, + "learning_rate": 0.00012804356631711597, + "loss": 0.304, + "step": 20926 + }, + { + "epoch": 1.6953175631885937, + "grad_norm": 0.05153541639447212, + "learning_rate": 0.00012803906566452136, + "loss": 0.2715, + "step": 20927 + }, + { + "epoch": 1.695398574206092, + "grad_norm": 0.05451453849673271, + "learning_rate": 0.00012803456501192675, + "loss": 0.2833, + "step": 20928 + }, + { + "epoch": 1.6954795852235904, + "grad_norm": 0.056824252009391785, + "learning_rate": 0.0001280300643593321, + "loss": 0.2743, + "step": 20929 + }, + { + "epoch": 1.695560596241089, + "grad_norm": 0.05149870365858078, + "learning_rate": 0.0001280255637067375, + "loss": 0.2686, + "step": 20930 + }, + { + "epoch": 1.6956416072585871, + "grad_norm": 0.06263695657253265, + "learning_rate": 0.00012802106305414285, + "loss": 0.2669, + "step": 20931 + }, + { + "epoch": 1.6957226182760854, + "grad_norm": 0.05872713401913643, + "learning_rate": 0.0001280165624015482, + "loss": 0.3088, + "step": 20932 + }, + { + "epoch": 1.695803629293584, + "grad_norm": 0.05683526769280434, + "learning_rate": 0.0001280120617489536, + "loss": 0.3005, + "step": 20933 + }, + { + "epoch": 1.6958846403110823, + "grad_norm": 0.04795790836215019, + "learning_rate": 0.000128007561096359, + "loss": 0.2629, + "step": 20934 + }, + { + "epoch": 1.6959656513285806, + "grad_norm": 0.05641131475567818, + "learning_rate": 0.00012800306044376435, + "loss": 0.2666, + "step": 20935 + }, + { + "epoch": 1.696046662346079, + "grad_norm": 0.05589877441525459, + "learning_rate": 0.00012799855979116973, + "loss": 0.2851, + "step": 20936 + }, + { + "epoch": 1.6961276733635775, + "grad_norm": 0.0438174232840538, + "learning_rate": 0.0001279940591385751, + "loss": 0.2198, + "step": 20937 + }, + { + "epoch": 1.6962086843810757, + "grad_norm": 0.05916321277618408, + "learning_rate": 0.00012798955848598045, + "loss": 0.2808, + "step": 20938 + }, + { + "epoch": 1.6962896953985742, + "grad_norm": 0.04272516444325447, + "learning_rate": 0.00012798505783338587, + "loss": 0.2337, + "step": 20939 + }, + { + "epoch": 1.6963707064160727, + "grad_norm": 0.05487491562962532, + "learning_rate": 0.00012798055718079123, + "loss": 0.2671, + "step": 20940 + }, + { + "epoch": 1.696451717433571, + "grad_norm": 0.042466625571250916, + "learning_rate": 0.0001279760565281966, + "loss": 0.2555, + "step": 20941 + }, + { + "epoch": 1.6965327284510694, + "grad_norm": 0.05536244064569473, + "learning_rate": 0.00012797155587560198, + "loss": 0.2683, + "step": 20942 + }, + { + "epoch": 1.6966137394685679, + "grad_norm": 0.047094427049160004, + "learning_rate": 0.00012796705522300734, + "loss": 0.2975, + "step": 20943 + }, + { + "epoch": 1.696694750486066, + "grad_norm": 0.05219883844256401, + "learning_rate": 0.0001279625545704127, + "loss": 0.2904, + "step": 20944 + }, + { + "epoch": 1.6967757615035644, + "grad_norm": 0.04873950034379959, + "learning_rate": 0.0001279580539178181, + "loss": 0.2689, + "step": 20945 + }, + { + "epoch": 1.6968567725210628, + "grad_norm": 0.04848574101924896, + "learning_rate": 0.00012795355326522347, + "loss": 0.2856, + "step": 20946 + }, + { + "epoch": 1.6969377835385613, + "grad_norm": 0.047113820910453796, + "learning_rate": 0.00012794905261262883, + "loss": 0.2732, + "step": 20947 + }, + { + "epoch": 1.6970187945560595, + "grad_norm": 0.05355425179004669, + "learning_rate": 0.00012794455196003422, + "loss": 0.2782, + "step": 20948 + }, + { + "epoch": 1.697099805573558, + "grad_norm": 0.05074724927544594, + "learning_rate": 0.00012794005130743958, + "loss": 0.2804, + "step": 20949 + }, + { + "epoch": 1.6971808165910565, + "grad_norm": 0.0466444194316864, + "learning_rate": 0.00012793555065484494, + "loss": 0.2729, + "step": 20950 + }, + { + "epoch": 1.6972618276085547, + "grad_norm": 0.05626093968749046, + "learning_rate": 0.00012793105000225035, + "loss": 0.2727, + "step": 20951 + }, + { + "epoch": 1.6973428386260532, + "grad_norm": 0.05510025471448898, + "learning_rate": 0.0001279265493496557, + "loss": 0.2864, + "step": 20952 + }, + { + "epoch": 1.6974238496435516, + "grad_norm": 0.0634729415178299, + "learning_rate": 0.00012792204869706107, + "loss": 0.3169, + "step": 20953 + }, + { + "epoch": 1.6975048606610499, + "grad_norm": 0.05881008878350258, + "learning_rate": 0.00012791754804446646, + "loss": 0.2871, + "step": 20954 + }, + { + "epoch": 1.6975858716785481, + "grad_norm": 0.05638792738318443, + "learning_rate": 0.00012791304739187182, + "loss": 0.3048, + "step": 20955 + }, + { + "epoch": 1.6976668826960468, + "grad_norm": 0.05283451825380325, + "learning_rate": 0.00012790854673927718, + "loss": 0.2665, + "step": 20956 + }, + { + "epoch": 1.697747893713545, + "grad_norm": 0.05943385511636734, + "learning_rate": 0.0001279040460866826, + "loss": 0.2945, + "step": 20957 + }, + { + "epoch": 1.6978289047310433, + "grad_norm": 0.050613295286893845, + "learning_rate": 0.00012789954543408795, + "loss": 0.2932, + "step": 20958 + }, + { + "epoch": 1.6979099157485418, + "grad_norm": 0.0523209348320961, + "learning_rate": 0.00012789504478149331, + "loss": 0.2743, + "step": 20959 + }, + { + "epoch": 1.6979909267660402, + "grad_norm": 0.0570591501891613, + "learning_rate": 0.0001278905441288987, + "loss": 0.3079, + "step": 20960 + }, + { + "epoch": 1.6980719377835385, + "grad_norm": 0.05599812790751457, + "learning_rate": 0.00012788604347630406, + "loss": 0.2843, + "step": 20961 + }, + { + "epoch": 1.698152948801037, + "grad_norm": 0.05278458818793297, + "learning_rate": 0.00012788154282370945, + "loss": 0.2517, + "step": 20962 + }, + { + "epoch": 1.6982339598185354, + "grad_norm": 0.051319461315870285, + "learning_rate": 0.00012787704217111484, + "loss": 0.2643, + "step": 20963 + }, + { + "epoch": 1.6983149708360337, + "grad_norm": 0.05669070780277252, + "learning_rate": 0.0001278725415185202, + "loss": 0.2677, + "step": 20964 + }, + { + "epoch": 1.6983959818535321, + "grad_norm": 0.046938829123973846, + "learning_rate": 0.00012786804086592556, + "loss": 0.2722, + "step": 20965 + }, + { + "epoch": 1.6984769928710306, + "grad_norm": 0.05684944614768028, + "learning_rate": 0.00012786354021333094, + "loss": 0.2751, + "step": 20966 + }, + { + "epoch": 1.6985580038885288, + "grad_norm": 0.05575013533234596, + "learning_rate": 0.0001278590395607363, + "loss": 0.2971, + "step": 20967 + }, + { + "epoch": 1.698639014906027, + "grad_norm": 0.052737586200237274, + "learning_rate": 0.0001278545389081417, + "loss": 0.2252, + "step": 20968 + }, + { + "epoch": 1.6987200259235256, + "grad_norm": 0.05768506973981857, + "learning_rate": 0.00012785003825554708, + "loss": 0.261, + "step": 20969 + }, + { + "epoch": 1.698801036941024, + "grad_norm": 0.04958980903029442, + "learning_rate": 0.00012784553760295244, + "loss": 0.2788, + "step": 20970 + }, + { + "epoch": 1.6988820479585223, + "grad_norm": 0.05882476642727852, + "learning_rate": 0.0001278410369503578, + "loss": 0.2598, + "step": 20971 + }, + { + "epoch": 1.6989630589760207, + "grad_norm": 0.052960388362407684, + "learning_rate": 0.00012783653629776318, + "loss": 0.2885, + "step": 20972 + }, + { + "epoch": 1.6990440699935192, + "grad_norm": 0.06784273684024811, + "learning_rate": 0.00012783203564516854, + "loss": 0.3332, + "step": 20973 + }, + { + "epoch": 1.6991250810110174, + "grad_norm": 0.05525969713926315, + "learning_rate": 0.00012782753499257393, + "loss": 0.3079, + "step": 20974 + }, + { + "epoch": 1.699206092028516, + "grad_norm": 0.06042307987809181, + "learning_rate": 0.00012782303433997932, + "loss": 0.327, + "step": 20975 + }, + { + "epoch": 1.6992871030460144, + "grad_norm": 0.0597507506608963, + "learning_rate": 0.00012781853368738468, + "loss": 0.2855, + "step": 20976 + }, + { + "epoch": 1.6993681140635126, + "grad_norm": 0.05462580919265747, + "learning_rate": 0.00012781403303479004, + "loss": 0.2772, + "step": 20977 + }, + { + "epoch": 1.6994491250810109, + "grad_norm": 0.04822581261396408, + "learning_rate": 0.00012780953238219543, + "loss": 0.2503, + "step": 20978 + }, + { + "epoch": 1.6995301360985096, + "grad_norm": 0.061402879655361176, + "learning_rate": 0.00012780503172960079, + "loss": 0.3026, + "step": 20979 + }, + { + "epoch": 1.6996111471160078, + "grad_norm": 0.057321321219205856, + "learning_rate": 0.00012780053107700617, + "loss": 0.264, + "step": 20980 + }, + { + "epoch": 1.699692158133506, + "grad_norm": 0.05499900504946709, + "learning_rate": 0.00012779603042441156, + "loss": 0.288, + "step": 20981 + }, + { + "epoch": 1.6997731691510045, + "grad_norm": 0.059335388243198395, + "learning_rate": 0.00012779152977181692, + "loss": 0.2493, + "step": 20982 + }, + { + "epoch": 1.699854180168503, + "grad_norm": 0.04036781191825867, + "learning_rate": 0.00012778702911922228, + "loss": 0.2461, + "step": 20983 + }, + { + "epoch": 1.6999351911860012, + "grad_norm": 0.057941023260354996, + "learning_rate": 0.00012778252846662767, + "loss": 0.2883, + "step": 20984 + }, + { + "epoch": 1.7000162022034997, + "grad_norm": 0.050086475908756256, + "learning_rate": 0.00012777802781403303, + "loss": 0.26, + "step": 20985 + }, + { + "epoch": 1.7000972132209982, + "grad_norm": 0.06684888154268265, + "learning_rate": 0.00012777352716143841, + "loss": 0.3185, + "step": 20986 + }, + { + "epoch": 1.7001782242384964, + "grad_norm": 0.05445714667439461, + "learning_rate": 0.0001277690265088438, + "loss": 0.2718, + "step": 20987 + }, + { + "epoch": 1.7002592352559946, + "grad_norm": 0.054723452776670456, + "learning_rate": 0.00012776452585624916, + "loss": 0.2237, + "step": 20988 + }, + { + "epoch": 1.7003402462734933, + "grad_norm": 0.06936323642730713, + "learning_rate": 0.00012776002520365452, + "loss": 0.3069, + "step": 20989 + }, + { + "epoch": 1.7004212572909916, + "grad_norm": 0.059990838170051575, + "learning_rate": 0.0001277555245510599, + "loss": 0.2868, + "step": 20990 + }, + { + "epoch": 1.7005022683084898, + "grad_norm": 0.05394691601395607, + "learning_rate": 0.0001277510238984653, + "loss": 0.2586, + "step": 20991 + }, + { + "epoch": 1.7005832793259883, + "grad_norm": 0.05593397095799446, + "learning_rate": 0.00012774652324587066, + "loss": 0.2665, + "step": 20992 + }, + { + "epoch": 1.7006642903434868, + "grad_norm": 0.052219316363334656, + "learning_rate": 0.00012774202259327604, + "loss": 0.2339, + "step": 20993 + }, + { + "epoch": 1.700745301360985, + "grad_norm": 0.06903288513422012, + "learning_rate": 0.0001277375219406814, + "loss": 0.2955, + "step": 20994 + }, + { + "epoch": 1.7008263123784835, + "grad_norm": 0.06040949001908302, + "learning_rate": 0.00012773302128808676, + "loss": 0.3226, + "step": 20995 + }, + { + "epoch": 1.700907323395982, + "grad_norm": 0.05372748523950577, + "learning_rate": 0.00012772852063549215, + "loss": 0.2893, + "step": 20996 + }, + { + "epoch": 1.7009883344134802, + "grad_norm": 0.05498870834708214, + "learning_rate": 0.00012772401998289754, + "loss": 0.2618, + "step": 20997 + }, + { + "epoch": 1.7010693454309787, + "grad_norm": 0.0433398000895977, + "learning_rate": 0.0001277195193303029, + "loss": 0.2516, + "step": 20998 + }, + { + "epoch": 1.7011503564484771, + "grad_norm": 0.042253199964761734, + "learning_rate": 0.00012771501867770828, + "loss": 0.2629, + "step": 20999 + }, + { + "epoch": 1.7012313674659754, + "grad_norm": 0.060787223279476166, + "learning_rate": 0.00012771051802511364, + "loss": 0.3057, + "step": 21000 + }, + { + "epoch": 1.7013123784834736, + "grad_norm": 0.08020738512277603, + "learning_rate": 0.000127706017372519, + "loss": 0.3053, + "step": 21001 + }, + { + "epoch": 1.7013933895009723, + "grad_norm": 0.058047182857990265, + "learning_rate": 0.0001277015167199244, + "loss": 0.2418, + "step": 21002 + }, + { + "epoch": 1.7014744005184705, + "grad_norm": 0.052321989089250565, + "learning_rate": 0.00012769701606732978, + "loss": 0.2736, + "step": 21003 + }, + { + "epoch": 1.7015554115359688, + "grad_norm": 0.052635811269283295, + "learning_rate": 0.00012769251541473514, + "loss": 0.3232, + "step": 21004 + }, + { + "epoch": 1.7016364225534673, + "grad_norm": 0.050415851175785065, + "learning_rate": 0.00012768801476214053, + "loss": 0.2611, + "step": 21005 + }, + { + "epoch": 1.7017174335709657, + "grad_norm": 0.05760395526885986, + "learning_rate": 0.0001276835141095459, + "loss": 0.2937, + "step": 21006 + }, + { + "epoch": 1.701798444588464, + "grad_norm": 0.05417398735880852, + "learning_rate": 0.00012767901345695125, + "loss": 0.2681, + "step": 21007 + }, + { + "epoch": 1.7018794556059624, + "grad_norm": 0.05250127241015434, + "learning_rate": 0.00012767451280435663, + "loss": 0.2809, + "step": 21008 + }, + { + "epoch": 1.701960466623461, + "grad_norm": 0.05703813210129738, + "learning_rate": 0.00012767001215176202, + "loss": 0.274, + "step": 21009 + }, + { + "epoch": 1.7020414776409591, + "grad_norm": 0.050707802176475525, + "learning_rate": 0.00012766551149916738, + "loss": 0.2451, + "step": 21010 + }, + { + "epoch": 1.7021224886584574, + "grad_norm": 0.06028733402490616, + "learning_rate": 0.00012766101084657277, + "loss": 0.2741, + "step": 21011 + }, + { + "epoch": 1.702203499675956, + "grad_norm": 0.059985797852277756, + "learning_rate": 0.00012765651019397813, + "loss": 0.3051, + "step": 21012 + }, + { + "epoch": 1.7022845106934543, + "grad_norm": 0.05215641111135483, + "learning_rate": 0.0001276520095413835, + "loss": 0.2691, + "step": 21013 + }, + { + "epoch": 1.7023655217109526, + "grad_norm": 0.055474214255809784, + "learning_rate": 0.0001276475088887889, + "loss": 0.2895, + "step": 21014 + }, + { + "epoch": 1.702446532728451, + "grad_norm": 0.05044548586010933, + "learning_rate": 0.00012764300823619426, + "loss": 0.3022, + "step": 21015 + }, + { + "epoch": 1.7025275437459495, + "grad_norm": 0.053043123334646225, + "learning_rate": 0.00012763850758359962, + "loss": 0.2758, + "step": 21016 + }, + { + "epoch": 1.7026085547634477, + "grad_norm": 0.0499655157327652, + "learning_rate": 0.000127634006931005, + "loss": 0.2501, + "step": 21017 + }, + { + "epoch": 1.7026895657809462, + "grad_norm": 0.05497471243143082, + "learning_rate": 0.00012762950627841037, + "loss": 0.2631, + "step": 21018 + }, + { + "epoch": 1.7027705767984447, + "grad_norm": 0.05439502373337746, + "learning_rate": 0.00012762500562581573, + "loss": 0.285, + "step": 21019 + }, + { + "epoch": 1.702851587815943, + "grad_norm": 0.05842931941151619, + "learning_rate": 0.00012762050497322114, + "loss": 0.2786, + "step": 21020 + }, + { + "epoch": 1.7029325988334414, + "grad_norm": 0.05893060192465782, + "learning_rate": 0.0001276160043206265, + "loss": 0.3007, + "step": 21021 + }, + { + "epoch": 1.7030136098509399, + "grad_norm": 0.05791548639535904, + "learning_rate": 0.00012761150366803186, + "loss": 0.2726, + "step": 21022 + }, + { + "epoch": 1.703094620868438, + "grad_norm": 0.041637811809778214, + "learning_rate": 0.00012760700301543725, + "loss": 0.2336, + "step": 21023 + }, + { + "epoch": 1.7031756318859363, + "grad_norm": 0.05405241623520851, + "learning_rate": 0.0001276025023628426, + "loss": 0.2907, + "step": 21024 + }, + { + "epoch": 1.7032566429034348, + "grad_norm": 0.047728147357702255, + "learning_rate": 0.00012759800171024797, + "loss": 0.2697, + "step": 21025 + }, + { + "epoch": 1.7033376539209333, + "grad_norm": 0.05584067478775978, + "learning_rate": 0.00012759350105765339, + "loss": 0.3092, + "step": 21026 + }, + { + "epoch": 1.7034186649384315, + "grad_norm": 0.05352660268545151, + "learning_rate": 0.00012758900040505875, + "loss": 0.2769, + "step": 21027 + }, + { + "epoch": 1.70349967595593, + "grad_norm": 0.052968524396419525, + "learning_rate": 0.0001275844997524641, + "loss": 0.3083, + "step": 21028 + }, + { + "epoch": 1.7035806869734285, + "grad_norm": 0.06345254927873611, + "learning_rate": 0.0001275799990998695, + "loss": 0.2946, + "step": 21029 + }, + { + "epoch": 1.7036616979909267, + "grad_norm": 0.058345187455415726, + "learning_rate": 0.00012757549844727485, + "loss": 0.3017, + "step": 21030 + }, + { + "epoch": 1.7037427090084252, + "grad_norm": 0.06407354027032852, + "learning_rate": 0.0001275709977946802, + "loss": 0.2447, + "step": 21031 + }, + { + "epoch": 1.7038237200259236, + "grad_norm": 0.049316179007291794, + "learning_rate": 0.00012756649714208563, + "loss": 0.2748, + "step": 21032 + }, + { + "epoch": 1.7039047310434219, + "grad_norm": 0.06267198920249939, + "learning_rate": 0.000127561996489491, + "loss": 0.2882, + "step": 21033 + }, + { + "epoch": 1.7039857420609201, + "grad_norm": 0.0582430474460125, + "learning_rate": 0.00012755749583689635, + "loss": 0.2848, + "step": 21034 + }, + { + "epoch": 1.7040667530784188, + "grad_norm": 0.0576762817800045, + "learning_rate": 0.00012755299518430173, + "loss": 0.2623, + "step": 21035 + }, + { + "epoch": 1.704147764095917, + "grad_norm": 0.05833594501018524, + "learning_rate": 0.0001275484945317071, + "loss": 0.2756, + "step": 21036 + }, + { + "epoch": 1.7042287751134153, + "grad_norm": 0.061021964997053146, + "learning_rate": 0.00012754399387911245, + "loss": 0.2743, + "step": 21037 + }, + { + "epoch": 1.7043097861309138, + "grad_norm": 0.05440381169319153, + "learning_rate": 0.00012753949322651787, + "loss": 0.2968, + "step": 21038 + }, + { + "epoch": 1.7043907971484122, + "grad_norm": 0.048326920717954636, + "learning_rate": 0.00012753499257392323, + "loss": 0.2562, + "step": 21039 + }, + { + "epoch": 1.7044718081659105, + "grad_norm": 0.05831613019108772, + "learning_rate": 0.0001275304919213286, + "loss": 0.2828, + "step": 21040 + }, + { + "epoch": 1.704552819183409, + "grad_norm": 0.057130247354507446, + "learning_rate": 0.00012752599126873398, + "loss": 0.2707, + "step": 21041 + }, + { + "epoch": 1.7046338302009074, + "grad_norm": 0.04975264146924019, + "learning_rate": 0.00012752149061613934, + "loss": 0.2518, + "step": 21042 + }, + { + "epoch": 1.7047148412184057, + "grad_norm": 0.05217575281858444, + "learning_rate": 0.00012751698996354472, + "loss": 0.2548, + "step": 21043 + }, + { + "epoch": 1.7047958522359041, + "grad_norm": 0.07900968939065933, + "learning_rate": 0.0001275124893109501, + "loss": 0.2852, + "step": 21044 + }, + { + "epoch": 1.7048768632534026, + "grad_norm": 0.058848753571510315, + "learning_rate": 0.00012750798865835547, + "loss": 0.3091, + "step": 21045 + }, + { + "epoch": 1.7049578742709008, + "grad_norm": 0.052460964769124985, + "learning_rate": 0.00012750348800576083, + "loss": 0.2722, + "step": 21046 + }, + { + "epoch": 1.705038885288399, + "grad_norm": 0.052080314606428146, + "learning_rate": 0.00012749898735316622, + "loss": 0.297, + "step": 21047 + }, + { + "epoch": 1.7051198963058976, + "grad_norm": 0.05389990657567978, + "learning_rate": 0.00012749448670057158, + "loss": 0.2982, + "step": 21048 + }, + { + "epoch": 1.705200907323396, + "grad_norm": 0.0437735840678215, + "learning_rate": 0.00012748998604797697, + "loss": 0.2651, + "step": 21049 + }, + { + "epoch": 1.7052819183408943, + "grad_norm": 0.05131865665316582, + "learning_rate": 0.00012748548539538235, + "loss": 0.2609, + "step": 21050 + }, + { + "epoch": 1.7053629293583927, + "grad_norm": 0.04363410547375679, + "learning_rate": 0.0001274809847427877, + "loss": 0.2436, + "step": 21051 + }, + { + "epoch": 1.7054439403758912, + "grad_norm": 0.05448685958981514, + "learning_rate": 0.00012747648409019307, + "loss": 0.3368, + "step": 21052 + }, + { + "epoch": 1.7055249513933894, + "grad_norm": 0.055532172322273254, + "learning_rate": 0.00012747198343759846, + "loss": 0.2764, + "step": 21053 + }, + { + "epoch": 1.705605962410888, + "grad_norm": 0.05029525235295296, + "learning_rate": 0.00012746748278500382, + "loss": 0.268, + "step": 21054 + }, + { + "epoch": 1.7056869734283864, + "grad_norm": 0.058983251452445984, + "learning_rate": 0.0001274629821324092, + "loss": 0.2468, + "step": 21055 + }, + { + "epoch": 1.7057679844458846, + "grad_norm": 0.05035523325204849, + "learning_rate": 0.0001274584814798146, + "loss": 0.2519, + "step": 21056 + }, + { + "epoch": 1.7058489954633829, + "grad_norm": 0.05484575033187866, + "learning_rate": 0.00012745398082721995, + "loss": 0.3013, + "step": 21057 + }, + { + "epoch": 1.7059300064808816, + "grad_norm": 0.05905700847506523, + "learning_rate": 0.00012744948017462531, + "loss": 0.274, + "step": 21058 + }, + { + "epoch": 1.7060110174983798, + "grad_norm": 0.04861075058579445, + "learning_rate": 0.0001274449795220307, + "loss": 0.2592, + "step": 21059 + }, + { + "epoch": 1.706092028515878, + "grad_norm": 0.056762732565402985, + "learning_rate": 0.00012744047886943606, + "loss": 0.2554, + "step": 21060 + }, + { + "epoch": 1.7061730395333765, + "grad_norm": 0.0626283660531044, + "learning_rate": 0.00012743597821684145, + "loss": 0.2907, + "step": 21061 + }, + { + "epoch": 1.706254050550875, + "grad_norm": 0.056917816400527954, + "learning_rate": 0.00012743147756424684, + "loss": 0.2593, + "step": 21062 + }, + { + "epoch": 1.7063350615683732, + "grad_norm": 0.055883992463350296, + "learning_rate": 0.0001274269769116522, + "loss": 0.2603, + "step": 21063 + }, + { + "epoch": 1.7064160725858717, + "grad_norm": 0.07410957664251328, + "learning_rate": 0.00012742247625905756, + "loss": 0.3146, + "step": 21064 + }, + { + "epoch": 1.7064970836033702, + "grad_norm": 0.06119558587670326, + "learning_rate": 0.00012741797560646294, + "loss": 0.3055, + "step": 21065 + }, + { + "epoch": 1.7065780946208684, + "grad_norm": 0.056091416627168655, + "learning_rate": 0.00012741347495386833, + "loss": 0.3078, + "step": 21066 + }, + { + "epoch": 1.7066591056383669, + "grad_norm": 0.06130021810531616, + "learning_rate": 0.0001274089743012737, + "loss": 0.3076, + "step": 21067 + }, + { + "epoch": 1.7067401166558653, + "grad_norm": 0.053993724286556244, + "learning_rate": 0.00012740447364867908, + "loss": 0.2357, + "step": 21068 + }, + { + "epoch": 1.7068211276733636, + "grad_norm": 0.06275437027215958, + "learning_rate": 0.00012739997299608444, + "loss": 0.3032, + "step": 21069 + }, + { + "epoch": 1.7069021386908618, + "grad_norm": 0.05278387293219566, + "learning_rate": 0.0001273954723434898, + "loss": 0.2554, + "step": 21070 + }, + { + "epoch": 1.7069831497083603, + "grad_norm": 0.05508885532617569, + "learning_rate": 0.00012739097169089518, + "loss": 0.2938, + "step": 21071 + }, + { + "epoch": 1.7070641607258588, + "grad_norm": 0.05782973766326904, + "learning_rate": 0.00012738647103830057, + "loss": 0.2583, + "step": 21072 + }, + { + "epoch": 1.707145171743357, + "grad_norm": 0.0582059770822525, + "learning_rate": 0.00012738197038570593, + "loss": 0.3355, + "step": 21073 + }, + { + "epoch": 1.7072261827608555, + "grad_norm": 0.055913038551807404, + "learning_rate": 0.00012737746973311132, + "loss": 0.2826, + "step": 21074 + }, + { + "epoch": 1.707307193778354, + "grad_norm": 0.04921237379312515, + "learning_rate": 0.00012737296908051668, + "loss": 0.2739, + "step": 21075 + }, + { + "epoch": 1.7073882047958522, + "grad_norm": 0.056289199739694595, + "learning_rate": 0.00012736846842792204, + "loss": 0.3051, + "step": 21076 + }, + { + "epoch": 1.7074692158133506, + "grad_norm": 0.05416957661509514, + "learning_rate": 0.00012736396777532743, + "loss": 0.2725, + "step": 21077 + }, + { + "epoch": 1.7075502268308491, + "grad_norm": 0.05085984617471695, + "learning_rate": 0.0001273594671227328, + "loss": 0.2759, + "step": 21078 + }, + { + "epoch": 1.7076312378483474, + "grad_norm": 0.049407776445150375, + "learning_rate": 0.00012735496647013817, + "loss": 0.294, + "step": 21079 + }, + { + "epoch": 1.7077122488658456, + "grad_norm": 0.06257401406764984, + "learning_rate": 0.00012735046581754356, + "loss": 0.2922, + "step": 21080 + }, + { + "epoch": 1.7077932598833443, + "grad_norm": 0.046495795249938965, + "learning_rate": 0.00012734596516494892, + "loss": 0.2558, + "step": 21081 + }, + { + "epoch": 1.7078742709008425, + "grad_norm": 0.06849236786365509, + "learning_rate": 0.00012734146451235428, + "loss": 0.2532, + "step": 21082 + }, + { + "epoch": 1.7079552819183408, + "grad_norm": 0.05374249443411827, + "learning_rate": 0.00012733696385975967, + "loss": 0.2892, + "step": 21083 + }, + { + "epoch": 1.7080362929358393, + "grad_norm": 0.05467557534575462, + "learning_rate": 0.00012733246320716505, + "loss": 0.2718, + "step": 21084 + }, + { + "epoch": 1.7081173039533377, + "grad_norm": 0.05555357038974762, + "learning_rate": 0.00012732796255457041, + "loss": 0.289, + "step": 21085 + }, + { + "epoch": 1.708198314970836, + "grad_norm": 0.05055621638894081, + "learning_rate": 0.0001273234619019758, + "loss": 0.266, + "step": 21086 + }, + { + "epoch": 1.7082793259883344, + "grad_norm": 0.05175207182765007, + "learning_rate": 0.00012731896124938116, + "loss": 0.268, + "step": 21087 + }, + { + "epoch": 1.708360337005833, + "grad_norm": 0.05274539813399315, + "learning_rate": 0.00012731446059678652, + "loss": 0.2707, + "step": 21088 + }, + { + "epoch": 1.7084413480233311, + "grad_norm": 0.058866798877716064, + "learning_rate": 0.0001273099599441919, + "loss": 0.2413, + "step": 21089 + }, + { + "epoch": 1.7085223590408296, + "grad_norm": 0.059128813445568085, + "learning_rate": 0.0001273054592915973, + "loss": 0.2794, + "step": 21090 + }, + { + "epoch": 1.708603370058328, + "grad_norm": 0.05331343784928322, + "learning_rate": 0.00012730095863900266, + "loss": 0.2991, + "step": 21091 + }, + { + "epoch": 1.7086843810758263, + "grad_norm": 0.052910611033439636, + "learning_rate": 0.00012729645798640804, + "loss": 0.2632, + "step": 21092 + }, + { + "epoch": 1.7087653920933246, + "grad_norm": 0.04627866670489311, + "learning_rate": 0.0001272919573338134, + "loss": 0.2509, + "step": 21093 + }, + { + "epoch": 1.708846403110823, + "grad_norm": 0.053217582404613495, + "learning_rate": 0.00012728745668121876, + "loss": 0.2715, + "step": 21094 + }, + { + "epoch": 1.7089274141283215, + "grad_norm": 0.06612826138734818, + "learning_rate": 0.00012728295602862418, + "loss": 0.3198, + "step": 21095 + }, + { + "epoch": 1.7090084251458197, + "grad_norm": 0.053610753268003464, + "learning_rate": 0.00012727845537602954, + "loss": 0.2619, + "step": 21096 + }, + { + "epoch": 1.7090894361633182, + "grad_norm": 0.043761976063251495, + "learning_rate": 0.0001272739547234349, + "loss": 0.2487, + "step": 21097 + }, + { + "epoch": 1.7091704471808167, + "grad_norm": 0.04693054035305977, + "learning_rate": 0.00012726945407084029, + "loss": 0.2636, + "step": 21098 + }, + { + "epoch": 1.709251458198315, + "grad_norm": 0.05634808540344238, + "learning_rate": 0.00012726495341824565, + "loss": 0.2397, + "step": 21099 + }, + { + "epoch": 1.7093324692158134, + "grad_norm": 0.062135498970746994, + "learning_rate": 0.000127260452765651, + "loss": 0.3048, + "step": 21100 + }, + { + "epoch": 1.7094134802333119, + "grad_norm": 0.060942426323890686, + "learning_rate": 0.00012725595211305642, + "loss": 0.3448, + "step": 21101 + }, + { + "epoch": 1.70949449125081, + "grad_norm": 0.05401855334639549, + "learning_rate": 0.00012725145146046178, + "loss": 0.2581, + "step": 21102 + }, + { + "epoch": 1.7095755022683083, + "grad_norm": 0.050278641283512115, + "learning_rate": 0.00012724695080786714, + "loss": 0.2573, + "step": 21103 + }, + { + "epoch": 1.709656513285807, + "grad_norm": 0.05164032429456711, + "learning_rate": 0.00012724245015527253, + "loss": 0.2528, + "step": 21104 + }, + { + "epoch": 1.7097375243033053, + "grad_norm": 0.04855099692940712, + "learning_rate": 0.0001272379495026779, + "loss": 0.2553, + "step": 21105 + }, + { + "epoch": 1.7098185353208035, + "grad_norm": 0.04928193241357803, + "learning_rate": 0.00012723344885008325, + "loss": 0.286, + "step": 21106 + }, + { + "epoch": 1.709899546338302, + "grad_norm": 0.05188463255763054, + "learning_rate": 0.00012722894819748866, + "loss": 0.2621, + "step": 21107 + }, + { + "epoch": 1.7099805573558005, + "grad_norm": 0.05235172063112259, + "learning_rate": 0.00012722444754489402, + "loss": 0.31, + "step": 21108 + }, + { + "epoch": 1.7100615683732987, + "grad_norm": 0.056308262050151825, + "learning_rate": 0.00012721994689229938, + "loss": 0.2707, + "step": 21109 + }, + { + "epoch": 1.7101425793907972, + "grad_norm": 0.044385965913534164, + "learning_rate": 0.00012721544623970477, + "loss": 0.2338, + "step": 21110 + }, + { + "epoch": 1.7102235904082956, + "grad_norm": 0.04382346197962761, + "learning_rate": 0.00012721094558711013, + "loss": 0.2703, + "step": 21111 + }, + { + "epoch": 1.7103046014257939, + "grad_norm": 0.053613610565662384, + "learning_rate": 0.0001272064449345155, + "loss": 0.2488, + "step": 21112 + }, + { + "epoch": 1.7103856124432921, + "grad_norm": 0.0616731233894825, + "learning_rate": 0.0001272019442819209, + "loss": 0.2805, + "step": 21113 + }, + { + "epoch": 1.7104666234607908, + "grad_norm": 0.046198032796382904, + "learning_rate": 0.00012719744362932626, + "loss": 0.2395, + "step": 21114 + }, + { + "epoch": 1.710547634478289, + "grad_norm": 0.06092916801571846, + "learning_rate": 0.00012719294297673162, + "loss": 0.3167, + "step": 21115 + }, + { + "epoch": 1.7106286454957873, + "grad_norm": 0.06428792327642441, + "learning_rate": 0.000127188442324137, + "loss": 0.3205, + "step": 21116 + }, + { + "epoch": 1.7107096565132858, + "grad_norm": 0.055565331131219864, + "learning_rate": 0.00012718394167154237, + "loss": 0.2697, + "step": 21117 + }, + { + "epoch": 1.7107906675307842, + "grad_norm": 0.05767647922039032, + "learning_rate": 0.00012717944101894773, + "loss": 0.2547, + "step": 21118 + }, + { + "epoch": 1.7108716785482825, + "grad_norm": 0.060147978365421295, + "learning_rate": 0.00012717494036635314, + "loss": 0.2619, + "step": 21119 + }, + { + "epoch": 1.710952689565781, + "grad_norm": 0.05821974575519562, + "learning_rate": 0.0001271704397137585, + "loss": 0.2597, + "step": 21120 + }, + { + "epoch": 1.7110337005832794, + "grad_norm": 0.05406796559691429, + "learning_rate": 0.00012716593906116386, + "loss": 0.2909, + "step": 21121 + }, + { + "epoch": 1.7111147116007777, + "grad_norm": 0.05455823615193367, + "learning_rate": 0.00012716143840856925, + "loss": 0.2887, + "step": 21122 + }, + { + "epoch": 1.7111957226182761, + "grad_norm": 0.055845241993665695, + "learning_rate": 0.0001271569377559746, + "loss": 0.2761, + "step": 21123 + }, + { + "epoch": 1.7112767336357746, + "grad_norm": 0.04812893643975258, + "learning_rate": 0.00012715243710338, + "loss": 0.2447, + "step": 21124 + }, + { + "epoch": 1.7113577446532728, + "grad_norm": 0.04706289619207382, + "learning_rate": 0.00012714793645078539, + "loss": 0.2817, + "step": 21125 + }, + { + "epoch": 1.711438755670771, + "grad_norm": 0.044318217784166336, + "learning_rate": 0.00012714343579819075, + "loss": 0.2766, + "step": 21126 + }, + { + "epoch": 1.7115197666882696, + "grad_norm": 0.05528712272644043, + "learning_rate": 0.0001271389351455961, + "loss": 0.2571, + "step": 21127 + }, + { + "epoch": 1.711600777705768, + "grad_norm": 0.051084212958812714, + "learning_rate": 0.0001271344344930015, + "loss": 0.2526, + "step": 21128 + }, + { + "epoch": 1.7116817887232663, + "grad_norm": 0.05133767053484917, + "learning_rate": 0.00012712993384040685, + "loss": 0.2854, + "step": 21129 + }, + { + "epoch": 1.7117627997407647, + "grad_norm": 0.04583508148789406, + "learning_rate": 0.00012712543318781224, + "loss": 0.23, + "step": 21130 + }, + { + "epoch": 1.7118438107582632, + "grad_norm": 0.06236959248781204, + "learning_rate": 0.00012712093253521763, + "loss": 0.2824, + "step": 21131 + }, + { + "epoch": 1.7119248217757614, + "grad_norm": 0.05616011098027229, + "learning_rate": 0.000127116431882623, + "loss": 0.2727, + "step": 21132 + }, + { + "epoch": 1.71200583279326, + "grad_norm": 0.051108259707689285, + "learning_rate": 0.00012711193123002835, + "loss": 0.2703, + "step": 21133 + }, + { + "epoch": 1.7120868438107584, + "grad_norm": 0.07083071768283844, + "learning_rate": 0.00012710743057743373, + "loss": 0.2931, + "step": 21134 + }, + { + "epoch": 1.7121678548282566, + "grad_norm": 0.040604688227176666, + "learning_rate": 0.0001271029299248391, + "loss": 0.2411, + "step": 21135 + }, + { + "epoch": 1.7122488658457549, + "grad_norm": 0.05927375331521034, + "learning_rate": 0.00012709842927224448, + "loss": 0.2877, + "step": 21136 + }, + { + "epoch": 1.7123298768632536, + "grad_norm": 0.05343782901763916, + "learning_rate": 0.00012709392861964987, + "loss": 0.2738, + "step": 21137 + }, + { + "epoch": 1.7124108878807518, + "grad_norm": 0.06753844767808914, + "learning_rate": 0.00012708942796705523, + "loss": 0.3114, + "step": 21138 + }, + { + "epoch": 1.71249189889825, + "grad_norm": 0.06401346623897552, + "learning_rate": 0.0001270849273144606, + "loss": 0.2567, + "step": 21139 + }, + { + "epoch": 1.7125729099157485, + "grad_norm": 0.054790738970041275, + "learning_rate": 0.00012708042666186598, + "loss": 0.2751, + "step": 21140 + }, + { + "epoch": 1.712653920933247, + "grad_norm": 0.06184843182563782, + "learning_rate": 0.00012707592600927134, + "loss": 0.2772, + "step": 21141 + }, + { + "epoch": 1.7127349319507452, + "grad_norm": 0.046240661293268204, + "learning_rate": 0.00012707142535667672, + "loss": 0.281, + "step": 21142 + }, + { + "epoch": 1.7128159429682437, + "grad_norm": 0.057482510805130005, + "learning_rate": 0.0001270669247040821, + "loss": 0.2881, + "step": 21143 + }, + { + "epoch": 1.7128969539857422, + "grad_norm": 0.0650155320763588, + "learning_rate": 0.00012706242405148747, + "loss": 0.3222, + "step": 21144 + }, + { + "epoch": 1.7129779650032404, + "grad_norm": 0.05420124903321266, + "learning_rate": 0.00012705792339889283, + "loss": 0.2459, + "step": 21145 + }, + { + "epoch": 1.7130589760207389, + "grad_norm": 0.05785535275936127, + "learning_rate": 0.00012705342274629822, + "loss": 0.3015, + "step": 21146 + }, + { + "epoch": 1.7131399870382373, + "grad_norm": 0.0573989674448967, + "learning_rate": 0.0001270489220937036, + "loss": 0.3189, + "step": 21147 + }, + { + "epoch": 1.7132209980557356, + "grad_norm": 0.05121051147580147, + "learning_rate": 0.00012704442144110897, + "loss": 0.2484, + "step": 21148 + }, + { + "epoch": 1.7133020090732338, + "grad_norm": 0.0666457861661911, + "learning_rate": 0.00012703992078851435, + "loss": 0.3251, + "step": 21149 + }, + { + "epoch": 1.7133830200907323, + "grad_norm": 0.042105793952941895, + "learning_rate": 0.0001270354201359197, + "loss": 0.2847, + "step": 21150 + }, + { + "epoch": 1.7134640311082308, + "grad_norm": 0.05290107801556587, + "learning_rate": 0.00012703091948332507, + "loss": 0.2648, + "step": 21151 + }, + { + "epoch": 1.713545042125729, + "grad_norm": 0.04666683450341225, + "learning_rate": 0.00012702641883073046, + "loss": 0.305, + "step": 21152 + }, + { + "epoch": 1.7136260531432275, + "grad_norm": 0.07131568342447281, + "learning_rate": 0.00012702191817813585, + "loss": 0.2696, + "step": 21153 + }, + { + "epoch": 1.713707064160726, + "grad_norm": 0.04963874816894531, + "learning_rate": 0.0001270174175255412, + "loss": 0.2648, + "step": 21154 + }, + { + "epoch": 1.7137880751782242, + "grad_norm": 0.0497848279774189, + "learning_rate": 0.0001270129168729466, + "loss": 0.2732, + "step": 21155 + }, + { + "epoch": 1.7138690861957226, + "grad_norm": 0.06622777879238129, + "learning_rate": 0.00012700841622035195, + "loss": 0.2878, + "step": 21156 + }, + { + "epoch": 1.7139500972132211, + "grad_norm": 0.050237640738487244, + "learning_rate": 0.00012700391556775731, + "loss": 0.2419, + "step": 21157 + }, + { + "epoch": 1.7140311082307194, + "grad_norm": 0.06899549067020416, + "learning_rate": 0.0001269994149151627, + "loss": 0.314, + "step": 21158 + }, + { + "epoch": 1.7141121192482176, + "grad_norm": 0.05233056843280792, + "learning_rate": 0.0001269949142625681, + "loss": 0.2767, + "step": 21159 + }, + { + "epoch": 1.7141931302657163, + "grad_norm": 0.06126559525728226, + "learning_rate": 0.00012699041360997345, + "loss": 0.26, + "step": 21160 + }, + { + "epoch": 1.7142741412832145, + "grad_norm": 0.060388606041669846, + "learning_rate": 0.00012698591295737884, + "loss": 0.2673, + "step": 21161 + }, + { + "epoch": 1.7143551523007128, + "grad_norm": 0.06104142591357231, + "learning_rate": 0.0001269814123047842, + "loss": 0.2809, + "step": 21162 + }, + { + "epoch": 1.7144361633182112, + "grad_norm": 0.06470298022031784, + "learning_rate": 0.00012697691165218956, + "loss": 0.2765, + "step": 21163 + }, + { + "epoch": 1.7145171743357097, + "grad_norm": 0.048042502254247665, + "learning_rate": 0.00012697241099959494, + "loss": 0.2597, + "step": 21164 + }, + { + "epoch": 1.714598185353208, + "grad_norm": 0.05685548484325409, + "learning_rate": 0.00012696791034700033, + "loss": 0.3263, + "step": 21165 + }, + { + "epoch": 1.7146791963707064, + "grad_norm": 0.059012167155742645, + "learning_rate": 0.0001269634096944057, + "loss": 0.2723, + "step": 21166 + }, + { + "epoch": 1.714760207388205, + "grad_norm": 0.058648496866226196, + "learning_rate": 0.00012695890904181108, + "loss": 0.3046, + "step": 21167 + }, + { + "epoch": 1.7148412184057031, + "grad_norm": 0.04576666280627251, + "learning_rate": 0.00012695440838921644, + "loss": 0.2526, + "step": 21168 + }, + { + "epoch": 1.7149222294232016, + "grad_norm": 0.054590385407209396, + "learning_rate": 0.0001269499077366218, + "loss": 0.2629, + "step": 21169 + }, + { + "epoch": 1.7150032404407, + "grad_norm": 0.06748390197753906, + "learning_rate": 0.00012694540708402718, + "loss": 0.295, + "step": 21170 + }, + { + "epoch": 1.7150842514581983, + "grad_norm": 0.05063813179731369, + "learning_rate": 0.00012694090643143257, + "loss": 0.2501, + "step": 21171 + }, + { + "epoch": 1.7151652624756966, + "grad_norm": 0.05567986145615578, + "learning_rate": 0.00012693640577883793, + "loss": 0.2925, + "step": 21172 + }, + { + "epoch": 1.715246273493195, + "grad_norm": 0.050066396594047546, + "learning_rate": 0.00012693190512624332, + "loss": 0.2446, + "step": 21173 + }, + { + "epoch": 1.7153272845106935, + "grad_norm": 0.055447474122047424, + "learning_rate": 0.00012692740447364868, + "loss": 0.2916, + "step": 21174 + }, + { + "epoch": 1.7154082955281917, + "grad_norm": 0.06282706558704376, + "learning_rate": 0.00012692290382105404, + "loss": 0.3065, + "step": 21175 + }, + { + "epoch": 1.7154893065456902, + "grad_norm": 0.04857902228832245, + "learning_rate": 0.00012691840316845945, + "loss": 0.2397, + "step": 21176 + }, + { + "epoch": 1.7155703175631887, + "grad_norm": 0.04898069053888321, + "learning_rate": 0.0001269139025158648, + "loss": 0.2604, + "step": 21177 + }, + { + "epoch": 1.715651328580687, + "grad_norm": 0.04747678339481354, + "learning_rate": 0.00012690940186327017, + "loss": 0.2599, + "step": 21178 + }, + { + "epoch": 1.7157323395981854, + "grad_norm": 0.05100768059492111, + "learning_rate": 0.00012690490121067556, + "loss": 0.2213, + "step": 21179 + }, + { + "epoch": 1.7158133506156839, + "grad_norm": 0.06935994327068329, + "learning_rate": 0.00012690040055808092, + "loss": 0.2827, + "step": 21180 + }, + { + "epoch": 1.715894361633182, + "grad_norm": 0.05088813230395317, + "learning_rate": 0.00012689589990548628, + "loss": 0.3469, + "step": 21181 + }, + { + "epoch": 1.7159753726506803, + "grad_norm": 0.04963594675064087, + "learning_rate": 0.0001268913992528917, + "loss": 0.2108, + "step": 21182 + }, + { + "epoch": 1.716056383668179, + "grad_norm": 0.06332466751337051, + "learning_rate": 0.00012688689860029706, + "loss": 0.2625, + "step": 21183 + }, + { + "epoch": 1.7161373946856773, + "grad_norm": 0.05151652917265892, + "learning_rate": 0.00012688239794770242, + "loss": 0.2509, + "step": 21184 + }, + { + "epoch": 1.7162184057031755, + "grad_norm": 0.05739894509315491, + "learning_rate": 0.0001268778972951078, + "loss": 0.3009, + "step": 21185 + }, + { + "epoch": 1.716299416720674, + "grad_norm": 0.06268756836652756, + "learning_rate": 0.00012687339664251316, + "loss": 0.2762, + "step": 21186 + }, + { + "epoch": 1.7163804277381725, + "grad_norm": 0.04882485046982765, + "learning_rate": 0.00012686889598991852, + "loss": 0.2938, + "step": 21187 + }, + { + "epoch": 1.7164614387556707, + "grad_norm": 0.04620731621980667, + "learning_rate": 0.00012686439533732394, + "loss": 0.2605, + "step": 21188 + }, + { + "epoch": 1.7165424497731692, + "grad_norm": 0.04816884547472, + "learning_rate": 0.0001268598946847293, + "loss": 0.2662, + "step": 21189 + }, + { + "epoch": 1.7166234607906676, + "grad_norm": 0.04603031650185585, + "learning_rate": 0.00012685539403213466, + "loss": 0.2633, + "step": 21190 + }, + { + "epoch": 1.7167044718081659, + "grad_norm": 0.05901835113763809, + "learning_rate": 0.00012685089337954004, + "loss": 0.2855, + "step": 21191 + }, + { + "epoch": 1.7167854828256643, + "grad_norm": 0.05132526904344559, + "learning_rate": 0.0001268463927269454, + "loss": 0.2198, + "step": 21192 + }, + { + "epoch": 1.7168664938431628, + "grad_norm": 0.06519816070795059, + "learning_rate": 0.00012684189207435076, + "loss": 0.2984, + "step": 21193 + }, + { + "epoch": 1.716947504860661, + "grad_norm": 0.057430390268564224, + "learning_rate": 0.00012683739142175618, + "loss": 0.2549, + "step": 21194 + }, + { + "epoch": 1.7170285158781593, + "grad_norm": 0.04745124652981758, + "learning_rate": 0.00012683289076916154, + "loss": 0.2417, + "step": 21195 + }, + { + "epoch": 1.7171095268956578, + "grad_norm": 0.05713975429534912, + "learning_rate": 0.0001268283901165669, + "loss": 0.2787, + "step": 21196 + }, + { + "epoch": 1.7171905379131562, + "grad_norm": 0.054375555366277695, + "learning_rate": 0.00012682388946397229, + "loss": 0.2505, + "step": 21197 + }, + { + "epoch": 1.7172715489306545, + "grad_norm": 0.05335111916065216, + "learning_rate": 0.00012681938881137765, + "loss": 0.2593, + "step": 21198 + }, + { + "epoch": 1.717352559948153, + "grad_norm": 0.0593978688120842, + "learning_rate": 0.00012681488815878303, + "loss": 0.2923, + "step": 21199 + }, + { + "epoch": 1.7174335709656514, + "grad_norm": 0.061143700033426285, + "learning_rate": 0.00012681038750618842, + "loss": 0.2594, + "step": 21200 + }, + { + "epoch": 1.7175145819831497, + "grad_norm": 0.06332940608263016, + "learning_rate": 0.00012680588685359378, + "loss": 0.3053, + "step": 21201 + }, + { + "epoch": 1.7175955930006481, + "grad_norm": 0.04996534436941147, + "learning_rate": 0.00012680138620099914, + "loss": 0.2429, + "step": 21202 + }, + { + "epoch": 1.7176766040181466, + "grad_norm": 0.05738399922847748, + "learning_rate": 0.00012679688554840453, + "loss": 0.3105, + "step": 21203 + }, + { + "epoch": 1.7177576150356448, + "grad_norm": 0.0711902305483818, + "learning_rate": 0.0001267923848958099, + "loss": 0.3255, + "step": 21204 + }, + { + "epoch": 1.717838626053143, + "grad_norm": 0.05467765033245087, + "learning_rate": 0.00012678788424321527, + "loss": 0.2763, + "step": 21205 + }, + { + "epoch": 1.7179196370706418, + "grad_norm": 0.0483541339635849, + "learning_rate": 0.00012678338359062066, + "loss": 0.2314, + "step": 21206 + }, + { + "epoch": 1.71800064808814, + "grad_norm": 0.05823575705289841, + "learning_rate": 0.00012677888293802602, + "loss": 0.2994, + "step": 21207 + }, + { + "epoch": 1.7180816591056383, + "grad_norm": 0.049455784261226654, + "learning_rate": 0.00012677438228543138, + "loss": 0.305, + "step": 21208 + }, + { + "epoch": 1.7181626701231367, + "grad_norm": 0.05627220496535301, + "learning_rate": 0.00012676988163283677, + "loss": 0.2779, + "step": 21209 + }, + { + "epoch": 1.7182436811406352, + "grad_norm": 0.05031132698059082, + "learning_rate": 0.00012676538098024213, + "loss": 0.2967, + "step": 21210 + }, + { + "epoch": 1.7183246921581334, + "grad_norm": 0.04256313294172287, + "learning_rate": 0.00012676088032764752, + "loss": 0.2588, + "step": 21211 + }, + { + "epoch": 1.718405703175632, + "grad_norm": 0.057716649025678635, + "learning_rate": 0.0001267563796750529, + "loss": 0.293, + "step": 21212 + }, + { + "epoch": 1.7184867141931304, + "grad_norm": 0.048773281276226044, + "learning_rate": 0.00012675187902245826, + "loss": 0.271, + "step": 21213 + }, + { + "epoch": 1.7185677252106286, + "grad_norm": 0.052334122359752655, + "learning_rate": 0.00012674737836986362, + "loss": 0.2585, + "step": 21214 + }, + { + "epoch": 1.7186487362281269, + "grad_norm": 0.058943185955286026, + "learning_rate": 0.000126742877717269, + "loss": 0.2928, + "step": 21215 + }, + { + "epoch": 1.7187297472456255, + "grad_norm": 0.05530468001961708, + "learning_rate": 0.00012673837706467437, + "loss": 0.267, + "step": 21216 + }, + { + "epoch": 1.7188107582631238, + "grad_norm": 0.05892956256866455, + "learning_rate": 0.00012673387641207976, + "loss": 0.2927, + "step": 21217 + }, + { + "epoch": 1.718891769280622, + "grad_norm": 0.06373018026351929, + "learning_rate": 0.00012672937575948514, + "loss": 0.3176, + "step": 21218 + }, + { + "epoch": 1.7189727802981205, + "grad_norm": 0.05370737612247467, + "learning_rate": 0.0001267248751068905, + "loss": 0.2462, + "step": 21219 + }, + { + "epoch": 1.719053791315619, + "grad_norm": 0.053730808198451996, + "learning_rate": 0.00012672037445429586, + "loss": 0.2505, + "step": 21220 + }, + { + "epoch": 1.7191348023331172, + "grad_norm": 0.05969685688614845, + "learning_rate": 0.00012671587380170125, + "loss": 0.2559, + "step": 21221 + }, + { + "epoch": 1.7192158133506157, + "grad_norm": 0.051641613245010376, + "learning_rate": 0.0001267113731491066, + "loss": 0.2581, + "step": 21222 + }, + { + "epoch": 1.7192968243681142, + "grad_norm": 0.06095960736274719, + "learning_rate": 0.000126706872496512, + "loss": 0.2565, + "step": 21223 + }, + { + "epoch": 1.7193778353856124, + "grad_norm": 0.05608683079481125, + "learning_rate": 0.00012670237184391739, + "loss": 0.2934, + "step": 21224 + }, + { + "epoch": 1.7194588464031109, + "grad_norm": 0.0689942017197609, + "learning_rate": 0.00012669787119132275, + "loss": 0.2505, + "step": 21225 + }, + { + "epoch": 1.7195398574206093, + "grad_norm": 0.056214869022369385, + "learning_rate": 0.0001266933705387281, + "loss": 0.2552, + "step": 21226 + }, + { + "epoch": 1.7196208684381076, + "grad_norm": 0.04623086377978325, + "learning_rate": 0.0001266888698861335, + "loss": 0.2916, + "step": 21227 + }, + { + "epoch": 1.7197018794556058, + "grad_norm": 0.06557148694992065, + "learning_rate": 0.00012668436923353888, + "loss": 0.2901, + "step": 21228 + }, + { + "epoch": 1.7197828904731045, + "grad_norm": 0.05431944876909256, + "learning_rate": 0.00012667986858094424, + "loss": 0.2534, + "step": 21229 + }, + { + "epoch": 1.7198639014906028, + "grad_norm": 0.060326166450977325, + "learning_rate": 0.00012667536792834963, + "loss": 0.3101, + "step": 21230 + }, + { + "epoch": 1.719944912508101, + "grad_norm": 0.055251408368349075, + "learning_rate": 0.000126670867275755, + "loss": 0.3074, + "step": 21231 + }, + { + "epoch": 1.7200259235255995, + "grad_norm": 0.05667218938469887, + "learning_rate": 0.00012666636662316035, + "loss": 0.3127, + "step": 21232 + }, + { + "epoch": 1.720106934543098, + "grad_norm": 0.06880618631839752, + "learning_rate": 0.00012666186597056574, + "loss": 0.2992, + "step": 21233 + }, + { + "epoch": 1.7201879455605962, + "grad_norm": 0.04143282026052475, + "learning_rate": 0.00012665736531797112, + "loss": 0.2202, + "step": 21234 + }, + { + "epoch": 1.7202689565780946, + "grad_norm": 0.064445361495018, + "learning_rate": 0.00012665286466537648, + "loss": 0.2515, + "step": 21235 + }, + { + "epoch": 1.720349967595593, + "grad_norm": 0.051159944385290146, + "learning_rate": 0.00012664836401278187, + "loss": 0.2752, + "step": 21236 + }, + { + "epoch": 1.7204309786130914, + "grad_norm": 0.053983334451913834, + "learning_rate": 0.00012664386336018723, + "loss": 0.2789, + "step": 21237 + }, + { + "epoch": 1.7205119896305896, + "grad_norm": 0.05558694526553154, + "learning_rate": 0.0001266393627075926, + "loss": 0.2809, + "step": 21238 + }, + { + "epoch": 1.7205930006480883, + "grad_norm": 0.05079452320933342, + "learning_rate": 0.00012663486205499798, + "loss": 0.2771, + "step": 21239 + }, + { + "epoch": 1.7206740116655865, + "grad_norm": 0.054594215005636215, + "learning_rate": 0.00012663036140240336, + "loss": 0.2361, + "step": 21240 + }, + { + "epoch": 1.7207550226830848, + "grad_norm": 0.052411917597055435, + "learning_rate": 0.00012662586074980872, + "loss": 0.2781, + "step": 21241 + }, + { + "epoch": 1.7208360337005832, + "grad_norm": 0.06520796567201614, + "learning_rate": 0.0001266213600972141, + "loss": 0.2616, + "step": 21242 + }, + { + "epoch": 1.7209170447180817, + "grad_norm": 0.05025561898946762, + "learning_rate": 0.00012661685944461947, + "loss": 0.2573, + "step": 21243 + }, + { + "epoch": 1.72099805573558, + "grad_norm": 0.060252200812101364, + "learning_rate": 0.00012661235879202483, + "loss": 0.2936, + "step": 21244 + }, + { + "epoch": 1.7210790667530784, + "grad_norm": 0.0471336767077446, + "learning_rate": 0.00012660785813943022, + "loss": 0.2514, + "step": 21245 + }, + { + "epoch": 1.721160077770577, + "grad_norm": 0.05307598039507866, + "learning_rate": 0.0001266033574868356, + "loss": 0.2775, + "step": 21246 + }, + { + "epoch": 1.7212410887880751, + "grad_norm": 0.05043487623333931, + "learning_rate": 0.00012659885683424097, + "loss": 0.2566, + "step": 21247 + }, + { + "epoch": 1.7213220998055736, + "grad_norm": 0.055017393082380295, + "learning_rate": 0.00012659435618164635, + "loss": 0.2598, + "step": 21248 + }, + { + "epoch": 1.721403110823072, + "grad_norm": 0.058290716260671616, + "learning_rate": 0.0001265898555290517, + "loss": 0.3069, + "step": 21249 + }, + { + "epoch": 1.7214841218405703, + "grad_norm": 0.06543973088264465, + "learning_rate": 0.00012658535487645707, + "loss": 0.2883, + "step": 21250 + }, + { + "epoch": 1.7215651328580686, + "grad_norm": 0.07460454106330872, + "learning_rate": 0.00012658085422386246, + "loss": 0.3185, + "step": 21251 + }, + { + "epoch": 1.721646143875567, + "grad_norm": 0.051480378955602646, + "learning_rate": 0.00012657635357126785, + "loss": 0.2741, + "step": 21252 + }, + { + "epoch": 1.7217271548930655, + "grad_norm": 0.04900343716144562, + "learning_rate": 0.0001265718529186732, + "loss": 0.3012, + "step": 21253 + }, + { + "epoch": 1.7218081659105637, + "grad_norm": 0.06127531826496124, + "learning_rate": 0.0001265673522660786, + "loss": 0.3124, + "step": 21254 + }, + { + "epoch": 1.7218891769280622, + "grad_norm": 0.056050658226013184, + "learning_rate": 0.00012656285161348395, + "loss": 0.2646, + "step": 21255 + }, + { + "epoch": 1.7219701879455607, + "grad_norm": 0.05161750316619873, + "learning_rate": 0.00012655835096088931, + "loss": 0.2661, + "step": 21256 + }, + { + "epoch": 1.722051198963059, + "grad_norm": 0.05862666293978691, + "learning_rate": 0.00012655385030829473, + "loss": 0.3148, + "step": 21257 + }, + { + "epoch": 1.7221322099805574, + "grad_norm": 0.053529586642980576, + "learning_rate": 0.0001265493496557001, + "loss": 0.2803, + "step": 21258 + }, + { + "epoch": 1.7222132209980558, + "grad_norm": 0.05283923074603081, + "learning_rate": 0.00012654484900310545, + "loss": 0.2592, + "step": 21259 + }, + { + "epoch": 1.722294232015554, + "grad_norm": 0.04250847548246384, + "learning_rate": 0.00012654034835051084, + "loss": 0.2548, + "step": 21260 + }, + { + "epoch": 1.7223752430330523, + "grad_norm": 0.05477706715464592, + "learning_rate": 0.0001265358476979162, + "loss": 0.2956, + "step": 21261 + }, + { + "epoch": 1.722456254050551, + "grad_norm": 0.04699578508734703, + "learning_rate": 0.00012653134704532156, + "loss": 0.2689, + "step": 21262 + }, + { + "epoch": 1.7225372650680493, + "grad_norm": 0.048608992248773575, + "learning_rate": 0.00012652684639272697, + "loss": 0.2915, + "step": 21263 + }, + { + "epoch": 1.7226182760855475, + "grad_norm": 0.050241369754076004, + "learning_rate": 0.00012652234574013233, + "loss": 0.2958, + "step": 21264 + }, + { + "epoch": 1.722699287103046, + "grad_norm": 0.05378851667046547, + "learning_rate": 0.0001265178450875377, + "loss": 0.2629, + "step": 21265 + }, + { + "epoch": 1.7227802981205445, + "grad_norm": 0.05486956611275673, + "learning_rate": 0.00012651334443494308, + "loss": 0.253, + "step": 21266 + }, + { + "epoch": 1.7228613091380427, + "grad_norm": 0.04376472532749176, + "learning_rate": 0.00012650884378234844, + "loss": 0.2434, + "step": 21267 + }, + { + "epoch": 1.7229423201555412, + "grad_norm": 0.05031654238700867, + "learning_rate": 0.0001265043431297538, + "loss": 0.2712, + "step": 21268 + }, + { + "epoch": 1.7230233311730396, + "grad_norm": 0.05202542617917061, + "learning_rate": 0.0001264998424771592, + "loss": 0.3016, + "step": 21269 + }, + { + "epoch": 1.7231043421905379, + "grad_norm": 0.06132112443447113, + "learning_rate": 0.00012649534182456457, + "loss": 0.3207, + "step": 21270 + }, + { + "epoch": 1.7231853532080363, + "grad_norm": 0.0606006421148777, + "learning_rate": 0.00012649084117196993, + "loss": 0.2808, + "step": 21271 + }, + { + "epoch": 1.7232663642255348, + "grad_norm": 0.05827093496918678, + "learning_rate": 0.00012648634051937532, + "loss": 0.2759, + "step": 21272 + }, + { + "epoch": 1.723347375243033, + "grad_norm": 0.060442544519901276, + "learning_rate": 0.00012648183986678068, + "loss": 0.2757, + "step": 21273 + }, + { + "epoch": 1.7234283862605313, + "grad_norm": 0.055618658661842346, + "learning_rate": 0.00012647733921418604, + "loss": 0.2845, + "step": 21274 + }, + { + "epoch": 1.7235093972780298, + "grad_norm": 0.05738990753889084, + "learning_rate": 0.00012647283856159145, + "loss": 0.2523, + "step": 21275 + }, + { + "epoch": 1.7235904082955282, + "grad_norm": 0.061367228627204895, + "learning_rate": 0.00012646833790899681, + "loss": 0.2478, + "step": 21276 + }, + { + "epoch": 1.7236714193130265, + "grad_norm": 0.0630490705370903, + "learning_rate": 0.00012646383725640217, + "loss": 0.2989, + "step": 21277 + }, + { + "epoch": 1.723752430330525, + "grad_norm": 0.05338551476597786, + "learning_rate": 0.00012645933660380756, + "loss": 0.2698, + "step": 21278 + }, + { + "epoch": 1.7238334413480234, + "grad_norm": 0.04899115115404129, + "learning_rate": 0.00012645483595121292, + "loss": 0.2392, + "step": 21279 + }, + { + "epoch": 1.7239144523655217, + "grad_norm": 0.05391428619623184, + "learning_rate": 0.0001264503352986183, + "loss": 0.2887, + "step": 21280 + }, + { + "epoch": 1.7239954633830201, + "grad_norm": 0.0557735338807106, + "learning_rate": 0.0001264458346460237, + "loss": 0.2377, + "step": 21281 + }, + { + "epoch": 1.7240764744005186, + "grad_norm": 0.05717554688453674, + "learning_rate": 0.00012644133399342906, + "loss": 0.3062, + "step": 21282 + }, + { + "epoch": 1.7241574854180168, + "grad_norm": 0.04615462198853493, + "learning_rate": 0.00012643683334083442, + "loss": 0.2586, + "step": 21283 + }, + { + "epoch": 1.724238496435515, + "grad_norm": 0.04595242440700531, + "learning_rate": 0.0001264323326882398, + "loss": 0.2649, + "step": 21284 + }, + { + "epoch": 1.7243195074530138, + "grad_norm": 0.06683401763439178, + "learning_rate": 0.00012642783203564516, + "loss": 0.346, + "step": 21285 + }, + { + "epoch": 1.724400518470512, + "grad_norm": 0.061918869614601135, + "learning_rate": 0.00012642333138305055, + "loss": 0.26, + "step": 21286 + }, + { + "epoch": 1.7244815294880103, + "grad_norm": 0.04123949259519577, + "learning_rate": 0.00012641883073045594, + "loss": 0.254, + "step": 21287 + }, + { + "epoch": 1.7245625405055087, + "grad_norm": 0.058282338082790375, + "learning_rate": 0.0001264143300778613, + "loss": 0.2479, + "step": 21288 + }, + { + "epoch": 1.7246435515230072, + "grad_norm": 0.06941943615674973, + "learning_rate": 0.00012640982942526666, + "loss": 0.2915, + "step": 21289 + }, + { + "epoch": 1.7247245625405054, + "grad_norm": 0.06298353523015976, + "learning_rate": 0.00012640532877267204, + "loss": 0.2921, + "step": 21290 + }, + { + "epoch": 1.724805573558004, + "grad_norm": 0.066420778632164, + "learning_rate": 0.0001264008281200774, + "loss": 0.3566, + "step": 21291 + }, + { + "epoch": 1.7248865845755024, + "grad_norm": 0.051810406148433685, + "learning_rate": 0.0001263963274674828, + "loss": 0.2673, + "step": 21292 + }, + { + "epoch": 1.7249675955930006, + "grad_norm": 0.06557074189186096, + "learning_rate": 0.00012639182681488818, + "loss": 0.3117, + "step": 21293 + }, + { + "epoch": 1.725048606610499, + "grad_norm": 0.05122154951095581, + "learning_rate": 0.00012638732616229354, + "loss": 0.2707, + "step": 21294 + }, + { + "epoch": 1.7251296176279975, + "grad_norm": 0.0532211996614933, + "learning_rate": 0.0001263828255096989, + "loss": 0.2401, + "step": 21295 + }, + { + "epoch": 1.7252106286454958, + "grad_norm": 0.053490567952394485, + "learning_rate": 0.00012637832485710429, + "loss": 0.3249, + "step": 21296 + }, + { + "epoch": 1.725291639662994, + "grad_norm": 0.05019410327076912, + "learning_rate": 0.00012637382420450965, + "loss": 0.2748, + "step": 21297 + }, + { + "epoch": 1.7253726506804925, + "grad_norm": 0.06262815743684769, + "learning_rate": 0.00012636932355191503, + "loss": 0.2599, + "step": 21298 + }, + { + "epoch": 1.725453661697991, + "grad_norm": 0.04839923605322838, + "learning_rate": 0.00012636482289932042, + "loss": 0.2763, + "step": 21299 + }, + { + "epoch": 1.7255346727154892, + "grad_norm": 0.05693989619612694, + "learning_rate": 0.00012636032224672578, + "loss": 0.2895, + "step": 21300 + }, + { + "epoch": 1.7256156837329877, + "grad_norm": 0.057713817805051804, + "learning_rate": 0.00012635582159413114, + "loss": 0.2793, + "step": 21301 + }, + { + "epoch": 1.7256966947504861, + "grad_norm": 0.05226074531674385, + "learning_rate": 0.00012635132094153653, + "loss": 0.257, + "step": 21302 + }, + { + "epoch": 1.7257777057679844, + "grad_norm": 0.0561804436147213, + "learning_rate": 0.0001263468202889419, + "loss": 0.2764, + "step": 21303 + }, + { + "epoch": 1.7258587167854829, + "grad_norm": 0.05232897773385048, + "learning_rate": 0.00012634231963634727, + "loss": 0.2765, + "step": 21304 + }, + { + "epoch": 1.7259397278029813, + "grad_norm": 0.05661951005458832, + "learning_rate": 0.00012633781898375266, + "loss": 0.3073, + "step": 21305 + }, + { + "epoch": 1.7260207388204796, + "grad_norm": 0.05507947504520416, + "learning_rate": 0.00012633331833115802, + "loss": 0.3042, + "step": 21306 + }, + { + "epoch": 1.7261017498379778, + "grad_norm": 0.049649450927972794, + "learning_rate": 0.00012632881767856338, + "loss": 0.2322, + "step": 21307 + }, + { + "epoch": 1.7261827608554765, + "grad_norm": 0.0585404708981514, + "learning_rate": 0.00012632431702596877, + "loss": 0.2929, + "step": 21308 + }, + { + "epoch": 1.7262637718729748, + "grad_norm": 0.05449943616986275, + "learning_rate": 0.00012631981637337416, + "loss": 0.2779, + "step": 21309 + }, + { + "epoch": 1.726344782890473, + "grad_norm": 0.05993398651480675, + "learning_rate": 0.00012631531572077952, + "loss": 0.2702, + "step": 21310 + }, + { + "epoch": 1.7264257939079715, + "grad_norm": 0.06984718143939972, + "learning_rate": 0.0001263108150681849, + "loss": 0.2983, + "step": 21311 + }, + { + "epoch": 1.72650680492547, + "grad_norm": 0.0720304548740387, + "learning_rate": 0.00012630631441559026, + "loss": 0.3044, + "step": 21312 + }, + { + "epoch": 1.7265878159429682, + "grad_norm": 0.06032625213265419, + "learning_rate": 0.00012630181376299562, + "loss": 0.2737, + "step": 21313 + }, + { + "epoch": 1.7266688269604666, + "grad_norm": 0.056495506316423416, + "learning_rate": 0.000126297313110401, + "loss": 0.2768, + "step": 21314 + }, + { + "epoch": 1.726749837977965, + "grad_norm": 0.058696284890174866, + "learning_rate": 0.0001262928124578064, + "loss": 0.3179, + "step": 21315 + }, + { + "epoch": 1.7268308489954634, + "grad_norm": 0.050367239862680435, + "learning_rate": 0.00012628831180521176, + "loss": 0.2669, + "step": 21316 + }, + { + "epoch": 1.7269118600129616, + "grad_norm": 0.05638197809457779, + "learning_rate": 0.00012628381115261714, + "loss": 0.2807, + "step": 21317 + }, + { + "epoch": 1.7269928710304603, + "grad_norm": 0.048176951706409454, + "learning_rate": 0.0001262793105000225, + "loss": 0.2527, + "step": 21318 + }, + { + "epoch": 1.7270738820479585, + "grad_norm": 0.05604461580514908, + "learning_rate": 0.00012627480984742787, + "loss": 0.2796, + "step": 21319 + }, + { + "epoch": 1.7271548930654568, + "grad_norm": 0.06629718095064163, + "learning_rate": 0.00012627030919483325, + "loss": 0.2862, + "step": 21320 + }, + { + "epoch": 1.7272359040829552, + "grad_norm": 0.0682777464389801, + "learning_rate": 0.00012626580854223864, + "loss": 0.2772, + "step": 21321 + }, + { + "epoch": 1.7273169151004537, + "grad_norm": 0.060906145721673965, + "learning_rate": 0.000126261307889644, + "loss": 0.2877, + "step": 21322 + }, + { + "epoch": 1.727397926117952, + "grad_norm": 0.0503285676240921, + "learning_rate": 0.0001262568072370494, + "loss": 0.2496, + "step": 21323 + }, + { + "epoch": 1.7274789371354504, + "grad_norm": 0.05899946019053459, + "learning_rate": 0.00012625230658445475, + "loss": 0.2627, + "step": 21324 + }, + { + "epoch": 1.7275599481529489, + "grad_norm": 0.06631388515233994, + "learning_rate": 0.0001262478059318601, + "loss": 0.2691, + "step": 21325 + }, + { + "epoch": 1.7276409591704471, + "grad_norm": 0.06886113435029984, + "learning_rate": 0.0001262433052792655, + "loss": 0.2729, + "step": 21326 + }, + { + "epoch": 1.7277219701879456, + "grad_norm": 0.05362020060420036, + "learning_rate": 0.00012623880462667088, + "loss": 0.2615, + "step": 21327 + }, + { + "epoch": 1.727802981205444, + "grad_norm": 0.04796961322426796, + "learning_rate": 0.00012623430397407624, + "loss": 0.2526, + "step": 21328 + }, + { + "epoch": 1.7278839922229423, + "grad_norm": 0.04980582743883133, + "learning_rate": 0.00012622980332148163, + "loss": 0.283, + "step": 21329 + }, + { + "epoch": 1.7279650032404406, + "grad_norm": 0.06693169474601746, + "learning_rate": 0.000126225302668887, + "loss": 0.2906, + "step": 21330 + }, + { + "epoch": 1.7280460142579392, + "grad_norm": 0.06786801666021347, + "learning_rate": 0.00012622080201629235, + "loss": 0.2715, + "step": 21331 + }, + { + "epoch": 1.7281270252754375, + "grad_norm": 0.051919642835855484, + "learning_rate": 0.00012621630136369776, + "loss": 0.2674, + "step": 21332 + }, + { + "epoch": 1.7282080362929357, + "grad_norm": 0.06100764498114586, + "learning_rate": 0.00012621180071110312, + "loss": 0.251, + "step": 21333 + }, + { + "epoch": 1.7282890473104342, + "grad_norm": 0.060728829354047775, + "learning_rate": 0.00012620730005850848, + "loss": 0.2945, + "step": 21334 + }, + { + "epoch": 1.7283700583279327, + "grad_norm": 0.078863225877285, + "learning_rate": 0.00012620279940591387, + "loss": 0.3004, + "step": 21335 + }, + { + "epoch": 1.728451069345431, + "grad_norm": 0.06468357890844345, + "learning_rate": 0.00012619829875331923, + "loss": 0.2717, + "step": 21336 + }, + { + "epoch": 1.7285320803629294, + "grad_norm": 0.05709939822554588, + "learning_rate": 0.0001261937981007246, + "loss": 0.2574, + "step": 21337 + }, + { + "epoch": 1.7286130913804278, + "grad_norm": 0.058976322412490845, + "learning_rate": 0.00012618929744813, + "loss": 0.272, + "step": 21338 + }, + { + "epoch": 1.728694102397926, + "grad_norm": 0.04370775446295738, + "learning_rate": 0.00012618479679553536, + "loss": 0.2272, + "step": 21339 + }, + { + "epoch": 1.7287751134154243, + "grad_norm": 0.05771368741989136, + "learning_rate": 0.00012618029614294072, + "loss": 0.2465, + "step": 21340 + }, + { + "epoch": 1.728856124432923, + "grad_norm": 0.04556897655129433, + "learning_rate": 0.0001261757954903461, + "loss": 0.287, + "step": 21341 + }, + { + "epoch": 1.7289371354504213, + "grad_norm": 0.05363953858613968, + "learning_rate": 0.00012617129483775147, + "loss": 0.2829, + "step": 21342 + }, + { + "epoch": 1.7290181464679195, + "grad_norm": 0.05831297114491463, + "learning_rate": 0.00012616679418515683, + "loss": 0.2739, + "step": 21343 + }, + { + "epoch": 1.729099157485418, + "grad_norm": 0.063968226313591, + "learning_rate": 0.00012616229353256225, + "loss": 0.2947, + "step": 21344 + }, + { + "epoch": 1.7291801685029164, + "grad_norm": 0.05874845013022423, + "learning_rate": 0.0001261577928799676, + "loss": 0.2953, + "step": 21345 + }, + { + "epoch": 1.7292611795204147, + "grad_norm": 0.05044439807534218, + "learning_rate": 0.00012615329222737297, + "loss": 0.3164, + "step": 21346 + }, + { + "epoch": 1.7293421905379132, + "grad_norm": 0.06613511592149734, + "learning_rate": 0.00012614879157477835, + "loss": 0.3009, + "step": 21347 + }, + { + "epoch": 1.7294232015554116, + "grad_norm": 0.05768156796693802, + "learning_rate": 0.0001261442909221837, + "loss": 0.3061, + "step": 21348 + }, + { + "epoch": 1.7295042125729099, + "grad_norm": 0.051086124032735825, + "learning_rate": 0.00012613979026958907, + "loss": 0.2436, + "step": 21349 + }, + { + "epoch": 1.7295852235904083, + "grad_norm": 0.05139658600091934, + "learning_rate": 0.0001261352896169945, + "loss": 0.2624, + "step": 21350 + }, + { + "epoch": 1.7296662346079068, + "grad_norm": 0.05288831517100334, + "learning_rate": 0.00012613078896439985, + "loss": 0.2851, + "step": 21351 + }, + { + "epoch": 1.729747245625405, + "grad_norm": 0.054695550352334976, + "learning_rate": 0.0001261262883118052, + "loss": 0.2491, + "step": 21352 + }, + { + "epoch": 1.7298282566429033, + "grad_norm": 0.055590204894542694, + "learning_rate": 0.0001261217876592106, + "loss": 0.2727, + "step": 21353 + }, + { + "epoch": 1.7299092676604018, + "grad_norm": 0.052829891443252563, + "learning_rate": 0.00012611728700661595, + "loss": 0.3052, + "step": 21354 + }, + { + "epoch": 1.7299902786779002, + "grad_norm": 0.06215556710958481, + "learning_rate": 0.00012611278635402131, + "loss": 0.2647, + "step": 21355 + }, + { + "epoch": 1.7300712896953985, + "grad_norm": 0.05681098252534866, + "learning_rate": 0.00012610828570142673, + "loss": 0.291, + "step": 21356 + }, + { + "epoch": 1.730152300712897, + "grad_norm": 0.06204934045672417, + "learning_rate": 0.0001261037850488321, + "loss": 0.304, + "step": 21357 + }, + { + "epoch": 1.7302333117303954, + "grad_norm": 0.05352215841412544, + "learning_rate": 0.00012609928439623745, + "loss": 0.2687, + "step": 21358 + }, + { + "epoch": 1.7303143227478937, + "grad_norm": 0.048397473990917206, + "learning_rate": 0.00012609478374364284, + "loss": 0.2847, + "step": 21359 + }, + { + "epoch": 1.7303953337653921, + "grad_norm": 0.055357180535793304, + "learning_rate": 0.0001260902830910482, + "loss": 0.297, + "step": 21360 + }, + { + "epoch": 1.7304763447828906, + "grad_norm": 0.05453452840447426, + "learning_rate": 0.00012608578243845358, + "loss": 0.3122, + "step": 21361 + }, + { + "epoch": 1.7305573558003888, + "grad_norm": 0.05287787318229675, + "learning_rate": 0.00012608128178585897, + "loss": 0.2694, + "step": 21362 + }, + { + "epoch": 1.730638366817887, + "grad_norm": 0.045254334807395935, + "learning_rate": 0.00012607678113326433, + "loss": 0.2705, + "step": 21363 + }, + { + "epoch": 1.7307193778353858, + "grad_norm": 0.05910949781537056, + "learning_rate": 0.0001260722804806697, + "loss": 0.2813, + "step": 21364 + }, + { + "epoch": 1.730800388852884, + "grad_norm": 0.04513555392622948, + "learning_rate": 0.00012606777982807508, + "loss": 0.2551, + "step": 21365 + }, + { + "epoch": 1.7308813998703823, + "grad_norm": 0.047856781631708145, + "learning_rate": 0.00012606327917548044, + "loss": 0.245, + "step": 21366 + }, + { + "epoch": 1.7309624108878807, + "grad_norm": 0.06202912703156471, + "learning_rate": 0.00012605877852288583, + "loss": 0.2554, + "step": 21367 + }, + { + "epoch": 1.7310434219053792, + "grad_norm": 0.06017141789197922, + "learning_rate": 0.0001260542778702912, + "loss": 0.2724, + "step": 21368 + }, + { + "epoch": 1.7311244329228774, + "grad_norm": 0.049296289682388306, + "learning_rate": 0.00012604977721769657, + "loss": 0.2767, + "step": 21369 + }, + { + "epoch": 1.731205443940376, + "grad_norm": 0.05470065027475357, + "learning_rate": 0.00012604527656510193, + "loss": 0.2262, + "step": 21370 + }, + { + "epoch": 1.7312864549578744, + "grad_norm": 0.05836993455886841, + "learning_rate": 0.00012604077591250732, + "loss": 0.2958, + "step": 21371 + }, + { + "epoch": 1.7313674659753726, + "grad_norm": 0.0632636696100235, + "learning_rate": 0.00012603627525991268, + "loss": 0.228, + "step": 21372 + }, + { + "epoch": 1.731448476992871, + "grad_norm": 0.05101475864648819, + "learning_rate": 0.00012603177460731807, + "loss": 0.2638, + "step": 21373 + }, + { + "epoch": 1.7315294880103695, + "grad_norm": 0.06122167408466339, + "learning_rate": 0.00012602727395472345, + "loss": 0.3064, + "step": 21374 + }, + { + "epoch": 1.7316104990278678, + "grad_norm": 0.06043681502342224, + "learning_rate": 0.00012602277330212881, + "loss": 0.3018, + "step": 21375 + }, + { + "epoch": 1.731691510045366, + "grad_norm": 0.05266754329204559, + "learning_rate": 0.00012601827264953417, + "loss": 0.2605, + "step": 21376 + }, + { + "epoch": 1.7317725210628645, + "grad_norm": 0.06432856619358063, + "learning_rate": 0.00012601377199693956, + "loss": 0.305, + "step": 21377 + }, + { + "epoch": 1.731853532080363, + "grad_norm": 0.05559670925140381, + "learning_rate": 0.00012600927134434492, + "loss": 0.2968, + "step": 21378 + }, + { + "epoch": 1.7319345430978612, + "grad_norm": 0.05518518015742302, + "learning_rate": 0.0001260047706917503, + "loss": 0.2731, + "step": 21379 + }, + { + "epoch": 1.7320155541153597, + "grad_norm": 0.06477613002061844, + "learning_rate": 0.0001260002700391557, + "loss": 0.2575, + "step": 21380 + }, + { + "epoch": 1.7320965651328581, + "grad_norm": 0.0499190092086792, + "learning_rate": 0.00012599576938656106, + "loss": 0.2738, + "step": 21381 + }, + { + "epoch": 1.7321775761503564, + "grad_norm": 0.048186007887125015, + "learning_rate": 0.00012599126873396642, + "loss": 0.2514, + "step": 21382 + }, + { + "epoch": 1.7322585871678549, + "grad_norm": 0.047534551471471786, + "learning_rate": 0.0001259867680813718, + "loss": 0.279, + "step": 21383 + }, + { + "epoch": 1.7323395981853533, + "grad_norm": 0.053618431091308594, + "learning_rate": 0.0001259822674287772, + "loss": 0.2716, + "step": 21384 + }, + { + "epoch": 1.7324206092028516, + "grad_norm": 0.051217447966337204, + "learning_rate": 0.00012597776677618255, + "loss": 0.2633, + "step": 21385 + }, + { + "epoch": 1.7325016202203498, + "grad_norm": 0.05521465465426445, + "learning_rate": 0.00012597326612358794, + "loss": 0.2639, + "step": 21386 + }, + { + "epoch": 1.7325826312378485, + "grad_norm": 0.04279023036360741, + "learning_rate": 0.0001259687654709933, + "loss": 0.2439, + "step": 21387 + }, + { + "epoch": 1.7326636422553467, + "grad_norm": 0.057732559740543365, + "learning_rate": 0.00012596426481839866, + "loss": 0.2406, + "step": 21388 + }, + { + "epoch": 1.732744653272845, + "grad_norm": 0.06207024306058884, + "learning_rate": 0.00012595976416580404, + "loss": 0.2888, + "step": 21389 + }, + { + "epoch": 1.7328256642903435, + "grad_norm": 0.05113453418016434, + "learning_rate": 0.00012595526351320943, + "loss": 0.2911, + "step": 21390 + }, + { + "epoch": 1.732906675307842, + "grad_norm": 0.050154559314250946, + "learning_rate": 0.0001259507628606148, + "loss": 0.2753, + "step": 21391 + }, + { + "epoch": 1.7329876863253402, + "grad_norm": 0.06298622488975525, + "learning_rate": 0.00012594626220802018, + "loss": 0.2862, + "step": 21392 + }, + { + "epoch": 1.7330686973428386, + "grad_norm": 0.047300562262535095, + "learning_rate": 0.00012594176155542554, + "loss": 0.2832, + "step": 21393 + }, + { + "epoch": 1.733149708360337, + "grad_norm": 0.0653621256351471, + "learning_rate": 0.0001259372609028309, + "loss": 0.2837, + "step": 21394 + }, + { + "epoch": 1.7332307193778353, + "grad_norm": 0.05709261819720268, + "learning_rate": 0.00012593276025023629, + "loss": 0.2791, + "step": 21395 + }, + { + "epoch": 1.7333117303953338, + "grad_norm": 0.0497063584625721, + "learning_rate": 0.00012592825959764167, + "loss": 0.2478, + "step": 21396 + }, + { + "epoch": 1.7333927414128323, + "grad_norm": 0.0732504278421402, + "learning_rate": 0.00012592375894504703, + "loss": 0.2567, + "step": 21397 + }, + { + "epoch": 1.7334737524303305, + "grad_norm": 0.06695054471492767, + "learning_rate": 0.00012591925829245242, + "loss": 0.2997, + "step": 21398 + }, + { + "epoch": 1.7335547634478288, + "grad_norm": 0.06759504228830338, + "learning_rate": 0.00012591475763985778, + "loss": 0.3161, + "step": 21399 + }, + { + "epoch": 1.7336357744653272, + "grad_norm": 0.05656924098730087, + "learning_rate": 0.00012591025698726314, + "loss": 0.2954, + "step": 21400 + }, + { + "epoch": 1.7337167854828257, + "grad_norm": 0.06173291429877281, + "learning_rate": 0.00012590575633466853, + "loss": 0.3053, + "step": 21401 + }, + { + "epoch": 1.733797796500324, + "grad_norm": 0.056654833257198334, + "learning_rate": 0.00012590125568207391, + "loss": 0.2904, + "step": 21402 + }, + { + "epoch": 1.7338788075178224, + "grad_norm": 0.050510480999946594, + "learning_rate": 0.00012589675502947927, + "loss": 0.2386, + "step": 21403 + }, + { + "epoch": 1.7339598185353209, + "grad_norm": 0.05793347209692001, + "learning_rate": 0.00012589225437688466, + "loss": 0.2795, + "step": 21404 + }, + { + "epoch": 1.7340408295528191, + "grad_norm": 0.05403798446059227, + "learning_rate": 0.00012588775372429002, + "loss": 0.2917, + "step": 21405 + }, + { + "epoch": 1.7341218405703176, + "grad_norm": 0.05910486355423927, + "learning_rate": 0.00012588325307169538, + "loss": 0.3166, + "step": 21406 + }, + { + "epoch": 1.734202851587816, + "grad_norm": 0.05428627133369446, + "learning_rate": 0.00012587875241910077, + "loss": 0.2969, + "step": 21407 + }, + { + "epoch": 1.7342838626053143, + "grad_norm": 0.0675114244222641, + "learning_rate": 0.00012587425176650616, + "loss": 0.3124, + "step": 21408 + }, + { + "epoch": 1.7343648736228126, + "grad_norm": 0.060393840074539185, + "learning_rate": 0.00012586975111391152, + "loss": 0.3345, + "step": 21409 + }, + { + "epoch": 1.7344458846403112, + "grad_norm": 0.05710717663168907, + "learning_rate": 0.0001258652504613169, + "loss": 0.2679, + "step": 21410 + }, + { + "epoch": 1.7345268956578095, + "grad_norm": 0.05533914640545845, + "learning_rate": 0.00012586074980872226, + "loss": 0.3104, + "step": 21411 + }, + { + "epoch": 1.7346079066753077, + "grad_norm": 0.057103849947452545, + "learning_rate": 0.00012585624915612762, + "loss": 0.3109, + "step": 21412 + }, + { + "epoch": 1.7346889176928062, + "grad_norm": 0.04883692413568497, + "learning_rate": 0.00012585174850353304, + "loss": 0.2732, + "step": 21413 + }, + { + "epoch": 1.7347699287103047, + "grad_norm": 0.056350503116846085, + "learning_rate": 0.0001258472478509384, + "loss": 0.3007, + "step": 21414 + }, + { + "epoch": 1.734850939727803, + "grad_norm": 0.05377821624279022, + "learning_rate": 0.00012584274719834376, + "loss": 0.2833, + "step": 21415 + }, + { + "epoch": 1.7349319507453014, + "grad_norm": 0.06554847955703735, + "learning_rate": 0.00012583824654574915, + "loss": 0.3022, + "step": 21416 + }, + { + "epoch": 1.7350129617627998, + "grad_norm": 0.054987650364637375, + "learning_rate": 0.0001258337458931545, + "loss": 0.2718, + "step": 21417 + }, + { + "epoch": 1.735093972780298, + "grad_norm": 0.046197839081287384, + "learning_rate": 0.00012582924524055987, + "loss": 0.2457, + "step": 21418 + }, + { + "epoch": 1.7351749837977966, + "grad_norm": 0.055975694209337234, + "learning_rate": 0.00012582474458796528, + "loss": 0.2791, + "step": 21419 + }, + { + "epoch": 1.735255994815295, + "grad_norm": 0.06019577011466026, + "learning_rate": 0.00012582024393537064, + "loss": 0.3089, + "step": 21420 + }, + { + "epoch": 1.7353370058327933, + "grad_norm": 0.05753401666879654, + "learning_rate": 0.000125815743282776, + "loss": 0.2629, + "step": 21421 + }, + { + "epoch": 1.7354180168502915, + "grad_norm": 0.05200933665037155, + "learning_rate": 0.0001258112426301814, + "loss": 0.2343, + "step": 21422 + }, + { + "epoch": 1.73549902786779, + "grad_norm": 0.05833006277680397, + "learning_rate": 0.00012580674197758675, + "loss": 0.2695, + "step": 21423 + }, + { + "epoch": 1.7355800388852884, + "grad_norm": 0.05324200913310051, + "learning_rate": 0.00012580224132499213, + "loss": 0.2712, + "step": 21424 + }, + { + "epoch": 1.7356610499027867, + "grad_norm": 0.05839523673057556, + "learning_rate": 0.00012579774067239752, + "loss": 0.2812, + "step": 21425 + }, + { + "epoch": 1.7357420609202852, + "grad_norm": 0.04021824151277542, + "learning_rate": 0.00012579324001980288, + "loss": 0.2395, + "step": 21426 + }, + { + "epoch": 1.7358230719377836, + "grad_norm": 0.07867847383022308, + "learning_rate": 0.00012578873936720824, + "loss": 0.2878, + "step": 21427 + }, + { + "epoch": 1.7359040829552819, + "grad_norm": 0.05782991647720337, + "learning_rate": 0.00012578423871461363, + "loss": 0.2531, + "step": 21428 + }, + { + "epoch": 1.7359850939727803, + "grad_norm": 0.0577692985534668, + "learning_rate": 0.000125779738062019, + "loss": 0.2641, + "step": 21429 + }, + { + "epoch": 1.7360661049902788, + "grad_norm": 0.05876109004020691, + "learning_rate": 0.00012577523740942438, + "loss": 0.2589, + "step": 21430 + }, + { + "epoch": 1.736147116007777, + "grad_norm": 0.05080138146877289, + "learning_rate": 0.00012577073675682976, + "loss": 0.2635, + "step": 21431 + }, + { + "epoch": 1.7362281270252753, + "grad_norm": 0.06783095002174377, + "learning_rate": 0.00012576623610423512, + "loss": 0.2848, + "step": 21432 + }, + { + "epoch": 1.736309138042774, + "grad_norm": 0.059699445962905884, + "learning_rate": 0.00012576173545164048, + "loss": 0.3167, + "step": 21433 + }, + { + "epoch": 1.7363901490602722, + "grad_norm": 0.060464613139629364, + "learning_rate": 0.00012575723479904587, + "loss": 0.2712, + "step": 21434 + }, + { + "epoch": 1.7364711600777705, + "grad_norm": 0.05330680310726166, + "learning_rate": 0.00012575273414645123, + "loss": 0.2919, + "step": 21435 + }, + { + "epoch": 1.736552171095269, + "grad_norm": 0.04967203736305237, + "learning_rate": 0.00012574823349385662, + "loss": 0.2596, + "step": 21436 + }, + { + "epoch": 1.7366331821127674, + "grad_norm": 0.054990921169519424, + "learning_rate": 0.000125743732841262, + "loss": 0.2646, + "step": 21437 + }, + { + "epoch": 1.7367141931302656, + "grad_norm": 0.052907802164554596, + "learning_rate": 0.00012573923218866736, + "loss": 0.2712, + "step": 21438 + }, + { + "epoch": 1.7367952041477641, + "grad_norm": 0.05345708131790161, + "learning_rate": 0.00012573473153607272, + "loss": 0.2591, + "step": 21439 + }, + { + "epoch": 1.7368762151652626, + "grad_norm": 0.0520918108522892, + "learning_rate": 0.0001257302308834781, + "loss": 0.2642, + "step": 21440 + }, + { + "epoch": 1.7369572261827608, + "grad_norm": 0.056074172258377075, + "learning_rate": 0.00012572573023088347, + "loss": 0.2667, + "step": 21441 + }, + { + "epoch": 1.737038237200259, + "grad_norm": 0.046990297734737396, + "learning_rate": 0.00012572122957828886, + "loss": 0.2657, + "step": 21442 + }, + { + "epoch": 1.7371192482177578, + "grad_norm": 0.056683171540498734, + "learning_rate": 0.00012571672892569425, + "loss": 0.2743, + "step": 21443 + }, + { + "epoch": 1.737200259235256, + "grad_norm": 0.048928774893283844, + "learning_rate": 0.0001257122282730996, + "loss": 0.274, + "step": 21444 + }, + { + "epoch": 1.7372812702527543, + "grad_norm": 0.055514540523290634, + "learning_rate": 0.00012570772762050497, + "loss": 0.2615, + "step": 21445 + }, + { + "epoch": 1.7373622812702527, + "grad_norm": 0.06316448748111725, + "learning_rate": 0.00012570322696791035, + "loss": 0.3063, + "step": 21446 + }, + { + "epoch": 1.7374432922877512, + "grad_norm": 0.054943282157182693, + "learning_rate": 0.0001256987263153157, + "loss": 0.2994, + "step": 21447 + }, + { + "epoch": 1.7375243033052494, + "grad_norm": 0.05165160447359085, + "learning_rate": 0.0001256942256627211, + "loss": 0.2552, + "step": 21448 + }, + { + "epoch": 1.737605314322748, + "grad_norm": 0.050460558384656906, + "learning_rate": 0.0001256897250101265, + "loss": 0.2646, + "step": 21449 + }, + { + "epoch": 1.7376863253402464, + "grad_norm": 0.05472885072231293, + "learning_rate": 0.00012568522435753185, + "loss": 0.285, + "step": 21450 + }, + { + "epoch": 1.7377673363577446, + "grad_norm": 0.06435894221067429, + "learning_rate": 0.0001256807237049372, + "loss": 0.2957, + "step": 21451 + }, + { + "epoch": 1.737848347375243, + "grad_norm": 0.060838378965854645, + "learning_rate": 0.0001256762230523426, + "loss": 0.2639, + "step": 21452 + }, + { + "epoch": 1.7379293583927415, + "grad_norm": 0.05674123764038086, + "learning_rate": 0.00012567172239974796, + "loss": 0.309, + "step": 21453 + }, + { + "epoch": 1.7380103694102398, + "grad_norm": 0.06330017745494843, + "learning_rate": 0.00012566722174715334, + "loss": 0.2949, + "step": 21454 + }, + { + "epoch": 1.738091380427738, + "grad_norm": 0.05393190309405327, + "learning_rate": 0.00012566272109455873, + "loss": 0.2424, + "step": 21455 + }, + { + "epoch": 1.7381723914452365, + "grad_norm": 0.042466968297958374, + "learning_rate": 0.0001256582204419641, + "loss": 0.2584, + "step": 21456 + }, + { + "epoch": 1.738253402462735, + "grad_norm": 0.05222369357943535, + "learning_rate": 0.00012565371978936945, + "loss": 0.2665, + "step": 21457 + }, + { + "epoch": 1.7383344134802332, + "grad_norm": 0.04583865404129028, + "learning_rate": 0.00012564921913677484, + "loss": 0.2534, + "step": 21458 + }, + { + "epoch": 1.7384154244977317, + "grad_norm": 0.05196063220500946, + "learning_rate": 0.0001256447184841802, + "loss": 0.2622, + "step": 21459 + }, + { + "epoch": 1.7384964355152301, + "grad_norm": 0.055964767932891846, + "learning_rate": 0.00012564021783158558, + "loss": 0.2734, + "step": 21460 + }, + { + "epoch": 1.7385774465327284, + "grad_norm": 0.0522189661860466, + "learning_rate": 0.00012563571717899097, + "loss": 0.2734, + "step": 21461 + }, + { + "epoch": 1.7386584575502269, + "grad_norm": 0.05658310279250145, + "learning_rate": 0.00012563121652639633, + "loss": 0.2604, + "step": 21462 + }, + { + "epoch": 1.7387394685677253, + "grad_norm": 0.05104950815439224, + "learning_rate": 0.0001256267158738017, + "loss": 0.296, + "step": 21463 + }, + { + "epoch": 1.7388204795852236, + "grad_norm": 0.0631055012345314, + "learning_rate": 0.00012562221522120708, + "loss": 0.2642, + "step": 21464 + }, + { + "epoch": 1.7389014906027218, + "grad_norm": 0.05673608556389809, + "learning_rate": 0.00012561771456861247, + "loss": 0.302, + "step": 21465 + }, + { + "epoch": 1.7389825016202205, + "grad_norm": 0.04988215118646622, + "learning_rate": 0.00012561321391601783, + "loss": 0.256, + "step": 21466 + }, + { + "epoch": 1.7390635126377187, + "grad_norm": 0.053265515714883804, + "learning_rate": 0.0001256087132634232, + "loss": 0.2588, + "step": 21467 + }, + { + "epoch": 1.739144523655217, + "grad_norm": 0.04829196259379387, + "learning_rate": 0.00012560421261082857, + "loss": 0.2508, + "step": 21468 + }, + { + "epoch": 1.7392255346727155, + "grad_norm": 0.06507780402898788, + "learning_rate": 0.00012559971195823393, + "loss": 0.3162, + "step": 21469 + }, + { + "epoch": 1.739306545690214, + "grad_norm": 0.05773133784532547, + "learning_rate": 0.00012559521130563932, + "loss": 0.2679, + "step": 21470 + }, + { + "epoch": 1.7393875567077122, + "grad_norm": 0.055706221610307693, + "learning_rate": 0.0001255907106530447, + "loss": 0.2997, + "step": 21471 + }, + { + "epoch": 1.7394685677252106, + "grad_norm": 0.05406045913696289, + "learning_rate": 0.00012558621000045007, + "loss": 0.2501, + "step": 21472 + }, + { + "epoch": 1.739549578742709, + "grad_norm": 0.05830344930291176, + "learning_rate": 0.00012558170934785545, + "loss": 0.3011, + "step": 21473 + }, + { + "epoch": 1.7396305897602073, + "grad_norm": 0.05689018964767456, + "learning_rate": 0.00012557720869526081, + "loss": 0.2535, + "step": 21474 + }, + { + "epoch": 1.7397116007777058, + "grad_norm": 0.05018286034464836, + "learning_rate": 0.00012557270804266617, + "loss": 0.2671, + "step": 21475 + }, + { + "epoch": 1.7397926117952043, + "grad_norm": 0.052559975534677505, + "learning_rate": 0.00012556820739007156, + "loss": 0.2961, + "step": 21476 + }, + { + "epoch": 1.7398736228127025, + "grad_norm": 0.06947226077318192, + "learning_rate": 0.00012556370673747695, + "loss": 0.3438, + "step": 21477 + }, + { + "epoch": 1.7399546338302008, + "grad_norm": 0.05610761046409607, + "learning_rate": 0.0001255592060848823, + "loss": 0.2982, + "step": 21478 + }, + { + "epoch": 1.7400356448476992, + "grad_norm": 0.060067109763622284, + "learning_rate": 0.0001255547054322877, + "loss": 0.2656, + "step": 21479 + }, + { + "epoch": 1.7401166558651977, + "grad_norm": 0.05526788532733917, + "learning_rate": 0.00012555020477969306, + "loss": 0.2666, + "step": 21480 + }, + { + "epoch": 1.740197666882696, + "grad_norm": 0.05276999995112419, + "learning_rate": 0.00012554570412709842, + "loss": 0.2765, + "step": 21481 + }, + { + "epoch": 1.7402786779001944, + "grad_norm": 0.054623451083898544, + "learning_rate": 0.0001255412034745038, + "loss": 0.2761, + "step": 21482 + }, + { + "epoch": 1.7403596889176929, + "grad_norm": 0.04773684963583946, + "learning_rate": 0.0001255367028219092, + "loss": 0.2923, + "step": 21483 + }, + { + "epoch": 1.7404406999351911, + "grad_norm": 0.053866446018218994, + "learning_rate": 0.00012553220216931455, + "loss": 0.2435, + "step": 21484 + }, + { + "epoch": 1.7405217109526896, + "grad_norm": 0.049850448966026306, + "learning_rate": 0.00012552770151671994, + "loss": 0.2933, + "step": 21485 + }, + { + "epoch": 1.740602721970188, + "grad_norm": 0.059759512543678284, + "learning_rate": 0.0001255232008641253, + "loss": 0.2967, + "step": 21486 + }, + { + "epoch": 1.7406837329876863, + "grad_norm": 0.04076186940073967, + "learning_rate": 0.00012551870021153066, + "loss": 0.2648, + "step": 21487 + }, + { + "epoch": 1.7407647440051845, + "grad_norm": 0.05791622772812843, + "learning_rate": 0.00012551419955893604, + "loss": 0.2974, + "step": 21488 + }, + { + "epoch": 1.7408457550226832, + "grad_norm": 0.05344267562031746, + "learning_rate": 0.00012550969890634143, + "loss": 0.2804, + "step": 21489 + }, + { + "epoch": 1.7409267660401815, + "grad_norm": 0.05635475367307663, + "learning_rate": 0.0001255051982537468, + "loss": 0.3058, + "step": 21490 + }, + { + "epoch": 1.7410077770576797, + "grad_norm": 0.07314221560955048, + "learning_rate": 0.00012550069760115218, + "loss": 0.3249, + "step": 21491 + }, + { + "epoch": 1.7410887880751782, + "grad_norm": 0.05568360164761543, + "learning_rate": 0.00012549619694855754, + "loss": 0.2766, + "step": 21492 + }, + { + "epoch": 1.7411697990926767, + "grad_norm": 0.04688483104109764, + "learning_rate": 0.00012549169629596293, + "loss": 0.2597, + "step": 21493 + }, + { + "epoch": 1.741250810110175, + "grad_norm": 0.047986626625061035, + "learning_rate": 0.0001254871956433683, + "loss": 0.2538, + "step": 21494 + }, + { + "epoch": 1.7413318211276734, + "grad_norm": 0.05898340418934822, + "learning_rate": 0.00012548269499077367, + "loss": 0.3109, + "step": 21495 + }, + { + "epoch": 1.7414128321451718, + "grad_norm": 0.062086861580610275, + "learning_rate": 0.00012547819433817903, + "loss": 0.2648, + "step": 21496 + }, + { + "epoch": 1.74149384316267, + "grad_norm": 0.05001772567629814, + "learning_rate": 0.00012547369368558442, + "loss": 0.2583, + "step": 21497 + }, + { + "epoch": 1.7415748541801686, + "grad_norm": 0.0589098185300827, + "learning_rate": 0.00012546919303298978, + "loss": 0.2771, + "step": 21498 + }, + { + "epoch": 1.741655865197667, + "grad_norm": 0.06397674232721329, + "learning_rate": 0.00012546469238039517, + "loss": 0.2931, + "step": 21499 + }, + { + "epoch": 1.7417368762151653, + "grad_norm": 0.059052225202322006, + "learning_rate": 0.00012546019172780056, + "loss": 0.2862, + "step": 21500 + }, + { + "epoch": 1.7418178872326635, + "grad_norm": 0.05825584754347801, + "learning_rate": 0.00012545569107520592, + "loss": 0.3101, + "step": 21501 + }, + { + "epoch": 1.741898898250162, + "grad_norm": 0.05482599884271622, + "learning_rate": 0.00012545119042261128, + "loss": 0.2734, + "step": 21502 + }, + { + "epoch": 1.7419799092676604, + "grad_norm": 0.06473010033369064, + "learning_rate": 0.00012544668977001666, + "loss": 0.2913, + "step": 21503 + }, + { + "epoch": 1.7420609202851587, + "grad_norm": 0.05840296298265457, + "learning_rate": 0.00012544218911742202, + "loss": 0.2555, + "step": 21504 + }, + { + "epoch": 1.7421419313026572, + "grad_norm": 0.06706037372350693, + "learning_rate": 0.0001254376884648274, + "loss": 0.2609, + "step": 21505 + }, + { + "epoch": 1.7422229423201556, + "grad_norm": 0.052381958812475204, + "learning_rate": 0.0001254331878122328, + "loss": 0.2467, + "step": 21506 + }, + { + "epoch": 1.7423039533376539, + "grad_norm": 0.04638232663273811, + "learning_rate": 0.00012542868715963816, + "loss": 0.2378, + "step": 21507 + }, + { + "epoch": 1.7423849643551523, + "grad_norm": 0.051347095519304276, + "learning_rate": 0.00012542418650704352, + "loss": 0.2355, + "step": 21508 + }, + { + "epoch": 1.7424659753726508, + "grad_norm": 0.06403027474880219, + "learning_rate": 0.0001254196858544489, + "loss": 0.3214, + "step": 21509 + }, + { + "epoch": 1.742546986390149, + "grad_norm": 0.049564070999622345, + "learning_rate": 0.00012541518520185426, + "loss": 0.2531, + "step": 21510 + }, + { + "epoch": 1.7426279974076473, + "grad_norm": 0.06536975502967834, + "learning_rate": 0.00012541068454925965, + "loss": 0.3425, + "step": 21511 + }, + { + "epoch": 1.742709008425146, + "grad_norm": 0.048163220286369324, + "learning_rate": 0.00012540618389666504, + "loss": 0.2339, + "step": 21512 + }, + { + "epoch": 1.7427900194426442, + "grad_norm": 0.049840670078992844, + "learning_rate": 0.0001254016832440704, + "loss": 0.2806, + "step": 21513 + }, + { + "epoch": 1.7428710304601425, + "grad_norm": 0.0577770434319973, + "learning_rate": 0.00012539718259147576, + "loss": 0.2912, + "step": 21514 + }, + { + "epoch": 1.742952041477641, + "grad_norm": 0.05158194527029991, + "learning_rate": 0.00012539268193888115, + "loss": 0.2386, + "step": 21515 + }, + { + "epoch": 1.7430330524951394, + "grad_norm": 0.0499204657971859, + "learning_rate": 0.0001253881812862865, + "loss": 0.251, + "step": 21516 + }, + { + "epoch": 1.7431140635126376, + "grad_norm": 0.048412248492240906, + "learning_rate": 0.0001253836806336919, + "loss": 0.2594, + "step": 21517 + }, + { + "epoch": 1.7431950745301361, + "grad_norm": 0.04767746105790138, + "learning_rate": 0.00012537917998109728, + "loss": 0.2352, + "step": 21518 + }, + { + "epoch": 1.7432760855476346, + "grad_norm": 0.04811496660113335, + "learning_rate": 0.00012537467932850264, + "loss": 0.2538, + "step": 21519 + }, + { + "epoch": 1.7433570965651328, + "grad_norm": 0.054618820548057556, + "learning_rate": 0.000125370178675908, + "loss": 0.2469, + "step": 21520 + }, + { + "epoch": 1.7434381075826313, + "grad_norm": 0.055555637925863266, + "learning_rate": 0.0001253656780233134, + "loss": 0.2955, + "step": 21521 + }, + { + "epoch": 1.7435191186001298, + "grad_norm": 0.04947299137711525, + "learning_rate": 0.00012536117737071875, + "loss": 0.27, + "step": 21522 + }, + { + "epoch": 1.743600129617628, + "grad_norm": 0.05181947723031044, + "learning_rate": 0.00012535667671812413, + "loss": 0.2476, + "step": 21523 + }, + { + "epoch": 1.7436811406351262, + "grad_norm": 0.055776968598365784, + "learning_rate": 0.00012535217606552952, + "loss": 0.2784, + "step": 21524 + }, + { + "epoch": 1.7437621516526247, + "grad_norm": 0.0568946897983551, + "learning_rate": 0.00012534767541293488, + "loss": 0.2588, + "step": 21525 + }, + { + "epoch": 1.7438431626701232, + "grad_norm": 0.06434997171163559, + "learning_rate": 0.00012534317476034024, + "loss": 0.2989, + "step": 21526 + }, + { + "epoch": 1.7439241736876214, + "grad_norm": 0.059009552001953125, + "learning_rate": 0.00012533867410774563, + "loss": 0.3146, + "step": 21527 + }, + { + "epoch": 1.74400518470512, + "grad_norm": 0.05180266872048378, + "learning_rate": 0.000125334173455151, + "loss": 0.2498, + "step": 21528 + }, + { + "epoch": 1.7440861957226184, + "grad_norm": 0.0628051683306694, + "learning_rate": 0.00012532967280255638, + "loss": 0.2804, + "step": 21529 + }, + { + "epoch": 1.7441672067401166, + "grad_norm": 0.07517543435096741, + "learning_rate": 0.00012532517214996176, + "loss": 0.2747, + "step": 21530 + }, + { + "epoch": 1.744248217757615, + "grad_norm": 0.06764915585517883, + "learning_rate": 0.00012532067149736712, + "loss": 0.2639, + "step": 21531 + }, + { + "epoch": 1.7443292287751135, + "grad_norm": 0.07854727655649185, + "learning_rate": 0.00012531617084477248, + "loss": 0.3395, + "step": 21532 + }, + { + "epoch": 1.7444102397926118, + "grad_norm": 0.05045973137021065, + "learning_rate": 0.00012531167019217787, + "loss": 0.3025, + "step": 21533 + }, + { + "epoch": 1.74449125081011, + "grad_norm": 0.054208904504776, + "learning_rate": 0.00012530716953958323, + "loss": 0.2857, + "step": 21534 + }, + { + "epoch": 1.7445722618276087, + "grad_norm": 0.04966440424323082, + "learning_rate": 0.00012530266888698862, + "loss": 0.2402, + "step": 21535 + }, + { + "epoch": 1.744653272845107, + "grad_norm": 0.05039123818278313, + "learning_rate": 0.000125298168234394, + "loss": 0.2566, + "step": 21536 + }, + { + "epoch": 1.7447342838626052, + "grad_norm": 0.04494204372167587, + "learning_rate": 0.00012529366758179936, + "loss": 0.2608, + "step": 21537 + }, + { + "epoch": 1.7448152948801037, + "grad_norm": 0.05890116095542908, + "learning_rate": 0.00012528916692920472, + "loss": 0.2842, + "step": 21538 + }, + { + "epoch": 1.7448963058976021, + "grad_norm": 0.049471139907836914, + "learning_rate": 0.0001252846662766101, + "loss": 0.3005, + "step": 21539 + }, + { + "epoch": 1.7449773169151004, + "grad_norm": 0.060563184320926666, + "learning_rate": 0.00012528016562401547, + "loss": 0.2672, + "step": 21540 + }, + { + "epoch": 1.7450583279325989, + "grad_norm": 0.051124975085258484, + "learning_rate": 0.00012527566497142086, + "loss": 0.2533, + "step": 21541 + }, + { + "epoch": 1.7451393389500973, + "grad_norm": 0.04515647143125534, + "learning_rate": 0.00012527116431882625, + "loss": 0.2504, + "step": 21542 + }, + { + "epoch": 1.7452203499675956, + "grad_norm": 0.046405233442783356, + "learning_rate": 0.0001252666636662316, + "loss": 0.2772, + "step": 21543 + }, + { + "epoch": 1.7453013609850938, + "grad_norm": 0.051825813949108124, + "learning_rate": 0.00012526216301363697, + "loss": 0.2524, + "step": 21544 + }, + { + "epoch": 1.7453823720025925, + "grad_norm": 0.052078425884246826, + "learning_rate": 0.00012525766236104235, + "loss": 0.2721, + "step": 21545 + }, + { + "epoch": 1.7454633830200907, + "grad_norm": 0.06504826992750168, + "learning_rate": 0.00012525316170844774, + "loss": 0.2336, + "step": 21546 + }, + { + "epoch": 1.745544394037589, + "grad_norm": 0.056292686611413956, + "learning_rate": 0.0001252486610558531, + "loss": 0.2983, + "step": 21547 + }, + { + "epoch": 1.7456254050550875, + "grad_norm": 0.062441788613796234, + "learning_rate": 0.0001252441604032585, + "loss": 0.3117, + "step": 21548 + }, + { + "epoch": 1.745706416072586, + "grad_norm": 0.05644739791750908, + "learning_rate": 0.00012523965975066385, + "loss": 0.2612, + "step": 21549 + }, + { + "epoch": 1.7457874270900842, + "grad_norm": 0.05442476272583008, + "learning_rate": 0.0001252351590980692, + "loss": 0.2705, + "step": 21550 + }, + { + "epoch": 1.7458684381075826, + "grad_norm": 0.05974116176366806, + "learning_rate": 0.0001252306584454746, + "loss": 0.2695, + "step": 21551 + }, + { + "epoch": 1.745949449125081, + "grad_norm": 0.05387002229690552, + "learning_rate": 0.00012522615779287998, + "loss": 0.2855, + "step": 21552 + }, + { + "epoch": 1.7460304601425793, + "grad_norm": 0.05770234391093254, + "learning_rate": 0.00012522165714028534, + "loss": 0.3043, + "step": 21553 + }, + { + "epoch": 1.7461114711600778, + "grad_norm": 0.0670681819319725, + "learning_rate": 0.00012521715648769073, + "loss": 0.2737, + "step": 21554 + }, + { + "epoch": 1.7461924821775763, + "grad_norm": 0.07340843975543976, + "learning_rate": 0.0001252126558350961, + "loss": 0.2869, + "step": 21555 + }, + { + "epoch": 1.7462734931950745, + "grad_norm": 0.059023816138505936, + "learning_rate": 0.00012520815518250145, + "loss": 0.2981, + "step": 21556 + }, + { + "epoch": 1.7463545042125728, + "grad_norm": 0.062153011560440063, + "learning_rate": 0.00012520365452990684, + "loss": 0.2648, + "step": 21557 + }, + { + "epoch": 1.7464355152300715, + "grad_norm": 0.05502208322286606, + "learning_rate": 0.00012519915387731222, + "loss": 0.2592, + "step": 21558 + }, + { + "epoch": 1.7465165262475697, + "grad_norm": 0.0649203211069107, + "learning_rate": 0.00012519465322471758, + "loss": 0.2915, + "step": 21559 + }, + { + "epoch": 1.746597537265068, + "grad_norm": 0.05525941774249077, + "learning_rate": 0.00012519015257212297, + "loss": 0.2721, + "step": 21560 + }, + { + "epoch": 1.7466785482825664, + "grad_norm": 0.062348414212465286, + "learning_rate": 0.00012518565191952833, + "loss": 0.3265, + "step": 21561 + }, + { + "epoch": 1.7467595593000649, + "grad_norm": 0.061026740819215775, + "learning_rate": 0.00012518115126693372, + "loss": 0.3212, + "step": 21562 + }, + { + "epoch": 1.7468405703175631, + "grad_norm": 0.05713002756237984, + "learning_rate": 0.00012517665061433908, + "loss": 0.2846, + "step": 21563 + }, + { + "epoch": 1.7469215813350616, + "grad_norm": 0.04872580245137215, + "learning_rate": 0.00012517214996174447, + "loss": 0.2637, + "step": 21564 + }, + { + "epoch": 1.74700259235256, + "grad_norm": 0.05415729805827141, + "learning_rate": 0.00012516764930914983, + "loss": 0.2787, + "step": 21565 + }, + { + "epoch": 1.7470836033700583, + "grad_norm": 0.054065488278865814, + "learning_rate": 0.0001251631486565552, + "loss": 0.3008, + "step": 21566 + }, + { + "epoch": 1.7471646143875565, + "grad_norm": 0.05611393228173256, + "learning_rate": 0.00012515864800396057, + "loss": 0.2688, + "step": 21567 + }, + { + "epoch": 1.7472456254050552, + "grad_norm": 0.05508045852184296, + "learning_rate": 0.00012515414735136596, + "loss": 0.2431, + "step": 21568 + }, + { + "epoch": 1.7473266364225535, + "grad_norm": 0.05065127834677696, + "learning_rate": 0.00012514964669877132, + "loss": 0.3143, + "step": 21569 + }, + { + "epoch": 1.7474076474400517, + "grad_norm": 0.04621940851211548, + "learning_rate": 0.0001251451460461767, + "loss": 0.2991, + "step": 21570 + }, + { + "epoch": 1.7474886584575502, + "grad_norm": 0.04384281113743782, + "learning_rate": 0.00012514064539358207, + "loss": 0.2465, + "step": 21571 + }, + { + "epoch": 1.7475696694750487, + "grad_norm": 0.05889362469315529, + "learning_rate": 0.00012513614474098745, + "loss": 0.2807, + "step": 21572 + }, + { + "epoch": 1.747650680492547, + "grad_norm": 0.05274774879217148, + "learning_rate": 0.00012513164408839281, + "loss": 0.2664, + "step": 21573 + }, + { + "epoch": 1.7477316915100454, + "grad_norm": 0.059176184237003326, + "learning_rate": 0.0001251271434357982, + "loss": 0.2772, + "step": 21574 + }, + { + "epoch": 1.7478127025275438, + "grad_norm": 0.05871306732296944, + "learning_rate": 0.0001251226427832036, + "loss": 0.2698, + "step": 21575 + }, + { + "epoch": 1.747893713545042, + "grad_norm": 0.052813295274972916, + "learning_rate": 0.00012511814213060895, + "loss": 0.2481, + "step": 21576 + }, + { + "epoch": 1.7479747245625405, + "grad_norm": 0.06352829933166504, + "learning_rate": 0.0001251136414780143, + "loss": 0.3312, + "step": 21577 + }, + { + "epoch": 1.748055735580039, + "grad_norm": 0.046175677329301834, + "learning_rate": 0.0001251091408254197, + "loss": 0.2593, + "step": 21578 + }, + { + "epoch": 1.7481367465975373, + "grad_norm": 0.06400038301944733, + "learning_rate": 0.00012510464017282506, + "loss": 0.3031, + "step": 21579 + }, + { + "epoch": 1.7482177576150355, + "grad_norm": 0.06217432767152786, + "learning_rate": 0.00012510013952023044, + "loss": 0.2741, + "step": 21580 + }, + { + "epoch": 1.748298768632534, + "grad_norm": 0.06696798652410507, + "learning_rate": 0.00012509563886763583, + "loss": 0.3025, + "step": 21581 + }, + { + "epoch": 1.7483797796500324, + "grad_norm": 0.0568382665514946, + "learning_rate": 0.0001250911382150412, + "loss": 0.3002, + "step": 21582 + }, + { + "epoch": 1.7484607906675307, + "grad_norm": 0.06164439395070076, + "learning_rate": 0.00012508663756244655, + "loss": 0.2556, + "step": 21583 + }, + { + "epoch": 1.7485418016850292, + "grad_norm": 0.05397419631481171, + "learning_rate": 0.00012508213690985194, + "loss": 0.2729, + "step": 21584 + }, + { + "epoch": 1.7486228127025276, + "grad_norm": 0.05543617904186249, + "learning_rate": 0.0001250776362572573, + "loss": 0.305, + "step": 21585 + }, + { + "epoch": 1.7487038237200259, + "grad_norm": 0.05711469426751137, + "learning_rate": 0.00012507313560466268, + "loss": 0.2885, + "step": 21586 + }, + { + "epoch": 1.7487848347375243, + "grad_norm": 0.061703696846961975, + "learning_rate": 0.00012506863495206807, + "loss": 0.2795, + "step": 21587 + }, + { + "epoch": 1.7488658457550228, + "grad_norm": 0.053626950830221176, + "learning_rate": 0.00012506413429947343, + "loss": 0.2935, + "step": 21588 + }, + { + "epoch": 1.748946856772521, + "grad_norm": 0.0490972176194191, + "learning_rate": 0.0001250596336468788, + "loss": 0.2555, + "step": 21589 + }, + { + "epoch": 1.7490278677900193, + "grad_norm": 0.05172901600599289, + "learning_rate": 0.00012505513299428418, + "loss": 0.295, + "step": 21590 + }, + { + "epoch": 1.749108878807518, + "grad_norm": 0.05565916746854782, + "learning_rate": 0.00012505063234168954, + "loss": 0.268, + "step": 21591 + }, + { + "epoch": 1.7491898898250162, + "grad_norm": 0.06226447597146034, + "learning_rate": 0.00012504613168909493, + "loss": 0.3178, + "step": 21592 + }, + { + "epoch": 1.7492709008425145, + "grad_norm": 0.051055144518613815, + "learning_rate": 0.00012504163103650031, + "loss": 0.2619, + "step": 21593 + }, + { + "epoch": 1.749351911860013, + "grad_norm": 0.05917363241314888, + "learning_rate": 0.00012503713038390567, + "loss": 0.2706, + "step": 21594 + }, + { + "epoch": 1.7494329228775114, + "grad_norm": 0.05820334702730179, + "learning_rate": 0.00012503262973131103, + "loss": 0.2764, + "step": 21595 + }, + { + "epoch": 1.7495139338950096, + "grad_norm": 0.06279566884040833, + "learning_rate": 0.00012502812907871642, + "loss": 0.2629, + "step": 21596 + }, + { + "epoch": 1.749594944912508, + "grad_norm": 0.05095963552594185, + "learning_rate": 0.00012502362842612178, + "loss": 0.2827, + "step": 21597 + }, + { + "epoch": 1.7496759559300066, + "grad_norm": 0.05769157037138939, + "learning_rate": 0.00012501912777352717, + "loss": 0.2404, + "step": 21598 + }, + { + "epoch": 1.7497569669475048, + "grad_norm": 0.05776160582900047, + "learning_rate": 0.00012501462712093256, + "loss": 0.3016, + "step": 21599 + }, + { + "epoch": 1.7498379779650033, + "grad_norm": 0.059620942920446396, + "learning_rate": 0.00012501012646833792, + "loss": 0.2767, + "step": 21600 + }, + { + "epoch": 1.7499189889825018, + "grad_norm": 0.051635947078466415, + "learning_rate": 0.00012500562581574328, + "loss": 0.2555, + "step": 21601 + }, + { + "epoch": 1.75, + "grad_norm": 0.061463210731744766, + "learning_rate": 0.00012500112516314866, + "loss": 0.2926, + "step": 21602 + }, + { + "epoch": 1.7500810110174982, + "grad_norm": 0.05254808068275452, + "learning_rate": 0.00012499662451055402, + "loss": 0.2554, + "step": 21603 + }, + { + "epoch": 1.7501620220349967, + "grad_norm": 0.05625847354531288, + "learning_rate": 0.0001249921238579594, + "loss": 0.2881, + "step": 21604 + }, + { + "epoch": 1.7502430330524952, + "grad_norm": 0.05712534114718437, + "learning_rate": 0.0001249876232053648, + "loss": 0.2898, + "step": 21605 + }, + { + "epoch": 1.7503240440699934, + "grad_norm": 0.05910136178135872, + "learning_rate": 0.00012498312255277016, + "loss": 0.2484, + "step": 21606 + }, + { + "epoch": 1.750405055087492, + "grad_norm": 0.05727291479706764, + "learning_rate": 0.00012497862190017552, + "loss": 0.2755, + "step": 21607 + }, + { + "epoch": 1.7504860661049904, + "grad_norm": 0.05712650343775749, + "learning_rate": 0.0001249741212475809, + "loss": 0.2926, + "step": 21608 + }, + { + "epoch": 1.7505670771224886, + "grad_norm": 0.04607681557536125, + "learning_rate": 0.00012496962059498626, + "loss": 0.2639, + "step": 21609 + }, + { + "epoch": 1.750648088139987, + "grad_norm": 0.06823698431253433, + "learning_rate": 0.00012496511994239165, + "loss": 0.2575, + "step": 21610 + }, + { + "epoch": 1.7507290991574855, + "grad_norm": 0.04931235313415527, + "learning_rate": 0.00012496061928979704, + "loss": 0.2968, + "step": 21611 + }, + { + "epoch": 1.7508101101749838, + "grad_norm": 0.05911986157298088, + "learning_rate": 0.0001249561186372024, + "loss": 0.3131, + "step": 21612 + }, + { + "epoch": 1.750891121192482, + "grad_norm": 0.054763313382864, + "learning_rate": 0.00012495161798460776, + "loss": 0.2549, + "step": 21613 + }, + { + "epoch": 1.7509721322099807, + "grad_norm": 0.048321280628442764, + "learning_rate": 0.00012494711733201315, + "loss": 0.2996, + "step": 21614 + }, + { + "epoch": 1.751053143227479, + "grad_norm": 0.05358283594250679, + "learning_rate": 0.0001249426166794185, + "loss": 0.2784, + "step": 21615 + }, + { + "epoch": 1.7511341542449772, + "grad_norm": 0.052051570266485214, + "learning_rate": 0.0001249381160268239, + "loss": 0.2844, + "step": 21616 + }, + { + "epoch": 1.7512151652624757, + "grad_norm": 0.04813947528600693, + "learning_rate": 0.00012493361537422928, + "loss": 0.2421, + "step": 21617 + }, + { + "epoch": 1.7512961762799741, + "grad_norm": 0.06113769859075546, + "learning_rate": 0.00012492911472163464, + "loss": 0.2889, + "step": 21618 + }, + { + "epoch": 1.7513771872974724, + "grad_norm": 0.06924999505281448, + "learning_rate": 0.00012492461406904, + "loss": 0.284, + "step": 21619 + }, + { + "epoch": 1.7514581983149708, + "grad_norm": 0.05498860776424408, + "learning_rate": 0.0001249201134164454, + "loss": 0.2622, + "step": 21620 + }, + { + "epoch": 1.7515392093324693, + "grad_norm": 0.044673047959804535, + "learning_rate": 0.00012491561276385075, + "loss": 0.2655, + "step": 21621 + }, + { + "epoch": 1.7516202203499676, + "grad_norm": 0.05713099613785744, + "learning_rate": 0.00012491111211125613, + "loss": 0.2567, + "step": 21622 + }, + { + "epoch": 1.751701231367466, + "grad_norm": 0.04379938170313835, + "learning_rate": 0.00012490661145866152, + "loss": 0.2534, + "step": 21623 + }, + { + "epoch": 1.7517822423849645, + "grad_norm": 0.055278170853853226, + "learning_rate": 0.00012490211080606688, + "loss": 0.2903, + "step": 21624 + }, + { + "epoch": 1.7518632534024627, + "grad_norm": 0.05559024214744568, + "learning_rate": 0.00012489761015347224, + "loss": 0.2833, + "step": 21625 + }, + { + "epoch": 1.751944264419961, + "grad_norm": 0.04830685257911682, + "learning_rate": 0.00012489310950087763, + "loss": 0.2773, + "step": 21626 + }, + { + "epoch": 1.7520252754374595, + "grad_norm": 0.04247341677546501, + "learning_rate": 0.00012488860884828302, + "loss": 0.3078, + "step": 21627 + }, + { + "epoch": 1.752106286454958, + "grad_norm": 0.05888223275542259, + "learning_rate": 0.00012488410819568838, + "loss": 0.2679, + "step": 21628 + }, + { + "epoch": 1.7521872974724562, + "grad_norm": 0.045269403606653214, + "learning_rate": 0.00012487960754309376, + "loss": 0.2646, + "step": 21629 + }, + { + "epoch": 1.7522683084899546, + "grad_norm": 0.05917354300618172, + "learning_rate": 0.00012487510689049912, + "loss": 0.277, + "step": 21630 + }, + { + "epoch": 1.752349319507453, + "grad_norm": 0.06229737028479576, + "learning_rate": 0.0001248706062379045, + "loss": 0.2812, + "step": 21631 + }, + { + "epoch": 1.7524303305249513, + "grad_norm": 0.05246162414550781, + "learning_rate": 0.00012486610558530987, + "loss": 0.2297, + "step": 21632 + }, + { + "epoch": 1.7525113415424498, + "grad_norm": 0.060903310775756836, + "learning_rate": 0.00012486160493271526, + "loss": 0.2643, + "step": 21633 + }, + { + "epoch": 1.7525923525599483, + "grad_norm": 0.05659567192196846, + "learning_rate": 0.00012485710428012062, + "loss": 0.2648, + "step": 21634 + }, + { + "epoch": 1.7526733635774465, + "grad_norm": 0.058088213205337524, + "learning_rate": 0.000124852603627526, + "loss": 0.2457, + "step": 21635 + }, + { + "epoch": 1.7527543745949448, + "grad_norm": 0.06439466029405594, + "learning_rate": 0.00012484810297493137, + "loss": 0.2749, + "step": 21636 + }, + { + "epoch": 1.7528353856124435, + "grad_norm": 0.056994613260030746, + "learning_rate": 0.00012484360232233675, + "loss": 0.275, + "step": 21637 + }, + { + "epoch": 1.7529163966299417, + "grad_norm": 0.05250370502471924, + "learning_rate": 0.0001248391016697421, + "loss": 0.28, + "step": 21638 + }, + { + "epoch": 1.75299740764744, + "grad_norm": 0.0519607812166214, + "learning_rate": 0.0001248346010171475, + "loss": 0.2656, + "step": 21639 + }, + { + "epoch": 1.7530784186649384, + "grad_norm": 0.0754295364022255, + "learning_rate": 0.00012483010036455286, + "loss": 0.3041, + "step": 21640 + }, + { + "epoch": 1.7531594296824369, + "grad_norm": 0.07012224197387695, + "learning_rate": 0.00012482559971195825, + "loss": 0.3193, + "step": 21641 + }, + { + "epoch": 1.7532404406999351, + "grad_norm": 0.050598498433828354, + "learning_rate": 0.0001248210990593636, + "loss": 0.3008, + "step": 21642 + }, + { + "epoch": 1.7533214517174336, + "grad_norm": 0.05320628732442856, + "learning_rate": 0.000124816598406769, + "loss": 0.2498, + "step": 21643 + }, + { + "epoch": 1.753402462734932, + "grad_norm": 0.05083597078919411, + "learning_rate": 0.00012481209775417435, + "loss": 0.2969, + "step": 21644 + }, + { + "epoch": 1.7534834737524303, + "grad_norm": 0.06301596015691757, + "learning_rate": 0.00012480759710157974, + "loss": 0.3163, + "step": 21645 + }, + { + "epoch": 1.7535644847699285, + "grad_norm": 0.05585845932364464, + "learning_rate": 0.0001248030964489851, + "loss": 0.2774, + "step": 21646 + }, + { + "epoch": 1.7536454957874272, + "grad_norm": 0.0647146999835968, + "learning_rate": 0.0001247985957963905, + "loss": 0.3017, + "step": 21647 + }, + { + "epoch": 1.7537265068049255, + "grad_norm": 0.08223205804824829, + "learning_rate": 0.00012479409514379585, + "loss": 0.2949, + "step": 21648 + }, + { + "epoch": 1.7538075178224237, + "grad_norm": 0.05526803061366081, + "learning_rate": 0.00012478959449120124, + "loss": 0.3022, + "step": 21649 + }, + { + "epoch": 1.7538885288399222, + "grad_norm": 0.04275866970419884, + "learning_rate": 0.00012478509383860662, + "loss": 0.2533, + "step": 21650 + }, + { + "epoch": 1.7539695398574207, + "grad_norm": 0.05600523203611374, + "learning_rate": 0.00012478059318601198, + "loss": 0.2719, + "step": 21651 + }, + { + "epoch": 1.754050550874919, + "grad_norm": 0.053683191537857056, + "learning_rate": 0.00012477609253341734, + "loss": 0.2669, + "step": 21652 + }, + { + "epoch": 1.7541315618924174, + "grad_norm": 0.05389159545302391, + "learning_rate": 0.00012477159188082273, + "loss": 0.2572, + "step": 21653 + }, + { + "epoch": 1.7542125729099158, + "grad_norm": 0.057773686945438385, + "learning_rate": 0.0001247670912282281, + "loss": 0.2924, + "step": 21654 + }, + { + "epoch": 1.754293583927414, + "grad_norm": 0.05425887927412987, + "learning_rate": 0.00012476259057563348, + "loss": 0.2633, + "step": 21655 + }, + { + "epoch": 1.7543745949449125, + "grad_norm": 0.05559500679373741, + "learning_rate": 0.00012475808992303886, + "loss": 0.2678, + "step": 21656 + }, + { + "epoch": 1.754455605962411, + "grad_norm": 0.05029148608446121, + "learning_rate": 0.00012475358927044422, + "loss": 0.2664, + "step": 21657 + }, + { + "epoch": 1.7545366169799093, + "grad_norm": 0.053293295204639435, + "learning_rate": 0.00012474908861784958, + "loss": 0.2915, + "step": 21658 + }, + { + "epoch": 1.7546176279974075, + "grad_norm": 0.04977700486779213, + "learning_rate": 0.00012474458796525497, + "loss": 0.2921, + "step": 21659 + }, + { + "epoch": 1.7546986390149062, + "grad_norm": 0.05130726844072342, + "learning_rate": 0.00012474008731266033, + "loss": 0.2822, + "step": 21660 + }, + { + "epoch": 1.7547796500324044, + "grad_norm": 0.04962471127510071, + "learning_rate": 0.00012473558666006572, + "loss": 0.2613, + "step": 21661 + }, + { + "epoch": 1.7548606610499027, + "grad_norm": 0.05113421380519867, + "learning_rate": 0.0001247310860074711, + "loss": 0.2958, + "step": 21662 + }, + { + "epoch": 1.7549416720674011, + "grad_norm": 0.05285904183983803, + "learning_rate": 0.00012472658535487647, + "loss": 0.2974, + "step": 21663 + }, + { + "epoch": 1.7550226830848996, + "grad_norm": 0.05132170394062996, + "learning_rate": 0.00012472208470228183, + "loss": 0.2654, + "step": 21664 + }, + { + "epoch": 1.7551036941023979, + "grad_norm": 0.054761797189712524, + "learning_rate": 0.0001247175840496872, + "loss": 0.3014, + "step": 21665 + }, + { + "epoch": 1.7551847051198963, + "grad_norm": 0.060858312994241714, + "learning_rate": 0.00012471308339709257, + "loss": 0.2973, + "step": 21666 + }, + { + "epoch": 1.7552657161373948, + "grad_norm": 0.0532330721616745, + "learning_rate": 0.00012470858274449796, + "loss": 0.2534, + "step": 21667 + }, + { + "epoch": 1.755346727154893, + "grad_norm": 0.05100312829017639, + "learning_rate": 0.00012470408209190335, + "loss": 0.2647, + "step": 21668 + }, + { + "epoch": 1.7554277381723913, + "grad_norm": 0.06273045390844345, + "learning_rate": 0.0001246995814393087, + "loss": 0.2898, + "step": 21669 + }, + { + "epoch": 1.75550874918989, + "grad_norm": 0.05596870183944702, + "learning_rate": 0.00012469508078671407, + "loss": 0.3194, + "step": 21670 + }, + { + "epoch": 1.7555897602073882, + "grad_norm": 0.057777270674705505, + "learning_rate": 0.00012469058013411945, + "loss": 0.26, + "step": 21671 + }, + { + "epoch": 1.7556707712248865, + "grad_norm": 0.07411547005176544, + "learning_rate": 0.00012468607948152481, + "loss": 0.3289, + "step": 21672 + }, + { + "epoch": 1.755751782242385, + "grad_norm": 0.051789868623018265, + "learning_rate": 0.0001246815788289302, + "loss": 0.2488, + "step": 21673 + }, + { + "epoch": 1.7558327932598834, + "grad_norm": 0.049402810633182526, + "learning_rate": 0.0001246770781763356, + "loss": 0.2336, + "step": 21674 + }, + { + "epoch": 1.7559138042773816, + "grad_norm": 0.05984452739357948, + "learning_rate": 0.00012467257752374095, + "loss": 0.2871, + "step": 21675 + }, + { + "epoch": 1.75599481529488, + "grad_norm": 0.050105199217796326, + "learning_rate": 0.0001246680768711463, + "loss": 0.251, + "step": 21676 + }, + { + "epoch": 1.7560758263123786, + "grad_norm": 0.05107852816581726, + "learning_rate": 0.0001246635762185517, + "loss": 0.268, + "step": 21677 + }, + { + "epoch": 1.7561568373298768, + "grad_norm": 0.05321214348077774, + "learning_rate": 0.00012465907556595706, + "loss": 0.2779, + "step": 21678 + }, + { + "epoch": 1.7562378483473753, + "grad_norm": 0.048276063054800034, + "learning_rate": 0.00012465457491336244, + "loss": 0.2588, + "step": 21679 + }, + { + "epoch": 1.7563188593648738, + "grad_norm": 0.0636172741651535, + "learning_rate": 0.00012465007426076783, + "loss": 0.2582, + "step": 21680 + }, + { + "epoch": 1.756399870382372, + "grad_norm": 0.05773542821407318, + "learning_rate": 0.0001246455736081732, + "loss": 0.2962, + "step": 21681 + }, + { + "epoch": 1.7564808813998702, + "grad_norm": 0.0537274032831192, + "learning_rate": 0.00012464107295557855, + "loss": 0.2705, + "step": 21682 + }, + { + "epoch": 1.7565618924173687, + "grad_norm": 0.06137649714946747, + "learning_rate": 0.00012463657230298394, + "loss": 0.2856, + "step": 21683 + }, + { + "epoch": 1.7566429034348672, + "grad_norm": 0.05260062217712402, + "learning_rate": 0.0001246320716503893, + "loss": 0.2577, + "step": 21684 + }, + { + "epoch": 1.7567239144523654, + "grad_norm": 0.0494031198322773, + "learning_rate": 0.00012462757099779469, + "loss": 0.2966, + "step": 21685 + }, + { + "epoch": 1.7568049254698639, + "grad_norm": 0.0548611544072628, + "learning_rate": 0.00012462307034520007, + "loss": 0.2493, + "step": 21686 + }, + { + "epoch": 1.7568859364873624, + "grad_norm": 0.0449042022228241, + "learning_rate": 0.00012461856969260543, + "loss": 0.2628, + "step": 21687 + }, + { + "epoch": 1.7569669475048606, + "grad_norm": 0.063282310962677, + "learning_rate": 0.0001246140690400108, + "loss": 0.266, + "step": 21688 + }, + { + "epoch": 1.757047958522359, + "grad_norm": 0.06522276997566223, + "learning_rate": 0.00012460956838741618, + "loss": 0.3117, + "step": 21689 + }, + { + "epoch": 1.7571289695398575, + "grad_norm": 0.05102567747235298, + "learning_rate": 0.00012460506773482154, + "loss": 0.2384, + "step": 21690 + }, + { + "epoch": 1.7572099805573558, + "grad_norm": 0.05092918500304222, + "learning_rate": 0.00012460056708222693, + "loss": 0.3042, + "step": 21691 + }, + { + "epoch": 1.757290991574854, + "grad_norm": 0.048716794699430466, + "learning_rate": 0.00012459606642963231, + "loss": 0.2535, + "step": 21692 + }, + { + "epoch": 1.7573720025923527, + "grad_norm": 0.05294806882739067, + "learning_rate": 0.00012459156577703767, + "loss": 0.2843, + "step": 21693 + }, + { + "epoch": 1.757453013609851, + "grad_norm": 0.04733522981405258, + "learning_rate": 0.00012458706512444303, + "loss": 0.2403, + "step": 21694 + }, + { + "epoch": 1.7575340246273492, + "grad_norm": 0.04993963986635208, + "learning_rate": 0.00012458256447184842, + "loss": 0.2286, + "step": 21695 + }, + { + "epoch": 1.7576150356448477, + "grad_norm": 0.059795740991830826, + "learning_rate": 0.00012457806381925378, + "loss": 0.3168, + "step": 21696 + }, + { + "epoch": 1.7576960466623461, + "grad_norm": 0.054991334676742554, + "learning_rate": 0.00012457356316665917, + "loss": 0.2736, + "step": 21697 + }, + { + "epoch": 1.7577770576798444, + "grad_norm": 0.06179995834827423, + "learning_rate": 0.00012456906251406456, + "loss": 0.2566, + "step": 21698 + }, + { + "epoch": 1.7578580686973428, + "grad_norm": 0.04543781280517578, + "learning_rate": 0.00012456456186146992, + "loss": 0.256, + "step": 21699 + }, + { + "epoch": 1.7579390797148413, + "grad_norm": 0.06514349579811096, + "learning_rate": 0.0001245600612088753, + "loss": 0.2745, + "step": 21700 + }, + { + "epoch": 1.7580200907323396, + "grad_norm": 0.062302350997924805, + "learning_rate": 0.00012455556055628066, + "loss": 0.2294, + "step": 21701 + }, + { + "epoch": 1.758101101749838, + "grad_norm": 0.04693538323044777, + "learning_rate": 0.00012455105990368605, + "loss": 0.2805, + "step": 21702 + }, + { + "epoch": 1.7581821127673365, + "grad_norm": 0.0487443283200264, + "learning_rate": 0.0001245465592510914, + "loss": 0.2562, + "step": 21703 + }, + { + "epoch": 1.7582631237848347, + "grad_norm": 0.05158422887325287, + "learning_rate": 0.0001245420585984968, + "loss": 0.2566, + "step": 21704 + }, + { + "epoch": 1.758344134802333, + "grad_norm": 0.056412965059280396, + "learning_rate": 0.00012453755794590216, + "loss": 0.2555, + "step": 21705 + }, + { + "epoch": 1.7584251458198314, + "grad_norm": 0.05551760271191597, + "learning_rate": 0.00012453305729330754, + "loss": 0.2889, + "step": 21706 + }, + { + "epoch": 1.75850615683733, + "grad_norm": 0.07467301934957504, + "learning_rate": 0.0001245285566407129, + "loss": 0.2919, + "step": 21707 + }, + { + "epoch": 1.7585871678548282, + "grad_norm": 0.05988988280296326, + "learning_rate": 0.0001245240559881183, + "loss": 0.2587, + "step": 21708 + }, + { + "epoch": 1.7586681788723266, + "grad_norm": 0.047254081815481186, + "learning_rate": 0.00012451955533552365, + "loss": 0.2535, + "step": 21709 + }, + { + "epoch": 1.758749189889825, + "grad_norm": 0.060444850474596024, + "learning_rate": 0.00012451505468292904, + "loss": 0.271, + "step": 21710 + }, + { + "epoch": 1.7588302009073233, + "grad_norm": 0.05495789274573326, + "learning_rate": 0.0001245105540303344, + "loss": 0.277, + "step": 21711 + }, + { + "epoch": 1.7589112119248218, + "grad_norm": 0.05612171068787575, + "learning_rate": 0.00012450605337773979, + "loss": 0.2637, + "step": 21712 + }, + { + "epoch": 1.7589922229423203, + "grad_norm": 0.05255566164851189, + "learning_rate": 0.00012450155272514515, + "loss": 0.2884, + "step": 21713 + }, + { + "epoch": 1.7590732339598185, + "grad_norm": 0.053682245314121246, + "learning_rate": 0.00012449705207255053, + "loss": 0.2974, + "step": 21714 + }, + { + "epoch": 1.7591542449773168, + "grad_norm": 0.05426940321922302, + "learning_rate": 0.0001244925514199559, + "loss": 0.2631, + "step": 21715 + }, + { + "epoch": 1.7592352559948155, + "grad_norm": 0.07131505757570267, + "learning_rate": 0.00012448805076736128, + "loss": 0.2999, + "step": 21716 + }, + { + "epoch": 1.7593162670123137, + "grad_norm": 0.06050477921962738, + "learning_rate": 0.00012448355011476664, + "loss": 0.2971, + "step": 21717 + }, + { + "epoch": 1.759397278029812, + "grad_norm": 0.049917127937078476, + "learning_rate": 0.00012447904946217203, + "loss": 0.2515, + "step": 21718 + }, + { + "epoch": 1.7594782890473104, + "grad_norm": 0.055279117077589035, + "learning_rate": 0.0001244745488095774, + "loss": 0.2781, + "step": 21719 + }, + { + "epoch": 1.7595593000648089, + "grad_norm": 0.051920387893915176, + "learning_rate": 0.00012447004815698277, + "loss": 0.2621, + "step": 21720 + }, + { + "epoch": 1.7596403110823071, + "grad_norm": 0.06155714765191078, + "learning_rate": 0.00012446554750438813, + "loss": 0.2914, + "step": 21721 + }, + { + "epoch": 1.7597213220998056, + "grad_norm": 0.0600089430809021, + "learning_rate": 0.00012446104685179352, + "loss": 0.275, + "step": 21722 + }, + { + "epoch": 1.759802333117304, + "grad_norm": 0.05426739528775215, + "learning_rate": 0.00012445654619919888, + "loss": 0.2991, + "step": 21723 + }, + { + "epoch": 1.7598833441348023, + "grad_norm": 0.04382641613483429, + "learning_rate": 0.00012445204554660427, + "loss": 0.2369, + "step": 21724 + }, + { + "epoch": 1.7599643551523008, + "grad_norm": 0.05670471489429474, + "learning_rate": 0.00012444754489400963, + "loss": 0.2556, + "step": 21725 + }, + { + "epoch": 1.7600453661697992, + "grad_norm": 0.05996417999267578, + "learning_rate": 0.00012444304424141502, + "loss": 0.2899, + "step": 21726 + }, + { + "epoch": 1.7601263771872975, + "grad_norm": 0.05591500923037529, + "learning_rate": 0.00012443854358882038, + "loss": 0.2728, + "step": 21727 + }, + { + "epoch": 1.7602073882047957, + "grad_norm": 0.050028007477521896, + "learning_rate": 0.00012443404293622576, + "loss": 0.2995, + "step": 21728 + }, + { + "epoch": 1.7602883992222942, + "grad_norm": 0.05522231385111809, + "learning_rate": 0.00012442954228363112, + "loss": 0.2866, + "step": 21729 + }, + { + "epoch": 1.7603694102397927, + "grad_norm": 0.055305320769548416, + "learning_rate": 0.0001244250416310365, + "loss": 0.2995, + "step": 21730 + }, + { + "epoch": 1.760450421257291, + "grad_norm": 0.05761830508708954, + "learning_rate": 0.0001244205409784419, + "loss": 0.3019, + "step": 21731 + }, + { + "epoch": 1.7605314322747894, + "grad_norm": 0.04823017120361328, + "learning_rate": 0.00012441604032584726, + "loss": 0.2601, + "step": 21732 + }, + { + "epoch": 1.7606124432922878, + "grad_norm": 0.05907674878835678, + "learning_rate": 0.00012441153967325262, + "loss": 0.2999, + "step": 21733 + }, + { + "epoch": 1.760693454309786, + "grad_norm": 0.050825346261262894, + "learning_rate": 0.000124407039020658, + "loss": 0.2835, + "step": 21734 + }, + { + "epoch": 1.7607744653272845, + "grad_norm": 0.06334000825881958, + "learning_rate": 0.00012440253836806337, + "loss": 0.2775, + "step": 21735 + }, + { + "epoch": 1.760855476344783, + "grad_norm": 0.05819164589047432, + "learning_rate": 0.00012439803771546875, + "loss": 0.2616, + "step": 21736 + }, + { + "epoch": 1.7609364873622813, + "grad_norm": 0.04646517336368561, + "learning_rate": 0.00012439353706287414, + "loss": 0.244, + "step": 21737 + }, + { + "epoch": 1.7610174983797795, + "grad_norm": 0.0714096948504448, + "learning_rate": 0.0001243890364102795, + "loss": 0.2959, + "step": 21738 + }, + { + "epoch": 1.7610985093972782, + "grad_norm": 0.04353515803813934, + "learning_rate": 0.00012438453575768486, + "loss": 0.264, + "step": 21739 + }, + { + "epoch": 1.7611795204147764, + "grad_norm": 0.05200977995991707, + "learning_rate": 0.00012438003510509025, + "loss": 0.2807, + "step": 21740 + }, + { + "epoch": 1.7612605314322747, + "grad_norm": 0.06344801187515259, + "learning_rate": 0.0001243755344524956, + "loss": 0.286, + "step": 21741 + }, + { + "epoch": 1.7613415424497731, + "grad_norm": 0.05688883364200592, + "learning_rate": 0.000124371033799901, + "loss": 0.2627, + "step": 21742 + }, + { + "epoch": 1.7614225534672716, + "grad_norm": 0.06464666873216629, + "learning_rate": 0.00012436653314730638, + "loss": 0.3134, + "step": 21743 + }, + { + "epoch": 1.7615035644847699, + "grad_norm": 0.053593527525663376, + "learning_rate": 0.00012436203249471174, + "loss": 0.3007, + "step": 21744 + }, + { + "epoch": 1.7615845755022683, + "grad_norm": 0.038453370332717896, + "learning_rate": 0.0001243575318421171, + "loss": 0.2245, + "step": 21745 + }, + { + "epoch": 1.7616655865197668, + "grad_norm": 0.05070507526397705, + "learning_rate": 0.0001243530311895225, + "loss": 0.2719, + "step": 21746 + }, + { + "epoch": 1.761746597537265, + "grad_norm": 0.04365245997905731, + "learning_rate": 0.00012434853053692785, + "loss": 0.2587, + "step": 21747 + }, + { + "epoch": 1.7618276085547635, + "grad_norm": 0.053488414734601974, + "learning_rate": 0.00012434402988433324, + "loss": 0.2996, + "step": 21748 + }, + { + "epoch": 1.761908619572262, + "grad_norm": 0.053450509905815125, + "learning_rate": 0.00012433952923173862, + "loss": 0.2702, + "step": 21749 + }, + { + "epoch": 1.7619896305897602, + "grad_norm": 0.046528153121471405, + "learning_rate": 0.00012433502857914398, + "loss": 0.2456, + "step": 21750 + }, + { + "epoch": 1.7620706416072585, + "grad_norm": 0.0616072341799736, + "learning_rate": 0.00012433052792654934, + "loss": 0.2709, + "step": 21751 + }, + { + "epoch": 1.762151652624757, + "grad_norm": 0.050160426646471024, + "learning_rate": 0.00012432602727395473, + "loss": 0.2721, + "step": 21752 + }, + { + "epoch": 1.7622326636422554, + "grad_norm": 0.05370565503835678, + "learning_rate": 0.0001243215266213601, + "loss": 0.2803, + "step": 21753 + }, + { + "epoch": 1.7623136746597536, + "grad_norm": 0.05295388028025627, + "learning_rate": 0.00012431702596876548, + "loss": 0.286, + "step": 21754 + }, + { + "epoch": 1.762394685677252, + "grad_norm": 0.06044628098607063, + "learning_rate": 0.00012431252531617086, + "loss": 0.2924, + "step": 21755 + }, + { + "epoch": 1.7624756966947506, + "grad_norm": 0.05766693502664566, + "learning_rate": 0.00012430802466357622, + "loss": 0.321, + "step": 21756 + }, + { + "epoch": 1.7625567077122488, + "grad_norm": 0.04584493860602379, + "learning_rate": 0.00012430352401098158, + "loss": 0.2753, + "step": 21757 + }, + { + "epoch": 1.7626377187297473, + "grad_norm": 0.05395643785595894, + "learning_rate": 0.00012429902335838697, + "loss": 0.2543, + "step": 21758 + }, + { + "epoch": 1.7627187297472457, + "grad_norm": 0.0653577521443367, + "learning_rate": 0.00012429452270579233, + "loss": 0.3025, + "step": 21759 + }, + { + "epoch": 1.762799740764744, + "grad_norm": 0.054335497319698334, + "learning_rate": 0.00012429002205319772, + "loss": 0.2753, + "step": 21760 + }, + { + "epoch": 1.7628807517822422, + "grad_norm": 0.05730433762073517, + "learning_rate": 0.0001242855214006031, + "loss": 0.3175, + "step": 21761 + }, + { + "epoch": 1.762961762799741, + "grad_norm": 0.04931226372718811, + "learning_rate": 0.00012428102074800847, + "loss": 0.2591, + "step": 21762 + }, + { + "epoch": 1.7630427738172392, + "grad_norm": 0.056040529161691666, + "learning_rate": 0.00012427652009541383, + "loss": 0.2592, + "step": 21763 + }, + { + "epoch": 1.7631237848347374, + "grad_norm": 0.05200287699699402, + "learning_rate": 0.0001242720194428192, + "loss": 0.2913, + "step": 21764 + }, + { + "epoch": 1.7632047958522359, + "grad_norm": 0.06672247499227524, + "learning_rate": 0.00012426751879022457, + "loss": 0.2925, + "step": 21765 + }, + { + "epoch": 1.7632858068697344, + "grad_norm": 0.05634213238954544, + "learning_rate": 0.00012426301813762996, + "loss": 0.2776, + "step": 21766 + }, + { + "epoch": 1.7633668178872326, + "grad_norm": 0.05127370357513428, + "learning_rate": 0.00012425851748503535, + "loss": 0.2567, + "step": 21767 + }, + { + "epoch": 1.763447828904731, + "grad_norm": 0.05843063443899155, + "learning_rate": 0.0001242540168324407, + "loss": 0.2992, + "step": 21768 + }, + { + "epoch": 1.7635288399222295, + "grad_norm": 0.07764243334531784, + "learning_rate": 0.0001242495161798461, + "loss": 0.3162, + "step": 21769 + }, + { + "epoch": 1.7636098509397278, + "grad_norm": 0.05555042251944542, + "learning_rate": 0.00012424501552725146, + "loss": 0.283, + "step": 21770 + }, + { + "epoch": 1.763690861957226, + "grad_norm": 0.05444716289639473, + "learning_rate": 0.00012424051487465682, + "loss": 0.3248, + "step": 21771 + }, + { + "epoch": 1.7637718729747247, + "grad_norm": 0.0517263300716877, + "learning_rate": 0.0001242360142220622, + "loss": 0.2594, + "step": 21772 + }, + { + "epoch": 1.763852883992223, + "grad_norm": 0.06352398544549942, + "learning_rate": 0.0001242315135694676, + "loss": 0.2863, + "step": 21773 + }, + { + "epoch": 1.7639338950097212, + "grad_norm": 0.04792700335383415, + "learning_rate": 0.00012422701291687295, + "loss": 0.2934, + "step": 21774 + }, + { + "epoch": 1.7640149060272197, + "grad_norm": 0.05038423463702202, + "learning_rate": 0.00012422251226427834, + "loss": 0.2686, + "step": 21775 + }, + { + "epoch": 1.7640959170447181, + "grad_norm": 0.054630909115076065, + "learning_rate": 0.0001242180116116837, + "loss": 0.2918, + "step": 21776 + }, + { + "epoch": 1.7641769280622164, + "grad_norm": 0.05571306124329567, + "learning_rate": 0.00012421351095908906, + "loss": 0.2747, + "step": 21777 + }, + { + "epoch": 1.7642579390797148, + "grad_norm": 0.04750450700521469, + "learning_rate": 0.00012420901030649444, + "loss": 0.242, + "step": 21778 + }, + { + "epoch": 1.7643389500972133, + "grad_norm": 0.060028474777936935, + "learning_rate": 0.00012420450965389983, + "loss": 0.2681, + "step": 21779 + }, + { + "epoch": 1.7644199611147116, + "grad_norm": 0.05656592175364494, + "learning_rate": 0.0001242000090013052, + "loss": 0.2401, + "step": 21780 + }, + { + "epoch": 1.76450097213221, + "grad_norm": 0.05683549866080284, + "learning_rate": 0.00012419550834871058, + "loss": 0.2718, + "step": 21781 + }, + { + "epoch": 1.7645819831497085, + "grad_norm": 0.055099744349718094, + "learning_rate": 0.00012419100769611594, + "loss": 0.2593, + "step": 21782 + }, + { + "epoch": 1.7646629941672067, + "grad_norm": 0.0482744500041008, + "learning_rate": 0.00012418650704352133, + "loss": 0.2658, + "step": 21783 + }, + { + "epoch": 1.764744005184705, + "grad_norm": 0.05808193236589432, + "learning_rate": 0.00012418200639092669, + "loss": 0.2852, + "step": 21784 + }, + { + "epoch": 1.7648250162022034, + "grad_norm": 0.06865742057561874, + "learning_rate": 0.00012417750573833207, + "loss": 0.3209, + "step": 21785 + }, + { + "epoch": 1.764906027219702, + "grad_norm": 0.07194961607456207, + "learning_rate": 0.00012417300508573743, + "loss": 0.3156, + "step": 21786 + }, + { + "epoch": 1.7649870382372002, + "grad_norm": 0.06195151433348656, + "learning_rate": 0.00012416850443314282, + "loss": 0.2867, + "step": 21787 + }, + { + "epoch": 1.7650680492546986, + "grad_norm": 0.051346033811569214, + "learning_rate": 0.00012416400378054818, + "loss": 0.2669, + "step": 21788 + }, + { + "epoch": 1.765149060272197, + "grad_norm": 0.0620853491127491, + "learning_rate": 0.00012415950312795357, + "loss": 0.2745, + "step": 21789 + }, + { + "epoch": 1.7652300712896953, + "grad_norm": 0.06049535796046257, + "learning_rate": 0.00012415500247535893, + "loss": 0.2704, + "step": 21790 + }, + { + "epoch": 1.7653110823071938, + "grad_norm": 0.06817255169153214, + "learning_rate": 0.00012415050182276431, + "loss": 0.2856, + "step": 21791 + }, + { + "epoch": 1.7653920933246923, + "grad_norm": 0.05738371983170509, + "learning_rate": 0.00012414600117016967, + "loss": 0.2306, + "step": 21792 + }, + { + "epoch": 1.7654731043421905, + "grad_norm": 0.05548636242747307, + "learning_rate": 0.00012414150051757506, + "loss": 0.2965, + "step": 21793 + }, + { + "epoch": 1.7655541153596888, + "grad_norm": 0.04842370003461838, + "learning_rate": 0.00012413699986498042, + "loss": 0.2579, + "step": 21794 + }, + { + "epoch": 1.7656351263771874, + "grad_norm": 0.05322146788239479, + "learning_rate": 0.0001241324992123858, + "loss": 0.257, + "step": 21795 + }, + { + "epoch": 1.7657161373946857, + "grad_norm": 0.055748313665390015, + "learning_rate": 0.00012412799855979117, + "loss": 0.2782, + "step": 21796 + }, + { + "epoch": 1.765797148412184, + "grad_norm": 0.05629035830497742, + "learning_rate": 0.00012412349790719656, + "loss": 0.2514, + "step": 21797 + }, + { + "epoch": 1.7658781594296824, + "grad_norm": 0.06065845862030983, + "learning_rate": 0.00012411899725460192, + "loss": 0.2579, + "step": 21798 + }, + { + "epoch": 1.7659591704471809, + "grad_norm": 0.05282067880034447, + "learning_rate": 0.0001241144966020073, + "loss": 0.2431, + "step": 21799 + }, + { + "epoch": 1.7660401814646791, + "grad_norm": 0.06059788167476654, + "learning_rate": 0.00012410999594941266, + "loss": 0.2792, + "step": 21800 + }, + { + "epoch": 1.7661211924821776, + "grad_norm": 0.04922725260257721, + "learning_rate": 0.00012410549529681805, + "loss": 0.3058, + "step": 21801 + }, + { + "epoch": 1.766202203499676, + "grad_norm": 0.05176448076963425, + "learning_rate": 0.0001241009946442234, + "loss": 0.2627, + "step": 21802 + }, + { + "epoch": 1.7662832145171743, + "grad_norm": 0.06321108341217041, + "learning_rate": 0.0001240964939916288, + "loss": 0.2944, + "step": 21803 + }, + { + "epoch": 1.7663642255346728, + "grad_norm": 0.04272797331213951, + "learning_rate": 0.00012409199333903416, + "loss": 0.2438, + "step": 21804 + }, + { + "epoch": 1.7664452365521712, + "grad_norm": 0.06427010148763657, + "learning_rate": 0.00012408749268643954, + "loss": 0.2724, + "step": 21805 + }, + { + "epoch": 1.7665262475696695, + "grad_norm": 0.058782003819942474, + "learning_rate": 0.0001240829920338449, + "loss": 0.2592, + "step": 21806 + }, + { + "epoch": 1.7666072585871677, + "grad_norm": 0.046798817813396454, + "learning_rate": 0.0001240784913812503, + "loss": 0.2545, + "step": 21807 + }, + { + "epoch": 1.7666882696046662, + "grad_norm": 0.05633607506752014, + "learning_rate": 0.00012407399072865565, + "loss": 0.2996, + "step": 21808 + }, + { + "epoch": 1.7667692806221647, + "grad_norm": 0.05841773748397827, + "learning_rate": 0.00012406949007606104, + "loss": 0.2579, + "step": 21809 + }, + { + "epoch": 1.766850291639663, + "grad_norm": 0.061462823301553726, + "learning_rate": 0.0001240649894234664, + "loss": 0.2993, + "step": 21810 + }, + { + "epoch": 1.7669313026571614, + "grad_norm": 0.06246998906135559, + "learning_rate": 0.00012406048877087179, + "loss": 0.2886, + "step": 21811 + }, + { + "epoch": 1.7670123136746598, + "grad_norm": 0.05299236625432968, + "learning_rate": 0.00012405598811827717, + "loss": 0.2694, + "step": 21812 + }, + { + "epoch": 1.767093324692158, + "grad_norm": 0.05541946738958359, + "learning_rate": 0.00012405148746568253, + "loss": 0.2743, + "step": 21813 + }, + { + "epoch": 1.7671743357096565, + "grad_norm": 0.07307790964841843, + "learning_rate": 0.0001240469868130879, + "loss": 0.2877, + "step": 21814 + }, + { + "epoch": 1.767255346727155, + "grad_norm": 0.04940599203109741, + "learning_rate": 0.00012404248616049328, + "loss": 0.2501, + "step": 21815 + }, + { + "epoch": 1.7673363577446533, + "grad_norm": 0.0704098492860794, + "learning_rate": 0.00012403798550789864, + "loss": 0.2915, + "step": 21816 + }, + { + "epoch": 1.7674173687621515, + "grad_norm": 0.049868229776620865, + "learning_rate": 0.00012403348485530403, + "loss": 0.2508, + "step": 21817 + }, + { + "epoch": 1.7674983797796502, + "grad_norm": 0.0582960806787014, + "learning_rate": 0.00012402898420270942, + "loss": 0.3132, + "step": 21818 + }, + { + "epoch": 1.7675793907971484, + "grad_norm": 0.05498791113495827, + "learning_rate": 0.00012402448355011478, + "loss": 0.2758, + "step": 21819 + }, + { + "epoch": 1.7676604018146467, + "grad_norm": 0.062076665461063385, + "learning_rate": 0.00012401998289752014, + "loss": 0.3287, + "step": 21820 + }, + { + "epoch": 1.7677414128321451, + "grad_norm": 0.053407374769449234, + "learning_rate": 0.00012401548224492552, + "loss": 0.2525, + "step": 21821 + }, + { + "epoch": 1.7678224238496436, + "grad_norm": 0.05482996627688408, + "learning_rate": 0.00012401098159233088, + "loss": 0.3037, + "step": 21822 + }, + { + "epoch": 1.7679034348671419, + "grad_norm": 0.05839741975069046, + "learning_rate": 0.00012400648093973627, + "loss": 0.3059, + "step": 21823 + }, + { + "epoch": 1.7679844458846403, + "grad_norm": 0.04174405336380005, + "learning_rate": 0.00012400198028714166, + "loss": 0.2515, + "step": 21824 + }, + { + "epoch": 1.7680654569021388, + "grad_norm": 0.05571863055229187, + "learning_rate": 0.00012399747963454702, + "loss": 0.2738, + "step": 21825 + }, + { + "epoch": 1.768146467919637, + "grad_norm": 0.062394920736551285, + "learning_rate": 0.00012399297898195238, + "loss": 0.2931, + "step": 21826 + }, + { + "epoch": 1.7682274789371355, + "grad_norm": 0.05461779236793518, + "learning_rate": 0.00012398847832935776, + "loss": 0.2779, + "step": 21827 + }, + { + "epoch": 1.768308489954634, + "grad_norm": 0.0779416486620903, + "learning_rate": 0.00012398397767676312, + "loss": 0.2979, + "step": 21828 + }, + { + "epoch": 1.7683895009721322, + "grad_norm": 0.05171630531549454, + "learning_rate": 0.0001239794770241685, + "loss": 0.3044, + "step": 21829 + }, + { + "epoch": 1.7684705119896305, + "grad_norm": 0.06141390651464462, + "learning_rate": 0.0001239749763715739, + "loss": 0.3078, + "step": 21830 + }, + { + "epoch": 1.768551523007129, + "grad_norm": 0.05810209736227989, + "learning_rate": 0.00012397047571897926, + "loss": 0.2944, + "step": 21831 + }, + { + "epoch": 1.7686325340246274, + "grad_norm": 0.054347582161426544, + "learning_rate": 0.00012396597506638462, + "loss": 0.2931, + "step": 21832 + }, + { + "epoch": 1.7687135450421256, + "grad_norm": 0.058234091848134995, + "learning_rate": 0.00012396147441379, + "loss": 0.2658, + "step": 21833 + }, + { + "epoch": 1.768794556059624, + "grad_norm": 0.07164759188890457, + "learning_rate": 0.00012395697376119537, + "loss": 0.2929, + "step": 21834 + }, + { + "epoch": 1.7688755670771226, + "grad_norm": 0.0488571897149086, + "learning_rate": 0.00012395247310860075, + "loss": 0.2444, + "step": 21835 + }, + { + "epoch": 1.7689565780946208, + "grad_norm": 0.05238921195268631, + "learning_rate": 0.00012394797245600614, + "loss": 0.302, + "step": 21836 + }, + { + "epoch": 1.7690375891121193, + "grad_norm": 0.057009343057870865, + "learning_rate": 0.0001239434718034115, + "loss": 0.2741, + "step": 21837 + }, + { + "epoch": 1.7691186001296177, + "grad_norm": 0.04948584735393524, + "learning_rate": 0.0001239389711508169, + "loss": 0.3014, + "step": 21838 + }, + { + "epoch": 1.769199611147116, + "grad_norm": 0.04947437718510628, + "learning_rate": 0.00012393447049822225, + "loss": 0.293, + "step": 21839 + }, + { + "epoch": 1.7692806221646142, + "grad_norm": 0.04769972339272499, + "learning_rate": 0.0001239299698456276, + "loss": 0.2778, + "step": 21840 + }, + { + "epoch": 1.769361633182113, + "grad_norm": 0.06893935054540634, + "learning_rate": 0.000123925469193033, + "loss": 0.2989, + "step": 21841 + }, + { + "epoch": 1.7694426441996112, + "grad_norm": 0.06745298951864243, + "learning_rate": 0.00012392096854043838, + "loss": 0.2669, + "step": 21842 + }, + { + "epoch": 1.7695236552171094, + "grad_norm": 0.04827446863055229, + "learning_rate": 0.00012391646788784374, + "loss": 0.2794, + "step": 21843 + }, + { + "epoch": 1.7696046662346079, + "grad_norm": 0.05109957233071327, + "learning_rate": 0.00012391196723524913, + "loss": 0.2207, + "step": 21844 + }, + { + "epoch": 1.7696856772521063, + "grad_norm": 0.04495451971888542, + "learning_rate": 0.0001239074665826545, + "loss": 0.2511, + "step": 21845 + }, + { + "epoch": 1.7697666882696046, + "grad_norm": 0.05750264972448349, + "learning_rate": 0.00012390296593005985, + "loss": 0.2837, + "step": 21846 + }, + { + "epoch": 1.769847699287103, + "grad_norm": 0.04733563959598541, + "learning_rate": 0.00012389846527746524, + "loss": 0.2341, + "step": 21847 + }, + { + "epoch": 1.7699287103046015, + "grad_norm": 0.054377175867557526, + "learning_rate": 0.00012389396462487062, + "loss": 0.2818, + "step": 21848 + }, + { + "epoch": 1.7700097213220998, + "grad_norm": 0.06388578563928604, + "learning_rate": 0.00012388946397227598, + "loss": 0.2798, + "step": 21849 + }, + { + "epoch": 1.7700907323395982, + "grad_norm": 0.07040360569953918, + "learning_rate": 0.00012388496331968137, + "loss": 0.2625, + "step": 21850 + }, + { + "epoch": 1.7701717433570967, + "grad_norm": 0.07146581262350082, + "learning_rate": 0.00012388046266708673, + "loss": 0.3742, + "step": 21851 + }, + { + "epoch": 1.770252754374595, + "grad_norm": 0.05052236095070839, + "learning_rate": 0.0001238759620144921, + "loss": 0.2262, + "step": 21852 + }, + { + "epoch": 1.7703337653920932, + "grad_norm": 0.05325867608189583, + "learning_rate": 0.00012387146136189748, + "loss": 0.26, + "step": 21853 + }, + { + "epoch": 1.7704147764095917, + "grad_norm": 0.057723164558410645, + "learning_rate": 0.00012386696070930286, + "loss": 0.2815, + "step": 21854 + }, + { + "epoch": 1.7704957874270901, + "grad_norm": 0.06256304681301117, + "learning_rate": 0.00012386246005670822, + "loss": 0.2963, + "step": 21855 + }, + { + "epoch": 1.7705767984445884, + "grad_norm": 0.0576959066092968, + "learning_rate": 0.0001238579594041136, + "loss": 0.3087, + "step": 21856 + }, + { + "epoch": 1.7706578094620868, + "grad_norm": 0.07264872640371323, + "learning_rate": 0.00012385345875151897, + "loss": 0.3027, + "step": 21857 + }, + { + "epoch": 1.7707388204795853, + "grad_norm": 0.05662224814295769, + "learning_rate": 0.00012384895809892433, + "loss": 0.2873, + "step": 21858 + }, + { + "epoch": 1.7708198314970836, + "grad_norm": 0.050668567419052124, + "learning_rate": 0.00012384445744632972, + "loss": 0.2247, + "step": 21859 + }, + { + "epoch": 1.770900842514582, + "grad_norm": 0.04291321709752083, + "learning_rate": 0.0001238399567937351, + "loss": 0.2235, + "step": 21860 + }, + { + "epoch": 1.7709818535320805, + "grad_norm": 0.06177298352122307, + "learning_rate": 0.00012383545614114047, + "loss": 0.2879, + "step": 21861 + }, + { + "epoch": 1.7710628645495787, + "grad_norm": 0.06481605023145676, + "learning_rate": 0.00012383095548854585, + "loss": 0.2883, + "step": 21862 + }, + { + "epoch": 1.771143875567077, + "grad_norm": 0.05115228146314621, + "learning_rate": 0.00012382645483595121, + "loss": 0.2964, + "step": 21863 + }, + { + "epoch": 1.7712248865845757, + "grad_norm": 0.05299142003059387, + "learning_rate": 0.0001238219541833566, + "loss": 0.2637, + "step": 21864 + }, + { + "epoch": 1.771305897602074, + "grad_norm": 0.05036075785756111, + "learning_rate": 0.00012381745353076196, + "loss": 0.2516, + "step": 21865 + }, + { + "epoch": 1.7713869086195722, + "grad_norm": 0.059381112456321716, + "learning_rate": 0.00012381295287816735, + "loss": 0.278, + "step": 21866 + }, + { + "epoch": 1.7714679196370706, + "grad_norm": 0.04535800218582153, + "learning_rate": 0.0001238084522255727, + "loss": 0.2224, + "step": 21867 + }, + { + "epoch": 1.771548930654569, + "grad_norm": 0.05935453623533249, + "learning_rate": 0.0001238039515729781, + "loss": 0.3078, + "step": 21868 + }, + { + "epoch": 1.7716299416720673, + "grad_norm": 0.057842377573251724, + "learning_rate": 0.00012379945092038346, + "loss": 0.2609, + "step": 21869 + }, + { + "epoch": 1.7717109526895658, + "grad_norm": 0.05465754121541977, + "learning_rate": 0.00012379495026778884, + "loss": 0.3241, + "step": 21870 + }, + { + "epoch": 1.7717919637070643, + "grad_norm": 0.05112515017390251, + "learning_rate": 0.0001237904496151942, + "loss": 0.2686, + "step": 21871 + }, + { + "epoch": 1.7718729747245625, + "grad_norm": 0.05687601864337921, + "learning_rate": 0.0001237859489625996, + "loss": 0.3015, + "step": 21872 + }, + { + "epoch": 1.7719539857420608, + "grad_norm": 0.053299058228731155, + "learning_rate": 0.00012378144831000495, + "loss": 0.2401, + "step": 21873 + }, + { + "epoch": 1.7720349967595594, + "grad_norm": 0.06761613488197327, + "learning_rate": 0.00012377694765741034, + "loss": 0.2663, + "step": 21874 + }, + { + "epoch": 1.7721160077770577, + "grad_norm": 0.04997372627258301, + "learning_rate": 0.0001237724470048157, + "loss": 0.275, + "step": 21875 + }, + { + "epoch": 1.772197018794556, + "grad_norm": 0.04821192845702171, + "learning_rate": 0.00012376794635222108, + "loss": 0.2531, + "step": 21876 + }, + { + "epoch": 1.7722780298120544, + "grad_norm": 0.06274958699941635, + "learning_rate": 0.00012376344569962644, + "loss": 0.2769, + "step": 21877 + }, + { + "epoch": 1.7723590408295529, + "grad_norm": 0.04555333033204079, + "learning_rate": 0.00012375894504703183, + "loss": 0.2701, + "step": 21878 + }, + { + "epoch": 1.7724400518470511, + "grad_norm": 0.062332700937986374, + "learning_rate": 0.0001237544443944372, + "loss": 0.2862, + "step": 21879 + }, + { + "epoch": 1.7725210628645496, + "grad_norm": 0.05699625611305237, + "learning_rate": 0.00012374994374184258, + "loss": 0.2738, + "step": 21880 + }, + { + "epoch": 1.772602073882048, + "grad_norm": 0.06004469096660614, + "learning_rate": 0.00012374544308924794, + "loss": 0.2787, + "step": 21881 + }, + { + "epoch": 1.7726830848995463, + "grad_norm": 0.058492615818977356, + "learning_rate": 0.00012374094243665333, + "loss": 0.2883, + "step": 21882 + }, + { + "epoch": 1.7727640959170448, + "grad_norm": 0.051991090178489685, + "learning_rate": 0.00012373644178405869, + "loss": 0.3014, + "step": 21883 + }, + { + "epoch": 1.7728451069345432, + "grad_norm": 0.053816016763448715, + "learning_rate": 0.00012373194113146407, + "loss": 0.2341, + "step": 21884 + }, + { + "epoch": 1.7729261179520415, + "grad_norm": 0.0617956668138504, + "learning_rate": 0.00012372744047886943, + "loss": 0.3296, + "step": 21885 + }, + { + "epoch": 1.7730071289695397, + "grad_norm": 0.06008381024003029, + "learning_rate": 0.00012372293982627482, + "loss": 0.2914, + "step": 21886 + }, + { + "epoch": 1.7730881399870384, + "grad_norm": 0.04700002446770668, + "learning_rate": 0.0001237184391736802, + "loss": 0.2774, + "step": 21887 + }, + { + "epoch": 1.7731691510045366, + "grad_norm": 0.06209200248122215, + "learning_rate": 0.00012371393852108557, + "loss": 0.2875, + "step": 21888 + }, + { + "epoch": 1.773250162022035, + "grad_norm": 0.05754224210977554, + "learning_rate": 0.00012370943786849093, + "loss": 0.2831, + "step": 21889 + }, + { + "epoch": 1.7733311730395334, + "grad_norm": 0.06822988390922546, + "learning_rate": 0.00012370493721589631, + "loss": 0.2533, + "step": 21890 + }, + { + "epoch": 1.7734121840570318, + "grad_norm": 0.05238701403141022, + "learning_rate": 0.00012370043656330167, + "loss": 0.2797, + "step": 21891 + }, + { + "epoch": 1.77349319507453, + "grad_norm": 0.05083979293704033, + "learning_rate": 0.00012369593591070706, + "loss": 0.2718, + "step": 21892 + }, + { + "epoch": 1.7735742060920285, + "grad_norm": 0.047063197940588, + "learning_rate": 0.00012369143525811245, + "loss": 0.2665, + "step": 21893 + }, + { + "epoch": 1.773655217109527, + "grad_norm": 0.05058155208826065, + "learning_rate": 0.0001236869346055178, + "loss": 0.2259, + "step": 21894 + }, + { + "epoch": 1.7737362281270252, + "grad_norm": 0.054182857275009155, + "learning_rate": 0.00012368243395292317, + "loss": 0.2337, + "step": 21895 + }, + { + "epoch": 1.7738172391445235, + "grad_norm": 0.058875586837530136, + "learning_rate": 0.00012367793330032856, + "loss": 0.3079, + "step": 21896 + }, + { + "epoch": 1.7738982501620222, + "grad_norm": 0.04809205234050751, + "learning_rate": 0.00012367343264773392, + "loss": 0.2898, + "step": 21897 + }, + { + "epoch": 1.7739792611795204, + "grad_norm": 0.04986289516091347, + "learning_rate": 0.0001236689319951393, + "loss": 0.285, + "step": 21898 + }, + { + "epoch": 1.7740602721970187, + "grad_norm": 0.057436879724264145, + "learning_rate": 0.0001236644313425447, + "loss": 0.2737, + "step": 21899 + }, + { + "epoch": 1.7741412832145171, + "grad_norm": 0.060833774507045746, + "learning_rate": 0.00012365993068995005, + "loss": 0.2656, + "step": 21900 + }, + { + "epoch": 1.7742222942320156, + "grad_norm": 0.05589460954070091, + "learning_rate": 0.00012365543003735544, + "loss": 0.2759, + "step": 21901 + }, + { + "epoch": 1.7743033052495139, + "grad_norm": 0.045339442789554596, + "learning_rate": 0.0001236509293847608, + "loss": 0.2471, + "step": 21902 + }, + { + "epoch": 1.7743843162670123, + "grad_norm": 0.05823316425085068, + "learning_rate": 0.00012364642873216616, + "loss": 0.2818, + "step": 21903 + }, + { + "epoch": 1.7744653272845108, + "grad_norm": 0.047641415148973465, + "learning_rate": 0.00012364192807957155, + "loss": 0.2875, + "step": 21904 + }, + { + "epoch": 1.774546338302009, + "grad_norm": 0.05201798304915428, + "learning_rate": 0.00012363742742697693, + "loss": 0.2797, + "step": 21905 + }, + { + "epoch": 1.7746273493195075, + "grad_norm": 0.05504504218697548, + "learning_rate": 0.0001236329267743823, + "loss": 0.2578, + "step": 21906 + }, + { + "epoch": 1.774708360337006, + "grad_norm": 0.0527983233332634, + "learning_rate": 0.00012362842612178768, + "loss": 0.2803, + "step": 21907 + }, + { + "epoch": 1.7747893713545042, + "grad_norm": 0.06161266937851906, + "learning_rate": 0.00012362392546919304, + "loss": 0.26, + "step": 21908 + }, + { + "epoch": 1.7748703823720025, + "grad_norm": 0.06064946949481964, + "learning_rate": 0.0001236194248165984, + "loss": 0.2583, + "step": 21909 + }, + { + "epoch": 1.774951393389501, + "grad_norm": 0.05657583102583885, + "learning_rate": 0.0001236149241640038, + "loss": 0.262, + "step": 21910 + }, + { + "epoch": 1.7750324044069994, + "grad_norm": 0.06834515929222107, + "learning_rate": 0.00012361042351140917, + "loss": 0.2948, + "step": 21911 + }, + { + "epoch": 1.7751134154244976, + "grad_norm": 0.05012359470129013, + "learning_rate": 0.00012360592285881453, + "loss": 0.2707, + "step": 21912 + }, + { + "epoch": 1.775194426441996, + "grad_norm": 0.05883803591132164, + "learning_rate": 0.00012360142220621992, + "loss": 0.2749, + "step": 21913 + }, + { + "epoch": 1.7752754374594946, + "grad_norm": 0.06639762222766876, + "learning_rate": 0.00012359692155362528, + "loss": 0.3117, + "step": 21914 + }, + { + "epoch": 1.7753564484769928, + "grad_norm": 0.0774698555469513, + "learning_rate": 0.00012359242090103064, + "loss": 0.2825, + "step": 21915 + }, + { + "epoch": 1.7754374594944913, + "grad_norm": 0.05529485270380974, + "learning_rate": 0.00012358792024843603, + "loss": 0.2406, + "step": 21916 + }, + { + "epoch": 1.7755184705119897, + "grad_norm": 0.06073933467268944, + "learning_rate": 0.00012358341959584142, + "loss": 0.2711, + "step": 21917 + }, + { + "epoch": 1.775599481529488, + "grad_norm": 0.06365562230348587, + "learning_rate": 0.00012357891894324678, + "loss": 0.3226, + "step": 21918 + }, + { + "epoch": 1.7756804925469862, + "grad_norm": 0.06966453790664673, + "learning_rate": 0.00012357441829065216, + "loss": 0.3034, + "step": 21919 + }, + { + "epoch": 1.775761503564485, + "grad_norm": 0.05651353299617767, + "learning_rate": 0.00012356991763805752, + "loss": 0.2836, + "step": 21920 + }, + { + "epoch": 1.7758425145819832, + "grad_norm": 0.049086764454841614, + "learning_rate": 0.00012356541698546288, + "loss": 0.2772, + "step": 21921 + }, + { + "epoch": 1.7759235255994814, + "grad_norm": 0.05163026601076126, + "learning_rate": 0.00012356091633286827, + "loss": 0.2773, + "step": 21922 + }, + { + "epoch": 1.7760045366169799, + "grad_norm": 0.05657094717025757, + "learning_rate": 0.00012355641568027366, + "loss": 0.2571, + "step": 21923 + }, + { + "epoch": 1.7760855476344783, + "grad_norm": 0.04952248930931091, + "learning_rate": 0.00012355191502767902, + "loss": 0.241, + "step": 21924 + }, + { + "epoch": 1.7761665586519766, + "grad_norm": 0.059065740555524826, + "learning_rate": 0.0001235474143750844, + "loss": 0.2768, + "step": 21925 + }, + { + "epoch": 1.776247569669475, + "grad_norm": 0.05678890645503998, + "learning_rate": 0.00012354291372248976, + "loss": 0.2842, + "step": 21926 + }, + { + "epoch": 1.7763285806869735, + "grad_norm": 0.050089024007320404, + "learning_rate": 0.00012353841306989512, + "loss": 0.2733, + "step": 21927 + }, + { + "epoch": 1.7764095917044718, + "grad_norm": 0.05137062445282936, + "learning_rate": 0.0001235339124173005, + "loss": 0.2842, + "step": 21928 + }, + { + "epoch": 1.7764906027219702, + "grad_norm": 0.05186197906732559, + "learning_rate": 0.0001235294117647059, + "loss": 0.2657, + "step": 21929 + }, + { + "epoch": 1.7765716137394687, + "grad_norm": 0.060374390333890915, + "learning_rate": 0.00012352491111211126, + "loss": 0.2829, + "step": 21930 + }, + { + "epoch": 1.776652624756967, + "grad_norm": 0.047595467418432236, + "learning_rate": 0.00012352041045951665, + "loss": 0.2654, + "step": 21931 + }, + { + "epoch": 1.7767336357744652, + "grad_norm": 0.0663878321647644, + "learning_rate": 0.000123515909806922, + "loss": 0.3388, + "step": 21932 + }, + { + "epoch": 1.7768146467919637, + "grad_norm": 0.05642055347561836, + "learning_rate": 0.00012351140915432737, + "loss": 0.3008, + "step": 21933 + }, + { + "epoch": 1.7768956578094621, + "grad_norm": 0.056756485253572464, + "learning_rate": 0.00012350690850173275, + "loss": 0.2959, + "step": 21934 + }, + { + "epoch": 1.7769766688269604, + "grad_norm": 0.06318248808383942, + "learning_rate": 0.00012350240784913814, + "loss": 0.2685, + "step": 21935 + }, + { + "epoch": 1.7770576798444588, + "grad_norm": 0.06705424189567566, + "learning_rate": 0.0001234979071965435, + "loss": 0.2758, + "step": 21936 + }, + { + "epoch": 1.7771386908619573, + "grad_norm": 0.05629139393568039, + "learning_rate": 0.0001234934065439489, + "loss": 0.2615, + "step": 21937 + }, + { + "epoch": 1.7772197018794555, + "grad_norm": 0.054760128259658813, + "learning_rate": 0.00012348890589135425, + "loss": 0.262, + "step": 21938 + }, + { + "epoch": 1.777300712896954, + "grad_norm": 0.055271271616220474, + "learning_rate": 0.0001234844052387596, + "loss": 0.288, + "step": 21939 + }, + { + "epoch": 1.7773817239144525, + "grad_norm": 0.05202984809875488, + "learning_rate": 0.000123479904586165, + "loss": 0.2842, + "step": 21940 + }, + { + "epoch": 1.7774627349319507, + "grad_norm": 0.05875534936785698, + "learning_rate": 0.00012347540393357038, + "loss": 0.2525, + "step": 21941 + }, + { + "epoch": 1.777543745949449, + "grad_norm": 0.05911627784371376, + "learning_rate": 0.00012347090328097574, + "loss": 0.2638, + "step": 21942 + }, + { + "epoch": 1.7776247569669477, + "grad_norm": 0.07096484303474426, + "learning_rate": 0.00012346640262838113, + "loss": 0.3125, + "step": 21943 + }, + { + "epoch": 1.777705767984446, + "grad_norm": 0.060509003698825836, + "learning_rate": 0.0001234619019757865, + "loss": 0.2735, + "step": 21944 + }, + { + "epoch": 1.7777867790019442, + "grad_norm": 0.06088875234127045, + "learning_rate": 0.00012345740132319188, + "loss": 0.3082, + "step": 21945 + }, + { + "epoch": 1.7778677900194426, + "grad_norm": 0.04923175275325775, + "learning_rate": 0.00012345290067059724, + "loss": 0.2385, + "step": 21946 + }, + { + "epoch": 1.777948801036941, + "grad_norm": 0.04954548925161362, + "learning_rate": 0.00012344840001800262, + "loss": 0.2509, + "step": 21947 + }, + { + "epoch": 1.7780298120544393, + "grad_norm": 0.05132404714822769, + "learning_rate": 0.00012344389936540798, + "loss": 0.2914, + "step": 21948 + }, + { + "epoch": 1.7781108230719378, + "grad_norm": 0.05896401032805443, + "learning_rate": 0.00012343939871281337, + "loss": 0.3064, + "step": 21949 + }, + { + "epoch": 1.7781918340894363, + "grad_norm": 0.0579485148191452, + "learning_rate": 0.00012343489806021873, + "loss": 0.2937, + "step": 21950 + }, + { + "epoch": 1.7782728451069345, + "grad_norm": 0.052562858909368515, + "learning_rate": 0.00012343039740762412, + "loss": 0.2935, + "step": 21951 + }, + { + "epoch": 1.778353856124433, + "grad_norm": 0.05139998719096184, + "learning_rate": 0.00012342589675502948, + "loss": 0.2585, + "step": 21952 + }, + { + "epoch": 1.7784348671419314, + "grad_norm": 0.058107078075408936, + "learning_rate": 0.00012342139610243487, + "loss": 0.2711, + "step": 21953 + }, + { + "epoch": 1.7785158781594297, + "grad_norm": 0.05365680903196335, + "learning_rate": 0.00012341689544984023, + "loss": 0.2968, + "step": 21954 + }, + { + "epoch": 1.778596889176928, + "grad_norm": 0.06430280208587646, + "learning_rate": 0.0001234123947972456, + "loss": 0.2667, + "step": 21955 + }, + { + "epoch": 1.7786779001944264, + "grad_norm": 0.06153976917266846, + "learning_rate": 0.00012340789414465097, + "loss": 0.2984, + "step": 21956 + }, + { + "epoch": 1.7787589112119249, + "grad_norm": 0.06513801217079163, + "learning_rate": 0.00012340339349205636, + "loss": 0.2628, + "step": 21957 + }, + { + "epoch": 1.778839922229423, + "grad_norm": 0.05367812514305115, + "learning_rate": 0.00012339889283946172, + "loss": 0.2542, + "step": 21958 + }, + { + "epoch": 1.7789209332469216, + "grad_norm": 0.0550832636654377, + "learning_rate": 0.0001233943921868671, + "loss": 0.3059, + "step": 21959 + }, + { + "epoch": 1.77900194426442, + "grad_norm": 0.05532095953822136, + "learning_rate": 0.00012338989153427247, + "loss": 0.2297, + "step": 21960 + }, + { + "epoch": 1.7790829552819183, + "grad_norm": 0.061939723789691925, + "learning_rate": 0.00012338539088167785, + "loss": 0.3022, + "step": 21961 + }, + { + "epoch": 1.7791639662994168, + "grad_norm": 0.06562153249979019, + "learning_rate": 0.00012338089022908321, + "loss": 0.268, + "step": 21962 + }, + { + "epoch": 1.7792449773169152, + "grad_norm": 0.05717784911394119, + "learning_rate": 0.0001233763895764886, + "loss": 0.2602, + "step": 21963 + }, + { + "epoch": 1.7793259883344135, + "grad_norm": 0.06313645839691162, + "learning_rate": 0.00012337188892389396, + "loss": 0.2716, + "step": 21964 + }, + { + "epoch": 1.7794069993519117, + "grad_norm": 0.05977176874876022, + "learning_rate": 0.00012336738827129935, + "loss": 0.2899, + "step": 21965 + }, + { + "epoch": 1.7794880103694104, + "grad_norm": 0.05052109807729721, + "learning_rate": 0.0001233628876187047, + "loss": 0.2595, + "step": 21966 + }, + { + "epoch": 1.7795690213869086, + "grad_norm": 0.05418974906206131, + "learning_rate": 0.0001233583869661101, + "loss": 0.2872, + "step": 21967 + }, + { + "epoch": 1.779650032404407, + "grad_norm": 0.04954787716269493, + "learning_rate": 0.00012335388631351548, + "loss": 0.2637, + "step": 21968 + }, + { + "epoch": 1.7797310434219054, + "grad_norm": 0.05969540402293205, + "learning_rate": 0.00012334938566092084, + "loss": 0.302, + "step": 21969 + }, + { + "epoch": 1.7798120544394038, + "grad_norm": 0.05559484288096428, + "learning_rate": 0.00012334488500832623, + "loss": 0.2331, + "step": 21970 + }, + { + "epoch": 1.779893065456902, + "grad_norm": 0.05428197979927063, + "learning_rate": 0.0001233403843557316, + "loss": 0.2521, + "step": 21971 + }, + { + "epoch": 1.7799740764744005, + "grad_norm": 0.05862327292561531, + "learning_rate": 0.00012333588370313695, + "loss": 0.2702, + "step": 21972 + }, + { + "epoch": 1.780055087491899, + "grad_norm": 0.06107109412550926, + "learning_rate": 0.00012333138305054234, + "loss": 0.2893, + "step": 21973 + }, + { + "epoch": 1.7801360985093972, + "grad_norm": 0.059656549245119095, + "learning_rate": 0.00012332688239794772, + "loss": 0.3108, + "step": 21974 + }, + { + "epoch": 1.7802171095268955, + "grad_norm": 0.052469126880168915, + "learning_rate": 0.00012332238174535308, + "loss": 0.2605, + "step": 21975 + }, + { + "epoch": 1.7802981205443942, + "grad_norm": 0.05808718129992485, + "learning_rate": 0.00012331788109275847, + "loss": 0.2265, + "step": 21976 + }, + { + "epoch": 1.7803791315618924, + "grad_norm": 0.07221174240112305, + "learning_rate": 0.00012331338044016383, + "loss": 0.2782, + "step": 21977 + }, + { + "epoch": 1.7804601425793907, + "grad_norm": 0.06101597473025322, + "learning_rate": 0.0001233088797875692, + "loss": 0.2695, + "step": 21978 + }, + { + "epoch": 1.7805411535968891, + "grad_norm": 0.05770976096391678, + "learning_rate": 0.00012330437913497458, + "loss": 0.261, + "step": 21979 + }, + { + "epoch": 1.7806221646143876, + "grad_norm": 0.05689411237835884, + "learning_rate": 0.00012329987848237997, + "loss": 0.2714, + "step": 21980 + }, + { + "epoch": 1.7807031756318858, + "grad_norm": 0.060095496475696564, + "learning_rate": 0.00012329537782978533, + "loss": 0.3046, + "step": 21981 + }, + { + "epoch": 1.7807841866493843, + "grad_norm": 0.06570778787136078, + "learning_rate": 0.0001232908771771907, + "loss": 0.2626, + "step": 21982 + }, + { + "epoch": 1.7808651976668828, + "grad_norm": 0.0634298175573349, + "learning_rate": 0.00012328637652459607, + "loss": 0.3217, + "step": 21983 + }, + { + "epoch": 1.780946208684381, + "grad_norm": 0.0634029358625412, + "learning_rate": 0.00012328187587200143, + "loss": 0.2632, + "step": 21984 + }, + { + "epoch": 1.7810272197018795, + "grad_norm": 0.04596779868006706, + "learning_rate": 0.00012327737521940682, + "loss": 0.2562, + "step": 21985 + }, + { + "epoch": 1.781108230719378, + "grad_norm": 0.0523015595972538, + "learning_rate": 0.0001232728745668122, + "loss": 0.2597, + "step": 21986 + }, + { + "epoch": 1.7811892417368762, + "grad_norm": 0.06286849826574326, + "learning_rate": 0.00012326837391421757, + "loss": 0.315, + "step": 21987 + }, + { + "epoch": 1.7812702527543745, + "grad_norm": 0.054060183465480804, + "learning_rate": 0.00012326387326162295, + "loss": 0.3178, + "step": 21988 + }, + { + "epoch": 1.7813512637718731, + "grad_norm": 0.058555398136377335, + "learning_rate": 0.00012325937260902831, + "loss": 0.2405, + "step": 21989 + }, + { + "epoch": 1.7814322747893714, + "grad_norm": 0.05825028941035271, + "learning_rate": 0.00012325487195643367, + "loss": 0.2865, + "step": 21990 + }, + { + "epoch": 1.7815132858068696, + "grad_norm": 0.04288549721240997, + "learning_rate": 0.00012325037130383906, + "loss": 0.2355, + "step": 21991 + }, + { + "epoch": 1.781594296824368, + "grad_norm": 0.051817573606967926, + "learning_rate": 0.00012324587065124445, + "loss": 0.2712, + "step": 21992 + }, + { + "epoch": 1.7816753078418666, + "grad_norm": 0.058279577642679214, + "learning_rate": 0.0001232413699986498, + "loss": 0.2988, + "step": 21993 + }, + { + "epoch": 1.7817563188593648, + "grad_norm": 0.06698846071958542, + "learning_rate": 0.0001232368693460552, + "loss": 0.351, + "step": 21994 + }, + { + "epoch": 1.7818373298768633, + "grad_norm": 0.05766688287258148, + "learning_rate": 0.00012323236869346056, + "loss": 0.2839, + "step": 21995 + }, + { + "epoch": 1.7819183408943617, + "grad_norm": 0.06342483311891556, + "learning_rate": 0.00012322786804086592, + "loss": 0.2469, + "step": 21996 + }, + { + "epoch": 1.78199935191186, + "grad_norm": 0.05495030805468559, + "learning_rate": 0.0001232233673882713, + "loss": 0.2595, + "step": 21997 + }, + { + "epoch": 1.7820803629293582, + "grad_norm": 0.05256880074739456, + "learning_rate": 0.0001232188667356767, + "loss": 0.2761, + "step": 21998 + }, + { + "epoch": 1.782161373946857, + "grad_norm": 0.052520014345645905, + "learning_rate": 0.00012321436608308205, + "loss": 0.3011, + "step": 21999 + }, + { + "epoch": 1.7822423849643552, + "grad_norm": 0.04679098725318909, + "learning_rate": 0.00012320986543048744, + "loss": 0.2715, + "step": 22000 + }, + { + "epoch": 1.7823233959818534, + "grad_norm": 0.06614166498184204, + "learning_rate": 0.0001232053647778928, + "loss": 0.281, + "step": 22001 + }, + { + "epoch": 1.7824044069993519, + "grad_norm": 0.04454330727458, + "learning_rate": 0.00012320086412529816, + "loss": 0.2277, + "step": 22002 + }, + { + "epoch": 1.7824854180168503, + "grad_norm": 0.0626412034034729, + "learning_rate": 0.00012319636347270355, + "loss": 0.3442, + "step": 22003 + }, + { + "epoch": 1.7825664290343486, + "grad_norm": 0.053170133382081985, + "learning_rate": 0.00012319186282010893, + "loss": 0.2938, + "step": 22004 + }, + { + "epoch": 1.782647440051847, + "grad_norm": 0.05421067774295807, + "learning_rate": 0.0001231873621675143, + "loss": 0.2778, + "step": 22005 + }, + { + "epoch": 1.7827284510693455, + "grad_norm": 0.05278918147087097, + "learning_rate": 0.00012318286151491968, + "loss": 0.3152, + "step": 22006 + }, + { + "epoch": 1.7828094620868438, + "grad_norm": 0.050274867564439774, + "learning_rate": 0.00012317836086232504, + "loss": 0.2494, + "step": 22007 + }, + { + "epoch": 1.7828904731043422, + "grad_norm": 0.04210267215967178, + "learning_rate": 0.0001231738602097304, + "loss": 0.2431, + "step": 22008 + }, + { + "epoch": 1.7829714841218407, + "grad_norm": 0.048283085227012634, + "learning_rate": 0.0001231693595571358, + "loss": 0.2714, + "step": 22009 + }, + { + "epoch": 1.783052495139339, + "grad_norm": 0.04342495650053024, + "learning_rate": 0.00012316485890454117, + "loss": 0.22, + "step": 22010 + }, + { + "epoch": 1.7831335061568372, + "grad_norm": 0.05618229880928993, + "learning_rate": 0.00012316035825194653, + "loss": 0.2851, + "step": 22011 + }, + { + "epoch": 1.7832145171743357, + "grad_norm": 0.057552892714738846, + "learning_rate": 0.00012315585759935192, + "loss": 0.3044, + "step": 22012 + }, + { + "epoch": 1.7832955281918341, + "grad_norm": 0.0542028471827507, + "learning_rate": 0.00012315135694675728, + "loss": 0.249, + "step": 22013 + }, + { + "epoch": 1.7833765392093324, + "grad_norm": 0.049775153398513794, + "learning_rate": 0.00012314685629416264, + "loss": 0.2795, + "step": 22014 + }, + { + "epoch": 1.7834575502268308, + "grad_norm": 0.056546781212091446, + "learning_rate": 0.00012314235564156803, + "loss": 0.2498, + "step": 22015 + }, + { + "epoch": 1.7835385612443293, + "grad_norm": 0.050292108207941055, + "learning_rate": 0.00012313785498897342, + "loss": 0.253, + "step": 22016 + }, + { + "epoch": 1.7836195722618275, + "grad_norm": 0.05830508843064308, + "learning_rate": 0.00012313335433637878, + "loss": 0.276, + "step": 22017 + }, + { + "epoch": 1.783700583279326, + "grad_norm": 0.061981040984392166, + "learning_rate": 0.00012312885368378416, + "loss": 0.2502, + "step": 22018 + }, + { + "epoch": 1.7837815942968245, + "grad_norm": 0.06097181513905525, + "learning_rate": 0.00012312435303118952, + "loss": 0.2522, + "step": 22019 + }, + { + "epoch": 1.7838626053143227, + "grad_norm": 0.05192512273788452, + "learning_rate": 0.0001231198523785949, + "loss": 0.2536, + "step": 22020 + }, + { + "epoch": 1.783943616331821, + "grad_norm": 0.05766737088561058, + "learning_rate": 0.00012311535172600027, + "loss": 0.2954, + "step": 22021 + }, + { + "epoch": 1.7840246273493197, + "grad_norm": 0.05775097757577896, + "learning_rate": 0.00012311085107340566, + "loss": 0.2491, + "step": 22022 + }, + { + "epoch": 1.784105638366818, + "grad_norm": 0.05492521449923515, + "learning_rate": 0.00012310635042081102, + "loss": 0.2884, + "step": 22023 + }, + { + "epoch": 1.7841866493843161, + "grad_norm": 0.047551821917295456, + "learning_rate": 0.0001231018497682164, + "loss": 0.2681, + "step": 22024 + }, + { + "epoch": 1.7842676604018146, + "grad_norm": 0.06832152605056763, + "learning_rate": 0.00012309734911562176, + "loss": 0.282, + "step": 22025 + }, + { + "epoch": 1.784348671419313, + "grad_norm": 0.059465307742357254, + "learning_rate": 0.00012309284846302715, + "loss": 0.3006, + "step": 22026 + }, + { + "epoch": 1.7844296824368113, + "grad_norm": 0.05935780331492424, + "learning_rate": 0.0001230883478104325, + "loss": 0.2667, + "step": 22027 + }, + { + "epoch": 1.7845106934543098, + "grad_norm": 0.05664028972387314, + "learning_rate": 0.0001230838471578379, + "loss": 0.2788, + "step": 22028 + }, + { + "epoch": 1.7845917044718083, + "grad_norm": 0.05232711881399155, + "learning_rate": 0.00012307934650524326, + "loss": 0.2356, + "step": 22029 + }, + { + "epoch": 1.7846727154893065, + "grad_norm": 0.0531197190284729, + "learning_rate": 0.00012307484585264865, + "loss": 0.2531, + "step": 22030 + }, + { + "epoch": 1.784753726506805, + "grad_norm": 0.056127868592739105, + "learning_rate": 0.000123070345200054, + "loss": 0.28, + "step": 22031 + }, + { + "epoch": 1.7848347375243034, + "grad_norm": 0.05154069513082504, + "learning_rate": 0.0001230658445474594, + "loss": 0.2483, + "step": 22032 + }, + { + "epoch": 1.7849157485418017, + "grad_norm": 0.06221969425678253, + "learning_rate": 0.00012306134389486475, + "loss": 0.2853, + "step": 22033 + }, + { + "epoch": 1.7849967595593, + "grad_norm": 0.0606851764023304, + "learning_rate": 0.00012305684324227014, + "loss": 0.2935, + "step": 22034 + }, + { + "epoch": 1.7850777705767984, + "grad_norm": 0.06411600857973099, + "learning_rate": 0.0001230523425896755, + "loss": 0.2681, + "step": 22035 + }, + { + "epoch": 1.7851587815942969, + "grad_norm": 0.05162372812628746, + "learning_rate": 0.0001230478419370809, + "loss": 0.2874, + "step": 22036 + }, + { + "epoch": 1.785239792611795, + "grad_norm": 0.07114414870738983, + "learning_rate": 0.00012304334128448625, + "loss": 0.271, + "step": 22037 + }, + { + "epoch": 1.7853208036292936, + "grad_norm": 0.06527965515851974, + "learning_rate": 0.00012303884063189163, + "loss": 0.284, + "step": 22038 + }, + { + "epoch": 1.785401814646792, + "grad_norm": 0.05137494578957558, + "learning_rate": 0.00012303433997929702, + "loss": 0.2445, + "step": 22039 + }, + { + "epoch": 1.7854828256642903, + "grad_norm": 0.05669848993420601, + "learning_rate": 0.00012302983932670238, + "loss": 0.2642, + "step": 22040 + }, + { + "epoch": 1.7855638366817888, + "grad_norm": 0.054273076355457306, + "learning_rate": 0.00012302533867410774, + "loss": 0.2527, + "step": 22041 + }, + { + "epoch": 1.7856448476992872, + "grad_norm": 0.051785897463560104, + "learning_rate": 0.00012302083802151313, + "loss": 0.2829, + "step": 22042 + }, + { + "epoch": 1.7857258587167855, + "grad_norm": 0.06361639499664307, + "learning_rate": 0.0001230163373689185, + "loss": 0.3006, + "step": 22043 + }, + { + "epoch": 1.7858068697342837, + "grad_norm": 0.05482267215847969, + "learning_rate": 0.00012301183671632388, + "loss": 0.2613, + "step": 22044 + }, + { + "epoch": 1.7858878807517824, + "grad_norm": 0.06259823590517044, + "learning_rate": 0.00012300733606372926, + "loss": 0.3061, + "step": 22045 + }, + { + "epoch": 1.7859688917692806, + "grad_norm": 0.047054145485162735, + "learning_rate": 0.00012300283541113462, + "loss": 0.2448, + "step": 22046 + }, + { + "epoch": 1.7860499027867789, + "grad_norm": 0.06172563135623932, + "learning_rate": 0.00012299833475853998, + "loss": 0.3046, + "step": 22047 + }, + { + "epoch": 1.7861309138042774, + "grad_norm": 0.04755101725459099, + "learning_rate": 0.00012299383410594537, + "loss": 0.2811, + "step": 22048 + }, + { + "epoch": 1.7862119248217758, + "grad_norm": 0.057041235268116, + "learning_rate": 0.00012298933345335076, + "loss": 0.2506, + "step": 22049 + }, + { + "epoch": 1.786292935839274, + "grad_norm": 0.05601425841450691, + "learning_rate": 0.00012298483280075612, + "loss": 0.3156, + "step": 22050 + }, + { + "epoch": 1.7863739468567725, + "grad_norm": 0.04828924313187599, + "learning_rate": 0.0001229803321481615, + "loss": 0.2683, + "step": 22051 + }, + { + "epoch": 1.786454957874271, + "grad_norm": 0.06615084409713745, + "learning_rate": 0.00012297583149556687, + "loss": 0.2838, + "step": 22052 + }, + { + "epoch": 1.7865359688917692, + "grad_norm": 0.05322642624378204, + "learning_rate": 0.00012297133084297223, + "loss": 0.2804, + "step": 22053 + }, + { + "epoch": 1.7866169799092677, + "grad_norm": 0.055921897292137146, + "learning_rate": 0.0001229668301903776, + "loss": 0.3104, + "step": 22054 + }, + { + "epoch": 1.7866979909267662, + "grad_norm": 0.06767680495977402, + "learning_rate": 0.000122962329537783, + "loss": 0.3315, + "step": 22055 + }, + { + "epoch": 1.7867790019442644, + "grad_norm": 0.06112572178244591, + "learning_rate": 0.00012295782888518836, + "loss": 0.244, + "step": 22056 + }, + { + "epoch": 1.7868600129617627, + "grad_norm": 0.05754351243376732, + "learning_rate": 0.00012295332823259375, + "loss": 0.2735, + "step": 22057 + }, + { + "epoch": 1.7869410239792611, + "grad_norm": 0.04415886104106903, + "learning_rate": 0.0001229488275799991, + "loss": 0.2288, + "step": 22058 + }, + { + "epoch": 1.7870220349967596, + "grad_norm": 0.04353123903274536, + "learning_rate": 0.00012294432692740447, + "loss": 0.2522, + "step": 22059 + }, + { + "epoch": 1.7871030460142578, + "grad_norm": 0.05958475172519684, + "learning_rate": 0.00012293982627480985, + "loss": 0.2736, + "step": 22060 + }, + { + "epoch": 1.7871840570317563, + "grad_norm": 0.056693777441978455, + "learning_rate": 0.00012293532562221524, + "loss": 0.2444, + "step": 22061 + }, + { + "epoch": 1.7872650680492548, + "grad_norm": 0.0533333495259285, + "learning_rate": 0.0001229308249696206, + "loss": 0.2346, + "step": 22062 + }, + { + "epoch": 1.787346079066753, + "grad_norm": 0.05624905973672867, + "learning_rate": 0.000122926324317026, + "loss": 0.2425, + "step": 22063 + }, + { + "epoch": 1.7874270900842515, + "grad_norm": 0.056994516402482986, + "learning_rate": 0.00012292182366443135, + "loss": 0.3258, + "step": 22064 + }, + { + "epoch": 1.78750810110175, + "grad_norm": 0.058526236563920975, + "learning_rate": 0.0001229173230118367, + "loss": 0.2613, + "step": 22065 + }, + { + "epoch": 1.7875891121192482, + "grad_norm": 0.05824572220444679, + "learning_rate": 0.0001229128223592421, + "loss": 0.2641, + "step": 22066 + }, + { + "epoch": 1.7876701231367464, + "grad_norm": 0.06035739183425903, + "learning_rate": 0.00012290832170664748, + "loss": 0.272, + "step": 22067 + }, + { + "epoch": 1.7877511341542451, + "grad_norm": 0.06125498563051224, + "learning_rate": 0.00012290382105405284, + "loss": 0.2784, + "step": 22068 + }, + { + "epoch": 1.7878321451717434, + "grad_norm": 0.050116147845983505, + "learning_rate": 0.00012289932040145823, + "loss": 0.2784, + "step": 22069 + }, + { + "epoch": 1.7879131561892416, + "grad_norm": 0.05024263635277748, + "learning_rate": 0.0001228948197488636, + "loss": 0.2598, + "step": 22070 + }, + { + "epoch": 1.78799416720674, + "grad_norm": 0.052744414657354355, + "learning_rate": 0.00012289031909626895, + "loss": 0.2655, + "step": 22071 + }, + { + "epoch": 1.7880751782242386, + "grad_norm": 0.062270790338516235, + "learning_rate": 0.00012288581844367434, + "loss": 0.2957, + "step": 22072 + }, + { + "epoch": 1.7881561892417368, + "grad_norm": 0.052221402525901794, + "learning_rate": 0.00012288131779107972, + "loss": 0.2482, + "step": 22073 + }, + { + "epoch": 1.7882372002592353, + "grad_norm": 0.04572257772088051, + "learning_rate": 0.00012287681713848508, + "loss": 0.2437, + "step": 22074 + }, + { + "epoch": 1.7883182112767337, + "grad_norm": 0.055134136229753494, + "learning_rate": 0.00012287231648589047, + "loss": 0.3352, + "step": 22075 + }, + { + "epoch": 1.788399222294232, + "grad_norm": 0.04828047752380371, + "learning_rate": 0.00012286781583329583, + "loss": 0.2628, + "step": 22076 + }, + { + "epoch": 1.7884802333117304, + "grad_norm": 0.05978702753782272, + "learning_rate": 0.0001228633151807012, + "loss": 0.2827, + "step": 22077 + }, + { + "epoch": 1.788561244329229, + "grad_norm": 0.04657856374979019, + "learning_rate": 0.00012285881452810658, + "loss": 0.2604, + "step": 22078 + }, + { + "epoch": 1.7886422553467272, + "grad_norm": 0.05295327305793762, + "learning_rate": 0.00012285431387551197, + "loss": 0.2527, + "step": 22079 + }, + { + "epoch": 1.7887232663642254, + "grad_norm": 0.061362508684396744, + "learning_rate": 0.00012284981322291733, + "loss": 0.2992, + "step": 22080 + }, + { + "epoch": 1.7888042773817239, + "grad_norm": 0.060083989053964615, + "learning_rate": 0.0001228453125703227, + "loss": 0.2715, + "step": 22081 + }, + { + "epoch": 1.7888852883992223, + "grad_norm": 0.051934026181697845, + "learning_rate": 0.00012284081191772807, + "loss": 0.2642, + "step": 22082 + }, + { + "epoch": 1.7889662994167206, + "grad_norm": 0.042310990393161774, + "learning_rate": 0.00012283631126513343, + "loss": 0.2597, + "step": 22083 + }, + { + "epoch": 1.789047310434219, + "grad_norm": 0.052813880145549774, + "learning_rate": 0.00012283181061253882, + "loss": 0.2509, + "step": 22084 + }, + { + "epoch": 1.7891283214517175, + "grad_norm": 0.059147439897060394, + "learning_rate": 0.0001228273099599442, + "loss": 0.3004, + "step": 22085 + }, + { + "epoch": 1.7892093324692158, + "grad_norm": 0.0584903247654438, + "learning_rate": 0.00012282280930734957, + "loss": 0.2706, + "step": 22086 + }, + { + "epoch": 1.7892903434867142, + "grad_norm": 0.061702847480773926, + "learning_rate": 0.00012281830865475496, + "loss": 0.2919, + "step": 22087 + }, + { + "epoch": 1.7893713545042127, + "grad_norm": 0.06372623145580292, + "learning_rate": 0.00012281380800216032, + "loss": 0.2651, + "step": 22088 + }, + { + "epoch": 1.789452365521711, + "grad_norm": 0.06399668008089066, + "learning_rate": 0.00012280930734956568, + "loss": 0.2835, + "step": 22089 + }, + { + "epoch": 1.7895333765392092, + "grad_norm": 0.052177559584379196, + "learning_rate": 0.00012280480669697106, + "loss": 0.2472, + "step": 22090 + }, + { + "epoch": 1.7896143875567079, + "grad_norm": 0.05263921990990639, + "learning_rate": 0.00012280030604437645, + "loss": 0.2679, + "step": 22091 + }, + { + "epoch": 1.7896953985742061, + "grad_norm": 0.05937456712126732, + "learning_rate": 0.0001227958053917818, + "loss": 0.3181, + "step": 22092 + }, + { + "epoch": 1.7897764095917044, + "grad_norm": 0.057407818734645844, + "learning_rate": 0.0001227913047391872, + "loss": 0.278, + "step": 22093 + }, + { + "epoch": 1.7898574206092028, + "grad_norm": 0.05732586607336998, + "learning_rate": 0.00012278680408659256, + "loss": 0.2701, + "step": 22094 + }, + { + "epoch": 1.7899384316267013, + "grad_norm": 0.05332402139902115, + "learning_rate": 0.00012278230343399792, + "loss": 0.2583, + "step": 22095 + }, + { + "epoch": 1.7900194426441995, + "grad_norm": 0.05595818907022476, + "learning_rate": 0.0001227778027814033, + "loss": 0.2632, + "step": 22096 + }, + { + "epoch": 1.790100453661698, + "grad_norm": 0.07867451012134552, + "learning_rate": 0.0001227733021288087, + "loss": 0.312, + "step": 22097 + }, + { + "epoch": 1.7901814646791965, + "grad_norm": 0.05248883366584778, + "learning_rate": 0.00012276880147621405, + "loss": 0.2347, + "step": 22098 + }, + { + "epoch": 1.7902624756966947, + "grad_norm": 0.06724679470062256, + "learning_rate": 0.00012276430082361944, + "loss": 0.2738, + "step": 22099 + }, + { + "epoch": 1.790343486714193, + "grad_norm": 0.0722748190164566, + "learning_rate": 0.0001227598001710248, + "loss": 0.283, + "step": 22100 + }, + { + "epoch": 1.7904244977316917, + "grad_norm": 0.05698879063129425, + "learning_rate": 0.00012275529951843019, + "loss": 0.3087, + "step": 22101 + }, + { + "epoch": 1.79050550874919, + "grad_norm": 0.05652936175465584, + "learning_rate": 0.00012275079886583555, + "loss": 0.3073, + "step": 22102 + }, + { + "epoch": 1.7905865197666881, + "grad_norm": 0.04519682005047798, + "learning_rate": 0.00012274629821324093, + "loss": 0.2467, + "step": 22103 + }, + { + "epoch": 1.7906675307841866, + "grad_norm": 0.05654732510447502, + "learning_rate": 0.0001227417975606463, + "loss": 0.2601, + "step": 22104 + }, + { + "epoch": 1.790748541801685, + "grad_norm": 0.06253427267074585, + "learning_rate": 0.00012273729690805168, + "loss": 0.2781, + "step": 22105 + }, + { + "epoch": 1.7908295528191833, + "grad_norm": 0.045378677546978, + "learning_rate": 0.00012273279625545704, + "loss": 0.2848, + "step": 22106 + }, + { + "epoch": 1.7909105638366818, + "grad_norm": 0.056497104465961456, + "learning_rate": 0.00012272829560286243, + "loss": 0.2824, + "step": 22107 + }, + { + "epoch": 1.7909915748541803, + "grad_norm": 0.05572156980633736, + "learning_rate": 0.00012272379495026781, + "loss": 0.3007, + "step": 22108 + }, + { + "epoch": 1.7910725858716785, + "grad_norm": 0.05481686443090439, + "learning_rate": 0.00012271929429767317, + "loss": 0.2619, + "step": 22109 + }, + { + "epoch": 1.791153596889177, + "grad_norm": 0.053460247814655304, + "learning_rate": 0.00012271479364507853, + "loss": 0.2852, + "step": 22110 + }, + { + "epoch": 1.7912346079066754, + "grad_norm": 0.05808936804533005, + "learning_rate": 0.00012271029299248392, + "loss": 0.2542, + "step": 22111 + }, + { + "epoch": 1.7913156189241737, + "grad_norm": 0.04818173125386238, + "learning_rate": 0.00012270579233988928, + "loss": 0.2825, + "step": 22112 + }, + { + "epoch": 1.791396629941672, + "grad_norm": 0.05348360911011696, + "learning_rate": 0.00012270129168729467, + "loss": 0.2481, + "step": 22113 + }, + { + "epoch": 1.7914776409591704, + "grad_norm": 0.04549575224518776, + "learning_rate": 0.00012269679103470006, + "loss": 0.238, + "step": 22114 + }, + { + "epoch": 1.7915586519766689, + "grad_norm": 0.05501910299062729, + "learning_rate": 0.00012269229038210542, + "loss": 0.2706, + "step": 22115 + }, + { + "epoch": 1.791639662994167, + "grad_norm": 0.05364497750997543, + "learning_rate": 0.00012268778972951078, + "loss": 0.275, + "step": 22116 + }, + { + "epoch": 1.7917206740116656, + "grad_norm": 0.0561637319624424, + "learning_rate": 0.00012268328907691616, + "loss": 0.2714, + "step": 22117 + }, + { + "epoch": 1.791801685029164, + "grad_norm": 0.06201014667749405, + "learning_rate": 0.00012267878842432152, + "loss": 0.2872, + "step": 22118 + }, + { + "epoch": 1.7918826960466623, + "grad_norm": 0.064414381980896, + "learning_rate": 0.0001226742877717269, + "loss": 0.2662, + "step": 22119 + }, + { + "epoch": 1.7919637070641607, + "grad_norm": 0.05276533216238022, + "learning_rate": 0.0001226697871191323, + "loss": 0.2668, + "step": 22120 + }, + { + "epoch": 1.7920447180816592, + "grad_norm": 0.06760087609291077, + "learning_rate": 0.00012266528646653766, + "loss": 0.2928, + "step": 22121 + }, + { + "epoch": 1.7921257290991575, + "grad_norm": 0.0584164634346962, + "learning_rate": 0.00012266078581394302, + "loss": 0.2329, + "step": 22122 + }, + { + "epoch": 1.7922067401166557, + "grad_norm": 0.049879640340805054, + "learning_rate": 0.0001226562851613484, + "loss": 0.2632, + "step": 22123 + }, + { + "epoch": 1.7922877511341544, + "grad_norm": 0.06702165305614471, + "learning_rate": 0.00012265178450875376, + "loss": 0.3171, + "step": 22124 + }, + { + "epoch": 1.7923687621516526, + "grad_norm": 0.0711006447672844, + "learning_rate": 0.00012264728385615915, + "loss": 0.3067, + "step": 22125 + }, + { + "epoch": 1.7924497731691509, + "grad_norm": 0.057895757257938385, + "learning_rate": 0.00012264278320356454, + "loss": 0.2707, + "step": 22126 + }, + { + "epoch": 1.7925307841866494, + "grad_norm": 0.048520952463150024, + "learning_rate": 0.0001226382825509699, + "loss": 0.2701, + "step": 22127 + }, + { + "epoch": 1.7926117952041478, + "grad_norm": 0.060298990458250046, + "learning_rate": 0.00012263378189837526, + "loss": 0.2497, + "step": 22128 + }, + { + "epoch": 1.792692806221646, + "grad_norm": 0.052584659308195114, + "learning_rate": 0.00012262928124578065, + "loss": 0.3016, + "step": 22129 + }, + { + "epoch": 1.7927738172391445, + "grad_norm": 0.04596573859453201, + "learning_rate": 0.00012262478059318603, + "loss": 0.2578, + "step": 22130 + }, + { + "epoch": 1.792854828256643, + "grad_norm": 0.05442513898015022, + "learning_rate": 0.0001226202799405914, + "loss": 0.2382, + "step": 22131 + }, + { + "epoch": 1.7929358392741412, + "grad_norm": 0.055967625230550766, + "learning_rate": 0.00012261577928799678, + "loss": 0.2909, + "step": 22132 + }, + { + "epoch": 1.7930168502916397, + "grad_norm": 0.06419744342565536, + "learning_rate": 0.00012261127863540214, + "loss": 0.273, + "step": 22133 + }, + { + "epoch": 1.7930978613091382, + "grad_norm": 0.050524089485406876, + "learning_rate": 0.0001226067779828075, + "loss": 0.2832, + "step": 22134 + }, + { + "epoch": 1.7931788723266364, + "grad_norm": 0.05179031565785408, + "learning_rate": 0.0001226022773302129, + "loss": 0.3183, + "step": 22135 + }, + { + "epoch": 1.7932598833441347, + "grad_norm": 0.049778230488300323, + "learning_rate": 0.00012259777667761828, + "loss": 0.2723, + "step": 22136 + }, + { + "epoch": 1.7933408943616331, + "grad_norm": 0.05179633945226669, + "learning_rate": 0.00012259327602502364, + "loss": 0.2419, + "step": 22137 + }, + { + "epoch": 1.7934219053791316, + "grad_norm": 0.06944955140352249, + "learning_rate": 0.00012258877537242902, + "loss": 0.2636, + "step": 22138 + }, + { + "epoch": 1.7935029163966298, + "grad_norm": 0.05182751640677452, + "learning_rate": 0.00012258427471983438, + "loss": 0.2735, + "step": 22139 + }, + { + "epoch": 1.7935839274141283, + "grad_norm": 0.06322464346885681, + "learning_rate": 0.00012257977406723974, + "loss": 0.3229, + "step": 22140 + }, + { + "epoch": 1.7936649384316268, + "grad_norm": 0.04878616705536842, + "learning_rate": 0.00012257527341464513, + "loss": 0.2562, + "step": 22141 + }, + { + "epoch": 1.793745949449125, + "grad_norm": 0.05315592512488365, + "learning_rate": 0.00012257077276205052, + "loss": 0.302, + "step": 22142 + }, + { + "epoch": 1.7938269604666235, + "grad_norm": 0.056052401661872864, + "learning_rate": 0.00012256627210945588, + "loss": 0.2733, + "step": 22143 + }, + { + "epoch": 1.793907971484122, + "grad_norm": 0.05101040005683899, + "learning_rate": 0.00012256177145686126, + "loss": 0.2602, + "step": 22144 + }, + { + "epoch": 1.7939889825016202, + "grad_norm": 0.0631868839263916, + "learning_rate": 0.00012255727080426662, + "loss": 0.2495, + "step": 22145 + }, + { + "epoch": 1.7940699935191184, + "grad_norm": 0.0552617684006691, + "learning_rate": 0.00012255277015167198, + "loss": 0.2988, + "step": 22146 + }, + { + "epoch": 1.7941510045366171, + "grad_norm": 0.06353448331356049, + "learning_rate": 0.00012254826949907737, + "loss": 0.3198, + "step": 22147 + }, + { + "epoch": 1.7942320155541154, + "grad_norm": 0.053773414343595505, + "learning_rate": 0.00012254376884648276, + "loss": 0.2663, + "step": 22148 + }, + { + "epoch": 1.7943130265716136, + "grad_norm": 0.05944092199206352, + "learning_rate": 0.00012253926819388812, + "loss": 0.2718, + "step": 22149 + }, + { + "epoch": 1.794394037589112, + "grad_norm": 0.06545424461364746, + "learning_rate": 0.0001225347675412935, + "loss": 0.2893, + "step": 22150 + }, + { + "epoch": 1.7944750486066106, + "grad_norm": 0.06336667388677597, + "learning_rate": 0.00012253026688869887, + "loss": 0.2825, + "step": 22151 + }, + { + "epoch": 1.7945560596241088, + "grad_norm": 0.0653478354215622, + "learning_rate": 0.00012252576623610423, + "loss": 0.3082, + "step": 22152 + }, + { + "epoch": 1.7946370706416073, + "grad_norm": 0.04935803264379501, + "learning_rate": 0.0001225212655835096, + "loss": 0.2803, + "step": 22153 + }, + { + "epoch": 1.7947180816591057, + "grad_norm": 0.06616376340389252, + "learning_rate": 0.000122516764930915, + "loss": 0.2731, + "step": 22154 + }, + { + "epoch": 1.794799092676604, + "grad_norm": 0.04812661558389664, + "learning_rate": 0.00012251226427832036, + "loss": 0.2609, + "step": 22155 + }, + { + "epoch": 1.7948801036941024, + "grad_norm": 0.048841774463653564, + "learning_rate": 0.00012250776362572575, + "loss": 0.2774, + "step": 22156 + }, + { + "epoch": 1.794961114711601, + "grad_norm": 0.06260140985250473, + "learning_rate": 0.0001225032629731311, + "loss": 0.3173, + "step": 22157 + }, + { + "epoch": 1.7950421257290992, + "grad_norm": 0.061257071793079376, + "learning_rate": 0.00012249876232053647, + "loss": 0.2533, + "step": 22158 + }, + { + "epoch": 1.7951231367465974, + "grad_norm": 0.04774602875113487, + "learning_rate": 0.00012249426166794185, + "loss": 0.2251, + "step": 22159 + }, + { + "epoch": 1.7952041477640959, + "grad_norm": 0.06033805012702942, + "learning_rate": 0.00012248976101534724, + "loss": 0.2844, + "step": 22160 + }, + { + "epoch": 1.7952851587815943, + "grad_norm": 0.0505126528441906, + "learning_rate": 0.0001224852603627526, + "loss": 0.2389, + "step": 22161 + }, + { + "epoch": 1.7953661697990926, + "grad_norm": 0.053559351712465286, + "learning_rate": 0.000122480759710158, + "loss": 0.2679, + "step": 22162 + }, + { + "epoch": 1.795447180816591, + "grad_norm": 0.05731525272130966, + "learning_rate": 0.00012247625905756335, + "loss": 0.2822, + "step": 22163 + }, + { + "epoch": 1.7955281918340895, + "grad_norm": 0.05027606710791588, + "learning_rate": 0.0001224717584049687, + "loss": 0.251, + "step": 22164 + }, + { + "epoch": 1.7956092028515878, + "grad_norm": 0.05858727917075157, + "learning_rate": 0.0001224672577523741, + "loss": 0.2776, + "step": 22165 + }, + { + "epoch": 1.7956902138690862, + "grad_norm": 0.0573890395462513, + "learning_rate": 0.00012246275709977948, + "loss": 0.3017, + "step": 22166 + }, + { + "epoch": 1.7957712248865847, + "grad_norm": 0.0540848970413208, + "learning_rate": 0.00012245825644718484, + "loss": 0.2345, + "step": 22167 + }, + { + "epoch": 1.795852235904083, + "grad_norm": 0.06505483388900757, + "learning_rate": 0.00012245375579459023, + "loss": 0.3001, + "step": 22168 + }, + { + "epoch": 1.7959332469215812, + "grad_norm": 0.05077476054430008, + "learning_rate": 0.0001224492551419956, + "loss": 0.2605, + "step": 22169 + }, + { + "epoch": 1.7960142579390799, + "grad_norm": 0.06261571496725082, + "learning_rate": 0.00012244475448940095, + "loss": 0.2615, + "step": 22170 + }, + { + "epoch": 1.7960952689565781, + "grad_norm": 0.05674409121274948, + "learning_rate": 0.00012244025383680634, + "loss": 0.2662, + "step": 22171 + }, + { + "epoch": 1.7961762799740764, + "grad_norm": 0.05636698380112648, + "learning_rate": 0.00012243575318421172, + "loss": 0.303, + "step": 22172 + }, + { + "epoch": 1.7962572909915748, + "grad_norm": 0.05232566222548485, + "learning_rate": 0.00012243125253161708, + "loss": 0.2749, + "step": 22173 + }, + { + "epoch": 1.7963383020090733, + "grad_norm": 0.05814569815993309, + "learning_rate": 0.00012242675187902247, + "loss": 0.2433, + "step": 22174 + }, + { + "epoch": 1.7964193130265715, + "grad_norm": 0.05443737283349037, + "learning_rate": 0.00012242225122642783, + "loss": 0.2652, + "step": 22175 + }, + { + "epoch": 1.79650032404407, + "grad_norm": 0.056092407554388046, + "learning_rate": 0.0001224177505738332, + "loss": 0.2928, + "step": 22176 + }, + { + "epoch": 1.7965813350615685, + "grad_norm": 0.04900359734892845, + "learning_rate": 0.0001224132499212386, + "loss": 0.26, + "step": 22177 + }, + { + "epoch": 1.7966623460790667, + "grad_norm": 0.05376352742314339, + "learning_rate": 0.00012240874926864397, + "loss": 0.2808, + "step": 22178 + }, + { + "epoch": 1.7967433570965652, + "grad_norm": 0.054264407604932785, + "learning_rate": 0.00012240424861604933, + "loss": 0.2627, + "step": 22179 + }, + { + "epoch": 1.7968243681140637, + "grad_norm": 0.06078781932592392, + "learning_rate": 0.00012239974796345471, + "loss": 0.316, + "step": 22180 + }, + { + "epoch": 1.796905379131562, + "grad_norm": 0.0540931411087513, + "learning_rate": 0.00012239524731086007, + "loss": 0.2805, + "step": 22181 + }, + { + "epoch": 1.7969863901490601, + "grad_norm": 0.0742974653840065, + "learning_rate": 0.00012239074665826546, + "loss": 0.2752, + "step": 22182 + }, + { + "epoch": 1.7970674011665586, + "grad_norm": 0.05611839145421982, + "learning_rate": 0.00012238624600567085, + "loss": 0.2656, + "step": 22183 + }, + { + "epoch": 1.797148412184057, + "grad_norm": 0.06015536189079285, + "learning_rate": 0.0001223817453530762, + "loss": 0.3339, + "step": 22184 + }, + { + "epoch": 1.7972294232015553, + "grad_norm": 0.06692757457494736, + "learning_rate": 0.00012237724470048157, + "loss": 0.2854, + "step": 22185 + }, + { + "epoch": 1.7973104342190538, + "grad_norm": 0.0594097375869751, + "learning_rate": 0.00012237274404788696, + "loss": 0.2938, + "step": 22186 + }, + { + "epoch": 1.7973914452365523, + "grad_norm": 0.07080462574958801, + "learning_rate": 0.00012236824339529232, + "loss": 0.3276, + "step": 22187 + }, + { + "epoch": 1.7974724562540505, + "grad_norm": 0.06359456479549408, + "learning_rate": 0.0001223637427426977, + "loss": 0.2843, + "step": 22188 + }, + { + "epoch": 1.797553467271549, + "grad_norm": 0.056201670318841934, + "learning_rate": 0.0001223592420901031, + "loss": 0.3357, + "step": 22189 + }, + { + "epoch": 1.7976344782890474, + "grad_norm": 0.05176905542612076, + "learning_rate": 0.00012235474143750845, + "loss": 0.2508, + "step": 22190 + }, + { + "epoch": 1.7977154893065457, + "grad_norm": 0.04799371585249901, + "learning_rate": 0.0001223502407849138, + "loss": 0.2863, + "step": 22191 + }, + { + "epoch": 1.797796500324044, + "grad_norm": 0.05569925531744957, + "learning_rate": 0.0001223457401323192, + "loss": 0.2732, + "step": 22192 + }, + { + "epoch": 1.7978775113415426, + "grad_norm": 0.05573749542236328, + "learning_rate": 0.00012234123947972456, + "loss": 0.2429, + "step": 22193 + }, + { + "epoch": 1.7979585223590409, + "grad_norm": 0.06206778809428215, + "learning_rate": 0.00012233673882712994, + "loss": 0.3125, + "step": 22194 + }, + { + "epoch": 1.798039533376539, + "grad_norm": 0.05589371919631958, + "learning_rate": 0.00012233223817453533, + "loss": 0.3008, + "step": 22195 + }, + { + "epoch": 1.7981205443940376, + "grad_norm": 0.05611521378159523, + "learning_rate": 0.0001223277375219407, + "loss": 0.2635, + "step": 22196 + }, + { + "epoch": 1.798201555411536, + "grad_norm": 0.057951584458351135, + "learning_rate": 0.00012232323686934605, + "loss": 0.2921, + "step": 22197 + }, + { + "epoch": 1.7982825664290343, + "grad_norm": 0.05601397156715393, + "learning_rate": 0.00012231873621675144, + "loss": 0.2778, + "step": 22198 + }, + { + "epoch": 1.7983635774465327, + "grad_norm": 0.06642493605613708, + "learning_rate": 0.0001223142355641568, + "loss": 0.2739, + "step": 22199 + }, + { + "epoch": 1.7984445884640312, + "grad_norm": 0.058593589812517166, + "learning_rate": 0.00012230973491156219, + "loss": 0.2916, + "step": 22200 + }, + { + "epoch": 1.7985255994815295, + "grad_norm": 0.06586059182882309, + "learning_rate": 0.00012230523425896757, + "loss": 0.3034, + "step": 22201 + }, + { + "epoch": 1.7986066104990277, + "grad_norm": 0.054388418793678284, + "learning_rate": 0.00012230073360637293, + "loss": 0.2716, + "step": 22202 + }, + { + "epoch": 1.7986876215165264, + "grad_norm": 0.059615492820739746, + "learning_rate": 0.0001222962329537783, + "loss": 0.3011, + "step": 22203 + }, + { + "epoch": 1.7987686325340246, + "grad_norm": 0.06519076973199844, + "learning_rate": 0.00012229173230118368, + "loss": 0.2707, + "step": 22204 + }, + { + "epoch": 1.7988496435515229, + "grad_norm": 0.05637694150209427, + "learning_rate": 0.00012228723164858907, + "loss": 0.2512, + "step": 22205 + }, + { + "epoch": 1.7989306545690213, + "grad_norm": 0.05849280208349228, + "learning_rate": 0.00012228273099599443, + "loss": 0.3303, + "step": 22206 + }, + { + "epoch": 1.7990116655865198, + "grad_norm": 0.06378426402807236, + "learning_rate": 0.00012227823034339981, + "loss": 0.2797, + "step": 22207 + }, + { + "epoch": 1.799092676604018, + "grad_norm": 0.05572065711021423, + "learning_rate": 0.00012227372969080517, + "loss": 0.2524, + "step": 22208 + }, + { + "epoch": 1.7991736876215165, + "grad_norm": 0.0632311999797821, + "learning_rate": 0.00012226922903821053, + "loss": 0.2885, + "step": 22209 + }, + { + "epoch": 1.799254698639015, + "grad_norm": 0.068609319627285, + "learning_rate": 0.00012226472838561592, + "loss": 0.2961, + "step": 22210 + }, + { + "epoch": 1.7993357096565132, + "grad_norm": 0.05970097705721855, + "learning_rate": 0.0001222602277330213, + "loss": 0.2667, + "step": 22211 + }, + { + "epoch": 1.7994167206740117, + "grad_norm": 0.05331006273627281, + "learning_rate": 0.00012225572708042667, + "loss": 0.2654, + "step": 22212 + }, + { + "epoch": 1.7994977316915102, + "grad_norm": 0.057156920433044434, + "learning_rate": 0.00012225122642783206, + "loss": 0.2611, + "step": 22213 + }, + { + "epoch": 1.7995787427090084, + "grad_norm": 0.04627132788300514, + "learning_rate": 0.00012224672577523742, + "loss": 0.2464, + "step": 22214 + }, + { + "epoch": 1.7996597537265067, + "grad_norm": 0.07109607756137848, + "learning_rate": 0.00012224222512264278, + "loss": 0.3506, + "step": 22215 + }, + { + "epoch": 1.7997407647440054, + "grad_norm": 0.057411983609199524, + "learning_rate": 0.00012223772447004816, + "loss": 0.2649, + "step": 22216 + }, + { + "epoch": 1.7998217757615036, + "grad_norm": 0.06819972395896912, + "learning_rate": 0.00012223322381745355, + "loss": 0.3188, + "step": 22217 + }, + { + "epoch": 1.7999027867790018, + "grad_norm": 0.05879756435751915, + "learning_rate": 0.0001222287231648589, + "loss": 0.2628, + "step": 22218 + }, + { + "epoch": 1.7999837977965003, + "grad_norm": 0.049354761838912964, + "learning_rate": 0.0001222242225122643, + "loss": 0.2617, + "step": 22219 + }, + { + "epoch": 1.8000648088139988, + "grad_norm": 0.04880581423640251, + "learning_rate": 0.00012221972185966966, + "loss": 0.2302, + "step": 22220 + }, + { + "epoch": 1.800145819831497, + "grad_norm": 0.0527828149497509, + "learning_rate": 0.00012221522120707502, + "loss": 0.266, + "step": 22221 + }, + { + "epoch": 1.8002268308489955, + "grad_norm": 0.05588414892554283, + "learning_rate": 0.0001222107205544804, + "loss": 0.2983, + "step": 22222 + }, + { + "epoch": 1.800307841866494, + "grad_norm": 0.056125205010175705, + "learning_rate": 0.0001222062199018858, + "loss": 0.2798, + "step": 22223 + }, + { + "epoch": 1.8003888528839922, + "grad_norm": 0.05799616128206253, + "learning_rate": 0.00012220171924929115, + "loss": 0.26, + "step": 22224 + }, + { + "epoch": 1.8004698639014904, + "grad_norm": 0.05869988352060318, + "learning_rate": 0.00012219721859669654, + "loss": 0.3042, + "step": 22225 + }, + { + "epoch": 1.8005508749189891, + "grad_norm": 0.0588398352265358, + "learning_rate": 0.0001221927179441019, + "loss": 0.2731, + "step": 22226 + }, + { + "epoch": 1.8006318859364874, + "grad_norm": 0.05252890661358833, + "learning_rate": 0.00012218821729150726, + "loss": 0.2559, + "step": 22227 + }, + { + "epoch": 1.8007128969539856, + "grad_norm": 0.04973560944199562, + "learning_rate": 0.00012218371663891265, + "loss": 0.2765, + "step": 22228 + }, + { + "epoch": 1.800793907971484, + "grad_norm": 0.05808442831039429, + "learning_rate": 0.00012217921598631803, + "loss": 0.272, + "step": 22229 + }, + { + "epoch": 1.8008749189889826, + "grad_norm": 0.052194830030202866, + "learning_rate": 0.0001221747153337234, + "loss": 0.2757, + "step": 22230 + }, + { + "epoch": 1.8009559300064808, + "grad_norm": 0.06407959759235382, + "learning_rate": 0.00012217021468112878, + "loss": 0.3018, + "step": 22231 + }, + { + "epoch": 1.8010369410239793, + "grad_norm": 0.048292580991983414, + "learning_rate": 0.00012216571402853414, + "loss": 0.2565, + "step": 22232 + }, + { + "epoch": 1.8011179520414777, + "grad_norm": 0.06521427631378174, + "learning_rate": 0.0001221612133759395, + "loss": 0.2806, + "step": 22233 + }, + { + "epoch": 1.801198963058976, + "grad_norm": 0.06741520762443542, + "learning_rate": 0.0001221567127233449, + "loss": 0.2611, + "step": 22234 + }, + { + "epoch": 1.8012799740764744, + "grad_norm": 0.05765705928206444, + "learning_rate": 0.00012215221207075028, + "loss": 0.3098, + "step": 22235 + }, + { + "epoch": 1.801360985093973, + "grad_norm": 0.05621275678277016, + "learning_rate": 0.00012214771141815564, + "loss": 0.2491, + "step": 22236 + }, + { + "epoch": 1.8014419961114712, + "grad_norm": 0.05360456183552742, + "learning_rate": 0.00012214321076556102, + "loss": 0.2526, + "step": 22237 + }, + { + "epoch": 1.8015230071289694, + "grad_norm": 0.06470076739788055, + "learning_rate": 0.00012213871011296638, + "loss": 0.291, + "step": 22238 + }, + { + "epoch": 1.8016040181464679, + "grad_norm": 0.06331278383731842, + "learning_rate": 0.00012213420946037174, + "loss": 0.2857, + "step": 22239 + }, + { + "epoch": 1.8016850291639663, + "grad_norm": 0.06826566159725189, + "learning_rate": 0.00012212970880777713, + "loss": 0.2561, + "step": 22240 + }, + { + "epoch": 1.8017660401814646, + "grad_norm": 0.05896066501736641, + "learning_rate": 0.00012212520815518252, + "loss": 0.2735, + "step": 22241 + }, + { + "epoch": 1.801847051198963, + "grad_norm": 0.0709279254078865, + "learning_rate": 0.00012212070750258788, + "loss": 0.2911, + "step": 22242 + }, + { + "epoch": 1.8019280622164615, + "grad_norm": 0.05172071233391762, + "learning_rate": 0.00012211620684999326, + "loss": 0.261, + "step": 22243 + }, + { + "epoch": 1.8020090732339598, + "grad_norm": 0.06091113016009331, + "learning_rate": 0.00012211170619739862, + "loss": 0.2672, + "step": 22244 + }, + { + "epoch": 1.8020900842514582, + "grad_norm": 0.04631185904145241, + "learning_rate": 0.00012210720554480398, + "loss": 0.2359, + "step": 22245 + }, + { + "epoch": 1.8021710952689567, + "grad_norm": 0.06016520410776138, + "learning_rate": 0.0001221027048922094, + "loss": 0.2634, + "step": 22246 + }, + { + "epoch": 1.802252106286455, + "grad_norm": 0.05439654365181923, + "learning_rate": 0.00012209820423961476, + "loss": 0.3049, + "step": 22247 + }, + { + "epoch": 1.8023331173039532, + "grad_norm": 0.05835814028978348, + "learning_rate": 0.00012209370358702012, + "loss": 0.2788, + "step": 22248 + }, + { + "epoch": 1.8024141283214519, + "grad_norm": 0.04965536668896675, + "learning_rate": 0.0001220892029344255, + "loss": 0.2963, + "step": 22249 + }, + { + "epoch": 1.8024951393389501, + "grad_norm": 0.048756249248981476, + "learning_rate": 0.00012208470228183087, + "loss": 0.2227, + "step": 22250 + }, + { + "epoch": 1.8025761503564484, + "grad_norm": 0.059522707015275955, + "learning_rate": 0.00012208020162923623, + "loss": 0.2688, + "step": 22251 + }, + { + "epoch": 1.8026571613739468, + "grad_norm": 0.07265980541706085, + "learning_rate": 0.00012207570097664164, + "loss": 0.2594, + "step": 22252 + }, + { + "epoch": 1.8027381723914453, + "grad_norm": 0.05598118528723717, + "learning_rate": 0.000122071200324047, + "loss": 0.2712, + "step": 22253 + }, + { + "epoch": 1.8028191834089435, + "grad_norm": 0.05440227687358856, + "learning_rate": 0.00012206669967145236, + "loss": 0.2689, + "step": 22254 + }, + { + "epoch": 1.802900194426442, + "grad_norm": 0.062089111655950546, + "learning_rate": 0.00012206219901885773, + "loss": 0.2761, + "step": 22255 + }, + { + "epoch": 1.8029812054439405, + "grad_norm": 0.07075740396976471, + "learning_rate": 0.00012205769836626311, + "loss": 0.2533, + "step": 22256 + }, + { + "epoch": 1.8030622164614387, + "grad_norm": 0.07244139164686203, + "learning_rate": 0.00012205319771366848, + "loss": 0.266, + "step": 22257 + }, + { + "epoch": 1.8031432274789372, + "grad_norm": 0.05973348021507263, + "learning_rate": 0.00012204869706107387, + "loss": 0.2593, + "step": 22258 + }, + { + "epoch": 1.8032242384964356, + "grad_norm": 0.06082494184374809, + "learning_rate": 0.00012204419640847924, + "loss": 0.2516, + "step": 22259 + }, + { + "epoch": 1.803305249513934, + "grad_norm": 0.06084740534424782, + "learning_rate": 0.00012203969575588462, + "loss": 0.286, + "step": 22260 + }, + { + "epoch": 1.8033862605314321, + "grad_norm": 0.046079982072114944, + "learning_rate": 0.00012203519510328998, + "loss": 0.2514, + "step": 22261 + }, + { + "epoch": 1.8034672715489306, + "grad_norm": 0.04780830070376396, + "learning_rate": 0.00012203069445069535, + "loss": 0.2542, + "step": 22262 + }, + { + "epoch": 1.803548282566429, + "grad_norm": 0.05270274356007576, + "learning_rate": 0.00012202619379810074, + "loss": 0.2598, + "step": 22263 + }, + { + "epoch": 1.8036292935839273, + "grad_norm": 0.05802077427506447, + "learning_rate": 0.00012202169314550611, + "loss": 0.2476, + "step": 22264 + }, + { + "epoch": 1.8037103046014258, + "grad_norm": 0.043057069182395935, + "learning_rate": 0.00012201719249291148, + "loss": 0.2448, + "step": 22265 + }, + { + "epoch": 1.8037913156189243, + "grad_norm": 0.06099724769592285, + "learning_rate": 0.00012201269184031686, + "loss": 0.2601, + "step": 22266 + }, + { + "epoch": 1.8038723266364225, + "grad_norm": 0.07051218301057816, + "learning_rate": 0.00012200819118772222, + "loss": 0.2879, + "step": 22267 + }, + { + "epoch": 1.803953337653921, + "grad_norm": 0.05437973514199257, + "learning_rate": 0.00012200369053512759, + "loss": 0.2669, + "step": 22268 + }, + { + "epoch": 1.8040343486714194, + "grad_norm": 0.06001882627606392, + "learning_rate": 0.00012199918988253298, + "loss": 0.2822, + "step": 22269 + }, + { + "epoch": 1.8041153596889177, + "grad_norm": 0.08208142966032028, + "learning_rate": 0.00012199468922993835, + "loss": 0.2962, + "step": 22270 + }, + { + "epoch": 1.804196370706416, + "grad_norm": 0.055241961032152176, + "learning_rate": 0.00012199018857734373, + "loss": 0.2445, + "step": 22271 + }, + { + "epoch": 1.8042773817239146, + "grad_norm": 0.05752910301089287, + "learning_rate": 0.0001219856879247491, + "loss": 0.2782, + "step": 22272 + }, + { + "epoch": 1.8043583927414129, + "grad_norm": 0.06530386209487915, + "learning_rate": 0.00012198118727215446, + "loss": 0.304, + "step": 22273 + }, + { + "epoch": 1.804439403758911, + "grad_norm": 0.05831904336810112, + "learning_rate": 0.00012197668661955983, + "loss": 0.2749, + "step": 22274 + }, + { + "epoch": 1.8045204147764096, + "grad_norm": 0.05550654977560043, + "learning_rate": 0.00012197218596696522, + "loss": 0.2881, + "step": 22275 + }, + { + "epoch": 1.804601425793908, + "grad_norm": 0.05844729021191597, + "learning_rate": 0.00012196768531437059, + "loss": 0.2886, + "step": 22276 + }, + { + "epoch": 1.8046824368114063, + "grad_norm": 0.05544528737664223, + "learning_rate": 0.00012196318466177597, + "loss": 0.2513, + "step": 22277 + }, + { + "epoch": 1.8047634478289047, + "grad_norm": 0.05155603587627411, + "learning_rate": 0.00012195868400918134, + "loss": 0.3023, + "step": 22278 + }, + { + "epoch": 1.8048444588464032, + "grad_norm": 0.050557348877191544, + "learning_rate": 0.0001219541833565867, + "loss": 0.2795, + "step": 22279 + }, + { + "epoch": 1.8049254698639015, + "grad_norm": 0.06218627095222473, + "learning_rate": 0.00012194968270399207, + "loss": 0.2854, + "step": 22280 + }, + { + "epoch": 1.8050064808814, + "grad_norm": 0.05242801457643509, + "learning_rate": 0.00012194518205139746, + "loss": 0.237, + "step": 22281 + }, + { + "epoch": 1.8050874918988984, + "grad_norm": 0.05768498033285141, + "learning_rate": 0.00012194068139880283, + "loss": 0.249, + "step": 22282 + }, + { + "epoch": 1.8051685029163966, + "grad_norm": 0.052621133625507355, + "learning_rate": 0.00012193618074620821, + "loss": 0.2474, + "step": 22283 + }, + { + "epoch": 1.8052495139338949, + "grad_norm": 0.053806059062480927, + "learning_rate": 0.00012193168009361358, + "loss": 0.2419, + "step": 22284 + }, + { + "epoch": 1.8053305249513933, + "grad_norm": 0.055902380496263504, + "learning_rate": 0.00012192717944101894, + "loss": 0.252, + "step": 22285 + }, + { + "epoch": 1.8054115359688918, + "grad_norm": 0.04919757321476936, + "learning_rate": 0.00012192267878842434, + "loss": 0.2879, + "step": 22286 + }, + { + "epoch": 1.80549254698639, + "grad_norm": 0.05439651384949684, + "learning_rate": 0.0001219181781358297, + "loss": 0.2766, + "step": 22287 + }, + { + "epoch": 1.8055735580038885, + "grad_norm": 0.07050889730453491, + "learning_rate": 0.00012191367748323508, + "loss": 0.3027, + "step": 22288 + }, + { + "epoch": 1.805654569021387, + "grad_norm": 0.05229153111577034, + "learning_rate": 0.00012190917683064045, + "loss": 0.2309, + "step": 22289 + }, + { + "epoch": 1.8057355800388852, + "grad_norm": 0.06367629021406174, + "learning_rate": 0.00012190467617804582, + "loss": 0.2515, + "step": 22290 + }, + { + "epoch": 1.8058165910563837, + "grad_norm": 0.06692226976156235, + "learning_rate": 0.00012190017552545118, + "loss": 0.2764, + "step": 22291 + }, + { + "epoch": 1.8058976020738822, + "grad_norm": 0.05514880269765854, + "learning_rate": 0.00012189567487285658, + "loss": 0.2416, + "step": 22292 + }, + { + "epoch": 1.8059786130913804, + "grad_norm": 0.07039035856723785, + "learning_rate": 0.00012189117422026194, + "loss": 0.3171, + "step": 22293 + }, + { + "epoch": 1.8060596241088787, + "grad_norm": 0.06593021005392075, + "learning_rate": 0.00012188667356766732, + "loss": 0.2675, + "step": 22294 + }, + { + "epoch": 1.8061406351263773, + "grad_norm": 0.059571146965026855, + "learning_rate": 0.00012188217291507269, + "loss": 0.3039, + "step": 22295 + }, + { + "epoch": 1.8062216461438756, + "grad_norm": 0.0600927360355854, + "learning_rate": 0.00012187767226247807, + "loss": 0.256, + "step": 22296 + }, + { + "epoch": 1.8063026571613738, + "grad_norm": 0.04874737560749054, + "learning_rate": 0.00012187317160988343, + "loss": 0.2198, + "step": 22297 + }, + { + "epoch": 1.8063836681788723, + "grad_norm": 0.05537532642483711, + "learning_rate": 0.00012186867095728883, + "loss": 0.2644, + "step": 22298 + }, + { + "epoch": 1.8064646791963708, + "grad_norm": 0.049281734973192215, + "learning_rate": 0.00012186417030469419, + "loss": 0.2587, + "step": 22299 + }, + { + "epoch": 1.806545690213869, + "grad_norm": 0.06185423582792282, + "learning_rate": 0.00012185966965209956, + "loss": 0.2584, + "step": 22300 + }, + { + "epoch": 1.8066267012313675, + "grad_norm": 0.06315252184867859, + "learning_rate": 0.00012185516899950493, + "loss": 0.2874, + "step": 22301 + }, + { + "epoch": 1.806707712248866, + "grad_norm": 0.049367886036634445, + "learning_rate": 0.00012185066834691031, + "loss": 0.281, + "step": 22302 + }, + { + "epoch": 1.8067887232663642, + "grad_norm": 0.061743155121803284, + "learning_rate": 0.00012184616769431567, + "loss": 0.2474, + "step": 22303 + }, + { + "epoch": 1.8068697342838627, + "grad_norm": 0.05364508181810379, + "learning_rate": 0.00012184166704172107, + "loss": 0.2641, + "step": 22304 + }, + { + "epoch": 1.8069507453013611, + "grad_norm": 0.055584825575351715, + "learning_rate": 0.00012183716638912643, + "loss": 0.2548, + "step": 22305 + }, + { + "epoch": 1.8070317563188594, + "grad_norm": 0.0605003759264946, + "learning_rate": 0.0001218326657365318, + "loss": 0.2925, + "step": 22306 + }, + { + "epoch": 1.8071127673363576, + "grad_norm": 0.05850888788700104, + "learning_rate": 0.00012182816508393717, + "loss": 0.2722, + "step": 22307 + }, + { + "epoch": 1.807193778353856, + "grad_norm": 0.05131843313574791, + "learning_rate": 0.00012182366443134255, + "loss": 0.3039, + "step": 22308 + }, + { + "epoch": 1.8072747893713546, + "grad_norm": 0.06271613389253616, + "learning_rate": 0.00012181916377874791, + "loss": 0.2712, + "step": 22309 + }, + { + "epoch": 1.8073558003888528, + "grad_norm": 0.055650744587183, + "learning_rate": 0.00012181466312615331, + "loss": 0.259, + "step": 22310 + }, + { + "epoch": 1.8074368114063513, + "grad_norm": 0.053837329149246216, + "learning_rate": 0.00012181016247355867, + "loss": 0.3005, + "step": 22311 + }, + { + "epoch": 1.8075178224238497, + "grad_norm": 0.05122172087430954, + "learning_rate": 0.00012180566182096404, + "loss": 0.2534, + "step": 22312 + }, + { + "epoch": 1.807598833441348, + "grad_norm": 0.05293326824903488, + "learning_rate": 0.00012180116116836942, + "loss": 0.3092, + "step": 22313 + }, + { + "epoch": 1.8076798444588464, + "grad_norm": 0.05141449719667435, + "learning_rate": 0.00012179666051577479, + "loss": 0.2599, + "step": 22314 + }, + { + "epoch": 1.807760855476345, + "grad_norm": 0.04616117104887962, + "learning_rate": 0.00012179215986318018, + "loss": 0.2346, + "step": 22315 + }, + { + "epoch": 1.8078418664938432, + "grad_norm": 0.060469768941402435, + "learning_rate": 0.00012178765921058555, + "loss": 0.2593, + "step": 22316 + }, + { + "epoch": 1.8079228775113414, + "grad_norm": 0.05977886915206909, + "learning_rate": 0.00012178315855799091, + "loss": 0.22, + "step": 22317 + }, + { + "epoch": 1.80800388852884, + "grad_norm": 0.05252216011285782, + "learning_rate": 0.00012177865790539628, + "loss": 0.2765, + "step": 22318 + }, + { + "epoch": 1.8080848995463383, + "grad_norm": 0.05202179774641991, + "learning_rate": 0.00012177415725280166, + "loss": 0.2723, + "step": 22319 + }, + { + "epoch": 1.8081659105638366, + "grad_norm": 0.0671328529715538, + "learning_rate": 0.00012176965660020703, + "loss": 0.2777, + "step": 22320 + }, + { + "epoch": 1.808246921581335, + "grad_norm": 0.06827513873577118, + "learning_rate": 0.00012176515594761242, + "loss": 0.2782, + "step": 22321 + }, + { + "epoch": 1.8083279325988335, + "grad_norm": 0.06724611669778824, + "learning_rate": 0.00012176065529501779, + "loss": 0.2963, + "step": 22322 + }, + { + "epoch": 1.8084089436163318, + "grad_norm": 0.05954742431640625, + "learning_rate": 0.00012175615464242315, + "loss": 0.2595, + "step": 22323 + }, + { + "epoch": 1.8084899546338302, + "grad_norm": 0.055643219500780106, + "learning_rate": 0.00012175165398982853, + "loss": 0.2942, + "step": 22324 + }, + { + "epoch": 1.8085709656513287, + "grad_norm": 0.0529947504401207, + "learning_rate": 0.0001217471533372339, + "loss": 0.2943, + "step": 22325 + }, + { + "epoch": 1.808651976668827, + "grad_norm": 0.0744687095284462, + "learning_rate": 0.00012174265268463927, + "loss": 0.2802, + "step": 22326 + }, + { + "epoch": 1.8087329876863252, + "grad_norm": 0.05510347709059715, + "learning_rate": 0.00012173815203204466, + "loss": 0.2314, + "step": 22327 + }, + { + "epoch": 1.8088139987038239, + "grad_norm": 0.05206920579075813, + "learning_rate": 0.00012173365137945003, + "loss": 0.2583, + "step": 22328 + }, + { + "epoch": 1.8088950097213221, + "grad_norm": 0.0690377876162529, + "learning_rate": 0.00012172915072685541, + "loss": 0.323, + "step": 22329 + }, + { + "epoch": 1.8089760207388204, + "grad_norm": 0.057178206741809845, + "learning_rate": 0.00012172465007426077, + "loss": 0.271, + "step": 22330 + }, + { + "epoch": 1.8090570317563188, + "grad_norm": 0.057551007717847824, + "learning_rate": 0.00012172014942166614, + "loss": 0.2791, + "step": 22331 + }, + { + "epoch": 1.8091380427738173, + "grad_norm": 0.07293134182691574, + "learning_rate": 0.00012171564876907151, + "loss": 0.3088, + "step": 22332 + }, + { + "epoch": 1.8092190537913155, + "grad_norm": 0.06492959707975388, + "learning_rate": 0.0001217111481164769, + "loss": 0.2929, + "step": 22333 + }, + { + "epoch": 1.809300064808814, + "grad_norm": 0.048726316541433334, + "learning_rate": 0.00012170664746388228, + "loss": 0.2456, + "step": 22334 + }, + { + "epoch": 1.8093810758263125, + "grad_norm": 0.04845815151929855, + "learning_rate": 0.00012170214681128765, + "loss": 0.2668, + "step": 22335 + }, + { + "epoch": 1.8094620868438107, + "grad_norm": 0.059659332036972046, + "learning_rate": 0.00012169764615869301, + "loss": 0.3305, + "step": 22336 + }, + { + "epoch": 1.8095430978613092, + "grad_norm": 0.04906664416193962, + "learning_rate": 0.00012169314550609838, + "loss": 0.2406, + "step": 22337 + }, + { + "epoch": 1.8096241088788076, + "grad_norm": 0.06083128973841667, + "learning_rate": 0.00012168864485350377, + "loss": 0.2755, + "step": 22338 + }, + { + "epoch": 1.809705119896306, + "grad_norm": 0.05594184622168541, + "learning_rate": 0.00012168414420090914, + "loss": 0.2585, + "step": 22339 + }, + { + "epoch": 1.8097861309138041, + "grad_norm": 0.059506163001060486, + "learning_rate": 0.00012167964354831452, + "loss": 0.2971, + "step": 22340 + }, + { + "epoch": 1.8098671419313026, + "grad_norm": 0.05431336164474487, + "learning_rate": 0.00012167514289571989, + "loss": 0.2766, + "step": 22341 + }, + { + "epoch": 1.809948152948801, + "grad_norm": 0.05741867795586586, + "learning_rate": 0.00012167064224312525, + "loss": 0.2573, + "step": 22342 + }, + { + "epoch": 1.8100291639662993, + "grad_norm": 0.05569988489151001, + "learning_rate": 0.00012166614159053062, + "loss": 0.2713, + "step": 22343 + }, + { + "epoch": 1.8101101749837978, + "grad_norm": 0.0600249357521534, + "learning_rate": 0.00012166164093793601, + "loss": 0.261, + "step": 22344 + }, + { + "epoch": 1.8101911860012962, + "grad_norm": 0.04880553111433983, + "learning_rate": 0.00012165714028534139, + "loss": 0.2457, + "step": 22345 + }, + { + "epoch": 1.8102721970187945, + "grad_norm": 0.05315748229622841, + "learning_rate": 0.00012165263963274676, + "loss": 0.2952, + "step": 22346 + }, + { + "epoch": 1.810353208036293, + "grad_norm": 0.04898060858249664, + "learning_rate": 0.00012164813898015213, + "loss": 0.2647, + "step": 22347 + }, + { + "epoch": 1.8104342190537914, + "grad_norm": 0.05683436244726181, + "learning_rate": 0.00012164363832755749, + "loss": 0.2885, + "step": 22348 + }, + { + "epoch": 1.8105152300712897, + "grad_norm": 0.05105239525437355, + "learning_rate": 0.00012163913767496287, + "loss": 0.26, + "step": 22349 + }, + { + "epoch": 1.810596241088788, + "grad_norm": 0.05811845883727074, + "learning_rate": 0.00012163463702236825, + "loss": 0.2592, + "step": 22350 + }, + { + "epoch": 1.8106772521062866, + "grad_norm": 0.06090196967124939, + "learning_rate": 0.00012163013636977363, + "loss": 0.2933, + "step": 22351 + }, + { + "epoch": 1.8107582631237849, + "grad_norm": 0.05475502833724022, + "learning_rate": 0.000121625635717179, + "loss": 0.2608, + "step": 22352 + }, + { + "epoch": 1.810839274141283, + "grad_norm": 0.050672296434640884, + "learning_rate": 0.00012162113506458437, + "loss": 0.2541, + "step": 22353 + }, + { + "epoch": 1.8109202851587816, + "grad_norm": 0.06462455540895462, + "learning_rate": 0.00012161663441198973, + "loss": 0.3082, + "step": 22354 + }, + { + "epoch": 1.81100129617628, + "grad_norm": 0.05794956535100937, + "learning_rate": 0.00012161213375939511, + "loss": 0.3007, + "step": 22355 + }, + { + "epoch": 1.8110823071937783, + "grad_norm": 0.06365778297185898, + "learning_rate": 0.0001216076331068005, + "loss": 0.2515, + "step": 22356 + }, + { + "epoch": 1.8111633182112767, + "grad_norm": 0.06921739131212234, + "learning_rate": 0.00012160313245420587, + "loss": 0.2999, + "step": 22357 + }, + { + "epoch": 1.8112443292287752, + "grad_norm": 0.05210886895656586, + "learning_rate": 0.00012159863180161124, + "loss": 0.2865, + "step": 22358 + }, + { + "epoch": 1.8113253402462735, + "grad_norm": 0.053139787167310715, + "learning_rate": 0.00012159413114901662, + "loss": 0.2675, + "step": 22359 + }, + { + "epoch": 1.811406351263772, + "grad_norm": 0.06554105877876282, + "learning_rate": 0.00012158963049642198, + "loss": 0.3042, + "step": 22360 + }, + { + "epoch": 1.8114873622812704, + "grad_norm": 0.05704993009567261, + "learning_rate": 0.00012158512984382735, + "loss": 0.2726, + "step": 22361 + }, + { + "epoch": 1.8115683732987686, + "grad_norm": 0.06123867630958557, + "learning_rate": 0.00012158062919123274, + "loss": 0.2919, + "step": 22362 + }, + { + "epoch": 1.8116493843162669, + "grad_norm": 0.04844033345580101, + "learning_rate": 0.00012157612853863811, + "loss": 0.2447, + "step": 22363 + }, + { + "epoch": 1.8117303953337653, + "grad_norm": 0.0578579381108284, + "learning_rate": 0.00012157162788604348, + "loss": 0.3067, + "step": 22364 + }, + { + "epoch": 1.8118114063512638, + "grad_norm": 0.061319511383771896, + "learning_rate": 0.00012156712723344886, + "loss": 0.2953, + "step": 22365 + }, + { + "epoch": 1.811892417368762, + "grad_norm": 0.0670505166053772, + "learning_rate": 0.00012156262658085422, + "loss": 0.2841, + "step": 22366 + }, + { + "epoch": 1.8119734283862605, + "grad_norm": 0.05111420527100563, + "learning_rate": 0.00012155812592825962, + "loss": 0.2544, + "step": 22367 + }, + { + "epoch": 1.812054439403759, + "grad_norm": 0.05738413333892822, + "learning_rate": 0.00012155362527566498, + "loss": 0.261, + "step": 22368 + }, + { + "epoch": 1.8121354504212572, + "grad_norm": 0.06913864612579346, + "learning_rate": 0.00012154912462307035, + "loss": 0.319, + "step": 22369 + }, + { + "epoch": 1.8122164614387557, + "grad_norm": 0.06058168411254883, + "learning_rate": 0.00012154462397047573, + "loss": 0.2474, + "step": 22370 + }, + { + "epoch": 1.8122974724562542, + "grad_norm": 0.06141381338238716, + "learning_rate": 0.0001215401233178811, + "loss": 0.2828, + "step": 22371 + }, + { + "epoch": 1.8123784834737524, + "grad_norm": 0.05288195610046387, + "learning_rate": 0.00012153562266528646, + "loss": 0.2572, + "step": 22372 + }, + { + "epoch": 1.8124594944912507, + "grad_norm": 0.06417234987020493, + "learning_rate": 0.00012153112201269186, + "loss": 0.3048, + "step": 22373 + }, + { + "epoch": 1.8125405055087493, + "grad_norm": 0.06014291197061539, + "learning_rate": 0.00012152662136009722, + "loss": 0.2615, + "step": 22374 + }, + { + "epoch": 1.8126215165262476, + "grad_norm": 0.050298839807510376, + "learning_rate": 0.0001215221207075026, + "loss": 0.2565, + "step": 22375 + }, + { + "epoch": 1.8127025275437458, + "grad_norm": 0.057888805866241455, + "learning_rate": 0.00012151762005490797, + "loss": 0.2453, + "step": 22376 + }, + { + "epoch": 1.8127835385612443, + "grad_norm": 0.05726255103945732, + "learning_rate": 0.00012151311940231334, + "loss": 0.2976, + "step": 22377 + }, + { + "epoch": 1.8128645495787428, + "grad_norm": 0.04736103117465973, + "learning_rate": 0.0001215086187497187, + "loss": 0.2447, + "step": 22378 + }, + { + "epoch": 1.812945560596241, + "grad_norm": 0.05614374577999115, + "learning_rate": 0.0001215041180971241, + "loss": 0.3026, + "step": 22379 + }, + { + "epoch": 1.8130265716137395, + "grad_norm": 0.059921517968177795, + "learning_rate": 0.00012149961744452946, + "loss": 0.3009, + "step": 22380 + }, + { + "epoch": 1.813107582631238, + "grad_norm": 0.058286361396312714, + "learning_rate": 0.00012149511679193484, + "loss": 0.2828, + "step": 22381 + }, + { + "epoch": 1.8131885936487362, + "grad_norm": 0.050801943987607956, + "learning_rate": 0.00012149061613934021, + "loss": 0.2702, + "step": 22382 + }, + { + "epoch": 1.8132696046662347, + "grad_norm": 0.05966091528534889, + "learning_rate": 0.00012148611548674558, + "loss": 0.2639, + "step": 22383 + }, + { + "epoch": 1.8133506156837331, + "grad_norm": 0.055925920605659485, + "learning_rate": 0.00012148161483415094, + "loss": 0.2673, + "step": 22384 + }, + { + "epoch": 1.8134316267012314, + "grad_norm": 0.056826021522283554, + "learning_rate": 0.00012147711418155634, + "loss": 0.3034, + "step": 22385 + }, + { + "epoch": 1.8135126377187296, + "grad_norm": 0.054868802428245544, + "learning_rate": 0.0001214726135289617, + "loss": 0.2957, + "step": 22386 + }, + { + "epoch": 1.813593648736228, + "grad_norm": 0.06180058419704437, + "learning_rate": 0.00012146811287636708, + "loss": 0.2785, + "step": 22387 + }, + { + "epoch": 1.8136746597537265, + "grad_norm": 0.04991845041513443, + "learning_rate": 0.00012146361222377245, + "loss": 0.2504, + "step": 22388 + }, + { + "epoch": 1.8137556707712248, + "grad_norm": 0.059130534529685974, + "learning_rate": 0.00012145911157117782, + "loss": 0.2916, + "step": 22389 + }, + { + "epoch": 1.8138366817887233, + "grad_norm": 0.06077086552977562, + "learning_rate": 0.00012145461091858318, + "loss": 0.2737, + "step": 22390 + }, + { + "epoch": 1.8139176928062217, + "grad_norm": 0.061018723994493484, + "learning_rate": 0.00012145011026598858, + "loss": 0.2709, + "step": 22391 + }, + { + "epoch": 1.81399870382372, + "grad_norm": 0.0630197674036026, + "learning_rate": 0.00012144560961339394, + "loss": 0.2471, + "step": 22392 + }, + { + "epoch": 1.8140797148412184, + "grad_norm": 0.05346192419528961, + "learning_rate": 0.00012144110896079932, + "loss": 0.2635, + "step": 22393 + }, + { + "epoch": 1.814160725858717, + "grad_norm": 0.05909271538257599, + "learning_rate": 0.00012143660830820469, + "loss": 0.2552, + "step": 22394 + }, + { + "epoch": 1.8142417368762151, + "grad_norm": 0.061077363789081573, + "learning_rate": 0.00012143210765561007, + "loss": 0.2628, + "step": 22395 + }, + { + "epoch": 1.8143227478937134, + "grad_norm": 0.0485931932926178, + "learning_rate": 0.00012142760700301545, + "loss": 0.2526, + "step": 22396 + }, + { + "epoch": 1.814403758911212, + "grad_norm": 0.07086597383022308, + "learning_rate": 0.00012142310635042083, + "loss": 0.3017, + "step": 22397 + }, + { + "epoch": 1.8144847699287103, + "grad_norm": 0.05354897305369377, + "learning_rate": 0.0001214186056978262, + "loss": 0.2934, + "step": 22398 + }, + { + "epoch": 1.8145657809462086, + "grad_norm": 0.060347460210323334, + "learning_rate": 0.00012141410504523156, + "loss": 0.2427, + "step": 22399 + }, + { + "epoch": 1.814646791963707, + "grad_norm": 0.07105302065610886, + "learning_rate": 0.00012140960439263693, + "loss": 0.3205, + "step": 22400 + }, + { + "epoch": 1.8147278029812055, + "grad_norm": 0.07145987451076508, + "learning_rate": 0.00012140510374004231, + "loss": 0.3012, + "step": 22401 + }, + { + "epoch": 1.8148088139987038, + "grad_norm": 0.05859648063778877, + "learning_rate": 0.0001214006030874477, + "loss": 0.2454, + "step": 22402 + }, + { + "epoch": 1.8148898250162022, + "grad_norm": 0.06137736886739731, + "learning_rate": 0.00012139610243485307, + "loss": 0.2506, + "step": 22403 + }, + { + "epoch": 1.8149708360337007, + "grad_norm": 0.0615663081407547, + "learning_rate": 0.00012139160178225844, + "loss": 0.2891, + "step": 22404 + }, + { + "epoch": 1.815051847051199, + "grad_norm": 0.07232252508401871, + "learning_rate": 0.0001213871011296638, + "loss": 0.2762, + "step": 22405 + }, + { + "epoch": 1.8151328580686974, + "grad_norm": 0.04825594276189804, + "learning_rate": 0.00012138260047706918, + "loss": 0.2539, + "step": 22406 + }, + { + "epoch": 1.8152138690861959, + "grad_norm": 0.061036497354507446, + "learning_rate": 0.00012137809982447455, + "loss": 0.2656, + "step": 22407 + }, + { + "epoch": 1.815294880103694, + "grad_norm": 0.04653603583574295, + "learning_rate": 0.00012137359917187994, + "loss": 0.2401, + "step": 22408 + }, + { + "epoch": 1.8153758911211924, + "grad_norm": 0.05841079726815224, + "learning_rate": 0.00012136909851928531, + "loss": 0.2981, + "step": 22409 + }, + { + "epoch": 1.8154569021386908, + "grad_norm": 0.05416569858789444, + "learning_rate": 0.00012136459786669068, + "loss": 0.2842, + "step": 22410 + }, + { + "epoch": 1.8155379131561893, + "grad_norm": 0.05413185432553291, + "learning_rate": 0.00012136009721409604, + "loss": 0.2934, + "step": 22411 + }, + { + "epoch": 1.8156189241736875, + "grad_norm": 0.05708204209804535, + "learning_rate": 0.00012135559656150142, + "loss": 0.2823, + "step": 22412 + }, + { + "epoch": 1.815699935191186, + "grad_norm": 0.05461487919092178, + "learning_rate": 0.00012135109590890679, + "loss": 0.2544, + "step": 22413 + }, + { + "epoch": 1.8157809462086845, + "grad_norm": 0.05890343710780144, + "learning_rate": 0.00012134659525631218, + "loss": 0.2831, + "step": 22414 + }, + { + "epoch": 1.8158619572261827, + "grad_norm": 0.04829421639442444, + "learning_rate": 0.00012134209460371755, + "loss": 0.2755, + "step": 22415 + }, + { + "epoch": 1.8159429682436812, + "grad_norm": 0.05846787244081497, + "learning_rate": 0.00012133759395112292, + "loss": 0.2883, + "step": 22416 + }, + { + "epoch": 1.8160239792611796, + "grad_norm": 0.05573653057217598, + "learning_rate": 0.00012133309329852828, + "loss": 0.2377, + "step": 22417 + }, + { + "epoch": 1.8161049902786779, + "grad_norm": 0.05826255679130554, + "learning_rate": 0.00012132859264593366, + "loss": 0.2521, + "step": 22418 + }, + { + "epoch": 1.8161860012961761, + "grad_norm": 0.07741843163967133, + "learning_rate": 0.00012132409199333905, + "loss": 0.3201, + "step": 22419 + }, + { + "epoch": 1.8162670123136748, + "grad_norm": 0.058812353760004044, + "learning_rate": 0.00012131959134074442, + "loss": 0.2444, + "step": 22420 + }, + { + "epoch": 1.816348023331173, + "grad_norm": 0.05757759511470795, + "learning_rate": 0.00012131509068814979, + "loss": 0.2913, + "step": 22421 + }, + { + "epoch": 1.8164290343486713, + "grad_norm": 0.06558262556791306, + "learning_rate": 0.00012131059003555517, + "loss": 0.2681, + "step": 22422 + }, + { + "epoch": 1.8165100453661698, + "grad_norm": 0.058929912745952606, + "learning_rate": 0.00012130608938296053, + "loss": 0.3001, + "step": 22423 + }, + { + "epoch": 1.8165910563836682, + "grad_norm": 0.055346451699733734, + "learning_rate": 0.0001213015887303659, + "loss": 0.2694, + "step": 22424 + }, + { + "epoch": 1.8166720674011665, + "grad_norm": 0.05668604001402855, + "learning_rate": 0.00012129708807777129, + "loss": 0.3082, + "step": 22425 + }, + { + "epoch": 1.816753078418665, + "grad_norm": 0.06419747322797775, + "learning_rate": 0.00012129258742517666, + "loss": 0.3171, + "step": 22426 + }, + { + "epoch": 1.8168340894361634, + "grad_norm": 0.05885817110538483, + "learning_rate": 0.00012128808677258203, + "loss": 0.3019, + "step": 22427 + }, + { + "epoch": 1.8169151004536617, + "grad_norm": 0.05111012980341911, + "learning_rate": 0.00012128358611998741, + "loss": 0.2704, + "step": 22428 + }, + { + "epoch": 1.81699611147116, + "grad_norm": 0.07075147330760956, + "learning_rate": 0.00012127908546739277, + "loss": 0.307, + "step": 22429 + }, + { + "epoch": 1.8170771224886586, + "grad_norm": 0.058915186673402786, + "learning_rate": 0.00012127458481479814, + "loss": 0.2834, + "step": 22430 + }, + { + "epoch": 1.8171581335061568, + "grad_norm": 0.051177240908145905, + "learning_rate": 0.00012127008416220353, + "loss": 0.2605, + "step": 22431 + }, + { + "epoch": 1.817239144523655, + "grad_norm": 0.06430277973413467, + "learning_rate": 0.0001212655835096089, + "loss": 0.2775, + "step": 22432 + }, + { + "epoch": 1.8173201555411536, + "grad_norm": 0.05540946498513222, + "learning_rate": 0.00012126108285701428, + "loss": 0.286, + "step": 22433 + }, + { + "epoch": 1.817401166558652, + "grad_norm": 0.05893925949931145, + "learning_rate": 0.00012125658220441965, + "loss": 0.311, + "step": 22434 + }, + { + "epoch": 1.8174821775761503, + "grad_norm": 0.06361778825521469, + "learning_rate": 0.00012125208155182501, + "loss": 0.2978, + "step": 22435 + }, + { + "epoch": 1.8175631885936487, + "grad_norm": 0.05619451403617859, + "learning_rate": 0.00012124758089923038, + "loss": 0.2462, + "step": 22436 + }, + { + "epoch": 1.8176441996111472, + "grad_norm": 0.06951126456260681, + "learning_rate": 0.00012124308024663577, + "loss": 0.309, + "step": 22437 + }, + { + "epoch": 1.8177252106286454, + "grad_norm": 0.06178022921085358, + "learning_rate": 0.00012123857959404114, + "loss": 0.2908, + "step": 22438 + }, + { + "epoch": 1.817806221646144, + "grad_norm": 0.0636647567152977, + "learning_rate": 0.00012123407894144652, + "loss": 0.2445, + "step": 22439 + }, + { + "epoch": 1.8178872326636424, + "grad_norm": 0.07269550859928131, + "learning_rate": 0.00012122957828885189, + "loss": 0.3106, + "step": 22440 + }, + { + "epoch": 1.8179682436811406, + "grad_norm": 0.05728549882769585, + "learning_rate": 0.00012122507763625725, + "loss": 0.2696, + "step": 22441 + }, + { + "epoch": 1.8180492546986389, + "grad_norm": 0.05145451799035072, + "learning_rate": 0.00012122057698366262, + "loss": 0.274, + "step": 22442 + }, + { + "epoch": 1.8181302657161373, + "grad_norm": 0.04999382793903351, + "learning_rate": 0.00012121607633106801, + "loss": 0.2461, + "step": 22443 + }, + { + "epoch": 1.8182112767336358, + "grad_norm": 0.04859451577067375, + "learning_rate": 0.00012121157567847339, + "loss": 0.2628, + "step": 22444 + }, + { + "epoch": 1.818292287751134, + "grad_norm": 0.049572959542274475, + "learning_rate": 0.00012120707502587876, + "loss": 0.2608, + "step": 22445 + }, + { + "epoch": 1.8183732987686325, + "grad_norm": 0.05134311690926552, + "learning_rate": 0.00012120257437328413, + "loss": 0.2482, + "step": 22446 + }, + { + "epoch": 1.818454309786131, + "grad_norm": 0.0611451230943203, + "learning_rate": 0.00012119807372068949, + "loss": 0.2283, + "step": 22447 + }, + { + "epoch": 1.8185353208036292, + "grad_norm": 0.052968040108680725, + "learning_rate": 0.0001211935730680949, + "loss": 0.2263, + "step": 22448 + }, + { + "epoch": 1.8186163318211277, + "grad_norm": 0.05763326212763786, + "learning_rate": 0.00012118907241550025, + "loss": 0.2637, + "step": 22449 + }, + { + "epoch": 1.8186973428386262, + "grad_norm": 0.05025547742843628, + "learning_rate": 0.00012118457176290563, + "loss": 0.3065, + "step": 22450 + }, + { + "epoch": 1.8187783538561244, + "grad_norm": 0.07037966698408127, + "learning_rate": 0.000121180071110311, + "loss": 0.2658, + "step": 22451 + }, + { + "epoch": 1.8188593648736227, + "grad_norm": 0.07079332321882248, + "learning_rate": 0.00012117557045771637, + "loss": 0.3103, + "step": 22452 + }, + { + "epoch": 1.8189403758911213, + "grad_norm": 0.05972728133201599, + "learning_rate": 0.00012117106980512173, + "loss": 0.3073, + "step": 22453 + }, + { + "epoch": 1.8190213869086196, + "grad_norm": 0.05895204469561577, + "learning_rate": 0.00012116656915252714, + "loss": 0.2826, + "step": 22454 + }, + { + "epoch": 1.8191023979261178, + "grad_norm": 0.0534057579934597, + "learning_rate": 0.0001211620684999325, + "loss": 0.2624, + "step": 22455 + }, + { + "epoch": 1.8191834089436163, + "grad_norm": 0.05191171541810036, + "learning_rate": 0.00012115756784733787, + "loss": 0.273, + "step": 22456 + }, + { + "epoch": 1.8192644199611148, + "grad_norm": 0.06158572435379028, + "learning_rate": 0.00012115306719474324, + "loss": 0.3172, + "step": 22457 + }, + { + "epoch": 1.819345430978613, + "grad_norm": 0.05240388587117195, + "learning_rate": 0.00012114856654214862, + "loss": 0.2498, + "step": 22458 + }, + { + "epoch": 1.8194264419961115, + "grad_norm": 0.06356768310070038, + "learning_rate": 0.00012114406588955398, + "loss": 0.3298, + "step": 22459 + }, + { + "epoch": 1.81950745301361, + "grad_norm": 0.06706894189119339, + "learning_rate": 0.00012113956523695938, + "loss": 0.3273, + "step": 22460 + }, + { + "epoch": 1.8195884640311082, + "grad_norm": 0.05673682317137718, + "learning_rate": 0.00012113506458436474, + "loss": 0.2624, + "step": 22461 + }, + { + "epoch": 1.8196694750486067, + "grad_norm": 0.05635488033294678, + "learning_rate": 0.00012113056393177011, + "loss": 0.2737, + "step": 22462 + }, + { + "epoch": 1.8197504860661051, + "grad_norm": 0.060851920396089554, + "learning_rate": 0.00012112606327917548, + "loss": 0.2629, + "step": 22463 + }, + { + "epoch": 1.8198314970836034, + "grad_norm": 0.06031657010316849, + "learning_rate": 0.00012112156262658086, + "loss": 0.2957, + "step": 22464 + }, + { + "epoch": 1.8199125081011016, + "grad_norm": 0.06104261055588722, + "learning_rate": 0.00012111706197398622, + "loss": 0.308, + "step": 22465 + }, + { + "epoch": 1.8199935191186, + "grad_norm": 0.05476023256778717, + "learning_rate": 0.00012111256132139162, + "loss": 0.2818, + "step": 22466 + }, + { + "epoch": 1.8200745301360985, + "grad_norm": 0.046863965690135956, + "learning_rate": 0.00012110806066879699, + "loss": 0.2285, + "step": 22467 + }, + { + "epoch": 1.8201555411535968, + "grad_norm": 0.06625159829854965, + "learning_rate": 0.00012110356001620235, + "loss": 0.2851, + "step": 22468 + }, + { + "epoch": 1.8202365521710953, + "grad_norm": 0.05833403021097183, + "learning_rate": 0.00012109905936360773, + "loss": 0.2991, + "step": 22469 + }, + { + "epoch": 1.8203175631885937, + "grad_norm": 0.06144651770591736, + "learning_rate": 0.0001210945587110131, + "loss": 0.2788, + "step": 22470 + }, + { + "epoch": 1.820398574206092, + "grad_norm": 0.06062743067741394, + "learning_rate": 0.00012109005805841849, + "loss": 0.2588, + "step": 22471 + }, + { + "epoch": 1.8204795852235904, + "grad_norm": 0.07043924182653427, + "learning_rate": 0.00012108555740582386, + "loss": 0.2513, + "step": 22472 + }, + { + "epoch": 1.820560596241089, + "grad_norm": 0.06090352311730385, + "learning_rate": 0.00012108105675322923, + "loss": 0.2469, + "step": 22473 + }, + { + "epoch": 1.8206416072585871, + "grad_norm": 0.0642428919672966, + "learning_rate": 0.0001210765561006346, + "loss": 0.2853, + "step": 22474 + }, + { + "epoch": 1.8207226182760854, + "grad_norm": 0.07086139917373657, + "learning_rate": 0.00012107205544803997, + "loss": 0.2791, + "step": 22475 + }, + { + "epoch": 1.820803629293584, + "grad_norm": 0.05878929793834686, + "learning_rate": 0.00012106755479544534, + "loss": 0.2854, + "step": 22476 + }, + { + "epoch": 1.8208846403110823, + "grad_norm": 0.07627884298563004, + "learning_rate": 0.00012106305414285073, + "loss": 0.287, + "step": 22477 + }, + { + "epoch": 1.8209656513285806, + "grad_norm": 0.06491941958665848, + "learning_rate": 0.0001210585534902561, + "loss": 0.2926, + "step": 22478 + }, + { + "epoch": 1.821046662346079, + "grad_norm": 0.06161245331168175, + "learning_rate": 0.00012105405283766148, + "loss": 0.2606, + "step": 22479 + }, + { + "epoch": 1.8211276733635775, + "grad_norm": 0.060672350227832794, + "learning_rate": 0.00012104955218506684, + "loss": 0.3011, + "step": 22480 + }, + { + "epoch": 1.8212086843810757, + "grad_norm": 0.054367486387491226, + "learning_rate": 0.00012104505153247221, + "loss": 0.2692, + "step": 22481 + }, + { + "epoch": 1.8212896953985742, + "grad_norm": 0.05251113697886467, + "learning_rate": 0.00012104055087987758, + "loss": 0.2704, + "step": 22482 + }, + { + "epoch": 1.8213707064160727, + "grad_norm": 0.058134134858846664, + "learning_rate": 0.00012103605022728297, + "loss": 0.2472, + "step": 22483 + }, + { + "epoch": 1.821451717433571, + "grad_norm": 0.05237071216106415, + "learning_rate": 0.00012103154957468834, + "loss": 0.2662, + "step": 22484 + }, + { + "epoch": 1.8215327284510694, + "grad_norm": 0.05184619873762131, + "learning_rate": 0.00012102704892209372, + "loss": 0.2549, + "step": 22485 + }, + { + "epoch": 1.8216137394685679, + "grad_norm": 0.060612794011831284, + "learning_rate": 0.00012102254826949908, + "loss": 0.2947, + "step": 22486 + }, + { + "epoch": 1.821694750486066, + "grad_norm": 0.057740937918424606, + "learning_rate": 0.00012101804761690445, + "loss": 0.2396, + "step": 22487 + }, + { + "epoch": 1.8217757615035644, + "grad_norm": 0.0475936233997345, + "learning_rate": 0.00012101354696430982, + "loss": 0.3043, + "step": 22488 + }, + { + "epoch": 1.8218567725210628, + "grad_norm": 0.052612967789173126, + "learning_rate": 0.00012100904631171521, + "loss": 0.2571, + "step": 22489 + }, + { + "epoch": 1.8219377835385613, + "grad_norm": 0.04497424140572548, + "learning_rate": 0.00012100454565912058, + "loss": 0.2296, + "step": 22490 + }, + { + "epoch": 1.8220187945560595, + "grad_norm": 0.054526977241039276, + "learning_rate": 0.00012100004500652596, + "loss": 0.2669, + "step": 22491 + }, + { + "epoch": 1.822099805573558, + "grad_norm": 0.06448981910943985, + "learning_rate": 0.00012099554435393132, + "loss": 0.2401, + "step": 22492 + }, + { + "epoch": 1.8221808165910565, + "grad_norm": 0.05319571495056152, + "learning_rate": 0.00012099104370133669, + "loss": 0.2617, + "step": 22493 + }, + { + "epoch": 1.8222618276085547, + "grad_norm": 0.056535281240940094, + "learning_rate": 0.00012098654304874207, + "loss": 0.2707, + "step": 22494 + }, + { + "epoch": 1.8223428386260532, + "grad_norm": 0.05342821404337883, + "learning_rate": 0.00012098204239614745, + "loss": 0.2579, + "step": 22495 + }, + { + "epoch": 1.8224238496435516, + "grad_norm": 0.041714731603860855, + "learning_rate": 0.00012097754174355283, + "loss": 0.2334, + "step": 22496 + }, + { + "epoch": 1.8225048606610499, + "grad_norm": 0.06172311678528786, + "learning_rate": 0.0001209730410909582, + "loss": 0.2553, + "step": 22497 + }, + { + "epoch": 1.8225858716785481, + "grad_norm": 0.06267467141151428, + "learning_rate": 0.00012096854043836356, + "loss": 0.2719, + "step": 22498 + }, + { + "epoch": 1.8226668826960468, + "grad_norm": 0.0709090530872345, + "learning_rate": 0.00012096403978576893, + "loss": 0.2472, + "step": 22499 + }, + { + "epoch": 1.822747893713545, + "grad_norm": 0.06478728353977203, + "learning_rate": 0.00012095953913317432, + "loss": 0.2908, + "step": 22500 + }, + { + "epoch": 1.8228289047310433, + "grad_norm": 0.053202226758003235, + "learning_rate": 0.0001209550384805797, + "loss": 0.2836, + "step": 22501 + }, + { + "epoch": 1.8229099157485418, + "grad_norm": 0.06402086466550827, + "learning_rate": 0.00012095053782798507, + "loss": 0.2871, + "step": 22502 + }, + { + "epoch": 1.8229909267660402, + "grad_norm": 0.04878729209303856, + "learning_rate": 0.00012094603717539044, + "loss": 0.2367, + "step": 22503 + }, + { + "epoch": 1.8230719377835385, + "grad_norm": 0.062460657209157944, + "learning_rate": 0.0001209415365227958, + "loss": 0.2806, + "step": 22504 + }, + { + "epoch": 1.823152948801037, + "grad_norm": 0.0667148008942604, + "learning_rate": 0.00012093703587020118, + "loss": 0.2773, + "step": 22505 + }, + { + "epoch": 1.8232339598185354, + "grad_norm": 0.0478152334690094, + "learning_rate": 0.00012093253521760656, + "loss": 0.2446, + "step": 22506 + }, + { + "epoch": 1.8233149708360337, + "grad_norm": 0.0614163763821125, + "learning_rate": 0.00012092803456501194, + "loss": 0.2799, + "step": 22507 + }, + { + "epoch": 1.8233959818535321, + "grad_norm": 0.05350629240274429, + "learning_rate": 0.00012092353391241731, + "loss": 0.2635, + "step": 22508 + }, + { + "epoch": 1.8234769928710306, + "grad_norm": 0.053439583629369736, + "learning_rate": 0.00012091903325982268, + "loss": 0.2455, + "step": 22509 + }, + { + "epoch": 1.8235580038885288, + "grad_norm": 0.059095874428749084, + "learning_rate": 0.00012091453260722804, + "loss": 0.3058, + "step": 22510 + }, + { + "epoch": 1.823639014906027, + "grad_norm": 0.06233348697423935, + "learning_rate": 0.00012091003195463342, + "loss": 0.3011, + "step": 22511 + }, + { + "epoch": 1.8237200259235256, + "grad_norm": 0.05984274297952652, + "learning_rate": 0.0001209055313020388, + "loss": 0.2479, + "step": 22512 + }, + { + "epoch": 1.823801036941024, + "grad_norm": 0.06020957976579666, + "learning_rate": 0.00012090103064944418, + "loss": 0.2889, + "step": 22513 + }, + { + "epoch": 1.8238820479585223, + "grad_norm": 0.06470814347267151, + "learning_rate": 0.00012089652999684955, + "loss": 0.3046, + "step": 22514 + }, + { + "epoch": 1.8239630589760207, + "grad_norm": 0.06276507675647736, + "learning_rate": 0.00012089202934425493, + "loss": 0.282, + "step": 22515 + }, + { + "epoch": 1.8240440699935192, + "grad_norm": 0.061302684247493744, + "learning_rate": 0.00012088752869166029, + "loss": 0.3058, + "step": 22516 + }, + { + "epoch": 1.8241250810110174, + "grad_norm": 0.07292281836271286, + "learning_rate": 0.00012088302803906566, + "loss": 0.2499, + "step": 22517 + }, + { + "epoch": 1.824206092028516, + "grad_norm": 0.05192513391375542, + "learning_rate": 0.00012087852738647105, + "loss": 0.2858, + "step": 22518 + }, + { + "epoch": 1.8242871030460144, + "grad_norm": 0.06113511696457863, + "learning_rate": 0.00012087402673387642, + "loss": 0.2708, + "step": 22519 + }, + { + "epoch": 1.8243681140635126, + "grad_norm": 0.06051720306277275, + "learning_rate": 0.00012086952608128179, + "loss": 0.2703, + "step": 22520 + }, + { + "epoch": 1.8244491250810109, + "grad_norm": 0.05868927389383316, + "learning_rate": 0.00012086502542868717, + "loss": 0.2879, + "step": 22521 + }, + { + "epoch": 1.8245301360985096, + "grad_norm": 0.05062629282474518, + "learning_rate": 0.00012086052477609253, + "loss": 0.2258, + "step": 22522 + }, + { + "epoch": 1.8246111471160078, + "grad_norm": 0.061690833419561386, + "learning_rate": 0.00012085602412349793, + "loss": 0.2863, + "step": 22523 + }, + { + "epoch": 1.824692158133506, + "grad_norm": 0.05033199116587639, + "learning_rate": 0.00012085152347090329, + "loss": 0.2558, + "step": 22524 + }, + { + "epoch": 1.8247731691510045, + "grad_norm": 0.06216473504900932, + "learning_rate": 0.00012084702281830866, + "loss": 0.3475, + "step": 22525 + }, + { + "epoch": 1.824854180168503, + "grad_norm": 0.05927857756614685, + "learning_rate": 0.00012084252216571403, + "loss": 0.2856, + "step": 22526 + }, + { + "epoch": 1.8249351911860012, + "grad_norm": 0.05006115138530731, + "learning_rate": 0.00012083802151311941, + "loss": 0.2688, + "step": 22527 + }, + { + "epoch": 1.8250162022034997, + "grad_norm": 0.05684928968548775, + "learning_rate": 0.00012083352086052477, + "loss": 0.2859, + "step": 22528 + }, + { + "epoch": 1.8250972132209982, + "grad_norm": 0.06044033542275429, + "learning_rate": 0.00012082902020793017, + "loss": 0.2736, + "step": 22529 + }, + { + "epoch": 1.8251782242384964, + "grad_norm": 0.0559692308306694, + "learning_rate": 0.00012082451955533553, + "loss": 0.229, + "step": 22530 + }, + { + "epoch": 1.8252592352559946, + "grad_norm": 0.05422244593501091, + "learning_rate": 0.0001208200189027409, + "loss": 0.3124, + "step": 22531 + }, + { + "epoch": 1.8253402462734933, + "grad_norm": 0.04698093235492706, + "learning_rate": 0.00012081551825014628, + "loss": 0.2666, + "step": 22532 + }, + { + "epoch": 1.8254212572909916, + "grad_norm": 0.048311080783605576, + "learning_rate": 0.00012081101759755165, + "loss": 0.236, + "step": 22533 + }, + { + "epoch": 1.8255022683084898, + "grad_norm": 0.05981531739234924, + "learning_rate": 0.00012080651694495701, + "loss": 0.2503, + "step": 22534 + }, + { + "epoch": 1.8255832793259883, + "grad_norm": 0.0633094534277916, + "learning_rate": 0.00012080201629236241, + "loss": 0.2585, + "step": 22535 + }, + { + "epoch": 1.8256642903434868, + "grad_norm": 0.05595608428120613, + "learning_rate": 0.00012079751563976778, + "loss": 0.2563, + "step": 22536 + }, + { + "epoch": 1.825745301360985, + "grad_norm": 0.055708158761262894, + "learning_rate": 0.00012079301498717314, + "loss": 0.2438, + "step": 22537 + }, + { + "epoch": 1.8258263123784835, + "grad_norm": 0.054020561277866364, + "learning_rate": 0.00012078851433457852, + "loss": 0.2861, + "step": 22538 + }, + { + "epoch": 1.825907323395982, + "grad_norm": 0.0549539290368557, + "learning_rate": 0.00012078401368198389, + "loss": 0.2523, + "step": 22539 + }, + { + "epoch": 1.8259883344134802, + "grad_norm": 0.04126188904047012, + "learning_rate": 0.00012077951302938925, + "loss": 0.2345, + "step": 22540 + }, + { + "epoch": 1.8260693454309787, + "grad_norm": 0.05515645444393158, + "learning_rate": 0.00012077501237679465, + "loss": 0.2437, + "step": 22541 + }, + { + "epoch": 1.8261503564484771, + "grad_norm": 0.05556858330965042, + "learning_rate": 0.00012077051172420003, + "loss": 0.2532, + "step": 22542 + }, + { + "epoch": 1.8262313674659754, + "grad_norm": 0.05830145254731178, + "learning_rate": 0.00012076601107160539, + "loss": 0.2999, + "step": 22543 + }, + { + "epoch": 1.8263123784834736, + "grad_norm": 0.05237169936299324, + "learning_rate": 0.00012076151041901076, + "loss": 0.2503, + "step": 22544 + }, + { + "epoch": 1.8263933895009723, + "grad_norm": 0.056736260652542114, + "learning_rate": 0.00012075700976641613, + "loss": 0.254, + "step": 22545 + }, + { + "epoch": 1.8264744005184705, + "grad_norm": 0.055458515882492065, + "learning_rate": 0.00012075250911382149, + "loss": 0.2374, + "step": 22546 + }, + { + "epoch": 1.8265554115359688, + "grad_norm": 0.05906181037425995, + "learning_rate": 0.0001207480084612269, + "loss": 0.2214, + "step": 22547 + }, + { + "epoch": 1.8266364225534673, + "grad_norm": 0.05620548501610756, + "learning_rate": 0.00012074350780863227, + "loss": 0.3144, + "step": 22548 + }, + { + "epoch": 1.8267174335709657, + "grad_norm": 0.065769724547863, + "learning_rate": 0.00012073900715603763, + "loss": 0.2542, + "step": 22549 + }, + { + "epoch": 1.826798444588464, + "grad_norm": 0.06099202483892441, + "learning_rate": 0.000120734506503443, + "loss": 0.2195, + "step": 22550 + }, + { + "epoch": 1.8268794556059624, + "grad_norm": 0.053566206246614456, + "learning_rate": 0.00012073000585084837, + "loss": 0.2438, + "step": 22551 + }, + { + "epoch": 1.826960466623461, + "grad_norm": 0.06798861920833588, + "learning_rate": 0.00012072550519825376, + "loss": 0.2738, + "step": 22552 + }, + { + "epoch": 1.8270414776409591, + "grad_norm": 0.05625709891319275, + "learning_rate": 0.00012072100454565914, + "loss": 0.2551, + "step": 22553 + }, + { + "epoch": 1.8271224886584574, + "grad_norm": 0.06199916824698448, + "learning_rate": 0.00012071650389306451, + "loss": 0.2348, + "step": 22554 + }, + { + "epoch": 1.827203499675956, + "grad_norm": 0.05430760234594345, + "learning_rate": 0.00012071200324046987, + "loss": 0.2629, + "step": 22555 + }, + { + "epoch": 1.8272845106934543, + "grad_norm": 0.07473081350326538, + "learning_rate": 0.00012070750258787524, + "loss": 0.3294, + "step": 22556 + }, + { + "epoch": 1.8273655217109526, + "grad_norm": 0.06008787825703621, + "learning_rate": 0.00012070300193528062, + "loss": 0.2457, + "step": 22557 + }, + { + "epoch": 1.827446532728451, + "grad_norm": 0.06692630052566528, + "learning_rate": 0.000120698501282686, + "loss": 0.2728, + "step": 22558 + }, + { + "epoch": 1.8275275437459495, + "grad_norm": 0.06054568663239479, + "learning_rate": 0.00012069400063009138, + "loss": 0.2718, + "step": 22559 + }, + { + "epoch": 1.8276085547634477, + "grad_norm": 0.06602507084608078, + "learning_rate": 0.00012068949997749675, + "loss": 0.2654, + "step": 22560 + }, + { + "epoch": 1.8276895657809462, + "grad_norm": 0.04704922437667847, + "learning_rate": 0.00012068499932490211, + "loss": 0.24, + "step": 22561 + }, + { + "epoch": 1.8277705767984447, + "grad_norm": 0.06324727833271027, + "learning_rate": 0.00012068049867230748, + "loss": 0.2936, + "step": 22562 + }, + { + "epoch": 1.827851587815943, + "grad_norm": 0.060656383633613586, + "learning_rate": 0.00012067599801971286, + "loss": 0.293, + "step": 22563 + }, + { + "epoch": 1.8279325988334414, + "grad_norm": 0.05807242542505264, + "learning_rate": 0.00012067149736711825, + "loss": 0.2623, + "step": 22564 + }, + { + "epoch": 1.8280136098509399, + "grad_norm": 0.06374472379684448, + "learning_rate": 0.00012066699671452362, + "loss": 0.2685, + "step": 22565 + }, + { + "epoch": 1.828094620868438, + "grad_norm": 0.05179629474878311, + "learning_rate": 0.00012066249606192899, + "loss": 0.2176, + "step": 22566 + }, + { + "epoch": 1.8281756318859363, + "grad_norm": 0.05624118819832802, + "learning_rate": 0.00012065799540933435, + "loss": 0.2816, + "step": 22567 + }, + { + "epoch": 1.8282566429034348, + "grad_norm": 0.06993228197097778, + "learning_rate": 0.00012065349475673973, + "loss": 0.2854, + "step": 22568 + }, + { + "epoch": 1.8283376539209333, + "grad_norm": 0.06136844679713249, + "learning_rate": 0.0001206489941041451, + "loss": 0.2589, + "step": 22569 + }, + { + "epoch": 1.8284186649384315, + "grad_norm": 0.05738857388496399, + "learning_rate": 0.00012064449345155049, + "loss": 0.2419, + "step": 22570 + }, + { + "epoch": 1.82849967595593, + "grad_norm": 0.05129186809062958, + "learning_rate": 0.00012063999279895586, + "loss": 0.2566, + "step": 22571 + }, + { + "epoch": 1.8285806869734285, + "grad_norm": 0.05626427382230759, + "learning_rate": 0.00012063549214636123, + "loss": 0.2962, + "step": 22572 + }, + { + "epoch": 1.8286616979909267, + "grad_norm": 0.06274127215147018, + "learning_rate": 0.0001206309914937666, + "loss": 0.304, + "step": 22573 + }, + { + "epoch": 1.8287427090084252, + "grad_norm": 0.06296470761299133, + "learning_rate": 0.00012062649084117197, + "loss": 0.2672, + "step": 22574 + }, + { + "epoch": 1.8288237200259236, + "grad_norm": 0.04759762063622475, + "learning_rate": 0.00012062199018857734, + "loss": 0.2423, + "step": 22575 + }, + { + "epoch": 1.8289047310434219, + "grad_norm": 0.04579182341694832, + "learning_rate": 0.00012061748953598273, + "loss": 0.2312, + "step": 22576 + }, + { + "epoch": 1.8289857420609201, + "grad_norm": 0.05473625659942627, + "learning_rate": 0.0001206129888833881, + "loss": 0.3182, + "step": 22577 + }, + { + "epoch": 1.8290667530784188, + "grad_norm": 0.06306344270706177, + "learning_rate": 0.00012060848823079348, + "loss": 0.2836, + "step": 22578 + }, + { + "epoch": 1.829147764095917, + "grad_norm": 0.059901390224695206, + "learning_rate": 0.00012060398757819884, + "loss": 0.276, + "step": 22579 + }, + { + "epoch": 1.8292287751134153, + "grad_norm": 0.06785433739423752, + "learning_rate": 0.00012059948692560421, + "loss": 0.2666, + "step": 22580 + }, + { + "epoch": 1.8293097861309138, + "grad_norm": 0.059782687574625015, + "learning_rate": 0.0001205949862730096, + "loss": 0.2356, + "step": 22581 + }, + { + "epoch": 1.8293907971484122, + "grad_norm": 0.05681954324245453, + "learning_rate": 0.00012059048562041497, + "loss": 0.2974, + "step": 22582 + }, + { + "epoch": 1.8294718081659105, + "grad_norm": 0.05416959896683693, + "learning_rate": 0.00012058598496782034, + "loss": 0.2732, + "step": 22583 + }, + { + "epoch": 1.829552819183409, + "grad_norm": 0.05503580719232559, + "learning_rate": 0.00012058148431522572, + "loss": 0.267, + "step": 22584 + }, + { + "epoch": 1.8296338302009074, + "grad_norm": 0.04536353796720505, + "learning_rate": 0.00012057698366263108, + "loss": 0.2597, + "step": 22585 + }, + { + "epoch": 1.8297148412184057, + "grad_norm": 0.05878216773271561, + "learning_rate": 0.00012057248301003645, + "loss": 0.2619, + "step": 22586 + }, + { + "epoch": 1.8297958522359041, + "grad_norm": 0.06327325850725174, + "learning_rate": 0.00012056798235744184, + "loss": 0.2762, + "step": 22587 + }, + { + "epoch": 1.8298768632534026, + "grad_norm": 0.04429469630122185, + "learning_rate": 0.00012056348170484721, + "loss": 0.2335, + "step": 22588 + }, + { + "epoch": 1.8299578742709008, + "grad_norm": 0.058379221707582474, + "learning_rate": 0.00012055898105225259, + "loss": 0.2927, + "step": 22589 + }, + { + "epoch": 1.830038885288399, + "grad_norm": 0.05577626824378967, + "learning_rate": 0.00012055448039965796, + "loss": 0.2665, + "step": 22590 + }, + { + "epoch": 1.8301198963058976, + "grad_norm": 0.06345677375793457, + "learning_rate": 0.00012054997974706332, + "loss": 0.2933, + "step": 22591 + }, + { + "epoch": 1.830200907323396, + "grad_norm": 0.0531621053814888, + "learning_rate": 0.00012054547909446869, + "loss": 0.27, + "step": 22592 + }, + { + "epoch": 1.8302819183408943, + "grad_norm": 0.06574061512947083, + "learning_rate": 0.00012054097844187408, + "loss": 0.2579, + "step": 22593 + }, + { + "epoch": 1.8303629293583927, + "grad_norm": 0.05340947210788727, + "learning_rate": 0.00012053647778927945, + "loss": 0.2503, + "step": 22594 + }, + { + "epoch": 1.8304439403758912, + "grad_norm": 0.06001076102256775, + "learning_rate": 0.00012053197713668483, + "loss": 0.2756, + "step": 22595 + }, + { + "epoch": 1.8305249513933894, + "grad_norm": 0.04578939825296402, + "learning_rate": 0.0001205274764840902, + "loss": 0.2726, + "step": 22596 + }, + { + "epoch": 1.830605962410888, + "grad_norm": 0.056231167167425156, + "learning_rate": 0.00012052297583149556, + "loss": 0.2572, + "step": 22597 + }, + { + "epoch": 1.8306869734283864, + "grad_norm": 0.050092145800590515, + "learning_rate": 0.00012051847517890093, + "loss": 0.2801, + "step": 22598 + }, + { + "epoch": 1.8307679844458846, + "grad_norm": 0.046886369585990906, + "learning_rate": 0.00012051397452630632, + "loss": 0.2583, + "step": 22599 + }, + { + "epoch": 1.8308489954633829, + "grad_norm": 0.06768523156642914, + "learning_rate": 0.0001205094738737117, + "loss": 0.3177, + "step": 22600 + }, + { + "epoch": 1.8309300064808816, + "grad_norm": 0.06049235910177231, + "learning_rate": 0.00012050497322111707, + "loss": 0.2889, + "step": 22601 + }, + { + "epoch": 1.8310110174983798, + "grad_norm": 0.04988478869199753, + "learning_rate": 0.00012050047256852244, + "loss": 0.3033, + "step": 22602 + }, + { + "epoch": 1.831092028515878, + "grad_norm": 0.053116343915462494, + "learning_rate": 0.0001204959719159278, + "loss": 0.2863, + "step": 22603 + }, + { + "epoch": 1.8311730395333765, + "grad_norm": 0.05728102847933769, + "learning_rate": 0.0001204914712633332, + "loss": 0.2863, + "step": 22604 + }, + { + "epoch": 1.831254050550875, + "grad_norm": 0.05136289820075035, + "learning_rate": 0.00012048697061073858, + "loss": 0.2814, + "step": 22605 + }, + { + "epoch": 1.8313350615683732, + "grad_norm": 0.05675578489899635, + "learning_rate": 0.00012048246995814394, + "loss": 0.2291, + "step": 22606 + }, + { + "epoch": 1.8314160725858717, + "grad_norm": 0.05223593860864639, + "learning_rate": 0.00012047796930554931, + "loss": 0.2373, + "step": 22607 + }, + { + "epoch": 1.8314970836033702, + "grad_norm": 0.05824046954512596, + "learning_rate": 0.00012047346865295468, + "loss": 0.264, + "step": 22608 + }, + { + "epoch": 1.8315780946208684, + "grad_norm": 0.059614140540361404, + "learning_rate": 0.00012046896800036004, + "loss": 0.2839, + "step": 22609 + }, + { + "epoch": 1.8316591056383669, + "grad_norm": 0.06695158779621124, + "learning_rate": 0.00012046446734776544, + "loss": 0.2891, + "step": 22610 + }, + { + "epoch": 1.8317401166558653, + "grad_norm": 0.061037980020046234, + "learning_rate": 0.00012045996669517082, + "loss": 0.3029, + "step": 22611 + }, + { + "epoch": 1.8318211276733636, + "grad_norm": 0.06011229753494263, + "learning_rate": 0.00012045546604257618, + "loss": 0.2629, + "step": 22612 + }, + { + "epoch": 1.8319021386908618, + "grad_norm": 0.04871377348899841, + "learning_rate": 0.00012045096538998155, + "loss": 0.2463, + "step": 22613 + }, + { + "epoch": 1.8319831497083603, + "grad_norm": 0.06099603325128555, + "learning_rate": 0.00012044646473738693, + "loss": 0.2795, + "step": 22614 + }, + { + "epoch": 1.8320641607258588, + "grad_norm": 0.05151505768299103, + "learning_rate": 0.00012044196408479229, + "loss": 0.2828, + "step": 22615 + }, + { + "epoch": 1.832145171743357, + "grad_norm": 0.06807029992341995, + "learning_rate": 0.00012043746343219769, + "loss": 0.2978, + "step": 22616 + }, + { + "epoch": 1.8322261827608555, + "grad_norm": 0.0644865408539772, + "learning_rate": 0.00012043296277960306, + "loss": 0.2447, + "step": 22617 + }, + { + "epoch": 1.832307193778354, + "grad_norm": 0.06668511033058167, + "learning_rate": 0.00012042846212700842, + "loss": 0.2266, + "step": 22618 + }, + { + "epoch": 1.8323882047958522, + "grad_norm": 0.0608198456466198, + "learning_rate": 0.0001204239614744138, + "loss": 0.2927, + "step": 22619 + }, + { + "epoch": 1.8324692158133506, + "grad_norm": 0.06732763350009918, + "learning_rate": 0.00012041946082181917, + "loss": 0.2836, + "step": 22620 + }, + { + "epoch": 1.8325502268308491, + "grad_norm": 0.052625562995672226, + "learning_rate": 0.00012041496016922453, + "loss": 0.2439, + "step": 22621 + }, + { + "epoch": 1.8326312378483474, + "grad_norm": 0.055617038160562515, + "learning_rate": 0.00012041045951662993, + "loss": 0.2672, + "step": 22622 + }, + { + "epoch": 1.8327122488658456, + "grad_norm": 0.06397406756877899, + "learning_rate": 0.0001204059588640353, + "loss": 0.2955, + "step": 22623 + }, + { + "epoch": 1.8327932598833443, + "grad_norm": 0.05224163830280304, + "learning_rate": 0.00012040145821144066, + "loss": 0.2504, + "step": 22624 + }, + { + "epoch": 1.8328742709008425, + "grad_norm": 0.057418178766965866, + "learning_rate": 0.00012039695755884603, + "loss": 0.2887, + "step": 22625 + }, + { + "epoch": 1.8329552819183408, + "grad_norm": 0.05472564697265625, + "learning_rate": 0.00012039245690625141, + "loss": 0.279, + "step": 22626 + }, + { + "epoch": 1.8330362929358393, + "grad_norm": 0.05555223673582077, + "learning_rate": 0.00012038795625365677, + "loss": 0.2792, + "step": 22627 + }, + { + "epoch": 1.8331173039533377, + "grad_norm": 0.05100194364786148, + "learning_rate": 0.00012038345560106217, + "loss": 0.2292, + "step": 22628 + }, + { + "epoch": 1.833198314970836, + "grad_norm": 0.057672880589962006, + "learning_rate": 0.00012037895494846754, + "loss": 0.3163, + "step": 22629 + }, + { + "epoch": 1.8332793259883344, + "grad_norm": 0.05746078118681908, + "learning_rate": 0.0001203744542958729, + "loss": 0.273, + "step": 22630 + }, + { + "epoch": 1.833360337005833, + "grad_norm": 0.04709317535161972, + "learning_rate": 0.00012036995364327828, + "loss": 0.2454, + "step": 22631 + }, + { + "epoch": 1.8334413480233311, + "grad_norm": 0.058300264179706573, + "learning_rate": 0.00012036545299068365, + "loss": 0.2748, + "step": 22632 + }, + { + "epoch": 1.8335223590408296, + "grad_norm": 0.050654564052820206, + "learning_rate": 0.00012036095233808904, + "loss": 0.2443, + "step": 22633 + }, + { + "epoch": 1.833603370058328, + "grad_norm": 0.05865953490138054, + "learning_rate": 0.00012035645168549441, + "loss": 0.2854, + "step": 22634 + }, + { + "epoch": 1.8336843810758263, + "grad_norm": 0.05909755825996399, + "learning_rate": 0.00012035195103289978, + "loss": 0.2888, + "step": 22635 + }, + { + "epoch": 1.8337653920933246, + "grad_norm": 0.06934584677219391, + "learning_rate": 0.00012034745038030514, + "loss": 0.2787, + "step": 22636 + }, + { + "epoch": 1.833846403110823, + "grad_norm": 0.0527278296649456, + "learning_rate": 0.00012034294972771052, + "loss": 0.2694, + "step": 22637 + }, + { + "epoch": 1.8339274141283215, + "grad_norm": 0.05175149068236351, + "learning_rate": 0.00012033844907511589, + "loss": 0.241, + "step": 22638 + }, + { + "epoch": 1.8340084251458197, + "grad_norm": 0.05036207661032677, + "learning_rate": 0.00012033394842252128, + "loss": 0.2634, + "step": 22639 + }, + { + "epoch": 1.8340894361633182, + "grad_norm": 0.05339133366942406, + "learning_rate": 0.00012032944776992665, + "loss": 0.2955, + "step": 22640 + }, + { + "epoch": 1.8341704471808167, + "grad_norm": 0.055165499448776245, + "learning_rate": 0.00012032494711733203, + "loss": 0.2867, + "step": 22641 + }, + { + "epoch": 1.834251458198315, + "grad_norm": 0.07881447672843933, + "learning_rate": 0.00012032044646473739, + "loss": 0.3049, + "step": 22642 + }, + { + "epoch": 1.8343324692158134, + "grad_norm": 0.05756150186061859, + "learning_rate": 0.00012031594581214276, + "loss": 0.2966, + "step": 22643 + }, + { + "epoch": 1.8344134802333119, + "grad_norm": 0.05701543390750885, + "learning_rate": 0.00012031144515954813, + "loss": 0.2866, + "step": 22644 + }, + { + "epoch": 1.83449449125081, + "grad_norm": 0.04993041232228279, + "learning_rate": 0.00012030694450695352, + "loss": 0.2601, + "step": 22645 + }, + { + "epoch": 1.8345755022683083, + "grad_norm": 0.05136844143271446, + "learning_rate": 0.0001203024438543589, + "loss": 0.3156, + "step": 22646 + }, + { + "epoch": 1.834656513285807, + "grad_norm": 0.060809340327978134, + "learning_rate": 0.00012029794320176427, + "loss": 0.3136, + "step": 22647 + }, + { + "epoch": 1.8347375243033053, + "grad_norm": 0.05689241737127304, + "learning_rate": 0.00012029344254916963, + "loss": 0.2737, + "step": 22648 + }, + { + "epoch": 1.8348185353208035, + "grad_norm": 0.06734489649534225, + "learning_rate": 0.000120288941896575, + "loss": 0.3179, + "step": 22649 + }, + { + "epoch": 1.834899546338302, + "grad_norm": 0.05590895935893059, + "learning_rate": 0.00012028444124398038, + "loss": 0.2887, + "step": 22650 + }, + { + "epoch": 1.8349805573558005, + "grad_norm": 0.06098778545856476, + "learning_rate": 0.00012027994059138576, + "loss": 0.2639, + "step": 22651 + }, + { + "epoch": 1.8350615683732987, + "grad_norm": 0.06430383026599884, + "learning_rate": 0.00012027543993879114, + "loss": 0.2732, + "step": 22652 + }, + { + "epoch": 1.8351425793907972, + "grad_norm": 0.060666609555482864, + "learning_rate": 0.00012027093928619651, + "loss": 0.2682, + "step": 22653 + }, + { + "epoch": 1.8352235904082956, + "grad_norm": 0.062334634363651276, + "learning_rate": 0.00012026643863360187, + "loss": 0.2374, + "step": 22654 + }, + { + "epoch": 1.8353046014257939, + "grad_norm": 0.06325268745422363, + "learning_rate": 0.00012026193798100724, + "loss": 0.2965, + "step": 22655 + }, + { + "epoch": 1.8353856124432921, + "grad_norm": 0.05274970829486847, + "learning_rate": 0.00012025743732841263, + "loss": 0.2384, + "step": 22656 + }, + { + "epoch": 1.8354666234607908, + "grad_norm": 0.05541786924004555, + "learning_rate": 0.000120252936675818, + "loss": 0.2724, + "step": 22657 + }, + { + "epoch": 1.835547634478289, + "grad_norm": 0.0628451406955719, + "learning_rate": 0.00012024843602322338, + "loss": 0.2188, + "step": 22658 + }, + { + "epoch": 1.8356286454957873, + "grad_norm": 0.05656782537698746, + "learning_rate": 0.00012024393537062875, + "loss": 0.2519, + "step": 22659 + }, + { + "epoch": 1.8357096565132858, + "grad_norm": 0.05050123482942581, + "learning_rate": 0.00012023943471803411, + "loss": 0.2645, + "step": 22660 + }, + { + "epoch": 1.8357906675307842, + "grad_norm": 0.0647178515791893, + "learning_rate": 0.00012023493406543948, + "loss": 0.3063, + "step": 22661 + }, + { + "epoch": 1.8358716785482825, + "grad_norm": 0.06083545461297035, + "learning_rate": 0.00012023043341284487, + "loss": 0.2363, + "step": 22662 + }, + { + "epoch": 1.835952689565781, + "grad_norm": 0.06450481712818146, + "learning_rate": 0.00012022593276025025, + "loss": 0.2894, + "step": 22663 + }, + { + "epoch": 1.8360337005832794, + "grad_norm": 0.059075742959976196, + "learning_rate": 0.00012022143210765562, + "loss": 0.2459, + "step": 22664 + }, + { + "epoch": 1.8361147116007777, + "grad_norm": 0.05525489151477814, + "learning_rate": 0.00012021693145506099, + "loss": 0.2711, + "step": 22665 + }, + { + "epoch": 1.8361957226182761, + "grad_norm": 0.05348137393593788, + "learning_rate": 0.00012021243080246635, + "loss": 0.3088, + "step": 22666 + }, + { + "epoch": 1.8362767336357746, + "grad_norm": 0.05961565673351288, + "learning_rate": 0.00012020793014987173, + "loss": 0.2618, + "step": 22667 + }, + { + "epoch": 1.8363577446532728, + "grad_norm": 0.056940242648124695, + "learning_rate": 0.00012020342949727711, + "loss": 0.283, + "step": 22668 + }, + { + "epoch": 1.836438755670771, + "grad_norm": 0.053428273648023605, + "learning_rate": 0.00012019892884468249, + "loss": 0.2821, + "step": 22669 + }, + { + "epoch": 1.8365197666882696, + "grad_norm": 0.05401530861854553, + "learning_rate": 0.00012019442819208786, + "loss": 0.2354, + "step": 22670 + }, + { + "epoch": 1.836600777705768, + "grad_norm": 0.07518289238214493, + "learning_rate": 0.00012018992753949323, + "loss": 0.2414, + "step": 22671 + }, + { + "epoch": 1.8366817887232663, + "grad_norm": 0.05973779782652855, + "learning_rate": 0.0001201854268868986, + "loss": 0.2419, + "step": 22672 + }, + { + "epoch": 1.8367627997407647, + "grad_norm": 0.05145569518208504, + "learning_rate": 0.00012018092623430397, + "loss": 0.2599, + "step": 22673 + }, + { + "epoch": 1.8368438107582632, + "grad_norm": 0.06088968366384506, + "learning_rate": 0.00012017642558170937, + "loss": 0.238, + "step": 22674 + }, + { + "epoch": 1.8369248217757614, + "grad_norm": 0.055228862911462784, + "learning_rate": 0.00012017192492911473, + "loss": 0.278, + "step": 22675 + }, + { + "epoch": 1.83700583279326, + "grad_norm": 0.04872765392065048, + "learning_rate": 0.0001201674242765201, + "loss": 0.2543, + "step": 22676 + }, + { + "epoch": 1.8370868438107584, + "grad_norm": 0.05401741713285446, + "learning_rate": 0.00012016292362392548, + "loss": 0.2407, + "step": 22677 + }, + { + "epoch": 1.8371678548282566, + "grad_norm": 0.059674959629774094, + "learning_rate": 0.00012015842297133084, + "loss": 0.2633, + "step": 22678 + }, + { + "epoch": 1.8372488658457549, + "grad_norm": 0.07011188566684723, + "learning_rate": 0.00012015392231873621, + "loss": 0.2824, + "step": 22679 + }, + { + "epoch": 1.8373298768632536, + "grad_norm": 0.05733582004904747, + "learning_rate": 0.00012014942166614161, + "loss": 0.2592, + "step": 22680 + }, + { + "epoch": 1.8374108878807518, + "grad_norm": 0.06655251979827881, + "learning_rate": 0.00012014492101354697, + "loss": 0.2793, + "step": 22681 + }, + { + "epoch": 1.83749189889825, + "grad_norm": 0.05587971210479736, + "learning_rate": 0.00012014042036095234, + "loss": 0.2447, + "step": 22682 + }, + { + "epoch": 1.8375729099157485, + "grad_norm": 0.052102141082286835, + "learning_rate": 0.00012013591970835772, + "loss": 0.2994, + "step": 22683 + }, + { + "epoch": 1.837653920933247, + "grad_norm": 0.05093378946185112, + "learning_rate": 0.00012013141905576308, + "loss": 0.2451, + "step": 22684 + }, + { + "epoch": 1.8377349319507452, + "grad_norm": 0.05560063570737839, + "learning_rate": 0.00012012691840316848, + "loss": 0.2608, + "step": 22685 + }, + { + "epoch": 1.8378159429682437, + "grad_norm": 0.07167977094650269, + "learning_rate": 0.00012012241775057385, + "loss": 0.2806, + "step": 22686 + }, + { + "epoch": 1.8378969539857422, + "grad_norm": 0.0680895745754242, + "learning_rate": 0.00012011791709797921, + "loss": 0.2729, + "step": 22687 + }, + { + "epoch": 1.8379779650032404, + "grad_norm": 0.06179485470056534, + "learning_rate": 0.00012011341644538459, + "loss": 0.2948, + "step": 22688 + }, + { + "epoch": 1.8380589760207389, + "grad_norm": 0.06343559920787811, + "learning_rate": 0.00012010891579278996, + "loss": 0.2774, + "step": 22689 + }, + { + "epoch": 1.8381399870382373, + "grad_norm": 0.07578915357589722, + "learning_rate": 0.00012010441514019532, + "loss": 0.3282, + "step": 22690 + }, + { + "epoch": 1.8382209980557356, + "grad_norm": 0.060906898230314255, + "learning_rate": 0.00012009991448760072, + "loss": 0.269, + "step": 22691 + }, + { + "epoch": 1.8383020090732338, + "grad_norm": 0.052891168743371964, + "learning_rate": 0.0001200954138350061, + "loss": 0.2587, + "step": 22692 + }, + { + "epoch": 1.8383830200907323, + "grad_norm": 0.04828254505991936, + "learning_rate": 0.00012009091318241145, + "loss": 0.28, + "step": 22693 + }, + { + "epoch": 1.8384640311082308, + "grad_norm": 0.06219147890806198, + "learning_rate": 0.00012008641252981683, + "loss": 0.2745, + "step": 22694 + }, + { + "epoch": 1.838545042125729, + "grad_norm": 0.06701318919658661, + "learning_rate": 0.0001200819118772222, + "loss": 0.2384, + "step": 22695 + }, + { + "epoch": 1.8386260531432275, + "grad_norm": 0.04830949380993843, + "learning_rate": 0.00012007741122462756, + "loss": 0.2375, + "step": 22696 + }, + { + "epoch": 1.838707064160726, + "grad_norm": 0.05603285878896713, + "learning_rate": 0.00012007291057203296, + "loss": 0.2444, + "step": 22697 + }, + { + "epoch": 1.8387880751782242, + "grad_norm": 0.06188029795885086, + "learning_rate": 0.00012006840991943834, + "loss": 0.2722, + "step": 22698 + }, + { + "epoch": 1.8388690861957226, + "grad_norm": 0.07010263204574585, + "learning_rate": 0.0001200639092668437, + "loss": 0.28, + "step": 22699 + }, + { + "epoch": 1.8389500972132211, + "grad_norm": 0.059973932802677155, + "learning_rate": 0.00012005940861424907, + "loss": 0.2692, + "step": 22700 + }, + { + "epoch": 1.8390311082307194, + "grad_norm": 0.0619681179523468, + "learning_rate": 0.00012005490796165444, + "loss": 0.3075, + "step": 22701 + }, + { + "epoch": 1.8391121192482176, + "grad_norm": 0.06769229471683502, + "learning_rate": 0.0001200504073090598, + "loss": 0.2876, + "step": 22702 + }, + { + "epoch": 1.8391931302657163, + "grad_norm": 0.060972172766923904, + "learning_rate": 0.0001200459066564652, + "loss": 0.2646, + "step": 22703 + }, + { + "epoch": 1.8392741412832145, + "grad_norm": 0.06796864420175552, + "learning_rate": 0.00012004140600387058, + "loss": 0.251, + "step": 22704 + }, + { + "epoch": 1.8393551523007128, + "grad_norm": 0.060075342655181885, + "learning_rate": 0.00012003690535127594, + "loss": 0.2577, + "step": 22705 + }, + { + "epoch": 1.8394361633182112, + "grad_norm": 0.06749142706394196, + "learning_rate": 0.00012003240469868131, + "loss": 0.2506, + "step": 22706 + }, + { + "epoch": 1.8395171743357097, + "grad_norm": 0.07474429905414581, + "learning_rate": 0.00012002790404608668, + "loss": 0.2846, + "step": 22707 + }, + { + "epoch": 1.839598185353208, + "grad_norm": 0.05930938944220543, + "learning_rate": 0.00012002340339349207, + "loss": 0.3026, + "step": 22708 + }, + { + "epoch": 1.8396791963707064, + "grad_norm": 0.06845049560070038, + "learning_rate": 0.00012001890274089744, + "loss": 0.2588, + "step": 22709 + }, + { + "epoch": 1.839760207388205, + "grad_norm": 0.059819675981998444, + "learning_rate": 0.00012001440208830282, + "loss": 0.2679, + "step": 22710 + }, + { + "epoch": 1.8398412184057031, + "grad_norm": 0.07567431777715683, + "learning_rate": 0.00012000990143570818, + "loss": 0.3168, + "step": 22711 + }, + { + "epoch": 1.8399222294232016, + "grad_norm": 0.06516402959823608, + "learning_rate": 0.00012000540078311355, + "loss": 0.2703, + "step": 22712 + }, + { + "epoch": 1.8400032404407, + "grad_norm": 0.05498339235782623, + "learning_rate": 0.00012000090013051893, + "loss": 0.2821, + "step": 22713 + }, + { + "epoch": 1.8400842514581983, + "grad_norm": 0.06723517179489136, + "learning_rate": 0.00011999639947792431, + "loss": 0.301, + "step": 22714 + }, + { + "epoch": 1.8401652624756966, + "grad_norm": 0.061191376298666, + "learning_rate": 0.00011999189882532969, + "loss": 0.2895, + "step": 22715 + }, + { + "epoch": 1.840246273493195, + "grad_norm": 0.0644366443157196, + "learning_rate": 0.00011998739817273506, + "loss": 0.2767, + "step": 22716 + }, + { + "epoch": 1.8403272845106935, + "grad_norm": 0.04802727699279785, + "learning_rate": 0.00011998289752014042, + "loss": 0.2429, + "step": 22717 + }, + { + "epoch": 1.8404082955281917, + "grad_norm": 0.061694443225860596, + "learning_rate": 0.0001199783968675458, + "loss": 0.3248, + "step": 22718 + }, + { + "epoch": 1.8404893065456902, + "grad_norm": 0.055218540132045746, + "learning_rate": 0.00011997389621495117, + "loss": 0.2767, + "step": 22719 + }, + { + "epoch": 1.8405703175631887, + "grad_norm": 0.06819997727870941, + "learning_rate": 0.00011996939556235655, + "loss": 0.2622, + "step": 22720 + }, + { + "epoch": 1.840651328580687, + "grad_norm": 0.04921705648303032, + "learning_rate": 0.00011996489490976193, + "loss": 0.2548, + "step": 22721 + }, + { + "epoch": 1.8407323395981854, + "grad_norm": 0.05520812049508095, + "learning_rate": 0.0001199603942571673, + "loss": 0.2925, + "step": 22722 + }, + { + "epoch": 1.8408133506156839, + "grad_norm": 0.05166729539632797, + "learning_rate": 0.00011995589360457266, + "loss": 0.2553, + "step": 22723 + }, + { + "epoch": 1.840894361633182, + "grad_norm": 0.045433759689331055, + "learning_rate": 0.00011995139295197804, + "loss": 0.246, + "step": 22724 + }, + { + "epoch": 1.8409753726506803, + "grad_norm": 0.044920552521944046, + "learning_rate": 0.00011994689229938341, + "loss": 0.2227, + "step": 22725 + }, + { + "epoch": 1.841056383668179, + "grad_norm": 0.05813392996788025, + "learning_rate": 0.0001199423916467888, + "loss": 0.2632, + "step": 22726 + }, + { + "epoch": 1.8411373946856773, + "grad_norm": 0.06702584773302078, + "learning_rate": 0.00011993789099419417, + "loss": 0.2737, + "step": 22727 + }, + { + "epoch": 1.8412184057031755, + "grad_norm": 0.05693779140710831, + "learning_rate": 0.00011993339034159954, + "loss": 0.2882, + "step": 22728 + }, + { + "epoch": 1.841299416720674, + "grad_norm": 0.06254604458808899, + "learning_rate": 0.0001199288896890049, + "loss": 0.2929, + "step": 22729 + }, + { + "epoch": 1.8413804277381725, + "grad_norm": 0.05122413486242294, + "learning_rate": 0.00011992438903641028, + "loss": 0.2891, + "step": 22730 + }, + { + "epoch": 1.8414614387556707, + "grad_norm": 0.05988462269306183, + "learning_rate": 0.00011991988838381565, + "loss": 0.2963, + "step": 22731 + }, + { + "epoch": 1.8415424497731692, + "grad_norm": 0.05026065558195114, + "learning_rate": 0.00011991538773122104, + "loss": 0.2593, + "step": 22732 + }, + { + "epoch": 1.8416234607906676, + "grad_norm": 0.05141877382993698, + "learning_rate": 0.00011991088707862641, + "loss": 0.2585, + "step": 22733 + }, + { + "epoch": 1.8417044718081659, + "grad_norm": 0.05695733428001404, + "learning_rate": 0.00011990638642603178, + "loss": 0.3031, + "step": 22734 + }, + { + "epoch": 1.8417854828256643, + "grad_norm": 0.051710471510887146, + "learning_rate": 0.00011990188577343714, + "loss": 0.279, + "step": 22735 + }, + { + "epoch": 1.8418664938431628, + "grad_norm": 0.05895623937249184, + "learning_rate": 0.00011989738512084252, + "loss": 0.3285, + "step": 22736 + }, + { + "epoch": 1.841947504860661, + "grad_norm": 0.06714142858982086, + "learning_rate": 0.00011989288446824792, + "loss": 0.3021, + "step": 22737 + }, + { + "epoch": 1.8420285158781593, + "grad_norm": 0.06259685754776001, + "learning_rate": 0.00011988838381565328, + "loss": 0.2606, + "step": 22738 + }, + { + "epoch": 1.8421095268956578, + "grad_norm": 0.06058111786842346, + "learning_rate": 0.00011988388316305865, + "loss": 0.2916, + "step": 22739 + }, + { + "epoch": 1.8421905379131562, + "grad_norm": 0.06061285734176636, + "learning_rate": 0.00011987938251046403, + "loss": 0.2886, + "step": 22740 + }, + { + "epoch": 1.8422715489306545, + "grad_norm": 0.053787924349308014, + "learning_rate": 0.00011987488185786939, + "loss": 0.2605, + "step": 22741 + }, + { + "epoch": 1.842352559948153, + "grad_norm": 0.06060533970594406, + "learning_rate": 0.00011987038120527476, + "loss": 0.2729, + "step": 22742 + }, + { + "epoch": 1.8424335709656514, + "grad_norm": 0.052182767540216446, + "learning_rate": 0.00011986588055268016, + "loss": 0.2454, + "step": 22743 + }, + { + "epoch": 1.8425145819831497, + "grad_norm": 0.05701451748609543, + "learning_rate": 0.00011986137990008552, + "loss": 0.2899, + "step": 22744 + }, + { + "epoch": 1.8425955930006481, + "grad_norm": 0.07026814669370651, + "learning_rate": 0.0001198568792474909, + "loss": 0.2672, + "step": 22745 + }, + { + "epoch": 1.8426766040181466, + "grad_norm": 0.056616608053445816, + "learning_rate": 0.00011985237859489627, + "loss": 0.2921, + "step": 22746 + }, + { + "epoch": 1.8427576150356448, + "grad_norm": 0.05835625156760216, + "learning_rate": 0.00011984787794230163, + "loss": 0.2585, + "step": 22747 + }, + { + "epoch": 1.842838626053143, + "grad_norm": 0.05612039193511009, + "learning_rate": 0.000119843377289707, + "loss": 0.2238, + "step": 22748 + }, + { + "epoch": 1.8429196370706418, + "grad_norm": 0.062194813042879105, + "learning_rate": 0.0001198388766371124, + "loss": 0.2807, + "step": 22749 + }, + { + "epoch": 1.84300064808814, + "grad_norm": 0.057809729129076004, + "learning_rate": 0.00011983437598451776, + "loss": 0.264, + "step": 22750 + }, + { + "epoch": 1.8430816591056383, + "grad_norm": 0.07250712811946869, + "learning_rate": 0.00011982987533192314, + "loss": 0.2787, + "step": 22751 + }, + { + "epoch": 1.8431626701231367, + "grad_norm": 0.061580806970596313, + "learning_rate": 0.00011982537467932851, + "loss": 0.2796, + "step": 22752 + }, + { + "epoch": 1.8432436811406352, + "grad_norm": 0.0518670380115509, + "learning_rate": 0.00011982087402673387, + "loss": 0.263, + "step": 22753 + }, + { + "epoch": 1.8433246921581334, + "grad_norm": 0.07268506288528442, + "learning_rate": 0.00011981637337413924, + "loss": 0.2785, + "step": 22754 + }, + { + "epoch": 1.843405703175632, + "grad_norm": 0.05438043922185898, + "learning_rate": 0.00011981187272154464, + "loss": 0.2325, + "step": 22755 + }, + { + "epoch": 1.8434867141931304, + "grad_norm": 0.05696878209710121, + "learning_rate": 0.00011980737206895, + "loss": 0.2576, + "step": 22756 + }, + { + "epoch": 1.8435677252106286, + "grad_norm": 0.049444738775491714, + "learning_rate": 0.00011980287141635538, + "loss": 0.237, + "step": 22757 + }, + { + "epoch": 1.8436487362281269, + "grad_norm": 0.0545518733561039, + "learning_rate": 0.00011979837076376075, + "loss": 0.2593, + "step": 22758 + }, + { + "epoch": 1.8437297472456255, + "grad_norm": 0.050957489758729935, + "learning_rate": 0.00011979387011116611, + "loss": 0.2338, + "step": 22759 + }, + { + "epoch": 1.8438107582631238, + "grad_norm": 0.06445847451686859, + "learning_rate": 0.00011978936945857148, + "loss": 0.2653, + "step": 22760 + }, + { + "epoch": 1.843891769280622, + "grad_norm": 0.057379335165023804, + "learning_rate": 0.00011978486880597689, + "loss": 0.2853, + "step": 22761 + }, + { + "epoch": 1.8439727802981205, + "grad_norm": 0.05096464604139328, + "learning_rate": 0.00011978036815338225, + "loss": 0.2465, + "step": 22762 + }, + { + "epoch": 1.844053791315619, + "grad_norm": 0.06210287660360336, + "learning_rate": 0.00011977586750078762, + "loss": 0.2893, + "step": 22763 + }, + { + "epoch": 1.8441348023331172, + "grad_norm": 0.05852840095758438, + "learning_rate": 0.00011977136684819299, + "loss": 0.2559, + "step": 22764 + }, + { + "epoch": 1.8442158133506157, + "grad_norm": 0.06159196048974991, + "learning_rate": 0.00011976686619559835, + "loss": 0.2812, + "step": 22765 + }, + { + "epoch": 1.8442968243681142, + "grad_norm": 0.06781520694494247, + "learning_rate": 0.00011976236554300375, + "loss": 0.2419, + "step": 22766 + }, + { + "epoch": 1.8443778353856124, + "grad_norm": 0.06899692863225937, + "learning_rate": 0.00011975786489040913, + "loss": 0.2556, + "step": 22767 + }, + { + "epoch": 1.8444588464031109, + "grad_norm": 0.057173509150743484, + "learning_rate": 0.00011975336423781449, + "loss": 0.2945, + "step": 22768 + }, + { + "epoch": 1.8445398574206093, + "grad_norm": 0.05989151448011398, + "learning_rate": 0.00011974886358521986, + "loss": 0.2691, + "step": 22769 + }, + { + "epoch": 1.8446208684381076, + "grad_norm": 0.05088137835264206, + "learning_rate": 0.00011974436293262523, + "loss": 0.244, + "step": 22770 + }, + { + "epoch": 1.8447018794556058, + "grad_norm": 0.056217145174741745, + "learning_rate": 0.0001197398622800306, + "loss": 0.2451, + "step": 22771 + }, + { + "epoch": 1.8447828904731045, + "grad_norm": 0.062338441610336304, + "learning_rate": 0.000119735361627436, + "loss": 0.2763, + "step": 22772 + }, + { + "epoch": 1.8448639014906028, + "grad_norm": 0.05098249763250351, + "learning_rate": 0.00011973086097484137, + "loss": 0.2299, + "step": 22773 + }, + { + "epoch": 1.844944912508101, + "grad_norm": 0.06756927073001862, + "learning_rate": 0.00011972636032224673, + "loss": 0.2627, + "step": 22774 + }, + { + "epoch": 1.8450259235255995, + "grad_norm": 0.04866158589720726, + "learning_rate": 0.0001197218596696521, + "loss": 0.2783, + "step": 22775 + }, + { + "epoch": 1.845106934543098, + "grad_norm": 0.051096439361572266, + "learning_rate": 0.00011971735901705748, + "loss": 0.2673, + "step": 22776 + }, + { + "epoch": 1.8451879455605962, + "grad_norm": 0.057585157454013824, + "learning_rate": 0.00011971285836446285, + "loss": 0.2833, + "step": 22777 + }, + { + "epoch": 1.8452689565780946, + "grad_norm": 0.057650353759527206, + "learning_rate": 0.00011970835771186824, + "loss": 0.2923, + "step": 22778 + }, + { + "epoch": 1.845349967595593, + "grad_norm": 0.05406733974814415, + "learning_rate": 0.00011970385705927361, + "loss": 0.2456, + "step": 22779 + }, + { + "epoch": 1.8454309786130914, + "grad_norm": 0.05556425824761391, + "learning_rate": 0.00011969935640667897, + "loss": 0.2827, + "step": 22780 + }, + { + "epoch": 1.8455119896305896, + "grad_norm": 0.07153059542179108, + "learning_rate": 0.00011969485575408434, + "loss": 0.2633, + "step": 22781 + }, + { + "epoch": 1.8455930006480883, + "grad_norm": 0.06352782249450684, + "learning_rate": 0.00011969035510148972, + "loss": 0.2709, + "step": 22782 + }, + { + "epoch": 1.8456740116655865, + "grad_norm": 0.05468868836760521, + "learning_rate": 0.00011968585444889509, + "loss": 0.2873, + "step": 22783 + }, + { + "epoch": 1.8457550226830848, + "grad_norm": 0.055515628308057785, + "learning_rate": 0.00011968135379630048, + "loss": 0.2938, + "step": 22784 + }, + { + "epoch": 1.8458360337005832, + "grad_norm": 0.05868731066584587, + "learning_rate": 0.00011967685314370585, + "loss": 0.2835, + "step": 22785 + }, + { + "epoch": 1.8459170447180817, + "grad_norm": 0.05362274497747421, + "learning_rate": 0.00011967235249111121, + "loss": 0.2513, + "step": 22786 + }, + { + "epoch": 1.84599805573558, + "grad_norm": 0.05573923885822296, + "learning_rate": 0.00011966785183851659, + "loss": 0.2909, + "step": 22787 + }, + { + "epoch": 1.8460790667530784, + "grad_norm": 0.052229754626750946, + "learning_rate": 0.00011966335118592196, + "loss": 0.2706, + "step": 22788 + }, + { + "epoch": 1.846160077770577, + "grad_norm": 0.049126509577035904, + "learning_rate": 0.00011965885053332735, + "loss": 0.2298, + "step": 22789 + }, + { + "epoch": 1.8462410887880751, + "grad_norm": 0.05795193836092949, + "learning_rate": 0.00011965434988073272, + "loss": 0.3102, + "step": 22790 + }, + { + "epoch": 1.8463220998055736, + "grad_norm": 0.0662761703133583, + "learning_rate": 0.0001196498492281381, + "loss": 0.2588, + "step": 22791 + }, + { + "epoch": 1.846403110823072, + "grad_norm": 0.060541246086359024, + "learning_rate": 0.00011964534857554345, + "loss": 0.2954, + "step": 22792 + }, + { + "epoch": 1.8464841218405703, + "grad_norm": 0.055643677711486816, + "learning_rate": 0.00011964084792294883, + "loss": 0.2557, + "step": 22793 + }, + { + "epoch": 1.8465651328580686, + "grad_norm": 0.04613005742430687, + "learning_rate": 0.0001196363472703542, + "loss": 0.2715, + "step": 22794 + }, + { + "epoch": 1.846646143875567, + "grad_norm": 0.05167005583643913, + "learning_rate": 0.00011963184661775959, + "loss": 0.2513, + "step": 22795 + }, + { + "epoch": 1.8467271548930655, + "grad_norm": 0.05315526947379112, + "learning_rate": 0.00011962734596516496, + "loss": 0.238, + "step": 22796 + }, + { + "epoch": 1.8468081659105637, + "grad_norm": 0.05100571736693382, + "learning_rate": 0.00011962284531257034, + "loss": 0.2399, + "step": 22797 + }, + { + "epoch": 1.8468891769280622, + "grad_norm": 0.05591544508934021, + "learning_rate": 0.0001196183446599757, + "loss": 0.2991, + "step": 22798 + }, + { + "epoch": 1.8469701879455607, + "grad_norm": 0.055552974343299866, + "learning_rate": 0.00011961384400738107, + "loss": 0.249, + "step": 22799 + }, + { + "epoch": 1.847051198963059, + "grad_norm": 0.059836044907569885, + "learning_rate": 0.00011960934335478644, + "loss": 0.2737, + "step": 22800 + }, + { + "epoch": 1.8471322099805574, + "grad_norm": 0.058615636080503464, + "learning_rate": 0.00011960484270219183, + "loss": 0.2711, + "step": 22801 + }, + { + "epoch": 1.8472132209980558, + "grad_norm": 0.05661217123270035, + "learning_rate": 0.0001196003420495972, + "loss": 0.2498, + "step": 22802 + }, + { + "epoch": 1.847294232015554, + "grad_norm": 0.05448070913553238, + "learning_rate": 0.00011959584139700258, + "loss": 0.3288, + "step": 22803 + }, + { + "epoch": 1.8473752430330523, + "grad_norm": 0.05980324000120163, + "learning_rate": 0.00011959134074440794, + "loss": 0.2745, + "step": 22804 + }, + { + "epoch": 1.847456254050551, + "grad_norm": 0.060643620789051056, + "learning_rate": 0.00011958684009181331, + "loss": 0.2992, + "step": 22805 + }, + { + "epoch": 1.8475372650680493, + "grad_norm": 0.05414064601063728, + "learning_rate": 0.00011958233943921868, + "loss": 0.237, + "step": 22806 + }, + { + "epoch": 1.8476182760855475, + "grad_norm": 0.06307387351989746, + "learning_rate": 0.00011957783878662407, + "loss": 0.3098, + "step": 22807 + }, + { + "epoch": 1.847699287103046, + "grad_norm": 0.054019246250391006, + "learning_rate": 0.00011957333813402945, + "loss": 0.283, + "step": 22808 + }, + { + "epoch": 1.8477802981205445, + "grad_norm": 0.056204576045274734, + "learning_rate": 0.00011956883748143482, + "loss": 0.2118, + "step": 22809 + }, + { + "epoch": 1.8478613091380427, + "grad_norm": 0.05422830581665039, + "learning_rate": 0.00011956433682884018, + "loss": 0.2927, + "step": 22810 + }, + { + "epoch": 1.8479423201555412, + "grad_norm": 0.0619785338640213, + "learning_rate": 0.00011955983617624555, + "loss": 0.2982, + "step": 22811 + }, + { + "epoch": 1.8480233311730396, + "grad_norm": 0.04841968044638634, + "learning_rate": 0.00011955533552365093, + "loss": 0.2854, + "step": 22812 + }, + { + "epoch": 1.8481043421905379, + "grad_norm": 0.05482259392738342, + "learning_rate": 0.00011955083487105631, + "loss": 0.3004, + "step": 22813 + }, + { + "epoch": 1.8481853532080363, + "grad_norm": 0.05952038988471031, + "learning_rate": 0.00011954633421846169, + "loss": 0.274, + "step": 22814 + }, + { + "epoch": 1.8482663642255348, + "grad_norm": 0.06057178974151611, + "learning_rate": 0.00011954183356586706, + "loss": 0.3035, + "step": 22815 + }, + { + "epoch": 1.848347375243033, + "grad_norm": 0.05620408430695534, + "learning_rate": 0.00011953733291327242, + "loss": 0.2811, + "step": 22816 + }, + { + "epoch": 1.8484283862605313, + "grad_norm": 0.05309577286243439, + "learning_rate": 0.0001195328322606778, + "loss": 0.2644, + "step": 22817 + }, + { + "epoch": 1.8485093972780298, + "grad_norm": 0.0493699349462986, + "learning_rate": 0.0001195283316080832, + "loss": 0.225, + "step": 22818 + }, + { + "epoch": 1.8485904082955282, + "grad_norm": 0.04509785398840904, + "learning_rate": 0.00011952383095548855, + "loss": 0.2254, + "step": 22819 + }, + { + "epoch": 1.8486714193130265, + "grad_norm": 0.059793125838041306, + "learning_rate": 0.00011951933030289393, + "loss": 0.2752, + "step": 22820 + }, + { + "epoch": 1.848752430330525, + "grad_norm": 0.05882219597697258, + "learning_rate": 0.0001195148296502993, + "loss": 0.278, + "step": 22821 + }, + { + "epoch": 1.8488334413480234, + "grad_norm": 0.05098733305931091, + "learning_rate": 0.00011951032899770466, + "loss": 0.2775, + "step": 22822 + }, + { + "epoch": 1.8489144523655217, + "grad_norm": 0.06307312101125717, + "learning_rate": 0.00011950582834511004, + "loss": 0.2493, + "step": 22823 + }, + { + "epoch": 1.8489954633830201, + "grad_norm": 0.056472472846508026, + "learning_rate": 0.00011950132769251544, + "loss": 0.2778, + "step": 22824 + }, + { + "epoch": 1.8490764744005186, + "grad_norm": 0.045031849294900894, + "learning_rate": 0.0001194968270399208, + "loss": 0.2271, + "step": 22825 + }, + { + "epoch": 1.8491574854180168, + "grad_norm": 0.07028354704380035, + "learning_rate": 0.00011949232638732617, + "loss": 0.2795, + "step": 22826 + }, + { + "epoch": 1.849238496435515, + "grad_norm": 0.06089423596858978, + "learning_rate": 0.00011948782573473154, + "loss": 0.2195, + "step": 22827 + }, + { + "epoch": 1.8493195074530138, + "grad_norm": 0.05401669070124626, + "learning_rate": 0.0001194833250821369, + "loss": 0.2605, + "step": 22828 + }, + { + "epoch": 1.849400518470512, + "grad_norm": 0.05383060872554779, + "learning_rate": 0.00011947882442954228, + "loss": 0.2788, + "step": 22829 + }, + { + "epoch": 1.8494815294880103, + "grad_norm": 0.053320858627557755, + "learning_rate": 0.00011947432377694768, + "loss": 0.2573, + "step": 22830 + }, + { + "epoch": 1.8495625405055087, + "grad_norm": 0.048075009137392044, + "learning_rate": 0.00011946982312435304, + "loss": 0.2569, + "step": 22831 + }, + { + "epoch": 1.8496435515230072, + "grad_norm": 0.05769873037934303, + "learning_rate": 0.00011946532247175841, + "loss": 0.2581, + "step": 22832 + }, + { + "epoch": 1.8497245625405054, + "grad_norm": 0.053351398557424545, + "learning_rate": 0.00011946082181916379, + "loss": 0.2496, + "step": 22833 + }, + { + "epoch": 1.849805573558004, + "grad_norm": 0.05518243834376335, + "learning_rate": 0.00011945632116656915, + "loss": 0.2513, + "step": 22834 + }, + { + "epoch": 1.8498865845755024, + "grad_norm": 0.04904431477189064, + "learning_rate": 0.00011945182051397452, + "loss": 0.2684, + "step": 22835 + }, + { + "epoch": 1.8499675955930006, + "grad_norm": 0.06256154924631119, + "learning_rate": 0.00011944731986137992, + "loss": 0.2536, + "step": 22836 + }, + { + "epoch": 1.850048606610499, + "grad_norm": 0.05640607327222824, + "learning_rate": 0.00011944281920878528, + "loss": 0.235, + "step": 22837 + }, + { + "epoch": 1.8501296176279975, + "grad_norm": 0.06628984957933426, + "learning_rate": 0.00011943831855619065, + "loss": 0.273, + "step": 22838 + }, + { + "epoch": 1.8502106286454958, + "grad_norm": 0.07006203383207321, + "learning_rate": 0.00011943381790359603, + "loss": 0.2872, + "step": 22839 + }, + { + "epoch": 1.850291639662994, + "grad_norm": 0.05886678397655487, + "learning_rate": 0.0001194293172510014, + "loss": 0.2358, + "step": 22840 + }, + { + "epoch": 1.8503726506804925, + "grad_norm": 0.05628499016165733, + "learning_rate": 0.00011942481659840679, + "loss": 0.2739, + "step": 22841 + }, + { + "epoch": 1.850453661697991, + "grad_norm": 0.052815746515989304, + "learning_rate": 0.00011942031594581216, + "loss": 0.2539, + "step": 22842 + }, + { + "epoch": 1.8505346727154892, + "grad_norm": 0.07488470524549484, + "learning_rate": 0.00011941581529321752, + "loss": 0.3107, + "step": 22843 + }, + { + "epoch": 1.8506156837329877, + "grad_norm": 0.0484716072678566, + "learning_rate": 0.0001194113146406229, + "loss": 0.2826, + "step": 22844 + }, + { + "epoch": 1.8506966947504861, + "grad_norm": 0.05985249578952789, + "learning_rate": 0.00011940681398802827, + "loss": 0.2711, + "step": 22845 + }, + { + "epoch": 1.8507777057679844, + "grad_norm": 0.05810718238353729, + "learning_rate": 0.00011940231333543364, + "loss": 0.274, + "step": 22846 + }, + { + "epoch": 1.8508587167854829, + "grad_norm": 0.05745183303952217, + "learning_rate": 0.00011939781268283903, + "loss": 0.279, + "step": 22847 + }, + { + "epoch": 1.8509397278029813, + "grad_norm": 0.050858549773693085, + "learning_rate": 0.0001193933120302444, + "loss": 0.2828, + "step": 22848 + }, + { + "epoch": 1.8510207388204796, + "grad_norm": 0.04445550590753555, + "learning_rate": 0.00011938881137764976, + "loss": 0.2621, + "step": 22849 + }, + { + "epoch": 1.8511017498379778, + "grad_norm": 0.05249026045203209, + "learning_rate": 0.00011938431072505514, + "loss": 0.2748, + "step": 22850 + }, + { + "epoch": 1.8511827608554765, + "grad_norm": 0.05515358969569206, + "learning_rate": 0.00011937981007246051, + "loss": 0.2476, + "step": 22851 + }, + { + "epoch": 1.8512637718729748, + "grad_norm": 0.05456148460507393, + "learning_rate": 0.00011937530941986588, + "loss": 0.3071, + "step": 22852 + }, + { + "epoch": 1.851344782890473, + "grad_norm": 0.05495483800768852, + "learning_rate": 0.00011937080876727127, + "loss": 0.2823, + "step": 22853 + }, + { + "epoch": 1.8514257939079715, + "grad_norm": 0.056710973381996155, + "learning_rate": 0.00011936630811467664, + "loss": 0.26, + "step": 22854 + }, + { + "epoch": 1.85150680492547, + "grad_norm": 0.05013580992817879, + "learning_rate": 0.000119361807462082, + "loss": 0.2832, + "step": 22855 + }, + { + "epoch": 1.8515878159429682, + "grad_norm": 0.0600869245827198, + "learning_rate": 0.00011935730680948738, + "loss": 0.2805, + "step": 22856 + }, + { + "epoch": 1.8516688269604666, + "grad_norm": 0.059895481914281845, + "learning_rate": 0.00011935280615689275, + "loss": 0.3037, + "step": 22857 + }, + { + "epoch": 1.851749837977965, + "grad_norm": 0.055746156722307205, + "learning_rate": 0.00011934830550429813, + "loss": 0.2847, + "step": 22858 + }, + { + "epoch": 1.8518308489954634, + "grad_norm": 0.0649481788277626, + "learning_rate": 0.00011934380485170351, + "loss": 0.2879, + "step": 22859 + }, + { + "epoch": 1.8519118600129616, + "grad_norm": 0.0539991594851017, + "learning_rate": 0.00011933930419910889, + "loss": 0.2815, + "step": 22860 + }, + { + "epoch": 1.8519928710304603, + "grad_norm": 0.056314416229724884, + "learning_rate": 0.00011933480354651425, + "loss": 0.2749, + "step": 22861 + }, + { + "epoch": 1.8520738820479585, + "grad_norm": 0.052914902567863464, + "learning_rate": 0.00011933030289391962, + "loss": 0.2394, + "step": 22862 + }, + { + "epoch": 1.8521548930654568, + "grad_norm": 0.056165698915719986, + "learning_rate": 0.00011932580224132499, + "loss": 0.2677, + "step": 22863 + }, + { + "epoch": 1.8522359040829552, + "grad_norm": 0.06530257314443588, + "learning_rate": 0.00011932130158873037, + "loss": 0.2919, + "step": 22864 + }, + { + "epoch": 1.8523169151004537, + "grad_norm": 0.06353907287120819, + "learning_rate": 0.00011931680093613575, + "loss": 0.2929, + "step": 22865 + }, + { + "epoch": 1.852397926117952, + "grad_norm": 0.06988402456045151, + "learning_rate": 0.00011931230028354113, + "loss": 0.2786, + "step": 22866 + }, + { + "epoch": 1.8524789371354504, + "grad_norm": 0.055794063955545425, + "learning_rate": 0.00011930779963094649, + "loss": 0.2701, + "step": 22867 + }, + { + "epoch": 1.8525599481529489, + "grad_norm": 0.06117599457502365, + "learning_rate": 0.00011930329897835186, + "loss": 0.2802, + "step": 22868 + }, + { + "epoch": 1.8526409591704471, + "grad_norm": 0.04874401167035103, + "learning_rate": 0.00011929879832575723, + "loss": 0.2767, + "step": 22869 + }, + { + "epoch": 1.8527219701879456, + "grad_norm": 0.04821452498435974, + "learning_rate": 0.00011929429767316262, + "loss": 0.253, + "step": 22870 + }, + { + "epoch": 1.852802981205444, + "grad_norm": 0.0526653490960598, + "learning_rate": 0.000119289797020568, + "loss": 0.2845, + "step": 22871 + }, + { + "epoch": 1.8528839922229423, + "grad_norm": 0.05268080160021782, + "learning_rate": 0.00011928529636797337, + "loss": 0.2513, + "step": 22872 + }, + { + "epoch": 1.8529650032404406, + "grad_norm": 0.06358671933412552, + "learning_rate": 0.00011928079571537873, + "loss": 0.2685, + "step": 22873 + }, + { + "epoch": 1.8530460142579392, + "grad_norm": 0.04742802679538727, + "learning_rate": 0.0001192762950627841, + "loss": 0.3032, + "step": 22874 + }, + { + "epoch": 1.8531270252754375, + "grad_norm": 0.06022234261035919, + "learning_rate": 0.00011927179441018948, + "loss": 0.2919, + "step": 22875 + }, + { + "epoch": 1.8532080362929357, + "grad_norm": 0.06292662769556046, + "learning_rate": 0.00011926729375759486, + "loss": 0.2676, + "step": 22876 + }, + { + "epoch": 1.8532890473104342, + "grad_norm": 0.0531792975962162, + "learning_rate": 0.00011926279310500024, + "loss": 0.2299, + "step": 22877 + }, + { + "epoch": 1.8533700583279327, + "grad_norm": 0.0589904747903347, + "learning_rate": 0.00011925829245240561, + "loss": 0.263, + "step": 22878 + }, + { + "epoch": 1.853451069345431, + "grad_norm": 0.05586778372526169, + "learning_rate": 0.00011925379179981097, + "loss": 0.2393, + "step": 22879 + }, + { + "epoch": 1.8535320803629294, + "grad_norm": 0.05040878430008888, + "learning_rate": 0.00011924929114721634, + "loss": 0.3131, + "step": 22880 + }, + { + "epoch": 1.8536130913804278, + "grad_norm": 0.060290947556495667, + "learning_rate": 0.00011924479049462172, + "loss": 0.2753, + "step": 22881 + }, + { + "epoch": 1.853694102397926, + "grad_norm": 0.0618286095559597, + "learning_rate": 0.0001192402898420271, + "loss": 0.3063, + "step": 22882 + }, + { + "epoch": 1.8537751134154243, + "grad_norm": 0.06807377934455872, + "learning_rate": 0.00011923578918943248, + "loss": 0.2963, + "step": 22883 + }, + { + "epoch": 1.853856124432923, + "grad_norm": 0.0539591945707798, + "learning_rate": 0.00011923128853683785, + "loss": 0.2659, + "step": 22884 + }, + { + "epoch": 1.8539371354504213, + "grad_norm": 0.0648304671049118, + "learning_rate": 0.00011922678788424321, + "loss": 0.2938, + "step": 22885 + }, + { + "epoch": 1.8540181464679195, + "grad_norm": 0.0472523532807827, + "learning_rate": 0.00011922228723164859, + "loss": 0.2033, + "step": 22886 + }, + { + "epoch": 1.854099157485418, + "grad_norm": 0.07764612138271332, + "learning_rate": 0.00011921778657905396, + "loss": 0.3092, + "step": 22887 + }, + { + "epoch": 1.8541801685029164, + "grad_norm": 0.05178683251142502, + "learning_rate": 0.00011921328592645935, + "loss": 0.2803, + "step": 22888 + }, + { + "epoch": 1.8542611795204147, + "grad_norm": 0.06465435773134232, + "learning_rate": 0.00011920878527386472, + "loss": 0.2492, + "step": 22889 + }, + { + "epoch": 1.8543421905379132, + "grad_norm": 0.06032474339008331, + "learning_rate": 0.0001192042846212701, + "loss": 0.2915, + "step": 22890 + }, + { + "epoch": 1.8544232015554116, + "grad_norm": 0.06960516422986984, + "learning_rate": 0.00011919978396867545, + "loss": 0.2857, + "step": 22891 + }, + { + "epoch": 1.8545042125729099, + "grad_norm": 0.05207527056336403, + "learning_rate": 0.00011919528331608083, + "loss": 0.2604, + "step": 22892 + }, + { + "epoch": 1.8545852235904083, + "grad_norm": 0.049169376492500305, + "learning_rate": 0.0001191907826634862, + "loss": 0.2482, + "step": 22893 + }, + { + "epoch": 1.8546662346079068, + "grad_norm": 0.047690924257040024, + "learning_rate": 0.00011918628201089159, + "loss": 0.2336, + "step": 22894 + }, + { + "epoch": 1.854747245625405, + "grad_norm": 0.05421299859881401, + "learning_rate": 0.00011918178135829696, + "loss": 0.2772, + "step": 22895 + }, + { + "epoch": 1.8548282566429033, + "grad_norm": 0.056196488440036774, + "learning_rate": 0.00011917728070570234, + "loss": 0.2582, + "step": 22896 + }, + { + "epoch": 1.8549092676604018, + "grad_norm": 0.06981717050075531, + "learning_rate": 0.0001191727800531077, + "loss": 0.2615, + "step": 22897 + }, + { + "epoch": 1.8549902786779002, + "grad_norm": 0.06893657892942429, + "learning_rate": 0.00011916827940051307, + "loss": 0.2834, + "step": 22898 + }, + { + "epoch": 1.8550712896953985, + "grad_norm": 0.05434175953269005, + "learning_rate": 0.00011916377874791847, + "loss": 0.2508, + "step": 22899 + }, + { + "epoch": 1.855152300712897, + "grad_norm": 0.055693451315164566, + "learning_rate": 0.00011915927809532383, + "loss": 0.2654, + "step": 22900 + }, + { + "epoch": 1.8552333117303954, + "grad_norm": 0.05152764916419983, + "learning_rate": 0.0001191547774427292, + "loss": 0.2714, + "step": 22901 + }, + { + "epoch": 1.8553143227478937, + "grad_norm": 0.05723453685641289, + "learning_rate": 0.00011915027679013458, + "loss": 0.3002, + "step": 22902 + }, + { + "epoch": 1.8553953337653921, + "grad_norm": 0.06853441894054413, + "learning_rate": 0.00011914577613753994, + "loss": 0.281, + "step": 22903 + }, + { + "epoch": 1.8554763447828906, + "grad_norm": 0.06737224757671356, + "learning_rate": 0.00011914127548494531, + "loss": 0.2943, + "step": 22904 + }, + { + "epoch": 1.8555573558003888, + "grad_norm": 0.061686307191848755, + "learning_rate": 0.00011913677483235071, + "loss": 0.2613, + "step": 22905 + }, + { + "epoch": 1.855638366817887, + "grad_norm": 0.05410462245345116, + "learning_rate": 0.00011913227417975607, + "loss": 0.2382, + "step": 22906 + }, + { + "epoch": 1.8557193778353858, + "grad_norm": 0.05027051642537117, + "learning_rate": 0.00011912777352716145, + "loss": 0.2349, + "step": 22907 + }, + { + "epoch": 1.855800388852884, + "grad_norm": 0.06542790681123734, + "learning_rate": 0.00011912327287456682, + "loss": 0.2648, + "step": 22908 + }, + { + "epoch": 1.8558813998703823, + "grad_norm": 0.06300154328346252, + "learning_rate": 0.00011911877222197219, + "loss": 0.2792, + "step": 22909 + }, + { + "epoch": 1.8559624108878807, + "grad_norm": 0.05228601396083832, + "learning_rate": 0.00011911427156937755, + "loss": 0.2633, + "step": 22910 + }, + { + "epoch": 1.8560434219053792, + "grad_norm": 0.05761516094207764, + "learning_rate": 0.00011910977091678295, + "loss": 0.271, + "step": 22911 + }, + { + "epoch": 1.8561244329228774, + "grad_norm": 0.05890168249607086, + "learning_rate": 0.00011910527026418831, + "loss": 0.297, + "step": 22912 + }, + { + "epoch": 1.856205443940376, + "grad_norm": 0.056910499930381775, + "learning_rate": 0.00011910076961159369, + "loss": 0.2503, + "step": 22913 + }, + { + "epoch": 1.8562864549578744, + "grad_norm": 0.06466339528560638, + "learning_rate": 0.00011909626895899906, + "loss": 0.2915, + "step": 22914 + }, + { + "epoch": 1.8563674659753726, + "grad_norm": 0.060335755348205566, + "learning_rate": 0.00011909176830640443, + "loss": 0.28, + "step": 22915 + }, + { + "epoch": 1.856448476992871, + "grad_norm": 0.05601664260029793, + "learning_rate": 0.0001190872676538098, + "loss": 0.2845, + "step": 22916 + }, + { + "epoch": 1.8565294880103695, + "grad_norm": 0.04754171147942543, + "learning_rate": 0.0001190827670012152, + "loss": 0.2347, + "step": 22917 + }, + { + "epoch": 1.8566104990278678, + "grad_norm": 0.06566859036684036, + "learning_rate": 0.00011907826634862055, + "loss": 0.295, + "step": 22918 + }, + { + "epoch": 1.856691510045366, + "grad_norm": 0.043993715196847916, + "learning_rate": 0.00011907376569602593, + "loss": 0.2561, + "step": 22919 + }, + { + "epoch": 1.8567725210628645, + "grad_norm": 0.05596703663468361, + "learning_rate": 0.0001190692650434313, + "loss": 0.276, + "step": 22920 + }, + { + "epoch": 1.856853532080363, + "grad_norm": 0.04608578234910965, + "learning_rate": 0.00011906476439083668, + "loss": 0.2464, + "step": 22921 + }, + { + "epoch": 1.8569345430978612, + "grad_norm": 0.06296337395906448, + "learning_rate": 0.00011906026373824206, + "loss": 0.3519, + "step": 22922 + }, + { + "epoch": 1.8570155541153597, + "grad_norm": 0.05028759315609932, + "learning_rate": 0.00011905576308564744, + "loss": 0.2515, + "step": 22923 + }, + { + "epoch": 1.8570965651328581, + "grad_norm": 0.05610145628452301, + "learning_rate": 0.0001190512624330528, + "loss": 0.2745, + "step": 22924 + }, + { + "epoch": 1.8571775761503564, + "grad_norm": 0.05411072447896004, + "learning_rate": 0.00011904676178045817, + "loss": 0.2521, + "step": 22925 + }, + { + "epoch": 1.8572585871678549, + "grad_norm": 0.047831691801548004, + "learning_rate": 0.00011904226112786354, + "loss": 0.2593, + "step": 22926 + }, + { + "epoch": 1.8573395981853533, + "grad_norm": 0.045878808945417404, + "learning_rate": 0.00011903776047526892, + "loss": 0.2364, + "step": 22927 + }, + { + "epoch": 1.8574206092028516, + "grad_norm": 0.05370170623064041, + "learning_rate": 0.0001190332598226743, + "loss": 0.2517, + "step": 22928 + }, + { + "epoch": 1.8575016202203498, + "grad_norm": 0.06451074033975601, + "learning_rate": 0.00011902875917007968, + "loss": 0.3187, + "step": 22929 + }, + { + "epoch": 1.8575826312378485, + "grad_norm": 0.0566953606903553, + "learning_rate": 0.00011902425851748504, + "loss": 0.2799, + "step": 22930 + }, + { + "epoch": 1.8576636422553467, + "grad_norm": 0.06248341128230095, + "learning_rate": 0.00011901975786489041, + "loss": 0.295, + "step": 22931 + }, + { + "epoch": 1.857744653272845, + "grad_norm": 0.05930943787097931, + "learning_rate": 0.00011901525721229579, + "loss": 0.2952, + "step": 22932 + }, + { + "epoch": 1.8578256642903435, + "grad_norm": 0.05437745898962021, + "learning_rate": 0.00011901075655970116, + "loss": 0.2561, + "step": 22933 + }, + { + "epoch": 1.857906675307842, + "grad_norm": 0.04583446681499481, + "learning_rate": 0.00011900625590710655, + "loss": 0.2377, + "step": 22934 + }, + { + "epoch": 1.8579876863253402, + "grad_norm": 0.06268307566642761, + "learning_rate": 0.00011900175525451192, + "loss": 0.2747, + "step": 22935 + }, + { + "epoch": 1.8580686973428386, + "grad_norm": 0.05475815385580063, + "learning_rate": 0.00011899725460191728, + "loss": 0.2221, + "step": 22936 + }, + { + "epoch": 1.858149708360337, + "grad_norm": 0.06827379018068314, + "learning_rate": 0.00011899275394932265, + "loss": 0.2984, + "step": 22937 + }, + { + "epoch": 1.8582307193778353, + "grad_norm": 0.046038705855607986, + "learning_rate": 0.00011898825329672803, + "loss": 0.2572, + "step": 22938 + }, + { + "epoch": 1.8583117303953338, + "grad_norm": 0.0644463449716568, + "learning_rate": 0.0001189837526441334, + "loss": 0.278, + "step": 22939 + }, + { + "epoch": 1.8583927414128323, + "grad_norm": 0.061292245984077454, + "learning_rate": 0.00011897925199153879, + "loss": 0.2861, + "step": 22940 + }, + { + "epoch": 1.8584737524303305, + "grad_norm": 0.05138443410396576, + "learning_rate": 0.00011897475133894416, + "loss": 0.2923, + "step": 22941 + }, + { + "epoch": 1.8585547634478288, + "grad_norm": 0.046949900686740875, + "learning_rate": 0.00011897025068634952, + "loss": 0.2688, + "step": 22942 + }, + { + "epoch": 1.8586357744653272, + "grad_norm": 0.052126020193099976, + "learning_rate": 0.0001189657500337549, + "loss": 0.2404, + "step": 22943 + }, + { + "epoch": 1.8587167854828257, + "grad_norm": 0.060159020125865936, + "learning_rate": 0.00011896124938116027, + "loss": 0.2904, + "step": 22944 + }, + { + "epoch": 1.858797796500324, + "grad_norm": 0.05633169785141945, + "learning_rate": 0.00011895674872856564, + "loss": 0.2633, + "step": 22945 + }, + { + "epoch": 1.8588788075178224, + "grad_norm": 0.05782944709062576, + "learning_rate": 0.00011895224807597103, + "loss": 0.2877, + "step": 22946 + }, + { + "epoch": 1.8589598185353209, + "grad_norm": 0.04998238384723663, + "learning_rate": 0.0001189477474233764, + "loss": 0.2682, + "step": 22947 + }, + { + "epoch": 1.8590408295528191, + "grad_norm": 0.06594345718622208, + "learning_rate": 0.00011894324677078176, + "loss": 0.3158, + "step": 22948 + }, + { + "epoch": 1.8591218405703176, + "grad_norm": 0.06773028522729874, + "learning_rate": 0.00011893874611818714, + "loss": 0.2936, + "step": 22949 + }, + { + "epoch": 1.859202851587816, + "grad_norm": 0.04972933232784271, + "learning_rate": 0.00011893424546559251, + "loss": 0.2655, + "step": 22950 + }, + { + "epoch": 1.8592838626053143, + "grad_norm": 0.06286988407373428, + "learning_rate": 0.0001189297448129979, + "loss": 0.295, + "step": 22951 + }, + { + "epoch": 1.8593648736228126, + "grad_norm": 0.05481676012277603, + "learning_rate": 0.00011892524416040327, + "loss": 0.2801, + "step": 22952 + }, + { + "epoch": 1.8594458846403112, + "grad_norm": 0.060597535222768784, + "learning_rate": 0.00011892074350780864, + "loss": 0.2892, + "step": 22953 + }, + { + "epoch": 1.8595268956578095, + "grad_norm": 0.06514524668455124, + "learning_rate": 0.000118916242855214, + "loss": 0.2844, + "step": 22954 + }, + { + "epoch": 1.8596079066753077, + "grad_norm": 0.051577962934970856, + "learning_rate": 0.00011891174220261938, + "loss": 0.2692, + "step": 22955 + }, + { + "epoch": 1.8596889176928062, + "grad_norm": 0.0652921050786972, + "learning_rate": 0.00011890724155002475, + "loss": 0.2766, + "step": 22956 + }, + { + "epoch": 1.8597699287103047, + "grad_norm": 0.05151598900556564, + "learning_rate": 0.00011890274089743014, + "loss": 0.2708, + "step": 22957 + }, + { + "epoch": 1.859850939727803, + "grad_norm": 0.05443265661597252, + "learning_rate": 0.00011889824024483551, + "loss": 0.2313, + "step": 22958 + }, + { + "epoch": 1.8599319507453014, + "grad_norm": 0.06853888183832169, + "learning_rate": 0.00011889373959224089, + "loss": 0.3092, + "step": 22959 + }, + { + "epoch": 1.8600129617627998, + "grad_norm": 0.06651543080806732, + "learning_rate": 0.00011888923893964625, + "loss": 0.324, + "step": 22960 + }, + { + "epoch": 1.860093972780298, + "grad_norm": 0.06134466454386711, + "learning_rate": 0.00011888473828705162, + "loss": 0.2937, + "step": 22961 + }, + { + "epoch": 1.8601749837977966, + "grad_norm": 0.10911976546049118, + "learning_rate": 0.000118880237634457, + "loss": 0.2954, + "step": 22962 + }, + { + "epoch": 1.860255994815295, + "grad_norm": 0.07487057894468307, + "learning_rate": 0.00011887573698186238, + "loss": 0.3127, + "step": 22963 + }, + { + "epoch": 1.8603370058327933, + "grad_norm": 0.04409882053732872, + "learning_rate": 0.00011887123632926775, + "loss": 0.247, + "step": 22964 + }, + { + "epoch": 1.8604180168502915, + "grad_norm": 0.06299306452274323, + "learning_rate": 0.00011886673567667313, + "loss": 0.304, + "step": 22965 + }, + { + "epoch": 1.86049902786779, + "grad_norm": 0.06553803384304047, + "learning_rate": 0.00011886223502407849, + "loss": 0.2936, + "step": 22966 + }, + { + "epoch": 1.8605800388852884, + "grad_norm": 0.05951612442731857, + "learning_rate": 0.00011885773437148386, + "loss": 0.235, + "step": 22967 + }, + { + "epoch": 1.8606610499027867, + "grad_norm": 0.05883181095123291, + "learning_rate": 0.00011885323371888924, + "loss": 0.2694, + "step": 22968 + }, + { + "epoch": 1.8607420609202852, + "grad_norm": 0.0556342713534832, + "learning_rate": 0.00011884873306629462, + "loss": 0.2635, + "step": 22969 + }, + { + "epoch": 1.8608230719377836, + "grad_norm": 0.06378420442342758, + "learning_rate": 0.0001188442324137, + "loss": 0.2705, + "step": 22970 + }, + { + "epoch": 1.8609040829552819, + "grad_norm": 0.059256576001644135, + "learning_rate": 0.00011883973176110537, + "loss": 0.2802, + "step": 22971 + }, + { + "epoch": 1.8609850939727803, + "grad_norm": 0.06668049097061157, + "learning_rate": 0.00011883523110851073, + "loss": 0.2518, + "step": 22972 + }, + { + "epoch": 1.8610661049902788, + "grad_norm": 0.06027417257428169, + "learning_rate": 0.0001188307304559161, + "loss": 0.2843, + "step": 22973 + }, + { + "epoch": 1.861147116007777, + "grad_norm": 0.06474443525075912, + "learning_rate": 0.0001188262298033215, + "loss": 0.3096, + "step": 22974 + }, + { + "epoch": 1.8612281270252753, + "grad_norm": 0.05829734355211258, + "learning_rate": 0.00011882172915072686, + "loss": 0.2901, + "step": 22975 + }, + { + "epoch": 1.861309138042774, + "grad_norm": 0.06739259511232376, + "learning_rate": 0.00011881722849813224, + "loss": 0.2961, + "step": 22976 + }, + { + "epoch": 1.8613901490602722, + "grad_norm": 0.06400362402200699, + "learning_rate": 0.00011881272784553761, + "loss": 0.3034, + "step": 22977 + }, + { + "epoch": 1.8614711600777705, + "grad_norm": 0.06838753819465637, + "learning_rate": 0.00011880822719294298, + "loss": 0.3016, + "step": 22978 + }, + { + "epoch": 1.861552171095269, + "grad_norm": 0.06485739350318909, + "learning_rate": 0.00011880372654034834, + "loss": 0.2499, + "step": 22979 + }, + { + "epoch": 1.8616331821127674, + "grad_norm": 0.06050259992480278, + "learning_rate": 0.00011879922588775375, + "loss": 0.269, + "step": 22980 + }, + { + "epoch": 1.8617141931302656, + "grad_norm": 0.06689231097698212, + "learning_rate": 0.0001187947252351591, + "loss": 0.2854, + "step": 22981 + }, + { + "epoch": 1.8617952041477641, + "grad_norm": 0.0670228898525238, + "learning_rate": 0.00011879022458256448, + "loss": 0.304, + "step": 22982 + }, + { + "epoch": 1.8618762151652626, + "grad_norm": 0.05257980152964592, + "learning_rate": 0.00011878572392996985, + "loss": 0.2687, + "step": 22983 + }, + { + "epoch": 1.8619572261827608, + "grad_norm": 0.05486253276467323, + "learning_rate": 0.00011878122327737523, + "loss": 0.3058, + "step": 22984 + }, + { + "epoch": 1.862038237200259, + "grad_norm": 0.05932844430208206, + "learning_rate": 0.00011877672262478059, + "loss": 0.2972, + "step": 22985 + }, + { + "epoch": 1.8621192482177578, + "grad_norm": 0.05444946512579918, + "learning_rate": 0.00011877222197218599, + "loss": 0.2695, + "step": 22986 + }, + { + "epoch": 1.862200259235256, + "grad_norm": 0.062618188560009, + "learning_rate": 0.00011876772131959135, + "loss": 0.2801, + "step": 22987 + }, + { + "epoch": 1.8622812702527543, + "grad_norm": 0.061284326016902924, + "learning_rate": 0.00011876322066699672, + "loss": 0.282, + "step": 22988 + }, + { + "epoch": 1.8623622812702527, + "grad_norm": 0.049036409705877304, + "learning_rate": 0.0001187587200144021, + "loss": 0.2739, + "step": 22989 + }, + { + "epoch": 1.8624432922877512, + "grad_norm": 0.053588204085826874, + "learning_rate": 0.00011875421936180747, + "loss": 0.2873, + "step": 22990 + }, + { + "epoch": 1.8625243033052494, + "grad_norm": 0.06565236300230026, + "learning_rate": 0.00011874971870921283, + "loss": 0.2702, + "step": 22991 + }, + { + "epoch": 1.862605314322748, + "grad_norm": 0.06319651007652283, + "learning_rate": 0.00011874521805661823, + "loss": 0.2731, + "step": 22992 + }, + { + "epoch": 1.8626863253402464, + "grad_norm": 0.0632275864481926, + "learning_rate": 0.00011874071740402359, + "loss": 0.3067, + "step": 22993 + }, + { + "epoch": 1.8627673363577446, + "grad_norm": 0.06545563787221909, + "learning_rate": 0.00011873621675142896, + "loss": 0.3187, + "step": 22994 + }, + { + "epoch": 1.862848347375243, + "grad_norm": 0.0605737641453743, + "learning_rate": 0.00011873171609883434, + "loss": 0.2951, + "step": 22995 + }, + { + "epoch": 1.8629293583927415, + "grad_norm": 0.051222190260887146, + "learning_rate": 0.00011872721544623971, + "loss": 0.246, + "step": 22996 + }, + { + "epoch": 1.8630103694102398, + "grad_norm": 0.054123520851135254, + "learning_rate": 0.00011872271479364507, + "loss": 0.2858, + "step": 22997 + }, + { + "epoch": 1.863091380427738, + "grad_norm": 0.0514240488409996, + "learning_rate": 0.00011871821414105047, + "loss": 0.225, + "step": 22998 + }, + { + "epoch": 1.8631723914452365, + "grad_norm": 0.05652325227856636, + "learning_rate": 0.00011871371348845583, + "loss": 0.2703, + "step": 22999 + }, + { + "epoch": 1.863253402462735, + "grad_norm": 0.0580107681453228, + "learning_rate": 0.0001187092128358612, + "loss": 0.2836, + "step": 23000 + }, + { + "epoch": 1.8633344134802332, + "grad_norm": 0.0637790709733963, + "learning_rate": 0.00011870471218326658, + "loss": 0.2706, + "step": 23001 + }, + { + "epoch": 1.8634154244977317, + "grad_norm": 0.053458504378795624, + "learning_rate": 0.00011870021153067195, + "loss": 0.2236, + "step": 23002 + }, + { + "epoch": 1.8634964355152301, + "grad_norm": 0.0517844595015049, + "learning_rate": 0.00011869571087807734, + "loss": 0.2756, + "step": 23003 + }, + { + "epoch": 1.8635774465327284, + "grad_norm": 0.06121333688497543, + "learning_rate": 0.00011869121022548271, + "loss": 0.2615, + "step": 23004 + }, + { + "epoch": 1.8636584575502269, + "grad_norm": 0.07658272236585617, + "learning_rate": 0.00011868670957288807, + "loss": 0.2696, + "step": 23005 + }, + { + "epoch": 1.8637394685677253, + "grad_norm": 0.06371904164552689, + "learning_rate": 0.00011868220892029345, + "loss": 0.2684, + "step": 23006 + }, + { + "epoch": 1.8638204795852236, + "grad_norm": 0.05698368325829506, + "learning_rate": 0.00011867770826769882, + "loss": 0.2737, + "step": 23007 + }, + { + "epoch": 1.8639014906027218, + "grad_norm": 0.065656878054142, + "learning_rate": 0.00011867320761510419, + "loss": 0.2917, + "step": 23008 + }, + { + "epoch": 1.8639825016202205, + "grad_norm": 0.053923673927783966, + "learning_rate": 0.00011866870696250958, + "loss": 0.2278, + "step": 23009 + }, + { + "epoch": 1.8640635126377187, + "grad_norm": 0.06553805619478226, + "learning_rate": 0.00011866420630991495, + "loss": 0.2594, + "step": 23010 + }, + { + "epoch": 1.864144523655217, + "grad_norm": 0.05535467341542244, + "learning_rate": 0.00011865970565732031, + "loss": 0.2843, + "step": 23011 + }, + { + "epoch": 1.8642255346727155, + "grad_norm": 0.05905730649828911, + "learning_rate": 0.00011865520500472569, + "loss": 0.2295, + "step": 23012 + }, + { + "epoch": 1.864306545690214, + "grad_norm": 0.0544164814054966, + "learning_rate": 0.00011865070435213106, + "loss": 0.2827, + "step": 23013 + }, + { + "epoch": 1.8643875567077122, + "grad_norm": 0.055385880172252655, + "learning_rate": 0.00011864620369953643, + "loss": 0.2745, + "step": 23014 + }, + { + "epoch": 1.8644685677252106, + "grad_norm": 0.064468614757061, + "learning_rate": 0.00011864170304694182, + "loss": 0.3074, + "step": 23015 + }, + { + "epoch": 1.864549578742709, + "grad_norm": 0.05835357680916786, + "learning_rate": 0.0001186372023943472, + "loss": 0.29, + "step": 23016 + }, + { + "epoch": 1.8646305897602073, + "grad_norm": 0.05793554335832596, + "learning_rate": 0.00011863270174175256, + "loss": 0.2754, + "step": 23017 + }, + { + "epoch": 1.8647116007777058, + "grad_norm": 0.0771193578839302, + "learning_rate": 0.00011862820108915793, + "loss": 0.2701, + "step": 23018 + }, + { + "epoch": 1.8647926117952043, + "grad_norm": 0.05392744392156601, + "learning_rate": 0.0001186237004365633, + "loss": 0.3021, + "step": 23019 + }, + { + "epoch": 1.8648736228127025, + "grad_norm": 0.05138259381055832, + "learning_rate": 0.00011861919978396868, + "loss": 0.2318, + "step": 23020 + }, + { + "epoch": 1.8649546338302008, + "grad_norm": 0.05715327709913254, + "learning_rate": 0.00011861469913137406, + "loss": 0.2691, + "step": 23021 + }, + { + "epoch": 1.8650356448476992, + "grad_norm": 0.07019861042499542, + "learning_rate": 0.00011861019847877944, + "loss": 0.2658, + "step": 23022 + }, + { + "epoch": 1.8651166558651977, + "grad_norm": 0.05146124213933945, + "learning_rate": 0.0001186056978261848, + "loss": 0.2548, + "step": 23023 + }, + { + "epoch": 1.865197666882696, + "grad_norm": 0.06404336541891098, + "learning_rate": 0.00011860119717359017, + "loss": 0.2954, + "step": 23024 + }, + { + "epoch": 1.8652786779001944, + "grad_norm": 0.05009884759783745, + "learning_rate": 0.00011859669652099554, + "loss": 0.3042, + "step": 23025 + }, + { + "epoch": 1.8653596889176929, + "grad_norm": 0.060626138001680374, + "learning_rate": 0.00011859219586840093, + "loss": 0.3105, + "step": 23026 + }, + { + "epoch": 1.8654406999351911, + "grad_norm": 0.055893126875162125, + "learning_rate": 0.0001185876952158063, + "loss": 0.2749, + "step": 23027 + }, + { + "epoch": 1.8655217109526896, + "grad_norm": 0.055800460278987885, + "learning_rate": 0.00011858319456321168, + "loss": 0.2762, + "step": 23028 + }, + { + "epoch": 1.865602721970188, + "grad_norm": 0.05607381463050842, + "learning_rate": 0.00011857869391061704, + "loss": 0.2577, + "step": 23029 + }, + { + "epoch": 1.8656837329876863, + "grad_norm": 0.054439570754766464, + "learning_rate": 0.00011857419325802241, + "loss": 0.2712, + "step": 23030 + }, + { + "epoch": 1.8657647440051845, + "grad_norm": 0.04881187528371811, + "learning_rate": 0.00011856969260542779, + "loss": 0.2301, + "step": 23031 + }, + { + "epoch": 1.8658457550226832, + "grad_norm": 0.05401236563920975, + "learning_rate": 0.00011856519195283317, + "loss": 0.2479, + "step": 23032 + }, + { + "epoch": 1.8659267660401815, + "grad_norm": 0.05074911192059517, + "learning_rate": 0.00011856069130023855, + "loss": 0.2441, + "step": 23033 + }, + { + "epoch": 1.8660077770576797, + "grad_norm": 0.04326274245977402, + "learning_rate": 0.00011855619064764392, + "loss": 0.2199, + "step": 23034 + }, + { + "epoch": 1.8660887880751782, + "grad_norm": 0.07047049701213837, + "learning_rate": 0.00011855168999504928, + "loss": 0.3273, + "step": 23035 + }, + { + "epoch": 1.8661697990926767, + "grad_norm": 0.05269942805171013, + "learning_rate": 0.00011854718934245465, + "loss": 0.2514, + "step": 23036 + }, + { + "epoch": 1.866250810110175, + "grad_norm": 0.05708518996834755, + "learning_rate": 0.00011854268868986003, + "loss": 0.2811, + "step": 23037 + }, + { + "epoch": 1.8663318211276734, + "grad_norm": 0.05749276280403137, + "learning_rate": 0.00011853818803726541, + "loss": 0.264, + "step": 23038 + }, + { + "epoch": 1.8664128321451718, + "grad_norm": 0.064975805580616, + "learning_rate": 0.00011853368738467079, + "loss": 0.3185, + "step": 23039 + }, + { + "epoch": 1.86649384316267, + "grad_norm": 0.05995302274823189, + "learning_rate": 0.00011852918673207616, + "loss": 0.2414, + "step": 23040 + }, + { + "epoch": 1.8665748541801686, + "grad_norm": 0.05492790415883064, + "learning_rate": 0.00011852468607948152, + "loss": 0.2714, + "step": 23041 + }, + { + "epoch": 1.866655865197667, + "grad_norm": 0.06112586334347725, + "learning_rate": 0.0001185201854268869, + "loss": 0.2785, + "step": 23042 + }, + { + "epoch": 1.8667368762151653, + "grad_norm": 0.05609976127743721, + "learning_rate": 0.00011851568477429227, + "loss": 0.2709, + "step": 23043 + }, + { + "epoch": 1.8668178872326635, + "grad_norm": 0.06333284080028534, + "learning_rate": 0.00011851118412169766, + "loss": 0.3069, + "step": 23044 + }, + { + "epoch": 1.866898898250162, + "grad_norm": 0.053372643887996674, + "learning_rate": 0.00011850668346910303, + "loss": 0.2847, + "step": 23045 + }, + { + "epoch": 1.8669799092676604, + "grad_norm": 0.06115709990262985, + "learning_rate": 0.0001185021828165084, + "loss": 0.2317, + "step": 23046 + }, + { + "epoch": 1.8670609202851587, + "grad_norm": 0.04994625970721245, + "learning_rate": 0.00011849768216391378, + "loss": 0.2491, + "step": 23047 + }, + { + "epoch": 1.8671419313026572, + "grad_norm": 0.049292415380477905, + "learning_rate": 0.00011849318151131914, + "loss": 0.2145, + "step": 23048 + }, + { + "epoch": 1.8672229423201556, + "grad_norm": 0.05687323212623596, + "learning_rate": 0.00011848868085872451, + "loss": 0.2504, + "step": 23049 + }, + { + "epoch": 1.8673039533376539, + "grad_norm": 0.05730355903506279, + "learning_rate": 0.0001184841802061299, + "loss": 0.238, + "step": 23050 + }, + { + "epoch": 1.8673849643551523, + "grad_norm": 0.062119435518980026, + "learning_rate": 0.00011847967955353527, + "loss": 0.2294, + "step": 23051 + }, + { + "epoch": 1.8674659753726508, + "grad_norm": 0.05226528272032738, + "learning_rate": 0.00011847517890094064, + "loss": 0.295, + "step": 23052 + }, + { + "epoch": 1.867546986390149, + "grad_norm": 0.07073201984167099, + "learning_rate": 0.00011847067824834602, + "loss": 0.277, + "step": 23053 + }, + { + "epoch": 1.8676279974076473, + "grad_norm": 0.0602298267185688, + "learning_rate": 0.00011846617759575138, + "loss": 0.2832, + "step": 23054 + }, + { + "epoch": 1.867709008425146, + "grad_norm": 0.06587371230125427, + "learning_rate": 0.00011846167694315678, + "loss": 0.2758, + "step": 23055 + }, + { + "epoch": 1.8677900194426442, + "grad_norm": 0.06909722834825516, + "learning_rate": 0.00011845717629056214, + "loss": 0.2748, + "step": 23056 + }, + { + "epoch": 1.8678710304601425, + "grad_norm": 0.055311381816864014, + "learning_rate": 0.00011845267563796751, + "loss": 0.2827, + "step": 23057 + }, + { + "epoch": 1.867952041477641, + "grad_norm": 0.04789602756500244, + "learning_rate": 0.00011844817498537289, + "loss": 0.2163, + "step": 23058 + }, + { + "epoch": 1.8680330524951394, + "grad_norm": 0.06209869682788849, + "learning_rate": 0.00011844367433277826, + "loss": 0.2775, + "step": 23059 + }, + { + "epoch": 1.8681140635126376, + "grad_norm": 0.05467294529080391, + "learning_rate": 0.00011843917368018362, + "loss": 0.2751, + "step": 23060 + }, + { + "epoch": 1.8681950745301361, + "grad_norm": 0.061567649245262146, + "learning_rate": 0.00011843467302758902, + "loss": 0.2685, + "step": 23061 + }, + { + "epoch": 1.8682760855476346, + "grad_norm": 0.06625506281852722, + "learning_rate": 0.00011843017237499438, + "loss": 0.2651, + "step": 23062 + }, + { + "epoch": 1.8683570965651328, + "grad_norm": 0.05260910838842392, + "learning_rate": 0.00011842567172239975, + "loss": 0.2451, + "step": 23063 + }, + { + "epoch": 1.8684381075826313, + "grad_norm": 0.04116278514266014, + "learning_rate": 0.00011842117106980513, + "loss": 0.2206, + "step": 23064 + }, + { + "epoch": 1.8685191186001298, + "grad_norm": 0.05465328320860863, + "learning_rate": 0.0001184166704172105, + "loss": 0.2535, + "step": 23065 + }, + { + "epoch": 1.868600129617628, + "grad_norm": 0.06118228659033775, + "learning_rate": 0.00011841216976461586, + "loss": 0.2564, + "step": 23066 + }, + { + "epoch": 1.8686811406351262, + "grad_norm": 0.045729734003543854, + "learning_rate": 0.00011840766911202126, + "loss": 0.2892, + "step": 23067 + }, + { + "epoch": 1.8687621516526247, + "grad_norm": 0.06286770105361938, + "learning_rate": 0.00011840316845942662, + "loss": 0.3109, + "step": 23068 + }, + { + "epoch": 1.8688431626701232, + "grad_norm": 0.05661400035023689, + "learning_rate": 0.000118398667806832, + "loss": 0.2494, + "step": 23069 + }, + { + "epoch": 1.8689241736876214, + "grad_norm": 0.059357624500989914, + "learning_rate": 0.00011839416715423737, + "loss": 0.2707, + "step": 23070 + }, + { + "epoch": 1.86900518470512, + "grad_norm": 0.05716922879219055, + "learning_rate": 0.00011838966650164274, + "loss": 0.2725, + "step": 23071 + }, + { + "epoch": 1.8690861957226184, + "grad_norm": 0.05681823939085007, + "learning_rate": 0.0001183851658490481, + "loss": 0.2519, + "step": 23072 + }, + { + "epoch": 1.8691672067401166, + "grad_norm": 0.05614323541522026, + "learning_rate": 0.0001183806651964535, + "loss": 0.2836, + "step": 23073 + }, + { + "epoch": 1.869248217757615, + "grad_norm": 0.059444133192300797, + "learning_rate": 0.00011837616454385886, + "loss": 0.278, + "step": 23074 + }, + { + "epoch": 1.8693292287751135, + "grad_norm": 0.0667402595281601, + "learning_rate": 0.00011837166389126424, + "loss": 0.2654, + "step": 23075 + }, + { + "epoch": 1.8694102397926118, + "grad_norm": 0.059592802077531815, + "learning_rate": 0.00011836716323866961, + "loss": 0.2652, + "step": 23076 + }, + { + "epoch": 1.86949125081011, + "grad_norm": 0.05447734519839287, + "learning_rate": 0.00011836266258607498, + "loss": 0.2343, + "step": 23077 + }, + { + "epoch": 1.8695722618276087, + "grad_norm": 0.06101779267191887, + "learning_rate": 0.00011835816193348035, + "loss": 0.3185, + "step": 23078 + }, + { + "epoch": 1.869653272845107, + "grad_norm": 0.05614207684993744, + "learning_rate": 0.00011835366128088575, + "loss": 0.2759, + "step": 23079 + }, + { + "epoch": 1.8697342838626052, + "grad_norm": 0.05248326435685158, + "learning_rate": 0.0001183491606282911, + "loss": 0.2659, + "step": 23080 + }, + { + "epoch": 1.8698152948801037, + "grad_norm": 0.07007522135972977, + "learning_rate": 0.00011834465997569648, + "loss": 0.3193, + "step": 23081 + }, + { + "epoch": 1.8698963058976021, + "grad_norm": 0.04993303865194321, + "learning_rate": 0.00011834015932310185, + "loss": 0.226, + "step": 23082 + }, + { + "epoch": 1.8699773169151004, + "grad_norm": 0.058122795075178146, + "learning_rate": 0.00011833565867050723, + "loss": 0.263, + "step": 23083 + }, + { + "epoch": 1.8700583279325989, + "grad_norm": 0.049717120826244354, + "learning_rate": 0.00011833115801791261, + "loss": 0.2541, + "step": 23084 + }, + { + "epoch": 1.8701393389500973, + "grad_norm": 0.06780257821083069, + "learning_rate": 0.00011832665736531799, + "loss": 0.2463, + "step": 23085 + }, + { + "epoch": 1.8702203499675956, + "grad_norm": 0.06661541759967804, + "learning_rate": 0.00011832215671272335, + "loss": 0.3024, + "step": 23086 + }, + { + "epoch": 1.8703013609850938, + "grad_norm": 0.050316132605075836, + "learning_rate": 0.00011831765606012872, + "loss": 0.2392, + "step": 23087 + }, + { + "epoch": 1.8703823720025925, + "grad_norm": 0.07037214189767838, + "learning_rate": 0.0001183131554075341, + "loss": 0.281, + "step": 23088 + }, + { + "epoch": 1.8704633830200907, + "grad_norm": 0.08435565233230591, + "learning_rate": 0.00011830865475493947, + "loss": 0.3307, + "step": 23089 + }, + { + "epoch": 1.870544394037589, + "grad_norm": 0.06200847774744034, + "learning_rate": 0.00011830415410234486, + "loss": 0.2522, + "step": 23090 + }, + { + "epoch": 1.8706254050550875, + "grad_norm": 0.05395513400435448, + "learning_rate": 0.00011829965344975023, + "loss": 0.28, + "step": 23091 + }, + { + "epoch": 1.870706416072586, + "grad_norm": 0.047511208802461624, + "learning_rate": 0.00011829515279715559, + "loss": 0.2718, + "step": 23092 + }, + { + "epoch": 1.8707874270900842, + "grad_norm": 0.05486256256699562, + "learning_rate": 0.00011829065214456096, + "loss": 0.2614, + "step": 23093 + }, + { + "epoch": 1.8708684381075826, + "grad_norm": 0.06447124481201172, + "learning_rate": 0.00011828615149196634, + "loss": 0.2584, + "step": 23094 + }, + { + "epoch": 1.870949449125081, + "grad_norm": 0.05701237916946411, + "learning_rate": 0.00011828165083937171, + "loss": 0.3066, + "step": 23095 + }, + { + "epoch": 1.8710304601425793, + "grad_norm": 0.06130368262529373, + "learning_rate": 0.0001182771501867771, + "loss": 0.281, + "step": 23096 + }, + { + "epoch": 1.8711114711600778, + "grad_norm": 0.05731342360377312, + "learning_rate": 0.00011827264953418247, + "loss": 0.2568, + "step": 23097 + }, + { + "epoch": 1.8711924821775763, + "grad_norm": 0.06326236575841904, + "learning_rate": 0.00011826814888158783, + "loss": 0.2683, + "step": 23098 + }, + { + "epoch": 1.8712734931950745, + "grad_norm": 0.04846350848674774, + "learning_rate": 0.0001182636482289932, + "loss": 0.2413, + "step": 23099 + }, + { + "epoch": 1.8713545042125728, + "grad_norm": 0.059385381639003754, + "learning_rate": 0.00011825914757639858, + "loss": 0.2771, + "step": 23100 + }, + { + "epoch": 1.8714355152300715, + "grad_norm": 0.051562074571847916, + "learning_rate": 0.00011825464692380395, + "loss": 0.2283, + "step": 23101 + }, + { + "epoch": 1.8715165262475697, + "grad_norm": 0.05749623849987984, + "learning_rate": 0.00011825014627120934, + "loss": 0.2467, + "step": 23102 + }, + { + "epoch": 1.871597537265068, + "grad_norm": 0.05762708559632301, + "learning_rate": 0.00011824564561861471, + "loss": 0.3291, + "step": 23103 + }, + { + "epoch": 1.8716785482825664, + "grad_norm": 0.07784654945135117, + "learning_rate": 0.00011824114496602007, + "loss": 0.3403, + "step": 23104 + }, + { + "epoch": 1.8717595593000649, + "grad_norm": 0.07061202079057693, + "learning_rate": 0.00011823664431342545, + "loss": 0.2642, + "step": 23105 + }, + { + "epoch": 1.8718405703175631, + "grad_norm": 0.07585369050502777, + "learning_rate": 0.00011823214366083082, + "loss": 0.331, + "step": 23106 + }, + { + "epoch": 1.8719215813350616, + "grad_norm": 0.05564757063984871, + "learning_rate": 0.0001182276430082362, + "loss": 0.2967, + "step": 23107 + }, + { + "epoch": 1.87200259235256, + "grad_norm": 0.07339231669902802, + "learning_rate": 0.00011822314235564158, + "loss": 0.2784, + "step": 23108 + }, + { + "epoch": 1.8720836033700583, + "grad_norm": 0.06239980831742287, + "learning_rate": 0.00011821864170304695, + "loss": 0.2765, + "step": 23109 + }, + { + "epoch": 1.8721646143875565, + "grad_norm": 0.06124987080693245, + "learning_rate": 0.00011821414105045231, + "loss": 0.2962, + "step": 23110 + }, + { + "epoch": 1.8722456254050552, + "grad_norm": 0.05769716203212738, + "learning_rate": 0.00011820964039785769, + "loss": 0.3079, + "step": 23111 + }, + { + "epoch": 1.8723266364225535, + "grad_norm": 0.07039839774370193, + "learning_rate": 0.00011820513974526306, + "loss": 0.2915, + "step": 23112 + }, + { + "epoch": 1.8724076474400517, + "grad_norm": 0.05476633086800575, + "learning_rate": 0.00011820063909266845, + "loss": 0.2654, + "step": 23113 + }, + { + "epoch": 1.8724886584575502, + "grad_norm": 0.04585075005888939, + "learning_rate": 0.00011819613844007382, + "loss": 0.2339, + "step": 23114 + }, + { + "epoch": 1.8725696694750487, + "grad_norm": 0.058606356382369995, + "learning_rate": 0.0001181916377874792, + "loss": 0.2471, + "step": 23115 + }, + { + "epoch": 1.872650680492547, + "grad_norm": 0.0552472360432148, + "learning_rate": 0.00011818713713488457, + "loss": 0.2488, + "step": 23116 + }, + { + "epoch": 1.8727316915100454, + "grad_norm": 0.04606689140200615, + "learning_rate": 0.00011818263648228993, + "loss": 0.2438, + "step": 23117 + }, + { + "epoch": 1.8728127025275438, + "grad_norm": 0.05656420439481735, + "learning_rate": 0.0001181781358296953, + "loss": 0.3072, + "step": 23118 + }, + { + "epoch": 1.872893713545042, + "grad_norm": 0.06550390273332596, + "learning_rate": 0.00011817363517710069, + "loss": 0.2704, + "step": 23119 + }, + { + "epoch": 1.8729747245625405, + "grad_norm": 0.05345557630062103, + "learning_rate": 0.00011816913452450606, + "loss": 0.2793, + "step": 23120 + }, + { + "epoch": 1.873055735580039, + "grad_norm": 0.06245110183954239, + "learning_rate": 0.00011816463387191144, + "loss": 0.2841, + "step": 23121 + }, + { + "epoch": 1.8731367465975373, + "grad_norm": 0.055512577295303345, + "learning_rate": 0.00011816013321931681, + "loss": 0.2806, + "step": 23122 + }, + { + "epoch": 1.8732177576150355, + "grad_norm": 0.0590149387717247, + "learning_rate": 0.00011815563256672217, + "loss": 0.2875, + "step": 23123 + }, + { + "epoch": 1.873298768632534, + "grad_norm": 0.05215387046337128, + "learning_rate": 0.00011815113191412754, + "loss": 0.2524, + "step": 23124 + }, + { + "epoch": 1.8733797796500324, + "grad_norm": 0.05460435152053833, + "learning_rate": 0.00011814663126153293, + "loss": 0.2551, + "step": 23125 + }, + { + "epoch": 1.8734607906675307, + "grad_norm": 0.06229287013411522, + "learning_rate": 0.0001181421306089383, + "loss": 0.2828, + "step": 23126 + }, + { + "epoch": 1.8735418016850292, + "grad_norm": 0.06163933128118515, + "learning_rate": 0.00011813762995634368, + "loss": 0.2937, + "step": 23127 + }, + { + "epoch": 1.8736228127025276, + "grad_norm": 0.0614483542740345, + "learning_rate": 0.00011813312930374905, + "loss": 0.2905, + "step": 23128 + }, + { + "epoch": 1.8737038237200259, + "grad_norm": 0.06532017141580582, + "learning_rate": 0.00011812862865115441, + "loss": 0.2622, + "step": 23129 + }, + { + "epoch": 1.8737848347375243, + "grad_norm": 0.0652865469455719, + "learning_rate": 0.00011812412799855979, + "loss": 0.3087, + "step": 23130 + }, + { + "epoch": 1.8738658457550228, + "grad_norm": 0.05394424498081207, + "learning_rate": 0.00011811962734596517, + "loss": 0.2498, + "step": 23131 + }, + { + "epoch": 1.873946856772521, + "grad_norm": 0.05773965269327164, + "learning_rate": 0.00011811512669337055, + "loss": 0.2645, + "step": 23132 + }, + { + "epoch": 1.8740278677900193, + "grad_norm": 0.06872682273387909, + "learning_rate": 0.00011811062604077592, + "loss": 0.3171, + "step": 23133 + }, + { + "epoch": 1.874108878807518, + "grad_norm": 0.05397693067789078, + "learning_rate": 0.0001181061253881813, + "loss": 0.2779, + "step": 23134 + }, + { + "epoch": 1.8741898898250162, + "grad_norm": 0.062232982367277145, + "learning_rate": 0.00011810162473558665, + "loss": 0.2677, + "step": 23135 + }, + { + "epoch": 1.8742709008425145, + "grad_norm": 0.06239263713359833, + "learning_rate": 0.00011809712408299205, + "loss": 0.3113, + "step": 23136 + }, + { + "epoch": 1.874351911860013, + "grad_norm": 0.07032819837331772, + "learning_rate": 0.00011809262343039741, + "loss": 0.2985, + "step": 23137 + }, + { + "epoch": 1.8744329228775114, + "grad_norm": 0.06495737284421921, + "learning_rate": 0.00011808812277780279, + "loss": 0.3248, + "step": 23138 + }, + { + "epoch": 1.8745139338950096, + "grad_norm": 0.06768009811639786, + "learning_rate": 0.00011808362212520816, + "loss": 0.2638, + "step": 23139 + }, + { + "epoch": 1.874594944912508, + "grad_norm": 0.060301005840301514, + "learning_rate": 0.00011807912147261354, + "loss": 0.3211, + "step": 23140 + }, + { + "epoch": 1.8746759559300066, + "grad_norm": 0.05018927529454231, + "learning_rate": 0.0001180746208200189, + "loss": 0.2727, + "step": 23141 + }, + { + "epoch": 1.8747569669475048, + "grad_norm": 0.05934358388185501, + "learning_rate": 0.0001180701201674243, + "loss": 0.2497, + "step": 23142 + }, + { + "epoch": 1.8748379779650033, + "grad_norm": 0.06375506520271301, + "learning_rate": 0.00011806561951482966, + "loss": 0.244, + "step": 23143 + }, + { + "epoch": 1.8749189889825018, + "grad_norm": 0.04853232577443123, + "learning_rate": 0.00011806111886223503, + "loss": 0.2322, + "step": 23144 + }, + { + "epoch": 1.875, + "grad_norm": 0.07027478516101837, + "learning_rate": 0.0001180566182096404, + "loss": 0.2801, + "step": 23145 + }, + { + "epoch": 1.8750810110174982, + "grad_norm": 0.057535093277692795, + "learning_rate": 0.00011805211755704578, + "loss": 0.2648, + "step": 23146 + }, + { + "epoch": 1.8751620220349967, + "grad_norm": 0.059583697468042374, + "learning_rate": 0.00011804761690445114, + "loss": 0.2737, + "step": 23147 + }, + { + "epoch": 1.8752430330524952, + "grad_norm": 0.05222093313932419, + "learning_rate": 0.00011804311625185654, + "loss": 0.2267, + "step": 23148 + }, + { + "epoch": 1.8753240440699934, + "grad_norm": 0.0696893259882927, + "learning_rate": 0.0001180386155992619, + "loss": 0.2535, + "step": 23149 + }, + { + "epoch": 1.875405055087492, + "grad_norm": 0.05335691198706627, + "learning_rate": 0.00011803411494666727, + "loss": 0.2489, + "step": 23150 + }, + { + "epoch": 1.8754860661049904, + "grad_norm": 0.055933184921741486, + "learning_rate": 0.00011802961429407265, + "loss": 0.294, + "step": 23151 + }, + { + "epoch": 1.8755670771224886, + "grad_norm": 0.05710383132100105, + "learning_rate": 0.00011802511364147802, + "loss": 0.2836, + "step": 23152 + }, + { + "epoch": 1.875648088139987, + "grad_norm": 0.05327145382761955, + "learning_rate": 0.00011802061298888338, + "loss": 0.2964, + "step": 23153 + }, + { + "epoch": 1.8757290991574855, + "grad_norm": 0.05328657105565071, + "learning_rate": 0.00011801611233628878, + "loss": 0.2742, + "step": 23154 + }, + { + "epoch": 1.8758101101749838, + "grad_norm": 0.052094716578722, + "learning_rate": 0.00011801161168369414, + "loss": 0.2712, + "step": 23155 + }, + { + "epoch": 1.875891121192482, + "grad_norm": 0.05217659845948219, + "learning_rate": 0.00011800711103109951, + "loss": 0.2587, + "step": 23156 + }, + { + "epoch": 1.8759721322099807, + "grad_norm": 0.05159446597099304, + "learning_rate": 0.00011800261037850489, + "loss": 0.2881, + "step": 23157 + }, + { + "epoch": 1.876053143227479, + "grad_norm": 0.07257717847824097, + "learning_rate": 0.00011799810972591026, + "loss": 0.2887, + "step": 23158 + }, + { + "epoch": 1.8761341542449772, + "grad_norm": 0.07235502451658249, + "learning_rate": 0.00011799360907331565, + "loss": 0.2695, + "step": 23159 + }, + { + "epoch": 1.8762151652624757, + "grad_norm": 0.0665421336889267, + "learning_rate": 0.00011798910842072102, + "loss": 0.2892, + "step": 23160 + }, + { + "epoch": 1.8762961762799741, + "grad_norm": 0.0503656379878521, + "learning_rate": 0.00011798460776812638, + "loss": 0.2646, + "step": 23161 + }, + { + "epoch": 1.8763771872974724, + "grad_norm": 0.060483045876026154, + "learning_rate": 0.00011798010711553175, + "loss": 0.2591, + "step": 23162 + }, + { + "epoch": 1.8764581983149708, + "grad_norm": 0.060321781784296036, + "learning_rate": 0.00011797560646293713, + "loss": 0.2755, + "step": 23163 + }, + { + "epoch": 1.8765392093324693, + "grad_norm": 0.06695659458637238, + "learning_rate": 0.0001179711058103425, + "loss": 0.2676, + "step": 23164 + }, + { + "epoch": 1.8766202203499676, + "grad_norm": 0.052316345274448395, + "learning_rate": 0.00011796660515774789, + "loss": 0.2758, + "step": 23165 + }, + { + "epoch": 1.876701231367466, + "grad_norm": 0.061475787311792374, + "learning_rate": 0.00011796210450515326, + "loss": 0.2605, + "step": 23166 + }, + { + "epoch": 1.8767822423849645, + "grad_norm": 0.06417156010866165, + "learning_rate": 0.00011795760385255862, + "loss": 0.2547, + "step": 23167 + }, + { + "epoch": 1.8768632534024627, + "grad_norm": 0.0625377893447876, + "learning_rate": 0.000117953103199964, + "loss": 0.2984, + "step": 23168 + }, + { + "epoch": 1.876944264419961, + "grad_norm": 0.06442003697156906, + "learning_rate": 0.00011794860254736937, + "loss": 0.2942, + "step": 23169 + }, + { + "epoch": 1.8770252754374595, + "grad_norm": 0.06341394782066345, + "learning_rate": 0.00011794410189477474, + "loss": 0.2621, + "step": 23170 + }, + { + "epoch": 1.877106286454958, + "grad_norm": 0.0712093785405159, + "learning_rate": 0.00011793960124218013, + "loss": 0.3148, + "step": 23171 + }, + { + "epoch": 1.8771872974724562, + "grad_norm": 0.06914816051721573, + "learning_rate": 0.0001179351005895855, + "loss": 0.2966, + "step": 23172 + }, + { + "epoch": 1.8772683084899546, + "grad_norm": 0.06750152260065079, + "learning_rate": 0.00011793059993699086, + "loss": 0.3091, + "step": 23173 + }, + { + "epoch": 1.877349319507453, + "grad_norm": 0.06350599229335785, + "learning_rate": 0.00011792609928439624, + "loss": 0.3225, + "step": 23174 + }, + { + "epoch": 1.8774303305249513, + "grad_norm": 0.05254534259438515, + "learning_rate": 0.00011792159863180161, + "loss": 0.2607, + "step": 23175 + }, + { + "epoch": 1.8775113415424498, + "grad_norm": 0.0773734524846077, + "learning_rate": 0.00011791709797920699, + "loss": 0.2863, + "step": 23176 + }, + { + "epoch": 1.8775923525599483, + "grad_norm": 0.0571201853454113, + "learning_rate": 0.00011791259732661237, + "loss": 0.2958, + "step": 23177 + }, + { + "epoch": 1.8776733635774465, + "grad_norm": 0.06771673262119293, + "learning_rate": 0.00011790809667401775, + "loss": 0.3361, + "step": 23178 + }, + { + "epoch": 1.8777543745949448, + "grad_norm": 0.05999191105365753, + "learning_rate": 0.0001179035960214231, + "loss": 0.2586, + "step": 23179 + }, + { + "epoch": 1.8778353856124435, + "grad_norm": 0.053210508078336716, + "learning_rate": 0.00011789909536882848, + "loss": 0.2406, + "step": 23180 + }, + { + "epoch": 1.8779163966299417, + "grad_norm": 0.06333492696285248, + "learning_rate": 0.00011789459471623385, + "loss": 0.2541, + "step": 23181 + }, + { + "epoch": 1.87799740764744, + "grad_norm": 0.06045440956950188, + "learning_rate": 0.00011789009406363923, + "loss": 0.2516, + "step": 23182 + }, + { + "epoch": 1.8780784186649384, + "grad_norm": 0.06109251827001572, + "learning_rate": 0.00011788559341104461, + "loss": 0.2754, + "step": 23183 + }, + { + "epoch": 1.8781594296824369, + "grad_norm": 0.054190631955862045, + "learning_rate": 0.00011788109275844999, + "loss": 0.2625, + "step": 23184 + }, + { + "epoch": 1.8782404406999351, + "grad_norm": 0.05592363327741623, + "learning_rate": 0.00011787659210585536, + "loss": 0.2804, + "step": 23185 + }, + { + "epoch": 1.8783214517174336, + "grad_norm": 0.05816115438938141, + "learning_rate": 0.00011787209145326072, + "loss": 0.2654, + "step": 23186 + }, + { + "epoch": 1.878402462734932, + "grad_norm": 0.04598617181181908, + "learning_rate": 0.0001178675908006661, + "loss": 0.2338, + "step": 23187 + }, + { + "epoch": 1.8784834737524303, + "grad_norm": 0.05558772012591362, + "learning_rate": 0.00011786309014807148, + "loss": 0.2792, + "step": 23188 + }, + { + "epoch": 1.8785644847699285, + "grad_norm": 0.054704394191503525, + "learning_rate": 0.00011785858949547686, + "loss": 0.2761, + "step": 23189 + }, + { + "epoch": 1.8786454957874272, + "grad_norm": 0.05882862210273743, + "learning_rate": 0.00011785408884288223, + "loss": 0.2817, + "step": 23190 + }, + { + "epoch": 1.8787265068049255, + "grad_norm": 0.05930648744106293, + "learning_rate": 0.0001178495881902876, + "loss": 0.2687, + "step": 23191 + }, + { + "epoch": 1.8788075178224237, + "grad_norm": 0.06882467865943909, + "learning_rate": 0.00011784508753769296, + "loss": 0.2748, + "step": 23192 + }, + { + "epoch": 1.8788885288399222, + "grad_norm": 0.0692213848233223, + "learning_rate": 0.00011784058688509834, + "loss": 0.2884, + "step": 23193 + }, + { + "epoch": 1.8789695398574207, + "grad_norm": 0.06592894345521927, + "learning_rate": 0.00011783608623250372, + "loss": 0.3105, + "step": 23194 + }, + { + "epoch": 1.879050550874919, + "grad_norm": 0.053590722382068634, + "learning_rate": 0.0001178315855799091, + "loss": 0.28, + "step": 23195 + }, + { + "epoch": 1.8791315618924174, + "grad_norm": 0.06279432028532028, + "learning_rate": 0.00011782708492731447, + "loss": 0.2687, + "step": 23196 + }, + { + "epoch": 1.8792125729099158, + "grad_norm": 0.06744203716516495, + "learning_rate": 0.00011782258427471984, + "loss": 0.252, + "step": 23197 + }, + { + "epoch": 1.879293583927414, + "grad_norm": 0.06443815678358078, + "learning_rate": 0.0001178180836221252, + "loss": 0.2666, + "step": 23198 + }, + { + "epoch": 1.8793745949449125, + "grad_norm": 0.05662263557314873, + "learning_rate": 0.00011781358296953058, + "loss": 0.251, + "step": 23199 + }, + { + "epoch": 1.879455605962411, + "grad_norm": 0.05632168427109718, + "learning_rate": 0.00011780908231693597, + "loss": 0.3433, + "step": 23200 + }, + { + "epoch": 1.8795366169799093, + "grad_norm": 0.05025065690279007, + "learning_rate": 0.00011780458166434134, + "loss": 0.256, + "step": 23201 + }, + { + "epoch": 1.8796176279974075, + "grad_norm": 0.05598134547472, + "learning_rate": 0.00011780008101174671, + "loss": 0.2565, + "step": 23202 + }, + { + "epoch": 1.8796986390149062, + "grad_norm": 0.048147015273571014, + "learning_rate": 0.00011779558035915209, + "loss": 0.2558, + "step": 23203 + }, + { + "epoch": 1.8797796500324044, + "grad_norm": 0.052048951387405396, + "learning_rate": 0.00011779107970655745, + "loss": 0.2569, + "step": 23204 + }, + { + "epoch": 1.8798606610499027, + "grad_norm": 0.050691064447164536, + "learning_rate": 0.00011778657905396282, + "loss": 0.2632, + "step": 23205 + }, + { + "epoch": 1.8799416720674011, + "grad_norm": 0.058509405702352524, + "learning_rate": 0.00011778207840136821, + "loss": 0.2606, + "step": 23206 + }, + { + "epoch": 1.8800226830848996, + "grad_norm": 0.06845378130674362, + "learning_rate": 0.00011777757774877358, + "loss": 0.2833, + "step": 23207 + }, + { + "epoch": 1.8801036941023979, + "grad_norm": 0.055425599217414856, + "learning_rate": 0.00011777307709617895, + "loss": 0.2456, + "step": 23208 + }, + { + "epoch": 1.8801847051198963, + "grad_norm": 0.056326232850551605, + "learning_rate": 0.00011776857644358433, + "loss": 0.3153, + "step": 23209 + }, + { + "epoch": 1.8802657161373948, + "grad_norm": 0.0537690632045269, + "learning_rate": 0.00011776407579098969, + "loss": 0.2587, + "step": 23210 + }, + { + "epoch": 1.880346727154893, + "grad_norm": 0.055282801389694214, + "learning_rate": 0.00011775957513839506, + "loss": 0.2548, + "step": 23211 + }, + { + "epoch": 1.8804277381723913, + "grad_norm": 0.056459132581949234, + "learning_rate": 0.00011775507448580045, + "loss": 0.2467, + "step": 23212 + }, + { + "epoch": 1.88050874918989, + "grad_norm": 0.049248144030570984, + "learning_rate": 0.00011775057383320582, + "loss": 0.2457, + "step": 23213 + }, + { + "epoch": 1.8805897602073882, + "grad_norm": 0.061972279101610184, + "learning_rate": 0.0001177460731806112, + "loss": 0.2712, + "step": 23214 + }, + { + "epoch": 1.8806707712248865, + "grad_norm": 0.05165662243962288, + "learning_rate": 0.00011774157252801657, + "loss": 0.2853, + "step": 23215 + }, + { + "epoch": 1.880751782242385, + "grad_norm": 0.05435626208782196, + "learning_rate": 0.00011773707187542193, + "loss": 0.2512, + "step": 23216 + }, + { + "epoch": 1.8808327932598834, + "grad_norm": 0.05616835877299309, + "learning_rate": 0.00011773257122282733, + "loss": 0.2569, + "step": 23217 + }, + { + "epoch": 1.8809138042773816, + "grad_norm": 0.05482323840260506, + "learning_rate": 0.00011772807057023269, + "loss": 0.2906, + "step": 23218 + }, + { + "epoch": 1.88099481529488, + "grad_norm": 0.05005818232893944, + "learning_rate": 0.00011772356991763806, + "loss": 0.278, + "step": 23219 + }, + { + "epoch": 1.8810758263123786, + "grad_norm": 0.07082431018352509, + "learning_rate": 0.00011771906926504344, + "loss": 0.2867, + "step": 23220 + }, + { + "epoch": 1.8811568373298768, + "grad_norm": 0.056544482707977295, + "learning_rate": 0.00011771456861244881, + "loss": 0.298, + "step": 23221 + }, + { + "epoch": 1.8812378483473753, + "grad_norm": 0.08427192270755768, + "learning_rate": 0.00011771006795985417, + "loss": 0.2707, + "step": 23222 + }, + { + "epoch": 1.8813188593648738, + "grad_norm": 0.06930147111415863, + "learning_rate": 0.00011770556730725957, + "loss": 0.2753, + "step": 23223 + }, + { + "epoch": 1.881399870382372, + "grad_norm": 0.06457947194576263, + "learning_rate": 0.00011770106665466493, + "loss": 0.2841, + "step": 23224 + }, + { + "epoch": 1.8814808813998702, + "grad_norm": 0.06023853272199631, + "learning_rate": 0.0001176965660020703, + "loss": 0.2636, + "step": 23225 + }, + { + "epoch": 1.8815618924173687, + "grad_norm": 0.05933365970849991, + "learning_rate": 0.00011769206534947568, + "loss": 0.2428, + "step": 23226 + }, + { + "epoch": 1.8816429034348672, + "grad_norm": 0.05406753346323967, + "learning_rate": 0.00011768756469688105, + "loss": 0.2264, + "step": 23227 + }, + { + "epoch": 1.8817239144523654, + "grad_norm": 0.05009690299630165, + "learning_rate": 0.00011768306404428641, + "loss": 0.2921, + "step": 23228 + }, + { + "epoch": 1.8818049254698639, + "grad_norm": 0.062386028468608856, + "learning_rate": 0.00011767856339169181, + "loss": 0.274, + "step": 23229 + }, + { + "epoch": 1.8818859364873624, + "grad_norm": 0.060382407158613205, + "learning_rate": 0.00011767406273909717, + "loss": 0.2765, + "step": 23230 + }, + { + "epoch": 1.8819669475048606, + "grad_norm": 0.05769483372569084, + "learning_rate": 0.00011766956208650255, + "loss": 0.2419, + "step": 23231 + }, + { + "epoch": 1.882047958522359, + "grad_norm": 0.04864031821489334, + "learning_rate": 0.00011766506143390792, + "loss": 0.2734, + "step": 23232 + }, + { + "epoch": 1.8821289695398575, + "grad_norm": 0.05230722203850746, + "learning_rate": 0.0001176605607813133, + "loss": 0.2695, + "step": 23233 + }, + { + "epoch": 1.8822099805573558, + "grad_norm": 0.050069410353899, + "learning_rate": 0.00011765606012871865, + "loss": 0.2594, + "step": 23234 + }, + { + "epoch": 1.882290991574854, + "grad_norm": 0.05620177835226059, + "learning_rate": 0.00011765155947612405, + "loss": 0.2829, + "step": 23235 + }, + { + "epoch": 1.8823720025923527, + "grad_norm": 0.059488844126462936, + "learning_rate": 0.00011764705882352942, + "loss": 0.2484, + "step": 23236 + }, + { + "epoch": 1.882453013609851, + "grad_norm": 0.049712467938661575, + "learning_rate": 0.00011764255817093479, + "loss": 0.2615, + "step": 23237 + }, + { + "epoch": 1.8825340246273492, + "grad_norm": 0.055734626948833466, + "learning_rate": 0.00011763805751834016, + "loss": 0.2662, + "step": 23238 + }, + { + "epoch": 1.8826150356448477, + "grad_norm": 0.057213060557842255, + "learning_rate": 0.00011763355686574554, + "loss": 0.2276, + "step": 23239 + }, + { + "epoch": 1.8826960466623461, + "grad_norm": 0.05141379311680794, + "learning_rate": 0.00011762905621315092, + "loss": 0.2531, + "step": 23240 + }, + { + "epoch": 1.8827770576798444, + "grad_norm": 0.07517049461603165, + "learning_rate": 0.0001176245555605563, + "loss": 0.2728, + "step": 23241 + }, + { + "epoch": 1.8828580686973428, + "grad_norm": 0.06808876246213913, + "learning_rate": 0.00011762005490796166, + "loss": 0.289, + "step": 23242 + }, + { + "epoch": 1.8829390797148413, + "grad_norm": 0.057982414960861206, + "learning_rate": 0.00011761555425536703, + "loss": 0.275, + "step": 23243 + }, + { + "epoch": 1.8830200907323396, + "grad_norm": 0.051993537694215775, + "learning_rate": 0.0001176110536027724, + "loss": 0.2556, + "step": 23244 + }, + { + "epoch": 1.883101101749838, + "grad_norm": 0.04890101030468941, + "learning_rate": 0.00011760655295017778, + "loss": 0.2163, + "step": 23245 + }, + { + "epoch": 1.8831821127673365, + "grad_norm": 0.06038685515522957, + "learning_rate": 0.00011760205229758316, + "loss": 0.3004, + "step": 23246 + }, + { + "epoch": 1.8832631237848347, + "grad_norm": 0.05634760111570358, + "learning_rate": 0.00011759755164498854, + "loss": 0.2671, + "step": 23247 + }, + { + "epoch": 1.883344134802333, + "grad_norm": 0.06269721686840057, + "learning_rate": 0.0001175930509923939, + "loss": 0.2915, + "step": 23248 + }, + { + "epoch": 1.8834251458198314, + "grad_norm": 0.05876925587654114, + "learning_rate": 0.00011758855033979927, + "loss": 0.2553, + "step": 23249 + }, + { + "epoch": 1.88350615683733, + "grad_norm": 0.05880272015929222, + "learning_rate": 0.00011758404968720465, + "loss": 0.2712, + "step": 23250 + }, + { + "epoch": 1.8835871678548282, + "grad_norm": 0.05663580447435379, + "learning_rate": 0.00011757954903461002, + "loss": 0.2741, + "step": 23251 + }, + { + "epoch": 1.8836681788723266, + "grad_norm": 0.06358767300844193, + "learning_rate": 0.0001175750483820154, + "loss": 0.2841, + "step": 23252 + }, + { + "epoch": 1.883749189889825, + "grad_norm": 0.06005439907312393, + "learning_rate": 0.00011757054772942078, + "loss": 0.2268, + "step": 23253 + }, + { + "epoch": 1.8838302009073233, + "grad_norm": 0.06091054901480675, + "learning_rate": 0.00011756604707682615, + "loss": 0.3052, + "step": 23254 + }, + { + "epoch": 1.8839112119248218, + "grad_norm": 0.07007857412099838, + "learning_rate": 0.00011756154642423151, + "loss": 0.2703, + "step": 23255 + }, + { + "epoch": 1.8839922229423203, + "grad_norm": 0.05151437595486641, + "learning_rate": 0.00011755704577163689, + "loss": 0.2233, + "step": 23256 + }, + { + "epoch": 1.8840732339598185, + "grad_norm": 0.05768553167581558, + "learning_rate": 0.00011755254511904226, + "loss": 0.2686, + "step": 23257 + }, + { + "epoch": 1.8841542449773168, + "grad_norm": 0.04945516213774681, + "learning_rate": 0.00011754804446644765, + "loss": 0.234, + "step": 23258 + }, + { + "epoch": 1.8842352559948155, + "grad_norm": 0.053539324551820755, + "learning_rate": 0.00011754354381385302, + "loss": 0.2857, + "step": 23259 + }, + { + "epoch": 1.8843162670123137, + "grad_norm": 0.065309077501297, + "learning_rate": 0.0001175390431612584, + "loss": 0.2945, + "step": 23260 + }, + { + "epoch": 1.884397278029812, + "grad_norm": 0.05602163448929787, + "learning_rate": 0.00011753454250866376, + "loss": 0.2522, + "step": 23261 + }, + { + "epoch": 1.8844782890473104, + "grad_norm": 0.06638114154338837, + "learning_rate": 0.00011753004185606913, + "loss": 0.2979, + "step": 23262 + }, + { + "epoch": 1.8845593000648089, + "grad_norm": 0.04951305314898491, + "learning_rate": 0.0001175255412034745, + "loss": 0.2588, + "step": 23263 + }, + { + "epoch": 1.8846403110823071, + "grad_norm": 0.053892266005277634, + "learning_rate": 0.00011752104055087989, + "loss": 0.2561, + "step": 23264 + }, + { + "epoch": 1.8847213220998056, + "grad_norm": 0.05633193626999855, + "learning_rate": 0.00011751653989828526, + "loss": 0.3307, + "step": 23265 + }, + { + "epoch": 1.884802333117304, + "grad_norm": 0.06259032338857651, + "learning_rate": 0.00011751203924569064, + "loss": 0.2476, + "step": 23266 + }, + { + "epoch": 1.8848833441348023, + "grad_norm": 0.054782312363386154, + "learning_rate": 0.000117507538593096, + "loss": 0.2651, + "step": 23267 + }, + { + "epoch": 1.8849643551523008, + "grad_norm": 0.07018223404884338, + "learning_rate": 0.00011750303794050137, + "loss": 0.2427, + "step": 23268 + }, + { + "epoch": 1.8850453661697992, + "grad_norm": 0.05568501725792885, + "learning_rate": 0.00011749853728790676, + "loss": 0.2359, + "step": 23269 + }, + { + "epoch": 1.8851263771872975, + "grad_norm": 0.06143895909190178, + "learning_rate": 0.00011749403663531213, + "loss": 0.3427, + "step": 23270 + }, + { + "epoch": 1.8852073882047957, + "grad_norm": 0.0545257069170475, + "learning_rate": 0.0001174895359827175, + "loss": 0.2596, + "step": 23271 + }, + { + "epoch": 1.8852883992222942, + "grad_norm": 0.055278100073337555, + "learning_rate": 0.00011748503533012288, + "loss": 0.268, + "step": 23272 + }, + { + "epoch": 1.8853694102397927, + "grad_norm": 0.05664905160665512, + "learning_rate": 0.00011748053467752824, + "loss": 0.2745, + "step": 23273 + }, + { + "epoch": 1.885450421257291, + "grad_norm": 0.06449475884437561, + "learning_rate": 0.00011747603402493361, + "loss": 0.287, + "step": 23274 + }, + { + "epoch": 1.8855314322747894, + "grad_norm": 0.05762218311429024, + "learning_rate": 0.000117471533372339, + "loss": 0.2556, + "step": 23275 + }, + { + "epoch": 1.8856124432922878, + "grad_norm": 0.057085197418928146, + "learning_rate": 0.00011746703271974437, + "loss": 0.2809, + "step": 23276 + }, + { + "epoch": 1.885693454309786, + "grad_norm": 0.0630897730588913, + "learning_rate": 0.00011746253206714975, + "loss": 0.2833, + "step": 23277 + }, + { + "epoch": 1.8857744653272845, + "grad_norm": 0.03931886702775955, + "learning_rate": 0.00011745803141455512, + "loss": 0.2068, + "step": 23278 + }, + { + "epoch": 1.885855476344783, + "grad_norm": 0.06200547143816948, + "learning_rate": 0.00011745353076196048, + "loss": 0.2294, + "step": 23279 + }, + { + "epoch": 1.8859364873622813, + "grad_norm": 0.04909854009747505, + "learning_rate": 0.00011744903010936585, + "loss": 0.2435, + "step": 23280 + }, + { + "epoch": 1.8860174983797795, + "grad_norm": 0.05622778832912445, + "learning_rate": 0.00011744452945677124, + "loss": 0.2574, + "step": 23281 + }, + { + "epoch": 1.8860985093972782, + "grad_norm": 0.05940768122673035, + "learning_rate": 0.00011744002880417661, + "loss": 0.2683, + "step": 23282 + }, + { + "epoch": 1.8861795204147764, + "grad_norm": 0.05599218234419823, + "learning_rate": 0.00011743552815158199, + "loss": 0.2492, + "step": 23283 + }, + { + "epoch": 1.8862605314322747, + "grad_norm": 0.061045896261930466, + "learning_rate": 0.00011743102749898736, + "loss": 0.2534, + "step": 23284 + }, + { + "epoch": 1.8863415424497731, + "grad_norm": 0.06886105239391327, + "learning_rate": 0.00011742652684639272, + "loss": 0.3065, + "step": 23285 + }, + { + "epoch": 1.8864225534672716, + "grad_norm": 0.04585834592580795, + "learning_rate": 0.0001174220261937981, + "loss": 0.2462, + "step": 23286 + }, + { + "epoch": 1.8865035644847699, + "grad_norm": 0.059051238000392914, + "learning_rate": 0.00011741752554120348, + "loss": 0.2488, + "step": 23287 + }, + { + "epoch": 1.8865845755022683, + "grad_norm": 0.07089319825172424, + "learning_rate": 0.00011741302488860886, + "loss": 0.3039, + "step": 23288 + }, + { + "epoch": 1.8866655865197668, + "grad_norm": 0.05801811441779137, + "learning_rate": 0.00011740852423601423, + "loss": 0.2833, + "step": 23289 + }, + { + "epoch": 1.886746597537265, + "grad_norm": 0.05861096456646919, + "learning_rate": 0.0001174040235834196, + "loss": 0.2818, + "step": 23290 + }, + { + "epoch": 1.8868276085547635, + "grad_norm": 0.05898521840572357, + "learning_rate": 0.00011739952293082496, + "loss": 0.2713, + "step": 23291 + }, + { + "epoch": 1.886908619572262, + "grad_norm": 0.05487682670354843, + "learning_rate": 0.00011739502227823036, + "loss": 0.2395, + "step": 23292 + }, + { + "epoch": 1.8869896305897602, + "grad_norm": 0.061498358845710754, + "learning_rate": 0.00011739052162563572, + "loss": 0.2987, + "step": 23293 + }, + { + "epoch": 1.8870706416072585, + "grad_norm": 0.06627925485372543, + "learning_rate": 0.0001173860209730411, + "loss": 0.2631, + "step": 23294 + }, + { + "epoch": 1.887151652624757, + "grad_norm": 0.05386059358716011, + "learning_rate": 0.00011738152032044647, + "loss": 0.2437, + "step": 23295 + }, + { + "epoch": 1.8872326636422554, + "grad_norm": 0.06662789732217789, + "learning_rate": 0.00011737701966785184, + "loss": 0.2781, + "step": 23296 + }, + { + "epoch": 1.8873136746597536, + "grad_norm": 0.05340142175555229, + "learning_rate": 0.0001173725190152572, + "loss": 0.2402, + "step": 23297 + }, + { + "epoch": 1.887394685677252, + "grad_norm": 0.06220469996333122, + "learning_rate": 0.0001173680183626626, + "loss": 0.2493, + "step": 23298 + }, + { + "epoch": 1.8874756966947506, + "grad_norm": 0.06774033606052399, + "learning_rate": 0.00011736351771006797, + "loss": 0.2628, + "step": 23299 + }, + { + "epoch": 1.8875567077122488, + "grad_norm": 0.056173328310251236, + "learning_rate": 0.00011735901705747334, + "loss": 0.2477, + "step": 23300 + }, + { + "epoch": 1.8876377187297473, + "grad_norm": 0.04532039538025856, + "learning_rate": 0.00011735451640487871, + "loss": 0.237, + "step": 23301 + }, + { + "epoch": 1.8877187297472457, + "grad_norm": 0.06891242414712906, + "learning_rate": 0.00011735001575228409, + "loss": 0.2505, + "step": 23302 + }, + { + "epoch": 1.887799740764744, + "grad_norm": 0.05437375232577324, + "learning_rate": 0.00011734551509968945, + "loss": 0.2327, + "step": 23303 + }, + { + "epoch": 1.8878807517822422, + "grad_norm": 0.0711214691400528, + "learning_rate": 0.00011734101444709485, + "loss": 0.3406, + "step": 23304 + }, + { + "epoch": 1.887961762799741, + "grad_norm": 0.06552078574895859, + "learning_rate": 0.00011733651379450021, + "loss": 0.2973, + "step": 23305 + }, + { + "epoch": 1.8880427738172392, + "grad_norm": 0.0602584071457386, + "learning_rate": 0.00011733201314190558, + "loss": 0.2509, + "step": 23306 + }, + { + "epoch": 1.8881237848347374, + "grad_norm": 0.054096248000860214, + "learning_rate": 0.00011732751248931095, + "loss": 0.2961, + "step": 23307 + }, + { + "epoch": 1.8882047958522359, + "grad_norm": 0.05824704095721245, + "learning_rate": 0.00011732301183671633, + "loss": 0.2853, + "step": 23308 + }, + { + "epoch": 1.8882858068697344, + "grad_norm": 0.0697273537516594, + "learning_rate": 0.00011731851118412169, + "loss": 0.2621, + "step": 23309 + }, + { + "epoch": 1.8883668178872326, + "grad_norm": 0.06556077301502228, + "learning_rate": 0.00011731401053152709, + "loss": 0.2689, + "step": 23310 + }, + { + "epoch": 1.888447828904731, + "grad_norm": 0.05337255448102951, + "learning_rate": 0.00011730950987893245, + "loss": 0.2712, + "step": 23311 + }, + { + "epoch": 1.8885288399222295, + "grad_norm": 0.06646385788917542, + "learning_rate": 0.00011730500922633782, + "loss": 0.2881, + "step": 23312 + }, + { + "epoch": 1.8886098509397278, + "grad_norm": 0.054512329399585724, + "learning_rate": 0.0001173005085737432, + "loss": 0.2768, + "step": 23313 + }, + { + "epoch": 1.888690861957226, + "grad_norm": 0.05661356449127197, + "learning_rate": 0.00011729600792114857, + "loss": 0.3054, + "step": 23314 + }, + { + "epoch": 1.8887718729747247, + "grad_norm": 0.06277420371770859, + "learning_rate": 0.00011729150726855393, + "loss": 0.2674, + "step": 23315 + }, + { + "epoch": 1.888852883992223, + "grad_norm": 0.06473672389984131, + "learning_rate": 0.00011728700661595933, + "loss": 0.2659, + "step": 23316 + }, + { + "epoch": 1.8889338950097212, + "grad_norm": 0.05304969847202301, + "learning_rate": 0.00011728250596336469, + "loss": 0.2537, + "step": 23317 + }, + { + "epoch": 1.8890149060272197, + "grad_norm": 0.06111942231655121, + "learning_rate": 0.00011727800531077006, + "loss": 0.2421, + "step": 23318 + }, + { + "epoch": 1.8890959170447181, + "grad_norm": 0.06297849118709564, + "learning_rate": 0.00011727350465817544, + "loss": 0.2667, + "step": 23319 + }, + { + "epoch": 1.8891769280622164, + "grad_norm": 0.06840529292821884, + "learning_rate": 0.00011726900400558081, + "loss": 0.2848, + "step": 23320 + }, + { + "epoch": 1.8892579390797148, + "grad_norm": 0.05544573813676834, + "learning_rate": 0.0001172645033529862, + "loss": 0.2567, + "step": 23321 + }, + { + "epoch": 1.8893389500972133, + "grad_norm": 0.06516019999980927, + "learning_rate": 0.00011726000270039157, + "loss": 0.2713, + "step": 23322 + }, + { + "epoch": 1.8894199611147116, + "grad_norm": 0.07645253837108612, + "learning_rate": 0.00011725550204779695, + "loss": 0.2767, + "step": 23323 + }, + { + "epoch": 1.88950097213221, + "grad_norm": 0.05969347059726715, + "learning_rate": 0.0001172510013952023, + "loss": 0.2833, + "step": 23324 + }, + { + "epoch": 1.8895819831497085, + "grad_norm": 0.061069682240486145, + "learning_rate": 0.00011724650074260768, + "loss": 0.2896, + "step": 23325 + }, + { + "epoch": 1.8896629941672067, + "grad_norm": 0.05962633714079857, + "learning_rate": 0.00011724200009001305, + "loss": 0.2753, + "step": 23326 + }, + { + "epoch": 1.889744005184705, + "grad_norm": 0.06852693110704422, + "learning_rate": 0.00011723749943741844, + "loss": 0.2671, + "step": 23327 + }, + { + "epoch": 1.8898250162022034, + "grad_norm": 0.06853719055652618, + "learning_rate": 0.00011723299878482381, + "loss": 0.3764, + "step": 23328 + }, + { + "epoch": 1.889906027219702, + "grad_norm": 0.05661075562238693, + "learning_rate": 0.00011722849813222919, + "loss": 0.2504, + "step": 23329 + }, + { + "epoch": 1.8899870382372002, + "grad_norm": 0.06189139559864998, + "learning_rate": 0.00011722399747963455, + "loss": 0.2636, + "step": 23330 + }, + { + "epoch": 1.8900680492546986, + "grad_norm": 0.06755541265010834, + "learning_rate": 0.00011721949682703992, + "loss": 0.3142, + "step": 23331 + }, + { + "epoch": 1.890149060272197, + "grad_norm": 0.06374403089284897, + "learning_rate": 0.0001172149961744453, + "loss": 0.2426, + "step": 23332 + }, + { + "epoch": 1.8902300712896953, + "grad_norm": 0.05989615246653557, + "learning_rate": 0.00011721049552185068, + "loss": 0.2686, + "step": 23333 + }, + { + "epoch": 1.8903110823071938, + "grad_norm": 0.06072322651743889, + "learning_rate": 0.00011720599486925606, + "loss": 0.2924, + "step": 23334 + }, + { + "epoch": 1.8903920933246923, + "grad_norm": 0.06039504334330559, + "learning_rate": 0.00011720149421666143, + "loss": 0.3068, + "step": 23335 + }, + { + "epoch": 1.8904731043421905, + "grad_norm": 0.058001559227705, + "learning_rate": 0.00011719699356406679, + "loss": 0.3194, + "step": 23336 + }, + { + "epoch": 1.8905541153596888, + "grad_norm": 0.05137511342763901, + "learning_rate": 0.00011719249291147216, + "loss": 0.283, + "step": 23337 + }, + { + "epoch": 1.8906351263771874, + "grad_norm": 0.06535633653402328, + "learning_rate": 0.00011718799225887754, + "loss": 0.282, + "step": 23338 + }, + { + "epoch": 1.8907161373946857, + "grad_norm": 0.0476398840546608, + "learning_rate": 0.00011718349160628292, + "loss": 0.22, + "step": 23339 + }, + { + "epoch": 1.890797148412184, + "grad_norm": 0.057031337171792984, + "learning_rate": 0.0001171789909536883, + "loss": 0.2837, + "step": 23340 + }, + { + "epoch": 1.8908781594296824, + "grad_norm": 0.05744462460279465, + "learning_rate": 0.00011717449030109367, + "loss": 0.2553, + "step": 23341 + }, + { + "epoch": 1.8909591704471809, + "grad_norm": 0.06022125110030174, + "learning_rate": 0.00011716998964849903, + "loss": 0.2846, + "step": 23342 + }, + { + "epoch": 1.8910401814646791, + "grad_norm": 0.06611776351928711, + "learning_rate": 0.0001171654889959044, + "loss": 0.2775, + "step": 23343 + }, + { + "epoch": 1.8911211924821776, + "grad_norm": 0.05349963158369064, + "learning_rate": 0.00011716098834330979, + "loss": 0.2582, + "step": 23344 + }, + { + "epoch": 1.891202203499676, + "grad_norm": 0.05367043614387512, + "learning_rate": 0.00011715648769071516, + "loss": 0.2746, + "step": 23345 + }, + { + "epoch": 1.8912832145171743, + "grad_norm": 0.04462164267897606, + "learning_rate": 0.00011715198703812054, + "loss": 0.2464, + "step": 23346 + }, + { + "epoch": 1.8913642255346728, + "grad_norm": 0.04876317083835602, + "learning_rate": 0.00011714748638552591, + "loss": 0.2535, + "step": 23347 + }, + { + "epoch": 1.8914452365521712, + "grad_norm": 0.060143206268548965, + "learning_rate": 0.00011714298573293127, + "loss": 0.2652, + "step": 23348 + }, + { + "epoch": 1.8915262475696695, + "grad_norm": 0.05717812106013298, + "learning_rate": 0.00011713848508033665, + "loss": 0.2712, + "step": 23349 + }, + { + "epoch": 1.8916072585871677, + "grad_norm": 0.07034508883953094, + "learning_rate": 0.00011713398442774203, + "loss": 0.2975, + "step": 23350 + }, + { + "epoch": 1.8916882696046662, + "grad_norm": 0.06371060758829117, + "learning_rate": 0.0001171294837751474, + "loss": 0.2906, + "step": 23351 + }, + { + "epoch": 1.8917692806221647, + "grad_norm": 0.06165608391165733, + "learning_rate": 0.00011712498312255278, + "loss": 0.2296, + "step": 23352 + }, + { + "epoch": 1.891850291639663, + "grad_norm": 0.05758389085531235, + "learning_rate": 0.00011712048246995815, + "loss": 0.2453, + "step": 23353 + }, + { + "epoch": 1.8919313026571614, + "grad_norm": 0.0606408528983593, + "learning_rate": 0.00011711598181736351, + "loss": 0.2498, + "step": 23354 + }, + { + "epoch": 1.8920123136746598, + "grad_norm": 0.06578253954648972, + "learning_rate": 0.00011711148116476889, + "loss": 0.2658, + "step": 23355 + }, + { + "epoch": 1.892093324692158, + "grad_norm": 0.06750598549842834, + "learning_rate": 0.00011710698051217427, + "loss": 0.2561, + "step": 23356 + }, + { + "epoch": 1.8921743357096565, + "grad_norm": 0.06298558413982391, + "learning_rate": 0.00011710247985957965, + "loss": 0.2603, + "step": 23357 + }, + { + "epoch": 1.892255346727155, + "grad_norm": 0.05482904985547066, + "learning_rate": 0.00011709797920698502, + "loss": 0.2428, + "step": 23358 + }, + { + "epoch": 1.8923363577446533, + "grad_norm": 0.05431557446718216, + "learning_rate": 0.0001170934785543904, + "loss": 0.2608, + "step": 23359 + }, + { + "epoch": 1.8924173687621515, + "grad_norm": 0.060233451426029205, + "learning_rate": 0.00011708897790179576, + "loss": 0.2297, + "step": 23360 + }, + { + "epoch": 1.8924983797796502, + "grad_norm": 0.06667815148830414, + "learning_rate": 0.00011708447724920113, + "loss": 0.2879, + "step": 23361 + }, + { + "epoch": 1.8925793907971484, + "grad_norm": 0.06704255938529968, + "learning_rate": 0.00011707997659660652, + "loss": 0.2846, + "step": 23362 + }, + { + "epoch": 1.8926604018146467, + "grad_norm": 0.06251055747270584, + "learning_rate": 0.00011707547594401189, + "loss": 0.2894, + "step": 23363 + }, + { + "epoch": 1.8927414128321451, + "grad_norm": 0.06847628951072693, + "learning_rate": 0.00011707097529141726, + "loss": 0.32, + "step": 23364 + }, + { + "epoch": 1.8928224238496436, + "grad_norm": 0.06425163149833679, + "learning_rate": 0.00011706647463882264, + "loss": 0.2243, + "step": 23365 + }, + { + "epoch": 1.8929034348671419, + "grad_norm": 0.0618412122130394, + "learning_rate": 0.000117061973986228, + "loss": 0.2973, + "step": 23366 + }, + { + "epoch": 1.8929844458846403, + "grad_norm": 0.05935633182525635, + "learning_rate": 0.00011705747333363337, + "loss": 0.246, + "step": 23367 + }, + { + "epoch": 1.8930654569021388, + "grad_norm": 0.04758918657898903, + "learning_rate": 0.00011705297268103876, + "loss": 0.2194, + "step": 23368 + }, + { + "epoch": 1.893146467919637, + "grad_norm": 0.05723010003566742, + "learning_rate": 0.00011704847202844413, + "loss": 0.2448, + "step": 23369 + }, + { + "epoch": 1.8932274789371355, + "grad_norm": 0.05666911602020264, + "learning_rate": 0.0001170439713758495, + "loss": 0.2836, + "step": 23370 + }, + { + "epoch": 1.893308489954634, + "grad_norm": 0.059430863708257675, + "learning_rate": 0.00011703947072325488, + "loss": 0.2815, + "step": 23371 + }, + { + "epoch": 1.8933895009721322, + "grad_norm": 0.05578245595097542, + "learning_rate": 0.00011703497007066024, + "loss": 0.2414, + "step": 23372 + }, + { + "epoch": 1.8934705119896305, + "grad_norm": 0.05080593377351761, + "learning_rate": 0.00011703046941806564, + "loss": 0.2321, + "step": 23373 + }, + { + "epoch": 1.893551523007129, + "grad_norm": 0.0652099996805191, + "learning_rate": 0.000117025968765471, + "loss": 0.2908, + "step": 23374 + }, + { + "epoch": 1.8936325340246274, + "grad_norm": 0.05565506964921951, + "learning_rate": 0.00011702146811287637, + "loss": 0.2847, + "step": 23375 + }, + { + "epoch": 1.8937135450421256, + "grad_norm": 0.05552058666944504, + "learning_rate": 0.00011701696746028175, + "loss": 0.3026, + "step": 23376 + }, + { + "epoch": 1.893794556059624, + "grad_norm": 0.051754243671894073, + "learning_rate": 0.00011701246680768712, + "loss": 0.2425, + "step": 23377 + }, + { + "epoch": 1.8938755670771226, + "grad_norm": 0.0491461418569088, + "learning_rate": 0.00011700796615509248, + "loss": 0.2677, + "step": 23378 + }, + { + "epoch": 1.8939565780946208, + "grad_norm": 0.052235040813684464, + "learning_rate": 0.00011700346550249788, + "loss": 0.2612, + "step": 23379 + }, + { + "epoch": 1.8940375891121193, + "grad_norm": 0.06482629477977753, + "learning_rate": 0.00011699896484990324, + "loss": 0.2645, + "step": 23380 + }, + { + "epoch": 1.8941186001296177, + "grad_norm": 0.052297309041023254, + "learning_rate": 0.00011699446419730861, + "loss": 0.2516, + "step": 23381 + }, + { + "epoch": 1.894199611147116, + "grad_norm": 0.05444755405187607, + "learning_rate": 0.00011698996354471399, + "loss": 0.2544, + "step": 23382 + }, + { + "epoch": 1.8942806221646142, + "grad_norm": 0.061681777238845825, + "learning_rate": 0.00011698546289211936, + "loss": 0.2908, + "step": 23383 + }, + { + "epoch": 1.894361633182113, + "grad_norm": 0.06857453286647797, + "learning_rate": 0.00011698096223952472, + "loss": 0.2886, + "step": 23384 + }, + { + "epoch": 1.8944426441996112, + "grad_norm": 0.0629449114203453, + "learning_rate": 0.00011697646158693012, + "loss": 0.2858, + "step": 23385 + }, + { + "epoch": 1.8945236552171094, + "grad_norm": 0.07253113389015198, + "learning_rate": 0.0001169719609343355, + "loss": 0.3053, + "step": 23386 + }, + { + "epoch": 1.8946046662346079, + "grad_norm": 0.05951322987675667, + "learning_rate": 0.00011696746028174086, + "loss": 0.2928, + "step": 23387 + }, + { + "epoch": 1.8946856772521063, + "grad_norm": 0.05505599081516266, + "learning_rate": 0.00011696295962914623, + "loss": 0.2499, + "step": 23388 + }, + { + "epoch": 1.8947666882696046, + "grad_norm": 0.05070142447948456, + "learning_rate": 0.0001169584589765516, + "loss": 0.3039, + "step": 23389 + }, + { + "epoch": 1.894847699287103, + "grad_norm": 0.05721290409564972, + "learning_rate": 0.00011695395832395696, + "loss": 0.2911, + "step": 23390 + }, + { + "epoch": 1.8949287103046015, + "grad_norm": 0.05515219643712044, + "learning_rate": 0.00011694945767136236, + "loss": 0.2888, + "step": 23391 + }, + { + "epoch": 1.8950097213220998, + "grad_norm": 0.060376644134521484, + "learning_rate": 0.00011694495701876774, + "loss": 0.2923, + "step": 23392 + }, + { + "epoch": 1.8950907323395982, + "grad_norm": 0.060023244470357895, + "learning_rate": 0.0001169404563661731, + "loss": 0.2979, + "step": 23393 + }, + { + "epoch": 1.8951717433570967, + "grad_norm": 0.0582108199596405, + "learning_rate": 0.00011693595571357847, + "loss": 0.2626, + "step": 23394 + }, + { + "epoch": 1.895252754374595, + "grad_norm": 0.06270253658294678, + "learning_rate": 0.00011693145506098385, + "loss": 0.2814, + "step": 23395 + }, + { + "epoch": 1.8953337653920932, + "grad_norm": 0.06579922139644623, + "learning_rate": 0.0001169269544083892, + "loss": 0.2559, + "step": 23396 + }, + { + "epoch": 1.8954147764095917, + "grad_norm": 0.055605754256248474, + "learning_rate": 0.0001169224537557946, + "loss": 0.31, + "step": 23397 + }, + { + "epoch": 1.8954957874270901, + "grad_norm": 0.047143884003162384, + "learning_rate": 0.00011691795310319998, + "loss": 0.2312, + "step": 23398 + }, + { + "epoch": 1.8955767984445884, + "grad_norm": 0.061128631234169006, + "learning_rate": 0.00011691345245060534, + "loss": 0.2586, + "step": 23399 + }, + { + "epoch": 1.8956578094620868, + "grad_norm": 0.05848472937941551, + "learning_rate": 0.00011690895179801071, + "loss": 0.3144, + "step": 23400 + }, + { + "epoch": 1.8957388204795853, + "grad_norm": 0.05355636402964592, + "learning_rate": 0.00011690445114541609, + "loss": 0.268, + "step": 23401 + }, + { + "epoch": 1.8958198314970836, + "grad_norm": 0.0633152574300766, + "learning_rate": 0.00011689995049282147, + "loss": 0.2618, + "step": 23402 + }, + { + "epoch": 1.895900842514582, + "grad_norm": 0.05251619592308998, + "learning_rate": 0.00011689544984022685, + "loss": 0.2679, + "step": 23403 + }, + { + "epoch": 1.8959818535320805, + "grad_norm": 0.05748763680458069, + "learning_rate": 0.00011689094918763222, + "loss": 0.2709, + "step": 23404 + }, + { + "epoch": 1.8960628645495787, + "grad_norm": 0.05484431982040405, + "learning_rate": 0.00011688644853503758, + "loss": 0.2634, + "step": 23405 + }, + { + "epoch": 1.896143875567077, + "grad_norm": 0.054440487176179886, + "learning_rate": 0.00011688194788244295, + "loss": 0.2566, + "step": 23406 + }, + { + "epoch": 1.8962248865845757, + "grad_norm": 0.06458580493927002, + "learning_rate": 0.00011687744722984833, + "loss": 0.2492, + "step": 23407 + }, + { + "epoch": 1.896305897602074, + "grad_norm": 0.054434191435575485, + "learning_rate": 0.00011687294657725372, + "loss": 0.2476, + "step": 23408 + }, + { + "epoch": 1.8963869086195722, + "grad_norm": 0.05186142027378082, + "learning_rate": 0.00011686844592465909, + "loss": 0.2583, + "step": 23409 + }, + { + "epoch": 1.8964679196370706, + "grad_norm": 0.05285156890749931, + "learning_rate": 0.00011686394527206446, + "loss": 0.2602, + "step": 23410 + }, + { + "epoch": 1.896548930654569, + "grad_norm": 0.0519791916012764, + "learning_rate": 0.00011685944461946982, + "loss": 0.2512, + "step": 23411 + }, + { + "epoch": 1.8966299416720673, + "grad_norm": 0.05681449547410011, + "learning_rate": 0.0001168549439668752, + "loss": 0.2388, + "step": 23412 + }, + { + "epoch": 1.8967109526895658, + "grad_norm": 0.05287312716245651, + "learning_rate": 0.00011685044331428057, + "loss": 0.2576, + "step": 23413 + }, + { + "epoch": 1.8967919637070643, + "grad_norm": 0.06390195339918137, + "learning_rate": 0.00011684594266168596, + "loss": 0.2953, + "step": 23414 + }, + { + "epoch": 1.8968729747245625, + "grad_norm": 0.06533109396696091, + "learning_rate": 0.00011684144200909133, + "loss": 0.2578, + "step": 23415 + }, + { + "epoch": 1.8969539857420608, + "grad_norm": 0.06771045178174973, + "learning_rate": 0.0001168369413564967, + "loss": 0.2826, + "step": 23416 + }, + { + "epoch": 1.8970349967595594, + "grad_norm": 0.06620460003614426, + "learning_rate": 0.00011683244070390206, + "loss": 0.3149, + "step": 23417 + }, + { + "epoch": 1.8971160077770577, + "grad_norm": 0.062103889882564545, + "learning_rate": 0.00011682794005130744, + "loss": 0.2722, + "step": 23418 + }, + { + "epoch": 1.897197018794556, + "grad_norm": 0.058835145086050034, + "learning_rate": 0.00011682343939871281, + "loss": 0.2414, + "step": 23419 + }, + { + "epoch": 1.8972780298120544, + "grad_norm": 0.05964101105928421, + "learning_rate": 0.0001168189387461182, + "loss": 0.2734, + "step": 23420 + }, + { + "epoch": 1.8973590408295529, + "grad_norm": 0.052001770585775375, + "learning_rate": 0.00011681443809352357, + "loss": 0.233, + "step": 23421 + }, + { + "epoch": 1.8974400518470511, + "grad_norm": 0.06389396637678146, + "learning_rate": 0.00011680993744092895, + "loss": 0.3054, + "step": 23422 + }, + { + "epoch": 1.8975210628645496, + "grad_norm": 0.049117472022771835, + "learning_rate": 0.0001168054367883343, + "loss": 0.2342, + "step": 23423 + }, + { + "epoch": 1.897602073882048, + "grad_norm": 0.05265875905752182, + "learning_rate": 0.00011680093613573968, + "loss": 0.2859, + "step": 23424 + }, + { + "epoch": 1.8976830848995463, + "grad_norm": 0.05852856487035751, + "learning_rate": 0.00011679643548314507, + "loss": 0.2698, + "step": 23425 + }, + { + "epoch": 1.8977640959170448, + "grad_norm": 0.05060741677880287, + "learning_rate": 0.00011679193483055044, + "loss": 0.2452, + "step": 23426 + }, + { + "epoch": 1.8978451069345432, + "grad_norm": 0.06805577129125595, + "learning_rate": 0.00011678743417795581, + "loss": 0.283, + "step": 23427 + }, + { + "epoch": 1.8979261179520415, + "grad_norm": 0.055861569941043854, + "learning_rate": 0.00011678293352536119, + "loss": 0.2498, + "step": 23428 + }, + { + "epoch": 1.8980071289695397, + "grad_norm": 0.06261233240365982, + "learning_rate": 0.00011677843287276655, + "loss": 0.2589, + "step": 23429 + }, + { + "epoch": 1.8980881399870384, + "grad_norm": 0.05829406529664993, + "learning_rate": 0.00011677393222017192, + "loss": 0.2646, + "step": 23430 + }, + { + "epoch": 1.8981691510045366, + "grad_norm": 0.05647638067603111, + "learning_rate": 0.00011676943156757731, + "loss": 0.2909, + "step": 23431 + }, + { + "epoch": 1.898250162022035, + "grad_norm": 0.0669042319059372, + "learning_rate": 0.00011676493091498268, + "loss": 0.2999, + "step": 23432 + }, + { + "epoch": 1.8983311730395334, + "grad_norm": 0.0586322546005249, + "learning_rate": 0.00011676043026238806, + "loss": 0.273, + "step": 23433 + }, + { + "epoch": 1.8984121840570318, + "grad_norm": 0.048080138862133026, + "learning_rate": 0.00011675592960979343, + "loss": 0.2836, + "step": 23434 + }, + { + "epoch": 1.89849319507453, + "grad_norm": 0.05639676749706268, + "learning_rate": 0.00011675142895719879, + "loss": 0.2426, + "step": 23435 + }, + { + "epoch": 1.8985742060920285, + "grad_norm": 0.05663561075925827, + "learning_rate": 0.00011674692830460416, + "loss": 0.3069, + "step": 23436 + }, + { + "epoch": 1.898655217109527, + "grad_norm": 0.05259792134165764, + "learning_rate": 0.00011674242765200955, + "loss": 0.2756, + "step": 23437 + }, + { + "epoch": 1.8987362281270252, + "grad_norm": 0.06394267082214355, + "learning_rate": 0.00011673792699941492, + "loss": 0.3269, + "step": 23438 + }, + { + "epoch": 1.8988172391445235, + "grad_norm": 0.07205379009246826, + "learning_rate": 0.0001167334263468203, + "loss": 0.2702, + "step": 23439 + }, + { + "epoch": 1.8988982501620222, + "grad_norm": 0.06035115197300911, + "learning_rate": 0.00011672892569422567, + "loss": 0.2676, + "step": 23440 + }, + { + "epoch": 1.8989792611795204, + "grad_norm": 0.0664551630616188, + "learning_rate": 0.00011672442504163103, + "loss": 0.281, + "step": 23441 + }, + { + "epoch": 1.8990602721970187, + "grad_norm": 0.06344987452030182, + "learning_rate": 0.0001167199243890364, + "loss": 0.2896, + "step": 23442 + }, + { + "epoch": 1.8991412832145171, + "grad_norm": 0.061063800007104874, + "learning_rate": 0.00011671542373644179, + "loss": 0.2513, + "step": 23443 + }, + { + "epoch": 1.8992222942320156, + "grad_norm": 0.05384545028209686, + "learning_rate": 0.00011671092308384717, + "loss": 0.265, + "step": 23444 + }, + { + "epoch": 1.8993033052495139, + "grad_norm": 0.06857175379991531, + "learning_rate": 0.00011670642243125254, + "loss": 0.2713, + "step": 23445 + }, + { + "epoch": 1.8993843162670123, + "grad_norm": 0.0652710348367691, + "learning_rate": 0.00011670192177865791, + "loss": 0.2431, + "step": 23446 + }, + { + "epoch": 1.8994653272845108, + "grad_norm": 0.04748218506574631, + "learning_rate": 0.00011669742112606327, + "loss": 0.2993, + "step": 23447 + }, + { + "epoch": 1.899546338302009, + "grad_norm": 0.052324965596199036, + "learning_rate": 0.00011669292047346865, + "loss": 0.2602, + "step": 23448 + }, + { + "epoch": 1.8996273493195075, + "grad_norm": 0.059723157435655594, + "learning_rate": 0.00011668841982087403, + "loss": 0.2643, + "step": 23449 + }, + { + "epoch": 1.899708360337006, + "grad_norm": 0.05854262039065361, + "learning_rate": 0.00011668391916827941, + "loss": 0.2597, + "step": 23450 + }, + { + "epoch": 1.8997893713545042, + "grad_norm": 0.053819689899683, + "learning_rate": 0.00011667941851568478, + "loss": 0.2708, + "step": 23451 + }, + { + "epoch": 1.8998703823720025, + "grad_norm": 0.060173001140356064, + "learning_rate": 0.00011667491786309015, + "loss": 0.2611, + "step": 23452 + }, + { + "epoch": 1.899951393389501, + "grad_norm": 0.044513072818517685, + "learning_rate": 0.00011667041721049551, + "loss": 0.2144, + "step": 23453 + }, + { + "epoch": 1.9000324044069994, + "grad_norm": 0.054686374962329865, + "learning_rate": 0.00011666591655790091, + "loss": 0.2517, + "step": 23454 + }, + { + "epoch": 1.9001134154244976, + "grad_norm": 0.07276096940040588, + "learning_rate": 0.00011666141590530629, + "loss": 0.2826, + "step": 23455 + }, + { + "epoch": 1.900194426441996, + "grad_norm": 0.06193507835268974, + "learning_rate": 0.00011665691525271165, + "loss": 0.2671, + "step": 23456 + }, + { + "epoch": 1.9002754374594946, + "grad_norm": 0.05841159448027611, + "learning_rate": 0.00011665241460011702, + "loss": 0.2911, + "step": 23457 + }, + { + "epoch": 1.9003564484769928, + "grad_norm": 0.05510144308209419, + "learning_rate": 0.0001166479139475224, + "loss": 0.2653, + "step": 23458 + }, + { + "epoch": 1.9004374594944913, + "grad_norm": 0.06022772938013077, + "learning_rate": 0.00011664341329492776, + "loss": 0.3302, + "step": 23459 + }, + { + "epoch": 1.9005184705119897, + "grad_norm": 0.05794879421591759, + "learning_rate": 0.00011663891264233316, + "loss": 0.3144, + "step": 23460 + }, + { + "epoch": 1.900599481529488, + "grad_norm": 0.0524263009428978, + "learning_rate": 0.00011663441198973853, + "loss": 0.2423, + "step": 23461 + }, + { + "epoch": 1.9006804925469862, + "grad_norm": 0.062246453016996384, + "learning_rate": 0.00011662991133714389, + "loss": 0.2428, + "step": 23462 + }, + { + "epoch": 1.900761503564485, + "grad_norm": 0.05451060086488724, + "learning_rate": 0.00011662541068454926, + "loss": 0.2493, + "step": 23463 + }, + { + "epoch": 1.9008425145819832, + "grad_norm": 0.06086164712905884, + "learning_rate": 0.00011662091003195464, + "loss": 0.2448, + "step": 23464 + }, + { + "epoch": 1.9009235255994814, + "grad_norm": 0.06377673149108887, + "learning_rate": 0.00011661640937936, + "loss": 0.2726, + "step": 23465 + }, + { + "epoch": 1.9010045366169799, + "grad_norm": 0.05496484041213989, + "learning_rate": 0.0001166119087267654, + "loss": 0.2791, + "step": 23466 + }, + { + "epoch": 1.9010855476344783, + "grad_norm": 0.06893599033355713, + "learning_rate": 0.00011660740807417077, + "loss": 0.285, + "step": 23467 + }, + { + "epoch": 1.9011665586519766, + "grad_norm": 0.0625278428196907, + "learning_rate": 0.00011660290742157613, + "loss": 0.2588, + "step": 23468 + }, + { + "epoch": 1.901247569669475, + "grad_norm": 0.06080259010195732, + "learning_rate": 0.0001165984067689815, + "loss": 0.3061, + "step": 23469 + }, + { + "epoch": 1.9013285806869735, + "grad_norm": 0.053016725927591324, + "learning_rate": 0.00011659390611638688, + "loss": 0.2483, + "step": 23470 + }, + { + "epoch": 1.9014095917044718, + "grad_norm": 0.058317508548498154, + "learning_rate": 0.00011658940546379224, + "loss": 0.2967, + "step": 23471 + }, + { + "epoch": 1.9014906027219702, + "grad_norm": 0.06212649866938591, + "learning_rate": 0.00011658490481119764, + "loss": 0.2737, + "step": 23472 + }, + { + "epoch": 1.9015716137394687, + "grad_norm": 0.05819268524646759, + "learning_rate": 0.00011658040415860301, + "loss": 0.2462, + "step": 23473 + }, + { + "epoch": 1.901652624756967, + "grad_norm": 0.048252273350954056, + "learning_rate": 0.00011657590350600837, + "loss": 0.2439, + "step": 23474 + }, + { + "epoch": 1.9017336357744652, + "grad_norm": 0.05994917079806328, + "learning_rate": 0.00011657140285341375, + "loss": 0.2448, + "step": 23475 + }, + { + "epoch": 1.9018146467919637, + "grad_norm": 0.04080428555607796, + "learning_rate": 0.00011656690220081912, + "loss": 0.2051, + "step": 23476 + }, + { + "epoch": 1.9018956578094621, + "grad_norm": 0.06264632940292358, + "learning_rate": 0.00011656240154822451, + "loss": 0.2444, + "step": 23477 + }, + { + "epoch": 1.9019766688269604, + "grad_norm": 0.06154443323612213, + "learning_rate": 0.00011655790089562988, + "loss": 0.3366, + "step": 23478 + }, + { + "epoch": 1.9020576798444588, + "grad_norm": 0.05270412564277649, + "learning_rate": 0.00011655340024303525, + "loss": 0.2472, + "step": 23479 + }, + { + "epoch": 1.9021386908619573, + "grad_norm": 0.05738883838057518, + "learning_rate": 0.00011654889959044061, + "loss": 0.2514, + "step": 23480 + }, + { + "epoch": 1.9022197018794555, + "grad_norm": 0.0528935082256794, + "learning_rate": 0.00011654439893784599, + "loss": 0.265, + "step": 23481 + }, + { + "epoch": 1.902300712896954, + "grad_norm": 0.06092529743909836, + "learning_rate": 0.00011653989828525136, + "loss": 0.2405, + "step": 23482 + }, + { + "epoch": 1.9023817239144525, + "grad_norm": 0.06586316227912903, + "learning_rate": 0.00011653539763265675, + "loss": 0.2804, + "step": 23483 + }, + { + "epoch": 1.9024627349319507, + "grad_norm": 0.06744181364774704, + "learning_rate": 0.00011653089698006212, + "loss": 0.226, + "step": 23484 + }, + { + "epoch": 1.902543745949449, + "grad_norm": 0.057407937943935394, + "learning_rate": 0.0001165263963274675, + "loss": 0.2548, + "step": 23485 + }, + { + "epoch": 1.9026247569669477, + "grad_norm": 0.06098683550953865, + "learning_rate": 0.00011652189567487286, + "loss": 0.2307, + "step": 23486 + }, + { + "epoch": 1.902705767984446, + "grad_norm": 0.04744044691324234, + "learning_rate": 0.00011651739502227823, + "loss": 0.2395, + "step": 23487 + }, + { + "epoch": 1.9027867790019442, + "grad_norm": 0.05935365706682205, + "learning_rate": 0.0001165128943696836, + "loss": 0.2953, + "step": 23488 + }, + { + "epoch": 1.9028677900194426, + "grad_norm": 0.059553083032369614, + "learning_rate": 0.00011650839371708899, + "loss": 0.2726, + "step": 23489 + }, + { + "epoch": 1.902948801036941, + "grad_norm": 0.06768973916769028, + "learning_rate": 0.00011650389306449436, + "loss": 0.3151, + "step": 23490 + }, + { + "epoch": 1.9030298120544393, + "grad_norm": 0.06006639450788498, + "learning_rate": 0.00011649939241189974, + "loss": 0.2463, + "step": 23491 + }, + { + "epoch": 1.9031108230719378, + "grad_norm": 0.06431041657924652, + "learning_rate": 0.0001164948917593051, + "loss": 0.2973, + "step": 23492 + }, + { + "epoch": 1.9031918340894363, + "grad_norm": 0.05177253112196922, + "learning_rate": 0.00011649039110671047, + "loss": 0.249, + "step": 23493 + }, + { + "epoch": 1.9032728451069345, + "grad_norm": 0.06623294949531555, + "learning_rate": 0.00011648589045411585, + "loss": 0.289, + "step": 23494 + }, + { + "epoch": 1.903353856124433, + "grad_norm": 0.0612802617251873, + "learning_rate": 0.00011648138980152123, + "loss": 0.2727, + "step": 23495 + }, + { + "epoch": 1.9034348671419314, + "grad_norm": 0.0497586615383625, + "learning_rate": 0.0001164768891489266, + "loss": 0.293, + "step": 23496 + }, + { + "epoch": 1.9035158781594297, + "grad_norm": 0.05529085174202919, + "learning_rate": 0.00011647238849633198, + "loss": 0.2956, + "step": 23497 + }, + { + "epoch": 1.903596889176928, + "grad_norm": 0.06677936017513275, + "learning_rate": 0.00011646788784373734, + "loss": 0.2632, + "step": 23498 + }, + { + "epoch": 1.9036779001944264, + "grad_norm": 0.05843420699238777, + "learning_rate": 0.00011646338719114271, + "loss": 0.2722, + "step": 23499 + }, + { + "epoch": 1.9037589112119249, + "grad_norm": 0.0610148124396801, + "learning_rate": 0.00011645888653854809, + "loss": 0.2534, + "step": 23500 + }, + { + "epoch": 1.903839922229423, + "grad_norm": 0.050915516912937164, + "learning_rate": 0.00011645438588595347, + "loss": 0.2339, + "step": 23501 + }, + { + "epoch": 1.9039209332469216, + "grad_norm": 0.07522425800561905, + "learning_rate": 0.00011644988523335885, + "loss": 0.3088, + "step": 23502 + }, + { + "epoch": 1.90400194426442, + "grad_norm": 0.06760836392641068, + "learning_rate": 0.00011644538458076422, + "loss": 0.2598, + "step": 23503 + }, + { + "epoch": 1.9040829552819183, + "grad_norm": 0.058208782225847244, + "learning_rate": 0.00011644088392816958, + "loss": 0.2923, + "step": 23504 + }, + { + "epoch": 1.9041639662994168, + "grad_norm": 0.05135008320212364, + "learning_rate": 0.00011643638327557495, + "loss": 0.2455, + "step": 23505 + }, + { + "epoch": 1.9042449773169152, + "grad_norm": 0.0543479323387146, + "learning_rate": 0.00011643188262298034, + "loss": 0.2697, + "step": 23506 + }, + { + "epoch": 1.9043259883344135, + "grad_norm": 0.05527025833725929, + "learning_rate": 0.00011642738197038572, + "loss": 0.2645, + "step": 23507 + }, + { + "epoch": 1.9044069993519117, + "grad_norm": 0.0615333653986454, + "learning_rate": 0.00011642288131779109, + "loss": 0.2426, + "step": 23508 + }, + { + "epoch": 1.9044880103694104, + "grad_norm": 0.061779141426086426, + "learning_rate": 0.00011641838066519646, + "loss": 0.3022, + "step": 23509 + }, + { + "epoch": 1.9045690213869086, + "grad_norm": 0.06118670105934143, + "learning_rate": 0.00011641388001260182, + "loss": 0.2446, + "step": 23510 + }, + { + "epoch": 1.904650032404407, + "grad_norm": 0.07236479222774506, + "learning_rate": 0.0001164093793600072, + "loss": 0.2911, + "step": 23511 + }, + { + "epoch": 1.9047310434219054, + "grad_norm": 0.07088925689458847, + "learning_rate": 0.00011640487870741258, + "loss": 0.2768, + "step": 23512 + }, + { + "epoch": 1.9048120544394038, + "grad_norm": 0.06011257693171501, + "learning_rate": 0.00011640037805481796, + "loss": 0.314, + "step": 23513 + }, + { + "epoch": 1.904893065456902, + "grad_norm": 0.054274432361125946, + "learning_rate": 0.00011639587740222333, + "loss": 0.2681, + "step": 23514 + }, + { + "epoch": 1.9049740764744005, + "grad_norm": 0.04443292319774628, + "learning_rate": 0.0001163913767496287, + "loss": 0.2424, + "step": 23515 + }, + { + "epoch": 1.905055087491899, + "grad_norm": 0.06765972822904587, + "learning_rate": 0.00011638687609703406, + "loss": 0.2851, + "step": 23516 + }, + { + "epoch": 1.9051360985093972, + "grad_norm": 0.056600090116262436, + "learning_rate": 0.00011638237544443944, + "loss": 0.2824, + "step": 23517 + }, + { + "epoch": 1.9052171095268955, + "grad_norm": 0.04816935211420059, + "learning_rate": 0.00011637787479184483, + "loss": 0.2828, + "step": 23518 + }, + { + "epoch": 1.9052981205443942, + "grad_norm": 0.05863537639379501, + "learning_rate": 0.0001163733741392502, + "loss": 0.253, + "step": 23519 + }, + { + "epoch": 1.9053791315618924, + "grad_norm": 0.05766700580716133, + "learning_rate": 0.00011636887348665557, + "loss": 0.2414, + "step": 23520 + }, + { + "epoch": 1.9054601425793907, + "grad_norm": 0.07518936693668365, + "learning_rate": 0.00011636437283406095, + "loss": 0.2811, + "step": 23521 + }, + { + "epoch": 1.9055411535968891, + "grad_norm": 0.05701868608593941, + "learning_rate": 0.0001163598721814663, + "loss": 0.2862, + "step": 23522 + }, + { + "epoch": 1.9056221646143876, + "grad_norm": 0.04992794618010521, + "learning_rate": 0.00011635537152887168, + "loss": 0.2316, + "step": 23523 + }, + { + "epoch": 1.9057031756318858, + "grad_norm": 0.06036796420812607, + "learning_rate": 0.00011635087087627708, + "loss": 0.2712, + "step": 23524 + }, + { + "epoch": 1.9057841866493843, + "grad_norm": 0.06520384550094604, + "learning_rate": 0.00011634637022368244, + "loss": 0.2483, + "step": 23525 + }, + { + "epoch": 1.9058651976668828, + "grad_norm": 0.06383537501096725, + "learning_rate": 0.00011634186957108781, + "loss": 0.2824, + "step": 23526 + }, + { + "epoch": 1.905946208684381, + "grad_norm": 0.05158648267388344, + "learning_rate": 0.00011633736891849319, + "loss": 0.2697, + "step": 23527 + }, + { + "epoch": 1.9060272197018795, + "grad_norm": 0.10046553611755371, + "learning_rate": 0.00011633286826589855, + "loss": 0.2859, + "step": 23528 + }, + { + "epoch": 1.906108230719378, + "grad_norm": 0.04792208969593048, + "learning_rate": 0.00011632836761330395, + "loss": 0.2174, + "step": 23529 + }, + { + "epoch": 1.9061892417368762, + "grad_norm": 0.05839664489030838, + "learning_rate": 0.00011632386696070932, + "loss": 0.2843, + "step": 23530 + }, + { + "epoch": 1.9062702527543745, + "grad_norm": 0.07664167135953903, + "learning_rate": 0.00011631936630811468, + "loss": 0.3275, + "step": 23531 + }, + { + "epoch": 1.9063512637718731, + "grad_norm": 0.057959817349910736, + "learning_rate": 0.00011631486565552006, + "loss": 0.2393, + "step": 23532 + }, + { + "epoch": 1.9064322747893714, + "grad_norm": 0.06964612007141113, + "learning_rate": 0.00011631036500292543, + "loss": 0.2542, + "step": 23533 + }, + { + "epoch": 1.9065132858068696, + "grad_norm": 0.058878980576992035, + "learning_rate": 0.00011630586435033079, + "loss": 0.268, + "step": 23534 + }, + { + "epoch": 1.906594296824368, + "grad_norm": 0.06393729895353317, + "learning_rate": 0.00011630136369773619, + "loss": 0.2873, + "step": 23535 + }, + { + "epoch": 1.9066753078418666, + "grad_norm": 0.052830591797828674, + "learning_rate": 0.00011629686304514156, + "loss": 0.273, + "step": 23536 + }, + { + "epoch": 1.9067563188593648, + "grad_norm": 0.06132220849394798, + "learning_rate": 0.00011629236239254692, + "loss": 0.2793, + "step": 23537 + }, + { + "epoch": 1.9068373298768633, + "grad_norm": 0.06467705965042114, + "learning_rate": 0.0001162878617399523, + "loss": 0.304, + "step": 23538 + }, + { + "epoch": 1.9069183408943617, + "grad_norm": 0.05416596680879593, + "learning_rate": 0.00011628336108735767, + "loss": 0.2695, + "step": 23539 + }, + { + "epoch": 1.90699935191186, + "grad_norm": 0.05750744789838791, + "learning_rate": 0.00011627886043476303, + "loss": 0.2528, + "step": 23540 + }, + { + "epoch": 1.9070803629293582, + "grad_norm": 0.06359507888555527, + "learning_rate": 0.00011627435978216843, + "loss": 0.2945, + "step": 23541 + }, + { + "epoch": 1.907161373946857, + "grad_norm": 0.06286890804767609, + "learning_rate": 0.0001162698591295738, + "loss": 0.2768, + "step": 23542 + }, + { + "epoch": 1.9072423849643552, + "grad_norm": 0.06291428953409195, + "learning_rate": 0.00011626535847697917, + "loss": 0.284, + "step": 23543 + }, + { + "epoch": 1.9073233959818534, + "grad_norm": 0.057130761444568634, + "learning_rate": 0.00011626085782438454, + "loss": 0.2401, + "step": 23544 + }, + { + "epoch": 1.9074044069993519, + "grad_norm": 0.06668264418840408, + "learning_rate": 0.00011625635717178991, + "loss": 0.2621, + "step": 23545 + }, + { + "epoch": 1.9074854180168503, + "grad_norm": 0.06476175040006638, + "learning_rate": 0.00011625185651919527, + "loss": 0.2796, + "step": 23546 + }, + { + "epoch": 1.9075664290343486, + "grad_norm": 0.060899537056684494, + "learning_rate": 0.00011624735586660067, + "loss": 0.2448, + "step": 23547 + }, + { + "epoch": 1.907647440051847, + "grad_norm": 0.0622684583067894, + "learning_rate": 0.00011624285521400605, + "loss": 0.2399, + "step": 23548 + }, + { + "epoch": 1.9077284510693455, + "grad_norm": 0.06308908015489578, + "learning_rate": 0.00011623835456141141, + "loss": 0.2758, + "step": 23549 + }, + { + "epoch": 1.9078094620868438, + "grad_norm": 0.061859551817178726, + "learning_rate": 0.00011623385390881678, + "loss": 0.2855, + "step": 23550 + }, + { + "epoch": 1.9078904731043422, + "grad_norm": 0.06682777404785156, + "learning_rate": 0.00011622935325622215, + "loss": 0.3057, + "step": 23551 + }, + { + "epoch": 1.9079714841218407, + "grad_norm": 0.06256937980651855, + "learning_rate": 0.00011622485260362751, + "loss": 0.2912, + "step": 23552 + }, + { + "epoch": 1.908052495139339, + "grad_norm": 0.051607646048069, + "learning_rate": 0.00011622035195103292, + "loss": 0.2626, + "step": 23553 + }, + { + "epoch": 1.9081335061568372, + "grad_norm": 0.054773516952991486, + "learning_rate": 0.00011621585129843829, + "loss": 0.2572, + "step": 23554 + }, + { + "epoch": 1.9082145171743357, + "grad_norm": 0.07187357544898987, + "learning_rate": 0.00011621135064584365, + "loss": 0.3005, + "step": 23555 + }, + { + "epoch": 1.9082955281918341, + "grad_norm": 0.05505804717540741, + "learning_rate": 0.00011620684999324902, + "loss": 0.2324, + "step": 23556 + }, + { + "epoch": 1.9083765392093324, + "grad_norm": 0.057166606187820435, + "learning_rate": 0.0001162023493406544, + "loss": 0.2488, + "step": 23557 + }, + { + "epoch": 1.9084575502268308, + "grad_norm": 0.0596490204334259, + "learning_rate": 0.00011619784868805978, + "loss": 0.2589, + "step": 23558 + }, + { + "epoch": 1.9085385612443293, + "grad_norm": 0.0551486536860466, + "learning_rate": 0.00011619334803546516, + "loss": 0.2311, + "step": 23559 + }, + { + "epoch": 1.9086195722618275, + "grad_norm": 0.05850379914045334, + "learning_rate": 0.00011618884738287053, + "loss": 0.2763, + "step": 23560 + }, + { + "epoch": 1.908700583279326, + "grad_norm": 0.06534396857023239, + "learning_rate": 0.00011618434673027589, + "loss": 0.2837, + "step": 23561 + }, + { + "epoch": 1.9087815942968245, + "grad_norm": 0.05430319532752037, + "learning_rate": 0.00011617984607768126, + "loss": 0.2355, + "step": 23562 + }, + { + "epoch": 1.9088626053143227, + "grad_norm": 0.04827059432864189, + "learning_rate": 0.00011617534542508664, + "loss": 0.253, + "step": 23563 + }, + { + "epoch": 1.908943616331821, + "grad_norm": 0.059288423508405685, + "learning_rate": 0.00011617084477249202, + "loss": 0.2538, + "step": 23564 + }, + { + "epoch": 1.9090246273493197, + "grad_norm": 0.06148166209459305, + "learning_rate": 0.0001161663441198974, + "loss": 0.2547, + "step": 23565 + }, + { + "epoch": 1.909105638366818, + "grad_norm": 0.07299099117517471, + "learning_rate": 0.00011616184346730277, + "loss": 0.2814, + "step": 23566 + }, + { + "epoch": 1.9091866493843161, + "grad_norm": 0.05289170518517494, + "learning_rate": 0.00011615734281470813, + "loss": 0.2467, + "step": 23567 + }, + { + "epoch": 1.9092676604018146, + "grad_norm": 0.07755734026432037, + "learning_rate": 0.0001161528421621135, + "loss": 0.3104, + "step": 23568 + }, + { + "epoch": 1.909348671419313, + "grad_norm": 0.0510355643928051, + "learning_rate": 0.00011614834150951888, + "loss": 0.248, + "step": 23569 + }, + { + "epoch": 1.9094296824368113, + "grad_norm": 0.06598133593797684, + "learning_rate": 0.00011614384085692427, + "loss": 0.2952, + "step": 23570 + }, + { + "epoch": 1.9095106934543098, + "grad_norm": 0.0684957429766655, + "learning_rate": 0.00011613934020432964, + "loss": 0.2586, + "step": 23571 + }, + { + "epoch": 1.9095917044718083, + "grad_norm": 0.058603137731552124, + "learning_rate": 0.00011613483955173501, + "loss": 0.2781, + "step": 23572 + }, + { + "epoch": 1.9096727154893065, + "grad_norm": 0.052800316363573074, + "learning_rate": 0.00011613033889914037, + "loss": 0.3278, + "step": 23573 + }, + { + "epoch": 1.909753726506805, + "grad_norm": 0.05046352744102478, + "learning_rate": 0.00011612583824654575, + "loss": 0.2318, + "step": 23574 + }, + { + "epoch": 1.9098347375243034, + "grad_norm": 0.061404839158058167, + "learning_rate": 0.00011612133759395112, + "loss": 0.2591, + "step": 23575 + }, + { + "epoch": 1.9099157485418017, + "grad_norm": 0.06698551028966904, + "learning_rate": 0.00011611683694135651, + "loss": 0.272, + "step": 23576 + }, + { + "epoch": 1.9099967595593, + "grad_norm": 0.05472571775317192, + "learning_rate": 0.00011611233628876188, + "loss": 0.2662, + "step": 23577 + }, + { + "epoch": 1.9100777705767984, + "grad_norm": 0.05898464098572731, + "learning_rate": 0.00011610783563616726, + "loss": 0.2572, + "step": 23578 + }, + { + "epoch": 1.9101587815942969, + "grad_norm": 0.07992018014192581, + "learning_rate": 0.00011610333498357262, + "loss": 0.2764, + "step": 23579 + }, + { + "epoch": 1.910239792611795, + "grad_norm": 0.04683117941021919, + "learning_rate": 0.00011609883433097799, + "loss": 0.2345, + "step": 23580 + }, + { + "epoch": 1.9103208036292936, + "grad_norm": 0.07582516223192215, + "learning_rate": 0.00011609433367838336, + "loss": 0.3301, + "step": 23581 + }, + { + "epoch": 1.910401814646792, + "grad_norm": 0.0542750246822834, + "learning_rate": 0.00011608983302578875, + "loss": 0.2332, + "step": 23582 + }, + { + "epoch": 1.9104828256642903, + "grad_norm": 0.061518967151641846, + "learning_rate": 0.00011608533237319412, + "loss": 0.2615, + "step": 23583 + }, + { + "epoch": 1.9105638366817888, + "grad_norm": 0.0602375864982605, + "learning_rate": 0.0001160808317205995, + "loss": 0.2639, + "step": 23584 + }, + { + "epoch": 1.9106448476992872, + "grad_norm": 0.054494086652994156, + "learning_rate": 0.00011607633106800486, + "loss": 0.2504, + "step": 23585 + }, + { + "epoch": 1.9107258587167855, + "grad_norm": 0.060139261186122894, + "learning_rate": 0.00011607183041541023, + "loss": 0.287, + "step": 23586 + }, + { + "epoch": 1.9108068697342837, + "grad_norm": 0.07051198184490204, + "learning_rate": 0.00011606732976281562, + "loss": 0.3095, + "step": 23587 + }, + { + "epoch": 1.9108878807517824, + "grad_norm": 0.05830878019332886, + "learning_rate": 0.00011606282911022099, + "loss": 0.2962, + "step": 23588 + }, + { + "epoch": 1.9109688917692806, + "grad_norm": 0.06247595325112343, + "learning_rate": 0.00011605832845762636, + "loss": 0.2658, + "step": 23589 + }, + { + "epoch": 1.9110499027867789, + "grad_norm": 0.05308188498020172, + "learning_rate": 0.00011605382780503174, + "loss": 0.2442, + "step": 23590 + }, + { + "epoch": 1.9111309138042774, + "grad_norm": 0.05700519308447838, + "learning_rate": 0.0001160493271524371, + "loss": 0.2994, + "step": 23591 + }, + { + "epoch": 1.9112119248217758, + "grad_norm": 0.05724016949534416, + "learning_rate": 0.00011604482649984247, + "loss": 0.2816, + "step": 23592 + }, + { + "epoch": 1.911292935839274, + "grad_norm": 0.04718222841620445, + "learning_rate": 0.00011604032584724787, + "loss": 0.2446, + "step": 23593 + }, + { + "epoch": 1.9113739468567725, + "grad_norm": 0.04728047549724579, + "learning_rate": 0.00011603582519465323, + "loss": 0.2632, + "step": 23594 + }, + { + "epoch": 1.911454957874271, + "grad_norm": 0.06270313262939453, + "learning_rate": 0.0001160313245420586, + "loss": 0.3105, + "step": 23595 + }, + { + "epoch": 1.9115359688917692, + "grad_norm": 0.05434510111808777, + "learning_rate": 0.00011602682388946398, + "loss": 0.2654, + "step": 23596 + }, + { + "epoch": 1.9116169799092677, + "grad_norm": 0.0677875280380249, + "learning_rate": 0.00011602232323686934, + "loss": 0.3015, + "step": 23597 + }, + { + "epoch": 1.9116979909267662, + "grad_norm": 0.07220727950334549, + "learning_rate": 0.00011601782258427471, + "loss": 0.277, + "step": 23598 + }, + { + "epoch": 1.9117790019442644, + "grad_norm": 0.06096704676747322, + "learning_rate": 0.00011601332193168011, + "loss": 0.2713, + "step": 23599 + }, + { + "epoch": 1.9118600129617627, + "grad_norm": 0.06041645258665085, + "learning_rate": 0.00011600882127908547, + "loss": 0.2569, + "step": 23600 + }, + { + "epoch": 1.9119410239792611, + "grad_norm": 0.05308665335178375, + "learning_rate": 0.00011600432062649085, + "loss": 0.2709, + "step": 23601 + }, + { + "epoch": 1.9120220349967596, + "grad_norm": 0.055644433945417404, + "learning_rate": 0.00011599981997389622, + "loss": 0.2551, + "step": 23602 + }, + { + "epoch": 1.9121030460142578, + "grad_norm": 0.06518996506929398, + "learning_rate": 0.00011599531932130158, + "loss": 0.2585, + "step": 23603 + }, + { + "epoch": 1.9121840570317563, + "grad_norm": 0.06019129976630211, + "learning_rate": 0.00011599081866870696, + "loss": 0.2829, + "step": 23604 + }, + { + "epoch": 1.9122650680492548, + "grad_norm": 0.04989992454648018, + "learning_rate": 0.00011598631801611236, + "loss": 0.2676, + "step": 23605 + }, + { + "epoch": 1.912346079066753, + "grad_norm": 0.057164739817380905, + "learning_rate": 0.00011598181736351772, + "loss": 0.2936, + "step": 23606 + }, + { + "epoch": 1.9124270900842515, + "grad_norm": 0.06150704249739647, + "learning_rate": 0.00011597731671092309, + "loss": 0.2621, + "step": 23607 + }, + { + "epoch": 1.91250810110175, + "grad_norm": 0.054209765046834946, + "learning_rate": 0.00011597281605832846, + "loss": 0.2534, + "step": 23608 + }, + { + "epoch": 1.9125891121192482, + "grad_norm": 0.053575582802295685, + "learning_rate": 0.00011596831540573382, + "loss": 0.2486, + "step": 23609 + }, + { + "epoch": 1.9126701231367464, + "grad_norm": 0.05911843851208687, + "learning_rate": 0.00011596381475313922, + "loss": 0.2303, + "step": 23610 + }, + { + "epoch": 1.9127511341542451, + "grad_norm": 0.05687384307384491, + "learning_rate": 0.0001159593141005446, + "loss": 0.2235, + "step": 23611 + }, + { + "epoch": 1.9128321451717434, + "grad_norm": 0.07763361930847168, + "learning_rate": 0.00011595481344794996, + "loss": 0.3259, + "step": 23612 + }, + { + "epoch": 1.9129131561892416, + "grad_norm": 0.05966458469629288, + "learning_rate": 0.00011595031279535533, + "loss": 0.2486, + "step": 23613 + }, + { + "epoch": 1.91299416720674, + "grad_norm": 0.058465708047151566, + "learning_rate": 0.0001159458121427607, + "loss": 0.2704, + "step": 23614 + }, + { + "epoch": 1.9130751782242386, + "grad_norm": 0.05656788498163223, + "learning_rate": 0.00011594131149016606, + "loss": 0.2215, + "step": 23615 + }, + { + "epoch": 1.9131561892417368, + "grad_norm": 0.055962447077035904, + "learning_rate": 0.00011593681083757147, + "loss": 0.2751, + "step": 23616 + }, + { + "epoch": 1.9132372002592353, + "grad_norm": 0.04729706794023514, + "learning_rate": 0.00011593231018497684, + "loss": 0.2564, + "step": 23617 + }, + { + "epoch": 1.9133182112767337, + "grad_norm": 0.06427638232707977, + "learning_rate": 0.0001159278095323822, + "loss": 0.2754, + "step": 23618 + }, + { + "epoch": 1.913399222294232, + "grad_norm": 0.05865442752838135, + "learning_rate": 0.00011592330887978757, + "loss": 0.255, + "step": 23619 + }, + { + "epoch": 1.9134802333117304, + "grad_norm": 0.06961038708686829, + "learning_rate": 0.00011591880822719295, + "loss": 0.3402, + "step": 23620 + }, + { + "epoch": 1.913561244329229, + "grad_norm": 0.0628763884305954, + "learning_rate": 0.0001159143075745983, + "loss": 0.2438, + "step": 23621 + }, + { + "epoch": 1.9136422553467272, + "grad_norm": 0.05884828045964241, + "learning_rate": 0.00011590980692200371, + "loss": 0.33, + "step": 23622 + }, + { + "epoch": 1.9137232663642254, + "grad_norm": 0.05961216613650322, + "learning_rate": 0.00011590530626940908, + "loss": 0.273, + "step": 23623 + }, + { + "epoch": 1.9138042773817239, + "grad_norm": 0.07995451241731644, + "learning_rate": 0.00011590080561681444, + "loss": 0.2605, + "step": 23624 + }, + { + "epoch": 1.9138852883992223, + "grad_norm": 0.05430275946855545, + "learning_rate": 0.00011589630496421981, + "loss": 0.2286, + "step": 23625 + }, + { + "epoch": 1.9139662994167206, + "grad_norm": 0.060730792582035065, + "learning_rate": 0.00011589180431162519, + "loss": 0.2459, + "step": 23626 + }, + { + "epoch": 1.914047310434219, + "grad_norm": 0.07754159718751907, + "learning_rate": 0.00011588730365903056, + "loss": 0.2733, + "step": 23627 + }, + { + "epoch": 1.9141283214517175, + "grad_norm": 0.05745822936296463, + "learning_rate": 0.00011588280300643595, + "loss": 0.2714, + "step": 23628 + }, + { + "epoch": 1.9142093324692158, + "grad_norm": 0.05793623626232147, + "learning_rate": 0.00011587830235384132, + "loss": 0.2276, + "step": 23629 + }, + { + "epoch": 1.9142903434867142, + "grad_norm": 0.05226112902164459, + "learning_rate": 0.00011587380170124668, + "loss": 0.2898, + "step": 23630 + }, + { + "epoch": 1.9143713545042127, + "grad_norm": 0.05570800229907036, + "learning_rate": 0.00011586930104865206, + "loss": 0.2533, + "step": 23631 + }, + { + "epoch": 1.914452365521711, + "grad_norm": 0.06118570268154144, + "learning_rate": 0.00011586480039605743, + "loss": 0.2697, + "step": 23632 + }, + { + "epoch": 1.9145333765392092, + "grad_norm": 0.062098145484924316, + "learning_rate": 0.0001158602997434628, + "loss": 0.2491, + "step": 23633 + }, + { + "epoch": 1.9146143875567079, + "grad_norm": 0.059037044644355774, + "learning_rate": 0.00011585579909086819, + "loss": 0.2991, + "step": 23634 + }, + { + "epoch": 1.9146953985742061, + "grad_norm": 0.052922844886779785, + "learning_rate": 0.00011585129843827356, + "loss": 0.2744, + "step": 23635 + }, + { + "epoch": 1.9147764095917044, + "grad_norm": 0.07369931787252426, + "learning_rate": 0.00011584679778567892, + "loss": 0.2997, + "step": 23636 + }, + { + "epoch": 1.9148574206092028, + "grad_norm": 0.06724896281957626, + "learning_rate": 0.0001158422971330843, + "loss": 0.2649, + "step": 23637 + }, + { + "epoch": 1.9149384316267013, + "grad_norm": 0.05919881537556648, + "learning_rate": 0.00011583779648048967, + "loss": 0.2736, + "step": 23638 + }, + { + "epoch": 1.9150194426441995, + "grad_norm": 0.0630001425743103, + "learning_rate": 0.00011583329582789506, + "loss": 0.26, + "step": 23639 + }, + { + "epoch": 1.915100453661698, + "grad_norm": 0.0480809323489666, + "learning_rate": 0.00011582879517530043, + "loss": 0.2376, + "step": 23640 + }, + { + "epoch": 1.9151814646791965, + "grad_norm": 0.06232346221804619, + "learning_rate": 0.0001158242945227058, + "loss": 0.2549, + "step": 23641 + }, + { + "epoch": 1.9152624756966947, + "grad_norm": 0.057114098221063614, + "learning_rate": 0.00011581979387011117, + "loss": 0.2361, + "step": 23642 + }, + { + "epoch": 1.915343486714193, + "grad_norm": 0.06052986532449722, + "learning_rate": 0.00011581529321751654, + "loss": 0.2683, + "step": 23643 + }, + { + "epoch": 1.9154244977316917, + "grad_norm": 0.06299544125795364, + "learning_rate": 0.00011581079256492191, + "loss": 0.2722, + "step": 23644 + }, + { + "epoch": 1.91550550874919, + "grad_norm": 0.049486126750707626, + "learning_rate": 0.0001158062919123273, + "loss": 0.2546, + "step": 23645 + }, + { + "epoch": 1.9155865197666881, + "grad_norm": 0.052669722586870193, + "learning_rate": 0.00011580179125973267, + "loss": 0.2571, + "step": 23646 + }, + { + "epoch": 1.9156675307841866, + "grad_norm": 0.06197255104780197, + "learning_rate": 0.00011579729060713805, + "loss": 0.2522, + "step": 23647 + }, + { + "epoch": 1.915748541801685, + "grad_norm": 0.05383479222655296, + "learning_rate": 0.00011579278995454341, + "loss": 0.2535, + "step": 23648 + }, + { + "epoch": 1.9158295528191833, + "grad_norm": 0.052105970680713654, + "learning_rate": 0.00011578828930194878, + "loss": 0.2658, + "step": 23649 + }, + { + "epoch": 1.9159105638366818, + "grad_norm": 0.06237861514091492, + "learning_rate": 0.00011578378864935415, + "loss": 0.2534, + "step": 23650 + }, + { + "epoch": 1.9159915748541803, + "grad_norm": 0.050857868045568466, + "learning_rate": 0.00011577928799675954, + "loss": 0.245, + "step": 23651 + }, + { + "epoch": 1.9160725858716785, + "grad_norm": 0.062176115810871124, + "learning_rate": 0.00011577478734416492, + "loss": 0.2344, + "step": 23652 + }, + { + "epoch": 1.916153596889177, + "grad_norm": 0.059934522956609726, + "learning_rate": 0.00011577028669157029, + "loss": 0.2677, + "step": 23653 + }, + { + "epoch": 1.9162346079066754, + "grad_norm": 0.05889790877699852, + "learning_rate": 0.00011576578603897565, + "loss": 0.2557, + "step": 23654 + }, + { + "epoch": 1.9163156189241737, + "grad_norm": 0.05746990069746971, + "learning_rate": 0.00011576128538638102, + "loss": 0.2714, + "step": 23655 + }, + { + "epoch": 1.916396629941672, + "grad_norm": 0.07142087072134018, + "learning_rate": 0.0001157567847337864, + "loss": 0.2866, + "step": 23656 + }, + { + "epoch": 1.9164776409591704, + "grad_norm": 0.06459254026412964, + "learning_rate": 0.00011575228408119178, + "loss": 0.3052, + "step": 23657 + }, + { + "epoch": 1.9165586519766689, + "grad_norm": 0.07262136787176132, + "learning_rate": 0.00011574778342859716, + "loss": 0.2893, + "step": 23658 + }, + { + "epoch": 1.916639662994167, + "grad_norm": 0.06700246036052704, + "learning_rate": 0.00011574328277600253, + "loss": 0.3159, + "step": 23659 + }, + { + "epoch": 1.9167206740116656, + "grad_norm": 0.05940024182200432, + "learning_rate": 0.00011573878212340789, + "loss": 0.269, + "step": 23660 + }, + { + "epoch": 1.916801685029164, + "grad_norm": 0.0663234069943428, + "learning_rate": 0.00011573428147081326, + "loss": 0.2781, + "step": 23661 + }, + { + "epoch": 1.9168826960466623, + "grad_norm": 0.051529187709093094, + "learning_rate": 0.00011572978081821866, + "loss": 0.257, + "step": 23662 + }, + { + "epoch": 1.9169637070641607, + "grad_norm": 0.06385305523872375, + "learning_rate": 0.00011572528016562402, + "loss": 0.2746, + "step": 23663 + }, + { + "epoch": 1.9170447180816592, + "grad_norm": 0.053908463567495346, + "learning_rate": 0.0001157207795130294, + "loss": 0.2617, + "step": 23664 + }, + { + "epoch": 1.9171257290991575, + "grad_norm": 0.06621986627578735, + "learning_rate": 0.00011571627886043477, + "loss": 0.2714, + "step": 23665 + }, + { + "epoch": 1.9172067401166557, + "grad_norm": 0.05236555263400078, + "learning_rate": 0.00011571177820784013, + "loss": 0.2463, + "step": 23666 + }, + { + "epoch": 1.9172877511341544, + "grad_norm": 0.06412264704704285, + "learning_rate": 0.0001157072775552455, + "loss": 0.2906, + "step": 23667 + }, + { + "epoch": 1.9173687621516526, + "grad_norm": 0.0691026896238327, + "learning_rate": 0.0001157027769026509, + "loss": 0.2795, + "step": 23668 + }, + { + "epoch": 1.9174497731691509, + "grad_norm": 0.05732530727982521, + "learning_rate": 0.00011569827625005627, + "loss": 0.2636, + "step": 23669 + }, + { + "epoch": 1.9175307841866494, + "grad_norm": 0.06166663393378258, + "learning_rate": 0.00011569377559746164, + "loss": 0.263, + "step": 23670 + }, + { + "epoch": 1.9176117952041478, + "grad_norm": 0.05083499476313591, + "learning_rate": 0.00011568927494486701, + "loss": 0.2531, + "step": 23671 + }, + { + "epoch": 1.917692806221646, + "grad_norm": 0.07012742757797241, + "learning_rate": 0.00011568477429227237, + "loss": 0.2805, + "step": 23672 + }, + { + "epoch": 1.9177738172391445, + "grad_norm": 0.053490299731492996, + "learning_rate": 0.00011568027363967775, + "loss": 0.2436, + "step": 23673 + }, + { + "epoch": 1.917854828256643, + "grad_norm": 0.045660991221666336, + "learning_rate": 0.00011567577298708315, + "loss": 0.261, + "step": 23674 + }, + { + "epoch": 1.9179358392741412, + "grad_norm": 0.06296227127313614, + "learning_rate": 0.00011567127233448851, + "loss": 0.315, + "step": 23675 + }, + { + "epoch": 1.9180168502916397, + "grad_norm": 0.05819511413574219, + "learning_rate": 0.00011566677168189388, + "loss": 0.2216, + "step": 23676 + }, + { + "epoch": 1.9180978613091382, + "grad_norm": 0.07294635474681854, + "learning_rate": 0.00011566227102929926, + "loss": 0.2784, + "step": 23677 + }, + { + "epoch": 1.9181788723266364, + "grad_norm": 0.05275367572903633, + "learning_rate": 0.00011565777037670462, + "loss": 0.2433, + "step": 23678 + }, + { + "epoch": 1.9182598833441347, + "grad_norm": 0.058120809495449066, + "learning_rate": 0.00011565326972410999, + "loss": 0.242, + "step": 23679 + }, + { + "epoch": 1.9183408943616331, + "grad_norm": 0.05177418887615204, + "learning_rate": 0.00011564876907151539, + "loss": 0.2525, + "step": 23680 + }, + { + "epoch": 1.9184219053791316, + "grad_norm": 0.06865722686052322, + "learning_rate": 0.00011564426841892075, + "loss": 0.2977, + "step": 23681 + }, + { + "epoch": 1.9185029163966298, + "grad_norm": 0.05706919729709625, + "learning_rate": 0.00011563976776632612, + "loss": 0.2953, + "step": 23682 + }, + { + "epoch": 1.9185839274141283, + "grad_norm": 0.059390176087617874, + "learning_rate": 0.0001156352671137315, + "loss": 0.2857, + "step": 23683 + }, + { + "epoch": 1.9186649384316268, + "grad_norm": 0.061236340552568436, + "learning_rate": 0.00011563076646113686, + "loss": 0.2785, + "step": 23684 + }, + { + "epoch": 1.918745949449125, + "grad_norm": 0.059098824858665466, + "learning_rate": 0.00011562626580854223, + "loss": 0.2793, + "step": 23685 + }, + { + "epoch": 1.9188269604666235, + "grad_norm": 0.060659877955913544, + "learning_rate": 0.00011562176515594763, + "loss": 0.2926, + "step": 23686 + }, + { + "epoch": 1.918907971484122, + "grad_norm": 0.061666518449783325, + "learning_rate": 0.00011561726450335299, + "loss": 0.305, + "step": 23687 + }, + { + "epoch": 1.9189889825016202, + "grad_norm": 0.0629463717341423, + "learning_rate": 0.00011561276385075837, + "loss": 0.2721, + "step": 23688 + }, + { + "epoch": 1.9190699935191184, + "grad_norm": 0.06427877396345139, + "learning_rate": 0.00011560826319816374, + "loss": 0.2555, + "step": 23689 + }, + { + "epoch": 1.9191510045366171, + "grad_norm": 0.06070510298013687, + "learning_rate": 0.0001156037625455691, + "loss": 0.2377, + "step": 23690 + }, + { + "epoch": 1.9192320155541154, + "grad_norm": 0.06346534192562103, + "learning_rate": 0.0001155992618929745, + "loss": 0.2747, + "step": 23691 + }, + { + "epoch": 1.9193130265716136, + "grad_norm": 0.055210500955581665, + "learning_rate": 0.00011559476124037987, + "loss": 0.2394, + "step": 23692 + }, + { + "epoch": 1.919394037589112, + "grad_norm": 0.07612962275743484, + "learning_rate": 0.00011559026058778523, + "loss": 0.2888, + "step": 23693 + }, + { + "epoch": 1.9194750486066106, + "grad_norm": 0.05394081771373749, + "learning_rate": 0.0001155857599351906, + "loss": 0.2794, + "step": 23694 + }, + { + "epoch": 1.9195560596241088, + "grad_norm": 0.052274949848651886, + "learning_rate": 0.00011558125928259598, + "loss": 0.2271, + "step": 23695 + }, + { + "epoch": 1.9196370706416073, + "grad_norm": 0.05973504111170769, + "learning_rate": 0.00011557675863000135, + "loss": 0.2481, + "step": 23696 + }, + { + "epoch": 1.9197180816591057, + "grad_norm": 0.05261189118027687, + "learning_rate": 0.00011557225797740674, + "loss": 0.2702, + "step": 23697 + }, + { + "epoch": 1.919799092676604, + "grad_norm": 0.08073947578668594, + "learning_rate": 0.00011556775732481211, + "loss": 0.2427, + "step": 23698 + }, + { + "epoch": 1.9198801036941024, + "grad_norm": 0.06485044956207275, + "learning_rate": 0.00011556325667221747, + "loss": 0.2742, + "step": 23699 + }, + { + "epoch": 1.919961114711601, + "grad_norm": 0.0512852780520916, + "learning_rate": 0.00011555875601962285, + "loss": 0.2425, + "step": 23700 + }, + { + "epoch": 1.9200421257290992, + "grad_norm": 0.06893439590930939, + "learning_rate": 0.00011555425536702822, + "loss": 0.3144, + "step": 23701 + }, + { + "epoch": 1.9201231367465974, + "grad_norm": 0.0645238384604454, + "learning_rate": 0.0001155497547144336, + "loss": 0.2899, + "step": 23702 + }, + { + "epoch": 1.9202041477640959, + "grad_norm": 0.0568263940513134, + "learning_rate": 0.00011554525406183898, + "loss": 0.2815, + "step": 23703 + }, + { + "epoch": 1.9202851587815943, + "grad_norm": 0.055893611162900925, + "learning_rate": 0.00011554075340924436, + "loss": 0.2716, + "step": 23704 + }, + { + "epoch": 1.9203661697990926, + "grad_norm": 0.0569830983877182, + "learning_rate": 0.00011553625275664972, + "loss": 0.2727, + "step": 23705 + }, + { + "epoch": 1.920447180816591, + "grad_norm": 0.052357595413923264, + "learning_rate": 0.00011553175210405509, + "loss": 0.3041, + "step": 23706 + }, + { + "epoch": 1.9205281918340895, + "grad_norm": 0.06113898754119873, + "learning_rate": 0.00011552725145146046, + "loss": 0.2864, + "step": 23707 + }, + { + "epoch": 1.9206092028515878, + "grad_norm": 0.051144078373909, + "learning_rate": 0.00011552275079886584, + "loss": 0.2892, + "step": 23708 + }, + { + "epoch": 1.9206902138690862, + "grad_norm": 0.052653051912784576, + "learning_rate": 0.00011551825014627122, + "loss": 0.2612, + "step": 23709 + }, + { + "epoch": 1.9207712248865847, + "grad_norm": 0.062111809849739075, + "learning_rate": 0.0001155137494936766, + "loss": 0.2928, + "step": 23710 + }, + { + "epoch": 1.920852235904083, + "grad_norm": 0.04654501751065254, + "learning_rate": 0.00011550924884108196, + "loss": 0.2916, + "step": 23711 + }, + { + "epoch": 1.9209332469215812, + "grad_norm": 0.05940484255552292, + "learning_rate": 0.00011550474818848733, + "loss": 0.32, + "step": 23712 + }, + { + "epoch": 1.9210142579390799, + "grad_norm": 0.05449722334742546, + "learning_rate": 0.0001155002475358927, + "loss": 0.2713, + "step": 23713 + }, + { + "epoch": 1.9210952689565781, + "grad_norm": 0.04990445077419281, + "learning_rate": 0.00011549574688329808, + "loss": 0.2451, + "step": 23714 + }, + { + "epoch": 1.9211762799740764, + "grad_norm": 0.048222944140434265, + "learning_rate": 0.00011549124623070347, + "loss": 0.2643, + "step": 23715 + }, + { + "epoch": 1.9212572909915748, + "grad_norm": 0.053241066634655, + "learning_rate": 0.00011548674557810884, + "loss": 0.3193, + "step": 23716 + }, + { + "epoch": 1.9213383020090733, + "grad_norm": 0.05876283720135689, + "learning_rate": 0.0001154822449255142, + "loss": 0.2738, + "step": 23717 + }, + { + "epoch": 1.9214193130265715, + "grad_norm": 0.07297758758068085, + "learning_rate": 0.00011547774427291957, + "loss": 0.2597, + "step": 23718 + }, + { + "epoch": 1.92150032404407, + "grad_norm": 0.051740966737270355, + "learning_rate": 0.00011547324362032495, + "loss": 0.2848, + "step": 23719 + }, + { + "epoch": 1.9215813350615685, + "grad_norm": 0.07227837294340134, + "learning_rate": 0.00011546874296773033, + "loss": 0.2866, + "step": 23720 + }, + { + "epoch": 1.9216623460790667, + "grad_norm": 0.07501929998397827, + "learning_rate": 0.00011546424231513571, + "loss": 0.249, + "step": 23721 + }, + { + "epoch": 1.9217433570965652, + "grad_norm": 0.06517595797777176, + "learning_rate": 0.00011545974166254108, + "loss": 0.2708, + "step": 23722 + }, + { + "epoch": 1.9218243681140637, + "grad_norm": 0.06509903818368912, + "learning_rate": 0.00011545524100994644, + "loss": 0.2839, + "step": 23723 + }, + { + "epoch": 1.921905379131562, + "grad_norm": 0.07787120342254639, + "learning_rate": 0.00011545074035735181, + "loss": 0.3032, + "step": 23724 + }, + { + "epoch": 1.9219863901490601, + "grad_norm": 0.050759267061948776, + "learning_rate": 0.00011544623970475719, + "loss": 0.2508, + "step": 23725 + }, + { + "epoch": 1.9220674011665586, + "grad_norm": 0.06383932381868362, + "learning_rate": 0.00011544173905216258, + "loss": 0.3205, + "step": 23726 + }, + { + "epoch": 1.922148412184057, + "grad_norm": 0.06522666662931442, + "learning_rate": 0.00011543723839956795, + "loss": 0.2566, + "step": 23727 + }, + { + "epoch": 1.9222294232015553, + "grad_norm": 0.04603106901049614, + "learning_rate": 0.00011543273774697332, + "loss": 0.2596, + "step": 23728 + }, + { + "epoch": 1.9223104342190538, + "grad_norm": 0.06357534229755402, + "learning_rate": 0.00011542823709437868, + "loss": 0.2922, + "step": 23729 + }, + { + "epoch": 1.9223914452365523, + "grad_norm": 0.06124119833111763, + "learning_rate": 0.00011542373644178406, + "loss": 0.2854, + "step": 23730 + }, + { + "epoch": 1.9224724562540505, + "grad_norm": 0.05823444202542305, + "learning_rate": 0.00011541923578918943, + "loss": 0.2564, + "step": 23731 + }, + { + "epoch": 1.922553467271549, + "grad_norm": 0.05559345334768295, + "learning_rate": 0.00011541473513659482, + "loss": 0.2842, + "step": 23732 + }, + { + "epoch": 1.9226344782890474, + "grad_norm": 0.05219922587275505, + "learning_rate": 0.00011541023448400019, + "loss": 0.2688, + "step": 23733 + }, + { + "epoch": 1.9227154893065457, + "grad_norm": 0.061427246779203415, + "learning_rate": 0.00011540573383140556, + "loss": 0.2675, + "step": 23734 + }, + { + "epoch": 1.922796500324044, + "grad_norm": 0.061376236379146576, + "learning_rate": 0.00011540123317881092, + "loss": 0.2625, + "step": 23735 + }, + { + "epoch": 1.9228775113415426, + "grad_norm": 0.06096119061112404, + "learning_rate": 0.0001153967325262163, + "loss": 0.2711, + "step": 23736 + }, + { + "epoch": 1.9229585223590409, + "grad_norm": 0.05492965131998062, + "learning_rate": 0.00011539223187362167, + "loss": 0.2323, + "step": 23737 + }, + { + "epoch": 1.923039533376539, + "grad_norm": 0.053612884134054184, + "learning_rate": 0.00011538773122102706, + "loss": 0.2624, + "step": 23738 + }, + { + "epoch": 1.9231205443940376, + "grad_norm": 0.05468292906880379, + "learning_rate": 0.00011538323056843243, + "loss": 0.278, + "step": 23739 + }, + { + "epoch": 1.923201555411536, + "grad_norm": 0.057623159140348434, + "learning_rate": 0.0001153787299158378, + "loss": 0.2608, + "step": 23740 + }, + { + "epoch": 1.9232825664290343, + "grad_norm": 0.06276316940784454, + "learning_rate": 0.00011537422926324317, + "loss": 0.2601, + "step": 23741 + }, + { + "epoch": 1.9233635774465327, + "grad_norm": 0.059519946575164795, + "learning_rate": 0.00011536972861064854, + "loss": 0.3006, + "step": 23742 + }, + { + "epoch": 1.9234445884640312, + "grad_norm": 0.06149646267294884, + "learning_rate": 0.00011536522795805394, + "loss": 0.2556, + "step": 23743 + }, + { + "epoch": 1.9235255994815295, + "grad_norm": 0.06678318977355957, + "learning_rate": 0.0001153607273054593, + "loss": 0.2883, + "step": 23744 + }, + { + "epoch": 1.9236066104990277, + "grad_norm": 0.05350562557578087, + "learning_rate": 0.00011535622665286467, + "loss": 0.2249, + "step": 23745 + }, + { + "epoch": 1.9236876215165264, + "grad_norm": 0.05622348561882973, + "learning_rate": 0.00011535172600027005, + "loss": 0.2765, + "step": 23746 + }, + { + "epoch": 1.9237686325340246, + "grad_norm": 0.07468672096729279, + "learning_rate": 0.00011534722534767541, + "loss": 0.3001, + "step": 23747 + }, + { + "epoch": 1.9238496435515229, + "grad_norm": 0.0751064196228981, + "learning_rate": 0.00011534272469508078, + "loss": 0.2626, + "step": 23748 + }, + { + "epoch": 1.9239306545690213, + "grad_norm": 0.05530063062906265, + "learning_rate": 0.00011533822404248618, + "loss": 0.2431, + "step": 23749 + }, + { + "epoch": 1.9240116655865198, + "grad_norm": 0.053154587745666504, + "learning_rate": 0.00011533372338989154, + "loss": 0.2823, + "step": 23750 + }, + { + "epoch": 1.924092676604018, + "grad_norm": 0.06156206130981445, + "learning_rate": 0.00011532922273729692, + "loss": 0.2349, + "step": 23751 + }, + { + "epoch": 1.9241736876215165, + "grad_norm": 0.06651521474123001, + "learning_rate": 0.00011532472208470229, + "loss": 0.2662, + "step": 23752 + }, + { + "epoch": 1.924254698639015, + "grad_norm": 0.04997168108820915, + "learning_rate": 0.00011532022143210765, + "loss": 0.2556, + "step": 23753 + }, + { + "epoch": 1.9243357096565132, + "grad_norm": 0.05996481329202652, + "learning_rate": 0.00011531572077951302, + "loss": 0.3115, + "step": 23754 + }, + { + "epoch": 1.9244167206740117, + "grad_norm": 0.05364224314689636, + "learning_rate": 0.00011531122012691842, + "loss": 0.2766, + "step": 23755 + }, + { + "epoch": 1.9244977316915102, + "grad_norm": 0.07042060792446136, + "learning_rate": 0.00011530671947432378, + "loss": 0.2657, + "step": 23756 + }, + { + "epoch": 1.9245787427090084, + "grad_norm": 0.04777385666966438, + "learning_rate": 0.00011530221882172916, + "loss": 0.2287, + "step": 23757 + }, + { + "epoch": 1.9246597537265067, + "grad_norm": 0.06700602918863297, + "learning_rate": 0.00011529771816913453, + "loss": 0.2736, + "step": 23758 + }, + { + "epoch": 1.9247407647440054, + "grad_norm": 0.06722524762153625, + "learning_rate": 0.00011529321751653989, + "loss": 0.2707, + "step": 23759 + }, + { + "epoch": 1.9248217757615036, + "grad_norm": 0.05077163130044937, + "learning_rate": 0.00011528871686394526, + "loss": 0.2763, + "step": 23760 + }, + { + "epoch": 1.9249027867790018, + "grad_norm": 0.05121629685163498, + "learning_rate": 0.00011528421621135067, + "loss": 0.2735, + "step": 23761 + }, + { + "epoch": 1.9249837977965003, + "grad_norm": 0.07214384526014328, + "learning_rate": 0.00011527971555875603, + "loss": 0.2928, + "step": 23762 + }, + { + "epoch": 1.9250648088139988, + "grad_norm": 0.0521952249109745, + "learning_rate": 0.0001152752149061614, + "loss": 0.2367, + "step": 23763 + }, + { + "epoch": 1.925145819831497, + "grad_norm": 0.06913160532712936, + "learning_rate": 0.00011527071425356677, + "loss": 0.3283, + "step": 23764 + }, + { + "epoch": 1.9252268308489955, + "grad_norm": 0.05566168576478958, + "learning_rate": 0.00011526621360097215, + "loss": 0.2849, + "step": 23765 + }, + { + "epoch": 1.925307841866494, + "grad_norm": 0.06802644580602646, + "learning_rate": 0.0001152617129483775, + "loss": 0.2753, + "step": 23766 + }, + { + "epoch": 1.9253888528839922, + "grad_norm": 0.07014881074428558, + "learning_rate": 0.00011525721229578291, + "loss": 0.2517, + "step": 23767 + }, + { + "epoch": 1.9254698639014904, + "grad_norm": 0.05886785313487053, + "learning_rate": 0.00011525271164318827, + "loss": 0.2763, + "step": 23768 + }, + { + "epoch": 1.9255508749189891, + "grad_norm": 0.054348427802324295, + "learning_rate": 0.00011524821099059364, + "loss": 0.2648, + "step": 23769 + }, + { + "epoch": 1.9256318859364874, + "grad_norm": 0.060900118201971054, + "learning_rate": 0.00011524371033799901, + "loss": 0.2349, + "step": 23770 + }, + { + "epoch": 1.9257128969539856, + "grad_norm": 0.06138508766889572, + "learning_rate": 0.00011523920968540439, + "loss": 0.282, + "step": 23771 + }, + { + "epoch": 1.925793907971484, + "grad_norm": 0.053859367966651917, + "learning_rate": 0.00011523470903280977, + "loss": 0.2704, + "step": 23772 + }, + { + "epoch": 1.9258749189889826, + "grad_norm": 0.056411322206258774, + "learning_rate": 0.00011523020838021515, + "loss": 0.2781, + "step": 23773 + }, + { + "epoch": 1.9259559300064808, + "grad_norm": 0.06078789383172989, + "learning_rate": 0.00011522570772762051, + "loss": 0.268, + "step": 23774 + }, + { + "epoch": 1.9260369410239793, + "grad_norm": 0.052285823971033096, + "learning_rate": 0.00011522120707502588, + "loss": 0.2564, + "step": 23775 + }, + { + "epoch": 1.9261179520414777, + "grad_norm": 0.06265722960233688, + "learning_rate": 0.00011521670642243126, + "loss": 0.2741, + "step": 23776 + }, + { + "epoch": 1.926198963058976, + "grad_norm": 0.05709337443113327, + "learning_rate": 0.00011521220576983663, + "loss": 0.2382, + "step": 23777 + }, + { + "epoch": 1.9262799740764744, + "grad_norm": 0.06749071180820465, + "learning_rate": 0.00011520770511724202, + "loss": 0.2617, + "step": 23778 + }, + { + "epoch": 1.926360985093973, + "grad_norm": 0.05918231979012489, + "learning_rate": 0.00011520320446464739, + "loss": 0.2637, + "step": 23779 + }, + { + "epoch": 1.9264419961114712, + "grad_norm": 0.07993515580892563, + "learning_rate": 0.00011519870381205275, + "loss": 0.2783, + "step": 23780 + }, + { + "epoch": 1.9265230071289694, + "grad_norm": 0.07001268863677979, + "learning_rate": 0.00011519420315945812, + "loss": 0.3366, + "step": 23781 + }, + { + "epoch": 1.9266040181464679, + "grad_norm": 0.05565613880753517, + "learning_rate": 0.0001151897025068635, + "loss": 0.2487, + "step": 23782 + }, + { + "epoch": 1.9266850291639663, + "grad_norm": 0.0639483854174614, + "learning_rate": 0.00011518520185426887, + "loss": 0.2915, + "step": 23783 + }, + { + "epoch": 1.9267660401814646, + "grad_norm": 0.06700640171766281, + "learning_rate": 0.00011518070120167426, + "loss": 0.2872, + "step": 23784 + }, + { + "epoch": 1.926847051198963, + "grad_norm": 0.055324066430330276, + "learning_rate": 0.00011517620054907963, + "loss": 0.259, + "step": 23785 + }, + { + "epoch": 1.9269280622164615, + "grad_norm": 0.06429389864206314, + "learning_rate": 0.00011517169989648499, + "loss": 0.2721, + "step": 23786 + }, + { + "epoch": 1.9270090732339598, + "grad_norm": 0.05660184845328331, + "learning_rate": 0.00011516719924389037, + "loss": 0.2632, + "step": 23787 + }, + { + "epoch": 1.9270900842514582, + "grad_norm": 0.051351770758628845, + "learning_rate": 0.00011516269859129574, + "loss": 0.2405, + "step": 23788 + }, + { + "epoch": 1.9271710952689567, + "grad_norm": 0.055790483951568604, + "learning_rate": 0.00011515819793870111, + "loss": 0.2598, + "step": 23789 + }, + { + "epoch": 1.927252106286455, + "grad_norm": 0.06330342590808868, + "learning_rate": 0.0001151536972861065, + "loss": 0.2544, + "step": 23790 + }, + { + "epoch": 1.9273331173039532, + "grad_norm": 0.05117948725819588, + "learning_rate": 0.00011514919663351187, + "loss": 0.2295, + "step": 23791 + }, + { + "epoch": 1.9274141283214519, + "grad_norm": 0.06005241349339485, + "learning_rate": 0.00011514469598091723, + "loss": 0.2551, + "step": 23792 + }, + { + "epoch": 1.9274951393389501, + "grad_norm": 0.057109806686639786, + "learning_rate": 0.00011514019532832261, + "loss": 0.2375, + "step": 23793 + }, + { + "epoch": 1.9275761503564484, + "grad_norm": 0.06928063929080963, + "learning_rate": 0.00011513569467572798, + "loss": 0.304, + "step": 23794 + }, + { + "epoch": 1.9276571613739468, + "grad_norm": 0.06174888089299202, + "learning_rate": 0.00011513119402313337, + "loss": 0.2762, + "step": 23795 + }, + { + "epoch": 1.9277381723914453, + "grad_norm": 0.07365316152572632, + "learning_rate": 0.00011512669337053874, + "loss": 0.2722, + "step": 23796 + }, + { + "epoch": 1.9278191834089435, + "grad_norm": 0.06484461575746536, + "learning_rate": 0.00011512219271794411, + "loss": 0.2766, + "step": 23797 + }, + { + "epoch": 1.927900194426442, + "grad_norm": 0.0619257427752018, + "learning_rate": 0.00011511769206534947, + "loss": 0.2866, + "step": 23798 + }, + { + "epoch": 1.9279812054439405, + "grad_norm": 0.0514339804649353, + "learning_rate": 0.00011511319141275485, + "loss": 0.2801, + "step": 23799 + }, + { + "epoch": 1.9280622164614387, + "grad_norm": 0.06011634320020676, + "learning_rate": 0.00011510869076016022, + "loss": 0.2558, + "step": 23800 + }, + { + "epoch": 1.9281432274789372, + "grad_norm": 0.05926699936389923, + "learning_rate": 0.00011510419010756561, + "loss": 0.2728, + "step": 23801 + }, + { + "epoch": 1.9282242384964356, + "grad_norm": 0.06783153116703033, + "learning_rate": 0.00011509968945497098, + "loss": 0.3053, + "step": 23802 + }, + { + "epoch": 1.928305249513934, + "grad_norm": 0.06083691492676735, + "learning_rate": 0.00011509518880237636, + "loss": 0.2789, + "step": 23803 + }, + { + "epoch": 1.9283862605314321, + "grad_norm": 0.051245927810668945, + "learning_rate": 0.00011509068814978172, + "loss": 0.263, + "step": 23804 + }, + { + "epoch": 1.9284672715489306, + "grad_norm": 0.05796998366713524, + "learning_rate": 0.00011508618749718709, + "loss": 0.2657, + "step": 23805 + }, + { + "epoch": 1.928548282566429, + "grad_norm": 0.06028568372130394, + "learning_rate": 0.00011508168684459246, + "loss": 0.2762, + "step": 23806 + }, + { + "epoch": 1.9286292935839273, + "grad_norm": 0.062084048986434937, + "learning_rate": 0.00011507718619199785, + "loss": 0.2563, + "step": 23807 + }, + { + "epoch": 1.9287103046014258, + "grad_norm": 0.050911158323287964, + "learning_rate": 0.00011507268553940322, + "loss": 0.2637, + "step": 23808 + }, + { + "epoch": 1.9287913156189243, + "grad_norm": 0.0765126645565033, + "learning_rate": 0.0001150681848868086, + "loss": 0.288, + "step": 23809 + }, + { + "epoch": 1.9288723266364225, + "grad_norm": 0.06888175010681152, + "learning_rate": 0.00011506368423421396, + "loss": 0.2965, + "step": 23810 + }, + { + "epoch": 1.928953337653921, + "grad_norm": 0.06394674628973007, + "learning_rate": 0.00011505918358161933, + "loss": 0.2659, + "step": 23811 + }, + { + "epoch": 1.9290343486714194, + "grad_norm": 0.061555393040180206, + "learning_rate": 0.0001150546829290247, + "loss": 0.263, + "step": 23812 + }, + { + "epoch": 1.9291153596889177, + "grad_norm": 0.05561353638768196, + "learning_rate": 0.00011505018227643009, + "loss": 0.2886, + "step": 23813 + }, + { + "epoch": 1.929196370706416, + "grad_norm": 0.05581795051693916, + "learning_rate": 0.00011504568162383547, + "loss": 0.2484, + "step": 23814 + }, + { + "epoch": 1.9292773817239146, + "grad_norm": 0.06461822986602783, + "learning_rate": 0.00011504118097124084, + "loss": 0.2971, + "step": 23815 + }, + { + "epoch": 1.9293583927414129, + "grad_norm": 0.05509224161505699, + "learning_rate": 0.0001150366803186462, + "loss": 0.2461, + "step": 23816 + }, + { + "epoch": 1.929439403758911, + "grad_norm": 0.05018146336078644, + "learning_rate": 0.00011503217966605157, + "loss": 0.2369, + "step": 23817 + }, + { + "epoch": 1.9295204147764096, + "grad_norm": 0.05055820196866989, + "learning_rate": 0.00011502767901345695, + "loss": 0.276, + "step": 23818 + }, + { + "epoch": 1.929601425793908, + "grad_norm": 0.050321076065301895, + "learning_rate": 0.00011502317836086233, + "loss": 0.269, + "step": 23819 + }, + { + "epoch": 1.9296824368114063, + "grad_norm": 0.06445109844207764, + "learning_rate": 0.00011501867770826771, + "loss": 0.2553, + "step": 23820 + }, + { + "epoch": 1.9297634478289047, + "grad_norm": 0.054344166070222855, + "learning_rate": 0.00011501417705567308, + "loss": 0.2479, + "step": 23821 + }, + { + "epoch": 1.9298444588464032, + "grad_norm": 0.05837761610746384, + "learning_rate": 0.00011500967640307844, + "loss": 0.2913, + "step": 23822 + }, + { + "epoch": 1.9299254698639015, + "grad_norm": 0.06546605378389359, + "learning_rate": 0.00011500517575048382, + "loss": 0.2784, + "step": 23823 + }, + { + "epoch": 1.9300064808814, + "grad_norm": 0.0638062059879303, + "learning_rate": 0.00011500067509788922, + "loss": 0.2531, + "step": 23824 + }, + { + "epoch": 1.9300874918988984, + "grad_norm": 0.06651133298873901, + "learning_rate": 0.00011499617444529458, + "loss": 0.2692, + "step": 23825 + }, + { + "epoch": 1.9301685029163966, + "grad_norm": 0.06613379716873169, + "learning_rate": 0.00011499167379269995, + "loss": 0.2671, + "step": 23826 + }, + { + "epoch": 1.9302495139338949, + "grad_norm": 0.051686592400074005, + "learning_rate": 0.00011498717314010532, + "loss": 0.2468, + "step": 23827 + }, + { + "epoch": 1.9303305249513933, + "grad_norm": 0.05653015151619911, + "learning_rate": 0.00011498267248751068, + "loss": 0.2876, + "step": 23828 + }, + { + "epoch": 1.9304115359688918, + "grad_norm": 0.06232219934463501, + "learning_rate": 0.00011497817183491606, + "loss": 0.2862, + "step": 23829 + }, + { + "epoch": 1.93049254698639, + "grad_norm": 0.052821218967437744, + "learning_rate": 0.00011497367118232146, + "loss": 0.2503, + "step": 23830 + }, + { + "epoch": 1.9305735580038885, + "grad_norm": 0.06216568499803543, + "learning_rate": 0.00011496917052972682, + "loss": 0.2524, + "step": 23831 + }, + { + "epoch": 1.930654569021387, + "grad_norm": 0.07735279202461243, + "learning_rate": 0.00011496466987713219, + "loss": 0.3003, + "step": 23832 + }, + { + "epoch": 1.9307355800388852, + "grad_norm": 0.06235320121049881, + "learning_rate": 0.00011496016922453756, + "loss": 0.2338, + "step": 23833 + }, + { + "epoch": 1.9308165910563837, + "grad_norm": 0.05040643364191055, + "learning_rate": 0.00011495566857194294, + "loss": 0.2185, + "step": 23834 + }, + { + "epoch": 1.9308976020738822, + "grad_norm": 0.07056865096092224, + "learning_rate": 0.0001149511679193483, + "loss": 0.2625, + "step": 23835 + }, + { + "epoch": 1.9309786130913804, + "grad_norm": 0.04554782807826996, + "learning_rate": 0.0001149466672667537, + "loss": 0.2648, + "step": 23836 + }, + { + "epoch": 1.9310596241088787, + "grad_norm": 0.07224829494953156, + "learning_rate": 0.00011494216661415906, + "loss": 0.2497, + "step": 23837 + }, + { + "epoch": 1.9311406351263773, + "grad_norm": 0.05645201355218887, + "learning_rate": 0.00011493766596156443, + "loss": 0.2642, + "step": 23838 + }, + { + "epoch": 1.9312216461438756, + "grad_norm": 0.06240091100335121, + "learning_rate": 0.0001149331653089698, + "loss": 0.2762, + "step": 23839 + }, + { + "epoch": 1.9313026571613738, + "grad_norm": 0.05922730639576912, + "learning_rate": 0.00011492866465637518, + "loss": 0.2615, + "step": 23840 + }, + { + "epoch": 1.9313836681788723, + "grad_norm": 0.05924151837825775, + "learning_rate": 0.00011492416400378054, + "loss": 0.2664, + "step": 23841 + }, + { + "epoch": 1.9314646791963708, + "grad_norm": 0.05077677592635155, + "learning_rate": 0.00011491966335118594, + "loss": 0.2478, + "step": 23842 + }, + { + "epoch": 1.931545690213869, + "grad_norm": 0.06003196910023689, + "learning_rate": 0.0001149151626985913, + "loss": 0.3053, + "step": 23843 + }, + { + "epoch": 1.9316267012313675, + "grad_norm": 0.05602165311574936, + "learning_rate": 0.00011491066204599667, + "loss": 0.2536, + "step": 23844 + }, + { + "epoch": 1.931707712248866, + "grad_norm": 0.05465425178408623, + "learning_rate": 0.00011490616139340205, + "loss": 0.2706, + "step": 23845 + }, + { + "epoch": 1.9317887232663642, + "grad_norm": 0.0503137931227684, + "learning_rate": 0.00011490166074080742, + "loss": 0.293, + "step": 23846 + }, + { + "epoch": 1.9318697342838627, + "grad_norm": 0.042224958539009094, + "learning_rate": 0.00011489716008821281, + "loss": 0.2338, + "step": 23847 + }, + { + "epoch": 1.9319507453013611, + "grad_norm": 0.06504608690738678, + "learning_rate": 0.00011489265943561818, + "loss": 0.326, + "step": 23848 + }, + { + "epoch": 1.9320317563188594, + "grad_norm": 0.057590994983911514, + "learning_rate": 0.00011488815878302354, + "loss": 0.2648, + "step": 23849 + }, + { + "epoch": 1.9321127673363576, + "grad_norm": 0.057391226291656494, + "learning_rate": 0.00011488365813042892, + "loss": 0.2405, + "step": 23850 + }, + { + "epoch": 1.932193778353856, + "grad_norm": 0.06676268577575684, + "learning_rate": 0.00011487915747783429, + "loss": 0.2861, + "step": 23851 + }, + { + "epoch": 1.9322747893713546, + "grad_norm": 0.05596553161740303, + "learning_rate": 0.00011487465682523966, + "loss": 0.2852, + "step": 23852 + }, + { + "epoch": 1.9323558003888528, + "grad_norm": 0.05663205310702324, + "learning_rate": 0.00011487015617264505, + "loss": 0.2519, + "step": 23853 + }, + { + "epoch": 1.9324368114063513, + "grad_norm": 0.06211337074637413, + "learning_rate": 0.00011486565552005042, + "loss": 0.2877, + "step": 23854 + }, + { + "epoch": 1.9325178224238497, + "grad_norm": 0.0637897327542305, + "learning_rate": 0.00011486115486745578, + "loss": 0.278, + "step": 23855 + }, + { + "epoch": 1.932598833441348, + "grad_norm": 0.05674593150615692, + "learning_rate": 0.00011485665421486116, + "loss": 0.2694, + "step": 23856 + }, + { + "epoch": 1.9326798444588464, + "grad_norm": 0.07816286385059357, + "learning_rate": 0.00011485215356226653, + "loss": 0.3347, + "step": 23857 + }, + { + "epoch": 1.932760855476345, + "grad_norm": 0.0642632469534874, + "learning_rate": 0.0001148476529096719, + "loss": 0.2657, + "step": 23858 + }, + { + "epoch": 1.9328418664938432, + "grad_norm": 0.06931416690349579, + "learning_rate": 0.00011484315225707729, + "loss": 0.2807, + "step": 23859 + }, + { + "epoch": 1.9329228775113414, + "grad_norm": 0.05714231729507446, + "learning_rate": 0.00011483865160448267, + "loss": 0.2979, + "step": 23860 + }, + { + "epoch": 1.93300388852884, + "grad_norm": 0.06332039088010788, + "learning_rate": 0.00011483415095188803, + "loss": 0.2682, + "step": 23861 + }, + { + "epoch": 1.9330848995463383, + "grad_norm": 0.06614676117897034, + "learning_rate": 0.0001148296502992934, + "loss": 0.2662, + "step": 23862 + }, + { + "epoch": 1.9331659105638366, + "grad_norm": 0.05901767313480377, + "learning_rate": 0.00011482514964669877, + "loss": 0.3002, + "step": 23863 + }, + { + "epoch": 1.933246921581335, + "grad_norm": 0.0599125437438488, + "learning_rate": 0.00011482064899410415, + "loss": 0.258, + "step": 23864 + }, + { + "epoch": 1.9333279325988335, + "grad_norm": 0.05454184114933014, + "learning_rate": 0.00011481614834150953, + "loss": 0.2551, + "step": 23865 + }, + { + "epoch": 1.9334089436163318, + "grad_norm": 0.0639750063419342, + "learning_rate": 0.00011481164768891491, + "loss": 0.261, + "step": 23866 + }, + { + "epoch": 1.9334899546338302, + "grad_norm": 0.06324272602796555, + "learning_rate": 0.00011480714703632027, + "loss": 0.2616, + "step": 23867 + }, + { + "epoch": 1.9335709656513287, + "grad_norm": 0.05212435871362686, + "learning_rate": 0.00011480264638372564, + "loss": 0.2473, + "step": 23868 + }, + { + "epoch": 1.933651976668827, + "grad_norm": 0.058949582278728485, + "learning_rate": 0.00011479814573113101, + "loss": 0.3239, + "step": 23869 + }, + { + "epoch": 1.9337329876863252, + "grad_norm": 0.057215847074985504, + "learning_rate": 0.00011479364507853639, + "loss": 0.2682, + "step": 23870 + }, + { + "epoch": 1.9338139987038239, + "grad_norm": 0.053002744913101196, + "learning_rate": 0.00011478914442594178, + "loss": 0.2705, + "step": 23871 + }, + { + "epoch": 1.9338950097213221, + "grad_norm": 0.07106682658195496, + "learning_rate": 0.00011478464377334715, + "loss": 0.2788, + "step": 23872 + }, + { + "epoch": 1.9339760207388204, + "grad_norm": 0.06341718137264252, + "learning_rate": 0.00011478014312075251, + "loss": 0.2676, + "step": 23873 + }, + { + "epoch": 1.9340570317563188, + "grad_norm": 0.05477464199066162, + "learning_rate": 0.00011477564246815788, + "loss": 0.223, + "step": 23874 + }, + { + "epoch": 1.9341380427738173, + "grad_norm": 0.06264764815568924, + "learning_rate": 0.00011477114181556326, + "loss": 0.2839, + "step": 23875 + }, + { + "epoch": 1.9342190537913155, + "grad_norm": 0.049226321280002594, + "learning_rate": 0.00011476664116296864, + "loss": 0.2423, + "step": 23876 + }, + { + "epoch": 1.934300064808814, + "grad_norm": 0.062404386699199677, + "learning_rate": 0.00011476214051037402, + "loss": 0.297, + "step": 23877 + }, + { + "epoch": 1.9343810758263125, + "grad_norm": 0.05783528834581375, + "learning_rate": 0.00011475763985777939, + "loss": 0.267, + "step": 23878 + }, + { + "epoch": 1.9344620868438107, + "grad_norm": 0.06114105135202408, + "learning_rate": 0.00011475313920518475, + "loss": 0.2823, + "step": 23879 + }, + { + "epoch": 1.9345430978613092, + "grad_norm": 0.06154955178499222, + "learning_rate": 0.00011474863855259012, + "loss": 0.2849, + "step": 23880 + }, + { + "epoch": 1.9346241088788076, + "grad_norm": 0.06020995229482651, + "learning_rate": 0.0001147441378999955, + "loss": 0.3081, + "step": 23881 + }, + { + "epoch": 1.934705119896306, + "grad_norm": 0.05826554074883461, + "learning_rate": 0.00011473963724740088, + "loss": 0.2419, + "step": 23882 + }, + { + "epoch": 1.9347861309138041, + "grad_norm": 0.05434232950210571, + "learning_rate": 0.00011473513659480626, + "loss": 0.2449, + "step": 23883 + }, + { + "epoch": 1.9348671419313026, + "grad_norm": 0.06052308902144432, + "learning_rate": 0.00011473063594221163, + "loss": 0.2925, + "step": 23884 + }, + { + "epoch": 1.934948152948801, + "grad_norm": 0.05621659383177757, + "learning_rate": 0.00011472613528961699, + "loss": 0.2648, + "step": 23885 + }, + { + "epoch": 1.9350291639662993, + "grad_norm": 0.057274747639894485, + "learning_rate": 0.00011472163463702237, + "loss": 0.2678, + "step": 23886 + }, + { + "epoch": 1.9351101749837978, + "grad_norm": 0.05058053135871887, + "learning_rate": 0.00011471713398442774, + "loss": 0.2499, + "step": 23887 + }, + { + "epoch": 1.9351911860012962, + "grad_norm": 0.04877978190779686, + "learning_rate": 0.00011471263333183313, + "loss": 0.2143, + "step": 23888 + }, + { + "epoch": 1.9352721970187945, + "grad_norm": 0.05602328106760979, + "learning_rate": 0.0001147081326792385, + "loss": 0.2916, + "step": 23889 + }, + { + "epoch": 1.935353208036293, + "grad_norm": 0.055223964154720306, + "learning_rate": 0.00011470363202664387, + "loss": 0.2723, + "step": 23890 + }, + { + "epoch": 1.9354342190537914, + "grad_norm": 0.0630248561501503, + "learning_rate": 0.00011469913137404923, + "loss": 0.2176, + "step": 23891 + }, + { + "epoch": 1.9355152300712897, + "grad_norm": 0.05983347073197365, + "learning_rate": 0.00011469463072145461, + "loss": 0.2666, + "step": 23892 + }, + { + "epoch": 1.935596241088788, + "grad_norm": 0.05666651949286461, + "learning_rate": 0.00011469013006885998, + "loss": 0.2425, + "step": 23893 + }, + { + "epoch": 1.9356772521062866, + "grad_norm": 0.05697185918688774, + "learning_rate": 0.00011468562941626537, + "loss": 0.256, + "step": 23894 + }, + { + "epoch": 1.9357582631237849, + "grad_norm": 0.05312450975179672, + "learning_rate": 0.00011468112876367074, + "loss": 0.2724, + "step": 23895 + }, + { + "epoch": 1.935839274141283, + "grad_norm": 0.050729118287563324, + "learning_rate": 0.00011467662811107612, + "loss": 0.2551, + "step": 23896 + }, + { + "epoch": 1.9359202851587816, + "grad_norm": 0.056781407445669174, + "learning_rate": 0.00011467212745848148, + "loss": 0.2648, + "step": 23897 + }, + { + "epoch": 1.93600129617628, + "grad_norm": 0.06569044291973114, + "learning_rate": 0.00011466762680588685, + "loss": 0.2704, + "step": 23898 + }, + { + "epoch": 1.9360823071937783, + "grad_norm": 0.06088702008128166, + "learning_rate": 0.00011466312615329222, + "loss": 0.27, + "step": 23899 + }, + { + "epoch": 1.9361633182112767, + "grad_norm": 0.055795930325984955, + "learning_rate": 0.00011465862550069761, + "loss": 0.2446, + "step": 23900 + }, + { + "epoch": 1.9362443292287752, + "grad_norm": 0.06180226802825928, + "learning_rate": 0.00011465412484810298, + "loss": 0.2701, + "step": 23901 + }, + { + "epoch": 1.9363253402462735, + "grad_norm": 0.06375572085380554, + "learning_rate": 0.00011464962419550836, + "loss": 0.2565, + "step": 23902 + }, + { + "epoch": 1.936406351263772, + "grad_norm": 0.04969800263643265, + "learning_rate": 0.00011464512354291373, + "loss": 0.2377, + "step": 23903 + }, + { + "epoch": 1.9364873622812704, + "grad_norm": 0.07344190031290054, + "learning_rate": 0.00011464062289031909, + "loss": 0.2966, + "step": 23904 + }, + { + "epoch": 1.9365683732987686, + "grad_norm": 0.05501018092036247, + "learning_rate": 0.00011463612223772449, + "loss": 0.2251, + "step": 23905 + }, + { + "epoch": 1.9366493843162669, + "grad_norm": 0.048425883054733276, + "learning_rate": 0.00011463162158512985, + "loss": 0.2583, + "step": 23906 + }, + { + "epoch": 1.9367303953337653, + "grad_norm": 0.06696183234453201, + "learning_rate": 0.00011462712093253522, + "loss": 0.2684, + "step": 23907 + }, + { + "epoch": 1.9368114063512638, + "grad_norm": 0.05589906871318817, + "learning_rate": 0.0001146226202799406, + "loss": 0.2343, + "step": 23908 + }, + { + "epoch": 1.936892417368762, + "grad_norm": 0.07046809047460556, + "learning_rate": 0.00011461811962734597, + "loss": 0.3071, + "step": 23909 + }, + { + "epoch": 1.9369734283862605, + "grad_norm": 0.07077688723802567, + "learning_rate": 0.00011461361897475133, + "loss": 0.3055, + "step": 23910 + }, + { + "epoch": 1.937054439403759, + "grad_norm": 0.07673512399196625, + "learning_rate": 0.00011460911832215673, + "loss": 0.2839, + "step": 23911 + }, + { + "epoch": 1.9371354504212572, + "grad_norm": 0.066664919257164, + "learning_rate": 0.00011460461766956209, + "loss": 0.2812, + "step": 23912 + }, + { + "epoch": 1.9372164614387557, + "grad_norm": 0.06907733529806137, + "learning_rate": 0.00011460011701696747, + "loss": 0.2709, + "step": 23913 + }, + { + "epoch": 1.9372974724562542, + "grad_norm": 0.0514480397105217, + "learning_rate": 0.00011459561636437284, + "loss": 0.2514, + "step": 23914 + }, + { + "epoch": 1.9373784834737524, + "grad_norm": 0.06700679659843445, + "learning_rate": 0.00011459111571177821, + "loss": 0.2833, + "step": 23915 + }, + { + "epoch": 1.9374594944912507, + "grad_norm": 0.06083435192704201, + "learning_rate": 0.00011458661505918357, + "loss": 0.2955, + "step": 23916 + }, + { + "epoch": 1.9375405055087493, + "grad_norm": 0.051247239112854004, + "learning_rate": 0.00011458211440658897, + "loss": 0.2426, + "step": 23917 + }, + { + "epoch": 1.9376215165262476, + "grad_norm": 0.048243001103401184, + "learning_rate": 0.00011457761375399433, + "loss": 0.2177, + "step": 23918 + }, + { + "epoch": 1.9377025275437458, + "grad_norm": 0.05603655427694321, + "learning_rate": 0.00011457311310139971, + "loss": 0.3136, + "step": 23919 + }, + { + "epoch": 1.9377835385612443, + "grad_norm": 0.06207847222685814, + "learning_rate": 0.00011456861244880508, + "loss": 0.2679, + "step": 23920 + }, + { + "epoch": 1.9378645495787428, + "grad_norm": 0.05992251634597778, + "learning_rate": 0.00011456411179621046, + "loss": 0.2982, + "step": 23921 + }, + { + "epoch": 1.937945560596241, + "grad_norm": 0.05770555138587952, + "learning_rate": 0.00011455961114361582, + "loss": 0.2643, + "step": 23922 + }, + { + "epoch": 1.9380265716137395, + "grad_norm": 0.05730791762471199, + "learning_rate": 0.00011455511049102122, + "loss": 0.2514, + "step": 23923 + }, + { + "epoch": 1.938107582631238, + "grad_norm": 0.0661020278930664, + "learning_rate": 0.00011455060983842658, + "loss": 0.2773, + "step": 23924 + }, + { + "epoch": 1.9381885936487362, + "grad_norm": 0.05453161150217056, + "learning_rate": 0.00011454610918583195, + "loss": 0.2417, + "step": 23925 + }, + { + "epoch": 1.9382696046662347, + "grad_norm": 0.06748737394809723, + "learning_rate": 0.00011454160853323732, + "loss": 0.2934, + "step": 23926 + }, + { + "epoch": 1.9383506156837331, + "grad_norm": 0.05424464866518974, + "learning_rate": 0.0001145371078806427, + "loss": 0.2694, + "step": 23927 + }, + { + "epoch": 1.9384316267012314, + "grad_norm": 0.06523843854665756, + "learning_rate": 0.00011453260722804808, + "loss": 0.2623, + "step": 23928 + }, + { + "epoch": 1.9385126377187296, + "grad_norm": 0.06635954976081848, + "learning_rate": 0.00011452810657545346, + "loss": 0.2637, + "step": 23929 + }, + { + "epoch": 1.938593648736228, + "grad_norm": 0.06299407035112381, + "learning_rate": 0.00011452360592285882, + "loss": 0.2927, + "step": 23930 + }, + { + "epoch": 1.9386746597537265, + "grad_norm": 0.06170092523097992, + "learning_rate": 0.00011451910527026419, + "loss": 0.2784, + "step": 23931 + }, + { + "epoch": 1.9387556707712248, + "grad_norm": 0.08529407531023026, + "learning_rate": 0.00011451460461766956, + "loss": 0.2693, + "step": 23932 + }, + { + "epoch": 1.9388366817887233, + "grad_norm": 0.06642553955316544, + "learning_rate": 0.00011451010396507494, + "loss": 0.2497, + "step": 23933 + }, + { + "epoch": 1.9389176928062217, + "grad_norm": 0.05886387825012207, + "learning_rate": 0.00011450560331248033, + "loss": 0.275, + "step": 23934 + }, + { + "epoch": 1.93899870382372, + "grad_norm": 0.05601823702454567, + "learning_rate": 0.0001145011026598857, + "loss": 0.2583, + "step": 23935 + }, + { + "epoch": 1.9390797148412184, + "grad_norm": 0.057005684822797775, + "learning_rate": 0.00011449660200729106, + "loss": 0.2406, + "step": 23936 + }, + { + "epoch": 1.939160725858717, + "grad_norm": 0.05442437157034874, + "learning_rate": 0.00011449210135469643, + "loss": 0.2484, + "step": 23937 + }, + { + "epoch": 1.9392417368762151, + "grad_norm": 0.059382639825344086, + "learning_rate": 0.0001144876007021018, + "loss": 0.2651, + "step": 23938 + }, + { + "epoch": 1.9393227478937134, + "grad_norm": 0.06507833302021027, + "learning_rate": 0.00011448310004950718, + "loss": 0.2799, + "step": 23939 + }, + { + "epoch": 1.939403758911212, + "grad_norm": 0.06146455928683281, + "learning_rate": 0.00011447859939691257, + "loss": 0.2335, + "step": 23940 + }, + { + "epoch": 1.9394847699287103, + "grad_norm": 0.04766790568828583, + "learning_rate": 0.00011447409874431794, + "loss": 0.2231, + "step": 23941 + }, + { + "epoch": 1.9395657809462086, + "grad_norm": 0.06149807572364807, + "learning_rate": 0.0001144695980917233, + "loss": 0.2733, + "step": 23942 + }, + { + "epoch": 1.939646791963707, + "grad_norm": 0.07344269007444382, + "learning_rate": 0.00011446509743912867, + "loss": 0.3203, + "step": 23943 + }, + { + "epoch": 1.9397278029812055, + "grad_norm": 0.06012146174907684, + "learning_rate": 0.00011446059678653405, + "loss": 0.2655, + "step": 23944 + }, + { + "epoch": 1.9398088139987038, + "grad_norm": 0.06117866188287735, + "learning_rate": 0.00011445609613393942, + "loss": 0.2646, + "step": 23945 + }, + { + "epoch": 1.9398898250162022, + "grad_norm": 0.06094307452440262, + "learning_rate": 0.00011445159548134481, + "loss": 0.3039, + "step": 23946 + }, + { + "epoch": 1.9399708360337007, + "grad_norm": 0.060897525399923325, + "learning_rate": 0.00011444709482875018, + "loss": 0.2616, + "step": 23947 + }, + { + "epoch": 1.940051847051199, + "grad_norm": 0.054656002670526505, + "learning_rate": 0.00011444259417615554, + "loss": 0.2567, + "step": 23948 + }, + { + "epoch": 1.9401328580686974, + "grad_norm": 0.055229902267456055, + "learning_rate": 0.00011443809352356092, + "loss": 0.3029, + "step": 23949 + }, + { + "epoch": 1.9402138690861959, + "grad_norm": 0.0635809451341629, + "learning_rate": 0.00011443359287096629, + "loss": 0.263, + "step": 23950 + }, + { + "epoch": 1.940294880103694, + "grad_norm": 0.05155356600880623, + "learning_rate": 0.00011442909221837166, + "loss": 0.2497, + "step": 23951 + }, + { + "epoch": 1.9403758911211924, + "grad_norm": 0.05434060096740723, + "learning_rate": 0.00011442459156577705, + "loss": 0.2649, + "step": 23952 + }, + { + "epoch": 1.9404569021386908, + "grad_norm": 0.06424754858016968, + "learning_rate": 0.00011442009091318242, + "loss": 0.2904, + "step": 23953 + }, + { + "epoch": 1.9405379131561893, + "grad_norm": 0.08113347738981247, + "learning_rate": 0.00011441559026058778, + "loss": 0.2875, + "step": 23954 + }, + { + "epoch": 1.9406189241736875, + "grad_norm": 0.06629473716020584, + "learning_rate": 0.00011441108960799316, + "loss": 0.2856, + "step": 23955 + }, + { + "epoch": 1.940699935191186, + "grad_norm": 0.05168033763766289, + "learning_rate": 0.00011440658895539853, + "loss": 0.2272, + "step": 23956 + }, + { + "epoch": 1.9407809462086845, + "grad_norm": 0.0798877626657486, + "learning_rate": 0.00011440208830280392, + "loss": 0.2911, + "step": 23957 + }, + { + "epoch": 1.9408619572261827, + "grad_norm": 0.06408312171697617, + "learning_rate": 0.00011439758765020929, + "loss": 0.2583, + "step": 23958 + }, + { + "epoch": 1.9409429682436812, + "grad_norm": 0.04707522317767143, + "learning_rate": 0.00011439308699761467, + "loss": 0.2339, + "step": 23959 + }, + { + "epoch": 1.9410239792611796, + "grad_norm": 0.05191976577043533, + "learning_rate": 0.00011438858634502003, + "loss": 0.2347, + "step": 23960 + }, + { + "epoch": 1.9411049902786779, + "grad_norm": 0.052793506532907486, + "learning_rate": 0.0001143840856924254, + "loss": 0.2961, + "step": 23961 + }, + { + "epoch": 1.9411860012961761, + "grad_norm": 0.057689033448696136, + "learning_rate": 0.00011437958503983077, + "loss": 0.2579, + "step": 23962 + }, + { + "epoch": 1.9412670123136748, + "grad_norm": 0.056980159133672714, + "learning_rate": 0.00011437508438723616, + "loss": 0.2993, + "step": 23963 + }, + { + "epoch": 1.941348023331173, + "grad_norm": 0.05391747131943703, + "learning_rate": 0.00011437058373464153, + "loss": 0.2379, + "step": 23964 + }, + { + "epoch": 1.9414290343486713, + "grad_norm": 0.0595475398004055, + "learning_rate": 0.00011436608308204691, + "loss": 0.25, + "step": 23965 + }, + { + "epoch": 1.9415100453661698, + "grad_norm": 0.04893352836370468, + "learning_rate": 0.00011436158242945228, + "loss": 0.2692, + "step": 23966 + }, + { + "epoch": 1.9415910563836682, + "grad_norm": 0.06128464266657829, + "learning_rate": 0.00011435708177685764, + "loss": 0.2725, + "step": 23967 + }, + { + "epoch": 1.9416720674011665, + "grad_norm": 0.05946241691708565, + "learning_rate": 0.00011435258112426301, + "loss": 0.2785, + "step": 23968 + }, + { + "epoch": 1.941753078418665, + "grad_norm": 0.06088683009147644, + "learning_rate": 0.0001143480804716684, + "loss": 0.2832, + "step": 23969 + }, + { + "epoch": 1.9418340894361634, + "grad_norm": 0.05622805282473564, + "learning_rate": 0.00011434357981907378, + "loss": 0.2521, + "step": 23970 + }, + { + "epoch": 1.9419151004536617, + "grad_norm": 0.06270037591457367, + "learning_rate": 0.00011433907916647915, + "loss": 0.2689, + "step": 23971 + }, + { + "epoch": 1.94199611147116, + "grad_norm": 0.06490737944841385, + "learning_rate": 0.00011433457851388452, + "loss": 0.2398, + "step": 23972 + }, + { + "epoch": 1.9420771224886586, + "grad_norm": 0.0594317726790905, + "learning_rate": 0.00011433007786128988, + "loss": 0.243, + "step": 23973 + }, + { + "epoch": 1.9421581335061568, + "grad_norm": 0.06923339515924454, + "learning_rate": 0.00011432557720869526, + "loss": 0.2784, + "step": 23974 + }, + { + "epoch": 1.942239144523655, + "grad_norm": 0.062373723834753036, + "learning_rate": 0.00011432107655610064, + "loss": 0.2768, + "step": 23975 + }, + { + "epoch": 1.9423201555411536, + "grad_norm": 0.0614754743874073, + "learning_rate": 0.00011431657590350602, + "loss": 0.2646, + "step": 23976 + }, + { + "epoch": 1.942401166558652, + "grad_norm": 0.05733213946223259, + "learning_rate": 0.00011431207525091139, + "loss": 0.2638, + "step": 23977 + }, + { + "epoch": 1.9424821775761503, + "grad_norm": 0.053427550941705704, + "learning_rate": 0.00011430757459831676, + "loss": 0.2055, + "step": 23978 + }, + { + "epoch": 1.9425631885936487, + "grad_norm": 0.06927467882633209, + "learning_rate": 0.00011430307394572212, + "loss": 0.2574, + "step": 23979 + }, + { + "epoch": 1.9426441996111472, + "grad_norm": 0.061050258576869965, + "learning_rate": 0.00011429857329312752, + "loss": 0.2588, + "step": 23980 + }, + { + "epoch": 1.9427252106286454, + "grad_norm": 0.06572843343019485, + "learning_rate": 0.00011429407264053289, + "loss": 0.2726, + "step": 23981 + }, + { + "epoch": 1.942806221646144, + "grad_norm": 0.0645068809390068, + "learning_rate": 0.00011428957198793826, + "loss": 0.2822, + "step": 23982 + }, + { + "epoch": 1.9428872326636424, + "grad_norm": 0.07286891341209412, + "learning_rate": 0.00011428507133534363, + "loss": 0.2695, + "step": 23983 + }, + { + "epoch": 1.9429682436811406, + "grad_norm": 0.06508053094148636, + "learning_rate": 0.000114280570682749, + "loss": 0.2857, + "step": 23984 + }, + { + "epoch": 1.9430492546986389, + "grad_norm": 0.05973774939775467, + "learning_rate": 0.00011427607003015437, + "loss": 0.2527, + "step": 23985 + }, + { + "epoch": 1.9431302657161373, + "grad_norm": 0.057456836104393005, + "learning_rate": 0.00011427156937755977, + "loss": 0.266, + "step": 23986 + }, + { + "epoch": 1.9432112767336358, + "grad_norm": 0.06490325182676315, + "learning_rate": 0.00011426706872496513, + "loss": 0.2791, + "step": 23987 + }, + { + "epoch": 1.943292287751134, + "grad_norm": 0.07369411736726761, + "learning_rate": 0.0001142625680723705, + "loss": 0.3054, + "step": 23988 + }, + { + "epoch": 1.9433732987686325, + "grad_norm": 0.0538899265229702, + "learning_rate": 0.00011425806741977587, + "loss": 0.2425, + "step": 23989 + }, + { + "epoch": 1.943454309786131, + "grad_norm": 0.05909901484847069, + "learning_rate": 0.00011425356676718125, + "loss": 0.275, + "step": 23990 + }, + { + "epoch": 1.9435353208036292, + "grad_norm": 0.05489085987210274, + "learning_rate": 0.00011424906611458661, + "loss": 0.2841, + "step": 23991 + }, + { + "epoch": 1.9436163318211277, + "grad_norm": 0.05336972326040268, + "learning_rate": 0.00011424456546199201, + "loss": 0.2588, + "step": 23992 + }, + { + "epoch": 1.9436973428386262, + "grad_norm": 0.053867846727371216, + "learning_rate": 0.00011424006480939737, + "loss": 0.2954, + "step": 23993 + }, + { + "epoch": 1.9437783538561244, + "grad_norm": 0.053872622549533844, + "learning_rate": 0.00011423556415680274, + "loss": 0.262, + "step": 23994 + }, + { + "epoch": 1.9438593648736227, + "grad_norm": 0.06062573567032814, + "learning_rate": 0.00011423106350420812, + "loss": 0.3082, + "step": 23995 + }, + { + "epoch": 1.9439403758911213, + "grad_norm": 0.05445629358291626, + "learning_rate": 0.00011422656285161349, + "loss": 0.2608, + "step": 23996 + }, + { + "epoch": 1.9440213869086196, + "grad_norm": 0.060416966676712036, + "learning_rate": 0.00011422206219901885, + "loss": 0.27, + "step": 23997 + }, + { + "epoch": 1.9441023979261178, + "grad_norm": 0.0546443797647953, + "learning_rate": 0.00011421756154642425, + "loss": 0.2463, + "step": 23998 + }, + { + "epoch": 1.9441834089436163, + "grad_norm": 0.054451312869787216, + "learning_rate": 0.00011421306089382961, + "loss": 0.2277, + "step": 23999 + }, + { + "epoch": 1.9442644199611148, + "grad_norm": 0.06382211297750473, + "learning_rate": 0.00011420856024123498, + "loss": 0.236, + "step": 24000 + }, + { + "epoch": 1.944345430978613, + "grad_norm": 0.049225952476263046, + "learning_rate": 0.00011420405958864036, + "loss": 0.2737, + "step": 24001 + }, + { + "epoch": 1.9444264419961115, + "grad_norm": 0.05174494534730911, + "learning_rate": 0.00011419955893604573, + "loss": 0.2588, + "step": 24002 + }, + { + "epoch": 1.94450745301361, + "grad_norm": 0.05987345799803734, + "learning_rate": 0.00011419505828345109, + "loss": 0.2853, + "step": 24003 + }, + { + "epoch": 1.9445884640311082, + "grad_norm": 0.05934160202741623, + "learning_rate": 0.00011419055763085649, + "loss": 0.2377, + "step": 24004 + }, + { + "epoch": 1.9446694750486067, + "grad_norm": 0.058130960911512375, + "learning_rate": 0.00011418605697826185, + "loss": 0.2487, + "step": 24005 + }, + { + "epoch": 1.9447504860661051, + "grad_norm": 0.052440762519836426, + "learning_rate": 0.00011418155632566723, + "loss": 0.2556, + "step": 24006 + }, + { + "epoch": 1.9448314970836034, + "grad_norm": 0.06635487079620361, + "learning_rate": 0.0001141770556730726, + "loss": 0.3016, + "step": 24007 + }, + { + "epoch": 1.9449125081011016, + "grad_norm": 0.04897871986031532, + "learning_rate": 0.00011417255502047797, + "loss": 0.2753, + "step": 24008 + }, + { + "epoch": 1.9449935191186, + "grad_norm": 0.058619424700737, + "learning_rate": 0.00011416805436788336, + "loss": 0.2035, + "step": 24009 + }, + { + "epoch": 1.9450745301360985, + "grad_norm": 0.04933502897620201, + "learning_rate": 0.00011416355371528873, + "loss": 0.2622, + "step": 24010 + }, + { + "epoch": 1.9451555411535968, + "grad_norm": 0.044737957417964935, + "learning_rate": 0.00011415905306269409, + "loss": 0.239, + "step": 24011 + }, + { + "epoch": 1.9452365521710953, + "grad_norm": 0.06307195872068405, + "learning_rate": 0.00011415455241009947, + "loss": 0.2677, + "step": 24012 + }, + { + "epoch": 1.9453175631885937, + "grad_norm": 0.053666554391384125, + "learning_rate": 0.00011415005175750484, + "loss": 0.2509, + "step": 24013 + }, + { + "epoch": 1.945398574206092, + "grad_norm": 0.060802605003118515, + "learning_rate": 0.00011414555110491021, + "loss": 0.2838, + "step": 24014 + }, + { + "epoch": 1.9454795852235904, + "grad_norm": 0.05955837294459343, + "learning_rate": 0.0001141410504523156, + "loss": 0.2905, + "step": 24015 + }, + { + "epoch": 1.945560596241089, + "grad_norm": 0.06257221102714539, + "learning_rate": 0.00011413654979972097, + "loss": 0.2567, + "step": 24016 + }, + { + "epoch": 1.9456416072585871, + "grad_norm": 0.06320682168006897, + "learning_rate": 0.00011413204914712633, + "loss": 0.3398, + "step": 24017 + }, + { + "epoch": 1.9457226182760854, + "grad_norm": 0.06431394815444946, + "learning_rate": 0.00011412754849453171, + "loss": 0.2492, + "step": 24018 + }, + { + "epoch": 1.945803629293584, + "grad_norm": 0.0520576536655426, + "learning_rate": 0.00011412304784193708, + "loss": 0.2671, + "step": 24019 + }, + { + "epoch": 1.9458846403110823, + "grad_norm": 0.06366564333438873, + "learning_rate": 0.00011411854718934246, + "loss": 0.2465, + "step": 24020 + }, + { + "epoch": 1.9459656513285806, + "grad_norm": 0.05639234557747841, + "learning_rate": 0.00011411404653674784, + "loss": 0.2627, + "step": 24021 + }, + { + "epoch": 1.946046662346079, + "grad_norm": 0.05050186067819595, + "learning_rate": 0.00011410954588415322, + "loss": 0.2566, + "step": 24022 + }, + { + "epoch": 1.9461276733635775, + "grad_norm": 0.06824922561645508, + "learning_rate": 0.00011410504523155858, + "loss": 0.2542, + "step": 24023 + }, + { + "epoch": 1.9462086843810757, + "grad_norm": 0.048303570598363876, + "learning_rate": 0.00011410054457896395, + "loss": 0.2629, + "step": 24024 + }, + { + "epoch": 1.9462896953985742, + "grad_norm": 0.058540135622024536, + "learning_rate": 0.00011409604392636932, + "loss": 0.2789, + "step": 24025 + }, + { + "epoch": 1.9463707064160727, + "grad_norm": 0.0661383792757988, + "learning_rate": 0.0001140915432737747, + "loss": 0.245, + "step": 24026 + }, + { + "epoch": 1.946451717433571, + "grad_norm": 0.061160411685705185, + "learning_rate": 0.00011408704262118008, + "loss": 0.2163, + "step": 24027 + }, + { + "epoch": 1.9465327284510694, + "grad_norm": 0.056919898837804794, + "learning_rate": 0.00011408254196858546, + "loss": 0.2187, + "step": 24028 + }, + { + "epoch": 1.9466137394685679, + "grad_norm": 0.06528938561677933, + "learning_rate": 0.00011407804131599082, + "loss": 0.2884, + "step": 24029 + }, + { + "epoch": 1.946694750486066, + "grad_norm": 0.05889112129807472, + "learning_rate": 0.00011407354066339619, + "loss": 0.3225, + "step": 24030 + }, + { + "epoch": 1.9467757615035644, + "grad_norm": 0.061333924531936646, + "learning_rate": 0.00011406904001080157, + "loss": 0.2619, + "step": 24031 + }, + { + "epoch": 1.9468567725210628, + "grad_norm": 0.07009389251470566, + "learning_rate": 0.00011406453935820694, + "loss": 0.2613, + "step": 24032 + }, + { + "epoch": 1.9469377835385613, + "grad_norm": 0.06943484395742416, + "learning_rate": 0.00011406003870561233, + "loss": 0.256, + "step": 24033 + }, + { + "epoch": 1.9470187945560595, + "grad_norm": 0.04996665567159653, + "learning_rate": 0.0001140555380530177, + "loss": 0.2415, + "step": 24034 + }, + { + "epoch": 1.947099805573558, + "grad_norm": 0.05447015538811684, + "learning_rate": 0.00011405103740042307, + "loss": 0.2911, + "step": 24035 + }, + { + "epoch": 1.9471808165910565, + "grad_norm": 0.05733644962310791, + "learning_rate": 0.00011404653674782843, + "loss": 0.2754, + "step": 24036 + }, + { + "epoch": 1.9472618276085547, + "grad_norm": 0.061473965644836426, + "learning_rate": 0.00011404203609523381, + "loss": 0.2768, + "step": 24037 + }, + { + "epoch": 1.9473428386260532, + "grad_norm": 0.047736383974552155, + "learning_rate": 0.0001140375354426392, + "loss": 0.2346, + "step": 24038 + }, + { + "epoch": 1.9474238496435516, + "grad_norm": 0.06296089291572571, + "learning_rate": 0.00011403303479004457, + "loss": 0.2654, + "step": 24039 + }, + { + "epoch": 1.9475048606610499, + "grad_norm": 0.061912525445222855, + "learning_rate": 0.00011402853413744994, + "loss": 0.2574, + "step": 24040 + }, + { + "epoch": 1.9475858716785481, + "grad_norm": 0.07144110649824142, + "learning_rate": 0.00011402403348485531, + "loss": 0.2667, + "step": 24041 + }, + { + "epoch": 1.9476668826960468, + "grad_norm": 0.06251692026853561, + "learning_rate": 0.00011401953283226067, + "loss": 0.2476, + "step": 24042 + }, + { + "epoch": 1.947747893713545, + "grad_norm": 0.06474420428276062, + "learning_rate": 0.00011401503217966605, + "loss": 0.2716, + "step": 24043 + }, + { + "epoch": 1.9478289047310433, + "grad_norm": 0.061901070177555084, + "learning_rate": 0.00011401053152707144, + "loss": 0.2543, + "step": 24044 + }, + { + "epoch": 1.9479099157485418, + "grad_norm": 0.06367991119623184, + "learning_rate": 0.00011400603087447681, + "loss": 0.2544, + "step": 24045 + }, + { + "epoch": 1.9479909267660402, + "grad_norm": 0.060065027326345444, + "learning_rate": 0.00011400153022188218, + "loss": 0.2548, + "step": 24046 + }, + { + "epoch": 1.9480719377835385, + "grad_norm": 0.049185190349817276, + "learning_rate": 0.00011399702956928756, + "loss": 0.2562, + "step": 24047 + }, + { + "epoch": 1.948152948801037, + "grad_norm": 0.05605795979499817, + "learning_rate": 0.00011399252891669292, + "loss": 0.2622, + "step": 24048 + }, + { + "epoch": 1.9482339598185354, + "grad_norm": 0.06234635040163994, + "learning_rate": 0.00011398802826409829, + "loss": 0.277, + "step": 24049 + }, + { + "epoch": 1.9483149708360337, + "grad_norm": 0.06847386807203293, + "learning_rate": 0.00011398352761150368, + "loss": 0.2943, + "step": 24050 + }, + { + "epoch": 1.9483959818535321, + "grad_norm": 0.06446881592273712, + "learning_rate": 0.00011397902695890905, + "loss": 0.2821, + "step": 24051 + }, + { + "epoch": 1.9484769928710306, + "grad_norm": 0.05356021970510483, + "learning_rate": 0.00011397452630631442, + "loss": 0.2891, + "step": 24052 + }, + { + "epoch": 1.9485580038885288, + "grad_norm": 0.08012793958187103, + "learning_rate": 0.0001139700256537198, + "loss": 0.2671, + "step": 24053 + }, + { + "epoch": 1.948639014906027, + "grad_norm": 0.04903095215559006, + "learning_rate": 0.00011396552500112516, + "loss": 0.2388, + "step": 24054 + }, + { + "epoch": 1.9487200259235256, + "grad_norm": 0.056451983749866486, + "learning_rate": 0.00011396102434853053, + "loss": 0.2749, + "step": 24055 + }, + { + "epoch": 1.948801036941024, + "grad_norm": 0.06300033628940582, + "learning_rate": 0.00011395652369593592, + "loss": 0.3084, + "step": 24056 + }, + { + "epoch": 1.9488820479585223, + "grad_norm": 0.05597818270325661, + "learning_rate": 0.00011395202304334129, + "loss": 0.2499, + "step": 24057 + }, + { + "epoch": 1.9489630589760207, + "grad_norm": 0.05621341988444328, + "learning_rate": 0.00011394752239074667, + "loss": 0.2607, + "step": 24058 + }, + { + "epoch": 1.9490440699935192, + "grad_norm": 0.05270567163825035, + "learning_rate": 0.00011394302173815204, + "loss": 0.253, + "step": 24059 + }, + { + "epoch": 1.9491250810110174, + "grad_norm": 0.07046864181756973, + "learning_rate": 0.0001139385210855574, + "loss": 0.3114, + "step": 24060 + }, + { + "epoch": 1.949206092028516, + "grad_norm": 0.05155371129512787, + "learning_rate": 0.0001139340204329628, + "loss": 0.2617, + "step": 24061 + }, + { + "epoch": 1.9492871030460144, + "grad_norm": 0.06104248762130737, + "learning_rate": 0.00011392951978036816, + "loss": 0.2578, + "step": 24062 + }, + { + "epoch": 1.9493681140635126, + "grad_norm": 0.05884503200650215, + "learning_rate": 0.00011392501912777353, + "loss": 0.2704, + "step": 24063 + }, + { + "epoch": 1.9494491250810109, + "grad_norm": 0.0626462921500206, + "learning_rate": 0.00011392051847517891, + "loss": 0.2998, + "step": 24064 + }, + { + "epoch": 1.9495301360985096, + "grad_norm": 0.06282604485750198, + "learning_rate": 0.00011391601782258428, + "loss": 0.2836, + "step": 24065 + }, + { + "epoch": 1.9496111471160078, + "grad_norm": 0.05556776747107506, + "learning_rate": 0.00011391151716998964, + "loss": 0.2542, + "step": 24066 + }, + { + "epoch": 1.949692158133506, + "grad_norm": 0.07514247298240662, + "learning_rate": 0.00011390701651739504, + "loss": 0.2671, + "step": 24067 + }, + { + "epoch": 1.9497731691510045, + "grad_norm": 0.07319103181362152, + "learning_rate": 0.0001139025158648004, + "loss": 0.253, + "step": 24068 + }, + { + "epoch": 1.949854180168503, + "grad_norm": 0.07046500593423843, + "learning_rate": 0.00011389801521220578, + "loss": 0.3138, + "step": 24069 + }, + { + "epoch": 1.9499351911860012, + "grad_norm": 0.060062143951654434, + "learning_rate": 0.00011389351455961115, + "loss": 0.2599, + "step": 24070 + }, + { + "epoch": 1.9500162022034997, + "grad_norm": 0.06474334746599197, + "learning_rate": 0.00011388901390701652, + "loss": 0.2578, + "step": 24071 + }, + { + "epoch": 1.9500972132209982, + "grad_norm": 0.06309830397367477, + "learning_rate": 0.00011388451325442188, + "loss": 0.2858, + "step": 24072 + }, + { + "epoch": 1.9501782242384964, + "grad_norm": 0.061201080679893494, + "learning_rate": 0.00011388001260182728, + "loss": 0.2555, + "step": 24073 + }, + { + "epoch": 1.9502592352559946, + "grad_norm": 0.05632198601961136, + "learning_rate": 0.00011387551194923264, + "loss": 0.2351, + "step": 24074 + }, + { + "epoch": 1.9503402462734933, + "grad_norm": 0.05575885251164436, + "learning_rate": 0.00011387101129663802, + "loss": 0.3062, + "step": 24075 + }, + { + "epoch": 1.9504212572909916, + "grad_norm": 0.07558295130729675, + "learning_rate": 0.00011386651064404339, + "loss": 0.2967, + "step": 24076 + }, + { + "epoch": 1.9505022683084898, + "grad_norm": 0.07199109345674515, + "learning_rate": 0.00011386200999144876, + "loss": 0.2833, + "step": 24077 + }, + { + "epoch": 1.9505832793259883, + "grad_norm": 0.06253136694431305, + "learning_rate": 0.00011385750933885412, + "loss": 0.331, + "step": 24078 + }, + { + "epoch": 1.9506642903434868, + "grad_norm": 0.06450769305229187, + "learning_rate": 0.00011385300868625953, + "loss": 0.292, + "step": 24079 + }, + { + "epoch": 1.950745301360985, + "grad_norm": 0.04632382467389107, + "learning_rate": 0.00011384850803366489, + "loss": 0.2592, + "step": 24080 + }, + { + "epoch": 1.9508263123784835, + "grad_norm": 0.055005017668008804, + "learning_rate": 0.00011384400738107026, + "loss": 0.2463, + "step": 24081 + }, + { + "epoch": 1.950907323395982, + "grad_norm": 0.06214490160346031, + "learning_rate": 0.00011383950672847563, + "loss": 0.2709, + "step": 24082 + }, + { + "epoch": 1.9509883344134802, + "grad_norm": 0.04313374310731888, + "learning_rate": 0.000113835006075881, + "loss": 0.2247, + "step": 24083 + }, + { + "epoch": 1.9510693454309787, + "grad_norm": 0.053844235837459564, + "learning_rate": 0.00011383050542328637, + "loss": 0.2478, + "step": 24084 + }, + { + "epoch": 1.9511503564484771, + "grad_norm": 0.05828912928700447, + "learning_rate": 0.00011382600477069177, + "loss": 0.2478, + "step": 24085 + }, + { + "epoch": 1.9512313674659754, + "grad_norm": 0.049257442355155945, + "learning_rate": 0.00011382150411809713, + "loss": 0.2616, + "step": 24086 + }, + { + "epoch": 1.9513123784834736, + "grad_norm": 0.06783261895179749, + "learning_rate": 0.0001138170034655025, + "loss": 0.2901, + "step": 24087 + }, + { + "epoch": 1.9513933895009723, + "grad_norm": 0.05653469264507294, + "learning_rate": 0.00011381250281290787, + "loss": 0.2835, + "step": 24088 + }, + { + "epoch": 1.9514744005184705, + "grad_norm": 0.054393965750932693, + "learning_rate": 0.00011380800216031325, + "loss": 0.2645, + "step": 24089 + }, + { + "epoch": 1.9515554115359688, + "grad_norm": 0.06505770981311798, + "learning_rate": 0.00011380350150771863, + "loss": 0.2834, + "step": 24090 + }, + { + "epoch": 1.9516364225534673, + "grad_norm": 0.06360580027103424, + "learning_rate": 0.00011379900085512401, + "loss": 0.2592, + "step": 24091 + }, + { + "epoch": 1.9517174335709657, + "grad_norm": 0.06400442868471146, + "learning_rate": 0.00011379450020252937, + "loss": 0.2857, + "step": 24092 + }, + { + "epoch": 1.951798444588464, + "grad_norm": 0.05865645036101341, + "learning_rate": 0.00011378999954993474, + "loss": 0.2713, + "step": 24093 + }, + { + "epoch": 1.9518794556059624, + "grad_norm": 0.06572428345680237, + "learning_rate": 0.00011378549889734012, + "loss": 0.264, + "step": 24094 + }, + { + "epoch": 1.951960466623461, + "grad_norm": 0.05528400465846062, + "learning_rate": 0.00011378099824474549, + "loss": 0.2675, + "step": 24095 + }, + { + "epoch": 1.9520414776409591, + "grad_norm": 0.06614361703395844, + "learning_rate": 0.00011377649759215088, + "loss": 0.294, + "step": 24096 + }, + { + "epoch": 1.9521224886584574, + "grad_norm": 0.05867353081703186, + "learning_rate": 0.00011377199693955625, + "loss": 0.2546, + "step": 24097 + }, + { + "epoch": 1.952203499675956, + "grad_norm": 0.06184687092900276, + "learning_rate": 0.00011376749628696161, + "loss": 0.2794, + "step": 24098 + }, + { + "epoch": 1.9522845106934543, + "grad_norm": 0.05994020774960518, + "learning_rate": 0.00011376299563436698, + "loss": 0.2637, + "step": 24099 + }, + { + "epoch": 1.9523655217109526, + "grad_norm": 0.05696763098239899, + "learning_rate": 0.00011375849498177236, + "loss": 0.31, + "step": 24100 + }, + { + "epoch": 1.952446532728451, + "grad_norm": 0.058388851583004, + "learning_rate": 0.00011375399432917773, + "loss": 0.2826, + "step": 24101 + }, + { + "epoch": 1.9525275437459495, + "grad_norm": 0.05529080703854561, + "learning_rate": 0.00011374949367658312, + "loss": 0.2612, + "step": 24102 + }, + { + "epoch": 1.9526085547634477, + "grad_norm": 0.05322407931089401, + "learning_rate": 0.00011374499302398849, + "loss": 0.2918, + "step": 24103 + }, + { + "epoch": 1.9526895657809462, + "grad_norm": 0.057907380163669586, + "learning_rate": 0.00011374049237139387, + "loss": 0.2858, + "step": 24104 + }, + { + "epoch": 1.9527705767984447, + "grad_norm": 0.059196989983320236, + "learning_rate": 0.00011373599171879923, + "loss": 0.2977, + "step": 24105 + }, + { + "epoch": 1.952851587815943, + "grad_norm": 0.06501670926809311, + "learning_rate": 0.0001137314910662046, + "loss": 0.2796, + "step": 24106 + }, + { + "epoch": 1.9529325988334414, + "grad_norm": 0.060437873005867004, + "learning_rate": 0.00011372699041360997, + "loss": 0.2585, + "step": 24107 + }, + { + "epoch": 1.9530136098509399, + "grad_norm": 0.057793036103248596, + "learning_rate": 0.00011372248976101536, + "loss": 0.2585, + "step": 24108 + }, + { + "epoch": 1.953094620868438, + "grad_norm": 0.06259094923734665, + "learning_rate": 0.00011371798910842073, + "loss": 0.2546, + "step": 24109 + }, + { + "epoch": 1.9531756318859363, + "grad_norm": 0.055709585547447205, + "learning_rate": 0.00011371348845582611, + "loss": 0.2675, + "step": 24110 + }, + { + "epoch": 1.9532566429034348, + "grad_norm": 0.05864265188574791, + "learning_rate": 0.00011370898780323147, + "loss": 0.2846, + "step": 24111 + }, + { + "epoch": 1.9533376539209333, + "grad_norm": 0.07006549835205078, + "learning_rate": 0.00011370448715063684, + "loss": 0.2695, + "step": 24112 + }, + { + "epoch": 1.9534186649384315, + "grad_norm": 0.07488393038511276, + "learning_rate": 0.00011369998649804223, + "loss": 0.3068, + "step": 24113 + }, + { + "epoch": 1.95349967595593, + "grad_norm": 0.067337766289711, + "learning_rate": 0.0001136954858454476, + "loss": 0.3021, + "step": 24114 + }, + { + "epoch": 1.9535806869734285, + "grad_norm": 0.0490729995071888, + "learning_rate": 0.00011369098519285297, + "loss": 0.2364, + "step": 24115 + }, + { + "epoch": 1.9536616979909267, + "grad_norm": 0.05571191385388374, + "learning_rate": 0.00011368648454025835, + "loss": 0.2307, + "step": 24116 + }, + { + "epoch": 1.9537427090084252, + "grad_norm": 0.05138971656560898, + "learning_rate": 0.00011368198388766371, + "loss": 0.2702, + "step": 24117 + }, + { + "epoch": 1.9538237200259236, + "grad_norm": 0.06012196093797684, + "learning_rate": 0.00011367748323506908, + "loss": 0.2532, + "step": 24118 + }, + { + "epoch": 1.9539047310434219, + "grad_norm": 0.0667654424905777, + "learning_rate": 0.00011367298258247447, + "loss": 0.309, + "step": 24119 + }, + { + "epoch": 1.9539857420609201, + "grad_norm": 0.061375852674245834, + "learning_rate": 0.00011366848192987984, + "loss": 0.2537, + "step": 24120 + }, + { + "epoch": 1.9540667530784188, + "grad_norm": 0.06362592428922653, + "learning_rate": 0.00011366398127728522, + "loss": 0.2813, + "step": 24121 + }, + { + "epoch": 1.954147764095917, + "grad_norm": 0.057681769132614136, + "learning_rate": 0.00011365948062469059, + "loss": 0.2756, + "step": 24122 + }, + { + "epoch": 1.9542287751134153, + "grad_norm": 0.060051437467336655, + "learning_rate": 0.00011365497997209595, + "loss": 0.275, + "step": 24123 + }, + { + "epoch": 1.9543097861309138, + "grad_norm": 0.07187920063734055, + "learning_rate": 0.00011365047931950132, + "loss": 0.3165, + "step": 24124 + }, + { + "epoch": 1.9543907971484122, + "grad_norm": 0.062115393579006195, + "learning_rate": 0.00011364597866690671, + "loss": 0.309, + "step": 24125 + }, + { + "epoch": 1.9544718081659105, + "grad_norm": 0.07206244021654129, + "learning_rate": 0.00011364147801431208, + "loss": 0.2352, + "step": 24126 + }, + { + "epoch": 1.954552819183409, + "grad_norm": 0.061783768236637115, + "learning_rate": 0.00011363697736171746, + "loss": 0.2734, + "step": 24127 + }, + { + "epoch": 1.9546338302009074, + "grad_norm": 0.05878418684005737, + "learning_rate": 0.00011363247670912283, + "loss": 0.2769, + "step": 24128 + }, + { + "epoch": 1.9547148412184057, + "grad_norm": 0.05013301968574524, + "learning_rate": 0.00011362797605652819, + "loss": 0.2365, + "step": 24129 + }, + { + "epoch": 1.9547958522359041, + "grad_norm": 0.06090492755174637, + "learning_rate": 0.00011362347540393357, + "loss": 0.2881, + "step": 24130 + }, + { + "epoch": 1.9548768632534026, + "grad_norm": 0.066561758518219, + "learning_rate": 0.00011361897475133895, + "loss": 0.2439, + "step": 24131 + }, + { + "epoch": 1.9549578742709008, + "grad_norm": 0.05893224850296974, + "learning_rate": 0.00011361447409874433, + "loss": 0.2768, + "step": 24132 + }, + { + "epoch": 1.955038885288399, + "grad_norm": 0.056500572711229324, + "learning_rate": 0.0001136099734461497, + "loss": 0.3002, + "step": 24133 + }, + { + "epoch": 1.9551198963058976, + "grad_norm": 0.055701520293951035, + "learning_rate": 0.00011360547279355507, + "loss": 0.266, + "step": 24134 + }, + { + "epoch": 1.955200907323396, + "grad_norm": 0.07285144925117493, + "learning_rate": 0.00011360097214096043, + "loss": 0.2558, + "step": 24135 + }, + { + "epoch": 1.9552819183408943, + "grad_norm": 0.05705364793539047, + "learning_rate": 0.00011359647148836581, + "loss": 0.2942, + "step": 24136 + }, + { + "epoch": 1.9553629293583927, + "grad_norm": 0.057131167501211166, + "learning_rate": 0.0001135919708357712, + "loss": 0.2659, + "step": 24137 + }, + { + "epoch": 1.9554439403758912, + "grad_norm": 0.058022141456604004, + "learning_rate": 0.00011358747018317657, + "loss": 0.2422, + "step": 24138 + }, + { + "epoch": 1.9555249513933894, + "grad_norm": 0.05680813640356064, + "learning_rate": 0.00011358296953058194, + "loss": 0.2514, + "step": 24139 + }, + { + "epoch": 1.955605962410888, + "grad_norm": 0.05137019604444504, + "learning_rate": 0.00011357846887798732, + "loss": 0.2994, + "step": 24140 + }, + { + "epoch": 1.9556869734283864, + "grad_norm": 0.0479348823428154, + "learning_rate": 0.00011357396822539268, + "loss": 0.2255, + "step": 24141 + }, + { + "epoch": 1.9557679844458846, + "grad_norm": 0.06023770943284035, + "learning_rate": 0.00011356946757279808, + "loss": 0.3027, + "step": 24142 + }, + { + "epoch": 1.9558489954633829, + "grad_norm": 0.05584618076682091, + "learning_rate": 0.00011356496692020344, + "loss": 0.2731, + "step": 24143 + }, + { + "epoch": 1.9559300064808816, + "grad_norm": 0.05602271482348442, + "learning_rate": 0.00011356046626760881, + "loss": 0.2554, + "step": 24144 + }, + { + "epoch": 1.9560110174983798, + "grad_norm": 0.05980118364095688, + "learning_rate": 0.00011355596561501418, + "loss": 0.2706, + "step": 24145 + }, + { + "epoch": 1.956092028515878, + "grad_norm": 0.04484741389751434, + "learning_rate": 0.00011355146496241956, + "loss": 0.2174, + "step": 24146 + }, + { + "epoch": 1.9561730395333765, + "grad_norm": 0.08422911912202835, + "learning_rate": 0.00011354696430982492, + "loss": 0.3119, + "step": 24147 + }, + { + "epoch": 1.956254050550875, + "grad_norm": 0.06032005324959755, + "learning_rate": 0.00011354246365723032, + "loss": 0.2296, + "step": 24148 + }, + { + "epoch": 1.9563350615683732, + "grad_norm": 0.0618845634162426, + "learning_rate": 0.00011353796300463568, + "loss": 0.2492, + "step": 24149 + }, + { + "epoch": 1.9564160725858717, + "grad_norm": 0.058529820293188095, + "learning_rate": 0.00011353346235204105, + "loss": 0.2588, + "step": 24150 + }, + { + "epoch": 1.9564970836033702, + "grad_norm": 0.059740472584962845, + "learning_rate": 0.00011352896169944642, + "loss": 0.2709, + "step": 24151 + }, + { + "epoch": 1.9565780946208684, + "grad_norm": 0.056812260299921036, + "learning_rate": 0.0001135244610468518, + "loss": 0.2424, + "step": 24152 + }, + { + "epoch": 1.9566591056383669, + "grad_norm": 0.04421933740377426, + "learning_rate": 0.00011351996039425716, + "loss": 0.2248, + "step": 24153 + }, + { + "epoch": 1.9567401166558653, + "grad_norm": 0.05292237550020218, + "learning_rate": 0.00011351545974166256, + "loss": 0.2809, + "step": 24154 + }, + { + "epoch": 1.9568211276733636, + "grad_norm": 0.054508842527866364, + "learning_rate": 0.00011351095908906792, + "loss": 0.2455, + "step": 24155 + }, + { + "epoch": 1.9569021386908618, + "grad_norm": 0.06526229530572891, + "learning_rate": 0.00011350645843647329, + "loss": 0.2686, + "step": 24156 + }, + { + "epoch": 1.9569831497083603, + "grad_norm": 0.047157347202301025, + "learning_rate": 0.00011350195778387867, + "loss": 0.2593, + "step": 24157 + }, + { + "epoch": 1.9570641607258588, + "grad_norm": 0.06563350558280945, + "learning_rate": 0.00011349745713128404, + "loss": 0.2474, + "step": 24158 + }, + { + "epoch": 1.957145171743357, + "grad_norm": 0.058557942509651184, + "learning_rate": 0.0001134929564786894, + "loss": 0.2587, + "step": 24159 + }, + { + "epoch": 1.9572261827608555, + "grad_norm": 0.05948895588517189, + "learning_rate": 0.0001134884558260948, + "loss": 0.259, + "step": 24160 + }, + { + "epoch": 1.957307193778354, + "grad_norm": 0.06698385626077652, + "learning_rate": 0.00011348395517350016, + "loss": 0.2659, + "step": 24161 + }, + { + "epoch": 1.9573882047958522, + "grad_norm": 0.06790705025196075, + "learning_rate": 0.00011347945452090553, + "loss": 0.265, + "step": 24162 + }, + { + "epoch": 1.9574692158133506, + "grad_norm": 0.06589134782552719, + "learning_rate": 0.00011347495386831091, + "loss": 0.3139, + "step": 24163 + }, + { + "epoch": 1.9575502268308491, + "grad_norm": 0.048939596861600876, + "learning_rate": 0.00011347045321571628, + "loss": 0.2322, + "step": 24164 + }, + { + "epoch": 1.9576312378483474, + "grad_norm": 0.05785316601395607, + "learning_rate": 0.00011346595256312167, + "loss": 0.286, + "step": 24165 + }, + { + "epoch": 1.9577122488658456, + "grad_norm": 0.05657365918159485, + "learning_rate": 0.00011346145191052704, + "loss": 0.2624, + "step": 24166 + }, + { + "epoch": 1.9577932598833443, + "grad_norm": 0.052017692476511, + "learning_rate": 0.0001134569512579324, + "loss": 0.2513, + "step": 24167 + }, + { + "epoch": 1.9578742709008425, + "grad_norm": 0.059289928525686264, + "learning_rate": 0.00011345245060533778, + "loss": 0.2915, + "step": 24168 + }, + { + "epoch": 1.9579552819183408, + "grad_norm": 0.05871990695595741, + "learning_rate": 0.00011344794995274315, + "loss": 0.2953, + "step": 24169 + }, + { + "epoch": 1.9580362929358393, + "grad_norm": 0.05929981544613838, + "learning_rate": 0.00011344344930014852, + "loss": 0.2598, + "step": 24170 + }, + { + "epoch": 1.9581173039533377, + "grad_norm": 0.06190398707985878, + "learning_rate": 0.00011343894864755391, + "loss": 0.2833, + "step": 24171 + }, + { + "epoch": 1.958198314970836, + "grad_norm": 0.05891365930438042, + "learning_rate": 0.00011343444799495928, + "loss": 0.2244, + "step": 24172 + }, + { + "epoch": 1.9582793259883344, + "grad_norm": 0.04958629980683327, + "learning_rate": 0.00011342994734236466, + "loss": 0.2591, + "step": 24173 + }, + { + "epoch": 1.958360337005833, + "grad_norm": 0.05412125959992409, + "learning_rate": 0.00011342544668977002, + "loss": 0.2543, + "step": 24174 + }, + { + "epoch": 1.9584413480233311, + "grad_norm": 0.06268096715211868, + "learning_rate": 0.00011342094603717539, + "loss": 0.2434, + "step": 24175 + }, + { + "epoch": 1.9585223590408296, + "grad_norm": 0.0730811133980751, + "learning_rate": 0.00011341644538458076, + "loss": 0.2657, + "step": 24176 + }, + { + "epoch": 1.958603370058328, + "grad_norm": 0.062305256724357605, + "learning_rate": 0.00011341194473198615, + "loss": 0.2843, + "step": 24177 + }, + { + "epoch": 1.9586843810758263, + "grad_norm": 0.060018282383680344, + "learning_rate": 0.00011340744407939153, + "loss": 0.2779, + "step": 24178 + }, + { + "epoch": 1.9587653920933246, + "grad_norm": 0.06363444775342941, + "learning_rate": 0.0001134029434267969, + "loss": 0.2455, + "step": 24179 + }, + { + "epoch": 1.958846403110823, + "grad_norm": 0.06346273422241211, + "learning_rate": 0.00011339844277420226, + "loss": 0.2725, + "step": 24180 + }, + { + "epoch": 1.9589274141283215, + "grad_norm": 0.05639772117137909, + "learning_rate": 0.00011339394212160763, + "loss": 0.2455, + "step": 24181 + }, + { + "epoch": 1.9590084251458197, + "grad_norm": 0.07999150454998016, + "learning_rate": 0.000113389441469013, + "loss": 0.2761, + "step": 24182 + }, + { + "epoch": 1.9590894361633182, + "grad_norm": 0.054590802639722824, + "learning_rate": 0.0001133849408164184, + "loss": 0.2769, + "step": 24183 + }, + { + "epoch": 1.9591704471808167, + "grad_norm": 0.055199988186359406, + "learning_rate": 0.00011338044016382377, + "loss": 0.2401, + "step": 24184 + }, + { + "epoch": 1.959251458198315, + "grad_norm": 0.08141867816448212, + "learning_rate": 0.00011337593951122914, + "loss": 0.2864, + "step": 24185 + }, + { + "epoch": 1.9593324692158134, + "grad_norm": 0.05465199425816536, + "learning_rate": 0.0001133714388586345, + "loss": 0.2519, + "step": 24186 + }, + { + "epoch": 1.9594134802333119, + "grad_norm": 0.0712234228849411, + "learning_rate": 0.00011336693820603987, + "loss": 0.2858, + "step": 24187 + }, + { + "epoch": 1.95949449125081, + "grad_norm": 0.058448389172554016, + "learning_rate": 0.00011336243755344525, + "loss": 0.2639, + "step": 24188 + }, + { + "epoch": 1.9595755022683083, + "grad_norm": 0.06451912224292755, + "learning_rate": 0.00011335793690085064, + "loss": 0.2741, + "step": 24189 + }, + { + "epoch": 1.959656513285807, + "grad_norm": 0.06475438922643661, + "learning_rate": 0.00011335343624825601, + "loss": 0.252, + "step": 24190 + }, + { + "epoch": 1.9597375243033053, + "grad_norm": 0.0741402730345726, + "learning_rate": 0.00011334893559566138, + "loss": 0.2449, + "step": 24191 + }, + { + "epoch": 1.9598185353208035, + "grad_norm": 0.0444721020758152, + "learning_rate": 0.00011334443494306674, + "loss": 0.263, + "step": 24192 + }, + { + "epoch": 1.959899546338302, + "grad_norm": 0.07959980517625809, + "learning_rate": 0.00011333993429047212, + "loss": 0.2654, + "step": 24193 + }, + { + "epoch": 1.9599805573558005, + "grad_norm": 0.05834224820137024, + "learning_rate": 0.0001133354336378775, + "loss": 0.2734, + "step": 24194 + }, + { + "epoch": 1.9600615683732987, + "grad_norm": 0.053961653262376785, + "learning_rate": 0.00011333093298528288, + "loss": 0.2372, + "step": 24195 + }, + { + "epoch": 1.9601425793907972, + "grad_norm": 0.04977266862988472, + "learning_rate": 0.00011332643233268825, + "loss": 0.2533, + "step": 24196 + }, + { + "epoch": 1.9602235904082956, + "grad_norm": 0.0665312260389328, + "learning_rate": 0.00011332193168009362, + "loss": 0.2908, + "step": 24197 + }, + { + "epoch": 1.9603046014257939, + "grad_norm": 0.062488801777362823, + "learning_rate": 0.00011331743102749898, + "loss": 0.2682, + "step": 24198 + }, + { + "epoch": 1.9603856124432921, + "grad_norm": 0.05820208042860031, + "learning_rate": 0.00011331293037490436, + "loss": 0.2597, + "step": 24199 + }, + { + "epoch": 1.9604666234607908, + "grad_norm": 0.06187162548303604, + "learning_rate": 0.00011330842972230974, + "loss": 0.246, + "step": 24200 + }, + { + "epoch": 1.960547634478289, + "grad_norm": 0.050065185874700546, + "learning_rate": 0.00011330392906971512, + "loss": 0.2378, + "step": 24201 + }, + { + "epoch": 1.9606286454957873, + "grad_norm": 0.04843199625611305, + "learning_rate": 0.00011329942841712049, + "loss": 0.241, + "step": 24202 + }, + { + "epoch": 1.9607096565132858, + "grad_norm": 0.05734392628073692, + "learning_rate": 0.00011329492776452587, + "loss": 0.2769, + "step": 24203 + }, + { + "epoch": 1.9607906675307842, + "grad_norm": 0.059642255306243896, + "learning_rate": 0.00011329042711193123, + "loss": 0.2737, + "step": 24204 + }, + { + "epoch": 1.9608716785482825, + "grad_norm": 0.05199331417679787, + "learning_rate": 0.0001132859264593366, + "loss": 0.3023, + "step": 24205 + }, + { + "epoch": 1.960952689565781, + "grad_norm": 0.058106642216444016, + "learning_rate": 0.00011328142580674199, + "loss": 0.2491, + "step": 24206 + }, + { + "epoch": 1.9610337005832794, + "grad_norm": 0.06125922128558159, + "learning_rate": 0.00011327692515414736, + "loss": 0.256, + "step": 24207 + }, + { + "epoch": 1.9611147116007777, + "grad_norm": 0.05933903902769089, + "learning_rate": 0.00011327242450155273, + "loss": 0.2476, + "step": 24208 + }, + { + "epoch": 1.9611957226182761, + "grad_norm": 0.07027129083871841, + "learning_rate": 0.00011326792384895811, + "loss": 0.2797, + "step": 24209 + }, + { + "epoch": 1.9612767336357746, + "grad_norm": 0.06699054688215256, + "learning_rate": 0.00011326342319636347, + "loss": 0.2781, + "step": 24210 + }, + { + "epoch": 1.9613577446532728, + "grad_norm": 0.06571625918149948, + "learning_rate": 0.00011325892254376884, + "loss": 0.2325, + "step": 24211 + }, + { + "epoch": 1.961438755670771, + "grad_norm": 0.054301898926496506, + "learning_rate": 0.00011325442189117423, + "loss": 0.2817, + "step": 24212 + }, + { + "epoch": 1.9615197666882696, + "grad_norm": 0.05335809662938118, + "learning_rate": 0.0001132499212385796, + "loss": 0.2594, + "step": 24213 + }, + { + "epoch": 1.961600777705768, + "grad_norm": 0.05309063196182251, + "learning_rate": 0.00011324542058598498, + "loss": 0.2543, + "step": 24214 + }, + { + "epoch": 1.9616817887232663, + "grad_norm": 0.07116621732711792, + "learning_rate": 0.00011324091993339035, + "loss": 0.2815, + "step": 24215 + }, + { + "epoch": 1.9617627997407647, + "grad_norm": 0.05735379084944725, + "learning_rate": 0.00011323641928079571, + "loss": 0.2533, + "step": 24216 + }, + { + "epoch": 1.9618438107582632, + "grad_norm": 0.07357499748468399, + "learning_rate": 0.00011323191862820108, + "loss": 0.2909, + "step": 24217 + }, + { + "epoch": 1.9619248217757614, + "grad_norm": 0.051178716123104095, + "learning_rate": 0.00011322741797560647, + "loss": 0.264, + "step": 24218 + }, + { + "epoch": 1.96200583279326, + "grad_norm": 0.05480940639972687, + "learning_rate": 0.00011322291732301184, + "loss": 0.2515, + "step": 24219 + }, + { + "epoch": 1.9620868438107584, + "grad_norm": 0.04697352275252342, + "learning_rate": 0.00011321841667041722, + "loss": 0.1938, + "step": 24220 + }, + { + "epoch": 1.9621678548282566, + "grad_norm": 0.06244059279561043, + "learning_rate": 0.00011321391601782259, + "loss": 0.2577, + "step": 24221 + }, + { + "epoch": 1.9622488658457549, + "grad_norm": 0.05462726950645447, + "learning_rate": 0.00011320941536522795, + "loss": 0.2353, + "step": 24222 + }, + { + "epoch": 1.9623298768632536, + "grad_norm": 0.055527880787849426, + "learning_rate": 0.00011320491471263335, + "loss": 0.2525, + "step": 24223 + }, + { + "epoch": 1.9624108878807518, + "grad_norm": 0.054965630173683167, + "learning_rate": 0.00011320041406003871, + "loss": 0.255, + "step": 24224 + }, + { + "epoch": 1.96249189889825, + "grad_norm": 0.05207141861319542, + "learning_rate": 0.00011319591340744408, + "loss": 0.2601, + "step": 24225 + }, + { + "epoch": 1.9625729099157485, + "grad_norm": 0.053380418568849564, + "learning_rate": 0.00011319141275484946, + "loss": 0.2242, + "step": 24226 + }, + { + "epoch": 1.962653920933247, + "grad_norm": 0.06348201632499695, + "learning_rate": 0.00011318691210225483, + "loss": 0.2309, + "step": 24227 + }, + { + "epoch": 1.9627349319507452, + "grad_norm": 0.06286928802728653, + "learning_rate": 0.00011318241144966019, + "loss": 0.3057, + "step": 24228 + }, + { + "epoch": 1.9628159429682437, + "grad_norm": 0.06830119341611862, + "learning_rate": 0.00011317791079706559, + "loss": 0.29, + "step": 24229 + }, + { + "epoch": 1.9628969539857422, + "grad_norm": 0.05434748902916908, + "learning_rate": 0.00011317341014447095, + "loss": 0.2584, + "step": 24230 + }, + { + "epoch": 1.9629779650032404, + "grad_norm": 0.05682295933365822, + "learning_rate": 0.00011316890949187633, + "loss": 0.2603, + "step": 24231 + }, + { + "epoch": 1.9630589760207389, + "grad_norm": 0.06411691009998322, + "learning_rate": 0.0001131644088392817, + "loss": 0.2867, + "step": 24232 + }, + { + "epoch": 1.9631399870382373, + "grad_norm": 0.06302022933959961, + "learning_rate": 0.00011315990818668707, + "loss": 0.2794, + "step": 24233 + }, + { + "epoch": 1.9632209980557356, + "grad_norm": 0.06645432859659195, + "learning_rate": 0.00011315540753409243, + "loss": 0.2635, + "step": 24234 + }, + { + "epoch": 1.9633020090732338, + "grad_norm": 0.06499811261892319, + "learning_rate": 0.00011315090688149783, + "loss": 0.2713, + "step": 24235 + }, + { + "epoch": 1.9633830200907323, + "grad_norm": 0.06514635682106018, + "learning_rate": 0.0001131464062289032, + "loss": 0.2812, + "step": 24236 + }, + { + "epoch": 1.9634640311082308, + "grad_norm": 0.04923933371901512, + "learning_rate": 0.00011314190557630857, + "loss": 0.282, + "step": 24237 + }, + { + "epoch": 1.963545042125729, + "grad_norm": 0.062177617102861404, + "learning_rate": 0.00011313740492371394, + "loss": 0.2679, + "step": 24238 + }, + { + "epoch": 1.9636260531432275, + "grad_norm": 0.06502950191497803, + "learning_rate": 0.00011313290427111932, + "loss": 0.2964, + "step": 24239 + }, + { + "epoch": 1.963707064160726, + "grad_norm": 0.05972938984632492, + "learning_rate": 0.00011312840361852468, + "loss": 0.2706, + "step": 24240 + }, + { + "epoch": 1.9637880751782242, + "grad_norm": 0.050397828221321106, + "learning_rate": 0.00011312390296593008, + "loss": 0.2312, + "step": 24241 + }, + { + "epoch": 1.9638690861957226, + "grad_norm": 0.054717596620321274, + "learning_rate": 0.00011311940231333545, + "loss": 0.2871, + "step": 24242 + }, + { + "epoch": 1.9639500972132211, + "grad_norm": 0.061753761023283005, + "learning_rate": 0.00011311490166074081, + "loss": 0.2899, + "step": 24243 + }, + { + "epoch": 1.9640311082307194, + "grad_norm": 0.053413957357406616, + "learning_rate": 0.00011311040100814618, + "loss": 0.2539, + "step": 24244 + }, + { + "epoch": 1.9641121192482176, + "grad_norm": 0.05676018074154854, + "learning_rate": 0.00011310590035555156, + "loss": 0.2969, + "step": 24245 + }, + { + "epoch": 1.9641931302657163, + "grad_norm": 0.06230397894978523, + "learning_rate": 0.00011310139970295694, + "loss": 0.2845, + "step": 24246 + }, + { + "epoch": 1.9642741412832145, + "grad_norm": 0.06296941637992859, + "learning_rate": 0.00011309689905036232, + "loss": 0.2432, + "step": 24247 + }, + { + "epoch": 1.9643551523007128, + "grad_norm": 0.05645883455872536, + "learning_rate": 0.00011309239839776769, + "loss": 0.2478, + "step": 24248 + }, + { + "epoch": 1.9644361633182112, + "grad_norm": 0.06192861869931221, + "learning_rate": 0.00011308789774517305, + "loss": 0.2954, + "step": 24249 + }, + { + "epoch": 1.9645171743357097, + "grad_norm": 0.06263851374387741, + "learning_rate": 0.00011308339709257842, + "loss": 0.247, + "step": 24250 + }, + { + "epoch": 1.964598185353208, + "grad_norm": 0.06363161653280258, + "learning_rate": 0.0001130788964399838, + "loss": 0.2498, + "step": 24251 + }, + { + "epoch": 1.9646791963707064, + "grad_norm": 0.05741799250245094, + "learning_rate": 0.00011307439578738919, + "loss": 0.3236, + "step": 24252 + }, + { + "epoch": 1.964760207388205, + "grad_norm": 0.05794438719749451, + "learning_rate": 0.00011306989513479456, + "loss": 0.208, + "step": 24253 + }, + { + "epoch": 1.9648412184057031, + "grad_norm": 0.0729365274310112, + "learning_rate": 0.00011306539448219993, + "loss": 0.2937, + "step": 24254 + }, + { + "epoch": 1.9649222294232016, + "grad_norm": 0.07066737860441208, + "learning_rate": 0.00011306089382960529, + "loss": 0.284, + "step": 24255 + }, + { + "epoch": 1.9650032404407, + "grad_norm": 0.05694340541958809, + "learning_rate": 0.00011305639317701067, + "loss": 0.2258, + "step": 24256 + }, + { + "epoch": 1.9650842514581983, + "grad_norm": 0.07903728634119034, + "learning_rate": 0.00011305189252441604, + "loss": 0.2413, + "step": 24257 + }, + { + "epoch": 1.9651652624756966, + "grad_norm": 0.07888397574424744, + "learning_rate": 0.00011304739187182143, + "loss": 0.2586, + "step": 24258 + }, + { + "epoch": 1.965246273493195, + "grad_norm": 0.0666826143860817, + "learning_rate": 0.0001130428912192268, + "loss": 0.2713, + "step": 24259 + }, + { + "epoch": 1.9653272845106935, + "grad_norm": 0.06192653626203537, + "learning_rate": 0.00011303839056663217, + "loss": 0.2717, + "step": 24260 + }, + { + "epoch": 1.9654082955281917, + "grad_norm": 0.05060596019029617, + "learning_rate": 0.00011303388991403753, + "loss": 0.2343, + "step": 24261 + }, + { + "epoch": 1.9654893065456902, + "grad_norm": 0.06154812127351761, + "learning_rate": 0.00011302938926144291, + "loss": 0.2823, + "step": 24262 + }, + { + "epoch": 1.9655703175631887, + "grad_norm": 0.06363443285226822, + "learning_rate": 0.00011302488860884828, + "loss": 0.2412, + "step": 24263 + }, + { + "epoch": 1.965651328580687, + "grad_norm": 0.056647319346666336, + "learning_rate": 0.00011302038795625367, + "loss": 0.2737, + "step": 24264 + }, + { + "epoch": 1.9657323395981854, + "grad_norm": 0.06012161821126938, + "learning_rate": 0.00011301588730365904, + "loss": 0.292, + "step": 24265 + }, + { + "epoch": 1.9658133506156839, + "grad_norm": 0.0688176155090332, + "learning_rate": 0.00011301138665106442, + "loss": 0.2574, + "step": 24266 + }, + { + "epoch": 1.965894361633182, + "grad_norm": 0.05791863426566124, + "learning_rate": 0.00011300688599846978, + "loss": 0.2391, + "step": 24267 + }, + { + "epoch": 1.9659753726506803, + "grad_norm": 0.06651633232831955, + "learning_rate": 0.00011300238534587515, + "loss": 0.2804, + "step": 24268 + }, + { + "epoch": 1.966056383668179, + "grad_norm": 0.044299155473709106, + "learning_rate": 0.00011299788469328052, + "loss": 0.2518, + "step": 24269 + }, + { + "epoch": 1.9661373946856773, + "grad_norm": 0.06218911334872246, + "learning_rate": 0.00011299338404068591, + "loss": 0.2613, + "step": 24270 + }, + { + "epoch": 1.9662184057031755, + "grad_norm": 0.05687331780791283, + "learning_rate": 0.00011298888338809128, + "loss": 0.2567, + "step": 24271 + }, + { + "epoch": 1.966299416720674, + "grad_norm": 0.056875865906476974, + "learning_rate": 0.00011298438273549666, + "loss": 0.2558, + "step": 24272 + }, + { + "epoch": 1.9663804277381725, + "grad_norm": 0.06565535813570023, + "learning_rate": 0.00011297988208290202, + "loss": 0.299, + "step": 24273 + }, + { + "epoch": 1.9664614387556707, + "grad_norm": 0.0581546276807785, + "learning_rate": 0.00011297538143030739, + "loss": 0.2253, + "step": 24274 + }, + { + "epoch": 1.9665424497731692, + "grad_norm": 0.06149299070239067, + "learning_rate": 0.00011297088077771278, + "loss": 0.2401, + "step": 24275 + }, + { + "epoch": 1.9666234607906676, + "grad_norm": 0.06168157607316971, + "learning_rate": 0.00011296638012511815, + "loss": 0.2851, + "step": 24276 + }, + { + "epoch": 1.9667044718081659, + "grad_norm": 0.08303011953830719, + "learning_rate": 0.00011296187947252353, + "loss": 0.2569, + "step": 24277 + }, + { + "epoch": 1.9667854828256643, + "grad_norm": 0.0567610077559948, + "learning_rate": 0.0001129573788199289, + "loss": 0.2474, + "step": 24278 + }, + { + "epoch": 1.9668664938431628, + "grad_norm": 0.06008369103074074, + "learning_rate": 0.00011295287816733426, + "loss": 0.2717, + "step": 24279 + }, + { + "epoch": 1.966947504860661, + "grad_norm": 0.06307718902826309, + "learning_rate": 0.00011294837751473963, + "loss": 0.2692, + "step": 24280 + }, + { + "epoch": 1.9670285158781593, + "grad_norm": 0.05900444835424423, + "learning_rate": 0.00011294387686214502, + "loss": 0.2119, + "step": 24281 + }, + { + "epoch": 1.9671095268956578, + "grad_norm": 0.06123722344636917, + "learning_rate": 0.0001129393762095504, + "loss": 0.2613, + "step": 24282 + }, + { + "epoch": 1.9671905379131562, + "grad_norm": 0.05989507958292961, + "learning_rate": 0.00011293487555695577, + "loss": 0.2608, + "step": 24283 + }, + { + "epoch": 1.9672715489306545, + "grad_norm": 0.048651110380887985, + "learning_rate": 0.00011293037490436114, + "loss": 0.2409, + "step": 24284 + }, + { + "epoch": 1.967352559948153, + "grad_norm": 0.062375765293836594, + "learning_rate": 0.0001129258742517665, + "loss": 0.2719, + "step": 24285 + }, + { + "epoch": 1.9674335709656514, + "grad_norm": 0.05648590624332428, + "learning_rate": 0.00011292137359917187, + "loss": 0.3004, + "step": 24286 + }, + { + "epoch": 1.9675145819831497, + "grad_norm": 0.06214189529418945, + "learning_rate": 0.00011291687294657726, + "loss": 0.2473, + "step": 24287 + }, + { + "epoch": 1.9675955930006481, + "grad_norm": 0.05842027813196182, + "learning_rate": 0.00011291237229398264, + "loss": 0.263, + "step": 24288 + }, + { + "epoch": 1.9676766040181466, + "grad_norm": 0.06813821941614151, + "learning_rate": 0.00011290787164138801, + "loss": 0.2804, + "step": 24289 + }, + { + "epoch": 1.9677576150356448, + "grad_norm": 0.06982593238353729, + "learning_rate": 0.00011290337098879338, + "loss": 0.3156, + "step": 24290 + }, + { + "epoch": 1.967838626053143, + "grad_norm": 0.059116560965776443, + "learning_rate": 0.00011289887033619874, + "loss": 0.2764, + "step": 24291 + }, + { + "epoch": 1.9679196370706418, + "grad_norm": 0.06490524858236313, + "learning_rate": 0.00011289436968360412, + "loss": 0.3004, + "step": 24292 + }, + { + "epoch": 1.96800064808814, + "grad_norm": 0.05990765616297722, + "learning_rate": 0.0001128898690310095, + "loss": 0.261, + "step": 24293 + }, + { + "epoch": 1.9680816591056383, + "grad_norm": 0.06079739332199097, + "learning_rate": 0.00011288536837841488, + "loss": 0.2864, + "step": 24294 + }, + { + "epoch": 1.9681626701231367, + "grad_norm": 0.0682620033621788, + "learning_rate": 0.00011288086772582025, + "loss": 0.2691, + "step": 24295 + }, + { + "epoch": 1.9682436811406352, + "grad_norm": 0.06189132481813431, + "learning_rate": 0.00011287636707322562, + "loss": 0.2378, + "step": 24296 + }, + { + "epoch": 1.9683246921581334, + "grad_norm": 0.061714477837085724, + "learning_rate": 0.00011287186642063098, + "loss": 0.2491, + "step": 24297 + }, + { + "epoch": 1.968405703175632, + "grad_norm": 0.056510306894779205, + "learning_rate": 0.00011286736576803639, + "loss": 0.2883, + "step": 24298 + }, + { + "epoch": 1.9684867141931304, + "grad_norm": 0.06037548556923866, + "learning_rate": 0.00011286286511544175, + "loss": 0.2785, + "step": 24299 + }, + { + "epoch": 1.9685677252106286, + "grad_norm": 0.057282455265522, + "learning_rate": 0.00011285836446284712, + "loss": 0.2417, + "step": 24300 + }, + { + "epoch": 1.9686487362281269, + "grad_norm": 0.05612381175160408, + "learning_rate": 0.00011285386381025249, + "loss": 0.2616, + "step": 24301 + }, + { + "epoch": 1.9687297472456255, + "grad_norm": 0.06894572824239731, + "learning_rate": 0.00011284936315765787, + "loss": 0.2798, + "step": 24302 + }, + { + "epoch": 1.9688107582631238, + "grad_norm": 0.06875929981470108, + "learning_rate": 0.00011284486250506323, + "loss": 0.3331, + "step": 24303 + }, + { + "epoch": 1.968891769280622, + "grad_norm": 0.07246767729520798, + "learning_rate": 0.00011284036185246863, + "loss": 0.2901, + "step": 24304 + }, + { + "epoch": 1.9689727802981205, + "grad_norm": 0.0570947490632534, + "learning_rate": 0.00011283586119987399, + "loss": 0.2562, + "step": 24305 + }, + { + "epoch": 1.969053791315619, + "grad_norm": 0.056611426174640656, + "learning_rate": 0.00011283136054727936, + "loss": 0.2627, + "step": 24306 + }, + { + "epoch": 1.9691348023331172, + "grad_norm": 0.05819262936711311, + "learning_rate": 0.00011282685989468473, + "loss": 0.2955, + "step": 24307 + }, + { + "epoch": 1.9692158133506157, + "grad_norm": 0.05799686163663864, + "learning_rate": 0.00011282235924209011, + "loss": 0.2635, + "step": 24308 + }, + { + "epoch": 1.9692968243681142, + "grad_norm": 0.06244715303182602, + "learning_rate": 0.00011281785858949547, + "loss": 0.2691, + "step": 24309 + }, + { + "epoch": 1.9693778353856124, + "grad_norm": 0.05977495759725571, + "learning_rate": 0.00011281335793690087, + "loss": 0.2219, + "step": 24310 + }, + { + "epoch": 1.9694588464031109, + "grad_norm": 0.0680684894323349, + "learning_rate": 0.00011280885728430624, + "loss": 0.3019, + "step": 24311 + }, + { + "epoch": 1.9695398574206093, + "grad_norm": 0.07078555226325989, + "learning_rate": 0.0001128043566317116, + "loss": 0.2791, + "step": 24312 + }, + { + "epoch": 1.9696208684381076, + "grad_norm": 0.06547614187002182, + "learning_rate": 0.00011279985597911698, + "loss": 0.2702, + "step": 24313 + }, + { + "epoch": 1.9697018794556058, + "grad_norm": 0.04946935921907425, + "learning_rate": 0.00011279535532652235, + "loss": 0.2616, + "step": 24314 + }, + { + "epoch": 1.9697828904731045, + "grad_norm": 0.05585674196481705, + "learning_rate": 0.00011279085467392771, + "loss": 0.2472, + "step": 24315 + }, + { + "epoch": 1.9698639014906028, + "grad_norm": 0.06626023352146149, + "learning_rate": 0.00011278635402133311, + "loss": 0.2707, + "step": 24316 + }, + { + "epoch": 1.969944912508101, + "grad_norm": 0.062406085431575775, + "learning_rate": 0.00011278185336873848, + "loss": 0.2868, + "step": 24317 + }, + { + "epoch": 1.9700259235255995, + "grad_norm": 0.05898802727460861, + "learning_rate": 0.00011277735271614384, + "loss": 0.288, + "step": 24318 + }, + { + "epoch": 1.970106934543098, + "grad_norm": 0.08069593459367752, + "learning_rate": 0.00011277285206354922, + "loss": 0.3254, + "step": 24319 + }, + { + "epoch": 1.9701879455605962, + "grad_norm": 0.04930425062775612, + "learning_rate": 0.00011276835141095459, + "loss": 0.2368, + "step": 24320 + }, + { + "epoch": 1.9702689565780946, + "grad_norm": 0.05644959583878517, + "learning_rate": 0.00011276385075835995, + "loss": 0.2657, + "step": 24321 + }, + { + "epoch": 1.970349967595593, + "grad_norm": 0.06363023817539215, + "learning_rate": 0.00011275935010576535, + "loss": 0.2921, + "step": 24322 + }, + { + "epoch": 1.9704309786130914, + "grad_norm": 0.05638289451599121, + "learning_rate": 0.00011275484945317073, + "loss": 0.2637, + "step": 24323 + }, + { + "epoch": 1.9705119896305896, + "grad_norm": 0.07388167828321457, + "learning_rate": 0.00011275034880057609, + "loss": 0.3125, + "step": 24324 + }, + { + "epoch": 1.9705930006480883, + "grad_norm": 0.07029640674591064, + "learning_rate": 0.00011274584814798146, + "loss": 0.2886, + "step": 24325 + }, + { + "epoch": 1.9706740116655865, + "grad_norm": 0.05631260573863983, + "learning_rate": 0.00011274134749538683, + "loss": 0.2543, + "step": 24326 + }, + { + "epoch": 1.9707550226830848, + "grad_norm": 0.06461930274963379, + "learning_rate": 0.00011273684684279222, + "loss": 0.2516, + "step": 24327 + }, + { + "epoch": 1.9708360337005832, + "grad_norm": 0.05598876625299454, + "learning_rate": 0.00011273234619019759, + "loss": 0.2604, + "step": 24328 + }, + { + "epoch": 1.9709170447180817, + "grad_norm": 0.05257659777998924, + "learning_rate": 0.00011272784553760297, + "loss": 0.2461, + "step": 24329 + }, + { + "epoch": 1.97099805573558, + "grad_norm": 0.06548883765935898, + "learning_rate": 0.00011272334488500833, + "loss": 0.2675, + "step": 24330 + }, + { + "epoch": 1.9710790667530784, + "grad_norm": 0.06943689286708832, + "learning_rate": 0.0001127188442324137, + "loss": 0.2743, + "step": 24331 + }, + { + "epoch": 1.971160077770577, + "grad_norm": 0.0543447881937027, + "learning_rate": 0.00011271434357981907, + "loss": 0.2649, + "step": 24332 + }, + { + "epoch": 1.9712410887880751, + "grad_norm": 0.05939463898539543, + "learning_rate": 0.00011270984292722446, + "loss": 0.2728, + "step": 24333 + }, + { + "epoch": 1.9713220998055736, + "grad_norm": 0.05872873589396477, + "learning_rate": 0.00011270534227462983, + "loss": 0.2517, + "step": 24334 + }, + { + "epoch": 1.971403110823072, + "grad_norm": 0.058184314519166946, + "learning_rate": 0.00011270084162203521, + "loss": 0.2569, + "step": 24335 + }, + { + "epoch": 1.9714841218405703, + "grad_norm": 0.05213569477200508, + "learning_rate": 0.00011269634096944057, + "loss": 0.2645, + "step": 24336 + }, + { + "epoch": 1.9715651328580686, + "grad_norm": 0.059691090136766434, + "learning_rate": 0.00011269184031684594, + "loss": 0.2461, + "step": 24337 + }, + { + "epoch": 1.971646143875567, + "grad_norm": 0.05191425606608391, + "learning_rate": 0.00011268733966425132, + "loss": 0.248, + "step": 24338 + }, + { + "epoch": 1.9717271548930655, + "grad_norm": 0.059659551829099655, + "learning_rate": 0.0001126828390116567, + "loss": 0.203, + "step": 24339 + }, + { + "epoch": 1.9718081659105637, + "grad_norm": 0.07564805448055267, + "learning_rate": 0.00011267833835906208, + "loss": 0.3105, + "step": 24340 + }, + { + "epoch": 1.9718891769280622, + "grad_norm": 0.07206302136182785, + "learning_rate": 0.00011267383770646745, + "loss": 0.2776, + "step": 24341 + }, + { + "epoch": 1.9719701879455607, + "grad_norm": 0.06573637574911118, + "learning_rate": 0.00011266933705387281, + "loss": 0.2624, + "step": 24342 + }, + { + "epoch": 1.972051198963059, + "grad_norm": 0.06227828562259674, + "learning_rate": 0.00011266483640127818, + "loss": 0.3009, + "step": 24343 + }, + { + "epoch": 1.9721322099805574, + "grad_norm": 0.07165750116109848, + "learning_rate": 0.00011266033574868356, + "loss": 0.2565, + "step": 24344 + }, + { + "epoch": 1.9722132209980558, + "grad_norm": 0.06627726554870605, + "learning_rate": 0.00011265583509608894, + "loss": 0.2682, + "step": 24345 + }, + { + "epoch": 1.972294232015554, + "grad_norm": 0.061688557267189026, + "learning_rate": 0.00011265133444349432, + "loss": 0.2246, + "step": 24346 + }, + { + "epoch": 1.9723752430330523, + "grad_norm": 0.05644363909959793, + "learning_rate": 0.00011264683379089969, + "loss": 0.2426, + "step": 24347 + }, + { + "epoch": 1.972456254050551, + "grad_norm": 0.060142453759908676, + "learning_rate": 0.00011264233313830505, + "loss": 0.281, + "step": 24348 + }, + { + "epoch": 1.9725372650680493, + "grad_norm": 0.06855808943510056, + "learning_rate": 0.00011263783248571043, + "loss": 0.2884, + "step": 24349 + }, + { + "epoch": 1.9726182760855475, + "grad_norm": 0.05051879212260246, + "learning_rate": 0.00011263333183311581, + "loss": 0.227, + "step": 24350 + }, + { + "epoch": 1.972699287103046, + "grad_norm": 0.04746206849813461, + "learning_rate": 0.00011262883118052119, + "loss": 0.2836, + "step": 24351 + }, + { + "epoch": 1.9727802981205445, + "grad_norm": 0.049681976437568665, + "learning_rate": 0.00011262433052792656, + "loss": 0.209, + "step": 24352 + }, + { + "epoch": 1.9728613091380427, + "grad_norm": 0.04825180396437645, + "learning_rate": 0.00011261982987533193, + "loss": 0.2902, + "step": 24353 + }, + { + "epoch": 1.9729423201555412, + "grad_norm": 0.06472016125917435, + "learning_rate": 0.0001126153292227373, + "loss": 0.2793, + "step": 24354 + }, + { + "epoch": 1.9730233311730396, + "grad_norm": 0.06196343153715134, + "learning_rate": 0.00011261082857014267, + "loss": 0.2756, + "step": 24355 + }, + { + "epoch": 1.9731043421905379, + "grad_norm": 0.059851787984371185, + "learning_rate": 0.00011260632791754805, + "loss": 0.2414, + "step": 24356 + }, + { + "epoch": 1.9731853532080363, + "grad_norm": 0.054491378366947174, + "learning_rate": 0.00011260182726495343, + "loss": 0.2524, + "step": 24357 + }, + { + "epoch": 1.9732663642255348, + "grad_norm": 0.060839369893074036, + "learning_rate": 0.0001125973266123588, + "loss": 0.3168, + "step": 24358 + }, + { + "epoch": 1.973347375243033, + "grad_norm": 0.06078599765896797, + "learning_rate": 0.00011259282595976417, + "loss": 0.2883, + "step": 24359 + }, + { + "epoch": 1.9734283862605313, + "grad_norm": 0.08065938204526901, + "learning_rate": 0.00011258832530716953, + "loss": 0.3195, + "step": 24360 + }, + { + "epoch": 1.9735093972780298, + "grad_norm": 0.07087505608797073, + "learning_rate": 0.00011258382465457491, + "loss": 0.2759, + "step": 24361 + }, + { + "epoch": 1.9735904082955282, + "grad_norm": 0.05086572840809822, + "learning_rate": 0.0001125793240019803, + "loss": 0.2296, + "step": 24362 + }, + { + "epoch": 1.9736714193130265, + "grad_norm": 0.052488088607788086, + "learning_rate": 0.00011257482334938567, + "loss": 0.2581, + "step": 24363 + }, + { + "epoch": 1.973752430330525, + "grad_norm": 0.06180967763066292, + "learning_rate": 0.00011257032269679104, + "loss": 0.2892, + "step": 24364 + }, + { + "epoch": 1.9738334413480234, + "grad_norm": 0.053815726190805435, + "learning_rate": 0.00011256582204419642, + "loss": 0.2282, + "step": 24365 + }, + { + "epoch": 1.9739144523655217, + "grad_norm": 0.05706044286489487, + "learning_rate": 0.00011256132139160178, + "loss": 0.2592, + "step": 24366 + }, + { + "epoch": 1.9739954633830201, + "grad_norm": 0.05483723431825638, + "learning_rate": 0.00011255682073900715, + "loss": 0.258, + "step": 24367 + }, + { + "epoch": 1.9740764744005186, + "grad_norm": 0.04908522218465805, + "learning_rate": 0.00011255232008641254, + "loss": 0.2459, + "step": 24368 + }, + { + "epoch": 1.9741574854180168, + "grad_norm": 0.046063363552093506, + "learning_rate": 0.00011254781943381791, + "loss": 0.2091, + "step": 24369 + }, + { + "epoch": 1.974238496435515, + "grad_norm": 0.07381352037191391, + "learning_rate": 0.00011254331878122328, + "loss": 0.3122, + "step": 24370 + }, + { + "epoch": 1.9743195074530138, + "grad_norm": 0.04620813950896263, + "learning_rate": 0.00011253881812862866, + "loss": 0.2667, + "step": 24371 + }, + { + "epoch": 1.974400518470512, + "grad_norm": 0.06148213520646095, + "learning_rate": 0.00011253431747603402, + "loss": 0.3344, + "step": 24372 + }, + { + "epoch": 1.9744815294880103, + "grad_norm": 0.05575966835021973, + "learning_rate": 0.00011252981682343939, + "loss": 0.2599, + "step": 24373 + }, + { + "epoch": 1.9745625405055087, + "grad_norm": 0.07938870787620544, + "learning_rate": 0.00011252531617084478, + "loss": 0.3063, + "step": 24374 + }, + { + "epoch": 1.9746435515230072, + "grad_norm": 0.06841164082288742, + "learning_rate": 0.00011252081551825015, + "loss": 0.2779, + "step": 24375 + }, + { + "epoch": 1.9747245625405054, + "grad_norm": 0.057837821543216705, + "learning_rate": 0.00011251631486565553, + "loss": 0.2773, + "step": 24376 + }, + { + "epoch": 1.974805573558004, + "grad_norm": 0.06958066672086716, + "learning_rate": 0.0001125118142130609, + "loss": 0.235, + "step": 24377 + }, + { + "epoch": 1.9748865845755024, + "grad_norm": 0.062445688992738724, + "learning_rate": 0.00011250731356046626, + "loss": 0.2771, + "step": 24378 + }, + { + "epoch": 1.9749675955930006, + "grad_norm": 0.05331667140126228, + "learning_rate": 0.00011250281290787166, + "loss": 0.261, + "step": 24379 + }, + { + "epoch": 1.975048606610499, + "grad_norm": 0.06378906965255737, + "learning_rate": 0.00011249831225527703, + "loss": 0.2597, + "step": 24380 + }, + { + "epoch": 1.9751296176279975, + "grad_norm": 0.05976220220327377, + "learning_rate": 0.0001124938116026824, + "loss": 0.2747, + "step": 24381 + }, + { + "epoch": 1.9752106286454958, + "grad_norm": 0.05172165110707283, + "learning_rate": 0.00011248931095008777, + "loss": 0.2417, + "step": 24382 + }, + { + "epoch": 1.975291639662994, + "grad_norm": 0.06261162459850311, + "learning_rate": 0.00011248481029749314, + "loss": 0.2536, + "step": 24383 + }, + { + "epoch": 1.9753726506804925, + "grad_norm": 0.06476294994354248, + "learning_rate": 0.0001124803096448985, + "loss": 0.2713, + "step": 24384 + }, + { + "epoch": 1.975453661697991, + "grad_norm": 0.05832285061478615, + "learning_rate": 0.0001124758089923039, + "loss": 0.2588, + "step": 24385 + }, + { + "epoch": 1.9755346727154892, + "grad_norm": 0.058134134858846664, + "learning_rate": 0.00011247130833970928, + "loss": 0.2624, + "step": 24386 + }, + { + "epoch": 1.9756156837329877, + "grad_norm": 0.064618781208992, + "learning_rate": 0.00011246680768711464, + "loss": 0.2951, + "step": 24387 + }, + { + "epoch": 1.9756966947504861, + "grad_norm": 0.05063759908080101, + "learning_rate": 0.00011246230703452001, + "loss": 0.262, + "step": 24388 + }, + { + "epoch": 1.9757777057679844, + "grad_norm": 0.0644262284040451, + "learning_rate": 0.00011245780638192538, + "loss": 0.2836, + "step": 24389 + }, + { + "epoch": 1.9758587167854829, + "grad_norm": 0.04861307144165039, + "learning_rate": 0.00011245330572933074, + "loss": 0.2593, + "step": 24390 + }, + { + "epoch": 1.9759397278029813, + "grad_norm": 0.07954981923103333, + "learning_rate": 0.00011244880507673614, + "loss": 0.2886, + "step": 24391 + }, + { + "epoch": 1.9760207388204796, + "grad_norm": 0.08754291385412216, + "learning_rate": 0.00011244430442414152, + "loss": 0.2864, + "step": 24392 + }, + { + "epoch": 1.9761017498379778, + "grad_norm": 0.05969921499490738, + "learning_rate": 0.00011243980377154688, + "loss": 0.2877, + "step": 24393 + }, + { + "epoch": 1.9761827608554765, + "grad_norm": 0.0576811358332634, + "learning_rate": 0.00011243530311895225, + "loss": 0.2593, + "step": 24394 + }, + { + "epoch": 1.9762637718729748, + "grad_norm": 0.05961635336279869, + "learning_rate": 0.00011243080246635762, + "loss": 0.2802, + "step": 24395 + }, + { + "epoch": 1.976344782890473, + "grad_norm": 0.05381064862012863, + "learning_rate": 0.00011242630181376298, + "loss": 0.2541, + "step": 24396 + }, + { + "epoch": 1.9764257939079715, + "grad_norm": 0.05873046815395355, + "learning_rate": 0.00011242180116116839, + "loss": 0.219, + "step": 24397 + }, + { + "epoch": 1.97650680492547, + "grad_norm": 0.06834851950407028, + "learning_rate": 0.00011241730050857376, + "loss": 0.2985, + "step": 24398 + }, + { + "epoch": 1.9765878159429682, + "grad_norm": 0.07514762133359909, + "learning_rate": 0.00011241279985597912, + "loss": 0.3346, + "step": 24399 + }, + { + "epoch": 1.9766688269604666, + "grad_norm": 0.061792224645614624, + "learning_rate": 0.00011240829920338449, + "loss": 0.2543, + "step": 24400 + }, + { + "epoch": 1.976749837977965, + "grad_norm": 0.05621310696005821, + "learning_rate": 0.00011240379855078987, + "loss": 0.2323, + "step": 24401 + }, + { + "epoch": 1.9768308489954634, + "grad_norm": 0.05764911696314812, + "learning_rate": 0.00011239929789819523, + "loss": 0.2796, + "step": 24402 + }, + { + "epoch": 1.9769118600129616, + "grad_norm": 0.06711073219776154, + "learning_rate": 0.00011239479724560063, + "loss": 0.2955, + "step": 24403 + }, + { + "epoch": 1.9769928710304603, + "grad_norm": 0.04883848503232002, + "learning_rate": 0.000112390296593006, + "loss": 0.2866, + "step": 24404 + }, + { + "epoch": 1.9770738820479585, + "grad_norm": 0.06951811164617538, + "learning_rate": 0.00011238579594041136, + "loss": 0.2486, + "step": 24405 + }, + { + "epoch": 1.9771548930654568, + "grad_norm": 0.051786020398139954, + "learning_rate": 0.00011238129528781673, + "loss": 0.2421, + "step": 24406 + }, + { + "epoch": 1.9772359040829552, + "grad_norm": 0.07006373256444931, + "learning_rate": 0.00011237679463522211, + "loss": 0.2781, + "step": 24407 + }, + { + "epoch": 1.9773169151004537, + "grad_norm": 0.054151784628629684, + "learning_rate": 0.0001123722939826275, + "loss": 0.2512, + "step": 24408 + }, + { + "epoch": 1.977397926117952, + "grad_norm": 0.062393900007009506, + "learning_rate": 0.00011236779333003287, + "loss": 0.258, + "step": 24409 + }, + { + "epoch": 1.9774789371354504, + "grad_norm": 0.06319965422153473, + "learning_rate": 0.00011236329267743824, + "loss": 0.2443, + "step": 24410 + }, + { + "epoch": 1.9775599481529489, + "grad_norm": 0.06782061606645584, + "learning_rate": 0.0001123587920248436, + "loss": 0.244, + "step": 24411 + }, + { + "epoch": 1.9776409591704471, + "grad_norm": 0.0541662760078907, + "learning_rate": 0.00011235429137224898, + "loss": 0.269, + "step": 24412 + }, + { + "epoch": 1.9777219701879456, + "grad_norm": 0.05658426880836487, + "learning_rate": 0.00011234979071965435, + "loss": 0.2396, + "step": 24413 + }, + { + "epoch": 1.977802981205444, + "grad_norm": 0.06435883790254593, + "learning_rate": 0.00011234529006705974, + "loss": 0.2469, + "step": 24414 + }, + { + "epoch": 1.9778839922229423, + "grad_norm": 0.07199982553720474, + "learning_rate": 0.00011234078941446511, + "loss": 0.2922, + "step": 24415 + }, + { + "epoch": 1.9779650032404406, + "grad_norm": 0.05086003988981247, + "learning_rate": 0.00011233628876187048, + "loss": 0.2537, + "step": 24416 + }, + { + "epoch": 1.9780460142579392, + "grad_norm": 0.055621031671762466, + "learning_rate": 0.00011233178810927584, + "loss": 0.2739, + "step": 24417 + }, + { + "epoch": 1.9781270252754375, + "grad_norm": 0.06266532093286514, + "learning_rate": 0.00011232728745668122, + "loss": 0.2704, + "step": 24418 + }, + { + "epoch": 1.9782080362929357, + "grad_norm": 0.051992930471897125, + "learning_rate": 0.00011232278680408659, + "loss": 0.2495, + "step": 24419 + }, + { + "epoch": 1.9782890473104342, + "grad_norm": 0.06642768532037735, + "learning_rate": 0.00011231828615149198, + "loss": 0.2768, + "step": 24420 + }, + { + "epoch": 1.9783700583279327, + "grad_norm": 0.06322135776281357, + "learning_rate": 0.00011231378549889735, + "loss": 0.2809, + "step": 24421 + }, + { + "epoch": 1.978451069345431, + "grad_norm": 0.05358263850212097, + "learning_rate": 0.00011230928484630273, + "loss": 0.2477, + "step": 24422 + }, + { + "epoch": 1.9785320803629294, + "grad_norm": 0.06326816976070404, + "learning_rate": 0.00011230478419370809, + "loss": 0.2492, + "step": 24423 + }, + { + "epoch": 1.9786130913804278, + "grad_norm": 0.06179438531398773, + "learning_rate": 0.00011230028354111346, + "loss": 0.2542, + "step": 24424 + }, + { + "epoch": 1.978694102397926, + "grad_norm": 0.0613495409488678, + "learning_rate": 0.00011229578288851883, + "loss": 0.2609, + "step": 24425 + }, + { + "epoch": 1.9787751134154243, + "grad_norm": 0.05658883601427078, + "learning_rate": 0.00011229128223592422, + "loss": 0.2791, + "step": 24426 + }, + { + "epoch": 1.978856124432923, + "grad_norm": 0.07112139463424683, + "learning_rate": 0.0001122867815833296, + "loss": 0.2675, + "step": 24427 + }, + { + "epoch": 1.9789371354504213, + "grad_norm": 0.062307700514793396, + "learning_rate": 0.00011228228093073497, + "loss": 0.328, + "step": 24428 + }, + { + "epoch": 1.9790181464679195, + "grad_norm": 0.06263001263141632, + "learning_rate": 0.00011227778027814033, + "loss": 0.2461, + "step": 24429 + }, + { + "epoch": 1.979099157485418, + "grad_norm": 0.05385729297995567, + "learning_rate": 0.0001122732796255457, + "loss": 0.275, + "step": 24430 + }, + { + "epoch": 1.9791801685029164, + "grad_norm": 0.05409708619117737, + "learning_rate": 0.00011226877897295109, + "loss": 0.2181, + "step": 24431 + }, + { + "epoch": 1.9792611795204147, + "grad_norm": 0.06311351805925369, + "learning_rate": 0.00011226427832035646, + "loss": 0.3082, + "step": 24432 + }, + { + "epoch": 1.9793421905379132, + "grad_norm": 0.06120266765356064, + "learning_rate": 0.00011225977766776184, + "loss": 0.2868, + "step": 24433 + }, + { + "epoch": 1.9794232015554116, + "grad_norm": 0.04996590316295624, + "learning_rate": 0.00011225527701516721, + "loss": 0.2469, + "step": 24434 + }, + { + "epoch": 1.9795042125729099, + "grad_norm": 0.05709290876984596, + "learning_rate": 0.00011225077636257257, + "loss": 0.2701, + "step": 24435 + }, + { + "epoch": 1.9795852235904083, + "grad_norm": 0.07180941104888916, + "learning_rate": 0.00011224627570997794, + "loss": 0.2976, + "step": 24436 + }, + { + "epoch": 1.9796662346079068, + "grad_norm": 0.05287057161331177, + "learning_rate": 0.00011224177505738333, + "loss": 0.2368, + "step": 24437 + }, + { + "epoch": 1.979747245625405, + "grad_norm": 0.05574708804488182, + "learning_rate": 0.0001122372744047887, + "loss": 0.2576, + "step": 24438 + }, + { + "epoch": 1.9798282566429033, + "grad_norm": 0.05408519506454468, + "learning_rate": 0.00011223277375219408, + "loss": 0.2621, + "step": 24439 + }, + { + "epoch": 1.9799092676604018, + "grad_norm": 0.06887460500001907, + "learning_rate": 0.00011222827309959945, + "loss": 0.2803, + "step": 24440 + }, + { + "epoch": 1.9799902786779002, + "grad_norm": 0.06533532589673996, + "learning_rate": 0.00011222377244700481, + "loss": 0.2892, + "step": 24441 + }, + { + "epoch": 1.9800712896953985, + "grad_norm": 0.08909396827220917, + "learning_rate": 0.00011221927179441018, + "loss": 0.3064, + "step": 24442 + }, + { + "epoch": 1.980152300712897, + "grad_norm": 0.05991598218679428, + "learning_rate": 0.00011221477114181557, + "loss": 0.2813, + "step": 24443 + }, + { + "epoch": 1.9802333117303954, + "grad_norm": 0.07463617622852325, + "learning_rate": 0.00011221027048922094, + "loss": 0.2546, + "step": 24444 + }, + { + "epoch": 1.9803143227478937, + "grad_norm": 0.059813693165779114, + "learning_rate": 0.00011220576983662632, + "loss": 0.2864, + "step": 24445 + }, + { + "epoch": 1.9803953337653921, + "grad_norm": 0.052126556634902954, + "learning_rate": 0.00011220126918403169, + "loss": 0.2353, + "step": 24446 + }, + { + "epoch": 1.9804763447828906, + "grad_norm": 0.06391862779855728, + "learning_rate": 0.00011219676853143705, + "loss": 0.2966, + "step": 24447 + }, + { + "epoch": 1.9805573558003888, + "grad_norm": 0.06347377598285675, + "learning_rate": 0.00011219226787884243, + "loss": 0.244, + "step": 24448 + }, + { + "epoch": 1.980638366817887, + "grad_norm": 0.06185566633939743, + "learning_rate": 0.00011218776722624783, + "loss": 0.2716, + "step": 24449 + }, + { + "epoch": 1.9807193778353858, + "grad_norm": 0.04907141998410225, + "learning_rate": 0.00011218326657365319, + "loss": 0.237, + "step": 24450 + }, + { + "epoch": 1.980800388852884, + "grad_norm": 0.06573867797851562, + "learning_rate": 0.00011217876592105856, + "loss": 0.2799, + "step": 24451 + }, + { + "epoch": 1.9808813998703823, + "grad_norm": 0.0709458440542221, + "learning_rate": 0.00011217426526846393, + "loss": 0.2982, + "step": 24452 + }, + { + "epoch": 1.9809624108878807, + "grad_norm": 0.06352993100881577, + "learning_rate": 0.0001121697646158693, + "loss": 0.3239, + "step": 24453 + }, + { + "epoch": 1.9810434219053792, + "grad_norm": 0.06232510134577751, + "learning_rate": 0.00011216526396327467, + "loss": 0.2297, + "step": 24454 + }, + { + "epoch": 1.9811244329228774, + "grad_norm": 0.06582966446876526, + "learning_rate": 0.00011216076331068007, + "loss": 0.294, + "step": 24455 + }, + { + "epoch": 1.981205443940376, + "grad_norm": 0.0512259304523468, + "learning_rate": 0.00011215626265808543, + "loss": 0.247, + "step": 24456 + }, + { + "epoch": 1.9812864549578744, + "grad_norm": 0.049582984298467636, + "learning_rate": 0.0001121517620054908, + "loss": 0.2409, + "step": 24457 + }, + { + "epoch": 1.9813674659753726, + "grad_norm": 0.06026807427406311, + "learning_rate": 0.00011214726135289618, + "loss": 0.2568, + "step": 24458 + }, + { + "epoch": 1.981448476992871, + "grad_norm": 0.07120063900947571, + "learning_rate": 0.00011214276070030154, + "loss": 0.2823, + "step": 24459 + }, + { + "epoch": 1.9815294880103695, + "grad_norm": 0.055205926299095154, + "learning_rate": 0.00011213826004770694, + "loss": 0.2424, + "step": 24460 + }, + { + "epoch": 1.9816104990278678, + "grad_norm": 0.04511072859168053, + "learning_rate": 0.00011213375939511231, + "loss": 0.2494, + "step": 24461 + }, + { + "epoch": 1.981691510045366, + "grad_norm": 0.05112173780798912, + "learning_rate": 0.00011212925874251767, + "loss": 0.2506, + "step": 24462 + }, + { + "epoch": 1.9817725210628645, + "grad_norm": 0.0629863440990448, + "learning_rate": 0.00011212475808992304, + "loss": 0.2648, + "step": 24463 + }, + { + "epoch": 1.981853532080363, + "grad_norm": 0.07610534876585007, + "learning_rate": 0.00011212025743732842, + "loss": 0.2665, + "step": 24464 + }, + { + "epoch": 1.9819345430978612, + "grad_norm": 0.057084761559963226, + "learning_rate": 0.00011211575678473378, + "loss": 0.2609, + "step": 24465 + }, + { + "epoch": 1.9820155541153597, + "grad_norm": 0.05006333068013191, + "learning_rate": 0.00011211125613213918, + "loss": 0.2939, + "step": 24466 + }, + { + "epoch": 1.9820965651328581, + "grad_norm": 0.05147801712155342, + "learning_rate": 0.00011210675547954455, + "loss": 0.263, + "step": 24467 + }, + { + "epoch": 1.9821775761503564, + "grad_norm": 0.056295499205589294, + "learning_rate": 0.00011210225482694991, + "loss": 0.2558, + "step": 24468 + }, + { + "epoch": 1.9822585871678549, + "grad_norm": 0.0700087621808052, + "learning_rate": 0.00011209775417435528, + "loss": 0.2555, + "step": 24469 + }, + { + "epoch": 1.9823395981853533, + "grad_norm": 0.06073461472988129, + "learning_rate": 0.00011209325352176066, + "loss": 0.2736, + "step": 24470 + }, + { + "epoch": 1.9824206092028516, + "grad_norm": 0.06145452708005905, + "learning_rate": 0.00011208875286916602, + "loss": 0.2946, + "step": 24471 + }, + { + "epoch": 1.9825016202203498, + "grad_norm": 0.06638166308403015, + "learning_rate": 0.00011208425221657142, + "loss": 0.2725, + "step": 24472 + }, + { + "epoch": 1.9825826312378485, + "grad_norm": 0.06589549779891968, + "learning_rate": 0.00011207975156397679, + "loss": 0.2852, + "step": 24473 + }, + { + "epoch": 1.9826636422553467, + "grad_norm": 0.05530265346169472, + "learning_rate": 0.00011207525091138215, + "loss": 0.2754, + "step": 24474 + }, + { + "epoch": 1.982744653272845, + "grad_norm": 0.05445459857583046, + "learning_rate": 0.00011207075025878753, + "loss": 0.2451, + "step": 24475 + }, + { + "epoch": 1.9828256642903435, + "grad_norm": 0.04934202507138252, + "learning_rate": 0.0001120662496061929, + "loss": 0.2473, + "step": 24476 + }, + { + "epoch": 1.982906675307842, + "grad_norm": 0.06119426339864731, + "learning_rate": 0.00011206174895359826, + "loss": 0.3041, + "step": 24477 + }, + { + "epoch": 1.9829876863253402, + "grad_norm": 0.05670265480875969, + "learning_rate": 0.00011205724830100366, + "loss": 0.2268, + "step": 24478 + }, + { + "epoch": 1.9830686973428386, + "grad_norm": 0.06675172597169876, + "learning_rate": 0.00011205274764840903, + "loss": 0.2677, + "step": 24479 + }, + { + "epoch": 1.983149708360337, + "grad_norm": 0.055620431900024414, + "learning_rate": 0.0001120482469958144, + "loss": 0.224, + "step": 24480 + }, + { + "epoch": 1.9832307193778353, + "grad_norm": 0.06226406618952751, + "learning_rate": 0.00011204374634321977, + "loss": 0.2093, + "step": 24481 + }, + { + "epoch": 1.9833117303953338, + "grad_norm": 0.051887817680835724, + "learning_rate": 0.00011203924569062514, + "loss": 0.2495, + "step": 24482 + }, + { + "epoch": 1.9833927414128323, + "grad_norm": 0.059647489339113235, + "learning_rate": 0.00011203474503803053, + "loss": 0.2327, + "step": 24483 + }, + { + "epoch": 1.9834737524303305, + "grad_norm": 0.06090124323964119, + "learning_rate": 0.0001120302443854359, + "loss": 0.2628, + "step": 24484 + }, + { + "epoch": 1.9835547634478288, + "grad_norm": 0.05459466204047203, + "learning_rate": 0.00011202574373284128, + "loss": 0.2848, + "step": 24485 + }, + { + "epoch": 1.9836357744653272, + "grad_norm": 0.056634724140167236, + "learning_rate": 0.00011202124308024664, + "loss": 0.3098, + "step": 24486 + }, + { + "epoch": 1.9837167854828257, + "grad_norm": 0.07928376644849777, + "learning_rate": 0.00011201674242765201, + "loss": 0.3164, + "step": 24487 + }, + { + "epoch": 1.983797796500324, + "grad_norm": 0.0586346834897995, + "learning_rate": 0.00011201224177505738, + "loss": 0.2314, + "step": 24488 + }, + { + "epoch": 1.9838788075178224, + "grad_norm": 0.06746040284633636, + "learning_rate": 0.00011200774112246277, + "loss": 0.2514, + "step": 24489 + }, + { + "epoch": 1.9839598185353209, + "grad_norm": 0.049767278134822845, + "learning_rate": 0.00011200324046986814, + "loss": 0.2643, + "step": 24490 + }, + { + "epoch": 1.9840408295528191, + "grad_norm": 0.06641127169132233, + "learning_rate": 0.00011199873981727352, + "loss": 0.2634, + "step": 24491 + }, + { + "epoch": 1.9841218405703176, + "grad_norm": 0.06195981800556183, + "learning_rate": 0.00011199423916467888, + "loss": 0.2632, + "step": 24492 + }, + { + "epoch": 1.984202851587816, + "grad_norm": 0.05313951522111893, + "learning_rate": 0.00011198973851208425, + "loss": 0.2762, + "step": 24493 + }, + { + "epoch": 1.9842838626053143, + "grad_norm": 0.05326806381344795, + "learning_rate": 0.00011198523785948962, + "loss": 0.2396, + "step": 24494 + }, + { + "epoch": 1.9843648736228126, + "grad_norm": 0.07170695811510086, + "learning_rate": 0.00011198073720689501, + "loss": 0.2899, + "step": 24495 + }, + { + "epoch": 1.9844458846403112, + "grad_norm": 0.06008395552635193, + "learning_rate": 0.00011197623655430039, + "loss": 0.2291, + "step": 24496 + }, + { + "epoch": 1.9845268956578095, + "grad_norm": 0.070198193192482, + "learning_rate": 0.00011197173590170576, + "loss": 0.3339, + "step": 24497 + }, + { + "epoch": 1.9846079066753077, + "grad_norm": 0.06424789130687714, + "learning_rate": 0.00011196723524911112, + "loss": 0.3084, + "step": 24498 + }, + { + "epoch": 1.9846889176928062, + "grad_norm": 0.0610867515206337, + "learning_rate": 0.00011196273459651649, + "loss": 0.3067, + "step": 24499 + }, + { + "epoch": 1.9847699287103047, + "grad_norm": 0.05551750212907791, + "learning_rate": 0.00011195823394392187, + "loss": 0.2685, + "step": 24500 + }, + { + "epoch": 1.984850939727803, + "grad_norm": 0.06064987927675247, + "learning_rate": 0.00011195373329132725, + "loss": 0.3069, + "step": 24501 + }, + { + "epoch": 1.9849319507453014, + "grad_norm": 0.06334603577852249, + "learning_rate": 0.00011194923263873263, + "loss": 0.2555, + "step": 24502 + }, + { + "epoch": 1.9850129617627998, + "grad_norm": 0.06943856179714203, + "learning_rate": 0.000111944731986138, + "loss": 0.2758, + "step": 24503 + }, + { + "epoch": 1.985093972780298, + "grad_norm": 0.055823858827352524, + "learning_rate": 0.00011194023133354336, + "loss": 0.258, + "step": 24504 + }, + { + "epoch": 1.9851749837977966, + "grad_norm": 0.06590847671031952, + "learning_rate": 0.00011193573068094873, + "loss": 0.2694, + "step": 24505 + }, + { + "epoch": 1.985255994815295, + "grad_norm": 0.08120261132717133, + "learning_rate": 0.00011193123002835411, + "loss": 0.3314, + "step": 24506 + }, + { + "epoch": 1.9853370058327933, + "grad_norm": 0.058982912451028824, + "learning_rate": 0.0001119267293757595, + "loss": 0.2636, + "step": 24507 + }, + { + "epoch": 1.9854180168502915, + "grad_norm": 0.09235643595457077, + "learning_rate": 0.00011192222872316487, + "loss": 0.2883, + "step": 24508 + }, + { + "epoch": 1.98549902786779, + "grad_norm": 0.05961127579212189, + "learning_rate": 0.00011191772807057024, + "loss": 0.2508, + "step": 24509 + }, + { + "epoch": 1.9855800388852884, + "grad_norm": 0.06918458640575409, + "learning_rate": 0.0001119132274179756, + "loss": 0.3028, + "step": 24510 + }, + { + "epoch": 1.9856610499027867, + "grad_norm": 0.056944042444229126, + "learning_rate": 0.00011190872676538098, + "loss": 0.2494, + "step": 24511 + }, + { + "epoch": 1.9857420609202852, + "grad_norm": 0.05929713323712349, + "learning_rate": 0.00011190422611278638, + "loss": 0.2547, + "step": 24512 + }, + { + "epoch": 1.9858230719377836, + "grad_norm": 0.05718807131052017, + "learning_rate": 0.00011189972546019174, + "loss": 0.2701, + "step": 24513 + }, + { + "epoch": 1.9859040829552819, + "grad_norm": 0.05005601793527603, + "learning_rate": 0.00011189522480759711, + "loss": 0.2057, + "step": 24514 + }, + { + "epoch": 1.9859850939727803, + "grad_norm": 0.06474079936742783, + "learning_rate": 0.00011189072415500248, + "loss": 0.2971, + "step": 24515 + }, + { + "epoch": 1.9860661049902788, + "grad_norm": 0.04730149731040001, + "learning_rate": 0.00011188622350240784, + "loss": 0.2715, + "step": 24516 + }, + { + "epoch": 1.986147116007777, + "grad_norm": 0.05048287287354469, + "learning_rate": 0.00011188172284981322, + "loss": 0.2503, + "step": 24517 + }, + { + "epoch": 1.9862281270252753, + "grad_norm": 0.0685349553823471, + "learning_rate": 0.00011187722219721862, + "loss": 0.2832, + "step": 24518 + }, + { + "epoch": 1.986309138042774, + "grad_norm": 0.04649600014090538, + "learning_rate": 0.00011187272154462398, + "loss": 0.216, + "step": 24519 + }, + { + "epoch": 1.9863901490602722, + "grad_norm": 0.06072686240077019, + "learning_rate": 0.00011186822089202935, + "loss": 0.2706, + "step": 24520 + }, + { + "epoch": 1.9864711600777705, + "grad_norm": 0.06857617199420929, + "learning_rate": 0.00011186372023943473, + "loss": 0.2356, + "step": 24521 + }, + { + "epoch": 1.986552171095269, + "grad_norm": 0.06952265650033951, + "learning_rate": 0.00011185921958684009, + "loss": 0.2902, + "step": 24522 + }, + { + "epoch": 1.9866331821127674, + "grad_norm": 0.05612373724579811, + "learning_rate": 0.00011185471893424546, + "loss": 0.2502, + "step": 24523 + }, + { + "epoch": 1.9867141931302656, + "grad_norm": 0.052051082253456116, + "learning_rate": 0.00011185021828165086, + "loss": 0.2394, + "step": 24524 + }, + { + "epoch": 1.9867952041477641, + "grad_norm": 0.059324733912944794, + "learning_rate": 0.00011184571762905622, + "loss": 0.262, + "step": 24525 + }, + { + "epoch": 1.9868762151652626, + "grad_norm": 0.06145957112312317, + "learning_rate": 0.0001118412169764616, + "loss": 0.2551, + "step": 24526 + }, + { + "epoch": 1.9869572261827608, + "grad_norm": 0.06007521227002144, + "learning_rate": 0.00011183671632386697, + "loss": 0.2511, + "step": 24527 + }, + { + "epoch": 1.987038237200259, + "grad_norm": 0.0481129065155983, + "learning_rate": 0.00011183221567127233, + "loss": 0.2424, + "step": 24528 + }, + { + "epoch": 1.9871192482177578, + "grad_norm": 0.053366199135780334, + "learning_rate": 0.0001118277150186777, + "loss": 0.2566, + "step": 24529 + }, + { + "epoch": 1.987200259235256, + "grad_norm": 0.057617463171482086, + "learning_rate": 0.0001118232143660831, + "loss": 0.2516, + "step": 24530 + }, + { + "epoch": 1.9872812702527543, + "grad_norm": 0.06365573406219482, + "learning_rate": 0.00011181871371348846, + "loss": 0.3051, + "step": 24531 + }, + { + "epoch": 1.9873622812702527, + "grad_norm": 0.050839368253946304, + "learning_rate": 0.00011181421306089384, + "loss": 0.2531, + "step": 24532 + }, + { + "epoch": 1.9874432922877512, + "grad_norm": 0.0506996251642704, + "learning_rate": 0.00011180971240829921, + "loss": 0.2341, + "step": 24533 + }, + { + "epoch": 1.9875243033052494, + "grad_norm": 0.07351569086313248, + "learning_rate": 0.00011180521175570457, + "loss": 0.2372, + "step": 24534 + }, + { + "epoch": 1.987605314322748, + "grad_norm": 0.07951635867357254, + "learning_rate": 0.00011180071110310994, + "loss": 0.2614, + "step": 24535 + }, + { + "epoch": 1.9876863253402464, + "grad_norm": 0.09023235738277435, + "learning_rate": 0.00011179621045051534, + "loss": 0.294, + "step": 24536 + }, + { + "epoch": 1.9877673363577446, + "grad_norm": 0.06043418496847153, + "learning_rate": 0.0001117917097979207, + "loss": 0.2703, + "step": 24537 + }, + { + "epoch": 1.987848347375243, + "grad_norm": 0.06518200039863586, + "learning_rate": 0.00011178720914532608, + "loss": 0.2742, + "step": 24538 + }, + { + "epoch": 1.9879293583927415, + "grad_norm": 0.07014909386634827, + "learning_rate": 0.00011178270849273145, + "loss": 0.3306, + "step": 24539 + }, + { + "epoch": 1.9880103694102398, + "grad_norm": 0.06112273782491684, + "learning_rate": 0.00011177820784013681, + "loss": 0.2638, + "step": 24540 + }, + { + "epoch": 1.988091380427738, + "grad_norm": 0.05691821873188019, + "learning_rate": 0.00011177370718754221, + "loss": 0.2354, + "step": 24541 + }, + { + "epoch": 1.9881723914452365, + "grad_norm": 0.057806871831417084, + "learning_rate": 0.00011176920653494758, + "loss": 0.2906, + "step": 24542 + }, + { + "epoch": 1.988253402462735, + "grad_norm": 0.05050506070256233, + "learning_rate": 0.00011176470588235294, + "loss": 0.249, + "step": 24543 + }, + { + "epoch": 1.9883344134802332, + "grad_norm": 0.056343384087085724, + "learning_rate": 0.00011176020522975832, + "loss": 0.2296, + "step": 24544 + }, + { + "epoch": 1.9884154244977317, + "grad_norm": 0.049676354974508286, + "learning_rate": 0.00011175570457716369, + "loss": 0.2453, + "step": 24545 + }, + { + "epoch": 1.9884964355152301, + "grad_norm": 0.062451086938381195, + "learning_rate": 0.00011175120392456905, + "loss": 0.302, + "step": 24546 + }, + { + "epoch": 1.9885774465327284, + "grad_norm": 0.05744845047593117, + "learning_rate": 0.00011174670327197445, + "loss": 0.2519, + "step": 24547 + }, + { + "epoch": 1.9886584575502269, + "grad_norm": 0.055483151227235794, + "learning_rate": 0.00011174220261937983, + "loss": 0.2436, + "step": 24548 + }, + { + "epoch": 1.9887394685677253, + "grad_norm": 0.0638405978679657, + "learning_rate": 0.00011173770196678519, + "loss": 0.2653, + "step": 24549 + }, + { + "epoch": 1.9888204795852236, + "grad_norm": 0.05625903606414795, + "learning_rate": 0.00011173320131419056, + "loss": 0.2677, + "step": 24550 + }, + { + "epoch": 1.9889014906027218, + "grad_norm": 0.061679109930992126, + "learning_rate": 0.00011172870066159593, + "loss": 0.287, + "step": 24551 + }, + { + "epoch": 1.9889825016202205, + "grad_norm": 0.05694204196333885, + "learning_rate": 0.00011172420000900131, + "loss": 0.2265, + "step": 24552 + }, + { + "epoch": 1.9890635126377187, + "grad_norm": 0.07806746661663055, + "learning_rate": 0.0001117196993564067, + "loss": 0.2172, + "step": 24553 + }, + { + "epoch": 1.989144523655217, + "grad_norm": 0.048661310225725174, + "learning_rate": 0.00011171519870381207, + "loss": 0.2383, + "step": 24554 + }, + { + "epoch": 1.9892255346727155, + "grad_norm": 0.06622835993766785, + "learning_rate": 0.00011171069805121743, + "loss": 0.2882, + "step": 24555 + }, + { + "epoch": 1.989306545690214, + "grad_norm": 0.05554422736167908, + "learning_rate": 0.0001117061973986228, + "loss": 0.2619, + "step": 24556 + }, + { + "epoch": 1.9893875567077122, + "grad_norm": 0.052647799253463745, + "learning_rate": 0.00011170169674602818, + "loss": 0.2794, + "step": 24557 + }, + { + "epoch": 1.9894685677252106, + "grad_norm": 0.060399044305086136, + "learning_rate": 0.00011169719609343355, + "loss": 0.2619, + "step": 24558 + }, + { + "epoch": 1.989549578742709, + "grad_norm": 0.05348338559269905, + "learning_rate": 0.00011169269544083894, + "loss": 0.2545, + "step": 24559 + }, + { + "epoch": 1.9896305897602073, + "grad_norm": 0.06820795685052872, + "learning_rate": 0.00011168819478824431, + "loss": 0.2736, + "step": 24560 + }, + { + "epoch": 1.9897116007777058, + "grad_norm": 0.06192629784345627, + "learning_rate": 0.00011168369413564967, + "loss": 0.2742, + "step": 24561 + }, + { + "epoch": 1.9897926117952043, + "grad_norm": 0.0603586845099926, + "learning_rate": 0.00011167919348305504, + "loss": 0.2561, + "step": 24562 + }, + { + "epoch": 1.9898736228127025, + "grad_norm": 0.055726949125528336, + "learning_rate": 0.00011167469283046042, + "loss": 0.2532, + "step": 24563 + }, + { + "epoch": 1.9899546338302008, + "grad_norm": 0.06763040274381638, + "learning_rate": 0.0001116701921778658, + "loss": 0.2934, + "step": 24564 + }, + { + "epoch": 1.9900356448476992, + "grad_norm": 0.07034408301115036, + "learning_rate": 0.00011166569152527118, + "loss": 0.2619, + "step": 24565 + }, + { + "epoch": 1.9901166558651977, + "grad_norm": 0.07477734982967377, + "learning_rate": 0.00011166119087267655, + "loss": 0.2686, + "step": 24566 + }, + { + "epoch": 1.990197666882696, + "grad_norm": 0.05332627892494202, + "learning_rate": 0.00011165669022008191, + "loss": 0.2509, + "step": 24567 + }, + { + "epoch": 1.9902786779001944, + "grad_norm": 0.05626533553004265, + "learning_rate": 0.00011165218956748729, + "loss": 0.3044, + "step": 24568 + }, + { + "epoch": 1.9903596889176929, + "grad_norm": 0.05866987630724907, + "learning_rate": 0.00011164768891489266, + "loss": 0.2438, + "step": 24569 + }, + { + "epoch": 1.9904406999351911, + "grad_norm": 0.05218745395541191, + "learning_rate": 0.00011164318826229805, + "loss": 0.2403, + "step": 24570 + }, + { + "epoch": 1.9905217109526896, + "grad_norm": 0.06421063095331192, + "learning_rate": 0.00011163868760970342, + "loss": 0.2867, + "step": 24571 + }, + { + "epoch": 1.990602721970188, + "grad_norm": 0.057032715529203415, + "learning_rate": 0.00011163418695710879, + "loss": 0.2647, + "step": 24572 + }, + { + "epoch": 1.9906837329876863, + "grad_norm": 0.050951577723026276, + "learning_rate": 0.00011162968630451415, + "loss": 0.2686, + "step": 24573 + }, + { + "epoch": 1.9907647440051845, + "grad_norm": 0.06326782703399658, + "learning_rate": 0.00011162518565191953, + "loss": 0.269, + "step": 24574 + }, + { + "epoch": 1.9908457550226832, + "grad_norm": 0.062102437019348145, + "learning_rate": 0.0001116206849993249, + "loss": 0.2342, + "step": 24575 + }, + { + "epoch": 1.9909267660401815, + "grad_norm": 0.07285258173942566, + "learning_rate": 0.00011161618434673029, + "loss": 0.2705, + "step": 24576 + }, + { + "epoch": 1.9910077770576797, + "grad_norm": 0.06593865901231766, + "learning_rate": 0.00011161168369413566, + "loss": 0.269, + "step": 24577 + }, + { + "epoch": 1.9910887880751782, + "grad_norm": 0.06808798760175705, + "learning_rate": 0.00011160718304154103, + "loss": 0.2833, + "step": 24578 + }, + { + "epoch": 1.9911697990926767, + "grad_norm": 0.06346582621335983, + "learning_rate": 0.0001116026823889464, + "loss": 0.2162, + "step": 24579 + }, + { + "epoch": 1.991250810110175, + "grad_norm": 0.05143898352980614, + "learning_rate": 0.00011159818173635177, + "loss": 0.2506, + "step": 24580 + }, + { + "epoch": 1.9913318211276734, + "grad_norm": 0.05779607594013214, + "learning_rate": 0.00011159368108375714, + "loss": 0.3119, + "step": 24581 + }, + { + "epoch": 1.9914128321451718, + "grad_norm": 0.06664140522480011, + "learning_rate": 0.00011158918043116253, + "loss": 0.269, + "step": 24582 + }, + { + "epoch": 1.99149384316267, + "grad_norm": 0.0591379813849926, + "learning_rate": 0.0001115846797785679, + "loss": 0.2791, + "step": 24583 + }, + { + "epoch": 1.9915748541801686, + "grad_norm": 0.058772388845682144, + "learning_rate": 0.00011158017912597328, + "loss": 0.2951, + "step": 24584 + }, + { + "epoch": 1.991655865197667, + "grad_norm": 0.05919862911105156, + "learning_rate": 0.00011157567847337864, + "loss": 0.2522, + "step": 24585 + }, + { + "epoch": 1.9917368762151653, + "grad_norm": 0.06193426623940468, + "learning_rate": 0.00011157117782078401, + "loss": 0.3214, + "step": 24586 + }, + { + "epoch": 1.9918178872326635, + "grad_norm": 0.062030475586652756, + "learning_rate": 0.00011156667716818938, + "loss": 0.2391, + "step": 24587 + }, + { + "epoch": 1.991898898250162, + "grad_norm": 0.06320808082818985, + "learning_rate": 0.00011156217651559477, + "loss": 0.2907, + "step": 24588 + }, + { + "epoch": 1.9919799092676604, + "grad_norm": 0.057537805289030075, + "learning_rate": 0.00011155767586300014, + "loss": 0.2469, + "step": 24589 + }, + { + "epoch": 1.9920609202851587, + "grad_norm": 0.057037729769945145, + "learning_rate": 0.00011155317521040552, + "loss": 0.2696, + "step": 24590 + }, + { + "epoch": 1.9921419313026572, + "grad_norm": 0.05443946272134781, + "learning_rate": 0.00011154867455781088, + "loss": 0.2441, + "step": 24591 + }, + { + "epoch": 1.9922229423201556, + "grad_norm": 0.060619425028562546, + "learning_rate": 0.00011154417390521625, + "loss": 0.2568, + "step": 24592 + }, + { + "epoch": 1.9923039533376539, + "grad_norm": 0.06094972416758537, + "learning_rate": 0.00011153967325262165, + "loss": 0.2774, + "step": 24593 + }, + { + "epoch": 1.9923849643551523, + "grad_norm": 0.06160997226834297, + "learning_rate": 0.00011153517260002701, + "loss": 0.2604, + "step": 24594 + }, + { + "epoch": 1.9924659753726508, + "grad_norm": 0.06207242235541344, + "learning_rate": 0.00011153067194743239, + "loss": 0.2543, + "step": 24595 + }, + { + "epoch": 1.992546986390149, + "grad_norm": 0.077674999833107, + "learning_rate": 0.00011152617129483776, + "loss": 0.2803, + "step": 24596 + }, + { + "epoch": 1.9926279974076473, + "grad_norm": 0.0795058086514473, + "learning_rate": 0.00011152167064224312, + "loss": 0.3187, + "step": 24597 + }, + { + "epoch": 1.992709008425146, + "grad_norm": 0.07428909093141556, + "learning_rate": 0.00011151716998964849, + "loss": 0.3045, + "step": 24598 + }, + { + "epoch": 1.9927900194426442, + "grad_norm": 0.044681303203105927, + "learning_rate": 0.0001115126693370539, + "loss": 0.2241, + "step": 24599 + }, + { + "epoch": 1.9928710304601425, + "grad_norm": 0.07117260992527008, + "learning_rate": 0.00011150816868445925, + "loss": 0.2866, + "step": 24600 + }, + { + "epoch": 1.992952041477641, + "grad_norm": 0.06624329090118408, + "learning_rate": 0.00011150366803186463, + "loss": 0.3041, + "step": 24601 + }, + { + "epoch": 1.9930330524951394, + "grad_norm": 0.06437989324331284, + "learning_rate": 0.00011149916737927, + "loss": 0.3017, + "step": 24602 + }, + { + "epoch": 1.9931140635126376, + "grad_norm": 0.052165914326906204, + "learning_rate": 0.00011149466672667536, + "loss": 0.2566, + "step": 24603 + }, + { + "epoch": 1.9931950745301361, + "grad_norm": 0.06342125684022903, + "learning_rate": 0.00011149016607408073, + "loss": 0.2759, + "step": 24604 + }, + { + "epoch": 1.9932760855476346, + "grad_norm": 0.05953681841492653, + "learning_rate": 0.00011148566542148614, + "loss": 0.2445, + "step": 24605 + }, + { + "epoch": 1.9933570965651328, + "grad_norm": 0.05627750977873802, + "learning_rate": 0.0001114811647688915, + "loss": 0.2747, + "step": 24606 + }, + { + "epoch": 1.9934381075826313, + "grad_norm": 0.07000666856765747, + "learning_rate": 0.00011147666411629687, + "loss": 0.2832, + "step": 24607 + }, + { + "epoch": 1.9935191186001298, + "grad_norm": 0.06306980550289154, + "learning_rate": 0.00011147216346370224, + "loss": 0.2462, + "step": 24608 + }, + { + "epoch": 1.993600129617628, + "grad_norm": 0.06391213089227676, + "learning_rate": 0.0001114676628111076, + "loss": 0.261, + "step": 24609 + }, + { + "epoch": 1.9936811406351262, + "grad_norm": 0.06361900269985199, + "learning_rate": 0.00011146316215851298, + "loss": 0.2847, + "step": 24610 + }, + { + "epoch": 1.9937621516526247, + "grad_norm": 0.05595242604613304, + "learning_rate": 0.00011145866150591838, + "loss": 0.2612, + "step": 24611 + }, + { + "epoch": 1.9938431626701232, + "grad_norm": 0.05720612406730652, + "learning_rate": 0.00011145416085332374, + "loss": 0.2928, + "step": 24612 + }, + { + "epoch": 1.9939241736876214, + "grad_norm": 0.07067383825778961, + "learning_rate": 0.00011144966020072911, + "loss": 0.2921, + "step": 24613 + }, + { + "epoch": 1.99400518470512, + "grad_norm": 0.06673353910446167, + "learning_rate": 0.00011144515954813448, + "loss": 0.2906, + "step": 24614 + }, + { + "epoch": 1.9940861957226184, + "grad_norm": 0.06661546230316162, + "learning_rate": 0.00011144065889553986, + "loss": 0.284, + "step": 24615 + }, + { + "epoch": 1.9941672067401166, + "grad_norm": 0.06102203577756882, + "learning_rate": 0.00011143615824294525, + "loss": 0.269, + "step": 24616 + }, + { + "epoch": 1.994248217757615, + "grad_norm": 0.055220767855644226, + "learning_rate": 0.00011143165759035062, + "loss": 0.2439, + "step": 24617 + }, + { + "epoch": 1.9943292287751135, + "grad_norm": 0.05387234315276146, + "learning_rate": 0.00011142715693775598, + "loss": 0.225, + "step": 24618 + }, + { + "epoch": 1.9944102397926118, + "grad_norm": 0.05761054530739784, + "learning_rate": 0.00011142265628516135, + "loss": 0.3081, + "step": 24619 + }, + { + "epoch": 1.99449125081011, + "grad_norm": 0.06527112424373627, + "learning_rate": 0.00011141815563256673, + "loss": 0.2999, + "step": 24620 + }, + { + "epoch": 1.9945722618276087, + "grad_norm": 0.06231717765331268, + "learning_rate": 0.0001114136549799721, + "loss": 0.2853, + "step": 24621 + }, + { + "epoch": 1.994653272845107, + "grad_norm": 0.05752355605363846, + "learning_rate": 0.00011140915432737749, + "loss": 0.2365, + "step": 24622 + }, + { + "epoch": 1.9947342838626052, + "grad_norm": 0.05275258794426918, + "learning_rate": 0.00011140465367478286, + "loss": 0.2559, + "step": 24623 + }, + { + "epoch": 1.9948152948801037, + "grad_norm": 0.048421286046504974, + "learning_rate": 0.00011140015302218822, + "loss": 0.2261, + "step": 24624 + }, + { + "epoch": 1.9948963058976021, + "grad_norm": 0.05411406606435776, + "learning_rate": 0.0001113956523695936, + "loss": 0.2123, + "step": 24625 + }, + { + "epoch": 1.9949773169151004, + "grad_norm": 0.06282738596200943, + "learning_rate": 0.00011139115171699897, + "loss": 0.2899, + "step": 24626 + }, + { + "epoch": 1.9950583279325989, + "grad_norm": 0.058103177696466446, + "learning_rate": 0.00011138665106440434, + "loss": 0.3021, + "step": 24627 + }, + { + "epoch": 1.9951393389500973, + "grad_norm": 0.059910792857408524, + "learning_rate": 0.00011138215041180973, + "loss": 0.2864, + "step": 24628 + }, + { + "epoch": 1.9952203499675956, + "grad_norm": 0.061971426010131836, + "learning_rate": 0.0001113776497592151, + "loss": 0.2642, + "step": 24629 + }, + { + "epoch": 1.9953013609850938, + "grad_norm": 0.05655450001358986, + "learning_rate": 0.00011137314910662046, + "loss": 0.3023, + "step": 24630 + }, + { + "epoch": 1.9953823720025925, + "grad_norm": 0.06409027427434921, + "learning_rate": 0.00011136864845402584, + "loss": 0.2386, + "step": 24631 + }, + { + "epoch": 1.9954633830200907, + "grad_norm": 0.06355836242437363, + "learning_rate": 0.00011136414780143121, + "loss": 0.2617, + "step": 24632 + }, + { + "epoch": 1.995544394037589, + "grad_norm": 0.056723617017269135, + "learning_rate": 0.00011135964714883658, + "loss": 0.2586, + "step": 24633 + }, + { + "epoch": 1.9956254050550875, + "grad_norm": 0.06595166772603989, + "learning_rate": 0.00011135514649624197, + "loss": 0.2874, + "step": 24634 + }, + { + "epoch": 1.995706416072586, + "grad_norm": 0.06310024857521057, + "learning_rate": 0.00011135064584364734, + "loss": 0.2506, + "step": 24635 + }, + { + "epoch": 1.9957874270900842, + "grad_norm": 0.0638793483376503, + "learning_rate": 0.0001113461451910527, + "loss": 0.266, + "step": 24636 + }, + { + "epoch": 1.9958684381075826, + "grad_norm": 0.05258272960782051, + "learning_rate": 0.00011134164453845808, + "loss": 0.2736, + "step": 24637 + }, + { + "epoch": 1.995949449125081, + "grad_norm": 0.0778241902589798, + "learning_rate": 0.00011133714388586345, + "loss": 0.3137, + "step": 24638 + }, + { + "epoch": 1.9960304601425793, + "grad_norm": 0.06420960277318954, + "learning_rate": 0.00011133264323326882, + "loss": 0.2783, + "step": 24639 + }, + { + "epoch": 1.9961114711600778, + "grad_norm": 0.06127495691180229, + "learning_rate": 0.00011132814258067421, + "loss": 0.2688, + "step": 24640 + }, + { + "epoch": 1.9961924821775763, + "grad_norm": 0.06163187325000763, + "learning_rate": 0.00011132364192807959, + "loss": 0.2524, + "step": 24641 + }, + { + "epoch": 1.9962734931950745, + "grad_norm": 0.06623512506484985, + "learning_rate": 0.00011131914127548495, + "loss": 0.2536, + "step": 24642 + }, + { + "epoch": 1.9963545042125728, + "grad_norm": 0.05344029515981674, + "learning_rate": 0.00011131464062289032, + "loss": 0.2577, + "step": 24643 + }, + { + "epoch": 1.9964355152300715, + "grad_norm": 0.05709357187151909, + "learning_rate": 0.00011131013997029569, + "loss": 0.2525, + "step": 24644 + }, + { + "epoch": 1.9965165262475697, + "grad_norm": 0.05957780033349991, + "learning_rate": 0.00011130563931770108, + "loss": 0.2777, + "step": 24645 + }, + { + "epoch": 1.996597537265068, + "grad_norm": 0.06133737415075302, + "learning_rate": 0.00011130113866510645, + "loss": 0.2333, + "step": 24646 + }, + { + "epoch": 1.9966785482825664, + "grad_norm": 0.06680312752723694, + "learning_rate": 0.00011129663801251183, + "loss": 0.2544, + "step": 24647 + }, + { + "epoch": 1.9967595593000649, + "grad_norm": 0.04961247742176056, + "learning_rate": 0.00011129213735991719, + "loss": 0.2551, + "step": 24648 + }, + { + "epoch": 1.9968405703175631, + "grad_norm": 0.05387548357248306, + "learning_rate": 0.00011128763670732256, + "loss": 0.2646, + "step": 24649 + }, + { + "epoch": 1.9969215813350616, + "grad_norm": 0.07167568057775497, + "learning_rate": 0.00011128313605472793, + "loss": 0.2806, + "step": 24650 + }, + { + "epoch": 1.99700259235256, + "grad_norm": 0.05495288595557213, + "learning_rate": 0.00011127863540213332, + "loss": 0.2527, + "step": 24651 + }, + { + "epoch": 1.9970836033700583, + "grad_norm": 0.07860712707042694, + "learning_rate": 0.0001112741347495387, + "loss": 0.3519, + "step": 24652 + }, + { + "epoch": 1.9971646143875565, + "grad_norm": 0.05974472314119339, + "learning_rate": 0.00011126963409694407, + "loss": 0.2416, + "step": 24653 + }, + { + "epoch": 1.9972456254050552, + "grad_norm": 0.0708412453532219, + "learning_rate": 0.00011126513344434943, + "loss": 0.3165, + "step": 24654 + }, + { + "epoch": 1.9973266364225535, + "grad_norm": 0.04328044131398201, + "learning_rate": 0.0001112606327917548, + "loss": 0.2463, + "step": 24655 + }, + { + "epoch": 1.9974076474400517, + "grad_norm": 0.054907847195863724, + "learning_rate": 0.00011125613213916018, + "loss": 0.2841, + "step": 24656 + }, + { + "epoch": 1.9974886584575502, + "grad_norm": 0.05406655743718147, + "learning_rate": 0.00011125163148656556, + "loss": 0.2728, + "step": 24657 + }, + { + "epoch": 1.9975696694750487, + "grad_norm": 0.05375692993402481, + "learning_rate": 0.00011124713083397094, + "loss": 0.3063, + "step": 24658 + }, + { + "epoch": 1.997650680492547, + "grad_norm": 0.05375174060463905, + "learning_rate": 0.00011124263018137631, + "loss": 0.259, + "step": 24659 + }, + { + "epoch": 1.9977316915100454, + "grad_norm": 0.05399360880255699, + "learning_rate": 0.00011123812952878167, + "loss": 0.2399, + "step": 24660 + }, + { + "epoch": 1.9978127025275438, + "grad_norm": 0.0648183673620224, + "learning_rate": 0.00011123362887618704, + "loss": 0.2597, + "step": 24661 + }, + { + "epoch": 1.997893713545042, + "grad_norm": 0.054348211735486984, + "learning_rate": 0.00011122912822359242, + "loss": 0.2603, + "step": 24662 + }, + { + "epoch": 1.9979747245625405, + "grad_norm": 0.07106921076774597, + "learning_rate": 0.0001112246275709978, + "loss": 0.3093, + "step": 24663 + }, + { + "epoch": 1.998055735580039, + "grad_norm": 0.05705713853240013, + "learning_rate": 0.00011122012691840318, + "loss": 0.2544, + "step": 24664 + }, + { + "epoch": 1.9981367465975373, + "grad_norm": 0.05977337807416916, + "learning_rate": 0.00011121562626580855, + "loss": 0.2715, + "step": 24665 + }, + { + "epoch": 1.9982177576150355, + "grad_norm": 0.05743882805109024, + "learning_rate": 0.00011121112561321391, + "loss": 0.2842, + "step": 24666 + }, + { + "epoch": 1.998298768632534, + "grad_norm": 0.056074030697345734, + "learning_rate": 0.00011120662496061929, + "loss": 0.2544, + "step": 24667 + }, + { + "epoch": 1.9983797796500324, + "grad_norm": 0.05134023725986481, + "learning_rate": 0.00011120212430802469, + "loss": 0.2757, + "step": 24668 + }, + { + "epoch": 1.9984607906675307, + "grad_norm": 0.0737508237361908, + "learning_rate": 0.00011119762365543005, + "loss": 0.2564, + "step": 24669 + }, + { + "epoch": 1.9985418016850292, + "grad_norm": 0.06467597186565399, + "learning_rate": 0.00011119312300283542, + "loss": 0.3089, + "step": 24670 + }, + { + "epoch": 1.9986228127025276, + "grad_norm": 0.06818606704473495, + "learning_rate": 0.0001111886223502408, + "loss": 0.256, + "step": 24671 + }, + { + "epoch": 1.9987038237200259, + "grad_norm": 0.06933887302875519, + "learning_rate": 0.00011118412169764615, + "loss": 0.2743, + "step": 24672 + }, + { + "epoch": 1.9987848347375243, + "grad_norm": 0.05665956437587738, + "learning_rate": 0.00011117962104505153, + "loss": 0.2242, + "step": 24673 + }, + { + "epoch": 1.9988658457550228, + "grad_norm": 0.05923176556825638, + "learning_rate": 0.00011117512039245693, + "loss": 0.301, + "step": 24674 + }, + { + "epoch": 1.998946856772521, + "grad_norm": 0.054322510957717896, + "learning_rate": 0.00011117061973986229, + "loss": 0.2436, + "step": 24675 + }, + { + "epoch": 1.9990278677900193, + "grad_norm": 0.05147448554635048, + "learning_rate": 0.00011116611908726766, + "loss": 0.229, + "step": 24676 + }, + { + "epoch": 1.999108878807518, + "grad_norm": 0.06228252127766609, + "learning_rate": 0.00011116161843467303, + "loss": 0.2484, + "step": 24677 + }, + { + "epoch": 1.9991898898250162, + "grad_norm": 0.04767598584294319, + "learning_rate": 0.0001111571177820784, + "loss": 0.2757, + "step": 24678 + }, + { + "epoch": 1.9992709008425145, + "grad_norm": 0.04789305478334427, + "learning_rate": 0.00011115261712948377, + "loss": 0.2455, + "step": 24679 + }, + { + "epoch": 1.999351911860013, + "grad_norm": 0.06482335180044174, + "learning_rate": 0.00011114811647688917, + "loss": 0.2898, + "step": 24680 + }, + { + "epoch": 1.9994329228775114, + "grad_norm": 0.057793837040662766, + "learning_rate": 0.00011114361582429453, + "loss": 0.2517, + "step": 24681 + }, + { + "epoch": 1.9995139338950096, + "grad_norm": 0.05205461010336876, + "learning_rate": 0.0001111391151716999, + "loss": 0.2323, + "step": 24682 + }, + { + "epoch": 1.999594944912508, + "grad_norm": 0.05734451487660408, + "learning_rate": 0.00011113461451910528, + "loss": 0.2746, + "step": 24683 + }, + { + "epoch": 1.9996759559300066, + "grad_norm": 0.062195006757974625, + "learning_rate": 0.00011113011386651065, + "loss": 0.2669, + "step": 24684 + }, + { + "epoch": 1.9997569669475048, + "grad_norm": 0.05192793905735016, + "learning_rate": 0.00011112561321391601, + "loss": 0.2802, + "step": 24685 + }, + { + "epoch": 1.9998379779650033, + "grad_norm": 0.06143191456794739, + "learning_rate": 0.00011112111256132141, + "loss": 0.311, + "step": 24686 + }, + { + "epoch": 1.9999189889825018, + "grad_norm": 0.054254207760095596, + "learning_rate": 0.00011111661190872677, + "loss": 0.2842, + "step": 24687 + }, + { + "epoch": 2.0, + "grad_norm": 0.04564838111400604, + "learning_rate": 0.00011111211125613214, + "loss": 0.1993, + "step": 24688 + }, + { + "epoch": 2.0000810110174982, + "grad_norm": 0.05824628099799156, + "learning_rate": 0.00011110761060353752, + "loss": 0.2654, + "step": 24689 + }, + { + "epoch": 2.000162022034997, + "grad_norm": 0.06120525300502777, + "learning_rate": 0.00011110310995094289, + "loss": 0.2628, + "step": 24690 + }, + { + "epoch": 2.000243033052495, + "grad_norm": 0.05350266024470329, + "learning_rate": 0.00011109860929834825, + "loss": 0.2325, + "step": 24691 + }, + { + "epoch": 2.0003240440699934, + "grad_norm": 0.06434735655784607, + "learning_rate": 0.00011109410864575365, + "loss": 0.2442, + "step": 24692 + }, + { + "epoch": 2.000405055087492, + "grad_norm": 0.07097124308347702, + "learning_rate": 0.00011108960799315901, + "loss": 0.2748, + "step": 24693 + }, + { + "epoch": 2.0004860661049904, + "grad_norm": 0.06032516807317734, + "learning_rate": 0.00011108510734056439, + "loss": 0.2436, + "step": 24694 + }, + { + "epoch": 2.0005670771224886, + "grad_norm": 0.0629146546125412, + "learning_rate": 0.00011108060668796976, + "loss": 0.3101, + "step": 24695 + }, + { + "epoch": 2.000648088139987, + "grad_norm": 0.052501216530799866, + "learning_rate": 0.00011107610603537513, + "loss": 0.2384, + "step": 24696 + }, + { + "epoch": 2.0007290991574855, + "grad_norm": 0.06095361337065697, + "learning_rate": 0.00011107160538278052, + "loss": 0.2698, + "step": 24697 + }, + { + "epoch": 2.000810110174984, + "grad_norm": 0.05067559704184532, + "learning_rate": 0.0001110671047301859, + "loss": 0.2674, + "step": 24698 + }, + { + "epoch": 2.000891121192482, + "grad_norm": 0.06133396923542023, + "learning_rate": 0.00011106260407759125, + "loss": 0.2594, + "step": 24699 + }, + { + "epoch": 2.0009721322099807, + "grad_norm": 0.07269486784934998, + "learning_rate": 0.00011105810342499663, + "loss": 0.2731, + "step": 24700 + }, + { + "epoch": 2.001053143227479, + "grad_norm": 0.054098621010780334, + "learning_rate": 0.000111053602772402, + "loss": 0.2434, + "step": 24701 + }, + { + "epoch": 2.001134154244977, + "grad_norm": 0.048134684562683105, + "learning_rate": 0.00011104910211980737, + "loss": 0.2278, + "step": 24702 + }, + { + "epoch": 2.001215165262476, + "grad_norm": 0.05443552881479263, + "learning_rate": 0.00011104460146721276, + "loss": 0.2295, + "step": 24703 + }, + { + "epoch": 2.001296176279974, + "grad_norm": 0.0662059560418129, + "learning_rate": 0.00011104010081461814, + "loss": 0.2592, + "step": 24704 + }, + { + "epoch": 2.0013771872974724, + "grad_norm": 0.052790913730859756, + "learning_rate": 0.0001110356001620235, + "loss": 0.2463, + "step": 24705 + }, + { + "epoch": 2.0014581983149706, + "grad_norm": 0.05461428686976433, + "learning_rate": 0.00011103109950942887, + "loss": 0.2431, + "step": 24706 + }, + { + "epoch": 2.0015392093324693, + "grad_norm": 0.05621863156557083, + "learning_rate": 0.00011102659885683424, + "loss": 0.2484, + "step": 24707 + }, + { + "epoch": 2.0016202203499676, + "grad_norm": 0.06086277589201927, + "learning_rate": 0.00011102209820423962, + "loss": 0.2532, + "step": 24708 + }, + { + "epoch": 2.001701231367466, + "grad_norm": 0.06324207782745361, + "learning_rate": 0.000111017597551645, + "loss": 0.2349, + "step": 24709 + }, + { + "epoch": 2.0017822423849645, + "grad_norm": 0.057401739060878754, + "learning_rate": 0.00011101309689905038, + "loss": 0.2737, + "step": 24710 + }, + { + "epoch": 2.0018632534024627, + "grad_norm": 0.07243891805410385, + "learning_rate": 0.00011100859624645574, + "loss": 0.2514, + "step": 24711 + }, + { + "epoch": 2.001944264419961, + "grad_norm": 0.06032774597406387, + "learning_rate": 0.00011100409559386111, + "loss": 0.2515, + "step": 24712 + }, + { + "epoch": 2.0020252754374597, + "grad_norm": 0.06539171189069748, + "learning_rate": 0.00011099959494126648, + "loss": 0.2429, + "step": 24713 + }, + { + "epoch": 2.002106286454958, + "grad_norm": 0.06766407936811447, + "learning_rate": 0.00011099509428867186, + "loss": 0.2121, + "step": 24714 + }, + { + "epoch": 2.002187297472456, + "grad_norm": 0.05758608505129814, + "learning_rate": 0.00011099059363607725, + "loss": 0.267, + "step": 24715 + }, + { + "epoch": 2.002268308489955, + "grad_norm": 0.061857450753450394, + "learning_rate": 0.00011098609298348262, + "loss": 0.2626, + "step": 24716 + }, + { + "epoch": 2.002349319507453, + "grad_norm": 0.06130329146981239, + "learning_rate": 0.00011098159233088798, + "loss": 0.2481, + "step": 24717 + }, + { + "epoch": 2.0024303305249513, + "grad_norm": 0.0638541579246521, + "learning_rate": 0.00011097709167829335, + "loss": 0.2778, + "step": 24718 + }, + { + "epoch": 2.0025113415424496, + "grad_norm": 0.05980760231614113, + "learning_rate": 0.00011097259102569873, + "loss": 0.2796, + "step": 24719 + }, + { + "epoch": 2.0025923525599483, + "grad_norm": 0.06339839845895767, + "learning_rate": 0.0001109680903731041, + "loss": 0.2772, + "step": 24720 + }, + { + "epoch": 2.0026733635774465, + "grad_norm": 0.0550805926322937, + "learning_rate": 0.00011096358972050949, + "loss": 0.2452, + "step": 24721 + }, + { + "epoch": 2.0027543745949448, + "grad_norm": 0.06922931969165802, + "learning_rate": 0.00011095908906791486, + "loss": 0.2714, + "step": 24722 + }, + { + "epoch": 2.0028353856124435, + "grad_norm": 0.05844055488705635, + "learning_rate": 0.00011095458841532022, + "loss": 0.2589, + "step": 24723 + }, + { + "epoch": 2.0029163966299417, + "grad_norm": 0.0715651586651802, + "learning_rate": 0.0001109500877627256, + "loss": 0.2519, + "step": 24724 + }, + { + "epoch": 2.00299740764744, + "grad_norm": 0.056815255433321, + "learning_rate": 0.00011094558711013097, + "loss": 0.2375, + "step": 24725 + }, + { + "epoch": 2.0030784186649386, + "grad_norm": 0.07449610531330109, + "learning_rate": 0.00011094108645753636, + "loss": 0.2598, + "step": 24726 + }, + { + "epoch": 2.003159429682437, + "grad_norm": 0.059494923800230026, + "learning_rate": 0.00011093658580494173, + "loss": 0.2419, + "step": 24727 + }, + { + "epoch": 2.003240440699935, + "grad_norm": 0.06169473007321358, + "learning_rate": 0.0001109320851523471, + "loss": 0.3016, + "step": 24728 + }, + { + "epoch": 2.0033214517174334, + "grad_norm": 0.05887840688228607, + "learning_rate": 0.00011092758449975246, + "loss": 0.2653, + "step": 24729 + }, + { + "epoch": 2.003402462734932, + "grad_norm": 0.05909749120473862, + "learning_rate": 0.00011092308384715784, + "loss": 0.2754, + "step": 24730 + }, + { + "epoch": 2.0034834737524303, + "grad_norm": 0.05189096927642822, + "learning_rate": 0.00011091858319456321, + "loss": 0.2508, + "step": 24731 + }, + { + "epoch": 2.0035644847699285, + "grad_norm": 0.0566328763961792, + "learning_rate": 0.0001109140825419686, + "loss": 0.2704, + "step": 24732 + }, + { + "epoch": 2.0036454957874272, + "grad_norm": 0.060474324971437454, + "learning_rate": 0.00011090958188937397, + "loss": 0.2717, + "step": 24733 + }, + { + "epoch": 2.0037265068049255, + "grad_norm": 0.07984836399555206, + "learning_rate": 0.00011090508123677934, + "loss": 0.3446, + "step": 24734 + }, + { + "epoch": 2.0038075178224237, + "grad_norm": 0.056504566222429276, + "learning_rate": 0.0001109005805841847, + "loss": 0.2756, + "step": 24735 + }, + { + "epoch": 2.0038885288399224, + "grad_norm": 0.05933867767453194, + "learning_rate": 0.00011089607993159008, + "loss": 0.2581, + "step": 24736 + }, + { + "epoch": 2.0039695398574207, + "grad_norm": 0.06576526910066605, + "learning_rate": 0.00011089157927899545, + "loss": 0.2791, + "step": 24737 + }, + { + "epoch": 2.004050550874919, + "grad_norm": 0.06752409785985947, + "learning_rate": 0.00011088707862640084, + "loss": 0.2283, + "step": 24738 + }, + { + "epoch": 2.0041315618924176, + "grad_norm": 0.06522194296121597, + "learning_rate": 0.00011088257797380621, + "loss": 0.2802, + "step": 24739 + }, + { + "epoch": 2.004212572909916, + "grad_norm": 0.05219874158501625, + "learning_rate": 0.00011087807732121159, + "loss": 0.2693, + "step": 24740 + }, + { + "epoch": 2.004293583927414, + "grad_norm": 0.06683573126792908, + "learning_rate": 0.00011087357666861695, + "loss": 0.2921, + "step": 24741 + }, + { + "epoch": 2.0043745949449123, + "grad_norm": 0.04907495528459549, + "learning_rate": 0.00011086907601602232, + "loss": 0.2185, + "step": 24742 + }, + { + "epoch": 2.004455605962411, + "grad_norm": 0.06387689709663391, + "learning_rate": 0.00011086457536342769, + "loss": 0.2467, + "step": 24743 + }, + { + "epoch": 2.0045366169799093, + "grad_norm": 0.0585172064602375, + "learning_rate": 0.00011086007471083308, + "loss": 0.2231, + "step": 24744 + }, + { + "epoch": 2.0046176279974075, + "grad_norm": 0.059628698974847794, + "learning_rate": 0.00011085557405823845, + "loss": 0.2674, + "step": 24745 + }, + { + "epoch": 2.004698639014906, + "grad_norm": 0.05630394071340561, + "learning_rate": 0.00011085107340564383, + "loss": 0.2594, + "step": 24746 + }, + { + "epoch": 2.0047796500324044, + "grad_norm": 0.06501825153827667, + "learning_rate": 0.00011084657275304919, + "loss": 0.2527, + "step": 24747 + }, + { + "epoch": 2.0048606610499027, + "grad_norm": 0.07488704472780228, + "learning_rate": 0.00011084207210045456, + "loss": 0.2559, + "step": 24748 + }, + { + "epoch": 2.0049416720674014, + "grad_norm": 0.06376083940267563, + "learning_rate": 0.00011083757144785996, + "loss": 0.2598, + "step": 24749 + }, + { + "epoch": 2.0050226830848996, + "grad_norm": 0.057747989892959595, + "learning_rate": 0.00011083307079526532, + "loss": 0.22, + "step": 24750 + }, + { + "epoch": 2.005103694102398, + "grad_norm": 0.07557959854602814, + "learning_rate": 0.0001108285701426707, + "loss": 0.2561, + "step": 24751 + }, + { + "epoch": 2.005184705119896, + "grad_norm": 0.07208617031574249, + "learning_rate": 0.00011082406949007607, + "loss": 0.2967, + "step": 24752 + }, + { + "epoch": 2.005265716137395, + "grad_norm": 0.058110445737838745, + "learning_rate": 0.00011081956883748144, + "loss": 0.2892, + "step": 24753 + }, + { + "epoch": 2.005346727154893, + "grad_norm": 0.05751382187008858, + "learning_rate": 0.0001108150681848868, + "loss": 0.2578, + "step": 24754 + }, + { + "epoch": 2.0054277381723913, + "grad_norm": 0.051433827728033066, + "learning_rate": 0.0001108105675322922, + "loss": 0.2281, + "step": 24755 + }, + { + "epoch": 2.00550874918989, + "grad_norm": 0.06940347701311111, + "learning_rate": 0.00011080606687969756, + "loss": 0.2757, + "step": 24756 + }, + { + "epoch": 2.005589760207388, + "grad_norm": 0.05864989757537842, + "learning_rate": 0.00011080156622710294, + "loss": 0.244, + "step": 24757 + }, + { + "epoch": 2.0056707712248865, + "grad_norm": 0.06956269592046738, + "learning_rate": 0.00011079706557450831, + "loss": 0.256, + "step": 24758 + }, + { + "epoch": 2.005751782242385, + "grad_norm": 0.06806553900241852, + "learning_rate": 0.00011079256492191368, + "loss": 0.2792, + "step": 24759 + }, + { + "epoch": 2.0058327932598834, + "grad_norm": 0.05740509182214737, + "learning_rate": 0.00011078806426931904, + "loss": 0.2826, + "step": 24760 + }, + { + "epoch": 2.0059138042773816, + "grad_norm": 0.0648796334862709, + "learning_rate": 0.00011078356361672444, + "loss": 0.2743, + "step": 24761 + }, + { + "epoch": 2.00599481529488, + "grad_norm": 0.06351775676012039, + "learning_rate": 0.0001107790629641298, + "loss": 0.2894, + "step": 24762 + }, + { + "epoch": 2.0060758263123786, + "grad_norm": 0.07129117846488953, + "learning_rate": 0.00011077456231153518, + "loss": 0.2419, + "step": 24763 + }, + { + "epoch": 2.006156837329877, + "grad_norm": 0.05522260442376137, + "learning_rate": 0.00011077006165894055, + "loss": 0.2666, + "step": 24764 + }, + { + "epoch": 2.006237848347375, + "grad_norm": 0.07258357107639313, + "learning_rate": 0.00011076556100634593, + "loss": 0.2418, + "step": 24765 + }, + { + "epoch": 2.0063188593648738, + "grad_norm": 0.0612257681787014, + "learning_rate": 0.00011076106035375129, + "loss": 0.2624, + "step": 24766 + }, + { + "epoch": 2.006399870382372, + "grad_norm": 0.056269869208335876, + "learning_rate": 0.00011075655970115669, + "loss": 0.2558, + "step": 24767 + }, + { + "epoch": 2.0064808813998702, + "grad_norm": 0.058652397245168686, + "learning_rate": 0.00011075205904856205, + "loss": 0.2731, + "step": 24768 + }, + { + "epoch": 2.006561892417369, + "grad_norm": 0.05787859112024307, + "learning_rate": 0.00011074755839596742, + "loss": 0.259, + "step": 24769 + }, + { + "epoch": 2.006642903434867, + "grad_norm": 0.07301168143749237, + "learning_rate": 0.0001107430577433728, + "loss": 0.2736, + "step": 24770 + }, + { + "epoch": 2.0067239144523654, + "grad_norm": 0.05876472592353821, + "learning_rate": 0.00011073855709077817, + "loss": 0.2436, + "step": 24771 + }, + { + "epoch": 2.006804925469864, + "grad_norm": 0.06376785039901733, + "learning_rate": 0.00011073405643818353, + "loss": 0.2371, + "step": 24772 + }, + { + "epoch": 2.0068859364873624, + "grad_norm": 0.05607694014906883, + "learning_rate": 0.00011072955578558893, + "loss": 0.2826, + "step": 24773 + }, + { + "epoch": 2.0069669475048606, + "grad_norm": 0.06324949115514755, + "learning_rate": 0.00011072505513299429, + "loss": 0.2269, + "step": 24774 + }, + { + "epoch": 2.007047958522359, + "grad_norm": 0.06031335890293121, + "learning_rate": 0.00011072055448039966, + "loss": 0.2686, + "step": 24775 + }, + { + "epoch": 2.0071289695398575, + "grad_norm": 0.04615411534905434, + "learning_rate": 0.00011071605382780504, + "loss": 0.2397, + "step": 24776 + }, + { + "epoch": 2.0072099805573558, + "grad_norm": 0.06102414056658745, + "learning_rate": 0.00011071155317521041, + "loss": 0.305, + "step": 24777 + }, + { + "epoch": 2.007290991574854, + "grad_norm": 0.05530553683638573, + "learning_rate": 0.0001107070525226158, + "loss": 0.2567, + "step": 24778 + }, + { + "epoch": 2.0073720025923527, + "grad_norm": 0.05917354300618172, + "learning_rate": 0.00011070255187002117, + "loss": 0.2733, + "step": 24779 + }, + { + "epoch": 2.007453013609851, + "grad_norm": 0.06167829409241676, + "learning_rate": 0.00011069805121742653, + "loss": 0.2323, + "step": 24780 + }, + { + "epoch": 2.007534024627349, + "grad_norm": 0.06296923011541367, + "learning_rate": 0.0001106935505648319, + "loss": 0.2097, + "step": 24781 + }, + { + "epoch": 2.007615035644848, + "grad_norm": 0.05603337287902832, + "learning_rate": 0.00011068904991223728, + "loss": 0.2307, + "step": 24782 + }, + { + "epoch": 2.007696046662346, + "grad_norm": 0.0774034634232521, + "learning_rate": 0.00011068454925964265, + "loss": 0.2663, + "step": 24783 + }, + { + "epoch": 2.0077770576798444, + "grad_norm": 0.06689009070396423, + "learning_rate": 0.00011068004860704804, + "loss": 0.2797, + "step": 24784 + }, + { + "epoch": 2.0078580686973426, + "grad_norm": 0.05564473941922188, + "learning_rate": 0.00011067554795445341, + "loss": 0.2234, + "step": 24785 + }, + { + "epoch": 2.0079390797148413, + "grad_norm": 0.06797027587890625, + "learning_rate": 0.00011067104730185877, + "loss": 0.3027, + "step": 24786 + }, + { + "epoch": 2.0080200907323396, + "grad_norm": 0.061519429087638855, + "learning_rate": 0.00011066654664926414, + "loss": 0.2575, + "step": 24787 + }, + { + "epoch": 2.008101101749838, + "grad_norm": 0.06355039775371552, + "learning_rate": 0.00011066204599666952, + "loss": 0.2481, + "step": 24788 + }, + { + "epoch": 2.0081821127673365, + "grad_norm": 0.05140797793865204, + "learning_rate": 0.00011065754534407489, + "loss": 0.2388, + "step": 24789 + }, + { + "epoch": 2.0082631237848347, + "grad_norm": 0.0660591721534729, + "learning_rate": 0.00011065304469148028, + "loss": 0.2364, + "step": 24790 + }, + { + "epoch": 2.008344134802333, + "grad_norm": 0.062273040413856506, + "learning_rate": 0.00011064854403888565, + "loss": 0.2608, + "step": 24791 + }, + { + "epoch": 2.0084251458198317, + "grad_norm": 0.059555307030677795, + "learning_rate": 0.00011064404338629101, + "loss": 0.2682, + "step": 24792 + }, + { + "epoch": 2.00850615683733, + "grad_norm": 0.06148289144039154, + "learning_rate": 0.00011063954273369639, + "loss": 0.2812, + "step": 24793 + }, + { + "epoch": 2.008587167854828, + "grad_norm": 0.057390399277210236, + "learning_rate": 0.00011063504208110176, + "loss": 0.2341, + "step": 24794 + }, + { + "epoch": 2.008668178872327, + "grad_norm": 0.06243424862623215, + "learning_rate": 0.00011063054142850713, + "loss": 0.2291, + "step": 24795 + }, + { + "epoch": 2.008749189889825, + "grad_norm": 0.055271074175834656, + "learning_rate": 0.00011062604077591252, + "loss": 0.2808, + "step": 24796 + }, + { + "epoch": 2.0088302009073233, + "grad_norm": 0.05574141442775726, + "learning_rate": 0.0001106215401233179, + "loss": 0.2327, + "step": 24797 + }, + { + "epoch": 2.0089112119248216, + "grad_norm": 0.055914100259542465, + "learning_rate": 0.00011061703947072325, + "loss": 0.2444, + "step": 24798 + }, + { + "epoch": 2.0089922229423203, + "grad_norm": 0.0710621103644371, + "learning_rate": 0.00011061253881812863, + "loss": 0.2514, + "step": 24799 + }, + { + "epoch": 2.0090732339598185, + "grad_norm": 0.06601139158010483, + "learning_rate": 0.000110608038165534, + "loss": 0.2553, + "step": 24800 + }, + { + "epoch": 2.0091542449773168, + "grad_norm": 0.052085116505622864, + "learning_rate": 0.00011060353751293939, + "loss": 0.2459, + "step": 24801 + }, + { + "epoch": 2.0092352559948155, + "grad_norm": 0.060282688587903976, + "learning_rate": 0.00011059903686034476, + "loss": 0.2479, + "step": 24802 + }, + { + "epoch": 2.0093162670123137, + "grad_norm": 0.05750880390405655, + "learning_rate": 0.00011059453620775014, + "loss": 0.294, + "step": 24803 + }, + { + "epoch": 2.009397278029812, + "grad_norm": 0.0854443684220314, + "learning_rate": 0.0001105900355551555, + "loss": 0.2245, + "step": 24804 + }, + { + "epoch": 2.0094782890473106, + "grad_norm": 0.05768865346908569, + "learning_rate": 0.00011058553490256087, + "loss": 0.2389, + "step": 24805 + }, + { + "epoch": 2.009559300064809, + "grad_norm": 0.06694091856479645, + "learning_rate": 0.00011058103424996624, + "loss": 0.2886, + "step": 24806 + }, + { + "epoch": 2.009640311082307, + "grad_norm": 0.06841512769460678, + "learning_rate": 0.00011057653359737163, + "loss": 0.2861, + "step": 24807 + }, + { + "epoch": 2.0097213220998054, + "grad_norm": 0.051087573170661926, + "learning_rate": 0.000110572032944777, + "loss": 0.2722, + "step": 24808 + }, + { + "epoch": 2.009802333117304, + "grad_norm": 0.05654599517583847, + "learning_rate": 0.00011056753229218238, + "loss": 0.2624, + "step": 24809 + }, + { + "epoch": 2.0098833441348023, + "grad_norm": 0.06250681728124619, + "learning_rate": 0.00011056303163958774, + "loss": 0.2876, + "step": 24810 + }, + { + "epoch": 2.0099643551523005, + "grad_norm": 0.06365267187356949, + "learning_rate": 0.00011055853098699311, + "loss": 0.2541, + "step": 24811 + }, + { + "epoch": 2.0100453661697992, + "grad_norm": 0.06928987056016922, + "learning_rate": 0.00011055403033439848, + "loss": 0.2938, + "step": 24812 + }, + { + "epoch": 2.0101263771872975, + "grad_norm": 0.05956394597887993, + "learning_rate": 0.00011054952968180387, + "loss": 0.2443, + "step": 24813 + }, + { + "epoch": 2.0102073882047957, + "grad_norm": 0.06563542038202286, + "learning_rate": 0.00011054502902920925, + "loss": 0.2706, + "step": 24814 + }, + { + "epoch": 2.0102883992222944, + "grad_norm": 0.07303499430418015, + "learning_rate": 0.00011054052837661462, + "loss": 0.2691, + "step": 24815 + }, + { + "epoch": 2.0103694102397927, + "grad_norm": 0.05468272790312767, + "learning_rate": 0.00011053602772401998, + "loss": 0.2536, + "step": 24816 + }, + { + "epoch": 2.010450421257291, + "grad_norm": 0.059382714331150055, + "learning_rate": 0.00011053152707142535, + "loss": 0.2385, + "step": 24817 + }, + { + "epoch": 2.0105314322747896, + "grad_norm": 0.06693161278963089, + "learning_rate": 0.00011052702641883073, + "loss": 0.2505, + "step": 24818 + }, + { + "epoch": 2.010612443292288, + "grad_norm": 0.05936438590288162, + "learning_rate": 0.00011052252576623611, + "loss": 0.2067, + "step": 24819 + }, + { + "epoch": 2.010693454309786, + "grad_norm": 0.07057078927755356, + "learning_rate": 0.00011051802511364149, + "loss": 0.2758, + "step": 24820 + }, + { + "epoch": 2.0107744653272843, + "grad_norm": 0.06983327120542526, + "learning_rate": 0.00011051352446104686, + "loss": 0.2795, + "step": 24821 + }, + { + "epoch": 2.010855476344783, + "grad_norm": 0.052694715559482574, + "learning_rate": 0.00011050902380845223, + "loss": 0.2565, + "step": 24822 + }, + { + "epoch": 2.0109364873622813, + "grad_norm": 0.059741996228694916, + "learning_rate": 0.0001105045231558576, + "loss": 0.2474, + "step": 24823 + }, + { + "epoch": 2.0110174983797795, + "grad_norm": 0.06866106390953064, + "learning_rate": 0.00011050002250326297, + "loss": 0.2669, + "step": 24824 + }, + { + "epoch": 2.011098509397278, + "grad_norm": 0.06021895259618759, + "learning_rate": 0.00011049552185066836, + "loss": 0.2585, + "step": 24825 + }, + { + "epoch": 2.0111795204147764, + "grad_norm": 0.05985042080283165, + "learning_rate": 0.00011049102119807373, + "loss": 0.2671, + "step": 24826 + }, + { + "epoch": 2.0112605314322747, + "grad_norm": 0.05347612127661705, + "learning_rate": 0.0001104865205454791, + "loss": 0.2408, + "step": 24827 + }, + { + "epoch": 2.0113415424497734, + "grad_norm": 0.07350436598062515, + "learning_rate": 0.00011048201989288448, + "loss": 0.2614, + "step": 24828 + }, + { + "epoch": 2.0114225534672716, + "grad_norm": 0.05584220588207245, + "learning_rate": 0.00011047751924028984, + "loss": 0.2454, + "step": 24829 + }, + { + "epoch": 2.01150356448477, + "grad_norm": 0.05780201032757759, + "learning_rate": 0.00011047301858769524, + "loss": 0.2406, + "step": 24830 + }, + { + "epoch": 2.011584575502268, + "grad_norm": 0.06727975606918335, + "learning_rate": 0.0001104685179351006, + "loss": 0.2837, + "step": 24831 + }, + { + "epoch": 2.011665586519767, + "grad_norm": 0.050393763929605484, + "learning_rate": 0.00011046401728250597, + "loss": 0.2185, + "step": 24832 + }, + { + "epoch": 2.011746597537265, + "grad_norm": 0.05321752279996872, + "learning_rate": 0.00011045951662991134, + "loss": 0.2333, + "step": 24833 + }, + { + "epoch": 2.0118276085547633, + "grad_norm": 0.058083198964595795, + "learning_rate": 0.00011045501597731672, + "loss": 0.2934, + "step": 24834 + }, + { + "epoch": 2.011908619572262, + "grad_norm": 0.05851643532514572, + "learning_rate": 0.00011045051532472208, + "loss": 0.2604, + "step": 24835 + }, + { + "epoch": 2.01198963058976, + "grad_norm": 0.07460993528366089, + "learning_rate": 0.00011044601467212748, + "loss": 0.2684, + "step": 24836 + }, + { + "epoch": 2.0120706416072585, + "grad_norm": 0.05338436737656593, + "learning_rate": 0.00011044151401953284, + "loss": 0.2467, + "step": 24837 + }, + { + "epoch": 2.012151652624757, + "grad_norm": 0.07232926785945892, + "learning_rate": 0.00011043701336693821, + "loss": 0.2798, + "step": 24838 + }, + { + "epoch": 2.0122326636422554, + "grad_norm": 0.06704601645469666, + "learning_rate": 0.00011043251271434359, + "loss": 0.2967, + "step": 24839 + }, + { + "epoch": 2.0123136746597536, + "grad_norm": 0.04931149259209633, + "learning_rate": 0.00011042801206174896, + "loss": 0.2283, + "step": 24840 + }, + { + "epoch": 2.0123946856772523, + "grad_norm": 0.07212699204683304, + "learning_rate": 0.00011042351140915432, + "loss": 0.2677, + "step": 24841 + }, + { + "epoch": 2.0124756966947506, + "grad_norm": 0.06866638362407684, + "learning_rate": 0.00011041901075655972, + "loss": 0.2856, + "step": 24842 + }, + { + "epoch": 2.012556707712249, + "grad_norm": 0.060009896755218506, + "learning_rate": 0.00011041451010396508, + "loss": 0.2556, + "step": 24843 + }, + { + "epoch": 2.012637718729747, + "grad_norm": 0.05222009867429733, + "learning_rate": 0.00011041000945137045, + "loss": 0.2415, + "step": 24844 + }, + { + "epoch": 2.0127187297472457, + "grad_norm": 0.0565890334546566, + "learning_rate": 0.00011040550879877583, + "loss": 0.225, + "step": 24845 + }, + { + "epoch": 2.012799740764744, + "grad_norm": 0.05756883695721626, + "learning_rate": 0.0001104010081461812, + "loss": 0.2198, + "step": 24846 + }, + { + "epoch": 2.0128807517822422, + "grad_norm": 0.08347923308610916, + "learning_rate": 0.00011039650749358656, + "loss": 0.2808, + "step": 24847 + }, + { + "epoch": 2.012961762799741, + "grad_norm": 0.06518597900867462, + "learning_rate": 0.00011039200684099196, + "loss": 0.2887, + "step": 24848 + }, + { + "epoch": 2.013042773817239, + "grad_norm": 0.05346430093050003, + "learning_rate": 0.00011038750618839732, + "loss": 0.2629, + "step": 24849 + }, + { + "epoch": 2.0131237848347374, + "grad_norm": 0.060417596250772476, + "learning_rate": 0.0001103830055358027, + "loss": 0.2502, + "step": 24850 + }, + { + "epoch": 2.013204795852236, + "grad_norm": 0.05704808607697487, + "learning_rate": 0.00011037850488320807, + "loss": 0.2798, + "step": 24851 + }, + { + "epoch": 2.0132858068697344, + "grad_norm": 0.07693468779325485, + "learning_rate": 0.00011037400423061344, + "loss": 0.2504, + "step": 24852 + }, + { + "epoch": 2.0133668178872326, + "grad_norm": 0.06132509186863899, + "learning_rate": 0.0001103695035780188, + "loss": 0.2554, + "step": 24853 + }, + { + "epoch": 2.013447828904731, + "grad_norm": 0.06517904996871948, + "learning_rate": 0.0001103650029254242, + "loss": 0.2751, + "step": 24854 + }, + { + "epoch": 2.0135288399222295, + "grad_norm": 0.0628628209233284, + "learning_rate": 0.00011036050227282956, + "loss": 0.2956, + "step": 24855 + }, + { + "epoch": 2.0136098509397278, + "grad_norm": 0.06579403579235077, + "learning_rate": 0.00011035600162023494, + "loss": 0.2664, + "step": 24856 + }, + { + "epoch": 2.013690861957226, + "grad_norm": 0.052854299545288086, + "learning_rate": 0.00011035150096764031, + "loss": 0.2618, + "step": 24857 + }, + { + "epoch": 2.0137718729747247, + "grad_norm": 0.055037494748830795, + "learning_rate": 0.00011034700031504568, + "loss": 0.2117, + "step": 24858 + }, + { + "epoch": 2.013852883992223, + "grad_norm": 0.07010837644338608, + "learning_rate": 0.00011034249966245107, + "loss": 0.2582, + "step": 24859 + }, + { + "epoch": 2.013933895009721, + "grad_norm": 0.06331303715705872, + "learning_rate": 0.00011033799900985644, + "loss": 0.2429, + "step": 24860 + }, + { + "epoch": 2.01401490602722, + "grad_norm": 0.06630232185125351, + "learning_rate": 0.0001103334983572618, + "loss": 0.2881, + "step": 24861 + }, + { + "epoch": 2.014095917044718, + "grad_norm": 0.07510954141616821, + "learning_rate": 0.00011032899770466718, + "loss": 0.2941, + "step": 24862 + }, + { + "epoch": 2.0141769280622164, + "grad_norm": 0.06895710527896881, + "learning_rate": 0.00011032449705207255, + "loss": 0.2736, + "step": 24863 + }, + { + "epoch": 2.0142579390797146, + "grad_norm": 0.054260727018117905, + "learning_rate": 0.00011031999639947793, + "loss": 0.233, + "step": 24864 + }, + { + "epoch": 2.0143389500972133, + "grad_norm": 0.07401958107948303, + "learning_rate": 0.00011031549574688331, + "loss": 0.289, + "step": 24865 + }, + { + "epoch": 2.0144199611147116, + "grad_norm": 0.06323473900556564, + "learning_rate": 0.00011031099509428869, + "loss": 0.2472, + "step": 24866 + }, + { + "epoch": 2.01450097213221, + "grad_norm": 0.07643520087003708, + "learning_rate": 0.00011030649444169405, + "loss": 0.2777, + "step": 24867 + }, + { + "epoch": 2.0145819831497085, + "grad_norm": 0.06905188411474228, + "learning_rate": 0.00011030199378909942, + "loss": 0.2359, + "step": 24868 + }, + { + "epoch": 2.0146629941672067, + "grad_norm": 0.06276547908782959, + "learning_rate": 0.0001102974931365048, + "loss": 0.2813, + "step": 24869 + }, + { + "epoch": 2.014744005184705, + "grad_norm": 0.06353870034217834, + "learning_rate": 0.00011029299248391017, + "loss": 0.2542, + "step": 24870 + }, + { + "epoch": 2.0148250162022037, + "grad_norm": 0.0546317994594574, + "learning_rate": 0.00011028849183131555, + "loss": 0.2887, + "step": 24871 + }, + { + "epoch": 2.014906027219702, + "grad_norm": 0.06614603102207184, + "learning_rate": 0.00011028399117872093, + "loss": 0.2699, + "step": 24872 + }, + { + "epoch": 2.0149870382372, + "grad_norm": 0.048714540898799896, + "learning_rate": 0.00011027949052612629, + "loss": 0.2503, + "step": 24873 + }, + { + "epoch": 2.015068049254699, + "grad_norm": 0.06008073687553406, + "learning_rate": 0.00011027498987353166, + "loss": 0.2622, + "step": 24874 + }, + { + "epoch": 2.015149060272197, + "grad_norm": 0.07386930286884308, + "learning_rate": 0.00011027048922093704, + "loss": 0.2689, + "step": 24875 + }, + { + "epoch": 2.0152300712896953, + "grad_norm": 0.060236118733882904, + "learning_rate": 0.00011026598856834241, + "loss": 0.2504, + "step": 24876 + }, + { + "epoch": 2.0153110823071936, + "grad_norm": 0.0673976019024849, + "learning_rate": 0.0001102614879157478, + "loss": 0.2707, + "step": 24877 + }, + { + "epoch": 2.0153920933246923, + "grad_norm": 0.060310475528240204, + "learning_rate": 0.00011025698726315317, + "loss": 0.2727, + "step": 24878 + }, + { + "epoch": 2.0154731043421905, + "grad_norm": 0.06076750531792641, + "learning_rate": 0.00011025248661055853, + "loss": 0.2529, + "step": 24879 + }, + { + "epoch": 2.0155541153596888, + "grad_norm": 0.06271596997976303, + "learning_rate": 0.0001102479859579639, + "loss": 0.2426, + "step": 24880 + }, + { + "epoch": 2.0156351263771874, + "grad_norm": 0.062388744205236435, + "learning_rate": 0.00011024348530536928, + "loss": 0.2621, + "step": 24881 + }, + { + "epoch": 2.0157161373946857, + "grad_norm": 0.051883265376091, + "learning_rate": 0.00011023898465277466, + "loss": 0.2425, + "step": 24882 + }, + { + "epoch": 2.015797148412184, + "grad_norm": 0.0597045011818409, + "learning_rate": 0.00011023448400018004, + "loss": 0.2831, + "step": 24883 + }, + { + "epoch": 2.0158781594296826, + "grad_norm": 0.06017323583364487, + "learning_rate": 0.00011022998334758541, + "loss": 0.2742, + "step": 24884 + }, + { + "epoch": 2.015959170447181, + "grad_norm": 0.05535883083939552, + "learning_rate": 0.00011022548269499077, + "loss": 0.3075, + "step": 24885 + }, + { + "epoch": 2.016040181464679, + "grad_norm": 0.06221333518624306, + "learning_rate": 0.00011022098204239615, + "loss": 0.2893, + "step": 24886 + }, + { + "epoch": 2.0161211924821774, + "grad_norm": 0.06521249562501907, + "learning_rate": 0.00011021648138980152, + "loss": 0.2463, + "step": 24887 + }, + { + "epoch": 2.016202203499676, + "grad_norm": 0.07503092288970947, + "learning_rate": 0.0001102119807372069, + "loss": 0.2656, + "step": 24888 + }, + { + "epoch": 2.0162832145171743, + "grad_norm": 0.07238157838582993, + "learning_rate": 0.00011020748008461228, + "loss": 0.2772, + "step": 24889 + }, + { + "epoch": 2.0163642255346725, + "grad_norm": 0.057459212839603424, + "learning_rate": 0.00011020297943201765, + "loss": 0.2264, + "step": 24890 + }, + { + "epoch": 2.0164452365521712, + "grad_norm": 0.07599131017923355, + "learning_rate": 0.00011019847877942303, + "loss": 0.2923, + "step": 24891 + }, + { + "epoch": 2.0165262475696695, + "grad_norm": 0.053455375134944916, + "learning_rate": 0.00011019397812682839, + "loss": 0.2342, + "step": 24892 + }, + { + "epoch": 2.0166072585871677, + "grad_norm": 0.08646205067634583, + "learning_rate": 0.00011018947747423376, + "loss": 0.3353, + "step": 24893 + }, + { + "epoch": 2.0166882696046664, + "grad_norm": 0.04716218635439873, + "learning_rate": 0.00011018497682163915, + "loss": 0.2437, + "step": 24894 + }, + { + "epoch": 2.0167692806221647, + "grad_norm": 0.06640686839818954, + "learning_rate": 0.00011018047616904452, + "loss": 0.2363, + "step": 24895 + }, + { + "epoch": 2.016850291639663, + "grad_norm": 0.055529430508613586, + "learning_rate": 0.0001101759755164499, + "loss": 0.2743, + "step": 24896 + }, + { + "epoch": 2.0169313026571616, + "grad_norm": 0.06658606231212616, + "learning_rate": 0.00011017147486385527, + "loss": 0.2619, + "step": 24897 + }, + { + "epoch": 2.01701231367466, + "grad_norm": 0.0485866405069828, + "learning_rate": 0.00011016697421126063, + "loss": 0.2371, + "step": 24898 + }, + { + "epoch": 2.017093324692158, + "grad_norm": 0.062041107565164566, + "learning_rate": 0.000110162473558666, + "loss": 0.2701, + "step": 24899 + }, + { + "epoch": 2.0171743357096563, + "grad_norm": 0.06547149270772934, + "learning_rate": 0.00011015797290607139, + "loss": 0.2669, + "step": 24900 + }, + { + "epoch": 2.017255346727155, + "grad_norm": 0.05686812102794647, + "learning_rate": 0.00011015347225347676, + "loss": 0.2694, + "step": 24901 + }, + { + "epoch": 2.0173363577446533, + "grad_norm": 0.06172237545251846, + "learning_rate": 0.00011014897160088214, + "loss": 0.241, + "step": 24902 + }, + { + "epoch": 2.0174173687621515, + "grad_norm": 0.05941611900925636, + "learning_rate": 0.00011014447094828751, + "loss": 0.2788, + "step": 24903 + }, + { + "epoch": 2.01749837977965, + "grad_norm": 0.08111029118299484, + "learning_rate": 0.00011013997029569287, + "loss": 0.3124, + "step": 24904 + }, + { + "epoch": 2.0175793907971484, + "grad_norm": 0.04909438639879227, + "learning_rate": 0.00011013546964309824, + "loss": 0.2268, + "step": 24905 + }, + { + "epoch": 2.0176604018146467, + "grad_norm": 0.06682118028402328, + "learning_rate": 0.00011013096899050363, + "loss": 0.2574, + "step": 24906 + }, + { + "epoch": 2.0177414128321454, + "grad_norm": 0.06406114250421524, + "learning_rate": 0.000110126468337909, + "loss": 0.2469, + "step": 24907 + }, + { + "epoch": 2.0178224238496436, + "grad_norm": 0.059632740914821625, + "learning_rate": 0.00011012196768531438, + "loss": 0.2702, + "step": 24908 + }, + { + "epoch": 2.017903434867142, + "grad_norm": 0.06329367309808731, + "learning_rate": 0.00011011746703271975, + "loss": 0.2823, + "step": 24909 + }, + { + "epoch": 2.01798444588464, + "grad_norm": 0.06427815556526184, + "learning_rate": 0.00011011296638012511, + "loss": 0.2771, + "step": 24910 + }, + { + "epoch": 2.018065456902139, + "grad_norm": 0.06586452573537827, + "learning_rate": 0.00011010846572753051, + "loss": 0.25, + "step": 24911 + }, + { + "epoch": 2.018146467919637, + "grad_norm": 0.05930725485086441, + "learning_rate": 0.00011010396507493587, + "loss": 0.2281, + "step": 24912 + }, + { + "epoch": 2.0182274789371353, + "grad_norm": 0.062091536819934845, + "learning_rate": 0.00011009946442234125, + "loss": 0.2609, + "step": 24913 + }, + { + "epoch": 2.018308489954634, + "grad_norm": 0.048801861703395844, + "learning_rate": 0.00011009496376974662, + "loss": 0.2532, + "step": 24914 + }, + { + "epoch": 2.018389500972132, + "grad_norm": 0.0713617280125618, + "learning_rate": 0.00011009046311715199, + "loss": 0.281, + "step": 24915 + }, + { + "epoch": 2.0184705119896305, + "grad_norm": 0.05809679627418518, + "learning_rate": 0.00011008596246455735, + "loss": 0.2809, + "step": 24916 + }, + { + "epoch": 2.018551523007129, + "grad_norm": 0.0762714073061943, + "learning_rate": 0.00011008146181196275, + "loss": 0.2538, + "step": 24917 + }, + { + "epoch": 2.0186325340246274, + "grad_norm": 0.06147345155477524, + "learning_rate": 0.00011007696115936811, + "loss": 0.2437, + "step": 24918 + }, + { + "epoch": 2.0187135450421256, + "grad_norm": 0.0637492686510086, + "learning_rate": 0.00011007246050677349, + "loss": 0.2619, + "step": 24919 + }, + { + "epoch": 2.0187945560596243, + "grad_norm": 0.06866779178380966, + "learning_rate": 0.00011006795985417886, + "loss": 0.2692, + "step": 24920 + }, + { + "epoch": 2.0188755670771226, + "grad_norm": 0.05443578585982323, + "learning_rate": 0.00011006345920158423, + "loss": 0.2322, + "step": 24921 + }, + { + "epoch": 2.018956578094621, + "grad_norm": 0.0455310195684433, + "learning_rate": 0.0001100589585489896, + "loss": 0.2692, + "step": 24922 + }, + { + "epoch": 2.019037589112119, + "grad_norm": 0.0551089383661747, + "learning_rate": 0.000110054457896395, + "loss": 0.2893, + "step": 24923 + }, + { + "epoch": 2.0191186001296177, + "grad_norm": 0.07012448459863663, + "learning_rate": 0.00011004995724380036, + "loss": 0.2355, + "step": 24924 + }, + { + "epoch": 2.019199611147116, + "grad_norm": 0.062949538230896, + "learning_rate": 0.00011004545659120573, + "loss": 0.2785, + "step": 24925 + }, + { + "epoch": 2.0192806221646142, + "grad_norm": 0.06228519603610039, + "learning_rate": 0.0001100409559386111, + "loss": 0.2644, + "step": 24926 + }, + { + "epoch": 2.019361633182113, + "grad_norm": 0.05176496133208275, + "learning_rate": 0.00011003645528601648, + "loss": 0.2491, + "step": 24927 + }, + { + "epoch": 2.019442644199611, + "grad_norm": 0.05500460043549538, + "learning_rate": 0.00011003195463342184, + "loss": 0.2675, + "step": 24928 + }, + { + "epoch": 2.0195236552171094, + "grad_norm": 0.07030055671930313, + "learning_rate": 0.00011002745398082724, + "loss": 0.2412, + "step": 24929 + }, + { + "epoch": 2.019604666234608, + "grad_norm": 0.056598152965307236, + "learning_rate": 0.0001100229533282326, + "loss": 0.2636, + "step": 24930 + }, + { + "epoch": 2.0196856772521063, + "grad_norm": 0.05294608697295189, + "learning_rate": 0.00011001845267563797, + "loss": 0.2312, + "step": 24931 + }, + { + "epoch": 2.0197666882696046, + "grad_norm": 0.0632098987698555, + "learning_rate": 0.00011001395202304334, + "loss": 0.2519, + "step": 24932 + }, + { + "epoch": 2.019847699287103, + "grad_norm": 0.06364619731903076, + "learning_rate": 0.00011000945137044872, + "loss": 0.2579, + "step": 24933 + }, + { + "epoch": 2.0199287103046015, + "grad_norm": 0.06215987354516983, + "learning_rate": 0.0001100049507178541, + "loss": 0.2517, + "step": 24934 + }, + { + "epoch": 2.0200097213220998, + "grad_norm": 0.0633561834692955, + "learning_rate": 0.00011000045006525948, + "loss": 0.2062, + "step": 24935 + }, + { + "epoch": 2.020090732339598, + "grad_norm": 0.050913676619529724, + "learning_rate": 0.00010999594941266484, + "loss": 0.2084, + "step": 24936 + }, + { + "epoch": 2.0201717433570967, + "grad_norm": 0.06375525146722794, + "learning_rate": 0.00010999144876007021, + "loss": 0.3028, + "step": 24937 + }, + { + "epoch": 2.020252754374595, + "grad_norm": 0.062853142619133, + "learning_rate": 0.00010998694810747559, + "loss": 0.2372, + "step": 24938 + }, + { + "epoch": 2.020333765392093, + "grad_norm": 0.0629112720489502, + "learning_rate": 0.00010998244745488096, + "loss": 0.2601, + "step": 24939 + }, + { + "epoch": 2.020414776409592, + "grad_norm": 0.06098567321896553, + "learning_rate": 0.00010997794680228635, + "loss": 0.2289, + "step": 24940 + }, + { + "epoch": 2.02049578742709, + "grad_norm": 0.058595672249794006, + "learning_rate": 0.00010997344614969172, + "loss": 0.2181, + "step": 24941 + }, + { + "epoch": 2.0205767984445884, + "grad_norm": 0.06882533431053162, + "learning_rate": 0.00010996894549709708, + "loss": 0.2263, + "step": 24942 + }, + { + "epoch": 2.020657809462087, + "grad_norm": 0.06504590064287186, + "learning_rate": 0.00010996444484450245, + "loss": 0.2492, + "step": 24943 + }, + { + "epoch": 2.0207388204795853, + "grad_norm": 0.07631085067987442, + "learning_rate": 0.00010995994419190783, + "loss": 0.2563, + "step": 24944 + }, + { + "epoch": 2.0208198314970836, + "grad_norm": 0.06929519772529602, + "learning_rate": 0.0001099554435393132, + "loss": 0.2693, + "step": 24945 + }, + { + "epoch": 2.020900842514582, + "grad_norm": 0.07654745876789093, + "learning_rate": 0.00010995094288671859, + "loss": 0.2764, + "step": 24946 + }, + { + "epoch": 2.0209818535320805, + "grad_norm": 0.06594858318567276, + "learning_rate": 0.00010994644223412396, + "loss": 0.297, + "step": 24947 + }, + { + "epoch": 2.0210628645495787, + "grad_norm": 0.06843897700309753, + "learning_rate": 0.00010994194158152932, + "loss": 0.2662, + "step": 24948 + }, + { + "epoch": 2.021143875567077, + "grad_norm": 0.060284316539764404, + "learning_rate": 0.0001099374409289347, + "loss": 0.2381, + "step": 24949 + }, + { + "epoch": 2.0212248865845757, + "grad_norm": 0.06309103965759277, + "learning_rate": 0.00010993294027634007, + "loss": 0.2796, + "step": 24950 + }, + { + "epoch": 2.021305897602074, + "grad_norm": 0.051141221076250076, + "learning_rate": 0.00010992843962374544, + "loss": 0.2192, + "step": 24951 + }, + { + "epoch": 2.021386908619572, + "grad_norm": 0.05918668955564499, + "learning_rate": 0.00010992393897115083, + "loss": 0.276, + "step": 24952 + }, + { + "epoch": 2.021467919637071, + "grad_norm": 0.0494406521320343, + "learning_rate": 0.0001099194383185562, + "loss": 0.2209, + "step": 24953 + }, + { + "epoch": 2.021548930654569, + "grad_norm": 0.06384733319282532, + "learning_rate": 0.00010991493766596156, + "loss": 0.2434, + "step": 24954 + }, + { + "epoch": 2.0216299416720673, + "grad_norm": 0.06195070967078209, + "learning_rate": 0.00010991043701336694, + "loss": 0.2492, + "step": 24955 + }, + { + "epoch": 2.0217109526895656, + "grad_norm": 0.05020172521471977, + "learning_rate": 0.00010990593636077231, + "loss": 0.2447, + "step": 24956 + }, + { + "epoch": 2.0217919637070643, + "grad_norm": 0.06323754787445068, + "learning_rate": 0.00010990143570817768, + "loss": 0.274, + "step": 24957 + }, + { + "epoch": 2.0218729747245625, + "grad_norm": 0.05106744542717934, + "learning_rate": 0.00010989693505558307, + "loss": 0.2456, + "step": 24958 + }, + { + "epoch": 2.0219539857420608, + "grad_norm": 0.06775747984647751, + "learning_rate": 0.00010989243440298845, + "loss": 0.2219, + "step": 24959 + }, + { + "epoch": 2.0220349967595594, + "grad_norm": 0.05914744734764099, + "learning_rate": 0.00010988793375039382, + "loss": 0.2463, + "step": 24960 + }, + { + "epoch": 2.0221160077770577, + "grad_norm": 0.06987558305263519, + "learning_rate": 0.00010988343309779918, + "loss": 0.2643, + "step": 24961 + }, + { + "epoch": 2.022197018794556, + "grad_norm": 0.06686465442180634, + "learning_rate": 0.00010987893244520455, + "loss": 0.2868, + "step": 24962 + }, + { + "epoch": 2.0222780298120546, + "grad_norm": 0.05918348953127861, + "learning_rate": 0.00010987443179260994, + "loss": 0.2565, + "step": 24963 + }, + { + "epoch": 2.022359040829553, + "grad_norm": 0.05445210263133049, + "learning_rate": 0.00010986993114001531, + "loss": 0.2242, + "step": 24964 + }, + { + "epoch": 2.022440051847051, + "grad_norm": 0.062487054616212845, + "learning_rate": 0.00010986543048742069, + "loss": 0.2482, + "step": 24965 + }, + { + "epoch": 2.0225210628645494, + "grad_norm": 0.053661637008190155, + "learning_rate": 0.00010986092983482606, + "loss": 0.2275, + "step": 24966 + }, + { + "epoch": 2.022602073882048, + "grad_norm": 0.05276666209101677, + "learning_rate": 0.00010985642918223142, + "loss": 0.206, + "step": 24967 + }, + { + "epoch": 2.0226830848995463, + "grad_norm": 0.053006209433078766, + "learning_rate": 0.0001098519285296368, + "loss": 0.2452, + "step": 24968 + }, + { + "epoch": 2.0227640959170445, + "grad_norm": 0.06032721698284149, + "learning_rate": 0.00010984742787704218, + "loss": 0.2698, + "step": 24969 + }, + { + "epoch": 2.0228451069345432, + "grad_norm": 0.06984737515449524, + "learning_rate": 0.00010984292722444755, + "loss": 0.3158, + "step": 24970 + }, + { + "epoch": 2.0229261179520415, + "grad_norm": 0.07318402081727982, + "learning_rate": 0.00010983842657185293, + "loss": 0.2687, + "step": 24971 + }, + { + "epoch": 2.0230071289695397, + "grad_norm": 0.05291476845741272, + "learning_rate": 0.0001098339259192583, + "loss": 0.2432, + "step": 24972 + }, + { + "epoch": 2.0230881399870384, + "grad_norm": 0.051952771842479706, + "learning_rate": 0.00010982942526666366, + "loss": 0.223, + "step": 24973 + }, + { + "epoch": 2.0231691510045366, + "grad_norm": 0.05664176493883133, + "learning_rate": 0.00010982492461406904, + "loss": 0.2493, + "step": 24974 + }, + { + "epoch": 2.023250162022035, + "grad_norm": 0.05760600045323372, + "learning_rate": 0.00010982042396147442, + "loss": 0.256, + "step": 24975 + }, + { + "epoch": 2.0233311730395336, + "grad_norm": 0.0646364763379097, + "learning_rate": 0.0001098159233088798, + "loss": 0.2838, + "step": 24976 + }, + { + "epoch": 2.023412184057032, + "grad_norm": 0.05765828117728233, + "learning_rate": 0.00010981142265628517, + "loss": 0.2238, + "step": 24977 + }, + { + "epoch": 2.02349319507453, + "grad_norm": 0.05723297595977783, + "learning_rate": 0.00010980692200369054, + "loss": 0.2353, + "step": 24978 + }, + { + "epoch": 2.0235742060920283, + "grad_norm": 0.052987512201070786, + "learning_rate": 0.0001098024213510959, + "loss": 0.249, + "step": 24979 + }, + { + "epoch": 2.023655217109527, + "grad_norm": 0.0625142753124237, + "learning_rate": 0.00010979792069850128, + "loss": 0.2637, + "step": 24980 + }, + { + "epoch": 2.0237362281270252, + "grad_norm": 0.056187573820352554, + "learning_rate": 0.00010979342004590666, + "loss": 0.2558, + "step": 24981 + }, + { + "epoch": 2.0238172391445235, + "grad_norm": 0.054129134863615036, + "learning_rate": 0.00010978891939331204, + "loss": 0.2429, + "step": 24982 + }, + { + "epoch": 2.023898250162022, + "grad_norm": 0.062188971787691116, + "learning_rate": 0.00010978441874071741, + "loss": 0.2804, + "step": 24983 + }, + { + "epoch": 2.0239792611795204, + "grad_norm": 0.0643451139330864, + "learning_rate": 0.00010977991808812279, + "loss": 0.2458, + "step": 24984 + }, + { + "epoch": 2.0240602721970187, + "grad_norm": 0.06608925759792328, + "learning_rate": 0.00010977541743552815, + "loss": 0.2601, + "step": 24985 + }, + { + "epoch": 2.0241412832145174, + "grad_norm": 0.0593901164829731, + "learning_rate": 0.00010977091678293355, + "loss": 0.2873, + "step": 24986 + }, + { + "epoch": 2.0242222942320156, + "grad_norm": 0.059159524738788605, + "learning_rate": 0.0001097664161303389, + "loss": 0.2723, + "step": 24987 + }, + { + "epoch": 2.024303305249514, + "grad_norm": 0.056645944714546204, + "learning_rate": 0.00010976191547774428, + "loss": 0.2177, + "step": 24988 + }, + { + "epoch": 2.024384316267012, + "grad_norm": 0.0677858516573906, + "learning_rate": 0.00010975741482514965, + "loss": 0.2809, + "step": 24989 + }, + { + "epoch": 2.024465327284511, + "grad_norm": 0.06798025220632553, + "learning_rate": 0.00010975291417255503, + "loss": 0.2814, + "step": 24990 + }, + { + "epoch": 2.024546338302009, + "grad_norm": 0.04821959137916565, + "learning_rate": 0.00010974841351996039, + "loss": 0.2121, + "step": 24991 + }, + { + "epoch": 2.0246273493195073, + "grad_norm": 0.07496128231287003, + "learning_rate": 0.00010974391286736579, + "loss": 0.2722, + "step": 24992 + }, + { + "epoch": 2.024708360337006, + "grad_norm": 0.05031698942184448, + "learning_rate": 0.00010973941221477115, + "loss": 0.2518, + "step": 24993 + }, + { + "epoch": 2.024789371354504, + "grad_norm": 0.05619891732931137, + "learning_rate": 0.00010973491156217652, + "loss": 0.2405, + "step": 24994 + }, + { + "epoch": 2.0248703823720025, + "grad_norm": 0.06708116084337234, + "learning_rate": 0.0001097304109095819, + "loss": 0.2418, + "step": 24995 + }, + { + "epoch": 2.024951393389501, + "grad_norm": 0.0597895085811615, + "learning_rate": 0.00010972591025698727, + "loss": 0.2578, + "step": 24996 + }, + { + "epoch": 2.0250324044069994, + "grad_norm": 0.06456056237220764, + "learning_rate": 0.00010972140960439263, + "loss": 0.2667, + "step": 24997 + }, + { + "epoch": 2.0251134154244976, + "grad_norm": 0.06623676419258118, + "learning_rate": 0.00010971690895179803, + "loss": 0.2774, + "step": 24998 + }, + { + "epoch": 2.0251944264419963, + "grad_norm": 0.06833084672689438, + "learning_rate": 0.00010971240829920339, + "loss": 0.2838, + "step": 24999 + }, + { + "epoch": 2.0252754374594946, + "grad_norm": 0.05819803103804588, + "learning_rate": 0.00010970790764660876, + "loss": 0.2698, + "step": 25000 + }, + { + "epoch": 2.025356448476993, + "grad_norm": 0.07516606152057648, + "learning_rate": 0.00010970340699401414, + "loss": 0.2881, + "step": 25001 + }, + { + "epoch": 2.025437459494491, + "grad_norm": 0.0686899870634079, + "learning_rate": 0.00010969890634141951, + "loss": 0.2546, + "step": 25002 + }, + { + "epoch": 2.0255184705119897, + "grad_norm": 0.05174567550420761, + "learning_rate": 0.00010969440568882487, + "loss": 0.262, + "step": 25003 + }, + { + "epoch": 2.025599481529488, + "grad_norm": 0.06987392902374268, + "learning_rate": 0.00010968990503623027, + "loss": 0.282, + "step": 25004 + }, + { + "epoch": 2.0256804925469862, + "grad_norm": 0.058612413704395294, + "learning_rate": 0.00010968540438363563, + "loss": 0.2509, + "step": 25005 + }, + { + "epoch": 2.025761503564485, + "grad_norm": 0.06494861841201782, + "learning_rate": 0.000109680903731041, + "loss": 0.2522, + "step": 25006 + }, + { + "epoch": 2.025842514581983, + "grad_norm": 0.058371901512145996, + "learning_rate": 0.00010967640307844638, + "loss": 0.2689, + "step": 25007 + }, + { + "epoch": 2.0259235255994814, + "grad_norm": 0.0662706196308136, + "learning_rate": 0.00010967190242585175, + "loss": 0.297, + "step": 25008 + }, + { + "epoch": 2.02600453661698, + "grad_norm": 0.06714751571416855, + "learning_rate": 0.00010966740177325711, + "loss": 0.2632, + "step": 25009 + }, + { + "epoch": 2.0260855476344783, + "grad_norm": 0.06655032187700272, + "learning_rate": 0.00010966290112066251, + "loss": 0.2735, + "step": 25010 + }, + { + "epoch": 2.0261665586519766, + "grad_norm": 0.06318897753953934, + "learning_rate": 0.00010965840046806787, + "loss": 0.2675, + "step": 25011 + }, + { + "epoch": 2.026247569669475, + "grad_norm": 0.056749485433101654, + "learning_rate": 0.00010965389981547325, + "loss": 0.2227, + "step": 25012 + }, + { + "epoch": 2.0263285806869735, + "grad_norm": 0.07162638008594513, + "learning_rate": 0.00010964939916287862, + "loss": 0.2556, + "step": 25013 + }, + { + "epoch": 2.0264095917044718, + "grad_norm": 0.06857409328222275, + "learning_rate": 0.000109644898510284, + "loss": 0.273, + "step": 25014 + }, + { + "epoch": 2.02649060272197, + "grad_norm": 0.056812405586242676, + "learning_rate": 0.00010964039785768938, + "loss": 0.2441, + "step": 25015 + }, + { + "epoch": 2.0265716137394687, + "grad_norm": 0.0918152704834938, + "learning_rate": 0.00010963589720509475, + "loss": 0.271, + "step": 25016 + }, + { + "epoch": 2.026652624756967, + "grad_norm": 0.06377450376749039, + "learning_rate": 0.00010963139655250011, + "loss": 0.2599, + "step": 25017 + }, + { + "epoch": 2.026733635774465, + "grad_norm": 0.05704493075609207, + "learning_rate": 0.00010962689589990549, + "loss": 0.2729, + "step": 25018 + }, + { + "epoch": 2.026814646791964, + "grad_norm": 0.05449501425027847, + "learning_rate": 0.00010962239524731086, + "loss": 0.2323, + "step": 25019 + }, + { + "epoch": 2.026895657809462, + "grad_norm": 0.06370042264461517, + "learning_rate": 0.00010961789459471624, + "loss": 0.2465, + "step": 25020 + }, + { + "epoch": 2.0269766688269604, + "grad_norm": 0.06381337344646454, + "learning_rate": 0.00010961339394212162, + "loss": 0.2108, + "step": 25021 + }, + { + "epoch": 2.027057679844459, + "grad_norm": 0.055754173547029495, + "learning_rate": 0.000109608893289527, + "loss": 0.228, + "step": 25022 + }, + { + "epoch": 2.0271386908619573, + "grad_norm": 0.0694638341665268, + "learning_rate": 0.00010960439263693236, + "loss": 0.2338, + "step": 25023 + }, + { + "epoch": 2.0272197018794555, + "grad_norm": 0.06296098977327347, + "learning_rate": 0.00010959989198433773, + "loss": 0.2104, + "step": 25024 + }, + { + "epoch": 2.027300712896954, + "grad_norm": 0.04989820346236229, + "learning_rate": 0.0001095953913317431, + "loss": 0.2395, + "step": 25025 + }, + { + "epoch": 2.0273817239144525, + "grad_norm": 0.05545510724186897, + "learning_rate": 0.00010959089067914848, + "loss": 0.243, + "step": 25026 + }, + { + "epoch": 2.0274627349319507, + "grad_norm": 0.06581075489521027, + "learning_rate": 0.00010958639002655386, + "loss": 0.273, + "step": 25027 + }, + { + "epoch": 2.027543745949449, + "grad_norm": 0.06940918415784836, + "learning_rate": 0.00010958188937395924, + "loss": 0.2525, + "step": 25028 + }, + { + "epoch": 2.0276247569669477, + "grad_norm": 0.05674520507454872, + "learning_rate": 0.00010957738872136461, + "loss": 0.2343, + "step": 25029 + }, + { + "epoch": 2.027705767984446, + "grad_norm": 0.04720667749643326, + "learning_rate": 0.00010957288806876997, + "loss": 0.2484, + "step": 25030 + }, + { + "epoch": 2.027786779001944, + "grad_norm": 0.06545969098806381, + "learning_rate": 0.00010956838741617534, + "loss": 0.239, + "step": 25031 + }, + { + "epoch": 2.027867790019443, + "grad_norm": 0.0600646436214447, + "learning_rate": 0.00010956388676358072, + "loss": 0.2443, + "step": 25032 + }, + { + "epoch": 2.027948801036941, + "grad_norm": 0.051953334361314774, + "learning_rate": 0.0001095593861109861, + "loss": 0.3033, + "step": 25033 + }, + { + "epoch": 2.0280298120544393, + "grad_norm": 0.056979961693286896, + "learning_rate": 0.00010955488545839148, + "loss": 0.2704, + "step": 25034 + }, + { + "epoch": 2.0281108230719376, + "grad_norm": 0.0636250451207161, + "learning_rate": 0.00010955038480579685, + "loss": 0.2829, + "step": 25035 + }, + { + "epoch": 2.0281918340894363, + "grad_norm": 0.06014517694711685, + "learning_rate": 0.00010954588415320221, + "loss": 0.2934, + "step": 25036 + }, + { + "epoch": 2.0282728451069345, + "grad_norm": 0.05575292930006981, + "learning_rate": 0.00010954138350060759, + "loss": 0.2503, + "step": 25037 + }, + { + "epoch": 2.0283538561244328, + "grad_norm": 0.06069019436836243, + "learning_rate": 0.00010953688284801296, + "loss": 0.2644, + "step": 25038 + }, + { + "epoch": 2.0284348671419314, + "grad_norm": 0.055340562015771866, + "learning_rate": 0.00010953238219541835, + "loss": 0.2574, + "step": 25039 + }, + { + "epoch": 2.0285158781594297, + "grad_norm": 0.05364044010639191, + "learning_rate": 0.00010952788154282372, + "loss": 0.2622, + "step": 25040 + }, + { + "epoch": 2.028596889176928, + "grad_norm": 0.057188618928194046, + "learning_rate": 0.0001095233808902291, + "loss": 0.2789, + "step": 25041 + }, + { + "epoch": 2.0286779001944266, + "grad_norm": 0.06309512257575989, + "learning_rate": 0.00010951888023763445, + "loss": 0.2699, + "step": 25042 + }, + { + "epoch": 2.028758911211925, + "grad_norm": 0.05525052919983864, + "learning_rate": 0.00010951437958503983, + "loss": 0.2631, + "step": 25043 + }, + { + "epoch": 2.028839922229423, + "grad_norm": 0.0631512701511383, + "learning_rate": 0.00010950987893244522, + "loss": 0.2757, + "step": 25044 + }, + { + "epoch": 2.028920933246922, + "grad_norm": 0.05379737541079521, + "learning_rate": 0.00010950537827985059, + "loss": 0.2569, + "step": 25045 + }, + { + "epoch": 2.02900194426442, + "grad_norm": 0.05144206061959267, + "learning_rate": 0.00010950087762725596, + "loss": 0.2527, + "step": 25046 + }, + { + "epoch": 2.0290829552819183, + "grad_norm": 0.06055891141295433, + "learning_rate": 0.00010949637697466134, + "loss": 0.2479, + "step": 25047 + }, + { + "epoch": 2.0291639662994165, + "grad_norm": 0.06275729089975357, + "learning_rate": 0.0001094918763220667, + "loss": 0.2867, + "step": 25048 + }, + { + "epoch": 2.029244977316915, + "grad_norm": 0.05858127027750015, + "learning_rate": 0.00010948737566947207, + "loss": 0.2511, + "step": 25049 + }, + { + "epoch": 2.0293259883344135, + "grad_norm": 0.07683297991752625, + "learning_rate": 0.00010948287501687746, + "loss": 0.2954, + "step": 25050 + }, + { + "epoch": 2.0294069993519117, + "grad_norm": 0.05623006075620651, + "learning_rate": 0.00010947837436428283, + "loss": 0.3148, + "step": 25051 + }, + { + "epoch": 2.0294880103694104, + "grad_norm": 0.06020371988415718, + "learning_rate": 0.0001094738737116882, + "loss": 0.2865, + "step": 25052 + }, + { + "epoch": 2.0295690213869086, + "grad_norm": 0.057044848799705505, + "learning_rate": 0.00010946937305909358, + "loss": 0.2943, + "step": 25053 + }, + { + "epoch": 2.029650032404407, + "grad_norm": 0.05336422100663185, + "learning_rate": 0.00010946487240649894, + "loss": 0.2309, + "step": 25054 + }, + { + "epoch": 2.0297310434219056, + "grad_norm": 0.05944908782839775, + "learning_rate": 0.00010946037175390431, + "loss": 0.2163, + "step": 25055 + }, + { + "epoch": 2.029812054439404, + "grad_norm": 0.06958787888288498, + "learning_rate": 0.0001094558711013097, + "loss": 0.2916, + "step": 25056 + }, + { + "epoch": 2.029893065456902, + "grad_norm": 0.07049243152141571, + "learning_rate": 0.00010945137044871507, + "loss": 0.2844, + "step": 25057 + }, + { + "epoch": 2.0299740764744003, + "grad_norm": 0.048220280557870865, + "learning_rate": 0.00010944686979612045, + "loss": 0.2358, + "step": 25058 + }, + { + "epoch": 2.030055087491899, + "grad_norm": 0.06429468840360641, + "learning_rate": 0.00010944236914352582, + "loss": 0.2563, + "step": 25059 + }, + { + "epoch": 2.0301360985093972, + "grad_norm": 0.06154334545135498, + "learning_rate": 0.00010943786849093118, + "loss": 0.2583, + "step": 25060 + }, + { + "epoch": 2.0302171095268955, + "grad_norm": 0.06173671409487724, + "learning_rate": 0.00010943336783833655, + "loss": 0.271, + "step": 25061 + }, + { + "epoch": 2.030298120544394, + "grad_norm": 0.06672929227352142, + "learning_rate": 0.00010942886718574194, + "loss": 0.2636, + "step": 25062 + }, + { + "epoch": 2.0303791315618924, + "grad_norm": 0.06729565560817719, + "learning_rate": 0.00010942436653314731, + "loss": 0.2563, + "step": 25063 + }, + { + "epoch": 2.0304601425793907, + "grad_norm": 0.06372225284576416, + "learning_rate": 0.00010941986588055269, + "loss": 0.2503, + "step": 25064 + }, + { + "epoch": 2.0305411535968894, + "grad_norm": 0.05387897044420242, + "learning_rate": 0.00010941536522795806, + "loss": 0.2554, + "step": 25065 + }, + { + "epoch": 2.0306221646143876, + "grad_norm": 0.050809603184461594, + "learning_rate": 0.00010941086457536342, + "loss": 0.2344, + "step": 25066 + }, + { + "epoch": 2.030703175631886, + "grad_norm": 0.06151817366480827, + "learning_rate": 0.00010940636392276882, + "loss": 0.2516, + "step": 25067 + }, + { + "epoch": 2.0307841866493845, + "grad_norm": 0.06497801095247269, + "learning_rate": 0.00010940186327017418, + "loss": 0.2661, + "step": 25068 + }, + { + "epoch": 2.030865197666883, + "grad_norm": 0.057573284953832626, + "learning_rate": 0.00010939736261757956, + "loss": 0.2439, + "step": 25069 + }, + { + "epoch": 2.030946208684381, + "grad_norm": 0.06750375032424927, + "learning_rate": 0.00010939286196498493, + "loss": 0.2727, + "step": 25070 + }, + { + "epoch": 2.0310272197018793, + "grad_norm": 0.06054481491446495, + "learning_rate": 0.0001093883613123903, + "loss": 0.2383, + "step": 25071 + }, + { + "epoch": 2.031108230719378, + "grad_norm": 0.06974566727876663, + "learning_rate": 0.00010938386065979566, + "loss": 0.298, + "step": 25072 + }, + { + "epoch": 2.031189241736876, + "grad_norm": 0.06394176185131073, + "learning_rate": 0.00010937936000720106, + "loss": 0.2388, + "step": 25073 + }, + { + "epoch": 2.0312702527543745, + "grad_norm": 0.050677455961704254, + "learning_rate": 0.00010937485935460642, + "loss": 0.2417, + "step": 25074 + }, + { + "epoch": 2.031351263771873, + "grad_norm": 0.06450121849775314, + "learning_rate": 0.0001093703587020118, + "loss": 0.2397, + "step": 25075 + }, + { + "epoch": 2.0314322747893714, + "grad_norm": 0.06229517608880997, + "learning_rate": 0.00010936585804941717, + "loss": 0.2726, + "step": 25076 + }, + { + "epoch": 2.0315132858068696, + "grad_norm": 0.06502380222082138, + "learning_rate": 0.00010936135739682254, + "loss": 0.2809, + "step": 25077 + }, + { + "epoch": 2.0315942968243683, + "grad_norm": 0.05273763835430145, + "learning_rate": 0.0001093568567442279, + "loss": 0.2218, + "step": 25078 + }, + { + "epoch": 2.0316753078418666, + "grad_norm": 0.059138379991054535, + "learning_rate": 0.0001093523560916333, + "loss": 0.259, + "step": 25079 + }, + { + "epoch": 2.031756318859365, + "grad_norm": 0.054885562509298325, + "learning_rate": 0.00010934785543903866, + "loss": 0.2615, + "step": 25080 + }, + { + "epoch": 2.031837329876863, + "grad_norm": 0.04939741641283035, + "learning_rate": 0.00010934335478644404, + "loss": 0.2444, + "step": 25081 + }, + { + "epoch": 2.0319183408943617, + "grad_norm": 0.05478831008076668, + "learning_rate": 0.00010933885413384941, + "loss": 0.2389, + "step": 25082 + }, + { + "epoch": 2.03199935191186, + "grad_norm": 0.06696939468383789, + "learning_rate": 0.00010933435348125479, + "loss": 0.305, + "step": 25083 + }, + { + "epoch": 2.0320803629293582, + "grad_norm": 0.051382653415203094, + "learning_rate": 0.00010932985282866015, + "loss": 0.2306, + "step": 25084 + }, + { + "epoch": 2.032161373946857, + "grad_norm": 0.07961443811655045, + "learning_rate": 0.00010932535217606555, + "loss": 0.2348, + "step": 25085 + }, + { + "epoch": 2.032242384964355, + "grad_norm": 0.0601271316409111, + "learning_rate": 0.0001093208515234709, + "loss": 0.2833, + "step": 25086 + }, + { + "epoch": 2.0323233959818534, + "grad_norm": 0.062180083245038986, + "learning_rate": 0.00010931635087087628, + "loss": 0.2535, + "step": 25087 + }, + { + "epoch": 2.032404406999352, + "grad_norm": 0.07181721180677414, + "learning_rate": 0.00010931185021828165, + "loss": 0.2996, + "step": 25088 + }, + { + "epoch": 2.0324854180168503, + "grad_norm": 0.04999459907412529, + "learning_rate": 0.00010930734956568703, + "loss": 0.2413, + "step": 25089 + }, + { + "epoch": 2.0325664290343486, + "grad_norm": 0.05797059088945389, + "learning_rate": 0.00010930284891309239, + "loss": 0.2754, + "step": 25090 + }, + { + "epoch": 2.0326474400518473, + "grad_norm": 0.0662585198879242, + "learning_rate": 0.00010929834826049779, + "loss": 0.2687, + "step": 25091 + }, + { + "epoch": 2.0327284510693455, + "grad_norm": 0.06757596880197525, + "learning_rate": 0.00010929384760790316, + "loss": 0.2941, + "step": 25092 + }, + { + "epoch": 2.0328094620868438, + "grad_norm": 0.05894165858626366, + "learning_rate": 0.00010928934695530852, + "loss": 0.3183, + "step": 25093 + }, + { + "epoch": 2.032890473104342, + "grad_norm": 0.0708579570055008, + "learning_rate": 0.0001092848463027139, + "loss": 0.2966, + "step": 25094 + }, + { + "epoch": 2.0329714841218407, + "grad_norm": 0.06730002164840698, + "learning_rate": 0.00010928034565011927, + "loss": 0.239, + "step": 25095 + }, + { + "epoch": 2.033052495139339, + "grad_norm": 0.04710034653544426, + "learning_rate": 0.00010927584499752466, + "loss": 0.2072, + "step": 25096 + }, + { + "epoch": 2.033133506156837, + "grad_norm": 0.08717244863510132, + "learning_rate": 0.00010927134434493003, + "loss": 0.3157, + "step": 25097 + }, + { + "epoch": 2.033214517174336, + "grad_norm": 0.060938701033592224, + "learning_rate": 0.0001092668436923354, + "loss": 0.3116, + "step": 25098 + }, + { + "epoch": 2.033295528191834, + "grad_norm": 0.062359604984521866, + "learning_rate": 0.00010926234303974076, + "loss": 0.2838, + "step": 25099 + }, + { + "epoch": 2.0333765392093324, + "grad_norm": 0.06159916892647743, + "learning_rate": 0.00010925784238714614, + "loss": 0.2623, + "step": 25100 + }, + { + "epoch": 2.033457550226831, + "grad_norm": 0.06426838785409927, + "learning_rate": 0.00010925334173455151, + "loss": 0.2926, + "step": 25101 + }, + { + "epoch": 2.0335385612443293, + "grad_norm": 0.06911225616931915, + "learning_rate": 0.0001092488410819569, + "loss": 0.2972, + "step": 25102 + }, + { + "epoch": 2.0336195722618275, + "grad_norm": 0.057620491832494736, + "learning_rate": 0.00010924434042936227, + "loss": 0.2462, + "step": 25103 + }, + { + "epoch": 2.033700583279326, + "grad_norm": 0.058398373425006866, + "learning_rate": 0.00010923983977676764, + "loss": 0.2825, + "step": 25104 + }, + { + "epoch": 2.0337815942968245, + "grad_norm": 0.06570584326982498, + "learning_rate": 0.000109235339124173, + "loss": 0.2713, + "step": 25105 + }, + { + "epoch": 2.0338626053143227, + "grad_norm": 0.0867527574300766, + "learning_rate": 0.00010923083847157838, + "loss": 0.2968, + "step": 25106 + }, + { + "epoch": 2.033943616331821, + "grad_norm": 0.05803163722157478, + "learning_rate": 0.00010922633781898375, + "loss": 0.2257, + "step": 25107 + }, + { + "epoch": 2.0340246273493197, + "grad_norm": 0.06863657385110855, + "learning_rate": 0.00010922183716638914, + "loss": 0.2413, + "step": 25108 + }, + { + "epoch": 2.034105638366818, + "grad_norm": 0.05453773960471153, + "learning_rate": 0.00010921733651379451, + "loss": 0.2746, + "step": 25109 + }, + { + "epoch": 2.034186649384316, + "grad_norm": 0.05881081521511078, + "learning_rate": 0.00010921283586119989, + "loss": 0.2647, + "step": 25110 + }, + { + "epoch": 2.034267660401815, + "grad_norm": 0.054627347737550735, + "learning_rate": 0.00010920833520860525, + "loss": 0.2671, + "step": 25111 + }, + { + "epoch": 2.034348671419313, + "grad_norm": 0.0621161125600338, + "learning_rate": 0.00010920383455601062, + "loss": 0.2506, + "step": 25112 + }, + { + "epoch": 2.0344296824368113, + "grad_norm": 0.0610932894051075, + "learning_rate": 0.000109199333903416, + "loss": 0.2802, + "step": 25113 + }, + { + "epoch": 2.0345106934543096, + "grad_norm": 0.062395934015512466, + "learning_rate": 0.00010919483325082138, + "loss": 0.2289, + "step": 25114 + }, + { + "epoch": 2.0345917044718083, + "grad_norm": 0.06225151568651199, + "learning_rate": 0.00010919033259822675, + "loss": 0.3058, + "step": 25115 + }, + { + "epoch": 2.0346727154893065, + "grad_norm": 0.05295949801802635, + "learning_rate": 0.00010918583194563213, + "loss": 0.2118, + "step": 25116 + }, + { + "epoch": 2.0347537265068047, + "grad_norm": 0.06223113089799881, + "learning_rate": 0.00010918133129303749, + "loss": 0.2409, + "step": 25117 + }, + { + "epoch": 2.0348347375243034, + "grad_norm": 0.0673019140958786, + "learning_rate": 0.00010917683064044286, + "loss": 0.2742, + "step": 25118 + }, + { + "epoch": 2.0349157485418017, + "grad_norm": 0.06154569610953331, + "learning_rate": 0.00010917232998784825, + "loss": 0.278, + "step": 25119 + }, + { + "epoch": 2.0349967595593, + "grad_norm": 0.06451249122619629, + "learning_rate": 0.00010916782933525362, + "loss": 0.2448, + "step": 25120 + }, + { + "epoch": 2.0350777705767986, + "grad_norm": 0.05225265398621559, + "learning_rate": 0.000109163328682659, + "loss": 0.2114, + "step": 25121 + }, + { + "epoch": 2.035158781594297, + "grad_norm": 0.06646935641765594, + "learning_rate": 0.00010915882803006437, + "loss": 0.251, + "step": 25122 + }, + { + "epoch": 2.035239792611795, + "grad_norm": 0.05266328155994415, + "learning_rate": 0.00010915432737746973, + "loss": 0.2568, + "step": 25123 + }, + { + "epoch": 2.035320803629294, + "grad_norm": 0.06037535145878792, + "learning_rate": 0.0001091498267248751, + "loss": 0.2778, + "step": 25124 + }, + { + "epoch": 2.035401814646792, + "grad_norm": 0.05661199986934662, + "learning_rate": 0.00010914532607228049, + "loss": 0.265, + "step": 25125 + }, + { + "epoch": 2.0354828256642903, + "grad_norm": 0.06349622458219528, + "learning_rate": 0.00010914082541968586, + "loss": 0.2316, + "step": 25126 + }, + { + "epoch": 2.0355638366817885, + "grad_norm": 0.06441009044647217, + "learning_rate": 0.00010913632476709124, + "loss": 0.2619, + "step": 25127 + }, + { + "epoch": 2.035644847699287, + "grad_norm": 0.057916779071092606, + "learning_rate": 0.00010913182411449661, + "loss": 0.2373, + "step": 25128 + }, + { + "epoch": 2.0357258587167855, + "grad_norm": 0.07067728787660599, + "learning_rate": 0.00010912732346190197, + "loss": 0.269, + "step": 25129 + }, + { + "epoch": 2.0358068697342837, + "grad_norm": 0.05133752524852753, + "learning_rate": 0.00010912282280930734, + "loss": 0.2378, + "step": 25130 + }, + { + "epoch": 2.0358878807517824, + "grad_norm": 0.053712695837020874, + "learning_rate": 0.00010911832215671273, + "loss": 0.2077, + "step": 25131 + }, + { + "epoch": 2.0359688917692806, + "grad_norm": 0.058011818677186966, + "learning_rate": 0.0001091138215041181, + "loss": 0.2455, + "step": 25132 + }, + { + "epoch": 2.036049902786779, + "grad_norm": 0.07174625247716904, + "learning_rate": 0.00010910932085152348, + "loss": 0.3063, + "step": 25133 + }, + { + "epoch": 2.0361309138042776, + "grad_norm": 0.05659336969256401, + "learning_rate": 0.00010910482019892885, + "loss": 0.236, + "step": 25134 + }, + { + "epoch": 2.036211924821776, + "grad_norm": 0.06388983875513077, + "learning_rate": 0.00010910031954633421, + "loss": 0.2843, + "step": 25135 + }, + { + "epoch": 2.036292935839274, + "grad_norm": 0.07428158819675446, + "learning_rate": 0.00010909581889373959, + "loss": 0.2687, + "step": 25136 + }, + { + "epoch": 2.0363739468567723, + "grad_norm": 0.06257401406764984, + "learning_rate": 0.00010909131824114497, + "loss": 0.2659, + "step": 25137 + }, + { + "epoch": 2.036454957874271, + "grad_norm": 0.06060203164815903, + "learning_rate": 0.00010908681758855035, + "loss": 0.2578, + "step": 25138 + }, + { + "epoch": 2.0365359688917692, + "grad_norm": 0.08558185398578644, + "learning_rate": 0.00010908231693595572, + "loss": 0.2567, + "step": 25139 + }, + { + "epoch": 2.0366169799092675, + "grad_norm": 0.08406521379947662, + "learning_rate": 0.0001090778162833611, + "loss": 0.2774, + "step": 25140 + }, + { + "epoch": 2.036697990926766, + "grad_norm": 0.054151203483343124, + "learning_rate": 0.00010907331563076645, + "loss": 0.2609, + "step": 25141 + }, + { + "epoch": 2.0367790019442644, + "grad_norm": 0.06468921899795532, + "learning_rate": 0.00010906881497817183, + "loss": 0.2595, + "step": 25142 + }, + { + "epoch": 2.0368600129617627, + "grad_norm": 0.055418115109205246, + "learning_rate": 0.00010906431432557722, + "loss": 0.2319, + "step": 25143 + }, + { + "epoch": 2.0369410239792614, + "grad_norm": 0.05805583298206329, + "learning_rate": 0.00010905981367298259, + "loss": 0.2811, + "step": 25144 + }, + { + "epoch": 2.0370220349967596, + "grad_norm": 0.05657727271318436, + "learning_rate": 0.00010905531302038796, + "loss": 0.2416, + "step": 25145 + }, + { + "epoch": 2.037103046014258, + "grad_norm": 0.05590350925922394, + "learning_rate": 0.00010905081236779334, + "loss": 0.2429, + "step": 25146 + }, + { + "epoch": 2.0371840570317565, + "grad_norm": 0.06622657924890518, + "learning_rate": 0.0001090463117151987, + "loss": 0.2972, + "step": 25147 + }, + { + "epoch": 2.037265068049255, + "grad_norm": 0.06123768165707588, + "learning_rate": 0.0001090418110626041, + "loss": 0.2762, + "step": 25148 + }, + { + "epoch": 2.037346079066753, + "grad_norm": 0.062109388411045074, + "learning_rate": 0.00010903731041000946, + "loss": 0.2418, + "step": 25149 + }, + { + "epoch": 2.0374270900842513, + "grad_norm": 0.06358703225851059, + "learning_rate": 0.00010903280975741483, + "loss": 0.2675, + "step": 25150 + }, + { + "epoch": 2.03750810110175, + "grad_norm": 0.05100923404097557, + "learning_rate": 0.0001090283091048202, + "loss": 0.2346, + "step": 25151 + }, + { + "epoch": 2.037589112119248, + "grad_norm": 0.06406904757022858, + "learning_rate": 0.00010902380845222558, + "loss": 0.2638, + "step": 25152 + }, + { + "epoch": 2.0376701231367464, + "grad_norm": 0.06896286457777023, + "learning_rate": 0.00010901930779963094, + "loss": 0.2405, + "step": 25153 + }, + { + "epoch": 2.037751134154245, + "grad_norm": 0.05823047086596489, + "learning_rate": 0.00010901480714703634, + "loss": 0.2461, + "step": 25154 + }, + { + "epoch": 2.0378321451717434, + "grad_norm": 0.07670606672763824, + "learning_rate": 0.0001090103064944417, + "loss": 0.2884, + "step": 25155 + }, + { + "epoch": 2.0379131561892416, + "grad_norm": 0.1217048391699791, + "learning_rate": 0.00010900580584184707, + "loss": 0.3086, + "step": 25156 + }, + { + "epoch": 2.0379941672067403, + "grad_norm": 0.05431569740176201, + "learning_rate": 0.00010900130518925245, + "loss": 0.2592, + "step": 25157 + }, + { + "epoch": 2.0380751782242386, + "grad_norm": 0.0681622326374054, + "learning_rate": 0.00010899680453665782, + "loss": 0.2489, + "step": 25158 + }, + { + "epoch": 2.038156189241737, + "grad_norm": 0.05610613897442818, + "learning_rate": 0.00010899230388406318, + "loss": 0.2317, + "step": 25159 + }, + { + "epoch": 2.038237200259235, + "grad_norm": 0.06325603276491165, + "learning_rate": 0.00010898780323146858, + "loss": 0.2827, + "step": 25160 + }, + { + "epoch": 2.0383182112767337, + "grad_norm": 0.069940485060215, + "learning_rate": 0.00010898330257887395, + "loss": 0.249, + "step": 25161 + }, + { + "epoch": 2.038399222294232, + "grad_norm": 0.059809450060129166, + "learning_rate": 0.00010897880192627931, + "loss": 0.2967, + "step": 25162 + }, + { + "epoch": 2.0384802333117302, + "grad_norm": 0.06071152910590172, + "learning_rate": 0.00010897430127368469, + "loss": 0.254, + "step": 25163 + }, + { + "epoch": 2.038561244329229, + "grad_norm": 0.06979705393314362, + "learning_rate": 0.00010896980062109006, + "loss": 0.2616, + "step": 25164 + }, + { + "epoch": 2.038642255346727, + "grad_norm": 0.06330650299787521, + "learning_rate": 0.00010896529996849542, + "loss": 0.2712, + "step": 25165 + }, + { + "epoch": 2.0387232663642254, + "grad_norm": 0.05924196541309357, + "learning_rate": 0.00010896079931590082, + "loss": 0.2434, + "step": 25166 + }, + { + "epoch": 2.038804277381724, + "grad_norm": 0.05195455253124237, + "learning_rate": 0.0001089562986633062, + "loss": 0.2472, + "step": 25167 + }, + { + "epoch": 2.0388852883992223, + "grad_norm": 0.0717695876955986, + "learning_rate": 0.00010895179801071156, + "loss": 0.2771, + "step": 25168 + }, + { + "epoch": 2.0389662994167206, + "grad_norm": 0.05342751368880272, + "learning_rate": 0.00010894729735811693, + "loss": 0.2439, + "step": 25169 + }, + { + "epoch": 2.039047310434219, + "grad_norm": 0.05817456170916557, + "learning_rate": 0.0001089427967055223, + "loss": 0.2545, + "step": 25170 + }, + { + "epoch": 2.0391283214517175, + "grad_norm": 0.06112273782491684, + "learning_rate": 0.00010893829605292769, + "loss": 0.2924, + "step": 25171 + }, + { + "epoch": 2.0392093324692158, + "grad_norm": 0.05255928635597229, + "learning_rate": 0.00010893379540033306, + "loss": 0.2109, + "step": 25172 + }, + { + "epoch": 2.039290343486714, + "grad_norm": 0.051350079476833344, + "learning_rate": 0.00010892929474773844, + "loss": 0.2313, + "step": 25173 + }, + { + "epoch": 2.0393713545042127, + "grad_norm": 0.06925446540117264, + "learning_rate": 0.0001089247940951438, + "loss": 0.2773, + "step": 25174 + }, + { + "epoch": 2.039452365521711, + "grad_norm": 0.061324674636125565, + "learning_rate": 0.00010892029344254917, + "loss": 0.2867, + "step": 25175 + }, + { + "epoch": 2.039533376539209, + "grad_norm": 0.06071063503623009, + "learning_rate": 0.00010891579278995454, + "loss": 0.2643, + "step": 25176 + }, + { + "epoch": 2.039614387556708, + "grad_norm": 0.06241742521524429, + "learning_rate": 0.00010891129213735993, + "loss": 0.2664, + "step": 25177 + }, + { + "epoch": 2.039695398574206, + "grad_norm": 0.06371390074491501, + "learning_rate": 0.0001089067914847653, + "loss": 0.258, + "step": 25178 + }, + { + "epoch": 2.0397764095917044, + "grad_norm": 0.047560915350914, + "learning_rate": 0.00010890229083217068, + "loss": 0.2607, + "step": 25179 + }, + { + "epoch": 2.039857420609203, + "grad_norm": 0.0704430490732193, + "learning_rate": 0.00010889779017957604, + "loss": 0.2488, + "step": 25180 + }, + { + "epoch": 2.0399384316267013, + "grad_norm": 0.06133200600743294, + "learning_rate": 0.00010889328952698141, + "loss": 0.299, + "step": 25181 + }, + { + "epoch": 2.0400194426441995, + "grad_norm": 0.06216515600681305, + "learning_rate": 0.00010888878887438679, + "loss": 0.2495, + "step": 25182 + }, + { + "epoch": 2.040100453661698, + "grad_norm": 0.08171521127223969, + "learning_rate": 0.00010888428822179217, + "loss": 0.2662, + "step": 25183 + }, + { + "epoch": 2.0401814646791965, + "grad_norm": 0.06202945485711098, + "learning_rate": 0.00010887978756919755, + "loss": 0.2791, + "step": 25184 + }, + { + "epoch": 2.0402624756966947, + "grad_norm": 0.05931111425161362, + "learning_rate": 0.00010887528691660292, + "loss": 0.2811, + "step": 25185 + }, + { + "epoch": 2.040343486714193, + "grad_norm": 0.06533900648355484, + "learning_rate": 0.00010887078626400828, + "loss": 0.2494, + "step": 25186 + }, + { + "epoch": 2.0404244977316917, + "grad_norm": 0.04683497175574303, + "learning_rate": 0.00010886628561141365, + "loss": 0.2489, + "step": 25187 + }, + { + "epoch": 2.04050550874919, + "grad_norm": 0.06653185933828354, + "learning_rate": 0.00010886178495881903, + "loss": 0.2598, + "step": 25188 + }, + { + "epoch": 2.040586519766688, + "grad_norm": 0.06457366049289703, + "learning_rate": 0.00010885728430622441, + "loss": 0.2452, + "step": 25189 + }, + { + "epoch": 2.040667530784187, + "grad_norm": 0.046025507152080536, + "learning_rate": 0.00010885278365362979, + "loss": 0.2694, + "step": 25190 + }, + { + "epoch": 2.040748541801685, + "grad_norm": 0.05456798896193504, + "learning_rate": 0.00010884828300103516, + "loss": 0.2243, + "step": 25191 + }, + { + "epoch": 2.0408295528191833, + "grad_norm": 0.07598184794187546, + "learning_rate": 0.00010884378234844052, + "loss": 0.26, + "step": 25192 + }, + { + "epoch": 2.0409105638366816, + "grad_norm": 0.05668919160962105, + "learning_rate": 0.0001088392816958459, + "loss": 0.2743, + "step": 25193 + }, + { + "epoch": 2.0409915748541803, + "grad_norm": 0.06732729822397232, + "learning_rate": 0.00010883478104325127, + "loss": 0.2668, + "step": 25194 + }, + { + "epoch": 2.0410725858716785, + "grad_norm": 0.06389015913009644, + "learning_rate": 0.00010883028039065666, + "loss": 0.2835, + "step": 25195 + }, + { + "epoch": 2.0411535968891767, + "grad_norm": 0.06920386850833893, + "learning_rate": 0.00010882577973806203, + "loss": 0.2426, + "step": 25196 + }, + { + "epoch": 2.0412346079066754, + "grad_norm": 0.06971029192209244, + "learning_rate": 0.0001088212790854674, + "loss": 0.2717, + "step": 25197 + }, + { + "epoch": 2.0413156189241737, + "grad_norm": 0.06330526620149612, + "learning_rate": 0.00010881677843287276, + "loss": 0.2365, + "step": 25198 + }, + { + "epoch": 2.041396629941672, + "grad_norm": 0.0685361996293068, + "learning_rate": 0.00010881227778027814, + "loss": 0.279, + "step": 25199 + }, + { + "epoch": 2.0414776409591706, + "grad_norm": 0.05792514234781265, + "learning_rate": 0.00010880777712768352, + "loss": 0.2366, + "step": 25200 + }, + { + "epoch": 2.041558651976669, + "grad_norm": 0.06563431024551392, + "learning_rate": 0.0001088032764750889, + "loss": 0.266, + "step": 25201 + }, + { + "epoch": 2.041639662994167, + "grad_norm": 0.05215870961546898, + "learning_rate": 0.00010879877582249427, + "loss": 0.2613, + "step": 25202 + }, + { + "epoch": 2.041720674011666, + "grad_norm": 0.06881966441869736, + "learning_rate": 0.00010879427516989965, + "loss": 0.2992, + "step": 25203 + }, + { + "epoch": 2.041801685029164, + "grad_norm": 0.054352544248104095, + "learning_rate": 0.000108789774517305, + "loss": 0.2296, + "step": 25204 + }, + { + "epoch": 2.0418826960466623, + "grad_norm": 0.08665680140256882, + "learning_rate": 0.00010878527386471038, + "loss": 0.2642, + "step": 25205 + }, + { + "epoch": 2.0419637070641605, + "grad_norm": 0.06520480662584305, + "learning_rate": 0.00010878077321211577, + "loss": 0.2686, + "step": 25206 + }, + { + "epoch": 2.042044718081659, + "grad_norm": 0.057679660618305206, + "learning_rate": 0.00010877627255952114, + "loss": 0.218, + "step": 25207 + }, + { + "epoch": 2.0421257290991575, + "grad_norm": 0.06190623342990875, + "learning_rate": 0.00010877177190692651, + "loss": 0.2329, + "step": 25208 + }, + { + "epoch": 2.0422067401166557, + "grad_norm": 0.07380854338407516, + "learning_rate": 0.00010876727125433189, + "loss": 0.2816, + "step": 25209 + }, + { + "epoch": 2.0422877511341544, + "grad_norm": 0.059125471860170364, + "learning_rate": 0.00010876277060173725, + "loss": 0.2596, + "step": 25210 + }, + { + "epoch": 2.0423687621516526, + "grad_norm": 0.05417705699801445, + "learning_rate": 0.00010875826994914262, + "loss": 0.2192, + "step": 25211 + }, + { + "epoch": 2.042449773169151, + "grad_norm": 0.054412972182035446, + "learning_rate": 0.00010875376929654801, + "loss": 0.2798, + "step": 25212 + }, + { + "epoch": 2.0425307841866496, + "grad_norm": 0.05146175995469093, + "learning_rate": 0.00010874926864395338, + "loss": 0.234, + "step": 25213 + }, + { + "epoch": 2.042611795204148, + "grad_norm": 0.0554482527077198, + "learning_rate": 0.00010874476799135875, + "loss": 0.2545, + "step": 25214 + }, + { + "epoch": 2.042692806221646, + "grad_norm": 0.06421638280153275, + "learning_rate": 0.00010874026733876413, + "loss": 0.2538, + "step": 25215 + }, + { + "epoch": 2.0427738172391443, + "grad_norm": 0.06408846378326416, + "learning_rate": 0.00010873576668616949, + "loss": 0.2672, + "step": 25216 + }, + { + "epoch": 2.042854828256643, + "grad_norm": 0.06903623789548874, + "learning_rate": 0.00010873126603357486, + "loss": 0.2385, + "step": 25217 + }, + { + "epoch": 2.0429358392741412, + "grad_norm": 0.06604668498039246, + "learning_rate": 0.00010872676538098025, + "loss": 0.247, + "step": 25218 + }, + { + "epoch": 2.0430168502916395, + "grad_norm": 0.06986042857170105, + "learning_rate": 0.00010872226472838562, + "loss": 0.2529, + "step": 25219 + }, + { + "epoch": 2.043097861309138, + "grad_norm": 0.05973782390356064, + "learning_rate": 0.000108717764075791, + "loss": 0.2432, + "step": 25220 + }, + { + "epoch": 2.0431788723266364, + "grad_norm": 0.06510750204324722, + "learning_rate": 0.00010871326342319637, + "loss": 0.2568, + "step": 25221 + }, + { + "epoch": 2.0432598833441347, + "grad_norm": 0.057255763560533524, + "learning_rate": 0.00010870876277060173, + "loss": 0.2666, + "step": 25222 + }, + { + "epoch": 2.0433408943616334, + "grad_norm": 0.061788126826286316, + "learning_rate": 0.0001087042621180071, + "loss": 0.2459, + "step": 25223 + }, + { + "epoch": 2.0434219053791316, + "grad_norm": 0.058773137629032135, + "learning_rate": 0.00010869976146541249, + "loss": 0.2736, + "step": 25224 + }, + { + "epoch": 2.04350291639663, + "grad_norm": 0.0588141568005085, + "learning_rate": 0.00010869526081281786, + "loss": 0.2935, + "step": 25225 + }, + { + "epoch": 2.0435839274141285, + "grad_norm": 0.06576879322528839, + "learning_rate": 0.00010869076016022324, + "loss": 0.2801, + "step": 25226 + }, + { + "epoch": 2.0436649384316268, + "grad_norm": 0.07062683999538422, + "learning_rate": 0.00010868625950762861, + "loss": 0.26, + "step": 25227 + }, + { + "epoch": 2.043745949449125, + "grad_norm": 0.07221104204654694, + "learning_rate": 0.00010868175885503397, + "loss": 0.2598, + "step": 25228 + }, + { + "epoch": 2.0438269604666233, + "grad_norm": 0.05775179713964462, + "learning_rate": 0.00010867725820243937, + "loss": 0.2565, + "step": 25229 + }, + { + "epoch": 2.043907971484122, + "grad_norm": 0.06143064796924591, + "learning_rate": 0.00010867275754984475, + "loss": 0.2412, + "step": 25230 + }, + { + "epoch": 2.04398898250162, + "grad_norm": 0.0679447203874588, + "learning_rate": 0.0001086682568972501, + "loss": 0.2682, + "step": 25231 + }, + { + "epoch": 2.0440699935191184, + "grad_norm": 0.05769611522555351, + "learning_rate": 0.00010866375624465548, + "loss": 0.2428, + "step": 25232 + }, + { + "epoch": 2.044151004536617, + "grad_norm": 0.05339081957936287, + "learning_rate": 0.00010865925559206085, + "loss": 0.2342, + "step": 25233 + }, + { + "epoch": 2.0442320155541154, + "grad_norm": 0.05419541150331497, + "learning_rate": 0.00010865475493946621, + "loss": 0.2811, + "step": 25234 + }, + { + "epoch": 2.0443130265716136, + "grad_norm": 0.06672076135873795, + "learning_rate": 0.00010865025428687161, + "loss": 0.2646, + "step": 25235 + }, + { + "epoch": 2.0443940375891123, + "grad_norm": 0.06640081852674484, + "learning_rate": 0.00010864575363427699, + "loss": 0.2206, + "step": 25236 + }, + { + "epoch": 2.0444750486066106, + "grad_norm": 0.06232764199376106, + "learning_rate": 0.00010864125298168235, + "loss": 0.2563, + "step": 25237 + }, + { + "epoch": 2.044556059624109, + "grad_norm": 0.08199611306190491, + "learning_rate": 0.00010863675232908772, + "loss": 0.2992, + "step": 25238 + }, + { + "epoch": 2.044637070641607, + "grad_norm": 0.06076132878661156, + "learning_rate": 0.0001086322516764931, + "loss": 0.2602, + "step": 25239 + }, + { + "epoch": 2.0447180816591057, + "grad_norm": 0.06518393754959106, + "learning_rate": 0.00010862775102389845, + "loss": 0.2383, + "step": 25240 + }, + { + "epoch": 2.044799092676604, + "grad_norm": 0.05123493820428848, + "learning_rate": 0.00010862325037130386, + "loss": 0.2327, + "step": 25241 + }, + { + "epoch": 2.0448801036941022, + "grad_norm": 0.05127443000674248, + "learning_rate": 0.00010861874971870923, + "loss": 0.2443, + "step": 25242 + }, + { + "epoch": 2.044961114711601, + "grad_norm": 0.05903277546167374, + "learning_rate": 0.00010861424906611459, + "loss": 0.2386, + "step": 25243 + }, + { + "epoch": 2.045042125729099, + "grad_norm": 0.06020485237240791, + "learning_rate": 0.00010860974841351996, + "loss": 0.2262, + "step": 25244 + }, + { + "epoch": 2.0451231367465974, + "grad_norm": 0.05409131199121475, + "learning_rate": 0.00010860524776092534, + "loss": 0.2589, + "step": 25245 + }, + { + "epoch": 2.045204147764096, + "grad_norm": 0.060740187764167786, + "learning_rate": 0.0001086007471083307, + "loss": 0.2329, + "step": 25246 + }, + { + "epoch": 2.0452851587815943, + "grad_norm": 0.06752003729343414, + "learning_rate": 0.0001085962464557361, + "loss": 0.2297, + "step": 25247 + }, + { + "epoch": 2.0453661697990926, + "grad_norm": 0.06496375054121017, + "learning_rate": 0.00010859174580314147, + "loss": 0.2775, + "step": 25248 + }, + { + "epoch": 2.0454471808165913, + "grad_norm": 0.059181131422519684, + "learning_rate": 0.00010858724515054683, + "loss": 0.2275, + "step": 25249 + }, + { + "epoch": 2.0455281918340895, + "grad_norm": 0.05792650207877159, + "learning_rate": 0.0001085827444979522, + "loss": 0.2853, + "step": 25250 + }, + { + "epoch": 2.0456092028515878, + "grad_norm": 0.059535253793001175, + "learning_rate": 0.00010857824384535758, + "loss": 0.2278, + "step": 25251 + }, + { + "epoch": 2.045690213869086, + "grad_norm": 0.0699746236205101, + "learning_rate": 0.00010857374319276297, + "loss": 0.2716, + "step": 25252 + }, + { + "epoch": 2.0457712248865847, + "grad_norm": 0.060232095420360565, + "learning_rate": 0.00010856924254016834, + "loss": 0.2602, + "step": 25253 + }, + { + "epoch": 2.045852235904083, + "grad_norm": 0.08279857039451599, + "learning_rate": 0.00010856474188757371, + "loss": 0.28, + "step": 25254 + }, + { + "epoch": 2.045933246921581, + "grad_norm": 0.07259318977594376, + "learning_rate": 0.00010856024123497907, + "loss": 0.2621, + "step": 25255 + }, + { + "epoch": 2.04601425793908, + "grad_norm": 0.05575643107295036, + "learning_rate": 0.00010855574058238445, + "loss": 0.2717, + "step": 25256 + }, + { + "epoch": 2.046095268956578, + "grad_norm": 0.06759722530841827, + "learning_rate": 0.00010855123992978982, + "loss": 0.2664, + "step": 25257 + }, + { + "epoch": 2.0461762799740764, + "grad_norm": 0.06458771973848343, + "learning_rate": 0.00010854673927719521, + "loss": 0.2659, + "step": 25258 + }, + { + "epoch": 2.046257290991575, + "grad_norm": 0.06146334111690521, + "learning_rate": 0.00010854223862460058, + "loss": 0.2536, + "step": 25259 + }, + { + "epoch": 2.0463383020090733, + "grad_norm": 0.054808396846055984, + "learning_rate": 0.00010853773797200595, + "loss": 0.2369, + "step": 25260 + }, + { + "epoch": 2.0464193130265715, + "grad_norm": 0.05751333385705948, + "learning_rate": 0.00010853323731941131, + "loss": 0.2482, + "step": 25261 + }, + { + "epoch": 2.04650032404407, + "grad_norm": 0.0674423947930336, + "learning_rate": 0.00010852873666681669, + "loss": 0.2647, + "step": 25262 + }, + { + "epoch": 2.0465813350615685, + "grad_norm": 0.0590125173330307, + "learning_rate": 0.00010852423601422206, + "loss": 0.2987, + "step": 25263 + }, + { + "epoch": 2.0466623460790667, + "grad_norm": 0.07027654349803925, + "learning_rate": 0.00010851973536162745, + "loss": 0.3088, + "step": 25264 + }, + { + "epoch": 2.046743357096565, + "grad_norm": 0.05695608630776405, + "learning_rate": 0.00010851523470903282, + "loss": 0.2663, + "step": 25265 + }, + { + "epoch": 2.0468243681140637, + "grad_norm": 0.06685222685337067, + "learning_rate": 0.0001085107340564382, + "loss": 0.3287, + "step": 25266 + }, + { + "epoch": 2.046905379131562, + "grad_norm": 0.055971141904592514, + "learning_rate": 0.00010850623340384356, + "loss": 0.2208, + "step": 25267 + }, + { + "epoch": 2.04698639014906, + "grad_norm": 0.05768276005983353, + "learning_rate": 0.00010850173275124893, + "loss": 0.2425, + "step": 25268 + }, + { + "epoch": 2.047067401166559, + "grad_norm": 0.06055925786495209, + "learning_rate": 0.0001084972320986543, + "loss": 0.2441, + "step": 25269 + }, + { + "epoch": 2.047148412184057, + "grad_norm": 0.06709778308868408, + "learning_rate": 0.00010849273144605969, + "loss": 0.2709, + "step": 25270 + }, + { + "epoch": 2.0472294232015553, + "grad_norm": 0.07295244932174683, + "learning_rate": 0.00010848823079346506, + "loss": 0.2556, + "step": 25271 + }, + { + "epoch": 2.047310434219054, + "grad_norm": 0.0473698154091835, + "learning_rate": 0.00010848373014087044, + "loss": 0.2631, + "step": 25272 + }, + { + "epoch": 2.0473914452365523, + "grad_norm": 0.06414885818958282, + "learning_rate": 0.0001084792294882758, + "loss": 0.2333, + "step": 25273 + }, + { + "epoch": 2.0474724562540505, + "grad_norm": 0.06567434221506119, + "learning_rate": 0.00010847472883568117, + "loss": 0.2742, + "step": 25274 + }, + { + "epoch": 2.0475534672715487, + "grad_norm": 0.0706758126616478, + "learning_rate": 0.00010847022818308654, + "loss": 0.2413, + "step": 25275 + }, + { + "epoch": 2.0476344782890474, + "grad_norm": 0.05288252234458923, + "learning_rate": 0.00010846572753049193, + "loss": 0.217, + "step": 25276 + }, + { + "epoch": 2.0477154893065457, + "grad_norm": 0.06759516894817352, + "learning_rate": 0.0001084612268778973, + "loss": 0.2626, + "step": 25277 + }, + { + "epoch": 2.047796500324044, + "grad_norm": 0.05509433150291443, + "learning_rate": 0.00010845672622530268, + "loss": 0.2362, + "step": 25278 + }, + { + "epoch": 2.0478775113415426, + "grad_norm": 0.06873355805873871, + "learning_rate": 0.00010845222557270804, + "loss": 0.2722, + "step": 25279 + }, + { + "epoch": 2.047958522359041, + "grad_norm": 0.07308941334486008, + "learning_rate": 0.00010844772492011341, + "loss": 0.2349, + "step": 25280 + }, + { + "epoch": 2.048039533376539, + "grad_norm": 0.06773709505796432, + "learning_rate": 0.0001084432242675188, + "loss": 0.2699, + "step": 25281 + }, + { + "epoch": 2.048120544394038, + "grad_norm": 0.052150875329971313, + "learning_rate": 0.00010843872361492417, + "loss": 0.2775, + "step": 25282 + }, + { + "epoch": 2.048201555411536, + "grad_norm": 0.06018095463514328, + "learning_rate": 0.00010843422296232955, + "loss": 0.2788, + "step": 25283 + }, + { + "epoch": 2.0482825664290343, + "grad_norm": 0.06471269577741623, + "learning_rate": 0.00010842972230973492, + "loss": 0.2316, + "step": 25284 + }, + { + "epoch": 2.0483635774465325, + "grad_norm": 0.06936302036046982, + "learning_rate": 0.00010842522165714028, + "loss": 0.3311, + "step": 25285 + }, + { + "epoch": 2.048444588464031, + "grad_norm": 0.06158831715583801, + "learning_rate": 0.00010842072100454565, + "loss": 0.2261, + "step": 25286 + }, + { + "epoch": 2.0485255994815295, + "grad_norm": 0.061135925352573395, + "learning_rate": 0.00010841622035195104, + "loss": 0.2825, + "step": 25287 + }, + { + "epoch": 2.0486066104990277, + "grad_norm": 0.06801503151655197, + "learning_rate": 0.00010841171969935641, + "loss": 0.2833, + "step": 25288 + }, + { + "epoch": 2.0486876215165264, + "grad_norm": 0.052315711975097656, + "learning_rate": 0.00010840721904676179, + "loss": 0.235, + "step": 25289 + }, + { + "epoch": 2.0487686325340246, + "grad_norm": 0.0969030037522316, + "learning_rate": 0.00010840271839416716, + "loss": 0.259, + "step": 25290 + }, + { + "epoch": 2.048849643551523, + "grad_norm": 0.07182536274194717, + "learning_rate": 0.00010839821774157252, + "loss": 0.2531, + "step": 25291 + }, + { + "epoch": 2.0489306545690216, + "grad_norm": 0.0598326101899147, + "learning_rate": 0.0001083937170889779, + "loss": 0.251, + "step": 25292 + }, + { + "epoch": 2.04901166558652, + "grad_norm": 0.060341332107782364, + "learning_rate": 0.00010838921643638328, + "loss": 0.2806, + "step": 25293 + }, + { + "epoch": 2.049092676604018, + "grad_norm": 0.049504201859235764, + "learning_rate": 0.00010838471578378866, + "loss": 0.297, + "step": 25294 + }, + { + "epoch": 2.0491736876215167, + "grad_norm": 0.06888049840927124, + "learning_rate": 0.00010838021513119403, + "loss": 0.2524, + "step": 25295 + }, + { + "epoch": 2.049254698639015, + "grad_norm": 0.0596277117729187, + "learning_rate": 0.0001083757144785994, + "loss": 0.2837, + "step": 25296 + }, + { + "epoch": 2.0493357096565132, + "grad_norm": 0.05721274018287659, + "learning_rate": 0.00010837121382600476, + "loss": 0.2543, + "step": 25297 + }, + { + "epoch": 2.0494167206740115, + "grad_norm": 0.08060150593519211, + "learning_rate": 0.00010836671317341014, + "loss": 0.2573, + "step": 25298 + }, + { + "epoch": 2.04949773169151, + "grad_norm": 0.05760250240564346, + "learning_rate": 0.00010836221252081554, + "loss": 0.2385, + "step": 25299 + }, + { + "epoch": 2.0495787427090084, + "grad_norm": 0.07224278897047043, + "learning_rate": 0.0001083577118682209, + "loss": 0.2988, + "step": 25300 + }, + { + "epoch": 2.0496597537265067, + "grad_norm": 0.06838857382535934, + "learning_rate": 0.00010835321121562627, + "loss": 0.2279, + "step": 25301 + }, + { + "epoch": 2.0497407647440054, + "grad_norm": 0.07297328114509583, + "learning_rate": 0.00010834871056303165, + "loss": 0.3082, + "step": 25302 + }, + { + "epoch": 2.0498217757615036, + "grad_norm": 0.07161819189786911, + "learning_rate": 0.000108344209910437, + "loss": 0.2788, + "step": 25303 + }, + { + "epoch": 2.049902786779002, + "grad_norm": 0.057904768735170364, + "learning_rate": 0.0001083397092578424, + "loss": 0.2154, + "step": 25304 + }, + { + "epoch": 2.0499837977965005, + "grad_norm": 0.06379827857017517, + "learning_rate": 0.00010833520860524778, + "loss": 0.2477, + "step": 25305 + }, + { + "epoch": 2.0500648088139988, + "grad_norm": 0.05735756456851959, + "learning_rate": 0.00010833070795265314, + "loss": 0.2626, + "step": 25306 + }, + { + "epoch": 2.050145819831497, + "grad_norm": 0.06780066341161728, + "learning_rate": 0.00010832620730005851, + "loss": 0.2818, + "step": 25307 + }, + { + "epoch": 2.0502268308489953, + "grad_norm": 0.06374137103557587, + "learning_rate": 0.00010832170664746389, + "loss": 0.2399, + "step": 25308 + }, + { + "epoch": 2.050307841866494, + "grad_norm": 0.06231587007641792, + "learning_rate": 0.00010831720599486925, + "loss": 0.2412, + "step": 25309 + }, + { + "epoch": 2.050388852883992, + "grad_norm": 0.06339266896247864, + "learning_rate": 0.00010831270534227465, + "loss": 0.2633, + "step": 25310 + }, + { + "epoch": 2.0504698639014904, + "grad_norm": 0.059382013976573944, + "learning_rate": 0.00010830820468968002, + "loss": 0.249, + "step": 25311 + }, + { + "epoch": 2.050550874918989, + "grad_norm": 0.05026280879974365, + "learning_rate": 0.00010830370403708538, + "loss": 0.261, + "step": 25312 + }, + { + "epoch": 2.0506318859364874, + "grad_norm": 0.05445680394768715, + "learning_rate": 0.00010829920338449076, + "loss": 0.2356, + "step": 25313 + }, + { + "epoch": 2.0507128969539856, + "grad_norm": 0.047062940895557404, + "learning_rate": 0.00010829470273189613, + "loss": 0.2241, + "step": 25314 + }, + { + "epoch": 2.0507939079714843, + "grad_norm": 0.06622743606567383, + "learning_rate": 0.00010829020207930149, + "loss": 0.2578, + "step": 25315 + }, + { + "epoch": 2.0508749189889826, + "grad_norm": 0.055635739117860794, + "learning_rate": 0.00010828570142670689, + "loss": 0.2606, + "step": 25316 + }, + { + "epoch": 2.050955930006481, + "grad_norm": 0.054714541882276535, + "learning_rate": 0.00010828120077411226, + "loss": 0.2839, + "step": 25317 + }, + { + "epoch": 2.051036941023979, + "grad_norm": 0.056158315390348434, + "learning_rate": 0.00010827670012151762, + "loss": 0.2433, + "step": 25318 + }, + { + "epoch": 2.0511179520414777, + "grad_norm": 0.04764556884765625, + "learning_rate": 0.000108272199468923, + "loss": 0.2449, + "step": 25319 + }, + { + "epoch": 2.051198963058976, + "grad_norm": 0.061281781643629074, + "learning_rate": 0.00010826769881632837, + "loss": 0.3081, + "step": 25320 + }, + { + "epoch": 2.051279974076474, + "grad_norm": 0.05917587876319885, + "learning_rate": 0.00010826319816373373, + "loss": 0.2426, + "step": 25321 + }, + { + "epoch": 2.051360985093973, + "grad_norm": 0.05930584669113159, + "learning_rate": 0.00010825869751113913, + "loss": 0.2238, + "step": 25322 + }, + { + "epoch": 2.051441996111471, + "grad_norm": 0.06175884231925011, + "learning_rate": 0.0001082541968585445, + "loss": 0.2367, + "step": 25323 + }, + { + "epoch": 2.0515230071289694, + "grad_norm": 0.06388114392757416, + "learning_rate": 0.00010824969620594986, + "loss": 0.2432, + "step": 25324 + }, + { + "epoch": 2.051604018146468, + "grad_norm": 0.05988280847668648, + "learning_rate": 0.00010824519555335524, + "loss": 0.2363, + "step": 25325 + }, + { + "epoch": 2.0516850291639663, + "grad_norm": 0.07179353386163712, + "learning_rate": 0.00010824069490076061, + "loss": 0.2732, + "step": 25326 + }, + { + "epoch": 2.0517660401814646, + "grad_norm": 0.07074914127588272, + "learning_rate": 0.00010823619424816597, + "loss": 0.212, + "step": 25327 + }, + { + "epoch": 2.0518470511989633, + "grad_norm": 0.07107607275247574, + "learning_rate": 0.00010823169359557137, + "loss": 0.2568, + "step": 25328 + }, + { + "epoch": 2.0519280622164615, + "grad_norm": 0.05105834826827049, + "learning_rate": 0.00010822719294297675, + "loss": 0.2546, + "step": 25329 + }, + { + "epoch": 2.0520090732339598, + "grad_norm": 0.06509406864643097, + "learning_rate": 0.0001082226922903821, + "loss": 0.288, + "step": 25330 + }, + { + "epoch": 2.052090084251458, + "grad_norm": 0.0698138028383255, + "learning_rate": 0.00010821819163778748, + "loss": 0.2601, + "step": 25331 + }, + { + "epoch": 2.0521710952689567, + "grad_norm": 0.06088358536362648, + "learning_rate": 0.00010821369098519285, + "loss": 0.2543, + "step": 25332 + }, + { + "epoch": 2.052252106286455, + "grad_norm": 0.06391238421201706, + "learning_rate": 0.00010820919033259824, + "loss": 0.2437, + "step": 25333 + }, + { + "epoch": 2.052333117303953, + "grad_norm": 0.05070818215608597, + "learning_rate": 0.00010820468968000361, + "loss": 0.2322, + "step": 25334 + }, + { + "epoch": 2.052414128321452, + "grad_norm": 0.06086651608347893, + "learning_rate": 0.00010820018902740899, + "loss": 0.2405, + "step": 25335 + }, + { + "epoch": 2.05249513933895, + "grad_norm": 0.06370527297258377, + "learning_rate": 0.00010819568837481435, + "loss": 0.2719, + "step": 25336 + }, + { + "epoch": 2.0525761503564484, + "grad_norm": 0.08593538403511047, + "learning_rate": 0.00010819118772221972, + "loss": 0.2623, + "step": 25337 + }, + { + "epoch": 2.052657161373947, + "grad_norm": 0.06916448473930359, + "learning_rate": 0.0001081866870696251, + "loss": 0.3053, + "step": 25338 + }, + { + "epoch": 2.0527381723914453, + "grad_norm": 0.06832996010780334, + "learning_rate": 0.00010818218641703048, + "loss": 0.2716, + "step": 25339 + }, + { + "epoch": 2.0528191834089435, + "grad_norm": 0.052606839686632156, + "learning_rate": 0.00010817768576443586, + "loss": 0.2287, + "step": 25340 + }, + { + "epoch": 2.052900194426442, + "grad_norm": 0.07673200219869614, + "learning_rate": 0.00010817318511184123, + "loss": 0.2445, + "step": 25341 + }, + { + "epoch": 2.0529812054439405, + "grad_norm": 0.0654953345656395, + "learning_rate": 0.00010816868445924659, + "loss": 0.2664, + "step": 25342 + }, + { + "epoch": 2.0530622164614387, + "grad_norm": 0.06384981423616409, + "learning_rate": 0.00010816418380665196, + "loss": 0.2609, + "step": 25343 + }, + { + "epoch": 2.053143227478937, + "grad_norm": 0.0635632798075676, + "learning_rate": 0.00010815968315405734, + "loss": 0.2574, + "step": 25344 + }, + { + "epoch": 2.0532242384964356, + "grad_norm": 0.06317467987537384, + "learning_rate": 0.00010815518250146272, + "loss": 0.2562, + "step": 25345 + }, + { + "epoch": 2.053305249513934, + "grad_norm": 0.061597634106874466, + "learning_rate": 0.0001081506818488681, + "loss": 0.2435, + "step": 25346 + }, + { + "epoch": 2.053386260531432, + "grad_norm": 0.06339189410209656, + "learning_rate": 0.00010814618119627347, + "loss": 0.2619, + "step": 25347 + }, + { + "epoch": 2.053467271548931, + "grad_norm": 0.06991313397884369, + "learning_rate": 0.00010814168054367883, + "loss": 0.2435, + "step": 25348 + }, + { + "epoch": 2.053548282566429, + "grad_norm": 0.06686621904373169, + "learning_rate": 0.0001081371798910842, + "loss": 0.274, + "step": 25349 + }, + { + "epoch": 2.0536292935839273, + "grad_norm": 0.07232963293790817, + "learning_rate": 0.00010813267923848958, + "loss": 0.2884, + "step": 25350 + }, + { + "epoch": 2.053710304601426, + "grad_norm": 0.05980679392814636, + "learning_rate": 0.00010812817858589497, + "loss": 0.2728, + "step": 25351 + }, + { + "epoch": 2.0537913156189243, + "grad_norm": 0.0768938660621643, + "learning_rate": 0.00010812367793330034, + "loss": 0.3206, + "step": 25352 + }, + { + "epoch": 2.0538723266364225, + "grad_norm": 0.051484767347574234, + "learning_rate": 0.00010811917728070571, + "loss": 0.2502, + "step": 25353 + }, + { + "epoch": 2.0539533376539207, + "grad_norm": 0.04832427576184273, + "learning_rate": 0.00010811467662811107, + "loss": 0.2379, + "step": 25354 + }, + { + "epoch": 2.0540343486714194, + "grad_norm": 0.0567401684820652, + "learning_rate": 0.00010811017597551645, + "loss": 0.282, + "step": 25355 + }, + { + "epoch": 2.0541153596889177, + "grad_norm": 0.07354261726140976, + "learning_rate": 0.00010810567532292182, + "loss": 0.2407, + "step": 25356 + }, + { + "epoch": 2.054196370706416, + "grad_norm": 0.06890858709812164, + "learning_rate": 0.00010810117467032721, + "loss": 0.2768, + "step": 25357 + }, + { + "epoch": 2.0542773817239146, + "grad_norm": 0.06346815824508667, + "learning_rate": 0.00010809667401773258, + "loss": 0.2608, + "step": 25358 + }, + { + "epoch": 2.054358392741413, + "grad_norm": 0.05928133428096771, + "learning_rate": 0.00010809217336513795, + "loss": 0.2616, + "step": 25359 + }, + { + "epoch": 2.054439403758911, + "grad_norm": 0.05636308342218399, + "learning_rate": 0.00010808767271254331, + "loss": 0.2779, + "step": 25360 + }, + { + "epoch": 2.05452041477641, + "grad_norm": 0.0737721249461174, + "learning_rate": 0.00010808317205994869, + "loss": 0.317, + "step": 25361 + }, + { + "epoch": 2.054601425793908, + "grad_norm": 0.06959801912307739, + "learning_rate": 0.00010807867140735408, + "loss": 0.2139, + "step": 25362 + }, + { + "epoch": 2.0546824368114063, + "grad_norm": 0.06847614049911499, + "learning_rate": 0.00010807417075475945, + "loss": 0.2532, + "step": 25363 + }, + { + "epoch": 2.0547634478289045, + "grad_norm": 0.055373258888721466, + "learning_rate": 0.00010806967010216482, + "loss": 0.2381, + "step": 25364 + }, + { + "epoch": 2.054844458846403, + "grad_norm": 0.05136518180370331, + "learning_rate": 0.0001080651694495702, + "loss": 0.2581, + "step": 25365 + }, + { + "epoch": 2.0549254698639015, + "grad_norm": 0.0626882016658783, + "learning_rate": 0.00010806066879697556, + "loss": 0.2773, + "step": 25366 + }, + { + "epoch": 2.0550064808813997, + "grad_norm": 0.051707156002521515, + "learning_rate": 0.00010805616814438093, + "loss": 0.2434, + "step": 25367 + }, + { + "epoch": 2.0550874918988984, + "grad_norm": 0.06586476415395737, + "learning_rate": 0.00010805166749178633, + "loss": 0.2737, + "step": 25368 + }, + { + "epoch": 2.0551685029163966, + "grad_norm": 0.06009049341082573, + "learning_rate": 0.00010804716683919169, + "loss": 0.2667, + "step": 25369 + }, + { + "epoch": 2.055249513933895, + "grad_norm": 0.07964500039815903, + "learning_rate": 0.00010804266618659706, + "loss": 0.2736, + "step": 25370 + }, + { + "epoch": 2.0553305249513936, + "grad_norm": 0.06829434633255005, + "learning_rate": 0.00010803816553400244, + "loss": 0.2558, + "step": 25371 + }, + { + "epoch": 2.055411535968892, + "grad_norm": 0.0760689228773117, + "learning_rate": 0.0001080336648814078, + "loss": 0.2657, + "step": 25372 + }, + { + "epoch": 2.05549254698639, + "grad_norm": 0.07334722578525543, + "learning_rate": 0.00010802916422881317, + "loss": 0.28, + "step": 25373 + }, + { + "epoch": 2.0555735580038887, + "grad_norm": 0.05663280189037323, + "learning_rate": 0.00010802466357621857, + "loss": 0.2564, + "step": 25374 + }, + { + "epoch": 2.055654569021387, + "grad_norm": 0.07035217434167862, + "learning_rate": 0.00010802016292362393, + "loss": 0.2701, + "step": 25375 + }, + { + "epoch": 2.0557355800388852, + "grad_norm": 0.07113281637430191, + "learning_rate": 0.0001080156622710293, + "loss": 0.2341, + "step": 25376 + }, + { + "epoch": 2.0558165910563835, + "grad_norm": 0.06113379821181297, + "learning_rate": 0.00010801116161843468, + "loss": 0.2442, + "step": 25377 + }, + { + "epoch": 2.055897602073882, + "grad_norm": 0.06307367980480194, + "learning_rate": 0.00010800666096584004, + "loss": 0.293, + "step": 25378 + }, + { + "epoch": 2.0559786130913804, + "grad_norm": 0.05947719141840935, + "learning_rate": 0.00010800216031324541, + "loss": 0.2455, + "step": 25379 + }, + { + "epoch": 2.0560596241088787, + "grad_norm": 0.06589431315660477, + "learning_rate": 0.00010799765966065081, + "loss": 0.275, + "step": 25380 + }, + { + "epoch": 2.0561406351263773, + "grad_norm": 0.061434660106897354, + "learning_rate": 0.00010799315900805617, + "loss": 0.2388, + "step": 25381 + }, + { + "epoch": 2.0562216461438756, + "grad_norm": 0.06392492353916168, + "learning_rate": 0.00010798865835546155, + "loss": 0.2626, + "step": 25382 + }, + { + "epoch": 2.056302657161374, + "grad_norm": 0.062192004173994064, + "learning_rate": 0.00010798415770286692, + "loss": 0.2075, + "step": 25383 + }, + { + "epoch": 2.0563836681788725, + "grad_norm": 0.0811699777841568, + "learning_rate": 0.00010797965705027228, + "loss": 0.3011, + "step": 25384 + }, + { + "epoch": 2.0564646791963708, + "grad_norm": 0.05931561812758446, + "learning_rate": 0.00010797515639767768, + "loss": 0.2761, + "step": 25385 + }, + { + "epoch": 2.056545690213869, + "grad_norm": 0.06587688624858856, + "learning_rate": 0.00010797065574508306, + "loss": 0.2879, + "step": 25386 + }, + { + "epoch": 2.0566267012313673, + "grad_norm": 0.05588224530220032, + "learning_rate": 0.00010796615509248842, + "loss": 0.2513, + "step": 25387 + }, + { + "epoch": 2.056707712248866, + "grad_norm": 0.06199725717306137, + "learning_rate": 0.00010796165443989379, + "loss": 0.2752, + "step": 25388 + }, + { + "epoch": 2.056788723266364, + "grad_norm": 0.04725608974695206, + "learning_rate": 0.00010795715378729916, + "loss": 0.2246, + "step": 25389 + }, + { + "epoch": 2.0568697342838624, + "grad_norm": 0.06411755084991455, + "learning_rate": 0.00010795265313470452, + "loss": 0.2443, + "step": 25390 + }, + { + "epoch": 2.056950745301361, + "grad_norm": 0.06652890145778656, + "learning_rate": 0.00010794815248210992, + "loss": 0.2282, + "step": 25391 + }, + { + "epoch": 2.0570317563188594, + "grad_norm": 0.05945013090968132, + "learning_rate": 0.0001079436518295153, + "loss": 0.3013, + "step": 25392 + }, + { + "epoch": 2.0571127673363576, + "grad_norm": 0.07318027317523956, + "learning_rate": 0.00010793915117692066, + "loss": 0.2679, + "step": 25393 + }, + { + "epoch": 2.0571937783538563, + "grad_norm": 0.05756182223558426, + "learning_rate": 0.00010793465052432603, + "loss": 0.2471, + "step": 25394 + }, + { + "epoch": 2.0572747893713546, + "grad_norm": 0.0655335783958435, + "learning_rate": 0.0001079301498717314, + "loss": 0.2832, + "step": 25395 + }, + { + "epoch": 2.057355800388853, + "grad_norm": 0.07269290834665298, + "learning_rate": 0.00010792564921913676, + "loss": 0.2433, + "step": 25396 + }, + { + "epoch": 2.057436811406351, + "grad_norm": 0.0656035766005516, + "learning_rate": 0.00010792114856654216, + "loss": 0.2351, + "step": 25397 + }, + { + "epoch": 2.0575178224238497, + "grad_norm": 0.06749686598777771, + "learning_rate": 0.00010791664791394754, + "loss": 0.2705, + "step": 25398 + }, + { + "epoch": 2.057598833441348, + "grad_norm": 0.06477170437574387, + "learning_rate": 0.0001079121472613529, + "loss": 0.2732, + "step": 25399 + }, + { + "epoch": 2.057679844458846, + "grad_norm": 0.05468861386179924, + "learning_rate": 0.00010790764660875827, + "loss": 0.2596, + "step": 25400 + }, + { + "epoch": 2.057760855476345, + "grad_norm": 0.059450436383485794, + "learning_rate": 0.00010790314595616365, + "loss": 0.2385, + "step": 25401 + }, + { + "epoch": 2.057841866493843, + "grad_norm": 0.05819116160273552, + "learning_rate": 0.00010789864530356902, + "loss": 0.2732, + "step": 25402 + }, + { + "epoch": 2.0579228775113414, + "grad_norm": 0.06588525325059891, + "learning_rate": 0.0001078941446509744, + "loss": 0.2763, + "step": 25403 + }, + { + "epoch": 2.05800388852884, + "grad_norm": 0.059603944420814514, + "learning_rate": 0.00010788964399837978, + "loss": 0.2468, + "step": 25404 + }, + { + "epoch": 2.0580848995463383, + "grad_norm": 0.0704275518655777, + "learning_rate": 0.00010788514334578514, + "loss": 0.2963, + "step": 25405 + }, + { + "epoch": 2.0581659105638366, + "grad_norm": 0.053840700536966324, + "learning_rate": 0.00010788064269319051, + "loss": 0.2356, + "step": 25406 + }, + { + "epoch": 2.0582469215813353, + "grad_norm": 0.05670013278722763, + "learning_rate": 0.00010787614204059589, + "loss": 0.2554, + "step": 25407 + }, + { + "epoch": 2.0583279325988335, + "grad_norm": 0.05271613597869873, + "learning_rate": 0.00010787164138800126, + "loss": 0.2384, + "step": 25408 + }, + { + "epoch": 2.0584089436163318, + "grad_norm": 0.05470237880945206, + "learning_rate": 0.00010786714073540665, + "loss": 0.2539, + "step": 25409 + }, + { + "epoch": 2.05848995463383, + "grad_norm": 0.06058771535754204, + "learning_rate": 0.00010786264008281202, + "loss": 0.2643, + "step": 25410 + }, + { + "epoch": 2.0585709656513287, + "grad_norm": 0.061354998499155045, + "learning_rate": 0.00010785813943021738, + "loss": 0.267, + "step": 25411 + }, + { + "epoch": 2.058651976668827, + "grad_norm": 0.07212673872709274, + "learning_rate": 0.00010785363877762276, + "loss": 0.2658, + "step": 25412 + }, + { + "epoch": 2.058732987686325, + "grad_norm": 0.05706482008099556, + "learning_rate": 0.00010784913812502813, + "loss": 0.2526, + "step": 25413 + }, + { + "epoch": 2.058813998703824, + "grad_norm": 0.06004820019006729, + "learning_rate": 0.00010784463747243352, + "loss": 0.2527, + "step": 25414 + }, + { + "epoch": 2.058895009721322, + "grad_norm": 0.06409589946269989, + "learning_rate": 0.00010784013681983889, + "loss": 0.2244, + "step": 25415 + }, + { + "epoch": 2.0589760207388204, + "grad_norm": 0.05893712118268013, + "learning_rate": 0.00010783563616724426, + "loss": 0.2805, + "step": 25416 + }, + { + "epoch": 2.059057031756319, + "grad_norm": 0.06911461800336838, + "learning_rate": 0.00010783113551464962, + "loss": 0.2239, + "step": 25417 + }, + { + "epoch": 2.0591380427738173, + "grad_norm": 0.07301256060600281, + "learning_rate": 0.000107826634862055, + "loss": 0.2678, + "step": 25418 + }, + { + "epoch": 2.0592190537913155, + "grad_norm": 0.06696818768978119, + "learning_rate": 0.00010782213420946037, + "loss": 0.2502, + "step": 25419 + }, + { + "epoch": 2.059300064808814, + "grad_norm": 0.07134930789470673, + "learning_rate": 0.00010781763355686576, + "loss": 0.2564, + "step": 25420 + }, + { + "epoch": 2.0593810758263125, + "grad_norm": 0.061927393078804016, + "learning_rate": 0.00010781313290427113, + "loss": 0.234, + "step": 25421 + }, + { + "epoch": 2.0594620868438107, + "grad_norm": 0.05794256180524826, + "learning_rate": 0.0001078086322516765, + "loss": 0.2276, + "step": 25422 + }, + { + "epoch": 2.059543097861309, + "grad_norm": 0.061672911047935486, + "learning_rate": 0.00010780413159908186, + "loss": 0.2486, + "step": 25423 + }, + { + "epoch": 2.0596241088788076, + "grad_norm": 0.06663712114095688, + "learning_rate": 0.00010779963094648724, + "loss": 0.2562, + "step": 25424 + }, + { + "epoch": 2.059705119896306, + "grad_norm": 0.06700772792100906, + "learning_rate": 0.00010779513029389261, + "loss": 0.2657, + "step": 25425 + }, + { + "epoch": 2.059786130913804, + "grad_norm": 0.051759883761405945, + "learning_rate": 0.000107790629641298, + "loss": 0.2297, + "step": 25426 + }, + { + "epoch": 2.059867141931303, + "grad_norm": 0.06241053715348244, + "learning_rate": 0.00010778612898870337, + "loss": 0.2642, + "step": 25427 + }, + { + "epoch": 2.059948152948801, + "grad_norm": 0.05102437734603882, + "learning_rate": 0.00010778162833610875, + "loss": 0.2665, + "step": 25428 + }, + { + "epoch": 2.0600291639662993, + "grad_norm": 0.08531544357538223, + "learning_rate": 0.0001077771276835141, + "loss": 0.2539, + "step": 25429 + }, + { + "epoch": 2.060110174983798, + "grad_norm": 0.06146989390254021, + "learning_rate": 0.00010777262703091948, + "loss": 0.2673, + "step": 25430 + }, + { + "epoch": 2.0601911860012962, + "grad_norm": 0.07017390429973602, + "learning_rate": 0.00010776812637832485, + "loss": 0.271, + "step": 25431 + }, + { + "epoch": 2.0602721970187945, + "grad_norm": 0.056769225746393204, + "learning_rate": 0.00010776362572573024, + "loss": 0.2696, + "step": 25432 + }, + { + "epoch": 2.0603532080362927, + "grad_norm": 0.061329200863838196, + "learning_rate": 0.00010775912507313561, + "loss": 0.3056, + "step": 25433 + }, + { + "epoch": 2.0604342190537914, + "grad_norm": 0.05566471815109253, + "learning_rate": 0.00010775462442054099, + "loss": 0.236, + "step": 25434 + }, + { + "epoch": 2.0605152300712897, + "grad_norm": 0.04820633307099342, + "learning_rate": 0.00010775012376794635, + "loss": 0.2328, + "step": 25435 + }, + { + "epoch": 2.060596241088788, + "grad_norm": 0.06556723266839981, + "learning_rate": 0.00010774562311535172, + "loss": 0.26, + "step": 25436 + }, + { + "epoch": 2.0606772521062866, + "grad_norm": 0.05982845649123192, + "learning_rate": 0.00010774112246275712, + "loss": 0.2371, + "step": 25437 + }, + { + "epoch": 2.060758263123785, + "grad_norm": 0.0625552088022232, + "learning_rate": 0.00010773662181016248, + "loss": 0.2744, + "step": 25438 + }, + { + "epoch": 2.060839274141283, + "grad_norm": 0.07399687170982361, + "learning_rate": 0.00010773212115756786, + "loss": 0.251, + "step": 25439 + }, + { + "epoch": 2.060920285158782, + "grad_norm": 0.057092901319265366, + "learning_rate": 0.00010772762050497323, + "loss": 0.2365, + "step": 25440 + }, + { + "epoch": 2.06100129617628, + "grad_norm": 0.05995013564825058, + "learning_rate": 0.00010772311985237859, + "loss": 0.2596, + "step": 25441 + }, + { + "epoch": 2.0610823071937783, + "grad_norm": 0.08690828084945679, + "learning_rate": 0.00010771861919978396, + "loss": 0.286, + "step": 25442 + }, + { + "epoch": 2.0611633182112765, + "grad_norm": 0.07108601182699203, + "learning_rate": 0.00010771411854718936, + "loss": 0.2676, + "step": 25443 + }, + { + "epoch": 2.061244329228775, + "grad_norm": 0.07897786796092987, + "learning_rate": 0.00010770961789459472, + "loss": 0.2668, + "step": 25444 + }, + { + "epoch": 2.0613253402462735, + "grad_norm": 0.05581003800034523, + "learning_rate": 0.0001077051172420001, + "loss": 0.2658, + "step": 25445 + }, + { + "epoch": 2.0614063512637717, + "grad_norm": 0.05923276022076607, + "learning_rate": 0.00010770061658940547, + "loss": 0.2794, + "step": 25446 + }, + { + "epoch": 2.0614873622812704, + "grad_norm": 0.057664401829242706, + "learning_rate": 0.00010769611593681083, + "loss": 0.2442, + "step": 25447 + }, + { + "epoch": 2.0615683732987686, + "grad_norm": 0.060754094272851944, + "learning_rate": 0.0001076916152842162, + "loss": 0.2401, + "step": 25448 + }, + { + "epoch": 2.061649384316267, + "grad_norm": 0.05699896439909935, + "learning_rate": 0.0001076871146316216, + "loss": 0.2309, + "step": 25449 + }, + { + "epoch": 2.0617303953337656, + "grad_norm": 0.062410902231931686, + "learning_rate": 0.00010768261397902697, + "loss": 0.2446, + "step": 25450 + }, + { + "epoch": 2.061811406351264, + "grad_norm": 0.08376600593328476, + "learning_rate": 0.00010767811332643234, + "loss": 0.2463, + "step": 25451 + }, + { + "epoch": 2.061892417368762, + "grad_norm": 0.05989118665456772, + "learning_rate": 0.00010767361267383771, + "loss": 0.2559, + "step": 25452 + }, + { + "epoch": 2.0619734283862607, + "grad_norm": 0.05913263559341431, + "learning_rate": 0.00010766911202124307, + "loss": 0.2597, + "step": 25453 + }, + { + "epoch": 2.062054439403759, + "grad_norm": 0.0431220605969429, + "learning_rate": 0.00010766461136864845, + "loss": 0.1989, + "step": 25454 + }, + { + "epoch": 2.0621354504212572, + "grad_norm": 0.06682056933641434, + "learning_rate": 0.00010766011071605385, + "loss": 0.2754, + "step": 25455 + }, + { + "epoch": 2.0622164614387555, + "grad_norm": 0.07611300051212311, + "learning_rate": 0.00010765561006345921, + "loss": 0.2821, + "step": 25456 + }, + { + "epoch": 2.062297472456254, + "grad_norm": 0.06988724321126938, + "learning_rate": 0.00010765110941086458, + "loss": 0.2711, + "step": 25457 + }, + { + "epoch": 2.0623784834737524, + "grad_norm": 0.061311472207307816, + "learning_rate": 0.00010764660875826995, + "loss": 0.238, + "step": 25458 + }, + { + "epoch": 2.0624594944912507, + "grad_norm": 0.05116477981209755, + "learning_rate": 0.00010764210810567531, + "loss": 0.247, + "step": 25459 + }, + { + "epoch": 2.0625405055087493, + "grad_norm": 0.06642013043165207, + "learning_rate": 0.00010763760745308069, + "loss": 0.2614, + "step": 25460 + }, + { + "epoch": 2.0626215165262476, + "grad_norm": 0.05731287598609924, + "learning_rate": 0.00010763310680048609, + "loss": 0.2508, + "step": 25461 + }, + { + "epoch": 2.062702527543746, + "grad_norm": 0.06660475581884384, + "learning_rate": 0.00010762860614789145, + "loss": 0.249, + "step": 25462 + }, + { + "epoch": 2.0627835385612445, + "grad_norm": 0.07254832983016968, + "learning_rate": 0.00010762410549529682, + "loss": 0.3014, + "step": 25463 + }, + { + "epoch": 2.0628645495787428, + "grad_norm": 0.06310251355171204, + "learning_rate": 0.0001076196048427022, + "loss": 0.2488, + "step": 25464 + }, + { + "epoch": 2.062945560596241, + "grad_norm": 0.06924467533826828, + "learning_rate": 0.00010761510419010756, + "loss": 0.2594, + "step": 25465 + }, + { + "epoch": 2.0630265716137393, + "grad_norm": 0.06101961433887482, + "learning_rate": 0.00010761060353751296, + "loss": 0.2605, + "step": 25466 + }, + { + "epoch": 2.063107582631238, + "grad_norm": 0.06226739287376404, + "learning_rate": 0.00010760610288491833, + "loss": 0.2286, + "step": 25467 + }, + { + "epoch": 2.063188593648736, + "grad_norm": 0.05867013707756996, + "learning_rate": 0.00010760160223232369, + "loss": 0.2512, + "step": 25468 + }, + { + "epoch": 2.0632696046662344, + "grad_norm": 0.05932064726948738, + "learning_rate": 0.00010759710157972906, + "loss": 0.2728, + "step": 25469 + }, + { + "epoch": 2.063350615683733, + "grad_norm": 0.07166090607643127, + "learning_rate": 0.00010759260092713444, + "loss": 0.2621, + "step": 25470 + }, + { + "epoch": 2.0634316267012314, + "grad_norm": 0.05555148422718048, + "learning_rate": 0.00010758810027453981, + "loss": 0.2402, + "step": 25471 + }, + { + "epoch": 2.0635126377187296, + "grad_norm": 0.05288715660572052, + "learning_rate": 0.0001075835996219452, + "loss": 0.2352, + "step": 25472 + }, + { + "epoch": 2.0635936487362283, + "grad_norm": 0.062381304800510406, + "learning_rate": 0.00010757909896935057, + "loss": 0.2925, + "step": 25473 + }, + { + "epoch": 2.0636746597537265, + "grad_norm": 0.06400690227746964, + "learning_rate": 0.00010757459831675593, + "loss": 0.2709, + "step": 25474 + }, + { + "epoch": 2.063755670771225, + "grad_norm": 0.06255996227264404, + "learning_rate": 0.0001075700976641613, + "loss": 0.2678, + "step": 25475 + }, + { + "epoch": 2.0638366817887235, + "grad_norm": 0.059242524206638336, + "learning_rate": 0.00010756559701156668, + "loss": 0.2982, + "step": 25476 + }, + { + "epoch": 2.0639176928062217, + "grad_norm": 0.06539078801870346, + "learning_rate": 0.00010756109635897205, + "loss": 0.2829, + "step": 25477 + }, + { + "epoch": 2.06399870382372, + "grad_norm": 0.05289076268672943, + "learning_rate": 0.00010755659570637744, + "loss": 0.2096, + "step": 25478 + }, + { + "epoch": 2.064079714841218, + "grad_norm": 0.07497277110815048, + "learning_rate": 0.00010755209505378281, + "loss": 0.263, + "step": 25479 + }, + { + "epoch": 2.064160725858717, + "grad_norm": 0.0586920790374279, + "learning_rate": 0.00010754759440118817, + "loss": 0.2514, + "step": 25480 + }, + { + "epoch": 2.064241736876215, + "grad_norm": 0.07065203785896301, + "learning_rate": 0.00010754309374859355, + "loss": 0.2484, + "step": 25481 + }, + { + "epoch": 2.0643227478937134, + "grad_norm": 0.0774066299200058, + "learning_rate": 0.00010753859309599892, + "loss": 0.2964, + "step": 25482 + }, + { + "epoch": 2.064403758911212, + "grad_norm": 0.06121392548084259, + "learning_rate": 0.0001075340924434043, + "loss": 0.2298, + "step": 25483 + }, + { + "epoch": 2.0644847699287103, + "grad_norm": 0.08433547616004944, + "learning_rate": 0.00010752959179080968, + "loss": 0.3256, + "step": 25484 + }, + { + "epoch": 2.0645657809462086, + "grad_norm": 0.06554676592350006, + "learning_rate": 0.00010752509113821506, + "loss": 0.2843, + "step": 25485 + }, + { + "epoch": 2.0646467919637073, + "grad_norm": 0.0544719398021698, + "learning_rate": 0.00010752059048562042, + "loss": 0.2767, + "step": 25486 + }, + { + "epoch": 2.0647278029812055, + "grad_norm": 0.06069955229759216, + "learning_rate": 0.00010751608983302579, + "loss": 0.2681, + "step": 25487 + }, + { + "epoch": 2.0648088139987038, + "grad_norm": 0.0706348568201065, + "learning_rate": 0.00010751158918043116, + "loss": 0.2734, + "step": 25488 + }, + { + "epoch": 2.064889825016202, + "grad_norm": 0.0692746564745903, + "learning_rate": 0.00010750708852783655, + "loss": 0.2547, + "step": 25489 + }, + { + "epoch": 2.0649708360337007, + "grad_norm": 0.06689253449440002, + "learning_rate": 0.00010750258787524192, + "loss": 0.2882, + "step": 25490 + }, + { + "epoch": 2.065051847051199, + "grad_norm": 0.0553397610783577, + "learning_rate": 0.0001074980872226473, + "loss": 0.2507, + "step": 25491 + }, + { + "epoch": 2.065132858068697, + "grad_norm": 0.07342147827148438, + "learning_rate": 0.00010749358657005266, + "loss": 0.2894, + "step": 25492 + }, + { + "epoch": 2.065213869086196, + "grad_norm": 0.06011483445763588, + "learning_rate": 0.00010748908591745803, + "loss": 0.2383, + "step": 25493 + }, + { + "epoch": 2.065294880103694, + "grad_norm": 0.06471290439367294, + "learning_rate": 0.0001074845852648634, + "loss": 0.2363, + "step": 25494 + }, + { + "epoch": 2.0653758911211924, + "grad_norm": 0.058766886591911316, + "learning_rate": 0.00010748008461226879, + "loss": 0.2708, + "step": 25495 + }, + { + "epoch": 2.065456902138691, + "grad_norm": 0.04799140617251396, + "learning_rate": 0.00010747558395967417, + "loss": 0.2145, + "step": 25496 + }, + { + "epoch": 2.0655379131561893, + "grad_norm": 0.05512750893831253, + "learning_rate": 0.00010747108330707954, + "loss": 0.2369, + "step": 25497 + }, + { + "epoch": 2.0656189241736875, + "grad_norm": 0.0533377043902874, + "learning_rate": 0.0001074665826544849, + "loss": 0.2161, + "step": 25498 + }, + { + "epoch": 2.065699935191186, + "grad_norm": 0.06390156596899033, + "learning_rate": 0.00010746208200189027, + "loss": 0.2746, + "step": 25499 + }, + { + "epoch": 2.0657809462086845, + "grad_norm": 0.052978046238422394, + "learning_rate": 0.00010745758134929565, + "loss": 0.2455, + "step": 25500 + }, + { + "epoch": 2.0658619572261827, + "grad_norm": 0.059611186385154724, + "learning_rate": 0.00010745308069670103, + "loss": 0.2146, + "step": 25501 + }, + { + "epoch": 2.065942968243681, + "grad_norm": 0.07775352895259857, + "learning_rate": 0.0001074485800441064, + "loss": 0.2481, + "step": 25502 + }, + { + "epoch": 2.0660239792611796, + "grad_norm": 0.05068066716194153, + "learning_rate": 0.00010744407939151178, + "loss": 0.2753, + "step": 25503 + }, + { + "epoch": 2.066104990278678, + "grad_norm": 0.0600021667778492, + "learning_rate": 0.00010743957873891714, + "loss": 0.2884, + "step": 25504 + }, + { + "epoch": 2.066186001296176, + "grad_norm": 0.05596313253045082, + "learning_rate": 0.00010743507808632251, + "loss": 0.2536, + "step": 25505 + }, + { + "epoch": 2.066267012313675, + "grad_norm": 0.05262834578752518, + "learning_rate": 0.00010743057743372789, + "loss": 0.2505, + "step": 25506 + }, + { + "epoch": 2.066348023331173, + "grad_norm": 0.059846874326467514, + "learning_rate": 0.00010742607678113327, + "loss": 0.2275, + "step": 25507 + }, + { + "epoch": 2.0664290343486713, + "grad_norm": 0.05899396911263466, + "learning_rate": 0.00010742157612853865, + "loss": 0.2734, + "step": 25508 + }, + { + "epoch": 2.06651004536617, + "grad_norm": 0.05113719776272774, + "learning_rate": 0.00010741707547594402, + "loss": 0.2496, + "step": 25509 + }, + { + "epoch": 2.0665910563836682, + "grad_norm": 0.05043940991163254, + "learning_rate": 0.00010741257482334938, + "loss": 0.2103, + "step": 25510 + }, + { + "epoch": 2.0666720674011665, + "grad_norm": 0.06341768801212311, + "learning_rate": 0.00010740807417075476, + "loss": 0.2821, + "step": 25511 + }, + { + "epoch": 2.0667530784186647, + "grad_norm": 0.056847795844078064, + "learning_rate": 0.00010740357351816013, + "loss": 0.229, + "step": 25512 + }, + { + "epoch": 2.0668340894361634, + "grad_norm": 0.07382318377494812, + "learning_rate": 0.00010739907286556552, + "loss": 0.2489, + "step": 25513 + }, + { + "epoch": 2.0669151004536617, + "grad_norm": 0.07611710578203201, + "learning_rate": 0.00010739457221297089, + "loss": 0.2719, + "step": 25514 + }, + { + "epoch": 2.06699611147116, + "grad_norm": 0.060138873755931854, + "learning_rate": 0.00010739007156037626, + "loss": 0.2736, + "step": 25515 + }, + { + "epoch": 2.0670771224886586, + "grad_norm": 0.06036220118403435, + "learning_rate": 0.00010738557090778162, + "loss": 0.2438, + "step": 25516 + }, + { + "epoch": 2.067158133506157, + "grad_norm": 0.06158443167805672, + "learning_rate": 0.000107381070255187, + "loss": 0.2693, + "step": 25517 + }, + { + "epoch": 2.067239144523655, + "grad_norm": 0.07605554163455963, + "learning_rate": 0.0001073765696025924, + "loss": 0.2527, + "step": 25518 + }, + { + "epoch": 2.067320155541154, + "grad_norm": 0.07047504931688309, + "learning_rate": 0.00010737206894999776, + "loss": 0.2989, + "step": 25519 + }, + { + "epoch": 2.067401166558652, + "grad_norm": 0.06754288822412491, + "learning_rate": 0.00010736756829740313, + "loss": 0.2966, + "step": 25520 + }, + { + "epoch": 2.0674821775761503, + "grad_norm": 0.05999528989195824, + "learning_rate": 0.0001073630676448085, + "loss": 0.248, + "step": 25521 + }, + { + "epoch": 2.067563188593649, + "grad_norm": 0.059076789766550064, + "learning_rate": 0.00010735856699221387, + "loss": 0.2683, + "step": 25522 + }, + { + "epoch": 2.067644199611147, + "grad_norm": 0.058501314371824265, + "learning_rate": 0.00010735406633961924, + "loss": 0.2538, + "step": 25523 + }, + { + "epoch": 2.0677252106286454, + "grad_norm": 0.06582152098417282, + "learning_rate": 0.00010734956568702464, + "loss": 0.2636, + "step": 25524 + }, + { + "epoch": 2.0678062216461437, + "grad_norm": 0.05614101141691208, + "learning_rate": 0.00010734506503443, + "loss": 0.2706, + "step": 25525 + }, + { + "epoch": 2.0678872326636424, + "grad_norm": 0.05233749374747276, + "learning_rate": 0.00010734056438183537, + "loss": 0.2506, + "step": 25526 + }, + { + "epoch": 2.0679682436811406, + "grad_norm": 0.058415673673152924, + "learning_rate": 0.00010733606372924075, + "loss": 0.2374, + "step": 25527 + }, + { + "epoch": 2.068049254698639, + "grad_norm": 0.05963871255517006, + "learning_rate": 0.00010733156307664611, + "loss": 0.2552, + "step": 25528 + }, + { + "epoch": 2.0681302657161376, + "grad_norm": 0.05692227929830551, + "learning_rate": 0.00010732706242405148, + "loss": 0.2423, + "step": 25529 + }, + { + "epoch": 2.068211276733636, + "grad_norm": 0.06456737965345383, + "learning_rate": 0.00010732256177145688, + "loss": 0.273, + "step": 25530 + }, + { + "epoch": 2.068292287751134, + "grad_norm": 0.0603487491607666, + "learning_rate": 0.00010731806111886224, + "loss": 0.2559, + "step": 25531 + }, + { + "epoch": 2.0683732987686327, + "grad_norm": 0.07074466347694397, + "learning_rate": 0.00010731356046626761, + "loss": 0.2903, + "step": 25532 + }, + { + "epoch": 2.068454309786131, + "grad_norm": 0.07579421997070312, + "learning_rate": 0.00010730905981367299, + "loss": 0.2499, + "step": 25533 + }, + { + "epoch": 2.0685353208036292, + "grad_norm": 0.06210765987634659, + "learning_rate": 0.00010730455916107835, + "loss": 0.2876, + "step": 25534 + }, + { + "epoch": 2.0686163318211275, + "grad_norm": 0.050779640674591064, + "learning_rate": 0.00010730005850848372, + "loss": 0.2516, + "step": 25535 + }, + { + "epoch": 2.068697342838626, + "grad_norm": 0.05742275342345238, + "learning_rate": 0.00010729555785588912, + "loss": 0.2579, + "step": 25536 + }, + { + "epoch": 2.0687783538561244, + "grad_norm": 0.06202203035354614, + "learning_rate": 0.00010729105720329448, + "loss": 0.256, + "step": 25537 + }, + { + "epoch": 2.0688593648736227, + "grad_norm": 0.07720314711332321, + "learning_rate": 0.00010728655655069986, + "loss": 0.2485, + "step": 25538 + }, + { + "epoch": 2.0689403758911213, + "grad_norm": 0.06743809580802917, + "learning_rate": 0.00010728205589810523, + "loss": 0.2238, + "step": 25539 + }, + { + "epoch": 2.0690213869086196, + "grad_norm": 0.06037648394703865, + "learning_rate": 0.0001072775552455106, + "loss": 0.2561, + "step": 25540 + }, + { + "epoch": 2.069102397926118, + "grad_norm": 0.0584062859416008, + "learning_rate": 0.00010727305459291596, + "loss": 0.2945, + "step": 25541 + }, + { + "epoch": 2.0691834089436165, + "grad_norm": 0.06143295392394066, + "learning_rate": 0.00010726855394032136, + "loss": 0.2384, + "step": 25542 + }, + { + "epoch": 2.0692644199611148, + "grad_norm": 0.06712021678686142, + "learning_rate": 0.00010726405328772672, + "loss": 0.2639, + "step": 25543 + }, + { + "epoch": 2.069345430978613, + "grad_norm": 0.05841357633471489, + "learning_rate": 0.0001072595526351321, + "loss": 0.2739, + "step": 25544 + }, + { + "epoch": 2.0694264419961117, + "grad_norm": 0.06780307739973068, + "learning_rate": 0.00010725505198253747, + "loss": 0.2708, + "step": 25545 + }, + { + "epoch": 2.06950745301361, + "grad_norm": 0.07603610306978226, + "learning_rate": 0.00010725055132994285, + "loss": 0.3202, + "step": 25546 + }, + { + "epoch": 2.069588464031108, + "grad_norm": 0.07834131270647049, + "learning_rate": 0.00010724605067734823, + "loss": 0.302, + "step": 25547 + }, + { + "epoch": 2.0696694750486064, + "grad_norm": 0.06293410062789917, + "learning_rate": 0.0001072415500247536, + "loss": 0.2331, + "step": 25548 + }, + { + "epoch": 2.069750486066105, + "grad_norm": 0.05901750549674034, + "learning_rate": 0.00010723704937215897, + "loss": 0.2451, + "step": 25549 + }, + { + "epoch": 2.0698314970836034, + "grad_norm": 0.061935652047395706, + "learning_rate": 0.00010723254871956434, + "loss": 0.2806, + "step": 25550 + }, + { + "epoch": 2.0699125081011016, + "grad_norm": 0.05839787423610687, + "learning_rate": 0.00010722804806696971, + "loss": 0.308, + "step": 25551 + }, + { + "epoch": 2.0699935191186003, + "grad_norm": 0.061520643532276154, + "learning_rate": 0.00010722354741437509, + "loss": 0.2504, + "step": 25552 + }, + { + "epoch": 2.0700745301360985, + "grad_norm": 0.06616219133138657, + "learning_rate": 0.00010721904676178047, + "loss": 0.3015, + "step": 25553 + }, + { + "epoch": 2.070155541153597, + "grad_norm": 0.0596001073718071, + "learning_rate": 0.00010721454610918585, + "loss": 0.2737, + "step": 25554 + }, + { + "epoch": 2.0702365521710955, + "grad_norm": 0.052469369024038315, + "learning_rate": 0.00010721004545659121, + "loss": 0.2847, + "step": 25555 + }, + { + "epoch": 2.0703175631885937, + "grad_norm": 0.05862971395254135, + "learning_rate": 0.00010720554480399658, + "loss": 0.2468, + "step": 25556 + }, + { + "epoch": 2.070398574206092, + "grad_norm": 0.05308549106121063, + "learning_rate": 0.00010720104415140195, + "loss": 0.2428, + "step": 25557 + }, + { + "epoch": 2.07047958522359, + "grad_norm": 0.04823100194334984, + "learning_rate": 0.00010719654349880733, + "loss": 0.2398, + "step": 25558 + }, + { + "epoch": 2.070560596241089, + "grad_norm": 0.04787187650799751, + "learning_rate": 0.00010719204284621272, + "loss": 0.2616, + "step": 25559 + }, + { + "epoch": 2.070641607258587, + "grad_norm": 0.06318645179271698, + "learning_rate": 0.00010718754219361809, + "loss": 0.2735, + "step": 25560 + }, + { + "epoch": 2.0707226182760854, + "grad_norm": 0.060162078589200974, + "learning_rate": 0.00010718304154102345, + "loss": 0.2522, + "step": 25561 + }, + { + "epoch": 2.070803629293584, + "grad_norm": 0.06665724515914917, + "learning_rate": 0.00010717854088842882, + "loss": 0.2239, + "step": 25562 + }, + { + "epoch": 2.0708846403110823, + "grad_norm": 0.06121491640806198, + "learning_rate": 0.0001071740402358342, + "loss": 0.2561, + "step": 25563 + }, + { + "epoch": 2.0709656513285806, + "grad_norm": 0.06794225424528122, + "learning_rate": 0.00010716953958323957, + "loss": 0.2715, + "step": 25564 + }, + { + "epoch": 2.0710466623460793, + "grad_norm": 0.05323499068617821, + "learning_rate": 0.00010716503893064496, + "loss": 0.269, + "step": 25565 + }, + { + "epoch": 2.0711276733635775, + "grad_norm": 0.06246719881892204, + "learning_rate": 0.00010716053827805033, + "loss": 0.2422, + "step": 25566 + }, + { + "epoch": 2.0712086843810757, + "grad_norm": 0.05139967426657677, + "learning_rate": 0.00010715603762545569, + "loss": 0.2417, + "step": 25567 + }, + { + "epoch": 2.071289695398574, + "grad_norm": 0.06880932301282883, + "learning_rate": 0.00010715153697286106, + "loss": 0.2313, + "step": 25568 + }, + { + "epoch": 2.0713707064160727, + "grad_norm": 0.06802390515804291, + "learning_rate": 0.00010714703632026644, + "loss": 0.2592, + "step": 25569 + }, + { + "epoch": 2.071451717433571, + "grad_norm": 0.060402750968933105, + "learning_rate": 0.00010714253566767183, + "loss": 0.2621, + "step": 25570 + }, + { + "epoch": 2.071532728451069, + "grad_norm": 0.05299397557973862, + "learning_rate": 0.0001071380350150772, + "loss": 0.2308, + "step": 25571 + }, + { + "epoch": 2.071613739468568, + "grad_norm": 0.05250426381826401, + "learning_rate": 0.00010713353436248257, + "loss": 0.245, + "step": 25572 + }, + { + "epoch": 2.071694750486066, + "grad_norm": 0.06468871980905533, + "learning_rate": 0.00010712903370988793, + "loss": 0.2577, + "step": 25573 + }, + { + "epoch": 2.0717757615035644, + "grad_norm": 0.05726084113121033, + "learning_rate": 0.0001071245330572933, + "loss": 0.2585, + "step": 25574 + }, + { + "epoch": 2.071856772521063, + "grad_norm": 0.06499053537845612, + "learning_rate": 0.00010712003240469868, + "loss": 0.2271, + "step": 25575 + }, + { + "epoch": 2.0719377835385613, + "grad_norm": 0.05974118039011955, + "learning_rate": 0.00010711553175210407, + "loss": 0.2672, + "step": 25576 + }, + { + "epoch": 2.0720187945560595, + "grad_norm": 0.07319014519453049, + "learning_rate": 0.00010711103109950944, + "loss": 0.2568, + "step": 25577 + }, + { + "epoch": 2.072099805573558, + "grad_norm": 0.056026265025138855, + "learning_rate": 0.00010710653044691481, + "loss": 0.2507, + "step": 25578 + }, + { + "epoch": 2.0721808165910565, + "grad_norm": 0.06923699378967285, + "learning_rate": 0.00010710202979432017, + "loss": 0.2544, + "step": 25579 + }, + { + "epoch": 2.0722618276085547, + "grad_norm": 0.0588361993432045, + "learning_rate": 0.00010709752914172555, + "loss": 0.2436, + "step": 25580 + }, + { + "epoch": 2.072342838626053, + "grad_norm": 0.06266608089208603, + "learning_rate": 0.00010709302848913092, + "loss": 0.2777, + "step": 25581 + }, + { + "epoch": 2.0724238496435516, + "grad_norm": 0.050658125430345535, + "learning_rate": 0.00010708852783653631, + "loss": 0.2363, + "step": 25582 + }, + { + "epoch": 2.07250486066105, + "grad_norm": 0.05195537209510803, + "learning_rate": 0.00010708402718394168, + "loss": 0.2793, + "step": 25583 + }, + { + "epoch": 2.072585871678548, + "grad_norm": 0.07279521226882935, + "learning_rate": 0.00010707952653134706, + "loss": 0.2929, + "step": 25584 + }, + { + "epoch": 2.072666882696047, + "grad_norm": 0.06828270107507706, + "learning_rate": 0.00010707502587875242, + "loss": 0.2234, + "step": 25585 + }, + { + "epoch": 2.072747893713545, + "grad_norm": 0.06727327406406403, + "learning_rate": 0.00010707052522615779, + "loss": 0.2902, + "step": 25586 + }, + { + "epoch": 2.0728289047310433, + "grad_norm": 0.05688133463263512, + "learning_rate": 0.00010706602457356316, + "loss": 0.2401, + "step": 25587 + }, + { + "epoch": 2.072909915748542, + "grad_norm": 0.0660371407866478, + "learning_rate": 0.00010706152392096855, + "loss": 0.2907, + "step": 25588 + }, + { + "epoch": 2.0729909267660402, + "grad_norm": 0.06644482910633087, + "learning_rate": 0.00010705702326837392, + "loss": 0.2801, + "step": 25589 + }, + { + "epoch": 2.0730719377835385, + "grad_norm": 0.06793033331632614, + "learning_rate": 0.0001070525226157793, + "loss": 0.2611, + "step": 25590 + }, + { + "epoch": 2.0731529488010367, + "grad_norm": 0.05119938403367996, + "learning_rate": 0.00010704802196318466, + "loss": 0.2204, + "step": 25591 + }, + { + "epoch": 2.0732339598185354, + "grad_norm": 0.06441966444253922, + "learning_rate": 0.00010704352131059003, + "loss": 0.2236, + "step": 25592 + }, + { + "epoch": 2.0733149708360337, + "grad_norm": 0.0572500079870224, + "learning_rate": 0.0001070390206579954, + "loss": 0.2268, + "step": 25593 + }, + { + "epoch": 2.073395981853532, + "grad_norm": 0.08293695002794266, + "learning_rate": 0.00010703452000540079, + "loss": 0.2815, + "step": 25594 + }, + { + "epoch": 2.0734769928710306, + "grad_norm": 0.0702480673789978, + "learning_rate": 0.00010703001935280617, + "loss": 0.302, + "step": 25595 + }, + { + "epoch": 2.073558003888529, + "grad_norm": 0.058082662522792816, + "learning_rate": 0.00010702551870021154, + "loss": 0.2337, + "step": 25596 + }, + { + "epoch": 2.073639014906027, + "grad_norm": 0.06347770988941193, + "learning_rate": 0.0001070210180476169, + "loss": 0.2441, + "step": 25597 + }, + { + "epoch": 2.073720025923526, + "grad_norm": 0.059085577726364136, + "learning_rate": 0.00010701651739502227, + "loss": 0.2552, + "step": 25598 + }, + { + "epoch": 2.073801036941024, + "grad_norm": 0.0577191598713398, + "learning_rate": 0.00010701201674242767, + "loss": 0.2575, + "step": 25599 + }, + { + "epoch": 2.0738820479585223, + "grad_norm": 0.054545432329177856, + "learning_rate": 0.00010700751608983303, + "loss": 0.2327, + "step": 25600 + }, + { + "epoch": 2.0739630589760205, + "grad_norm": 0.05656864121556282, + "learning_rate": 0.00010700301543723841, + "loss": 0.2511, + "step": 25601 + }, + { + "epoch": 2.074044069993519, + "grad_norm": 0.05607932060956955, + "learning_rate": 0.00010699851478464378, + "loss": 0.2355, + "step": 25602 + }, + { + "epoch": 2.0741250810110174, + "grad_norm": 0.07496161758899689, + "learning_rate": 0.00010699401413204914, + "loss": 0.3078, + "step": 25603 + }, + { + "epoch": 2.0742060920285157, + "grad_norm": 0.06039506942033768, + "learning_rate": 0.00010698951347945451, + "loss": 0.2421, + "step": 25604 + }, + { + "epoch": 2.0742871030460144, + "grad_norm": 0.06327684223651886, + "learning_rate": 0.00010698501282685991, + "loss": 0.2545, + "step": 25605 + }, + { + "epoch": 2.0743681140635126, + "grad_norm": 0.06102275848388672, + "learning_rate": 0.00010698051217426527, + "loss": 0.2782, + "step": 25606 + }, + { + "epoch": 2.074449125081011, + "grad_norm": 0.06765256077051163, + "learning_rate": 0.00010697601152167065, + "loss": 0.2567, + "step": 25607 + }, + { + "epoch": 2.0745301360985096, + "grad_norm": 0.0560624822974205, + "learning_rate": 0.00010697151086907602, + "loss": 0.2401, + "step": 25608 + }, + { + "epoch": 2.074611147116008, + "grad_norm": 0.05952421948313713, + "learning_rate": 0.0001069670102164814, + "loss": 0.2765, + "step": 25609 + }, + { + "epoch": 2.074692158133506, + "grad_norm": 0.07524368911981583, + "learning_rate": 0.00010696250956388676, + "loss": 0.286, + "step": 25610 + }, + { + "epoch": 2.0747731691510047, + "grad_norm": 0.0632515475153923, + "learning_rate": 0.00010695800891129216, + "loss": 0.319, + "step": 25611 + }, + { + "epoch": 2.074854180168503, + "grad_norm": 0.053480733186006546, + "learning_rate": 0.00010695350825869752, + "loss": 0.245, + "step": 25612 + }, + { + "epoch": 2.0749351911860012, + "grad_norm": 0.07301179319620132, + "learning_rate": 0.00010694900760610289, + "loss": 0.2887, + "step": 25613 + }, + { + "epoch": 2.0750162022034995, + "grad_norm": 0.05944663658738136, + "learning_rate": 0.00010694450695350826, + "loss": 0.3042, + "step": 25614 + }, + { + "epoch": 2.075097213220998, + "grad_norm": 0.06371688097715378, + "learning_rate": 0.00010694000630091364, + "loss": 0.2952, + "step": 25615 + }, + { + "epoch": 2.0751782242384964, + "grad_norm": 0.05884646624326706, + "learning_rate": 0.000106935505648319, + "loss": 0.2649, + "step": 25616 + }, + { + "epoch": 2.0752592352559946, + "grad_norm": 0.057729847729206085, + "learning_rate": 0.0001069310049957244, + "loss": 0.2633, + "step": 25617 + }, + { + "epoch": 2.0753402462734933, + "grad_norm": 0.06070605665445328, + "learning_rate": 0.00010692650434312976, + "loss": 0.2183, + "step": 25618 + }, + { + "epoch": 2.0754212572909916, + "grad_norm": 0.04957975447177887, + "learning_rate": 0.00010692200369053513, + "loss": 0.2647, + "step": 25619 + }, + { + "epoch": 2.07550226830849, + "grad_norm": 0.05769617483019829, + "learning_rate": 0.0001069175030379405, + "loss": 0.2364, + "step": 25620 + }, + { + "epoch": 2.0755832793259885, + "grad_norm": 0.06463346630334854, + "learning_rate": 0.00010691300238534588, + "loss": 0.2877, + "step": 25621 + }, + { + "epoch": 2.0756642903434868, + "grad_norm": 0.05660693347454071, + "learning_rate": 0.00010690850173275127, + "loss": 0.2918, + "step": 25622 + }, + { + "epoch": 2.075745301360985, + "grad_norm": 0.06576941907405853, + "learning_rate": 0.00010690400108015664, + "loss": 0.2448, + "step": 25623 + }, + { + "epoch": 2.0758263123784833, + "grad_norm": 0.06314044445753098, + "learning_rate": 0.000106899500427562, + "loss": 0.2815, + "step": 25624 + }, + { + "epoch": 2.075907323395982, + "grad_norm": 0.06122468411922455, + "learning_rate": 0.00010689499977496737, + "loss": 0.2712, + "step": 25625 + }, + { + "epoch": 2.07598833441348, + "grad_norm": 0.06045297533273697, + "learning_rate": 0.00010689049912237275, + "loss": 0.2386, + "step": 25626 + }, + { + "epoch": 2.0760693454309784, + "grad_norm": 0.06016145274043083, + "learning_rate": 0.00010688599846977812, + "loss": 0.2585, + "step": 25627 + }, + { + "epoch": 2.076150356448477, + "grad_norm": 0.07210994511842728, + "learning_rate": 0.00010688149781718351, + "loss": 0.2581, + "step": 25628 + }, + { + "epoch": 2.0762313674659754, + "grad_norm": 0.06515554338693619, + "learning_rate": 0.00010687699716458888, + "loss": 0.2634, + "step": 25629 + }, + { + "epoch": 2.0763123784834736, + "grad_norm": 0.08918245881795883, + "learning_rate": 0.00010687249651199424, + "loss": 0.3076, + "step": 25630 + }, + { + "epoch": 2.0763933895009723, + "grad_norm": 0.06386683881282806, + "learning_rate": 0.00010686799585939962, + "loss": 0.2638, + "step": 25631 + }, + { + "epoch": 2.0764744005184705, + "grad_norm": 0.050748348236083984, + "learning_rate": 0.00010686349520680499, + "loss": 0.2224, + "step": 25632 + }, + { + "epoch": 2.076555411535969, + "grad_norm": 0.07010557502508163, + "learning_rate": 0.00010685899455421036, + "loss": 0.2524, + "step": 25633 + }, + { + "epoch": 2.0766364225534675, + "grad_norm": 0.05773705244064331, + "learning_rate": 0.00010685449390161575, + "loss": 0.2583, + "step": 25634 + }, + { + "epoch": 2.0767174335709657, + "grad_norm": 0.06482109427452087, + "learning_rate": 0.00010684999324902112, + "loss": 0.2794, + "step": 25635 + }, + { + "epoch": 2.076798444588464, + "grad_norm": 0.07043759524822235, + "learning_rate": 0.00010684549259642648, + "loss": 0.2956, + "step": 25636 + }, + { + "epoch": 2.076879455605962, + "grad_norm": 0.06465146690607071, + "learning_rate": 0.00010684099194383186, + "loss": 0.2775, + "step": 25637 + }, + { + "epoch": 2.076960466623461, + "grad_norm": 0.06037595868110657, + "learning_rate": 0.00010683649129123723, + "loss": 0.2441, + "step": 25638 + }, + { + "epoch": 2.077041477640959, + "grad_norm": 0.08674988150596619, + "learning_rate": 0.0001068319906386426, + "loss": 0.3322, + "step": 25639 + }, + { + "epoch": 2.0771224886584574, + "grad_norm": 0.05551932752132416, + "learning_rate": 0.00010682748998604799, + "loss": 0.261, + "step": 25640 + }, + { + "epoch": 2.077203499675956, + "grad_norm": 0.07131566852331161, + "learning_rate": 0.00010682298933345336, + "loss": 0.232, + "step": 25641 + }, + { + "epoch": 2.0772845106934543, + "grad_norm": 0.05972439795732498, + "learning_rate": 0.00010681848868085872, + "loss": 0.2835, + "step": 25642 + }, + { + "epoch": 2.0773655217109526, + "grad_norm": 0.05565524473786354, + "learning_rate": 0.0001068139880282641, + "loss": 0.23, + "step": 25643 + }, + { + "epoch": 2.0774465327284513, + "grad_norm": 0.05611024051904678, + "learning_rate": 0.00010680948737566947, + "loss": 0.3103, + "step": 25644 + }, + { + "epoch": 2.0775275437459495, + "grad_norm": 0.07952877879142761, + "learning_rate": 0.00010680498672307485, + "loss": 0.2595, + "step": 25645 + }, + { + "epoch": 2.0776085547634477, + "grad_norm": 0.06529795378446579, + "learning_rate": 0.00010680048607048023, + "loss": 0.2697, + "step": 25646 + }, + { + "epoch": 2.077689565780946, + "grad_norm": 0.06786473840475082, + "learning_rate": 0.0001067959854178856, + "loss": 0.2843, + "step": 25647 + }, + { + "epoch": 2.0777705767984447, + "grad_norm": 0.06259346753358841, + "learning_rate": 0.00010679148476529097, + "loss": 0.2382, + "step": 25648 + }, + { + "epoch": 2.077851587815943, + "grad_norm": 0.057985417544841766, + "learning_rate": 0.00010678698411269634, + "loss": 0.2807, + "step": 25649 + }, + { + "epoch": 2.077932598833441, + "grad_norm": 0.048762246966362, + "learning_rate": 0.00010678248346010171, + "loss": 0.2399, + "step": 25650 + }, + { + "epoch": 2.07801360985094, + "grad_norm": 0.05488719791173935, + "learning_rate": 0.0001067779828075071, + "loss": 0.2443, + "step": 25651 + }, + { + "epoch": 2.078094620868438, + "grad_norm": 0.05270203948020935, + "learning_rate": 0.00010677348215491247, + "loss": 0.226, + "step": 25652 + }, + { + "epoch": 2.0781756318859363, + "grad_norm": 0.07405373454093933, + "learning_rate": 0.00010676898150231785, + "loss": 0.2567, + "step": 25653 + }, + { + "epoch": 2.078256642903435, + "grad_norm": 0.07551823556423187, + "learning_rate": 0.00010676448084972321, + "loss": 0.2704, + "step": 25654 + }, + { + "epoch": 2.0783376539209333, + "grad_norm": 0.05943058803677559, + "learning_rate": 0.00010675998019712858, + "loss": 0.2553, + "step": 25655 + }, + { + "epoch": 2.0784186649384315, + "grad_norm": 0.05561874434351921, + "learning_rate": 0.00010675547954453396, + "loss": 0.2388, + "step": 25656 + }, + { + "epoch": 2.07849967595593, + "grad_norm": 0.04733414202928543, + "learning_rate": 0.00010675097889193934, + "loss": 0.2326, + "step": 25657 + }, + { + "epoch": 2.0785806869734285, + "grad_norm": 0.07790904492139816, + "learning_rate": 0.00010674647823934472, + "loss": 0.279, + "step": 25658 + }, + { + "epoch": 2.0786616979909267, + "grad_norm": 0.06400874257087708, + "learning_rate": 0.00010674197758675009, + "loss": 0.2887, + "step": 25659 + }, + { + "epoch": 2.078742709008425, + "grad_norm": 0.06504980474710464, + "learning_rate": 0.00010673747693415545, + "loss": 0.264, + "step": 25660 + }, + { + "epoch": 2.0788237200259236, + "grad_norm": 0.06921865046024323, + "learning_rate": 0.00010673297628156082, + "loss": 0.2837, + "step": 25661 + }, + { + "epoch": 2.078904731043422, + "grad_norm": 0.057739365845918655, + "learning_rate": 0.0001067284756289662, + "loss": 0.2548, + "step": 25662 + }, + { + "epoch": 2.07898574206092, + "grad_norm": 0.058383893221616745, + "learning_rate": 0.00010672397497637158, + "loss": 0.2579, + "step": 25663 + }, + { + "epoch": 2.079066753078419, + "grad_norm": 0.06038293242454529, + "learning_rate": 0.00010671947432377696, + "loss": 0.2434, + "step": 25664 + }, + { + "epoch": 2.079147764095917, + "grad_norm": 0.0538647286593914, + "learning_rate": 0.00010671497367118233, + "loss": 0.2383, + "step": 25665 + }, + { + "epoch": 2.0792287751134153, + "grad_norm": 0.05161396414041519, + "learning_rate": 0.00010671047301858769, + "loss": 0.2746, + "step": 25666 + }, + { + "epoch": 2.079309786130914, + "grad_norm": 0.06023260951042175, + "learning_rate": 0.00010670597236599306, + "loss": 0.2567, + "step": 25667 + }, + { + "epoch": 2.0793907971484122, + "grad_norm": 0.05432547256350517, + "learning_rate": 0.00010670147171339844, + "loss": 0.227, + "step": 25668 + }, + { + "epoch": 2.0794718081659105, + "grad_norm": 0.05337492749094963, + "learning_rate": 0.00010669697106080383, + "loss": 0.2281, + "step": 25669 + }, + { + "epoch": 2.0795528191834087, + "grad_norm": 0.06227840110659599, + "learning_rate": 0.0001066924704082092, + "loss": 0.225, + "step": 25670 + }, + { + "epoch": 2.0796338302009074, + "grad_norm": 0.0642867460846901, + "learning_rate": 0.00010668796975561457, + "loss": 0.2542, + "step": 25671 + }, + { + "epoch": 2.0797148412184057, + "grad_norm": 0.07576468586921692, + "learning_rate": 0.00010668346910301993, + "loss": 0.3283, + "step": 25672 + }, + { + "epoch": 2.079795852235904, + "grad_norm": 0.05812416598200798, + "learning_rate": 0.0001066789684504253, + "loss": 0.2565, + "step": 25673 + }, + { + "epoch": 2.0798768632534026, + "grad_norm": 0.05353740602731705, + "learning_rate": 0.00010667446779783071, + "loss": 0.2655, + "step": 25674 + }, + { + "epoch": 2.079957874270901, + "grad_norm": 0.052923765033483505, + "learning_rate": 0.00010666996714523607, + "loss": 0.2334, + "step": 25675 + }, + { + "epoch": 2.080038885288399, + "grad_norm": 0.07438129186630249, + "learning_rate": 0.00010666546649264144, + "loss": 0.2633, + "step": 25676 + }, + { + "epoch": 2.0801198963058978, + "grad_norm": 0.06333016604185104, + "learning_rate": 0.00010666096584004681, + "loss": 0.2701, + "step": 25677 + }, + { + "epoch": 2.080200907323396, + "grad_norm": 0.061056092381477356, + "learning_rate": 0.00010665646518745219, + "loss": 0.2527, + "step": 25678 + }, + { + "epoch": 2.0802819183408943, + "grad_norm": 0.0625620037317276, + "learning_rate": 0.00010665196453485755, + "loss": 0.2912, + "step": 25679 + }, + { + "epoch": 2.080362929358393, + "grad_norm": 0.05852917581796646, + "learning_rate": 0.00010664746388226295, + "loss": 0.2371, + "step": 25680 + }, + { + "epoch": 2.080443940375891, + "grad_norm": 0.05300210788846016, + "learning_rate": 0.00010664296322966831, + "loss": 0.238, + "step": 25681 + }, + { + "epoch": 2.0805249513933894, + "grad_norm": 0.051059871912002563, + "learning_rate": 0.00010663846257707368, + "loss": 0.2545, + "step": 25682 + }, + { + "epoch": 2.0806059624108877, + "grad_norm": 0.05365686118602753, + "learning_rate": 0.00010663396192447906, + "loss": 0.2292, + "step": 25683 + }, + { + "epoch": 2.0806869734283864, + "grad_norm": 0.056775450706481934, + "learning_rate": 0.00010662946127188443, + "loss": 0.2782, + "step": 25684 + }, + { + "epoch": 2.0807679844458846, + "grad_norm": 0.06060144305229187, + "learning_rate": 0.00010662496061928979, + "loss": 0.2625, + "step": 25685 + }, + { + "epoch": 2.080848995463383, + "grad_norm": 0.06818179786205292, + "learning_rate": 0.00010662045996669519, + "loss": 0.2896, + "step": 25686 + }, + { + "epoch": 2.0809300064808816, + "grad_norm": 0.05880607292056084, + "learning_rate": 0.00010661595931410055, + "loss": 0.2755, + "step": 25687 + }, + { + "epoch": 2.08101101749838, + "grad_norm": 0.06834668666124344, + "learning_rate": 0.00010661145866150592, + "loss": 0.2539, + "step": 25688 + }, + { + "epoch": 2.081092028515878, + "grad_norm": 0.09048371016979218, + "learning_rate": 0.0001066069580089113, + "loss": 0.2587, + "step": 25689 + }, + { + "epoch": 2.0811730395333767, + "grad_norm": 0.07518291473388672, + "learning_rate": 0.00010660245735631667, + "loss": 0.2522, + "step": 25690 + }, + { + "epoch": 2.081254050550875, + "grad_norm": 0.0637117400765419, + "learning_rate": 0.00010659795670372203, + "loss": 0.2945, + "step": 25691 + }, + { + "epoch": 2.0813350615683732, + "grad_norm": 0.06945264339447021, + "learning_rate": 0.00010659345605112743, + "loss": 0.2858, + "step": 25692 + }, + { + "epoch": 2.0814160725858715, + "grad_norm": 0.06889203935861588, + "learning_rate": 0.00010658895539853279, + "loss": 0.2662, + "step": 25693 + }, + { + "epoch": 2.08149708360337, + "grad_norm": 0.06387284398078918, + "learning_rate": 0.00010658445474593817, + "loss": 0.2433, + "step": 25694 + }, + { + "epoch": 2.0815780946208684, + "grad_norm": 0.05560588836669922, + "learning_rate": 0.00010657995409334354, + "loss": 0.2042, + "step": 25695 + }, + { + "epoch": 2.0816591056383666, + "grad_norm": 0.052960895001888275, + "learning_rate": 0.00010657545344074891, + "loss": 0.2754, + "step": 25696 + }, + { + "epoch": 2.0817401166558653, + "grad_norm": 0.07859915494918823, + "learning_rate": 0.00010657095278815427, + "loss": 0.3211, + "step": 25697 + }, + { + "epoch": 2.0818211276733636, + "grad_norm": 0.05939163640141487, + "learning_rate": 0.00010656645213555967, + "loss": 0.2674, + "step": 25698 + }, + { + "epoch": 2.081902138690862, + "grad_norm": 0.06102471798658371, + "learning_rate": 0.00010656195148296503, + "loss": 0.2112, + "step": 25699 + }, + { + "epoch": 2.0819831497083605, + "grad_norm": 0.07584518939256668, + "learning_rate": 0.00010655745083037041, + "loss": 0.2417, + "step": 25700 + }, + { + "epoch": 2.0820641607258588, + "grad_norm": 0.06054631620645523, + "learning_rate": 0.00010655295017777578, + "loss": 0.2998, + "step": 25701 + }, + { + "epoch": 2.082145171743357, + "grad_norm": 0.06277702748775482, + "learning_rate": 0.00010654844952518115, + "loss": 0.2694, + "step": 25702 + }, + { + "epoch": 2.0822261827608557, + "grad_norm": 0.06722848862409592, + "learning_rate": 0.00010654394887258654, + "loss": 0.281, + "step": 25703 + }, + { + "epoch": 2.082307193778354, + "grad_norm": 0.0671129897236824, + "learning_rate": 0.00010653944821999192, + "loss": 0.2553, + "step": 25704 + }, + { + "epoch": 2.082388204795852, + "grad_norm": 0.06236971542239189, + "learning_rate": 0.00010653494756739728, + "loss": 0.2847, + "step": 25705 + }, + { + "epoch": 2.0824692158133504, + "grad_norm": 0.06291084736585617, + "learning_rate": 0.00010653044691480265, + "loss": 0.215, + "step": 25706 + }, + { + "epoch": 2.082550226830849, + "grad_norm": 0.061905037611722946, + "learning_rate": 0.00010652594626220802, + "loss": 0.2296, + "step": 25707 + }, + { + "epoch": 2.0826312378483474, + "grad_norm": 0.0679488480091095, + "learning_rate": 0.0001065214456096134, + "loss": 0.24, + "step": 25708 + }, + { + "epoch": 2.0827122488658456, + "grad_norm": 0.05547121539711952, + "learning_rate": 0.00010651694495701878, + "loss": 0.2332, + "step": 25709 + }, + { + "epoch": 2.0827932598833443, + "grad_norm": 0.06207665428519249, + "learning_rate": 0.00010651244430442416, + "loss": 0.2399, + "step": 25710 + }, + { + "epoch": 2.0828742709008425, + "grad_norm": 0.060923006385564804, + "learning_rate": 0.00010650794365182952, + "loss": 0.2793, + "step": 25711 + }, + { + "epoch": 2.082955281918341, + "grad_norm": 0.05969534441828728, + "learning_rate": 0.00010650344299923489, + "loss": 0.2456, + "step": 25712 + }, + { + "epoch": 2.0830362929358395, + "grad_norm": 0.060140207409858704, + "learning_rate": 0.00010649894234664026, + "loss": 0.2417, + "step": 25713 + }, + { + "epoch": 2.0831173039533377, + "grad_norm": 0.06350401043891907, + "learning_rate": 0.00010649444169404564, + "loss": 0.2725, + "step": 25714 + }, + { + "epoch": 2.083198314970836, + "grad_norm": 0.05730599910020828, + "learning_rate": 0.00010648994104145102, + "loss": 0.2471, + "step": 25715 + }, + { + "epoch": 2.083279325988334, + "grad_norm": 0.06925246119499207, + "learning_rate": 0.0001064854403888564, + "loss": 0.2924, + "step": 25716 + }, + { + "epoch": 2.083360337005833, + "grad_norm": 0.05671600624918938, + "learning_rate": 0.00010648093973626176, + "loss": 0.2313, + "step": 25717 + }, + { + "epoch": 2.083441348023331, + "grad_norm": 0.05397858843207359, + "learning_rate": 0.00010647643908366713, + "loss": 0.2426, + "step": 25718 + }, + { + "epoch": 2.0835223590408294, + "grad_norm": 0.06132481247186661, + "learning_rate": 0.0001064719384310725, + "loss": 0.2614, + "step": 25719 + }, + { + "epoch": 2.083603370058328, + "grad_norm": 0.07008907943964005, + "learning_rate": 0.00010646743777847788, + "loss": 0.2507, + "step": 25720 + }, + { + "epoch": 2.0836843810758263, + "grad_norm": 0.05325290188193321, + "learning_rate": 0.00010646293712588327, + "loss": 0.2513, + "step": 25721 + }, + { + "epoch": 2.0837653920933246, + "grad_norm": 0.06077274680137634, + "learning_rate": 0.00010645843647328864, + "loss": 0.2353, + "step": 25722 + }, + { + "epoch": 2.0838464031108233, + "grad_norm": 0.05821505934000015, + "learning_rate": 0.000106453935820694, + "loss": 0.252, + "step": 25723 + }, + { + "epoch": 2.0839274141283215, + "grad_norm": 0.05630730465054512, + "learning_rate": 0.00010644943516809937, + "loss": 0.2255, + "step": 25724 + }, + { + "epoch": 2.0840084251458197, + "grad_norm": 0.0677599087357521, + "learning_rate": 0.00010644493451550475, + "loss": 0.3002, + "step": 25725 + }, + { + "epoch": 2.0840894361633184, + "grad_norm": 0.06731487065553665, + "learning_rate": 0.00010644043386291012, + "loss": 0.2607, + "step": 25726 + }, + { + "epoch": 2.0841704471808167, + "grad_norm": 0.07227706909179688, + "learning_rate": 0.00010643593321031551, + "loss": 0.2606, + "step": 25727 + }, + { + "epoch": 2.084251458198315, + "grad_norm": 0.07242441922426224, + "learning_rate": 0.00010643143255772088, + "loss": 0.2694, + "step": 25728 + }, + { + "epoch": 2.084332469215813, + "grad_norm": 0.06362912803888321, + "learning_rate": 0.00010642693190512624, + "loss": 0.2793, + "step": 25729 + }, + { + "epoch": 2.084413480233312, + "grad_norm": 0.0482899434864521, + "learning_rate": 0.00010642243125253162, + "loss": 0.2201, + "step": 25730 + }, + { + "epoch": 2.08449449125081, + "grad_norm": 0.059406179934740067, + "learning_rate": 0.00010641793059993699, + "loss": 0.2579, + "step": 25731 + }, + { + "epoch": 2.0845755022683083, + "grad_norm": 0.059811271727085114, + "learning_rate": 0.00010641342994734238, + "loss": 0.2488, + "step": 25732 + }, + { + "epoch": 2.084656513285807, + "grad_norm": 0.06633289903402328, + "learning_rate": 0.00010640892929474775, + "loss": 0.2932, + "step": 25733 + }, + { + "epoch": 2.0847375243033053, + "grad_norm": 0.05347701534628868, + "learning_rate": 0.00010640442864215312, + "loss": 0.2601, + "step": 25734 + }, + { + "epoch": 2.0848185353208035, + "grad_norm": 0.060605239123106, + "learning_rate": 0.00010639992798955848, + "loss": 0.2603, + "step": 25735 + }, + { + "epoch": 2.084899546338302, + "grad_norm": 0.05953764170408249, + "learning_rate": 0.00010639542733696386, + "loss": 0.2675, + "step": 25736 + }, + { + "epoch": 2.0849805573558005, + "grad_norm": 0.052090004086494446, + "learning_rate": 0.00010639092668436923, + "loss": 0.2545, + "step": 25737 + }, + { + "epoch": 2.0850615683732987, + "grad_norm": 0.054392725229263306, + "learning_rate": 0.00010638642603177462, + "loss": 0.2421, + "step": 25738 + }, + { + "epoch": 2.085142579390797, + "grad_norm": 0.05666566640138626, + "learning_rate": 0.00010638192537917999, + "loss": 0.211, + "step": 25739 + }, + { + "epoch": 2.0852235904082956, + "grad_norm": 0.057736024260520935, + "learning_rate": 0.00010637742472658536, + "loss": 0.2907, + "step": 25740 + }, + { + "epoch": 2.085304601425794, + "grad_norm": 0.055861927568912506, + "learning_rate": 0.00010637292407399074, + "loss": 0.2927, + "step": 25741 + }, + { + "epoch": 2.085385612443292, + "grad_norm": 0.060406699776649475, + "learning_rate": 0.0001063684234213961, + "loss": 0.2586, + "step": 25742 + }, + { + "epoch": 2.085466623460791, + "grad_norm": 0.0583062581717968, + "learning_rate": 0.00010636392276880147, + "loss": 0.2507, + "step": 25743 + }, + { + "epoch": 2.085547634478289, + "grad_norm": 0.06983792036771774, + "learning_rate": 0.00010635942211620686, + "loss": 0.2803, + "step": 25744 + }, + { + "epoch": 2.0856286454957873, + "grad_norm": 0.06031275913119316, + "learning_rate": 0.00010635492146361223, + "loss": 0.274, + "step": 25745 + }, + { + "epoch": 2.085709656513286, + "grad_norm": 0.06896448880434036, + "learning_rate": 0.0001063504208110176, + "loss": 0.2534, + "step": 25746 + }, + { + "epoch": 2.0857906675307842, + "grad_norm": 0.08314532041549683, + "learning_rate": 0.00010634592015842298, + "loss": 0.2933, + "step": 25747 + }, + { + "epoch": 2.0858716785482825, + "grad_norm": 0.06117968261241913, + "learning_rate": 0.00010634141950582834, + "loss": 0.244, + "step": 25748 + }, + { + "epoch": 2.085952689565781, + "grad_norm": 0.05076151341199875, + "learning_rate": 0.00010633691885323371, + "loss": 0.2418, + "step": 25749 + }, + { + "epoch": 2.0860337005832794, + "grad_norm": 0.05884158983826637, + "learning_rate": 0.0001063324182006391, + "loss": 0.2552, + "step": 25750 + }, + { + "epoch": 2.0861147116007777, + "grad_norm": 0.06510408967733383, + "learning_rate": 0.00010632791754804447, + "loss": 0.2432, + "step": 25751 + }, + { + "epoch": 2.086195722618276, + "grad_norm": 0.05114440992474556, + "learning_rate": 0.00010632341689544985, + "loss": 0.263, + "step": 25752 + }, + { + "epoch": 2.0862767336357746, + "grad_norm": 0.05631138011813164, + "learning_rate": 0.00010631891624285522, + "loss": 0.2466, + "step": 25753 + }, + { + "epoch": 2.086357744653273, + "grad_norm": 0.06776197999715805, + "learning_rate": 0.00010631441559026058, + "loss": 0.287, + "step": 25754 + }, + { + "epoch": 2.086438755670771, + "grad_norm": 0.05723719298839569, + "learning_rate": 0.00010630991493766598, + "loss": 0.2534, + "step": 25755 + }, + { + "epoch": 2.0865197666882698, + "grad_norm": 0.07237262278795242, + "learning_rate": 0.00010630541428507134, + "loss": 0.2933, + "step": 25756 + }, + { + "epoch": 2.086600777705768, + "grad_norm": 0.061718061566352844, + "learning_rate": 0.00010630091363247672, + "loss": 0.2711, + "step": 25757 + }, + { + "epoch": 2.0866817887232663, + "grad_norm": 0.06035630404949188, + "learning_rate": 0.00010629641297988209, + "loss": 0.235, + "step": 25758 + }, + { + "epoch": 2.086762799740765, + "grad_norm": 0.06308078020811081, + "learning_rate": 0.00010629191232728746, + "loss": 0.2505, + "step": 25759 + }, + { + "epoch": 2.086843810758263, + "grad_norm": 0.05552982538938522, + "learning_rate": 0.00010628741167469282, + "loss": 0.2482, + "step": 25760 + }, + { + "epoch": 2.0869248217757614, + "grad_norm": 0.06342519819736481, + "learning_rate": 0.00010628291102209822, + "loss": 0.2535, + "step": 25761 + }, + { + "epoch": 2.0870058327932597, + "grad_norm": 0.04495114088058472, + "learning_rate": 0.00010627841036950358, + "loss": 0.2632, + "step": 25762 + }, + { + "epoch": 2.0870868438107584, + "grad_norm": 0.06390406936407089, + "learning_rate": 0.00010627390971690896, + "loss": 0.2626, + "step": 25763 + }, + { + "epoch": 2.0871678548282566, + "grad_norm": 0.06474427878856659, + "learning_rate": 0.00010626940906431433, + "loss": 0.2821, + "step": 25764 + }, + { + "epoch": 2.087248865845755, + "grad_norm": 0.05594582483172417, + "learning_rate": 0.0001062649084117197, + "loss": 0.285, + "step": 25765 + }, + { + "epoch": 2.0873298768632536, + "grad_norm": 0.061284903436899185, + "learning_rate": 0.00010626040775912507, + "loss": 0.2327, + "step": 25766 + }, + { + "epoch": 2.087410887880752, + "grad_norm": 0.058083221316337585, + "learning_rate": 0.00010625590710653047, + "loss": 0.2555, + "step": 25767 + }, + { + "epoch": 2.08749189889825, + "grad_norm": 0.0643569752573967, + "learning_rate": 0.00010625140645393583, + "loss": 0.2676, + "step": 25768 + }, + { + "epoch": 2.0875729099157487, + "grad_norm": 0.05668030306696892, + "learning_rate": 0.0001062469058013412, + "loss": 0.263, + "step": 25769 + }, + { + "epoch": 2.087653920933247, + "grad_norm": 0.05691717192530632, + "learning_rate": 0.00010624240514874657, + "loss": 0.285, + "step": 25770 + }, + { + "epoch": 2.087734931950745, + "grad_norm": 0.05709190294146538, + "learning_rate": 0.00010623790449615195, + "loss": 0.2534, + "step": 25771 + }, + { + "epoch": 2.087815942968244, + "grad_norm": 0.05560784786939621, + "learning_rate": 0.0001062334038435573, + "loss": 0.2386, + "step": 25772 + }, + { + "epoch": 2.087896953985742, + "grad_norm": 0.062379688024520874, + "learning_rate": 0.00010622890319096271, + "loss": 0.2553, + "step": 25773 + }, + { + "epoch": 2.0879779650032404, + "grad_norm": 0.060460880398750305, + "learning_rate": 0.00010622440253836807, + "loss": 0.239, + "step": 25774 + }, + { + "epoch": 2.0880589760207386, + "grad_norm": 0.06349531561136246, + "learning_rate": 0.00010621990188577344, + "loss": 0.2695, + "step": 25775 + }, + { + "epoch": 2.0881399870382373, + "grad_norm": 0.07084312289953232, + "learning_rate": 0.00010621540123317881, + "loss": 0.2661, + "step": 25776 + }, + { + "epoch": 2.0882209980557356, + "grad_norm": 0.06351789832115173, + "learning_rate": 0.00010621090058058419, + "loss": 0.2794, + "step": 25777 + }, + { + "epoch": 2.088302009073234, + "grad_norm": 0.06865677982568741, + "learning_rate": 0.00010620639992798955, + "loss": 0.2506, + "step": 25778 + }, + { + "epoch": 2.0883830200907325, + "grad_norm": 0.05225326120853424, + "learning_rate": 0.00010620189927539495, + "loss": 0.2465, + "step": 25779 + }, + { + "epoch": 2.0884640311082308, + "grad_norm": 0.06418361514806747, + "learning_rate": 0.00010619739862280031, + "loss": 0.2706, + "step": 25780 + }, + { + "epoch": 2.088545042125729, + "grad_norm": 0.057067323476076126, + "learning_rate": 0.00010619289797020568, + "loss": 0.2642, + "step": 25781 + }, + { + "epoch": 2.0886260531432277, + "grad_norm": 0.06191162019968033, + "learning_rate": 0.00010618839731761106, + "loss": 0.2264, + "step": 25782 + }, + { + "epoch": 2.088707064160726, + "grad_norm": 0.10706423968076706, + "learning_rate": 0.00010618389666501643, + "loss": 0.259, + "step": 25783 + }, + { + "epoch": 2.088788075178224, + "grad_norm": 0.07185043394565582, + "learning_rate": 0.00010617939601242182, + "loss": 0.249, + "step": 25784 + }, + { + "epoch": 2.0888690861957224, + "grad_norm": 0.056017689406871796, + "learning_rate": 0.00010617489535982719, + "loss": 0.282, + "step": 25785 + }, + { + "epoch": 2.088950097213221, + "grad_norm": 0.0657619833946228, + "learning_rate": 0.00010617039470723255, + "loss": 0.2481, + "step": 25786 + }, + { + "epoch": 2.0890311082307194, + "grad_norm": 0.05671881511807442, + "learning_rate": 0.00010616589405463792, + "loss": 0.2431, + "step": 25787 + }, + { + "epoch": 2.0891121192482176, + "grad_norm": 0.061993759125471115, + "learning_rate": 0.0001061613934020433, + "loss": 0.2715, + "step": 25788 + }, + { + "epoch": 2.0891931302657163, + "grad_norm": 0.05531548708677292, + "learning_rate": 0.00010615689274944867, + "loss": 0.212, + "step": 25789 + }, + { + "epoch": 2.0892741412832145, + "grad_norm": 0.05547591298818588, + "learning_rate": 0.00010615239209685406, + "loss": 0.2111, + "step": 25790 + }, + { + "epoch": 2.089355152300713, + "grad_norm": 0.07032620906829834, + "learning_rate": 0.00010614789144425943, + "loss": 0.2928, + "step": 25791 + }, + { + "epoch": 2.0894361633182115, + "grad_norm": 0.06776494532823563, + "learning_rate": 0.00010614339079166479, + "loss": 0.227, + "step": 25792 + }, + { + "epoch": 2.0895171743357097, + "grad_norm": 0.06672436743974686, + "learning_rate": 0.00010613889013907017, + "loss": 0.3022, + "step": 25793 + }, + { + "epoch": 2.089598185353208, + "grad_norm": 0.07442519813776016, + "learning_rate": 0.00010613438948647554, + "loss": 0.2315, + "step": 25794 + }, + { + "epoch": 2.089679196370706, + "grad_norm": 0.07650196552276611, + "learning_rate": 0.00010612988883388091, + "loss": 0.2704, + "step": 25795 + }, + { + "epoch": 2.089760207388205, + "grad_norm": 0.07087470591068268, + "learning_rate": 0.0001061253881812863, + "loss": 0.2499, + "step": 25796 + }, + { + "epoch": 2.089841218405703, + "grad_norm": 0.06081029400229454, + "learning_rate": 0.00010612088752869167, + "loss": 0.2102, + "step": 25797 + }, + { + "epoch": 2.0899222294232014, + "grad_norm": 0.0629265084862709, + "learning_rate": 0.00010611638687609703, + "loss": 0.2666, + "step": 25798 + }, + { + "epoch": 2.0900032404407, + "grad_norm": 0.06669928878545761, + "learning_rate": 0.00010611188622350241, + "loss": 0.2466, + "step": 25799 + }, + { + "epoch": 2.0900842514581983, + "grad_norm": 0.053711552172899246, + "learning_rate": 0.00010610738557090778, + "loss": 0.2246, + "step": 25800 + }, + { + "epoch": 2.0901652624756966, + "grad_norm": 0.05141836032271385, + "learning_rate": 0.00010610288491831315, + "loss": 0.2227, + "step": 25801 + }, + { + "epoch": 2.0902462734931953, + "grad_norm": 0.05613867565989494, + "learning_rate": 0.00010609838426571854, + "loss": 0.223, + "step": 25802 + }, + { + "epoch": 2.0903272845106935, + "grad_norm": 0.0661654844880104, + "learning_rate": 0.00010609388361312392, + "loss": 0.2581, + "step": 25803 + }, + { + "epoch": 2.0904082955281917, + "grad_norm": 0.04869081825017929, + "learning_rate": 0.00010608938296052928, + "loss": 0.2325, + "step": 25804 + }, + { + "epoch": 2.0904893065456904, + "grad_norm": 0.07372672110795975, + "learning_rate": 0.00010608488230793465, + "loss": 0.3026, + "step": 25805 + }, + { + "epoch": 2.0905703175631887, + "grad_norm": 0.05311240255832672, + "learning_rate": 0.00010608038165534002, + "loss": 0.2785, + "step": 25806 + }, + { + "epoch": 2.090651328580687, + "grad_norm": 0.06160484999418259, + "learning_rate": 0.00010607588100274541, + "loss": 0.2661, + "step": 25807 + }, + { + "epoch": 2.090732339598185, + "grad_norm": 0.053490664809942245, + "learning_rate": 0.00010607138035015078, + "loss": 0.2464, + "step": 25808 + }, + { + "epoch": 2.090813350615684, + "grad_norm": 0.06441934406757355, + "learning_rate": 0.00010606687969755616, + "loss": 0.2287, + "step": 25809 + }, + { + "epoch": 2.090894361633182, + "grad_norm": 0.06719013303518295, + "learning_rate": 0.00010606237904496153, + "loss": 0.2507, + "step": 25810 + }, + { + "epoch": 2.0909753726506803, + "grad_norm": 0.06515239924192429, + "learning_rate": 0.00010605787839236689, + "loss": 0.2787, + "step": 25811 + }, + { + "epoch": 2.091056383668179, + "grad_norm": 0.058618079870939255, + "learning_rate": 0.00010605337773977226, + "loss": 0.2328, + "step": 25812 + }, + { + "epoch": 2.0911373946856773, + "grad_norm": 0.06700004637241364, + "learning_rate": 0.00010604887708717765, + "loss": 0.2504, + "step": 25813 + }, + { + "epoch": 2.0912184057031755, + "grad_norm": 0.06595467776060104, + "learning_rate": 0.00010604437643458303, + "loss": 0.2316, + "step": 25814 + }, + { + "epoch": 2.091299416720674, + "grad_norm": 0.07155590504407883, + "learning_rate": 0.0001060398757819884, + "loss": 0.2837, + "step": 25815 + }, + { + "epoch": 2.0913804277381725, + "grad_norm": 0.06447424739599228, + "learning_rate": 0.00010603537512939377, + "loss": 0.2741, + "step": 25816 + }, + { + "epoch": 2.0914614387556707, + "grad_norm": 0.06656340509653091, + "learning_rate": 0.00010603087447679913, + "loss": 0.2896, + "step": 25817 + }, + { + "epoch": 2.091542449773169, + "grad_norm": 0.06265214085578918, + "learning_rate": 0.0001060263738242045, + "loss": 0.2384, + "step": 25818 + }, + { + "epoch": 2.0916234607906676, + "grad_norm": 0.06865495443344116, + "learning_rate": 0.00010602187317160989, + "loss": 0.3143, + "step": 25819 + }, + { + "epoch": 2.091704471808166, + "grad_norm": 0.06335429847240448, + "learning_rate": 0.00010601737251901527, + "loss": 0.2424, + "step": 25820 + }, + { + "epoch": 2.091785482825664, + "grad_norm": 0.06055642291903496, + "learning_rate": 0.00010601287186642064, + "loss": 0.2754, + "step": 25821 + }, + { + "epoch": 2.091866493843163, + "grad_norm": 0.06168762221932411, + "learning_rate": 0.00010600837121382601, + "loss": 0.2886, + "step": 25822 + }, + { + "epoch": 2.091947504860661, + "grad_norm": 0.0600765235722065, + "learning_rate": 0.00010600387056123137, + "loss": 0.2488, + "step": 25823 + }, + { + "epoch": 2.0920285158781593, + "grad_norm": 0.057513538748025894, + "learning_rate": 0.00010599936990863675, + "loss": 0.2352, + "step": 25824 + }, + { + "epoch": 2.092109526895658, + "grad_norm": 0.061202578246593475, + "learning_rate": 0.00010599486925604213, + "loss": 0.2575, + "step": 25825 + }, + { + "epoch": 2.0921905379131562, + "grad_norm": 0.06461769342422485, + "learning_rate": 0.00010599036860344751, + "loss": 0.2584, + "step": 25826 + }, + { + "epoch": 2.0922715489306545, + "grad_norm": 0.08148052543401718, + "learning_rate": 0.00010598586795085288, + "loss": 0.2752, + "step": 25827 + }, + { + "epoch": 2.0923525599481527, + "grad_norm": 0.05636552721261978, + "learning_rate": 0.00010598136729825826, + "loss": 0.2453, + "step": 25828 + }, + { + "epoch": 2.0924335709656514, + "grad_norm": 0.055334556847810745, + "learning_rate": 0.00010597686664566362, + "loss": 0.2603, + "step": 25829 + }, + { + "epoch": 2.0925145819831497, + "grad_norm": 0.05396450310945511, + "learning_rate": 0.00010597236599306899, + "loss": 0.2434, + "step": 25830 + }, + { + "epoch": 2.092595593000648, + "grad_norm": 0.06471575051546097, + "learning_rate": 0.00010596786534047438, + "loss": 0.2984, + "step": 25831 + }, + { + "epoch": 2.0926766040181466, + "grad_norm": 0.05728616937994957, + "learning_rate": 0.00010596336468787975, + "loss": 0.272, + "step": 25832 + }, + { + "epoch": 2.092757615035645, + "grad_norm": 0.0556185282766819, + "learning_rate": 0.00010595886403528512, + "loss": 0.2365, + "step": 25833 + }, + { + "epoch": 2.092838626053143, + "grad_norm": 0.04983407258987427, + "learning_rate": 0.0001059543633826905, + "loss": 0.2521, + "step": 25834 + }, + { + "epoch": 2.0929196370706418, + "grad_norm": 0.07000332325696945, + "learning_rate": 0.00010594986273009586, + "loss": 0.2536, + "step": 25835 + }, + { + "epoch": 2.09300064808814, + "grad_norm": 0.05987836793065071, + "learning_rate": 0.00010594536207750126, + "loss": 0.2305, + "step": 25836 + }, + { + "epoch": 2.0930816591056383, + "grad_norm": 0.0611238107085228, + "learning_rate": 0.00010594086142490662, + "loss": 0.2421, + "step": 25837 + }, + { + "epoch": 2.093162670123137, + "grad_norm": 0.0788748636841774, + "learning_rate": 0.00010593636077231199, + "loss": 0.2708, + "step": 25838 + }, + { + "epoch": 2.093243681140635, + "grad_norm": 0.05313669890165329, + "learning_rate": 0.00010593186011971737, + "loss": 0.246, + "step": 25839 + }, + { + "epoch": 2.0933246921581334, + "grad_norm": 0.06658217310905457, + "learning_rate": 0.00010592735946712274, + "loss": 0.27, + "step": 25840 + }, + { + "epoch": 2.0934057031756317, + "grad_norm": 0.06660610437393188, + "learning_rate": 0.0001059228588145281, + "loss": 0.3048, + "step": 25841 + }, + { + "epoch": 2.0934867141931304, + "grad_norm": 0.06018728017807007, + "learning_rate": 0.0001059183581619335, + "loss": 0.2395, + "step": 25842 + }, + { + "epoch": 2.0935677252106286, + "grad_norm": 0.056028787046670914, + "learning_rate": 0.00010591385750933886, + "loss": 0.263, + "step": 25843 + }, + { + "epoch": 2.093648736228127, + "grad_norm": 0.06321057677268982, + "learning_rate": 0.00010590935685674423, + "loss": 0.2256, + "step": 25844 + }, + { + "epoch": 2.0937297472456255, + "grad_norm": 0.08071082830429077, + "learning_rate": 0.00010590485620414961, + "loss": 0.3121, + "step": 25845 + }, + { + "epoch": 2.093810758263124, + "grad_norm": 0.06411097198724747, + "learning_rate": 0.00010590035555155498, + "loss": 0.2786, + "step": 25846 + }, + { + "epoch": 2.093891769280622, + "grad_norm": 0.059210870414972305, + "learning_rate": 0.00010589585489896034, + "loss": 0.2746, + "step": 25847 + }, + { + "epoch": 2.0939727802981207, + "grad_norm": 0.07945404201745987, + "learning_rate": 0.00010589135424636574, + "loss": 0.2947, + "step": 25848 + }, + { + "epoch": 2.094053791315619, + "grad_norm": 0.05297846719622612, + "learning_rate": 0.0001058868535937711, + "loss": 0.2591, + "step": 25849 + }, + { + "epoch": 2.094134802333117, + "grad_norm": 0.07699709385633469, + "learning_rate": 0.00010588235294117647, + "loss": 0.2897, + "step": 25850 + }, + { + "epoch": 2.0942158133506155, + "grad_norm": 0.07000952959060669, + "learning_rate": 0.00010587785228858185, + "loss": 0.2837, + "step": 25851 + }, + { + "epoch": 2.094296824368114, + "grad_norm": 0.05717483162879944, + "learning_rate": 0.00010587335163598722, + "loss": 0.2496, + "step": 25852 + }, + { + "epoch": 2.0943778353856124, + "grad_norm": 0.058601513504981995, + "learning_rate": 0.00010586885098339258, + "loss": 0.26, + "step": 25853 + }, + { + "epoch": 2.0944588464031106, + "grad_norm": 0.07447142153978348, + "learning_rate": 0.00010586435033079798, + "loss": 0.2429, + "step": 25854 + }, + { + "epoch": 2.0945398574206093, + "grad_norm": 0.05477110669016838, + "learning_rate": 0.00010585984967820334, + "loss": 0.2115, + "step": 25855 + }, + { + "epoch": 2.0946208684381076, + "grad_norm": 0.061628323048353195, + "learning_rate": 0.00010585534902560872, + "loss": 0.3228, + "step": 25856 + }, + { + "epoch": 2.094701879455606, + "grad_norm": 0.05916190892457962, + "learning_rate": 0.00010585084837301409, + "loss": 0.2674, + "step": 25857 + }, + { + "epoch": 2.0947828904731045, + "grad_norm": 0.058797094970941544, + "learning_rate": 0.00010584634772041946, + "loss": 0.2531, + "step": 25858 + }, + { + "epoch": 2.0948639014906028, + "grad_norm": 0.050141118466854095, + "learning_rate": 0.00010584184706782482, + "loss": 0.2569, + "step": 25859 + }, + { + "epoch": 2.094944912508101, + "grad_norm": 0.07009284943342209, + "learning_rate": 0.00010583734641523022, + "loss": 0.2686, + "step": 25860 + }, + { + "epoch": 2.0950259235255997, + "grad_norm": 0.06598873436450958, + "learning_rate": 0.00010583284576263558, + "loss": 0.2529, + "step": 25861 + }, + { + "epoch": 2.095106934543098, + "grad_norm": 0.05313203111290932, + "learning_rate": 0.00010582834511004096, + "loss": 0.2521, + "step": 25862 + }, + { + "epoch": 2.095187945560596, + "grad_norm": 0.053829628974199295, + "learning_rate": 0.00010582384445744633, + "loss": 0.2272, + "step": 25863 + }, + { + "epoch": 2.0952689565780944, + "grad_norm": 0.0683264285326004, + "learning_rate": 0.0001058193438048517, + "loss": 0.2467, + "step": 25864 + }, + { + "epoch": 2.095349967595593, + "grad_norm": 0.07350464165210724, + "learning_rate": 0.00010581484315225709, + "loss": 0.3027, + "step": 25865 + }, + { + "epoch": 2.0954309786130914, + "grad_norm": 0.07851356267929077, + "learning_rate": 0.00010581034249966247, + "loss": 0.3086, + "step": 25866 + }, + { + "epoch": 2.0955119896305896, + "grad_norm": 0.06328796595335007, + "learning_rate": 0.00010580584184706783, + "loss": 0.2429, + "step": 25867 + }, + { + "epoch": 2.0955930006480883, + "grad_norm": 0.06569570302963257, + "learning_rate": 0.0001058013411944732, + "loss": 0.2965, + "step": 25868 + }, + { + "epoch": 2.0956740116655865, + "grad_norm": 0.05470426380634308, + "learning_rate": 0.00010579684054187857, + "loss": 0.2537, + "step": 25869 + }, + { + "epoch": 2.095755022683085, + "grad_norm": 0.06650625169277191, + "learning_rate": 0.00010579233988928395, + "loss": 0.2746, + "step": 25870 + }, + { + "epoch": 2.0958360337005835, + "grad_norm": 0.05982498824596405, + "learning_rate": 0.00010578783923668933, + "loss": 0.2607, + "step": 25871 + }, + { + "epoch": 2.0959170447180817, + "grad_norm": 0.05677129328250885, + "learning_rate": 0.00010578333858409471, + "loss": 0.2816, + "step": 25872 + }, + { + "epoch": 2.09599805573558, + "grad_norm": 0.060105811804533005, + "learning_rate": 0.00010577883793150007, + "loss": 0.2755, + "step": 25873 + }, + { + "epoch": 2.096079066753078, + "grad_norm": 0.06248025223612785, + "learning_rate": 0.00010577433727890544, + "loss": 0.2758, + "step": 25874 + }, + { + "epoch": 2.096160077770577, + "grad_norm": 0.0530896931886673, + "learning_rate": 0.00010576983662631081, + "loss": 0.2461, + "step": 25875 + }, + { + "epoch": 2.096241088788075, + "grad_norm": 0.05905615910887718, + "learning_rate": 0.00010576533597371619, + "loss": 0.2714, + "step": 25876 + }, + { + "epoch": 2.0963220998055734, + "grad_norm": 0.06350836902856827, + "learning_rate": 0.00010576083532112158, + "loss": 0.2623, + "step": 25877 + }, + { + "epoch": 2.096403110823072, + "grad_norm": 0.07517584413290024, + "learning_rate": 0.00010575633466852695, + "loss": 0.2645, + "step": 25878 + }, + { + "epoch": 2.0964841218405703, + "grad_norm": 0.0731901228427887, + "learning_rate": 0.00010575183401593232, + "loss": 0.2511, + "step": 25879 + }, + { + "epoch": 2.0965651328580686, + "grad_norm": 0.06542081385850906, + "learning_rate": 0.00010574733336333768, + "loss": 0.2262, + "step": 25880 + }, + { + "epoch": 2.0966461438755672, + "grad_norm": 0.0583370216190815, + "learning_rate": 0.00010574283271074306, + "loss": 0.2004, + "step": 25881 + }, + { + "epoch": 2.0967271548930655, + "grad_norm": 0.0566578283905983, + "learning_rate": 0.00010573833205814843, + "loss": 0.2464, + "step": 25882 + }, + { + "epoch": 2.0968081659105637, + "grad_norm": 0.06367211788892746, + "learning_rate": 0.00010573383140555382, + "loss": 0.2435, + "step": 25883 + }, + { + "epoch": 2.0968891769280624, + "grad_norm": 0.07052043080329895, + "learning_rate": 0.00010572933075295919, + "loss": 0.2647, + "step": 25884 + }, + { + "epoch": 2.0969701879455607, + "grad_norm": 0.06257165223360062, + "learning_rate": 0.00010572483010036456, + "loss": 0.2382, + "step": 25885 + }, + { + "epoch": 2.097051198963059, + "grad_norm": 0.07620043307542801, + "learning_rate": 0.00010572032944776992, + "loss": 0.2354, + "step": 25886 + }, + { + "epoch": 2.097132209980557, + "grad_norm": 0.06310175359249115, + "learning_rate": 0.0001057158287951753, + "loss": 0.2355, + "step": 25887 + }, + { + "epoch": 2.097213220998056, + "grad_norm": 0.06670668721199036, + "learning_rate": 0.00010571132814258069, + "loss": 0.2478, + "step": 25888 + }, + { + "epoch": 2.097294232015554, + "grad_norm": 0.061407558619976044, + "learning_rate": 0.00010570682748998606, + "loss": 0.2442, + "step": 25889 + }, + { + "epoch": 2.0973752430330523, + "grad_norm": 0.07137828320264816, + "learning_rate": 0.00010570232683739143, + "loss": 0.2681, + "step": 25890 + }, + { + "epoch": 2.097456254050551, + "grad_norm": 0.06494369357824326, + "learning_rate": 0.0001056978261847968, + "loss": 0.2408, + "step": 25891 + }, + { + "epoch": 2.0975372650680493, + "grad_norm": 0.06404042989015579, + "learning_rate": 0.00010569332553220217, + "loss": 0.2647, + "step": 25892 + }, + { + "epoch": 2.0976182760855475, + "grad_norm": 0.0751018077135086, + "learning_rate": 0.00010568882487960754, + "loss": 0.2569, + "step": 25893 + }, + { + "epoch": 2.097699287103046, + "grad_norm": 0.05569750815629959, + "learning_rate": 0.00010568432422701293, + "loss": 0.2303, + "step": 25894 + }, + { + "epoch": 2.0977802981205445, + "grad_norm": 0.06106451153755188, + "learning_rate": 0.0001056798235744183, + "loss": 0.2732, + "step": 25895 + }, + { + "epoch": 2.0978613091380427, + "grad_norm": 0.06857486069202423, + "learning_rate": 0.00010567532292182367, + "loss": 0.2592, + "step": 25896 + }, + { + "epoch": 2.097942320155541, + "grad_norm": 0.0606396459043026, + "learning_rate": 0.00010567082226922905, + "loss": 0.2216, + "step": 25897 + }, + { + "epoch": 2.0980233311730396, + "grad_norm": 0.05696889013051987, + "learning_rate": 0.00010566632161663441, + "loss": 0.2482, + "step": 25898 + }, + { + "epoch": 2.098104342190538, + "grad_norm": 0.05583791434764862, + "learning_rate": 0.00010566182096403978, + "loss": 0.2342, + "step": 25899 + }, + { + "epoch": 2.098185353208036, + "grad_norm": 0.061783235520124435, + "learning_rate": 0.00010565732031144517, + "loss": 0.2278, + "step": 25900 + }, + { + "epoch": 2.098266364225535, + "grad_norm": 0.06409650295972824, + "learning_rate": 0.00010565281965885054, + "loss": 0.2559, + "step": 25901 + }, + { + "epoch": 2.098347375243033, + "grad_norm": 0.05956130847334862, + "learning_rate": 0.00010564831900625592, + "loss": 0.2202, + "step": 25902 + }, + { + "epoch": 2.0984283862605313, + "grad_norm": 0.04956075921654701, + "learning_rate": 0.00010564381835366129, + "loss": 0.2025, + "step": 25903 + }, + { + "epoch": 2.09850939727803, + "grad_norm": 0.06383761018514633, + "learning_rate": 0.00010563931770106665, + "loss": 0.2671, + "step": 25904 + }, + { + "epoch": 2.0985904082955282, + "grad_norm": 0.07448650151491165, + "learning_rate": 0.00010563481704847202, + "loss": 0.2972, + "step": 25905 + }, + { + "epoch": 2.0986714193130265, + "grad_norm": 0.06987703591585159, + "learning_rate": 0.00010563031639587741, + "loss": 0.2953, + "step": 25906 + }, + { + "epoch": 2.098752430330525, + "grad_norm": 0.06196024641394615, + "learning_rate": 0.00010562581574328278, + "loss": 0.2541, + "step": 25907 + }, + { + "epoch": 2.0988334413480234, + "grad_norm": 0.06016739085316658, + "learning_rate": 0.00010562131509068816, + "loss": 0.2748, + "step": 25908 + }, + { + "epoch": 2.0989144523655217, + "grad_norm": 0.05822562053799629, + "learning_rate": 0.00010561681443809353, + "loss": 0.2236, + "step": 25909 + }, + { + "epoch": 2.09899546338302, + "grad_norm": 0.05755983665585518, + "learning_rate": 0.00010561231378549889, + "loss": 0.2493, + "step": 25910 + }, + { + "epoch": 2.0990764744005186, + "grad_norm": 0.056454241275787354, + "learning_rate": 0.00010560781313290426, + "loss": 0.2218, + "step": 25911 + }, + { + "epoch": 2.099157485418017, + "grad_norm": 0.07454843074083328, + "learning_rate": 0.00010560331248030965, + "loss": 0.2426, + "step": 25912 + }, + { + "epoch": 2.099238496435515, + "grad_norm": 0.06113918125629425, + "learning_rate": 0.00010559881182771503, + "loss": 0.2593, + "step": 25913 + }, + { + "epoch": 2.0993195074530138, + "grad_norm": 0.06242956966161728, + "learning_rate": 0.0001055943111751204, + "loss": 0.2403, + "step": 25914 + }, + { + "epoch": 2.099400518470512, + "grad_norm": 0.06737563759088516, + "learning_rate": 0.00010558981052252577, + "loss": 0.2492, + "step": 25915 + }, + { + "epoch": 2.0994815294880103, + "grad_norm": 0.05659928917884827, + "learning_rate": 0.00010558530986993113, + "loss": 0.2394, + "step": 25916 + }, + { + "epoch": 2.099562540505509, + "grad_norm": 0.06783798336982727, + "learning_rate": 0.00010558080921733653, + "loss": 0.2747, + "step": 25917 + }, + { + "epoch": 2.099643551523007, + "grad_norm": 0.06404232233762741, + "learning_rate": 0.0001055763085647419, + "loss": 0.2721, + "step": 25918 + }, + { + "epoch": 2.0997245625405054, + "grad_norm": 0.06794023513793945, + "learning_rate": 0.00010557180791214727, + "loss": 0.2454, + "step": 25919 + }, + { + "epoch": 2.0998055735580037, + "grad_norm": 0.06422477215528488, + "learning_rate": 0.00010556730725955264, + "loss": 0.292, + "step": 25920 + }, + { + "epoch": 2.0998865845755024, + "grad_norm": 0.05315267667174339, + "learning_rate": 0.00010556280660695801, + "loss": 0.2484, + "step": 25921 + }, + { + "epoch": 2.0999675955930006, + "grad_norm": 0.053334176540374756, + "learning_rate": 0.00010555830595436337, + "loss": 0.3048, + "step": 25922 + }, + { + "epoch": 2.100048606610499, + "grad_norm": 0.06307677179574966, + "learning_rate": 0.00010555380530176878, + "loss": 0.2724, + "step": 25923 + }, + { + "epoch": 2.1001296176279975, + "grad_norm": 0.06282954663038254, + "learning_rate": 0.00010554930464917414, + "loss": 0.2642, + "step": 25924 + }, + { + "epoch": 2.100210628645496, + "grad_norm": 0.0542440190911293, + "learning_rate": 0.00010554480399657951, + "loss": 0.2537, + "step": 25925 + }, + { + "epoch": 2.100291639662994, + "grad_norm": 0.0643736720085144, + "learning_rate": 0.00010554030334398488, + "loss": 0.2486, + "step": 25926 + }, + { + "epoch": 2.1003726506804927, + "grad_norm": 0.06453826278448105, + "learning_rate": 0.00010553580269139026, + "loss": 0.2951, + "step": 25927 + }, + { + "epoch": 2.100453661697991, + "grad_norm": 0.06805769354104996, + "learning_rate": 0.00010553130203879562, + "loss": 0.2796, + "step": 25928 + }, + { + "epoch": 2.100534672715489, + "grad_norm": 0.05347653478384018, + "learning_rate": 0.00010552680138620102, + "loss": 0.2501, + "step": 25929 + }, + { + "epoch": 2.100615683732988, + "grad_norm": 0.06046787276864052, + "learning_rate": 0.00010552230073360638, + "loss": 0.2579, + "step": 25930 + }, + { + "epoch": 2.100696694750486, + "grad_norm": 0.057562969624996185, + "learning_rate": 0.00010551780008101175, + "loss": 0.2422, + "step": 25931 + }, + { + "epoch": 2.1007777057679844, + "grad_norm": 0.0631248950958252, + "learning_rate": 0.00010551329942841712, + "loss": 0.274, + "step": 25932 + }, + { + "epoch": 2.1008587167854826, + "grad_norm": 0.06080873683094978, + "learning_rate": 0.0001055087987758225, + "loss": 0.2253, + "step": 25933 + }, + { + "epoch": 2.1009397278029813, + "grad_norm": 0.06965567171573639, + "learning_rate": 0.00010550429812322786, + "loss": 0.2816, + "step": 25934 + }, + { + "epoch": 2.1010207388204796, + "grad_norm": 0.07264664769172668, + "learning_rate": 0.00010549979747063326, + "loss": 0.26, + "step": 25935 + }, + { + "epoch": 2.101101749837978, + "grad_norm": 0.0596703365445137, + "learning_rate": 0.00010549529681803862, + "loss": 0.2154, + "step": 25936 + }, + { + "epoch": 2.1011827608554765, + "grad_norm": 0.06607351452112198, + "learning_rate": 0.00010549079616544399, + "loss": 0.2913, + "step": 25937 + }, + { + "epoch": 2.1012637718729748, + "grad_norm": 0.054247647523880005, + "learning_rate": 0.00010548629551284937, + "loss": 0.2655, + "step": 25938 + }, + { + "epoch": 2.101344782890473, + "grad_norm": 0.07472404092550278, + "learning_rate": 0.00010548179486025474, + "loss": 0.2869, + "step": 25939 + }, + { + "epoch": 2.1014257939079717, + "grad_norm": 0.05823759734630585, + "learning_rate": 0.00010547729420766013, + "loss": 0.227, + "step": 25940 + }, + { + "epoch": 2.10150680492547, + "grad_norm": 0.06206611543893814, + "learning_rate": 0.0001054727935550655, + "loss": 0.2557, + "step": 25941 + }, + { + "epoch": 2.101587815942968, + "grad_norm": 0.06391151249408722, + "learning_rate": 0.00010546829290247086, + "loss": 0.2689, + "step": 25942 + }, + { + "epoch": 2.1016688269604664, + "grad_norm": 0.04861528426408768, + "learning_rate": 0.00010546379224987623, + "loss": 0.2413, + "step": 25943 + }, + { + "epoch": 2.101749837977965, + "grad_norm": 0.06416679918766022, + "learning_rate": 0.00010545929159728161, + "loss": 0.2517, + "step": 25944 + }, + { + "epoch": 2.1018308489954634, + "grad_norm": 0.05032241344451904, + "learning_rate": 0.00010545479094468698, + "loss": 0.2621, + "step": 25945 + }, + { + "epoch": 2.1019118600129616, + "grad_norm": 0.06672210991382599, + "learning_rate": 0.00010545029029209237, + "loss": 0.2304, + "step": 25946 + }, + { + "epoch": 2.1019928710304603, + "grad_norm": 0.05387113615870476, + "learning_rate": 0.00010544578963949774, + "loss": 0.2592, + "step": 25947 + }, + { + "epoch": 2.1020738820479585, + "grad_norm": 0.05849631503224373, + "learning_rate": 0.00010544128898690312, + "loss": 0.2468, + "step": 25948 + }, + { + "epoch": 2.1021548930654568, + "grad_norm": 0.07469534128904343, + "learning_rate": 0.00010543678833430848, + "loss": 0.2787, + "step": 25949 + }, + { + "epoch": 2.1022359040829555, + "grad_norm": 0.06344032287597656, + "learning_rate": 0.00010543228768171385, + "loss": 0.2459, + "step": 25950 + }, + { + "epoch": 2.1023169151004537, + "grad_norm": 0.06518208235502243, + "learning_rate": 0.00010542778702911922, + "loss": 0.2265, + "step": 25951 + }, + { + "epoch": 2.102397926117952, + "grad_norm": 0.054074712097644806, + "learning_rate": 0.00010542328637652461, + "loss": 0.2395, + "step": 25952 + }, + { + "epoch": 2.1024789371354506, + "grad_norm": 0.0657474473118782, + "learning_rate": 0.00010541878572392998, + "loss": 0.3097, + "step": 25953 + }, + { + "epoch": 2.102559948152949, + "grad_norm": 0.06637993454933167, + "learning_rate": 0.00010541428507133536, + "loss": 0.286, + "step": 25954 + }, + { + "epoch": 2.102640959170447, + "grad_norm": 0.06281334161758423, + "learning_rate": 0.00010540978441874072, + "loss": 0.2775, + "step": 25955 + }, + { + "epoch": 2.1027219701879454, + "grad_norm": 0.060356464236974716, + "learning_rate": 0.00010540528376614609, + "loss": 0.2912, + "step": 25956 + }, + { + "epoch": 2.102802981205444, + "grad_norm": 0.07013566046953201, + "learning_rate": 0.00010540078311355146, + "loss": 0.3275, + "step": 25957 + }, + { + "epoch": 2.1028839922229423, + "grad_norm": 0.05440182611346245, + "learning_rate": 0.00010539628246095685, + "loss": 0.2983, + "step": 25958 + }, + { + "epoch": 2.1029650032404406, + "grad_norm": 0.05539843067526817, + "learning_rate": 0.00010539178180836222, + "loss": 0.2443, + "step": 25959 + }, + { + "epoch": 2.1030460142579392, + "grad_norm": 0.05578560009598732, + "learning_rate": 0.0001053872811557676, + "loss": 0.2418, + "step": 25960 + }, + { + "epoch": 2.1031270252754375, + "grad_norm": 0.0695006400346756, + "learning_rate": 0.00010538278050317296, + "loss": 0.2742, + "step": 25961 + }, + { + "epoch": 2.1032080362929357, + "grad_norm": 0.0572974868118763, + "learning_rate": 0.00010537827985057833, + "loss": 0.2737, + "step": 25962 + }, + { + "epoch": 2.1032890473104344, + "grad_norm": 0.07014113664627075, + "learning_rate": 0.0001053737791979837, + "loss": 0.2304, + "step": 25963 + }, + { + "epoch": 2.1033700583279327, + "grad_norm": 0.07061361521482468, + "learning_rate": 0.00010536927854538909, + "loss": 0.2436, + "step": 25964 + }, + { + "epoch": 2.103451069345431, + "grad_norm": 0.079985611140728, + "learning_rate": 0.00010536477789279447, + "loss": 0.3183, + "step": 25965 + }, + { + "epoch": 2.103532080362929, + "grad_norm": 0.07024233043193817, + "learning_rate": 0.00010536027724019984, + "loss": 0.257, + "step": 25966 + }, + { + "epoch": 2.103613091380428, + "grad_norm": 0.055939681828022, + "learning_rate": 0.0001053557765876052, + "loss": 0.2686, + "step": 25967 + }, + { + "epoch": 2.103694102397926, + "grad_norm": 0.06374896317720413, + "learning_rate": 0.00010535127593501057, + "loss": 0.2437, + "step": 25968 + }, + { + "epoch": 2.1037751134154243, + "grad_norm": 0.060883134603500366, + "learning_rate": 0.00010534677528241596, + "loss": 0.2739, + "step": 25969 + }, + { + "epoch": 2.103856124432923, + "grad_norm": 0.06968491524457932, + "learning_rate": 0.00010534227462982133, + "loss": 0.243, + "step": 25970 + }, + { + "epoch": 2.1039371354504213, + "grad_norm": 0.06380768120288849, + "learning_rate": 0.00010533777397722671, + "loss": 0.2462, + "step": 25971 + }, + { + "epoch": 2.1040181464679195, + "grad_norm": 0.0704575926065445, + "learning_rate": 0.00010533327332463208, + "loss": 0.259, + "step": 25972 + }, + { + "epoch": 2.104099157485418, + "grad_norm": 0.05365743860602379, + "learning_rate": 0.00010532877267203744, + "loss": 0.2534, + "step": 25973 + }, + { + "epoch": 2.1041801685029164, + "grad_norm": 0.06499110907316208, + "learning_rate": 0.00010532427201944282, + "loss": 0.2802, + "step": 25974 + }, + { + "epoch": 2.1042611795204147, + "grad_norm": 0.06151876598596573, + "learning_rate": 0.0001053197713668482, + "loss": 0.2415, + "step": 25975 + }, + { + "epoch": 2.1043421905379134, + "grad_norm": 0.06648625433444977, + "learning_rate": 0.00010531527071425358, + "loss": 0.2563, + "step": 25976 + }, + { + "epoch": 2.1044232015554116, + "grad_norm": 0.057899028062820435, + "learning_rate": 0.00010531077006165895, + "loss": 0.2355, + "step": 25977 + }, + { + "epoch": 2.10450421257291, + "grad_norm": 0.07268321514129639, + "learning_rate": 0.00010530626940906432, + "loss": 0.2558, + "step": 25978 + }, + { + "epoch": 2.104585223590408, + "grad_norm": 0.05207452178001404, + "learning_rate": 0.00010530176875646968, + "loss": 0.2317, + "step": 25979 + }, + { + "epoch": 2.104666234607907, + "grad_norm": 0.052503883838653564, + "learning_rate": 0.00010529726810387506, + "loss": 0.2387, + "step": 25980 + }, + { + "epoch": 2.104747245625405, + "grad_norm": 0.05151834338903427, + "learning_rate": 0.00010529276745128044, + "loss": 0.2316, + "step": 25981 + }, + { + "epoch": 2.1048282566429033, + "grad_norm": 0.07107538729906082, + "learning_rate": 0.00010528826679868582, + "loss": 0.3077, + "step": 25982 + }, + { + "epoch": 2.104909267660402, + "grad_norm": 0.05292705073952675, + "learning_rate": 0.00010528376614609119, + "loss": 0.2599, + "step": 25983 + }, + { + "epoch": 2.1049902786779002, + "grad_norm": 0.07342039048671722, + "learning_rate": 0.00010527926549349656, + "loss": 0.2586, + "step": 25984 + }, + { + "epoch": 2.1050712896953985, + "grad_norm": 0.05398553982377052, + "learning_rate": 0.00010527476484090192, + "loss": 0.2687, + "step": 25985 + }, + { + "epoch": 2.105152300712897, + "grad_norm": 0.0721733421087265, + "learning_rate": 0.0001052702641883073, + "loss": 0.2903, + "step": 25986 + }, + { + "epoch": 2.1052333117303954, + "grad_norm": 0.07198905199766159, + "learning_rate": 0.00010526576353571269, + "loss": 0.2692, + "step": 25987 + }, + { + "epoch": 2.1053143227478937, + "grad_norm": 0.06857843697071075, + "learning_rate": 0.00010526126288311806, + "loss": 0.2703, + "step": 25988 + }, + { + "epoch": 2.105395333765392, + "grad_norm": 0.0674414411187172, + "learning_rate": 0.00010525676223052343, + "loss": 0.2731, + "step": 25989 + }, + { + "epoch": 2.1054763447828906, + "grad_norm": 0.05459675192832947, + "learning_rate": 0.0001052522615779288, + "loss": 0.2306, + "step": 25990 + }, + { + "epoch": 2.105557355800389, + "grad_norm": 0.05658693239092827, + "learning_rate": 0.00010524776092533417, + "loss": 0.2499, + "step": 25991 + }, + { + "epoch": 2.105638366817887, + "grad_norm": 0.05841512978076935, + "learning_rate": 0.00010524326027273957, + "loss": 0.2647, + "step": 25992 + }, + { + "epoch": 2.1057193778353858, + "grad_norm": 0.06220696493983269, + "learning_rate": 0.00010523875962014493, + "loss": 0.2774, + "step": 25993 + }, + { + "epoch": 2.105800388852884, + "grad_norm": 0.06404992938041687, + "learning_rate": 0.0001052342589675503, + "loss": 0.227, + "step": 25994 + }, + { + "epoch": 2.1058813998703823, + "grad_norm": 0.08344966918230057, + "learning_rate": 0.00010522975831495567, + "loss": 0.3065, + "step": 25995 + }, + { + "epoch": 2.105962410887881, + "grad_norm": 0.04662399739027023, + "learning_rate": 0.00010522525766236105, + "loss": 0.2443, + "step": 25996 + }, + { + "epoch": 2.106043421905379, + "grad_norm": 0.07650208473205566, + "learning_rate": 0.00010522075700976641, + "loss": 0.2978, + "step": 25997 + }, + { + "epoch": 2.1061244329228774, + "grad_norm": 0.06667305529117584, + "learning_rate": 0.00010521625635717181, + "loss": 0.2549, + "step": 25998 + }, + { + "epoch": 2.1062054439403757, + "grad_norm": 0.05364314094185829, + "learning_rate": 0.00010521175570457717, + "loss": 0.2474, + "step": 25999 + }, + { + "epoch": 2.1062864549578744, + "grad_norm": 0.07898110151290894, + "learning_rate": 0.00010520725505198254, + "loss": 0.2559, + "step": 26000 + }, + { + "epoch": 2.1063674659753726, + "grad_norm": 0.0522674061357975, + "learning_rate": 0.00010520275439938792, + "loss": 0.2427, + "step": 26001 + }, + { + "epoch": 2.106448476992871, + "grad_norm": 0.06294900923967361, + "learning_rate": 0.00010519825374679329, + "loss": 0.2572, + "step": 26002 + }, + { + "epoch": 2.1065294880103695, + "grad_norm": 0.06288789212703705, + "learning_rate": 0.00010519375309419865, + "loss": 0.2602, + "step": 26003 + }, + { + "epoch": 2.106610499027868, + "grad_norm": 0.05611900985240936, + "learning_rate": 0.00010518925244160405, + "loss": 0.2704, + "step": 26004 + }, + { + "epoch": 2.106691510045366, + "grad_norm": 0.06647303700447083, + "learning_rate": 0.00010518475178900941, + "loss": 0.2875, + "step": 26005 + }, + { + "epoch": 2.1067725210628647, + "grad_norm": 0.05431298166513443, + "learning_rate": 0.00010518025113641478, + "loss": 0.2415, + "step": 26006 + }, + { + "epoch": 2.106853532080363, + "grad_norm": 0.07187354564666748, + "learning_rate": 0.00010517575048382016, + "loss": 0.3086, + "step": 26007 + }, + { + "epoch": 2.106934543097861, + "grad_norm": 0.06193733587861061, + "learning_rate": 0.00010517124983122553, + "loss": 0.2618, + "step": 26008 + }, + { + "epoch": 2.10701555411536, + "grad_norm": 0.07977601140737534, + "learning_rate": 0.00010516674917863089, + "loss": 0.2584, + "step": 26009 + }, + { + "epoch": 2.107096565132858, + "grad_norm": 0.07653692364692688, + "learning_rate": 0.00010516224852603629, + "loss": 0.2764, + "step": 26010 + }, + { + "epoch": 2.1071775761503564, + "grad_norm": 0.05683164671063423, + "learning_rate": 0.00010515774787344165, + "loss": 0.2385, + "step": 26011 + }, + { + "epoch": 2.1072585871678546, + "grad_norm": 0.05809834226965904, + "learning_rate": 0.00010515324722084703, + "loss": 0.2453, + "step": 26012 + }, + { + "epoch": 2.1073395981853533, + "grad_norm": 0.06329745799303055, + "learning_rate": 0.0001051487465682524, + "loss": 0.2541, + "step": 26013 + }, + { + "epoch": 2.1074206092028516, + "grad_norm": 0.06667368859052658, + "learning_rate": 0.00010514424591565777, + "loss": 0.2378, + "step": 26014 + }, + { + "epoch": 2.10750162022035, + "grad_norm": 0.07000331580638885, + "learning_rate": 0.00010513974526306313, + "loss": 0.3256, + "step": 26015 + }, + { + "epoch": 2.1075826312378485, + "grad_norm": 0.056671325117349625, + "learning_rate": 0.00010513524461046853, + "loss": 0.2131, + "step": 26016 + }, + { + "epoch": 2.1076636422553467, + "grad_norm": 0.05919632315635681, + "learning_rate": 0.00010513074395787391, + "loss": 0.2216, + "step": 26017 + }, + { + "epoch": 2.107744653272845, + "grad_norm": 0.06502486020326614, + "learning_rate": 0.00010512624330527927, + "loss": 0.2587, + "step": 26018 + }, + { + "epoch": 2.1078256642903437, + "grad_norm": 0.06771452724933624, + "learning_rate": 0.00010512174265268464, + "loss": 0.2489, + "step": 26019 + }, + { + "epoch": 2.107906675307842, + "grad_norm": 0.07498066127300262, + "learning_rate": 0.00010511724200009001, + "loss": 0.2853, + "step": 26020 + }, + { + "epoch": 2.10798768632534, + "grad_norm": 0.05339035019278526, + "learning_rate": 0.0001051127413474954, + "loss": 0.2318, + "step": 26021 + }, + { + "epoch": 2.1080686973428384, + "grad_norm": 0.06364471465349197, + "learning_rate": 0.00010510824069490078, + "loss": 0.3141, + "step": 26022 + }, + { + "epoch": 2.108149708360337, + "grad_norm": 0.06906301528215408, + "learning_rate": 0.00010510374004230615, + "loss": 0.2675, + "step": 26023 + }, + { + "epoch": 2.1082307193778353, + "grad_norm": 0.05944638326764107, + "learning_rate": 0.00010509923938971151, + "loss": 0.2374, + "step": 26024 + }, + { + "epoch": 2.1083117303953336, + "grad_norm": 0.06184754893183708, + "learning_rate": 0.00010509473873711688, + "loss": 0.2586, + "step": 26025 + }, + { + "epoch": 2.1083927414128323, + "grad_norm": 0.06255876272916794, + "learning_rate": 0.00010509023808452226, + "loss": 0.2445, + "step": 26026 + }, + { + "epoch": 2.1084737524303305, + "grad_norm": 0.06641737371683121, + "learning_rate": 0.00010508573743192764, + "loss": 0.2396, + "step": 26027 + }, + { + "epoch": 2.1085547634478288, + "grad_norm": 0.05015082284808159, + "learning_rate": 0.00010508123677933302, + "loss": 0.2394, + "step": 26028 + }, + { + "epoch": 2.1086357744653275, + "grad_norm": 0.09021834284067154, + "learning_rate": 0.00010507673612673839, + "loss": 0.2919, + "step": 26029 + }, + { + "epoch": 2.1087167854828257, + "grad_norm": 0.06203924119472504, + "learning_rate": 0.00010507223547414375, + "loss": 0.2747, + "step": 26030 + }, + { + "epoch": 2.108797796500324, + "grad_norm": 0.048285387456417084, + "learning_rate": 0.00010506773482154912, + "loss": 0.2308, + "step": 26031 + }, + { + "epoch": 2.108878807517822, + "grad_norm": 0.05346948280930519, + "learning_rate": 0.0001050632341689545, + "loss": 0.2085, + "step": 26032 + }, + { + "epoch": 2.108959818535321, + "grad_norm": 0.054928600788116455, + "learning_rate": 0.00010505873351635988, + "loss": 0.2359, + "step": 26033 + }, + { + "epoch": 2.109040829552819, + "grad_norm": 0.05738217011094093, + "learning_rate": 0.00010505423286376526, + "loss": 0.205, + "step": 26034 + }, + { + "epoch": 2.1091218405703174, + "grad_norm": 0.06716038286685944, + "learning_rate": 0.00010504973221117063, + "loss": 0.2708, + "step": 26035 + }, + { + "epoch": 2.109202851587816, + "grad_norm": 0.0686250627040863, + "learning_rate": 0.00010504523155857599, + "loss": 0.258, + "step": 26036 + }, + { + "epoch": 2.1092838626053143, + "grad_norm": 0.058074306696653366, + "learning_rate": 0.00010504073090598137, + "loss": 0.2406, + "step": 26037 + }, + { + "epoch": 2.1093648736228126, + "grad_norm": 0.06594202667474747, + "learning_rate": 0.00010503623025338674, + "loss": 0.2912, + "step": 26038 + }, + { + "epoch": 2.1094458846403112, + "grad_norm": 0.0583692230284214, + "learning_rate": 0.00010503172960079213, + "loss": 0.2266, + "step": 26039 + }, + { + "epoch": 2.1095268956578095, + "grad_norm": 0.05876157432794571, + "learning_rate": 0.0001050272289481975, + "loss": 0.2621, + "step": 26040 + }, + { + "epoch": 2.1096079066753077, + "grad_norm": 0.057510919868946075, + "learning_rate": 0.00010502272829560287, + "loss": 0.2414, + "step": 26041 + }, + { + "epoch": 2.1096889176928064, + "grad_norm": 0.06362812966108322, + "learning_rate": 0.00010501822764300823, + "loss": 0.2567, + "step": 26042 + }, + { + "epoch": 2.1097699287103047, + "grad_norm": 0.06669417023658752, + "learning_rate": 0.00010501372699041361, + "loss": 0.2412, + "step": 26043 + }, + { + "epoch": 2.109850939727803, + "grad_norm": 0.0634775161743164, + "learning_rate": 0.00010500922633781898, + "loss": 0.2767, + "step": 26044 + }, + { + "epoch": 2.109931950745301, + "grad_norm": 0.07396277785301208, + "learning_rate": 0.00010500472568522437, + "loss": 0.2799, + "step": 26045 + }, + { + "epoch": 2.1100129617628, + "grad_norm": 0.056821681559085846, + "learning_rate": 0.00010500022503262974, + "loss": 0.2229, + "step": 26046 + }, + { + "epoch": 2.110093972780298, + "grad_norm": 0.05848122015595436, + "learning_rate": 0.00010499572438003512, + "loss": 0.2295, + "step": 26047 + }, + { + "epoch": 2.1101749837977963, + "grad_norm": 0.08277922123670578, + "learning_rate": 0.00010499122372744048, + "loss": 0.2667, + "step": 26048 + }, + { + "epoch": 2.110255994815295, + "grad_norm": 0.06810778379440308, + "learning_rate": 0.00010498672307484585, + "loss": 0.274, + "step": 26049 + }, + { + "epoch": 2.1103370058327933, + "grad_norm": 0.07550406455993652, + "learning_rate": 0.00010498222242225124, + "loss": 0.2594, + "step": 26050 + }, + { + "epoch": 2.1104180168502915, + "grad_norm": 0.06279616802930832, + "learning_rate": 0.00010497772176965661, + "loss": 0.28, + "step": 26051 + }, + { + "epoch": 2.11049902786779, + "grad_norm": 0.060095228254795074, + "learning_rate": 0.00010497322111706198, + "loss": 0.238, + "step": 26052 + }, + { + "epoch": 2.1105800388852884, + "grad_norm": 0.061981070786714554, + "learning_rate": 0.00010496872046446736, + "loss": 0.2696, + "step": 26053 + }, + { + "epoch": 2.1106610499027867, + "grad_norm": 0.06327757984399796, + "learning_rate": 0.00010496421981187272, + "loss": 0.2771, + "step": 26054 + }, + { + "epoch": 2.110742060920285, + "grad_norm": 0.07830698788166046, + "learning_rate": 0.00010495971915927809, + "loss": 0.292, + "step": 26055 + }, + { + "epoch": 2.1108230719377836, + "grad_norm": 0.06010693684220314, + "learning_rate": 0.00010495521850668348, + "loss": 0.2472, + "step": 26056 + }, + { + "epoch": 2.110904082955282, + "grad_norm": 0.0585097037255764, + "learning_rate": 0.00010495071785408885, + "loss": 0.2756, + "step": 26057 + }, + { + "epoch": 2.11098509397278, + "grad_norm": 0.059424206614494324, + "learning_rate": 0.00010494621720149423, + "loss": 0.2788, + "step": 26058 + }, + { + "epoch": 2.111066104990279, + "grad_norm": 0.05133242905139923, + "learning_rate": 0.0001049417165488996, + "loss": 0.2625, + "step": 26059 + }, + { + "epoch": 2.111147116007777, + "grad_norm": 0.055822599679231644, + "learning_rate": 0.00010493721589630496, + "loss": 0.2834, + "step": 26060 + }, + { + "epoch": 2.1112281270252753, + "grad_norm": 0.053742486983537674, + "learning_rate": 0.00010493271524371033, + "loss": 0.2375, + "step": 26061 + }, + { + "epoch": 2.111309138042774, + "grad_norm": 0.06400348991155624, + "learning_rate": 0.00010492821459111572, + "loss": 0.2367, + "step": 26062 + }, + { + "epoch": 2.1113901490602722, + "grad_norm": 0.06669709831476212, + "learning_rate": 0.00010492371393852109, + "loss": 0.2442, + "step": 26063 + }, + { + "epoch": 2.1114711600777705, + "grad_norm": 0.06966737657785416, + "learning_rate": 0.00010491921328592647, + "loss": 0.2724, + "step": 26064 + }, + { + "epoch": 2.111552171095269, + "grad_norm": 0.06068720296025276, + "learning_rate": 0.00010491471263333184, + "loss": 0.2577, + "step": 26065 + }, + { + "epoch": 2.1116331821127674, + "grad_norm": 0.05518393963575363, + "learning_rate": 0.0001049102119807372, + "loss": 0.2351, + "step": 26066 + }, + { + "epoch": 2.1117141931302656, + "grad_norm": 0.0546581968665123, + "learning_rate": 0.00010490571132814257, + "loss": 0.2113, + "step": 26067 + }, + { + "epoch": 2.111795204147764, + "grad_norm": 0.07191743701696396, + "learning_rate": 0.00010490121067554796, + "loss": 0.2787, + "step": 26068 + }, + { + "epoch": 2.1118762151652626, + "grad_norm": 0.0725640207529068, + "learning_rate": 0.00010489671002295333, + "loss": 0.2762, + "step": 26069 + }, + { + "epoch": 2.111957226182761, + "grad_norm": 0.06637110561132431, + "learning_rate": 0.00010489220937035871, + "loss": 0.2152, + "step": 26070 + }, + { + "epoch": 2.112038237200259, + "grad_norm": 0.06367357075214386, + "learning_rate": 0.00010488770871776408, + "loss": 0.2789, + "step": 26071 + }, + { + "epoch": 2.1121192482177578, + "grad_norm": 0.052471041679382324, + "learning_rate": 0.00010488320806516944, + "loss": 0.2458, + "step": 26072 + }, + { + "epoch": 2.112200259235256, + "grad_norm": 0.06254670768976212, + "learning_rate": 0.00010487870741257484, + "loss": 0.2575, + "step": 26073 + }, + { + "epoch": 2.1122812702527543, + "grad_norm": 0.06931617856025696, + "learning_rate": 0.0001048742067599802, + "loss": 0.2263, + "step": 26074 + }, + { + "epoch": 2.112362281270253, + "grad_norm": 0.06655744463205338, + "learning_rate": 0.00010486970610738558, + "loss": 0.2686, + "step": 26075 + }, + { + "epoch": 2.112443292287751, + "grad_norm": 0.054537370800971985, + "learning_rate": 0.00010486520545479095, + "loss": 0.2822, + "step": 26076 + }, + { + "epoch": 2.1125243033052494, + "grad_norm": 0.06522393971681595, + "learning_rate": 0.00010486070480219632, + "loss": 0.2968, + "step": 26077 + }, + { + "epoch": 2.1126053143227477, + "grad_norm": 0.06392154097557068, + "learning_rate": 0.00010485620414960168, + "loss": 0.2307, + "step": 26078 + }, + { + "epoch": 2.1126863253402464, + "grad_norm": 0.057777177542448044, + "learning_rate": 0.00010485170349700708, + "loss": 0.2331, + "step": 26079 + }, + { + "epoch": 2.1127673363577446, + "grad_norm": 0.059251341968774796, + "learning_rate": 0.00010484720284441244, + "loss": 0.2868, + "step": 26080 + }, + { + "epoch": 2.112848347375243, + "grad_norm": 0.063652902841568, + "learning_rate": 0.00010484270219181782, + "loss": 0.2458, + "step": 26081 + }, + { + "epoch": 2.1129293583927415, + "grad_norm": 0.05617694556713104, + "learning_rate": 0.00010483820153922319, + "loss": 0.2302, + "step": 26082 + }, + { + "epoch": 2.11301036941024, + "grad_norm": 0.05732857063412666, + "learning_rate": 0.00010483370088662857, + "loss": 0.2406, + "step": 26083 + }, + { + "epoch": 2.113091380427738, + "grad_norm": 0.05721890181303024, + "learning_rate": 0.00010482920023403393, + "loss": 0.2408, + "step": 26084 + }, + { + "epoch": 2.1131723914452367, + "grad_norm": 0.053211111575365067, + "learning_rate": 0.00010482469958143933, + "loss": 0.2469, + "step": 26085 + }, + { + "epoch": 2.113253402462735, + "grad_norm": 0.06854691356420517, + "learning_rate": 0.0001048201989288447, + "loss": 0.2964, + "step": 26086 + }, + { + "epoch": 2.113334413480233, + "grad_norm": 0.059306904673576355, + "learning_rate": 0.00010481569827625006, + "loss": 0.2599, + "step": 26087 + }, + { + "epoch": 2.113415424497732, + "grad_norm": 0.057720642536878586, + "learning_rate": 0.00010481119762365543, + "loss": 0.2641, + "step": 26088 + }, + { + "epoch": 2.11349643551523, + "grad_norm": 0.06803397089242935, + "learning_rate": 0.0001048066969710608, + "loss": 0.2846, + "step": 26089 + }, + { + "epoch": 2.1135774465327284, + "grad_norm": 0.054278306663036346, + "learning_rate": 0.00010480219631846617, + "loss": 0.2366, + "step": 26090 + }, + { + "epoch": 2.1136584575502266, + "grad_norm": 0.07058935612440109, + "learning_rate": 0.00010479769566587157, + "loss": 0.2422, + "step": 26091 + }, + { + "epoch": 2.1137394685677253, + "grad_norm": 0.049645014107227325, + "learning_rate": 0.00010479319501327694, + "loss": 0.2387, + "step": 26092 + }, + { + "epoch": 2.1138204795852236, + "grad_norm": 0.06945136189460754, + "learning_rate": 0.0001047886943606823, + "loss": 0.2778, + "step": 26093 + }, + { + "epoch": 2.113901490602722, + "grad_norm": 0.058754272758960724, + "learning_rate": 0.00010478419370808767, + "loss": 0.2448, + "step": 26094 + }, + { + "epoch": 2.1139825016202205, + "grad_norm": 0.06239371746778488, + "learning_rate": 0.00010477969305549305, + "loss": 0.2497, + "step": 26095 + }, + { + "epoch": 2.1140635126377187, + "grad_norm": 0.07894030213356018, + "learning_rate": 0.00010477519240289841, + "loss": 0.2891, + "step": 26096 + }, + { + "epoch": 2.114144523655217, + "grad_norm": 0.0756504163146019, + "learning_rate": 0.00010477069175030381, + "loss": 0.2929, + "step": 26097 + }, + { + "epoch": 2.1142255346727157, + "grad_norm": 0.054733000695705414, + "learning_rate": 0.00010476619109770918, + "loss": 0.2162, + "step": 26098 + }, + { + "epoch": 2.114306545690214, + "grad_norm": 0.05940738692879677, + "learning_rate": 0.00010476169044511454, + "loss": 0.2872, + "step": 26099 + }, + { + "epoch": 2.114387556707712, + "grad_norm": 0.05741294100880623, + "learning_rate": 0.00010475718979251992, + "loss": 0.2484, + "step": 26100 + }, + { + "epoch": 2.1144685677252104, + "grad_norm": 0.06838394701480865, + "learning_rate": 0.00010475268913992529, + "loss": 0.2606, + "step": 26101 + }, + { + "epoch": 2.114549578742709, + "grad_norm": 0.07081933319568634, + "learning_rate": 0.00010474818848733068, + "loss": 0.2782, + "step": 26102 + }, + { + "epoch": 2.1146305897602073, + "grad_norm": 0.058358822017908096, + "learning_rate": 0.00010474368783473605, + "loss": 0.2384, + "step": 26103 + }, + { + "epoch": 2.1147116007777056, + "grad_norm": 0.05638371407985687, + "learning_rate": 0.00010473918718214142, + "loss": 0.2525, + "step": 26104 + }, + { + "epoch": 2.1147926117952043, + "grad_norm": 0.06408551335334778, + "learning_rate": 0.00010473468652954678, + "loss": 0.2752, + "step": 26105 + }, + { + "epoch": 2.1148736228127025, + "grad_norm": 0.06037361919879913, + "learning_rate": 0.00010473018587695216, + "loss": 0.2626, + "step": 26106 + }, + { + "epoch": 2.1149546338302008, + "grad_norm": 0.05822877958416939, + "learning_rate": 0.00010472568522435753, + "loss": 0.2408, + "step": 26107 + }, + { + "epoch": 2.1150356448476995, + "grad_norm": 0.052096933126449585, + "learning_rate": 0.00010472118457176292, + "loss": 0.2619, + "step": 26108 + }, + { + "epoch": 2.1151166558651977, + "grad_norm": 0.06133052706718445, + "learning_rate": 0.00010471668391916829, + "loss": 0.2439, + "step": 26109 + }, + { + "epoch": 2.115197666882696, + "grad_norm": 0.06262310594320297, + "learning_rate": 0.00010471218326657367, + "loss": 0.2709, + "step": 26110 + }, + { + "epoch": 2.1152786779001946, + "grad_norm": 0.05901063233613968, + "learning_rate": 0.00010470768261397903, + "loss": 0.2492, + "step": 26111 + }, + { + "epoch": 2.115359688917693, + "grad_norm": 0.06124577298760414, + "learning_rate": 0.0001047031819613844, + "loss": 0.2355, + "step": 26112 + }, + { + "epoch": 2.115440699935191, + "grad_norm": 0.057176556438207626, + "learning_rate": 0.00010469868130878977, + "loss": 0.2454, + "step": 26113 + }, + { + "epoch": 2.1155217109526894, + "grad_norm": 0.0686374306678772, + "learning_rate": 0.00010469418065619516, + "loss": 0.2547, + "step": 26114 + }, + { + "epoch": 2.115602721970188, + "grad_norm": 0.05528261885046959, + "learning_rate": 0.00010468968000360053, + "loss": 0.2413, + "step": 26115 + }, + { + "epoch": 2.1156837329876863, + "grad_norm": 0.07165534049272537, + "learning_rate": 0.00010468517935100591, + "loss": 0.2451, + "step": 26116 + }, + { + "epoch": 2.1157647440051845, + "grad_norm": 0.07650242745876312, + "learning_rate": 0.00010468067869841127, + "loss": 0.2724, + "step": 26117 + }, + { + "epoch": 2.1158457550226832, + "grad_norm": 0.06085258349776268, + "learning_rate": 0.00010467617804581664, + "loss": 0.2926, + "step": 26118 + }, + { + "epoch": 2.1159267660401815, + "grad_norm": 0.07131532579660416, + "learning_rate": 0.00010467167739322201, + "loss": 0.2394, + "step": 26119 + }, + { + "epoch": 2.1160077770576797, + "grad_norm": 0.07328178733587265, + "learning_rate": 0.0001046671767406274, + "loss": 0.2732, + "step": 26120 + }, + { + "epoch": 2.1160887880751784, + "grad_norm": 0.06203152611851692, + "learning_rate": 0.00010466267608803278, + "loss": 0.2863, + "step": 26121 + }, + { + "epoch": 2.1161697990926767, + "grad_norm": 0.07757362723350525, + "learning_rate": 0.00010465817543543815, + "loss": 0.2655, + "step": 26122 + }, + { + "epoch": 2.116250810110175, + "grad_norm": 0.07243662327528, + "learning_rate": 0.00010465367478284351, + "loss": 0.2313, + "step": 26123 + }, + { + "epoch": 2.116331821127673, + "grad_norm": 0.07049787789583206, + "learning_rate": 0.00010464917413024888, + "loss": 0.2567, + "step": 26124 + }, + { + "epoch": 2.116412832145172, + "grad_norm": 0.06684257090091705, + "learning_rate": 0.00010464467347765427, + "loss": 0.2406, + "step": 26125 + }, + { + "epoch": 2.11649384316267, + "grad_norm": 0.052143748849630356, + "learning_rate": 0.00010464017282505964, + "loss": 0.23, + "step": 26126 + }, + { + "epoch": 2.1165748541801683, + "grad_norm": 0.05962395295500755, + "learning_rate": 0.00010463567217246502, + "loss": 0.256, + "step": 26127 + }, + { + "epoch": 2.116655865197667, + "grad_norm": 0.061811357736587524, + "learning_rate": 0.00010463117151987039, + "loss": 0.2421, + "step": 26128 + }, + { + "epoch": 2.1167368762151653, + "grad_norm": 0.0682341679930687, + "learning_rate": 0.00010462667086727575, + "loss": 0.2453, + "step": 26129 + }, + { + "epoch": 2.1168178872326635, + "grad_norm": 0.07151232659816742, + "learning_rate": 0.00010462217021468112, + "loss": 0.2643, + "step": 26130 + }, + { + "epoch": 2.116898898250162, + "grad_norm": 0.053989555686712265, + "learning_rate": 0.00010461766956208651, + "loss": 0.2658, + "step": 26131 + }, + { + "epoch": 2.1169799092676604, + "grad_norm": 0.06623726338148117, + "learning_rate": 0.00010461316890949189, + "loss": 0.2811, + "step": 26132 + }, + { + "epoch": 2.1170609202851587, + "grad_norm": 0.05381413549184799, + "learning_rate": 0.00010460866825689726, + "loss": 0.2685, + "step": 26133 + }, + { + "epoch": 2.1171419313026574, + "grad_norm": 0.056920427829027176, + "learning_rate": 0.00010460416760430263, + "loss": 0.2591, + "step": 26134 + }, + { + "epoch": 2.1172229423201556, + "grad_norm": 0.06302447617053986, + "learning_rate": 0.00010459966695170799, + "loss": 0.2217, + "step": 26135 + }, + { + "epoch": 2.117303953337654, + "grad_norm": 0.06110900640487671, + "learning_rate": 0.00010459516629911337, + "loss": 0.2377, + "step": 26136 + }, + { + "epoch": 2.117384964355152, + "grad_norm": 0.059887710958719254, + "learning_rate": 0.00010459066564651875, + "loss": 0.2437, + "step": 26137 + }, + { + "epoch": 2.117465975372651, + "grad_norm": 0.04854189604520798, + "learning_rate": 0.00010458616499392413, + "loss": 0.2316, + "step": 26138 + }, + { + "epoch": 2.117546986390149, + "grad_norm": 0.06568967550992966, + "learning_rate": 0.0001045816643413295, + "loss": 0.2684, + "step": 26139 + }, + { + "epoch": 2.1176279974076473, + "grad_norm": 0.04694654792547226, + "learning_rate": 0.00010457716368873487, + "loss": 0.2543, + "step": 26140 + }, + { + "epoch": 2.117709008425146, + "grad_norm": 0.07281840592622757, + "learning_rate": 0.00010457266303614023, + "loss": 0.2672, + "step": 26141 + }, + { + "epoch": 2.1177900194426442, + "grad_norm": 0.0669359415769577, + "learning_rate": 0.00010456816238354561, + "loss": 0.2436, + "step": 26142 + }, + { + "epoch": 2.1178710304601425, + "grad_norm": 0.07672449201345444, + "learning_rate": 0.000104563661730951, + "loss": 0.2949, + "step": 26143 + }, + { + "epoch": 2.117952041477641, + "grad_norm": 0.06504993140697479, + "learning_rate": 0.00010455916107835637, + "loss": 0.2751, + "step": 26144 + }, + { + "epoch": 2.1180330524951394, + "grad_norm": 0.07261329889297485, + "learning_rate": 0.00010455466042576174, + "loss": 0.2494, + "step": 26145 + }, + { + "epoch": 2.1181140635126376, + "grad_norm": 0.05860583484172821, + "learning_rate": 0.00010455015977316712, + "loss": 0.2514, + "step": 26146 + }, + { + "epoch": 2.118195074530136, + "grad_norm": 0.09110474586486816, + "learning_rate": 0.00010454565912057248, + "loss": 0.293, + "step": 26147 + }, + { + "epoch": 2.1182760855476346, + "grad_norm": 0.05644802376627922, + "learning_rate": 0.00010454115846797785, + "loss": 0.263, + "step": 26148 + }, + { + "epoch": 2.118357096565133, + "grad_norm": 0.05598912760615349, + "learning_rate": 0.00010453665781538324, + "loss": 0.2473, + "step": 26149 + }, + { + "epoch": 2.118438107582631, + "grad_norm": 0.05859972536563873, + "learning_rate": 0.00010453215716278861, + "loss": 0.232, + "step": 26150 + }, + { + "epoch": 2.1185191186001298, + "grad_norm": 0.057381488382816315, + "learning_rate": 0.00010452765651019398, + "loss": 0.2124, + "step": 26151 + }, + { + "epoch": 2.118600129617628, + "grad_norm": 0.05538659542798996, + "learning_rate": 0.00010452315585759936, + "loss": 0.2685, + "step": 26152 + }, + { + "epoch": 2.1186811406351262, + "grad_norm": 0.0663476511836052, + "learning_rate": 0.00010451865520500472, + "loss": 0.2414, + "step": 26153 + }, + { + "epoch": 2.118762151652625, + "grad_norm": 0.05742928013205528, + "learning_rate": 0.00010451415455241012, + "loss": 0.2559, + "step": 26154 + }, + { + "epoch": 2.118843162670123, + "grad_norm": 0.08387189358472824, + "learning_rate": 0.00010450965389981549, + "loss": 0.2854, + "step": 26155 + }, + { + "epoch": 2.1189241736876214, + "grad_norm": 0.059729333966970444, + "learning_rate": 0.00010450515324722085, + "loss": 0.2309, + "step": 26156 + }, + { + "epoch": 2.11900518470512, + "grad_norm": 0.06769514083862305, + "learning_rate": 0.00010450065259462623, + "loss": 0.2841, + "step": 26157 + }, + { + "epoch": 2.1190861957226184, + "grad_norm": 0.07828256487846375, + "learning_rate": 0.0001044961519420316, + "loss": 0.2738, + "step": 26158 + }, + { + "epoch": 2.1191672067401166, + "grad_norm": 0.06256436556577682, + "learning_rate": 0.00010449165128943696, + "loss": 0.2503, + "step": 26159 + }, + { + "epoch": 2.119248217757615, + "grad_norm": 0.07057522982358932, + "learning_rate": 0.00010448715063684236, + "loss": 0.2277, + "step": 26160 + }, + { + "epoch": 2.1193292287751135, + "grad_norm": 0.062220748513936996, + "learning_rate": 0.00010448264998424773, + "loss": 0.268, + "step": 26161 + }, + { + "epoch": 2.119410239792612, + "grad_norm": 0.055413730442523956, + "learning_rate": 0.0001044781493316531, + "loss": 0.2235, + "step": 26162 + }, + { + "epoch": 2.11949125081011, + "grad_norm": 0.05254589021205902, + "learning_rate": 0.00010447364867905847, + "loss": 0.2702, + "step": 26163 + }, + { + "epoch": 2.1195722618276087, + "grad_norm": 0.061898522078990936, + "learning_rate": 0.00010446914802646384, + "loss": 0.2119, + "step": 26164 + }, + { + "epoch": 2.119653272845107, + "grad_norm": 0.0660005584359169, + "learning_rate": 0.0001044646473738692, + "loss": 0.27, + "step": 26165 + }, + { + "epoch": 2.119734283862605, + "grad_norm": 0.058570023626089096, + "learning_rate": 0.0001044601467212746, + "loss": 0.2225, + "step": 26166 + }, + { + "epoch": 2.119815294880104, + "grad_norm": 0.06641636788845062, + "learning_rate": 0.00010445564606867997, + "loss": 0.2676, + "step": 26167 + }, + { + "epoch": 2.119896305897602, + "grad_norm": 0.06420730799436569, + "learning_rate": 0.00010445114541608533, + "loss": 0.2481, + "step": 26168 + }, + { + "epoch": 2.1199773169151004, + "grad_norm": 0.06847498565912247, + "learning_rate": 0.00010444664476349071, + "loss": 0.2607, + "step": 26169 + }, + { + "epoch": 2.1200583279325986, + "grad_norm": 0.055883947759866714, + "learning_rate": 0.00010444214411089608, + "loss": 0.2322, + "step": 26170 + }, + { + "epoch": 2.1201393389500973, + "grad_norm": 0.06870889663696289, + "learning_rate": 0.00010443764345830144, + "loss": 0.2445, + "step": 26171 + }, + { + "epoch": 2.1202203499675956, + "grad_norm": 0.06205878406763077, + "learning_rate": 0.00010443314280570684, + "loss": 0.2635, + "step": 26172 + }, + { + "epoch": 2.120301360985094, + "grad_norm": 0.06938192248344421, + "learning_rate": 0.00010442864215311222, + "loss": 0.2607, + "step": 26173 + }, + { + "epoch": 2.1203823720025925, + "grad_norm": 0.07056978344917297, + "learning_rate": 0.00010442414150051758, + "loss": 0.2683, + "step": 26174 + }, + { + "epoch": 2.1204633830200907, + "grad_norm": 0.055816810578107834, + "learning_rate": 0.00010441964084792295, + "loss": 0.2517, + "step": 26175 + }, + { + "epoch": 2.120544394037589, + "grad_norm": 0.06050346419215202, + "learning_rate": 0.00010441514019532832, + "loss": 0.2518, + "step": 26176 + }, + { + "epoch": 2.1206254050550877, + "grad_norm": 0.06322181224822998, + "learning_rate": 0.00010441063954273368, + "loss": 0.288, + "step": 26177 + }, + { + "epoch": 2.120706416072586, + "grad_norm": 0.06231006234884262, + "learning_rate": 0.00010440613889013908, + "loss": 0.2527, + "step": 26178 + }, + { + "epoch": 2.120787427090084, + "grad_norm": 0.06672997772693634, + "learning_rate": 0.00010440163823754446, + "loss": 0.2725, + "step": 26179 + }, + { + "epoch": 2.120868438107583, + "grad_norm": 0.05536544322967529, + "learning_rate": 0.00010439713758494982, + "loss": 0.2328, + "step": 26180 + }, + { + "epoch": 2.120949449125081, + "grad_norm": 0.06709985435009003, + "learning_rate": 0.00010439263693235519, + "loss": 0.2527, + "step": 26181 + }, + { + "epoch": 2.1210304601425793, + "grad_norm": 0.05647808685898781, + "learning_rate": 0.00010438813627976057, + "loss": 0.2834, + "step": 26182 + }, + { + "epoch": 2.1211114711600776, + "grad_norm": 0.050501611083745956, + "learning_rate": 0.00010438363562716595, + "loss": 0.2427, + "step": 26183 + }, + { + "epoch": 2.1211924821775763, + "grad_norm": 0.06943561136722565, + "learning_rate": 0.00010437913497457133, + "loss": 0.243, + "step": 26184 + }, + { + "epoch": 2.1212734931950745, + "grad_norm": 0.0644908994436264, + "learning_rate": 0.0001043746343219767, + "loss": 0.2669, + "step": 26185 + }, + { + "epoch": 2.1213545042125728, + "grad_norm": 0.07270579040050507, + "learning_rate": 0.00010437013366938206, + "loss": 0.2585, + "step": 26186 + }, + { + "epoch": 2.1214355152300715, + "grad_norm": 0.07066863030195236, + "learning_rate": 0.00010436563301678743, + "loss": 0.2864, + "step": 26187 + }, + { + "epoch": 2.1215165262475697, + "grad_norm": 0.06685689091682434, + "learning_rate": 0.00010436113236419281, + "loss": 0.2619, + "step": 26188 + }, + { + "epoch": 2.121597537265068, + "grad_norm": 0.07248367369174957, + "learning_rate": 0.0001043566317115982, + "loss": 0.2502, + "step": 26189 + }, + { + "epoch": 2.1216785482825666, + "grad_norm": 0.05484854802489281, + "learning_rate": 0.00010435213105900357, + "loss": 0.2461, + "step": 26190 + }, + { + "epoch": 2.121759559300065, + "grad_norm": 0.05028130114078522, + "learning_rate": 0.00010434763040640894, + "loss": 0.2659, + "step": 26191 + }, + { + "epoch": 2.121840570317563, + "grad_norm": 0.06200327351689339, + "learning_rate": 0.0001043431297538143, + "loss": 0.232, + "step": 26192 + }, + { + "epoch": 2.1219215813350614, + "grad_norm": 0.048312537372112274, + "learning_rate": 0.00010433862910121968, + "loss": 0.242, + "step": 26193 + }, + { + "epoch": 2.12200259235256, + "grad_norm": 0.058204833418130875, + "learning_rate": 0.00010433412844862505, + "loss": 0.241, + "step": 26194 + }, + { + "epoch": 2.1220836033700583, + "grad_norm": 0.06451279670000076, + "learning_rate": 0.00010432962779603044, + "loss": 0.2668, + "step": 26195 + }, + { + "epoch": 2.1221646143875565, + "grad_norm": 0.0593167208135128, + "learning_rate": 0.00010432512714343581, + "loss": 0.2549, + "step": 26196 + }, + { + "epoch": 2.1222456254050552, + "grad_norm": 0.060698408633470535, + "learning_rate": 0.00010432062649084118, + "loss": 0.2381, + "step": 26197 + }, + { + "epoch": 2.1223266364225535, + "grad_norm": 0.07551072537899017, + "learning_rate": 0.00010431612583824654, + "loss": 0.2797, + "step": 26198 + }, + { + "epoch": 2.1224076474400517, + "grad_norm": 0.06287830322980881, + "learning_rate": 0.00010431162518565192, + "loss": 0.2611, + "step": 26199 + }, + { + "epoch": 2.1224886584575504, + "grad_norm": 0.0578264556825161, + "learning_rate": 0.00010430712453305729, + "loss": 0.2523, + "step": 26200 + }, + { + "epoch": 2.1225696694750487, + "grad_norm": 0.053504422307014465, + "learning_rate": 0.00010430262388046268, + "loss": 0.2402, + "step": 26201 + }, + { + "epoch": 2.122650680492547, + "grad_norm": 0.06288152933120728, + "learning_rate": 0.00010429812322786805, + "loss": 0.2552, + "step": 26202 + }, + { + "epoch": 2.1227316915100456, + "grad_norm": 0.07255303859710693, + "learning_rate": 0.00010429362257527342, + "loss": 0.2479, + "step": 26203 + }, + { + "epoch": 2.122812702527544, + "grad_norm": 0.06279697269201279, + "learning_rate": 0.00010428912192267878, + "loss": 0.2606, + "step": 26204 + }, + { + "epoch": 2.122893713545042, + "grad_norm": 0.07724709808826447, + "learning_rate": 0.00010428462127008416, + "loss": 0.3127, + "step": 26205 + }, + { + "epoch": 2.1229747245625403, + "grad_norm": 0.06904693692922592, + "learning_rate": 0.00010428012061748955, + "loss": 0.2998, + "step": 26206 + }, + { + "epoch": 2.123055735580039, + "grad_norm": 0.07110625505447388, + "learning_rate": 0.00010427561996489492, + "loss": 0.2679, + "step": 26207 + }, + { + "epoch": 2.1231367465975373, + "grad_norm": 0.055152129381895065, + "learning_rate": 0.00010427111931230029, + "loss": 0.2452, + "step": 26208 + }, + { + "epoch": 2.1232177576150355, + "grad_norm": 0.050709083676338196, + "learning_rate": 0.00010426661865970567, + "loss": 0.2758, + "step": 26209 + }, + { + "epoch": 2.123298768632534, + "grad_norm": 0.057936035096645355, + "learning_rate": 0.00010426211800711103, + "loss": 0.2816, + "step": 26210 + }, + { + "epoch": 2.1233797796500324, + "grad_norm": 0.05752767622470856, + "learning_rate": 0.0001042576173545164, + "loss": 0.2379, + "step": 26211 + }, + { + "epoch": 2.1234607906675307, + "grad_norm": 0.05384643375873566, + "learning_rate": 0.00010425311670192179, + "loss": 0.2312, + "step": 26212 + }, + { + "epoch": 2.1235418016850294, + "grad_norm": 0.07233273983001709, + "learning_rate": 0.00010424861604932716, + "loss": 0.2908, + "step": 26213 + }, + { + "epoch": 2.1236228127025276, + "grad_norm": 0.06371590495109558, + "learning_rate": 0.00010424411539673253, + "loss": 0.2986, + "step": 26214 + }, + { + "epoch": 2.123703823720026, + "grad_norm": 0.054069891571998596, + "learning_rate": 0.00010423961474413791, + "loss": 0.2706, + "step": 26215 + }, + { + "epoch": 2.123784834737524, + "grad_norm": 0.06553568691015244, + "learning_rate": 0.00010423511409154327, + "loss": 0.213, + "step": 26216 + }, + { + "epoch": 2.123865845755023, + "grad_norm": 0.05562140792608261, + "learning_rate": 0.00010423061343894864, + "loss": 0.2606, + "step": 26217 + }, + { + "epoch": 2.123946856772521, + "grad_norm": 0.06978686153888702, + "learning_rate": 0.00010422611278635403, + "loss": 0.2908, + "step": 26218 + }, + { + "epoch": 2.1240278677900193, + "grad_norm": 0.06078553944826126, + "learning_rate": 0.0001042216121337594, + "loss": 0.2235, + "step": 26219 + }, + { + "epoch": 2.124108878807518, + "grad_norm": 0.05368654802441597, + "learning_rate": 0.00010421711148116478, + "loss": 0.218, + "step": 26220 + }, + { + "epoch": 2.124189889825016, + "grad_norm": 0.07140675187110901, + "learning_rate": 0.00010421261082857015, + "loss": 0.2807, + "step": 26221 + }, + { + "epoch": 2.1242709008425145, + "grad_norm": 0.06928906589746475, + "learning_rate": 0.00010420811017597551, + "loss": 0.2496, + "step": 26222 + }, + { + "epoch": 2.124351911860013, + "grad_norm": 0.07112964987754822, + "learning_rate": 0.00010420360952338088, + "loss": 0.258, + "step": 26223 + }, + { + "epoch": 2.1244329228775114, + "grad_norm": 0.06662682443857193, + "learning_rate": 0.00010419910887078628, + "loss": 0.2908, + "step": 26224 + }, + { + "epoch": 2.1245139338950096, + "grad_norm": 0.05349923297762871, + "learning_rate": 0.00010419460821819164, + "loss": 0.2349, + "step": 26225 + }, + { + "epoch": 2.124594944912508, + "grad_norm": 0.06059380993247032, + "learning_rate": 0.00010419010756559702, + "loss": 0.2223, + "step": 26226 + }, + { + "epoch": 2.1246759559300066, + "grad_norm": 0.06186490133404732, + "learning_rate": 0.00010418560691300239, + "loss": 0.2835, + "step": 26227 + }, + { + "epoch": 2.124756966947505, + "grad_norm": 0.059035640209913254, + "learning_rate": 0.00010418110626040775, + "loss": 0.2732, + "step": 26228 + }, + { + "epoch": 2.124837977965003, + "grad_norm": 0.06877464801073074, + "learning_rate": 0.00010417660560781312, + "loss": 0.2323, + "step": 26229 + }, + { + "epoch": 2.1249189889825018, + "grad_norm": 0.06301309168338776, + "learning_rate": 0.00010417210495521853, + "loss": 0.2623, + "step": 26230 + }, + { + "epoch": 2.125, + "grad_norm": 0.07027006894350052, + "learning_rate": 0.00010416760430262389, + "loss": 0.2478, + "step": 26231 + }, + { + "epoch": 2.1250810110174982, + "grad_norm": 0.05357455834746361, + "learning_rate": 0.00010416310365002926, + "loss": 0.2156, + "step": 26232 + }, + { + "epoch": 2.125162022034997, + "grad_norm": 0.06986691057682037, + "learning_rate": 0.00010415860299743463, + "loss": 0.2421, + "step": 26233 + }, + { + "epoch": 2.125243033052495, + "grad_norm": 0.06922683119773865, + "learning_rate": 0.00010415410234483999, + "loss": 0.2455, + "step": 26234 + }, + { + "epoch": 2.1253240440699934, + "grad_norm": 0.054529983550310135, + "learning_rate": 0.0001041496016922454, + "loss": 0.2331, + "step": 26235 + }, + { + "epoch": 2.1254050550874917, + "grad_norm": 0.07196197658777237, + "learning_rate": 0.00010414510103965077, + "loss": 0.2474, + "step": 26236 + }, + { + "epoch": 2.1254860661049904, + "grad_norm": 0.07445516437292099, + "learning_rate": 0.00010414060038705613, + "loss": 0.2361, + "step": 26237 + }, + { + "epoch": 2.1255670771224886, + "grad_norm": 0.07319855690002441, + "learning_rate": 0.0001041360997344615, + "loss": 0.256, + "step": 26238 + }, + { + "epoch": 2.125648088139987, + "grad_norm": 0.05994647741317749, + "learning_rate": 0.00010413159908186687, + "loss": 0.2262, + "step": 26239 + }, + { + "epoch": 2.1257290991574855, + "grad_norm": 0.07112723588943481, + "learning_rate": 0.00010412709842927223, + "loss": 0.2444, + "step": 26240 + }, + { + "epoch": 2.125810110174984, + "grad_norm": 0.06451784819364548, + "learning_rate": 0.00010412259777667764, + "loss": 0.2477, + "step": 26241 + }, + { + "epoch": 2.125891121192482, + "grad_norm": 0.05500245466828346, + "learning_rate": 0.00010411809712408301, + "loss": 0.2422, + "step": 26242 + }, + { + "epoch": 2.1259721322099807, + "grad_norm": 0.06472550332546234, + "learning_rate": 0.00010411359647148837, + "loss": 0.2837, + "step": 26243 + }, + { + "epoch": 2.126053143227479, + "grad_norm": 0.07072708755731583, + "learning_rate": 0.00010410909581889374, + "loss": 0.2748, + "step": 26244 + }, + { + "epoch": 2.126134154244977, + "grad_norm": 0.07283692806959152, + "learning_rate": 0.00010410459516629912, + "loss": 0.2817, + "step": 26245 + }, + { + "epoch": 2.126215165262476, + "grad_norm": 0.07446455210447311, + "learning_rate": 0.00010410009451370448, + "loss": 0.2939, + "step": 26246 + }, + { + "epoch": 2.126296176279974, + "grad_norm": 0.066004179418087, + "learning_rate": 0.00010409559386110988, + "loss": 0.2706, + "step": 26247 + }, + { + "epoch": 2.1263771872974724, + "grad_norm": 0.05694718658924103, + "learning_rate": 0.00010409109320851525, + "loss": 0.2355, + "step": 26248 + }, + { + "epoch": 2.126458198314971, + "grad_norm": 0.05692675709724426, + "learning_rate": 0.00010408659255592061, + "loss": 0.242, + "step": 26249 + }, + { + "epoch": 2.1265392093324693, + "grad_norm": 0.0675569698214531, + "learning_rate": 0.00010408209190332598, + "loss": 0.2595, + "step": 26250 + }, + { + "epoch": 2.1266202203499676, + "grad_norm": 0.06404047459363937, + "learning_rate": 0.00010407759125073136, + "loss": 0.2409, + "step": 26251 + }, + { + "epoch": 2.126701231367466, + "grad_norm": 0.0745696946978569, + "learning_rate": 0.00010407309059813672, + "loss": 0.2876, + "step": 26252 + }, + { + "epoch": 2.1267822423849645, + "grad_norm": 0.06286709755659103, + "learning_rate": 0.00010406858994554212, + "loss": 0.2785, + "step": 26253 + }, + { + "epoch": 2.1268632534024627, + "grad_norm": 0.05989371985197067, + "learning_rate": 0.00010406408929294749, + "loss": 0.2816, + "step": 26254 + }, + { + "epoch": 2.126944264419961, + "grad_norm": 0.0647851824760437, + "learning_rate": 0.00010405958864035285, + "loss": 0.2196, + "step": 26255 + }, + { + "epoch": 2.1270252754374597, + "grad_norm": 0.0552176833152771, + "learning_rate": 0.00010405508798775823, + "loss": 0.2191, + "step": 26256 + }, + { + "epoch": 2.127106286454958, + "grad_norm": 0.06426537781953812, + "learning_rate": 0.0001040505873351636, + "loss": 0.2801, + "step": 26257 + }, + { + "epoch": 2.127187297472456, + "grad_norm": 0.06369642913341522, + "learning_rate": 0.00010404608668256899, + "loss": 0.2588, + "step": 26258 + }, + { + "epoch": 2.1272683084899544, + "grad_norm": 0.059055306017398834, + "learning_rate": 0.00010404158602997436, + "loss": 0.2524, + "step": 26259 + }, + { + "epoch": 2.127349319507453, + "grad_norm": 0.06316355615854263, + "learning_rate": 0.00010403708537737973, + "loss": 0.2154, + "step": 26260 + }, + { + "epoch": 2.1274303305249513, + "grad_norm": 0.08105630427598953, + "learning_rate": 0.0001040325847247851, + "loss": 0.2739, + "step": 26261 + }, + { + "epoch": 2.1275113415424496, + "grad_norm": 0.07276780158281326, + "learning_rate": 0.00010402808407219047, + "loss": 0.267, + "step": 26262 + }, + { + "epoch": 2.1275923525599483, + "grad_norm": 0.05796151980757713, + "learning_rate": 0.00010402358341959584, + "loss": 0.2863, + "step": 26263 + }, + { + "epoch": 2.1276733635774465, + "grad_norm": 0.07009650766849518, + "learning_rate": 0.00010401908276700123, + "loss": 0.2627, + "step": 26264 + }, + { + "epoch": 2.1277543745949448, + "grad_norm": 0.06027712672948837, + "learning_rate": 0.0001040145821144066, + "loss": 0.2333, + "step": 26265 + }, + { + "epoch": 2.1278353856124435, + "grad_norm": 0.06609676033258438, + "learning_rate": 0.00010401008146181198, + "loss": 0.2538, + "step": 26266 + }, + { + "epoch": 2.1279163966299417, + "grad_norm": 0.07370631396770477, + "learning_rate": 0.00010400558080921734, + "loss": 0.2589, + "step": 26267 + }, + { + "epoch": 2.12799740764744, + "grad_norm": 0.05804192274808884, + "learning_rate": 0.00010400108015662271, + "loss": 0.2441, + "step": 26268 + }, + { + "epoch": 2.1280784186649386, + "grad_norm": 0.05638861283659935, + "learning_rate": 0.00010399657950402808, + "loss": 0.2698, + "step": 26269 + }, + { + "epoch": 2.128159429682437, + "grad_norm": 0.06556017696857452, + "learning_rate": 0.00010399207885143347, + "loss": 0.2474, + "step": 26270 + }, + { + "epoch": 2.128240440699935, + "grad_norm": 0.05903168395161629, + "learning_rate": 0.00010398757819883884, + "loss": 0.2387, + "step": 26271 + }, + { + "epoch": 2.1283214517174334, + "grad_norm": 0.056488338857889175, + "learning_rate": 0.00010398307754624422, + "loss": 0.2302, + "step": 26272 + }, + { + "epoch": 2.128402462734932, + "grad_norm": 0.05794563144445419, + "learning_rate": 0.00010397857689364958, + "loss": 0.2536, + "step": 26273 + }, + { + "epoch": 2.1284834737524303, + "grad_norm": 0.06688067317008972, + "learning_rate": 0.00010397407624105495, + "loss": 0.2834, + "step": 26274 + }, + { + "epoch": 2.1285644847699285, + "grad_norm": 0.049689728766679764, + "learning_rate": 0.00010396957558846032, + "loss": 0.219, + "step": 26275 + }, + { + "epoch": 2.1286454957874272, + "grad_norm": 0.06290605664253235, + "learning_rate": 0.00010396507493586571, + "loss": 0.2924, + "step": 26276 + }, + { + "epoch": 2.1287265068049255, + "grad_norm": 0.0553937628865242, + "learning_rate": 0.00010396057428327108, + "loss": 0.2459, + "step": 26277 + }, + { + "epoch": 2.1288075178224237, + "grad_norm": 0.06402228772640228, + "learning_rate": 0.00010395607363067646, + "loss": 0.2613, + "step": 26278 + }, + { + "epoch": 2.1288885288399224, + "grad_norm": 0.07002311199903488, + "learning_rate": 0.00010395157297808182, + "loss": 0.2424, + "step": 26279 + }, + { + "epoch": 2.1289695398574207, + "grad_norm": 0.0628318265080452, + "learning_rate": 0.00010394707232548719, + "loss": 0.2463, + "step": 26280 + }, + { + "epoch": 2.129050550874919, + "grad_norm": 0.062282539904117584, + "learning_rate": 0.00010394257167289257, + "loss": 0.2638, + "step": 26281 + }, + { + "epoch": 2.129131561892417, + "grad_norm": 0.05883893743157387, + "learning_rate": 0.00010393807102029795, + "loss": 0.2542, + "step": 26282 + }, + { + "epoch": 2.129212572909916, + "grad_norm": 0.07414300739765167, + "learning_rate": 0.00010393357036770333, + "loss": 0.2334, + "step": 26283 + }, + { + "epoch": 2.129293583927414, + "grad_norm": 0.06983643025159836, + "learning_rate": 0.0001039290697151087, + "loss": 0.267, + "step": 26284 + }, + { + "epoch": 2.1293745949449123, + "grad_norm": 0.0707906112074852, + "learning_rate": 0.00010392456906251406, + "loss": 0.2389, + "step": 26285 + }, + { + "epoch": 2.129455605962411, + "grad_norm": 0.06597426533699036, + "learning_rate": 0.00010392006840991943, + "loss": 0.252, + "step": 26286 + }, + { + "epoch": 2.1295366169799093, + "grad_norm": 0.0692160502076149, + "learning_rate": 0.00010391556775732483, + "loss": 0.2618, + "step": 26287 + }, + { + "epoch": 2.1296176279974075, + "grad_norm": 0.05935715511441231, + "learning_rate": 0.0001039110671047302, + "loss": 0.2613, + "step": 26288 + }, + { + "epoch": 2.129698639014906, + "grad_norm": 0.0638524740934372, + "learning_rate": 0.00010390656645213557, + "loss": 0.2415, + "step": 26289 + }, + { + "epoch": 2.1297796500324044, + "grad_norm": 0.06546127796173096, + "learning_rate": 0.00010390206579954094, + "loss": 0.225, + "step": 26290 + }, + { + "epoch": 2.1298606610499027, + "grad_norm": 0.05805578827857971, + "learning_rate": 0.0001038975651469463, + "loss": 0.2147, + "step": 26291 + }, + { + "epoch": 2.1299416720674014, + "grad_norm": 0.050522420555353165, + "learning_rate": 0.00010389306449435168, + "loss": 0.2434, + "step": 26292 + }, + { + "epoch": 2.1300226830848996, + "grad_norm": 0.06944924592971802, + "learning_rate": 0.00010388856384175708, + "loss": 0.284, + "step": 26293 + }, + { + "epoch": 2.130103694102398, + "grad_norm": 0.05426494777202606, + "learning_rate": 0.00010388406318916244, + "loss": 0.2662, + "step": 26294 + }, + { + "epoch": 2.130184705119896, + "grad_norm": 0.06014052778482437, + "learning_rate": 0.00010387956253656781, + "loss": 0.2193, + "step": 26295 + }, + { + "epoch": 2.130265716137395, + "grad_norm": 0.06177065894007683, + "learning_rate": 0.00010387506188397318, + "loss": 0.2814, + "step": 26296 + }, + { + "epoch": 2.130346727154893, + "grad_norm": 0.054523248225450516, + "learning_rate": 0.00010387056123137854, + "loss": 0.2851, + "step": 26297 + }, + { + "epoch": 2.1304277381723913, + "grad_norm": 0.062403611838817596, + "learning_rate": 0.00010386606057878392, + "loss": 0.2655, + "step": 26298 + }, + { + "epoch": 2.13050874918989, + "grad_norm": 0.06154339760541916, + "learning_rate": 0.00010386155992618932, + "loss": 0.2696, + "step": 26299 + }, + { + "epoch": 2.130589760207388, + "grad_norm": 0.05249398201704025, + "learning_rate": 0.00010385705927359468, + "loss": 0.243, + "step": 26300 + }, + { + "epoch": 2.1306707712248865, + "grad_norm": 0.06125812232494354, + "learning_rate": 0.00010385255862100005, + "loss": 0.2581, + "step": 26301 + }, + { + "epoch": 2.130751782242385, + "grad_norm": 0.06812266260385513, + "learning_rate": 0.00010384805796840542, + "loss": 0.2485, + "step": 26302 + }, + { + "epoch": 2.1308327932598834, + "grad_norm": 0.0666508674621582, + "learning_rate": 0.00010384355731581078, + "loss": 0.2815, + "step": 26303 + }, + { + "epoch": 2.1309138042773816, + "grad_norm": 0.07783432304859161, + "learning_rate": 0.00010383905666321616, + "loss": 0.277, + "step": 26304 + }, + { + "epoch": 2.13099481529488, + "grad_norm": 0.06238381192088127, + "learning_rate": 0.00010383455601062156, + "loss": 0.2191, + "step": 26305 + }, + { + "epoch": 2.1310758263123786, + "grad_norm": 0.08316400647163391, + "learning_rate": 0.00010383005535802692, + "loss": 0.281, + "step": 26306 + }, + { + "epoch": 2.131156837329877, + "grad_norm": 0.07105521857738495, + "learning_rate": 0.00010382555470543229, + "loss": 0.2566, + "step": 26307 + }, + { + "epoch": 2.131237848347375, + "grad_norm": 0.06777141988277435, + "learning_rate": 0.00010382105405283767, + "loss": 0.2886, + "step": 26308 + }, + { + "epoch": 2.1313188593648738, + "grad_norm": 0.0814325287938118, + "learning_rate": 0.00010381655340024303, + "loss": 0.2652, + "step": 26309 + }, + { + "epoch": 2.131399870382372, + "grad_norm": 0.05332833155989647, + "learning_rate": 0.00010381205274764843, + "loss": 0.2244, + "step": 26310 + }, + { + "epoch": 2.1314808813998702, + "grad_norm": 0.051932573318481445, + "learning_rate": 0.0001038075520950538, + "loss": 0.2449, + "step": 26311 + }, + { + "epoch": 2.131561892417369, + "grad_norm": 0.06615007668733597, + "learning_rate": 0.00010380305144245916, + "loss": 0.2693, + "step": 26312 + }, + { + "epoch": 2.131642903434867, + "grad_norm": 0.05709698051214218, + "learning_rate": 0.00010379855078986453, + "loss": 0.261, + "step": 26313 + }, + { + "epoch": 2.1317239144523654, + "grad_norm": 0.06457649916410446, + "learning_rate": 0.00010379405013726991, + "loss": 0.2664, + "step": 26314 + }, + { + "epoch": 2.131804925469864, + "grad_norm": 0.061860498040914536, + "learning_rate": 0.00010378954948467527, + "loss": 0.2273, + "step": 26315 + }, + { + "epoch": 2.1318859364873624, + "grad_norm": 0.05811541900038719, + "learning_rate": 0.00010378504883208067, + "loss": 0.2317, + "step": 26316 + }, + { + "epoch": 2.1319669475048606, + "grad_norm": 0.08073913305997849, + "learning_rate": 0.00010378054817948604, + "loss": 0.2872, + "step": 26317 + }, + { + "epoch": 2.132047958522359, + "grad_norm": 0.059323426336050034, + "learning_rate": 0.0001037760475268914, + "loss": 0.2742, + "step": 26318 + }, + { + "epoch": 2.1321289695398575, + "grad_norm": 0.06322403997182846, + "learning_rate": 0.00010377154687429678, + "loss": 0.2432, + "step": 26319 + }, + { + "epoch": 2.1322099805573558, + "grad_norm": 0.08462004363536835, + "learning_rate": 0.00010376704622170215, + "loss": 0.265, + "step": 26320 + }, + { + "epoch": 2.132290991574854, + "grad_norm": 0.05855248123407364, + "learning_rate": 0.00010376254556910751, + "loss": 0.2519, + "step": 26321 + }, + { + "epoch": 2.1323720025923527, + "grad_norm": 0.06013753265142441, + "learning_rate": 0.00010375804491651291, + "loss": 0.2431, + "step": 26322 + }, + { + "epoch": 2.132453013609851, + "grad_norm": 0.06801541149616241, + "learning_rate": 0.00010375354426391828, + "loss": 0.2304, + "step": 26323 + }, + { + "epoch": 2.132534024627349, + "grad_norm": 0.0554489828646183, + "learning_rate": 0.00010374904361132364, + "loss": 0.2435, + "step": 26324 + }, + { + "epoch": 2.132615035644848, + "grad_norm": 0.059247396886348724, + "learning_rate": 0.00010374454295872902, + "loss": 0.25, + "step": 26325 + }, + { + "epoch": 2.132696046662346, + "grad_norm": 0.05381538346409798, + "learning_rate": 0.00010374004230613439, + "loss": 0.2666, + "step": 26326 + }, + { + "epoch": 2.1327770576798444, + "grad_norm": 0.053969722241163254, + "learning_rate": 0.00010373554165353976, + "loss": 0.2994, + "step": 26327 + }, + { + "epoch": 2.1328580686973426, + "grad_norm": 0.062413427978754044, + "learning_rate": 0.00010373104100094515, + "loss": 0.2737, + "step": 26328 + }, + { + "epoch": 2.1329390797148413, + "grad_norm": 0.06641319394111633, + "learning_rate": 0.00010372654034835053, + "loss": 0.2615, + "step": 26329 + }, + { + "epoch": 2.1330200907323396, + "grad_norm": 0.05698978528380394, + "learning_rate": 0.00010372203969575589, + "loss": 0.2741, + "step": 26330 + }, + { + "epoch": 2.133101101749838, + "grad_norm": 0.05135650932788849, + "learning_rate": 0.00010371753904316126, + "loss": 0.2124, + "step": 26331 + }, + { + "epoch": 2.1331821127673365, + "grad_norm": 0.0743556097149849, + "learning_rate": 0.00010371303839056663, + "loss": 0.2611, + "step": 26332 + }, + { + "epoch": 2.1332631237848347, + "grad_norm": 0.06397853791713715, + "learning_rate": 0.000103708537737972, + "loss": 0.239, + "step": 26333 + }, + { + "epoch": 2.133344134802333, + "grad_norm": 0.08030462265014648, + "learning_rate": 0.0001037040370853774, + "loss": 0.2777, + "step": 26334 + }, + { + "epoch": 2.1334251458198317, + "grad_norm": 0.06808465719223022, + "learning_rate": 0.00010369953643278277, + "loss": 0.2438, + "step": 26335 + }, + { + "epoch": 2.13350615683733, + "grad_norm": 0.05307475104928017, + "learning_rate": 0.00010369503578018813, + "loss": 0.2798, + "step": 26336 + }, + { + "epoch": 2.133587167854828, + "grad_norm": 0.05824761092662811, + "learning_rate": 0.0001036905351275935, + "loss": 0.2436, + "step": 26337 + }, + { + "epoch": 2.133668178872327, + "grad_norm": 0.07136199623346329, + "learning_rate": 0.00010368603447499887, + "loss": 0.2344, + "step": 26338 + }, + { + "epoch": 2.133749189889825, + "grad_norm": 0.06164703890681267, + "learning_rate": 0.00010368153382240426, + "loss": 0.2324, + "step": 26339 + }, + { + "epoch": 2.1338302009073233, + "grad_norm": 0.0646100714802742, + "learning_rate": 0.00010367703316980964, + "loss": 0.2902, + "step": 26340 + }, + { + "epoch": 2.1339112119248216, + "grad_norm": 0.08409897983074188, + "learning_rate": 0.00010367253251721501, + "loss": 0.2522, + "step": 26341 + }, + { + "epoch": 2.1339922229423203, + "grad_norm": 0.06278625130653381, + "learning_rate": 0.00010366803186462037, + "loss": 0.208, + "step": 26342 + }, + { + "epoch": 2.1340732339598185, + "grad_norm": 0.07819729298353195, + "learning_rate": 0.00010366353121202574, + "loss": 0.2776, + "step": 26343 + }, + { + "epoch": 2.1341542449773168, + "grad_norm": 0.05907128378748894, + "learning_rate": 0.00010365903055943112, + "loss": 0.2685, + "step": 26344 + }, + { + "epoch": 2.1342352559948155, + "grad_norm": 0.06722066551446915, + "learning_rate": 0.0001036545299068365, + "loss": 0.2745, + "step": 26345 + }, + { + "epoch": 2.1343162670123137, + "grad_norm": 0.06968337297439575, + "learning_rate": 0.00010365002925424188, + "loss": 0.2304, + "step": 26346 + }, + { + "epoch": 2.134397278029812, + "grad_norm": 0.05220748856663704, + "learning_rate": 0.00010364552860164725, + "loss": 0.2533, + "step": 26347 + }, + { + "epoch": 2.1344782890473106, + "grad_norm": 0.06192225590348244, + "learning_rate": 0.00010364102794905261, + "loss": 0.2705, + "step": 26348 + }, + { + "epoch": 2.134559300064809, + "grad_norm": 0.06906786561012268, + "learning_rate": 0.00010363652729645798, + "loss": 0.2649, + "step": 26349 + }, + { + "epoch": 2.134640311082307, + "grad_norm": 0.06037280336022377, + "learning_rate": 0.00010363202664386336, + "loss": 0.2351, + "step": 26350 + }, + { + "epoch": 2.1347213220998054, + "grad_norm": 0.06934493780136108, + "learning_rate": 0.00010362752599126874, + "loss": 0.2877, + "step": 26351 + }, + { + "epoch": 2.134802333117304, + "grad_norm": 0.07149045169353485, + "learning_rate": 0.00010362302533867412, + "loss": 0.2611, + "step": 26352 + }, + { + "epoch": 2.1348833441348023, + "grad_norm": 0.0638730451464653, + "learning_rate": 0.00010361852468607949, + "loss": 0.2668, + "step": 26353 + }, + { + "epoch": 2.1349643551523005, + "grad_norm": 0.05846642702817917, + "learning_rate": 0.00010361402403348485, + "loss": 0.2729, + "step": 26354 + }, + { + "epoch": 2.1350453661697992, + "grad_norm": 0.061692554503679276, + "learning_rate": 0.00010360952338089023, + "loss": 0.2465, + "step": 26355 + }, + { + "epoch": 2.1351263771872975, + "grad_norm": 0.06015016511082649, + "learning_rate": 0.0001036050227282956, + "loss": 0.2682, + "step": 26356 + }, + { + "epoch": 2.1352073882047957, + "grad_norm": 0.07217731326818466, + "learning_rate": 0.00010360052207570099, + "loss": 0.3123, + "step": 26357 + }, + { + "epoch": 2.1352883992222944, + "grad_norm": 0.05961654335260391, + "learning_rate": 0.00010359602142310636, + "loss": 0.2348, + "step": 26358 + }, + { + "epoch": 2.1353694102397927, + "grad_norm": 0.05991566553711891, + "learning_rate": 0.00010359152077051173, + "loss": 0.2568, + "step": 26359 + }, + { + "epoch": 2.135450421257291, + "grad_norm": 0.05187077075242996, + "learning_rate": 0.0001035870201179171, + "loss": 0.234, + "step": 26360 + }, + { + "epoch": 2.1355314322747896, + "grad_norm": 0.07228851318359375, + "learning_rate": 0.00010358251946532247, + "loss": 0.2917, + "step": 26361 + }, + { + "epoch": 2.135612443292288, + "grad_norm": 0.05363132804632187, + "learning_rate": 0.00010357801881272784, + "loss": 0.2507, + "step": 26362 + }, + { + "epoch": 2.135693454309786, + "grad_norm": 0.06615161150693893, + "learning_rate": 0.00010357351816013323, + "loss": 0.2376, + "step": 26363 + }, + { + "epoch": 2.1357744653272843, + "grad_norm": 0.059696223586797714, + "learning_rate": 0.0001035690175075386, + "loss": 0.2358, + "step": 26364 + }, + { + "epoch": 2.135855476344783, + "grad_norm": 0.05982410907745361, + "learning_rate": 0.00010356451685494398, + "loss": 0.2444, + "step": 26365 + }, + { + "epoch": 2.1359364873622813, + "grad_norm": 0.06730233877897263, + "learning_rate": 0.00010356001620234934, + "loss": 0.2819, + "step": 26366 + }, + { + "epoch": 2.1360174983797795, + "grad_norm": 0.061217255890369415, + "learning_rate": 0.00010355551554975471, + "loss": 0.2586, + "step": 26367 + }, + { + "epoch": 2.136098509397278, + "grad_norm": 0.05739770084619522, + "learning_rate": 0.00010355101489716011, + "loss": 0.2267, + "step": 26368 + }, + { + "epoch": 2.1361795204147764, + "grad_norm": 0.07194606959819794, + "learning_rate": 0.00010354651424456547, + "loss": 0.2953, + "step": 26369 + }, + { + "epoch": 2.1362605314322747, + "grad_norm": 0.06404095888137817, + "learning_rate": 0.00010354201359197084, + "loss": 0.2578, + "step": 26370 + }, + { + "epoch": 2.1363415424497734, + "grad_norm": 0.09129726886749268, + "learning_rate": 0.00010353751293937622, + "loss": 0.2832, + "step": 26371 + }, + { + "epoch": 2.1364225534672716, + "grad_norm": 0.061405278742313385, + "learning_rate": 0.00010353301228678158, + "loss": 0.229, + "step": 26372 + }, + { + "epoch": 2.13650356448477, + "grad_norm": 0.05377738177776337, + "learning_rate": 0.00010352851163418695, + "loss": 0.2135, + "step": 26373 + }, + { + "epoch": 2.136584575502268, + "grad_norm": 0.05743473395705223, + "learning_rate": 0.00010352401098159235, + "loss": 0.2366, + "step": 26374 + }, + { + "epoch": 2.136665586519767, + "grad_norm": 0.05833156779408455, + "learning_rate": 0.00010351951032899771, + "loss": 0.2676, + "step": 26375 + }, + { + "epoch": 2.136746597537265, + "grad_norm": 0.06809478253126144, + "learning_rate": 0.00010351500967640309, + "loss": 0.2674, + "step": 26376 + }, + { + "epoch": 2.1368276085547633, + "grad_norm": 0.05865863710641861, + "learning_rate": 0.00010351050902380846, + "loss": 0.2034, + "step": 26377 + }, + { + "epoch": 2.136908619572262, + "grad_norm": 0.05914255976676941, + "learning_rate": 0.00010350600837121382, + "loss": 0.2658, + "step": 26378 + }, + { + "epoch": 2.13698963058976, + "grad_norm": 0.055052950978279114, + "learning_rate": 0.00010350150771861919, + "loss": 0.2838, + "step": 26379 + }, + { + "epoch": 2.1370706416072585, + "grad_norm": 0.0790739506483078, + "learning_rate": 0.00010349700706602459, + "loss": 0.3052, + "step": 26380 + }, + { + "epoch": 2.137151652624757, + "grad_norm": 0.06399786472320557, + "learning_rate": 0.00010349250641342995, + "loss": 0.253, + "step": 26381 + }, + { + "epoch": 2.1372326636422554, + "grad_norm": 0.06380538642406464, + "learning_rate": 0.00010348800576083533, + "loss": 0.2286, + "step": 26382 + }, + { + "epoch": 2.1373136746597536, + "grad_norm": 0.06525228917598724, + "learning_rate": 0.0001034835051082407, + "loss": 0.2567, + "step": 26383 + }, + { + "epoch": 2.1373946856772523, + "grad_norm": 0.05566810816526413, + "learning_rate": 0.00010347900445564606, + "loss": 0.2635, + "step": 26384 + }, + { + "epoch": 2.1374756966947506, + "grad_norm": 0.06300102174282074, + "learning_rate": 0.00010347450380305143, + "loss": 0.2824, + "step": 26385 + }, + { + "epoch": 2.137556707712249, + "grad_norm": 0.05361280217766762, + "learning_rate": 0.00010347000315045683, + "loss": 0.2763, + "step": 26386 + }, + { + "epoch": 2.137637718729747, + "grad_norm": 0.04924359917640686, + "learning_rate": 0.0001034655024978622, + "loss": 0.224, + "step": 26387 + }, + { + "epoch": 2.1377187297472457, + "grad_norm": 0.05773067846894264, + "learning_rate": 0.00010346100184526757, + "loss": 0.2382, + "step": 26388 + }, + { + "epoch": 2.137799740764744, + "grad_norm": 0.07036109268665314, + "learning_rate": 0.00010345650119267294, + "loss": 0.2489, + "step": 26389 + }, + { + "epoch": 2.1378807517822422, + "grad_norm": 0.06863632053136826, + "learning_rate": 0.00010345200054007832, + "loss": 0.2514, + "step": 26390 + }, + { + "epoch": 2.137961762799741, + "grad_norm": 0.07008032500743866, + "learning_rate": 0.0001034474998874837, + "loss": 0.2615, + "step": 26391 + }, + { + "epoch": 2.138042773817239, + "grad_norm": 0.05886336788535118, + "learning_rate": 0.00010344299923488908, + "loss": 0.2667, + "step": 26392 + }, + { + "epoch": 2.1381237848347374, + "grad_norm": 0.06166012957692146, + "learning_rate": 0.00010343849858229444, + "loss": 0.2622, + "step": 26393 + }, + { + "epoch": 2.138204795852236, + "grad_norm": 0.06474199146032333, + "learning_rate": 0.00010343399792969981, + "loss": 0.2691, + "step": 26394 + }, + { + "epoch": 2.1382858068697344, + "grad_norm": 0.06455071270465851, + "learning_rate": 0.00010342949727710518, + "loss": 0.2353, + "step": 26395 + }, + { + "epoch": 2.1383668178872326, + "grad_norm": 0.06218364089727402, + "learning_rate": 0.00010342499662451056, + "loss": 0.2311, + "step": 26396 + }, + { + "epoch": 2.138447828904731, + "grad_norm": 0.06683358550071716, + "learning_rate": 0.00010342049597191594, + "loss": 0.2628, + "step": 26397 + }, + { + "epoch": 2.1385288399222295, + "grad_norm": 0.061113521456718445, + "learning_rate": 0.00010341599531932132, + "loss": 0.2618, + "step": 26398 + }, + { + "epoch": 2.1386098509397278, + "grad_norm": 0.047484900802373886, + "learning_rate": 0.00010341149466672668, + "loss": 0.2075, + "step": 26399 + }, + { + "epoch": 2.138690861957226, + "grad_norm": 0.07419335097074509, + "learning_rate": 0.00010340699401413205, + "loss": 0.2428, + "step": 26400 + }, + { + "epoch": 2.1387718729747247, + "grad_norm": 0.0658346489071846, + "learning_rate": 0.00010340249336153743, + "loss": 0.256, + "step": 26401 + }, + { + "epoch": 2.138852883992223, + "grad_norm": 0.061048831790685654, + "learning_rate": 0.0001033979927089428, + "loss": 0.2456, + "step": 26402 + }, + { + "epoch": 2.138933895009721, + "grad_norm": 0.06434381008148193, + "learning_rate": 0.00010339349205634819, + "loss": 0.2481, + "step": 26403 + }, + { + "epoch": 2.13901490602722, + "grad_norm": 0.07088811695575714, + "learning_rate": 0.00010338899140375356, + "loss": 0.2486, + "step": 26404 + }, + { + "epoch": 2.139095917044718, + "grad_norm": 0.06179327517747879, + "learning_rate": 0.00010338449075115892, + "loss": 0.2761, + "step": 26405 + }, + { + "epoch": 2.1391769280622164, + "grad_norm": 0.07120385766029358, + "learning_rate": 0.00010337999009856429, + "loss": 0.2839, + "step": 26406 + }, + { + "epoch": 2.139257939079715, + "grad_norm": 0.05768730491399765, + "learning_rate": 0.00010337548944596967, + "loss": 0.248, + "step": 26407 + }, + { + "epoch": 2.1393389500972133, + "grad_norm": 0.06576817482709885, + "learning_rate": 0.00010337098879337504, + "loss": 0.2627, + "step": 26408 + }, + { + "epoch": 2.1394199611147116, + "grad_norm": 0.056955333799123764, + "learning_rate": 0.00010336648814078043, + "loss": 0.2858, + "step": 26409 + }, + { + "epoch": 2.13950097213221, + "grad_norm": 0.0679904893040657, + "learning_rate": 0.0001033619874881858, + "loss": 0.2467, + "step": 26410 + }, + { + "epoch": 2.1395819831497085, + "grad_norm": 0.0797104611992836, + "learning_rate": 0.00010335748683559116, + "loss": 0.2559, + "step": 26411 + }, + { + "epoch": 2.1396629941672067, + "grad_norm": 0.05416597053408623, + "learning_rate": 0.00010335298618299653, + "loss": 0.232, + "step": 26412 + }, + { + "epoch": 2.139744005184705, + "grad_norm": 0.05993930995464325, + "learning_rate": 0.00010334848553040191, + "loss": 0.2279, + "step": 26413 + }, + { + "epoch": 2.1398250162022037, + "grad_norm": 0.05330348759889603, + "learning_rate": 0.00010334398487780728, + "loss": 0.2933, + "step": 26414 + }, + { + "epoch": 2.139906027219702, + "grad_norm": 0.06839060038328171, + "learning_rate": 0.00010333948422521267, + "loss": 0.2748, + "step": 26415 + }, + { + "epoch": 2.1399870382372, + "grad_norm": 0.0675363838672638, + "learning_rate": 0.00010333498357261804, + "loss": 0.2523, + "step": 26416 + }, + { + "epoch": 2.140068049254699, + "grad_norm": 0.07417375594377518, + "learning_rate": 0.0001033304829200234, + "loss": 0.2739, + "step": 26417 + }, + { + "epoch": 2.140149060272197, + "grad_norm": 0.0644654631614685, + "learning_rate": 0.00010332598226742878, + "loss": 0.2708, + "step": 26418 + }, + { + "epoch": 2.1402300712896953, + "grad_norm": 0.061722222715616226, + "learning_rate": 0.00010332148161483415, + "loss": 0.2747, + "step": 26419 + }, + { + "epoch": 2.1403110823071936, + "grad_norm": 0.05808354541659355, + "learning_rate": 0.00010331698096223954, + "loss": 0.2263, + "step": 26420 + }, + { + "epoch": 2.1403920933246923, + "grad_norm": 0.04819166660308838, + "learning_rate": 0.00010331248030964491, + "loss": 0.2457, + "step": 26421 + }, + { + "epoch": 2.1404731043421905, + "grad_norm": 0.059980083256959915, + "learning_rate": 0.00010330797965705028, + "loss": 0.2476, + "step": 26422 + }, + { + "epoch": 2.1405541153596888, + "grad_norm": 0.05877969041466713, + "learning_rate": 0.00010330347900445564, + "loss": 0.2365, + "step": 26423 + }, + { + "epoch": 2.1406351263771874, + "grad_norm": 0.055677689611911774, + "learning_rate": 0.00010329897835186102, + "loss": 0.1986, + "step": 26424 + }, + { + "epoch": 2.1407161373946857, + "grad_norm": 0.08751609921455383, + "learning_rate": 0.00010329447769926639, + "loss": 0.2867, + "step": 26425 + }, + { + "epoch": 2.140797148412184, + "grad_norm": 0.06934158504009247, + "learning_rate": 0.00010328997704667178, + "loss": 0.2596, + "step": 26426 + }, + { + "epoch": 2.1408781594296826, + "grad_norm": 0.06514386087656021, + "learning_rate": 0.00010328547639407715, + "loss": 0.2662, + "step": 26427 + }, + { + "epoch": 2.140959170447181, + "grad_norm": 0.06309270858764648, + "learning_rate": 0.00010328097574148253, + "loss": 0.2795, + "step": 26428 + }, + { + "epoch": 2.141040181464679, + "grad_norm": 0.0683743879199028, + "learning_rate": 0.00010327647508888789, + "loss": 0.3018, + "step": 26429 + }, + { + "epoch": 2.141121192482178, + "grad_norm": 0.04728134348988533, + "learning_rate": 0.00010327197443629326, + "loss": 0.2146, + "step": 26430 + }, + { + "epoch": 2.141202203499676, + "grad_norm": 0.05591541528701782, + "learning_rate": 0.00010326747378369863, + "loss": 0.2513, + "step": 26431 + }, + { + "epoch": 2.1412832145171743, + "grad_norm": 0.04973398149013519, + "learning_rate": 0.00010326297313110402, + "loss": 0.2232, + "step": 26432 + }, + { + "epoch": 2.1413642255346725, + "grad_norm": 0.05986110493540764, + "learning_rate": 0.0001032584724785094, + "loss": 0.254, + "step": 26433 + }, + { + "epoch": 2.1414452365521712, + "grad_norm": 0.06611759215593338, + "learning_rate": 0.00010325397182591477, + "loss": 0.2275, + "step": 26434 + }, + { + "epoch": 2.1415262475696695, + "grad_norm": 0.06158333644270897, + "learning_rate": 0.00010324947117332013, + "loss": 0.2348, + "step": 26435 + }, + { + "epoch": 2.1416072585871677, + "grad_norm": 0.06677713990211487, + "learning_rate": 0.0001032449705207255, + "loss": 0.253, + "step": 26436 + }, + { + "epoch": 2.1416882696046664, + "grad_norm": 0.05785349756479263, + "learning_rate": 0.00010324046986813087, + "loss": 0.252, + "step": 26437 + }, + { + "epoch": 2.1417692806221647, + "grad_norm": 0.0526701845228672, + "learning_rate": 0.00010323596921553626, + "loss": 0.2633, + "step": 26438 + }, + { + "epoch": 2.141850291639663, + "grad_norm": 0.06457501649856567, + "learning_rate": 0.00010323146856294164, + "loss": 0.2287, + "step": 26439 + }, + { + "epoch": 2.141931302657161, + "grad_norm": 0.06768237799406052, + "learning_rate": 0.00010322696791034701, + "loss": 0.2505, + "step": 26440 + }, + { + "epoch": 2.14201231367466, + "grad_norm": 0.05725770443677902, + "learning_rate": 0.00010322246725775237, + "loss": 0.275, + "step": 26441 + }, + { + "epoch": 2.142093324692158, + "grad_norm": 0.06118274852633476, + "learning_rate": 0.00010321796660515774, + "loss": 0.2306, + "step": 26442 + }, + { + "epoch": 2.1421743357096563, + "grad_norm": 0.07188738137483597, + "learning_rate": 0.00010321346595256314, + "loss": 0.3041, + "step": 26443 + }, + { + "epoch": 2.142255346727155, + "grad_norm": 0.07297833263874054, + "learning_rate": 0.0001032089652999685, + "loss": 0.2598, + "step": 26444 + }, + { + "epoch": 2.1423363577446533, + "grad_norm": 0.0646425187587738, + "learning_rate": 0.00010320446464737388, + "loss": 0.2422, + "step": 26445 + }, + { + "epoch": 2.1424173687621515, + "grad_norm": 0.05819808691740036, + "learning_rate": 0.00010319996399477925, + "loss": 0.2433, + "step": 26446 + }, + { + "epoch": 2.14249837977965, + "grad_norm": 0.060123011469841, + "learning_rate": 0.00010319546334218461, + "loss": 0.22, + "step": 26447 + }, + { + "epoch": 2.1425793907971484, + "grad_norm": 0.054031722247600555, + "learning_rate": 0.00010319096268958998, + "loss": 0.2528, + "step": 26448 + }, + { + "epoch": 2.1426604018146467, + "grad_norm": 0.05365290492773056, + "learning_rate": 0.00010318646203699539, + "loss": 0.2528, + "step": 26449 + }, + { + "epoch": 2.1427414128321454, + "grad_norm": 0.06101183220744133, + "learning_rate": 0.00010318196138440075, + "loss": 0.2365, + "step": 26450 + }, + { + "epoch": 2.1428224238496436, + "grad_norm": 0.06819365918636322, + "learning_rate": 0.00010317746073180612, + "loss": 0.2122, + "step": 26451 + }, + { + "epoch": 2.142903434867142, + "grad_norm": 0.05841772258281708, + "learning_rate": 0.00010317296007921149, + "loss": 0.2207, + "step": 26452 + }, + { + "epoch": 2.1429844458846405, + "grad_norm": 0.05575397238135338, + "learning_rate": 0.00010316845942661685, + "loss": 0.2278, + "step": 26453 + }, + { + "epoch": 2.143065456902139, + "grad_norm": 0.0675593689084053, + "learning_rate": 0.00010316395877402223, + "loss": 0.241, + "step": 26454 + }, + { + "epoch": 2.143146467919637, + "grad_norm": 0.06064525246620178, + "learning_rate": 0.00010315945812142763, + "loss": 0.2345, + "step": 26455 + }, + { + "epoch": 2.1432274789371353, + "grad_norm": 0.06661902368068695, + "learning_rate": 0.00010315495746883299, + "loss": 0.2721, + "step": 26456 + }, + { + "epoch": 2.143308489954634, + "grad_norm": 0.05384773388504982, + "learning_rate": 0.00010315045681623836, + "loss": 0.2424, + "step": 26457 + }, + { + "epoch": 2.143389500972132, + "grad_norm": 0.0716564953327179, + "learning_rate": 0.00010314595616364373, + "loss": 0.2887, + "step": 26458 + }, + { + "epoch": 2.1434705119896305, + "grad_norm": 0.06959453225135803, + "learning_rate": 0.00010314145551104911, + "loss": 0.2666, + "step": 26459 + }, + { + "epoch": 2.143551523007129, + "grad_norm": 0.06140555441379547, + "learning_rate": 0.00010313695485845447, + "loss": 0.2526, + "step": 26460 + }, + { + "epoch": 2.1436325340246274, + "grad_norm": 0.06028158217668533, + "learning_rate": 0.00010313245420585987, + "loss": 0.2576, + "step": 26461 + }, + { + "epoch": 2.1437135450421256, + "grad_norm": 0.11063975095748901, + "learning_rate": 0.00010312795355326523, + "loss": 0.2897, + "step": 26462 + }, + { + "epoch": 2.143794556059624, + "grad_norm": 0.06059414893388748, + "learning_rate": 0.0001031234529006706, + "loss": 0.2898, + "step": 26463 + }, + { + "epoch": 2.1438755670771226, + "grad_norm": 0.06326915323734283, + "learning_rate": 0.00010311895224807598, + "loss": 0.2594, + "step": 26464 + }, + { + "epoch": 2.143956578094621, + "grad_norm": 0.053890589624643326, + "learning_rate": 0.00010311445159548135, + "loss": 0.2303, + "step": 26465 + }, + { + "epoch": 2.144037589112119, + "grad_norm": 0.0587117001414299, + "learning_rate": 0.00010310995094288671, + "loss": 0.3032, + "step": 26466 + }, + { + "epoch": 2.1441186001296177, + "grad_norm": 0.06277912855148315, + "learning_rate": 0.00010310545029029211, + "loss": 0.2732, + "step": 26467 + }, + { + "epoch": 2.144199611147116, + "grad_norm": 0.06795115768909454, + "learning_rate": 0.00010310094963769747, + "loss": 0.2623, + "step": 26468 + }, + { + "epoch": 2.1442806221646142, + "grad_norm": 0.05775567889213562, + "learning_rate": 0.00010309644898510284, + "loss": 0.2579, + "step": 26469 + }, + { + "epoch": 2.144361633182113, + "grad_norm": 0.07728058099746704, + "learning_rate": 0.00010309194833250822, + "loss": 0.2639, + "step": 26470 + }, + { + "epoch": 2.144442644199611, + "grad_norm": 0.06779685616493225, + "learning_rate": 0.00010308744767991359, + "loss": 0.2606, + "step": 26471 + }, + { + "epoch": 2.1445236552171094, + "grad_norm": 0.058312151581048965, + "learning_rate": 0.00010308294702731898, + "loss": 0.2612, + "step": 26472 + }, + { + "epoch": 2.144604666234608, + "grad_norm": 0.07041696459054947, + "learning_rate": 0.00010307844637472435, + "loss": 0.2735, + "step": 26473 + }, + { + "epoch": 2.1446856772521063, + "grad_norm": 0.06928315758705139, + "learning_rate": 0.00010307394572212971, + "loss": 0.2664, + "step": 26474 + }, + { + "epoch": 2.1447666882696046, + "grad_norm": 0.061312656849622726, + "learning_rate": 0.00010306944506953509, + "loss": 0.3004, + "step": 26475 + }, + { + "epoch": 2.144847699287103, + "grad_norm": 0.0498967207968235, + "learning_rate": 0.00010306494441694046, + "loss": 0.2628, + "step": 26476 + }, + { + "epoch": 2.1449287103046015, + "grad_norm": 0.05547580495476723, + "learning_rate": 0.00010306044376434583, + "loss": 0.2206, + "step": 26477 + }, + { + "epoch": 2.1450097213220998, + "grad_norm": 0.06494994461536407, + "learning_rate": 0.00010305594311175122, + "loss": 0.2589, + "step": 26478 + }, + { + "epoch": 2.145090732339598, + "grad_norm": 0.09350940585136414, + "learning_rate": 0.0001030514424591566, + "loss": 0.2807, + "step": 26479 + }, + { + "epoch": 2.1451717433570967, + "grad_norm": 0.07217366248369217, + "learning_rate": 0.00010304694180656195, + "loss": 0.2563, + "step": 26480 + }, + { + "epoch": 2.145252754374595, + "grad_norm": 0.06400761753320694, + "learning_rate": 0.00010304244115396733, + "loss": 0.2625, + "step": 26481 + }, + { + "epoch": 2.145333765392093, + "grad_norm": 0.07066665589809418, + "learning_rate": 0.0001030379405013727, + "loss": 0.2737, + "step": 26482 + }, + { + "epoch": 2.145414776409592, + "grad_norm": 0.07438008487224579, + "learning_rate": 0.00010303343984877807, + "loss": 0.2685, + "step": 26483 + }, + { + "epoch": 2.14549578742709, + "grad_norm": 0.07420582324266434, + "learning_rate": 0.00010302893919618346, + "loss": 0.2386, + "step": 26484 + }, + { + "epoch": 2.1455767984445884, + "grad_norm": 0.06442619115114212, + "learning_rate": 0.00010302443854358883, + "loss": 0.2382, + "step": 26485 + }, + { + "epoch": 2.1456578094620866, + "grad_norm": 0.05589266121387482, + "learning_rate": 0.0001030199378909942, + "loss": 0.2886, + "step": 26486 + }, + { + "epoch": 2.1457388204795853, + "grad_norm": 0.05380919948220253, + "learning_rate": 0.00010301543723839957, + "loss": 0.2513, + "step": 26487 + }, + { + "epoch": 2.1458198314970836, + "grad_norm": 0.07141850143671036, + "learning_rate": 0.00010301093658580494, + "loss": 0.2867, + "step": 26488 + }, + { + "epoch": 2.145900842514582, + "grad_norm": 0.074470154941082, + "learning_rate": 0.00010300643593321032, + "loss": 0.2819, + "step": 26489 + }, + { + "epoch": 2.1459818535320805, + "grad_norm": 0.05928831547498703, + "learning_rate": 0.0001030019352806157, + "loss": 0.2699, + "step": 26490 + }, + { + "epoch": 2.1460628645495787, + "grad_norm": 0.06617609411478043, + "learning_rate": 0.00010299743462802108, + "loss": 0.229, + "step": 26491 + }, + { + "epoch": 2.146143875567077, + "grad_norm": 0.0654638484120369, + "learning_rate": 0.00010299293397542644, + "loss": 0.258, + "step": 26492 + }, + { + "epoch": 2.1462248865845757, + "grad_norm": 0.06313467770814896, + "learning_rate": 0.00010298843332283181, + "loss": 0.2635, + "step": 26493 + }, + { + "epoch": 2.146305897602074, + "grad_norm": 0.061352573335170746, + "learning_rate": 0.00010298393267023718, + "loss": 0.2785, + "step": 26494 + }, + { + "epoch": 2.146386908619572, + "grad_norm": 0.05656629800796509, + "learning_rate": 0.00010297943201764257, + "loss": 0.2786, + "step": 26495 + }, + { + "epoch": 2.146467919637071, + "grad_norm": 0.0668196976184845, + "learning_rate": 0.00010297493136504794, + "loss": 0.2822, + "step": 26496 + }, + { + "epoch": 2.146548930654569, + "grad_norm": 0.051473237574100494, + "learning_rate": 0.00010297043071245332, + "loss": 0.2299, + "step": 26497 + }, + { + "epoch": 2.1466299416720673, + "grad_norm": 0.0558832623064518, + "learning_rate": 0.00010296593005985868, + "loss": 0.2648, + "step": 26498 + }, + { + "epoch": 2.1467109526895656, + "grad_norm": 0.05708617717027664, + "learning_rate": 0.00010296142940726405, + "loss": 0.2563, + "step": 26499 + }, + { + "epoch": 2.1467919637070643, + "grad_norm": 0.06447821855545044, + "learning_rate": 0.00010295692875466943, + "loss": 0.2614, + "step": 26500 + }, + { + "epoch": 2.1468729747245625, + "grad_norm": 0.07675790041685104, + "learning_rate": 0.00010295242810207481, + "loss": 0.2717, + "step": 26501 + }, + { + "epoch": 2.1469539857420608, + "grad_norm": 0.05448411777615547, + "learning_rate": 0.00010294792744948019, + "loss": 0.2388, + "step": 26502 + }, + { + "epoch": 2.1470349967595594, + "grad_norm": 0.07981395721435547, + "learning_rate": 0.00010294342679688556, + "loss": 0.2609, + "step": 26503 + }, + { + "epoch": 2.1471160077770577, + "grad_norm": 0.06348752975463867, + "learning_rate": 0.00010293892614429092, + "loss": 0.263, + "step": 26504 + }, + { + "epoch": 2.147197018794556, + "grad_norm": 0.055345065891742706, + "learning_rate": 0.0001029344254916963, + "loss": 0.2106, + "step": 26505 + }, + { + "epoch": 2.1472780298120546, + "grad_norm": 0.05992121621966362, + "learning_rate": 0.00010292992483910167, + "loss": 0.2421, + "step": 26506 + }, + { + "epoch": 2.147359040829553, + "grad_norm": 0.05819815397262573, + "learning_rate": 0.00010292542418650705, + "loss": 0.2634, + "step": 26507 + }, + { + "epoch": 2.147440051847051, + "grad_norm": 0.06262335926294327, + "learning_rate": 0.00010292092353391243, + "loss": 0.2093, + "step": 26508 + }, + { + "epoch": 2.1475210628645494, + "grad_norm": 0.053963132202625275, + "learning_rate": 0.0001029164228813178, + "loss": 0.261, + "step": 26509 + }, + { + "epoch": 2.147602073882048, + "grad_norm": 0.058370329439640045, + "learning_rate": 0.00010291192222872316, + "loss": 0.2638, + "step": 26510 + }, + { + "epoch": 2.1476830848995463, + "grad_norm": 0.04938644543290138, + "learning_rate": 0.00010290742157612854, + "loss": 0.2664, + "step": 26511 + }, + { + "epoch": 2.1477640959170445, + "grad_norm": 0.05646153539419174, + "learning_rate": 0.00010290292092353391, + "loss": 0.223, + "step": 26512 + }, + { + "epoch": 2.1478451069345432, + "grad_norm": 0.06261970847845078, + "learning_rate": 0.0001028984202709393, + "loss": 0.2702, + "step": 26513 + }, + { + "epoch": 2.1479261179520415, + "grad_norm": 0.05951191857457161, + "learning_rate": 0.00010289391961834467, + "loss": 0.2573, + "step": 26514 + }, + { + "epoch": 2.1480071289695397, + "grad_norm": 0.06979864835739136, + "learning_rate": 0.00010288941896575004, + "loss": 0.3194, + "step": 26515 + }, + { + "epoch": 2.1480881399870384, + "grad_norm": 0.05955822020769119, + "learning_rate": 0.0001028849183131554, + "loss": 0.2649, + "step": 26516 + }, + { + "epoch": 2.1481691510045366, + "grad_norm": 0.08112508803606033, + "learning_rate": 0.00010288041766056078, + "loss": 0.309, + "step": 26517 + }, + { + "epoch": 2.148250162022035, + "grad_norm": 0.06836628168821335, + "learning_rate": 0.00010287591700796615, + "loss": 0.2426, + "step": 26518 + }, + { + "epoch": 2.1483311730395336, + "grad_norm": 0.07359662652015686, + "learning_rate": 0.00010287141635537154, + "loss": 0.2346, + "step": 26519 + }, + { + "epoch": 2.148412184057032, + "grad_norm": 0.07387182116508484, + "learning_rate": 0.00010286691570277691, + "loss": 0.2542, + "step": 26520 + }, + { + "epoch": 2.14849319507453, + "grad_norm": 0.07881849259138107, + "learning_rate": 0.00010286241505018228, + "loss": 0.3046, + "step": 26521 + }, + { + "epoch": 2.1485742060920283, + "grad_norm": 0.07013516873121262, + "learning_rate": 0.00010285791439758764, + "loss": 0.2478, + "step": 26522 + }, + { + "epoch": 2.148655217109527, + "grad_norm": 0.07745005935430527, + "learning_rate": 0.00010285341374499302, + "loss": 0.2765, + "step": 26523 + }, + { + "epoch": 2.1487362281270252, + "grad_norm": 0.0691734030842781, + "learning_rate": 0.00010284891309239842, + "loss": 0.2633, + "step": 26524 + }, + { + "epoch": 2.1488172391445235, + "grad_norm": 0.05485551059246063, + "learning_rate": 0.00010284441243980378, + "loss": 0.225, + "step": 26525 + }, + { + "epoch": 2.148898250162022, + "grad_norm": 0.065140500664711, + "learning_rate": 0.00010283991178720915, + "loss": 0.2564, + "step": 26526 + }, + { + "epoch": 2.1489792611795204, + "grad_norm": 0.06324749439954758, + "learning_rate": 0.00010283541113461453, + "loss": 0.2274, + "step": 26527 + }, + { + "epoch": 2.1490602721970187, + "grad_norm": 0.05943019315600395, + "learning_rate": 0.0001028309104820199, + "loss": 0.247, + "step": 26528 + }, + { + "epoch": 2.1491412832145174, + "grad_norm": 0.05910013988614082, + "learning_rate": 0.00010282640982942526, + "loss": 0.2751, + "step": 26529 + }, + { + "epoch": 2.1492222942320156, + "grad_norm": 0.06628038734197617, + "learning_rate": 0.00010282190917683066, + "loss": 0.2657, + "step": 26530 + }, + { + "epoch": 2.149303305249514, + "grad_norm": 0.0645565614104271, + "learning_rate": 0.00010281740852423602, + "loss": 0.2534, + "step": 26531 + }, + { + "epoch": 2.149384316267012, + "grad_norm": 0.059102196246385574, + "learning_rate": 0.0001028129078716414, + "loss": 0.2709, + "step": 26532 + }, + { + "epoch": 2.149465327284511, + "grad_norm": 0.08080757409334183, + "learning_rate": 0.00010280840721904677, + "loss": 0.2303, + "step": 26533 + }, + { + "epoch": 2.149546338302009, + "grad_norm": 0.07256803661584854, + "learning_rate": 0.00010280390656645214, + "loss": 0.2381, + "step": 26534 + }, + { + "epoch": 2.1496273493195073, + "grad_norm": 0.06296215951442719, + "learning_rate": 0.0001027994059138575, + "loss": 0.2396, + "step": 26535 + }, + { + "epoch": 2.149708360337006, + "grad_norm": 0.06908629089593887, + "learning_rate": 0.0001027949052612629, + "loss": 0.2381, + "step": 26536 + }, + { + "epoch": 2.149789371354504, + "grad_norm": 0.0535476990044117, + "learning_rate": 0.00010279040460866826, + "loss": 0.2161, + "step": 26537 + }, + { + "epoch": 2.1498703823720025, + "grad_norm": 0.07507485896348953, + "learning_rate": 0.00010278590395607364, + "loss": 0.3161, + "step": 26538 + }, + { + "epoch": 2.149951393389501, + "grad_norm": 0.08611593395471573, + "learning_rate": 0.00010278140330347901, + "loss": 0.252, + "step": 26539 + }, + { + "epoch": 2.1500324044069994, + "grad_norm": 0.06031523272395134, + "learning_rate": 0.00010277690265088438, + "loss": 0.2271, + "step": 26540 + }, + { + "epoch": 2.1501134154244976, + "grad_norm": 0.08460874110460281, + "learning_rate": 0.00010277240199828974, + "loss": 0.2427, + "step": 26541 + }, + { + "epoch": 2.1501944264419963, + "grad_norm": 0.06741653382778168, + "learning_rate": 0.00010276790134569514, + "loss": 0.2594, + "step": 26542 + }, + { + "epoch": 2.1502754374594946, + "grad_norm": 0.05998913198709488, + "learning_rate": 0.0001027634006931005, + "loss": 0.2479, + "step": 26543 + }, + { + "epoch": 2.150356448476993, + "grad_norm": 0.05123647302389145, + "learning_rate": 0.00010275890004050588, + "loss": 0.2636, + "step": 26544 + }, + { + "epoch": 2.150437459494491, + "grad_norm": 0.062051501125097275, + "learning_rate": 0.00010275439938791125, + "loss": 0.2531, + "step": 26545 + }, + { + "epoch": 2.1505184705119897, + "grad_norm": 0.06481350213289261, + "learning_rate": 0.00010274989873531662, + "loss": 0.2322, + "step": 26546 + }, + { + "epoch": 2.150599481529488, + "grad_norm": 0.06562846153974533, + "learning_rate": 0.00010274539808272198, + "loss": 0.2733, + "step": 26547 + }, + { + "epoch": 2.1506804925469862, + "grad_norm": 0.0631428137421608, + "learning_rate": 0.00010274089743012739, + "loss": 0.2787, + "step": 26548 + }, + { + "epoch": 2.150761503564485, + "grad_norm": 0.07828080654144287, + "learning_rate": 0.00010273639677753275, + "loss": 0.2352, + "step": 26549 + }, + { + "epoch": 2.150842514581983, + "grad_norm": 0.06805264949798584, + "learning_rate": 0.00010273189612493812, + "loss": 0.2428, + "step": 26550 + }, + { + "epoch": 2.1509235255994814, + "grad_norm": 0.06137997657060623, + "learning_rate": 0.00010272739547234349, + "loss": 0.255, + "step": 26551 + }, + { + "epoch": 2.15100453661698, + "grad_norm": 0.07119157165288925, + "learning_rate": 0.00010272289481974887, + "loss": 0.3016, + "step": 26552 + }, + { + "epoch": 2.1510855476344783, + "grad_norm": 0.06857890635728836, + "learning_rate": 0.00010271839416715425, + "loss": 0.2899, + "step": 26553 + }, + { + "epoch": 2.1511665586519766, + "grad_norm": 0.048014085739851, + "learning_rate": 0.00010271389351455963, + "loss": 0.2641, + "step": 26554 + }, + { + "epoch": 2.151247569669475, + "grad_norm": 0.07205279171466827, + "learning_rate": 0.00010270939286196499, + "loss": 0.2434, + "step": 26555 + }, + { + "epoch": 2.1513285806869735, + "grad_norm": 0.07112793624401093, + "learning_rate": 0.00010270489220937036, + "loss": 0.3006, + "step": 26556 + }, + { + "epoch": 2.1514095917044718, + "grad_norm": 0.06290469318628311, + "learning_rate": 0.00010270039155677573, + "loss": 0.2626, + "step": 26557 + }, + { + "epoch": 2.15149060272197, + "grad_norm": 0.055432796478271484, + "learning_rate": 0.00010269589090418111, + "loss": 0.255, + "step": 26558 + }, + { + "epoch": 2.1515716137394687, + "grad_norm": 0.05841575935482979, + "learning_rate": 0.0001026913902515865, + "loss": 0.285, + "step": 26559 + }, + { + "epoch": 2.151652624756967, + "grad_norm": 0.06193108111619949, + "learning_rate": 0.00010268688959899187, + "loss": 0.2864, + "step": 26560 + }, + { + "epoch": 2.151733635774465, + "grad_norm": 0.0693509429693222, + "learning_rate": 0.00010268238894639723, + "loss": 0.2604, + "step": 26561 + }, + { + "epoch": 2.151814646791964, + "grad_norm": 0.061149440705776215, + "learning_rate": 0.0001026778882938026, + "loss": 0.2691, + "step": 26562 + }, + { + "epoch": 2.151895657809462, + "grad_norm": 0.06805266439914703, + "learning_rate": 0.00010267338764120798, + "loss": 0.3228, + "step": 26563 + }, + { + "epoch": 2.1519766688269604, + "grad_norm": 0.05401428043842316, + "learning_rate": 0.00010266888698861335, + "loss": 0.2867, + "step": 26564 + }, + { + "epoch": 2.152057679844459, + "grad_norm": 0.06017407029867172, + "learning_rate": 0.00010266438633601874, + "loss": 0.2303, + "step": 26565 + }, + { + "epoch": 2.1521386908619573, + "grad_norm": 0.06361527740955353, + "learning_rate": 0.00010265988568342411, + "loss": 0.2976, + "step": 26566 + }, + { + "epoch": 2.1522197018794555, + "grad_norm": 0.06338279694318771, + "learning_rate": 0.00010265538503082947, + "loss": 0.2583, + "step": 26567 + }, + { + "epoch": 2.152300712896954, + "grad_norm": 0.0681900903582573, + "learning_rate": 0.00010265088437823484, + "loss": 0.2604, + "step": 26568 + }, + { + "epoch": 2.1523817239144525, + "grad_norm": 0.06779904663562775, + "learning_rate": 0.00010264638372564022, + "loss": 0.2554, + "step": 26569 + }, + { + "epoch": 2.1524627349319507, + "grad_norm": 0.0530104897916317, + "learning_rate": 0.00010264188307304559, + "loss": 0.2325, + "step": 26570 + }, + { + "epoch": 2.152543745949449, + "grad_norm": 0.07330182939767838, + "learning_rate": 0.00010263738242045098, + "loss": 0.2599, + "step": 26571 + }, + { + "epoch": 2.1526247569669477, + "grad_norm": 0.06863976269960403, + "learning_rate": 0.00010263288176785635, + "loss": 0.2425, + "step": 26572 + }, + { + "epoch": 2.152705767984446, + "grad_norm": 0.0714733898639679, + "learning_rate": 0.00010262838111526171, + "loss": 0.2421, + "step": 26573 + }, + { + "epoch": 2.152786779001944, + "grad_norm": 0.06109316647052765, + "learning_rate": 0.00010262388046266709, + "loss": 0.2359, + "step": 26574 + }, + { + "epoch": 2.152867790019443, + "grad_norm": 0.06053869053721428, + "learning_rate": 0.00010261937981007246, + "loss": 0.2656, + "step": 26575 + }, + { + "epoch": 2.152948801036941, + "grad_norm": 0.06158290058374405, + "learning_rate": 0.00010261487915747785, + "loss": 0.2442, + "step": 26576 + }, + { + "epoch": 2.1530298120544393, + "grad_norm": 0.05543677508831024, + "learning_rate": 0.00010261037850488322, + "loss": 0.215, + "step": 26577 + }, + { + "epoch": 2.1531108230719376, + "grad_norm": 0.05953080579638481, + "learning_rate": 0.0001026058778522886, + "loss": 0.2935, + "step": 26578 + }, + { + "epoch": 2.1531918340894363, + "grad_norm": 0.06894619017839432, + "learning_rate": 0.00010260137719969395, + "loss": 0.2454, + "step": 26579 + }, + { + "epoch": 2.1532728451069345, + "grad_norm": 0.06742026656866074, + "learning_rate": 0.00010259687654709933, + "loss": 0.231, + "step": 26580 + }, + { + "epoch": 2.1533538561244328, + "grad_norm": 0.056998420506715775, + "learning_rate": 0.0001025923758945047, + "loss": 0.2384, + "step": 26581 + }, + { + "epoch": 2.1534348671419314, + "grad_norm": 0.06900626420974731, + "learning_rate": 0.00010258787524191009, + "loss": 0.2387, + "step": 26582 + }, + { + "epoch": 2.1535158781594297, + "grad_norm": 0.05155860260128975, + "learning_rate": 0.00010258337458931546, + "loss": 0.2394, + "step": 26583 + }, + { + "epoch": 2.153596889176928, + "grad_norm": 0.07202799618244171, + "learning_rate": 0.00010257887393672084, + "loss": 0.2856, + "step": 26584 + }, + { + "epoch": 2.1536779001944266, + "grad_norm": 0.05389600247144699, + "learning_rate": 0.0001025743732841262, + "loss": 0.2319, + "step": 26585 + }, + { + "epoch": 2.153758911211925, + "grad_norm": 0.06073429808020592, + "learning_rate": 0.00010256987263153157, + "loss": 0.2352, + "step": 26586 + }, + { + "epoch": 2.153839922229423, + "grad_norm": 0.07475744187831879, + "learning_rate": 0.00010256537197893694, + "loss": 0.263, + "step": 26587 + }, + { + "epoch": 2.153920933246922, + "grad_norm": 0.056003883481025696, + "learning_rate": 0.00010256087132634233, + "loss": 0.2163, + "step": 26588 + }, + { + "epoch": 2.15400194426442, + "grad_norm": 0.07948049902915955, + "learning_rate": 0.0001025563706737477, + "loss": 0.2713, + "step": 26589 + }, + { + "epoch": 2.1540829552819183, + "grad_norm": 0.05557125434279442, + "learning_rate": 0.00010255187002115308, + "loss": 0.2532, + "step": 26590 + }, + { + "epoch": 2.1541639662994165, + "grad_norm": 0.07005944103002548, + "learning_rate": 0.00010254736936855844, + "loss": 0.2673, + "step": 26591 + }, + { + "epoch": 2.154244977316915, + "grad_norm": 0.06999240815639496, + "learning_rate": 0.00010254286871596381, + "loss": 0.2479, + "step": 26592 + }, + { + "epoch": 2.1543259883344135, + "grad_norm": 0.061392951756715775, + "learning_rate": 0.00010253836806336918, + "loss": 0.2824, + "step": 26593 + }, + { + "epoch": 2.1544069993519117, + "grad_norm": 0.05862107500433922, + "learning_rate": 0.00010253386741077457, + "loss": 0.254, + "step": 26594 + }, + { + "epoch": 2.1544880103694104, + "grad_norm": 0.055120065808296204, + "learning_rate": 0.00010252936675817994, + "loss": 0.2826, + "step": 26595 + }, + { + "epoch": 2.1545690213869086, + "grad_norm": 0.05726516246795654, + "learning_rate": 0.00010252486610558532, + "loss": 0.258, + "step": 26596 + }, + { + "epoch": 2.154650032404407, + "grad_norm": 0.06028033047914505, + "learning_rate": 0.00010252036545299069, + "loss": 0.2468, + "step": 26597 + }, + { + "epoch": 2.1547310434219056, + "grad_norm": 0.06699507683515549, + "learning_rate": 0.00010251586480039605, + "loss": 0.2881, + "step": 26598 + }, + { + "epoch": 2.154812054439404, + "grad_norm": 0.056645482778549194, + "learning_rate": 0.00010251136414780143, + "loss": 0.245, + "step": 26599 + }, + { + "epoch": 2.154893065456902, + "grad_norm": 0.06294909864664078, + "learning_rate": 0.00010250686349520681, + "loss": 0.2676, + "step": 26600 + }, + { + "epoch": 2.1549740764744003, + "grad_norm": 0.05808530002832413, + "learning_rate": 0.00010250236284261219, + "loss": 0.2408, + "step": 26601 + }, + { + "epoch": 2.155055087491899, + "grad_norm": 0.056637249886989594, + "learning_rate": 0.00010249786219001756, + "loss": 0.2518, + "step": 26602 + }, + { + "epoch": 2.1551360985093972, + "grad_norm": 0.056968703866004944, + "learning_rate": 0.00010249336153742293, + "loss": 0.2357, + "step": 26603 + }, + { + "epoch": 2.1552171095268955, + "grad_norm": 0.06403225660324097, + "learning_rate": 0.0001024888608848283, + "loss": 0.2996, + "step": 26604 + }, + { + "epoch": 2.155298120544394, + "grad_norm": 0.05986631661653519, + "learning_rate": 0.0001024843602322337, + "loss": 0.247, + "step": 26605 + }, + { + "epoch": 2.1553791315618924, + "grad_norm": 0.06378137320280075, + "learning_rate": 0.00010247985957963905, + "loss": 0.2541, + "step": 26606 + }, + { + "epoch": 2.1554601425793907, + "grad_norm": 0.053147122263908386, + "learning_rate": 0.00010247535892704443, + "loss": 0.2254, + "step": 26607 + }, + { + "epoch": 2.1555411535968894, + "grad_norm": 0.06405290216207504, + "learning_rate": 0.0001024708582744498, + "loss": 0.2534, + "step": 26608 + }, + { + "epoch": 2.1556221646143876, + "grad_norm": 0.06678581237792969, + "learning_rate": 0.00010246635762185518, + "loss": 0.2812, + "step": 26609 + }, + { + "epoch": 2.155703175631886, + "grad_norm": 0.08605819195508957, + "learning_rate": 0.00010246185696926054, + "loss": 0.2717, + "step": 26610 + }, + { + "epoch": 2.1557841866493845, + "grad_norm": 0.07971778512001038, + "learning_rate": 0.00010245735631666594, + "loss": 0.2643, + "step": 26611 + }, + { + "epoch": 2.155865197666883, + "grad_norm": 0.06603200733661652, + "learning_rate": 0.0001024528556640713, + "loss": 0.2387, + "step": 26612 + }, + { + "epoch": 2.155946208684381, + "grad_norm": 0.060673121362924576, + "learning_rate": 0.00010244835501147667, + "loss": 0.2346, + "step": 26613 + }, + { + "epoch": 2.1560272197018793, + "grad_norm": 0.060722626745700836, + "learning_rate": 0.00010244385435888204, + "loss": 0.2713, + "step": 26614 + }, + { + "epoch": 2.156108230719378, + "grad_norm": 0.06248776987195015, + "learning_rate": 0.00010243935370628742, + "loss": 0.2609, + "step": 26615 + }, + { + "epoch": 2.156189241736876, + "grad_norm": 0.05779466778039932, + "learning_rate": 0.00010243485305369278, + "loss": 0.2544, + "step": 26616 + }, + { + "epoch": 2.1562702527543745, + "grad_norm": 0.07016555219888687, + "learning_rate": 0.00010243035240109818, + "loss": 0.2636, + "step": 26617 + }, + { + "epoch": 2.156351263771873, + "grad_norm": 0.08269957453012466, + "learning_rate": 0.00010242585174850354, + "loss": 0.2469, + "step": 26618 + }, + { + "epoch": 2.1564322747893714, + "grad_norm": 0.05582574009895325, + "learning_rate": 0.00010242135109590891, + "loss": 0.2708, + "step": 26619 + }, + { + "epoch": 2.1565132858068696, + "grad_norm": 0.0636686459183693, + "learning_rate": 0.00010241685044331428, + "loss": 0.2627, + "step": 26620 + }, + { + "epoch": 2.1565942968243683, + "grad_norm": 0.06832555681467056, + "learning_rate": 0.00010241234979071966, + "loss": 0.2596, + "step": 26621 + }, + { + "epoch": 2.1566753078418666, + "grad_norm": 0.06222445145249367, + "learning_rate": 0.00010240784913812502, + "loss": 0.2496, + "step": 26622 + }, + { + "epoch": 2.156756318859365, + "grad_norm": 0.07057000696659088, + "learning_rate": 0.00010240334848553042, + "loss": 0.2579, + "step": 26623 + }, + { + "epoch": 2.156837329876863, + "grad_norm": 0.057806190103292465, + "learning_rate": 0.00010239884783293578, + "loss": 0.2485, + "step": 26624 + }, + { + "epoch": 2.1569183408943617, + "grad_norm": 0.05468752607703209, + "learning_rate": 0.00010239434718034115, + "loss": 0.2532, + "step": 26625 + }, + { + "epoch": 2.15699935191186, + "grad_norm": 0.05896284431219101, + "learning_rate": 0.00010238984652774653, + "loss": 0.2606, + "step": 26626 + }, + { + "epoch": 2.1570803629293582, + "grad_norm": 0.062181517481803894, + "learning_rate": 0.0001023853458751519, + "loss": 0.2601, + "step": 26627 + }, + { + "epoch": 2.157161373946857, + "grad_norm": 0.0685560554265976, + "learning_rate": 0.00010238084522255729, + "loss": 0.274, + "step": 26628 + }, + { + "epoch": 2.157242384964355, + "grad_norm": 0.0639854297041893, + "learning_rate": 0.00010237634456996266, + "loss": 0.229, + "step": 26629 + }, + { + "epoch": 2.1573233959818534, + "grad_norm": 0.06651315093040466, + "learning_rate": 0.00010237184391736802, + "loss": 0.2389, + "step": 26630 + }, + { + "epoch": 2.157404406999352, + "grad_norm": 0.06652846187353134, + "learning_rate": 0.0001023673432647734, + "loss": 0.2525, + "step": 26631 + }, + { + "epoch": 2.1574854180168503, + "grad_norm": 0.06786283105611801, + "learning_rate": 0.00010236284261217877, + "loss": 0.2838, + "step": 26632 + }, + { + "epoch": 2.1575664290343486, + "grad_norm": 0.06288962066173553, + "learning_rate": 0.00010235834195958414, + "loss": 0.2379, + "step": 26633 + }, + { + "epoch": 2.1576474400518473, + "grad_norm": 0.07239990681409836, + "learning_rate": 0.00010235384130698953, + "loss": 0.2779, + "step": 26634 + }, + { + "epoch": 2.1577284510693455, + "grad_norm": 0.07054921239614487, + "learning_rate": 0.0001023493406543949, + "loss": 0.2498, + "step": 26635 + }, + { + "epoch": 2.1578094620868438, + "grad_norm": 0.07556222379207611, + "learning_rate": 0.00010234484000180026, + "loss": 0.2712, + "step": 26636 + }, + { + "epoch": 2.157890473104342, + "grad_norm": 0.055823683738708496, + "learning_rate": 0.00010234033934920564, + "loss": 0.2631, + "step": 26637 + }, + { + "epoch": 2.1579714841218407, + "grad_norm": 0.061262547969818115, + "learning_rate": 0.00010233583869661101, + "loss": 0.27, + "step": 26638 + }, + { + "epoch": 2.158052495139339, + "grad_norm": 0.07095042616128922, + "learning_rate": 0.00010233133804401638, + "loss": 0.2901, + "step": 26639 + }, + { + "epoch": 2.158133506156837, + "grad_norm": 0.07894306629896164, + "learning_rate": 0.00010232683739142177, + "loss": 0.256, + "step": 26640 + }, + { + "epoch": 2.158214517174336, + "grad_norm": 0.05407718941569328, + "learning_rate": 0.00010232233673882714, + "loss": 0.2349, + "step": 26641 + }, + { + "epoch": 2.158295528191834, + "grad_norm": 0.07037380337715149, + "learning_rate": 0.0001023178360862325, + "loss": 0.2731, + "step": 26642 + }, + { + "epoch": 2.1583765392093324, + "grad_norm": 0.04452992230653763, + "learning_rate": 0.00010231333543363788, + "loss": 0.2351, + "step": 26643 + }, + { + "epoch": 2.158457550226831, + "grad_norm": 0.06478511542081833, + "learning_rate": 0.00010230883478104325, + "loss": 0.2517, + "step": 26644 + }, + { + "epoch": 2.1585385612443293, + "grad_norm": 0.061454348266124725, + "learning_rate": 0.00010230433412844863, + "loss": 0.2674, + "step": 26645 + }, + { + "epoch": 2.1586195722618275, + "grad_norm": 0.056620292365550995, + "learning_rate": 0.00010229983347585401, + "loss": 0.239, + "step": 26646 + }, + { + "epoch": 2.158700583279326, + "grad_norm": 0.05957835167646408, + "learning_rate": 0.00010229533282325939, + "loss": 0.2663, + "step": 26647 + }, + { + "epoch": 2.1587815942968245, + "grad_norm": 0.06666881591081619, + "learning_rate": 0.00010229083217066475, + "loss": 0.2544, + "step": 26648 + }, + { + "epoch": 2.1588626053143227, + "grad_norm": 0.06860926747322083, + "learning_rate": 0.00010228633151807012, + "loss": 0.2814, + "step": 26649 + }, + { + "epoch": 2.158943616331821, + "grad_norm": 0.05659700185060501, + "learning_rate": 0.00010228183086547549, + "loss": 0.2478, + "step": 26650 + }, + { + "epoch": 2.1590246273493197, + "grad_norm": 0.04990334063768387, + "learning_rate": 0.00010227733021288087, + "loss": 0.2532, + "step": 26651 + }, + { + "epoch": 2.159105638366818, + "grad_norm": 0.07146997004747391, + "learning_rate": 0.00010227282956028625, + "loss": 0.2496, + "step": 26652 + }, + { + "epoch": 2.159186649384316, + "grad_norm": 0.06514699757099152, + "learning_rate": 0.00010226832890769163, + "loss": 0.255, + "step": 26653 + }, + { + "epoch": 2.159267660401815, + "grad_norm": 0.06386931985616684, + "learning_rate": 0.00010226382825509699, + "loss": 0.2656, + "step": 26654 + }, + { + "epoch": 2.159348671419313, + "grad_norm": 0.05146828293800354, + "learning_rate": 0.00010225932760250236, + "loss": 0.2349, + "step": 26655 + }, + { + "epoch": 2.1594296824368113, + "grad_norm": 0.059927668422460556, + "learning_rate": 0.00010225482694990773, + "loss": 0.2559, + "step": 26656 + }, + { + "epoch": 2.15951069345431, + "grad_norm": 0.05467910319566727, + "learning_rate": 0.00010225032629731312, + "loss": 0.2622, + "step": 26657 + }, + { + "epoch": 2.1595917044718083, + "grad_norm": 0.062487825751304626, + "learning_rate": 0.0001022458256447185, + "loss": 0.2629, + "step": 26658 + }, + { + "epoch": 2.1596727154893065, + "grad_norm": 0.0732332095503807, + "learning_rate": 0.00010224132499212387, + "loss": 0.2707, + "step": 26659 + }, + { + "epoch": 2.1597537265068047, + "grad_norm": 0.06669419258832932, + "learning_rate": 0.00010223682433952923, + "loss": 0.2927, + "step": 26660 + }, + { + "epoch": 2.1598347375243034, + "grad_norm": 0.06436692923307419, + "learning_rate": 0.0001022323236869346, + "loss": 0.2877, + "step": 26661 + }, + { + "epoch": 2.1599157485418017, + "grad_norm": 0.05647365748882294, + "learning_rate": 0.00010222782303433998, + "loss": 0.2812, + "step": 26662 + }, + { + "epoch": 2.1599967595593, + "grad_norm": 0.07081833481788635, + "learning_rate": 0.00010222332238174536, + "loss": 0.2701, + "step": 26663 + }, + { + "epoch": 2.1600777705767986, + "grad_norm": 0.0558331124484539, + "learning_rate": 0.00010221882172915074, + "loss": 0.2585, + "step": 26664 + }, + { + "epoch": 2.160158781594297, + "grad_norm": 0.07021911442279816, + "learning_rate": 0.00010221432107655611, + "loss": 0.2821, + "step": 26665 + }, + { + "epoch": 2.160239792611795, + "grad_norm": 0.062387652695178986, + "learning_rate": 0.00010220982042396148, + "loss": 0.2167, + "step": 26666 + }, + { + "epoch": 2.1603208036292934, + "grad_norm": 0.0702107697725296, + "learning_rate": 0.00010220531977136684, + "loss": 0.239, + "step": 26667 + }, + { + "epoch": 2.160401814646792, + "grad_norm": 0.0710628479719162, + "learning_rate": 0.00010220081911877222, + "loss": 0.2947, + "step": 26668 + }, + { + "epoch": 2.1604828256642903, + "grad_norm": 0.05518188700079918, + "learning_rate": 0.0001021963184661776, + "loss": 0.2594, + "step": 26669 + }, + { + "epoch": 2.1605638366817885, + "grad_norm": 0.07188651710748672, + "learning_rate": 0.00010219181781358298, + "loss": 0.2536, + "step": 26670 + }, + { + "epoch": 2.160644847699287, + "grad_norm": 0.06636640429496765, + "learning_rate": 0.00010218731716098835, + "loss": 0.2418, + "step": 26671 + }, + { + "epoch": 2.1607258587167855, + "grad_norm": 0.07105717062950134, + "learning_rate": 0.00010218281650839373, + "loss": 0.2735, + "step": 26672 + }, + { + "epoch": 2.1608068697342837, + "grad_norm": 0.061248186975717545, + "learning_rate": 0.00010217831585579909, + "loss": 0.267, + "step": 26673 + }, + { + "epoch": 2.1608878807517824, + "grad_norm": 0.06057823449373245, + "learning_rate": 0.00010217381520320446, + "loss": 0.2574, + "step": 26674 + }, + { + "epoch": 2.1609688917692806, + "grad_norm": 0.056544676423072815, + "learning_rate": 0.00010216931455060985, + "loss": 0.2459, + "step": 26675 + }, + { + "epoch": 2.161049902786779, + "grad_norm": 0.06037278473377228, + "learning_rate": 0.00010216481389801522, + "loss": 0.259, + "step": 26676 + }, + { + "epoch": 2.1611309138042776, + "grad_norm": 0.07936231791973114, + "learning_rate": 0.0001021603132454206, + "loss": 0.2663, + "step": 26677 + }, + { + "epoch": 2.161211924821776, + "grad_norm": 0.060614656656980515, + "learning_rate": 0.00010215581259282597, + "loss": 0.2603, + "step": 26678 + }, + { + "epoch": 2.161292935839274, + "grad_norm": 0.0534573532640934, + "learning_rate": 0.00010215131194023133, + "loss": 0.2638, + "step": 26679 + }, + { + "epoch": 2.1613739468567728, + "grad_norm": 0.060797303915023804, + "learning_rate": 0.0001021468112876367, + "loss": 0.2402, + "step": 26680 + }, + { + "epoch": 2.161454957874271, + "grad_norm": 0.06404197216033936, + "learning_rate": 0.00010214231063504209, + "loss": 0.2485, + "step": 26681 + }, + { + "epoch": 2.1615359688917692, + "grad_norm": 0.05765843018889427, + "learning_rate": 0.00010213780998244746, + "loss": 0.2736, + "step": 26682 + }, + { + "epoch": 2.1616169799092675, + "grad_norm": 0.05763285979628563, + "learning_rate": 0.00010213330932985284, + "loss": 0.2434, + "step": 26683 + }, + { + "epoch": 2.161697990926766, + "grad_norm": 0.05568533390760422, + "learning_rate": 0.00010212880867725821, + "loss": 0.2232, + "step": 26684 + }, + { + "epoch": 2.1617790019442644, + "grad_norm": 0.06619726121425629, + "learning_rate": 0.00010212430802466357, + "loss": 0.2955, + "step": 26685 + }, + { + "epoch": 2.1618600129617627, + "grad_norm": 0.07268057763576508, + "learning_rate": 0.00010211980737206897, + "loss": 0.2706, + "step": 26686 + }, + { + "epoch": 2.1619410239792614, + "grad_norm": 0.06154467910528183, + "learning_rate": 0.00010211530671947433, + "loss": 0.24, + "step": 26687 + }, + { + "epoch": 2.1620220349967596, + "grad_norm": 0.0544455386698246, + "learning_rate": 0.0001021108060668797, + "loss": 0.2601, + "step": 26688 + }, + { + "epoch": 2.162103046014258, + "grad_norm": 0.06915867328643799, + "learning_rate": 0.00010210630541428508, + "loss": 0.2544, + "step": 26689 + }, + { + "epoch": 2.162184057031756, + "grad_norm": 0.0737709030508995, + "learning_rate": 0.00010210180476169045, + "loss": 0.2471, + "step": 26690 + }, + { + "epoch": 2.162265068049255, + "grad_norm": 0.05942685529589653, + "learning_rate": 0.00010209730410909581, + "loss": 0.2324, + "step": 26691 + }, + { + "epoch": 2.162346079066753, + "grad_norm": 0.05648018419742584, + "learning_rate": 0.00010209280345650121, + "loss": 0.2247, + "step": 26692 + }, + { + "epoch": 2.1624270900842513, + "grad_norm": 0.0682801902294159, + "learning_rate": 0.00010208830280390657, + "loss": 0.2884, + "step": 26693 + }, + { + "epoch": 2.16250810110175, + "grad_norm": 0.07194321602582932, + "learning_rate": 0.00010208380215131195, + "loss": 0.2652, + "step": 26694 + }, + { + "epoch": 2.162589112119248, + "grad_norm": 0.08287494629621506, + "learning_rate": 0.00010207930149871732, + "loss": 0.2525, + "step": 26695 + }, + { + "epoch": 2.1626701231367464, + "grad_norm": 0.07747865468263626, + "learning_rate": 0.00010207480084612269, + "loss": 0.271, + "step": 26696 + }, + { + "epoch": 2.162751134154245, + "grad_norm": 0.07304595410823822, + "learning_rate": 0.00010207030019352805, + "loss": 0.3076, + "step": 26697 + }, + { + "epoch": 2.1628321451717434, + "grad_norm": 0.07419362664222717, + "learning_rate": 0.00010206579954093345, + "loss": 0.2364, + "step": 26698 + }, + { + "epoch": 2.1629131561892416, + "grad_norm": 0.07199463248252869, + "learning_rate": 0.00010206129888833881, + "loss": 0.2931, + "step": 26699 + }, + { + "epoch": 2.1629941672067403, + "grad_norm": 0.05875088647007942, + "learning_rate": 0.00010205679823574419, + "loss": 0.2558, + "step": 26700 + }, + { + "epoch": 2.1630751782242386, + "grad_norm": 0.057817574590444565, + "learning_rate": 0.00010205229758314956, + "loss": 0.2728, + "step": 26701 + }, + { + "epoch": 2.163156189241737, + "grad_norm": 0.060741499066352844, + "learning_rate": 0.00010204779693055493, + "loss": 0.2873, + "step": 26702 + }, + { + "epoch": 2.163237200259235, + "grad_norm": 0.0565064400434494, + "learning_rate": 0.0001020432962779603, + "loss": 0.2225, + "step": 26703 + }, + { + "epoch": 2.1633182112767337, + "grad_norm": 0.06156710162758827, + "learning_rate": 0.0001020387956253657, + "loss": 0.2445, + "step": 26704 + }, + { + "epoch": 2.163399222294232, + "grad_norm": 0.05642209202051163, + "learning_rate": 0.00010203429497277105, + "loss": 0.2315, + "step": 26705 + }, + { + "epoch": 2.1634802333117302, + "grad_norm": 0.06932426989078522, + "learning_rate": 0.00010202979432017643, + "loss": 0.2718, + "step": 26706 + }, + { + "epoch": 2.163561244329229, + "grad_norm": 0.061338771134614944, + "learning_rate": 0.0001020252936675818, + "loss": 0.2471, + "step": 26707 + }, + { + "epoch": 2.163642255346727, + "grad_norm": 0.053529005497694016, + "learning_rate": 0.00010202079301498718, + "loss": 0.2258, + "step": 26708 + }, + { + "epoch": 2.1637232663642254, + "grad_norm": 0.062487855553627014, + "learning_rate": 0.00010201629236239256, + "loss": 0.2294, + "step": 26709 + }, + { + "epoch": 2.163804277381724, + "grad_norm": 0.0688663199543953, + "learning_rate": 0.00010201179170979794, + "loss": 0.2623, + "step": 26710 + }, + { + "epoch": 2.1638852883992223, + "grad_norm": 0.05936722457408905, + "learning_rate": 0.0001020072910572033, + "loss": 0.2663, + "step": 26711 + }, + { + "epoch": 2.1639662994167206, + "grad_norm": 0.056306082755327225, + "learning_rate": 0.00010200279040460867, + "loss": 0.2768, + "step": 26712 + }, + { + "epoch": 2.164047310434219, + "grad_norm": 0.07249502837657928, + "learning_rate": 0.00010199828975201404, + "loss": 0.2538, + "step": 26713 + }, + { + "epoch": 2.1641283214517175, + "grad_norm": 0.06649959087371826, + "learning_rate": 0.00010199378909941942, + "loss": 0.2547, + "step": 26714 + }, + { + "epoch": 2.1642093324692158, + "grad_norm": 0.06089214235544205, + "learning_rate": 0.0001019892884468248, + "loss": 0.2736, + "step": 26715 + }, + { + "epoch": 2.164290343486714, + "grad_norm": 0.06362010538578033, + "learning_rate": 0.00010198478779423018, + "loss": 0.2643, + "step": 26716 + }, + { + "epoch": 2.1643713545042127, + "grad_norm": 0.061047982424497604, + "learning_rate": 0.00010198028714163554, + "loss": 0.2762, + "step": 26717 + }, + { + "epoch": 2.164452365521711, + "grad_norm": 0.06476712971925735, + "learning_rate": 0.00010197578648904091, + "loss": 0.2731, + "step": 26718 + }, + { + "epoch": 2.164533376539209, + "grad_norm": 0.051583848893642426, + "learning_rate": 0.00010197128583644629, + "loss": 0.2737, + "step": 26719 + }, + { + "epoch": 2.164614387556708, + "grad_norm": 0.062449660152196884, + "learning_rate": 0.00010196678518385166, + "loss": 0.2785, + "step": 26720 + }, + { + "epoch": 2.164695398574206, + "grad_norm": 0.06525503098964691, + "learning_rate": 0.00010196228453125705, + "loss": 0.2442, + "step": 26721 + }, + { + "epoch": 2.1647764095917044, + "grad_norm": 0.0679815486073494, + "learning_rate": 0.00010195778387866242, + "loss": 0.2464, + "step": 26722 + }, + { + "epoch": 2.164857420609203, + "grad_norm": 0.06120334565639496, + "learning_rate": 0.00010195328322606778, + "loss": 0.2351, + "step": 26723 + }, + { + "epoch": 2.1649384316267013, + "grad_norm": 0.057069793343544006, + "learning_rate": 0.00010194878257347315, + "loss": 0.2769, + "step": 26724 + }, + { + "epoch": 2.1650194426441995, + "grad_norm": 0.05186355486512184, + "learning_rate": 0.00010194428192087853, + "loss": 0.255, + "step": 26725 + }, + { + "epoch": 2.165100453661698, + "grad_norm": 0.05202582851052284, + "learning_rate": 0.0001019397812682839, + "loss": 0.2577, + "step": 26726 + }, + { + "epoch": 2.1651814646791965, + "grad_norm": 0.05064481124281883, + "learning_rate": 0.00010193528061568929, + "loss": 0.2548, + "step": 26727 + }, + { + "epoch": 2.1652624756966947, + "grad_norm": 0.06526156514883041, + "learning_rate": 0.00010193077996309466, + "loss": 0.276, + "step": 26728 + }, + { + "epoch": 2.165343486714193, + "grad_norm": 0.049982719123363495, + "learning_rate": 0.00010192627931050002, + "loss": 0.2681, + "step": 26729 + }, + { + "epoch": 2.1654244977316917, + "grad_norm": 0.050517067313194275, + "learning_rate": 0.0001019217786579054, + "loss": 0.2587, + "step": 26730 + }, + { + "epoch": 2.16550550874919, + "grad_norm": 0.0650363489985466, + "learning_rate": 0.00010191727800531077, + "loss": 0.2565, + "step": 26731 + }, + { + "epoch": 2.165586519766688, + "grad_norm": 0.060763679444789886, + "learning_rate": 0.00010191277735271614, + "loss": 0.2328, + "step": 26732 + }, + { + "epoch": 2.165667530784187, + "grad_norm": 0.0616152361035347, + "learning_rate": 0.00010190827670012153, + "loss": 0.2042, + "step": 26733 + }, + { + "epoch": 2.165748541801685, + "grad_norm": 0.07217466086149216, + "learning_rate": 0.0001019037760475269, + "loss": 0.2598, + "step": 26734 + }, + { + "epoch": 2.1658295528191833, + "grad_norm": 0.07178352773189545, + "learning_rate": 0.00010189927539493228, + "loss": 0.2348, + "step": 26735 + }, + { + "epoch": 2.1659105638366816, + "grad_norm": 0.07062013447284698, + "learning_rate": 0.00010189477474233764, + "loss": 0.2333, + "step": 26736 + }, + { + "epoch": 2.1659915748541803, + "grad_norm": 0.07211899012327194, + "learning_rate": 0.00010189027408974301, + "loss": 0.2592, + "step": 26737 + }, + { + "epoch": 2.1660725858716785, + "grad_norm": 0.07860163599252701, + "learning_rate": 0.0001018857734371484, + "loss": 0.2504, + "step": 26738 + }, + { + "epoch": 2.1661535968891767, + "grad_norm": 0.0584687739610672, + "learning_rate": 0.00010188127278455377, + "loss": 0.2546, + "step": 26739 + }, + { + "epoch": 2.1662346079066754, + "grad_norm": 0.058649592101573944, + "learning_rate": 0.00010187677213195914, + "loss": 0.2068, + "step": 26740 + }, + { + "epoch": 2.1663156189241737, + "grad_norm": 0.07066672295331955, + "learning_rate": 0.00010187227147936452, + "loss": 0.2642, + "step": 26741 + }, + { + "epoch": 2.166396629941672, + "grad_norm": 0.06761866807937622, + "learning_rate": 0.00010186777082676988, + "loss": 0.2859, + "step": 26742 + }, + { + "epoch": 2.1664776409591706, + "grad_norm": 0.06311914324760437, + "learning_rate": 0.00010186327017417525, + "loss": 0.2429, + "step": 26743 + }, + { + "epoch": 2.166558651976669, + "grad_norm": 0.04951076582074165, + "learning_rate": 0.00010185876952158064, + "loss": 0.2263, + "step": 26744 + }, + { + "epoch": 2.166639662994167, + "grad_norm": 0.0637192651629448, + "learning_rate": 0.00010185426886898601, + "loss": 0.2625, + "step": 26745 + }, + { + "epoch": 2.166720674011666, + "grad_norm": 0.06900296360254288, + "learning_rate": 0.00010184976821639139, + "loss": 0.2377, + "step": 26746 + }, + { + "epoch": 2.166801685029164, + "grad_norm": 0.07371290773153305, + "learning_rate": 0.00010184526756379676, + "loss": 0.246, + "step": 26747 + }, + { + "epoch": 2.1668826960466623, + "grad_norm": 0.062216177582740784, + "learning_rate": 0.00010184076691120212, + "loss": 0.2369, + "step": 26748 + }, + { + "epoch": 2.1669637070641605, + "grad_norm": 0.06387155503034592, + "learning_rate": 0.0001018362662586075, + "loss": 0.2689, + "step": 26749 + }, + { + "epoch": 2.167044718081659, + "grad_norm": 0.058942124247550964, + "learning_rate": 0.00010183176560601288, + "loss": 0.2371, + "step": 26750 + }, + { + "epoch": 2.1671257290991575, + "grad_norm": 0.06074969097971916, + "learning_rate": 0.00010182726495341825, + "loss": 0.23, + "step": 26751 + }, + { + "epoch": 2.1672067401166557, + "grad_norm": 0.07480621337890625, + "learning_rate": 0.00010182276430082363, + "loss": 0.287, + "step": 26752 + }, + { + "epoch": 2.1672877511341544, + "grad_norm": 0.08909303694963455, + "learning_rate": 0.000101818263648229, + "loss": 0.2427, + "step": 26753 + }, + { + "epoch": 2.1673687621516526, + "grad_norm": 0.05299947410821915, + "learning_rate": 0.00010181376299563436, + "loss": 0.2399, + "step": 26754 + }, + { + "epoch": 2.167449773169151, + "grad_norm": 0.06072693318128586, + "learning_rate": 0.00010180926234303973, + "loss": 0.2634, + "step": 26755 + }, + { + "epoch": 2.1675307841866496, + "grad_norm": 0.0829189270734787, + "learning_rate": 0.00010180476169044512, + "loss": 0.3127, + "step": 26756 + }, + { + "epoch": 2.167611795204148, + "grad_norm": 0.06256073713302612, + "learning_rate": 0.0001018002610378505, + "loss": 0.2423, + "step": 26757 + }, + { + "epoch": 2.167692806221646, + "grad_norm": 0.06656386703252792, + "learning_rate": 0.00010179576038525587, + "loss": 0.2767, + "step": 26758 + }, + { + "epoch": 2.1677738172391443, + "grad_norm": 0.0756622776389122, + "learning_rate": 0.00010179125973266124, + "loss": 0.2534, + "step": 26759 + }, + { + "epoch": 2.167854828256643, + "grad_norm": 0.06560155004262924, + "learning_rate": 0.0001017867590800666, + "loss": 0.2627, + "step": 26760 + }, + { + "epoch": 2.1679358392741412, + "grad_norm": 0.07325062900781631, + "learning_rate": 0.000101782258427472, + "loss": 0.2672, + "step": 26761 + }, + { + "epoch": 2.1680168502916395, + "grad_norm": 0.08070877194404602, + "learning_rate": 0.00010177775777487736, + "loss": 0.2794, + "step": 26762 + }, + { + "epoch": 2.168097861309138, + "grad_norm": 0.06608166545629501, + "learning_rate": 0.00010177325712228274, + "loss": 0.2571, + "step": 26763 + }, + { + "epoch": 2.1681788723266364, + "grad_norm": 0.05721350386738777, + "learning_rate": 0.00010176875646968811, + "loss": 0.2518, + "step": 26764 + }, + { + "epoch": 2.1682598833441347, + "grad_norm": 0.08121379464864731, + "learning_rate": 0.00010176425581709348, + "loss": 0.3085, + "step": 26765 + }, + { + "epoch": 2.1683408943616334, + "grad_norm": 0.06703774631023407, + "learning_rate": 0.00010175975516449884, + "loss": 0.2756, + "step": 26766 + }, + { + "epoch": 2.1684219053791316, + "grad_norm": 0.07273641973733902, + "learning_rate": 0.00010175525451190425, + "loss": 0.2832, + "step": 26767 + }, + { + "epoch": 2.16850291639663, + "grad_norm": 0.05412622541189194, + "learning_rate": 0.0001017507538593096, + "loss": 0.2947, + "step": 26768 + }, + { + "epoch": 2.1685839274141285, + "grad_norm": 0.06769771873950958, + "learning_rate": 0.00010174625320671498, + "loss": 0.247, + "step": 26769 + }, + { + "epoch": 2.1686649384316268, + "grad_norm": 0.06173847243189812, + "learning_rate": 0.00010174175255412035, + "loss": 0.2371, + "step": 26770 + }, + { + "epoch": 2.168745949449125, + "grad_norm": 0.048327669501304626, + "learning_rate": 0.00010173725190152573, + "loss": 0.1902, + "step": 26771 + }, + { + "epoch": 2.1688269604666233, + "grad_norm": 0.071600541472435, + "learning_rate": 0.00010173275124893109, + "loss": 0.2731, + "step": 26772 + }, + { + "epoch": 2.168907971484122, + "grad_norm": 0.05988006293773651, + "learning_rate": 0.00010172825059633649, + "loss": 0.2513, + "step": 26773 + }, + { + "epoch": 2.16898898250162, + "grad_norm": 0.07233826816082001, + "learning_rate": 0.00010172374994374185, + "loss": 0.2387, + "step": 26774 + }, + { + "epoch": 2.1690699935191184, + "grad_norm": 0.05992598831653595, + "learning_rate": 0.00010171924929114722, + "loss": 0.2615, + "step": 26775 + }, + { + "epoch": 2.169151004536617, + "grad_norm": 0.0667218416929245, + "learning_rate": 0.0001017147486385526, + "loss": 0.2468, + "step": 26776 + }, + { + "epoch": 2.1692320155541154, + "grad_norm": 0.06006920710206032, + "learning_rate": 0.00010171024798595797, + "loss": 0.2693, + "step": 26777 + }, + { + "epoch": 2.1693130265716136, + "grad_norm": 0.06924033164978027, + "learning_rate": 0.00010170574733336333, + "loss": 0.2469, + "step": 26778 + }, + { + "epoch": 2.1693940375891123, + "grad_norm": 0.05584564805030823, + "learning_rate": 0.00010170124668076873, + "loss": 0.2532, + "step": 26779 + }, + { + "epoch": 2.1694750486066106, + "grad_norm": 0.05546014755964279, + "learning_rate": 0.00010169674602817409, + "loss": 0.2259, + "step": 26780 + }, + { + "epoch": 2.169556059624109, + "grad_norm": 0.058247484266757965, + "learning_rate": 0.00010169224537557946, + "loss": 0.2787, + "step": 26781 + }, + { + "epoch": 2.169637070641607, + "grad_norm": 0.055568281561136246, + "learning_rate": 0.00010168774472298484, + "loss": 0.2211, + "step": 26782 + }, + { + "epoch": 2.1697180816591057, + "grad_norm": 0.06250690668821335, + "learning_rate": 0.00010168324407039021, + "loss": 0.2913, + "step": 26783 + }, + { + "epoch": 2.169799092676604, + "grad_norm": 0.05972811579704285, + "learning_rate": 0.00010167874341779557, + "loss": 0.263, + "step": 26784 + }, + { + "epoch": 2.1698801036941022, + "grad_norm": 0.06468886882066727, + "learning_rate": 0.00010167424276520097, + "loss": 0.2459, + "step": 26785 + }, + { + "epoch": 2.169961114711601, + "grad_norm": 0.06843791902065277, + "learning_rate": 0.00010166974211260633, + "loss": 0.2314, + "step": 26786 + }, + { + "epoch": 2.170042125729099, + "grad_norm": 0.06479120254516602, + "learning_rate": 0.0001016652414600117, + "loss": 0.2212, + "step": 26787 + }, + { + "epoch": 2.1701231367465974, + "grad_norm": 0.057134464383125305, + "learning_rate": 0.00010166074080741708, + "loss": 0.2177, + "step": 26788 + }, + { + "epoch": 2.170204147764096, + "grad_norm": 0.06482430547475815, + "learning_rate": 0.00010165624015482245, + "loss": 0.2535, + "step": 26789 + }, + { + "epoch": 2.1702851587815943, + "grad_norm": 0.06957831978797913, + "learning_rate": 0.00010165173950222784, + "loss": 0.2535, + "step": 26790 + }, + { + "epoch": 2.1703661697990926, + "grad_norm": 0.09038234502077103, + "learning_rate": 0.00010164723884963321, + "loss": 0.2401, + "step": 26791 + }, + { + "epoch": 2.1704471808165913, + "grad_norm": 0.07424237579107285, + "learning_rate": 0.00010164273819703857, + "loss": 0.2865, + "step": 26792 + }, + { + "epoch": 2.1705281918340895, + "grad_norm": 0.06953853368759155, + "learning_rate": 0.00010163823754444395, + "loss": 0.271, + "step": 26793 + }, + { + "epoch": 2.1706092028515878, + "grad_norm": 0.06888557970523834, + "learning_rate": 0.00010163373689184932, + "loss": 0.2437, + "step": 26794 + }, + { + "epoch": 2.170690213869086, + "grad_norm": 0.05482298508286476, + "learning_rate": 0.00010162923623925469, + "loss": 0.2175, + "step": 26795 + }, + { + "epoch": 2.1707712248865847, + "grad_norm": 0.06336444616317749, + "learning_rate": 0.00010162473558666008, + "loss": 0.23, + "step": 26796 + }, + { + "epoch": 2.170852235904083, + "grad_norm": 0.07812932878732681, + "learning_rate": 0.00010162023493406545, + "loss": 0.2428, + "step": 26797 + }, + { + "epoch": 2.170933246921581, + "grad_norm": 0.07318337261676788, + "learning_rate": 0.00010161573428147081, + "loss": 0.2747, + "step": 26798 + }, + { + "epoch": 2.17101425793908, + "grad_norm": 0.07459474354982376, + "learning_rate": 0.00010161123362887619, + "loss": 0.2651, + "step": 26799 + }, + { + "epoch": 2.171095268956578, + "grad_norm": 0.06458953768014908, + "learning_rate": 0.00010160673297628156, + "loss": 0.3091, + "step": 26800 + }, + { + "epoch": 2.1711762799740764, + "grad_norm": 0.06517677754163742, + "learning_rate": 0.00010160223232368693, + "loss": 0.273, + "step": 26801 + }, + { + "epoch": 2.171257290991575, + "grad_norm": 0.06921321898698807, + "learning_rate": 0.00010159773167109232, + "loss": 0.2543, + "step": 26802 + }, + { + "epoch": 2.1713383020090733, + "grad_norm": 0.06701141595840454, + "learning_rate": 0.0001015932310184977, + "loss": 0.2652, + "step": 26803 + }, + { + "epoch": 2.1714193130265715, + "grad_norm": 0.058195166289806366, + "learning_rate": 0.00010158873036590307, + "loss": 0.2553, + "step": 26804 + }, + { + "epoch": 2.17150032404407, + "grad_norm": 0.0681111291050911, + "learning_rate": 0.00010158422971330843, + "loss": 0.238, + "step": 26805 + }, + { + "epoch": 2.1715813350615685, + "grad_norm": 0.07244133204221725, + "learning_rate": 0.0001015797290607138, + "loss": 0.243, + "step": 26806 + }, + { + "epoch": 2.1716623460790667, + "grad_norm": 0.05482422932982445, + "learning_rate": 0.00010157522840811918, + "loss": 0.2243, + "step": 26807 + }, + { + "epoch": 2.171743357096565, + "grad_norm": 0.057814229279756546, + "learning_rate": 0.00010157072775552456, + "loss": 0.2618, + "step": 26808 + }, + { + "epoch": 2.1718243681140637, + "grad_norm": 0.05317820981144905, + "learning_rate": 0.00010156622710292994, + "loss": 0.2305, + "step": 26809 + }, + { + "epoch": 2.171905379131562, + "grad_norm": 0.07410687953233719, + "learning_rate": 0.00010156172645033531, + "loss": 0.2517, + "step": 26810 + }, + { + "epoch": 2.17198639014906, + "grad_norm": 0.07304941117763519, + "learning_rate": 0.00010155722579774067, + "loss": 0.3064, + "step": 26811 + }, + { + "epoch": 2.172067401166559, + "grad_norm": 0.05160785838961601, + "learning_rate": 0.00010155272514514604, + "loss": 0.258, + "step": 26812 + }, + { + "epoch": 2.172148412184057, + "grad_norm": 0.06912660598754883, + "learning_rate": 0.00010154822449255143, + "loss": 0.3077, + "step": 26813 + }, + { + "epoch": 2.1722294232015553, + "grad_norm": 0.057028722018003464, + "learning_rate": 0.0001015437238399568, + "loss": 0.2669, + "step": 26814 + }, + { + "epoch": 2.172310434219054, + "grad_norm": 0.06860620528459549, + "learning_rate": 0.00010153922318736218, + "loss": 0.2796, + "step": 26815 + }, + { + "epoch": 2.1723914452365523, + "grad_norm": 0.06248742341995239, + "learning_rate": 0.00010153472253476755, + "loss": 0.2643, + "step": 26816 + }, + { + "epoch": 2.1724724562540505, + "grad_norm": 0.06286834180355072, + "learning_rate": 0.00010153022188217291, + "loss": 0.2312, + "step": 26817 + }, + { + "epoch": 2.1725534672715487, + "grad_norm": 0.052289243787527084, + "learning_rate": 0.00010152572122957829, + "loss": 0.2168, + "step": 26818 + }, + { + "epoch": 2.1726344782890474, + "grad_norm": 0.06501159071922302, + "learning_rate": 0.00010152122057698367, + "loss": 0.2233, + "step": 26819 + }, + { + "epoch": 2.1727154893065457, + "grad_norm": 0.06584373116493225, + "learning_rate": 0.00010151671992438905, + "loss": 0.302, + "step": 26820 + }, + { + "epoch": 2.172796500324044, + "grad_norm": 0.06395610421895981, + "learning_rate": 0.00010151221927179442, + "loss": 0.2661, + "step": 26821 + }, + { + "epoch": 2.1728775113415426, + "grad_norm": 0.06655091792345047, + "learning_rate": 0.0001015077186191998, + "loss": 0.2748, + "step": 26822 + }, + { + "epoch": 2.172958522359041, + "grad_norm": 0.05467076599597931, + "learning_rate": 0.00010150321796660515, + "loss": 0.2563, + "step": 26823 + }, + { + "epoch": 2.173039533376539, + "grad_norm": 0.053810540586709976, + "learning_rate": 0.00010149871731401053, + "loss": 0.2178, + "step": 26824 + }, + { + "epoch": 2.173120544394038, + "grad_norm": 0.06725815683603287, + "learning_rate": 0.00010149421666141591, + "loss": 0.2688, + "step": 26825 + }, + { + "epoch": 2.173201555411536, + "grad_norm": 0.0681646317243576, + "learning_rate": 0.00010148971600882129, + "loss": 0.2879, + "step": 26826 + }, + { + "epoch": 2.1732825664290343, + "grad_norm": 0.0565604604780674, + "learning_rate": 0.00010148521535622666, + "loss": 0.2636, + "step": 26827 + }, + { + "epoch": 2.1733635774465325, + "grad_norm": 0.062234435230493546, + "learning_rate": 0.00010148071470363204, + "loss": 0.2619, + "step": 26828 + }, + { + "epoch": 2.173444588464031, + "grad_norm": 0.055433180183172226, + "learning_rate": 0.0001014762140510374, + "loss": 0.2548, + "step": 26829 + }, + { + "epoch": 2.1735255994815295, + "grad_norm": 0.06828644126653671, + "learning_rate": 0.00010147171339844277, + "loss": 0.2876, + "step": 26830 + }, + { + "epoch": 2.1736066104990277, + "grad_norm": 0.05612456053495407, + "learning_rate": 0.00010146721274584816, + "loss": 0.2097, + "step": 26831 + }, + { + "epoch": 2.1736876215165264, + "grad_norm": 0.07091177254915237, + "learning_rate": 0.00010146271209325353, + "loss": 0.2632, + "step": 26832 + }, + { + "epoch": 2.1737686325340246, + "grad_norm": 0.06432201713323593, + "learning_rate": 0.0001014582114406589, + "loss": 0.2493, + "step": 26833 + }, + { + "epoch": 2.173849643551523, + "grad_norm": 0.08751298487186432, + "learning_rate": 0.00010145371078806428, + "loss": 0.2594, + "step": 26834 + }, + { + "epoch": 2.1739306545690216, + "grad_norm": 0.06486582010984421, + "learning_rate": 0.00010144921013546964, + "loss": 0.2603, + "step": 26835 + }, + { + "epoch": 2.17401166558652, + "grad_norm": 0.07561185956001282, + "learning_rate": 0.00010144470948287501, + "loss": 0.2503, + "step": 26836 + }, + { + "epoch": 2.174092676604018, + "grad_norm": 0.05659014731645584, + "learning_rate": 0.0001014402088302804, + "loss": 0.2091, + "step": 26837 + }, + { + "epoch": 2.1741736876215167, + "grad_norm": 0.049686819314956665, + "learning_rate": 0.00010143570817768577, + "loss": 0.2458, + "step": 26838 + }, + { + "epoch": 2.174254698639015, + "grad_norm": 0.06719785183668137, + "learning_rate": 0.00010143120752509114, + "loss": 0.2321, + "step": 26839 + }, + { + "epoch": 2.1743357096565132, + "grad_norm": 0.06648898869752884, + "learning_rate": 0.00010142670687249652, + "loss": 0.2551, + "step": 26840 + }, + { + "epoch": 2.1744167206740115, + "grad_norm": 0.07859358936548233, + "learning_rate": 0.00010142220621990188, + "loss": 0.2387, + "step": 26841 + }, + { + "epoch": 2.17449773169151, + "grad_norm": 0.06366816908121109, + "learning_rate": 0.00010141770556730728, + "loss": 0.2409, + "step": 26842 + }, + { + "epoch": 2.1745787427090084, + "grad_norm": 0.06784055382013321, + "learning_rate": 0.00010141320491471264, + "loss": 0.247, + "step": 26843 + }, + { + "epoch": 2.1746597537265067, + "grad_norm": 0.08164727687835693, + "learning_rate": 0.00010140870426211801, + "loss": 0.2682, + "step": 26844 + }, + { + "epoch": 2.1747407647440054, + "grad_norm": 0.06338001042604446, + "learning_rate": 0.00010140420360952339, + "loss": 0.276, + "step": 26845 + }, + { + "epoch": 2.1748217757615036, + "grad_norm": 0.061157673597335815, + "learning_rate": 0.00010139970295692876, + "loss": 0.2441, + "step": 26846 + }, + { + "epoch": 2.174902786779002, + "grad_norm": 0.06020011752843857, + "learning_rate": 0.00010139520230433412, + "loss": 0.2292, + "step": 26847 + }, + { + "epoch": 2.1749837977965005, + "grad_norm": 0.060947537422180176, + "learning_rate": 0.00010139070165173952, + "loss": 0.2535, + "step": 26848 + }, + { + "epoch": 2.1750648088139988, + "grad_norm": 0.05085349455475807, + "learning_rate": 0.00010138620099914488, + "loss": 0.2459, + "step": 26849 + }, + { + "epoch": 2.175145819831497, + "grad_norm": 0.07931619882583618, + "learning_rate": 0.00010138170034655025, + "loss": 0.2339, + "step": 26850 + }, + { + "epoch": 2.1752268308489953, + "grad_norm": 0.05213142931461334, + "learning_rate": 0.00010137719969395563, + "loss": 0.2326, + "step": 26851 + }, + { + "epoch": 2.175307841866494, + "grad_norm": 0.07295794039964676, + "learning_rate": 0.000101372699041361, + "loss": 0.2556, + "step": 26852 + }, + { + "epoch": 2.175388852883992, + "grad_norm": 0.06054900586605072, + "learning_rate": 0.00010136819838876636, + "loss": 0.2651, + "step": 26853 + }, + { + "epoch": 2.1754698639014904, + "grad_norm": 0.08561225980520248, + "learning_rate": 0.00010136369773617176, + "loss": 0.2647, + "step": 26854 + }, + { + "epoch": 2.175550874918989, + "grad_norm": 0.05409996584057808, + "learning_rate": 0.00010135919708357712, + "loss": 0.2526, + "step": 26855 + }, + { + "epoch": 2.1756318859364874, + "grad_norm": 0.058983881026506424, + "learning_rate": 0.0001013546964309825, + "loss": 0.2587, + "step": 26856 + }, + { + "epoch": 2.1757128969539856, + "grad_norm": 0.0726015567779541, + "learning_rate": 0.00010135019577838787, + "loss": 0.2552, + "step": 26857 + }, + { + "epoch": 2.1757939079714843, + "grad_norm": 0.06838826835155487, + "learning_rate": 0.00010134569512579324, + "loss": 0.2544, + "step": 26858 + }, + { + "epoch": 2.1758749189889826, + "grad_norm": 0.07187411189079285, + "learning_rate": 0.0001013411944731986, + "loss": 0.2318, + "step": 26859 + }, + { + "epoch": 2.175955930006481, + "grad_norm": 0.06443783640861511, + "learning_rate": 0.000101336693820604, + "loss": 0.2641, + "step": 26860 + }, + { + "epoch": 2.1760369410239795, + "grad_norm": 0.07598260790109634, + "learning_rate": 0.00010133219316800936, + "loss": 0.2823, + "step": 26861 + }, + { + "epoch": 2.1761179520414777, + "grad_norm": 0.05119853839278221, + "learning_rate": 0.00010132769251541474, + "loss": 0.2304, + "step": 26862 + }, + { + "epoch": 2.176198963058976, + "grad_norm": 0.07492173463106155, + "learning_rate": 0.00010132319186282011, + "loss": 0.2622, + "step": 26863 + }, + { + "epoch": 2.176279974076474, + "grad_norm": 0.05547771230340004, + "learning_rate": 0.00010131869121022548, + "loss": 0.2158, + "step": 26864 + }, + { + "epoch": 2.176360985093973, + "grad_norm": 0.06932628899812698, + "learning_rate": 0.00010131419055763084, + "loss": 0.2798, + "step": 26865 + }, + { + "epoch": 2.176441996111471, + "grad_norm": 0.06388963013887405, + "learning_rate": 0.00010130968990503625, + "loss": 0.2632, + "step": 26866 + }, + { + "epoch": 2.1765230071289694, + "grad_norm": 0.05636812746524811, + "learning_rate": 0.00010130518925244162, + "loss": 0.2714, + "step": 26867 + }, + { + "epoch": 2.176604018146468, + "grad_norm": 0.06506742537021637, + "learning_rate": 0.00010130068859984698, + "loss": 0.2738, + "step": 26868 + }, + { + "epoch": 2.1766850291639663, + "grad_norm": 0.060381386429071426, + "learning_rate": 0.00010129618794725235, + "loss": 0.2743, + "step": 26869 + }, + { + "epoch": 2.1767660401814646, + "grad_norm": 0.05970916897058487, + "learning_rate": 0.00010129168729465773, + "loss": 0.2559, + "step": 26870 + }, + { + "epoch": 2.176847051198963, + "grad_norm": 0.06593108922243118, + "learning_rate": 0.00010128718664206311, + "loss": 0.2611, + "step": 26871 + }, + { + "epoch": 2.1769280622164615, + "grad_norm": 0.05910542234778404, + "learning_rate": 0.00010128268598946849, + "loss": 0.2718, + "step": 26872 + }, + { + "epoch": 2.1770090732339598, + "grad_norm": 0.06308293342590332, + "learning_rate": 0.00010127818533687386, + "loss": 0.2707, + "step": 26873 + }, + { + "epoch": 2.177090084251458, + "grad_norm": 0.06285148113965988, + "learning_rate": 0.00010127368468427922, + "loss": 0.2661, + "step": 26874 + }, + { + "epoch": 2.1771710952689567, + "grad_norm": 0.059369564056396484, + "learning_rate": 0.0001012691840316846, + "loss": 0.222, + "step": 26875 + }, + { + "epoch": 2.177252106286455, + "grad_norm": 0.05995746701955795, + "learning_rate": 0.00010126468337908997, + "loss": 0.3101, + "step": 26876 + }, + { + "epoch": 2.177333117303953, + "grad_norm": 0.08611472696065903, + "learning_rate": 0.00010126018272649536, + "loss": 0.276, + "step": 26877 + }, + { + "epoch": 2.177414128321452, + "grad_norm": 0.06998145580291748, + "learning_rate": 0.00010125568207390073, + "loss": 0.2923, + "step": 26878 + }, + { + "epoch": 2.17749513933895, + "grad_norm": 0.05503055825829506, + "learning_rate": 0.0001012511814213061, + "loss": 0.2521, + "step": 26879 + }, + { + "epoch": 2.1775761503564484, + "grad_norm": 0.06742962449789047, + "learning_rate": 0.00010124668076871146, + "loss": 0.292, + "step": 26880 + }, + { + "epoch": 2.177657161373947, + "grad_norm": 0.061561357229948044, + "learning_rate": 0.00010124218011611684, + "loss": 0.2354, + "step": 26881 + }, + { + "epoch": 2.1777381723914453, + "grad_norm": 0.05672166496515274, + "learning_rate": 0.00010123767946352221, + "loss": 0.242, + "step": 26882 + }, + { + "epoch": 2.1778191834089435, + "grad_norm": 0.07537026703357697, + "learning_rate": 0.0001012331788109276, + "loss": 0.2796, + "step": 26883 + }, + { + "epoch": 2.1779001944264422, + "grad_norm": 0.058555323630571365, + "learning_rate": 0.00010122867815833297, + "loss": 0.2534, + "step": 26884 + }, + { + "epoch": 2.1779812054439405, + "grad_norm": 0.06248362362384796, + "learning_rate": 0.00010122417750573834, + "loss": 0.2499, + "step": 26885 + }, + { + "epoch": 2.1780622164614387, + "grad_norm": 0.06366802006959915, + "learning_rate": 0.0001012196768531437, + "loss": 0.2357, + "step": 26886 + }, + { + "epoch": 2.178143227478937, + "grad_norm": 0.07255370169878006, + "learning_rate": 0.00010121517620054908, + "loss": 0.2567, + "step": 26887 + }, + { + "epoch": 2.1782242384964356, + "grad_norm": 0.07287876307964325, + "learning_rate": 0.00010121067554795445, + "loss": 0.3164, + "step": 26888 + }, + { + "epoch": 2.178305249513934, + "grad_norm": 0.0844060480594635, + "learning_rate": 0.00010120617489535984, + "loss": 0.2519, + "step": 26889 + }, + { + "epoch": 2.178386260531432, + "grad_norm": 0.08379963785409927, + "learning_rate": 0.00010120167424276521, + "loss": 0.2488, + "step": 26890 + }, + { + "epoch": 2.178467271548931, + "grad_norm": 0.058833617717027664, + "learning_rate": 0.00010119717359017059, + "loss": 0.2436, + "step": 26891 + }, + { + "epoch": 2.178548282566429, + "grad_norm": 0.07550203055143356, + "learning_rate": 0.00010119267293757595, + "loss": 0.2592, + "step": 26892 + }, + { + "epoch": 2.1786292935839273, + "grad_norm": 0.0695260539650917, + "learning_rate": 0.00010118817228498132, + "loss": 0.2653, + "step": 26893 + }, + { + "epoch": 2.1787103046014256, + "grad_norm": 0.054173555225133896, + "learning_rate": 0.0001011836716323867, + "loss": 0.2421, + "step": 26894 + }, + { + "epoch": 2.1787913156189243, + "grad_norm": 0.05898955464363098, + "learning_rate": 0.00010117917097979208, + "loss": 0.2291, + "step": 26895 + }, + { + "epoch": 2.1788723266364225, + "grad_norm": 0.05412711948156357, + "learning_rate": 0.00010117467032719745, + "loss": 0.2196, + "step": 26896 + }, + { + "epoch": 2.1789533376539207, + "grad_norm": 0.053595926612615585, + "learning_rate": 0.00010117016967460283, + "loss": 0.2341, + "step": 26897 + }, + { + "epoch": 2.1790343486714194, + "grad_norm": 0.06263613700866699, + "learning_rate": 0.00010116566902200819, + "loss": 0.2385, + "step": 26898 + }, + { + "epoch": 2.1791153596889177, + "grad_norm": 0.07698112726211548, + "learning_rate": 0.00010116116836941356, + "loss": 0.3146, + "step": 26899 + }, + { + "epoch": 2.179196370706416, + "grad_norm": 0.06529154628515244, + "learning_rate": 0.00010115666771681895, + "loss": 0.2266, + "step": 26900 + }, + { + "epoch": 2.1792773817239146, + "grad_norm": 0.0627814456820488, + "learning_rate": 0.00010115216706422432, + "loss": 0.2646, + "step": 26901 + }, + { + "epoch": 2.179358392741413, + "grad_norm": 0.06005362793803215, + "learning_rate": 0.0001011476664116297, + "loss": 0.2475, + "step": 26902 + }, + { + "epoch": 2.179439403758911, + "grad_norm": 0.062046971172094345, + "learning_rate": 0.00010114316575903507, + "loss": 0.2527, + "step": 26903 + }, + { + "epoch": 2.17952041477641, + "grad_norm": 0.06619498878717422, + "learning_rate": 0.00010113866510644043, + "loss": 0.2766, + "step": 26904 + }, + { + "epoch": 2.179601425793908, + "grad_norm": 0.06332775950431824, + "learning_rate": 0.0001011341644538458, + "loss": 0.2353, + "step": 26905 + }, + { + "epoch": 2.1796824368114063, + "grad_norm": 0.06274640560150146, + "learning_rate": 0.00010112966380125119, + "loss": 0.2526, + "step": 26906 + }, + { + "epoch": 2.179763447828905, + "grad_norm": 0.060782160609960556, + "learning_rate": 0.00010112516314865656, + "loss": 0.2578, + "step": 26907 + }, + { + "epoch": 2.179844458846403, + "grad_norm": 0.062455467879772186, + "learning_rate": 0.00010112066249606194, + "loss": 0.2314, + "step": 26908 + }, + { + "epoch": 2.1799254698639015, + "grad_norm": 0.06808603554964066, + "learning_rate": 0.00010111616184346731, + "loss": 0.2572, + "step": 26909 + }, + { + "epoch": 2.1800064808813997, + "grad_norm": 0.07538673281669617, + "learning_rate": 0.00010111166119087267, + "loss": 0.2653, + "step": 26910 + }, + { + "epoch": 2.1800874918988984, + "grad_norm": 0.05819778889417648, + "learning_rate": 0.00010110716053827804, + "loss": 0.2352, + "step": 26911 + }, + { + "epoch": 2.1801685029163966, + "grad_norm": 0.07164320349693298, + "learning_rate": 0.00010110265988568343, + "loss": 0.2437, + "step": 26912 + }, + { + "epoch": 2.180249513933895, + "grad_norm": 0.0552498884499073, + "learning_rate": 0.0001010981592330888, + "loss": 0.2135, + "step": 26913 + }, + { + "epoch": 2.1803305249513936, + "grad_norm": 0.0703219398856163, + "learning_rate": 0.00010109365858049418, + "loss": 0.2735, + "step": 26914 + }, + { + "epoch": 2.180411535968892, + "grad_norm": 0.0603179857134819, + "learning_rate": 0.00010108915792789955, + "loss": 0.2237, + "step": 26915 + }, + { + "epoch": 2.18049254698639, + "grad_norm": 0.07395555824041367, + "learning_rate": 0.00010108465727530491, + "loss": 0.2294, + "step": 26916 + }, + { + "epoch": 2.1805735580038883, + "grad_norm": 0.06649938225746155, + "learning_rate": 0.00010108015662271029, + "loss": 0.2314, + "step": 26917 + }, + { + "epoch": 2.180654569021387, + "grad_norm": 0.05904200300574303, + "learning_rate": 0.00010107565597011567, + "loss": 0.2757, + "step": 26918 + }, + { + "epoch": 2.1807355800388852, + "grad_norm": 0.0554424412548542, + "learning_rate": 0.00010107115531752105, + "loss": 0.2022, + "step": 26919 + }, + { + "epoch": 2.1808165910563835, + "grad_norm": 0.07284878194332123, + "learning_rate": 0.00010106665466492642, + "loss": 0.2521, + "step": 26920 + }, + { + "epoch": 2.180897602073882, + "grad_norm": 0.06328210979700089, + "learning_rate": 0.0001010621540123318, + "loss": 0.2335, + "step": 26921 + }, + { + "epoch": 2.1809786130913804, + "grad_norm": 0.07258543372154236, + "learning_rate": 0.00010105765335973715, + "loss": 0.2862, + "step": 26922 + }, + { + "epoch": 2.1810596241088787, + "grad_norm": 0.07839885354042053, + "learning_rate": 0.00010105315270714255, + "loss": 0.3048, + "step": 26923 + }, + { + "epoch": 2.1811406351263773, + "grad_norm": 0.06599098443984985, + "learning_rate": 0.00010104865205454791, + "loss": 0.3021, + "step": 26924 + }, + { + "epoch": 2.1812216461438756, + "grad_norm": 0.059078000485897064, + "learning_rate": 0.00010104415140195329, + "loss": 0.2275, + "step": 26925 + }, + { + "epoch": 2.181302657161374, + "grad_norm": 0.0659744143486023, + "learning_rate": 0.00010103965074935866, + "loss": 0.2426, + "step": 26926 + }, + { + "epoch": 2.1813836681788725, + "grad_norm": 0.05550341308116913, + "learning_rate": 0.00010103515009676404, + "loss": 0.237, + "step": 26927 + }, + { + "epoch": 2.1814646791963708, + "grad_norm": 0.061645619571208954, + "learning_rate": 0.0001010306494441694, + "loss": 0.2622, + "step": 26928 + }, + { + "epoch": 2.181545690213869, + "grad_norm": 0.06350360810756683, + "learning_rate": 0.0001010261487915748, + "loss": 0.2376, + "step": 26929 + }, + { + "epoch": 2.1816267012313673, + "grad_norm": 0.05759890004992485, + "learning_rate": 0.00010102164813898016, + "loss": 0.272, + "step": 26930 + }, + { + "epoch": 2.181707712248866, + "grad_norm": 0.06146299093961716, + "learning_rate": 0.00010101714748638553, + "loss": 0.2552, + "step": 26931 + }, + { + "epoch": 2.181788723266364, + "grad_norm": 0.06417529284954071, + "learning_rate": 0.0001010126468337909, + "loss": 0.2554, + "step": 26932 + }, + { + "epoch": 2.1818697342838624, + "grad_norm": 0.06104740872979164, + "learning_rate": 0.00010100814618119628, + "loss": 0.2642, + "step": 26933 + }, + { + "epoch": 2.181950745301361, + "grad_norm": 0.05828656256198883, + "learning_rate": 0.00010100364552860164, + "loss": 0.2439, + "step": 26934 + }, + { + "epoch": 2.1820317563188594, + "grad_norm": 0.05927203223109245, + "learning_rate": 0.00010099914487600704, + "loss": 0.2732, + "step": 26935 + }, + { + "epoch": 2.1821127673363576, + "grad_norm": 0.07756493240594864, + "learning_rate": 0.00010099464422341241, + "loss": 0.2433, + "step": 26936 + }, + { + "epoch": 2.1821937783538563, + "grad_norm": 0.06863519549369812, + "learning_rate": 0.00010099014357081777, + "loss": 0.2594, + "step": 26937 + }, + { + "epoch": 2.1822747893713546, + "grad_norm": 0.06425057351589203, + "learning_rate": 0.00010098564291822315, + "loss": 0.2223, + "step": 26938 + }, + { + "epoch": 2.182355800388853, + "grad_norm": 0.057839956134557724, + "learning_rate": 0.00010098114226562852, + "loss": 0.2562, + "step": 26939 + }, + { + "epoch": 2.182436811406351, + "grad_norm": 0.06706058979034424, + "learning_rate": 0.00010097664161303388, + "loss": 0.2575, + "step": 26940 + }, + { + "epoch": 2.1825178224238497, + "grad_norm": 0.07520833611488342, + "learning_rate": 0.00010097214096043928, + "loss": 0.2859, + "step": 26941 + }, + { + "epoch": 2.182598833441348, + "grad_norm": 0.06069038808345795, + "learning_rate": 0.00010096764030784465, + "loss": 0.2359, + "step": 26942 + }, + { + "epoch": 2.182679844458846, + "grad_norm": 0.06446312367916107, + "learning_rate": 0.00010096313965525001, + "loss": 0.2374, + "step": 26943 + }, + { + "epoch": 2.182760855476345, + "grad_norm": 0.07859501242637634, + "learning_rate": 0.00010095863900265539, + "loss": 0.2502, + "step": 26944 + }, + { + "epoch": 2.182841866493843, + "grad_norm": 0.06446570158004761, + "learning_rate": 0.00010095413835006076, + "loss": 0.2924, + "step": 26945 + }, + { + "epoch": 2.1829228775113414, + "grad_norm": 0.06464308500289917, + "learning_rate": 0.00010094963769746615, + "loss": 0.262, + "step": 26946 + }, + { + "epoch": 2.18300388852884, + "grad_norm": 0.04921901226043701, + "learning_rate": 0.00010094513704487152, + "loss": 0.2536, + "step": 26947 + }, + { + "epoch": 2.1830848995463383, + "grad_norm": 0.06221909075975418, + "learning_rate": 0.0001009406363922769, + "loss": 0.3152, + "step": 26948 + }, + { + "epoch": 2.1831659105638366, + "grad_norm": 0.06305009126663208, + "learning_rate": 0.00010093613573968225, + "loss": 0.2528, + "step": 26949 + }, + { + "epoch": 2.1832469215813353, + "grad_norm": 0.08612064272165298, + "learning_rate": 0.00010093163508708763, + "loss": 0.2447, + "step": 26950 + }, + { + "epoch": 2.1833279325988335, + "grad_norm": 0.07131386548280716, + "learning_rate": 0.000100927134434493, + "loss": 0.2517, + "step": 26951 + }, + { + "epoch": 2.1834089436163318, + "grad_norm": 0.08630380779504776, + "learning_rate": 0.00010092263378189839, + "loss": 0.3009, + "step": 26952 + }, + { + "epoch": 2.18348995463383, + "grad_norm": 0.05582493916153908, + "learning_rate": 0.00010091813312930376, + "loss": 0.2636, + "step": 26953 + }, + { + "epoch": 2.1835709656513287, + "grad_norm": 0.07412777841091156, + "learning_rate": 0.00010091363247670914, + "loss": 0.2333, + "step": 26954 + }, + { + "epoch": 2.183651976668827, + "grad_norm": 0.06575404107570648, + "learning_rate": 0.0001009091318241145, + "loss": 0.2573, + "step": 26955 + }, + { + "epoch": 2.183732987686325, + "grad_norm": 0.0642695501446724, + "learning_rate": 0.00010090463117151987, + "loss": 0.2624, + "step": 26956 + }, + { + "epoch": 2.183813998703824, + "grad_norm": 0.0616472102701664, + "learning_rate": 0.00010090013051892524, + "loss": 0.2231, + "step": 26957 + }, + { + "epoch": 2.183895009721322, + "grad_norm": 0.06302770972251892, + "learning_rate": 0.00010089562986633063, + "loss": 0.2798, + "step": 26958 + }, + { + "epoch": 2.1839760207388204, + "grad_norm": 0.05955367907881737, + "learning_rate": 0.000100891129213736, + "loss": 0.2604, + "step": 26959 + }, + { + "epoch": 2.184057031756319, + "grad_norm": 0.06482961773872375, + "learning_rate": 0.00010088662856114138, + "loss": 0.2375, + "step": 26960 + }, + { + "epoch": 2.1841380427738173, + "grad_norm": 0.0561881847679615, + "learning_rate": 0.00010088212790854674, + "loss": 0.226, + "step": 26961 + }, + { + "epoch": 2.1842190537913155, + "grad_norm": 0.07170061022043228, + "learning_rate": 0.00010087762725595211, + "loss": 0.2623, + "step": 26962 + }, + { + "epoch": 2.184300064808814, + "grad_norm": 0.06463984400033951, + "learning_rate": 0.00010087312660335749, + "loss": 0.2756, + "step": 26963 + }, + { + "epoch": 2.1843810758263125, + "grad_norm": 0.061898235231637955, + "learning_rate": 0.00010086862595076287, + "loss": 0.2383, + "step": 26964 + }, + { + "epoch": 2.1844620868438107, + "grad_norm": 0.06135169789195061, + "learning_rate": 0.00010086412529816825, + "loss": 0.2642, + "step": 26965 + }, + { + "epoch": 2.184543097861309, + "grad_norm": 0.08380525559186935, + "learning_rate": 0.00010085962464557362, + "loss": 0.2849, + "step": 26966 + }, + { + "epoch": 2.1846241088788076, + "grad_norm": 0.07640878856182098, + "learning_rate": 0.00010085512399297898, + "loss": 0.3056, + "step": 26967 + }, + { + "epoch": 2.184705119896306, + "grad_norm": 0.0755113810300827, + "learning_rate": 0.00010085062334038435, + "loss": 0.2957, + "step": 26968 + }, + { + "epoch": 2.184786130913804, + "grad_norm": 0.07308080792427063, + "learning_rate": 0.00010084612268778973, + "loss": 0.2689, + "step": 26969 + }, + { + "epoch": 2.184867141931303, + "grad_norm": 0.07572430372238159, + "learning_rate": 0.00010084162203519511, + "loss": 0.2492, + "step": 26970 + }, + { + "epoch": 2.184948152948801, + "grad_norm": 0.07837171852588654, + "learning_rate": 0.00010083712138260049, + "loss": 0.3113, + "step": 26971 + }, + { + "epoch": 2.1850291639662993, + "grad_norm": 0.06892222166061401, + "learning_rate": 0.00010083262073000586, + "loss": 0.2123, + "step": 26972 + }, + { + "epoch": 2.185110174983798, + "grad_norm": 0.0713641420006752, + "learning_rate": 0.00010082812007741122, + "loss": 0.2821, + "step": 26973 + }, + { + "epoch": 2.1851911860012962, + "grad_norm": 0.05223554000258446, + "learning_rate": 0.0001008236194248166, + "loss": 0.2282, + "step": 26974 + }, + { + "epoch": 2.1852721970187945, + "grad_norm": 0.060114096850156784, + "learning_rate": 0.00010081911877222198, + "loss": 0.2458, + "step": 26975 + }, + { + "epoch": 2.1853532080362927, + "grad_norm": 0.06315486133098602, + "learning_rate": 0.00010081461811962736, + "loss": 0.2526, + "step": 26976 + }, + { + "epoch": 2.1854342190537914, + "grad_norm": 0.05718789994716644, + "learning_rate": 0.00010081011746703273, + "loss": 0.2548, + "step": 26977 + }, + { + "epoch": 2.1855152300712897, + "grad_norm": 0.0645727887749672, + "learning_rate": 0.0001008056168144381, + "loss": 0.2194, + "step": 26978 + }, + { + "epoch": 2.185596241088788, + "grad_norm": 0.05673626437783241, + "learning_rate": 0.00010080111616184346, + "loss": 0.2776, + "step": 26979 + }, + { + "epoch": 2.1856772521062866, + "grad_norm": 0.0616806261241436, + "learning_rate": 0.00010079661550924884, + "loss": 0.2394, + "step": 26980 + }, + { + "epoch": 2.185758263123785, + "grad_norm": 0.08814667910337448, + "learning_rate": 0.00010079211485665422, + "loss": 0.2503, + "step": 26981 + }, + { + "epoch": 2.185839274141283, + "grad_norm": 0.06610722839832306, + "learning_rate": 0.0001007876142040596, + "loss": 0.2726, + "step": 26982 + }, + { + "epoch": 2.185920285158782, + "grad_norm": 0.0646798387169838, + "learning_rate": 0.00010078311355146497, + "loss": 0.2755, + "step": 26983 + }, + { + "epoch": 2.18600129617628, + "grad_norm": 0.06937199085950851, + "learning_rate": 0.00010077861289887034, + "loss": 0.2723, + "step": 26984 + }, + { + "epoch": 2.1860823071937783, + "grad_norm": 0.0682591125369072, + "learning_rate": 0.0001007741122462757, + "loss": 0.2559, + "step": 26985 + }, + { + "epoch": 2.1861633182112765, + "grad_norm": 0.0681900903582573, + "learning_rate": 0.00010076961159368108, + "loss": 0.2232, + "step": 26986 + }, + { + "epoch": 2.186244329228775, + "grad_norm": 0.06384847313165665, + "learning_rate": 0.00010076511094108647, + "loss": 0.2501, + "step": 26987 + }, + { + "epoch": 2.1863253402462735, + "grad_norm": 0.06840259581804276, + "learning_rate": 0.00010076061028849184, + "loss": 0.2717, + "step": 26988 + }, + { + "epoch": 2.1864063512637717, + "grad_norm": 0.06057314947247505, + "learning_rate": 0.00010075610963589721, + "loss": 0.2268, + "step": 26989 + }, + { + "epoch": 2.1864873622812704, + "grad_norm": 0.058428164571523666, + "learning_rate": 0.00010075160898330259, + "loss": 0.2459, + "step": 26990 + }, + { + "epoch": 2.1865683732987686, + "grad_norm": 0.07158659398555756, + "learning_rate": 0.00010074710833070795, + "loss": 0.2702, + "step": 26991 + }, + { + "epoch": 2.186649384316267, + "grad_norm": 0.06685875356197357, + "learning_rate": 0.00010074260767811332, + "loss": 0.2488, + "step": 26992 + }, + { + "epoch": 2.1867303953337656, + "grad_norm": 0.049234382808208466, + "learning_rate": 0.0001007381070255187, + "loss": 0.2249, + "step": 26993 + }, + { + "epoch": 2.186811406351264, + "grad_norm": 0.057531118392944336, + "learning_rate": 0.00010073360637292408, + "loss": 0.2399, + "step": 26994 + }, + { + "epoch": 2.186892417368762, + "grad_norm": 0.06531118601560593, + "learning_rate": 0.00010072910572032945, + "loss": 0.2352, + "step": 26995 + }, + { + "epoch": 2.1869734283862607, + "grad_norm": 0.05908123403787613, + "learning_rate": 0.00010072460506773483, + "loss": 0.2365, + "step": 26996 + }, + { + "epoch": 2.187054439403759, + "grad_norm": 0.07252193987369537, + "learning_rate": 0.00010072010441514019, + "loss": 0.2443, + "step": 26997 + }, + { + "epoch": 2.1871354504212572, + "grad_norm": 0.06428375095129013, + "learning_rate": 0.00010071560376254556, + "loss": 0.2711, + "step": 26998 + }, + { + "epoch": 2.1872164614387555, + "grad_norm": 0.06661681085824966, + "learning_rate": 0.00010071110310995095, + "loss": 0.2677, + "step": 26999 + }, + { + "epoch": 2.187297472456254, + "grad_norm": 0.06688234955072403, + "learning_rate": 0.00010070660245735632, + "loss": 0.2124, + "step": 27000 + }, + { + "epoch": 2.1873784834737524, + "grad_norm": 0.062010444700717926, + "learning_rate": 0.0001007021018047617, + "loss": 0.2186, + "step": 27001 + }, + { + "epoch": 2.1874594944912507, + "grad_norm": 0.06785605847835541, + "learning_rate": 0.00010069760115216707, + "loss": 0.2512, + "step": 27002 + }, + { + "epoch": 2.1875405055087493, + "grad_norm": 0.06811809539794922, + "learning_rate": 0.00010069310049957243, + "loss": 0.2352, + "step": 27003 + }, + { + "epoch": 2.1876215165262476, + "grad_norm": 0.06498055905103683, + "learning_rate": 0.00010068859984697783, + "loss": 0.2579, + "step": 27004 + }, + { + "epoch": 2.187702527543746, + "grad_norm": 0.07715179026126862, + "learning_rate": 0.0001006840991943832, + "loss": 0.2328, + "step": 27005 + }, + { + "epoch": 2.1877835385612445, + "grad_norm": 0.07081539183855057, + "learning_rate": 0.00010067959854178856, + "loss": 0.2599, + "step": 27006 + }, + { + "epoch": 2.1878645495787428, + "grad_norm": 0.082499660551548, + "learning_rate": 0.00010067509788919394, + "loss": 0.288, + "step": 27007 + }, + { + "epoch": 2.187945560596241, + "grad_norm": 0.06907915323972702, + "learning_rate": 0.00010067059723659931, + "loss": 0.2821, + "step": 27008 + }, + { + "epoch": 2.1880265716137393, + "grad_norm": 0.0711050033569336, + "learning_rate": 0.00010066609658400467, + "loss": 0.2782, + "step": 27009 + }, + { + "epoch": 2.188107582631238, + "grad_norm": 0.06753864884376526, + "learning_rate": 0.00010066159593141007, + "loss": 0.2313, + "step": 27010 + }, + { + "epoch": 2.188188593648736, + "grad_norm": 0.059835441410541534, + "learning_rate": 0.00010065709527881545, + "loss": 0.2399, + "step": 27011 + }, + { + "epoch": 2.1882696046662344, + "grad_norm": 0.07964125275611877, + "learning_rate": 0.0001006525946262208, + "loss": 0.2661, + "step": 27012 + }, + { + "epoch": 2.188350615683733, + "grad_norm": 0.06409208476543427, + "learning_rate": 0.00010064809397362618, + "loss": 0.2652, + "step": 27013 + }, + { + "epoch": 2.1884316267012314, + "grad_norm": 0.05935799330472946, + "learning_rate": 0.00010064359332103155, + "loss": 0.244, + "step": 27014 + }, + { + "epoch": 2.1885126377187296, + "grad_norm": 0.08206460624933243, + "learning_rate": 0.00010063909266843691, + "loss": 0.2023, + "step": 27015 + }, + { + "epoch": 2.1885936487362283, + "grad_norm": 0.07063757628202438, + "learning_rate": 0.00010063459201584231, + "loss": 0.2893, + "step": 27016 + }, + { + "epoch": 2.1886746597537265, + "grad_norm": 0.06272878497838974, + "learning_rate": 0.00010063009136324769, + "loss": 0.2369, + "step": 27017 + }, + { + "epoch": 2.188755670771225, + "grad_norm": 0.060590989887714386, + "learning_rate": 0.00010062559071065305, + "loss": 0.2661, + "step": 27018 + }, + { + "epoch": 2.1888366817887235, + "grad_norm": 0.06756321340799332, + "learning_rate": 0.00010062109005805842, + "loss": 0.2799, + "step": 27019 + }, + { + "epoch": 2.1889176928062217, + "grad_norm": 0.06753882765769958, + "learning_rate": 0.0001006165894054638, + "loss": 0.2786, + "step": 27020 + }, + { + "epoch": 2.18899870382372, + "grad_norm": 0.0644640401005745, + "learning_rate": 0.00010061208875286915, + "loss": 0.2291, + "step": 27021 + }, + { + "epoch": 2.189079714841218, + "grad_norm": 0.06838696449995041, + "learning_rate": 0.00010060758810027455, + "loss": 0.2785, + "step": 27022 + }, + { + "epoch": 2.189160725858717, + "grad_norm": 0.06403999775648117, + "learning_rate": 0.00010060308744767993, + "loss": 0.2289, + "step": 27023 + }, + { + "epoch": 2.189241736876215, + "grad_norm": 0.06139326095581055, + "learning_rate": 0.00010059858679508529, + "loss": 0.2291, + "step": 27024 + }, + { + "epoch": 2.1893227478937134, + "grad_norm": 0.0574423111975193, + "learning_rate": 0.00010059408614249066, + "loss": 0.259, + "step": 27025 + }, + { + "epoch": 2.189403758911212, + "grad_norm": 0.07539302110671997, + "learning_rate": 0.00010058958548989604, + "loss": 0.2837, + "step": 27026 + }, + { + "epoch": 2.1894847699287103, + "grad_norm": 0.06395363807678223, + "learning_rate": 0.00010058508483730142, + "loss": 0.2634, + "step": 27027 + }, + { + "epoch": 2.1895657809462086, + "grad_norm": 0.05907757952809334, + "learning_rate": 0.0001005805841847068, + "loss": 0.2663, + "step": 27028 + }, + { + "epoch": 2.1896467919637073, + "grad_norm": 0.06327487528324127, + "learning_rate": 0.00010057608353211217, + "loss": 0.2362, + "step": 27029 + }, + { + "epoch": 2.1897278029812055, + "grad_norm": 0.06699670106172562, + "learning_rate": 0.00010057158287951753, + "loss": 0.2709, + "step": 27030 + }, + { + "epoch": 2.1898088139987038, + "grad_norm": 0.051833443343639374, + "learning_rate": 0.0001005670822269229, + "loss": 0.2482, + "step": 27031 + }, + { + "epoch": 2.189889825016202, + "grad_norm": 0.07217784970998764, + "learning_rate": 0.00010056258157432828, + "loss": 0.2804, + "step": 27032 + }, + { + "epoch": 2.1899708360337007, + "grad_norm": 0.06261934340000153, + "learning_rate": 0.00010055808092173366, + "loss": 0.2548, + "step": 27033 + }, + { + "epoch": 2.190051847051199, + "grad_norm": 0.06124917045235634, + "learning_rate": 0.00010055358026913904, + "loss": 0.218, + "step": 27034 + }, + { + "epoch": 2.190132858068697, + "grad_norm": 0.06161245331168175, + "learning_rate": 0.00010054907961654441, + "loss": 0.229, + "step": 27035 + }, + { + "epoch": 2.190213869086196, + "grad_norm": 0.07126471400260925, + "learning_rate": 0.00010054457896394977, + "loss": 0.2559, + "step": 27036 + }, + { + "epoch": 2.190294880103694, + "grad_norm": 0.05679122358560562, + "learning_rate": 0.00010054007831135515, + "loss": 0.2001, + "step": 27037 + }, + { + "epoch": 2.1903758911211924, + "grad_norm": 0.07383640855550766, + "learning_rate": 0.00010053557765876052, + "loss": 0.2519, + "step": 27038 + }, + { + "epoch": 2.190456902138691, + "grad_norm": 0.06887379288673401, + "learning_rate": 0.0001005310770061659, + "loss": 0.2363, + "step": 27039 + }, + { + "epoch": 2.1905379131561893, + "grad_norm": 0.06768579035997391, + "learning_rate": 0.00010052657635357128, + "loss": 0.241, + "step": 27040 + }, + { + "epoch": 2.1906189241736875, + "grad_norm": 0.061511702835559845, + "learning_rate": 0.00010052207570097665, + "loss": 0.2445, + "step": 27041 + }, + { + "epoch": 2.190699935191186, + "grad_norm": 0.06590621918439865, + "learning_rate": 0.00010051757504838201, + "loss": 0.2734, + "step": 27042 + }, + { + "epoch": 2.1907809462086845, + "grad_norm": 0.060971327126026154, + "learning_rate": 0.00010051307439578739, + "loss": 0.2402, + "step": 27043 + }, + { + "epoch": 2.1908619572261827, + "grad_norm": 0.05624224618077278, + "learning_rate": 0.00010050857374319276, + "loss": 0.2245, + "step": 27044 + }, + { + "epoch": 2.190942968243681, + "grad_norm": 0.06078394129872322, + "learning_rate": 0.00010050407309059815, + "loss": 0.3057, + "step": 27045 + }, + { + "epoch": 2.1910239792611796, + "grad_norm": 0.06620259582996368, + "learning_rate": 0.00010049957243800352, + "loss": 0.2642, + "step": 27046 + }, + { + "epoch": 2.191104990278678, + "grad_norm": 0.05764997750520706, + "learning_rate": 0.0001004950717854089, + "loss": 0.2416, + "step": 27047 + }, + { + "epoch": 2.191186001296176, + "grad_norm": 0.05957689881324768, + "learning_rate": 0.00010049057113281425, + "loss": 0.2355, + "step": 27048 + }, + { + "epoch": 2.191267012313675, + "grad_norm": 0.06990189105272293, + "learning_rate": 0.00010048607048021963, + "loss": 0.2298, + "step": 27049 + }, + { + "epoch": 2.191348023331173, + "grad_norm": 0.061472970992326736, + "learning_rate": 0.000100481569827625, + "loss": 0.2542, + "step": 27050 + }, + { + "epoch": 2.1914290343486713, + "grad_norm": 0.05452711880207062, + "learning_rate": 0.00010047706917503039, + "loss": 0.2594, + "step": 27051 + }, + { + "epoch": 2.19151004536617, + "grad_norm": 0.05978472903370857, + "learning_rate": 0.00010047256852243576, + "loss": 0.2424, + "step": 27052 + }, + { + "epoch": 2.1915910563836682, + "grad_norm": 0.06296546757221222, + "learning_rate": 0.00010046806786984114, + "loss": 0.2645, + "step": 27053 + }, + { + "epoch": 2.1916720674011665, + "grad_norm": 0.06470096856355667, + "learning_rate": 0.0001004635672172465, + "loss": 0.2586, + "step": 27054 + }, + { + "epoch": 2.1917530784186647, + "grad_norm": 0.06551109999418259, + "learning_rate": 0.00010045906656465187, + "loss": 0.2369, + "step": 27055 + }, + { + "epoch": 2.1918340894361634, + "grad_norm": 0.06295715272426605, + "learning_rate": 0.00010045456591205726, + "loss": 0.2507, + "step": 27056 + }, + { + "epoch": 2.1919151004536617, + "grad_norm": 0.06285136938095093, + "learning_rate": 0.00010045006525946263, + "loss": 0.2484, + "step": 27057 + }, + { + "epoch": 2.19199611147116, + "grad_norm": 0.04650372266769409, + "learning_rate": 0.000100445564606868, + "loss": 0.2413, + "step": 27058 + }, + { + "epoch": 2.1920771224886586, + "grad_norm": 0.06401270627975464, + "learning_rate": 0.00010044106395427338, + "loss": 0.2889, + "step": 27059 + }, + { + "epoch": 2.192158133506157, + "grad_norm": 0.0678238719701767, + "learning_rate": 0.00010043656330167874, + "loss": 0.2224, + "step": 27060 + }, + { + "epoch": 2.192239144523655, + "grad_norm": 0.06454212963581085, + "learning_rate": 0.00010043206264908411, + "loss": 0.2326, + "step": 27061 + }, + { + "epoch": 2.192320155541154, + "grad_norm": 0.0558546744287014, + "learning_rate": 0.0001004275619964895, + "loss": 0.2824, + "step": 27062 + }, + { + "epoch": 2.192401166558652, + "grad_norm": 0.06207578256726265, + "learning_rate": 0.00010042306134389487, + "loss": 0.272, + "step": 27063 + }, + { + "epoch": 2.1924821775761503, + "grad_norm": 0.05711657181382179, + "learning_rate": 0.00010041856069130025, + "loss": 0.2588, + "step": 27064 + }, + { + "epoch": 2.192563188593649, + "grad_norm": 0.07172786444425583, + "learning_rate": 0.00010041406003870562, + "loss": 0.2402, + "step": 27065 + }, + { + "epoch": 2.192644199611147, + "grad_norm": 0.07652819156646729, + "learning_rate": 0.00010040955938611098, + "loss": 0.233, + "step": 27066 + }, + { + "epoch": 2.1927252106286454, + "grad_norm": 0.062125325202941895, + "learning_rate": 0.00010040505873351635, + "loss": 0.2643, + "step": 27067 + }, + { + "epoch": 2.1928062216461437, + "grad_norm": 0.05422678962349892, + "learning_rate": 0.00010040055808092174, + "loss": 0.2316, + "step": 27068 + }, + { + "epoch": 2.1928872326636424, + "grad_norm": 0.05607717111706734, + "learning_rate": 0.00010039605742832711, + "loss": 0.2473, + "step": 27069 + }, + { + "epoch": 2.1929682436811406, + "grad_norm": 0.06470224261283875, + "learning_rate": 0.00010039155677573249, + "loss": 0.2499, + "step": 27070 + }, + { + "epoch": 2.193049254698639, + "grad_norm": 0.08244304358959198, + "learning_rate": 0.00010038705612313786, + "loss": 0.2361, + "step": 27071 + }, + { + "epoch": 2.1931302657161376, + "grad_norm": 0.058195363730192184, + "learning_rate": 0.00010038255547054322, + "loss": 0.2303, + "step": 27072 + }, + { + "epoch": 2.193211276733636, + "grad_norm": 0.06642767786979675, + "learning_rate": 0.0001003780548179486, + "loss": 0.2225, + "step": 27073 + }, + { + "epoch": 2.193292287751134, + "grad_norm": 0.06660183519124985, + "learning_rate": 0.000100373554165354, + "loss": 0.267, + "step": 27074 + }, + { + "epoch": 2.1933732987686327, + "grad_norm": 0.06869763880968094, + "learning_rate": 0.00010036905351275936, + "loss": 0.2598, + "step": 27075 + }, + { + "epoch": 2.193454309786131, + "grad_norm": 0.06465927511453629, + "learning_rate": 0.00010036455286016473, + "loss": 0.269, + "step": 27076 + }, + { + "epoch": 2.1935353208036292, + "grad_norm": 0.06094600260257721, + "learning_rate": 0.0001003600522075701, + "loss": 0.2514, + "step": 27077 + }, + { + "epoch": 2.1936163318211275, + "grad_norm": 0.06704872101545334, + "learning_rate": 0.00010035555155497546, + "loss": 0.2857, + "step": 27078 + }, + { + "epoch": 2.193697342838626, + "grad_norm": 0.0678897276520729, + "learning_rate": 0.00010035105090238086, + "loss": 0.2392, + "step": 27079 + }, + { + "epoch": 2.1937783538561244, + "grad_norm": 0.06632528454065323, + "learning_rate": 0.00010034655024978624, + "loss": 0.2516, + "step": 27080 + }, + { + "epoch": 2.1938593648736227, + "grad_norm": 0.05018361285328865, + "learning_rate": 0.0001003420495971916, + "loss": 0.2556, + "step": 27081 + }, + { + "epoch": 2.1939403758911213, + "grad_norm": 0.06408238410949707, + "learning_rate": 0.00010033754894459697, + "loss": 0.2101, + "step": 27082 + }, + { + "epoch": 2.1940213869086196, + "grad_norm": 0.054987452924251556, + "learning_rate": 0.00010033304829200234, + "loss": 0.2328, + "step": 27083 + }, + { + "epoch": 2.194102397926118, + "grad_norm": 0.08715245872735977, + "learning_rate": 0.0001003285476394077, + "loss": 0.2756, + "step": 27084 + }, + { + "epoch": 2.1941834089436165, + "grad_norm": 0.0645839124917984, + "learning_rate": 0.0001003240469868131, + "loss": 0.2502, + "step": 27085 + }, + { + "epoch": 2.1942644199611148, + "grad_norm": 0.07756000757217407, + "learning_rate": 0.00010031954633421848, + "loss": 0.2455, + "step": 27086 + }, + { + "epoch": 2.194345430978613, + "grad_norm": 0.07709761708974838, + "learning_rate": 0.00010031504568162384, + "loss": 0.2763, + "step": 27087 + }, + { + "epoch": 2.1944264419961117, + "grad_norm": 0.06762171536684036, + "learning_rate": 0.00010031054502902921, + "loss": 0.2682, + "step": 27088 + }, + { + "epoch": 2.19450745301361, + "grad_norm": 0.07915575802326202, + "learning_rate": 0.00010030604437643459, + "loss": 0.2576, + "step": 27089 + }, + { + "epoch": 2.194588464031108, + "grad_norm": 0.0666109248995781, + "learning_rate": 0.00010030154372383995, + "loss": 0.2592, + "step": 27090 + }, + { + "epoch": 2.1946694750486064, + "grad_norm": 0.065487340092659, + "learning_rate": 0.00010029704307124535, + "loss": 0.2819, + "step": 27091 + }, + { + "epoch": 2.194750486066105, + "grad_norm": 0.04944925010204315, + "learning_rate": 0.00010029254241865072, + "loss": 0.2711, + "step": 27092 + }, + { + "epoch": 2.1948314970836034, + "grad_norm": 0.05818904563784599, + "learning_rate": 0.00010028804176605608, + "loss": 0.2692, + "step": 27093 + }, + { + "epoch": 2.1949125081011016, + "grad_norm": 0.07648874819278717, + "learning_rate": 0.00010028354111346145, + "loss": 0.2549, + "step": 27094 + }, + { + "epoch": 2.1949935191186003, + "grad_norm": 0.06092251464724541, + "learning_rate": 0.00010027904046086683, + "loss": 0.2752, + "step": 27095 + }, + { + "epoch": 2.1950745301360985, + "grad_norm": 0.05122043564915657, + "learning_rate": 0.00010027453980827219, + "loss": 0.2425, + "step": 27096 + }, + { + "epoch": 2.195155541153597, + "grad_norm": 0.06031843647360802, + "learning_rate": 0.00010027003915567759, + "loss": 0.2439, + "step": 27097 + }, + { + "epoch": 2.195236552171095, + "grad_norm": 0.07118486613035202, + "learning_rate": 0.00010026553850308296, + "loss": 0.2973, + "step": 27098 + }, + { + "epoch": 2.1953175631885937, + "grad_norm": 0.0504891462624073, + "learning_rate": 0.00010026103785048832, + "loss": 0.2472, + "step": 27099 + }, + { + "epoch": 2.195398574206092, + "grad_norm": 0.060217734426259995, + "learning_rate": 0.0001002565371978937, + "loss": 0.2556, + "step": 27100 + }, + { + "epoch": 2.19547958522359, + "grad_norm": 0.057529326528310776, + "learning_rate": 0.00010025203654529907, + "loss": 0.2159, + "step": 27101 + }, + { + "epoch": 2.195560596241089, + "grad_norm": 0.06307288259267807, + "learning_rate": 0.00010024753589270443, + "loss": 0.2522, + "step": 27102 + }, + { + "epoch": 2.195641607258587, + "grad_norm": 0.0709809735417366, + "learning_rate": 0.00010024303524010983, + "loss": 0.2566, + "step": 27103 + }, + { + "epoch": 2.1957226182760854, + "grad_norm": 0.058112580329179764, + "learning_rate": 0.0001002385345875152, + "loss": 0.2735, + "step": 27104 + }, + { + "epoch": 2.195803629293584, + "grad_norm": 0.051618389785289764, + "learning_rate": 0.00010023403393492056, + "loss": 0.2369, + "step": 27105 + }, + { + "epoch": 2.1958846403110823, + "grad_norm": 0.051942378282547, + "learning_rate": 0.00010022953328232594, + "loss": 0.2598, + "step": 27106 + }, + { + "epoch": 2.1959656513285806, + "grad_norm": 0.06882523000240326, + "learning_rate": 0.00010022503262973131, + "loss": 0.283, + "step": 27107 + }, + { + "epoch": 2.1960466623460793, + "grad_norm": 0.07351597398519516, + "learning_rate": 0.0001002205319771367, + "loss": 0.2697, + "step": 27108 + }, + { + "epoch": 2.1961276733635775, + "grad_norm": 0.06842011213302612, + "learning_rate": 0.00010021603132454207, + "loss": 0.2583, + "step": 27109 + }, + { + "epoch": 2.1962086843810757, + "grad_norm": 0.06987638026475906, + "learning_rate": 0.00010021153067194745, + "loss": 0.2067, + "step": 27110 + }, + { + "epoch": 2.1962896953985744, + "grad_norm": 0.0682644248008728, + "learning_rate": 0.0001002070300193528, + "loss": 0.2765, + "step": 27111 + }, + { + "epoch": 2.1963707064160727, + "grad_norm": 0.0675097405910492, + "learning_rate": 0.00010020252936675818, + "loss": 0.2952, + "step": 27112 + }, + { + "epoch": 2.196451717433571, + "grad_norm": 0.06232113763689995, + "learning_rate": 0.00010019802871416355, + "loss": 0.2379, + "step": 27113 + }, + { + "epoch": 2.196532728451069, + "grad_norm": 0.06394306570291519, + "learning_rate": 0.00010019352806156894, + "loss": 0.2296, + "step": 27114 + }, + { + "epoch": 2.196613739468568, + "grad_norm": 0.06672637909650803, + "learning_rate": 0.00010018902740897431, + "loss": 0.2604, + "step": 27115 + }, + { + "epoch": 2.196694750486066, + "grad_norm": 0.07373014092445374, + "learning_rate": 0.00010018452675637969, + "loss": 0.2931, + "step": 27116 + }, + { + "epoch": 2.1967757615035644, + "grad_norm": 0.06646038591861725, + "learning_rate": 0.00010018002610378505, + "loss": 0.245, + "step": 27117 + }, + { + "epoch": 2.196856772521063, + "grad_norm": 0.050996072590351105, + "learning_rate": 0.00010017552545119042, + "loss": 0.2759, + "step": 27118 + }, + { + "epoch": 2.1969377835385613, + "grad_norm": 0.04928220435976982, + "learning_rate": 0.0001001710247985958, + "loss": 0.2405, + "step": 27119 + }, + { + "epoch": 2.1970187945560595, + "grad_norm": 0.07236079126596451, + "learning_rate": 0.00010016652414600118, + "loss": 0.3173, + "step": 27120 + }, + { + "epoch": 2.1970998055735578, + "grad_norm": 0.06007302552461624, + "learning_rate": 0.00010016202349340656, + "loss": 0.2147, + "step": 27121 + }, + { + "epoch": 2.1971808165910565, + "grad_norm": 0.058224182575941086, + "learning_rate": 0.00010015752284081193, + "loss": 0.282, + "step": 27122 + }, + { + "epoch": 2.1972618276085547, + "grad_norm": 0.07079780101776123, + "learning_rate": 0.00010015302218821729, + "loss": 0.2344, + "step": 27123 + }, + { + "epoch": 2.197342838626053, + "grad_norm": 0.06322164833545685, + "learning_rate": 0.00010014852153562266, + "loss": 0.2089, + "step": 27124 + }, + { + "epoch": 2.1974238496435516, + "grad_norm": 0.05888306349515915, + "learning_rate": 0.00010014402088302804, + "loss": 0.2324, + "step": 27125 + }, + { + "epoch": 2.19750486066105, + "grad_norm": 0.055346712470054626, + "learning_rate": 0.00010013952023043342, + "loss": 0.2585, + "step": 27126 + }, + { + "epoch": 2.197585871678548, + "grad_norm": 0.06587319076061249, + "learning_rate": 0.0001001350195778388, + "loss": 0.253, + "step": 27127 + }, + { + "epoch": 2.197666882696047, + "grad_norm": 0.056305259466171265, + "learning_rate": 0.00010013051892524417, + "loss": 0.2195, + "step": 27128 + }, + { + "epoch": 2.197747893713545, + "grad_norm": 0.04610638692975044, + "learning_rate": 0.00010012601827264953, + "loss": 0.1896, + "step": 27129 + }, + { + "epoch": 2.1978289047310433, + "grad_norm": 0.06477444618940353, + "learning_rate": 0.0001001215176200549, + "loss": 0.2673, + "step": 27130 + }, + { + "epoch": 2.197909915748542, + "grad_norm": 0.0813470110297203, + "learning_rate": 0.00010011701696746029, + "loss": 0.279, + "step": 27131 + }, + { + "epoch": 2.1979909267660402, + "grad_norm": 0.07328961789608002, + "learning_rate": 0.00010011251631486566, + "loss": 0.2411, + "step": 27132 + }, + { + "epoch": 2.1980719377835385, + "grad_norm": 0.06270834803581238, + "learning_rate": 0.00010010801566227104, + "loss": 0.2682, + "step": 27133 + }, + { + "epoch": 2.198152948801037, + "grad_norm": 0.05361814424395561, + "learning_rate": 0.00010010351500967641, + "loss": 0.2963, + "step": 27134 + }, + { + "epoch": 2.1982339598185354, + "grad_norm": 0.08425524830818176, + "learning_rate": 0.00010009901435708177, + "loss": 0.2577, + "step": 27135 + }, + { + "epoch": 2.1983149708360337, + "grad_norm": 0.07660902291536331, + "learning_rate": 0.00010009451370448715, + "loss": 0.234, + "step": 27136 + }, + { + "epoch": 2.198395981853532, + "grad_norm": 0.06730036437511444, + "learning_rate": 0.00010009001305189253, + "loss": 0.2721, + "step": 27137 + }, + { + "epoch": 2.1984769928710306, + "grad_norm": 0.08619198948144913, + "learning_rate": 0.0001000855123992979, + "loss": 0.3167, + "step": 27138 + }, + { + "epoch": 2.198558003888529, + "grad_norm": 0.05942687392234802, + "learning_rate": 0.00010008101174670328, + "loss": 0.2137, + "step": 27139 + }, + { + "epoch": 2.198639014906027, + "grad_norm": 0.05431392416357994, + "learning_rate": 0.00010007651109410865, + "loss": 0.2292, + "step": 27140 + }, + { + "epoch": 2.198720025923526, + "grad_norm": 0.0699324905872345, + "learning_rate": 0.00010007201044151401, + "loss": 0.2661, + "step": 27141 + }, + { + "epoch": 2.198801036941024, + "grad_norm": 0.05979772284626961, + "learning_rate": 0.00010006750978891939, + "loss": 0.2514, + "step": 27142 + }, + { + "epoch": 2.1988820479585223, + "grad_norm": 0.059384338557720184, + "learning_rate": 0.00010006300913632479, + "loss": 0.2559, + "step": 27143 + }, + { + "epoch": 2.1989630589760205, + "grad_norm": 0.06850460916757584, + "learning_rate": 0.00010005850848373015, + "loss": 0.266, + "step": 27144 + }, + { + "epoch": 2.199044069993519, + "grad_norm": 0.06401151418685913, + "learning_rate": 0.00010005400783113552, + "loss": 0.2512, + "step": 27145 + }, + { + "epoch": 2.1991250810110174, + "grad_norm": 0.07683590054512024, + "learning_rate": 0.0001000495071785409, + "loss": 0.292, + "step": 27146 + }, + { + "epoch": 2.1992060920285157, + "grad_norm": 0.06709449738264084, + "learning_rate": 0.00010004500652594626, + "loss": 0.2352, + "step": 27147 + }, + { + "epoch": 2.1992871030460144, + "grad_norm": 0.06725570559501648, + "learning_rate": 0.00010004050587335163, + "loss": 0.2849, + "step": 27148 + }, + { + "epoch": 2.1993681140635126, + "grad_norm": 0.07019000500440598, + "learning_rate": 0.00010003600522075703, + "loss": 0.2795, + "step": 27149 + }, + { + "epoch": 2.199449125081011, + "grad_norm": 0.09164316207170486, + "learning_rate": 0.00010003150456816239, + "loss": 0.2621, + "step": 27150 + }, + { + "epoch": 2.1995301360985096, + "grad_norm": 0.06879639625549316, + "learning_rate": 0.00010002700391556776, + "loss": 0.2867, + "step": 27151 + }, + { + "epoch": 2.199611147116008, + "grad_norm": 0.07062327116727829, + "learning_rate": 0.00010002250326297314, + "loss": 0.243, + "step": 27152 + }, + { + "epoch": 2.199692158133506, + "grad_norm": 0.07149609923362732, + "learning_rate": 0.0001000180026103785, + "loss": 0.2377, + "step": 27153 + }, + { + "epoch": 2.1997731691510047, + "grad_norm": 0.06835110485553741, + "learning_rate": 0.00010001350195778387, + "loss": 0.2424, + "step": 27154 + }, + { + "epoch": 2.199854180168503, + "grad_norm": 0.05907417833805084, + "learning_rate": 0.00010000900130518927, + "loss": 0.2585, + "step": 27155 + }, + { + "epoch": 2.1999351911860012, + "grad_norm": 0.08102352917194366, + "learning_rate": 0.00010000450065259463, + "loss": 0.2646, + "step": 27156 + }, + { + "epoch": 2.2000162022034995, + "grad_norm": 0.06644205749034882, + "learning_rate": 0.0001, + "loss": 0.2181, + "step": 27157 + }, + { + "epoch": 2.200097213220998, + "grad_norm": 0.05316384136676788, + "learning_rate": 9.999549934740538e-05, + "loss": 0.2329, + "step": 27158 + }, + { + "epoch": 2.2001782242384964, + "grad_norm": 0.06541083753108978, + "learning_rate": 9.999099869481075e-05, + "loss": 0.269, + "step": 27159 + }, + { + "epoch": 2.2002592352559946, + "grad_norm": 0.07427337765693665, + "learning_rate": 9.998649804221613e-05, + "loss": 0.2808, + "step": 27160 + }, + { + "epoch": 2.2003402462734933, + "grad_norm": 0.06966466456651688, + "learning_rate": 9.99819973896215e-05, + "loss": 0.2675, + "step": 27161 + }, + { + "epoch": 2.2004212572909916, + "grad_norm": 0.061215851455926895, + "learning_rate": 9.997749673702687e-05, + "loss": 0.2489, + "step": 27162 + }, + { + "epoch": 2.20050226830849, + "grad_norm": 0.051639918237924576, + "learning_rate": 9.997299608443225e-05, + "loss": 0.2527, + "step": 27163 + }, + { + "epoch": 2.2005832793259885, + "grad_norm": 0.07078103721141815, + "learning_rate": 9.996849543183762e-05, + "loss": 0.2507, + "step": 27164 + }, + { + "epoch": 2.2006642903434868, + "grad_norm": 0.06260479241609573, + "learning_rate": 9.9963994779243e-05, + "loss": 0.2756, + "step": 27165 + }, + { + "epoch": 2.200745301360985, + "grad_norm": 0.07474082708358765, + "learning_rate": 9.995949412664837e-05, + "loss": 0.267, + "step": 27166 + }, + { + "epoch": 2.2008263123784833, + "grad_norm": 0.07497971504926682, + "learning_rate": 9.995499347405374e-05, + "loss": 0.3132, + "step": 27167 + }, + { + "epoch": 2.200907323395982, + "grad_norm": 0.06425240635871887, + "learning_rate": 9.995049282145911e-05, + "loss": 0.2429, + "step": 27168 + }, + { + "epoch": 2.20098833441348, + "grad_norm": 0.056325603276491165, + "learning_rate": 9.994599216886449e-05, + "loss": 0.2481, + "step": 27169 + }, + { + "epoch": 2.2010693454309784, + "grad_norm": 0.05998290330171585, + "learning_rate": 9.994149151626986e-05, + "loss": 0.2565, + "step": 27170 + }, + { + "epoch": 2.201150356448477, + "grad_norm": 0.05483241379261017, + "learning_rate": 9.993699086367524e-05, + "loss": 0.2498, + "step": 27171 + }, + { + "epoch": 2.2012313674659754, + "grad_norm": 0.06303826719522476, + "learning_rate": 9.993249021108061e-05, + "loss": 0.2494, + "step": 27172 + }, + { + "epoch": 2.2013123784834736, + "grad_norm": 0.06558068096637726, + "learning_rate": 9.9927989558486e-05, + "loss": 0.247, + "step": 27173 + }, + { + "epoch": 2.2013933895009723, + "grad_norm": 0.06100596487522125, + "learning_rate": 9.992348890589136e-05, + "loss": 0.228, + "step": 27174 + }, + { + "epoch": 2.2014744005184705, + "grad_norm": 0.06812256574630737, + "learning_rate": 9.991898825329673e-05, + "loss": 0.2565, + "step": 27175 + }, + { + "epoch": 2.201555411535969, + "grad_norm": 0.05909280478954315, + "learning_rate": 9.991448760070212e-05, + "loss": 0.2197, + "step": 27176 + }, + { + "epoch": 2.2016364225534675, + "grad_norm": 0.06603255122900009, + "learning_rate": 9.990998694810748e-05, + "loss": 0.2717, + "step": 27177 + }, + { + "epoch": 2.2017174335709657, + "grad_norm": 0.06001312658190727, + "learning_rate": 9.990548629551285e-05, + "loss": 0.2446, + "step": 27178 + }, + { + "epoch": 2.201798444588464, + "grad_norm": 0.06539706885814667, + "learning_rate": 9.990098564291824e-05, + "loss": 0.2538, + "step": 27179 + }, + { + "epoch": 2.201879455605962, + "grad_norm": 0.06196155771613121, + "learning_rate": 9.98964849903236e-05, + "loss": 0.2476, + "step": 27180 + }, + { + "epoch": 2.201960466623461, + "grad_norm": 0.06107961758971214, + "learning_rate": 9.989198433772897e-05, + "loss": 0.2705, + "step": 27181 + }, + { + "epoch": 2.202041477640959, + "grad_norm": 0.05701644718647003, + "learning_rate": 9.988748368513436e-05, + "loss": 0.2582, + "step": 27182 + }, + { + "epoch": 2.2021224886584574, + "grad_norm": 0.05959983542561531, + "learning_rate": 9.988298303253972e-05, + "loss": 0.2449, + "step": 27183 + }, + { + "epoch": 2.202203499675956, + "grad_norm": 0.06808173656463623, + "learning_rate": 9.987848237994509e-05, + "loss": 0.2961, + "step": 27184 + }, + { + "epoch": 2.2022845106934543, + "grad_norm": 0.06098674610257149, + "learning_rate": 9.987398172735048e-05, + "loss": 0.2377, + "step": 27185 + }, + { + "epoch": 2.2023655217109526, + "grad_norm": 0.0937223955988884, + "learning_rate": 9.986948107475584e-05, + "loss": 0.2819, + "step": 27186 + }, + { + "epoch": 2.2024465327284513, + "grad_norm": 0.07166877388954163, + "learning_rate": 9.986498042216121e-05, + "loss": 0.2414, + "step": 27187 + }, + { + "epoch": 2.2025275437459495, + "grad_norm": 0.057484161108732224, + "learning_rate": 9.98604797695666e-05, + "loss": 0.2384, + "step": 27188 + }, + { + "epoch": 2.2026085547634477, + "grad_norm": 0.08165212720632553, + "learning_rate": 9.985597911697196e-05, + "loss": 0.2655, + "step": 27189 + }, + { + "epoch": 2.202689565780946, + "grad_norm": 0.07205326855182648, + "learning_rate": 9.985147846437733e-05, + "loss": 0.2405, + "step": 27190 + }, + { + "epoch": 2.2027705767984447, + "grad_norm": 0.06088333949446678, + "learning_rate": 9.984697781178272e-05, + "loss": 0.2745, + "step": 27191 + }, + { + "epoch": 2.202851587815943, + "grad_norm": 0.07322680205106735, + "learning_rate": 9.984247715918808e-05, + "loss": 0.2722, + "step": 27192 + }, + { + "epoch": 2.202932598833441, + "grad_norm": 0.06500561535358429, + "learning_rate": 9.983797650659345e-05, + "loss": 0.278, + "step": 27193 + }, + { + "epoch": 2.20301360985094, + "grad_norm": 0.05702653154730797, + "learning_rate": 9.983347585399884e-05, + "loss": 0.2436, + "step": 27194 + }, + { + "epoch": 2.203094620868438, + "grad_norm": 0.06561413407325745, + "learning_rate": 9.98289752014042e-05, + "loss": 0.2873, + "step": 27195 + }, + { + "epoch": 2.2031756318859363, + "grad_norm": 0.07284931093454361, + "learning_rate": 9.982447454880958e-05, + "loss": 0.2479, + "step": 27196 + }, + { + "epoch": 2.203256642903435, + "grad_norm": 0.06644266098737717, + "learning_rate": 9.981997389621496e-05, + "loss": 0.2779, + "step": 27197 + }, + { + "epoch": 2.2033376539209333, + "grad_norm": 0.06733689457178116, + "learning_rate": 9.981547324362032e-05, + "loss": 0.2625, + "step": 27198 + }, + { + "epoch": 2.2034186649384315, + "grad_norm": 0.06203522905707359, + "learning_rate": 9.981097259102571e-05, + "loss": 0.2689, + "step": 27199 + }, + { + "epoch": 2.20349967595593, + "grad_norm": 0.05635721981525421, + "learning_rate": 9.980647193843108e-05, + "loss": 0.2755, + "step": 27200 + }, + { + "epoch": 2.2035806869734285, + "grad_norm": 0.05323106423020363, + "learning_rate": 9.980197128583644e-05, + "loss": 0.2448, + "step": 27201 + }, + { + "epoch": 2.2036616979909267, + "grad_norm": 0.0679623931646347, + "learning_rate": 9.979747063324183e-05, + "loss": 0.2517, + "step": 27202 + }, + { + "epoch": 2.203742709008425, + "grad_norm": 0.06785213947296143, + "learning_rate": 9.97929699806472e-05, + "loss": 0.251, + "step": 27203 + }, + { + "epoch": 2.2038237200259236, + "grad_norm": 0.06036762520670891, + "learning_rate": 9.978846932805256e-05, + "loss": 0.2078, + "step": 27204 + }, + { + "epoch": 2.203904731043422, + "grad_norm": 0.05910918489098549, + "learning_rate": 9.978396867545795e-05, + "loss": 0.237, + "step": 27205 + }, + { + "epoch": 2.20398574206092, + "grad_norm": 0.0643678605556488, + "learning_rate": 9.977946802286332e-05, + "loss": 0.2414, + "step": 27206 + }, + { + "epoch": 2.204066753078419, + "grad_norm": 0.07331311702728271, + "learning_rate": 9.977496737026868e-05, + "loss": 0.2394, + "step": 27207 + }, + { + "epoch": 2.204147764095917, + "grad_norm": 0.05993986874818802, + "learning_rate": 9.977046671767407e-05, + "loss": 0.2783, + "step": 27208 + }, + { + "epoch": 2.2042287751134153, + "grad_norm": 0.0725242868065834, + "learning_rate": 9.976596606507945e-05, + "loss": 0.2711, + "step": 27209 + }, + { + "epoch": 2.204309786130914, + "grad_norm": 0.07055281102657318, + "learning_rate": 9.97614654124848e-05, + "loss": 0.2692, + "step": 27210 + }, + { + "epoch": 2.2043907971484122, + "grad_norm": 0.06300518661737442, + "learning_rate": 9.975696475989019e-05, + "loss": 0.2196, + "step": 27211 + }, + { + "epoch": 2.2044718081659105, + "grad_norm": 0.10356733202934265, + "learning_rate": 9.975246410729557e-05, + "loss": 0.2921, + "step": 27212 + }, + { + "epoch": 2.2045528191834087, + "grad_norm": 0.06462245434522629, + "learning_rate": 9.974796345470093e-05, + "loss": 0.1995, + "step": 27213 + }, + { + "epoch": 2.2046338302009074, + "grad_norm": 0.0734039694070816, + "learning_rate": 9.974346280210631e-05, + "loss": 0.2784, + "step": 27214 + }, + { + "epoch": 2.2047148412184057, + "grad_norm": 0.053666383028030396, + "learning_rate": 9.973896214951169e-05, + "loss": 0.2901, + "step": 27215 + }, + { + "epoch": 2.204795852235904, + "grad_norm": 0.06547022610902786, + "learning_rate": 9.973446149691705e-05, + "loss": 0.2601, + "step": 27216 + }, + { + "epoch": 2.2048768632534026, + "grad_norm": 0.07122816890478134, + "learning_rate": 9.972996084432243e-05, + "loss": 0.2685, + "step": 27217 + }, + { + "epoch": 2.204957874270901, + "grad_norm": 0.056516796350479126, + "learning_rate": 9.972546019172781e-05, + "loss": 0.3253, + "step": 27218 + }, + { + "epoch": 2.205038885288399, + "grad_norm": 0.06432358175516129, + "learning_rate": 9.972095953913317e-05, + "loss": 0.2563, + "step": 27219 + }, + { + "epoch": 2.2051198963058978, + "grad_norm": 0.045835137367248535, + "learning_rate": 9.971645888653856e-05, + "loss": 0.2529, + "step": 27220 + }, + { + "epoch": 2.205200907323396, + "grad_norm": 0.0580122210085392, + "learning_rate": 9.971195823394393e-05, + "loss": 0.2571, + "step": 27221 + }, + { + "epoch": 2.2052819183408943, + "grad_norm": 0.05672190338373184, + "learning_rate": 9.970745758134929e-05, + "loss": 0.2699, + "step": 27222 + }, + { + "epoch": 2.205362929358393, + "grad_norm": 0.04954679682850838, + "learning_rate": 9.970295692875468e-05, + "loss": 0.2287, + "step": 27223 + }, + { + "epoch": 2.205443940375891, + "grad_norm": 0.06200401112437248, + "learning_rate": 9.969845627616005e-05, + "loss": 0.2672, + "step": 27224 + }, + { + "epoch": 2.2055249513933894, + "grad_norm": 0.05223114416003227, + "learning_rate": 9.969395562356542e-05, + "loss": 0.2534, + "step": 27225 + }, + { + "epoch": 2.2056059624108877, + "grad_norm": 0.061142902821302414, + "learning_rate": 9.96894549709708e-05, + "loss": 0.2874, + "step": 27226 + }, + { + "epoch": 2.2056869734283864, + "grad_norm": 0.05851171910762787, + "learning_rate": 9.968495431837617e-05, + "loss": 0.2247, + "step": 27227 + }, + { + "epoch": 2.2057679844458846, + "grad_norm": 0.07332703471183777, + "learning_rate": 9.968045366578154e-05, + "loss": 0.2797, + "step": 27228 + }, + { + "epoch": 2.205848995463383, + "grad_norm": 0.0516182966530323, + "learning_rate": 9.967595301318692e-05, + "loss": 0.2202, + "step": 27229 + }, + { + "epoch": 2.2059300064808816, + "grad_norm": 0.0503121018409729, + "learning_rate": 9.967145236059229e-05, + "loss": 0.1759, + "step": 27230 + }, + { + "epoch": 2.20601101749838, + "grad_norm": 0.05455978214740753, + "learning_rate": 9.966695170799766e-05, + "loss": 0.2481, + "step": 27231 + }, + { + "epoch": 2.206092028515878, + "grad_norm": 0.06131187826395035, + "learning_rate": 9.966245105540304e-05, + "loss": 0.248, + "step": 27232 + }, + { + "epoch": 2.2061730395333767, + "grad_norm": 0.07432816177606583, + "learning_rate": 9.965795040280841e-05, + "loss": 0.2314, + "step": 27233 + }, + { + "epoch": 2.206254050550875, + "grad_norm": 0.060853440314531326, + "learning_rate": 9.965344975021379e-05, + "loss": 0.2422, + "step": 27234 + }, + { + "epoch": 2.2063350615683732, + "grad_norm": 0.0634138435125351, + "learning_rate": 9.964894909761916e-05, + "loss": 0.2392, + "step": 27235 + }, + { + "epoch": 2.2064160725858715, + "grad_norm": 0.08180546015501022, + "learning_rate": 9.964444844502453e-05, + "loss": 0.2537, + "step": 27236 + }, + { + "epoch": 2.20649708360337, + "grad_norm": 0.059457991272211075, + "learning_rate": 9.96399477924299e-05, + "loss": 0.2641, + "step": 27237 + }, + { + "epoch": 2.2065780946208684, + "grad_norm": 0.06653142720460892, + "learning_rate": 9.963544713983528e-05, + "loss": 0.2427, + "step": 27238 + }, + { + "epoch": 2.2066591056383666, + "grad_norm": 0.07568779587745667, + "learning_rate": 9.963094648724065e-05, + "loss": 0.2923, + "step": 27239 + }, + { + "epoch": 2.2067401166558653, + "grad_norm": 0.06351905316114426, + "learning_rate": 9.962644583464603e-05, + "loss": 0.2854, + "step": 27240 + }, + { + "epoch": 2.2068211276733636, + "grad_norm": 0.0615183487534523, + "learning_rate": 9.96219451820514e-05, + "loss": 0.234, + "step": 27241 + }, + { + "epoch": 2.206902138690862, + "grad_norm": 0.05222557857632637, + "learning_rate": 9.961744452945677e-05, + "loss": 0.2578, + "step": 27242 + }, + { + "epoch": 2.2069831497083605, + "grad_norm": 0.05467645451426506, + "learning_rate": 9.961294387686215e-05, + "loss": 0.2315, + "step": 27243 + }, + { + "epoch": 2.2070641607258588, + "grad_norm": 0.06070820987224579, + "learning_rate": 9.960844322426752e-05, + "loss": 0.2568, + "step": 27244 + }, + { + "epoch": 2.207145171743357, + "grad_norm": 0.05687892436981201, + "learning_rate": 9.96039425716729e-05, + "loss": 0.241, + "step": 27245 + }, + { + "epoch": 2.2072261827608557, + "grad_norm": 0.07000940293073654, + "learning_rate": 9.959944191907827e-05, + "loss": 0.2835, + "step": 27246 + }, + { + "epoch": 2.207307193778354, + "grad_norm": 0.05552801117300987, + "learning_rate": 9.959494126648364e-05, + "loss": 0.2798, + "step": 27247 + }, + { + "epoch": 2.207388204795852, + "grad_norm": 0.06713631749153137, + "learning_rate": 9.959044061388902e-05, + "loss": 0.2782, + "step": 27248 + }, + { + "epoch": 2.2074692158133504, + "grad_norm": 0.05187486484646797, + "learning_rate": 9.958593996129439e-05, + "loss": 0.2324, + "step": 27249 + }, + { + "epoch": 2.207550226830849, + "grad_norm": 0.07198222726583481, + "learning_rate": 9.958143930869976e-05, + "loss": 0.2797, + "step": 27250 + }, + { + "epoch": 2.2076312378483474, + "grad_norm": 0.06188987195491791, + "learning_rate": 9.957693865610515e-05, + "loss": 0.2489, + "step": 27251 + }, + { + "epoch": 2.2077122488658456, + "grad_norm": 0.07027272880077362, + "learning_rate": 9.957243800351051e-05, + "loss": 0.2898, + "step": 27252 + }, + { + "epoch": 2.2077932598833443, + "grad_norm": 0.05598944053053856, + "learning_rate": 9.956793735091588e-05, + "loss": 0.2326, + "step": 27253 + }, + { + "epoch": 2.2078742709008425, + "grad_norm": 0.06060578301548958, + "learning_rate": 9.956343669832127e-05, + "loss": 0.2571, + "step": 27254 + }, + { + "epoch": 2.207955281918341, + "grad_norm": 0.05788060650229454, + "learning_rate": 9.955893604572663e-05, + "loss": 0.213, + "step": 27255 + }, + { + "epoch": 2.2080362929358395, + "grad_norm": 0.07043232768774033, + "learning_rate": 9.9554435393132e-05, + "loss": 0.263, + "step": 27256 + }, + { + "epoch": 2.2081173039533377, + "grad_norm": 0.06341779977083206, + "learning_rate": 9.954993474053739e-05, + "loss": 0.2358, + "step": 27257 + }, + { + "epoch": 2.208198314970836, + "grad_norm": 0.054107457399368286, + "learning_rate": 9.954543408794275e-05, + "loss": 0.2264, + "step": 27258 + }, + { + "epoch": 2.208279325988334, + "grad_norm": 0.06598135083913803, + "learning_rate": 9.954093343534813e-05, + "loss": 0.2559, + "step": 27259 + }, + { + "epoch": 2.208360337005833, + "grad_norm": 0.0615556575357914, + "learning_rate": 9.953643278275351e-05, + "loss": 0.264, + "step": 27260 + }, + { + "epoch": 2.208441348023331, + "grad_norm": 0.07525072991847992, + "learning_rate": 9.953193213015887e-05, + "loss": 0.2897, + "step": 27261 + }, + { + "epoch": 2.2085223590408294, + "grad_norm": 0.0570087768137455, + "learning_rate": 9.952743147756425e-05, + "loss": 0.2455, + "step": 27262 + }, + { + "epoch": 2.208603370058328, + "grad_norm": 0.08910997956991196, + "learning_rate": 9.952293082496963e-05, + "loss": 0.2998, + "step": 27263 + }, + { + "epoch": 2.2086843810758263, + "grad_norm": 0.07079474627971649, + "learning_rate": 9.9518430172375e-05, + "loss": 0.2236, + "step": 27264 + }, + { + "epoch": 2.2087653920933246, + "grad_norm": 0.06641038507223129, + "learning_rate": 9.951392951978037e-05, + "loss": 0.2413, + "step": 27265 + }, + { + "epoch": 2.2088464031108233, + "grad_norm": 0.07263396680355072, + "learning_rate": 9.950942886718575e-05, + "loss": 0.2327, + "step": 27266 + }, + { + "epoch": 2.2089274141283215, + "grad_norm": 0.0786370038986206, + "learning_rate": 9.950492821459111e-05, + "loss": 0.2585, + "step": 27267 + }, + { + "epoch": 2.2090084251458197, + "grad_norm": 0.06409087777137756, + "learning_rate": 9.950042756199649e-05, + "loss": 0.2572, + "step": 27268 + }, + { + "epoch": 2.2090894361633184, + "grad_norm": 0.07662307471036911, + "learning_rate": 9.949592690940188e-05, + "loss": 0.3218, + "step": 27269 + }, + { + "epoch": 2.2091704471808167, + "grad_norm": 0.0556035116314888, + "learning_rate": 9.949142625680724e-05, + "loss": 0.2315, + "step": 27270 + }, + { + "epoch": 2.209251458198315, + "grad_norm": 0.07545039057731628, + "learning_rate": 9.948692560421261e-05, + "loss": 0.2455, + "step": 27271 + }, + { + "epoch": 2.209332469215813, + "grad_norm": 0.07402218133211136, + "learning_rate": 9.9482424951618e-05, + "loss": 0.2466, + "step": 27272 + }, + { + "epoch": 2.209413480233312, + "grad_norm": 0.07804799824953079, + "learning_rate": 9.947792429902336e-05, + "loss": 0.2561, + "step": 27273 + }, + { + "epoch": 2.20949449125081, + "grad_norm": 0.07206982374191284, + "learning_rate": 9.947342364642873e-05, + "loss": 0.268, + "step": 27274 + }, + { + "epoch": 2.2095755022683083, + "grad_norm": 0.06558424234390259, + "learning_rate": 9.946892299383412e-05, + "loss": 0.2505, + "step": 27275 + }, + { + "epoch": 2.209656513285807, + "grad_norm": 0.05041855201125145, + "learning_rate": 9.946442234123948e-05, + "loss": 0.2461, + "step": 27276 + }, + { + "epoch": 2.2097375243033053, + "grad_norm": 0.06105949729681015, + "learning_rate": 9.945992168864486e-05, + "loss": 0.2645, + "step": 27277 + }, + { + "epoch": 2.2098185353208035, + "grad_norm": 0.05644926801323891, + "learning_rate": 9.945542103605024e-05, + "loss": 0.2377, + "step": 27278 + }, + { + "epoch": 2.209899546338302, + "grad_norm": 0.06584417074918747, + "learning_rate": 9.94509203834556e-05, + "loss": 0.2987, + "step": 27279 + }, + { + "epoch": 2.2099805573558005, + "grad_norm": 0.06372004747390747, + "learning_rate": 9.944641973086099e-05, + "loss": 0.2823, + "step": 27280 + }, + { + "epoch": 2.2100615683732987, + "grad_norm": 0.05927109718322754, + "learning_rate": 9.944191907826636e-05, + "loss": 0.2528, + "step": 27281 + }, + { + "epoch": 2.210142579390797, + "grad_norm": 0.06508370488882065, + "learning_rate": 9.943741842567172e-05, + "loss": 0.2439, + "step": 27282 + }, + { + "epoch": 2.2102235904082956, + "grad_norm": 0.0654415488243103, + "learning_rate": 9.94329177730771e-05, + "loss": 0.278, + "step": 27283 + }, + { + "epoch": 2.210304601425794, + "grad_norm": 0.06012353301048279, + "learning_rate": 9.942841712048248e-05, + "loss": 0.2598, + "step": 27284 + }, + { + "epoch": 2.210385612443292, + "grad_norm": 0.06264343857765198, + "learning_rate": 9.942391646788784e-05, + "loss": 0.2586, + "step": 27285 + }, + { + "epoch": 2.210466623460791, + "grad_norm": 0.05635780468583107, + "learning_rate": 9.941941581529323e-05, + "loss": 0.2515, + "step": 27286 + }, + { + "epoch": 2.210547634478289, + "grad_norm": 0.07047024369239807, + "learning_rate": 9.94149151626986e-05, + "loss": 0.2785, + "step": 27287 + }, + { + "epoch": 2.2106286454957873, + "grad_norm": 0.07421385496854782, + "learning_rate": 9.941041451010396e-05, + "loss": 0.2461, + "step": 27288 + }, + { + "epoch": 2.210709656513286, + "grad_norm": 0.07711216062307358, + "learning_rate": 9.940591385750935e-05, + "loss": 0.2494, + "step": 27289 + }, + { + "epoch": 2.2107906675307842, + "grad_norm": 0.06072551757097244, + "learning_rate": 9.940141320491472e-05, + "loss": 0.3183, + "step": 27290 + }, + { + "epoch": 2.2108716785482825, + "grad_norm": 0.08102339506149292, + "learning_rate": 9.939691255232008e-05, + "loss": 0.2514, + "step": 27291 + }, + { + "epoch": 2.210952689565781, + "grad_norm": 0.06397506594657898, + "learning_rate": 9.939241189972547e-05, + "loss": 0.2289, + "step": 27292 + }, + { + "epoch": 2.2110337005832794, + "grad_norm": 0.05820344015955925, + "learning_rate": 9.938791124713084e-05, + "loss": 0.2186, + "step": 27293 + }, + { + "epoch": 2.2111147116007777, + "grad_norm": 0.06543104350566864, + "learning_rate": 9.93834105945362e-05, + "loss": 0.2407, + "step": 27294 + }, + { + "epoch": 2.211195722618276, + "grad_norm": 0.07835323363542557, + "learning_rate": 9.937890994194159e-05, + "loss": 0.2444, + "step": 27295 + }, + { + "epoch": 2.2112767336357746, + "grad_norm": 0.06635398417711258, + "learning_rate": 9.937440928934696e-05, + "loss": 0.2334, + "step": 27296 + }, + { + "epoch": 2.211357744653273, + "grad_norm": 0.05356477573513985, + "learning_rate": 9.936990863675232e-05, + "loss": 0.2232, + "step": 27297 + }, + { + "epoch": 2.211438755670771, + "grad_norm": 0.05896617844700813, + "learning_rate": 9.936540798415771e-05, + "loss": 0.2636, + "step": 27298 + }, + { + "epoch": 2.2115197666882698, + "grad_norm": 0.05496169626712799, + "learning_rate": 9.936090733156308e-05, + "loss": 0.2313, + "step": 27299 + }, + { + "epoch": 2.211600777705768, + "grad_norm": 0.06341741979122162, + "learning_rate": 9.935640667896844e-05, + "loss": 0.2541, + "step": 27300 + }, + { + "epoch": 2.2116817887232663, + "grad_norm": 0.05799972638487816, + "learning_rate": 9.935190602637383e-05, + "loss": 0.2355, + "step": 27301 + }, + { + "epoch": 2.211762799740765, + "grad_norm": 0.06351993978023529, + "learning_rate": 9.93474053737792e-05, + "loss": 0.2545, + "step": 27302 + }, + { + "epoch": 2.211843810758263, + "grad_norm": 0.05903366208076477, + "learning_rate": 9.934290472118458e-05, + "loss": 0.2037, + "step": 27303 + }, + { + "epoch": 2.2119248217757614, + "grad_norm": 0.07522643357515335, + "learning_rate": 9.933840406858995e-05, + "loss": 0.2483, + "step": 27304 + }, + { + "epoch": 2.2120058327932597, + "grad_norm": 0.06297983974218369, + "learning_rate": 9.933390341599533e-05, + "loss": 0.2967, + "step": 27305 + }, + { + "epoch": 2.2120868438107584, + "grad_norm": 0.08709455281496048, + "learning_rate": 9.93294027634007e-05, + "loss": 0.2715, + "step": 27306 + }, + { + "epoch": 2.2121678548282566, + "grad_norm": 0.07213051617145538, + "learning_rate": 9.932490211080607e-05, + "loss": 0.2528, + "step": 27307 + }, + { + "epoch": 2.212248865845755, + "grad_norm": 0.06085379421710968, + "learning_rate": 9.932040145821145e-05, + "loss": 0.2293, + "step": 27308 + }, + { + "epoch": 2.2123298768632536, + "grad_norm": 0.05907389521598816, + "learning_rate": 9.931590080561682e-05, + "loss": 0.2358, + "step": 27309 + }, + { + "epoch": 2.212410887880752, + "grad_norm": 0.0674804225564003, + "learning_rate": 9.931140015302219e-05, + "loss": 0.2447, + "step": 27310 + }, + { + "epoch": 2.21249189889825, + "grad_norm": 0.059147339314222336, + "learning_rate": 9.930689950042757e-05, + "loss": 0.2597, + "step": 27311 + }, + { + "epoch": 2.2125729099157487, + "grad_norm": 0.05762263014912605, + "learning_rate": 9.930239884783294e-05, + "loss": 0.2347, + "step": 27312 + }, + { + "epoch": 2.212653920933247, + "grad_norm": 0.0638045072555542, + "learning_rate": 9.929789819523831e-05, + "loss": 0.2646, + "step": 27313 + }, + { + "epoch": 2.212734931950745, + "grad_norm": 0.07611501216888428, + "learning_rate": 9.929339754264369e-05, + "loss": 0.2533, + "step": 27314 + }, + { + "epoch": 2.212815942968244, + "grad_norm": 0.06777425855398178, + "learning_rate": 9.928889689004906e-05, + "loss": 0.2812, + "step": 27315 + }, + { + "epoch": 2.212896953985742, + "grad_norm": 0.05217338353395462, + "learning_rate": 9.928439623745443e-05, + "loss": 0.2586, + "step": 27316 + }, + { + "epoch": 2.2129779650032404, + "grad_norm": 0.06507042050361633, + "learning_rate": 9.927989558485981e-05, + "loss": 0.2517, + "step": 27317 + }, + { + "epoch": 2.2130589760207386, + "grad_norm": 0.06591640412807465, + "learning_rate": 9.927539493226518e-05, + "loss": 0.2339, + "step": 27318 + }, + { + "epoch": 2.2131399870382373, + "grad_norm": 0.06380046159029007, + "learning_rate": 9.927089427967056e-05, + "loss": 0.2303, + "step": 27319 + }, + { + "epoch": 2.2132209980557356, + "grad_norm": 0.0698883980512619, + "learning_rate": 9.926639362707593e-05, + "loss": 0.2441, + "step": 27320 + }, + { + "epoch": 2.213302009073234, + "grad_norm": 0.04417555406689644, + "learning_rate": 9.92618929744813e-05, + "loss": 0.2252, + "step": 27321 + }, + { + "epoch": 2.2133830200907325, + "grad_norm": 0.07910863310098648, + "learning_rate": 9.925739232188668e-05, + "loss": 0.2467, + "step": 27322 + }, + { + "epoch": 2.2134640311082308, + "grad_norm": 0.06234428659081459, + "learning_rate": 9.925289166929205e-05, + "loss": 0.2396, + "step": 27323 + }, + { + "epoch": 2.213545042125729, + "grad_norm": 0.0615205354988575, + "learning_rate": 9.924839101669742e-05, + "loss": 0.2425, + "step": 27324 + }, + { + "epoch": 2.2136260531432272, + "grad_norm": 0.07571331411600113, + "learning_rate": 9.92438903641028e-05, + "loss": 0.257, + "step": 27325 + }, + { + "epoch": 2.213707064160726, + "grad_norm": 0.07765693962574005, + "learning_rate": 9.923938971150817e-05, + "loss": 0.2795, + "step": 27326 + }, + { + "epoch": 2.213788075178224, + "grad_norm": 0.07145484536886215, + "learning_rate": 9.923488905891354e-05, + "loss": 0.2542, + "step": 27327 + }, + { + "epoch": 2.2138690861957224, + "grad_norm": 0.07751671224832535, + "learning_rate": 9.923038840631892e-05, + "loss": 0.259, + "step": 27328 + }, + { + "epoch": 2.213950097213221, + "grad_norm": 0.05293998867273331, + "learning_rate": 9.922588775372429e-05, + "loss": 0.2228, + "step": 27329 + }, + { + "epoch": 2.2140311082307194, + "grad_norm": 0.06756523996591568, + "learning_rate": 9.922138710112967e-05, + "loss": 0.2897, + "step": 27330 + }, + { + "epoch": 2.2141121192482176, + "grad_norm": 0.05320659652352333, + "learning_rate": 9.921688644853504e-05, + "loss": 0.2577, + "step": 27331 + }, + { + "epoch": 2.2141931302657163, + "grad_norm": 0.0681634396314621, + "learning_rate": 9.921238579594043e-05, + "loss": 0.257, + "step": 27332 + }, + { + "epoch": 2.2142741412832145, + "grad_norm": 0.06318969279527664, + "learning_rate": 9.920788514334579e-05, + "loss": 0.2767, + "step": 27333 + }, + { + "epoch": 2.214355152300713, + "grad_norm": 0.06955531239509583, + "learning_rate": 9.920338449075116e-05, + "loss": 0.299, + "step": 27334 + }, + { + "epoch": 2.2144361633182115, + "grad_norm": 0.0692557767033577, + "learning_rate": 9.919888383815655e-05, + "loss": 0.2761, + "step": 27335 + }, + { + "epoch": 2.2145171743357097, + "grad_norm": 0.06259065866470337, + "learning_rate": 9.919438318556191e-05, + "loss": 0.2358, + "step": 27336 + }, + { + "epoch": 2.214598185353208, + "grad_norm": 0.06407694518566132, + "learning_rate": 9.918988253296728e-05, + "loss": 0.2416, + "step": 27337 + }, + { + "epoch": 2.2146791963707066, + "grad_norm": 0.06017732620239258, + "learning_rate": 9.918538188037267e-05, + "loss": 0.2327, + "step": 27338 + }, + { + "epoch": 2.214760207388205, + "grad_norm": 0.05848823860287666, + "learning_rate": 9.918088122777803e-05, + "loss": 0.2397, + "step": 27339 + }, + { + "epoch": 2.214841218405703, + "grad_norm": 0.0606422983109951, + "learning_rate": 9.91763805751834e-05, + "loss": 0.2453, + "step": 27340 + }, + { + "epoch": 2.2149222294232014, + "grad_norm": 0.07600191980600357, + "learning_rate": 9.917187992258879e-05, + "loss": 0.2595, + "step": 27341 + }, + { + "epoch": 2.2150032404407, + "grad_norm": 0.05821105092763901, + "learning_rate": 9.916737926999415e-05, + "loss": 0.2641, + "step": 27342 + }, + { + "epoch": 2.2150842514581983, + "grad_norm": 0.04956561699509621, + "learning_rate": 9.916287861739952e-05, + "loss": 0.185, + "step": 27343 + }, + { + "epoch": 2.2151652624756966, + "grad_norm": 0.06335799396038055, + "learning_rate": 9.915837796480491e-05, + "loss": 0.2326, + "step": 27344 + }, + { + "epoch": 2.2152462734931953, + "grad_norm": 0.06417136639356613, + "learning_rate": 9.915387731221027e-05, + "loss": 0.2495, + "step": 27345 + }, + { + "epoch": 2.2153272845106935, + "grad_norm": 0.08220578730106354, + "learning_rate": 9.914937665961564e-05, + "loss": 0.2699, + "step": 27346 + }, + { + "epoch": 2.2154082955281917, + "grad_norm": 0.06456286460161209, + "learning_rate": 9.914487600702103e-05, + "loss": 0.2291, + "step": 27347 + }, + { + "epoch": 2.21548930654569, + "grad_norm": 0.06221301853656769, + "learning_rate": 9.914037535442639e-05, + "loss": 0.2272, + "step": 27348 + }, + { + "epoch": 2.2155703175631887, + "grad_norm": 0.05606995150446892, + "learning_rate": 9.913587470183176e-05, + "loss": 0.2354, + "step": 27349 + }, + { + "epoch": 2.215651328580687, + "grad_norm": 0.07109422236680984, + "learning_rate": 9.913137404923715e-05, + "loss": 0.3113, + "step": 27350 + }, + { + "epoch": 2.215732339598185, + "grad_norm": 0.06941894441843033, + "learning_rate": 9.912687339664251e-05, + "loss": 0.2866, + "step": 27351 + }, + { + "epoch": 2.215813350615684, + "grad_norm": 0.06637705862522125, + "learning_rate": 9.912237274404788e-05, + "loss": 0.304, + "step": 27352 + }, + { + "epoch": 2.215894361633182, + "grad_norm": 0.06498729437589645, + "learning_rate": 9.911787209145327e-05, + "loss": 0.3121, + "step": 27353 + }, + { + "epoch": 2.2159753726506803, + "grad_norm": 0.08340992778539658, + "learning_rate": 9.911337143885863e-05, + "loss": 0.302, + "step": 27354 + }, + { + "epoch": 2.216056383668179, + "grad_norm": 0.06651787459850311, + "learning_rate": 9.9108870786264e-05, + "loss": 0.2322, + "step": 27355 + }, + { + "epoch": 2.2161373946856773, + "grad_norm": 0.08446913957595825, + "learning_rate": 9.910437013366939e-05, + "loss": 0.2637, + "step": 27356 + }, + { + "epoch": 2.2162184057031755, + "grad_norm": 0.05738092213869095, + "learning_rate": 9.909986948107475e-05, + "loss": 0.2603, + "step": 27357 + }, + { + "epoch": 2.216299416720674, + "grad_norm": 0.08143822848796844, + "learning_rate": 9.909536882848014e-05, + "loss": 0.2694, + "step": 27358 + }, + { + "epoch": 2.2163804277381725, + "grad_norm": 0.04862968251109123, + "learning_rate": 9.909086817588551e-05, + "loss": 0.226, + "step": 27359 + }, + { + "epoch": 2.2164614387556707, + "grad_norm": 0.05859667807817459, + "learning_rate": 9.908636752329087e-05, + "loss": 0.2551, + "step": 27360 + }, + { + "epoch": 2.216542449773169, + "grad_norm": 0.07120684534311295, + "learning_rate": 9.908186687069626e-05, + "loss": 0.2648, + "step": 27361 + }, + { + "epoch": 2.2166234607906676, + "grad_norm": 0.06827737390995026, + "learning_rate": 9.907736621810163e-05, + "loss": 0.264, + "step": 27362 + }, + { + "epoch": 2.216704471808166, + "grad_norm": 0.05470007658004761, + "learning_rate": 9.9072865565507e-05, + "loss": 0.25, + "step": 27363 + }, + { + "epoch": 2.216785482825664, + "grad_norm": 0.06258236616849899, + "learning_rate": 9.906836491291238e-05, + "loss": 0.274, + "step": 27364 + }, + { + "epoch": 2.216866493843163, + "grad_norm": 0.06155962124466896, + "learning_rate": 9.906386426031775e-05, + "loss": 0.2329, + "step": 27365 + }, + { + "epoch": 2.216947504860661, + "grad_norm": 0.06398579478263855, + "learning_rate": 9.905936360772311e-05, + "loss": 0.2401, + "step": 27366 + }, + { + "epoch": 2.2170285158781593, + "grad_norm": 0.0638839453458786, + "learning_rate": 9.90548629551285e-05, + "loss": 0.2635, + "step": 27367 + }, + { + "epoch": 2.217109526895658, + "grad_norm": 0.06554199010133743, + "learning_rate": 9.905036230253388e-05, + "loss": 0.2445, + "step": 27368 + }, + { + "epoch": 2.2171905379131562, + "grad_norm": 0.06670667231082916, + "learning_rate": 9.904586164993924e-05, + "loss": 0.336, + "step": 27369 + }, + { + "epoch": 2.2172715489306545, + "grad_norm": 0.06505695730447769, + "learning_rate": 9.904136099734462e-05, + "loss": 0.277, + "step": 27370 + }, + { + "epoch": 2.2173525599481527, + "grad_norm": 0.05942712724208832, + "learning_rate": 9.903686034475e-05, + "loss": 0.2439, + "step": 27371 + }, + { + "epoch": 2.2174335709656514, + "grad_norm": 0.05172451585531235, + "learning_rate": 9.903235969215536e-05, + "loss": 0.2348, + "step": 27372 + }, + { + "epoch": 2.2175145819831497, + "grad_norm": 0.0634874477982521, + "learning_rate": 9.902785903956074e-05, + "loss": 0.2198, + "step": 27373 + }, + { + "epoch": 2.217595593000648, + "grad_norm": 0.06077420711517334, + "learning_rate": 9.902335838696612e-05, + "loss": 0.2598, + "step": 27374 + }, + { + "epoch": 2.2176766040181466, + "grad_norm": 0.06746877729892731, + "learning_rate": 9.901885773437148e-05, + "loss": 0.2783, + "step": 27375 + }, + { + "epoch": 2.217757615035645, + "grad_norm": 0.06087268888950348, + "learning_rate": 9.901435708177686e-05, + "loss": 0.2618, + "step": 27376 + }, + { + "epoch": 2.217838626053143, + "grad_norm": 0.07225014269351959, + "learning_rate": 9.900985642918224e-05, + "loss": 0.2837, + "step": 27377 + }, + { + "epoch": 2.2179196370706418, + "grad_norm": 0.0711088478565216, + "learning_rate": 9.90053557765876e-05, + "loss": 0.2293, + "step": 27378 + }, + { + "epoch": 2.21800064808814, + "grad_norm": 0.05558910593390465, + "learning_rate": 9.900085512399299e-05, + "loss": 0.2422, + "step": 27379 + }, + { + "epoch": 2.2180816591056383, + "grad_norm": 0.09979903697967529, + "learning_rate": 9.899635447139836e-05, + "loss": 0.2926, + "step": 27380 + }, + { + "epoch": 2.218162670123137, + "grad_norm": 0.060716476291418076, + "learning_rate": 9.899185381880373e-05, + "loss": 0.2442, + "step": 27381 + }, + { + "epoch": 2.218243681140635, + "grad_norm": 0.06644859910011292, + "learning_rate": 9.89873531662091e-05, + "loss": 0.2583, + "step": 27382 + }, + { + "epoch": 2.2183246921581334, + "grad_norm": 0.05265781655907631, + "learning_rate": 9.898285251361448e-05, + "loss": 0.2632, + "step": 27383 + }, + { + "epoch": 2.2184057031756317, + "grad_norm": 0.07375629991292953, + "learning_rate": 9.897835186101985e-05, + "loss": 0.2509, + "step": 27384 + }, + { + "epoch": 2.2184867141931304, + "grad_norm": 0.06024035066366196, + "learning_rate": 9.897385120842523e-05, + "loss": 0.2678, + "step": 27385 + }, + { + "epoch": 2.2185677252106286, + "grad_norm": 0.06680899113416672, + "learning_rate": 9.89693505558306e-05, + "loss": 0.2442, + "step": 27386 + }, + { + "epoch": 2.218648736228127, + "grad_norm": 0.05207567289471626, + "learning_rate": 9.896484990323597e-05, + "loss": 0.2211, + "step": 27387 + }, + { + "epoch": 2.2187297472456255, + "grad_norm": 0.059763479977846146, + "learning_rate": 9.896034925064135e-05, + "loss": 0.2822, + "step": 27388 + }, + { + "epoch": 2.218810758263124, + "grad_norm": 0.07277268916368484, + "learning_rate": 9.895584859804672e-05, + "loss": 0.2677, + "step": 27389 + }, + { + "epoch": 2.218891769280622, + "grad_norm": 0.06010603532195091, + "learning_rate": 9.89513479454521e-05, + "loss": 0.2264, + "step": 27390 + }, + { + "epoch": 2.2189727802981207, + "grad_norm": 0.06812576949596405, + "learning_rate": 9.894684729285747e-05, + "loss": 0.2438, + "step": 27391 + }, + { + "epoch": 2.219053791315619, + "grad_norm": 0.05678996816277504, + "learning_rate": 9.894234664026284e-05, + "loss": 0.2705, + "step": 27392 + }, + { + "epoch": 2.219134802333117, + "grad_norm": 0.06939080357551575, + "learning_rate": 9.893784598766822e-05, + "loss": 0.256, + "step": 27393 + }, + { + "epoch": 2.2192158133506155, + "grad_norm": 0.0556022971868515, + "learning_rate": 9.893334533507359e-05, + "loss": 0.2282, + "step": 27394 + }, + { + "epoch": 2.219296824368114, + "grad_norm": 0.07095302641391754, + "learning_rate": 9.892884468247896e-05, + "loss": 0.2738, + "step": 27395 + }, + { + "epoch": 2.2193778353856124, + "grad_norm": 0.07031233608722687, + "learning_rate": 9.892434402988434e-05, + "loss": 0.279, + "step": 27396 + }, + { + "epoch": 2.2194588464031106, + "grad_norm": 0.05634433776140213, + "learning_rate": 9.891984337728971e-05, + "loss": 0.2655, + "step": 27397 + }, + { + "epoch": 2.2195398574206093, + "grad_norm": 0.056388407945632935, + "learning_rate": 9.891534272469508e-05, + "loss": 0.2669, + "step": 27398 + }, + { + "epoch": 2.2196208684381076, + "grad_norm": 0.06369081884622574, + "learning_rate": 9.891084207210046e-05, + "loss": 0.2805, + "step": 27399 + }, + { + "epoch": 2.219701879455606, + "grad_norm": 0.0784493088722229, + "learning_rate": 9.890634141950583e-05, + "loss": 0.2655, + "step": 27400 + }, + { + "epoch": 2.2197828904731045, + "grad_norm": 0.052695855498313904, + "learning_rate": 9.89018407669112e-05, + "loss": 0.2547, + "step": 27401 + }, + { + "epoch": 2.2198639014906028, + "grad_norm": 0.06510698050260544, + "learning_rate": 9.889734011431658e-05, + "loss": 0.2539, + "step": 27402 + }, + { + "epoch": 2.219944912508101, + "grad_norm": 0.06631234288215637, + "learning_rate": 9.889283946172195e-05, + "loss": 0.2554, + "step": 27403 + }, + { + "epoch": 2.2200259235255997, + "grad_norm": 0.06391224265098572, + "learning_rate": 9.888833880912733e-05, + "loss": 0.2828, + "step": 27404 + }, + { + "epoch": 2.220106934543098, + "grad_norm": 0.0637073889374733, + "learning_rate": 9.88838381565327e-05, + "loss": 0.2219, + "step": 27405 + }, + { + "epoch": 2.220187945560596, + "grad_norm": 0.05338175967335701, + "learning_rate": 9.887933750393807e-05, + "loss": 0.229, + "step": 27406 + }, + { + "epoch": 2.2202689565780944, + "grad_norm": 0.06317490339279175, + "learning_rate": 9.887483685134345e-05, + "loss": 0.2624, + "step": 27407 + }, + { + "epoch": 2.220349967595593, + "grad_norm": 0.06853865832090378, + "learning_rate": 9.887033619874882e-05, + "loss": 0.2574, + "step": 27408 + }, + { + "epoch": 2.2204309786130914, + "grad_norm": 0.06886468827724457, + "learning_rate": 9.88658355461542e-05, + "loss": 0.2514, + "step": 27409 + }, + { + "epoch": 2.2205119896305896, + "grad_norm": 0.07354399561882019, + "learning_rate": 9.886133489355958e-05, + "loss": 0.2396, + "step": 27410 + }, + { + "epoch": 2.2205930006480883, + "grad_norm": 0.06268005073070526, + "learning_rate": 9.885683424096494e-05, + "loss": 0.254, + "step": 27411 + }, + { + "epoch": 2.2206740116655865, + "grad_norm": 0.09697388857603073, + "learning_rate": 9.885233358837031e-05, + "loss": 0.2285, + "step": 27412 + }, + { + "epoch": 2.220755022683085, + "grad_norm": 0.06654613465070724, + "learning_rate": 9.88478329357757e-05, + "loss": 0.2374, + "step": 27413 + }, + { + "epoch": 2.2208360337005835, + "grad_norm": 0.07476381957530975, + "learning_rate": 9.884333228318106e-05, + "loss": 0.2601, + "step": 27414 + }, + { + "epoch": 2.2209170447180817, + "grad_norm": 0.05062644183635712, + "learning_rate": 9.883883163058644e-05, + "loss": 0.2397, + "step": 27415 + }, + { + "epoch": 2.22099805573558, + "grad_norm": 0.06820233911275864, + "learning_rate": 9.883433097799182e-05, + "loss": 0.2549, + "step": 27416 + }, + { + "epoch": 2.221079066753078, + "grad_norm": 0.05485299229621887, + "learning_rate": 9.882983032539718e-05, + "loss": 0.2242, + "step": 27417 + }, + { + "epoch": 2.221160077770577, + "grad_norm": 0.06426907330751419, + "learning_rate": 9.882532967280256e-05, + "loss": 0.2561, + "step": 27418 + }, + { + "epoch": 2.221241088788075, + "grad_norm": 0.06557059288024902, + "learning_rate": 9.882082902020794e-05, + "loss": 0.2679, + "step": 27419 + }, + { + "epoch": 2.2213220998055734, + "grad_norm": 0.06241839751601219, + "learning_rate": 9.88163283676133e-05, + "loss": 0.2643, + "step": 27420 + }, + { + "epoch": 2.221403110823072, + "grad_norm": 0.06659674644470215, + "learning_rate": 9.881182771501868e-05, + "loss": 0.2709, + "step": 27421 + }, + { + "epoch": 2.2214841218405703, + "grad_norm": 0.06463085860013962, + "learning_rate": 9.880732706242406e-05, + "loss": 0.2544, + "step": 27422 + }, + { + "epoch": 2.2215651328580686, + "grad_norm": 0.06982456147670746, + "learning_rate": 9.880282640982942e-05, + "loss": 0.2625, + "step": 27423 + }, + { + "epoch": 2.2216461438755672, + "grad_norm": 0.05242660269141197, + "learning_rate": 9.87983257572348e-05, + "loss": 0.2588, + "step": 27424 + }, + { + "epoch": 2.2217271548930655, + "grad_norm": 0.06410206854343414, + "learning_rate": 9.879382510464018e-05, + "loss": 0.2542, + "step": 27425 + }, + { + "epoch": 2.2218081659105637, + "grad_norm": 0.0681510865688324, + "learning_rate": 9.878932445204554e-05, + "loss": 0.2598, + "step": 27426 + }, + { + "epoch": 2.2218891769280624, + "grad_norm": 0.061042800545692444, + "learning_rate": 9.878482379945092e-05, + "loss": 0.2472, + "step": 27427 + }, + { + "epoch": 2.2219701879455607, + "grad_norm": 0.06885142624378204, + "learning_rate": 9.87803231468563e-05, + "loss": 0.2414, + "step": 27428 + }, + { + "epoch": 2.222051198963059, + "grad_norm": 0.06637189537286758, + "learning_rate": 9.877582249426167e-05, + "loss": 0.2563, + "step": 27429 + }, + { + "epoch": 2.222132209980557, + "grad_norm": 0.07090841978788376, + "learning_rate": 9.877132184166704e-05, + "loss": 0.2827, + "step": 27430 + }, + { + "epoch": 2.222213220998056, + "grad_norm": 0.08006473630666733, + "learning_rate": 9.876682118907243e-05, + "loss": 0.2639, + "step": 27431 + }, + { + "epoch": 2.222294232015554, + "grad_norm": 0.05476365610957146, + "learning_rate": 9.876232053647779e-05, + "loss": 0.2275, + "step": 27432 + }, + { + "epoch": 2.2223752430330523, + "grad_norm": 0.07430217415094376, + "learning_rate": 9.875781988388316e-05, + "loss": 0.2414, + "step": 27433 + }, + { + "epoch": 2.222456254050551, + "grad_norm": 0.09188614785671234, + "learning_rate": 9.875331923128855e-05, + "loss": 0.2447, + "step": 27434 + }, + { + "epoch": 2.2225372650680493, + "grad_norm": 0.052723485976457596, + "learning_rate": 9.874881857869391e-05, + "loss": 0.2271, + "step": 27435 + }, + { + "epoch": 2.2226182760855475, + "grad_norm": 0.07605164498090744, + "learning_rate": 9.87443179260993e-05, + "loss": 0.2764, + "step": 27436 + }, + { + "epoch": 2.222699287103046, + "grad_norm": 0.06630595028400421, + "learning_rate": 9.873981727350467e-05, + "loss": 0.2323, + "step": 27437 + }, + { + "epoch": 2.2227802981205445, + "grad_norm": 0.049888212233781815, + "learning_rate": 9.873531662091003e-05, + "loss": 0.1975, + "step": 27438 + }, + { + "epoch": 2.2228613091380427, + "grad_norm": 0.049289241433143616, + "learning_rate": 9.873081596831542e-05, + "loss": 0.2091, + "step": 27439 + }, + { + "epoch": 2.222942320155541, + "grad_norm": 0.0582793727517128, + "learning_rate": 9.872631531572079e-05, + "loss": 0.222, + "step": 27440 + }, + { + "epoch": 2.2230233311730396, + "grad_norm": 0.06828349083662033, + "learning_rate": 9.872181466312615e-05, + "loss": 0.2356, + "step": 27441 + }, + { + "epoch": 2.223104342190538, + "grad_norm": 0.0737132653594017, + "learning_rate": 9.871731401053154e-05, + "loss": 0.2409, + "step": 27442 + }, + { + "epoch": 2.223185353208036, + "grad_norm": 0.07881966233253479, + "learning_rate": 9.871281335793691e-05, + "loss": 0.2731, + "step": 27443 + }, + { + "epoch": 2.223266364225535, + "grad_norm": 0.07030628621578217, + "learning_rate": 9.870831270534227e-05, + "loss": 0.2873, + "step": 27444 + }, + { + "epoch": 2.223347375243033, + "grad_norm": 0.06227487698197365, + "learning_rate": 9.870381205274766e-05, + "loss": 0.254, + "step": 27445 + }, + { + "epoch": 2.2234283862605313, + "grad_norm": 0.07559426873922348, + "learning_rate": 9.869931140015303e-05, + "loss": 0.2863, + "step": 27446 + }, + { + "epoch": 2.22350939727803, + "grad_norm": 0.058403171598911285, + "learning_rate": 9.869481074755839e-05, + "loss": 0.2524, + "step": 27447 + }, + { + "epoch": 2.2235904082955282, + "grad_norm": 0.059054918587207794, + "learning_rate": 9.869031009496378e-05, + "loss": 0.2425, + "step": 27448 + }, + { + "epoch": 2.2236714193130265, + "grad_norm": 0.05502532050013542, + "learning_rate": 9.868580944236915e-05, + "loss": 0.2413, + "step": 27449 + }, + { + "epoch": 2.223752430330525, + "grad_norm": 0.058676645159721375, + "learning_rate": 9.868130878977452e-05, + "loss": 0.2485, + "step": 27450 + }, + { + "epoch": 2.2238334413480234, + "grad_norm": 0.07493289560079575, + "learning_rate": 9.86768081371799e-05, + "loss": 0.2898, + "step": 27451 + }, + { + "epoch": 2.2239144523655217, + "grad_norm": 0.060452915728092194, + "learning_rate": 9.867230748458527e-05, + "loss": 0.2362, + "step": 27452 + }, + { + "epoch": 2.22399546338302, + "grad_norm": 0.058643221855163574, + "learning_rate": 9.866780683199065e-05, + "loss": 0.2515, + "step": 27453 + }, + { + "epoch": 2.2240764744005186, + "grad_norm": 0.06971300393342972, + "learning_rate": 9.866330617939602e-05, + "loss": 0.2339, + "step": 27454 + }, + { + "epoch": 2.224157485418017, + "grad_norm": 0.0665970891714096, + "learning_rate": 9.865880552680139e-05, + "loss": 0.2689, + "step": 27455 + }, + { + "epoch": 2.224238496435515, + "grad_norm": 0.06566937267780304, + "learning_rate": 9.865430487420677e-05, + "loss": 0.2652, + "step": 27456 + }, + { + "epoch": 2.2243195074530138, + "grad_norm": 0.07684820145368576, + "learning_rate": 9.864980422161214e-05, + "loss": 0.2953, + "step": 27457 + }, + { + "epoch": 2.224400518470512, + "grad_norm": 0.06417267769575119, + "learning_rate": 9.864530356901751e-05, + "loss": 0.2415, + "step": 27458 + }, + { + "epoch": 2.2244815294880103, + "grad_norm": 0.0706862136721611, + "learning_rate": 9.864080291642289e-05, + "loss": 0.3013, + "step": 27459 + }, + { + "epoch": 2.224562540505509, + "grad_norm": 0.07747501134872437, + "learning_rate": 9.863630226382826e-05, + "loss": 0.285, + "step": 27460 + }, + { + "epoch": 2.224643551523007, + "grad_norm": 0.0710633248090744, + "learning_rate": 9.863180161123363e-05, + "loss": 0.2647, + "step": 27461 + }, + { + "epoch": 2.2247245625405054, + "grad_norm": 0.07837072759866714, + "learning_rate": 9.862730095863901e-05, + "loss": 0.3105, + "step": 27462 + }, + { + "epoch": 2.2248055735580037, + "grad_norm": 0.07310599088668823, + "learning_rate": 9.862280030604438e-05, + "loss": 0.2734, + "step": 27463 + }, + { + "epoch": 2.2248865845755024, + "grad_norm": 0.06395061314105988, + "learning_rate": 9.861829965344976e-05, + "loss": 0.242, + "step": 27464 + }, + { + "epoch": 2.2249675955930006, + "grad_norm": 0.06322309374809265, + "learning_rate": 9.861379900085513e-05, + "loss": 0.2324, + "step": 27465 + }, + { + "epoch": 2.225048606610499, + "grad_norm": 0.05373113974928856, + "learning_rate": 9.86092983482605e-05, + "loss": 0.2504, + "step": 27466 + }, + { + "epoch": 2.2251296176279975, + "grad_norm": 0.06073353439569473, + "learning_rate": 9.860479769566588e-05, + "loss": 0.24, + "step": 27467 + }, + { + "epoch": 2.225210628645496, + "grad_norm": 0.06355416029691696, + "learning_rate": 9.860029704307125e-05, + "loss": 0.2263, + "step": 27468 + }, + { + "epoch": 2.225291639662994, + "grad_norm": 0.07005083560943604, + "learning_rate": 9.859579639047662e-05, + "loss": 0.2319, + "step": 27469 + }, + { + "epoch": 2.2253726506804927, + "grad_norm": 0.06226866692304611, + "learning_rate": 9.8591295737882e-05, + "loss": 0.2657, + "step": 27470 + }, + { + "epoch": 2.225453661697991, + "grad_norm": 0.061929695308208466, + "learning_rate": 9.858679508528737e-05, + "loss": 0.2895, + "step": 27471 + }, + { + "epoch": 2.225534672715489, + "grad_norm": 0.047602199018001556, + "learning_rate": 9.858229443269274e-05, + "loss": 0.2445, + "step": 27472 + }, + { + "epoch": 2.225615683732988, + "grad_norm": 0.05019146203994751, + "learning_rate": 9.857779378009812e-05, + "loss": 0.2185, + "step": 27473 + }, + { + "epoch": 2.225696694750486, + "grad_norm": 0.07254036515951157, + "learning_rate": 9.857329312750349e-05, + "loss": 0.2716, + "step": 27474 + }, + { + "epoch": 2.2257777057679844, + "grad_norm": 0.062929667532444, + "learning_rate": 9.856879247490886e-05, + "loss": 0.2515, + "step": 27475 + }, + { + "epoch": 2.2258587167854826, + "grad_norm": 0.05786556378006935, + "learning_rate": 9.856429182231424e-05, + "loss": 0.2367, + "step": 27476 + }, + { + "epoch": 2.2259397278029813, + "grad_norm": 0.05418018996715546, + "learning_rate": 9.855979116971961e-05, + "loss": 0.2743, + "step": 27477 + }, + { + "epoch": 2.2260207388204796, + "grad_norm": 0.05989348143339157, + "learning_rate": 9.855529051712499e-05, + "loss": 0.2373, + "step": 27478 + }, + { + "epoch": 2.226101749837978, + "grad_norm": 0.05833243206143379, + "learning_rate": 9.855078986453036e-05, + "loss": 0.214, + "step": 27479 + }, + { + "epoch": 2.2261827608554765, + "grad_norm": 0.06846433132886887, + "learning_rate": 9.854628921193573e-05, + "loss": 0.2518, + "step": 27480 + }, + { + "epoch": 2.2262637718729748, + "grad_norm": 0.07357719540596008, + "learning_rate": 9.85417885593411e-05, + "loss": 0.236, + "step": 27481 + }, + { + "epoch": 2.226344782890473, + "grad_norm": 0.06799498945474625, + "learning_rate": 9.853728790674648e-05, + "loss": 0.2835, + "step": 27482 + }, + { + "epoch": 2.2264257939079717, + "grad_norm": 0.07681022584438324, + "learning_rate": 9.853278725415185e-05, + "loss": 0.2576, + "step": 27483 + }, + { + "epoch": 2.22650680492547, + "grad_norm": 0.09169048070907593, + "learning_rate": 9.852828660155723e-05, + "loss": 0.2802, + "step": 27484 + }, + { + "epoch": 2.226587815942968, + "grad_norm": 0.07720378786325455, + "learning_rate": 9.85237859489626e-05, + "loss": 0.2744, + "step": 27485 + }, + { + "epoch": 2.2266688269604664, + "grad_norm": 0.06049828231334686, + "learning_rate": 9.851928529636797e-05, + "loss": 0.2339, + "step": 27486 + }, + { + "epoch": 2.226749837977965, + "grad_norm": 0.06331600993871689, + "learning_rate": 9.851478464377335e-05, + "loss": 0.2062, + "step": 27487 + }, + { + "epoch": 2.2268308489954634, + "grad_norm": 0.0648995041847229, + "learning_rate": 9.851028399117872e-05, + "loss": 0.272, + "step": 27488 + }, + { + "epoch": 2.2269118600129616, + "grad_norm": 0.05869756639003754, + "learning_rate": 9.85057833385841e-05, + "loss": 0.2155, + "step": 27489 + }, + { + "epoch": 2.2269928710304603, + "grad_norm": 0.07214734703302383, + "learning_rate": 9.850128268598947e-05, + "loss": 0.259, + "step": 27490 + }, + { + "epoch": 2.2270738820479585, + "grad_norm": 0.07368501275777817, + "learning_rate": 9.849678203339486e-05, + "loss": 0.2668, + "step": 27491 + }, + { + "epoch": 2.2271548930654568, + "grad_norm": 0.057600319385528564, + "learning_rate": 9.849228138080022e-05, + "loss": 0.2817, + "step": 27492 + }, + { + "epoch": 2.2272359040829555, + "grad_norm": 0.07728826254606247, + "learning_rate": 9.848778072820559e-05, + "loss": 0.2332, + "step": 27493 + }, + { + "epoch": 2.2273169151004537, + "grad_norm": 0.08698121458292007, + "learning_rate": 9.848328007561098e-05, + "loss": 0.2613, + "step": 27494 + }, + { + "epoch": 2.227397926117952, + "grad_norm": 0.06995168328285217, + "learning_rate": 9.847877942301634e-05, + "loss": 0.2448, + "step": 27495 + }, + { + "epoch": 2.2274789371354506, + "grad_norm": 0.07552751153707504, + "learning_rate": 9.847427877042171e-05, + "loss": 0.2467, + "step": 27496 + }, + { + "epoch": 2.227559948152949, + "grad_norm": 0.05158974602818489, + "learning_rate": 9.84697781178271e-05, + "loss": 0.2392, + "step": 27497 + }, + { + "epoch": 2.227640959170447, + "grad_norm": 0.06887788325548172, + "learning_rate": 9.846527746523246e-05, + "loss": 0.2429, + "step": 27498 + }, + { + "epoch": 2.2277219701879454, + "grad_norm": 0.05913383141160011, + "learning_rate": 9.846077681263783e-05, + "loss": 0.251, + "step": 27499 + }, + { + "epoch": 2.227802981205444, + "grad_norm": 0.06455465406179428, + "learning_rate": 9.845627616004322e-05, + "loss": 0.2602, + "step": 27500 + }, + { + "epoch": 2.2278839922229423, + "grad_norm": 0.06063240021467209, + "learning_rate": 9.845177550744858e-05, + "loss": 0.2513, + "step": 27501 + }, + { + "epoch": 2.2279650032404406, + "grad_norm": 0.0720057561993599, + "learning_rate": 9.844727485485395e-05, + "loss": 0.2618, + "step": 27502 + }, + { + "epoch": 2.2280460142579392, + "grad_norm": 0.06722862273454666, + "learning_rate": 9.844277420225934e-05, + "loss": 0.2426, + "step": 27503 + }, + { + "epoch": 2.2281270252754375, + "grad_norm": 0.06625615060329437, + "learning_rate": 9.84382735496647e-05, + "loss": 0.2413, + "step": 27504 + }, + { + "epoch": 2.2282080362929357, + "grad_norm": 0.06222971901297569, + "learning_rate": 9.843377289707007e-05, + "loss": 0.2936, + "step": 27505 + }, + { + "epoch": 2.2282890473104344, + "grad_norm": 0.07540034502744675, + "learning_rate": 9.842927224447546e-05, + "loss": 0.2774, + "step": 27506 + }, + { + "epoch": 2.2283700583279327, + "grad_norm": 0.05772213637828827, + "learning_rate": 9.842477159188082e-05, + "loss": 0.2926, + "step": 27507 + }, + { + "epoch": 2.228451069345431, + "grad_norm": 0.05750125274062157, + "learning_rate": 9.84202709392862e-05, + "loss": 0.2172, + "step": 27508 + }, + { + "epoch": 2.228532080362929, + "grad_norm": 0.06508267670869827, + "learning_rate": 9.841577028669158e-05, + "loss": 0.2431, + "step": 27509 + }, + { + "epoch": 2.228613091380428, + "grad_norm": 0.05775724723935127, + "learning_rate": 9.841126963409694e-05, + "loss": 0.2197, + "step": 27510 + }, + { + "epoch": 2.228694102397926, + "grad_norm": 0.06315132230520248, + "learning_rate": 9.840676898150231e-05, + "loss": 0.2528, + "step": 27511 + }, + { + "epoch": 2.2287751134154243, + "grad_norm": 0.07274419069290161, + "learning_rate": 9.84022683289077e-05, + "loss": 0.2502, + "step": 27512 + }, + { + "epoch": 2.228856124432923, + "grad_norm": 0.059047311544418335, + "learning_rate": 9.839776767631306e-05, + "loss": 0.2592, + "step": 27513 + }, + { + "epoch": 2.2289371354504213, + "grad_norm": 0.07797567546367645, + "learning_rate": 9.839326702371844e-05, + "loss": 0.2427, + "step": 27514 + }, + { + "epoch": 2.2290181464679195, + "grad_norm": 0.06294585019350052, + "learning_rate": 9.838876637112382e-05, + "loss": 0.3364, + "step": 27515 + }, + { + "epoch": 2.229099157485418, + "grad_norm": 0.07711624354124069, + "learning_rate": 9.83842657185292e-05, + "loss": 0.282, + "step": 27516 + }, + { + "epoch": 2.2291801685029164, + "grad_norm": 0.06658951193094254, + "learning_rate": 9.837976506593457e-05, + "loss": 0.2528, + "step": 27517 + }, + { + "epoch": 2.2292611795204147, + "grad_norm": 0.06800119578838348, + "learning_rate": 9.837526441333994e-05, + "loss": 0.3019, + "step": 27518 + }, + { + "epoch": 2.2293421905379134, + "grad_norm": 0.06257172673940659, + "learning_rate": 9.837076376074532e-05, + "loss": 0.2488, + "step": 27519 + }, + { + "epoch": 2.2294232015554116, + "grad_norm": 0.06106441840529442, + "learning_rate": 9.836626310815069e-05, + "loss": 0.2568, + "step": 27520 + }, + { + "epoch": 2.22950421257291, + "grad_norm": 0.061102159321308136, + "learning_rate": 9.836176245555606e-05, + "loss": 0.2469, + "step": 27521 + }, + { + "epoch": 2.229585223590408, + "grad_norm": 0.07021883875131607, + "learning_rate": 9.835726180296144e-05, + "loss": 0.2892, + "step": 27522 + }, + { + "epoch": 2.229666234607907, + "grad_norm": 0.07805996388196945, + "learning_rate": 9.835276115036681e-05, + "loss": 0.2457, + "step": 27523 + }, + { + "epoch": 2.229747245625405, + "grad_norm": 0.05466349795460701, + "learning_rate": 9.834826049777218e-05, + "loss": 0.2625, + "step": 27524 + }, + { + "epoch": 2.2298282566429033, + "grad_norm": 0.06738977134227753, + "learning_rate": 9.834375984517756e-05, + "loss": 0.2104, + "step": 27525 + }, + { + "epoch": 2.229909267660402, + "grad_norm": 0.06767590343952179, + "learning_rate": 9.833925919258293e-05, + "loss": 0.2371, + "step": 27526 + }, + { + "epoch": 2.2299902786779002, + "grad_norm": 0.06312504410743713, + "learning_rate": 9.83347585399883e-05, + "loss": 0.2782, + "step": 27527 + }, + { + "epoch": 2.2300712896953985, + "grad_norm": 0.07312658429145813, + "learning_rate": 9.833025788739368e-05, + "loss": 0.2412, + "step": 27528 + }, + { + "epoch": 2.230152300712897, + "grad_norm": 0.07023181021213531, + "learning_rate": 9.832575723479905e-05, + "loss": 0.2241, + "step": 27529 + }, + { + "epoch": 2.2302333117303954, + "grad_norm": 0.06490954011678696, + "learning_rate": 9.832125658220443e-05, + "loss": 0.2657, + "step": 27530 + }, + { + "epoch": 2.2303143227478937, + "grad_norm": 0.07052972167730331, + "learning_rate": 9.83167559296098e-05, + "loss": 0.2798, + "step": 27531 + }, + { + "epoch": 2.230395333765392, + "grad_norm": 0.07023556530475616, + "learning_rate": 9.831225527701517e-05, + "loss": 0.239, + "step": 27532 + }, + { + "epoch": 2.2304763447828906, + "grad_norm": 0.06125883013010025, + "learning_rate": 9.830775462442055e-05, + "loss": 0.2651, + "step": 27533 + }, + { + "epoch": 2.230557355800389, + "grad_norm": 0.060255266726017, + "learning_rate": 9.830325397182592e-05, + "loss": 0.2869, + "step": 27534 + }, + { + "epoch": 2.230638366817887, + "grad_norm": 0.07880748063325882, + "learning_rate": 9.82987533192313e-05, + "loss": 0.2452, + "step": 27535 + }, + { + "epoch": 2.2307193778353858, + "grad_norm": 0.07296689599752426, + "learning_rate": 9.829425266663667e-05, + "loss": 0.277, + "step": 27536 + }, + { + "epoch": 2.230800388852884, + "grad_norm": 0.05730430781841278, + "learning_rate": 9.828975201404204e-05, + "loss": 0.2232, + "step": 27537 + }, + { + "epoch": 2.2308813998703823, + "grad_norm": 0.06154127046465874, + "learning_rate": 9.828525136144742e-05, + "loss": 0.2694, + "step": 27538 + }, + { + "epoch": 2.230962410887881, + "grad_norm": 0.058270592242479324, + "learning_rate": 9.828075070885279e-05, + "loss": 0.2411, + "step": 27539 + }, + { + "epoch": 2.231043421905379, + "grad_norm": 0.061224162578582764, + "learning_rate": 9.827625005625816e-05, + "loss": 0.2387, + "step": 27540 + }, + { + "epoch": 2.2311244329228774, + "grad_norm": 0.05655451491475105, + "learning_rate": 9.827174940366354e-05, + "loss": 0.2532, + "step": 27541 + }, + { + "epoch": 2.231205443940376, + "grad_norm": 0.06237894669175148, + "learning_rate": 9.826724875106891e-05, + "loss": 0.2996, + "step": 27542 + }, + { + "epoch": 2.2312864549578744, + "grad_norm": 0.0705811008810997, + "learning_rate": 9.826274809847428e-05, + "loss": 0.2582, + "step": 27543 + }, + { + "epoch": 2.2313674659753726, + "grad_norm": 0.06242735683917999, + "learning_rate": 9.825824744587966e-05, + "loss": 0.2207, + "step": 27544 + }, + { + "epoch": 2.231448476992871, + "grad_norm": 0.07497147470712662, + "learning_rate": 9.825374679328503e-05, + "loss": 0.2633, + "step": 27545 + }, + { + "epoch": 2.2315294880103695, + "grad_norm": 0.05885780230164528, + "learning_rate": 9.82492461406904e-05, + "loss": 0.2292, + "step": 27546 + }, + { + "epoch": 2.231610499027868, + "grad_norm": 0.07651273906230927, + "learning_rate": 9.824474548809578e-05, + "loss": 0.2238, + "step": 27547 + }, + { + "epoch": 2.231691510045366, + "grad_norm": 0.07533611357212067, + "learning_rate": 9.824024483550115e-05, + "loss": 0.2695, + "step": 27548 + }, + { + "epoch": 2.2317725210628647, + "grad_norm": 0.06582427769899368, + "learning_rate": 9.823574418290653e-05, + "loss": 0.2439, + "step": 27549 + }, + { + "epoch": 2.231853532080363, + "grad_norm": 0.07081213593482971, + "learning_rate": 9.82312435303119e-05, + "loss": 0.2457, + "step": 27550 + }, + { + "epoch": 2.231934543097861, + "grad_norm": 0.054331351071596146, + "learning_rate": 9.822674287771727e-05, + "loss": 0.2193, + "step": 27551 + }, + { + "epoch": 2.2320155541153595, + "grad_norm": 0.06326715648174286, + "learning_rate": 9.822224222512265e-05, + "loss": 0.276, + "step": 27552 + }, + { + "epoch": 2.232096565132858, + "grad_norm": 0.0661514475941658, + "learning_rate": 9.821774157252802e-05, + "loss": 0.2201, + "step": 27553 + }, + { + "epoch": 2.2321775761503564, + "grad_norm": 0.07344749569892883, + "learning_rate": 9.821324091993339e-05, + "loss": 0.2613, + "step": 27554 + }, + { + "epoch": 2.2322585871678546, + "grad_norm": 0.06575877219438553, + "learning_rate": 9.820874026733877e-05, + "loss": 0.2533, + "step": 27555 + }, + { + "epoch": 2.2323395981853533, + "grad_norm": 0.06327194720506668, + "learning_rate": 9.820423961474414e-05, + "loss": 0.2836, + "step": 27556 + }, + { + "epoch": 2.2324206092028516, + "grad_norm": 0.07980243861675262, + "learning_rate": 9.819973896214951e-05, + "loss": 0.3185, + "step": 27557 + }, + { + "epoch": 2.23250162022035, + "grad_norm": 0.06968654692173004, + "learning_rate": 9.819523830955489e-05, + "loss": 0.2597, + "step": 27558 + }, + { + "epoch": 2.2325826312378485, + "grad_norm": 0.06582485139369965, + "learning_rate": 9.819073765696026e-05, + "loss": 0.2474, + "step": 27559 + }, + { + "epoch": 2.2326636422553467, + "grad_norm": 0.059361279010772705, + "learning_rate": 9.818623700436563e-05, + "loss": 0.2138, + "step": 27560 + }, + { + "epoch": 2.232744653272845, + "grad_norm": 0.05606989562511444, + "learning_rate": 9.818173635177101e-05, + "loss": 0.192, + "step": 27561 + }, + { + "epoch": 2.2328256642903437, + "grad_norm": 0.06947729736566544, + "learning_rate": 9.817723569917638e-05, + "loss": 0.3042, + "step": 27562 + }, + { + "epoch": 2.232906675307842, + "grad_norm": 0.05743652954697609, + "learning_rate": 9.817273504658176e-05, + "loss": 0.2408, + "step": 27563 + }, + { + "epoch": 2.23298768632534, + "grad_norm": 0.06133139878511429, + "learning_rate": 9.816823439398713e-05, + "loss": 0.2494, + "step": 27564 + }, + { + "epoch": 2.233068697342839, + "grad_norm": 0.06576412916183472, + "learning_rate": 9.81637337413925e-05, + "loss": 0.293, + "step": 27565 + }, + { + "epoch": 2.233149708360337, + "grad_norm": 0.07262366265058517, + "learning_rate": 9.815923308879788e-05, + "loss": 0.2593, + "step": 27566 + }, + { + "epoch": 2.2332307193778353, + "grad_norm": 0.06177317723631859, + "learning_rate": 9.815473243620325e-05, + "loss": 0.2956, + "step": 27567 + }, + { + "epoch": 2.2333117303953336, + "grad_norm": 0.06126299500465393, + "learning_rate": 9.815023178360862e-05, + "loss": 0.2228, + "step": 27568 + }, + { + "epoch": 2.2333927414128323, + "grad_norm": 0.05007245019078255, + "learning_rate": 9.814573113101401e-05, + "loss": 0.2662, + "step": 27569 + }, + { + "epoch": 2.2334737524303305, + "grad_norm": 0.07485800236463547, + "learning_rate": 9.814123047841937e-05, + "loss": 0.2344, + "step": 27570 + }, + { + "epoch": 2.2335547634478288, + "grad_norm": 0.057362813502550125, + "learning_rate": 9.813672982582474e-05, + "loss": 0.2246, + "step": 27571 + }, + { + "epoch": 2.2336357744653275, + "grad_norm": 0.05725299194455147, + "learning_rate": 9.813222917323013e-05, + "loss": 0.2566, + "step": 27572 + }, + { + "epoch": 2.2337167854828257, + "grad_norm": 0.06267392635345459, + "learning_rate": 9.812772852063549e-05, + "loss": 0.2466, + "step": 27573 + }, + { + "epoch": 2.233797796500324, + "grad_norm": 0.07378648221492767, + "learning_rate": 9.812322786804087e-05, + "loss": 0.2414, + "step": 27574 + }, + { + "epoch": 2.233878807517822, + "grad_norm": 0.07484953105449677, + "learning_rate": 9.811872721544625e-05, + "loss": 0.2581, + "step": 27575 + }, + { + "epoch": 2.233959818535321, + "grad_norm": 0.07462868839502335, + "learning_rate": 9.811422656285161e-05, + "loss": 0.2483, + "step": 27576 + }, + { + "epoch": 2.234040829552819, + "grad_norm": 0.056544914841651917, + "learning_rate": 9.810972591025699e-05, + "loss": 0.298, + "step": 27577 + }, + { + "epoch": 2.2341218405703174, + "grad_norm": 0.05818667635321617, + "learning_rate": 9.810522525766237e-05, + "loss": 0.2323, + "step": 27578 + }, + { + "epoch": 2.234202851587816, + "grad_norm": 0.06373794376850128, + "learning_rate": 9.810072460506773e-05, + "loss": 0.2654, + "step": 27579 + }, + { + "epoch": 2.2342838626053143, + "grad_norm": 0.09751275926828384, + "learning_rate": 9.80962239524731e-05, + "loss": 0.2999, + "step": 27580 + }, + { + "epoch": 2.2343648736228126, + "grad_norm": 0.059716999530792236, + "learning_rate": 9.80917232998785e-05, + "loss": 0.2333, + "step": 27581 + }, + { + "epoch": 2.2344458846403112, + "grad_norm": 0.049951471388339996, + "learning_rate": 9.808722264728385e-05, + "loss": 0.221, + "step": 27582 + }, + { + "epoch": 2.2345268956578095, + "grad_norm": 0.07606692612171173, + "learning_rate": 9.808272199468923e-05, + "loss": 0.2508, + "step": 27583 + }, + { + "epoch": 2.2346079066753077, + "grad_norm": 0.053282156586647034, + "learning_rate": 9.807822134209461e-05, + "loss": 0.2406, + "step": 27584 + }, + { + "epoch": 2.2346889176928064, + "grad_norm": 0.05666787177324295, + "learning_rate": 9.807372068949999e-05, + "loss": 0.235, + "step": 27585 + }, + { + "epoch": 2.2347699287103047, + "grad_norm": 0.06908533722162247, + "learning_rate": 9.806922003690535e-05, + "loss": 0.3004, + "step": 27586 + }, + { + "epoch": 2.234850939727803, + "grad_norm": 0.05455467104911804, + "learning_rate": 9.806471938431074e-05, + "loss": 0.2432, + "step": 27587 + }, + { + "epoch": 2.234931950745301, + "grad_norm": 0.061505746096372604, + "learning_rate": 9.806021873171611e-05, + "loss": 0.2074, + "step": 27588 + }, + { + "epoch": 2.2350129617628, + "grad_norm": 0.07434146851301193, + "learning_rate": 9.805571807912147e-05, + "loss": 0.2753, + "step": 27589 + }, + { + "epoch": 2.235093972780298, + "grad_norm": 0.06101938709616661, + "learning_rate": 9.805121742652686e-05, + "loss": 0.2577, + "step": 27590 + }, + { + "epoch": 2.2351749837977963, + "grad_norm": 0.05936296284198761, + "learning_rate": 9.804671677393223e-05, + "loss": 0.2461, + "step": 27591 + }, + { + "epoch": 2.235255994815295, + "grad_norm": 0.060692984610795975, + "learning_rate": 9.804221612133759e-05, + "loss": 0.2119, + "step": 27592 + }, + { + "epoch": 2.2353370058327933, + "grad_norm": 0.0680263414978981, + "learning_rate": 9.803771546874298e-05, + "loss": 0.2368, + "step": 27593 + }, + { + "epoch": 2.2354180168502915, + "grad_norm": 0.06867021322250366, + "learning_rate": 9.803321481614835e-05, + "loss": 0.2842, + "step": 27594 + }, + { + "epoch": 2.23549902786779, + "grad_norm": 0.06476103514432907, + "learning_rate": 9.802871416355372e-05, + "loss": 0.2524, + "step": 27595 + }, + { + "epoch": 2.2355800388852884, + "grad_norm": 0.0553499199450016, + "learning_rate": 9.80242135109591e-05, + "loss": 0.2183, + "step": 27596 + }, + { + "epoch": 2.2356610499027867, + "grad_norm": 0.08324495702981949, + "learning_rate": 9.801971285836447e-05, + "loss": 0.2827, + "step": 27597 + }, + { + "epoch": 2.235742060920285, + "grad_norm": 0.06461543589830399, + "learning_rate": 9.801521220576985e-05, + "loss": 0.2366, + "step": 27598 + }, + { + "epoch": 2.2358230719377836, + "grad_norm": 0.06602394580841064, + "learning_rate": 9.801071155317522e-05, + "loss": 0.2393, + "step": 27599 + }, + { + "epoch": 2.235904082955282, + "grad_norm": 0.06189282611012459, + "learning_rate": 9.800621090058059e-05, + "loss": 0.2199, + "step": 27600 + }, + { + "epoch": 2.23598509397278, + "grad_norm": 0.06203988194465637, + "learning_rate": 9.800171024798597e-05, + "loss": 0.2333, + "step": 27601 + }, + { + "epoch": 2.236066104990279, + "grad_norm": 0.05741807445883751, + "learning_rate": 9.799720959539134e-05, + "loss": 0.2591, + "step": 27602 + }, + { + "epoch": 2.236147116007777, + "grad_norm": 0.06251756101846695, + "learning_rate": 9.799270894279671e-05, + "loss": 0.2568, + "step": 27603 + }, + { + "epoch": 2.2362281270252753, + "grad_norm": 0.06924989074468613, + "learning_rate": 9.798820829020209e-05, + "loss": 0.2319, + "step": 27604 + }, + { + "epoch": 2.236309138042774, + "grad_norm": 0.05958033725619316, + "learning_rate": 9.798370763760746e-05, + "loss": 0.285, + "step": 27605 + }, + { + "epoch": 2.2363901490602722, + "grad_norm": 0.07292255014181137, + "learning_rate": 9.797920698501283e-05, + "loss": 0.2757, + "step": 27606 + }, + { + "epoch": 2.2364711600777705, + "grad_norm": 0.06331977993249893, + "learning_rate": 9.797470633241821e-05, + "loss": 0.2391, + "step": 27607 + }, + { + "epoch": 2.236552171095269, + "grad_norm": 0.06359085440635681, + "learning_rate": 9.797020567982358e-05, + "loss": 0.2788, + "step": 27608 + }, + { + "epoch": 2.2366331821127674, + "grad_norm": 0.04755968600511551, + "learning_rate": 9.796570502722895e-05, + "loss": 0.2137, + "step": 27609 + }, + { + "epoch": 2.2367141931302656, + "grad_norm": 0.068677619099617, + "learning_rate": 9.796120437463433e-05, + "loss": 0.2455, + "step": 27610 + }, + { + "epoch": 2.236795204147764, + "grad_norm": 0.05319945141673088, + "learning_rate": 9.79567037220397e-05, + "loss": 0.23, + "step": 27611 + }, + { + "epoch": 2.2368762151652626, + "grad_norm": 0.07198475301265717, + "learning_rate": 9.795220306944508e-05, + "loss": 0.323, + "step": 27612 + }, + { + "epoch": 2.236957226182761, + "grad_norm": 0.07129145413637161, + "learning_rate": 9.794770241685045e-05, + "loss": 0.2547, + "step": 27613 + }, + { + "epoch": 2.237038237200259, + "grad_norm": 0.08250509947538376, + "learning_rate": 9.794320176425582e-05, + "loss": 0.2533, + "step": 27614 + }, + { + "epoch": 2.2371192482177578, + "grad_norm": 0.06255879998207092, + "learning_rate": 9.79387011116612e-05, + "loss": 0.2776, + "step": 27615 + }, + { + "epoch": 2.237200259235256, + "grad_norm": 0.056715380400419235, + "learning_rate": 9.793420045906657e-05, + "loss": 0.2633, + "step": 27616 + }, + { + "epoch": 2.2372812702527543, + "grad_norm": 0.08910910040140152, + "learning_rate": 9.792969980647194e-05, + "loss": 0.2812, + "step": 27617 + }, + { + "epoch": 2.237362281270253, + "grad_norm": 0.05875033885240555, + "learning_rate": 9.792519915387732e-05, + "loss": 0.2527, + "step": 27618 + }, + { + "epoch": 2.237443292287751, + "grad_norm": 0.054330844432115555, + "learning_rate": 9.792069850128269e-05, + "loss": 0.234, + "step": 27619 + }, + { + "epoch": 2.2375243033052494, + "grad_norm": 0.06929908692836761, + "learning_rate": 9.791619784868806e-05, + "loss": 0.2287, + "step": 27620 + }, + { + "epoch": 2.2376053143227477, + "grad_norm": 0.05904196947813034, + "learning_rate": 9.791169719609344e-05, + "loss": 0.2536, + "step": 27621 + }, + { + "epoch": 2.2376863253402464, + "grad_norm": 0.06485608965158463, + "learning_rate": 9.790719654349881e-05, + "loss": 0.2793, + "step": 27622 + }, + { + "epoch": 2.2377673363577446, + "grad_norm": 0.05853230133652687, + "learning_rate": 9.790269589090419e-05, + "loss": 0.2733, + "step": 27623 + }, + { + "epoch": 2.237848347375243, + "grad_norm": 0.05344027280807495, + "learning_rate": 9.789819523830956e-05, + "loss": 0.2305, + "step": 27624 + }, + { + "epoch": 2.2379293583927415, + "grad_norm": 0.057308122515678406, + "learning_rate": 9.789369458571493e-05, + "loss": 0.216, + "step": 27625 + }, + { + "epoch": 2.23801036941024, + "grad_norm": 0.08146602660417557, + "learning_rate": 9.78891939331203e-05, + "loss": 0.2624, + "step": 27626 + }, + { + "epoch": 2.238091380427738, + "grad_norm": 0.07540880888700485, + "learning_rate": 9.788469328052568e-05, + "loss": 0.2634, + "step": 27627 + }, + { + "epoch": 2.2381723914452367, + "grad_norm": 0.07709179073572159, + "learning_rate": 9.788019262793105e-05, + "loss": 0.2665, + "step": 27628 + }, + { + "epoch": 2.238253402462735, + "grad_norm": 0.07061771303415298, + "learning_rate": 9.787569197533643e-05, + "loss": 0.2396, + "step": 27629 + }, + { + "epoch": 2.238334413480233, + "grad_norm": 0.06282792240381241, + "learning_rate": 9.78711913227418e-05, + "loss": 0.2519, + "step": 27630 + }, + { + "epoch": 2.238415424497732, + "grad_norm": 0.06897687166929245, + "learning_rate": 9.786669067014717e-05, + "loss": 0.3148, + "step": 27631 + }, + { + "epoch": 2.23849643551523, + "grad_norm": 0.06308693438768387, + "learning_rate": 9.786219001755255e-05, + "loss": 0.2562, + "step": 27632 + }, + { + "epoch": 2.2385774465327284, + "grad_norm": 0.053178559988737106, + "learning_rate": 9.785768936495792e-05, + "loss": 0.2282, + "step": 27633 + }, + { + "epoch": 2.2386584575502266, + "grad_norm": 0.06457486748695374, + "learning_rate": 9.78531887123633e-05, + "loss": 0.2539, + "step": 27634 + }, + { + "epoch": 2.2387394685677253, + "grad_norm": 0.06926770508289337, + "learning_rate": 9.784868805976867e-05, + "loss": 0.3016, + "step": 27635 + }, + { + "epoch": 2.2388204795852236, + "grad_norm": 0.07494411617517471, + "learning_rate": 9.784418740717404e-05, + "loss": 0.2625, + "step": 27636 + }, + { + "epoch": 2.238901490602722, + "grad_norm": 0.05255121737718582, + "learning_rate": 9.783968675457942e-05, + "loss": 0.2445, + "step": 27637 + }, + { + "epoch": 2.2389825016202205, + "grad_norm": 0.058950025588274, + "learning_rate": 9.783518610198479e-05, + "loss": 0.2513, + "step": 27638 + }, + { + "epoch": 2.2390635126377187, + "grad_norm": 0.06239871680736542, + "learning_rate": 9.783068544939016e-05, + "loss": 0.2302, + "step": 27639 + }, + { + "epoch": 2.239144523655217, + "grad_norm": 0.059939995408058167, + "learning_rate": 9.782618479679554e-05, + "loss": 0.2669, + "step": 27640 + }, + { + "epoch": 2.2392255346727157, + "grad_norm": 0.06433939933776855, + "learning_rate": 9.782168414420091e-05, + "loss": 0.2489, + "step": 27641 + }, + { + "epoch": 2.239306545690214, + "grad_norm": 0.060069452971220016, + "learning_rate": 9.781718349160628e-05, + "loss": 0.2463, + "step": 27642 + }, + { + "epoch": 2.239387556707712, + "grad_norm": 0.07081073522567749, + "learning_rate": 9.781268283901166e-05, + "loss": 0.2563, + "step": 27643 + }, + { + "epoch": 2.2394685677252104, + "grad_norm": 0.07024306058883667, + "learning_rate": 9.780818218641703e-05, + "loss": 0.2692, + "step": 27644 + }, + { + "epoch": 2.239549578742709, + "grad_norm": 0.06197810545563698, + "learning_rate": 9.78036815338224e-05, + "loss": 0.2678, + "step": 27645 + }, + { + "epoch": 2.2396305897602073, + "grad_norm": 0.05187183618545532, + "learning_rate": 9.779918088122778e-05, + "loss": 0.2451, + "step": 27646 + }, + { + "epoch": 2.2397116007777056, + "grad_norm": 0.06668131798505783, + "learning_rate": 9.779468022863317e-05, + "loss": 0.2693, + "step": 27647 + }, + { + "epoch": 2.2397926117952043, + "grad_norm": 0.07856709510087967, + "learning_rate": 9.779017957603853e-05, + "loss": 0.2779, + "step": 27648 + }, + { + "epoch": 2.2398736228127025, + "grad_norm": 0.07700806856155396, + "learning_rate": 9.77856789234439e-05, + "loss": 0.2508, + "step": 27649 + }, + { + "epoch": 2.2399546338302008, + "grad_norm": 0.06017092615365982, + "learning_rate": 9.778117827084929e-05, + "loss": 0.2599, + "step": 27650 + }, + { + "epoch": 2.2400356448476995, + "grad_norm": 0.06719563156366348, + "learning_rate": 9.777667761825465e-05, + "loss": 0.2658, + "step": 27651 + }, + { + "epoch": 2.2401166558651977, + "grad_norm": 0.061362117528915405, + "learning_rate": 9.777217696566002e-05, + "loss": 0.2284, + "step": 27652 + }, + { + "epoch": 2.240197666882696, + "grad_norm": 0.06896430253982544, + "learning_rate": 9.776767631306541e-05, + "loss": 0.2376, + "step": 27653 + }, + { + "epoch": 2.2402786779001946, + "grad_norm": 0.06479042023420334, + "learning_rate": 9.776317566047078e-05, + "loss": 0.2307, + "step": 27654 + }, + { + "epoch": 2.240359688917693, + "grad_norm": 0.06881794333457947, + "learning_rate": 9.775867500787614e-05, + "loss": 0.2324, + "step": 27655 + }, + { + "epoch": 2.240440699935191, + "grad_norm": 0.06108381599187851, + "learning_rate": 9.775417435528153e-05, + "loss": 0.2415, + "step": 27656 + }, + { + "epoch": 2.2405217109526894, + "grad_norm": 0.06953638792037964, + "learning_rate": 9.77496737026869e-05, + "loss": 0.2839, + "step": 27657 + }, + { + "epoch": 2.240602721970188, + "grad_norm": 0.06614869832992554, + "learning_rate": 9.774517305009226e-05, + "loss": 0.2442, + "step": 27658 + }, + { + "epoch": 2.2406837329876863, + "grad_norm": 0.1045251414179802, + "learning_rate": 9.774067239749765e-05, + "loss": 0.2695, + "step": 27659 + }, + { + "epoch": 2.2407647440051845, + "grad_norm": 0.0695669949054718, + "learning_rate": 9.773617174490302e-05, + "loss": 0.2693, + "step": 27660 + }, + { + "epoch": 2.2408457550226832, + "grad_norm": 0.06217452138662338, + "learning_rate": 9.773167109230838e-05, + "loss": 0.2216, + "step": 27661 + }, + { + "epoch": 2.2409267660401815, + "grad_norm": 0.05973656475543976, + "learning_rate": 9.772717043971377e-05, + "loss": 0.2175, + "step": 27662 + }, + { + "epoch": 2.2410077770576797, + "grad_norm": 0.06636808067560196, + "learning_rate": 9.772266978711914e-05, + "loss": 0.267, + "step": 27663 + }, + { + "epoch": 2.2410887880751784, + "grad_norm": 0.07036703824996948, + "learning_rate": 9.77181691345245e-05, + "loss": 0.2256, + "step": 27664 + }, + { + "epoch": 2.2411697990926767, + "grad_norm": 0.06427907943725586, + "learning_rate": 9.771366848192989e-05, + "loss": 0.2263, + "step": 27665 + }, + { + "epoch": 2.241250810110175, + "grad_norm": 0.06415683031082153, + "learning_rate": 9.770916782933526e-05, + "loss": 0.2586, + "step": 27666 + }, + { + "epoch": 2.241331821127673, + "grad_norm": 0.05050278455018997, + "learning_rate": 9.770466717674062e-05, + "loss": 0.208, + "step": 27667 + }, + { + "epoch": 2.241412832145172, + "grad_norm": 0.07526952028274536, + "learning_rate": 9.770016652414601e-05, + "loss": 0.2852, + "step": 27668 + }, + { + "epoch": 2.24149384316267, + "grad_norm": 0.051428694278001785, + "learning_rate": 9.769566587155138e-05, + "loss": 0.2508, + "step": 27669 + }, + { + "epoch": 2.2415748541801683, + "grad_norm": 0.06677006930112839, + "learning_rate": 9.769116521895674e-05, + "loss": 0.2466, + "step": 27670 + }, + { + "epoch": 2.241655865197667, + "grad_norm": 0.06048748269677162, + "learning_rate": 9.768666456636213e-05, + "loss": 0.2282, + "step": 27671 + }, + { + "epoch": 2.2417368762151653, + "grad_norm": 0.06114950403571129, + "learning_rate": 9.76821639137675e-05, + "loss": 0.2255, + "step": 27672 + }, + { + "epoch": 2.2418178872326635, + "grad_norm": 0.07318947464227676, + "learning_rate": 9.767766326117287e-05, + "loss": 0.2637, + "step": 27673 + }, + { + "epoch": 2.241898898250162, + "grad_norm": 0.06158239394426346, + "learning_rate": 9.767316260857825e-05, + "loss": 0.2701, + "step": 27674 + }, + { + "epoch": 2.2419799092676604, + "grad_norm": 0.058391667902469635, + "learning_rate": 9.766866195598363e-05, + "loss": 0.2541, + "step": 27675 + }, + { + "epoch": 2.2420609202851587, + "grad_norm": 0.06646616011857986, + "learning_rate": 9.7664161303389e-05, + "loss": 0.2838, + "step": 27676 + }, + { + "epoch": 2.2421419313026574, + "grad_norm": 0.06191658973693848, + "learning_rate": 9.765966065079437e-05, + "loss": 0.2369, + "step": 27677 + }, + { + "epoch": 2.2422229423201556, + "grad_norm": 0.07929068803787231, + "learning_rate": 9.765515999819975e-05, + "loss": 0.2425, + "step": 27678 + }, + { + "epoch": 2.242303953337654, + "grad_norm": 0.06208163872361183, + "learning_rate": 9.765065934560512e-05, + "loss": 0.2378, + "step": 27679 + }, + { + "epoch": 2.242384964355152, + "grad_norm": 0.09012886881828308, + "learning_rate": 9.76461586930105e-05, + "loss": 0.2697, + "step": 27680 + }, + { + "epoch": 2.242465975372651, + "grad_norm": 0.07423432171344757, + "learning_rate": 9.764165804041587e-05, + "loss": 0.2782, + "step": 27681 + }, + { + "epoch": 2.242546986390149, + "grad_norm": 0.06949017196893692, + "learning_rate": 9.763715738782124e-05, + "loss": 0.2715, + "step": 27682 + }, + { + "epoch": 2.2426279974076473, + "grad_norm": 0.0629178062081337, + "learning_rate": 9.763265673522662e-05, + "loss": 0.2575, + "step": 27683 + }, + { + "epoch": 2.242709008425146, + "grad_norm": 0.0800800770521164, + "learning_rate": 9.762815608263199e-05, + "loss": 0.2946, + "step": 27684 + }, + { + "epoch": 2.2427900194426442, + "grad_norm": 0.0748923197388649, + "learning_rate": 9.762365543003736e-05, + "loss": 0.2647, + "step": 27685 + }, + { + "epoch": 2.2428710304601425, + "grad_norm": 0.06776215136051178, + "learning_rate": 9.761915477744274e-05, + "loss": 0.2396, + "step": 27686 + }, + { + "epoch": 2.242952041477641, + "grad_norm": 0.06588950008153915, + "learning_rate": 9.761465412484811e-05, + "loss": 0.2286, + "step": 27687 + }, + { + "epoch": 2.2430330524951394, + "grad_norm": 0.06949547678232193, + "learning_rate": 9.761015347225348e-05, + "loss": 0.2577, + "step": 27688 + }, + { + "epoch": 2.2431140635126376, + "grad_norm": 0.058526843786239624, + "learning_rate": 9.760565281965886e-05, + "loss": 0.2679, + "step": 27689 + }, + { + "epoch": 2.243195074530136, + "grad_norm": 0.06525886803865433, + "learning_rate": 9.760115216706423e-05, + "loss": 0.2511, + "step": 27690 + }, + { + "epoch": 2.2432760855476346, + "grad_norm": 0.06647324562072754, + "learning_rate": 9.75966515144696e-05, + "loss": 0.2755, + "step": 27691 + }, + { + "epoch": 2.243357096565133, + "grad_norm": 0.06325287371873856, + "learning_rate": 9.759215086187498e-05, + "loss": 0.2338, + "step": 27692 + }, + { + "epoch": 2.243438107582631, + "grad_norm": 0.09324182569980621, + "learning_rate": 9.758765020928035e-05, + "loss": 0.2508, + "step": 27693 + }, + { + "epoch": 2.2435191186001298, + "grad_norm": 0.06293612718582153, + "learning_rate": 9.758314955668572e-05, + "loss": 0.2608, + "step": 27694 + }, + { + "epoch": 2.243600129617628, + "grad_norm": 0.06111268326640129, + "learning_rate": 9.75786489040911e-05, + "loss": 0.244, + "step": 27695 + }, + { + "epoch": 2.2436811406351262, + "grad_norm": 0.05974910780787468, + "learning_rate": 9.757414825149647e-05, + "loss": 0.2458, + "step": 27696 + }, + { + "epoch": 2.243762151652625, + "grad_norm": 0.07850635796785355, + "learning_rate": 9.756964759890185e-05, + "loss": 0.2657, + "step": 27697 + }, + { + "epoch": 2.243843162670123, + "grad_norm": 0.06100136414170265, + "learning_rate": 9.756514694630722e-05, + "loss": 0.2265, + "step": 27698 + }, + { + "epoch": 2.2439241736876214, + "grad_norm": 0.07412116229534149, + "learning_rate": 9.756064629371259e-05, + "loss": 0.3002, + "step": 27699 + }, + { + "epoch": 2.24400518470512, + "grad_norm": 0.06798642128705978, + "learning_rate": 9.755614564111797e-05, + "loss": 0.2751, + "step": 27700 + }, + { + "epoch": 2.2440861957226184, + "grad_norm": 0.07323481142520905, + "learning_rate": 9.755164498852334e-05, + "loss": 0.2393, + "step": 27701 + }, + { + "epoch": 2.2441672067401166, + "grad_norm": 0.07551179081201553, + "learning_rate": 9.754714433592871e-05, + "loss": 0.2398, + "step": 27702 + }, + { + "epoch": 2.244248217757615, + "grad_norm": 0.06301606446504593, + "learning_rate": 9.754264368333409e-05, + "loss": 0.2451, + "step": 27703 + }, + { + "epoch": 2.2443292287751135, + "grad_norm": 0.060894306749105453, + "learning_rate": 9.753814303073946e-05, + "loss": 0.2531, + "step": 27704 + }, + { + "epoch": 2.244410239792612, + "grad_norm": 0.06674228608608246, + "learning_rate": 9.753364237814483e-05, + "loss": 0.2386, + "step": 27705 + }, + { + "epoch": 2.24449125081011, + "grad_norm": 0.06235579773783684, + "learning_rate": 9.752914172555021e-05, + "loss": 0.243, + "step": 27706 + }, + { + "epoch": 2.2445722618276087, + "grad_norm": 0.07511220872402191, + "learning_rate": 9.752464107295558e-05, + "loss": 0.2352, + "step": 27707 + }, + { + "epoch": 2.244653272845107, + "grad_norm": 0.07888006418943405, + "learning_rate": 9.752014042036096e-05, + "loss": 0.2753, + "step": 27708 + }, + { + "epoch": 2.244734283862605, + "grad_norm": 0.0688016340136528, + "learning_rate": 9.751563976776633e-05, + "loss": 0.2296, + "step": 27709 + }, + { + "epoch": 2.244815294880104, + "grad_norm": 0.07113447785377502, + "learning_rate": 9.75111391151717e-05, + "loss": 0.2865, + "step": 27710 + }, + { + "epoch": 2.244896305897602, + "grad_norm": 0.06750037521123886, + "learning_rate": 9.750663846257708e-05, + "loss": 0.2499, + "step": 27711 + }, + { + "epoch": 2.2449773169151004, + "grad_norm": 0.050355907529592514, + "learning_rate": 9.750213780998245e-05, + "loss": 0.2083, + "step": 27712 + }, + { + "epoch": 2.2450583279325986, + "grad_norm": 0.06752597540616989, + "learning_rate": 9.749763715738782e-05, + "loss": 0.286, + "step": 27713 + }, + { + "epoch": 2.2451393389500973, + "grad_norm": 0.05738743022084236, + "learning_rate": 9.74931365047932e-05, + "loss": 0.267, + "step": 27714 + }, + { + "epoch": 2.2452203499675956, + "grad_norm": 0.08378364890813828, + "learning_rate": 9.748863585219857e-05, + "loss": 0.263, + "step": 27715 + }, + { + "epoch": 2.245301360985094, + "grad_norm": 0.06476253271102905, + "learning_rate": 9.748413519960394e-05, + "loss": 0.2629, + "step": 27716 + }, + { + "epoch": 2.2453823720025925, + "grad_norm": 0.06573726236820221, + "learning_rate": 9.747963454700932e-05, + "loss": 0.2835, + "step": 27717 + }, + { + "epoch": 2.2454633830200907, + "grad_norm": 0.05313044786453247, + "learning_rate": 9.747513389441469e-05, + "loss": 0.2232, + "step": 27718 + }, + { + "epoch": 2.245544394037589, + "grad_norm": 0.06361450999975204, + "learning_rate": 9.747063324182006e-05, + "loss": 0.2716, + "step": 27719 + }, + { + "epoch": 2.2456254050550877, + "grad_norm": 0.06137660890817642, + "learning_rate": 9.746613258922544e-05, + "loss": 0.2451, + "step": 27720 + }, + { + "epoch": 2.245706416072586, + "grad_norm": 0.060190510004758835, + "learning_rate": 9.746163193663081e-05, + "loss": 0.2378, + "step": 27721 + }, + { + "epoch": 2.245787427090084, + "grad_norm": 0.05988527089357376, + "learning_rate": 9.745713128403619e-05, + "loss": 0.2593, + "step": 27722 + }, + { + "epoch": 2.245868438107583, + "grad_norm": 0.06177292391657829, + "learning_rate": 9.745263063144157e-05, + "loss": 0.2269, + "step": 27723 + }, + { + "epoch": 2.245949449125081, + "grad_norm": 0.05540267378091812, + "learning_rate": 9.744812997884693e-05, + "loss": 0.2415, + "step": 27724 + }, + { + "epoch": 2.2460304601425793, + "grad_norm": 0.07458721846342087, + "learning_rate": 9.74436293262523e-05, + "loss": 0.228, + "step": 27725 + }, + { + "epoch": 2.2461114711600776, + "grad_norm": 0.0722312182188034, + "learning_rate": 9.74391286736577e-05, + "loss": 0.2973, + "step": 27726 + }, + { + "epoch": 2.2461924821775763, + "grad_norm": 0.06115707382559776, + "learning_rate": 9.743462802106305e-05, + "loss": 0.2307, + "step": 27727 + }, + { + "epoch": 2.2462734931950745, + "grad_norm": 0.06007055193185806, + "learning_rate": 9.743012736846844e-05, + "loss": 0.2421, + "step": 27728 + }, + { + "epoch": 2.2463545042125728, + "grad_norm": 0.0731889083981514, + "learning_rate": 9.742562671587381e-05, + "loss": 0.2972, + "step": 27729 + }, + { + "epoch": 2.2464355152300715, + "grad_norm": 0.0577794648706913, + "learning_rate": 9.742112606327917e-05, + "loss": 0.2105, + "step": 27730 + }, + { + "epoch": 2.2465165262475697, + "grad_norm": 0.06230419501662254, + "learning_rate": 9.741662541068456e-05, + "loss": 0.2752, + "step": 27731 + }, + { + "epoch": 2.246597537265068, + "grad_norm": 0.07523570954799652, + "learning_rate": 9.741212475808994e-05, + "loss": 0.2258, + "step": 27732 + }, + { + "epoch": 2.2466785482825666, + "grad_norm": 0.07175968587398529, + "learning_rate": 9.74076241054953e-05, + "loss": 0.275, + "step": 27733 + }, + { + "epoch": 2.246759559300065, + "grad_norm": 0.07299905270338058, + "learning_rate": 9.740312345290068e-05, + "loss": 0.2665, + "step": 27734 + }, + { + "epoch": 2.246840570317563, + "grad_norm": 0.0867857038974762, + "learning_rate": 9.739862280030606e-05, + "loss": 0.2812, + "step": 27735 + }, + { + "epoch": 2.2469215813350614, + "grad_norm": 0.06806836277246475, + "learning_rate": 9.739412214771142e-05, + "loss": 0.2524, + "step": 27736 + }, + { + "epoch": 2.24700259235256, + "grad_norm": 0.07593537122011185, + "learning_rate": 9.73896214951168e-05, + "loss": 0.2734, + "step": 27737 + }, + { + "epoch": 2.2470836033700583, + "grad_norm": 0.07448563724756241, + "learning_rate": 9.738512084252218e-05, + "loss": 0.2473, + "step": 27738 + }, + { + "epoch": 2.2471646143875565, + "grad_norm": 0.06869513541460037, + "learning_rate": 9.738062018992754e-05, + "loss": 0.2335, + "step": 27739 + }, + { + "epoch": 2.2472456254050552, + "grad_norm": 0.05957813560962677, + "learning_rate": 9.737611953733292e-05, + "loss": 0.2562, + "step": 27740 + }, + { + "epoch": 2.2473266364225535, + "grad_norm": 0.06571602821350098, + "learning_rate": 9.73716188847383e-05, + "loss": 0.2691, + "step": 27741 + }, + { + "epoch": 2.2474076474400517, + "grad_norm": 0.06896911561489105, + "learning_rate": 9.736711823214366e-05, + "loss": 0.2493, + "step": 27742 + }, + { + "epoch": 2.2474886584575504, + "grad_norm": 0.0788666233420372, + "learning_rate": 9.736261757954904e-05, + "loss": 0.2619, + "step": 27743 + }, + { + "epoch": 2.2475696694750487, + "grad_norm": 0.08098050951957703, + "learning_rate": 9.735811692695442e-05, + "loss": 0.2563, + "step": 27744 + }, + { + "epoch": 2.247650680492547, + "grad_norm": 0.061166420578956604, + "learning_rate": 9.735361627435978e-05, + "loss": 0.2636, + "step": 27745 + }, + { + "epoch": 2.2477316915100456, + "grad_norm": 0.07056838274002075, + "learning_rate": 9.734911562176517e-05, + "loss": 0.2773, + "step": 27746 + }, + { + "epoch": 2.247812702527544, + "grad_norm": 0.06341052800416946, + "learning_rate": 9.734461496917054e-05, + "loss": 0.2722, + "step": 27747 + }, + { + "epoch": 2.247893713545042, + "grad_norm": 0.08864749222993851, + "learning_rate": 9.73401143165759e-05, + "loss": 0.2749, + "step": 27748 + }, + { + "epoch": 2.2479747245625403, + "grad_norm": 0.06531281769275665, + "learning_rate": 9.733561366398129e-05, + "loss": 0.2412, + "step": 27749 + }, + { + "epoch": 2.248055735580039, + "grad_norm": 0.05753309279680252, + "learning_rate": 9.733111301138666e-05, + "loss": 0.2593, + "step": 27750 + }, + { + "epoch": 2.2481367465975373, + "grad_norm": 0.06826810538768768, + "learning_rate": 9.732661235879202e-05, + "loss": 0.2313, + "step": 27751 + }, + { + "epoch": 2.2482177576150355, + "grad_norm": 0.06804418563842773, + "learning_rate": 9.732211170619741e-05, + "loss": 0.2549, + "step": 27752 + }, + { + "epoch": 2.248298768632534, + "grad_norm": 0.07234552502632141, + "learning_rate": 9.731761105360278e-05, + "loss": 0.2672, + "step": 27753 + }, + { + "epoch": 2.2483797796500324, + "grad_norm": 0.0556609183549881, + "learning_rate": 9.731311040100815e-05, + "loss": 0.2281, + "step": 27754 + }, + { + "epoch": 2.2484607906675307, + "grad_norm": 0.07722504436969757, + "learning_rate": 9.730860974841353e-05, + "loss": 0.2918, + "step": 27755 + }, + { + "epoch": 2.248541801685029, + "grad_norm": 0.060491111129522324, + "learning_rate": 9.73041090958189e-05, + "loss": 0.2375, + "step": 27756 + }, + { + "epoch": 2.2486228127025276, + "grad_norm": 0.05507529154419899, + "learning_rate": 9.729960844322428e-05, + "loss": 0.2338, + "step": 27757 + }, + { + "epoch": 2.248703823720026, + "grad_norm": 0.07224796712398529, + "learning_rate": 9.729510779062965e-05, + "loss": 0.2423, + "step": 27758 + }, + { + "epoch": 2.248784834737524, + "grad_norm": 0.06265775114297867, + "learning_rate": 9.729060713803502e-05, + "loss": 0.2582, + "step": 27759 + }, + { + "epoch": 2.248865845755023, + "grad_norm": 0.08026780188083649, + "learning_rate": 9.72861064854404e-05, + "loss": 0.2753, + "step": 27760 + }, + { + "epoch": 2.248946856772521, + "grad_norm": 0.051603544503450394, + "learning_rate": 9.728160583284577e-05, + "loss": 0.2458, + "step": 27761 + }, + { + "epoch": 2.2490278677900193, + "grad_norm": 0.07035865634679794, + "learning_rate": 9.727710518025114e-05, + "loss": 0.2556, + "step": 27762 + }, + { + "epoch": 2.249108878807518, + "grad_norm": 0.05800047889351845, + "learning_rate": 9.727260452765652e-05, + "loss": 0.2377, + "step": 27763 + }, + { + "epoch": 2.249189889825016, + "grad_norm": 0.06574030965566635, + "learning_rate": 9.726810387506189e-05, + "loss": 0.2641, + "step": 27764 + }, + { + "epoch": 2.2492709008425145, + "grad_norm": 0.06430261582136154, + "learning_rate": 9.726360322246726e-05, + "loss": 0.2426, + "step": 27765 + }, + { + "epoch": 2.249351911860013, + "grad_norm": 0.06490317732095718, + "learning_rate": 9.725910256987264e-05, + "loss": 0.2361, + "step": 27766 + }, + { + "epoch": 2.2494329228775114, + "grad_norm": 0.0709247961640358, + "learning_rate": 9.725460191727801e-05, + "loss": 0.2505, + "step": 27767 + }, + { + "epoch": 2.2495139338950096, + "grad_norm": 0.05828109383583069, + "learning_rate": 9.725010126468338e-05, + "loss": 0.2168, + "step": 27768 + }, + { + "epoch": 2.2495949449125083, + "grad_norm": 0.0659540668129921, + "learning_rate": 9.724560061208876e-05, + "loss": 0.2212, + "step": 27769 + }, + { + "epoch": 2.2496759559300066, + "grad_norm": 0.07301966100931168, + "learning_rate": 9.724109995949413e-05, + "loss": 0.2457, + "step": 27770 + }, + { + "epoch": 2.249756966947505, + "grad_norm": 0.06525655835866928, + "learning_rate": 9.72365993068995e-05, + "loss": 0.2453, + "step": 27771 + }, + { + "epoch": 2.249837977965003, + "grad_norm": 0.0701277032494545, + "learning_rate": 9.723209865430488e-05, + "loss": 0.2612, + "step": 27772 + }, + { + "epoch": 2.2499189889825018, + "grad_norm": 0.05551356077194214, + "learning_rate": 9.722759800171025e-05, + "loss": 0.2751, + "step": 27773 + }, + { + "epoch": 2.25, + "grad_norm": 0.07594873756170273, + "learning_rate": 9.722309734911563e-05, + "loss": 0.2741, + "step": 27774 + }, + { + "epoch": 2.2500810110174982, + "grad_norm": 0.07346559315919876, + "learning_rate": 9.7218596696521e-05, + "loss": 0.2472, + "step": 27775 + }, + { + "epoch": 2.250162022034997, + "grad_norm": 0.06441164016723633, + "learning_rate": 9.721409604392637e-05, + "loss": 0.2858, + "step": 27776 + }, + { + "epoch": 2.250243033052495, + "grad_norm": 0.07367736846208572, + "learning_rate": 9.720959539133175e-05, + "loss": 0.2576, + "step": 27777 + }, + { + "epoch": 2.2503240440699934, + "grad_norm": 0.07037994265556335, + "learning_rate": 9.720509473873712e-05, + "loss": 0.2483, + "step": 27778 + }, + { + "epoch": 2.2504050550874917, + "grad_norm": 0.062301624566316605, + "learning_rate": 9.72005940861425e-05, + "loss": 0.2298, + "step": 27779 + }, + { + "epoch": 2.2504860661049904, + "grad_norm": 0.06465239077806473, + "learning_rate": 9.719609343354787e-05, + "loss": 0.2167, + "step": 27780 + }, + { + "epoch": 2.2505670771224886, + "grad_norm": 0.06765540689229965, + "learning_rate": 9.719159278095324e-05, + "loss": 0.2624, + "step": 27781 + }, + { + "epoch": 2.250648088139987, + "grad_norm": 0.07314081490039825, + "learning_rate": 9.718709212835862e-05, + "loss": 0.2735, + "step": 27782 + }, + { + "epoch": 2.2507290991574855, + "grad_norm": 0.056773893535137177, + "learning_rate": 9.718259147576399e-05, + "loss": 0.2679, + "step": 27783 + }, + { + "epoch": 2.250810110174984, + "grad_norm": 0.07499083131551743, + "learning_rate": 9.717809082316936e-05, + "loss": 0.2742, + "step": 27784 + }, + { + "epoch": 2.250891121192482, + "grad_norm": 0.055659789592027664, + "learning_rate": 9.717359017057474e-05, + "loss": 0.2459, + "step": 27785 + }, + { + "epoch": 2.2509721322099807, + "grad_norm": 0.057748425751924515, + "learning_rate": 9.716908951798011e-05, + "loss": 0.2191, + "step": 27786 + }, + { + "epoch": 2.251053143227479, + "grad_norm": 0.06498988717794418, + "learning_rate": 9.716458886538548e-05, + "loss": 0.2336, + "step": 27787 + }, + { + "epoch": 2.251134154244977, + "grad_norm": 0.05748360604047775, + "learning_rate": 9.716008821279086e-05, + "loss": 0.2329, + "step": 27788 + }, + { + "epoch": 2.251215165262476, + "grad_norm": 0.05783957615494728, + "learning_rate": 9.715558756019624e-05, + "loss": 0.2266, + "step": 27789 + }, + { + "epoch": 2.251296176279974, + "grad_norm": 0.05689896643161774, + "learning_rate": 9.71510869076016e-05, + "loss": 0.2332, + "step": 27790 + }, + { + "epoch": 2.2513771872974724, + "grad_norm": 0.05369093269109726, + "learning_rate": 9.714658625500698e-05, + "loss": 0.2255, + "step": 27791 + }, + { + "epoch": 2.251458198314971, + "grad_norm": 0.061454132199287415, + "learning_rate": 9.714208560241236e-05, + "loss": 0.2584, + "step": 27792 + }, + { + "epoch": 2.2515392093324693, + "grad_norm": 0.05191675201058388, + "learning_rate": 9.713758494981772e-05, + "loss": 0.2653, + "step": 27793 + }, + { + "epoch": 2.2516202203499676, + "grad_norm": 0.05647498741745949, + "learning_rate": 9.71330842972231e-05, + "loss": 0.2598, + "step": 27794 + }, + { + "epoch": 2.251701231367466, + "grad_norm": 0.05284734070301056, + "learning_rate": 9.712858364462849e-05, + "loss": 0.244, + "step": 27795 + }, + { + "epoch": 2.2517822423849645, + "grad_norm": 0.05807924270629883, + "learning_rate": 9.712408299203385e-05, + "loss": 0.2768, + "step": 27796 + }, + { + "epoch": 2.2518632534024627, + "grad_norm": 0.06968452781438828, + "learning_rate": 9.711958233943922e-05, + "loss": 0.2916, + "step": 27797 + }, + { + "epoch": 2.251944264419961, + "grad_norm": 0.05544663220643997, + "learning_rate": 9.71150816868446e-05, + "loss": 0.2459, + "step": 27798 + }, + { + "epoch": 2.2520252754374597, + "grad_norm": 0.05861636623740196, + "learning_rate": 9.711058103424997e-05, + "loss": 0.2588, + "step": 27799 + }, + { + "epoch": 2.252106286454958, + "grad_norm": 0.06427206844091415, + "learning_rate": 9.710608038165534e-05, + "loss": 0.229, + "step": 27800 + }, + { + "epoch": 2.252187297472456, + "grad_norm": 0.06308040022850037, + "learning_rate": 9.710157972906073e-05, + "loss": 0.2487, + "step": 27801 + }, + { + "epoch": 2.2522683084899544, + "grad_norm": 0.060957130044698715, + "learning_rate": 9.709707907646609e-05, + "loss": 0.2518, + "step": 27802 + }, + { + "epoch": 2.252349319507453, + "grad_norm": 0.048829443752765656, + "learning_rate": 9.709257842387146e-05, + "loss": 0.2253, + "step": 27803 + }, + { + "epoch": 2.2524303305249513, + "grad_norm": 0.05918252468109131, + "learning_rate": 9.708807777127685e-05, + "loss": 0.255, + "step": 27804 + }, + { + "epoch": 2.2525113415424496, + "grad_norm": 0.0695018544793129, + "learning_rate": 9.708357711868221e-05, + "loss": 0.256, + "step": 27805 + }, + { + "epoch": 2.2525923525599483, + "grad_norm": 0.07991789281368256, + "learning_rate": 9.70790764660876e-05, + "loss": 0.2932, + "step": 27806 + }, + { + "epoch": 2.2526733635774465, + "grad_norm": 0.06261714547872543, + "learning_rate": 9.707457581349297e-05, + "loss": 0.2423, + "step": 27807 + }, + { + "epoch": 2.2527543745949448, + "grad_norm": 0.0648646205663681, + "learning_rate": 9.707007516089833e-05, + "loss": 0.2338, + "step": 27808 + }, + { + "epoch": 2.2528353856124435, + "grad_norm": 0.07085174322128296, + "learning_rate": 9.706557450830372e-05, + "loss": 0.2525, + "step": 27809 + }, + { + "epoch": 2.2529163966299417, + "grad_norm": 0.0629393607378006, + "learning_rate": 9.706107385570909e-05, + "loss": 0.2756, + "step": 27810 + }, + { + "epoch": 2.25299740764744, + "grad_norm": 0.06812289357185364, + "learning_rate": 9.705657320311445e-05, + "loss": 0.2666, + "step": 27811 + }, + { + "epoch": 2.2530784186649386, + "grad_norm": 0.06094437837600708, + "learning_rate": 9.705207255051984e-05, + "loss": 0.2545, + "step": 27812 + }, + { + "epoch": 2.253159429682437, + "grad_norm": 0.06829962879419327, + "learning_rate": 9.704757189792521e-05, + "loss": 0.2416, + "step": 27813 + }, + { + "epoch": 2.253240440699935, + "grad_norm": 0.058218564838171005, + "learning_rate": 9.704307124533057e-05, + "loss": 0.2694, + "step": 27814 + }, + { + "epoch": 2.253321451717434, + "grad_norm": 0.05303672328591347, + "learning_rate": 9.703857059273596e-05, + "loss": 0.2362, + "step": 27815 + }, + { + "epoch": 2.253402462734932, + "grad_norm": 0.06274978071451187, + "learning_rate": 9.703406994014133e-05, + "loss": 0.2829, + "step": 27816 + }, + { + "epoch": 2.2534834737524303, + "grad_norm": 0.06936366856098175, + "learning_rate": 9.702956928754669e-05, + "loss": 0.2609, + "step": 27817 + }, + { + "epoch": 2.2535644847699285, + "grad_norm": 0.06225275993347168, + "learning_rate": 9.702506863495208e-05, + "loss": 0.2297, + "step": 27818 + }, + { + "epoch": 2.2536454957874272, + "grad_norm": 0.07567765563726425, + "learning_rate": 9.702056798235745e-05, + "loss": 0.3011, + "step": 27819 + }, + { + "epoch": 2.2537265068049255, + "grad_norm": 0.07268765568733215, + "learning_rate": 9.701606732976281e-05, + "loss": 0.2752, + "step": 27820 + }, + { + "epoch": 2.2538075178224237, + "grad_norm": 0.06192336603999138, + "learning_rate": 9.70115666771682e-05, + "loss": 0.255, + "step": 27821 + }, + { + "epoch": 2.2538885288399224, + "grad_norm": 0.06162412464618683, + "learning_rate": 9.700706602457357e-05, + "loss": 0.2628, + "step": 27822 + }, + { + "epoch": 2.2539695398574207, + "grad_norm": 0.07518964260816574, + "learning_rate": 9.700256537197893e-05, + "loss": 0.2581, + "step": 27823 + }, + { + "epoch": 2.254050550874919, + "grad_norm": 0.07366792857646942, + "learning_rate": 9.699806471938432e-05, + "loss": 0.281, + "step": 27824 + }, + { + "epoch": 2.254131561892417, + "grad_norm": 0.06474350392818451, + "learning_rate": 9.69935640667897e-05, + "loss": 0.2567, + "step": 27825 + }, + { + "epoch": 2.254212572909916, + "grad_norm": 0.07112695276737213, + "learning_rate": 9.698906341419505e-05, + "loss": 0.279, + "step": 27826 + }, + { + "epoch": 2.254293583927414, + "grad_norm": 0.06454798579216003, + "learning_rate": 9.698456276160044e-05, + "loss": 0.2568, + "step": 27827 + }, + { + "epoch": 2.2543745949449123, + "grad_norm": 0.07216446101665497, + "learning_rate": 9.698006210900581e-05, + "loss": 0.2637, + "step": 27828 + }, + { + "epoch": 2.254455605962411, + "grad_norm": 0.07267016917467117, + "learning_rate": 9.697556145641117e-05, + "loss": 0.2624, + "step": 27829 + }, + { + "epoch": 2.2545366169799093, + "grad_norm": 0.0549045130610466, + "learning_rate": 9.697106080381656e-05, + "loss": 0.2394, + "step": 27830 + }, + { + "epoch": 2.2546176279974075, + "grad_norm": 0.0676216334104538, + "learning_rate": 9.696656015122194e-05, + "loss": 0.3046, + "step": 27831 + }, + { + "epoch": 2.254698639014906, + "grad_norm": 0.05932784825563431, + "learning_rate": 9.69620594986273e-05, + "loss": 0.2855, + "step": 27832 + }, + { + "epoch": 2.2547796500324044, + "grad_norm": 0.06297644227743149, + "learning_rate": 9.695755884603268e-05, + "loss": 0.2341, + "step": 27833 + }, + { + "epoch": 2.2548606610499027, + "grad_norm": 0.06666389107704163, + "learning_rate": 9.695305819343806e-05, + "loss": 0.247, + "step": 27834 + }, + { + "epoch": 2.2549416720674014, + "grad_norm": 0.06561585515737534, + "learning_rate": 9.694855754084343e-05, + "loss": 0.2727, + "step": 27835 + }, + { + "epoch": 2.2550226830848996, + "grad_norm": 0.06304679065942764, + "learning_rate": 9.69440568882488e-05, + "loss": 0.261, + "step": 27836 + }, + { + "epoch": 2.255103694102398, + "grad_norm": 0.057998958975076675, + "learning_rate": 9.693955623565418e-05, + "loss": 0.2634, + "step": 27837 + }, + { + "epoch": 2.2551847051198965, + "grad_norm": 0.061838455498218536, + "learning_rate": 9.693505558305955e-05, + "loss": 0.2558, + "step": 27838 + }, + { + "epoch": 2.255265716137395, + "grad_norm": 0.06982436776161194, + "learning_rate": 9.693055493046492e-05, + "loss": 0.2729, + "step": 27839 + }, + { + "epoch": 2.255346727154893, + "grad_norm": 0.06714750081300735, + "learning_rate": 9.69260542778703e-05, + "loss": 0.2463, + "step": 27840 + }, + { + "epoch": 2.2554277381723913, + "grad_norm": 0.05839761346578598, + "learning_rate": 9.692155362527567e-05, + "loss": 0.224, + "step": 27841 + }, + { + "epoch": 2.25550874918989, + "grad_norm": 0.062319573014974594, + "learning_rate": 9.691705297268105e-05, + "loss": 0.2349, + "step": 27842 + }, + { + "epoch": 2.255589760207388, + "grad_norm": 0.0658549889922142, + "learning_rate": 9.691255232008642e-05, + "loss": 0.233, + "step": 27843 + }, + { + "epoch": 2.2556707712248865, + "grad_norm": 0.06968716531991959, + "learning_rate": 9.690805166749179e-05, + "loss": 0.2594, + "step": 27844 + }, + { + "epoch": 2.255751782242385, + "grad_norm": 0.062022265046834946, + "learning_rate": 9.690355101489717e-05, + "loss": 0.2612, + "step": 27845 + }, + { + "epoch": 2.2558327932598834, + "grad_norm": 0.06448287516832352, + "learning_rate": 9.689905036230254e-05, + "loss": 0.2406, + "step": 27846 + }, + { + "epoch": 2.2559138042773816, + "grad_norm": 0.04885780066251755, + "learning_rate": 9.689454970970791e-05, + "loss": 0.2206, + "step": 27847 + }, + { + "epoch": 2.25599481529488, + "grad_norm": 0.06826422363519669, + "learning_rate": 9.689004905711329e-05, + "loss": 0.24, + "step": 27848 + }, + { + "epoch": 2.2560758263123786, + "grad_norm": 0.054555803537368774, + "learning_rate": 9.688554840451866e-05, + "loss": 0.2357, + "step": 27849 + }, + { + "epoch": 2.256156837329877, + "grad_norm": 0.05679121986031532, + "learning_rate": 9.688104775192403e-05, + "loss": 0.2509, + "step": 27850 + }, + { + "epoch": 2.256237848347375, + "grad_norm": 0.07215198874473572, + "learning_rate": 9.687654709932941e-05, + "loss": 0.2869, + "step": 27851 + }, + { + "epoch": 2.2563188593648738, + "grad_norm": 0.06503754109144211, + "learning_rate": 9.687204644673478e-05, + "loss": 0.2314, + "step": 27852 + }, + { + "epoch": 2.256399870382372, + "grad_norm": 0.08150571584701538, + "learning_rate": 9.686754579414015e-05, + "loss": 0.2287, + "step": 27853 + }, + { + "epoch": 2.2564808813998702, + "grad_norm": 0.06819403171539307, + "learning_rate": 9.686304514154553e-05, + "loss": 0.272, + "step": 27854 + }, + { + "epoch": 2.256561892417369, + "grad_norm": 0.06893903017044067, + "learning_rate": 9.68585444889509e-05, + "loss": 0.2516, + "step": 27855 + }, + { + "epoch": 2.256642903434867, + "grad_norm": 0.07099400460720062, + "learning_rate": 9.685404383635628e-05, + "loss": 0.2557, + "step": 27856 + }, + { + "epoch": 2.2567239144523654, + "grad_norm": 0.05509239435195923, + "learning_rate": 9.684954318376165e-05, + "loss": 0.2763, + "step": 27857 + }, + { + "epoch": 2.256804925469864, + "grad_norm": 0.059497859328985214, + "learning_rate": 9.684504253116702e-05, + "loss": 0.2944, + "step": 27858 + }, + { + "epoch": 2.2568859364873624, + "grad_norm": 0.06433116644620895, + "learning_rate": 9.68405418785724e-05, + "loss": 0.227, + "step": 27859 + }, + { + "epoch": 2.2569669475048606, + "grad_norm": 0.062018293887376785, + "learning_rate": 9.683604122597777e-05, + "loss": 0.1944, + "step": 27860 + }, + { + "epoch": 2.2570479585223593, + "grad_norm": 0.07556930929422379, + "learning_rate": 9.683154057338316e-05, + "loss": 0.2528, + "step": 27861 + }, + { + "epoch": 2.2571289695398575, + "grad_norm": 0.06311879307031631, + "learning_rate": 9.682703992078852e-05, + "loss": 0.2752, + "step": 27862 + }, + { + "epoch": 2.2572099805573558, + "grad_norm": 0.06782593578100204, + "learning_rate": 9.682253926819389e-05, + "loss": 0.2638, + "step": 27863 + }, + { + "epoch": 2.257290991574854, + "grad_norm": 0.064161017537117, + "learning_rate": 9.681803861559928e-05, + "loss": 0.2636, + "step": 27864 + }, + { + "epoch": 2.2573720025923527, + "grad_norm": 0.08791672438383102, + "learning_rate": 9.681353796300464e-05, + "loss": 0.277, + "step": 27865 + }, + { + "epoch": 2.257453013609851, + "grad_norm": 0.07432711869478226, + "learning_rate": 9.680903731041001e-05, + "loss": 0.2761, + "step": 27866 + }, + { + "epoch": 2.257534024627349, + "grad_norm": 0.061404597014188766, + "learning_rate": 9.68045366578154e-05, + "loss": 0.2678, + "step": 27867 + }, + { + "epoch": 2.257615035644848, + "grad_norm": 0.0511782243847847, + "learning_rate": 9.680003600522076e-05, + "loss": 0.2169, + "step": 27868 + }, + { + "epoch": 2.257696046662346, + "grad_norm": 0.05862230435013771, + "learning_rate": 9.679553535262613e-05, + "loss": 0.2222, + "step": 27869 + }, + { + "epoch": 2.2577770576798444, + "grad_norm": 0.058833081275224686, + "learning_rate": 9.679103470003152e-05, + "loss": 0.2323, + "step": 27870 + }, + { + "epoch": 2.2578580686973426, + "grad_norm": 0.05861777067184448, + "learning_rate": 9.678653404743688e-05, + "loss": 0.2489, + "step": 27871 + }, + { + "epoch": 2.2579390797148413, + "grad_norm": 0.06305963546037674, + "learning_rate": 9.678203339484225e-05, + "loss": 0.2298, + "step": 27872 + }, + { + "epoch": 2.2580200907323396, + "grad_norm": 0.07361917197704315, + "learning_rate": 9.677753274224764e-05, + "loss": 0.2926, + "step": 27873 + }, + { + "epoch": 2.258101101749838, + "grad_norm": 0.07810232043266296, + "learning_rate": 9.6773032089653e-05, + "loss": 0.2722, + "step": 27874 + }, + { + "epoch": 2.2581821127673365, + "grad_norm": 0.07439389079809189, + "learning_rate": 9.676853143705837e-05, + "loss": 0.278, + "step": 27875 + }, + { + "epoch": 2.2582631237848347, + "grad_norm": 0.060669757425785065, + "learning_rate": 9.676403078446376e-05, + "loss": 0.2413, + "step": 27876 + }, + { + "epoch": 2.258344134802333, + "grad_norm": 0.051836349070072174, + "learning_rate": 9.675953013186912e-05, + "loss": 0.2208, + "step": 27877 + }, + { + "epoch": 2.2584251458198317, + "grad_norm": 0.06817534565925598, + "learning_rate": 9.67550294792745e-05, + "loss": 0.2213, + "step": 27878 + }, + { + "epoch": 2.25850615683733, + "grad_norm": 0.0720968171954155, + "learning_rate": 9.675052882667988e-05, + "loss": 0.2481, + "step": 27879 + }, + { + "epoch": 2.258587167854828, + "grad_norm": 0.05986291170120239, + "learning_rate": 9.674602817408524e-05, + "loss": 0.2508, + "step": 27880 + }, + { + "epoch": 2.258668178872327, + "grad_norm": 0.0641389712691307, + "learning_rate": 9.674152752149062e-05, + "loss": 0.2973, + "step": 27881 + }, + { + "epoch": 2.258749189889825, + "grad_norm": 0.06277509033679962, + "learning_rate": 9.6737026868896e-05, + "loss": 0.2439, + "step": 27882 + }, + { + "epoch": 2.2588302009073233, + "grad_norm": 0.060060031712055206, + "learning_rate": 9.673252621630136e-05, + "loss": 0.243, + "step": 27883 + }, + { + "epoch": 2.2589112119248216, + "grad_norm": 0.0587458573281765, + "learning_rate": 9.672802556370674e-05, + "loss": 0.2252, + "step": 27884 + }, + { + "epoch": 2.2589922229423203, + "grad_norm": 0.056831326335668564, + "learning_rate": 9.672352491111212e-05, + "loss": 0.2175, + "step": 27885 + }, + { + "epoch": 2.2590732339598185, + "grad_norm": 0.06255652010440826, + "learning_rate": 9.671902425851748e-05, + "loss": 0.219, + "step": 27886 + }, + { + "epoch": 2.2591542449773168, + "grad_norm": 0.06168430671095848, + "learning_rate": 9.671452360592287e-05, + "loss": 0.2411, + "step": 27887 + }, + { + "epoch": 2.2592352559948155, + "grad_norm": 0.06198149546980858, + "learning_rate": 9.671002295332824e-05, + "loss": 0.2581, + "step": 27888 + }, + { + "epoch": 2.2593162670123137, + "grad_norm": 0.06613697111606598, + "learning_rate": 9.67055223007336e-05, + "loss": 0.258, + "step": 27889 + }, + { + "epoch": 2.259397278029812, + "grad_norm": 0.059314556419849396, + "learning_rate": 9.670102164813899e-05, + "loss": 0.2882, + "step": 27890 + }, + { + "epoch": 2.2594782890473106, + "grad_norm": 0.04973381385207176, + "learning_rate": 9.669652099554437e-05, + "loss": 0.2104, + "step": 27891 + }, + { + "epoch": 2.259559300064809, + "grad_norm": 0.07927535474300385, + "learning_rate": 9.669202034294973e-05, + "loss": 0.2664, + "step": 27892 + }, + { + "epoch": 2.259640311082307, + "grad_norm": 0.06386194378137589, + "learning_rate": 9.668751969035511e-05, + "loss": 0.2277, + "step": 27893 + }, + { + "epoch": 2.2597213220998054, + "grad_norm": 0.07194879651069641, + "learning_rate": 9.668301903776049e-05, + "loss": 0.2376, + "step": 27894 + }, + { + "epoch": 2.259802333117304, + "grad_norm": 0.057715609669685364, + "learning_rate": 9.667851838516585e-05, + "loss": 0.212, + "step": 27895 + }, + { + "epoch": 2.2598833441348023, + "grad_norm": 0.0678049772977829, + "learning_rate": 9.667401773257123e-05, + "loss": 0.2616, + "step": 27896 + }, + { + "epoch": 2.2599643551523005, + "grad_norm": 0.06508383899927139, + "learning_rate": 9.66695170799766e-05, + "loss": 0.2513, + "step": 27897 + }, + { + "epoch": 2.2600453661697992, + "grad_norm": 0.06817775964736938, + "learning_rate": 9.666501642738197e-05, + "loss": 0.2594, + "step": 27898 + }, + { + "epoch": 2.2601263771872975, + "grad_norm": 0.04622168466448784, + "learning_rate": 9.666051577478735e-05, + "loss": 0.1943, + "step": 27899 + }, + { + "epoch": 2.2602073882047957, + "grad_norm": 0.06753421574831009, + "learning_rate": 9.665601512219273e-05, + "loss": 0.2419, + "step": 27900 + }, + { + "epoch": 2.2602883992222944, + "grad_norm": 0.05535978451371193, + "learning_rate": 9.665151446959809e-05, + "loss": 0.2371, + "step": 27901 + }, + { + "epoch": 2.2603694102397927, + "grad_norm": 0.05608188360929489, + "learning_rate": 9.664701381700347e-05, + "loss": 0.2404, + "step": 27902 + }, + { + "epoch": 2.260450421257291, + "grad_norm": 0.07442081719636917, + "learning_rate": 9.664251316440885e-05, + "loss": 0.2929, + "step": 27903 + }, + { + "epoch": 2.2605314322747896, + "grad_norm": 0.06551259756088257, + "learning_rate": 9.663801251181421e-05, + "loss": 0.2504, + "step": 27904 + }, + { + "epoch": 2.260612443292288, + "grad_norm": 0.06035178527235985, + "learning_rate": 9.66335118592196e-05, + "loss": 0.2239, + "step": 27905 + }, + { + "epoch": 2.260693454309786, + "grad_norm": 0.05210309103131294, + "learning_rate": 9.662901120662497e-05, + "loss": 0.2309, + "step": 27906 + }, + { + "epoch": 2.2607744653272843, + "grad_norm": 0.05578669160604477, + "learning_rate": 9.662451055403033e-05, + "loss": 0.2555, + "step": 27907 + }, + { + "epoch": 2.260855476344783, + "grad_norm": 0.059826821088790894, + "learning_rate": 9.662000990143572e-05, + "loss": 0.2674, + "step": 27908 + }, + { + "epoch": 2.2609364873622813, + "grad_norm": 0.055484119802713394, + "learning_rate": 9.661550924884109e-05, + "loss": 0.2559, + "step": 27909 + }, + { + "epoch": 2.2610174983797795, + "grad_norm": 0.06662512570619583, + "learning_rate": 9.661100859624645e-05, + "loss": 0.2428, + "step": 27910 + }, + { + "epoch": 2.261098509397278, + "grad_norm": 0.06647738069295883, + "learning_rate": 9.660650794365184e-05, + "loss": 0.2705, + "step": 27911 + }, + { + "epoch": 2.2611795204147764, + "grad_norm": 0.07568782567977905, + "learning_rate": 9.660200729105721e-05, + "loss": 0.2397, + "step": 27912 + }, + { + "epoch": 2.2612605314322747, + "grad_norm": 0.05346573889255524, + "learning_rate": 9.659750663846258e-05, + "loss": 0.2155, + "step": 27913 + }, + { + "epoch": 2.261341542449773, + "grad_norm": 0.05152289569377899, + "learning_rate": 9.659300598586796e-05, + "loss": 0.2347, + "step": 27914 + }, + { + "epoch": 2.2614225534672716, + "grad_norm": 0.06577225774526596, + "learning_rate": 9.658850533327333e-05, + "loss": 0.2841, + "step": 27915 + }, + { + "epoch": 2.26150356448477, + "grad_norm": 0.061457157135009766, + "learning_rate": 9.65840046806787e-05, + "loss": 0.2433, + "step": 27916 + }, + { + "epoch": 2.261584575502268, + "grad_norm": 0.053514160215854645, + "learning_rate": 9.657950402808408e-05, + "loss": 0.2233, + "step": 27917 + }, + { + "epoch": 2.261665586519767, + "grad_norm": 0.06306682527065277, + "learning_rate": 9.657500337548945e-05, + "loss": 0.286, + "step": 27918 + }, + { + "epoch": 2.261746597537265, + "grad_norm": 0.06268003582954407, + "learning_rate": 9.657050272289483e-05, + "loss": 0.2605, + "step": 27919 + }, + { + "epoch": 2.2618276085547633, + "grad_norm": 0.0637531504034996, + "learning_rate": 9.65660020703002e-05, + "loss": 0.2675, + "step": 27920 + }, + { + "epoch": 2.261908619572262, + "grad_norm": 0.06470780819654465, + "learning_rate": 9.656150141770557e-05, + "loss": 0.2344, + "step": 27921 + }, + { + "epoch": 2.26198963058976, + "grad_norm": 0.06864067912101746, + "learning_rate": 9.655700076511095e-05, + "loss": 0.2639, + "step": 27922 + }, + { + "epoch": 2.2620706416072585, + "grad_norm": 0.06445959955453873, + "learning_rate": 9.655250011251632e-05, + "loss": 0.2367, + "step": 27923 + }, + { + "epoch": 2.262151652624757, + "grad_norm": 0.0648827999830246, + "learning_rate": 9.65479994599217e-05, + "loss": 0.2133, + "step": 27924 + }, + { + "epoch": 2.2622326636422554, + "grad_norm": 0.06934642791748047, + "learning_rate": 9.654349880732707e-05, + "loss": 0.2511, + "step": 27925 + }, + { + "epoch": 2.2623136746597536, + "grad_norm": 0.06981291621923447, + "learning_rate": 9.653899815473244e-05, + "loss": 0.2763, + "step": 27926 + }, + { + "epoch": 2.2623946856772523, + "grad_norm": 0.07649930566549301, + "learning_rate": 9.653449750213781e-05, + "loss": 0.245, + "step": 27927 + }, + { + "epoch": 2.2624756966947506, + "grad_norm": 0.06518231332302094, + "learning_rate": 9.652999684954319e-05, + "loss": 0.2537, + "step": 27928 + }, + { + "epoch": 2.262556707712249, + "grad_norm": 0.051750268787145615, + "learning_rate": 9.652549619694856e-05, + "loss": 0.225, + "step": 27929 + }, + { + "epoch": 2.262637718729747, + "grad_norm": 0.07412702590227127, + "learning_rate": 9.652099554435394e-05, + "loss": 0.2511, + "step": 27930 + }, + { + "epoch": 2.2627187297472457, + "grad_norm": 0.057747166603803635, + "learning_rate": 9.651649489175931e-05, + "loss": 0.2292, + "step": 27931 + }, + { + "epoch": 2.262799740764744, + "grad_norm": 0.07066566497087479, + "learning_rate": 9.651199423916468e-05, + "loss": 0.2743, + "step": 27932 + }, + { + "epoch": 2.2628807517822422, + "grad_norm": 0.06702201068401337, + "learning_rate": 9.650749358657006e-05, + "loss": 0.2583, + "step": 27933 + }, + { + "epoch": 2.262961762799741, + "grad_norm": 0.0659802109003067, + "learning_rate": 9.650299293397543e-05, + "loss": 0.2497, + "step": 27934 + }, + { + "epoch": 2.263042773817239, + "grad_norm": 0.06113113835453987, + "learning_rate": 9.64984922813808e-05, + "loss": 0.2161, + "step": 27935 + }, + { + "epoch": 2.2631237848347374, + "grad_norm": 0.07334668189287186, + "learning_rate": 9.649399162878618e-05, + "loss": 0.2582, + "step": 27936 + }, + { + "epoch": 2.2632047958522357, + "grad_norm": 0.057873696088790894, + "learning_rate": 9.648949097619155e-05, + "loss": 0.249, + "step": 27937 + }, + { + "epoch": 2.2632858068697344, + "grad_norm": 0.05299917235970497, + "learning_rate": 9.648499032359692e-05, + "loss": 0.2305, + "step": 27938 + }, + { + "epoch": 2.2633668178872326, + "grad_norm": 0.06792672723531723, + "learning_rate": 9.648048967100231e-05, + "loss": 0.2545, + "step": 27939 + }, + { + "epoch": 2.263447828904731, + "grad_norm": 0.06435631215572357, + "learning_rate": 9.647598901840767e-05, + "loss": 0.2659, + "step": 27940 + }, + { + "epoch": 2.2635288399222295, + "grad_norm": 0.06368466466665268, + "learning_rate": 9.647148836581305e-05, + "loss": 0.2745, + "step": 27941 + }, + { + "epoch": 2.2636098509397278, + "grad_norm": 0.061373911798000336, + "learning_rate": 9.646698771321843e-05, + "loss": 0.2489, + "step": 27942 + }, + { + "epoch": 2.263690861957226, + "grad_norm": 0.06631971150636673, + "learning_rate": 9.646248706062379e-05, + "loss": 0.2395, + "step": 27943 + }, + { + "epoch": 2.2637718729747247, + "grad_norm": 0.058606911450624466, + "learning_rate": 9.645798640802917e-05, + "loss": 0.2348, + "step": 27944 + }, + { + "epoch": 2.263852883992223, + "grad_norm": 0.0627407506108284, + "learning_rate": 9.645348575543455e-05, + "loss": 0.2781, + "step": 27945 + }, + { + "epoch": 2.263933895009721, + "grad_norm": 0.05077546089887619, + "learning_rate": 9.644898510283991e-05, + "loss": 0.2521, + "step": 27946 + }, + { + "epoch": 2.26401490602722, + "grad_norm": 0.07406048476696014, + "learning_rate": 9.644448445024529e-05, + "loss": 0.2675, + "step": 27947 + }, + { + "epoch": 2.264095917044718, + "grad_norm": 0.057797353714704514, + "learning_rate": 9.643998379765067e-05, + "loss": 0.1935, + "step": 27948 + }, + { + "epoch": 2.2641769280622164, + "grad_norm": 0.05993032827973366, + "learning_rate": 9.643548314505603e-05, + "loss": 0.2157, + "step": 27949 + }, + { + "epoch": 2.264257939079715, + "grad_norm": 0.059989865869283676, + "learning_rate": 9.643098249246141e-05, + "loss": 0.2519, + "step": 27950 + }, + { + "epoch": 2.2643389500972133, + "grad_norm": 0.07011564821004868, + "learning_rate": 9.64264818398668e-05, + "loss": 0.2762, + "step": 27951 + }, + { + "epoch": 2.2644199611147116, + "grad_norm": 0.0700843557715416, + "learning_rate": 9.642198118727215e-05, + "loss": 0.2355, + "step": 27952 + }, + { + "epoch": 2.26450097213221, + "grad_norm": 0.06506705284118652, + "learning_rate": 9.641748053467753e-05, + "loss": 0.2306, + "step": 27953 + }, + { + "epoch": 2.2645819831497085, + "grad_norm": 0.06078203022480011, + "learning_rate": 9.641297988208292e-05, + "loss": 0.2461, + "step": 27954 + }, + { + "epoch": 2.2646629941672067, + "grad_norm": 0.04927067831158638, + "learning_rate": 9.640847922948828e-05, + "loss": 0.2106, + "step": 27955 + }, + { + "epoch": 2.264744005184705, + "grad_norm": 0.05114800110459328, + "learning_rate": 9.640397857689365e-05, + "loss": 0.2107, + "step": 27956 + }, + { + "epoch": 2.2648250162022037, + "grad_norm": 0.06491793692111969, + "learning_rate": 9.639947792429904e-05, + "loss": 0.2448, + "step": 27957 + }, + { + "epoch": 2.264906027219702, + "grad_norm": 0.07158529758453369, + "learning_rate": 9.63949772717044e-05, + "loss": 0.2538, + "step": 27958 + }, + { + "epoch": 2.2649870382372, + "grad_norm": 0.07513157278299332, + "learning_rate": 9.639047661910977e-05, + "loss": 0.2702, + "step": 27959 + }, + { + "epoch": 2.2650680492546984, + "grad_norm": 0.06532614678144455, + "learning_rate": 9.638597596651516e-05, + "loss": 0.2587, + "step": 27960 + }, + { + "epoch": 2.265149060272197, + "grad_norm": 0.06393644958734512, + "learning_rate": 9.638147531392052e-05, + "loss": 0.2593, + "step": 27961 + }, + { + "epoch": 2.2652300712896953, + "grad_norm": 0.058122579008340836, + "learning_rate": 9.637697466132589e-05, + "loss": 0.2569, + "step": 27962 + }, + { + "epoch": 2.2653110823071936, + "grad_norm": 0.05314813181757927, + "learning_rate": 9.637247400873128e-05, + "loss": 0.2269, + "step": 27963 + }, + { + "epoch": 2.2653920933246923, + "grad_norm": 0.06928487122058868, + "learning_rate": 9.636797335613664e-05, + "loss": 0.2635, + "step": 27964 + }, + { + "epoch": 2.2654731043421905, + "grad_norm": 0.05637259781360626, + "learning_rate": 9.636347270354203e-05, + "loss": 0.2504, + "step": 27965 + }, + { + "epoch": 2.2655541153596888, + "grad_norm": 0.07404623925685883, + "learning_rate": 9.63589720509474e-05, + "loss": 0.2725, + "step": 27966 + }, + { + "epoch": 2.2656351263771874, + "grad_norm": 0.059635065495967865, + "learning_rate": 9.635447139835276e-05, + "loss": 0.2214, + "step": 27967 + }, + { + "epoch": 2.2657161373946857, + "grad_norm": 0.07493023574352264, + "learning_rate": 9.634997074575815e-05, + "loss": 0.2995, + "step": 27968 + }, + { + "epoch": 2.265797148412184, + "grad_norm": 0.07327574491500854, + "learning_rate": 9.634547009316352e-05, + "loss": 0.2793, + "step": 27969 + }, + { + "epoch": 2.2658781594296826, + "grad_norm": 0.07068467885255814, + "learning_rate": 9.634096944056888e-05, + "loss": 0.2383, + "step": 27970 + }, + { + "epoch": 2.265959170447181, + "grad_norm": 0.07366050779819489, + "learning_rate": 9.633646878797427e-05, + "loss": 0.2369, + "step": 27971 + }, + { + "epoch": 2.266040181464679, + "grad_norm": 0.05504010245203972, + "learning_rate": 9.633196813537964e-05, + "loss": 0.2442, + "step": 27972 + }, + { + "epoch": 2.266121192482178, + "grad_norm": 0.05307780206203461, + "learning_rate": 9.6327467482785e-05, + "loss": 0.2648, + "step": 27973 + }, + { + "epoch": 2.266202203499676, + "grad_norm": 0.058790065348148346, + "learning_rate": 9.632296683019039e-05, + "loss": 0.2295, + "step": 27974 + }, + { + "epoch": 2.2662832145171743, + "grad_norm": 0.07024440169334412, + "learning_rate": 9.631846617759576e-05, + "loss": 0.2316, + "step": 27975 + }, + { + "epoch": 2.2663642255346725, + "grad_norm": 0.07419854402542114, + "learning_rate": 9.631396552500112e-05, + "loss": 0.2755, + "step": 27976 + }, + { + "epoch": 2.2664452365521712, + "grad_norm": 0.0637488067150116, + "learning_rate": 9.630946487240651e-05, + "loss": 0.264, + "step": 27977 + }, + { + "epoch": 2.2665262475696695, + "grad_norm": 0.08120803534984589, + "learning_rate": 9.630496421981188e-05, + "loss": 0.2495, + "step": 27978 + }, + { + "epoch": 2.2666072585871677, + "grad_norm": 0.06741491705179214, + "learning_rate": 9.630046356721724e-05, + "loss": 0.2777, + "step": 27979 + }, + { + "epoch": 2.2666882696046664, + "grad_norm": 0.06208684667944908, + "learning_rate": 9.629596291462263e-05, + "loss": 0.2689, + "step": 27980 + }, + { + "epoch": 2.2667692806221647, + "grad_norm": 0.07637951523065567, + "learning_rate": 9.6291462262028e-05, + "loss": 0.3162, + "step": 27981 + }, + { + "epoch": 2.266850291639663, + "grad_norm": 0.0612499862909317, + "learning_rate": 9.628696160943336e-05, + "loss": 0.26, + "step": 27982 + }, + { + "epoch": 2.266931302657161, + "grad_norm": 0.07130424678325653, + "learning_rate": 9.628246095683875e-05, + "loss": 0.2524, + "step": 27983 + }, + { + "epoch": 2.26701231367466, + "grad_norm": 0.07146918028593063, + "learning_rate": 9.627796030424412e-05, + "loss": 0.2477, + "step": 27984 + }, + { + "epoch": 2.267093324692158, + "grad_norm": 0.06923989951610565, + "learning_rate": 9.627345965164948e-05, + "loss": 0.2463, + "step": 27985 + }, + { + "epoch": 2.2671743357096563, + "grad_norm": 0.05341016501188278, + "learning_rate": 9.626895899905487e-05, + "loss": 0.2214, + "step": 27986 + }, + { + "epoch": 2.267255346727155, + "grad_norm": 0.07137572765350342, + "learning_rate": 9.626445834646024e-05, + "loss": 0.2466, + "step": 27987 + }, + { + "epoch": 2.2673363577446533, + "grad_norm": 0.060765862464904785, + "learning_rate": 9.62599576938656e-05, + "loss": 0.2313, + "step": 27988 + }, + { + "epoch": 2.2674173687621515, + "grad_norm": 0.06152436509728432, + "learning_rate": 9.625545704127099e-05, + "loss": 0.2466, + "step": 27989 + }, + { + "epoch": 2.26749837977965, + "grad_norm": 0.05408678948879242, + "learning_rate": 9.625095638867637e-05, + "loss": 0.2439, + "step": 27990 + }, + { + "epoch": 2.2675793907971484, + "grad_norm": 0.053834687918424606, + "learning_rate": 9.624645573608173e-05, + "loss": 0.246, + "step": 27991 + }, + { + "epoch": 2.2676604018146467, + "grad_norm": 0.06949444115161896, + "learning_rate": 9.624195508348711e-05, + "loss": 0.2376, + "step": 27992 + }, + { + "epoch": 2.2677414128321454, + "grad_norm": 0.060450479388237, + "learning_rate": 9.623745443089249e-05, + "loss": 0.2117, + "step": 27993 + }, + { + "epoch": 2.2678224238496436, + "grad_norm": 0.05990859866142273, + "learning_rate": 9.623295377829786e-05, + "loss": 0.274, + "step": 27994 + }, + { + "epoch": 2.267903434867142, + "grad_norm": 0.0631098821759224, + "learning_rate": 9.622845312570323e-05, + "loss": 0.2882, + "step": 27995 + }, + { + "epoch": 2.2679844458846405, + "grad_norm": 0.06892747431993484, + "learning_rate": 9.622395247310861e-05, + "loss": 0.227, + "step": 27996 + }, + { + "epoch": 2.268065456902139, + "grad_norm": 0.05815494433045387, + "learning_rate": 9.621945182051398e-05, + "loss": 0.2431, + "step": 27997 + }, + { + "epoch": 2.268146467919637, + "grad_norm": 0.051642052829265594, + "learning_rate": 9.621495116791935e-05, + "loss": 0.2257, + "step": 27998 + }, + { + "epoch": 2.2682274789371353, + "grad_norm": 0.05603569746017456, + "learning_rate": 9.621045051532473e-05, + "loss": 0.26, + "step": 27999 + }, + { + "epoch": 2.268308489954634, + "grad_norm": 0.0718226209282875, + "learning_rate": 9.62059498627301e-05, + "loss": 0.2461, + "step": 28000 + }, + { + "epoch": 2.268389500972132, + "grad_norm": 0.0677584558725357, + "learning_rate": 9.620144921013548e-05, + "loss": 0.2434, + "step": 28001 + }, + { + "epoch": 2.2684705119896305, + "grad_norm": 0.05520008131861687, + "learning_rate": 9.619694855754085e-05, + "loss": 0.2681, + "step": 28002 + }, + { + "epoch": 2.268551523007129, + "grad_norm": 0.06791157275438309, + "learning_rate": 9.619244790494622e-05, + "loss": 0.2695, + "step": 28003 + }, + { + "epoch": 2.2686325340246274, + "grad_norm": 0.05662940442562103, + "learning_rate": 9.61879472523516e-05, + "loss": 0.2663, + "step": 28004 + }, + { + "epoch": 2.2687135450421256, + "grad_norm": 0.06527923047542572, + "learning_rate": 9.618344659975697e-05, + "loss": 0.2916, + "step": 28005 + }, + { + "epoch": 2.268794556059624, + "grad_norm": 0.05786988511681557, + "learning_rate": 9.617894594716234e-05, + "loss": 0.231, + "step": 28006 + }, + { + "epoch": 2.2688755670771226, + "grad_norm": 0.06730439513921738, + "learning_rate": 9.617444529456772e-05, + "loss": 0.2724, + "step": 28007 + }, + { + "epoch": 2.268956578094621, + "grad_norm": 0.05449024215340614, + "learning_rate": 9.616994464197309e-05, + "loss": 0.2368, + "step": 28008 + }, + { + "epoch": 2.269037589112119, + "grad_norm": 0.0632796511054039, + "learning_rate": 9.616544398937846e-05, + "loss": 0.2399, + "step": 28009 + }, + { + "epoch": 2.2691186001296177, + "grad_norm": 0.06255123019218445, + "learning_rate": 9.616094333678384e-05, + "loss": 0.2599, + "step": 28010 + }, + { + "epoch": 2.269199611147116, + "grad_norm": 0.060231465846300125, + "learning_rate": 9.615644268418921e-05, + "loss": 0.2241, + "step": 28011 + }, + { + "epoch": 2.2692806221646142, + "grad_norm": 0.060310233384370804, + "learning_rate": 9.615194203159458e-05, + "loss": 0.2308, + "step": 28012 + }, + { + "epoch": 2.269361633182113, + "grad_norm": 0.08699499070644379, + "learning_rate": 9.614744137899996e-05, + "loss": 0.2961, + "step": 28013 + }, + { + "epoch": 2.269442644199611, + "grad_norm": 0.06907127052545547, + "learning_rate": 9.614294072640533e-05, + "loss": 0.239, + "step": 28014 + }, + { + "epoch": 2.2695236552171094, + "grad_norm": 0.06631629168987274, + "learning_rate": 9.61384400738107e-05, + "loss": 0.2425, + "step": 28015 + }, + { + "epoch": 2.269604666234608, + "grad_norm": 0.07456733286380768, + "learning_rate": 9.613393942121608e-05, + "loss": 0.2669, + "step": 28016 + }, + { + "epoch": 2.2696856772521063, + "grad_norm": 0.07179507613182068, + "learning_rate": 9.612943876862145e-05, + "loss": 0.2641, + "step": 28017 + }, + { + "epoch": 2.2697666882696046, + "grad_norm": 0.06662650406360626, + "learning_rate": 9.612493811602683e-05, + "loss": 0.2811, + "step": 28018 + }, + { + "epoch": 2.2698476992871033, + "grad_norm": 0.05604487657546997, + "learning_rate": 9.61204374634322e-05, + "loss": 0.2491, + "step": 28019 + }, + { + "epoch": 2.2699287103046015, + "grad_norm": 0.0543643943965435, + "learning_rate": 9.611593681083759e-05, + "loss": 0.2541, + "step": 28020 + }, + { + "epoch": 2.2700097213220998, + "grad_norm": 0.06231965124607086, + "learning_rate": 9.611143615824295e-05, + "loss": 0.196, + "step": 28021 + }, + { + "epoch": 2.270090732339598, + "grad_norm": 0.06550896167755127, + "learning_rate": 9.610693550564832e-05, + "loss": 0.2541, + "step": 28022 + }, + { + "epoch": 2.2701717433570967, + "grad_norm": 0.06826600432395935, + "learning_rate": 9.610243485305371e-05, + "loss": 0.2487, + "step": 28023 + }, + { + "epoch": 2.270252754374595, + "grad_norm": 0.06269895285367966, + "learning_rate": 9.609793420045907e-05, + "loss": 0.2342, + "step": 28024 + }, + { + "epoch": 2.270333765392093, + "grad_norm": 0.05146685615181923, + "learning_rate": 9.609343354786444e-05, + "loss": 0.2753, + "step": 28025 + }, + { + "epoch": 2.270414776409592, + "grad_norm": 0.059559378772974014, + "learning_rate": 9.608893289526983e-05, + "loss": 0.2508, + "step": 28026 + }, + { + "epoch": 2.27049578742709, + "grad_norm": 0.06171615794301033, + "learning_rate": 9.608443224267519e-05, + "loss": 0.2971, + "step": 28027 + }, + { + "epoch": 2.2705767984445884, + "grad_norm": 0.05737600475549698, + "learning_rate": 9.607993159008056e-05, + "loss": 0.2523, + "step": 28028 + }, + { + "epoch": 2.2706578094620866, + "grad_norm": 0.061808522790670395, + "learning_rate": 9.607543093748595e-05, + "loss": 0.231, + "step": 28029 + }, + { + "epoch": 2.2707388204795853, + "grad_norm": 0.061562590301036835, + "learning_rate": 9.607093028489131e-05, + "loss": 0.2664, + "step": 28030 + }, + { + "epoch": 2.2708198314970836, + "grad_norm": 0.06974106281995773, + "learning_rate": 9.606642963229668e-05, + "loss": 0.2272, + "step": 28031 + }, + { + "epoch": 2.270900842514582, + "grad_norm": 0.05686337500810623, + "learning_rate": 9.606192897970207e-05, + "loss": 0.2348, + "step": 28032 + }, + { + "epoch": 2.2709818535320805, + "grad_norm": 0.05605282634496689, + "learning_rate": 9.605742832710743e-05, + "loss": 0.2539, + "step": 28033 + }, + { + "epoch": 2.2710628645495787, + "grad_norm": 0.07936391979455948, + "learning_rate": 9.60529276745128e-05, + "loss": 0.2369, + "step": 28034 + }, + { + "epoch": 2.271143875567077, + "grad_norm": 0.06194789707660675, + "learning_rate": 9.604842702191819e-05, + "loss": 0.2124, + "step": 28035 + }, + { + "epoch": 2.2712248865845757, + "grad_norm": 0.07164730876684189, + "learning_rate": 9.604392636932355e-05, + "loss": 0.2329, + "step": 28036 + }, + { + "epoch": 2.271305897602074, + "grad_norm": 0.0761098638176918, + "learning_rate": 9.603942571672892e-05, + "loss": 0.2965, + "step": 28037 + }, + { + "epoch": 2.271386908619572, + "grad_norm": 0.07668202370405197, + "learning_rate": 9.603492506413431e-05, + "loss": 0.2525, + "step": 28038 + }, + { + "epoch": 2.271467919637071, + "grad_norm": 0.07283965498209, + "learning_rate": 9.603042441153967e-05, + "loss": 0.2561, + "step": 28039 + }, + { + "epoch": 2.271548930654569, + "grad_norm": 0.07585301995277405, + "learning_rate": 9.602592375894505e-05, + "loss": 0.2746, + "step": 28040 + }, + { + "epoch": 2.2716299416720673, + "grad_norm": 0.07666393369436264, + "learning_rate": 9.602142310635043e-05, + "loss": 0.2471, + "step": 28041 + }, + { + "epoch": 2.271710952689566, + "grad_norm": 0.06626211851835251, + "learning_rate": 9.601692245375579e-05, + "loss": 0.274, + "step": 28042 + }, + { + "epoch": 2.2717919637070643, + "grad_norm": 0.06695680320262909, + "learning_rate": 9.601242180116117e-05, + "loss": 0.2673, + "step": 28043 + }, + { + "epoch": 2.2718729747245625, + "grad_norm": 0.07495789974927902, + "learning_rate": 9.600792114856655e-05, + "loss": 0.2563, + "step": 28044 + }, + { + "epoch": 2.2719539857420608, + "grad_norm": 0.06226911023259163, + "learning_rate": 9.600342049597191e-05, + "loss": 0.2282, + "step": 28045 + }, + { + "epoch": 2.2720349967595594, + "grad_norm": 0.0586124062538147, + "learning_rate": 9.59989198433773e-05, + "loss": 0.2222, + "step": 28046 + }, + { + "epoch": 2.2721160077770577, + "grad_norm": 0.0961061641573906, + "learning_rate": 9.599441919078267e-05, + "loss": 0.2865, + "step": 28047 + }, + { + "epoch": 2.272197018794556, + "grad_norm": 0.06482996046543121, + "learning_rate": 9.598991853818803e-05, + "loss": 0.2704, + "step": 28048 + }, + { + "epoch": 2.2722780298120546, + "grad_norm": 0.07463010400533676, + "learning_rate": 9.598541788559342e-05, + "loss": 0.2539, + "step": 28049 + }, + { + "epoch": 2.272359040829553, + "grad_norm": 0.05233949422836304, + "learning_rate": 9.59809172329988e-05, + "loss": 0.2534, + "step": 28050 + }, + { + "epoch": 2.272440051847051, + "grad_norm": 0.07969961315393448, + "learning_rate": 9.597641658040416e-05, + "loss": 0.3144, + "step": 28051 + }, + { + "epoch": 2.2725210628645494, + "grad_norm": 0.07647403329610825, + "learning_rate": 9.597191592780954e-05, + "loss": 0.2552, + "step": 28052 + }, + { + "epoch": 2.272602073882048, + "grad_norm": 0.09753167629241943, + "learning_rate": 9.596741527521492e-05, + "loss": 0.2545, + "step": 28053 + }, + { + "epoch": 2.2726830848995463, + "grad_norm": 0.06515941023826599, + "learning_rate": 9.596291462262028e-05, + "loss": 0.2693, + "step": 28054 + }, + { + "epoch": 2.2727640959170445, + "grad_norm": 0.07915733009576797, + "learning_rate": 9.595841397002566e-05, + "loss": 0.2925, + "step": 28055 + }, + { + "epoch": 2.2728451069345432, + "grad_norm": 0.0683445930480957, + "learning_rate": 9.595391331743104e-05, + "loss": 0.2546, + "step": 28056 + }, + { + "epoch": 2.2729261179520415, + "grad_norm": 0.07178857177495956, + "learning_rate": 9.59494126648364e-05, + "loss": 0.2179, + "step": 28057 + }, + { + "epoch": 2.2730071289695397, + "grad_norm": 0.05506483465433121, + "learning_rate": 9.594491201224178e-05, + "loss": 0.2233, + "step": 28058 + }, + { + "epoch": 2.2730881399870384, + "grad_norm": 0.06703373789787292, + "learning_rate": 9.594041135964716e-05, + "loss": 0.2706, + "step": 28059 + }, + { + "epoch": 2.2731691510045366, + "grad_norm": 0.06285927444696426, + "learning_rate": 9.593591070705252e-05, + "loss": 0.2609, + "step": 28060 + }, + { + "epoch": 2.273250162022035, + "grad_norm": 0.056138258427381516, + "learning_rate": 9.59314100544579e-05, + "loss": 0.2039, + "step": 28061 + }, + { + "epoch": 2.2733311730395336, + "grad_norm": 0.06609399616718292, + "learning_rate": 9.592690940186328e-05, + "loss": 0.2393, + "step": 28062 + }, + { + "epoch": 2.273412184057032, + "grad_norm": 0.06169790029525757, + "learning_rate": 9.592240874926864e-05, + "loss": 0.2802, + "step": 28063 + }, + { + "epoch": 2.27349319507453, + "grad_norm": 0.06889687478542328, + "learning_rate": 9.591790809667403e-05, + "loss": 0.2713, + "step": 28064 + }, + { + "epoch": 2.2735742060920288, + "grad_norm": 0.0658617615699768, + "learning_rate": 9.59134074440794e-05, + "loss": 0.2538, + "step": 28065 + }, + { + "epoch": 2.273655217109527, + "grad_norm": 0.0711151510477066, + "learning_rate": 9.590890679148476e-05, + "loss": 0.2761, + "step": 28066 + }, + { + "epoch": 2.2737362281270252, + "grad_norm": 0.07032036036252975, + "learning_rate": 9.590440613889015e-05, + "loss": 0.285, + "step": 28067 + }, + { + "epoch": 2.2738172391445235, + "grad_norm": 0.06317279487848282, + "learning_rate": 9.589990548629552e-05, + "loss": 0.231, + "step": 28068 + }, + { + "epoch": 2.273898250162022, + "grad_norm": 0.06784936785697937, + "learning_rate": 9.589540483370088e-05, + "loss": 0.259, + "step": 28069 + }, + { + "epoch": 2.2739792611795204, + "grad_norm": 0.06801003217697144, + "learning_rate": 9.589090418110627e-05, + "loss": 0.2417, + "step": 28070 + }, + { + "epoch": 2.2740602721970187, + "grad_norm": 0.08054570853710175, + "learning_rate": 9.588640352851164e-05, + "loss": 0.248, + "step": 28071 + }, + { + "epoch": 2.2741412832145174, + "grad_norm": 0.05918966606259346, + "learning_rate": 9.588190287591701e-05, + "loss": 0.2452, + "step": 28072 + }, + { + "epoch": 2.2742222942320156, + "grad_norm": 0.06380657851696014, + "learning_rate": 9.587740222332239e-05, + "loss": 0.2177, + "step": 28073 + }, + { + "epoch": 2.274303305249514, + "grad_norm": 0.07852265983819962, + "learning_rate": 9.587290157072776e-05, + "loss": 0.2489, + "step": 28074 + }, + { + "epoch": 2.274384316267012, + "grad_norm": 0.07023780047893524, + "learning_rate": 9.586840091813314e-05, + "loss": 0.2442, + "step": 28075 + }, + { + "epoch": 2.274465327284511, + "grad_norm": 0.08056506514549255, + "learning_rate": 9.586390026553851e-05, + "loss": 0.269, + "step": 28076 + }, + { + "epoch": 2.274546338302009, + "grad_norm": 0.07773345708847046, + "learning_rate": 9.585939961294388e-05, + "loss": 0.2805, + "step": 28077 + }, + { + "epoch": 2.2746273493195073, + "grad_norm": 0.06304016709327698, + "learning_rate": 9.585489896034926e-05, + "loss": 0.2316, + "step": 28078 + }, + { + "epoch": 2.274708360337006, + "grad_norm": 0.07088416814804077, + "learning_rate": 9.585039830775463e-05, + "loss": 0.2349, + "step": 28079 + }, + { + "epoch": 2.274789371354504, + "grad_norm": 0.061816342175006866, + "learning_rate": 9.584589765516e-05, + "loss": 0.2104, + "step": 28080 + }, + { + "epoch": 2.2748703823720025, + "grad_norm": 0.062373705208301544, + "learning_rate": 9.584139700256538e-05, + "loss": 0.235, + "step": 28081 + }, + { + "epoch": 2.274951393389501, + "grad_norm": 0.06598381698131561, + "learning_rate": 9.583689634997075e-05, + "loss": 0.2672, + "step": 28082 + }, + { + "epoch": 2.2750324044069994, + "grad_norm": 0.06689947098493576, + "learning_rate": 9.583239569737612e-05, + "loss": 0.2961, + "step": 28083 + }, + { + "epoch": 2.2751134154244976, + "grad_norm": 0.07751460373401642, + "learning_rate": 9.58278950447815e-05, + "loss": 0.2693, + "step": 28084 + }, + { + "epoch": 2.2751944264419963, + "grad_norm": 0.0560307651758194, + "learning_rate": 9.582339439218687e-05, + "loss": 0.2455, + "step": 28085 + }, + { + "epoch": 2.2752754374594946, + "grad_norm": 0.0636034682393074, + "learning_rate": 9.581889373959224e-05, + "loss": 0.2222, + "step": 28086 + }, + { + "epoch": 2.275356448476993, + "grad_norm": 0.07131937891244888, + "learning_rate": 9.581439308699762e-05, + "loss": 0.2621, + "step": 28087 + }, + { + "epoch": 2.2754374594944915, + "grad_norm": 0.0636332780122757, + "learning_rate": 9.580989243440299e-05, + "loss": 0.2331, + "step": 28088 + }, + { + "epoch": 2.2755184705119897, + "grad_norm": 0.04460746794939041, + "learning_rate": 9.580539178180837e-05, + "loss": 0.2375, + "step": 28089 + }, + { + "epoch": 2.275599481529488, + "grad_norm": 0.06167895346879959, + "learning_rate": 9.580089112921374e-05, + "loss": 0.2246, + "step": 28090 + }, + { + "epoch": 2.2756804925469862, + "grad_norm": 0.06896967440843582, + "learning_rate": 9.579639047661911e-05, + "loss": 0.2652, + "step": 28091 + }, + { + "epoch": 2.275761503564485, + "grad_norm": 0.06739029288291931, + "learning_rate": 9.579188982402449e-05, + "loss": 0.2471, + "step": 28092 + }, + { + "epoch": 2.275842514581983, + "grad_norm": 0.05659691244363785, + "learning_rate": 9.578738917142986e-05, + "loss": 0.2575, + "step": 28093 + }, + { + "epoch": 2.2759235255994814, + "grad_norm": 0.07226850092411041, + "learning_rate": 9.578288851883523e-05, + "loss": 0.2619, + "step": 28094 + }, + { + "epoch": 2.27600453661698, + "grad_norm": 0.056813836097717285, + "learning_rate": 9.577838786624061e-05, + "loss": 0.2418, + "step": 28095 + }, + { + "epoch": 2.2760855476344783, + "grad_norm": 0.05825258418917656, + "learning_rate": 9.577388721364598e-05, + "loss": 0.2707, + "step": 28096 + }, + { + "epoch": 2.2761665586519766, + "grad_norm": 0.07265456020832062, + "learning_rate": 9.576938656105135e-05, + "loss": 0.2919, + "step": 28097 + }, + { + "epoch": 2.276247569669475, + "grad_norm": 0.07091055065393448, + "learning_rate": 9.576488590845674e-05, + "loss": 0.2699, + "step": 28098 + }, + { + "epoch": 2.2763285806869735, + "grad_norm": 0.05324326083064079, + "learning_rate": 9.57603852558621e-05, + "loss": 0.224, + "step": 28099 + }, + { + "epoch": 2.2764095917044718, + "grad_norm": 0.05966367945075035, + "learning_rate": 9.575588460326748e-05, + "loss": 0.2656, + "step": 28100 + }, + { + "epoch": 2.27649060272197, + "grad_norm": 0.06723299622535706, + "learning_rate": 9.575138395067286e-05, + "loss": 0.2725, + "step": 28101 + }, + { + "epoch": 2.2765716137394687, + "grad_norm": 0.07250206917524338, + "learning_rate": 9.574688329807822e-05, + "loss": 0.253, + "step": 28102 + }, + { + "epoch": 2.276652624756967, + "grad_norm": 0.05841446295380592, + "learning_rate": 9.57423826454836e-05, + "loss": 0.2519, + "step": 28103 + }, + { + "epoch": 2.276733635774465, + "grad_norm": 0.06128285080194473, + "learning_rate": 9.573788199288898e-05, + "loss": 0.2297, + "step": 28104 + }, + { + "epoch": 2.276814646791964, + "grad_norm": 0.05345291271805763, + "learning_rate": 9.573338134029434e-05, + "loss": 0.2033, + "step": 28105 + }, + { + "epoch": 2.276895657809462, + "grad_norm": 0.0680374950170517, + "learning_rate": 9.572888068769972e-05, + "loss": 0.2673, + "step": 28106 + }, + { + "epoch": 2.2769766688269604, + "grad_norm": 0.06612028181552887, + "learning_rate": 9.57243800351051e-05, + "loss": 0.2735, + "step": 28107 + }, + { + "epoch": 2.277057679844459, + "grad_norm": 0.06396949291229248, + "learning_rate": 9.571987938251046e-05, + "loss": 0.2196, + "step": 28108 + }, + { + "epoch": 2.2771386908619573, + "grad_norm": 0.0603230744600296, + "learning_rate": 9.571537872991584e-05, + "loss": 0.2457, + "step": 28109 + }, + { + "epoch": 2.2772197018794555, + "grad_norm": 0.05682147294282913, + "learning_rate": 9.571087807732122e-05, + "loss": 0.2556, + "step": 28110 + }, + { + "epoch": 2.277300712896954, + "grad_norm": 0.06449034810066223, + "learning_rate": 9.570637742472658e-05, + "loss": 0.2305, + "step": 28111 + }, + { + "epoch": 2.2773817239144525, + "grad_norm": 0.06559954583644867, + "learning_rate": 9.570187677213196e-05, + "loss": 0.238, + "step": 28112 + }, + { + "epoch": 2.2774627349319507, + "grad_norm": 0.07063081860542297, + "learning_rate": 9.569737611953735e-05, + "loss": 0.2515, + "step": 28113 + }, + { + "epoch": 2.277543745949449, + "grad_norm": 0.06556065380573273, + "learning_rate": 9.56928754669427e-05, + "loss": 0.2922, + "step": 28114 + }, + { + "epoch": 2.2776247569669477, + "grad_norm": 0.06445152312517166, + "learning_rate": 9.568837481434808e-05, + "loss": 0.2591, + "step": 28115 + }, + { + "epoch": 2.277705767984446, + "grad_norm": 0.07554656267166138, + "learning_rate": 9.568387416175347e-05, + "loss": 0.2867, + "step": 28116 + }, + { + "epoch": 2.277786779001944, + "grad_norm": 0.06792528927326202, + "learning_rate": 9.567937350915883e-05, + "loss": 0.2787, + "step": 28117 + }, + { + "epoch": 2.2778677900194424, + "grad_norm": 0.06126142665743828, + "learning_rate": 9.56748728565642e-05, + "loss": 0.2588, + "step": 28118 + }, + { + "epoch": 2.277948801036941, + "grad_norm": 0.06154099851846695, + "learning_rate": 9.567037220396959e-05, + "loss": 0.2529, + "step": 28119 + }, + { + "epoch": 2.2780298120544393, + "grad_norm": 0.06487050652503967, + "learning_rate": 9.566587155137495e-05, + "loss": 0.2491, + "step": 28120 + }, + { + "epoch": 2.2781108230719376, + "grad_norm": 0.06991656869649887, + "learning_rate": 9.566137089878032e-05, + "loss": 0.2596, + "step": 28121 + }, + { + "epoch": 2.2781918340894363, + "grad_norm": 0.07080622762441635, + "learning_rate": 9.565687024618571e-05, + "loss": 0.2844, + "step": 28122 + }, + { + "epoch": 2.2782728451069345, + "grad_norm": 0.07612073421478271, + "learning_rate": 9.565236959359107e-05, + "loss": 0.2624, + "step": 28123 + }, + { + "epoch": 2.2783538561244328, + "grad_norm": 0.06403160840272903, + "learning_rate": 9.564786894099646e-05, + "loss": 0.2432, + "step": 28124 + }, + { + "epoch": 2.2784348671419314, + "grad_norm": 0.06373529881238937, + "learning_rate": 9.564336828840183e-05, + "loss": 0.2473, + "step": 28125 + }, + { + "epoch": 2.2785158781594297, + "grad_norm": 0.05918858200311661, + "learning_rate": 9.563886763580719e-05, + "loss": 0.2741, + "step": 28126 + }, + { + "epoch": 2.278596889176928, + "grad_norm": 0.07889010012149811, + "learning_rate": 9.563436698321258e-05, + "loss": 0.2177, + "step": 28127 + }, + { + "epoch": 2.2786779001944266, + "grad_norm": 0.06980336457490921, + "learning_rate": 9.562986633061795e-05, + "loss": 0.2733, + "step": 28128 + }, + { + "epoch": 2.278758911211925, + "grad_norm": 0.08205079287290573, + "learning_rate": 9.562536567802331e-05, + "loss": 0.2339, + "step": 28129 + }, + { + "epoch": 2.278839922229423, + "grad_norm": 0.07007057219743729, + "learning_rate": 9.56208650254287e-05, + "loss": 0.2609, + "step": 28130 + }, + { + "epoch": 2.278920933246922, + "grad_norm": 0.061237528920173645, + "learning_rate": 9.561636437283407e-05, + "loss": 0.2355, + "step": 28131 + }, + { + "epoch": 2.27900194426442, + "grad_norm": 0.05059129372239113, + "learning_rate": 9.561186372023943e-05, + "loss": 0.2554, + "step": 28132 + }, + { + "epoch": 2.2790829552819183, + "grad_norm": 0.06367185711860657, + "learning_rate": 9.560736306764482e-05, + "loss": 0.258, + "step": 28133 + }, + { + "epoch": 2.2791639662994165, + "grad_norm": 0.061538007110357285, + "learning_rate": 9.560286241505019e-05, + "loss": 0.2436, + "step": 28134 + }, + { + "epoch": 2.279244977316915, + "grad_norm": 0.06117791682481766, + "learning_rate": 9.559836176245555e-05, + "loss": 0.2555, + "step": 28135 + }, + { + "epoch": 2.2793259883344135, + "grad_norm": 0.06512202322483063, + "learning_rate": 9.559386110986094e-05, + "loss": 0.2767, + "step": 28136 + }, + { + "epoch": 2.2794069993519117, + "grad_norm": 0.054416246712207794, + "learning_rate": 9.558936045726631e-05, + "loss": 0.246, + "step": 28137 + }, + { + "epoch": 2.2794880103694104, + "grad_norm": 0.07490819692611694, + "learning_rate": 9.558485980467167e-05, + "loss": 0.2663, + "step": 28138 + }, + { + "epoch": 2.2795690213869086, + "grad_norm": 0.055716339498758316, + "learning_rate": 9.558035915207706e-05, + "loss": 0.2518, + "step": 28139 + }, + { + "epoch": 2.279650032404407, + "grad_norm": 0.07080546766519547, + "learning_rate": 9.557585849948243e-05, + "loss": 0.2555, + "step": 28140 + }, + { + "epoch": 2.279731043421905, + "grad_norm": 0.07642080634832382, + "learning_rate": 9.557135784688779e-05, + "loss": 0.3134, + "step": 28141 + }, + { + "epoch": 2.279812054439404, + "grad_norm": 0.0733831450343132, + "learning_rate": 9.556685719429318e-05, + "loss": 0.2263, + "step": 28142 + }, + { + "epoch": 2.279893065456902, + "grad_norm": 0.08651655912399292, + "learning_rate": 9.556235654169855e-05, + "loss": 0.2357, + "step": 28143 + }, + { + "epoch": 2.2799740764744003, + "grad_norm": 0.06302259117364883, + "learning_rate": 9.555785588910391e-05, + "loss": 0.2157, + "step": 28144 + }, + { + "epoch": 2.280055087491899, + "grad_norm": 0.06950745731592178, + "learning_rate": 9.55533552365093e-05, + "loss": 0.2786, + "step": 28145 + }, + { + "epoch": 2.2801360985093972, + "grad_norm": 0.06577300280332565, + "learning_rate": 9.554885458391467e-05, + "loss": 0.2228, + "step": 28146 + }, + { + "epoch": 2.2802171095268955, + "grad_norm": 0.07339700311422348, + "learning_rate": 9.554435393132003e-05, + "loss": 0.2488, + "step": 28147 + }, + { + "epoch": 2.280298120544394, + "grad_norm": 0.06289852410554886, + "learning_rate": 9.553985327872542e-05, + "loss": 0.2438, + "step": 28148 + }, + { + "epoch": 2.2803791315618924, + "grad_norm": 0.05764570087194443, + "learning_rate": 9.55353526261308e-05, + "loss": 0.2099, + "step": 28149 + }, + { + "epoch": 2.2804601425793907, + "grad_norm": 0.04218946769833565, + "learning_rate": 9.553085197353616e-05, + "loss": 0.2068, + "step": 28150 + }, + { + "epoch": 2.2805411535968894, + "grad_norm": 0.07371071726083755, + "learning_rate": 9.552635132094154e-05, + "loss": 0.2473, + "step": 28151 + }, + { + "epoch": 2.2806221646143876, + "grad_norm": 0.06671245396137238, + "learning_rate": 9.552185066834692e-05, + "loss": 0.2579, + "step": 28152 + }, + { + "epoch": 2.280703175631886, + "grad_norm": 0.06467222422361374, + "learning_rate": 9.551735001575229e-05, + "loss": 0.2633, + "step": 28153 + }, + { + "epoch": 2.2807841866493845, + "grad_norm": 0.06984186172485352, + "learning_rate": 9.551284936315766e-05, + "loss": 0.2834, + "step": 28154 + }, + { + "epoch": 2.280865197666883, + "grad_norm": 0.057516057044267654, + "learning_rate": 9.550834871056304e-05, + "loss": 0.2485, + "step": 28155 + }, + { + "epoch": 2.280946208684381, + "grad_norm": 0.0576837882399559, + "learning_rate": 9.550384805796841e-05, + "loss": 0.2331, + "step": 28156 + }, + { + "epoch": 2.2810272197018793, + "grad_norm": 0.06400992721319199, + "learning_rate": 9.549934740537378e-05, + "loss": 0.2409, + "step": 28157 + }, + { + "epoch": 2.281108230719378, + "grad_norm": 0.07133117318153381, + "learning_rate": 9.549484675277916e-05, + "loss": 0.2394, + "step": 28158 + }, + { + "epoch": 2.281189241736876, + "grad_norm": 0.05950386822223663, + "learning_rate": 9.549034610018453e-05, + "loss": 0.2629, + "step": 28159 + }, + { + "epoch": 2.2812702527543745, + "grad_norm": 0.06598322093486786, + "learning_rate": 9.54858454475899e-05, + "loss": 0.2732, + "step": 28160 + }, + { + "epoch": 2.281351263771873, + "grad_norm": 0.052976034581661224, + "learning_rate": 9.548134479499528e-05, + "loss": 0.246, + "step": 28161 + }, + { + "epoch": 2.2814322747893714, + "grad_norm": 0.05692631006240845, + "learning_rate": 9.547684414240065e-05, + "loss": 0.2429, + "step": 28162 + }, + { + "epoch": 2.2815132858068696, + "grad_norm": 0.06387508660554886, + "learning_rate": 9.547234348980603e-05, + "loss": 0.2653, + "step": 28163 + }, + { + "epoch": 2.281594296824368, + "grad_norm": 0.06577061116695404, + "learning_rate": 9.54678428372114e-05, + "loss": 0.2142, + "step": 28164 + }, + { + "epoch": 2.2816753078418666, + "grad_norm": 0.06224307790398598, + "learning_rate": 9.546334218461677e-05, + "loss": 0.2492, + "step": 28165 + }, + { + "epoch": 2.281756318859365, + "grad_norm": 0.06197723001241684, + "learning_rate": 9.545884153202215e-05, + "loss": 0.2428, + "step": 28166 + }, + { + "epoch": 2.281837329876863, + "grad_norm": 0.061254385858774185, + "learning_rate": 9.545434087942752e-05, + "loss": 0.2553, + "step": 28167 + }, + { + "epoch": 2.2819183408943617, + "grad_norm": 0.06933100521564484, + "learning_rate": 9.54498402268329e-05, + "loss": 0.2643, + "step": 28168 + }, + { + "epoch": 2.28199935191186, + "grad_norm": 0.0723806843161583, + "learning_rate": 9.544533957423827e-05, + "loss": 0.2492, + "step": 28169 + }, + { + "epoch": 2.2820803629293582, + "grad_norm": 0.05900033935904503, + "learning_rate": 9.544083892164364e-05, + "loss": 0.2738, + "step": 28170 + }, + { + "epoch": 2.282161373946857, + "grad_norm": 0.07795938849449158, + "learning_rate": 9.543633826904901e-05, + "loss": 0.2242, + "step": 28171 + }, + { + "epoch": 2.282242384964355, + "grad_norm": 0.060051240026950836, + "learning_rate": 9.543183761645439e-05, + "loss": 0.2521, + "step": 28172 + }, + { + "epoch": 2.2823233959818534, + "grad_norm": 0.061081644147634506, + "learning_rate": 9.542733696385976e-05, + "loss": 0.2866, + "step": 28173 + }, + { + "epoch": 2.282404406999352, + "grad_norm": 0.07270535826683044, + "learning_rate": 9.542283631126514e-05, + "loss": 0.2235, + "step": 28174 + }, + { + "epoch": 2.2824854180168503, + "grad_norm": 0.07448476552963257, + "learning_rate": 9.541833565867051e-05, + "loss": 0.2199, + "step": 28175 + }, + { + "epoch": 2.2825664290343486, + "grad_norm": 0.06277775019407272, + "learning_rate": 9.541383500607588e-05, + "loss": 0.2548, + "step": 28176 + }, + { + "epoch": 2.2826474400518473, + "grad_norm": 0.06676080077886581, + "learning_rate": 9.540933435348126e-05, + "loss": 0.2542, + "step": 28177 + }, + { + "epoch": 2.2827284510693455, + "grad_norm": 0.05792508274316788, + "learning_rate": 9.540483370088663e-05, + "loss": 0.2852, + "step": 28178 + }, + { + "epoch": 2.2828094620868438, + "grad_norm": 0.05737094581127167, + "learning_rate": 9.540033304829202e-05, + "loss": 0.2581, + "step": 28179 + }, + { + "epoch": 2.282890473104342, + "grad_norm": 0.06211812049150467, + "learning_rate": 9.539583239569738e-05, + "loss": 0.2292, + "step": 28180 + }, + { + "epoch": 2.2829714841218407, + "grad_norm": 0.07637868076562881, + "learning_rate": 9.539133174310275e-05, + "loss": 0.2691, + "step": 28181 + }, + { + "epoch": 2.283052495139339, + "grad_norm": 0.06960336118936539, + "learning_rate": 9.538683109050814e-05, + "loss": 0.2326, + "step": 28182 + }, + { + "epoch": 2.283133506156837, + "grad_norm": 0.06159220635890961, + "learning_rate": 9.53823304379135e-05, + "loss": 0.2531, + "step": 28183 + }, + { + "epoch": 2.283214517174336, + "grad_norm": 0.062390245497226715, + "learning_rate": 9.537782978531887e-05, + "loss": 0.3095, + "step": 28184 + }, + { + "epoch": 2.283295528191834, + "grad_norm": 0.06799671053886414, + "learning_rate": 9.537332913272426e-05, + "loss": 0.2446, + "step": 28185 + }, + { + "epoch": 2.2833765392093324, + "grad_norm": 0.050637729465961456, + "learning_rate": 9.536882848012962e-05, + "loss": 0.2581, + "step": 28186 + }, + { + "epoch": 2.2834575502268306, + "grad_norm": 0.08036299794912338, + "learning_rate": 9.536432782753499e-05, + "loss": 0.2413, + "step": 28187 + }, + { + "epoch": 2.2835385612443293, + "grad_norm": 0.06844053417444229, + "learning_rate": 9.535982717494038e-05, + "loss": 0.2028, + "step": 28188 + }, + { + "epoch": 2.2836195722618275, + "grad_norm": 0.07212093472480774, + "learning_rate": 9.535532652234574e-05, + "loss": 0.2397, + "step": 28189 + }, + { + "epoch": 2.283700583279326, + "grad_norm": 0.0825839415192604, + "learning_rate": 9.535082586975111e-05, + "loss": 0.2579, + "step": 28190 + }, + { + "epoch": 2.2837815942968245, + "grad_norm": 0.07898513227701187, + "learning_rate": 9.53463252171565e-05, + "loss": 0.2711, + "step": 28191 + }, + { + "epoch": 2.2838626053143227, + "grad_norm": 0.08032089471817017, + "learning_rate": 9.534182456456186e-05, + "loss": 0.2724, + "step": 28192 + }, + { + "epoch": 2.283943616331821, + "grad_norm": 0.07727602124214172, + "learning_rate": 9.533732391196723e-05, + "loss": 0.2241, + "step": 28193 + }, + { + "epoch": 2.2840246273493197, + "grad_norm": 0.07366763800382614, + "learning_rate": 9.533282325937262e-05, + "loss": 0.2354, + "step": 28194 + }, + { + "epoch": 2.284105638366818, + "grad_norm": 0.07446452230215073, + "learning_rate": 9.532832260677798e-05, + "loss": 0.2805, + "step": 28195 + }, + { + "epoch": 2.284186649384316, + "grad_norm": 0.06326040625572205, + "learning_rate": 9.532382195418335e-05, + "loss": 0.2287, + "step": 28196 + }, + { + "epoch": 2.284267660401815, + "grad_norm": 0.05394669622182846, + "learning_rate": 9.531932130158874e-05, + "loss": 0.2708, + "step": 28197 + }, + { + "epoch": 2.284348671419313, + "grad_norm": 0.054610323160886765, + "learning_rate": 9.53148206489941e-05, + "loss": 0.2596, + "step": 28198 + }, + { + "epoch": 2.2844296824368113, + "grad_norm": 0.047331418842077255, + "learning_rate": 9.531031999639948e-05, + "loss": 0.2438, + "step": 28199 + }, + { + "epoch": 2.28451069345431, + "grad_norm": 0.07602977007627487, + "learning_rate": 9.530581934380486e-05, + "loss": 0.2742, + "step": 28200 + }, + { + "epoch": 2.2845917044718083, + "grad_norm": 0.06329728662967682, + "learning_rate": 9.530131869121022e-05, + "loss": 0.2407, + "step": 28201 + }, + { + "epoch": 2.2846727154893065, + "grad_norm": 0.05749182775616646, + "learning_rate": 9.52968180386156e-05, + "loss": 0.2423, + "step": 28202 + }, + { + "epoch": 2.2847537265068047, + "grad_norm": 0.062329839915037155, + "learning_rate": 9.529231738602098e-05, + "loss": 0.2623, + "step": 28203 + }, + { + "epoch": 2.2848347375243034, + "grad_norm": 0.056943412870168686, + "learning_rate": 9.528781673342634e-05, + "loss": 0.255, + "step": 28204 + }, + { + "epoch": 2.2849157485418017, + "grad_norm": 0.06783603876829147, + "learning_rate": 9.528331608083173e-05, + "loss": 0.2757, + "step": 28205 + }, + { + "epoch": 2.2849967595593, + "grad_norm": 0.0673983246088028, + "learning_rate": 9.52788154282371e-05, + "loss": 0.2527, + "step": 28206 + }, + { + "epoch": 2.2850777705767986, + "grad_norm": 0.054460301995277405, + "learning_rate": 9.527431477564246e-05, + "loss": 0.2595, + "step": 28207 + }, + { + "epoch": 2.285158781594297, + "grad_norm": 0.05514821410179138, + "learning_rate": 9.526981412304785e-05, + "loss": 0.2557, + "step": 28208 + }, + { + "epoch": 2.285239792611795, + "grad_norm": 0.07168824970722198, + "learning_rate": 9.526531347045323e-05, + "loss": 0.2503, + "step": 28209 + }, + { + "epoch": 2.2853208036292934, + "grad_norm": 0.07106754928827286, + "learning_rate": 9.526081281785859e-05, + "loss": 0.266, + "step": 28210 + }, + { + "epoch": 2.285401814646792, + "grad_norm": 0.06122040003538132, + "learning_rate": 9.525631216526397e-05, + "loss": 0.285, + "step": 28211 + }, + { + "epoch": 2.2854828256642903, + "grad_norm": 0.055722106248140335, + "learning_rate": 9.525181151266935e-05, + "loss": 0.2126, + "step": 28212 + }, + { + "epoch": 2.2855638366817885, + "grad_norm": 0.0730934888124466, + "learning_rate": 9.52473108600747e-05, + "loss": 0.2513, + "step": 28213 + }, + { + "epoch": 2.285644847699287, + "grad_norm": 0.06926199793815613, + "learning_rate": 9.52428102074801e-05, + "loss": 0.2303, + "step": 28214 + }, + { + "epoch": 2.2857258587167855, + "grad_norm": 0.057966891676187515, + "learning_rate": 9.523830955488547e-05, + "loss": 0.2631, + "step": 28215 + }, + { + "epoch": 2.2858068697342837, + "grad_norm": 0.07610571384429932, + "learning_rate": 9.523380890229083e-05, + "loss": 0.2419, + "step": 28216 + }, + { + "epoch": 2.2858878807517824, + "grad_norm": 0.04987300932407379, + "learning_rate": 9.522930824969621e-05, + "loss": 0.2507, + "step": 28217 + }, + { + "epoch": 2.2859688917692806, + "grad_norm": 0.06637983024120331, + "learning_rate": 9.522480759710159e-05, + "loss": 0.2303, + "step": 28218 + }, + { + "epoch": 2.286049902786779, + "grad_norm": 0.07258900254964828, + "learning_rate": 9.522030694450695e-05, + "loss": 0.2512, + "step": 28219 + }, + { + "epoch": 2.2861309138042776, + "grad_norm": 0.06396087259054184, + "learning_rate": 9.521580629191233e-05, + "loss": 0.2433, + "step": 28220 + }, + { + "epoch": 2.286211924821776, + "grad_norm": 0.05232375115156174, + "learning_rate": 9.521130563931771e-05, + "loss": 0.2265, + "step": 28221 + }, + { + "epoch": 2.286292935839274, + "grad_norm": 0.07666315883398056, + "learning_rate": 9.520680498672307e-05, + "loss": 0.229, + "step": 28222 + }, + { + "epoch": 2.2863739468567728, + "grad_norm": 0.06945835053920746, + "learning_rate": 9.520230433412846e-05, + "loss": 0.2227, + "step": 28223 + }, + { + "epoch": 2.286454957874271, + "grad_norm": 0.0774323046207428, + "learning_rate": 9.519780368153383e-05, + "loss": 0.2956, + "step": 28224 + }, + { + "epoch": 2.2865359688917692, + "grad_norm": 0.06579787284135818, + "learning_rate": 9.519330302893919e-05, + "loss": 0.2426, + "step": 28225 + }, + { + "epoch": 2.2866169799092675, + "grad_norm": 0.07013032585382462, + "learning_rate": 9.518880237634458e-05, + "loss": 0.2767, + "step": 28226 + }, + { + "epoch": 2.286697990926766, + "grad_norm": 0.07726210355758667, + "learning_rate": 9.518430172374995e-05, + "loss": 0.2645, + "step": 28227 + }, + { + "epoch": 2.2867790019442644, + "grad_norm": 0.07686841487884521, + "learning_rate": 9.517980107115531e-05, + "loss": 0.2483, + "step": 28228 + }, + { + "epoch": 2.2868600129617627, + "grad_norm": 0.06310471892356873, + "learning_rate": 9.51753004185607e-05, + "loss": 0.2663, + "step": 28229 + }, + { + "epoch": 2.2869410239792614, + "grad_norm": 0.05296747758984566, + "learning_rate": 9.517079976596607e-05, + "loss": 0.2524, + "step": 28230 + }, + { + "epoch": 2.2870220349967596, + "grad_norm": 0.06394476443529129, + "learning_rate": 9.516629911337144e-05, + "loss": 0.2327, + "step": 28231 + }, + { + "epoch": 2.287103046014258, + "grad_norm": 0.08596345782279968, + "learning_rate": 9.516179846077682e-05, + "loss": 0.229, + "step": 28232 + }, + { + "epoch": 2.287184057031756, + "grad_norm": 0.06403294205665588, + "learning_rate": 9.515729780818219e-05, + "loss": 0.2249, + "step": 28233 + }, + { + "epoch": 2.287265068049255, + "grad_norm": 0.073976069688797, + "learning_rate": 9.515279715558757e-05, + "loss": 0.2389, + "step": 28234 + }, + { + "epoch": 2.287346079066753, + "grad_norm": 0.06861808896064758, + "learning_rate": 9.514829650299294e-05, + "loss": 0.2483, + "step": 28235 + }, + { + "epoch": 2.2874270900842513, + "grad_norm": 0.06976199895143509, + "learning_rate": 9.514379585039831e-05, + "loss": 0.2794, + "step": 28236 + }, + { + "epoch": 2.28750810110175, + "grad_norm": 0.0783432349562645, + "learning_rate": 9.513929519780369e-05, + "loss": 0.2481, + "step": 28237 + }, + { + "epoch": 2.287589112119248, + "grad_norm": 0.05985863134264946, + "learning_rate": 9.513479454520906e-05, + "loss": 0.2146, + "step": 28238 + }, + { + "epoch": 2.2876701231367464, + "grad_norm": 0.07286644726991653, + "learning_rate": 9.513029389261443e-05, + "loss": 0.2254, + "step": 28239 + }, + { + "epoch": 2.287751134154245, + "grad_norm": 0.06654609739780426, + "learning_rate": 9.512579324001981e-05, + "loss": 0.2668, + "step": 28240 + }, + { + "epoch": 2.2878321451717434, + "grad_norm": 0.056361664086580276, + "learning_rate": 9.512129258742518e-05, + "loss": 0.2595, + "step": 28241 + }, + { + "epoch": 2.2879131561892416, + "grad_norm": 0.07212407886981964, + "learning_rate": 9.511679193483055e-05, + "loss": 0.2908, + "step": 28242 + }, + { + "epoch": 2.2879941672067403, + "grad_norm": 0.05984533578157425, + "learning_rate": 9.511229128223593e-05, + "loss": 0.2865, + "step": 28243 + }, + { + "epoch": 2.2880751782242386, + "grad_norm": 0.06211211159825325, + "learning_rate": 9.51077906296413e-05, + "loss": 0.2501, + "step": 28244 + }, + { + "epoch": 2.288156189241737, + "grad_norm": 0.0646166056394577, + "learning_rate": 9.510328997704667e-05, + "loss": 0.2579, + "step": 28245 + }, + { + "epoch": 2.2882372002592355, + "grad_norm": 0.0673956647515297, + "learning_rate": 9.509878932445205e-05, + "loss": 0.2683, + "step": 28246 + }, + { + "epoch": 2.2883182112767337, + "grad_norm": 0.05403958633542061, + "learning_rate": 9.509428867185742e-05, + "loss": 0.2417, + "step": 28247 + }, + { + "epoch": 2.288399222294232, + "grad_norm": 0.05626343563199043, + "learning_rate": 9.50897880192628e-05, + "loss": 0.2173, + "step": 28248 + }, + { + "epoch": 2.2884802333117302, + "grad_norm": 0.06387647986412048, + "learning_rate": 9.508528736666817e-05, + "loss": 0.245, + "step": 28249 + }, + { + "epoch": 2.288561244329229, + "grad_norm": 0.06680309772491455, + "learning_rate": 9.508078671407354e-05, + "loss": 0.2748, + "step": 28250 + }, + { + "epoch": 2.288642255346727, + "grad_norm": 0.05708785727620125, + "learning_rate": 9.507628606147892e-05, + "loss": 0.2285, + "step": 28251 + }, + { + "epoch": 2.2887232663642254, + "grad_norm": 0.06534324586391449, + "learning_rate": 9.507178540888429e-05, + "loss": 0.2364, + "step": 28252 + }, + { + "epoch": 2.288804277381724, + "grad_norm": 0.07644810527563095, + "learning_rate": 9.506728475628966e-05, + "loss": 0.2334, + "step": 28253 + }, + { + "epoch": 2.2888852883992223, + "grad_norm": 0.07262365520000458, + "learning_rate": 9.506278410369504e-05, + "loss": 0.2624, + "step": 28254 + }, + { + "epoch": 2.2889662994167206, + "grad_norm": 0.061442889273166656, + "learning_rate": 9.505828345110041e-05, + "loss": 0.2362, + "step": 28255 + }, + { + "epoch": 2.289047310434219, + "grad_norm": 0.07143153995275497, + "learning_rate": 9.505378279850578e-05, + "loss": 0.2874, + "step": 28256 + }, + { + "epoch": 2.2891283214517175, + "grad_norm": 0.0650113970041275, + "learning_rate": 9.504928214591117e-05, + "loss": 0.1863, + "step": 28257 + }, + { + "epoch": 2.2892093324692158, + "grad_norm": 0.06786942481994629, + "learning_rate": 9.504478149331653e-05, + "loss": 0.2347, + "step": 28258 + }, + { + "epoch": 2.289290343486714, + "grad_norm": 0.06598436832427979, + "learning_rate": 9.50402808407219e-05, + "loss": 0.2598, + "step": 28259 + }, + { + "epoch": 2.2893713545042127, + "grad_norm": 0.06490588188171387, + "learning_rate": 9.503578018812729e-05, + "loss": 0.2493, + "step": 28260 + }, + { + "epoch": 2.289452365521711, + "grad_norm": 0.05049898847937584, + "learning_rate": 9.503127953553265e-05, + "loss": 0.2423, + "step": 28261 + }, + { + "epoch": 2.289533376539209, + "grad_norm": 0.05463642627000809, + "learning_rate": 9.502677888293803e-05, + "loss": 0.2613, + "step": 28262 + }, + { + "epoch": 2.289614387556708, + "grad_norm": 0.08326191455125809, + "learning_rate": 9.502227823034341e-05, + "loss": 0.2508, + "step": 28263 + }, + { + "epoch": 2.289695398574206, + "grad_norm": 0.07564669102430344, + "learning_rate": 9.501777757774877e-05, + "loss": 0.27, + "step": 28264 + }, + { + "epoch": 2.2897764095917044, + "grad_norm": 0.06074196472764015, + "learning_rate": 9.501327692515415e-05, + "loss": 0.2657, + "step": 28265 + }, + { + "epoch": 2.289857420609203, + "grad_norm": 0.06326223164796829, + "learning_rate": 9.500877627255953e-05, + "loss": 0.2301, + "step": 28266 + }, + { + "epoch": 2.2899384316267013, + "grad_norm": 0.05562509596347809, + "learning_rate": 9.50042756199649e-05, + "loss": 0.2538, + "step": 28267 + }, + { + "epoch": 2.2900194426441995, + "grad_norm": 0.059947285801172256, + "learning_rate": 9.499977496737027e-05, + "loss": 0.274, + "step": 28268 + }, + { + "epoch": 2.2901004536616982, + "grad_norm": 0.051273610442876816, + "learning_rate": 9.499527431477565e-05, + "loss": 0.2301, + "step": 28269 + }, + { + "epoch": 2.2901814646791965, + "grad_norm": 0.0706385150551796, + "learning_rate": 9.499077366218102e-05, + "loss": 0.2673, + "step": 28270 + }, + { + "epoch": 2.2902624756966947, + "grad_norm": 0.06693287938833237, + "learning_rate": 9.498627300958639e-05, + "loss": 0.2535, + "step": 28271 + }, + { + "epoch": 2.290343486714193, + "grad_norm": 0.07116030901670456, + "learning_rate": 9.498177235699178e-05, + "loss": 0.2951, + "step": 28272 + }, + { + "epoch": 2.2904244977316917, + "grad_norm": 0.08194348961114883, + "learning_rate": 9.497727170439714e-05, + "loss": 0.2604, + "step": 28273 + }, + { + "epoch": 2.29050550874919, + "grad_norm": 0.05736947059631348, + "learning_rate": 9.497277105180251e-05, + "loss": 0.2223, + "step": 28274 + }, + { + "epoch": 2.290586519766688, + "grad_norm": 0.0609368234872818, + "learning_rate": 9.49682703992079e-05, + "loss": 0.2197, + "step": 28275 + }, + { + "epoch": 2.290667530784187, + "grad_norm": 0.061983298510313034, + "learning_rate": 9.496376974661326e-05, + "loss": 0.2552, + "step": 28276 + }, + { + "epoch": 2.290748541801685, + "grad_norm": 0.06865274906158447, + "learning_rate": 9.495926909401863e-05, + "loss": 0.2611, + "step": 28277 + }, + { + "epoch": 2.2908295528191833, + "grad_norm": 0.07397310435771942, + "learning_rate": 9.495476844142402e-05, + "loss": 0.2649, + "step": 28278 + }, + { + "epoch": 2.2909105638366816, + "grad_norm": 0.06135142594575882, + "learning_rate": 9.495026778882938e-05, + "loss": 0.2785, + "step": 28279 + }, + { + "epoch": 2.2909915748541803, + "grad_norm": 0.05850803852081299, + "learning_rate": 9.494576713623475e-05, + "loss": 0.2482, + "step": 28280 + }, + { + "epoch": 2.2910725858716785, + "grad_norm": 0.06148507446050644, + "learning_rate": 9.494126648364014e-05, + "loss": 0.2317, + "step": 28281 + }, + { + "epoch": 2.2911535968891767, + "grad_norm": 0.06344686448574066, + "learning_rate": 9.49367658310455e-05, + "loss": 0.239, + "step": 28282 + }, + { + "epoch": 2.2912346079066754, + "grad_norm": 0.0795753002166748, + "learning_rate": 9.493226517845089e-05, + "loss": 0.2462, + "step": 28283 + }, + { + "epoch": 2.2913156189241737, + "grad_norm": 0.061491359025239944, + "learning_rate": 9.492776452585626e-05, + "loss": 0.228, + "step": 28284 + }, + { + "epoch": 2.291396629941672, + "grad_norm": 0.07499266415834427, + "learning_rate": 9.492326387326162e-05, + "loss": 0.2701, + "step": 28285 + }, + { + "epoch": 2.2914776409591706, + "grad_norm": 0.06818591058254242, + "learning_rate": 9.4918763220667e-05, + "loss": 0.2532, + "step": 28286 + }, + { + "epoch": 2.291558651976669, + "grad_norm": 0.08009780943393707, + "learning_rate": 9.491426256807238e-05, + "loss": 0.3119, + "step": 28287 + }, + { + "epoch": 2.291639662994167, + "grad_norm": 0.056352753192186356, + "learning_rate": 9.490976191547774e-05, + "loss": 0.2219, + "step": 28288 + }, + { + "epoch": 2.291720674011666, + "grad_norm": 0.080259308218956, + "learning_rate": 9.490526126288313e-05, + "loss": 0.2622, + "step": 28289 + }, + { + "epoch": 2.291801685029164, + "grad_norm": 0.06937295198440552, + "learning_rate": 9.49007606102885e-05, + "loss": 0.2491, + "step": 28290 + }, + { + "epoch": 2.2918826960466623, + "grad_norm": 0.06505081802606583, + "learning_rate": 9.489625995769386e-05, + "loss": 0.2546, + "step": 28291 + }, + { + "epoch": 2.291963707064161, + "grad_norm": 0.06334318220615387, + "learning_rate": 9.489175930509925e-05, + "loss": 0.2564, + "step": 28292 + }, + { + "epoch": 2.292044718081659, + "grad_norm": 0.0651654452085495, + "learning_rate": 9.488725865250462e-05, + "loss": 0.2688, + "step": 28293 + }, + { + "epoch": 2.2921257290991575, + "grad_norm": 0.06208321824669838, + "learning_rate": 9.488275799990998e-05, + "loss": 0.2233, + "step": 28294 + }, + { + "epoch": 2.2922067401166557, + "grad_norm": 0.056360118091106415, + "learning_rate": 9.487825734731537e-05, + "loss": 0.2241, + "step": 28295 + }, + { + "epoch": 2.2922877511341544, + "grad_norm": 0.07632552832365036, + "learning_rate": 9.487375669472074e-05, + "loss": 0.2594, + "step": 28296 + }, + { + "epoch": 2.2923687621516526, + "grad_norm": 0.07288607954978943, + "learning_rate": 9.48692560421261e-05, + "loss": 0.2737, + "step": 28297 + }, + { + "epoch": 2.292449773169151, + "grad_norm": 0.0723167434334755, + "learning_rate": 9.486475538953149e-05, + "loss": 0.2366, + "step": 28298 + }, + { + "epoch": 2.2925307841866496, + "grad_norm": 0.05326563119888306, + "learning_rate": 9.486025473693686e-05, + "loss": 0.2355, + "step": 28299 + }, + { + "epoch": 2.292611795204148, + "grad_norm": 0.06141812726855278, + "learning_rate": 9.485575408434222e-05, + "loss": 0.2516, + "step": 28300 + }, + { + "epoch": 2.292692806221646, + "grad_norm": 0.06692104041576385, + "learning_rate": 9.485125343174761e-05, + "loss": 0.2937, + "step": 28301 + }, + { + "epoch": 2.2927738172391443, + "grad_norm": 0.06172855570912361, + "learning_rate": 9.484675277915298e-05, + "loss": 0.2417, + "step": 28302 + }, + { + "epoch": 2.292854828256643, + "grad_norm": 0.06784968078136444, + "learning_rate": 9.484225212655836e-05, + "loss": 0.2704, + "step": 28303 + }, + { + "epoch": 2.2929358392741412, + "grad_norm": 0.07524605095386505, + "learning_rate": 9.483775147396373e-05, + "loss": 0.2703, + "step": 28304 + }, + { + "epoch": 2.2930168502916395, + "grad_norm": 0.061262935400009155, + "learning_rate": 9.48332508213691e-05, + "loss": 0.2608, + "step": 28305 + }, + { + "epoch": 2.293097861309138, + "grad_norm": 0.07269617915153503, + "learning_rate": 9.482875016877448e-05, + "loss": 0.24, + "step": 28306 + }, + { + "epoch": 2.2931788723266364, + "grad_norm": 0.06610900163650513, + "learning_rate": 9.482424951617985e-05, + "loss": 0.2466, + "step": 28307 + }, + { + "epoch": 2.2932598833441347, + "grad_norm": 0.08361209183931351, + "learning_rate": 9.481974886358523e-05, + "loss": 0.2459, + "step": 28308 + }, + { + "epoch": 2.2933408943616334, + "grad_norm": 0.06436758488416672, + "learning_rate": 9.48152482109906e-05, + "loss": 0.2156, + "step": 28309 + }, + { + "epoch": 2.2934219053791316, + "grad_norm": 0.07810865342617035, + "learning_rate": 9.481074755839597e-05, + "loss": 0.2417, + "step": 28310 + }, + { + "epoch": 2.29350291639663, + "grad_norm": 0.07908753305673599, + "learning_rate": 9.480624690580135e-05, + "loss": 0.2408, + "step": 28311 + }, + { + "epoch": 2.2935839274141285, + "grad_norm": 0.05391676723957062, + "learning_rate": 9.480174625320672e-05, + "loss": 0.2365, + "step": 28312 + }, + { + "epoch": 2.2936649384316268, + "grad_norm": 0.0659322664141655, + "learning_rate": 9.47972456006121e-05, + "loss": 0.2425, + "step": 28313 + }, + { + "epoch": 2.293745949449125, + "grad_norm": 0.07384074479341507, + "learning_rate": 9.479274494801747e-05, + "loss": 0.2852, + "step": 28314 + }, + { + "epoch": 2.2938269604666237, + "grad_norm": 0.07077052444219589, + "learning_rate": 9.478824429542284e-05, + "loss": 0.2672, + "step": 28315 + }, + { + "epoch": 2.293907971484122, + "grad_norm": 0.06094752997159958, + "learning_rate": 9.478374364282821e-05, + "loss": 0.225, + "step": 28316 + }, + { + "epoch": 2.29398898250162, + "grad_norm": 0.056600797921419144, + "learning_rate": 9.477924299023359e-05, + "loss": 0.2478, + "step": 28317 + }, + { + "epoch": 2.2940699935191184, + "grad_norm": 0.0702267587184906, + "learning_rate": 9.477474233763896e-05, + "loss": 0.2666, + "step": 28318 + }, + { + "epoch": 2.294151004536617, + "grad_norm": 0.06771890819072723, + "learning_rate": 9.477024168504434e-05, + "loss": 0.273, + "step": 28319 + }, + { + "epoch": 2.2942320155541154, + "grad_norm": 0.067947618663311, + "learning_rate": 9.476574103244971e-05, + "loss": 0.2272, + "step": 28320 + }, + { + "epoch": 2.2943130265716136, + "grad_norm": 0.0776883214712143, + "learning_rate": 9.476124037985508e-05, + "loss": 0.3166, + "step": 28321 + }, + { + "epoch": 2.2943940375891123, + "grad_norm": 0.05880747735500336, + "learning_rate": 9.475673972726046e-05, + "loss": 0.2402, + "step": 28322 + }, + { + "epoch": 2.2944750486066106, + "grad_norm": 0.05730128660798073, + "learning_rate": 9.475223907466583e-05, + "loss": 0.2259, + "step": 28323 + }, + { + "epoch": 2.294556059624109, + "grad_norm": 0.06512884050607681, + "learning_rate": 9.47477384220712e-05, + "loss": 0.2672, + "step": 28324 + }, + { + "epoch": 2.294637070641607, + "grad_norm": 0.06250195950269699, + "learning_rate": 9.474323776947658e-05, + "loss": 0.2561, + "step": 28325 + }, + { + "epoch": 2.2947180816591057, + "grad_norm": 0.061930492520332336, + "learning_rate": 9.473873711688195e-05, + "loss": 0.2613, + "step": 28326 + }, + { + "epoch": 2.294799092676604, + "grad_norm": 0.06593555212020874, + "learning_rate": 9.473423646428732e-05, + "loss": 0.2862, + "step": 28327 + }, + { + "epoch": 2.2948801036941022, + "grad_norm": 0.056815922260284424, + "learning_rate": 9.47297358116927e-05, + "loss": 0.2174, + "step": 28328 + }, + { + "epoch": 2.294961114711601, + "grad_norm": 0.07273375242948532, + "learning_rate": 9.472523515909807e-05, + "loss": 0.2806, + "step": 28329 + }, + { + "epoch": 2.295042125729099, + "grad_norm": 0.0795697495341301, + "learning_rate": 9.472073450650344e-05, + "loss": 0.2545, + "step": 28330 + }, + { + "epoch": 2.2951231367465974, + "grad_norm": 0.05804990231990814, + "learning_rate": 9.471623385390882e-05, + "loss": 0.2681, + "step": 28331 + }, + { + "epoch": 2.295204147764096, + "grad_norm": 0.05056268721818924, + "learning_rate": 9.471173320131419e-05, + "loss": 0.2425, + "step": 28332 + }, + { + "epoch": 2.2952851587815943, + "grad_norm": 0.06456629186868668, + "learning_rate": 9.470723254871957e-05, + "loss": 0.2263, + "step": 28333 + }, + { + "epoch": 2.2953661697990926, + "grad_norm": 0.06838972866535187, + "learning_rate": 9.470273189612494e-05, + "loss": 0.2526, + "step": 28334 + }, + { + "epoch": 2.2954471808165913, + "grad_norm": 0.06177784502506256, + "learning_rate": 9.469823124353031e-05, + "loss": 0.2227, + "step": 28335 + }, + { + "epoch": 2.2955281918340895, + "grad_norm": 0.08078238368034363, + "learning_rate": 9.469373059093569e-05, + "loss": 0.2449, + "step": 28336 + }, + { + "epoch": 2.2956092028515878, + "grad_norm": 0.06638062745332718, + "learning_rate": 9.468922993834106e-05, + "loss": 0.2641, + "step": 28337 + }, + { + "epoch": 2.295690213869086, + "grad_norm": 0.08003167808055878, + "learning_rate": 9.468472928574645e-05, + "loss": 0.2807, + "step": 28338 + }, + { + "epoch": 2.2957712248865847, + "grad_norm": 0.08642017841339111, + "learning_rate": 9.468022863315181e-05, + "loss": 0.2712, + "step": 28339 + }, + { + "epoch": 2.295852235904083, + "grad_norm": 0.059965621680021286, + "learning_rate": 9.467572798055718e-05, + "loss": 0.2274, + "step": 28340 + }, + { + "epoch": 2.295933246921581, + "grad_norm": 0.05939400941133499, + "learning_rate": 9.467122732796257e-05, + "loss": 0.2365, + "step": 28341 + }, + { + "epoch": 2.29601425793908, + "grad_norm": 0.05748443305492401, + "learning_rate": 9.466672667536793e-05, + "loss": 0.2375, + "step": 28342 + }, + { + "epoch": 2.296095268956578, + "grad_norm": 0.07314951717853546, + "learning_rate": 9.46622260227733e-05, + "loss": 0.2715, + "step": 28343 + }, + { + "epoch": 2.2961762799740764, + "grad_norm": 0.06428799778223038, + "learning_rate": 9.465772537017869e-05, + "loss": 0.2464, + "step": 28344 + }, + { + "epoch": 2.2962572909915746, + "grad_norm": 0.06721749901771545, + "learning_rate": 9.465322471758405e-05, + "loss": 0.243, + "step": 28345 + }, + { + "epoch": 2.2963383020090733, + "grad_norm": 0.05531509593129158, + "learning_rate": 9.464872406498942e-05, + "loss": 0.2744, + "step": 28346 + }, + { + "epoch": 2.2964193130265715, + "grad_norm": 0.07265246659517288, + "learning_rate": 9.464422341239481e-05, + "loss": 0.2686, + "step": 28347 + }, + { + "epoch": 2.29650032404407, + "grad_norm": 0.06863284111022949, + "learning_rate": 9.463972275980017e-05, + "loss": 0.2606, + "step": 28348 + }, + { + "epoch": 2.2965813350615685, + "grad_norm": 0.05782085284590721, + "learning_rate": 9.463522210720554e-05, + "loss": 0.2256, + "step": 28349 + }, + { + "epoch": 2.2966623460790667, + "grad_norm": 0.05759461969137192, + "learning_rate": 9.463072145461093e-05, + "loss": 0.2229, + "step": 28350 + }, + { + "epoch": 2.296743357096565, + "grad_norm": 0.07360438257455826, + "learning_rate": 9.462622080201629e-05, + "loss": 0.2503, + "step": 28351 + }, + { + "epoch": 2.2968243681140637, + "grad_norm": 0.06497979909181595, + "learning_rate": 9.462172014942166e-05, + "loss": 0.2661, + "step": 28352 + }, + { + "epoch": 2.296905379131562, + "grad_norm": 0.05980612710118294, + "learning_rate": 9.461721949682705e-05, + "loss": 0.2292, + "step": 28353 + }, + { + "epoch": 2.29698639014906, + "grad_norm": 0.04446947202086449, + "learning_rate": 9.461271884423241e-05, + "loss": 0.2002, + "step": 28354 + }, + { + "epoch": 2.297067401166559, + "grad_norm": 0.06264941394329071, + "learning_rate": 9.460821819163778e-05, + "loss": 0.2511, + "step": 28355 + }, + { + "epoch": 2.297148412184057, + "grad_norm": 0.07036852091550827, + "learning_rate": 9.460371753904317e-05, + "loss": 0.2535, + "step": 28356 + }, + { + "epoch": 2.2972294232015553, + "grad_norm": 0.060618314892053604, + "learning_rate": 9.459921688644853e-05, + "loss": 0.2722, + "step": 28357 + }, + { + "epoch": 2.297310434219054, + "grad_norm": 0.06120682880282402, + "learning_rate": 9.45947162338539e-05, + "loss": 0.2608, + "step": 28358 + }, + { + "epoch": 2.2973914452365523, + "grad_norm": 0.05045586824417114, + "learning_rate": 9.459021558125929e-05, + "loss": 0.2056, + "step": 28359 + }, + { + "epoch": 2.2974724562540505, + "grad_norm": 0.0593399740755558, + "learning_rate": 9.458571492866465e-05, + "loss": 0.2436, + "step": 28360 + }, + { + "epoch": 2.2975534672715487, + "grad_norm": 0.05917546525597572, + "learning_rate": 9.458121427607003e-05, + "loss": 0.2518, + "step": 28361 + }, + { + "epoch": 2.2976344782890474, + "grad_norm": 0.07110293209552765, + "learning_rate": 9.457671362347541e-05, + "loss": 0.2729, + "step": 28362 + }, + { + "epoch": 2.2977154893065457, + "grad_norm": 0.07229780405759811, + "learning_rate": 9.457221297088077e-05, + "loss": 0.269, + "step": 28363 + }, + { + "epoch": 2.297796500324044, + "grad_norm": 0.05696237087249756, + "learning_rate": 9.456771231828616e-05, + "loss": 0.2289, + "step": 28364 + }, + { + "epoch": 2.2978775113415426, + "grad_norm": 0.06969776749610901, + "learning_rate": 9.456321166569153e-05, + "loss": 0.247, + "step": 28365 + }, + { + "epoch": 2.297958522359041, + "grad_norm": 0.06379079818725586, + "learning_rate": 9.45587110130969e-05, + "loss": 0.2436, + "step": 28366 + }, + { + "epoch": 2.298039533376539, + "grad_norm": 0.06603731960058212, + "learning_rate": 9.455421036050228e-05, + "loss": 0.2661, + "step": 28367 + }, + { + "epoch": 2.2981205443940373, + "grad_norm": 0.0677296444773674, + "learning_rate": 9.454970970790766e-05, + "loss": 0.2285, + "step": 28368 + }, + { + "epoch": 2.298201555411536, + "grad_norm": 0.0610167570412159, + "learning_rate": 9.454520905531303e-05, + "loss": 0.2561, + "step": 28369 + }, + { + "epoch": 2.2982825664290343, + "grad_norm": 0.060097601264715195, + "learning_rate": 9.45407084027184e-05, + "loss": 0.2386, + "step": 28370 + }, + { + "epoch": 2.2983635774465325, + "grad_norm": 0.09854019433259964, + "learning_rate": 9.453620775012378e-05, + "loss": 0.259, + "step": 28371 + }, + { + "epoch": 2.298444588464031, + "grad_norm": 0.052267104387283325, + "learning_rate": 9.453170709752915e-05, + "loss": 0.2207, + "step": 28372 + }, + { + "epoch": 2.2985255994815295, + "grad_norm": 0.06957710534334183, + "learning_rate": 9.452720644493452e-05, + "loss": 0.2621, + "step": 28373 + }, + { + "epoch": 2.2986066104990277, + "grad_norm": 0.05953522399067879, + "learning_rate": 9.45227057923399e-05, + "loss": 0.2669, + "step": 28374 + }, + { + "epoch": 2.2986876215165264, + "grad_norm": 0.06640417128801346, + "learning_rate": 9.451820513974527e-05, + "loss": 0.2461, + "step": 28375 + }, + { + "epoch": 2.2987686325340246, + "grad_norm": 0.07142982631921768, + "learning_rate": 9.451370448715064e-05, + "loss": 0.2503, + "step": 28376 + }, + { + "epoch": 2.298849643551523, + "grad_norm": 0.07983957976102829, + "learning_rate": 9.450920383455602e-05, + "loss": 0.2676, + "step": 28377 + }, + { + "epoch": 2.2989306545690216, + "grad_norm": 0.059071604162454605, + "learning_rate": 9.450470318196139e-05, + "loss": 0.2624, + "step": 28378 + }, + { + "epoch": 2.29901166558652, + "grad_norm": 0.06270880997180939, + "learning_rate": 9.450020252936676e-05, + "loss": 0.2306, + "step": 28379 + }, + { + "epoch": 2.299092676604018, + "grad_norm": 0.06676393002271652, + "learning_rate": 9.449570187677214e-05, + "loss": 0.2477, + "step": 28380 + }, + { + "epoch": 2.2991736876215167, + "grad_norm": 0.07599867135286331, + "learning_rate": 9.449120122417751e-05, + "loss": 0.2911, + "step": 28381 + }, + { + "epoch": 2.299254698639015, + "grad_norm": 0.061391886323690414, + "learning_rate": 9.448670057158289e-05, + "loss": 0.2364, + "step": 28382 + }, + { + "epoch": 2.2993357096565132, + "grad_norm": 0.05860184505581856, + "learning_rate": 9.448219991898826e-05, + "loss": 0.2293, + "step": 28383 + }, + { + "epoch": 2.2994167206740115, + "grad_norm": 0.0665908008813858, + "learning_rate": 9.447769926639363e-05, + "loss": 0.2416, + "step": 28384 + }, + { + "epoch": 2.29949773169151, + "grad_norm": 0.06436271965503693, + "learning_rate": 9.4473198613799e-05, + "loss": 0.2713, + "step": 28385 + }, + { + "epoch": 2.2995787427090084, + "grad_norm": 0.060904085636138916, + "learning_rate": 9.446869796120438e-05, + "loss": 0.2974, + "step": 28386 + }, + { + "epoch": 2.2996597537265067, + "grad_norm": 0.05490853264927864, + "learning_rate": 9.446419730860975e-05, + "loss": 0.2115, + "step": 28387 + }, + { + "epoch": 2.2997407647440054, + "grad_norm": 0.0666164755821228, + "learning_rate": 9.445969665601513e-05, + "loss": 0.2551, + "step": 28388 + }, + { + "epoch": 2.2998217757615036, + "grad_norm": 0.04948906973004341, + "learning_rate": 9.44551960034205e-05, + "loss": 0.2575, + "step": 28389 + }, + { + "epoch": 2.299902786779002, + "grad_norm": 0.06865771114826202, + "learning_rate": 9.445069535082587e-05, + "loss": 0.2477, + "step": 28390 + }, + { + "epoch": 2.2999837977965, + "grad_norm": 0.0601445734500885, + "learning_rate": 9.444619469823125e-05, + "loss": 0.2679, + "step": 28391 + }, + { + "epoch": 2.3000648088139988, + "grad_norm": 0.05781553313136101, + "learning_rate": 9.444169404563662e-05, + "loss": 0.2276, + "step": 28392 + }, + { + "epoch": 2.300145819831497, + "grad_norm": 0.0677238255739212, + "learning_rate": 9.4437193393042e-05, + "loss": 0.2261, + "step": 28393 + }, + { + "epoch": 2.3002268308489953, + "grad_norm": 0.0595778189599514, + "learning_rate": 9.443269274044737e-05, + "loss": 0.2752, + "step": 28394 + }, + { + "epoch": 2.300307841866494, + "grad_norm": 0.0635710060596466, + "learning_rate": 9.442819208785274e-05, + "loss": 0.2567, + "step": 28395 + }, + { + "epoch": 2.300388852883992, + "grad_norm": 0.06570877879858017, + "learning_rate": 9.442369143525812e-05, + "loss": 0.2933, + "step": 28396 + }, + { + "epoch": 2.3004698639014904, + "grad_norm": 0.06711771339178085, + "learning_rate": 9.441919078266349e-05, + "loss": 0.2205, + "step": 28397 + }, + { + "epoch": 2.300550874918989, + "grad_norm": 0.07368925213813782, + "learning_rate": 9.441469013006886e-05, + "loss": 0.2361, + "step": 28398 + }, + { + "epoch": 2.3006318859364874, + "grad_norm": 0.06781308352947235, + "learning_rate": 9.441018947747424e-05, + "loss": 0.261, + "step": 28399 + }, + { + "epoch": 2.3007128969539856, + "grad_norm": 0.06204356253147125, + "learning_rate": 9.440568882487961e-05, + "loss": 0.2319, + "step": 28400 + }, + { + "epoch": 2.3007939079714843, + "grad_norm": 0.061793096363544464, + "learning_rate": 9.440118817228498e-05, + "loss": 0.2038, + "step": 28401 + }, + { + "epoch": 2.3008749189889826, + "grad_norm": 0.05806412175297737, + "learning_rate": 9.439668751969036e-05, + "loss": 0.2435, + "step": 28402 + }, + { + "epoch": 2.300955930006481, + "grad_norm": 0.06467214971780777, + "learning_rate": 9.439218686709573e-05, + "loss": 0.2193, + "step": 28403 + }, + { + "epoch": 2.3010369410239795, + "grad_norm": 0.05494023486971855, + "learning_rate": 9.43876862145011e-05, + "loss": 0.2562, + "step": 28404 + }, + { + "epoch": 2.3011179520414777, + "grad_norm": 0.06859727948904037, + "learning_rate": 9.438318556190648e-05, + "loss": 0.2541, + "step": 28405 + }, + { + "epoch": 2.301198963058976, + "grad_norm": 0.07257232815027237, + "learning_rate": 9.437868490931185e-05, + "loss": 0.2783, + "step": 28406 + }, + { + "epoch": 2.301279974076474, + "grad_norm": 0.07005695253610611, + "learning_rate": 9.437418425671723e-05, + "loss": 0.2998, + "step": 28407 + }, + { + "epoch": 2.301360985093973, + "grad_norm": 0.06895466893911362, + "learning_rate": 9.43696836041226e-05, + "loss": 0.2509, + "step": 28408 + }, + { + "epoch": 2.301441996111471, + "grad_norm": 0.07160642743110657, + "learning_rate": 9.436518295152797e-05, + "loss": 0.2389, + "step": 28409 + }, + { + "epoch": 2.3015230071289694, + "grad_norm": 0.06630747020244598, + "learning_rate": 9.436068229893335e-05, + "loss": 0.2737, + "step": 28410 + }, + { + "epoch": 2.301604018146468, + "grad_norm": 0.06067004054784775, + "learning_rate": 9.435618164633872e-05, + "loss": 0.2827, + "step": 28411 + }, + { + "epoch": 2.3016850291639663, + "grad_norm": 0.06799201667308807, + "learning_rate": 9.43516809937441e-05, + "loss": 0.21, + "step": 28412 + }, + { + "epoch": 2.3017660401814646, + "grad_norm": 0.06540130823850632, + "learning_rate": 9.434718034114947e-05, + "loss": 0.2194, + "step": 28413 + }, + { + "epoch": 2.301847051198963, + "grad_norm": 0.06734874099493027, + "learning_rate": 9.434267968855484e-05, + "loss": 0.2604, + "step": 28414 + }, + { + "epoch": 2.3019280622164615, + "grad_norm": 0.05423181131482124, + "learning_rate": 9.433817903596021e-05, + "loss": 0.2558, + "step": 28415 + }, + { + "epoch": 2.3020090732339598, + "grad_norm": 0.06706136465072632, + "learning_rate": 9.43336783833656e-05, + "loss": 0.2677, + "step": 28416 + }, + { + "epoch": 2.302090084251458, + "grad_norm": 0.07376489043235779, + "learning_rate": 9.432917773077096e-05, + "loss": 0.2563, + "step": 28417 + }, + { + "epoch": 2.3021710952689567, + "grad_norm": 0.0608864426612854, + "learning_rate": 9.432467707817634e-05, + "loss": 0.2718, + "step": 28418 + }, + { + "epoch": 2.302252106286455, + "grad_norm": 0.07492481172084808, + "learning_rate": 9.432017642558172e-05, + "loss": 0.2959, + "step": 28419 + }, + { + "epoch": 2.302333117303953, + "grad_norm": 0.05481376126408577, + "learning_rate": 9.431567577298708e-05, + "loss": 0.2881, + "step": 28420 + }, + { + "epoch": 2.302414128321452, + "grad_norm": 0.05414591357111931, + "learning_rate": 9.431117512039246e-05, + "loss": 0.2368, + "step": 28421 + }, + { + "epoch": 2.30249513933895, + "grad_norm": 0.06244802847504616, + "learning_rate": 9.430667446779784e-05, + "loss": 0.2707, + "step": 28422 + }, + { + "epoch": 2.3025761503564484, + "grad_norm": 0.07095088809728622, + "learning_rate": 9.43021738152032e-05, + "loss": 0.262, + "step": 28423 + }, + { + "epoch": 2.302657161373947, + "grad_norm": 0.07430743426084518, + "learning_rate": 9.429767316260858e-05, + "loss": 0.273, + "step": 28424 + }, + { + "epoch": 2.3027381723914453, + "grad_norm": 0.07228193432092667, + "learning_rate": 9.429317251001396e-05, + "loss": 0.2152, + "step": 28425 + }, + { + "epoch": 2.3028191834089435, + "grad_norm": 0.06242399662733078, + "learning_rate": 9.428867185741932e-05, + "loss": 0.2448, + "step": 28426 + }, + { + "epoch": 2.3029001944264422, + "grad_norm": 0.0664132609963417, + "learning_rate": 9.42841712048247e-05, + "loss": 0.2344, + "step": 28427 + }, + { + "epoch": 2.3029812054439405, + "grad_norm": 0.07687770575284958, + "learning_rate": 9.427967055223008e-05, + "loss": 0.2753, + "step": 28428 + }, + { + "epoch": 2.3030622164614387, + "grad_norm": 0.08215631544589996, + "learning_rate": 9.427516989963545e-05, + "loss": 0.2519, + "step": 28429 + }, + { + "epoch": 2.303143227478937, + "grad_norm": 0.07950203120708466, + "learning_rate": 9.427066924704082e-05, + "loss": 0.2495, + "step": 28430 + }, + { + "epoch": 2.3032242384964356, + "grad_norm": 0.06067657843232155, + "learning_rate": 9.42661685944462e-05, + "loss": 0.2453, + "step": 28431 + }, + { + "epoch": 2.303305249513934, + "grad_norm": 0.06874459236860275, + "learning_rate": 9.426166794185157e-05, + "loss": 0.2695, + "step": 28432 + }, + { + "epoch": 2.303386260531432, + "grad_norm": 0.06850822269916534, + "learning_rate": 9.425716728925694e-05, + "loss": 0.2321, + "step": 28433 + }, + { + "epoch": 2.303467271548931, + "grad_norm": 0.08743181824684143, + "learning_rate": 9.425266663666233e-05, + "loss": 0.265, + "step": 28434 + }, + { + "epoch": 2.303548282566429, + "grad_norm": 0.055900510400533676, + "learning_rate": 9.424816598406769e-05, + "loss": 0.2272, + "step": 28435 + }, + { + "epoch": 2.3036292935839273, + "grad_norm": 0.07269109040498734, + "learning_rate": 9.424366533147306e-05, + "loss": 0.284, + "step": 28436 + }, + { + "epoch": 2.3037103046014256, + "grad_norm": 0.06864180415868759, + "learning_rate": 9.423916467887845e-05, + "loss": 0.2528, + "step": 28437 + }, + { + "epoch": 2.3037913156189243, + "grad_norm": 0.06924668699502945, + "learning_rate": 9.423466402628382e-05, + "loss": 0.2543, + "step": 28438 + }, + { + "epoch": 2.3038723266364225, + "grad_norm": 0.07466299086809158, + "learning_rate": 9.423016337368918e-05, + "loss": 0.2687, + "step": 28439 + }, + { + "epoch": 2.3039533376539207, + "grad_norm": 0.07248183339834213, + "learning_rate": 9.422566272109457e-05, + "loss": 0.267, + "step": 28440 + }, + { + "epoch": 2.3040343486714194, + "grad_norm": 0.06602407246828079, + "learning_rate": 9.422116206849994e-05, + "loss": 0.2441, + "step": 28441 + }, + { + "epoch": 2.3041153596889177, + "grad_norm": 0.07264547795057297, + "learning_rate": 9.421666141590532e-05, + "loss": 0.2265, + "step": 28442 + }, + { + "epoch": 2.304196370706416, + "grad_norm": 0.06024101749062538, + "learning_rate": 9.421216076331069e-05, + "loss": 0.2226, + "step": 28443 + }, + { + "epoch": 2.3042773817239146, + "grad_norm": 0.06888773292303085, + "learning_rate": 9.420766011071606e-05, + "loss": 0.2748, + "step": 28444 + }, + { + "epoch": 2.304358392741413, + "grad_norm": 0.06901773065328598, + "learning_rate": 9.420315945812144e-05, + "loss": 0.2888, + "step": 28445 + }, + { + "epoch": 2.304439403758911, + "grad_norm": 0.07405581325292587, + "learning_rate": 9.419865880552681e-05, + "loss": 0.2954, + "step": 28446 + }, + { + "epoch": 2.30452041477641, + "grad_norm": 0.053265344351530075, + "learning_rate": 9.419415815293218e-05, + "loss": 0.2623, + "step": 28447 + }, + { + "epoch": 2.304601425793908, + "grad_norm": 0.05269251763820648, + "learning_rate": 9.418965750033756e-05, + "loss": 0.2577, + "step": 28448 + }, + { + "epoch": 2.3046824368114063, + "grad_norm": 0.06693107634782791, + "learning_rate": 9.418515684774293e-05, + "loss": 0.2662, + "step": 28449 + }, + { + "epoch": 2.304763447828905, + "grad_norm": 0.06313629448413849, + "learning_rate": 9.41806561951483e-05, + "loss": 0.226, + "step": 28450 + }, + { + "epoch": 2.304844458846403, + "grad_norm": 0.0705796480178833, + "learning_rate": 9.417615554255368e-05, + "loss": 0.2718, + "step": 28451 + }, + { + "epoch": 2.3049254698639015, + "grad_norm": 0.05864161252975464, + "learning_rate": 9.417165488995905e-05, + "loss": 0.2517, + "step": 28452 + }, + { + "epoch": 2.3050064808813997, + "grad_norm": 0.060567956417798996, + "learning_rate": 9.416715423736443e-05, + "loss": 0.2667, + "step": 28453 + }, + { + "epoch": 2.3050874918988984, + "grad_norm": 0.0734984502196312, + "learning_rate": 9.41626535847698e-05, + "loss": 0.2583, + "step": 28454 + }, + { + "epoch": 2.3051685029163966, + "grad_norm": 0.06271208077669144, + "learning_rate": 9.415815293217517e-05, + "loss": 0.241, + "step": 28455 + }, + { + "epoch": 2.305249513933895, + "grad_norm": 0.07750144600868225, + "learning_rate": 9.415365227958055e-05, + "loss": 0.273, + "step": 28456 + }, + { + "epoch": 2.3053305249513936, + "grad_norm": 0.057265881448984146, + "learning_rate": 9.414915162698592e-05, + "loss": 0.233, + "step": 28457 + }, + { + "epoch": 2.305411535968892, + "grad_norm": 0.0688074454665184, + "learning_rate": 9.414465097439129e-05, + "loss": 0.2528, + "step": 28458 + }, + { + "epoch": 2.30549254698639, + "grad_norm": 0.08131054043769836, + "learning_rate": 9.414015032179667e-05, + "loss": 0.2691, + "step": 28459 + }, + { + "epoch": 2.3055735580038883, + "grad_norm": 0.056380532681941986, + "learning_rate": 9.413564966920204e-05, + "loss": 0.2488, + "step": 28460 + }, + { + "epoch": 2.305654569021387, + "grad_norm": 0.06920839846134186, + "learning_rate": 9.413114901660741e-05, + "loss": 0.2732, + "step": 28461 + }, + { + "epoch": 2.3057355800388852, + "grad_norm": 0.06537441164255142, + "learning_rate": 9.412664836401279e-05, + "loss": 0.2125, + "step": 28462 + }, + { + "epoch": 2.3058165910563835, + "grad_norm": 0.0647144690155983, + "learning_rate": 9.412214771141816e-05, + "loss": 0.2727, + "step": 28463 + }, + { + "epoch": 2.305897602073882, + "grad_norm": 0.05751211941242218, + "learning_rate": 9.411764705882353e-05, + "loss": 0.2527, + "step": 28464 + }, + { + "epoch": 2.3059786130913804, + "grad_norm": 0.058670178055763245, + "learning_rate": 9.411314640622891e-05, + "loss": 0.2362, + "step": 28465 + }, + { + "epoch": 2.3060596241088787, + "grad_norm": 0.0812852680683136, + "learning_rate": 9.410864575363428e-05, + "loss": 0.2728, + "step": 28466 + }, + { + "epoch": 2.3061406351263773, + "grad_norm": 0.06273907423019409, + "learning_rate": 9.410414510103966e-05, + "loss": 0.276, + "step": 28467 + }, + { + "epoch": 2.3062216461438756, + "grad_norm": 0.06311316788196564, + "learning_rate": 9.409964444844503e-05, + "loss": 0.2381, + "step": 28468 + }, + { + "epoch": 2.306302657161374, + "grad_norm": 0.08670295029878616, + "learning_rate": 9.40951437958504e-05, + "loss": 0.2965, + "step": 28469 + }, + { + "epoch": 2.3063836681788725, + "grad_norm": 0.06879959255456924, + "learning_rate": 9.409064314325578e-05, + "loss": 0.2687, + "step": 28470 + }, + { + "epoch": 2.3064646791963708, + "grad_norm": 0.0669282004237175, + "learning_rate": 9.408614249066115e-05, + "loss": 0.2588, + "step": 28471 + }, + { + "epoch": 2.306545690213869, + "grad_norm": 0.06189112365245819, + "learning_rate": 9.408164183806652e-05, + "loss": 0.2502, + "step": 28472 + }, + { + "epoch": 2.3066267012313677, + "grad_norm": 0.07936570793390274, + "learning_rate": 9.40771411854719e-05, + "loss": 0.2687, + "step": 28473 + }, + { + "epoch": 2.306707712248866, + "grad_norm": 0.08174500614404678, + "learning_rate": 9.407264053287727e-05, + "loss": 0.2332, + "step": 28474 + }, + { + "epoch": 2.306788723266364, + "grad_norm": 0.06000255048274994, + "learning_rate": 9.406813988028264e-05, + "loss": 0.2489, + "step": 28475 + }, + { + "epoch": 2.3068697342838624, + "grad_norm": 0.08085722476243973, + "learning_rate": 9.406363922768802e-05, + "loss": 0.2647, + "step": 28476 + }, + { + "epoch": 2.306950745301361, + "grad_norm": 0.06643027812242508, + "learning_rate": 9.405913857509339e-05, + "loss": 0.227, + "step": 28477 + }, + { + "epoch": 2.3070317563188594, + "grad_norm": 0.06923480331897736, + "learning_rate": 9.405463792249877e-05, + "loss": 0.2638, + "step": 28478 + }, + { + "epoch": 2.3071127673363576, + "grad_norm": 0.058105580508708954, + "learning_rate": 9.405013726990414e-05, + "loss": 0.2462, + "step": 28479 + }, + { + "epoch": 2.3071937783538563, + "grad_norm": 0.06581859290599823, + "learning_rate": 9.404563661730951e-05, + "loss": 0.2355, + "step": 28480 + }, + { + "epoch": 2.3072747893713546, + "grad_norm": 0.0653432309627533, + "learning_rate": 9.404113596471489e-05, + "loss": 0.2852, + "step": 28481 + }, + { + "epoch": 2.307355800388853, + "grad_norm": 0.06128329411149025, + "learning_rate": 9.403663531212026e-05, + "loss": 0.2594, + "step": 28482 + }, + { + "epoch": 2.307436811406351, + "grad_norm": 0.06882143020629883, + "learning_rate": 9.403213465952563e-05, + "loss": 0.2442, + "step": 28483 + }, + { + "epoch": 2.3075178224238497, + "grad_norm": 0.060445886105298996, + "learning_rate": 9.402763400693101e-05, + "loss": 0.2295, + "step": 28484 + }, + { + "epoch": 2.307598833441348, + "grad_norm": 0.10045522451400757, + "learning_rate": 9.402313335433638e-05, + "loss": 0.2667, + "step": 28485 + }, + { + "epoch": 2.307679844458846, + "grad_norm": 0.06006335839629173, + "learning_rate": 9.401863270174175e-05, + "loss": 0.2093, + "step": 28486 + }, + { + "epoch": 2.307760855476345, + "grad_norm": 0.06454713642597198, + "learning_rate": 9.401413204914713e-05, + "loss": 0.2232, + "step": 28487 + }, + { + "epoch": 2.307841866493843, + "grad_norm": 0.061691079288721085, + "learning_rate": 9.40096313965525e-05, + "loss": 0.288, + "step": 28488 + }, + { + "epoch": 2.3079228775113414, + "grad_norm": 0.05934334546327591, + "learning_rate": 9.400513074395787e-05, + "loss": 0.2301, + "step": 28489 + }, + { + "epoch": 2.30800388852884, + "grad_norm": 0.07355938851833344, + "learning_rate": 9.400063009136325e-05, + "loss": 0.2563, + "step": 28490 + }, + { + "epoch": 2.3080848995463383, + "grad_norm": 0.051536675542593, + "learning_rate": 9.399612943876862e-05, + "loss": 0.2287, + "step": 28491 + }, + { + "epoch": 2.3081659105638366, + "grad_norm": 0.06216194108128548, + "learning_rate": 9.3991628786174e-05, + "loss": 0.2532, + "step": 28492 + }, + { + "epoch": 2.3082469215813353, + "grad_norm": 0.07674593478441238, + "learning_rate": 9.398712813357937e-05, + "loss": 0.2364, + "step": 28493 + }, + { + "epoch": 2.3083279325988335, + "grad_norm": 0.0691610649228096, + "learning_rate": 9.398262748098474e-05, + "loss": 0.2353, + "step": 28494 + }, + { + "epoch": 2.3084089436163318, + "grad_norm": 0.05119806528091431, + "learning_rate": 9.397812682839012e-05, + "loss": 0.2663, + "step": 28495 + }, + { + "epoch": 2.3084899546338304, + "grad_norm": 0.061490509659051895, + "learning_rate": 9.397362617579549e-05, + "loss": 0.2408, + "step": 28496 + }, + { + "epoch": 2.3085709656513287, + "grad_norm": 0.06719590723514557, + "learning_rate": 9.396912552320088e-05, + "loss": 0.2429, + "step": 28497 + }, + { + "epoch": 2.308651976668827, + "grad_norm": 0.0660691112279892, + "learning_rate": 9.396462487060624e-05, + "loss": 0.2449, + "step": 28498 + }, + { + "epoch": 2.308732987686325, + "grad_norm": 0.08293015509843826, + "learning_rate": 9.396012421801161e-05, + "loss": 0.2525, + "step": 28499 + }, + { + "epoch": 2.308813998703824, + "grad_norm": 0.07510707527399063, + "learning_rate": 9.3955623565417e-05, + "loss": 0.2675, + "step": 28500 + }, + { + "epoch": 2.308895009721322, + "grad_norm": 0.06140323728322983, + "learning_rate": 9.395112291282236e-05, + "loss": 0.2454, + "step": 28501 + }, + { + "epoch": 2.3089760207388204, + "grad_norm": 0.07450032234191895, + "learning_rate": 9.394662226022773e-05, + "loss": 0.2273, + "step": 28502 + }, + { + "epoch": 2.309057031756319, + "grad_norm": 0.08496350795030594, + "learning_rate": 9.394212160763312e-05, + "loss": 0.238, + "step": 28503 + }, + { + "epoch": 2.3091380427738173, + "grad_norm": 0.052981045097112656, + "learning_rate": 9.393762095503848e-05, + "loss": 0.1997, + "step": 28504 + }, + { + "epoch": 2.3092190537913155, + "grad_norm": 0.05658174306154251, + "learning_rate": 9.393312030244385e-05, + "loss": 0.2151, + "step": 28505 + }, + { + "epoch": 2.309300064808814, + "grad_norm": 0.0606054812669754, + "learning_rate": 9.392861964984924e-05, + "loss": 0.2466, + "step": 28506 + }, + { + "epoch": 2.3093810758263125, + "grad_norm": 0.054300349205732346, + "learning_rate": 9.392411899725461e-05, + "loss": 0.2442, + "step": 28507 + }, + { + "epoch": 2.3094620868438107, + "grad_norm": 0.06346888840198517, + "learning_rate": 9.391961834465997e-05, + "loss": 0.2301, + "step": 28508 + }, + { + "epoch": 2.309543097861309, + "grad_norm": 0.060989703983068466, + "learning_rate": 9.391511769206536e-05, + "loss": 0.257, + "step": 28509 + }, + { + "epoch": 2.3096241088788076, + "grad_norm": 0.05664192885160446, + "learning_rate": 9.391061703947073e-05, + "loss": 0.25, + "step": 28510 + }, + { + "epoch": 2.309705119896306, + "grad_norm": 0.06328573077917099, + "learning_rate": 9.39061163868761e-05, + "loss": 0.2381, + "step": 28511 + }, + { + "epoch": 2.309786130913804, + "grad_norm": 0.052768316119909286, + "learning_rate": 9.390161573428148e-05, + "loss": 0.2359, + "step": 28512 + }, + { + "epoch": 2.309867141931303, + "grad_norm": 0.06358731538057327, + "learning_rate": 9.389711508168685e-05, + "loss": 0.2454, + "step": 28513 + }, + { + "epoch": 2.309948152948801, + "grad_norm": 0.056090958416461945, + "learning_rate": 9.389261442909221e-05, + "loss": 0.244, + "step": 28514 + }, + { + "epoch": 2.3100291639662993, + "grad_norm": 0.07474549859762192, + "learning_rate": 9.38881137764976e-05, + "loss": 0.2323, + "step": 28515 + }, + { + "epoch": 2.310110174983798, + "grad_norm": 0.058619070798158646, + "learning_rate": 9.388361312390298e-05, + "loss": 0.2427, + "step": 28516 + }, + { + "epoch": 2.3101911860012962, + "grad_norm": 0.0642256885766983, + "learning_rate": 9.387911247130834e-05, + "loss": 0.2488, + "step": 28517 + }, + { + "epoch": 2.3102721970187945, + "grad_norm": 0.07105040550231934, + "learning_rate": 9.387461181871372e-05, + "loss": 0.2636, + "step": 28518 + }, + { + "epoch": 2.310353208036293, + "grad_norm": 0.07639192044734955, + "learning_rate": 9.38701111661191e-05, + "loss": 0.2961, + "step": 28519 + }, + { + "epoch": 2.3104342190537914, + "grad_norm": 0.06323409825563431, + "learning_rate": 9.386561051352446e-05, + "loss": 0.2579, + "step": 28520 + }, + { + "epoch": 2.3105152300712897, + "grad_norm": 0.07321860641241074, + "learning_rate": 9.386110986092984e-05, + "loss": 0.2771, + "step": 28521 + }, + { + "epoch": 2.310596241088788, + "grad_norm": 0.0523768812417984, + "learning_rate": 9.385660920833522e-05, + "loss": 0.2457, + "step": 28522 + }, + { + "epoch": 2.3106772521062866, + "grad_norm": 0.05895464867353439, + "learning_rate": 9.385210855574059e-05, + "loss": 0.2525, + "step": 28523 + }, + { + "epoch": 2.310758263123785, + "grad_norm": 0.06829524785280228, + "learning_rate": 9.384760790314596e-05, + "loss": 0.2227, + "step": 28524 + }, + { + "epoch": 2.310839274141283, + "grad_norm": 0.06126768887042999, + "learning_rate": 9.384310725055134e-05, + "loss": 0.2504, + "step": 28525 + }, + { + "epoch": 2.310920285158782, + "grad_norm": 0.06391113251447678, + "learning_rate": 9.383860659795671e-05, + "loss": 0.2241, + "step": 28526 + }, + { + "epoch": 2.31100129617628, + "grad_norm": 0.07911943644285202, + "learning_rate": 9.383410594536209e-05, + "loss": 0.2581, + "step": 28527 + }, + { + "epoch": 2.3110823071937783, + "grad_norm": 0.05488322302699089, + "learning_rate": 9.382960529276746e-05, + "loss": 0.2314, + "step": 28528 + }, + { + "epoch": 2.3111633182112765, + "grad_norm": 0.050786539912223816, + "learning_rate": 9.382510464017283e-05, + "loss": 0.2464, + "step": 28529 + }, + { + "epoch": 2.311244329228775, + "grad_norm": 0.07747553288936615, + "learning_rate": 9.38206039875782e-05, + "loss": 0.255, + "step": 28530 + }, + { + "epoch": 2.3113253402462735, + "grad_norm": 0.06674326956272125, + "learning_rate": 9.381610333498358e-05, + "loss": 0.2351, + "step": 28531 + }, + { + "epoch": 2.3114063512637717, + "grad_norm": 0.07949559390544891, + "learning_rate": 9.381160268238895e-05, + "loss": 0.2757, + "step": 28532 + }, + { + "epoch": 2.3114873622812704, + "grad_norm": 0.06439467519521713, + "learning_rate": 9.380710202979433e-05, + "loss": 0.2233, + "step": 28533 + }, + { + "epoch": 2.3115683732987686, + "grad_norm": 0.06354320049285889, + "learning_rate": 9.38026013771997e-05, + "loss": 0.2357, + "step": 28534 + }, + { + "epoch": 2.311649384316267, + "grad_norm": 0.06903377175331116, + "learning_rate": 9.379810072460507e-05, + "loss": 0.2546, + "step": 28535 + }, + { + "epoch": 2.3117303953337656, + "grad_norm": 0.05698674917221069, + "learning_rate": 9.379360007201045e-05, + "loss": 0.2392, + "step": 28536 + }, + { + "epoch": 2.311811406351264, + "grad_norm": 0.07843542098999023, + "learning_rate": 9.378909941941582e-05, + "loss": 0.3068, + "step": 28537 + }, + { + "epoch": 2.311892417368762, + "grad_norm": 0.06648211926221848, + "learning_rate": 9.37845987668212e-05, + "loss": 0.2375, + "step": 28538 + }, + { + "epoch": 2.3119734283862607, + "grad_norm": 0.05423009395599365, + "learning_rate": 9.378009811422657e-05, + "loss": 0.2255, + "step": 28539 + }, + { + "epoch": 2.312054439403759, + "grad_norm": 0.054902784526348114, + "learning_rate": 9.377559746163194e-05, + "loss": 0.2453, + "step": 28540 + }, + { + "epoch": 2.3121354504212572, + "grad_norm": 0.060616981238126755, + "learning_rate": 9.377109680903732e-05, + "loss": 0.2776, + "step": 28541 + }, + { + "epoch": 2.3122164614387555, + "grad_norm": 0.06343099474906921, + "learning_rate": 9.376659615644269e-05, + "loss": 0.2778, + "step": 28542 + }, + { + "epoch": 2.312297472456254, + "grad_norm": 0.06430594623088837, + "learning_rate": 9.376209550384806e-05, + "loss": 0.2446, + "step": 28543 + }, + { + "epoch": 2.3123784834737524, + "grad_norm": 0.07691385596990585, + "learning_rate": 9.375759485125344e-05, + "loss": 0.254, + "step": 28544 + }, + { + "epoch": 2.3124594944912507, + "grad_norm": 0.06930742412805557, + "learning_rate": 9.375309419865881e-05, + "loss": 0.2614, + "step": 28545 + }, + { + "epoch": 2.3125405055087493, + "grad_norm": 0.07686679810285568, + "learning_rate": 9.374859354606418e-05, + "loss": 0.2806, + "step": 28546 + }, + { + "epoch": 2.3126215165262476, + "grad_norm": 0.05961774289608002, + "learning_rate": 9.374409289346956e-05, + "loss": 0.2176, + "step": 28547 + }, + { + "epoch": 2.312702527543746, + "grad_norm": 0.05253405496478081, + "learning_rate": 9.373959224087493e-05, + "loss": 0.2275, + "step": 28548 + }, + { + "epoch": 2.3127835385612445, + "grad_norm": 0.055466219782829285, + "learning_rate": 9.37350915882803e-05, + "loss": 0.3152, + "step": 28549 + }, + { + "epoch": 2.3128645495787428, + "grad_norm": 0.06095070019364357, + "learning_rate": 9.373059093568568e-05, + "loss": 0.3214, + "step": 28550 + }, + { + "epoch": 2.312945560596241, + "grad_norm": 0.06326496601104736, + "learning_rate": 9.372609028309105e-05, + "loss": 0.2667, + "step": 28551 + }, + { + "epoch": 2.3130265716137393, + "grad_norm": 0.06290464848279953, + "learning_rate": 9.372158963049643e-05, + "loss": 0.2538, + "step": 28552 + }, + { + "epoch": 2.313107582631238, + "grad_norm": 0.061679188162088394, + "learning_rate": 9.37170889779018e-05, + "loss": 0.2405, + "step": 28553 + }, + { + "epoch": 2.313188593648736, + "grad_norm": 0.07740036398172379, + "learning_rate": 9.371258832530717e-05, + "loss": 0.2171, + "step": 28554 + }, + { + "epoch": 2.3132696046662344, + "grad_norm": 0.06478272378444672, + "learning_rate": 9.370808767271255e-05, + "loss": 0.2422, + "step": 28555 + }, + { + "epoch": 2.313350615683733, + "grad_norm": 0.0558452382683754, + "learning_rate": 9.370358702011792e-05, + "loss": 0.2632, + "step": 28556 + }, + { + "epoch": 2.3134316267012314, + "grad_norm": 0.05222080647945404, + "learning_rate": 9.36990863675233e-05, + "loss": 0.2371, + "step": 28557 + }, + { + "epoch": 2.3135126377187296, + "grad_norm": 0.06738220900297165, + "learning_rate": 9.369458571492867e-05, + "loss": 0.2556, + "step": 28558 + }, + { + "epoch": 2.3135936487362283, + "grad_norm": 0.0675593838095665, + "learning_rate": 9.369008506233404e-05, + "loss": 0.2543, + "step": 28559 + }, + { + "epoch": 2.3136746597537265, + "grad_norm": 0.06700984388589859, + "learning_rate": 9.368558440973941e-05, + "loss": 0.217, + "step": 28560 + }, + { + "epoch": 2.313755670771225, + "grad_norm": 0.07644146680831909, + "learning_rate": 9.368108375714479e-05, + "loss": 0.2359, + "step": 28561 + }, + { + "epoch": 2.3138366817887235, + "grad_norm": 0.055144716054201126, + "learning_rate": 9.367658310455016e-05, + "loss": 0.2421, + "step": 28562 + }, + { + "epoch": 2.3139176928062217, + "grad_norm": 0.06547948718070984, + "learning_rate": 9.367208245195554e-05, + "loss": 0.218, + "step": 28563 + }, + { + "epoch": 2.31399870382372, + "grad_norm": 0.06000036746263504, + "learning_rate": 9.366758179936091e-05, + "loss": 0.2309, + "step": 28564 + }, + { + "epoch": 2.314079714841218, + "grad_norm": 0.04735422134399414, + "learning_rate": 9.366308114676628e-05, + "loss": 0.2643, + "step": 28565 + }, + { + "epoch": 2.314160725858717, + "grad_norm": 0.07013414055109024, + "learning_rate": 9.365858049417166e-05, + "loss": 0.2308, + "step": 28566 + }, + { + "epoch": 2.314241736876215, + "grad_norm": 0.05629046633839607, + "learning_rate": 9.365407984157703e-05, + "loss": 0.2114, + "step": 28567 + }, + { + "epoch": 2.3143227478937134, + "grad_norm": 0.06310821324586868, + "learning_rate": 9.36495791889824e-05, + "loss": 0.2241, + "step": 28568 + }, + { + "epoch": 2.314403758911212, + "grad_norm": 0.06148114427924156, + "learning_rate": 9.364507853638778e-05, + "loss": 0.2802, + "step": 28569 + }, + { + "epoch": 2.3144847699287103, + "grad_norm": 0.07152937352657318, + "learning_rate": 9.364057788379315e-05, + "loss": 0.2365, + "step": 28570 + }, + { + "epoch": 2.3145657809462086, + "grad_norm": 0.07520348578691483, + "learning_rate": 9.363607723119852e-05, + "loss": 0.2384, + "step": 28571 + }, + { + "epoch": 2.314646791963707, + "grad_norm": 0.07106326520442963, + "learning_rate": 9.36315765786039e-05, + "loss": 0.2678, + "step": 28572 + }, + { + "epoch": 2.3147278029812055, + "grad_norm": 0.08119279891252518, + "learning_rate": 9.362707592600927e-05, + "loss": 0.2913, + "step": 28573 + }, + { + "epoch": 2.3148088139987038, + "grad_norm": 0.07053650170564651, + "learning_rate": 9.362257527341464e-05, + "loss": 0.2389, + "step": 28574 + }, + { + "epoch": 2.314889825016202, + "grad_norm": 0.050931304693222046, + "learning_rate": 9.361807462082003e-05, + "loss": 0.2577, + "step": 28575 + }, + { + "epoch": 2.3149708360337007, + "grad_norm": 0.07329791784286499, + "learning_rate": 9.36135739682254e-05, + "loss": 0.207, + "step": 28576 + }, + { + "epoch": 2.315051847051199, + "grad_norm": 0.07313059270381927, + "learning_rate": 9.360907331563077e-05, + "loss": 0.2609, + "step": 28577 + }, + { + "epoch": 2.315132858068697, + "grad_norm": 0.04392017424106598, + "learning_rate": 9.360457266303615e-05, + "loss": 0.2144, + "step": 28578 + }, + { + "epoch": 2.315213869086196, + "grad_norm": 0.07469665259122849, + "learning_rate": 9.360007201044153e-05, + "loss": 0.2913, + "step": 28579 + }, + { + "epoch": 2.315294880103694, + "grad_norm": 0.08206086605787277, + "learning_rate": 9.359557135784689e-05, + "loss": 0.2466, + "step": 28580 + }, + { + "epoch": 2.3153758911211924, + "grad_norm": 0.06451928615570068, + "learning_rate": 9.359107070525227e-05, + "loss": 0.2784, + "step": 28581 + }, + { + "epoch": 2.315456902138691, + "grad_norm": 0.06024431437253952, + "learning_rate": 9.358657005265765e-05, + "loss": 0.2635, + "step": 28582 + }, + { + "epoch": 2.3155379131561893, + "grad_norm": 0.053838253021240234, + "learning_rate": 9.358206940006301e-05, + "loss": 0.2289, + "step": 28583 + }, + { + "epoch": 2.3156189241736875, + "grad_norm": 0.08351702243089676, + "learning_rate": 9.35775687474684e-05, + "loss": 0.272, + "step": 28584 + }, + { + "epoch": 2.315699935191186, + "grad_norm": 0.06420436501502991, + "learning_rate": 9.357306809487377e-05, + "loss": 0.2401, + "step": 28585 + }, + { + "epoch": 2.3157809462086845, + "grad_norm": 0.07795728743076324, + "learning_rate": 9.356856744227913e-05, + "loss": 0.2637, + "step": 28586 + }, + { + "epoch": 2.3158619572261827, + "grad_norm": 0.057210199534893036, + "learning_rate": 9.356406678968452e-05, + "loss": 0.2675, + "step": 28587 + }, + { + "epoch": 2.315942968243681, + "grad_norm": 0.05689553543925285, + "learning_rate": 9.355956613708989e-05, + "loss": 0.2559, + "step": 28588 + }, + { + "epoch": 2.3160239792611796, + "grad_norm": 0.08148650825023651, + "learning_rate": 9.355506548449525e-05, + "loss": 0.2419, + "step": 28589 + }, + { + "epoch": 2.316104990278678, + "grad_norm": 0.08246307820081711, + "learning_rate": 9.355056483190064e-05, + "loss": 0.2181, + "step": 28590 + }, + { + "epoch": 2.316186001296176, + "grad_norm": 0.07857725024223328, + "learning_rate": 9.354606417930601e-05, + "loss": 0.2313, + "step": 28591 + }, + { + "epoch": 2.316267012313675, + "grad_norm": 0.05609336495399475, + "learning_rate": 9.354156352671137e-05, + "loss": 0.2738, + "step": 28592 + }, + { + "epoch": 2.316348023331173, + "grad_norm": 0.06364389508962631, + "learning_rate": 9.353706287411676e-05, + "loss": 0.278, + "step": 28593 + }, + { + "epoch": 2.3164290343486713, + "grad_norm": 0.07805577665567398, + "learning_rate": 9.353256222152213e-05, + "loss": 0.2652, + "step": 28594 + }, + { + "epoch": 2.3165100453661696, + "grad_norm": 0.07056976109743118, + "learning_rate": 9.352806156892749e-05, + "loss": 0.2265, + "step": 28595 + }, + { + "epoch": 2.3165910563836682, + "grad_norm": 0.06576458364725113, + "learning_rate": 9.352356091633288e-05, + "loss": 0.2315, + "step": 28596 + }, + { + "epoch": 2.3166720674011665, + "grad_norm": 0.06922286748886108, + "learning_rate": 9.351906026373825e-05, + "loss": 0.275, + "step": 28597 + }, + { + "epoch": 2.3167530784186647, + "grad_norm": 0.0591658353805542, + "learning_rate": 9.351455961114361e-05, + "loss": 0.2421, + "step": 28598 + }, + { + "epoch": 2.3168340894361634, + "grad_norm": 0.0667690709233284, + "learning_rate": 9.3510058958549e-05, + "loss": 0.2788, + "step": 28599 + }, + { + "epoch": 2.3169151004536617, + "grad_norm": 0.0834415853023529, + "learning_rate": 9.350555830595437e-05, + "loss": 0.2597, + "step": 28600 + }, + { + "epoch": 2.31699611147116, + "grad_norm": 0.06573021411895752, + "learning_rate": 9.350105765335975e-05, + "loss": 0.2709, + "step": 28601 + }, + { + "epoch": 2.3170771224886586, + "grad_norm": 0.06178218126296997, + "learning_rate": 9.349655700076512e-05, + "loss": 0.2495, + "step": 28602 + }, + { + "epoch": 2.317158133506157, + "grad_norm": 0.061735160648822784, + "learning_rate": 9.349205634817049e-05, + "loss": 0.2539, + "step": 28603 + }, + { + "epoch": 2.317239144523655, + "grad_norm": 0.09074624627828598, + "learning_rate": 9.348755569557587e-05, + "loss": 0.3201, + "step": 28604 + }, + { + "epoch": 2.317320155541154, + "grad_norm": 0.06366878747940063, + "learning_rate": 9.348305504298124e-05, + "loss": 0.2549, + "step": 28605 + }, + { + "epoch": 2.317401166558652, + "grad_norm": 0.06706615537405014, + "learning_rate": 9.347855439038661e-05, + "loss": 0.2593, + "step": 28606 + }, + { + "epoch": 2.3174821775761503, + "grad_norm": 0.058599237352609634, + "learning_rate": 9.347405373779199e-05, + "loss": 0.2496, + "step": 28607 + }, + { + "epoch": 2.317563188593649, + "grad_norm": 0.07354261726140976, + "learning_rate": 9.346955308519736e-05, + "loss": 0.2655, + "step": 28608 + }, + { + "epoch": 2.317644199611147, + "grad_norm": 0.05688546970486641, + "learning_rate": 9.346505243260273e-05, + "loss": 0.2277, + "step": 28609 + }, + { + "epoch": 2.3177252106286454, + "grad_norm": 0.06739480793476105, + "learning_rate": 9.346055178000811e-05, + "loss": 0.2541, + "step": 28610 + }, + { + "epoch": 2.3178062216461437, + "grad_norm": 0.06607834249734879, + "learning_rate": 9.345605112741348e-05, + "loss": 0.2723, + "step": 28611 + }, + { + "epoch": 2.3178872326636424, + "grad_norm": 0.06151716411113739, + "learning_rate": 9.345155047481886e-05, + "loss": 0.2445, + "step": 28612 + }, + { + "epoch": 2.3179682436811406, + "grad_norm": 0.058197587728500366, + "learning_rate": 9.344704982222423e-05, + "loss": 0.2327, + "step": 28613 + }, + { + "epoch": 2.318049254698639, + "grad_norm": 0.06791552901268005, + "learning_rate": 9.34425491696296e-05, + "loss": 0.2655, + "step": 28614 + }, + { + "epoch": 2.3181302657161376, + "grad_norm": 0.0670228824019432, + "learning_rate": 9.343804851703498e-05, + "loss": 0.2915, + "step": 28615 + }, + { + "epoch": 2.318211276733636, + "grad_norm": 0.07326885312795639, + "learning_rate": 9.343354786444035e-05, + "loss": 0.3023, + "step": 28616 + }, + { + "epoch": 2.318292287751134, + "grad_norm": 0.05599506199359894, + "learning_rate": 9.342904721184572e-05, + "loss": 0.2733, + "step": 28617 + }, + { + "epoch": 2.3183732987686323, + "grad_norm": 0.06617167592048645, + "learning_rate": 9.34245465592511e-05, + "loss": 0.2475, + "step": 28618 + }, + { + "epoch": 2.318454309786131, + "grad_norm": 0.05749017000198364, + "learning_rate": 9.342004590665647e-05, + "loss": 0.2628, + "step": 28619 + }, + { + "epoch": 2.3185353208036292, + "grad_norm": 0.07162825018167496, + "learning_rate": 9.341554525406184e-05, + "loss": 0.2922, + "step": 28620 + }, + { + "epoch": 2.3186163318211275, + "grad_norm": 0.06103399768471718, + "learning_rate": 9.341104460146722e-05, + "loss": 0.2815, + "step": 28621 + }, + { + "epoch": 2.318697342838626, + "grad_norm": 0.05452466011047363, + "learning_rate": 9.340654394887259e-05, + "loss": 0.2352, + "step": 28622 + }, + { + "epoch": 2.3187783538561244, + "grad_norm": 0.06113947182893753, + "learning_rate": 9.340204329627796e-05, + "loss": 0.254, + "step": 28623 + }, + { + "epoch": 2.3188593648736227, + "grad_norm": 0.0528695210814476, + "learning_rate": 9.339754264368334e-05, + "loss": 0.2403, + "step": 28624 + }, + { + "epoch": 2.3189403758911213, + "grad_norm": 0.07002796232700348, + "learning_rate": 9.339304199108871e-05, + "loss": 0.233, + "step": 28625 + }, + { + "epoch": 2.3190213869086196, + "grad_norm": 0.059731412678956985, + "learning_rate": 9.338854133849409e-05, + "loss": 0.259, + "step": 28626 + }, + { + "epoch": 2.319102397926118, + "grad_norm": 0.06515996903181076, + "learning_rate": 9.338404068589946e-05, + "loss": 0.2598, + "step": 28627 + }, + { + "epoch": 2.3191834089436165, + "grad_norm": 0.05637449026107788, + "learning_rate": 9.337954003330483e-05, + "loss": 0.2327, + "step": 28628 + }, + { + "epoch": 2.3192644199611148, + "grad_norm": 0.05539031699299812, + "learning_rate": 9.33750393807102e-05, + "loss": 0.235, + "step": 28629 + }, + { + "epoch": 2.319345430978613, + "grad_norm": 0.0662870705127716, + "learning_rate": 9.337053872811558e-05, + "loss": 0.2741, + "step": 28630 + }, + { + "epoch": 2.3194264419961117, + "grad_norm": 0.06665057688951492, + "learning_rate": 9.336603807552095e-05, + "loss": 0.2643, + "step": 28631 + }, + { + "epoch": 2.31950745301361, + "grad_norm": 0.05518549680709839, + "learning_rate": 9.336153742292633e-05, + "loss": 0.2225, + "step": 28632 + }, + { + "epoch": 2.319588464031108, + "grad_norm": 0.0644635409116745, + "learning_rate": 9.33570367703317e-05, + "loss": 0.2178, + "step": 28633 + }, + { + "epoch": 2.3196694750486064, + "grad_norm": 0.060210924595594406, + "learning_rate": 9.335253611773707e-05, + "loss": 0.3026, + "step": 28634 + }, + { + "epoch": 2.319750486066105, + "grad_norm": 0.06755194813013077, + "learning_rate": 9.334803546514245e-05, + "loss": 0.2671, + "step": 28635 + }, + { + "epoch": 2.3198314970836034, + "grad_norm": 0.06527794897556305, + "learning_rate": 9.334353481254782e-05, + "loss": 0.2682, + "step": 28636 + }, + { + "epoch": 2.3199125081011016, + "grad_norm": 0.059565238654613495, + "learning_rate": 9.33390341599532e-05, + "loss": 0.2281, + "step": 28637 + }, + { + "epoch": 2.3199935191186003, + "grad_norm": 0.07178928703069687, + "learning_rate": 9.333453350735857e-05, + "loss": 0.2453, + "step": 28638 + }, + { + "epoch": 2.3200745301360985, + "grad_norm": 0.05469311401247978, + "learning_rate": 9.333003285476394e-05, + "loss": 0.2421, + "step": 28639 + }, + { + "epoch": 2.320155541153597, + "grad_norm": 0.06045643240213394, + "learning_rate": 9.332553220216932e-05, + "loss": 0.2452, + "step": 28640 + }, + { + "epoch": 2.320236552171095, + "grad_norm": 0.056732453405857086, + "learning_rate": 9.332103154957469e-05, + "loss": 0.2053, + "step": 28641 + }, + { + "epoch": 2.3203175631885937, + "grad_norm": 0.06491853296756744, + "learning_rate": 9.331653089698008e-05, + "loss": 0.2578, + "step": 28642 + }, + { + "epoch": 2.320398574206092, + "grad_norm": 0.06337980180978775, + "learning_rate": 9.331203024438544e-05, + "loss": 0.2679, + "step": 28643 + }, + { + "epoch": 2.32047958522359, + "grad_norm": 0.06434284150600433, + "learning_rate": 9.330752959179081e-05, + "loss": 0.249, + "step": 28644 + }, + { + "epoch": 2.320560596241089, + "grad_norm": 0.06914211064577103, + "learning_rate": 9.33030289391962e-05, + "loss": 0.2393, + "step": 28645 + }, + { + "epoch": 2.320641607258587, + "grad_norm": 0.08020402491092682, + "learning_rate": 9.329852828660156e-05, + "loss": 0.2562, + "step": 28646 + }, + { + "epoch": 2.3207226182760854, + "grad_norm": 0.061425480991601944, + "learning_rate": 9.329402763400693e-05, + "loss": 0.2476, + "step": 28647 + }, + { + "epoch": 2.320803629293584, + "grad_norm": 0.06337305903434753, + "learning_rate": 9.328952698141232e-05, + "loss": 0.2547, + "step": 28648 + }, + { + "epoch": 2.3208846403110823, + "grad_norm": 0.06028088927268982, + "learning_rate": 9.328502632881768e-05, + "loss": 0.2206, + "step": 28649 + }, + { + "epoch": 2.3209656513285806, + "grad_norm": 0.06460175663232803, + "learning_rate": 9.328052567622305e-05, + "loss": 0.2312, + "step": 28650 + }, + { + "epoch": 2.3210466623460793, + "grad_norm": 0.08044122904539108, + "learning_rate": 9.327602502362844e-05, + "loss": 0.2623, + "step": 28651 + }, + { + "epoch": 2.3211276733635775, + "grad_norm": 0.08063766360282898, + "learning_rate": 9.32715243710338e-05, + "loss": 0.2854, + "step": 28652 + }, + { + "epoch": 2.3212086843810757, + "grad_norm": 0.059449952095746994, + "learning_rate": 9.326702371843917e-05, + "loss": 0.2184, + "step": 28653 + }, + { + "epoch": 2.3212896953985744, + "grad_norm": 0.061719294637441635, + "learning_rate": 9.326252306584456e-05, + "loss": 0.2218, + "step": 28654 + }, + { + "epoch": 2.3213707064160727, + "grad_norm": 0.06940759718418121, + "learning_rate": 9.325802241324992e-05, + "loss": 0.2229, + "step": 28655 + }, + { + "epoch": 2.321451717433571, + "grad_norm": 0.0604344978928566, + "learning_rate": 9.325352176065531e-05, + "loss": 0.2039, + "step": 28656 + }, + { + "epoch": 2.321532728451069, + "grad_norm": 0.06056111678481102, + "learning_rate": 9.324902110806068e-05, + "loss": 0.2522, + "step": 28657 + }, + { + "epoch": 2.321613739468568, + "grad_norm": 0.07482937723398209, + "learning_rate": 9.324452045546604e-05, + "loss": 0.2472, + "step": 28658 + }, + { + "epoch": 2.321694750486066, + "grad_norm": 0.055157896131277084, + "learning_rate": 9.324001980287143e-05, + "loss": 0.2505, + "step": 28659 + }, + { + "epoch": 2.3217757615035644, + "grad_norm": 0.062443807721138, + "learning_rate": 9.32355191502768e-05, + "loss": 0.2007, + "step": 28660 + }, + { + "epoch": 2.321856772521063, + "grad_norm": 0.06030425429344177, + "learning_rate": 9.323101849768216e-05, + "loss": 0.2203, + "step": 28661 + }, + { + "epoch": 2.3219377835385613, + "grad_norm": 0.07104106992483139, + "learning_rate": 9.322651784508755e-05, + "loss": 0.2457, + "step": 28662 + }, + { + "epoch": 2.3220187945560595, + "grad_norm": 0.06790819764137268, + "learning_rate": 9.322201719249292e-05, + "loss": 0.28, + "step": 28663 + }, + { + "epoch": 2.3220998055735578, + "grad_norm": 0.06834875047206879, + "learning_rate": 9.321751653989828e-05, + "loss": 0.2454, + "step": 28664 + }, + { + "epoch": 2.3221808165910565, + "grad_norm": 0.05928588658571243, + "learning_rate": 9.321301588730367e-05, + "loss": 0.2552, + "step": 28665 + }, + { + "epoch": 2.3222618276085547, + "grad_norm": 0.07721876353025436, + "learning_rate": 9.320851523470904e-05, + "loss": 0.2618, + "step": 28666 + }, + { + "epoch": 2.322342838626053, + "grad_norm": 0.0791599452495575, + "learning_rate": 9.32040145821144e-05, + "loss": 0.2464, + "step": 28667 + }, + { + "epoch": 2.3224238496435516, + "grad_norm": 0.06511277705430984, + "learning_rate": 9.319951392951979e-05, + "loss": 0.2918, + "step": 28668 + }, + { + "epoch": 2.32250486066105, + "grad_norm": 0.06426123529672623, + "learning_rate": 9.319501327692516e-05, + "loss": 0.223, + "step": 28669 + }, + { + "epoch": 2.322585871678548, + "grad_norm": 0.05680019408464432, + "learning_rate": 9.319051262433052e-05, + "loss": 0.2295, + "step": 28670 + }, + { + "epoch": 2.322666882696047, + "grad_norm": 0.07702506333589554, + "learning_rate": 9.318601197173591e-05, + "loss": 0.2666, + "step": 28671 + }, + { + "epoch": 2.322747893713545, + "grad_norm": 0.08701331168413162, + "learning_rate": 9.318151131914128e-05, + "loss": 0.2492, + "step": 28672 + }, + { + "epoch": 2.3228289047310433, + "grad_norm": 0.08785002678632736, + "learning_rate": 9.317701066654664e-05, + "loss": 0.3024, + "step": 28673 + }, + { + "epoch": 2.322909915748542, + "grad_norm": 0.06806372106075287, + "learning_rate": 9.317251001395203e-05, + "loss": 0.2763, + "step": 28674 + }, + { + "epoch": 2.3229909267660402, + "grad_norm": 0.08264884352684021, + "learning_rate": 9.31680093613574e-05, + "loss": 0.2541, + "step": 28675 + }, + { + "epoch": 2.3230719377835385, + "grad_norm": 0.05051686242222786, + "learning_rate": 9.316350870876277e-05, + "loss": 0.2237, + "step": 28676 + }, + { + "epoch": 2.323152948801037, + "grad_norm": 0.05707293376326561, + "learning_rate": 9.315900805616815e-05, + "loss": 0.2311, + "step": 28677 + }, + { + "epoch": 2.3232339598185354, + "grad_norm": 0.07153897732496262, + "learning_rate": 9.315450740357353e-05, + "loss": 0.2794, + "step": 28678 + }, + { + "epoch": 2.3233149708360337, + "grad_norm": 0.08252691477537155, + "learning_rate": 9.315000675097889e-05, + "loss": 0.293, + "step": 28679 + }, + { + "epoch": 2.323395981853532, + "grad_norm": 0.078824482858181, + "learning_rate": 9.314550609838427e-05, + "loss": 0.227, + "step": 28680 + }, + { + "epoch": 2.3234769928710306, + "grad_norm": 0.057428814470767975, + "learning_rate": 9.314100544578965e-05, + "loss": 0.2732, + "step": 28681 + }, + { + "epoch": 2.323558003888529, + "grad_norm": 0.061816342175006866, + "learning_rate": 9.313650479319502e-05, + "loss": 0.2177, + "step": 28682 + }, + { + "epoch": 2.323639014906027, + "grad_norm": 0.07356321811676025, + "learning_rate": 9.31320041406004e-05, + "loss": 0.3128, + "step": 28683 + }, + { + "epoch": 2.323720025923526, + "grad_norm": 0.07679431885480881, + "learning_rate": 9.312750348800577e-05, + "loss": 0.2415, + "step": 28684 + }, + { + "epoch": 2.323801036941024, + "grad_norm": 0.060885075479745865, + "learning_rate": 9.312300283541114e-05, + "loss": 0.2542, + "step": 28685 + }, + { + "epoch": 2.3238820479585223, + "grad_norm": 0.06792385131120682, + "learning_rate": 9.311850218281652e-05, + "loss": 0.2264, + "step": 28686 + }, + { + "epoch": 2.3239630589760205, + "grad_norm": 0.05504778027534485, + "learning_rate": 9.311400153022189e-05, + "loss": 0.2534, + "step": 28687 + }, + { + "epoch": 2.324044069993519, + "grad_norm": 0.07655138522386551, + "learning_rate": 9.310950087762726e-05, + "loss": 0.2711, + "step": 28688 + }, + { + "epoch": 2.3241250810110174, + "grad_norm": 0.06925234943628311, + "learning_rate": 9.310500022503264e-05, + "loss": 0.2786, + "step": 28689 + }, + { + "epoch": 2.3242060920285157, + "grad_norm": 0.0667187049984932, + "learning_rate": 9.310049957243801e-05, + "loss": 0.2541, + "step": 28690 + }, + { + "epoch": 2.3242871030460144, + "grad_norm": 0.06552990525960922, + "learning_rate": 9.309599891984338e-05, + "loss": 0.3223, + "step": 28691 + }, + { + "epoch": 2.3243681140635126, + "grad_norm": 0.060230664908885956, + "learning_rate": 9.309149826724876e-05, + "loss": 0.2444, + "step": 28692 + }, + { + "epoch": 2.324449125081011, + "grad_norm": 0.07626669108867645, + "learning_rate": 9.308699761465413e-05, + "loss": 0.2414, + "step": 28693 + }, + { + "epoch": 2.3245301360985096, + "grad_norm": 0.05662994831800461, + "learning_rate": 9.30824969620595e-05, + "loss": 0.215, + "step": 28694 + }, + { + "epoch": 2.324611147116008, + "grad_norm": 0.059049200266599655, + "learning_rate": 9.307799630946488e-05, + "loss": 0.2491, + "step": 28695 + }, + { + "epoch": 2.324692158133506, + "grad_norm": 0.05884696543216705, + "learning_rate": 9.307349565687025e-05, + "loss": 0.233, + "step": 28696 + }, + { + "epoch": 2.3247731691510047, + "grad_norm": 0.06978729367256165, + "learning_rate": 9.306899500427562e-05, + "loss": 0.2214, + "step": 28697 + }, + { + "epoch": 2.324854180168503, + "grad_norm": 0.06161429360508919, + "learning_rate": 9.3064494351681e-05, + "loss": 0.2235, + "step": 28698 + }, + { + "epoch": 2.3249351911860012, + "grad_norm": 0.06919985264539719, + "learning_rate": 9.305999369908637e-05, + "loss": 0.2727, + "step": 28699 + }, + { + "epoch": 2.3250162022035, + "grad_norm": 0.06399379670619965, + "learning_rate": 9.305549304649175e-05, + "loss": 0.248, + "step": 28700 + }, + { + "epoch": 2.325097213220998, + "grad_norm": 0.05922725051641464, + "learning_rate": 9.305099239389712e-05, + "loss": 0.2056, + "step": 28701 + }, + { + "epoch": 2.3251782242384964, + "grad_norm": 0.057275738567113876, + "learning_rate": 9.304649174130249e-05, + "loss": 0.2456, + "step": 28702 + }, + { + "epoch": 2.3252592352559946, + "grad_norm": 0.06780439615249634, + "learning_rate": 9.304199108870787e-05, + "loss": 0.2467, + "step": 28703 + }, + { + "epoch": 2.3253402462734933, + "grad_norm": 0.061287131160497665, + "learning_rate": 9.303749043611324e-05, + "loss": 0.3068, + "step": 28704 + }, + { + "epoch": 2.3254212572909916, + "grad_norm": 0.06363238394260406, + "learning_rate": 9.303298978351861e-05, + "loss": 0.2691, + "step": 28705 + }, + { + "epoch": 2.32550226830849, + "grad_norm": 0.07219129055738449, + "learning_rate": 9.302848913092399e-05, + "loss": 0.2758, + "step": 28706 + }, + { + "epoch": 2.3255832793259885, + "grad_norm": 0.07437929511070251, + "learning_rate": 9.302398847832936e-05, + "loss": 0.2233, + "step": 28707 + }, + { + "epoch": 2.3256642903434868, + "grad_norm": 0.055756907910108566, + "learning_rate": 9.301948782573473e-05, + "loss": 0.2347, + "step": 28708 + }, + { + "epoch": 2.325745301360985, + "grad_norm": 0.07122205197811127, + "learning_rate": 9.301498717314011e-05, + "loss": 0.2544, + "step": 28709 + }, + { + "epoch": 2.3258263123784833, + "grad_norm": 0.065572090446949, + "learning_rate": 9.301048652054548e-05, + "loss": 0.22, + "step": 28710 + }, + { + "epoch": 2.325907323395982, + "grad_norm": 0.06668426096439362, + "learning_rate": 9.300598586795087e-05, + "loss": 0.2722, + "step": 28711 + }, + { + "epoch": 2.32598833441348, + "grad_norm": 0.08833864331245422, + "learning_rate": 9.300148521535623e-05, + "loss": 0.2419, + "step": 28712 + }, + { + "epoch": 2.3260693454309784, + "grad_norm": 0.06704901158809662, + "learning_rate": 9.29969845627616e-05, + "loss": 0.2561, + "step": 28713 + }, + { + "epoch": 2.326150356448477, + "grad_norm": 0.06984715163707733, + "learning_rate": 9.299248391016699e-05, + "loss": 0.2398, + "step": 28714 + }, + { + "epoch": 2.3262313674659754, + "grad_norm": 0.06736070662736893, + "learning_rate": 9.298798325757235e-05, + "loss": 0.266, + "step": 28715 + }, + { + "epoch": 2.3263123784834736, + "grad_norm": 0.06093626841902733, + "learning_rate": 9.298348260497772e-05, + "loss": 0.2351, + "step": 28716 + }, + { + "epoch": 2.3263933895009723, + "grad_norm": 0.06312654167413712, + "learning_rate": 9.297898195238311e-05, + "loss": 0.2475, + "step": 28717 + }, + { + "epoch": 2.3264744005184705, + "grad_norm": 0.061158306896686554, + "learning_rate": 9.297448129978847e-05, + "loss": 0.2674, + "step": 28718 + }, + { + "epoch": 2.326555411535969, + "grad_norm": 0.07095766812562943, + "learning_rate": 9.296998064719384e-05, + "loss": 0.2411, + "step": 28719 + }, + { + "epoch": 2.3266364225534675, + "grad_norm": 0.06944864988327026, + "learning_rate": 9.296547999459923e-05, + "loss": 0.2305, + "step": 28720 + }, + { + "epoch": 2.3267174335709657, + "grad_norm": 0.06484808772802353, + "learning_rate": 9.296097934200459e-05, + "loss": 0.246, + "step": 28721 + }, + { + "epoch": 2.326798444588464, + "grad_norm": 0.06880709528923035, + "learning_rate": 9.295647868940997e-05, + "loss": 0.2772, + "step": 28722 + }, + { + "epoch": 2.3268794556059627, + "grad_norm": 0.05179981887340546, + "learning_rate": 9.295197803681535e-05, + "loss": 0.2215, + "step": 28723 + }, + { + "epoch": 2.326960466623461, + "grad_norm": 0.06509382277727127, + "learning_rate": 9.294747738422071e-05, + "loss": 0.2636, + "step": 28724 + }, + { + "epoch": 2.327041477640959, + "grad_norm": 0.06014722213149071, + "learning_rate": 9.294297673162609e-05, + "loss": 0.251, + "step": 28725 + }, + { + "epoch": 2.3271224886584574, + "grad_norm": 0.07093410938978195, + "learning_rate": 9.293847607903147e-05, + "loss": 0.2665, + "step": 28726 + }, + { + "epoch": 2.327203499675956, + "grad_norm": 0.054343074560165405, + "learning_rate": 9.293397542643683e-05, + "loss": 0.264, + "step": 28727 + }, + { + "epoch": 2.3272845106934543, + "grad_norm": 0.07552266865968704, + "learning_rate": 9.29294747738422e-05, + "loss": 0.2363, + "step": 28728 + }, + { + "epoch": 2.3273655217109526, + "grad_norm": 0.06929947435855865, + "learning_rate": 9.29249741212476e-05, + "loss": 0.2754, + "step": 28729 + }, + { + "epoch": 2.3274465327284513, + "grad_norm": 0.0776057094335556, + "learning_rate": 9.292047346865295e-05, + "loss": 0.2586, + "step": 28730 + }, + { + "epoch": 2.3275275437459495, + "grad_norm": 0.06017788499593735, + "learning_rate": 9.291597281605833e-05, + "loss": 0.243, + "step": 28731 + }, + { + "epoch": 2.3276085547634477, + "grad_norm": 0.06438402831554413, + "learning_rate": 9.291147216346371e-05, + "loss": 0.266, + "step": 28732 + }, + { + "epoch": 2.327689565780946, + "grad_norm": 0.06785589456558228, + "learning_rate": 9.290697151086907e-05, + "loss": 0.2935, + "step": 28733 + }, + { + "epoch": 2.3277705767984447, + "grad_norm": 0.06721623986959457, + "learning_rate": 9.290247085827446e-05, + "loss": 0.2814, + "step": 28734 + }, + { + "epoch": 2.327851587815943, + "grad_norm": 0.053944192826747894, + "learning_rate": 9.289797020567984e-05, + "loss": 0.2258, + "step": 28735 + }, + { + "epoch": 2.327932598833441, + "grad_norm": 0.059720806777477264, + "learning_rate": 9.28934695530852e-05, + "loss": 0.2323, + "step": 28736 + }, + { + "epoch": 2.32801360985094, + "grad_norm": 0.05844448506832123, + "learning_rate": 9.288896890049058e-05, + "loss": 0.2292, + "step": 28737 + }, + { + "epoch": 2.328094620868438, + "grad_norm": 0.06499190628528595, + "learning_rate": 9.288446824789596e-05, + "loss": 0.2655, + "step": 28738 + }, + { + "epoch": 2.3281756318859363, + "grad_norm": 0.0609806589782238, + "learning_rate": 9.287996759530132e-05, + "loss": 0.2413, + "step": 28739 + }, + { + "epoch": 2.328256642903435, + "grad_norm": 0.06576729565858841, + "learning_rate": 9.28754669427067e-05, + "loss": 0.2281, + "step": 28740 + }, + { + "epoch": 2.3283376539209333, + "grad_norm": 0.06825067102909088, + "learning_rate": 9.287096629011208e-05, + "loss": 0.2359, + "step": 28741 + }, + { + "epoch": 2.3284186649384315, + "grad_norm": 0.05548768490552902, + "learning_rate": 9.286646563751744e-05, + "loss": 0.2472, + "step": 28742 + }, + { + "epoch": 2.32849967595593, + "grad_norm": 0.057399872690439224, + "learning_rate": 9.286196498492282e-05, + "loss": 0.2318, + "step": 28743 + }, + { + "epoch": 2.3285806869734285, + "grad_norm": 0.06823092699050903, + "learning_rate": 9.28574643323282e-05, + "loss": 0.2614, + "step": 28744 + }, + { + "epoch": 2.3286616979909267, + "grad_norm": 0.06344280391931534, + "learning_rate": 9.285296367973356e-05, + "loss": 0.2802, + "step": 28745 + }, + { + "epoch": 2.3287427090084254, + "grad_norm": 0.06706608086824417, + "learning_rate": 9.284846302713895e-05, + "loss": 0.2509, + "step": 28746 + }, + { + "epoch": 2.3288237200259236, + "grad_norm": 0.07321401685476303, + "learning_rate": 9.284396237454432e-05, + "loss": 0.2632, + "step": 28747 + }, + { + "epoch": 2.328904731043422, + "grad_norm": 0.07312627881765366, + "learning_rate": 9.283946172194968e-05, + "loss": 0.284, + "step": 28748 + }, + { + "epoch": 2.32898574206092, + "grad_norm": 0.07281111925840378, + "learning_rate": 9.283496106935507e-05, + "loss": 0.2649, + "step": 28749 + }, + { + "epoch": 2.329066753078419, + "grad_norm": 0.07355530560016632, + "learning_rate": 9.283046041676044e-05, + "loss": 0.2446, + "step": 28750 + }, + { + "epoch": 2.329147764095917, + "grad_norm": 0.07079508900642395, + "learning_rate": 9.28259597641658e-05, + "loss": 0.2546, + "step": 28751 + }, + { + "epoch": 2.3292287751134153, + "grad_norm": 0.07261772453784943, + "learning_rate": 9.282145911157119e-05, + "loss": 0.2631, + "step": 28752 + }, + { + "epoch": 2.329309786130914, + "grad_norm": 0.06484165787696838, + "learning_rate": 9.281695845897656e-05, + "loss": 0.2186, + "step": 28753 + }, + { + "epoch": 2.3293907971484122, + "grad_norm": 0.05773812532424927, + "learning_rate": 9.281245780638192e-05, + "loss": 0.206, + "step": 28754 + }, + { + "epoch": 2.3294718081659105, + "grad_norm": 0.062474749982357025, + "learning_rate": 9.280795715378731e-05, + "loss": 0.2643, + "step": 28755 + }, + { + "epoch": 2.3295528191834087, + "grad_norm": 0.06529214978218079, + "learning_rate": 9.280345650119268e-05, + "loss": 0.2855, + "step": 28756 + }, + { + "epoch": 2.3296338302009074, + "grad_norm": 0.06362579017877579, + "learning_rate": 9.279895584859804e-05, + "loss": 0.2055, + "step": 28757 + }, + { + "epoch": 2.3297148412184057, + "grad_norm": 0.07225839793682098, + "learning_rate": 9.279445519600343e-05, + "loss": 0.2555, + "step": 28758 + }, + { + "epoch": 2.329795852235904, + "grad_norm": 0.059502191841602325, + "learning_rate": 9.27899545434088e-05, + "loss": 0.2313, + "step": 28759 + }, + { + "epoch": 2.3298768632534026, + "grad_norm": 0.07234331220388412, + "learning_rate": 9.278545389081418e-05, + "loss": 0.2666, + "step": 28760 + }, + { + "epoch": 2.329957874270901, + "grad_norm": 0.06535720825195312, + "learning_rate": 9.278095323821955e-05, + "loss": 0.2418, + "step": 28761 + }, + { + "epoch": 2.330038885288399, + "grad_norm": 0.07945813983678818, + "learning_rate": 9.277645258562492e-05, + "loss": 0.2588, + "step": 28762 + }, + { + "epoch": 2.3301198963058978, + "grad_norm": 0.06043941527605057, + "learning_rate": 9.27719519330303e-05, + "loss": 0.2362, + "step": 28763 + }, + { + "epoch": 2.330200907323396, + "grad_norm": 0.06322724372148514, + "learning_rate": 9.276745128043567e-05, + "loss": 0.2405, + "step": 28764 + }, + { + "epoch": 2.3302819183408943, + "grad_norm": 0.0783979520201683, + "learning_rate": 9.276295062784104e-05, + "loss": 0.295, + "step": 28765 + }, + { + "epoch": 2.330362929358393, + "grad_norm": 0.06260982155799866, + "learning_rate": 9.275844997524642e-05, + "loss": 0.2632, + "step": 28766 + }, + { + "epoch": 2.330443940375891, + "grad_norm": 0.0643438994884491, + "learning_rate": 9.275394932265179e-05, + "loss": 0.2328, + "step": 28767 + }, + { + "epoch": 2.3305249513933894, + "grad_norm": 0.07150939851999283, + "learning_rate": 9.274944867005716e-05, + "loss": 0.2515, + "step": 28768 + }, + { + "epoch": 2.3306059624108877, + "grad_norm": 0.07017625123262405, + "learning_rate": 9.274494801746254e-05, + "loss": 0.2381, + "step": 28769 + }, + { + "epoch": 2.3306869734283864, + "grad_norm": 0.07019396871328354, + "learning_rate": 9.274044736486791e-05, + "loss": 0.2661, + "step": 28770 + }, + { + "epoch": 2.3307679844458846, + "grad_norm": 0.06480173021554947, + "learning_rate": 9.273594671227329e-05, + "loss": 0.261, + "step": 28771 + }, + { + "epoch": 2.330848995463383, + "grad_norm": 0.07203859835863113, + "learning_rate": 9.273144605967866e-05, + "loss": 0.2598, + "step": 28772 + }, + { + "epoch": 2.3309300064808816, + "grad_norm": 0.055770982056856155, + "learning_rate": 9.272694540708403e-05, + "loss": 0.2475, + "step": 28773 + }, + { + "epoch": 2.33101101749838, + "grad_norm": 0.07176729291677475, + "learning_rate": 9.27224447544894e-05, + "loss": 0.3045, + "step": 28774 + }, + { + "epoch": 2.331092028515878, + "grad_norm": 0.07138893753290176, + "learning_rate": 9.271794410189478e-05, + "loss": 0.2374, + "step": 28775 + }, + { + "epoch": 2.3311730395333763, + "grad_norm": 0.07597170025110245, + "learning_rate": 9.271344344930015e-05, + "loss": 0.244, + "step": 28776 + }, + { + "epoch": 2.331254050550875, + "grad_norm": 0.06030385568737984, + "learning_rate": 9.270894279670553e-05, + "loss": 0.309, + "step": 28777 + }, + { + "epoch": 2.3313350615683732, + "grad_norm": 0.054813820868730545, + "learning_rate": 9.27044421441109e-05, + "loss": 0.2555, + "step": 28778 + }, + { + "epoch": 2.3314160725858715, + "grad_norm": 0.06930273026227951, + "learning_rate": 9.269994149151627e-05, + "loss": 0.2483, + "step": 28779 + }, + { + "epoch": 2.33149708360337, + "grad_norm": 0.07116469740867615, + "learning_rate": 9.269544083892165e-05, + "loss": 0.2432, + "step": 28780 + }, + { + "epoch": 2.3315780946208684, + "grad_norm": 0.06757555902004242, + "learning_rate": 9.269094018632702e-05, + "loss": 0.3001, + "step": 28781 + }, + { + "epoch": 2.3316591056383666, + "grad_norm": 0.06617242097854614, + "learning_rate": 9.26864395337324e-05, + "loss": 0.242, + "step": 28782 + }, + { + "epoch": 2.3317401166558653, + "grad_norm": 0.06303367763757706, + "learning_rate": 9.268193888113777e-05, + "loss": 0.2311, + "step": 28783 + }, + { + "epoch": 2.3318211276733636, + "grad_norm": 0.06040335074067116, + "learning_rate": 9.267743822854314e-05, + "loss": 0.2501, + "step": 28784 + }, + { + "epoch": 2.331902138690862, + "grad_norm": 0.07035958021879196, + "learning_rate": 9.267293757594852e-05, + "loss": 0.272, + "step": 28785 + }, + { + "epoch": 2.3319831497083605, + "grad_norm": 0.07982959598302841, + "learning_rate": 9.26684369233539e-05, + "loss": 0.2289, + "step": 28786 + }, + { + "epoch": 2.3320641607258588, + "grad_norm": 0.060289278626441956, + "learning_rate": 9.266393627075926e-05, + "loss": 0.2461, + "step": 28787 + }, + { + "epoch": 2.332145171743357, + "grad_norm": 0.06692136079072952, + "learning_rate": 9.265943561816464e-05, + "loss": 0.2686, + "step": 28788 + }, + { + "epoch": 2.3322261827608557, + "grad_norm": 0.07542910426855087, + "learning_rate": 9.265493496557002e-05, + "loss": 0.2549, + "step": 28789 + }, + { + "epoch": 2.332307193778354, + "grad_norm": 0.06903022527694702, + "learning_rate": 9.265043431297538e-05, + "loss": 0.2434, + "step": 28790 + }, + { + "epoch": 2.332388204795852, + "grad_norm": 0.07558293640613556, + "learning_rate": 9.264593366038076e-05, + "loss": 0.2725, + "step": 28791 + }, + { + "epoch": 2.3324692158133504, + "grad_norm": 0.06524661928415298, + "learning_rate": 9.264143300778614e-05, + "loss": 0.3046, + "step": 28792 + }, + { + "epoch": 2.332550226830849, + "grad_norm": 0.06829063594341278, + "learning_rate": 9.26369323551915e-05, + "loss": 0.2546, + "step": 28793 + }, + { + "epoch": 2.3326312378483474, + "grad_norm": 0.060153085738420486, + "learning_rate": 9.263243170259688e-05, + "loss": 0.2406, + "step": 28794 + }, + { + "epoch": 2.3327122488658456, + "grad_norm": 0.06505537033081055, + "learning_rate": 9.262793105000227e-05, + "loss": 0.2498, + "step": 28795 + }, + { + "epoch": 2.3327932598833443, + "grad_norm": 0.06308627128601074, + "learning_rate": 9.262343039740763e-05, + "loss": 0.2413, + "step": 28796 + }, + { + "epoch": 2.3328742709008425, + "grad_norm": 0.07692531496286392, + "learning_rate": 9.2618929744813e-05, + "loss": 0.223, + "step": 28797 + }, + { + "epoch": 2.332955281918341, + "grad_norm": 0.07463797926902771, + "learning_rate": 9.261442909221839e-05, + "loss": 0.2878, + "step": 28798 + }, + { + "epoch": 2.333036292935839, + "grad_norm": 0.06551359593868256, + "learning_rate": 9.260992843962375e-05, + "loss": 0.2374, + "step": 28799 + }, + { + "epoch": 2.3331173039533377, + "grad_norm": 0.0611150823533535, + "learning_rate": 9.260542778702912e-05, + "loss": 0.2136, + "step": 28800 + }, + { + "epoch": 2.333198314970836, + "grad_norm": 0.04823988303542137, + "learning_rate": 9.260092713443451e-05, + "loss": 0.2037, + "step": 28801 + }, + { + "epoch": 2.333279325988334, + "grad_norm": 0.05834527313709259, + "learning_rate": 9.259642648183987e-05, + "loss": 0.2723, + "step": 28802 + }, + { + "epoch": 2.333360337005833, + "grad_norm": 0.0750277191400528, + "learning_rate": 9.259192582924524e-05, + "loss": 0.2637, + "step": 28803 + }, + { + "epoch": 2.333441348023331, + "grad_norm": 0.06466560810804367, + "learning_rate": 9.258742517665063e-05, + "loss": 0.2512, + "step": 28804 + }, + { + "epoch": 2.3335223590408294, + "grad_norm": 0.06680560111999512, + "learning_rate": 9.258292452405599e-05, + "loss": 0.2453, + "step": 28805 + }, + { + "epoch": 2.333603370058328, + "grad_norm": 0.07900147140026093, + "learning_rate": 9.257842387146136e-05, + "loss": 0.2408, + "step": 28806 + }, + { + "epoch": 2.3336843810758263, + "grad_norm": 0.08736459165811539, + "learning_rate": 9.257392321886675e-05, + "loss": 0.2717, + "step": 28807 + }, + { + "epoch": 2.3337653920933246, + "grad_norm": 0.06984545290470123, + "learning_rate": 9.256942256627211e-05, + "loss": 0.2761, + "step": 28808 + }, + { + "epoch": 2.3338464031108233, + "grad_norm": 0.05604099482297897, + "learning_rate": 9.256492191367748e-05, + "loss": 0.224, + "step": 28809 + }, + { + "epoch": 2.3339274141283215, + "grad_norm": 0.07051841169595718, + "learning_rate": 9.256042126108287e-05, + "loss": 0.2637, + "step": 28810 + }, + { + "epoch": 2.3340084251458197, + "grad_norm": 0.07440926134586334, + "learning_rate": 9.255592060848823e-05, + "loss": 0.2557, + "step": 28811 + }, + { + "epoch": 2.3340894361633184, + "grad_norm": 0.07619215548038483, + "learning_rate": 9.25514199558936e-05, + "loss": 0.2939, + "step": 28812 + }, + { + "epoch": 2.3341704471808167, + "grad_norm": 0.06554830074310303, + "learning_rate": 9.254691930329899e-05, + "loss": 0.2635, + "step": 28813 + }, + { + "epoch": 2.334251458198315, + "grad_norm": 0.05629369616508484, + "learning_rate": 9.254241865070435e-05, + "loss": 0.2119, + "step": 28814 + }, + { + "epoch": 2.334332469215813, + "grad_norm": 0.05875800922513008, + "learning_rate": 9.253791799810974e-05, + "loss": 0.2599, + "step": 28815 + }, + { + "epoch": 2.334413480233312, + "grad_norm": 0.06345994770526886, + "learning_rate": 9.253341734551511e-05, + "loss": 0.2424, + "step": 28816 + }, + { + "epoch": 2.33449449125081, + "grad_norm": 0.079732745885849, + "learning_rate": 9.252891669292047e-05, + "loss": 0.2379, + "step": 28817 + }, + { + "epoch": 2.3345755022683083, + "grad_norm": 0.07231254875659943, + "learning_rate": 9.252441604032586e-05, + "loss": 0.2322, + "step": 28818 + }, + { + "epoch": 2.334656513285807, + "grad_norm": 0.061505191028118134, + "learning_rate": 9.251991538773123e-05, + "loss": 0.2371, + "step": 28819 + }, + { + "epoch": 2.3347375243033053, + "grad_norm": 0.07146601378917694, + "learning_rate": 9.251541473513659e-05, + "loss": 0.2746, + "step": 28820 + }, + { + "epoch": 2.3348185353208035, + "grad_norm": 0.05789024010300636, + "learning_rate": 9.251091408254198e-05, + "loss": 0.2869, + "step": 28821 + }, + { + "epoch": 2.3348995463383018, + "grad_norm": 0.07208845764398575, + "learning_rate": 9.250641342994735e-05, + "loss": 0.228, + "step": 28822 + }, + { + "epoch": 2.3349805573558005, + "grad_norm": 0.05617007613182068, + "learning_rate": 9.250191277735271e-05, + "loss": 0.2676, + "step": 28823 + }, + { + "epoch": 2.3350615683732987, + "grad_norm": 0.07315897941589355, + "learning_rate": 9.24974121247581e-05, + "loss": 0.2697, + "step": 28824 + }, + { + "epoch": 2.335142579390797, + "grad_norm": 0.07798895984888077, + "learning_rate": 9.249291147216347e-05, + "loss": 0.2535, + "step": 28825 + }, + { + "epoch": 2.3352235904082956, + "grad_norm": 0.07685133814811707, + "learning_rate": 9.248841081956883e-05, + "loss": 0.3038, + "step": 28826 + }, + { + "epoch": 2.335304601425794, + "grad_norm": 0.06907043606042862, + "learning_rate": 9.248391016697422e-05, + "loss": 0.2836, + "step": 28827 + }, + { + "epoch": 2.335385612443292, + "grad_norm": 0.06489613652229309, + "learning_rate": 9.24794095143796e-05, + "loss": 0.2511, + "step": 28828 + }, + { + "epoch": 2.335466623460791, + "grad_norm": 0.07132221758365631, + "learning_rate": 9.247490886178495e-05, + "loss": 0.2657, + "step": 28829 + }, + { + "epoch": 2.335547634478289, + "grad_norm": 0.05750245600938797, + "learning_rate": 9.247040820919034e-05, + "loss": 0.2246, + "step": 28830 + }, + { + "epoch": 2.3356286454957873, + "grad_norm": 0.0631583034992218, + "learning_rate": 9.246590755659571e-05, + "loss": 0.2372, + "step": 28831 + }, + { + "epoch": 2.335709656513286, + "grad_norm": 0.06106153503060341, + "learning_rate": 9.246140690400107e-05, + "loss": 0.2485, + "step": 28832 + }, + { + "epoch": 2.3357906675307842, + "grad_norm": 0.0786064937710762, + "learning_rate": 9.245690625140646e-05, + "loss": 0.2482, + "step": 28833 + }, + { + "epoch": 2.3358716785482825, + "grad_norm": 0.0773613229393959, + "learning_rate": 9.245240559881184e-05, + "loss": 0.281, + "step": 28834 + }, + { + "epoch": 2.335952689565781, + "grad_norm": 0.08510950952768326, + "learning_rate": 9.24479049462172e-05, + "loss": 0.2649, + "step": 28835 + }, + { + "epoch": 2.3360337005832794, + "grad_norm": 0.0630519837141037, + "learning_rate": 9.244340429362258e-05, + "loss": 0.2635, + "step": 28836 + }, + { + "epoch": 2.3361147116007777, + "grad_norm": 0.06939224153757095, + "learning_rate": 9.243890364102796e-05, + "loss": 0.282, + "step": 28837 + }, + { + "epoch": 2.336195722618276, + "grad_norm": 0.06594755500555038, + "learning_rate": 9.243440298843332e-05, + "loss": 0.2541, + "step": 28838 + }, + { + "epoch": 2.3362767336357746, + "grad_norm": 0.0586162731051445, + "learning_rate": 9.24299023358387e-05, + "loss": 0.2118, + "step": 28839 + }, + { + "epoch": 2.336357744653273, + "grad_norm": 0.06778659671545029, + "learning_rate": 9.242540168324408e-05, + "loss": 0.2217, + "step": 28840 + }, + { + "epoch": 2.336438755670771, + "grad_norm": 0.07562971115112305, + "learning_rate": 9.242090103064945e-05, + "loss": 0.2907, + "step": 28841 + }, + { + "epoch": 2.3365197666882698, + "grad_norm": 0.06536746025085449, + "learning_rate": 9.241640037805482e-05, + "loss": 0.2519, + "step": 28842 + }, + { + "epoch": 2.336600777705768, + "grad_norm": 0.07464933395385742, + "learning_rate": 9.24118997254602e-05, + "loss": 0.2423, + "step": 28843 + }, + { + "epoch": 2.3366817887232663, + "grad_norm": 0.07020731270313263, + "learning_rate": 9.240739907286557e-05, + "loss": 0.2366, + "step": 28844 + }, + { + "epoch": 2.3367627997407645, + "grad_norm": 0.06497664004564285, + "learning_rate": 9.240289842027095e-05, + "loss": 0.2522, + "step": 28845 + }, + { + "epoch": 2.336843810758263, + "grad_norm": 0.05622991919517517, + "learning_rate": 9.239839776767632e-05, + "loss": 0.2252, + "step": 28846 + }, + { + "epoch": 2.3369248217757614, + "grad_norm": 0.06055394187569618, + "learning_rate": 9.239389711508169e-05, + "loss": 0.2235, + "step": 28847 + }, + { + "epoch": 2.3370058327932597, + "grad_norm": 0.06988414376974106, + "learning_rate": 9.238939646248707e-05, + "loss": 0.2199, + "step": 28848 + }, + { + "epoch": 2.3370868438107584, + "grad_norm": 0.05594763904809952, + "learning_rate": 9.238489580989244e-05, + "loss": 0.262, + "step": 28849 + }, + { + "epoch": 2.3371678548282566, + "grad_norm": 0.07687854021787643, + "learning_rate": 9.238039515729781e-05, + "loss": 0.2651, + "step": 28850 + }, + { + "epoch": 2.337248865845755, + "grad_norm": 0.0715331956744194, + "learning_rate": 9.237589450470319e-05, + "loss": 0.2874, + "step": 28851 + }, + { + "epoch": 2.3373298768632536, + "grad_norm": 0.05961179360747337, + "learning_rate": 9.237139385210856e-05, + "loss": 0.2296, + "step": 28852 + }, + { + "epoch": 2.337410887880752, + "grad_norm": 0.05749481916427612, + "learning_rate": 9.236689319951393e-05, + "loss": 0.2327, + "step": 28853 + }, + { + "epoch": 2.33749189889825, + "grad_norm": 0.0786358043551445, + "learning_rate": 9.236239254691931e-05, + "loss": 0.2288, + "step": 28854 + }, + { + "epoch": 2.3375729099157487, + "grad_norm": 0.0550704151391983, + "learning_rate": 9.235789189432468e-05, + "loss": 0.2517, + "step": 28855 + }, + { + "epoch": 2.337653920933247, + "grad_norm": 0.05813801661133766, + "learning_rate": 9.235339124173005e-05, + "loss": 0.2494, + "step": 28856 + }, + { + "epoch": 2.337734931950745, + "grad_norm": 0.05280022695660591, + "learning_rate": 9.234889058913543e-05, + "loss": 0.2242, + "step": 28857 + }, + { + "epoch": 2.337815942968244, + "grad_norm": 0.07368249446153641, + "learning_rate": 9.23443899365408e-05, + "loss": 0.2554, + "step": 28858 + }, + { + "epoch": 2.337896953985742, + "grad_norm": 0.07378704845905304, + "learning_rate": 9.233988928394618e-05, + "loss": 0.2355, + "step": 28859 + }, + { + "epoch": 2.3379779650032404, + "grad_norm": 0.06204526498913765, + "learning_rate": 9.233538863135155e-05, + "loss": 0.2243, + "step": 28860 + }, + { + "epoch": 2.3380589760207386, + "grad_norm": 0.06535429507493973, + "learning_rate": 9.233088797875692e-05, + "loss": 0.2564, + "step": 28861 + }, + { + "epoch": 2.3381399870382373, + "grad_norm": 0.0819898247718811, + "learning_rate": 9.23263873261623e-05, + "loss": 0.2373, + "step": 28862 + }, + { + "epoch": 2.3382209980557356, + "grad_norm": 0.0712568461894989, + "learning_rate": 9.232188667356767e-05, + "loss": 0.285, + "step": 28863 + }, + { + "epoch": 2.338302009073234, + "grad_norm": 0.060597095638513565, + "learning_rate": 9.231738602097304e-05, + "loss": 0.2217, + "step": 28864 + }, + { + "epoch": 2.3383830200907325, + "grad_norm": 0.08113036304712296, + "learning_rate": 9.231288536837842e-05, + "loss": 0.2632, + "step": 28865 + }, + { + "epoch": 2.3384640311082308, + "grad_norm": 0.06157940253615379, + "learning_rate": 9.230838471578379e-05, + "loss": 0.2237, + "step": 28866 + }, + { + "epoch": 2.338545042125729, + "grad_norm": 0.06427460163831711, + "learning_rate": 9.230388406318918e-05, + "loss": 0.2304, + "step": 28867 + }, + { + "epoch": 2.3386260531432272, + "grad_norm": 0.07019753754138947, + "learning_rate": 9.229938341059454e-05, + "loss": 0.2394, + "step": 28868 + }, + { + "epoch": 2.338707064160726, + "grad_norm": 0.062393609434366226, + "learning_rate": 9.229488275799991e-05, + "loss": 0.2255, + "step": 28869 + }, + { + "epoch": 2.338788075178224, + "grad_norm": 0.07597274333238602, + "learning_rate": 9.22903821054053e-05, + "loss": 0.262, + "step": 28870 + }, + { + "epoch": 2.3388690861957224, + "grad_norm": 0.06904326379299164, + "learning_rate": 9.228588145281066e-05, + "loss": 0.2396, + "step": 28871 + }, + { + "epoch": 2.338950097213221, + "grad_norm": 0.06872831284999847, + "learning_rate": 9.228138080021603e-05, + "loss": 0.2541, + "step": 28872 + }, + { + "epoch": 2.3390311082307194, + "grad_norm": 0.08009593188762665, + "learning_rate": 9.227688014762142e-05, + "loss": 0.2615, + "step": 28873 + }, + { + "epoch": 2.3391121192482176, + "grad_norm": 0.07586513459682465, + "learning_rate": 9.227237949502678e-05, + "loss": 0.2718, + "step": 28874 + }, + { + "epoch": 2.3391931302657163, + "grad_norm": 0.058345962315797806, + "learning_rate": 9.226787884243215e-05, + "loss": 0.2233, + "step": 28875 + }, + { + "epoch": 2.3392741412832145, + "grad_norm": 0.06970062851905823, + "learning_rate": 9.226337818983754e-05, + "loss": 0.2233, + "step": 28876 + }, + { + "epoch": 2.339355152300713, + "grad_norm": 0.08210455626249313, + "learning_rate": 9.22588775372429e-05, + "loss": 0.266, + "step": 28877 + }, + { + "epoch": 2.3394361633182115, + "grad_norm": 0.08800669759511948, + "learning_rate": 9.225437688464827e-05, + "loss": 0.2611, + "step": 28878 + }, + { + "epoch": 2.3395171743357097, + "grad_norm": 0.06274203956127167, + "learning_rate": 9.224987623205366e-05, + "loss": 0.2301, + "step": 28879 + }, + { + "epoch": 2.339598185353208, + "grad_norm": 0.05375487357378006, + "learning_rate": 9.224537557945902e-05, + "loss": 0.2326, + "step": 28880 + }, + { + "epoch": 2.3396791963707066, + "grad_norm": 0.06286244839429855, + "learning_rate": 9.22408749268644e-05, + "loss": 0.2318, + "step": 28881 + }, + { + "epoch": 2.339760207388205, + "grad_norm": 0.07534752041101456, + "learning_rate": 9.223637427426978e-05, + "loss": 0.2476, + "step": 28882 + }, + { + "epoch": 2.339841218405703, + "grad_norm": 0.0626584067940712, + "learning_rate": 9.223187362167514e-05, + "loss": 0.2728, + "step": 28883 + }, + { + "epoch": 2.3399222294232014, + "grad_norm": 0.05673712119460106, + "learning_rate": 9.222737296908052e-05, + "loss": 0.2632, + "step": 28884 + }, + { + "epoch": 2.3400032404407, + "grad_norm": 0.055177003145217896, + "learning_rate": 9.22228723164859e-05, + "loss": 0.2225, + "step": 28885 + }, + { + "epoch": 2.3400842514581983, + "grad_norm": 0.06031114235520363, + "learning_rate": 9.221837166389126e-05, + "loss": 0.2577, + "step": 28886 + }, + { + "epoch": 2.3401652624756966, + "grad_norm": 0.062008291482925415, + "learning_rate": 9.221387101129664e-05, + "loss": 0.2615, + "step": 28887 + }, + { + "epoch": 2.3402462734931953, + "grad_norm": 0.05988309159874916, + "learning_rate": 9.220937035870202e-05, + "loss": 0.2788, + "step": 28888 + }, + { + "epoch": 2.3403272845106935, + "grad_norm": 0.08174771070480347, + "learning_rate": 9.220486970610738e-05, + "loss": 0.2649, + "step": 28889 + }, + { + "epoch": 2.3404082955281917, + "grad_norm": 0.07654719054698944, + "learning_rate": 9.220036905351276e-05, + "loss": 0.2373, + "step": 28890 + }, + { + "epoch": 2.34048930654569, + "grad_norm": 0.05224039405584335, + "learning_rate": 9.219586840091814e-05, + "loss": 0.1982, + "step": 28891 + }, + { + "epoch": 2.3405703175631887, + "grad_norm": 0.06260054558515549, + "learning_rate": 9.21913677483235e-05, + "loss": 0.2508, + "step": 28892 + }, + { + "epoch": 2.340651328580687, + "grad_norm": 0.05155821144580841, + "learning_rate": 9.218686709572889e-05, + "loss": 0.2476, + "step": 28893 + }, + { + "epoch": 2.340732339598185, + "grad_norm": 0.06139184907078743, + "learning_rate": 9.218236644313427e-05, + "loss": 0.2475, + "step": 28894 + }, + { + "epoch": 2.340813350615684, + "grad_norm": 0.0666014552116394, + "learning_rate": 9.217786579053963e-05, + "loss": 0.2513, + "step": 28895 + }, + { + "epoch": 2.340894361633182, + "grad_norm": 0.06419499218463898, + "learning_rate": 9.217336513794501e-05, + "loss": 0.2276, + "step": 28896 + }, + { + "epoch": 2.3409753726506803, + "grad_norm": 0.08260070532560349, + "learning_rate": 9.216886448535039e-05, + "loss": 0.2799, + "step": 28897 + }, + { + "epoch": 2.341056383668179, + "grad_norm": 0.061513788998126984, + "learning_rate": 9.216436383275575e-05, + "loss": 0.2532, + "step": 28898 + }, + { + "epoch": 2.3411373946856773, + "grad_norm": 0.06428606063127518, + "learning_rate": 9.215986318016113e-05, + "loss": 0.2677, + "step": 28899 + }, + { + "epoch": 2.3412184057031755, + "grad_norm": 0.07828311622142792, + "learning_rate": 9.215536252756651e-05, + "loss": 0.247, + "step": 28900 + }, + { + "epoch": 2.341299416720674, + "grad_norm": 0.08125907927751541, + "learning_rate": 9.215086187497187e-05, + "loss": 0.2534, + "step": 28901 + }, + { + "epoch": 2.3413804277381725, + "grad_norm": 0.0653858631849289, + "learning_rate": 9.214636122237725e-05, + "loss": 0.251, + "step": 28902 + }, + { + "epoch": 2.3414614387556707, + "grad_norm": 0.06650615483522415, + "learning_rate": 9.214186056978263e-05, + "loss": 0.2561, + "step": 28903 + }, + { + "epoch": 2.3415424497731694, + "grad_norm": 0.0694642886519432, + "learning_rate": 9.213735991718799e-05, + "loss": 0.2732, + "step": 28904 + }, + { + "epoch": 2.3416234607906676, + "grad_norm": 0.07338881492614746, + "learning_rate": 9.213285926459338e-05, + "loss": 0.263, + "step": 28905 + }, + { + "epoch": 2.341704471808166, + "grad_norm": 0.05784222483634949, + "learning_rate": 9.212835861199875e-05, + "loss": 0.2523, + "step": 28906 + }, + { + "epoch": 2.341785482825664, + "grad_norm": 0.06689013540744781, + "learning_rate": 9.212385795940411e-05, + "loss": 0.242, + "step": 28907 + }, + { + "epoch": 2.341866493843163, + "grad_norm": 0.0690750703215599, + "learning_rate": 9.21193573068095e-05, + "loss": 0.2491, + "step": 28908 + }, + { + "epoch": 2.341947504860661, + "grad_norm": 0.0644659698009491, + "learning_rate": 9.211485665421487e-05, + "loss": 0.2304, + "step": 28909 + }, + { + "epoch": 2.3420285158781593, + "grad_norm": 0.06285371631383896, + "learning_rate": 9.211035600162023e-05, + "loss": 0.2224, + "step": 28910 + }, + { + "epoch": 2.342109526895658, + "grad_norm": 0.06495443731546402, + "learning_rate": 9.210585534902562e-05, + "loss": 0.2385, + "step": 28911 + }, + { + "epoch": 2.3421905379131562, + "grad_norm": 0.07020930200815201, + "learning_rate": 9.210135469643099e-05, + "loss": 0.2388, + "step": 28912 + }, + { + "epoch": 2.3422715489306545, + "grad_norm": 0.07302417606115341, + "learning_rate": 9.209685404383635e-05, + "loss": 0.219, + "step": 28913 + }, + { + "epoch": 2.3423525599481527, + "grad_norm": 0.05842958390712738, + "learning_rate": 9.209235339124174e-05, + "loss": 0.2071, + "step": 28914 + }, + { + "epoch": 2.3424335709656514, + "grad_norm": 0.05464782565832138, + "learning_rate": 9.208785273864711e-05, + "loss": 0.2315, + "step": 28915 + }, + { + "epoch": 2.3425145819831497, + "grad_norm": 0.0690939649939537, + "learning_rate": 9.208335208605247e-05, + "loss": 0.2757, + "step": 28916 + }, + { + "epoch": 2.342595593000648, + "grad_norm": 0.08018796145915985, + "learning_rate": 9.207885143345786e-05, + "loss": 0.2232, + "step": 28917 + }, + { + "epoch": 2.3426766040181466, + "grad_norm": 0.06097112223505974, + "learning_rate": 9.207435078086323e-05, + "loss": 0.2281, + "step": 28918 + }, + { + "epoch": 2.342757615035645, + "grad_norm": 0.053888920694589615, + "learning_rate": 9.20698501282686e-05, + "loss": 0.2471, + "step": 28919 + }, + { + "epoch": 2.342838626053143, + "grad_norm": 0.07474564015865326, + "learning_rate": 9.206534947567398e-05, + "loss": 0.2867, + "step": 28920 + }, + { + "epoch": 2.3429196370706418, + "grad_norm": 0.07385596632957458, + "learning_rate": 9.206084882307935e-05, + "loss": 0.2633, + "step": 28921 + }, + { + "epoch": 2.34300064808814, + "grad_norm": 0.0691351518034935, + "learning_rate": 9.205634817048473e-05, + "loss": 0.2792, + "step": 28922 + }, + { + "epoch": 2.3430816591056383, + "grad_norm": 0.07424209266901016, + "learning_rate": 9.20518475178901e-05, + "loss": 0.2547, + "step": 28923 + }, + { + "epoch": 2.343162670123137, + "grad_norm": 0.06224573031067848, + "learning_rate": 9.204734686529547e-05, + "loss": 0.2473, + "step": 28924 + }, + { + "epoch": 2.343243681140635, + "grad_norm": 0.0734906867146492, + "learning_rate": 9.204284621270085e-05, + "loss": 0.2414, + "step": 28925 + }, + { + "epoch": 2.3433246921581334, + "grad_norm": 0.06674046814441681, + "learning_rate": 9.203834556010622e-05, + "loss": 0.2388, + "step": 28926 + }, + { + "epoch": 2.343405703175632, + "grad_norm": 0.08335613459348679, + "learning_rate": 9.20338449075116e-05, + "loss": 0.2423, + "step": 28927 + }, + { + "epoch": 2.3434867141931304, + "grad_norm": 0.06520678848028183, + "learning_rate": 9.202934425491697e-05, + "loss": 0.238, + "step": 28928 + }, + { + "epoch": 2.3435677252106286, + "grad_norm": 0.0749930590391159, + "learning_rate": 9.202484360232234e-05, + "loss": 0.2372, + "step": 28929 + }, + { + "epoch": 2.343648736228127, + "grad_norm": 0.06236148253083229, + "learning_rate": 9.202034294972772e-05, + "loss": 0.2388, + "step": 28930 + }, + { + "epoch": 2.3437297472456255, + "grad_norm": 0.0710134208202362, + "learning_rate": 9.201584229713309e-05, + "loss": 0.2383, + "step": 28931 + }, + { + "epoch": 2.343810758263124, + "grad_norm": 0.07559984922409058, + "learning_rate": 9.201134164453846e-05, + "loss": 0.2338, + "step": 28932 + }, + { + "epoch": 2.343891769280622, + "grad_norm": 0.06743871420621872, + "learning_rate": 9.200684099194384e-05, + "loss": 0.2642, + "step": 28933 + }, + { + "epoch": 2.3439727802981207, + "grad_norm": 0.06335540860891342, + "learning_rate": 9.200234033934921e-05, + "loss": 0.2315, + "step": 28934 + }, + { + "epoch": 2.344053791315619, + "grad_norm": 0.07250569760799408, + "learning_rate": 9.199783968675458e-05, + "loss": 0.2444, + "step": 28935 + }, + { + "epoch": 2.344134802333117, + "grad_norm": 0.07095897942781448, + "learning_rate": 9.199333903415996e-05, + "loss": 0.2593, + "step": 28936 + }, + { + "epoch": 2.3442158133506155, + "grad_norm": 0.057127442210912704, + "learning_rate": 9.198883838156533e-05, + "loss": 0.2046, + "step": 28937 + }, + { + "epoch": 2.344296824368114, + "grad_norm": 0.07967636734247208, + "learning_rate": 9.19843377289707e-05, + "loss": 0.2502, + "step": 28938 + }, + { + "epoch": 2.3443778353856124, + "grad_norm": 0.05165790766477585, + "learning_rate": 9.197983707637608e-05, + "loss": 0.2311, + "step": 28939 + }, + { + "epoch": 2.3444588464031106, + "grad_norm": 0.05036972835659981, + "learning_rate": 9.197533642378145e-05, + "loss": 0.2129, + "step": 28940 + }, + { + "epoch": 2.3445398574206093, + "grad_norm": 0.07671160250902176, + "learning_rate": 9.197083577118682e-05, + "loss": 0.2968, + "step": 28941 + }, + { + "epoch": 2.3446208684381076, + "grad_norm": 0.0694350153207779, + "learning_rate": 9.19663351185922e-05, + "loss": 0.2546, + "step": 28942 + }, + { + "epoch": 2.344701879455606, + "grad_norm": 0.0753200575709343, + "learning_rate": 9.196183446599757e-05, + "loss": 0.2516, + "step": 28943 + }, + { + "epoch": 2.3447828904731045, + "grad_norm": 0.06781428307294846, + "learning_rate": 9.195733381340295e-05, + "loss": 0.2385, + "step": 28944 + }, + { + "epoch": 2.3448639014906028, + "grad_norm": 0.06559989601373672, + "learning_rate": 9.195283316080833e-05, + "loss": 0.2582, + "step": 28945 + }, + { + "epoch": 2.344944912508101, + "grad_norm": 0.08580916374921799, + "learning_rate": 9.194833250821369e-05, + "loss": 0.2617, + "step": 28946 + }, + { + "epoch": 2.3450259235255997, + "grad_norm": 0.065764881670475, + "learning_rate": 9.194383185561907e-05, + "loss": 0.2238, + "step": 28947 + }, + { + "epoch": 2.345106934543098, + "grad_norm": 0.06031308323144913, + "learning_rate": 9.193933120302445e-05, + "loss": 0.2493, + "step": 28948 + }, + { + "epoch": 2.345187945560596, + "grad_norm": 0.0599418543279171, + "learning_rate": 9.193483055042981e-05, + "loss": 0.2477, + "step": 28949 + }, + { + "epoch": 2.345268956578095, + "grad_norm": 0.07239186018705368, + "learning_rate": 9.193032989783519e-05, + "loss": 0.2567, + "step": 28950 + }, + { + "epoch": 2.345349967595593, + "grad_norm": 0.08161863684654236, + "learning_rate": 9.192582924524057e-05, + "loss": 0.2713, + "step": 28951 + }, + { + "epoch": 2.3454309786130914, + "grad_norm": 0.0612737275660038, + "learning_rate": 9.192132859264593e-05, + "loss": 0.216, + "step": 28952 + }, + { + "epoch": 2.3455119896305896, + "grad_norm": 0.08176063001155853, + "learning_rate": 9.191682794005131e-05, + "loss": 0.2514, + "step": 28953 + }, + { + "epoch": 2.3455930006480883, + "grad_norm": 0.04953533038496971, + "learning_rate": 9.19123272874567e-05, + "loss": 0.2425, + "step": 28954 + }, + { + "epoch": 2.3456740116655865, + "grad_norm": 0.06790510565042496, + "learning_rate": 9.190782663486206e-05, + "loss": 0.2317, + "step": 28955 + }, + { + "epoch": 2.345755022683085, + "grad_norm": 0.07242914289236069, + "learning_rate": 9.190332598226743e-05, + "loss": 0.227, + "step": 28956 + }, + { + "epoch": 2.3458360337005835, + "grad_norm": 0.07935182005167007, + "learning_rate": 9.189882532967282e-05, + "loss": 0.266, + "step": 28957 + }, + { + "epoch": 2.3459170447180817, + "grad_norm": 0.06886610388755798, + "learning_rate": 9.189432467707818e-05, + "loss": 0.2192, + "step": 28958 + }, + { + "epoch": 2.34599805573558, + "grad_norm": 0.0698259025812149, + "learning_rate": 9.188982402448355e-05, + "loss": 0.2757, + "step": 28959 + }, + { + "epoch": 2.346079066753078, + "grad_norm": 0.06897472590208054, + "learning_rate": 9.188532337188894e-05, + "loss": 0.2608, + "step": 28960 + }, + { + "epoch": 2.346160077770577, + "grad_norm": 0.0625743418931961, + "learning_rate": 9.18808227192943e-05, + "loss": 0.2446, + "step": 28961 + }, + { + "epoch": 2.346241088788075, + "grad_norm": 0.04949932545423508, + "learning_rate": 9.187632206669967e-05, + "loss": 0.2161, + "step": 28962 + }, + { + "epoch": 2.3463220998055734, + "grad_norm": 0.06938916444778442, + "learning_rate": 9.187182141410506e-05, + "loss": 0.2491, + "step": 28963 + }, + { + "epoch": 2.346403110823072, + "grad_norm": 0.07660891115665436, + "learning_rate": 9.186732076151042e-05, + "loss": 0.2647, + "step": 28964 + }, + { + "epoch": 2.3464841218405703, + "grad_norm": 0.06504109501838684, + "learning_rate": 9.186282010891579e-05, + "loss": 0.2786, + "step": 28965 + }, + { + "epoch": 2.3465651328580686, + "grad_norm": 0.07999644428491592, + "learning_rate": 9.185831945632118e-05, + "loss": 0.2319, + "step": 28966 + }, + { + "epoch": 2.3466461438755672, + "grad_norm": 0.07580562680959702, + "learning_rate": 9.185381880372654e-05, + "loss": 0.2959, + "step": 28967 + }, + { + "epoch": 2.3467271548930655, + "grad_norm": 0.05977579206228256, + "learning_rate": 9.184931815113191e-05, + "loss": 0.2362, + "step": 28968 + }, + { + "epoch": 2.3468081659105637, + "grad_norm": 0.053713224828243256, + "learning_rate": 9.18448174985373e-05, + "loss": 0.2205, + "step": 28969 + }, + { + "epoch": 2.3468891769280624, + "grad_norm": 0.07156253606081009, + "learning_rate": 9.184031684594266e-05, + "loss": 0.2572, + "step": 28970 + }, + { + "epoch": 2.3469701879455607, + "grad_norm": 0.06587357074022293, + "learning_rate": 9.183581619334803e-05, + "loss": 0.2546, + "step": 28971 + }, + { + "epoch": 2.347051198963059, + "grad_norm": 0.052538491785526276, + "learning_rate": 9.183131554075342e-05, + "loss": 0.21, + "step": 28972 + }, + { + "epoch": 2.3471322099805576, + "grad_norm": 0.05909012258052826, + "learning_rate": 9.182681488815878e-05, + "loss": 0.2277, + "step": 28973 + }, + { + "epoch": 2.347213220998056, + "grad_norm": 0.054832495748996735, + "learning_rate": 9.182231423556417e-05, + "loss": 0.2304, + "step": 28974 + }, + { + "epoch": 2.347294232015554, + "grad_norm": 0.07047142833471298, + "learning_rate": 9.181781358296954e-05, + "loss": 0.262, + "step": 28975 + }, + { + "epoch": 2.3473752430330523, + "grad_norm": 0.06125551462173462, + "learning_rate": 9.18133129303749e-05, + "loss": 0.2335, + "step": 28976 + }, + { + "epoch": 2.347456254050551, + "grad_norm": 0.08382407575845718, + "learning_rate": 9.180881227778029e-05, + "loss": 0.3174, + "step": 28977 + }, + { + "epoch": 2.3475372650680493, + "grad_norm": 0.06475745886564255, + "learning_rate": 9.180431162518566e-05, + "loss": 0.2553, + "step": 28978 + }, + { + "epoch": 2.3476182760855475, + "grad_norm": 0.05636309087276459, + "learning_rate": 9.179981097259102e-05, + "loss": 0.2548, + "step": 28979 + }, + { + "epoch": 2.347699287103046, + "grad_norm": 0.06750059127807617, + "learning_rate": 9.179531031999641e-05, + "loss": 0.2676, + "step": 28980 + }, + { + "epoch": 2.3477802981205445, + "grad_norm": 0.05322468280792236, + "learning_rate": 9.179080966740178e-05, + "loss": 0.2366, + "step": 28981 + }, + { + "epoch": 2.3478613091380427, + "grad_norm": 0.06459199637174606, + "learning_rate": 9.178630901480714e-05, + "loss": 0.2448, + "step": 28982 + }, + { + "epoch": 2.347942320155541, + "grad_norm": 0.08525732904672623, + "learning_rate": 9.178180836221253e-05, + "loss": 0.255, + "step": 28983 + }, + { + "epoch": 2.3480233311730396, + "grad_norm": 0.06443566828966141, + "learning_rate": 9.17773077096179e-05, + "loss": 0.2518, + "step": 28984 + }, + { + "epoch": 2.348104342190538, + "grad_norm": 0.06144661083817482, + "learning_rate": 9.177280705702326e-05, + "loss": 0.2246, + "step": 28985 + }, + { + "epoch": 2.348185353208036, + "grad_norm": 0.055623866617679596, + "learning_rate": 9.176830640442865e-05, + "loss": 0.2165, + "step": 28986 + }, + { + "epoch": 2.348266364225535, + "grad_norm": 0.054140083491802216, + "learning_rate": 9.176380575183402e-05, + "loss": 0.2077, + "step": 28987 + }, + { + "epoch": 2.348347375243033, + "grad_norm": 0.05536891147494316, + "learning_rate": 9.175930509923938e-05, + "loss": 0.1951, + "step": 28988 + }, + { + "epoch": 2.3484283862605313, + "grad_norm": 0.08356544375419617, + "learning_rate": 9.175480444664477e-05, + "loss": 0.2173, + "step": 28989 + }, + { + "epoch": 2.34850939727803, + "grad_norm": 0.07575877755880356, + "learning_rate": 9.175030379405014e-05, + "loss": 0.2238, + "step": 28990 + }, + { + "epoch": 2.3485904082955282, + "grad_norm": 0.07282985001802444, + "learning_rate": 9.17458031414555e-05, + "loss": 0.2359, + "step": 28991 + }, + { + "epoch": 2.3486714193130265, + "grad_norm": 0.06027929112315178, + "learning_rate": 9.174130248886089e-05, + "loss": 0.2358, + "step": 28992 + }, + { + "epoch": 2.348752430330525, + "grad_norm": 0.061999425292015076, + "learning_rate": 9.173680183626627e-05, + "loss": 0.1877, + "step": 28993 + }, + { + "epoch": 2.3488334413480234, + "grad_norm": 0.0746329203248024, + "learning_rate": 9.173230118367163e-05, + "loss": 0.2525, + "step": 28994 + }, + { + "epoch": 2.3489144523655217, + "grad_norm": 0.05842380225658417, + "learning_rate": 9.172780053107701e-05, + "loss": 0.2533, + "step": 28995 + }, + { + "epoch": 2.34899546338302, + "grad_norm": 0.05924517661333084, + "learning_rate": 9.172329987848239e-05, + "loss": 0.2229, + "step": 28996 + }, + { + "epoch": 2.3490764744005186, + "grad_norm": 0.06103135272860527, + "learning_rate": 9.171879922588775e-05, + "loss": 0.2527, + "step": 28997 + }, + { + "epoch": 2.349157485418017, + "grad_norm": 0.0787557065486908, + "learning_rate": 9.171429857329313e-05, + "loss": 0.2657, + "step": 28998 + }, + { + "epoch": 2.349238496435515, + "grad_norm": 0.07225020974874496, + "learning_rate": 9.170979792069851e-05, + "loss": 0.2664, + "step": 28999 + }, + { + "epoch": 2.3493195074530138, + "grad_norm": 0.06952983886003494, + "learning_rate": 9.170529726810388e-05, + "loss": 0.2139, + "step": 29000 + }, + { + "epoch": 2.349400518470512, + "grad_norm": 0.06747347116470337, + "learning_rate": 9.170079661550925e-05, + "loss": 0.2172, + "step": 29001 + }, + { + "epoch": 2.3494815294880103, + "grad_norm": 0.06495499610900879, + "learning_rate": 9.169629596291463e-05, + "loss": 0.2392, + "step": 29002 + }, + { + "epoch": 2.3495625405055085, + "grad_norm": 0.07105159759521484, + "learning_rate": 9.169179531032e-05, + "loss": 0.2398, + "step": 29003 + }, + { + "epoch": 2.349643551523007, + "grad_norm": 0.06550126522779465, + "learning_rate": 9.168729465772538e-05, + "loss": 0.2994, + "step": 29004 + }, + { + "epoch": 2.3497245625405054, + "grad_norm": 0.05634589120745659, + "learning_rate": 9.168279400513075e-05, + "loss": 0.2074, + "step": 29005 + }, + { + "epoch": 2.3498055735580037, + "grad_norm": 0.06591413170099258, + "learning_rate": 9.167829335253612e-05, + "loss": 0.2813, + "step": 29006 + }, + { + "epoch": 2.3498865845755024, + "grad_norm": 0.06812646985054016, + "learning_rate": 9.16737926999415e-05, + "loss": 0.2792, + "step": 29007 + }, + { + "epoch": 2.3499675955930006, + "grad_norm": 0.06828898936510086, + "learning_rate": 9.166929204734687e-05, + "loss": 0.2605, + "step": 29008 + }, + { + "epoch": 2.350048606610499, + "grad_norm": 0.06219886243343353, + "learning_rate": 9.166479139475224e-05, + "loss": 0.2731, + "step": 29009 + }, + { + "epoch": 2.3501296176279975, + "grad_norm": 0.0656910315155983, + "learning_rate": 9.166029074215762e-05, + "loss": 0.258, + "step": 29010 + }, + { + "epoch": 2.350210628645496, + "grad_norm": 0.06982869654893875, + "learning_rate": 9.165579008956299e-05, + "loss": 0.2512, + "step": 29011 + }, + { + "epoch": 2.350291639662994, + "grad_norm": 0.047029316425323486, + "learning_rate": 9.165128943696836e-05, + "loss": 0.2259, + "step": 29012 + }, + { + "epoch": 2.3503726506804927, + "grad_norm": 0.07702232897281647, + "learning_rate": 9.164678878437374e-05, + "loss": 0.2386, + "step": 29013 + }, + { + "epoch": 2.350453661697991, + "grad_norm": 0.052338164299726486, + "learning_rate": 9.164228813177911e-05, + "loss": 0.2337, + "step": 29014 + }, + { + "epoch": 2.350534672715489, + "grad_norm": 0.07687724381685257, + "learning_rate": 9.163778747918449e-05, + "loss": 0.2648, + "step": 29015 + }, + { + "epoch": 2.350615683732988, + "grad_norm": 0.06665743142366409, + "learning_rate": 9.163328682658986e-05, + "loss": 0.2785, + "step": 29016 + }, + { + "epoch": 2.350696694750486, + "grad_norm": 0.06634911149740219, + "learning_rate": 9.162878617399523e-05, + "loss": 0.2277, + "step": 29017 + }, + { + "epoch": 2.3507777057679844, + "grad_norm": 0.07159882038831711, + "learning_rate": 9.16242855214006e-05, + "loss": 0.2722, + "step": 29018 + }, + { + "epoch": 2.3508587167854826, + "grad_norm": 0.05206114426255226, + "learning_rate": 9.161978486880598e-05, + "loss": 0.2074, + "step": 29019 + }, + { + "epoch": 2.3509397278029813, + "grad_norm": 0.06335695832967758, + "learning_rate": 9.161528421621135e-05, + "loss": 0.2361, + "step": 29020 + }, + { + "epoch": 2.3510207388204796, + "grad_norm": 0.07118600606918335, + "learning_rate": 9.161078356361673e-05, + "loss": 0.2682, + "step": 29021 + }, + { + "epoch": 2.351101749837978, + "grad_norm": 0.06725834310054779, + "learning_rate": 9.16062829110221e-05, + "loss": 0.2615, + "step": 29022 + }, + { + "epoch": 2.3511827608554765, + "grad_norm": 0.0714595764875412, + "learning_rate": 9.160178225842747e-05, + "loss": 0.2619, + "step": 29023 + }, + { + "epoch": 2.3512637718729748, + "grad_norm": 0.0766262337565422, + "learning_rate": 9.159728160583285e-05, + "loss": 0.2714, + "step": 29024 + }, + { + "epoch": 2.351344782890473, + "grad_norm": 0.07190114259719849, + "learning_rate": 9.159278095323822e-05, + "loss": 0.2599, + "step": 29025 + }, + { + "epoch": 2.3514257939079712, + "grad_norm": 0.06509433686733246, + "learning_rate": 9.158828030064361e-05, + "loss": 0.255, + "step": 29026 + }, + { + "epoch": 2.35150680492547, + "grad_norm": 0.06340566277503967, + "learning_rate": 9.158377964804897e-05, + "loss": 0.2432, + "step": 29027 + }, + { + "epoch": 2.351587815942968, + "grad_norm": 0.07813110947608948, + "learning_rate": 9.157927899545434e-05, + "loss": 0.2729, + "step": 29028 + }, + { + "epoch": 2.3516688269604664, + "grad_norm": 0.07088219374418259, + "learning_rate": 9.157477834285973e-05, + "loss": 0.2391, + "step": 29029 + }, + { + "epoch": 2.351749837977965, + "grad_norm": 0.0655277669429779, + "learning_rate": 9.157027769026509e-05, + "loss": 0.256, + "step": 29030 + }, + { + "epoch": 2.3518308489954634, + "grad_norm": 0.07638170570135117, + "learning_rate": 9.156577703767046e-05, + "loss": 0.3067, + "step": 29031 + }, + { + "epoch": 2.3519118600129616, + "grad_norm": 0.05520417168736458, + "learning_rate": 9.156127638507585e-05, + "loss": 0.2231, + "step": 29032 + }, + { + "epoch": 2.3519928710304603, + "grad_norm": 0.061939746141433716, + "learning_rate": 9.155677573248121e-05, + "loss": 0.1929, + "step": 29033 + }, + { + "epoch": 2.3520738820479585, + "grad_norm": 0.06050903722643852, + "learning_rate": 9.155227507988658e-05, + "loss": 0.2748, + "step": 29034 + }, + { + "epoch": 2.3521548930654568, + "grad_norm": 0.06427094340324402, + "learning_rate": 9.154777442729197e-05, + "loss": 0.2544, + "step": 29035 + }, + { + "epoch": 2.3522359040829555, + "grad_norm": 0.06105329468846321, + "learning_rate": 9.154327377469733e-05, + "loss": 0.219, + "step": 29036 + }, + { + "epoch": 2.3523169151004537, + "grad_norm": 0.07421961426734924, + "learning_rate": 9.15387731221027e-05, + "loss": 0.2403, + "step": 29037 + }, + { + "epoch": 2.352397926117952, + "grad_norm": 0.0582902729511261, + "learning_rate": 9.153427246950809e-05, + "loss": 0.26, + "step": 29038 + }, + { + "epoch": 2.3524789371354506, + "grad_norm": 0.06710220873355865, + "learning_rate": 9.152977181691345e-05, + "loss": 0.26, + "step": 29039 + }, + { + "epoch": 2.352559948152949, + "grad_norm": 0.054716769605875015, + "learning_rate": 9.152527116431883e-05, + "loss": 0.2301, + "step": 29040 + }, + { + "epoch": 2.352640959170447, + "grad_norm": 0.06258168071508408, + "learning_rate": 9.152077051172421e-05, + "loss": 0.2367, + "step": 29041 + }, + { + "epoch": 2.3527219701879454, + "grad_norm": 0.0666784793138504, + "learning_rate": 9.151626985912957e-05, + "loss": 0.2377, + "step": 29042 + }, + { + "epoch": 2.352802981205444, + "grad_norm": 0.062231969088315964, + "learning_rate": 9.151176920653495e-05, + "loss": 0.2311, + "step": 29043 + }, + { + "epoch": 2.3528839922229423, + "grad_norm": 0.05620197951793671, + "learning_rate": 9.150726855394033e-05, + "loss": 0.2493, + "step": 29044 + }, + { + "epoch": 2.3529650032404406, + "grad_norm": 0.06929001957178116, + "learning_rate": 9.150276790134569e-05, + "loss": 0.2569, + "step": 29045 + }, + { + "epoch": 2.3530460142579392, + "grad_norm": 0.07018900662660599, + "learning_rate": 9.149826724875107e-05, + "loss": 0.2287, + "step": 29046 + }, + { + "epoch": 2.3531270252754375, + "grad_norm": 0.07081607729196548, + "learning_rate": 9.149376659615645e-05, + "loss": 0.2215, + "step": 29047 + }, + { + "epoch": 2.3532080362929357, + "grad_norm": 0.062088314443826675, + "learning_rate": 9.148926594356181e-05, + "loss": 0.2282, + "step": 29048 + }, + { + "epoch": 2.353289047310434, + "grad_norm": 0.06053098291158676, + "learning_rate": 9.148476529096719e-05, + "loss": 0.2198, + "step": 29049 + }, + { + "epoch": 2.3533700583279327, + "grad_norm": 0.07323571294546127, + "learning_rate": 9.148026463837257e-05, + "loss": 0.2468, + "step": 29050 + }, + { + "epoch": 2.353451069345431, + "grad_norm": 0.06491614133119583, + "learning_rate": 9.147576398577793e-05, + "loss": 0.2259, + "step": 29051 + }, + { + "epoch": 2.353532080362929, + "grad_norm": 0.06910113245248795, + "learning_rate": 9.147126333318332e-05, + "loss": 0.2277, + "step": 29052 + }, + { + "epoch": 2.353613091380428, + "grad_norm": 0.06695449352264404, + "learning_rate": 9.14667626805887e-05, + "loss": 0.2676, + "step": 29053 + }, + { + "epoch": 2.353694102397926, + "grad_norm": 0.07094959169626236, + "learning_rate": 9.146226202799406e-05, + "loss": 0.2747, + "step": 29054 + }, + { + "epoch": 2.3537751134154243, + "grad_norm": 0.0783795416355133, + "learning_rate": 9.145776137539944e-05, + "loss": 0.2644, + "step": 29055 + }, + { + "epoch": 2.353856124432923, + "grad_norm": 0.09245872497558594, + "learning_rate": 9.145326072280482e-05, + "loss": 0.3017, + "step": 29056 + }, + { + "epoch": 2.3539371354504213, + "grad_norm": 0.06651142239570618, + "learning_rate": 9.144876007021018e-05, + "loss": 0.2586, + "step": 29057 + }, + { + "epoch": 2.3540181464679195, + "grad_norm": 0.0733952522277832, + "learning_rate": 9.144425941761556e-05, + "loss": 0.2572, + "step": 29058 + }, + { + "epoch": 2.354099157485418, + "grad_norm": 0.0771905779838562, + "learning_rate": 9.143975876502094e-05, + "loss": 0.2467, + "step": 29059 + }, + { + "epoch": 2.3541801685029164, + "grad_norm": 0.06089000403881073, + "learning_rate": 9.14352581124263e-05, + "loss": 0.2183, + "step": 29060 + }, + { + "epoch": 2.3542611795204147, + "grad_norm": 0.06993361562490463, + "learning_rate": 9.143075745983168e-05, + "loss": 0.2635, + "step": 29061 + }, + { + "epoch": 2.3543421905379134, + "grad_norm": 0.06053172051906586, + "learning_rate": 9.142625680723706e-05, + "loss": 0.2675, + "step": 29062 + }, + { + "epoch": 2.3544232015554116, + "grad_norm": 0.06449250131845474, + "learning_rate": 9.142175615464242e-05, + "loss": 0.2575, + "step": 29063 + }, + { + "epoch": 2.35450421257291, + "grad_norm": 0.07320661842823029, + "learning_rate": 9.14172555020478e-05, + "loss": 0.2607, + "step": 29064 + }, + { + "epoch": 2.354585223590408, + "grad_norm": 0.0640730932354927, + "learning_rate": 9.141275484945318e-05, + "loss": 0.2475, + "step": 29065 + }, + { + "epoch": 2.354666234607907, + "grad_norm": 0.055678799748420715, + "learning_rate": 9.140825419685854e-05, + "loss": 0.1937, + "step": 29066 + }, + { + "epoch": 2.354747245625405, + "grad_norm": 0.060588981956243515, + "learning_rate": 9.140375354426393e-05, + "loss": 0.2503, + "step": 29067 + }, + { + "epoch": 2.3548282566429033, + "grad_norm": 0.06185297295451164, + "learning_rate": 9.13992528916693e-05, + "loss": 0.2874, + "step": 29068 + }, + { + "epoch": 2.354909267660402, + "grad_norm": 0.0669855922460556, + "learning_rate": 9.139475223907466e-05, + "loss": 0.2355, + "step": 29069 + }, + { + "epoch": 2.3549902786779002, + "grad_norm": 0.0630691796541214, + "learning_rate": 9.139025158648005e-05, + "loss": 0.2245, + "step": 29070 + }, + { + "epoch": 2.3550712896953985, + "grad_norm": 0.05715397745370865, + "learning_rate": 9.138575093388542e-05, + "loss": 0.2487, + "step": 29071 + }, + { + "epoch": 2.3551523007128967, + "grad_norm": 0.0638887882232666, + "learning_rate": 9.138125028129078e-05, + "loss": 0.287, + "step": 29072 + }, + { + "epoch": 2.3552333117303954, + "grad_norm": 0.05640844628214836, + "learning_rate": 9.137674962869617e-05, + "loss": 0.228, + "step": 29073 + }, + { + "epoch": 2.3553143227478937, + "grad_norm": 0.06533593684434891, + "learning_rate": 9.137224897610154e-05, + "loss": 0.2561, + "step": 29074 + }, + { + "epoch": 2.355395333765392, + "grad_norm": 0.056167490780353546, + "learning_rate": 9.13677483235069e-05, + "loss": 0.2184, + "step": 29075 + }, + { + "epoch": 2.3554763447828906, + "grad_norm": 0.0543614998459816, + "learning_rate": 9.136324767091229e-05, + "loss": 0.2405, + "step": 29076 + }, + { + "epoch": 2.355557355800389, + "grad_norm": 0.05727843940258026, + "learning_rate": 9.135874701831766e-05, + "loss": 0.2459, + "step": 29077 + }, + { + "epoch": 2.355638366817887, + "grad_norm": 0.06158679351210594, + "learning_rate": 9.135424636572304e-05, + "loss": 0.2419, + "step": 29078 + }, + { + "epoch": 2.3557193778353858, + "grad_norm": 0.06343264132738113, + "learning_rate": 9.134974571312841e-05, + "loss": 0.2282, + "step": 29079 + }, + { + "epoch": 2.355800388852884, + "grad_norm": 0.07503470033407211, + "learning_rate": 9.134524506053378e-05, + "loss": 0.2314, + "step": 29080 + }, + { + "epoch": 2.3558813998703823, + "grad_norm": 0.0750756710767746, + "learning_rate": 9.134074440793916e-05, + "loss": 0.2241, + "step": 29081 + }, + { + "epoch": 2.355962410887881, + "grad_norm": 0.06549500674009323, + "learning_rate": 9.133624375534453e-05, + "loss": 0.2455, + "step": 29082 + }, + { + "epoch": 2.356043421905379, + "grad_norm": 0.06845001876354218, + "learning_rate": 9.13317431027499e-05, + "loss": 0.2723, + "step": 29083 + }, + { + "epoch": 2.3561244329228774, + "grad_norm": 0.10937629640102386, + "learning_rate": 9.132724245015528e-05, + "loss": 0.2576, + "step": 29084 + }, + { + "epoch": 2.356205443940376, + "grad_norm": 0.07276338338851929, + "learning_rate": 9.132274179756065e-05, + "loss": 0.2463, + "step": 29085 + }, + { + "epoch": 2.3562864549578744, + "grad_norm": 0.08041878789663315, + "learning_rate": 9.131824114496602e-05, + "loss": 0.2571, + "step": 29086 + }, + { + "epoch": 2.3563674659753726, + "grad_norm": 0.0719827190041542, + "learning_rate": 9.13137404923714e-05, + "loss": 0.2842, + "step": 29087 + }, + { + "epoch": 2.356448476992871, + "grad_norm": 0.07114730775356293, + "learning_rate": 9.130923983977677e-05, + "loss": 0.2465, + "step": 29088 + }, + { + "epoch": 2.3565294880103695, + "grad_norm": 0.11425285041332245, + "learning_rate": 9.130473918718215e-05, + "loss": 0.2278, + "step": 29089 + }, + { + "epoch": 2.356610499027868, + "grad_norm": 0.07294203341007233, + "learning_rate": 9.130023853458752e-05, + "loss": 0.2497, + "step": 29090 + }, + { + "epoch": 2.356691510045366, + "grad_norm": 0.0667933002114296, + "learning_rate": 9.129573788199289e-05, + "loss": 0.2678, + "step": 29091 + }, + { + "epoch": 2.3567725210628647, + "grad_norm": 0.07403317093849182, + "learning_rate": 9.129123722939827e-05, + "loss": 0.2561, + "step": 29092 + }, + { + "epoch": 2.356853532080363, + "grad_norm": 0.06869540363550186, + "learning_rate": 9.128673657680364e-05, + "loss": 0.3012, + "step": 29093 + }, + { + "epoch": 2.356934543097861, + "grad_norm": 0.06836505979299545, + "learning_rate": 9.128223592420901e-05, + "loss": 0.2439, + "step": 29094 + }, + { + "epoch": 2.3570155541153595, + "grad_norm": 0.05557122081518173, + "learning_rate": 9.127773527161439e-05, + "loss": 0.2789, + "step": 29095 + }, + { + "epoch": 2.357096565132858, + "grad_norm": 0.07525942474603653, + "learning_rate": 9.127323461901976e-05, + "loss": 0.2753, + "step": 29096 + }, + { + "epoch": 2.3571775761503564, + "grad_norm": 0.049367401748895645, + "learning_rate": 9.126873396642513e-05, + "loss": 0.228, + "step": 29097 + }, + { + "epoch": 2.3572585871678546, + "grad_norm": 0.06840921193361282, + "learning_rate": 9.126423331383051e-05, + "loss": 0.2433, + "step": 29098 + }, + { + "epoch": 2.3573395981853533, + "grad_norm": 0.06897839158773422, + "learning_rate": 9.125973266123588e-05, + "loss": 0.2622, + "step": 29099 + }, + { + "epoch": 2.3574206092028516, + "grad_norm": 0.06702248752117157, + "learning_rate": 9.125523200864125e-05, + "loss": 0.2476, + "step": 29100 + }, + { + "epoch": 2.35750162022035, + "grad_norm": 0.06632895767688751, + "learning_rate": 9.125073135604663e-05, + "loss": 0.2635, + "step": 29101 + }, + { + "epoch": 2.3575826312378485, + "grad_norm": 0.06883935630321503, + "learning_rate": 9.1246230703452e-05, + "loss": 0.3266, + "step": 29102 + }, + { + "epoch": 2.3576636422553467, + "grad_norm": 0.058567121624946594, + "learning_rate": 9.124173005085738e-05, + "loss": 0.2546, + "step": 29103 + }, + { + "epoch": 2.357744653272845, + "grad_norm": 0.06617163121700287, + "learning_rate": 9.123722939826276e-05, + "loss": 0.3136, + "step": 29104 + }, + { + "epoch": 2.3578256642903437, + "grad_norm": 0.05383819714188576, + "learning_rate": 9.123272874566812e-05, + "loss": 0.2334, + "step": 29105 + }, + { + "epoch": 2.357906675307842, + "grad_norm": 0.06604919582605362, + "learning_rate": 9.12282280930735e-05, + "loss": 0.2537, + "step": 29106 + }, + { + "epoch": 2.35798768632534, + "grad_norm": 0.06112247705459595, + "learning_rate": 9.122372744047888e-05, + "loss": 0.2292, + "step": 29107 + }, + { + "epoch": 2.358068697342839, + "grad_norm": 0.07379915565252304, + "learning_rate": 9.121922678788424e-05, + "loss": 0.2603, + "step": 29108 + }, + { + "epoch": 2.358149708360337, + "grad_norm": 0.07685612142086029, + "learning_rate": 9.121472613528962e-05, + "loss": 0.2514, + "step": 29109 + }, + { + "epoch": 2.3582307193778353, + "grad_norm": 0.08886110037565231, + "learning_rate": 9.1210225482695e-05, + "loss": 0.2416, + "step": 29110 + }, + { + "epoch": 2.3583117303953336, + "grad_norm": 0.0855654776096344, + "learning_rate": 9.120572483010036e-05, + "loss": 0.2798, + "step": 29111 + }, + { + "epoch": 2.3583927414128323, + "grad_norm": 0.058703597635030746, + "learning_rate": 9.120122417750574e-05, + "loss": 0.263, + "step": 29112 + }, + { + "epoch": 2.3584737524303305, + "grad_norm": 0.08543696999549866, + "learning_rate": 9.119672352491113e-05, + "loss": 0.2868, + "step": 29113 + }, + { + "epoch": 2.3585547634478288, + "grad_norm": 0.08278053253889084, + "learning_rate": 9.119222287231649e-05, + "loss": 0.2598, + "step": 29114 + }, + { + "epoch": 2.3586357744653275, + "grad_norm": 0.06764932721853256, + "learning_rate": 9.118772221972186e-05, + "loss": 0.2557, + "step": 29115 + }, + { + "epoch": 2.3587167854828257, + "grad_norm": 0.07048583030700684, + "learning_rate": 9.118322156712725e-05, + "loss": 0.2823, + "step": 29116 + }, + { + "epoch": 2.358797796500324, + "grad_norm": 0.06134413182735443, + "learning_rate": 9.11787209145326e-05, + "loss": 0.2635, + "step": 29117 + }, + { + "epoch": 2.358878807517822, + "grad_norm": 0.07762916386127472, + "learning_rate": 9.117422026193798e-05, + "loss": 0.2568, + "step": 29118 + }, + { + "epoch": 2.358959818535321, + "grad_norm": 0.04822731763124466, + "learning_rate": 9.116971960934337e-05, + "loss": 0.2413, + "step": 29119 + }, + { + "epoch": 2.359040829552819, + "grad_norm": 0.06413828581571579, + "learning_rate": 9.116521895674873e-05, + "loss": 0.2573, + "step": 29120 + }, + { + "epoch": 2.3591218405703174, + "grad_norm": 0.060196202248334885, + "learning_rate": 9.11607183041541e-05, + "loss": 0.2772, + "step": 29121 + }, + { + "epoch": 2.359202851587816, + "grad_norm": 0.07289192825555801, + "learning_rate": 9.115621765155949e-05, + "loss": 0.2606, + "step": 29122 + }, + { + "epoch": 2.3592838626053143, + "grad_norm": 0.07562807947397232, + "learning_rate": 9.115171699896485e-05, + "loss": 0.2779, + "step": 29123 + }, + { + "epoch": 2.3593648736228126, + "grad_norm": 0.06476518511772156, + "learning_rate": 9.114721634637022e-05, + "loss": 0.2334, + "step": 29124 + }, + { + "epoch": 2.3594458846403112, + "grad_norm": 0.06918711960315704, + "learning_rate": 9.114271569377561e-05, + "loss": 0.2648, + "step": 29125 + }, + { + "epoch": 2.3595268956578095, + "grad_norm": 0.07326087355613708, + "learning_rate": 9.113821504118097e-05, + "loss": 0.2548, + "step": 29126 + }, + { + "epoch": 2.3596079066753077, + "grad_norm": 0.054968882352113724, + "learning_rate": 9.113371438858634e-05, + "loss": 0.2403, + "step": 29127 + }, + { + "epoch": 2.3596889176928064, + "grad_norm": 0.08446795493364334, + "learning_rate": 9.112921373599173e-05, + "loss": 0.2453, + "step": 29128 + }, + { + "epoch": 2.3597699287103047, + "grad_norm": 0.06379998475313187, + "learning_rate": 9.112471308339709e-05, + "loss": 0.2709, + "step": 29129 + }, + { + "epoch": 2.359850939727803, + "grad_norm": 0.054361291229724884, + "learning_rate": 9.112021243080248e-05, + "loss": 0.2494, + "step": 29130 + }, + { + "epoch": 2.3599319507453016, + "grad_norm": 0.06530174612998962, + "learning_rate": 9.111571177820785e-05, + "loss": 0.234, + "step": 29131 + }, + { + "epoch": 2.3600129617628, + "grad_norm": 0.07407192140817642, + "learning_rate": 9.111121112561321e-05, + "loss": 0.2432, + "step": 29132 + }, + { + "epoch": 2.360093972780298, + "grad_norm": 0.06312862038612366, + "learning_rate": 9.11067104730186e-05, + "loss": 0.2423, + "step": 29133 + }, + { + "epoch": 2.3601749837977963, + "grad_norm": 0.06313052028417587, + "learning_rate": 9.110220982042397e-05, + "loss": 0.2631, + "step": 29134 + }, + { + "epoch": 2.360255994815295, + "grad_norm": 0.0686829537153244, + "learning_rate": 9.109770916782933e-05, + "loss": 0.2375, + "step": 29135 + }, + { + "epoch": 2.3603370058327933, + "grad_norm": 0.05823444202542305, + "learning_rate": 9.109320851523472e-05, + "loss": 0.2578, + "step": 29136 + }, + { + "epoch": 2.3604180168502915, + "grad_norm": 0.06658496707677841, + "learning_rate": 9.108870786264009e-05, + "loss": 0.256, + "step": 29137 + }, + { + "epoch": 2.36049902786779, + "grad_norm": 0.06323231011629105, + "learning_rate": 9.108420721004545e-05, + "loss": 0.2386, + "step": 29138 + }, + { + "epoch": 2.3605800388852884, + "grad_norm": 0.06542006134986877, + "learning_rate": 9.107970655745084e-05, + "loss": 0.2323, + "step": 29139 + }, + { + "epoch": 2.3606610499027867, + "grad_norm": 0.06671994924545288, + "learning_rate": 9.107520590485621e-05, + "loss": 0.2718, + "step": 29140 + }, + { + "epoch": 2.360742060920285, + "grad_norm": 0.06911414861679077, + "learning_rate": 9.107070525226157e-05, + "loss": 0.2941, + "step": 29141 + }, + { + "epoch": 2.3608230719377836, + "grad_norm": 0.0645342469215393, + "learning_rate": 9.106620459966696e-05, + "loss": 0.243, + "step": 29142 + }, + { + "epoch": 2.360904082955282, + "grad_norm": 0.06701602786779404, + "learning_rate": 9.106170394707233e-05, + "loss": 0.2663, + "step": 29143 + }, + { + "epoch": 2.36098509397278, + "grad_norm": 0.06413000077009201, + "learning_rate": 9.10572032944777e-05, + "loss": 0.2423, + "step": 29144 + }, + { + "epoch": 2.361066104990279, + "grad_norm": 0.06362126022577286, + "learning_rate": 9.105270264188308e-05, + "loss": 0.2118, + "step": 29145 + }, + { + "epoch": 2.361147116007777, + "grad_norm": 0.05643783137202263, + "learning_rate": 9.104820198928845e-05, + "loss": 0.2042, + "step": 29146 + }, + { + "epoch": 2.3612281270252753, + "grad_norm": 0.07649555057287216, + "learning_rate": 9.104370133669381e-05, + "loss": 0.2315, + "step": 29147 + }, + { + "epoch": 2.361309138042774, + "grad_norm": 0.06502323597669601, + "learning_rate": 9.10392006840992e-05, + "loss": 0.2491, + "step": 29148 + }, + { + "epoch": 2.3613901490602722, + "grad_norm": 0.09323930740356445, + "learning_rate": 9.103470003150457e-05, + "loss": 0.249, + "step": 29149 + }, + { + "epoch": 2.3614711600777705, + "grad_norm": 0.06336724758148193, + "learning_rate": 9.103019937890994e-05, + "loss": 0.2039, + "step": 29150 + }, + { + "epoch": 2.361552171095269, + "grad_norm": 0.07176464796066284, + "learning_rate": 9.102569872631532e-05, + "loss": 0.2359, + "step": 29151 + }, + { + "epoch": 2.3616331821127674, + "grad_norm": 0.07099702209234238, + "learning_rate": 9.10211980737207e-05, + "loss": 0.2616, + "step": 29152 + }, + { + "epoch": 2.3617141931302656, + "grad_norm": 0.06924501806497574, + "learning_rate": 9.101669742112606e-05, + "loss": 0.2431, + "step": 29153 + }, + { + "epoch": 2.3617952041477643, + "grad_norm": 0.05669789761304855, + "learning_rate": 9.101219676853144e-05, + "loss": 0.2405, + "step": 29154 + }, + { + "epoch": 2.3618762151652626, + "grad_norm": 0.06776037812232971, + "learning_rate": 9.100769611593682e-05, + "loss": 0.248, + "step": 29155 + }, + { + "epoch": 2.361957226182761, + "grad_norm": 0.061502985656261444, + "learning_rate": 9.100319546334219e-05, + "loss": 0.2553, + "step": 29156 + }, + { + "epoch": 2.362038237200259, + "grad_norm": 0.061214275658130646, + "learning_rate": 9.099869481074756e-05, + "loss": 0.2245, + "step": 29157 + }, + { + "epoch": 2.3621192482177578, + "grad_norm": 0.07771072536706924, + "learning_rate": 9.099419415815294e-05, + "loss": 0.2295, + "step": 29158 + }, + { + "epoch": 2.362200259235256, + "grad_norm": 0.06995867937803268, + "learning_rate": 9.098969350555831e-05, + "loss": 0.2534, + "step": 29159 + }, + { + "epoch": 2.3622812702527543, + "grad_norm": 0.06866510212421417, + "learning_rate": 9.098519285296368e-05, + "loss": 0.2699, + "step": 29160 + }, + { + "epoch": 2.362362281270253, + "grad_norm": 0.06447815895080566, + "learning_rate": 9.098069220036906e-05, + "loss": 0.2618, + "step": 29161 + }, + { + "epoch": 2.362443292287751, + "grad_norm": 0.08535295724868774, + "learning_rate": 9.097619154777443e-05, + "loss": 0.296, + "step": 29162 + }, + { + "epoch": 2.3625243033052494, + "grad_norm": 0.06989932805299759, + "learning_rate": 9.09716908951798e-05, + "loss": 0.2501, + "step": 29163 + }, + { + "epoch": 2.3626053143227477, + "grad_norm": 0.06145579367876053, + "learning_rate": 9.096719024258518e-05, + "loss": 0.239, + "step": 29164 + }, + { + "epoch": 2.3626863253402464, + "grad_norm": 0.07149982452392578, + "learning_rate": 9.096268958999055e-05, + "loss": 0.2636, + "step": 29165 + }, + { + "epoch": 2.3627673363577446, + "grad_norm": 0.06792671233415604, + "learning_rate": 9.095818893739593e-05, + "loss": 0.27, + "step": 29166 + }, + { + "epoch": 2.362848347375243, + "grad_norm": 0.06381487846374512, + "learning_rate": 9.09536882848013e-05, + "loss": 0.2677, + "step": 29167 + }, + { + "epoch": 2.3629293583927415, + "grad_norm": 0.055963076651096344, + "learning_rate": 9.094918763220667e-05, + "loss": 0.2372, + "step": 29168 + }, + { + "epoch": 2.36301036941024, + "grad_norm": 0.07309937477111816, + "learning_rate": 9.094468697961205e-05, + "loss": 0.2895, + "step": 29169 + }, + { + "epoch": 2.363091380427738, + "grad_norm": 0.06073899194598198, + "learning_rate": 9.094018632701742e-05, + "loss": 0.2794, + "step": 29170 + }, + { + "epoch": 2.3631723914452367, + "grad_norm": 0.06594845652580261, + "learning_rate": 9.09356856744228e-05, + "loss": 0.2368, + "step": 29171 + }, + { + "epoch": 2.363253402462735, + "grad_norm": 0.06607092171907425, + "learning_rate": 9.093118502182817e-05, + "loss": 0.2608, + "step": 29172 + }, + { + "epoch": 2.363334413480233, + "grad_norm": 0.0516083724796772, + "learning_rate": 9.092668436923354e-05, + "loss": 0.2241, + "step": 29173 + }, + { + "epoch": 2.363415424497732, + "grad_norm": 0.05938537046313286, + "learning_rate": 9.092218371663892e-05, + "loss": 0.2413, + "step": 29174 + }, + { + "epoch": 2.36349643551523, + "grad_norm": 0.057529933750629425, + "learning_rate": 9.091768306404429e-05, + "loss": 0.2434, + "step": 29175 + }, + { + "epoch": 2.3635774465327284, + "grad_norm": 0.06252843886613846, + "learning_rate": 9.091318241144966e-05, + "loss": 0.2634, + "step": 29176 + }, + { + "epoch": 2.363658457550227, + "grad_norm": 0.08230286836624146, + "learning_rate": 9.090868175885504e-05, + "loss": 0.2736, + "step": 29177 + }, + { + "epoch": 2.3637394685677253, + "grad_norm": 0.05570271983742714, + "learning_rate": 9.090418110626041e-05, + "loss": 0.1984, + "step": 29178 + }, + { + "epoch": 2.3638204795852236, + "grad_norm": 0.05833020433783531, + "learning_rate": 9.089968045366578e-05, + "loss": 0.232, + "step": 29179 + }, + { + "epoch": 2.363901490602722, + "grad_norm": 0.06215290352702141, + "learning_rate": 9.089517980107116e-05, + "loss": 0.2629, + "step": 29180 + }, + { + "epoch": 2.3639825016202205, + "grad_norm": 0.07116110622882843, + "learning_rate": 9.089067914847653e-05, + "loss": 0.2618, + "step": 29181 + }, + { + "epoch": 2.3640635126377187, + "grad_norm": 0.07417536526918411, + "learning_rate": 9.08861784958819e-05, + "loss": 0.2508, + "step": 29182 + }, + { + "epoch": 2.364144523655217, + "grad_norm": 0.0574500672519207, + "learning_rate": 9.088167784328728e-05, + "loss": 0.2698, + "step": 29183 + }, + { + "epoch": 2.3642255346727157, + "grad_norm": 0.06579186022281647, + "learning_rate": 9.087717719069265e-05, + "loss": 0.2372, + "step": 29184 + }, + { + "epoch": 2.364306545690214, + "grad_norm": 0.06995394080877304, + "learning_rate": 9.087267653809804e-05, + "loss": 0.2239, + "step": 29185 + }, + { + "epoch": 2.364387556707712, + "grad_norm": 0.06749846041202545, + "learning_rate": 9.08681758855034e-05, + "loss": 0.2503, + "step": 29186 + }, + { + "epoch": 2.3644685677252104, + "grad_norm": 0.0807734802365303, + "learning_rate": 9.086367523290877e-05, + "loss": 0.2808, + "step": 29187 + }, + { + "epoch": 2.364549578742709, + "grad_norm": 0.060010310262441635, + "learning_rate": 9.085917458031416e-05, + "loss": 0.2503, + "step": 29188 + }, + { + "epoch": 2.3646305897602073, + "grad_norm": 0.0891261026263237, + "learning_rate": 9.085467392771952e-05, + "loss": 0.2796, + "step": 29189 + }, + { + "epoch": 2.3647116007777056, + "grad_norm": 0.06593820452690125, + "learning_rate": 9.085017327512489e-05, + "loss": 0.2598, + "step": 29190 + }, + { + "epoch": 2.3647926117952043, + "grad_norm": 0.06441017240285873, + "learning_rate": 9.084567262253028e-05, + "loss": 0.2758, + "step": 29191 + }, + { + "epoch": 2.3648736228127025, + "grad_norm": 0.05712772160768509, + "learning_rate": 9.084117196993564e-05, + "loss": 0.2509, + "step": 29192 + }, + { + "epoch": 2.3649546338302008, + "grad_norm": 0.053244173526763916, + "learning_rate": 9.083667131734101e-05, + "loss": 0.2186, + "step": 29193 + }, + { + "epoch": 2.3650356448476995, + "grad_norm": 0.06399260461330414, + "learning_rate": 9.08321706647464e-05, + "loss": 0.2382, + "step": 29194 + }, + { + "epoch": 2.3651166558651977, + "grad_norm": 0.10175412148237228, + "learning_rate": 9.082767001215176e-05, + "loss": 0.2739, + "step": 29195 + }, + { + "epoch": 2.365197666882696, + "grad_norm": 0.09233570098876953, + "learning_rate": 9.082316935955713e-05, + "loss": 0.2991, + "step": 29196 + }, + { + "epoch": 2.3652786779001946, + "grad_norm": 0.08154605329036713, + "learning_rate": 9.081866870696252e-05, + "loss": 0.2443, + "step": 29197 + }, + { + "epoch": 2.365359688917693, + "grad_norm": 0.07981499284505844, + "learning_rate": 9.081416805436788e-05, + "loss": 0.2662, + "step": 29198 + }, + { + "epoch": 2.365440699935191, + "grad_norm": 0.060174666345119476, + "learning_rate": 9.080966740177326e-05, + "loss": 0.2389, + "step": 29199 + }, + { + "epoch": 2.3655217109526894, + "grad_norm": 0.06928128004074097, + "learning_rate": 9.080516674917864e-05, + "loss": 0.259, + "step": 29200 + }, + { + "epoch": 2.365602721970188, + "grad_norm": 0.0812605544924736, + "learning_rate": 9.0800666096584e-05, + "loss": 0.2458, + "step": 29201 + }, + { + "epoch": 2.3656837329876863, + "grad_norm": 0.08370108902454376, + "learning_rate": 9.079616544398938e-05, + "loss": 0.2776, + "step": 29202 + }, + { + "epoch": 2.3657647440051845, + "grad_norm": 0.058878593146800995, + "learning_rate": 9.079166479139476e-05, + "loss": 0.2219, + "step": 29203 + }, + { + "epoch": 2.3658457550226832, + "grad_norm": 0.08872153609991074, + "learning_rate": 9.078716413880012e-05, + "loss": 0.2776, + "step": 29204 + }, + { + "epoch": 2.3659267660401815, + "grad_norm": 0.06256221234798431, + "learning_rate": 9.07826634862055e-05, + "loss": 0.2237, + "step": 29205 + }, + { + "epoch": 2.3660077770576797, + "grad_norm": 0.06201305240392685, + "learning_rate": 9.077816283361088e-05, + "loss": 0.2855, + "step": 29206 + }, + { + "epoch": 2.3660887880751784, + "grad_norm": 0.06656473875045776, + "learning_rate": 9.077366218101624e-05, + "loss": 0.2318, + "step": 29207 + }, + { + "epoch": 2.3661697990926767, + "grad_norm": 0.06485893577337265, + "learning_rate": 9.076916152842162e-05, + "loss": 0.2734, + "step": 29208 + }, + { + "epoch": 2.366250810110175, + "grad_norm": 0.06904721260070801, + "learning_rate": 9.0764660875827e-05, + "loss": 0.2874, + "step": 29209 + }, + { + "epoch": 2.366331821127673, + "grad_norm": 0.0712614580988884, + "learning_rate": 9.076016022323236e-05, + "loss": 0.2972, + "step": 29210 + }, + { + "epoch": 2.366412832145172, + "grad_norm": 0.06572310626506805, + "learning_rate": 9.075565957063775e-05, + "loss": 0.2869, + "step": 29211 + }, + { + "epoch": 2.36649384316267, + "grad_norm": 0.055767737329006195, + "learning_rate": 9.075115891804313e-05, + "loss": 0.2331, + "step": 29212 + }, + { + "epoch": 2.3665748541801683, + "grad_norm": 0.06114116311073303, + "learning_rate": 9.074665826544849e-05, + "loss": 0.2281, + "step": 29213 + }, + { + "epoch": 2.366655865197667, + "grad_norm": 0.06554603576660156, + "learning_rate": 9.074215761285387e-05, + "loss": 0.2319, + "step": 29214 + }, + { + "epoch": 2.3667368762151653, + "grad_norm": 0.06667567044496536, + "learning_rate": 9.073765696025925e-05, + "loss": 0.2464, + "step": 29215 + }, + { + "epoch": 2.3668178872326635, + "grad_norm": 0.0630006194114685, + "learning_rate": 9.07331563076646e-05, + "loss": 0.2647, + "step": 29216 + }, + { + "epoch": 2.366898898250162, + "grad_norm": 0.0625307708978653, + "learning_rate": 9.072865565507e-05, + "loss": 0.2146, + "step": 29217 + }, + { + "epoch": 2.3669799092676604, + "grad_norm": 0.06067518889904022, + "learning_rate": 9.072415500247537e-05, + "loss": 0.2151, + "step": 29218 + }, + { + "epoch": 2.3670609202851587, + "grad_norm": 0.06341242045164108, + "learning_rate": 9.071965434988073e-05, + "loss": 0.2586, + "step": 29219 + }, + { + "epoch": 2.3671419313026574, + "grad_norm": 0.06968440860509872, + "learning_rate": 9.071515369728611e-05, + "loss": 0.234, + "step": 29220 + }, + { + "epoch": 2.3672229423201556, + "grad_norm": 0.07393652945756912, + "learning_rate": 9.071065304469149e-05, + "loss": 0.223, + "step": 29221 + }, + { + "epoch": 2.367303953337654, + "grad_norm": 0.06258944422006607, + "learning_rate": 9.070615239209685e-05, + "loss": 0.2252, + "step": 29222 + }, + { + "epoch": 2.367384964355152, + "grad_norm": 0.0730939507484436, + "learning_rate": 9.070165173950224e-05, + "loss": 0.2548, + "step": 29223 + }, + { + "epoch": 2.367465975372651, + "grad_norm": 0.05863727629184723, + "learning_rate": 9.069715108690761e-05, + "loss": 0.2166, + "step": 29224 + }, + { + "epoch": 2.367546986390149, + "grad_norm": 0.08163779973983765, + "learning_rate": 9.069265043431298e-05, + "loss": 0.267, + "step": 29225 + }, + { + "epoch": 2.3676279974076473, + "grad_norm": 0.07284118235111237, + "learning_rate": 9.068814978171836e-05, + "loss": 0.2513, + "step": 29226 + }, + { + "epoch": 2.367709008425146, + "grad_norm": 0.061009738594293594, + "learning_rate": 9.068364912912373e-05, + "loss": 0.2427, + "step": 29227 + }, + { + "epoch": 2.3677900194426442, + "grad_norm": 0.06732402741909027, + "learning_rate": 9.06791484765291e-05, + "loss": 0.2178, + "step": 29228 + }, + { + "epoch": 2.3678710304601425, + "grad_norm": 0.07382349669933319, + "learning_rate": 9.067464782393448e-05, + "loss": 0.2322, + "step": 29229 + }, + { + "epoch": 2.3679520414776407, + "grad_norm": 0.06112572178244591, + "learning_rate": 9.067014717133985e-05, + "loss": 0.213, + "step": 29230 + }, + { + "epoch": 2.3680330524951394, + "grad_norm": 0.07919428497552872, + "learning_rate": 9.066564651874522e-05, + "loss": 0.2283, + "step": 29231 + }, + { + "epoch": 2.3681140635126376, + "grad_norm": 0.07739866524934769, + "learning_rate": 9.06611458661506e-05, + "loss": 0.2545, + "step": 29232 + }, + { + "epoch": 2.368195074530136, + "grad_norm": 0.07174039632081985, + "learning_rate": 9.065664521355597e-05, + "loss": 0.2392, + "step": 29233 + }, + { + "epoch": 2.3682760855476346, + "grad_norm": 0.08616352826356888, + "learning_rate": 9.065214456096134e-05, + "loss": 0.2912, + "step": 29234 + }, + { + "epoch": 2.368357096565133, + "grad_norm": 0.07922189682722092, + "learning_rate": 9.064764390836672e-05, + "loss": 0.2886, + "step": 29235 + }, + { + "epoch": 2.368438107582631, + "grad_norm": 0.07131379842758179, + "learning_rate": 9.064314325577209e-05, + "loss": 0.2466, + "step": 29236 + }, + { + "epoch": 2.3685191186001298, + "grad_norm": 0.053437042981386185, + "learning_rate": 9.063864260317747e-05, + "loss": 0.2572, + "step": 29237 + }, + { + "epoch": 2.368600129617628, + "grad_norm": 0.08782981336116791, + "learning_rate": 9.063414195058284e-05, + "loss": 0.2602, + "step": 29238 + }, + { + "epoch": 2.3686811406351262, + "grad_norm": 0.061420753598213196, + "learning_rate": 9.062964129798821e-05, + "loss": 0.2428, + "step": 29239 + }, + { + "epoch": 2.368762151652625, + "grad_norm": 0.053099263459444046, + "learning_rate": 9.062514064539359e-05, + "loss": 0.2624, + "step": 29240 + }, + { + "epoch": 2.368843162670123, + "grad_norm": 0.055860213935375214, + "learning_rate": 9.062063999279896e-05, + "loss": 0.2477, + "step": 29241 + }, + { + "epoch": 2.3689241736876214, + "grad_norm": 0.057424698024988174, + "learning_rate": 9.061613934020433e-05, + "loss": 0.2316, + "step": 29242 + }, + { + "epoch": 2.36900518470512, + "grad_norm": 0.059896305203437805, + "learning_rate": 9.061163868760971e-05, + "loss": 0.2346, + "step": 29243 + }, + { + "epoch": 2.3690861957226184, + "grad_norm": 0.060097161680459976, + "learning_rate": 9.060713803501508e-05, + "loss": 0.2623, + "step": 29244 + }, + { + "epoch": 2.3691672067401166, + "grad_norm": 0.06034714728593826, + "learning_rate": 9.060263738242045e-05, + "loss": 0.2934, + "step": 29245 + }, + { + "epoch": 2.369248217757615, + "grad_norm": 0.07298918068408966, + "learning_rate": 9.059813672982583e-05, + "loss": 0.2696, + "step": 29246 + }, + { + "epoch": 2.3693292287751135, + "grad_norm": 0.060322027653455734, + "learning_rate": 9.05936360772312e-05, + "loss": 0.2534, + "step": 29247 + }, + { + "epoch": 2.369410239792612, + "grad_norm": 0.08148866891860962, + "learning_rate": 9.058913542463658e-05, + "loss": 0.2583, + "step": 29248 + }, + { + "epoch": 2.36949125081011, + "grad_norm": 0.08276345580816269, + "learning_rate": 9.058463477204195e-05, + "loss": 0.2633, + "step": 29249 + }, + { + "epoch": 2.3695722618276087, + "grad_norm": 0.059174928814172745, + "learning_rate": 9.058013411944732e-05, + "loss": 0.2334, + "step": 29250 + }, + { + "epoch": 2.369653272845107, + "grad_norm": 0.06756450980901718, + "learning_rate": 9.05756334668527e-05, + "loss": 0.2424, + "step": 29251 + }, + { + "epoch": 2.369734283862605, + "grad_norm": 0.06810939311981201, + "learning_rate": 9.057113281425807e-05, + "loss": 0.2365, + "step": 29252 + }, + { + "epoch": 2.3698152948801035, + "grad_norm": 0.06994233280420303, + "learning_rate": 9.056663216166344e-05, + "loss": 0.2347, + "step": 29253 + }, + { + "epoch": 2.369896305897602, + "grad_norm": 0.07477480173110962, + "learning_rate": 9.056213150906882e-05, + "loss": 0.2462, + "step": 29254 + }, + { + "epoch": 2.3699773169151004, + "grad_norm": 0.056406669318675995, + "learning_rate": 9.055763085647419e-05, + "loss": 0.2442, + "step": 29255 + }, + { + "epoch": 2.3700583279325986, + "grad_norm": 0.06577787548303604, + "learning_rate": 9.055313020387956e-05, + "loss": 0.2065, + "step": 29256 + }, + { + "epoch": 2.3701393389500973, + "grad_norm": 0.08167983591556549, + "learning_rate": 9.054862955128494e-05, + "loss": 0.2722, + "step": 29257 + }, + { + "epoch": 2.3702203499675956, + "grad_norm": 0.06138281151652336, + "learning_rate": 9.054412889869031e-05, + "loss": 0.2421, + "step": 29258 + }, + { + "epoch": 2.370301360985094, + "grad_norm": 0.08680165559053421, + "learning_rate": 9.053962824609568e-05, + "loss": 0.2529, + "step": 29259 + }, + { + "epoch": 2.3703823720025925, + "grad_norm": 0.05767164006829262, + "learning_rate": 9.053512759350106e-05, + "loss": 0.2414, + "step": 29260 + }, + { + "epoch": 2.3704633830200907, + "grad_norm": 0.06463748961687088, + "learning_rate": 9.053062694090643e-05, + "loss": 0.2336, + "step": 29261 + }, + { + "epoch": 2.370544394037589, + "grad_norm": 0.060328882187604904, + "learning_rate": 9.05261262883118e-05, + "loss": 0.2762, + "step": 29262 + }, + { + "epoch": 2.3706254050550877, + "grad_norm": 0.06523432582616806, + "learning_rate": 9.052162563571719e-05, + "loss": 0.2519, + "step": 29263 + }, + { + "epoch": 2.370706416072586, + "grad_norm": 0.06499111652374268, + "learning_rate": 9.051712498312255e-05, + "loss": 0.2491, + "step": 29264 + }, + { + "epoch": 2.370787427090084, + "grad_norm": 0.06619423627853394, + "learning_rate": 9.051262433052793e-05, + "loss": 0.2648, + "step": 29265 + }, + { + "epoch": 2.370868438107583, + "grad_norm": 0.06158056482672691, + "learning_rate": 9.050812367793331e-05, + "loss": 0.2727, + "step": 29266 + }, + { + "epoch": 2.370949449125081, + "grad_norm": 0.08038074523210526, + "learning_rate": 9.050362302533867e-05, + "loss": 0.2144, + "step": 29267 + }, + { + "epoch": 2.3710304601425793, + "grad_norm": 0.06463596224784851, + "learning_rate": 9.049912237274405e-05, + "loss": 0.2685, + "step": 29268 + }, + { + "epoch": 2.3711114711600776, + "grad_norm": 0.0551106333732605, + "learning_rate": 9.049462172014943e-05, + "loss": 0.2355, + "step": 29269 + }, + { + "epoch": 2.3711924821775763, + "grad_norm": 0.053852569311857224, + "learning_rate": 9.04901210675548e-05, + "loss": 0.2639, + "step": 29270 + }, + { + "epoch": 2.3712734931950745, + "grad_norm": 0.06700147688388824, + "learning_rate": 9.048562041496017e-05, + "loss": 0.2603, + "step": 29271 + }, + { + "epoch": 2.3713545042125728, + "grad_norm": 0.054714132100343704, + "learning_rate": 9.048111976236556e-05, + "loss": 0.1949, + "step": 29272 + }, + { + "epoch": 2.3714355152300715, + "grad_norm": 0.059138987213373184, + "learning_rate": 9.047661910977092e-05, + "loss": 0.2275, + "step": 29273 + }, + { + "epoch": 2.3715165262475697, + "grad_norm": 0.08097271621227264, + "learning_rate": 9.047211845717629e-05, + "loss": 0.2626, + "step": 29274 + }, + { + "epoch": 2.371597537265068, + "grad_norm": 0.08481959998607635, + "learning_rate": 9.046761780458168e-05, + "loss": 0.2558, + "step": 29275 + }, + { + "epoch": 2.371678548282566, + "grad_norm": 0.06269409507513046, + "learning_rate": 9.046311715198704e-05, + "loss": 0.2561, + "step": 29276 + }, + { + "epoch": 2.371759559300065, + "grad_norm": 0.06228793412446976, + "learning_rate": 9.045861649939241e-05, + "loss": 0.2448, + "step": 29277 + }, + { + "epoch": 2.371840570317563, + "grad_norm": 0.06148279458284378, + "learning_rate": 9.04541158467978e-05, + "loss": 0.299, + "step": 29278 + }, + { + "epoch": 2.3719215813350614, + "grad_norm": 0.06152509152889252, + "learning_rate": 9.044961519420316e-05, + "loss": 0.2714, + "step": 29279 + }, + { + "epoch": 2.37200259235256, + "grad_norm": 0.07238215953111649, + "learning_rate": 9.044511454160853e-05, + "loss": 0.237, + "step": 29280 + }, + { + "epoch": 2.3720836033700583, + "grad_norm": 0.0703226625919342, + "learning_rate": 9.044061388901392e-05, + "loss": 0.2399, + "step": 29281 + }, + { + "epoch": 2.3721646143875565, + "grad_norm": 0.06950215995311737, + "learning_rate": 9.043611323641928e-05, + "loss": 0.2455, + "step": 29282 + }, + { + "epoch": 2.3722456254050552, + "grad_norm": 0.06561096012592316, + "learning_rate": 9.043161258382465e-05, + "loss": 0.2642, + "step": 29283 + }, + { + "epoch": 2.3723266364225535, + "grad_norm": 0.05323941260576248, + "learning_rate": 9.042711193123004e-05, + "loss": 0.2313, + "step": 29284 + }, + { + "epoch": 2.3724076474400517, + "grad_norm": 0.057839252054691315, + "learning_rate": 9.04226112786354e-05, + "loss": 0.2092, + "step": 29285 + }, + { + "epoch": 2.3724886584575504, + "grad_norm": 0.06279532611370087, + "learning_rate": 9.041811062604077e-05, + "loss": 0.2515, + "step": 29286 + }, + { + "epoch": 2.3725696694750487, + "grad_norm": 0.059042852371931076, + "learning_rate": 9.041360997344616e-05, + "loss": 0.2575, + "step": 29287 + }, + { + "epoch": 2.372650680492547, + "grad_norm": 0.0693732351064682, + "learning_rate": 9.040910932085152e-05, + "loss": 0.2682, + "step": 29288 + }, + { + "epoch": 2.3727316915100456, + "grad_norm": 0.06384388357400894, + "learning_rate": 9.04046086682569e-05, + "loss": 0.247, + "step": 29289 + }, + { + "epoch": 2.372812702527544, + "grad_norm": 0.07595692574977875, + "learning_rate": 9.040010801566228e-05, + "loss": 0.2578, + "step": 29290 + }, + { + "epoch": 2.372893713545042, + "grad_norm": 0.07165265083312988, + "learning_rate": 9.039560736306765e-05, + "loss": 0.2329, + "step": 29291 + }, + { + "epoch": 2.3729747245625403, + "grad_norm": 0.06206320971250534, + "learning_rate": 9.039110671047303e-05, + "loss": 0.2453, + "step": 29292 + }, + { + "epoch": 2.373055735580039, + "grad_norm": 0.05709023401141167, + "learning_rate": 9.03866060578784e-05, + "loss": 0.2483, + "step": 29293 + }, + { + "epoch": 2.3731367465975373, + "grad_norm": 0.06202362850308418, + "learning_rate": 9.038210540528377e-05, + "loss": 0.2261, + "step": 29294 + }, + { + "epoch": 2.3732177576150355, + "grad_norm": 0.0633508712053299, + "learning_rate": 9.037760475268915e-05, + "loss": 0.2277, + "step": 29295 + }, + { + "epoch": 2.373298768632534, + "grad_norm": 0.08329858630895615, + "learning_rate": 9.037310410009452e-05, + "loss": 0.3046, + "step": 29296 + }, + { + "epoch": 2.3733797796500324, + "grad_norm": 0.07049525529146194, + "learning_rate": 9.03686034474999e-05, + "loss": 0.2704, + "step": 29297 + }, + { + "epoch": 2.3734607906675307, + "grad_norm": 0.06773684173822403, + "learning_rate": 9.036410279490527e-05, + "loss": 0.2407, + "step": 29298 + }, + { + "epoch": 2.373541801685029, + "grad_norm": 0.0773211419582367, + "learning_rate": 9.035960214231064e-05, + "loss": 0.236, + "step": 29299 + }, + { + "epoch": 2.3736228127025276, + "grad_norm": 0.07259319722652435, + "learning_rate": 9.035510148971602e-05, + "loss": 0.2451, + "step": 29300 + }, + { + "epoch": 2.373703823720026, + "grad_norm": 0.07149739563465118, + "learning_rate": 9.035060083712139e-05, + "loss": 0.2715, + "step": 29301 + }, + { + "epoch": 2.373784834737524, + "grad_norm": 0.07947015017271042, + "learning_rate": 9.034610018452676e-05, + "loss": 0.2534, + "step": 29302 + }, + { + "epoch": 2.373865845755023, + "grad_norm": 0.05928175151348114, + "learning_rate": 9.034159953193214e-05, + "loss": 0.2417, + "step": 29303 + }, + { + "epoch": 2.373946856772521, + "grad_norm": 0.07301197201013565, + "learning_rate": 9.033709887933751e-05, + "loss": 0.2505, + "step": 29304 + }, + { + "epoch": 2.3740278677900193, + "grad_norm": 0.06998045742511749, + "learning_rate": 9.033259822674288e-05, + "loss": 0.2652, + "step": 29305 + }, + { + "epoch": 2.374108878807518, + "grad_norm": 0.055942460894584656, + "learning_rate": 9.032809757414826e-05, + "loss": 0.2293, + "step": 29306 + }, + { + "epoch": 2.374189889825016, + "grad_norm": 0.05982231721282005, + "learning_rate": 9.032359692155363e-05, + "loss": 0.2461, + "step": 29307 + }, + { + "epoch": 2.3742709008425145, + "grad_norm": 0.05536472797393799, + "learning_rate": 9.0319096268959e-05, + "loss": 0.2613, + "step": 29308 + }, + { + "epoch": 2.374351911860013, + "grad_norm": 0.06701016426086426, + "learning_rate": 9.031459561636438e-05, + "loss": 0.2521, + "step": 29309 + }, + { + "epoch": 2.3744329228775114, + "grad_norm": 0.059674832969903946, + "learning_rate": 9.031009496376975e-05, + "loss": 0.2505, + "step": 29310 + }, + { + "epoch": 2.3745139338950096, + "grad_norm": 0.06509581953287125, + "learning_rate": 9.030559431117513e-05, + "loss": 0.2343, + "step": 29311 + }, + { + "epoch": 2.3745949449125083, + "grad_norm": 0.06403180956840515, + "learning_rate": 9.03010936585805e-05, + "loss": 0.2505, + "step": 29312 + }, + { + "epoch": 2.3746759559300066, + "grad_norm": 0.06330661475658417, + "learning_rate": 9.029659300598587e-05, + "loss": 0.2172, + "step": 29313 + }, + { + "epoch": 2.374756966947505, + "grad_norm": 0.060208771377801895, + "learning_rate": 9.029209235339125e-05, + "loss": 0.2502, + "step": 29314 + }, + { + "epoch": 2.374837977965003, + "grad_norm": 0.07508249580860138, + "learning_rate": 9.028759170079662e-05, + "loss": 0.2881, + "step": 29315 + }, + { + "epoch": 2.3749189889825018, + "grad_norm": 0.0637403130531311, + "learning_rate": 9.0283091048202e-05, + "loss": 0.2305, + "step": 29316 + }, + { + "epoch": 2.375, + "grad_norm": 0.06241225078701973, + "learning_rate": 9.027859039560737e-05, + "loss": 0.2242, + "step": 29317 + }, + { + "epoch": 2.3750810110174982, + "grad_norm": 0.05353011190891266, + "learning_rate": 9.027408974301274e-05, + "loss": 0.2316, + "step": 29318 + }, + { + "epoch": 2.375162022034997, + "grad_norm": 0.0804276168346405, + "learning_rate": 9.026958909041811e-05, + "loss": 0.2389, + "step": 29319 + }, + { + "epoch": 2.375243033052495, + "grad_norm": 0.07396351546049118, + "learning_rate": 9.026508843782349e-05, + "loss": 0.2833, + "step": 29320 + }, + { + "epoch": 2.3753240440699934, + "grad_norm": 0.07646553963422775, + "learning_rate": 9.026058778522886e-05, + "loss": 0.2643, + "step": 29321 + }, + { + "epoch": 2.3754050550874917, + "grad_norm": 0.06344413757324219, + "learning_rate": 9.025608713263424e-05, + "loss": 0.2716, + "step": 29322 + }, + { + "epoch": 2.3754860661049904, + "grad_norm": 0.07764876633882523, + "learning_rate": 9.025158648003961e-05, + "loss": 0.2762, + "step": 29323 + }, + { + "epoch": 2.3755670771224886, + "grad_norm": 0.07426711171865463, + "learning_rate": 9.024708582744498e-05, + "loss": 0.2448, + "step": 29324 + }, + { + "epoch": 2.375648088139987, + "grad_norm": 0.06773090362548828, + "learning_rate": 9.024258517485036e-05, + "loss": 0.2829, + "step": 29325 + }, + { + "epoch": 2.3757290991574855, + "grad_norm": 0.06279227882623672, + "learning_rate": 9.023808452225573e-05, + "loss": 0.2658, + "step": 29326 + }, + { + "epoch": 2.375810110174984, + "grad_norm": 0.06827504187822342, + "learning_rate": 9.02335838696611e-05, + "loss": 0.2172, + "step": 29327 + }, + { + "epoch": 2.375891121192482, + "grad_norm": 0.06267635524272919, + "learning_rate": 9.022908321706648e-05, + "loss": 0.2675, + "step": 29328 + }, + { + "epoch": 2.3759721322099807, + "grad_norm": 0.05714039504528046, + "learning_rate": 9.022458256447185e-05, + "loss": 0.2372, + "step": 29329 + }, + { + "epoch": 2.376053143227479, + "grad_norm": 0.07521743327379227, + "learning_rate": 9.022008191187722e-05, + "loss": 0.2625, + "step": 29330 + }, + { + "epoch": 2.376134154244977, + "grad_norm": 0.07218578457832336, + "learning_rate": 9.02155812592826e-05, + "loss": 0.2601, + "step": 29331 + }, + { + "epoch": 2.376215165262476, + "grad_norm": 0.061242103576660156, + "learning_rate": 9.021108060668797e-05, + "loss": 0.2439, + "step": 29332 + }, + { + "epoch": 2.376296176279974, + "grad_norm": 0.05862688645720482, + "learning_rate": 9.020657995409335e-05, + "loss": 0.2327, + "step": 29333 + }, + { + "epoch": 2.3763771872974724, + "grad_norm": 0.065977081656456, + "learning_rate": 9.020207930149872e-05, + "loss": 0.2434, + "step": 29334 + }, + { + "epoch": 2.376458198314971, + "grad_norm": 0.0634816437959671, + "learning_rate": 9.019757864890409e-05, + "loss": 0.2493, + "step": 29335 + }, + { + "epoch": 2.3765392093324693, + "grad_norm": 0.06377308815717697, + "learning_rate": 9.019307799630947e-05, + "loss": 0.2441, + "step": 29336 + }, + { + "epoch": 2.3766202203499676, + "grad_norm": 0.06866897642612457, + "learning_rate": 9.018857734371484e-05, + "loss": 0.2655, + "step": 29337 + }, + { + "epoch": 2.376701231367466, + "grad_norm": 0.06453913450241089, + "learning_rate": 9.018407669112021e-05, + "loss": 0.2355, + "step": 29338 + }, + { + "epoch": 2.3767822423849645, + "grad_norm": 0.07311535626649857, + "learning_rate": 9.017957603852559e-05, + "loss": 0.2829, + "step": 29339 + }, + { + "epoch": 2.3768632534024627, + "grad_norm": 0.08807942271232605, + "learning_rate": 9.017507538593096e-05, + "loss": 0.2346, + "step": 29340 + }, + { + "epoch": 2.376944264419961, + "grad_norm": 0.056518666446208954, + "learning_rate": 9.017057473333633e-05, + "loss": 0.2741, + "step": 29341 + }, + { + "epoch": 2.3770252754374597, + "grad_norm": 0.07224880158901215, + "learning_rate": 9.016607408074171e-05, + "loss": 0.2188, + "step": 29342 + }, + { + "epoch": 2.377106286454958, + "grad_norm": 0.0775294154882431, + "learning_rate": 9.016157342814708e-05, + "loss": 0.2412, + "step": 29343 + }, + { + "epoch": 2.377187297472456, + "grad_norm": 0.06410830467939377, + "learning_rate": 9.015707277555247e-05, + "loss": 0.2718, + "step": 29344 + }, + { + "epoch": 2.3772683084899544, + "grad_norm": 0.0727565661072731, + "learning_rate": 9.015257212295783e-05, + "loss": 0.2492, + "step": 29345 + }, + { + "epoch": 2.377349319507453, + "grad_norm": 0.05172773078083992, + "learning_rate": 9.01480714703632e-05, + "loss": 0.1947, + "step": 29346 + }, + { + "epoch": 2.3774303305249513, + "grad_norm": 0.05713996663689613, + "learning_rate": 9.014357081776859e-05, + "loss": 0.2295, + "step": 29347 + }, + { + "epoch": 2.3775113415424496, + "grad_norm": 0.07344935834407806, + "learning_rate": 9.013907016517395e-05, + "loss": 0.2317, + "step": 29348 + }, + { + "epoch": 2.3775923525599483, + "grad_norm": 0.050090860575437546, + "learning_rate": 9.013456951257932e-05, + "loss": 0.2414, + "step": 29349 + }, + { + "epoch": 2.3776733635774465, + "grad_norm": 0.05050138384103775, + "learning_rate": 9.013006885998471e-05, + "loss": 0.2231, + "step": 29350 + }, + { + "epoch": 2.3777543745949448, + "grad_norm": 0.06192393973469734, + "learning_rate": 9.012556820739007e-05, + "loss": 0.2396, + "step": 29351 + }, + { + "epoch": 2.3778353856124435, + "grad_norm": 0.07744492590427399, + "learning_rate": 9.012106755479544e-05, + "loss": 0.2311, + "step": 29352 + }, + { + "epoch": 2.3779163966299417, + "grad_norm": 0.06321647018194199, + "learning_rate": 9.011656690220083e-05, + "loss": 0.2532, + "step": 29353 + }, + { + "epoch": 2.37799740764744, + "grad_norm": 0.06907698512077332, + "learning_rate": 9.011206624960619e-05, + "loss": 0.2545, + "step": 29354 + }, + { + "epoch": 2.3780784186649386, + "grad_norm": 0.06946361809968948, + "learning_rate": 9.010756559701156e-05, + "loss": 0.2538, + "step": 29355 + }, + { + "epoch": 2.378159429682437, + "grad_norm": 0.0634828582406044, + "learning_rate": 9.010306494441695e-05, + "loss": 0.2617, + "step": 29356 + }, + { + "epoch": 2.378240440699935, + "grad_norm": 0.06888262182474136, + "learning_rate": 9.009856429182231e-05, + "loss": 0.2616, + "step": 29357 + }, + { + "epoch": 2.378321451717434, + "grad_norm": 0.07091489434242249, + "learning_rate": 9.009406363922769e-05, + "loss": 0.2833, + "step": 29358 + }, + { + "epoch": 2.378402462734932, + "grad_norm": 0.06463092565536499, + "learning_rate": 9.008956298663307e-05, + "loss": 0.262, + "step": 29359 + }, + { + "epoch": 2.3784834737524303, + "grad_norm": 0.06307054311037064, + "learning_rate": 9.008506233403845e-05, + "loss": 0.2975, + "step": 29360 + }, + { + "epoch": 2.3785644847699285, + "grad_norm": 0.07395637035369873, + "learning_rate": 9.00805616814438e-05, + "loss": 0.2408, + "step": 29361 + }, + { + "epoch": 2.3786454957874272, + "grad_norm": 0.0689404159784317, + "learning_rate": 9.007606102884919e-05, + "loss": 0.265, + "step": 29362 + }, + { + "epoch": 2.3787265068049255, + "grad_norm": 0.060062237083911896, + "learning_rate": 9.007156037625457e-05, + "loss": 0.2371, + "step": 29363 + }, + { + "epoch": 2.3788075178224237, + "grad_norm": 0.06880932301282883, + "learning_rate": 9.006705972365993e-05, + "loss": 0.2392, + "step": 29364 + }, + { + "epoch": 2.3788885288399224, + "grad_norm": 0.05119600147008896, + "learning_rate": 9.006255907106531e-05, + "loss": 0.2123, + "step": 29365 + }, + { + "epoch": 2.3789695398574207, + "grad_norm": 0.053568657487630844, + "learning_rate": 9.005805841847069e-05, + "loss": 0.258, + "step": 29366 + }, + { + "epoch": 2.379050550874919, + "grad_norm": 0.05414640158414841, + "learning_rate": 9.005355776587605e-05, + "loss": 0.1898, + "step": 29367 + }, + { + "epoch": 2.379131561892417, + "grad_norm": 0.057076968252658844, + "learning_rate": 9.004905711328143e-05, + "loss": 0.2524, + "step": 29368 + }, + { + "epoch": 2.379212572909916, + "grad_norm": 0.07572975754737854, + "learning_rate": 9.004455646068681e-05, + "loss": 0.2415, + "step": 29369 + }, + { + "epoch": 2.379293583927414, + "grad_norm": 0.05044267699122429, + "learning_rate": 9.004005580809218e-05, + "loss": 0.2259, + "step": 29370 + }, + { + "epoch": 2.3793745949449123, + "grad_norm": 0.0783015713095665, + "learning_rate": 9.003555515549756e-05, + "loss": 0.2868, + "step": 29371 + }, + { + "epoch": 2.379455605962411, + "grad_norm": 0.06224671006202698, + "learning_rate": 9.003105450290293e-05, + "loss": 0.2468, + "step": 29372 + }, + { + "epoch": 2.3795366169799093, + "grad_norm": 0.06955622136592865, + "learning_rate": 9.00265538503083e-05, + "loss": 0.2552, + "step": 29373 + }, + { + "epoch": 2.3796176279974075, + "grad_norm": 0.05678120627999306, + "learning_rate": 9.002205319771368e-05, + "loss": 0.1996, + "step": 29374 + }, + { + "epoch": 2.379698639014906, + "grad_norm": 0.07291990518569946, + "learning_rate": 9.001755254511905e-05, + "loss": 0.241, + "step": 29375 + }, + { + "epoch": 2.3797796500324044, + "grad_norm": 0.06487166881561279, + "learning_rate": 9.001305189252442e-05, + "loss": 0.184, + "step": 29376 + }, + { + "epoch": 2.3798606610499027, + "grad_norm": 0.06429188698530197, + "learning_rate": 9.00085512399298e-05, + "loss": 0.2441, + "step": 29377 + }, + { + "epoch": 2.3799416720674014, + "grad_norm": 0.06392151862382889, + "learning_rate": 9.000405058733517e-05, + "loss": 0.2663, + "step": 29378 + }, + { + "epoch": 2.3800226830848996, + "grad_norm": 0.07104162126779556, + "learning_rate": 8.999954993474054e-05, + "loss": 0.2556, + "step": 29379 + }, + { + "epoch": 2.380103694102398, + "grad_norm": 0.07497918605804443, + "learning_rate": 8.999504928214592e-05, + "loss": 0.2527, + "step": 29380 + }, + { + "epoch": 2.3801847051198965, + "grad_norm": 0.06797675788402557, + "learning_rate": 8.999054862955129e-05, + "loss": 0.2693, + "step": 29381 + }, + { + "epoch": 2.380265716137395, + "grad_norm": 0.06973263621330261, + "learning_rate": 8.998604797695667e-05, + "loss": 0.235, + "step": 29382 + }, + { + "epoch": 2.380346727154893, + "grad_norm": 0.06908416748046875, + "learning_rate": 8.998154732436204e-05, + "loss": 0.2702, + "step": 29383 + }, + { + "epoch": 2.3804277381723913, + "grad_norm": 0.06926834583282471, + "learning_rate": 8.997704667176741e-05, + "loss": 0.2228, + "step": 29384 + }, + { + "epoch": 2.38050874918989, + "grad_norm": 0.06293617933988571, + "learning_rate": 8.997254601917279e-05, + "loss": 0.2318, + "step": 29385 + }, + { + "epoch": 2.380589760207388, + "grad_norm": 0.07725179195404053, + "learning_rate": 8.996804536657816e-05, + "loss": 0.2875, + "step": 29386 + }, + { + "epoch": 2.3806707712248865, + "grad_norm": 0.08843903988599777, + "learning_rate": 8.996354471398353e-05, + "loss": 0.2792, + "step": 29387 + }, + { + "epoch": 2.380751782242385, + "grad_norm": 0.05114304646849632, + "learning_rate": 8.995904406138891e-05, + "loss": 0.2401, + "step": 29388 + }, + { + "epoch": 2.3808327932598834, + "grad_norm": 0.06192674860358238, + "learning_rate": 8.995454340879428e-05, + "loss": 0.2321, + "step": 29389 + }, + { + "epoch": 2.3809138042773816, + "grad_norm": 0.07055914402008057, + "learning_rate": 8.995004275619965e-05, + "loss": 0.244, + "step": 29390 + }, + { + "epoch": 2.38099481529488, + "grad_norm": 0.06864523887634277, + "learning_rate": 8.994554210360503e-05, + "loss": 0.2536, + "step": 29391 + }, + { + "epoch": 2.3810758263123786, + "grad_norm": 0.06931130588054657, + "learning_rate": 8.99410414510104e-05, + "loss": 0.2185, + "step": 29392 + }, + { + "epoch": 2.381156837329877, + "grad_norm": 0.06413775682449341, + "learning_rate": 8.993654079841577e-05, + "loss": 0.26, + "step": 29393 + }, + { + "epoch": 2.381237848347375, + "grad_norm": 0.06281787902116776, + "learning_rate": 8.993204014582115e-05, + "loss": 0.2374, + "step": 29394 + }, + { + "epoch": 2.3813188593648738, + "grad_norm": 0.06532049179077148, + "learning_rate": 8.992753949322652e-05, + "loss": 0.2814, + "step": 29395 + }, + { + "epoch": 2.381399870382372, + "grad_norm": 0.04853471741080284, + "learning_rate": 8.99230388406319e-05, + "loss": 0.2335, + "step": 29396 + }, + { + "epoch": 2.3814808813998702, + "grad_norm": 0.04921332374215126, + "learning_rate": 8.991853818803727e-05, + "loss": 0.2296, + "step": 29397 + }, + { + "epoch": 2.381561892417369, + "grad_norm": 0.07588033378124237, + "learning_rate": 8.991403753544264e-05, + "loss": 0.2316, + "step": 29398 + }, + { + "epoch": 2.381642903434867, + "grad_norm": 0.06211639940738678, + "learning_rate": 8.990953688284802e-05, + "loss": 0.2186, + "step": 29399 + }, + { + "epoch": 2.3817239144523654, + "grad_norm": 0.05988191440701485, + "learning_rate": 8.990503623025339e-05, + "loss": 0.2049, + "step": 29400 + }, + { + "epoch": 2.381804925469864, + "grad_norm": 0.06821976602077484, + "learning_rate": 8.990053557765876e-05, + "loss": 0.2521, + "step": 29401 + }, + { + "epoch": 2.3818859364873624, + "grad_norm": 0.06516029685735703, + "learning_rate": 8.989603492506414e-05, + "loss": 0.2673, + "step": 29402 + }, + { + "epoch": 2.3819669475048606, + "grad_norm": 0.09090472757816315, + "learning_rate": 8.989153427246951e-05, + "loss": 0.2308, + "step": 29403 + }, + { + "epoch": 2.3820479585223593, + "grad_norm": 0.0731615200638771, + "learning_rate": 8.988703361987488e-05, + "loss": 0.2317, + "step": 29404 + }, + { + "epoch": 2.3821289695398575, + "grad_norm": 0.06726046651601791, + "learning_rate": 8.988253296728026e-05, + "loss": 0.2476, + "step": 29405 + }, + { + "epoch": 2.3822099805573558, + "grad_norm": 0.07715233415365219, + "learning_rate": 8.987803231468563e-05, + "loss": 0.2815, + "step": 29406 + }, + { + "epoch": 2.382290991574854, + "grad_norm": 0.07051915675401688, + "learning_rate": 8.9873531662091e-05, + "loss": 0.247, + "step": 29407 + }, + { + "epoch": 2.3823720025923527, + "grad_norm": 0.06349935382604599, + "learning_rate": 8.986903100949638e-05, + "loss": 0.2381, + "step": 29408 + }, + { + "epoch": 2.382453013609851, + "grad_norm": 0.07029050588607788, + "learning_rate": 8.986453035690175e-05, + "loss": 0.2344, + "step": 29409 + }, + { + "epoch": 2.382534024627349, + "grad_norm": 0.06303397566080093, + "learning_rate": 8.986002970430713e-05, + "loss": 0.2225, + "step": 29410 + }, + { + "epoch": 2.382615035644848, + "grad_norm": 0.07148943841457367, + "learning_rate": 8.98555290517125e-05, + "loss": 0.2517, + "step": 29411 + }, + { + "epoch": 2.382696046662346, + "grad_norm": 0.0819324180483818, + "learning_rate": 8.985102839911787e-05, + "loss": 0.2755, + "step": 29412 + }, + { + "epoch": 2.3827770576798444, + "grad_norm": 0.06606269627809525, + "learning_rate": 8.984652774652325e-05, + "loss": 0.2423, + "step": 29413 + }, + { + "epoch": 2.3828580686973426, + "grad_norm": 0.06293286383152008, + "learning_rate": 8.984202709392862e-05, + "loss": 0.2624, + "step": 29414 + }, + { + "epoch": 2.3829390797148413, + "grad_norm": 0.06545112282037735, + "learning_rate": 8.9837526441334e-05, + "loss": 0.2283, + "step": 29415 + }, + { + "epoch": 2.3830200907323396, + "grad_norm": 0.06773924827575684, + "learning_rate": 8.983302578873937e-05, + "loss": 0.2265, + "step": 29416 + }, + { + "epoch": 2.383101101749838, + "grad_norm": 0.07627654820680618, + "learning_rate": 8.982852513614474e-05, + "loss": 0.2721, + "step": 29417 + }, + { + "epoch": 2.3831821127673365, + "grad_norm": 0.0715307965874672, + "learning_rate": 8.982402448355011e-05, + "loss": 0.2365, + "step": 29418 + }, + { + "epoch": 2.3832631237848347, + "grad_norm": 0.06103930249810219, + "learning_rate": 8.981952383095549e-05, + "loss": 0.231, + "step": 29419 + }, + { + "epoch": 2.383344134802333, + "grad_norm": 0.06282036751508713, + "learning_rate": 8.981502317836086e-05, + "loss": 0.2191, + "step": 29420 + }, + { + "epoch": 2.3834251458198317, + "grad_norm": 0.07668137550354004, + "learning_rate": 8.981052252576624e-05, + "loss": 0.2885, + "step": 29421 + }, + { + "epoch": 2.38350615683733, + "grad_norm": 0.06577254086732864, + "learning_rate": 8.980602187317162e-05, + "loss": 0.2641, + "step": 29422 + }, + { + "epoch": 2.383587167854828, + "grad_norm": 0.06056416779756546, + "learning_rate": 8.980152122057698e-05, + "loss": 0.2767, + "step": 29423 + }, + { + "epoch": 2.383668178872327, + "grad_norm": 0.06798427551984787, + "learning_rate": 8.979702056798236e-05, + "loss": 0.2606, + "step": 29424 + }, + { + "epoch": 2.383749189889825, + "grad_norm": 0.06081007421016693, + "learning_rate": 8.979251991538774e-05, + "loss": 0.2633, + "step": 29425 + }, + { + "epoch": 2.3838302009073233, + "grad_norm": 0.06475000828504562, + "learning_rate": 8.97880192627931e-05, + "loss": 0.2315, + "step": 29426 + }, + { + "epoch": 2.3839112119248216, + "grad_norm": 0.06365825980901718, + "learning_rate": 8.978351861019848e-05, + "loss": 0.2564, + "step": 29427 + }, + { + "epoch": 2.3839922229423203, + "grad_norm": 0.06745237857103348, + "learning_rate": 8.977901795760386e-05, + "loss": 0.2596, + "step": 29428 + }, + { + "epoch": 2.3840732339598185, + "grad_norm": 0.06658170372247696, + "learning_rate": 8.977451730500924e-05, + "loss": 0.2437, + "step": 29429 + }, + { + "epoch": 2.3841542449773168, + "grad_norm": 0.06342948228120804, + "learning_rate": 8.97700166524146e-05, + "loss": 0.2361, + "step": 29430 + }, + { + "epoch": 2.3842352559948155, + "grad_norm": 0.07605592161417007, + "learning_rate": 8.976551599981999e-05, + "loss": 0.2685, + "step": 29431 + }, + { + "epoch": 2.3843162670123137, + "grad_norm": 0.06304483115673065, + "learning_rate": 8.976101534722536e-05, + "loss": 0.2481, + "step": 29432 + }, + { + "epoch": 2.384397278029812, + "grad_norm": 0.06299123167991638, + "learning_rate": 8.975651469463072e-05, + "loss": 0.2569, + "step": 29433 + }, + { + "epoch": 2.3844782890473106, + "grad_norm": 0.07267063856124878, + "learning_rate": 8.97520140420361e-05, + "loss": 0.2488, + "step": 29434 + }, + { + "epoch": 2.384559300064809, + "grad_norm": 0.0659085214138031, + "learning_rate": 8.974751338944148e-05, + "loss": 0.2288, + "step": 29435 + }, + { + "epoch": 2.384640311082307, + "grad_norm": 0.07585114985704422, + "learning_rate": 8.974301273684684e-05, + "loss": 0.2565, + "step": 29436 + }, + { + "epoch": 2.3847213220998054, + "grad_norm": 0.08091212064027786, + "learning_rate": 8.973851208425223e-05, + "loss": 0.2633, + "step": 29437 + }, + { + "epoch": 2.384802333117304, + "grad_norm": 0.06598428636789322, + "learning_rate": 8.97340114316576e-05, + "loss": 0.2664, + "step": 29438 + }, + { + "epoch": 2.3848833441348023, + "grad_norm": 0.06010466068983078, + "learning_rate": 8.972951077906296e-05, + "loss": 0.2495, + "step": 29439 + }, + { + "epoch": 2.3849643551523005, + "grad_norm": 0.05948279798030853, + "learning_rate": 8.972501012646835e-05, + "loss": 0.215, + "step": 29440 + }, + { + "epoch": 2.3850453661697992, + "grad_norm": 0.07091670483350754, + "learning_rate": 8.972050947387372e-05, + "loss": 0.2684, + "step": 29441 + }, + { + "epoch": 2.3851263771872975, + "grad_norm": 0.07026311755180359, + "learning_rate": 8.971600882127908e-05, + "loss": 0.2313, + "step": 29442 + }, + { + "epoch": 2.3852073882047957, + "grad_norm": 0.06763871759176254, + "learning_rate": 8.971150816868447e-05, + "loss": 0.2378, + "step": 29443 + }, + { + "epoch": 2.3852883992222944, + "grad_norm": 0.06841225922107697, + "learning_rate": 8.970700751608984e-05, + "loss": 0.2332, + "step": 29444 + }, + { + "epoch": 2.3853694102397927, + "grad_norm": 0.06431113183498383, + "learning_rate": 8.97025068634952e-05, + "loss": 0.2752, + "step": 29445 + }, + { + "epoch": 2.385450421257291, + "grad_norm": 0.06591040641069412, + "learning_rate": 8.969800621090059e-05, + "loss": 0.2531, + "step": 29446 + }, + { + "epoch": 2.3855314322747896, + "grad_norm": 0.062982939183712, + "learning_rate": 8.969350555830596e-05, + "loss": 0.1959, + "step": 29447 + }, + { + "epoch": 2.385612443292288, + "grad_norm": 0.05950835719704628, + "learning_rate": 8.968900490571134e-05, + "loss": 0.2678, + "step": 29448 + }, + { + "epoch": 2.385693454309786, + "grad_norm": 0.05962624028325081, + "learning_rate": 8.968450425311671e-05, + "loss": 0.2357, + "step": 29449 + }, + { + "epoch": 2.3857744653272843, + "grad_norm": 0.05592918023467064, + "learning_rate": 8.968000360052208e-05, + "loss": 0.2017, + "step": 29450 + }, + { + "epoch": 2.385855476344783, + "grad_norm": 0.06242687255144119, + "learning_rate": 8.967550294792746e-05, + "loss": 0.2649, + "step": 29451 + }, + { + "epoch": 2.3859364873622813, + "grad_norm": 0.05544115602970123, + "learning_rate": 8.967100229533283e-05, + "loss": 0.2417, + "step": 29452 + }, + { + "epoch": 2.3860174983797795, + "grad_norm": 0.08726091682910919, + "learning_rate": 8.96665016427382e-05, + "loss": 0.2654, + "step": 29453 + }, + { + "epoch": 2.386098509397278, + "grad_norm": 0.06033741310238838, + "learning_rate": 8.966200099014358e-05, + "loss": 0.2248, + "step": 29454 + }, + { + "epoch": 2.3861795204147764, + "grad_norm": 0.0769038274884224, + "learning_rate": 8.965750033754895e-05, + "loss": 0.2655, + "step": 29455 + }, + { + "epoch": 2.3862605314322747, + "grad_norm": 0.06940066814422607, + "learning_rate": 8.965299968495433e-05, + "loss": 0.2582, + "step": 29456 + }, + { + "epoch": 2.386341542449773, + "grad_norm": 0.07113378494977951, + "learning_rate": 8.96484990323597e-05, + "loss": 0.2424, + "step": 29457 + }, + { + "epoch": 2.3864225534672716, + "grad_norm": 0.07034889608621597, + "learning_rate": 8.964399837976507e-05, + "loss": 0.2595, + "step": 29458 + }, + { + "epoch": 2.38650356448477, + "grad_norm": 0.0719676986336708, + "learning_rate": 8.963949772717045e-05, + "loss": 0.2594, + "step": 29459 + }, + { + "epoch": 2.386584575502268, + "grad_norm": 0.06455852091312408, + "learning_rate": 8.963499707457582e-05, + "loss": 0.22, + "step": 29460 + }, + { + "epoch": 2.386665586519767, + "grad_norm": 0.06308633089065552, + "learning_rate": 8.96304964219812e-05, + "loss": 0.2601, + "step": 29461 + }, + { + "epoch": 2.386746597537265, + "grad_norm": 0.05884365737438202, + "learning_rate": 8.962599576938657e-05, + "loss": 0.2338, + "step": 29462 + }, + { + "epoch": 2.3868276085547633, + "grad_norm": 0.07012617588043213, + "learning_rate": 8.962149511679194e-05, + "loss": 0.2558, + "step": 29463 + }, + { + "epoch": 2.386908619572262, + "grad_norm": 0.0692358985543251, + "learning_rate": 8.961699446419731e-05, + "loss": 0.239, + "step": 29464 + }, + { + "epoch": 2.38698963058976, + "grad_norm": 0.06754320114850998, + "learning_rate": 8.961249381160269e-05, + "loss": 0.2061, + "step": 29465 + }, + { + "epoch": 2.3870706416072585, + "grad_norm": 0.060342174023389816, + "learning_rate": 8.960799315900806e-05, + "loss": 0.2169, + "step": 29466 + }, + { + "epoch": 2.387151652624757, + "grad_norm": 0.06610507518053055, + "learning_rate": 8.960349250641344e-05, + "loss": 0.2707, + "step": 29467 + }, + { + "epoch": 2.3872326636422554, + "grad_norm": 0.07763776183128357, + "learning_rate": 8.959899185381881e-05, + "loss": 0.2592, + "step": 29468 + }, + { + "epoch": 2.3873136746597536, + "grad_norm": 0.05693863704800606, + "learning_rate": 8.959449120122418e-05, + "loss": 0.2363, + "step": 29469 + }, + { + "epoch": 2.3873946856772523, + "grad_norm": 0.06809014081954956, + "learning_rate": 8.958999054862956e-05, + "loss": 0.2543, + "step": 29470 + }, + { + "epoch": 2.3874756966947506, + "grad_norm": 0.06256530433893204, + "learning_rate": 8.958548989603493e-05, + "loss": 0.2253, + "step": 29471 + }, + { + "epoch": 2.387556707712249, + "grad_norm": 0.07401026040315628, + "learning_rate": 8.95809892434403e-05, + "loss": 0.2619, + "step": 29472 + }, + { + "epoch": 2.387637718729747, + "grad_norm": 0.06931627541780472, + "learning_rate": 8.957648859084568e-05, + "loss": 0.2977, + "step": 29473 + }, + { + "epoch": 2.3877187297472457, + "grad_norm": 0.0846065878868103, + "learning_rate": 8.957198793825105e-05, + "loss": 0.3067, + "step": 29474 + }, + { + "epoch": 2.387799740764744, + "grad_norm": 0.0711844339966774, + "learning_rate": 8.956748728565642e-05, + "loss": 0.2294, + "step": 29475 + }, + { + "epoch": 2.3878807517822422, + "grad_norm": 0.06755898892879486, + "learning_rate": 8.95629866330618e-05, + "loss": 0.2515, + "step": 29476 + }, + { + "epoch": 2.387961762799741, + "grad_norm": 0.07480302453041077, + "learning_rate": 8.955848598046717e-05, + "loss": 0.2045, + "step": 29477 + }, + { + "epoch": 2.388042773817239, + "grad_norm": 0.0641959086060524, + "learning_rate": 8.955398532787254e-05, + "loss": 0.236, + "step": 29478 + }, + { + "epoch": 2.3881237848347374, + "grad_norm": 0.06681734323501587, + "learning_rate": 8.954948467527792e-05, + "loss": 0.2513, + "step": 29479 + }, + { + "epoch": 2.3882047958522357, + "grad_norm": 0.0504288449883461, + "learning_rate": 8.954498402268329e-05, + "loss": 0.2491, + "step": 29480 + }, + { + "epoch": 2.3882858068697344, + "grad_norm": 0.05784103274345398, + "learning_rate": 8.954048337008867e-05, + "loss": 0.2398, + "step": 29481 + }, + { + "epoch": 2.3883668178872326, + "grad_norm": 0.07770927250385284, + "learning_rate": 8.953598271749404e-05, + "loss": 0.2731, + "step": 29482 + }, + { + "epoch": 2.388447828904731, + "grad_norm": 0.06906847655773163, + "learning_rate": 8.953148206489941e-05, + "loss": 0.2525, + "step": 29483 + }, + { + "epoch": 2.3885288399222295, + "grad_norm": 0.07368722558021545, + "learning_rate": 8.952698141230479e-05, + "loss": 0.2416, + "step": 29484 + }, + { + "epoch": 2.3886098509397278, + "grad_norm": 0.04947913438081741, + "learning_rate": 8.952248075971016e-05, + "loss": 0.2461, + "step": 29485 + }, + { + "epoch": 2.388690861957226, + "grad_norm": 0.06703958660364151, + "learning_rate": 8.951798010711553e-05, + "loss": 0.2476, + "step": 29486 + }, + { + "epoch": 2.3887718729747247, + "grad_norm": 0.07003334909677505, + "learning_rate": 8.951347945452091e-05, + "loss": 0.2733, + "step": 29487 + }, + { + "epoch": 2.388852883992223, + "grad_norm": 0.07241364568471909, + "learning_rate": 8.950897880192628e-05, + "loss": 0.2788, + "step": 29488 + }, + { + "epoch": 2.388933895009721, + "grad_norm": 0.07211482524871826, + "learning_rate": 8.950447814933165e-05, + "loss": 0.2698, + "step": 29489 + }, + { + "epoch": 2.38901490602722, + "grad_norm": 0.0676577165722847, + "learning_rate": 8.949997749673703e-05, + "loss": 0.278, + "step": 29490 + }, + { + "epoch": 2.389095917044718, + "grad_norm": 0.07245694100856781, + "learning_rate": 8.94954768441424e-05, + "loss": 0.263, + "step": 29491 + }, + { + "epoch": 2.3891769280622164, + "grad_norm": 0.06439948081970215, + "learning_rate": 8.949097619154778e-05, + "loss": 0.2363, + "step": 29492 + }, + { + "epoch": 2.389257939079715, + "grad_norm": 0.06424013525247574, + "learning_rate": 8.948647553895315e-05, + "loss": 0.2454, + "step": 29493 + }, + { + "epoch": 2.3893389500972133, + "grad_norm": 0.08479436486959457, + "learning_rate": 8.948197488635852e-05, + "loss": 0.257, + "step": 29494 + }, + { + "epoch": 2.3894199611147116, + "grad_norm": 0.06991950422525406, + "learning_rate": 8.947747423376391e-05, + "loss": 0.2861, + "step": 29495 + }, + { + "epoch": 2.38950097213221, + "grad_norm": 0.05634448677301407, + "learning_rate": 8.947297358116927e-05, + "loss": 0.2191, + "step": 29496 + }, + { + "epoch": 2.3895819831497085, + "grad_norm": 0.06699156761169434, + "learning_rate": 8.946847292857464e-05, + "loss": 0.2619, + "step": 29497 + }, + { + "epoch": 2.3896629941672067, + "grad_norm": 0.05900770053267479, + "learning_rate": 8.946397227598003e-05, + "loss": 0.2327, + "step": 29498 + }, + { + "epoch": 2.389744005184705, + "grad_norm": 0.05870789662003517, + "learning_rate": 8.945947162338539e-05, + "loss": 0.2415, + "step": 29499 + }, + { + "epoch": 2.3898250162022037, + "grad_norm": 0.07382352650165558, + "learning_rate": 8.945497097079076e-05, + "loss": 0.2541, + "step": 29500 + }, + { + "epoch": 2.389906027219702, + "grad_norm": 0.06369208544492722, + "learning_rate": 8.945047031819615e-05, + "loss": 0.2456, + "step": 29501 + }, + { + "epoch": 2.3899870382372, + "grad_norm": 0.07548237591981888, + "learning_rate": 8.944596966560151e-05, + "loss": 0.2469, + "step": 29502 + }, + { + "epoch": 2.3900680492546984, + "grad_norm": 0.06972216069698334, + "learning_rate": 8.94414690130069e-05, + "loss": 0.2522, + "step": 29503 + }, + { + "epoch": 2.390149060272197, + "grad_norm": 0.05822772905230522, + "learning_rate": 8.943696836041227e-05, + "loss": 0.2461, + "step": 29504 + }, + { + "epoch": 2.3902300712896953, + "grad_norm": 0.07330431044101715, + "learning_rate": 8.943246770781763e-05, + "loss": 0.2647, + "step": 29505 + }, + { + "epoch": 2.3903110823071936, + "grad_norm": 0.07856698334217072, + "learning_rate": 8.942796705522302e-05, + "loss": 0.235, + "step": 29506 + }, + { + "epoch": 2.3903920933246923, + "grad_norm": 0.07406508922576904, + "learning_rate": 8.942346640262839e-05, + "loss": 0.2767, + "step": 29507 + }, + { + "epoch": 2.3904731043421905, + "grad_norm": 0.06292467564344406, + "learning_rate": 8.941896575003375e-05, + "loss": 0.2303, + "step": 29508 + }, + { + "epoch": 2.3905541153596888, + "grad_norm": 0.07412441074848175, + "learning_rate": 8.941446509743914e-05, + "loss": 0.263, + "step": 29509 + }, + { + "epoch": 2.3906351263771874, + "grad_norm": 0.06904534995555878, + "learning_rate": 8.940996444484451e-05, + "loss": 0.267, + "step": 29510 + }, + { + "epoch": 2.3907161373946857, + "grad_norm": 0.07869131863117218, + "learning_rate": 8.940546379224987e-05, + "loss": 0.2491, + "step": 29511 + }, + { + "epoch": 2.390797148412184, + "grad_norm": 0.06752260774374008, + "learning_rate": 8.940096313965526e-05, + "loss": 0.2257, + "step": 29512 + }, + { + "epoch": 2.3908781594296826, + "grad_norm": 0.0686509758234024, + "learning_rate": 8.939646248706063e-05, + "loss": 0.2307, + "step": 29513 + }, + { + "epoch": 2.390959170447181, + "grad_norm": 0.06670700013637543, + "learning_rate": 8.9391961834466e-05, + "loss": 0.227, + "step": 29514 + }, + { + "epoch": 2.391040181464679, + "grad_norm": 0.07350020855665207, + "learning_rate": 8.938746118187138e-05, + "loss": 0.2595, + "step": 29515 + }, + { + "epoch": 2.391121192482178, + "grad_norm": 0.062225863337516785, + "learning_rate": 8.938296052927676e-05, + "loss": 0.2215, + "step": 29516 + }, + { + "epoch": 2.391202203499676, + "grad_norm": 0.05617249011993408, + "learning_rate": 8.937845987668212e-05, + "loss": 0.2478, + "step": 29517 + }, + { + "epoch": 2.3912832145171743, + "grad_norm": 0.06018560007214546, + "learning_rate": 8.93739592240875e-05, + "loss": 0.2521, + "step": 29518 + }, + { + "epoch": 2.3913642255346725, + "grad_norm": 0.06831783801317215, + "learning_rate": 8.936945857149288e-05, + "loss": 0.2441, + "step": 29519 + }, + { + "epoch": 2.3914452365521712, + "grad_norm": 0.05382615327835083, + "learning_rate": 8.936495791889824e-05, + "loss": 0.2117, + "step": 29520 + }, + { + "epoch": 2.3915262475696695, + "grad_norm": 0.05994303151965141, + "learning_rate": 8.936045726630362e-05, + "loss": 0.2316, + "step": 29521 + }, + { + "epoch": 2.3916072585871677, + "grad_norm": 0.06629019975662231, + "learning_rate": 8.9355956613709e-05, + "loss": 0.2224, + "step": 29522 + }, + { + "epoch": 2.3916882696046664, + "grad_norm": 0.06589581072330475, + "learning_rate": 8.935145596111436e-05, + "loss": 0.2921, + "step": 29523 + }, + { + "epoch": 2.3917692806221647, + "grad_norm": 0.06992766261100769, + "learning_rate": 8.934695530851974e-05, + "loss": 0.2422, + "step": 29524 + }, + { + "epoch": 2.391850291639663, + "grad_norm": 0.06090115010738373, + "learning_rate": 8.934245465592512e-05, + "loss": 0.2305, + "step": 29525 + }, + { + "epoch": 2.391931302657161, + "grad_norm": 0.06497127562761307, + "learning_rate": 8.933795400333048e-05, + "loss": 0.2307, + "step": 29526 + }, + { + "epoch": 2.39201231367466, + "grad_norm": 0.06542658805847168, + "learning_rate": 8.933345335073586e-05, + "loss": 0.2518, + "step": 29527 + }, + { + "epoch": 2.392093324692158, + "grad_norm": 0.07996457070112228, + "learning_rate": 8.932895269814124e-05, + "loss": 0.2767, + "step": 29528 + }, + { + "epoch": 2.3921743357096563, + "grad_norm": 0.0691191554069519, + "learning_rate": 8.932445204554661e-05, + "loss": 0.2719, + "step": 29529 + }, + { + "epoch": 2.392255346727155, + "grad_norm": 0.06836870312690735, + "learning_rate": 8.931995139295199e-05, + "loss": 0.2424, + "step": 29530 + }, + { + "epoch": 2.3923363577446533, + "grad_norm": 0.06277644634246826, + "learning_rate": 8.931545074035736e-05, + "loss": 0.3112, + "step": 29531 + }, + { + "epoch": 2.3924173687621515, + "grad_norm": 0.083866186439991, + "learning_rate": 8.931095008776273e-05, + "loss": 0.245, + "step": 29532 + }, + { + "epoch": 2.39249837977965, + "grad_norm": 0.08739671856164932, + "learning_rate": 8.93064494351681e-05, + "loss": 0.2828, + "step": 29533 + }, + { + "epoch": 2.3925793907971484, + "grad_norm": 0.07633476704359055, + "learning_rate": 8.930194878257348e-05, + "loss": 0.2661, + "step": 29534 + }, + { + "epoch": 2.3926604018146467, + "grad_norm": 0.06199926510453224, + "learning_rate": 8.929744812997885e-05, + "loss": 0.2305, + "step": 29535 + }, + { + "epoch": 2.3927414128321454, + "grad_norm": 0.05972784385085106, + "learning_rate": 8.929294747738423e-05, + "loss": 0.2733, + "step": 29536 + }, + { + "epoch": 2.3928224238496436, + "grad_norm": 0.060678012669086456, + "learning_rate": 8.92884468247896e-05, + "loss": 0.2294, + "step": 29537 + }, + { + "epoch": 2.392903434867142, + "grad_norm": 0.0721907690167427, + "learning_rate": 8.928394617219497e-05, + "loss": 0.2564, + "step": 29538 + }, + { + "epoch": 2.3929844458846405, + "grad_norm": 0.06307584047317505, + "learning_rate": 8.927944551960035e-05, + "loss": 0.2649, + "step": 29539 + }, + { + "epoch": 2.393065456902139, + "grad_norm": 0.0750347301363945, + "learning_rate": 8.927494486700572e-05, + "loss": 0.2259, + "step": 29540 + }, + { + "epoch": 2.393146467919637, + "grad_norm": 0.06097326800227165, + "learning_rate": 8.92704442144111e-05, + "loss": 0.2481, + "step": 29541 + }, + { + "epoch": 2.3932274789371353, + "grad_norm": 0.060833755880594254, + "learning_rate": 8.926594356181647e-05, + "loss": 0.206, + "step": 29542 + }, + { + "epoch": 2.393308489954634, + "grad_norm": 0.06569301337003708, + "learning_rate": 8.926144290922184e-05, + "loss": 0.2317, + "step": 29543 + }, + { + "epoch": 2.393389500972132, + "grad_norm": 0.05607564002275467, + "learning_rate": 8.925694225662722e-05, + "loss": 0.2252, + "step": 29544 + }, + { + "epoch": 2.3934705119896305, + "grad_norm": 0.06047516316175461, + "learning_rate": 8.925244160403259e-05, + "loss": 0.2683, + "step": 29545 + }, + { + "epoch": 2.393551523007129, + "grad_norm": 0.054401516914367676, + "learning_rate": 8.924794095143796e-05, + "loss": 0.2488, + "step": 29546 + }, + { + "epoch": 2.3936325340246274, + "grad_norm": 0.04817594587802887, + "learning_rate": 8.924344029884334e-05, + "loss": 0.2257, + "step": 29547 + }, + { + "epoch": 2.3937135450421256, + "grad_norm": 0.05992850661277771, + "learning_rate": 8.923893964624871e-05, + "loss": 0.2092, + "step": 29548 + }, + { + "epoch": 2.393794556059624, + "grad_norm": 0.07102970033884048, + "learning_rate": 8.923443899365408e-05, + "loss": 0.2858, + "step": 29549 + }, + { + "epoch": 2.3938755670771226, + "grad_norm": 0.05211573839187622, + "learning_rate": 8.922993834105946e-05, + "loss": 0.2312, + "step": 29550 + }, + { + "epoch": 2.393956578094621, + "grad_norm": 0.07465586066246033, + "learning_rate": 8.922543768846483e-05, + "loss": 0.2826, + "step": 29551 + }, + { + "epoch": 2.394037589112119, + "grad_norm": 0.07565885782241821, + "learning_rate": 8.92209370358702e-05, + "loss": 0.2216, + "step": 29552 + }, + { + "epoch": 2.3941186001296177, + "grad_norm": 0.07079378515481949, + "learning_rate": 8.921643638327558e-05, + "loss": 0.2301, + "step": 29553 + }, + { + "epoch": 2.394199611147116, + "grad_norm": 0.06497667729854584, + "learning_rate": 8.921193573068095e-05, + "loss": 0.2282, + "step": 29554 + }, + { + "epoch": 2.3942806221646142, + "grad_norm": 0.06706354767084122, + "learning_rate": 8.920743507808633e-05, + "loss": 0.2796, + "step": 29555 + }, + { + "epoch": 2.394361633182113, + "grad_norm": 0.06051041558384895, + "learning_rate": 8.92029344254917e-05, + "loss": 0.2497, + "step": 29556 + }, + { + "epoch": 2.394442644199611, + "grad_norm": 0.061869047582149506, + "learning_rate": 8.919843377289707e-05, + "loss": 0.2302, + "step": 29557 + }, + { + "epoch": 2.3945236552171094, + "grad_norm": 0.06803011149168015, + "learning_rate": 8.919393312030245e-05, + "loss": 0.248, + "step": 29558 + }, + { + "epoch": 2.394604666234608, + "grad_norm": 0.06455831974744797, + "learning_rate": 8.918943246770782e-05, + "loss": 0.2371, + "step": 29559 + }, + { + "epoch": 2.3946856772521063, + "grad_norm": 0.07764575630426407, + "learning_rate": 8.91849318151132e-05, + "loss": 0.2781, + "step": 29560 + }, + { + "epoch": 2.3947666882696046, + "grad_norm": 0.06719639152288437, + "learning_rate": 8.918043116251857e-05, + "loss": 0.2158, + "step": 29561 + }, + { + "epoch": 2.3948476992871033, + "grad_norm": 0.07174685597419739, + "learning_rate": 8.917593050992394e-05, + "loss": 0.2641, + "step": 29562 + }, + { + "epoch": 2.3949287103046015, + "grad_norm": 0.07454746961593628, + "learning_rate": 8.917142985732931e-05, + "loss": 0.2645, + "step": 29563 + }, + { + "epoch": 2.3950097213220998, + "grad_norm": 0.07032407075166702, + "learning_rate": 8.91669292047347e-05, + "loss": 0.2503, + "step": 29564 + }, + { + "epoch": 2.395090732339598, + "grad_norm": 0.07723000645637512, + "learning_rate": 8.916242855214006e-05, + "loss": 0.2716, + "step": 29565 + }, + { + "epoch": 2.3951717433570967, + "grad_norm": 0.06460478901863098, + "learning_rate": 8.915792789954544e-05, + "loss": 0.2652, + "step": 29566 + }, + { + "epoch": 2.395252754374595, + "grad_norm": 0.05906296521425247, + "learning_rate": 8.915342724695082e-05, + "loss": 0.2557, + "step": 29567 + }, + { + "epoch": 2.395333765392093, + "grad_norm": 0.05999612808227539, + "learning_rate": 8.914892659435618e-05, + "loss": 0.2319, + "step": 29568 + }, + { + "epoch": 2.395414776409592, + "grad_norm": 0.07182128727436066, + "learning_rate": 8.914442594176156e-05, + "loss": 0.2204, + "step": 29569 + }, + { + "epoch": 2.39549578742709, + "grad_norm": 0.0702458918094635, + "learning_rate": 8.913992528916694e-05, + "loss": 0.2589, + "step": 29570 + }, + { + "epoch": 2.3955767984445884, + "grad_norm": 0.05754755809903145, + "learning_rate": 8.91354246365723e-05, + "loss": 0.2285, + "step": 29571 + }, + { + "epoch": 2.3956578094620866, + "grad_norm": 0.07615985721349716, + "learning_rate": 8.913092398397768e-05, + "loss": 0.2494, + "step": 29572 + }, + { + "epoch": 2.3957388204795853, + "grad_norm": 0.06916381418704987, + "learning_rate": 8.912642333138306e-05, + "loss": 0.2825, + "step": 29573 + }, + { + "epoch": 2.3958198314970836, + "grad_norm": 0.08532532304525375, + "learning_rate": 8.912192267878842e-05, + "loss": 0.2174, + "step": 29574 + }, + { + "epoch": 2.395900842514582, + "grad_norm": 0.06773477047681808, + "learning_rate": 8.91174220261938e-05, + "loss": 0.2418, + "step": 29575 + }, + { + "epoch": 2.3959818535320805, + "grad_norm": 0.06349464505910873, + "learning_rate": 8.911292137359918e-05, + "loss": 0.2202, + "step": 29576 + }, + { + "epoch": 2.3960628645495787, + "grad_norm": 0.06422010809183121, + "learning_rate": 8.910842072100454e-05, + "loss": 0.2895, + "step": 29577 + }, + { + "epoch": 2.396143875567077, + "grad_norm": 0.060994140803813934, + "learning_rate": 8.910392006840992e-05, + "loss": 0.2572, + "step": 29578 + }, + { + "epoch": 2.3962248865845757, + "grad_norm": 0.07673156261444092, + "learning_rate": 8.90994194158153e-05, + "loss": 0.2491, + "step": 29579 + }, + { + "epoch": 2.396305897602074, + "grad_norm": 0.06521592289209366, + "learning_rate": 8.909491876322067e-05, + "loss": 0.2379, + "step": 29580 + }, + { + "epoch": 2.396386908619572, + "grad_norm": 0.05804380401968956, + "learning_rate": 8.909041811062605e-05, + "loss": 0.2108, + "step": 29581 + }, + { + "epoch": 2.396467919637071, + "grad_norm": 0.05636740103363991, + "learning_rate": 8.908591745803143e-05, + "loss": 0.2424, + "step": 29582 + }, + { + "epoch": 2.396548930654569, + "grad_norm": 0.08155636489391327, + "learning_rate": 8.908141680543679e-05, + "loss": 0.2704, + "step": 29583 + }, + { + "epoch": 2.3966299416720673, + "grad_norm": 0.05432563275098801, + "learning_rate": 8.907691615284217e-05, + "loss": 0.2141, + "step": 29584 + }, + { + "epoch": 2.396710952689566, + "grad_norm": 0.06512283533811569, + "learning_rate": 8.907241550024755e-05, + "loss": 0.246, + "step": 29585 + }, + { + "epoch": 2.3967919637070643, + "grad_norm": 0.06527567654848099, + "learning_rate": 8.906791484765291e-05, + "loss": 0.2362, + "step": 29586 + }, + { + "epoch": 2.3968729747245625, + "grad_norm": 0.06097620725631714, + "learning_rate": 8.90634141950583e-05, + "loss": 0.2412, + "step": 29587 + }, + { + "epoch": 2.3969539857420608, + "grad_norm": 0.06241489201784134, + "learning_rate": 8.905891354246367e-05, + "loss": 0.2231, + "step": 29588 + }, + { + "epoch": 2.3970349967595594, + "grad_norm": 0.06722144782543182, + "learning_rate": 8.905441288986903e-05, + "loss": 0.2468, + "step": 29589 + }, + { + "epoch": 2.3971160077770577, + "grad_norm": 0.08474922925233841, + "learning_rate": 8.904991223727442e-05, + "loss": 0.2148, + "step": 29590 + }, + { + "epoch": 2.397197018794556, + "grad_norm": 0.05398935079574585, + "learning_rate": 8.904541158467979e-05, + "loss": 0.2572, + "step": 29591 + }, + { + "epoch": 2.3972780298120546, + "grad_norm": 0.05842433497309685, + "learning_rate": 8.904091093208515e-05, + "loss": 0.2377, + "step": 29592 + }, + { + "epoch": 2.397359040829553, + "grad_norm": 0.06882943958044052, + "learning_rate": 8.903641027949054e-05, + "loss": 0.2476, + "step": 29593 + }, + { + "epoch": 2.397440051847051, + "grad_norm": 0.049048058688640594, + "learning_rate": 8.903190962689591e-05, + "loss": 0.1895, + "step": 29594 + }, + { + "epoch": 2.3975210628645494, + "grad_norm": 0.06414130330085754, + "learning_rate": 8.902740897430127e-05, + "loss": 0.2652, + "step": 29595 + }, + { + "epoch": 2.397602073882048, + "grad_norm": 0.08015193790197372, + "learning_rate": 8.902290832170666e-05, + "loss": 0.2534, + "step": 29596 + }, + { + "epoch": 2.3976830848995463, + "grad_norm": 0.05423025041818619, + "learning_rate": 8.901840766911203e-05, + "loss": 0.2412, + "step": 29597 + }, + { + "epoch": 2.3977640959170445, + "grad_norm": 0.06724569946527481, + "learning_rate": 8.901390701651739e-05, + "loss": 0.2432, + "step": 29598 + }, + { + "epoch": 2.3978451069345432, + "grad_norm": 0.07872132956981659, + "learning_rate": 8.900940636392278e-05, + "loss": 0.2623, + "step": 29599 + }, + { + "epoch": 2.3979261179520415, + "grad_norm": 0.06910988688468933, + "learning_rate": 8.900490571132815e-05, + "loss": 0.2595, + "step": 29600 + }, + { + "epoch": 2.3980071289695397, + "grad_norm": 0.06886013597249985, + "learning_rate": 8.900040505873351e-05, + "loss": 0.2688, + "step": 29601 + }, + { + "epoch": 2.3980881399870384, + "grad_norm": 0.07101655006408691, + "learning_rate": 8.89959044061389e-05, + "loss": 0.2469, + "step": 29602 + }, + { + "epoch": 2.3981691510045366, + "grad_norm": 0.07381045818328857, + "learning_rate": 8.899140375354427e-05, + "loss": 0.2643, + "step": 29603 + }, + { + "epoch": 2.398250162022035, + "grad_norm": 0.053292810916900635, + "learning_rate": 8.898690310094963e-05, + "loss": 0.1849, + "step": 29604 + }, + { + "epoch": 2.3983311730395336, + "grad_norm": 0.05561814457178116, + "learning_rate": 8.898240244835502e-05, + "loss": 0.2459, + "step": 29605 + }, + { + "epoch": 2.398412184057032, + "grad_norm": 0.061810776591300964, + "learning_rate": 8.897790179576039e-05, + "loss": 0.2249, + "step": 29606 + }, + { + "epoch": 2.39849319507453, + "grad_norm": 0.07361455261707306, + "learning_rate": 8.897340114316577e-05, + "loss": 0.2647, + "step": 29607 + }, + { + "epoch": 2.3985742060920288, + "grad_norm": 0.060726381838321686, + "learning_rate": 8.896890049057114e-05, + "loss": 0.2486, + "step": 29608 + }, + { + "epoch": 2.398655217109527, + "grad_norm": 0.07952304184436798, + "learning_rate": 8.896439983797651e-05, + "loss": 0.2455, + "step": 29609 + }, + { + "epoch": 2.3987362281270252, + "grad_norm": 0.07039670646190643, + "learning_rate": 8.895989918538189e-05, + "loss": 0.2579, + "step": 29610 + }, + { + "epoch": 2.3988172391445235, + "grad_norm": 0.06562759727239609, + "learning_rate": 8.895539853278726e-05, + "loss": 0.2359, + "step": 29611 + }, + { + "epoch": 2.398898250162022, + "grad_norm": 0.07553057372570038, + "learning_rate": 8.895089788019263e-05, + "loss": 0.251, + "step": 29612 + }, + { + "epoch": 2.3989792611795204, + "grad_norm": 0.05753480643033981, + "learning_rate": 8.894639722759801e-05, + "loss": 0.2121, + "step": 29613 + }, + { + "epoch": 2.3990602721970187, + "grad_norm": 0.06267315149307251, + "learning_rate": 8.894189657500338e-05, + "loss": 0.2448, + "step": 29614 + }, + { + "epoch": 2.3991412832145174, + "grad_norm": 0.07307960838079453, + "learning_rate": 8.893739592240876e-05, + "loss": 0.2624, + "step": 29615 + }, + { + "epoch": 2.3992222942320156, + "grad_norm": 0.06411339342594147, + "learning_rate": 8.893289526981413e-05, + "loss": 0.2217, + "step": 29616 + }, + { + "epoch": 2.399303305249514, + "grad_norm": 0.06430069357156754, + "learning_rate": 8.89283946172195e-05, + "loss": 0.2259, + "step": 29617 + }, + { + "epoch": 2.399384316267012, + "grad_norm": 0.06761736422777176, + "learning_rate": 8.892389396462488e-05, + "loss": 0.2493, + "step": 29618 + }, + { + "epoch": 2.399465327284511, + "grad_norm": 0.06652569770812988, + "learning_rate": 8.891939331203025e-05, + "loss": 0.2445, + "step": 29619 + }, + { + "epoch": 2.399546338302009, + "grad_norm": 0.04514552652835846, + "learning_rate": 8.891489265943562e-05, + "loss": 0.2013, + "step": 29620 + }, + { + "epoch": 2.3996273493195073, + "grad_norm": 0.08357077091932297, + "learning_rate": 8.8910392006841e-05, + "loss": 0.2911, + "step": 29621 + }, + { + "epoch": 2.399708360337006, + "grad_norm": 0.0534617118537426, + "learning_rate": 8.890589135424637e-05, + "loss": 0.1895, + "step": 29622 + }, + { + "epoch": 2.399789371354504, + "grad_norm": 0.06501682847738266, + "learning_rate": 8.890139070165174e-05, + "loss": 0.2394, + "step": 29623 + }, + { + "epoch": 2.3998703823720025, + "grad_norm": 0.059162236750125885, + "learning_rate": 8.889689004905712e-05, + "loss": 0.2821, + "step": 29624 + }, + { + "epoch": 2.399951393389501, + "grad_norm": 0.06915201246738434, + "learning_rate": 8.889238939646249e-05, + "loss": 0.2513, + "step": 29625 + }, + { + "epoch": 2.4000324044069994, + "grad_norm": 0.07056642323732376, + "learning_rate": 8.888788874386787e-05, + "loss": 0.2652, + "step": 29626 + }, + { + "epoch": 2.4001134154244976, + "grad_norm": 0.053473327308893204, + "learning_rate": 8.888338809127324e-05, + "loss": 0.2427, + "step": 29627 + }, + { + "epoch": 2.4001944264419963, + "grad_norm": 0.09125243127346039, + "learning_rate": 8.887888743867861e-05, + "loss": 0.2423, + "step": 29628 + }, + { + "epoch": 2.4002754374594946, + "grad_norm": 0.08848083019256592, + "learning_rate": 8.887438678608399e-05, + "loss": 0.2736, + "step": 29629 + }, + { + "epoch": 2.400356448476993, + "grad_norm": 0.0683230310678482, + "learning_rate": 8.886988613348936e-05, + "loss": 0.2493, + "step": 29630 + }, + { + "epoch": 2.4004374594944915, + "grad_norm": 0.07020305097103119, + "learning_rate": 8.886538548089473e-05, + "loss": 0.2545, + "step": 29631 + }, + { + "epoch": 2.4005184705119897, + "grad_norm": 0.07480403780937195, + "learning_rate": 8.88608848283001e-05, + "loss": 0.2506, + "step": 29632 + }, + { + "epoch": 2.400599481529488, + "grad_norm": 0.0668230876326561, + "learning_rate": 8.885638417570548e-05, + "loss": 0.2722, + "step": 29633 + }, + { + "epoch": 2.4006804925469862, + "grad_norm": 0.06247445195913315, + "learning_rate": 8.885188352311085e-05, + "loss": 0.2566, + "step": 29634 + }, + { + "epoch": 2.400761503564485, + "grad_norm": 0.06823485344648361, + "learning_rate": 8.884738287051623e-05, + "loss": 0.2498, + "step": 29635 + }, + { + "epoch": 2.400842514581983, + "grad_norm": 0.07839343696832657, + "learning_rate": 8.884288221792161e-05, + "loss": 0.2733, + "step": 29636 + }, + { + "epoch": 2.4009235255994814, + "grad_norm": 0.08385621011257172, + "learning_rate": 8.883838156532697e-05, + "loss": 0.2964, + "step": 29637 + }, + { + "epoch": 2.40100453661698, + "grad_norm": 0.06754658371210098, + "learning_rate": 8.883388091273235e-05, + "loss": 0.2416, + "step": 29638 + }, + { + "epoch": 2.4010855476344783, + "grad_norm": 0.060111649334430695, + "learning_rate": 8.882938026013774e-05, + "loss": 0.2204, + "step": 29639 + }, + { + "epoch": 2.4011665586519766, + "grad_norm": 0.07113023102283478, + "learning_rate": 8.88248796075431e-05, + "loss": 0.2438, + "step": 29640 + }, + { + "epoch": 2.401247569669475, + "grad_norm": 0.07107888907194138, + "learning_rate": 8.882037895494847e-05, + "loss": 0.2741, + "step": 29641 + }, + { + "epoch": 2.4013285806869735, + "grad_norm": 0.06887800246477127, + "learning_rate": 8.881587830235386e-05, + "loss": 0.2886, + "step": 29642 + }, + { + "epoch": 2.4014095917044718, + "grad_norm": 0.055432695895433426, + "learning_rate": 8.881137764975922e-05, + "loss": 0.2222, + "step": 29643 + }, + { + "epoch": 2.40149060272197, + "grad_norm": 0.06920438259840012, + "learning_rate": 8.880687699716459e-05, + "loss": 0.2484, + "step": 29644 + }, + { + "epoch": 2.4015716137394687, + "grad_norm": 0.06635645776987076, + "learning_rate": 8.880237634456998e-05, + "loss": 0.2444, + "step": 29645 + }, + { + "epoch": 2.401652624756967, + "grad_norm": 0.06889855861663818, + "learning_rate": 8.879787569197534e-05, + "loss": 0.2806, + "step": 29646 + }, + { + "epoch": 2.401733635774465, + "grad_norm": 0.06344221532344818, + "learning_rate": 8.879337503938071e-05, + "loss": 0.2375, + "step": 29647 + }, + { + "epoch": 2.401814646791964, + "grad_norm": 0.06579340249300003, + "learning_rate": 8.87888743867861e-05, + "loss": 0.2372, + "step": 29648 + }, + { + "epoch": 2.401895657809462, + "grad_norm": 0.05817051976919174, + "learning_rate": 8.878437373419146e-05, + "loss": 0.2487, + "step": 29649 + }, + { + "epoch": 2.4019766688269604, + "grad_norm": 0.08828700333833694, + "learning_rate": 8.877987308159683e-05, + "loss": 0.3105, + "step": 29650 + }, + { + "epoch": 2.402057679844459, + "grad_norm": 0.05760222673416138, + "learning_rate": 8.877537242900222e-05, + "loss": 0.2585, + "step": 29651 + }, + { + "epoch": 2.4021386908619573, + "grad_norm": 0.05954901874065399, + "learning_rate": 8.877087177640758e-05, + "loss": 0.2504, + "step": 29652 + }, + { + "epoch": 2.4022197018794555, + "grad_norm": 0.05687323212623596, + "learning_rate": 8.876637112381295e-05, + "loss": 0.2478, + "step": 29653 + }, + { + "epoch": 2.402300712896954, + "grad_norm": 0.07466956973075867, + "learning_rate": 8.876187047121834e-05, + "loss": 0.2437, + "step": 29654 + }, + { + "epoch": 2.4023817239144525, + "grad_norm": 0.05892636254429817, + "learning_rate": 8.87573698186237e-05, + "loss": 0.252, + "step": 29655 + }, + { + "epoch": 2.4024627349319507, + "grad_norm": 0.06808330118656158, + "learning_rate": 8.875286916602907e-05, + "loss": 0.2319, + "step": 29656 + }, + { + "epoch": 2.402543745949449, + "grad_norm": 0.061993733048439026, + "learning_rate": 8.874836851343446e-05, + "loss": 0.2637, + "step": 29657 + }, + { + "epoch": 2.4026247569669477, + "grad_norm": 0.056053124368190765, + "learning_rate": 8.874386786083982e-05, + "loss": 0.272, + "step": 29658 + }, + { + "epoch": 2.402705767984446, + "grad_norm": 0.06183644384145737, + "learning_rate": 8.87393672082452e-05, + "loss": 0.2559, + "step": 29659 + }, + { + "epoch": 2.402786779001944, + "grad_norm": 0.06335576623678207, + "learning_rate": 8.873486655565058e-05, + "loss": 0.2429, + "step": 29660 + }, + { + "epoch": 2.4028677900194424, + "grad_norm": 0.06689286977052689, + "learning_rate": 8.873036590305594e-05, + "loss": 0.2428, + "step": 29661 + }, + { + "epoch": 2.402948801036941, + "grad_norm": 0.06543515622615814, + "learning_rate": 8.872586525046133e-05, + "loss": 0.243, + "step": 29662 + }, + { + "epoch": 2.4030298120544393, + "grad_norm": 0.07345867902040482, + "learning_rate": 8.87213645978667e-05, + "loss": 0.2533, + "step": 29663 + }, + { + "epoch": 2.4031108230719376, + "grad_norm": 0.07059627771377563, + "learning_rate": 8.871686394527206e-05, + "loss": 0.2348, + "step": 29664 + }, + { + "epoch": 2.4031918340894363, + "grad_norm": 0.07763314247131348, + "learning_rate": 8.871236329267745e-05, + "loss": 0.2539, + "step": 29665 + }, + { + "epoch": 2.4032728451069345, + "grad_norm": 0.06865006685256958, + "learning_rate": 8.870786264008282e-05, + "loss": 0.2579, + "step": 29666 + }, + { + "epoch": 2.4033538561244328, + "grad_norm": 0.061918459832668304, + "learning_rate": 8.870336198748818e-05, + "loss": 0.2472, + "step": 29667 + }, + { + "epoch": 2.4034348671419314, + "grad_norm": 0.061768610030412674, + "learning_rate": 8.869886133489357e-05, + "loss": 0.1955, + "step": 29668 + }, + { + "epoch": 2.4035158781594297, + "grad_norm": 0.06935855001211166, + "learning_rate": 8.869436068229894e-05, + "loss": 0.2317, + "step": 29669 + }, + { + "epoch": 2.403596889176928, + "grad_norm": 0.0667807012796402, + "learning_rate": 8.86898600297043e-05, + "loss": 0.251, + "step": 29670 + }, + { + "epoch": 2.4036779001944266, + "grad_norm": 0.06723842024803162, + "learning_rate": 8.868535937710969e-05, + "loss": 0.226, + "step": 29671 + }, + { + "epoch": 2.403758911211925, + "grad_norm": 0.06217675283551216, + "learning_rate": 8.868085872451506e-05, + "loss": 0.2232, + "step": 29672 + }, + { + "epoch": 2.403839922229423, + "grad_norm": 0.06494613736867905, + "learning_rate": 8.867635807192042e-05, + "loss": 0.2547, + "step": 29673 + }, + { + "epoch": 2.403920933246922, + "grad_norm": 0.06875234097242355, + "learning_rate": 8.867185741932581e-05, + "loss": 0.2292, + "step": 29674 + }, + { + "epoch": 2.40400194426442, + "grad_norm": 0.07220776379108429, + "learning_rate": 8.866735676673119e-05, + "loss": 0.3206, + "step": 29675 + }, + { + "epoch": 2.4040829552819183, + "grad_norm": 0.07017524540424347, + "learning_rate": 8.866285611413655e-05, + "loss": 0.2571, + "step": 29676 + }, + { + "epoch": 2.4041639662994165, + "grad_norm": 0.0610513836145401, + "learning_rate": 8.865835546154193e-05, + "loss": 0.229, + "step": 29677 + }, + { + "epoch": 2.404244977316915, + "grad_norm": 0.06113404035568237, + "learning_rate": 8.86538548089473e-05, + "loss": 0.287, + "step": 29678 + }, + { + "epoch": 2.4043259883344135, + "grad_norm": 0.06622372567653656, + "learning_rate": 8.864935415635267e-05, + "loss": 0.2213, + "step": 29679 + }, + { + "epoch": 2.4044069993519117, + "grad_norm": 0.06459666043519974, + "learning_rate": 8.864485350375805e-05, + "loss": 0.2076, + "step": 29680 + }, + { + "epoch": 2.4044880103694104, + "grad_norm": 0.06556376069784164, + "learning_rate": 8.864035285116343e-05, + "loss": 0.2137, + "step": 29681 + }, + { + "epoch": 2.4045690213869086, + "grad_norm": 0.0706576257944107, + "learning_rate": 8.863585219856879e-05, + "loss": 0.2516, + "step": 29682 + }, + { + "epoch": 2.404650032404407, + "grad_norm": 0.06727118790149689, + "learning_rate": 8.863135154597417e-05, + "loss": 0.2497, + "step": 29683 + }, + { + "epoch": 2.404731043421905, + "grad_norm": 0.06485434621572495, + "learning_rate": 8.862685089337955e-05, + "loss": 0.2405, + "step": 29684 + }, + { + "epoch": 2.404812054439404, + "grad_norm": 0.06530654430389404, + "learning_rate": 8.862235024078491e-05, + "loss": 0.2682, + "step": 29685 + }, + { + "epoch": 2.404893065456902, + "grad_norm": 0.07048763334751129, + "learning_rate": 8.86178495881903e-05, + "loss": 0.2775, + "step": 29686 + }, + { + "epoch": 2.4049740764744003, + "grad_norm": 0.06269804388284683, + "learning_rate": 8.861334893559567e-05, + "loss": 0.2426, + "step": 29687 + }, + { + "epoch": 2.405055087491899, + "grad_norm": 0.06605499982833862, + "learning_rate": 8.860884828300104e-05, + "loss": 0.2528, + "step": 29688 + }, + { + "epoch": 2.4051360985093972, + "grad_norm": 0.06122152879834175, + "learning_rate": 8.860434763040642e-05, + "loss": 0.2521, + "step": 29689 + }, + { + "epoch": 2.4052171095268955, + "grad_norm": 0.06740374118089676, + "learning_rate": 8.859984697781179e-05, + "loss": 0.2256, + "step": 29690 + }, + { + "epoch": 2.405298120544394, + "grad_norm": 0.06832475960254669, + "learning_rate": 8.859534632521716e-05, + "loss": 0.2571, + "step": 29691 + }, + { + "epoch": 2.4053791315618924, + "grad_norm": 0.05728984251618385, + "learning_rate": 8.859084567262254e-05, + "loss": 0.2393, + "step": 29692 + }, + { + "epoch": 2.4054601425793907, + "grad_norm": 0.05768602341413498, + "learning_rate": 8.858634502002791e-05, + "loss": 0.182, + "step": 29693 + }, + { + "epoch": 2.4055411535968894, + "grad_norm": 0.06498179584741592, + "learning_rate": 8.858184436743328e-05, + "loss": 0.2197, + "step": 29694 + }, + { + "epoch": 2.4056221646143876, + "grad_norm": 0.06262174248695374, + "learning_rate": 8.857734371483866e-05, + "loss": 0.2531, + "step": 29695 + }, + { + "epoch": 2.405703175631886, + "grad_norm": 0.08773565292358398, + "learning_rate": 8.857284306224403e-05, + "loss": 0.2652, + "step": 29696 + }, + { + "epoch": 2.4057841866493845, + "grad_norm": 0.09450256079435349, + "learning_rate": 8.85683424096494e-05, + "loss": 0.2816, + "step": 29697 + }, + { + "epoch": 2.405865197666883, + "grad_norm": 0.07148297131061554, + "learning_rate": 8.856384175705478e-05, + "loss": 0.2585, + "step": 29698 + }, + { + "epoch": 2.405946208684381, + "grad_norm": 0.06335129588842392, + "learning_rate": 8.855934110446015e-05, + "loss": 0.2381, + "step": 29699 + }, + { + "epoch": 2.4060272197018793, + "grad_norm": 0.07238379865884781, + "learning_rate": 8.855484045186553e-05, + "loss": 0.2496, + "step": 29700 + }, + { + "epoch": 2.406108230719378, + "grad_norm": 0.06429867446422577, + "learning_rate": 8.85503397992709e-05, + "loss": 0.2236, + "step": 29701 + }, + { + "epoch": 2.406189241736876, + "grad_norm": 0.07388942688703537, + "learning_rate": 8.854583914667627e-05, + "loss": 0.2766, + "step": 29702 + }, + { + "epoch": 2.4062702527543745, + "grad_norm": 0.06903047859668732, + "learning_rate": 8.854133849408165e-05, + "loss": 0.2415, + "step": 29703 + }, + { + "epoch": 2.406351263771873, + "grad_norm": 0.05265869200229645, + "learning_rate": 8.853683784148702e-05, + "loss": 0.2548, + "step": 29704 + }, + { + "epoch": 2.4064322747893714, + "grad_norm": 0.0546029731631279, + "learning_rate": 8.85323371888924e-05, + "loss": 0.2263, + "step": 29705 + }, + { + "epoch": 2.4065132858068696, + "grad_norm": 0.07559093087911606, + "learning_rate": 8.852783653629777e-05, + "loss": 0.2457, + "step": 29706 + }, + { + "epoch": 2.406594296824368, + "grad_norm": 0.07392976433038712, + "learning_rate": 8.852333588370314e-05, + "loss": 0.2449, + "step": 29707 + }, + { + "epoch": 2.4066753078418666, + "grad_norm": 0.06068427115678787, + "learning_rate": 8.851883523110851e-05, + "loss": 0.2252, + "step": 29708 + }, + { + "epoch": 2.406756318859365, + "grad_norm": 0.06823750585317612, + "learning_rate": 8.851433457851389e-05, + "loss": 0.3009, + "step": 29709 + }, + { + "epoch": 2.406837329876863, + "grad_norm": 0.06476844102144241, + "learning_rate": 8.850983392591926e-05, + "loss": 0.2708, + "step": 29710 + }, + { + "epoch": 2.4069183408943617, + "grad_norm": 0.07834062725305557, + "learning_rate": 8.850533327332463e-05, + "loss": 0.2802, + "step": 29711 + }, + { + "epoch": 2.40699935191186, + "grad_norm": 0.3444421589374542, + "learning_rate": 8.850083262073001e-05, + "loss": 0.2757, + "step": 29712 + }, + { + "epoch": 2.4070803629293582, + "grad_norm": 0.060476649552583694, + "learning_rate": 8.849633196813538e-05, + "loss": 0.3, + "step": 29713 + }, + { + "epoch": 2.407161373946857, + "grad_norm": 0.05499740317463875, + "learning_rate": 8.849183131554077e-05, + "loss": 0.2457, + "step": 29714 + }, + { + "epoch": 2.407242384964355, + "grad_norm": 0.06884583830833435, + "learning_rate": 8.848733066294613e-05, + "loss": 0.2529, + "step": 29715 + }, + { + "epoch": 2.4073233959818534, + "grad_norm": 0.0695691630244255, + "learning_rate": 8.84828300103515e-05, + "loss": 0.2408, + "step": 29716 + }, + { + "epoch": 2.407404406999352, + "grad_norm": 0.06407498568296432, + "learning_rate": 8.847832935775689e-05, + "loss": 0.2632, + "step": 29717 + }, + { + "epoch": 2.4074854180168503, + "grad_norm": 0.060239940881729126, + "learning_rate": 8.847382870516225e-05, + "loss": 0.2313, + "step": 29718 + }, + { + "epoch": 2.4075664290343486, + "grad_norm": 0.054388102144002914, + "learning_rate": 8.846932805256762e-05, + "loss": 0.2618, + "step": 29719 + }, + { + "epoch": 2.4076474400518473, + "grad_norm": 0.06771720200777054, + "learning_rate": 8.846482739997301e-05, + "loss": 0.2367, + "step": 29720 + }, + { + "epoch": 2.4077284510693455, + "grad_norm": 0.07251696288585663, + "learning_rate": 8.846032674737837e-05, + "loss": 0.234, + "step": 29721 + }, + { + "epoch": 2.4078094620868438, + "grad_norm": 0.06434344500303268, + "learning_rate": 8.845582609478374e-05, + "loss": 0.2908, + "step": 29722 + }, + { + "epoch": 2.407890473104342, + "grad_norm": 0.08624786138534546, + "learning_rate": 8.845132544218913e-05, + "loss": 0.2447, + "step": 29723 + }, + { + "epoch": 2.4079714841218407, + "grad_norm": 0.06329482048749924, + "learning_rate": 8.844682478959449e-05, + "loss": 0.2652, + "step": 29724 + }, + { + "epoch": 2.408052495139339, + "grad_norm": 0.07422767579555511, + "learning_rate": 8.844232413699987e-05, + "loss": 0.2649, + "step": 29725 + }, + { + "epoch": 2.408133506156837, + "grad_norm": 0.06247050687670708, + "learning_rate": 8.843782348440525e-05, + "loss": 0.2408, + "step": 29726 + }, + { + "epoch": 2.408214517174336, + "grad_norm": 0.06806286424398422, + "learning_rate": 8.843332283181061e-05, + "loss": 0.2658, + "step": 29727 + }, + { + "epoch": 2.408295528191834, + "grad_norm": 0.06933243572711945, + "learning_rate": 8.842882217921599e-05, + "loss": 0.2318, + "step": 29728 + }, + { + "epoch": 2.4083765392093324, + "grad_norm": 0.09632189571857452, + "learning_rate": 8.842432152662137e-05, + "loss": 0.2846, + "step": 29729 + }, + { + "epoch": 2.4084575502268306, + "grad_norm": 0.05814709514379501, + "learning_rate": 8.841982087402673e-05, + "loss": 0.2546, + "step": 29730 + }, + { + "epoch": 2.4085385612443293, + "grad_norm": 0.07406198978424072, + "learning_rate": 8.841532022143211e-05, + "loss": 0.2681, + "step": 29731 + }, + { + "epoch": 2.4086195722618275, + "grad_norm": 0.07493565231561661, + "learning_rate": 8.84108195688375e-05, + "loss": 0.2497, + "step": 29732 + }, + { + "epoch": 2.408700583279326, + "grad_norm": 0.06633651256561279, + "learning_rate": 8.840631891624285e-05, + "loss": 0.2923, + "step": 29733 + }, + { + "epoch": 2.4087815942968245, + "grad_norm": 0.0569579154253006, + "learning_rate": 8.840181826364823e-05, + "loss": 0.2019, + "step": 29734 + }, + { + "epoch": 2.4088626053143227, + "grad_norm": 0.05882423743605614, + "learning_rate": 8.839731761105361e-05, + "loss": 0.2373, + "step": 29735 + }, + { + "epoch": 2.408943616331821, + "grad_norm": 0.0590720996260643, + "learning_rate": 8.839281695845897e-05, + "loss": 0.2308, + "step": 29736 + }, + { + "epoch": 2.4090246273493197, + "grad_norm": 0.0773407518863678, + "learning_rate": 8.838831630586435e-05, + "loss": 0.2613, + "step": 29737 + }, + { + "epoch": 2.409105638366818, + "grad_norm": 0.06300017982721329, + "learning_rate": 8.838381565326974e-05, + "loss": 0.1976, + "step": 29738 + }, + { + "epoch": 2.409186649384316, + "grad_norm": 0.07593004405498505, + "learning_rate": 8.83793150006751e-05, + "loss": 0.2488, + "step": 29739 + }, + { + "epoch": 2.409267660401815, + "grad_norm": 0.06119083985686302, + "learning_rate": 8.837481434808048e-05, + "loss": 0.2288, + "step": 29740 + }, + { + "epoch": 2.409348671419313, + "grad_norm": 0.07041435688734055, + "learning_rate": 8.837031369548586e-05, + "loss": 0.2664, + "step": 29741 + }, + { + "epoch": 2.4094296824368113, + "grad_norm": 0.07043371349573135, + "learning_rate": 8.836581304289122e-05, + "loss": 0.2784, + "step": 29742 + }, + { + "epoch": 2.40951069345431, + "grad_norm": 0.07403253018856049, + "learning_rate": 8.83613123902966e-05, + "loss": 0.249, + "step": 29743 + }, + { + "epoch": 2.4095917044718083, + "grad_norm": 0.07712215930223465, + "learning_rate": 8.835681173770198e-05, + "loss": 0.2421, + "step": 29744 + }, + { + "epoch": 2.4096727154893065, + "grad_norm": 0.07213019579648972, + "learning_rate": 8.835231108510734e-05, + "loss": 0.2471, + "step": 29745 + }, + { + "epoch": 2.4097537265068047, + "grad_norm": 0.06117299944162369, + "learning_rate": 8.834781043251272e-05, + "loss": 0.2534, + "step": 29746 + }, + { + "epoch": 2.4098347375243034, + "grad_norm": 0.06271757185459137, + "learning_rate": 8.83433097799181e-05, + "loss": 0.2347, + "step": 29747 + }, + { + "epoch": 2.4099157485418017, + "grad_norm": 0.060313113033771515, + "learning_rate": 8.833880912732346e-05, + "loss": 0.2103, + "step": 29748 + }, + { + "epoch": 2.4099967595593, + "grad_norm": 0.0679052546620369, + "learning_rate": 8.833430847472885e-05, + "loss": 0.2536, + "step": 29749 + }, + { + "epoch": 2.4100777705767986, + "grad_norm": 0.06643978506326675, + "learning_rate": 8.832980782213422e-05, + "loss": 0.2498, + "step": 29750 + }, + { + "epoch": 2.410158781594297, + "grad_norm": 0.08351900428533554, + "learning_rate": 8.832530716953958e-05, + "loss": 0.3062, + "step": 29751 + }, + { + "epoch": 2.410239792611795, + "grad_norm": 0.07076345384120941, + "learning_rate": 8.832080651694497e-05, + "loss": 0.2302, + "step": 29752 + }, + { + "epoch": 2.4103208036292934, + "grad_norm": 0.06369180232286453, + "learning_rate": 8.831630586435034e-05, + "loss": 0.2396, + "step": 29753 + }, + { + "epoch": 2.410401814646792, + "grad_norm": 0.07251781970262527, + "learning_rate": 8.83118052117557e-05, + "loss": 0.2806, + "step": 29754 + }, + { + "epoch": 2.4104828256642903, + "grad_norm": 0.0614294707775116, + "learning_rate": 8.830730455916109e-05, + "loss": 0.2489, + "step": 29755 + }, + { + "epoch": 2.4105638366817885, + "grad_norm": 0.060916539281606674, + "learning_rate": 8.830280390656646e-05, + "loss": 0.2722, + "step": 29756 + }, + { + "epoch": 2.410644847699287, + "grad_norm": 0.05728588253259659, + "learning_rate": 8.829830325397182e-05, + "loss": 0.2451, + "step": 29757 + }, + { + "epoch": 2.4107258587167855, + "grad_norm": 0.05679110437631607, + "learning_rate": 8.829380260137721e-05, + "loss": 0.2239, + "step": 29758 + }, + { + "epoch": 2.4108068697342837, + "grad_norm": 0.0824250653386116, + "learning_rate": 8.828930194878258e-05, + "loss": 0.2978, + "step": 29759 + }, + { + "epoch": 2.4108878807517824, + "grad_norm": 0.06265558302402496, + "learning_rate": 8.828480129618794e-05, + "loss": 0.2662, + "step": 29760 + }, + { + "epoch": 2.4109688917692806, + "grad_norm": 0.058735158294439316, + "learning_rate": 8.828030064359333e-05, + "loss": 0.2146, + "step": 29761 + }, + { + "epoch": 2.411049902786779, + "grad_norm": 0.06004401296377182, + "learning_rate": 8.82757999909987e-05, + "loss": 0.2509, + "step": 29762 + }, + { + "epoch": 2.4111309138042776, + "grad_norm": 0.07412339746952057, + "learning_rate": 8.827129933840406e-05, + "loss": 0.327, + "step": 29763 + }, + { + "epoch": 2.411211924821776, + "grad_norm": 0.0679435208439827, + "learning_rate": 8.826679868580945e-05, + "loss": 0.2932, + "step": 29764 + }, + { + "epoch": 2.411292935839274, + "grad_norm": 0.07169094681739807, + "learning_rate": 8.826229803321482e-05, + "loss": 0.2314, + "step": 29765 + }, + { + "epoch": 2.4113739468567728, + "grad_norm": 0.06179240345954895, + "learning_rate": 8.82577973806202e-05, + "loss": 0.2281, + "step": 29766 + }, + { + "epoch": 2.411454957874271, + "grad_norm": 0.07233671098947525, + "learning_rate": 8.825329672802557e-05, + "loss": 0.2076, + "step": 29767 + }, + { + "epoch": 2.4115359688917692, + "grad_norm": 0.0739409476518631, + "learning_rate": 8.824879607543094e-05, + "loss": 0.246, + "step": 29768 + }, + { + "epoch": 2.4116169799092675, + "grad_norm": 0.06868897378444672, + "learning_rate": 8.824429542283632e-05, + "loss": 0.2497, + "step": 29769 + }, + { + "epoch": 2.411697990926766, + "grad_norm": 0.06777679175138474, + "learning_rate": 8.823979477024169e-05, + "loss": 0.2261, + "step": 29770 + }, + { + "epoch": 2.4117790019442644, + "grad_norm": 0.0738830640912056, + "learning_rate": 8.823529411764706e-05, + "loss": 0.2361, + "step": 29771 + }, + { + "epoch": 2.4118600129617627, + "grad_norm": 0.06694016605615616, + "learning_rate": 8.823079346505244e-05, + "loss": 0.2583, + "step": 29772 + }, + { + "epoch": 2.4119410239792614, + "grad_norm": 0.06757953763008118, + "learning_rate": 8.822629281245781e-05, + "loss": 0.2597, + "step": 29773 + }, + { + "epoch": 2.4120220349967596, + "grad_norm": 0.07212909311056137, + "learning_rate": 8.822179215986319e-05, + "loss": 0.2619, + "step": 29774 + }, + { + "epoch": 2.412103046014258, + "grad_norm": 0.06142732873558998, + "learning_rate": 8.821729150726856e-05, + "loss": 0.2064, + "step": 29775 + }, + { + "epoch": 2.412184057031756, + "grad_norm": 0.06122400611639023, + "learning_rate": 8.821279085467393e-05, + "loss": 0.2589, + "step": 29776 + }, + { + "epoch": 2.412265068049255, + "grad_norm": 0.06232326477766037, + "learning_rate": 8.82082902020793e-05, + "loss": 0.2328, + "step": 29777 + }, + { + "epoch": 2.412346079066753, + "grad_norm": 0.07228163629770279, + "learning_rate": 8.820378954948468e-05, + "loss": 0.3051, + "step": 29778 + }, + { + "epoch": 2.4124270900842513, + "grad_norm": 0.06454326212406158, + "learning_rate": 8.819928889689005e-05, + "loss": 0.2439, + "step": 29779 + }, + { + "epoch": 2.41250810110175, + "grad_norm": 0.07410193979740143, + "learning_rate": 8.819478824429543e-05, + "loss": 0.3022, + "step": 29780 + }, + { + "epoch": 2.412589112119248, + "grad_norm": 0.07694942504167557, + "learning_rate": 8.81902875917008e-05, + "loss": 0.2377, + "step": 29781 + }, + { + "epoch": 2.4126701231367464, + "grad_norm": 0.07964512705802917, + "learning_rate": 8.818578693910617e-05, + "loss": 0.2537, + "step": 29782 + }, + { + "epoch": 2.412751134154245, + "grad_norm": 0.054239723831415176, + "learning_rate": 8.818128628651155e-05, + "loss": 0.2466, + "step": 29783 + }, + { + "epoch": 2.4128321451717434, + "grad_norm": 0.06153694540262222, + "learning_rate": 8.817678563391692e-05, + "loss": 0.2287, + "step": 29784 + }, + { + "epoch": 2.4129131561892416, + "grad_norm": 0.07154665142297745, + "learning_rate": 8.81722849813223e-05, + "loss": 0.2932, + "step": 29785 + }, + { + "epoch": 2.4129941672067403, + "grad_norm": 0.06164253503084183, + "learning_rate": 8.816778432872767e-05, + "loss": 0.2298, + "step": 29786 + }, + { + "epoch": 2.4130751782242386, + "grad_norm": 0.05929383262991905, + "learning_rate": 8.816328367613304e-05, + "loss": 0.2465, + "step": 29787 + }, + { + "epoch": 2.413156189241737, + "grad_norm": 0.05722380056977272, + "learning_rate": 8.815878302353842e-05, + "loss": 0.2338, + "step": 29788 + }, + { + "epoch": 2.4132372002592355, + "grad_norm": 0.07813999801874161, + "learning_rate": 8.815428237094379e-05, + "loss": 0.2222, + "step": 29789 + }, + { + "epoch": 2.4133182112767337, + "grad_norm": 0.061567552387714386, + "learning_rate": 8.814978171834916e-05, + "loss": 0.2316, + "step": 29790 + }, + { + "epoch": 2.413399222294232, + "grad_norm": 0.06083657220005989, + "learning_rate": 8.814528106575454e-05, + "loss": 0.2198, + "step": 29791 + }, + { + "epoch": 2.4134802333117302, + "grad_norm": 0.06548894196748734, + "learning_rate": 8.814078041315991e-05, + "loss": 0.2775, + "step": 29792 + }, + { + "epoch": 2.413561244329229, + "grad_norm": 0.08378948271274567, + "learning_rate": 8.813627976056528e-05, + "loss": 0.3195, + "step": 29793 + }, + { + "epoch": 2.413642255346727, + "grad_norm": 0.06670024245977402, + "learning_rate": 8.813177910797066e-05, + "loss": 0.2197, + "step": 29794 + }, + { + "epoch": 2.4137232663642254, + "grad_norm": 0.05539132282137871, + "learning_rate": 8.812727845537604e-05, + "loss": 0.2427, + "step": 29795 + }, + { + "epoch": 2.413804277381724, + "grad_norm": 0.07522717118263245, + "learning_rate": 8.81227778027814e-05, + "loss": 0.2701, + "step": 29796 + }, + { + "epoch": 2.4138852883992223, + "grad_norm": 0.06489524245262146, + "learning_rate": 8.811827715018678e-05, + "loss": 0.2401, + "step": 29797 + }, + { + "epoch": 2.4139662994167206, + "grad_norm": 0.06208113953471184, + "learning_rate": 8.811377649759217e-05, + "loss": 0.2158, + "step": 29798 + }, + { + "epoch": 2.414047310434219, + "grad_norm": 0.06468240171670914, + "learning_rate": 8.810927584499753e-05, + "loss": 0.2597, + "step": 29799 + }, + { + "epoch": 2.4141283214517175, + "grad_norm": 0.06483402848243713, + "learning_rate": 8.81047751924029e-05, + "loss": 0.2367, + "step": 29800 + }, + { + "epoch": 2.4142093324692158, + "grad_norm": 0.07432875037193298, + "learning_rate": 8.810027453980829e-05, + "loss": 0.23, + "step": 29801 + }, + { + "epoch": 2.414290343486714, + "grad_norm": 0.0824606642127037, + "learning_rate": 8.809577388721365e-05, + "loss": 0.2402, + "step": 29802 + }, + { + "epoch": 2.4143713545042127, + "grad_norm": 0.061389729380607605, + "learning_rate": 8.809127323461902e-05, + "loss": 0.2312, + "step": 29803 + }, + { + "epoch": 2.414452365521711, + "grad_norm": 0.06574756652116776, + "learning_rate": 8.808677258202441e-05, + "loss": 0.2729, + "step": 29804 + }, + { + "epoch": 2.414533376539209, + "grad_norm": 0.06415622681379318, + "learning_rate": 8.808227192942977e-05, + "loss": 0.2467, + "step": 29805 + }, + { + "epoch": 2.414614387556708, + "grad_norm": 0.06910507380962372, + "learning_rate": 8.807777127683514e-05, + "loss": 0.2529, + "step": 29806 + }, + { + "epoch": 2.414695398574206, + "grad_norm": 0.07220354676246643, + "learning_rate": 8.807327062424053e-05, + "loss": 0.2353, + "step": 29807 + }, + { + "epoch": 2.4147764095917044, + "grad_norm": 0.0628422200679779, + "learning_rate": 8.806876997164589e-05, + "loss": 0.2537, + "step": 29808 + }, + { + "epoch": 2.414857420609203, + "grad_norm": 0.06969404965639114, + "learning_rate": 8.806426931905126e-05, + "loss": 0.2692, + "step": 29809 + }, + { + "epoch": 2.4149384316267013, + "grad_norm": 0.061693429946899414, + "learning_rate": 8.805976866645665e-05, + "loss": 0.2453, + "step": 29810 + }, + { + "epoch": 2.4150194426441995, + "grad_norm": 0.0579097718000412, + "learning_rate": 8.805526801386201e-05, + "loss": 0.2477, + "step": 29811 + }, + { + "epoch": 2.4151004536616982, + "grad_norm": 0.07249568402767181, + "learning_rate": 8.805076736126738e-05, + "loss": 0.2195, + "step": 29812 + }, + { + "epoch": 2.4151814646791965, + "grad_norm": 0.0828808918595314, + "learning_rate": 8.804626670867277e-05, + "loss": 0.2826, + "step": 29813 + }, + { + "epoch": 2.4152624756966947, + "grad_norm": 0.07269842177629471, + "learning_rate": 8.804176605607813e-05, + "loss": 0.2963, + "step": 29814 + }, + { + "epoch": 2.415343486714193, + "grad_norm": 0.06632715463638306, + "learning_rate": 8.80372654034835e-05, + "loss": 0.2514, + "step": 29815 + }, + { + "epoch": 2.4154244977316917, + "grad_norm": 0.07053852081298828, + "learning_rate": 8.803276475088889e-05, + "loss": 0.2155, + "step": 29816 + }, + { + "epoch": 2.41550550874919, + "grad_norm": 0.06813622266054153, + "learning_rate": 8.802826409829425e-05, + "loss": 0.2546, + "step": 29817 + }, + { + "epoch": 2.415586519766688, + "grad_norm": 0.05975549668073654, + "learning_rate": 8.802376344569962e-05, + "loss": 0.2262, + "step": 29818 + }, + { + "epoch": 2.415667530784187, + "grad_norm": 0.05883145332336426, + "learning_rate": 8.801926279310501e-05, + "loss": 0.2362, + "step": 29819 + }, + { + "epoch": 2.415748541801685, + "grad_norm": 0.07348956912755966, + "learning_rate": 8.801476214051037e-05, + "loss": 0.2625, + "step": 29820 + }, + { + "epoch": 2.4158295528191833, + "grad_norm": 0.06862062960863113, + "learning_rate": 8.801026148791576e-05, + "loss": 0.2435, + "step": 29821 + }, + { + "epoch": 2.4159105638366816, + "grad_norm": 0.060579124838113785, + "learning_rate": 8.800576083532113e-05, + "loss": 0.2218, + "step": 29822 + }, + { + "epoch": 2.4159915748541803, + "grad_norm": 0.06969986855983734, + "learning_rate": 8.800126018272649e-05, + "loss": 0.2312, + "step": 29823 + }, + { + "epoch": 2.4160725858716785, + "grad_norm": 0.06263470649719238, + "learning_rate": 8.799675953013188e-05, + "loss": 0.2526, + "step": 29824 + }, + { + "epoch": 2.4161535968891767, + "grad_norm": 0.0679418295621872, + "learning_rate": 8.799225887753725e-05, + "loss": 0.2384, + "step": 29825 + }, + { + "epoch": 2.4162346079066754, + "grad_norm": 0.07143159955739975, + "learning_rate": 8.798775822494261e-05, + "loss": 0.236, + "step": 29826 + }, + { + "epoch": 2.4163156189241737, + "grad_norm": 0.07788621634244919, + "learning_rate": 8.7983257572348e-05, + "loss": 0.2508, + "step": 29827 + }, + { + "epoch": 2.416396629941672, + "grad_norm": 0.055590007454156876, + "learning_rate": 8.797875691975337e-05, + "loss": 0.2412, + "step": 29828 + }, + { + "epoch": 2.4164776409591706, + "grad_norm": 0.07236363738775253, + "learning_rate": 8.797425626715873e-05, + "loss": 0.2574, + "step": 29829 + }, + { + "epoch": 2.416558651976669, + "grad_norm": 0.05652404949069023, + "learning_rate": 8.796975561456412e-05, + "loss": 0.2436, + "step": 29830 + }, + { + "epoch": 2.416639662994167, + "grad_norm": 0.06469497084617615, + "learning_rate": 8.79652549619695e-05, + "loss": 0.2439, + "step": 29831 + }, + { + "epoch": 2.416720674011666, + "grad_norm": 0.0865524411201477, + "learning_rate": 8.796075430937485e-05, + "loss": 0.2599, + "step": 29832 + }, + { + "epoch": 2.416801685029164, + "grad_norm": 0.06282461434602737, + "learning_rate": 8.795625365678024e-05, + "loss": 0.2414, + "step": 29833 + }, + { + "epoch": 2.4168826960466623, + "grad_norm": 0.07091455161571503, + "learning_rate": 8.795175300418562e-05, + "loss": 0.2388, + "step": 29834 + }, + { + "epoch": 2.416963707064161, + "grad_norm": 0.06994987279176712, + "learning_rate": 8.794725235159098e-05, + "loss": 0.2325, + "step": 29835 + }, + { + "epoch": 2.417044718081659, + "grad_norm": 0.0654347687959671, + "learning_rate": 8.794275169899636e-05, + "loss": 0.2666, + "step": 29836 + }, + { + "epoch": 2.4171257290991575, + "grad_norm": 0.06742554903030396, + "learning_rate": 8.793825104640174e-05, + "loss": 0.2468, + "step": 29837 + }, + { + "epoch": 2.4172067401166557, + "grad_norm": 0.06697697937488556, + "learning_rate": 8.79337503938071e-05, + "loss": 0.2861, + "step": 29838 + }, + { + "epoch": 2.4172877511341544, + "grad_norm": 0.06529314070940018, + "learning_rate": 8.792924974121248e-05, + "loss": 0.2546, + "step": 29839 + }, + { + "epoch": 2.4173687621516526, + "grad_norm": 0.05987263470888138, + "learning_rate": 8.792474908861786e-05, + "loss": 0.2414, + "step": 29840 + }, + { + "epoch": 2.417449773169151, + "grad_norm": 0.06584987789392471, + "learning_rate": 8.792024843602322e-05, + "loss": 0.2776, + "step": 29841 + }, + { + "epoch": 2.4175307841866496, + "grad_norm": 0.048123326152563095, + "learning_rate": 8.79157477834286e-05, + "loss": 0.2081, + "step": 29842 + }, + { + "epoch": 2.417611795204148, + "grad_norm": 0.06386002898216248, + "learning_rate": 8.791124713083398e-05, + "loss": 0.2184, + "step": 29843 + }, + { + "epoch": 2.417692806221646, + "grad_norm": 0.08389604091644287, + "learning_rate": 8.790674647823934e-05, + "loss": 0.264, + "step": 29844 + }, + { + "epoch": 2.4177738172391443, + "grad_norm": 0.07809042185544968, + "learning_rate": 8.790224582564472e-05, + "loss": 0.2656, + "step": 29845 + }, + { + "epoch": 2.417854828256643, + "grad_norm": 0.05657443404197693, + "learning_rate": 8.78977451730501e-05, + "loss": 0.1997, + "step": 29846 + }, + { + "epoch": 2.4179358392741412, + "grad_norm": 0.05614442378282547, + "learning_rate": 8.789324452045547e-05, + "loss": 0.2572, + "step": 29847 + }, + { + "epoch": 2.4180168502916395, + "grad_norm": 0.07657313346862793, + "learning_rate": 8.788874386786085e-05, + "loss": 0.256, + "step": 29848 + }, + { + "epoch": 2.418097861309138, + "grad_norm": 0.07188082486391068, + "learning_rate": 8.788424321526622e-05, + "loss": 0.2519, + "step": 29849 + }, + { + "epoch": 2.4181788723266364, + "grad_norm": 0.06149272248148918, + "learning_rate": 8.787974256267159e-05, + "loss": 0.2172, + "step": 29850 + }, + { + "epoch": 2.4182598833441347, + "grad_norm": 0.08759336918592453, + "learning_rate": 8.787524191007697e-05, + "loss": 0.2777, + "step": 29851 + }, + { + "epoch": 2.4183408943616334, + "grad_norm": 0.08040926605463028, + "learning_rate": 8.787074125748234e-05, + "loss": 0.2624, + "step": 29852 + }, + { + "epoch": 2.4184219053791316, + "grad_norm": 0.06956743448972702, + "learning_rate": 8.786624060488771e-05, + "loss": 0.2582, + "step": 29853 + }, + { + "epoch": 2.41850291639663, + "grad_norm": 0.07439576834440231, + "learning_rate": 8.786173995229309e-05, + "loss": 0.2626, + "step": 29854 + }, + { + "epoch": 2.4185839274141285, + "grad_norm": 0.06943316757678986, + "learning_rate": 8.785723929969846e-05, + "loss": 0.2056, + "step": 29855 + }, + { + "epoch": 2.4186649384316268, + "grad_norm": 0.06287401914596558, + "learning_rate": 8.785273864710383e-05, + "loss": 0.2195, + "step": 29856 + }, + { + "epoch": 2.418745949449125, + "grad_norm": 0.062373608350753784, + "learning_rate": 8.784823799450921e-05, + "loss": 0.216, + "step": 29857 + }, + { + "epoch": 2.4188269604666237, + "grad_norm": 0.05923473834991455, + "learning_rate": 8.784373734191458e-05, + "loss": 0.2179, + "step": 29858 + }, + { + "epoch": 2.418907971484122, + "grad_norm": 0.06392458826303482, + "learning_rate": 8.783923668931996e-05, + "loss": 0.2229, + "step": 29859 + }, + { + "epoch": 2.41898898250162, + "grad_norm": 0.06252685189247131, + "learning_rate": 8.783473603672533e-05, + "loss": 0.2176, + "step": 29860 + }, + { + "epoch": 2.4190699935191184, + "grad_norm": 0.06143161281943321, + "learning_rate": 8.78302353841307e-05, + "loss": 0.2343, + "step": 29861 + }, + { + "epoch": 2.419151004536617, + "grad_norm": 0.06481669098138809, + "learning_rate": 8.782573473153608e-05, + "loss": 0.2392, + "step": 29862 + }, + { + "epoch": 2.4192320155541154, + "grad_norm": 0.07333081215620041, + "learning_rate": 8.782123407894145e-05, + "loss": 0.2721, + "step": 29863 + }, + { + "epoch": 2.4193130265716136, + "grad_norm": 0.06324837356805801, + "learning_rate": 8.781673342634682e-05, + "loss": 0.2464, + "step": 29864 + }, + { + "epoch": 2.4193940375891123, + "grad_norm": 0.07487185299396515, + "learning_rate": 8.78122327737522e-05, + "loss": 0.2536, + "step": 29865 + }, + { + "epoch": 2.4194750486066106, + "grad_norm": 0.06082228943705559, + "learning_rate": 8.780773212115757e-05, + "loss": 0.2173, + "step": 29866 + }, + { + "epoch": 2.419556059624109, + "grad_norm": 0.07401525974273682, + "learning_rate": 8.780323146856294e-05, + "loss": 0.2594, + "step": 29867 + }, + { + "epoch": 2.419637070641607, + "grad_norm": 0.07237707078456879, + "learning_rate": 8.779873081596832e-05, + "loss": 0.2409, + "step": 29868 + }, + { + "epoch": 2.4197180816591057, + "grad_norm": 0.06320221722126007, + "learning_rate": 8.779423016337369e-05, + "loss": 0.2408, + "step": 29869 + }, + { + "epoch": 2.419799092676604, + "grad_norm": 0.07496561110019684, + "learning_rate": 8.778972951077906e-05, + "loss": 0.2456, + "step": 29870 + }, + { + "epoch": 2.4198801036941022, + "grad_norm": 0.07235972583293915, + "learning_rate": 8.778522885818444e-05, + "loss": 0.245, + "step": 29871 + }, + { + "epoch": 2.419961114711601, + "grad_norm": 0.0759521946310997, + "learning_rate": 8.778072820558981e-05, + "loss": 0.2589, + "step": 29872 + }, + { + "epoch": 2.420042125729099, + "grad_norm": 0.07334791123867035, + "learning_rate": 8.77762275529952e-05, + "loss": 0.3019, + "step": 29873 + }, + { + "epoch": 2.4201231367465974, + "grad_norm": 0.06675311177968979, + "learning_rate": 8.777172690040056e-05, + "loss": 0.2646, + "step": 29874 + }, + { + "epoch": 2.420204147764096, + "grad_norm": 0.0692124143242836, + "learning_rate": 8.776722624780593e-05, + "loss": 0.2403, + "step": 29875 + }, + { + "epoch": 2.4202851587815943, + "grad_norm": 0.06959296762943268, + "learning_rate": 8.776272559521132e-05, + "loss": 0.2203, + "step": 29876 + }, + { + "epoch": 2.4203661697990926, + "grad_norm": 0.07304505258798599, + "learning_rate": 8.775822494261668e-05, + "loss": 0.2476, + "step": 29877 + }, + { + "epoch": 2.4204471808165913, + "grad_norm": 0.07309460639953613, + "learning_rate": 8.775372429002205e-05, + "loss": 0.2791, + "step": 29878 + }, + { + "epoch": 2.4205281918340895, + "grad_norm": 0.06610136479139328, + "learning_rate": 8.774922363742744e-05, + "loss": 0.282, + "step": 29879 + }, + { + "epoch": 2.4206092028515878, + "grad_norm": 0.07905296981334686, + "learning_rate": 8.77447229848328e-05, + "loss": 0.2449, + "step": 29880 + }, + { + "epoch": 2.420690213869086, + "grad_norm": 0.06679189205169678, + "learning_rate": 8.774022233223817e-05, + "loss": 0.2395, + "step": 29881 + }, + { + "epoch": 2.4207712248865847, + "grad_norm": 0.07395980507135391, + "learning_rate": 8.773572167964356e-05, + "loss": 0.3036, + "step": 29882 + }, + { + "epoch": 2.420852235904083, + "grad_norm": 0.07605738937854767, + "learning_rate": 8.773122102704892e-05, + "loss": 0.2484, + "step": 29883 + }, + { + "epoch": 2.420933246921581, + "grad_norm": 0.05466524511575699, + "learning_rate": 8.77267203744543e-05, + "loss": 0.2249, + "step": 29884 + }, + { + "epoch": 2.42101425793908, + "grad_norm": 0.058822643011808395, + "learning_rate": 8.772221972185968e-05, + "loss": 0.2559, + "step": 29885 + }, + { + "epoch": 2.421095268956578, + "grad_norm": 0.07392793893814087, + "learning_rate": 8.771771906926504e-05, + "loss": 0.2545, + "step": 29886 + }, + { + "epoch": 2.4211762799740764, + "grad_norm": 0.05999179184436798, + "learning_rate": 8.771321841667042e-05, + "loss": 0.2357, + "step": 29887 + }, + { + "epoch": 2.4212572909915746, + "grad_norm": 0.06972895562648773, + "learning_rate": 8.77087177640758e-05, + "loss": 0.2264, + "step": 29888 + }, + { + "epoch": 2.4213383020090733, + "grad_norm": 0.06538040190935135, + "learning_rate": 8.770421711148116e-05, + "loss": 0.1909, + "step": 29889 + }, + { + "epoch": 2.4214193130265715, + "grad_norm": 0.05858279764652252, + "learning_rate": 8.769971645888654e-05, + "loss": 0.2409, + "step": 29890 + }, + { + "epoch": 2.42150032404407, + "grad_norm": 0.06701955199241638, + "learning_rate": 8.769521580629192e-05, + "loss": 0.268, + "step": 29891 + }, + { + "epoch": 2.4215813350615685, + "grad_norm": 0.06543616205453873, + "learning_rate": 8.769071515369728e-05, + "loss": 0.2356, + "step": 29892 + }, + { + "epoch": 2.4216623460790667, + "grad_norm": 0.08259650319814682, + "learning_rate": 8.768621450110266e-05, + "loss": 0.2182, + "step": 29893 + }, + { + "epoch": 2.421743357096565, + "grad_norm": 0.067936010658741, + "learning_rate": 8.768171384850804e-05, + "loss": 0.2435, + "step": 29894 + }, + { + "epoch": 2.4218243681140637, + "grad_norm": 0.0737324133515358, + "learning_rate": 8.76772131959134e-05, + "loss": 0.2154, + "step": 29895 + }, + { + "epoch": 2.421905379131562, + "grad_norm": 0.05419429764151573, + "learning_rate": 8.767271254331878e-05, + "loss": 0.2825, + "step": 29896 + }, + { + "epoch": 2.42198639014906, + "grad_norm": 0.07466893643140793, + "learning_rate": 8.766821189072417e-05, + "loss": 0.2347, + "step": 29897 + }, + { + "epoch": 2.422067401166559, + "grad_norm": 0.06078220158815384, + "learning_rate": 8.766371123812953e-05, + "loss": 0.2399, + "step": 29898 + }, + { + "epoch": 2.422148412184057, + "grad_norm": 0.06347761303186417, + "learning_rate": 8.765921058553491e-05, + "loss": 0.2375, + "step": 29899 + }, + { + "epoch": 2.4222294232015553, + "grad_norm": 0.07001287490129471, + "learning_rate": 8.765470993294029e-05, + "loss": 0.2417, + "step": 29900 + }, + { + "epoch": 2.422310434219054, + "grad_norm": 0.05894186720252037, + "learning_rate": 8.765020928034565e-05, + "loss": 0.2496, + "step": 29901 + }, + { + "epoch": 2.4223914452365523, + "grad_norm": 0.05822170153260231, + "learning_rate": 8.764570862775103e-05, + "loss": 0.2584, + "step": 29902 + }, + { + "epoch": 2.4224724562540505, + "grad_norm": 0.060905635356903076, + "learning_rate": 8.764120797515641e-05, + "loss": 0.2282, + "step": 29903 + }, + { + "epoch": 2.4225534672715487, + "grad_norm": 0.06173749268054962, + "learning_rate": 8.763670732256177e-05, + "loss": 0.2173, + "step": 29904 + }, + { + "epoch": 2.4226344782890474, + "grad_norm": 0.07445482164621353, + "learning_rate": 8.763220666996715e-05, + "loss": 0.2503, + "step": 29905 + }, + { + "epoch": 2.4227154893065457, + "grad_norm": 0.07381244748830795, + "learning_rate": 8.762770601737253e-05, + "loss": 0.2464, + "step": 29906 + }, + { + "epoch": 2.422796500324044, + "grad_norm": 0.06338288635015488, + "learning_rate": 8.762320536477789e-05, + "loss": 0.2549, + "step": 29907 + }, + { + "epoch": 2.4228775113415426, + "grad_norm": 0.0717942863702774, + "learning_rate": 8.761870471218328e-05, + "loss": 0.2664, + "step": 29908 + }, + { + "epoch": 2.422958522359041, + "grad_norm": 0.0670522078871727, + "learning_rate": 8.761420405958865e-05, + "loss": 0.2832, + "step": 29909 + }, + { + "epoch": 2.423039533376539, + "grad_norm": 0.07442935556173325, + "learning_rate": 8.760970340699401e-05, + "loss": 0.2276, + "step": 29910 + }, + { + "epoch": 2.4231205443940373, + "grad_norm": 0.07393119484186172, + "learning_rate": 8.76052027543994e-05, + "loss": 0.244, + "step": 29911 + }, + { + "epoch": 2.423201555411536, + "grad_norm": 0.06407365202903748, + "learning_rate": 8.760070210180477e-05, + "loss": 0.229, + "step": 29912 + }, + { + "epoch": 2.4232825664290343, + "grad_norm": 0.07595523446798325, + "learning_rate": 8.759620144921013e-05, + "loss": 0.2782, + "step": 29913 + }, + { + "epoch": 2.4233635774465325, + "grad_norm": 0.06496904790401459, + "learning_rate": 8.759170079661552e-05, + "loss": 0.2289, + "step": 29914 + }, + { + "epoch": 2.423444588464031, + "grad_norm": 0.07434480637311935, + "learning_rate": 8.758720014402089e-05, + "loss": 0.2359, + "step": 29915 + }, + { + "epoch": 2.4235255994815295, + "grad_norm": 0.07025157660245895, + "learning_rate": 8.758269949142625e-05, + "loss": 0.2308, + "step": 29916 + }, + { + "epoch": 2.4236066104990277, + "grad_norm": 0.07805941253900528, + "learning_rate": 8.757819883883164e-05, + "loss": 0.2323, + "step": 29917 + }, + { + "epoch": 2.4236876215165264, + "grad_norm": 0.06584879755973816, + "learning_rate": 8.757369818623701e-05, + "loss": 0.2444, + "step": 29918 + }, + { + "epoch": 2.4237686325340246, + "grad_norm": 0.0628671944141388, + "learning_rate": 8.756919753364237e-05, + "loss": 0.2402, + "step": 29919 + }, + { + "epoch": 2.423849643551523, + "grad_norm": 0.07205840945243835, + "learning_rate": 8.756469688104776e-05, + "loss": 0.2556, + "step": 29920 + }, + { + "epoch": 2.4239306545690216, + "grad_norm": 0.06968038529157639, + "learning_rate": 8.756019622845313e-05, + "loss": 0.2782, + "step": 29921 + }, + { + "epoch": 2.42401166558652, + "grad_norm": 0.077177993953228, + "learning_rate": 8.755569557585849e-05, + "loss": 0.2601, + "step": 29922 + }, + { + "epoch": 2.424092676604018, + "grad_norm": 0.06498174369335175, + "learning_rate": 8.755119492326388e-05, + "loss": 0.2192, + "step": 29923 + }, + { + "epoch": 2.4241736876215167, + "grad_norm": 0.06896430999040604, + "learning_rate": 8.754669427066925e-05, + "loss": 0.2517, + "step": 29924 + }, + { + "epoch": 2.424254698639015, + "grad_norm": 0.07223816961050034, + "learning_rate": 8.754219361807463e-05, + "loss": 0.2318, + "step": 29925 + }, + { + "epoch": 2.4243357096565132, + "grad_norm": 0.06818022578954697, + "learning_rate": 8.753769296548e-05, + "loss": 0.2928, + "step": 29926 + }, + { + "epoch": 2.4244167206740115, + "grad_norm": 0.06415341049432755, + "learning_rate": 8.753319231288537e-05, + "loss": 0.2132, + "step": 29927 + }, + { + "epoch": 2.42449773169151, + "grad_norm": 0.07759521901607513, + "learning_rate": 8.752869166029075e-05, + "loss": 0.2623, + "step": 29928 + }, + { + "epoch": 2.4245787427090084, + "grad_norm": 0.06272358447313309, + "learning_rate": 8.752419100769612e-05, + "loss": 0.25, + "step": 29929 + }, + { + "epoch": 2.4246597537265067, + "grad_norm": 0.060498110949993134, + "learning_rate": 8.75196903551015e-05, + "loss": 0.2399, + "step": 29930 + }, + { + "epoch": 2.4247407647440054, + "grad_norm": 0.07703352719545364, + "learning_rate": 8.751518970250687e-05, + "loss": 0.2812, + "step": 29931 + }, + { + "epoch": 2.4248217757615036, + "grad_norm": 0.06761782616376877, + "learning_rate": 8.751068904991224e-05, + "loss": 0.283, + "step": 29932 + }, + { + "epoch": 2.424902786779002, + "grad_norm": 0.06330965459346771, + "learning_rate": 8.750618839731762e-05, + "loss": 0.2472, + "step": 29933 + }, + { + "epoch": 2.4249837977965, + "grad_norm": 0.056882016360759735, + "learning_rate": 8.750168774472299e-05, + "loss": 0.2416, + "step": 29934 + }, + { + "epoch": 2.4250648088139988, + "grad_norm": 0.06899707019329071, + "learning_rate": 8.749718709212836e-05, + "loss": 0.2348, + "step": 29935 + }, + { + "epoch": 2.425145819831497, + "grad_norm": 0.07171101123094559, + "learning_rate": 8.749268643953374e-05, + "loss": 0.2216, + "step": 29936 + }, + { + "epoch": 2.4252268308489953, + "grad_norm": 0.06139335408806801, + "learning_rate": 8.748818578693911e-05, + "loss": 0.2235, + "step": 29937 + }, + { + "epoch": 2.425307841866494, + "grad_norm": 0.07158593833446503, + "learning_rate": 8.748368513434448e-05, + "loss": 0.2672, + "step": 29938 + }, + { + "epoch": 2.425388852883992, + "grad_norm": 0.055078309029340744, + "learning_rate": 8.747918448174986e-05, + "loss": 0.2161, + "step": 29939 + }, + { + "epoch": 2.4254698639014904, + "grad_norm": 0.05247804895043373, + "learning_rate": 8.747468382915523e-05, + "loss": 0.2266, + "step": 29940 + }, + { + "epoch": 2.425550874918989, + "grad_norm": 0.06628144532442093, + "learning_rate": 8.74701831765606e-05, + "loss": 0.2305, + "step": 29941 + }, + { + "epoch": 2.4256318859364874, + "grad_norm": 0.07619509100914001, + "learning_rate": 8.746568252396598e-05, + "loss": 0.2846, + "step": 29942 + }, + { + "epoch": 2.4257128969539856, + "grad_norm": 0.06378401070833206, + "learning_rate": 8.746118187137135e-05, + "loss": 0.2399, + "step": 29943 + }, + { + "epoch": 2.4257939079714843, + "grad_norm": 0.0780458077788353, + "learning_rate": 8.745668121877673e-05, + "loss": 0.266, + "step": 29944 + }, + { + "epoch": 2.4258749189889826, + "grad_norm": 0.06926210969686508, + "learning_rate": 8.74521805661821e-05, + "loss": 0.2581, + "step": 29945 + }, + { + "epoch": 2.425955930006481, + "grad_norm": 0.06663671135902405, + "learning_rate": 8.744767991358747e-05, + "loss": 0.2038, + "step": 29946 + }, + { + "epoch": 2.4260369410239795, + "grad_norm": 0.06778295338153839, + "learning_rate": 8.744317926099285e-05, + "loss": 0.2474, + "step": 29947 + }, + { + "epoch": 2.4261179520414777, + "grad_norm": 0.08452882617712021, + "learning_rate": 8.743867860839822e-05, + "loss": 0.2713, + "step": 29948 + }, + { + "epoch": 2.426198963058976, + "grad_norm": 0.07399019598960876, + "learning_rate": 8.743417795580359e-05, + "loss": 0.2588, + "step": 29949 + }, + { + "epoch": 2.426279974076474, + "grad_norm": 0.06832026690244675, + "learning_rate": 8.742967730320897e-05, + "loss": 0.245, + "step": 29950 + }, + { + "epoch": 2.426360985093973, + "grad_norm": 0.06566408276557922, + "learning_rate": 8.742517665061435e-05, + "loss": 0.2204, + "step": 29951 + }, + { + "epoch": 2.426441996111471, + "grad_norm": 0.06541307270526886, + "learning_rate": 8.742067599801971e-05, + "loss": 0.308, + "step": 29952 + }, + { + "epoch": 2.4265230071289694, + "grad_norm": 0.07913939654827118, + "learning_rate": 8.741617534542509e-05, + "loss": 0.2746, + "step": 29953 + }, + { + "epoch": 2.426604018146468, + "grad_norm": 0.07623070478439331, + "learning_rate": 8.741167469283047e-05, + "loss": 0.2766, + "step": 29954 + }, + { + "epoch": 2.4266850291639663, + "grad_norm": 0.06193844974040985, + "learning_rate": 8.740717404023583e-05, + "loss": 0.2231, + "step": 29955 + }, + { + "epoch": 2.4267660401814646, + "grad_norm": 0.07710251212120056, + "learning_rate": 8.740267338764121e-05, + "loss": 0.2649, + "step": 29956 + }, + { + "epoch": 2.426847051198963, + "grad_norm": 0.06773968040943146, + "learning_rate": 8.73981727350466e-05, + "loss": 0.2572, + "step": 29957 + }, + { + "epoch": 2.4269280622164615, + "grad_norm": 0.05227114260196686, + "learning_rate": 8.739367208245196e-05, + "loss": 0.2524, + "step": 29958 + }, + { + "epoch": 2.4270090732339598, + "grad_norm": 0.06641446053981781, + "learning_rate": 8.738917142985733e-05, + "loss": 0.2518, + "step": 29959 + }, + { + "epoch": 2.427090084251458, + "grad_norm": 0.07579101622104645, + "learning_rate": 8.738467077726272e-05, + "loss": 0.2628, + "step": 29960 + }, + { + "epoch": 2.4271710952689567, + "grad_norm": 0.057052165269851685, + "learning_rate": 8.738017012466808e-05, + "loss": 0.1979, + "step": 29961 + }, + { + "epoch": 2.427252106286455, + "grad_norm": 0.063960500061512, + "learning_rate": 8.737566947207345e-05, + "loss": 0.2349, + "step": 29962 + }, + { + "epoch": 2.427333117303953, + "grad_norm": 0.056935809552669525, + "learning_rate": 8.737116881947884e-05, + "loss": 0.2307, + "step": 29963 + }, + { + "epoch": 2.427414128321452, + "grad_norm": 0.08629359304904938, + "learning_rate": 8.73666681668842e-05, + "loss": 0.2795, + "step": 29964 + }, + { + "epoch": 2.42749513933895, + "grad_norm": 0.06306321918964386, + "learning_rate": 8.736216751428957e-05, + "loss": 0.2237, + "step": 29965 + }, + { + "epoch": 2.4275761503564484, + "grad_norm": 0.06393168121576309, + "learning_rate": 8.735766686169496e-05, + "loss": 0.2537, + "step": 29966 + }, + { + "epoch": 2.427657161373947, + "grad_norm": 0.06578171998262405, + "learning_rate": 8.735316620910032e-05, + "loss": 0.2718, + "step": 29967 + }, + { + "epoch": 2.4277381723914453, + "grad_norm": 0.07443420588970184, + "learning_rate": 8.734866555650569e-05, + "loss": 0.2344, + "step": 29968 + }, + { + "epoch": 2.4278191834089435, + "grad_norm": 0.07351133972406387, + "learning_rate": 8.734416490391108e-05, + "loss": 0.2356, + "step": 29969 + }, + { + "epoch": 2.4279001944264422, + "grad_norm": 0.07358096539974213, + "learning_rate": 8.733966425131644e-05, + "loss": 0.2264, + "step": 29970 + }, + { + "epoch": 2.4279812054439405, + "grad_norm": 0.07105661183595657, + "learning_rate": 8.733516359872181e-05, + "loss": 0.2398, + "step": 29971 + }, + { + "epoch": 2.4280622164614387, + "grad_norm": 0.0635269358754158, + "learning_rate": 8.73306629461272e-05, + "loss": 0.2536, + "step": 29972 + }, + { + "epoch": 2.428143227478937, + "grad_norm": 0.07085049897432327, + "learning_rate": 8.732616229353256e-05, + "loss": 0.2625, + "step": 29973 + }, + { + "epoch": 2.4282242384964356, + "grad_norm": 0.06909903883934021, + "learning_rate": 8.732166164093793e-05, + "loss": 0.2326, + "step": 29974 + }, + { + "epoch": 2.428305249513934, + "grad_norm": 0.06062382087111473, + "learning_rate": 8.731716098834332e-05, + "loss": 0.2277, + "step": 29975 + }, + { + "epoch": 2.428386260531432, + "grad_norm": 0.07726830244064331, + "learning_rate": 8.731266033574868e-05, + "loss": 0.248, + "step": 29976 + }, + { + "epoch": 2.428467271548931, + "grad_norm": 0.07095228880643845, + "learning_rate": 8.730815968315405e-05, + "loss": 0.2711, + "step": 29977 + }, + { + "epoch": 2.428548282566429, + "grad_norm": 0.07358419895172119, + "learning_rate": 8.730365903055944e-05, + "loss": 0.2577, + "step": 29978 + }, + { + "epoch": 2.4286292935839273, + "grad_norm": 0.0734160989522934, + "learning_rate": 8.72991583779648e-05, + "loss": 0.2313, + "step": 29979 + }, + { + "epoch": 2.4287103046014256, + "grad_norm": 0.07221377640962601, + "learning_rate": 8.729465772537019e-05, + "loss": 0.2388, + "step": 29980 + }, + { + "epoch": 2.4287913156189243, + "grad_norm": 0.07227525860071182, + "learning_rate": 8.729015707277556e-05, + "loss": 0.2619, + "step": 29981 + }, + { + "epoch": 2.4288723266364225, + "grad_norm": 0.06419933587312698, + "learning_rate": 8.728565642018092e-05, + "loss": 0.2373, + "step": 29982 + }, + { + "epoch": 2.4289533376539207, + "grad_norm": 0.07368076592683792, + "learning_rate": 8.728115576758631e-05, + "loss": 0.2541, + "step": 29983 + }, + { + "epoch": 2.4290343486714194, + "grad_norm": 0.06805281341075897, + "learning_rate": 8.727665511499168e-05, + "loss": 0.2301, + "step": 29984 + }, + { + "epoch": 2.4291153596889177, + "grad_norm": 0.0686119869351387, + "learning_rate": 8.727215446239704e-05, + "loss": 0.2327, + "step": 29985 + }, + { + "epoch": 2.429196370706416, + "grad_norm": 0.073693186044693, + "learning_rate": 8.726765380980243e-05, + "loss": 0.2879, + "step": 29986 + }, + { + "epoch": 2.4292773817239146, + "grad_norm": 0.07163406908512115, + "learning_rate": 8.72631531572078e-05, + "loss": 0.2568, + "step": 29987 + }, + { + "epoch": 2.429358392741413, + "grad_norm": 0.08275744318962097, + "learning_rate": 8.725865250461316e-05, + "loss": 0.2245, + "step": 29988 + }, + { + "epoch": 2.429439403758911, + "grad_norm": 0.06841978430747986, + "learning_rate": 8.725415185201855e-05, + "loss": 0.2633, + "step": 29989 + }, + { + "epoch": 2.42952041477641, + "grad_norm": 0.05534449964761734, + "learning_rate": 8.724965119942392e-05, + "loss": 0.2196, + "step": 29990 + }, + { + "epoch": 2.429601425793908, + "grad_norm": 0.05714098736643791, + "learning_rate": 8.724515054682928e-05, + "loss": 0.2595, + "step": 29991 + }, + { + "epoch": 2.4296824368114063, + "grad_norm": 0.07040819525718689, + "learning_rate": 8.724064989423467e-05, + "loss": 0.2937, + "step": 29992 + }, + { + "epoch": 2.429763447828905, + "grad_norm": 0.05960967391729355, + "learning_rate": 8.723614924164005e-05, + "loss": 0.2393, + "step": 29993 + }, + { + "epoch": 2.429844458846403, + "grad_norm": 0.05933735892176628, + "learning_rate": 8.72316485890454e-05, + "loss": 0.2062, + "step": 29994 + }, + { + "epoch": 2.4299254698639015, + "grad_norm": 0.06407492607831955, + "learning_rate": 8.722714793645079e-05, + "loss": 0.2333, + "step": 29995 + }, + { + "epoch": 2.4300064808813997, + "grad_norm": 0.0534355528652668, + "learning_rate": 8.722264728385617e-05, + "loss": 0.2148, + "step": 29996 + }, + { + "epoch": 2.4300874918988984, + "grad_norm": 0.06425287574529648, + "learning_rate": 8.721814663126153e-05, + "loss": 0.2585, + "step": 29997 + }, + { + "epoch": 2.4301685029163966, + "grad_norm": 0.0652540922164917, + "learning_rate": 8.721364597866691e-05, + "loss": 0.2127, + "step": 29998 + }, + { + "epoch": 2.430249513933895, + "grad_norm": 0.06726215779781342, + "learning_rate": 8.720914532607229e-05, + "loss": 0.232, + "step": 29999 + }, + { + "epoch": 2.4303305249513936, + "grad_norm": 0.06891316175460815, + "learning_rate": 8.720464467347765e-05, + "loss": 0.2624, + "step": 30000 + }, + { + "epoch": 2.430411535968892, + "grad_norm": 0.0784035250544548, + "learning_rate": 8.720014402088303e-05, + "loss": 0.2833, + "step": 30001 + }, + { + "epoch": 2.43049254698639, + "grad_norm": 0.07090424746274948, + "learning_rate": 8.719564336828841e-05, + "loss": 0.2373, + "step": 30002 + }, + { + "epoch": 2.4305735580038883, + "grad_norm": 0.07066990435123444, + "learning_rate": 8.719114271569377e-05, + "loss": 0.2364, + "step": 30003 + }, + { + "epoch": 2.430654569021387, + "grad_norm": 0.07360489666461945, + "learning_rate": 8.718664206309915e-05, + "loss": 0.2622, + "step": 30004 + }, + { + "epoch": 2.4307355800388852, + "grad_norm": 0.07157068699598312, + "learning_rate": 8.718214141050453e-05, + "loss": 0.251, + "step": 30005 + }, + { + "epoch": 2.4308165910563835, + "grad_norm": 0.06780479848384857, + "learning_rate": 8.71776407579099e-05, + "loss": 0.2558, + "step": 30006 + }, + { + "epoch": 2.430897602073882, + "grad_norm": 0.06623739749193192, + "learning_rate": 8.717314010531528e-05, + "loss": 0.2593, + "step": 30007 + }, + { + "epoch": 2.4309786130913804, + "grad_norm": 0.06277523189783096, + "learning_rate": 8.716863945272065e-05, + "loss": 0.229, + "step": 30008 + }, + { + "epoch": 2.4310596241088787, + "grad_norm": 0.0608900710940361, + "learning_rate": 8.716413880012602e-05, + "loss": 0.2153, + "step": 30009 + }, + { + "epoch": 2.4311406351263773, + "grad_norm": 0.07124532759189606, + "learning_rate": 8.71596381475314e-05, + "loss": 0.2302, + "step": 30010 + }, + { + "epoch": 2.4312216461438756, + "grad_norm": 0.07467015087604523, + "learning_rate": 8.715513749493677e-05, + "loss": 0.2531, + "step": 30011 + }, + { + "epoch": 2.431302657161374, + "grad_norm": 0.06628242880105972, + "learning_rate": 8.715063684234214e-05, + "loss": 0.2381, + "step": 30012 + }, + { + "epoch": 2.4313836681788725, + "grad_norm": 0.07032094150781631, + "learning_rate": 8.714613618974752e-05, + "loss": 0.279, + "step": 30013 + }, + { + "epoch": 2.4314646791963708, + "grad_norm": 0.07590150833129883, + "learning_rate": 8.714163553715289e-05, + "loss": 0.2673, + "step": 30014 + }, + { + "epoch": 2.431545690213869, + "grad_norm": 0.08777482807636261, + "learning_rate": 8.713713488455826e-05, + "loss": 0.2624, + "step": 30015 + }, + { + "epoch": 2.4316267012313677, + "grad_norm": 0.07249364256858826, + "learning_rate": 8.713263423196364e-05, + "loss": 0.3, + "step": 30016 + }, + { + "epoch": 2.431707712248866, + "grad_norm": 0.07197359204292297, + "learning_rate": 8.712813357936901e-05, + "loss": 0.2389, + "step": 30017 + }, + { + "epoch": 2.431788723266364, + "grad_norm": 0.07103599607944489, + "learning_rate": 8.712363292677439e-05, + "loss": 0.2419, + "step": 30018 + }, + { + "epoch": 2.4318697342838624, + "grad_norm": 0.057174891233444214, + "learning_rate": 8.711913227417976e-05, + "loss": 0.2458, + "step": 30019 + }, + { + "epoch": 2.431950745301361, + "grad_norm": 0.08173391968011856, + "learning_rate": 8.711463162158513e-05, + "loss": 0.246, + "step": 30020 + }, + { + "epoch": 2.4320317563188594, + "grad_norm": 0.08192049711942673, + "learning_rate": 8.71101309689905e-05, + "loss": 0.263, + "step": 30021 + }, + { + "epoch": 2.4321127673363576, + "grad_norm": 0.07564476132392883, + "learning_rate": 8.710563031639588e-05, + "loss": 0.2261, + "step": 30022 + }, + { + "epoch": 2.4321937783538563, + "grad_norm": 0.07592171430587769, + "learning_rate": 8.710112966380125e-05, + "loss": 0.287, + "step": 30023 + }, + { + "epoch": 2.4322747893713546, + "grad_norm": 0.05942939221858978, + "learning_rate": 8.709662901120663e-05, + "loss": 0.2677, + "step": 30024 + }, + { + "epoch": 2.432355800388853, + "grad_norm": 0.07345996797084808, + "learning_rate": 8.7092128358612e-05, + "loss": 0.259, + "step": 30025 + }, + { + "epoch": 2.432436811406351, + "grad_norm": 0.0635913833975792, + "learning_rate": 8.708762770601737e-05, + "loss": 0.2466, + "step": 30026 + }, + { + "epoch": 2.4325178224238497, + "grad_norm": 0.06319551914930344, + "learning_rate": 8.708312705342275e-05, + "loss": 0.263, + "step": 30027 + }, + { + "epoch": 2.432598833441348, + "grad_norm": 0.06326335668563843, + "learning_rate": 8.707862640082812e-05, + "loss": 0.2751, + "step": 30028 + }, + { + "epoch": 2.432679844458846, + "grad_norm": 0.061082255095243454, + "learning_rate": 8.70741257482335e-05, + "loss": 0.2286, + "step": 30029 + }, + { + "epoch": 2.432760855476345, + "grad_norm": 0.0597209595143795, + "learning_rate": 8.706962509563887e-05, + "loss": 0.2292, + "step": 30030 + }, + { + "epoch": 2.432841866493843, + "grad_norm": 0.0566350482404232, + "learning_rate": 8.706512444304424e-05, + "loss": 0.262, + "step": 30031 + }, + { + "epoch": 2.4329228775113414, + "grad_norm": 0.06501947343349457, + "learning_rate": 8.706062379044963e-05, + "loss": 0.2317, + "step": 30032 + }, + { + "epoch": 2.43300388852884, + "grad_norm": 0.07468343526124954, + "learning_rate": 8.705612313785499e-05, + "loss": 0.2716, + "step": 30033 + }, + { + "epoch": 2.4330848995463383, + "grad_norm": 0.06104396656155586, + "learning_rate": 8.705162248526036e-05, + "loss": 0.3045, + "step": 30034 + }, + { + "epoch": 2.4331659105638366, + "grad_norm": 0.06956176459789276, + "learning_rate": 8.704712183266575e-05, + "loss": 0.257, + "step": 30035 + }, + { + "epoch": 2.4332469215813353, + "grad_norm": 0.05189737305045128, + "learning_rate": 8.704262118007111e-05, + "loss": 0.2166, + "step": 30036 + }, + { + "epoch": 2.4333279325988335, + "grad_norm": 0.0675140991806984, + "learning_rate": 8.703812052747648e-05, + "loss": 0.2814, + "step": 30037 + }, + { + "epoch": 2.4334089436163318, + "grad_norm": 0.04953811690211296, + "learning_rate": 8.703361987488187e-05, + "loss": 0.2595, + "step": 30038 + }, + { + "epoch": 2.4334899546338304, + "grad_norm": 0.07347162067890167, + "learning_rate": 8.702911922228723e-05, + "loss": 0.2455, + "step": 30039 + }, + { + "epoch": 2.4335709656513287, + "grad_norm": 0.053286418318748474, + "learning_rate": 8.70246185696926e-05, + "loss": 0.2425, + "step": 30040 + }, + { + "epoch": 2.433651976668827, + "grad_norm": 0.06523050367832184, + "learning_rate": 8.702011791709799e-05, + "loss": 0.2209, + "step": 30041 + }, + { + "epoch": 2.433732987686325, + "grad_norm": 0.05891433730721474, + "learning_rate": 8.701561726450335e-05, + "loss": 0.2575, + "step": 30042 + }, + { + "epoch": 2.433813998703824, + "grad_norm": 0.08087577670812607, + "learning_rate": 8.701111661190873e-05, + "loss": 0.2569, + "step": 30043 + }, + { + "epoch": 2.433895009721322, + "grad_norm": 0.06858525425195694, + "learning_rate": 8.700661595931411e-05, + "loss": 0.2464, + "step": 30044 + }, + { + "epoch": 2.4339760207388204, + "grad_norm": 0.06655458360910416, + "learning_rate": 8.700211530671947e-05, + "loss": 0.2438, + "step": 30045 + }, + { + "epoch": 2.434057031756319, + "grad_norm": 0.0642879530787468, + "learning_rate": 8.699761465412485e-05, + "loss": 0.2355, + "step": 30046 + }, + { + "epoch": 2.4341380427738173, + "grad_norm": 0.05205295979976654, + "learning_rate": 8.699311400153023e-05, + "loss": 0.225, + "step": 30047 + }, + { + "epoch": 2.4342190537913155, + "grad_norm": 0.07050012052059174, + "learning_rate": 8.69886133489356e-05, + "loss": 0.2291, + "step": 30048 + }, + { + "epoch": 2.434300064808814, + "grad_norm": 0.0668644905090332, + "learning_rate": 8.698411269634097e-05, + "loss": 0.2689, + "step": 30049 + }, + { + "epoch": 2.4343810758263125, + "grad_norm": 0.0587209090590477, + "learning_rate": 8.697961204374635e-05, + "loss": 0.244, + "step": 30050 + }, + { + "epoch": 2.4344620868438107, + "grad_norm": 0.07448132336139679, + "learning_rate": 8.697511139115171e-05, + "loss": 0.2395, + "step": 30051 + }, + { + "epoch": 2.434543097861309, + "grad_norm": 0.0627812072634697, + "learning_rate": 8.697061073855709e-05, + "loss": 0.2505, + "step": 30052 + }, + { + "epoch": 2.4346241088788076, + "grad_norm": 0.07397947460412979, + "learning_rate": 8.696611008596247e-05, + "loss": 0.2638, + "step": 30053 + }, + { + "epoch": 2.434705119896306, + "grad_norm": 0.08410845696926117, + "learning_rate": 8.696160943336784e-05, + "loss": 0.3002, + "step": 30054 + }, + { + "epoch": 2.434786130913804, + "grad_norm": 0.05857682228088379, + "learning_rate": 8.695710878077321e-05, + "loss": 0.2265, + "step": 30055 + }, + { + "epoch": 2.434867141931303, + "grad_norm": 0.08480028063058853, + "learning_rate": 8.69526081281786e-05, + "loss": 0.2682, + "step": 30056 + }, + { + "epoch": 2.434948152948801, + "grad_norm": 0.0604703351855278, + "learning_rate": 8.694810747558396e-05, + "loss": 0.2503, + "step": 30057 + }, + { + "epoch": 2.4350291639662993, + "grad_norm": 0.07384306192398071, + "learning_rate": 8.694360682298934e-05, + "loss": 0.2213, + "step": 30058 + }, + { + "epoch": 2.435110174983798, + "grad_norm": 0.06075995787978172, + "learning_rate": 8.693910617039472e-05, + "loss": 0.2487, + "step": 30059 + }, + { + "epoch": 2.4351911860012962, + "grad_norm": 0.06557806581258774, + "learning_rate": 8.693460551780008e-05, + "loss": 0.2346, + "step": 30060 + }, + { + "epoch": 2.4352721970187945, + "grad_norm": 0.0574188195168972, + "learning_rate": 8.693010486520546e-05, + "loss": 0.2619, + "step": 30061 + }, + { + "epoch": 2.435353208036293, + "grad_norm": 0.06319770961999893, + "learning_rate": 8.692560421261084e-05, + "loss": 0.2364, + "step": 30062 + }, + { + "epoch": 2.4354342190537914, + "grad_norm": 0.06702440977096558, + "learning_rate": 8.69211035600162e-05, + "loss": 0.2398, + "step": 30063 + }, + { + "epoch": 2.4355152300712897, + "grad_norm": 0.06472543627023697, + "learning_rate": 8.691660290742158e-05, + "loss": 0.2607, + "step": 30064 + }, + { + "epoch": 2.435596241088788, + "grad_norm": 0.07147222012281418, + "learning_rate": 8.691210225482696e-05, + "loss": 0.2307, + "step": 30065 + }, + { + "epoch": 2.4356772521062866, + "grad_norm": 0.0654640719294548, + "learning_rate": 8.690760160223232e-05, + "loss": 0.2552, + "step": 30066 + }, + { + "epoch": 2.435758263123785, + "grad_norm": 0.06752464920282364, + "learning_rate": 8.69031009496377e-05, + "loss": 0.2487, + "step": 30067 + }, + { + "epoch": 2.435839274141283, + "grad_norm": 0.07302305847406387, + "learning_rate": 8.689860029704308e-05, + "loss": 0.233, + "step": 30068 + }, + { + "epoch": 2.435920285158782, + "grad_norm": 0.06815741211175919, + "learning_rate": 8.689409964444844e-05, + "loss": 0.2269, + "step": 30069 + }, + { + "epoch": 2.43600129617628, + "grad_norm": 0.06292562186717987, + "learning_rate": 8.688959899185383e-05, + "loss": 0.225, + "step": 30070 + }, + { + "epoch": 2.4360823071937783, + "grad_norm": 0.07034356892108917, + "learning_rate": 8.68850983392592e-05, + "loss": 0.2582, + "step": 30071 + }, + { + "epoch": 2.4361633182112765, + "grad_norm": 0.06996940821409225, + "learning_rate": 8.688059768666456e-05, + "loss": 0.2468, + "step": 30072 + }, + { + "epoch": 2.436244329228775, + "grad_norm": 0.08299688994884491, + "learning_rate": 8.687609703406995e-05, + "loss": 0.3314, + "step": 30073 + }, + { + "epoch": 2.4363253402462735, + "grad_norm": 0.06599246710538864, + "learning_rate": 8.687159638147532e-05, + "loss": 0.2411, + "step": 30074 + }, + { + "epoch": 2.4364063512637717, + "grad_norm": 0.06469972431659698, + "learning_rate": 8.686709572888068e-05, + "loss": 0.2836, + "step": 30075 + }, + { + "epoch": 2.4364873622812704, + "grad_norm": 0.069060780107975, + "learning_rate": 8.686259507628607e-05, + "loss": 0.2874, + "step": 30076 + }, + { + "epoch": 2.4365683732987686, + "grad_norm": 0.06383948773145676, + "learning_rate": 8.685809442369144e-05, + "loss": 0.2221, + "step": 30077 + }, + { + "epoch": 2.436649384316267, + "grad_norm": 0.08201758563518524, + "learning_rate": 8.685359377109682e-05, + "loss": 0.2834, + "step": 30078 + }, + { + "epoch": 2.4367303953337656, + "grad_norm": 0.06945295631885529, + "learning_rate": 8.684909311850219e-05, + "loss": 0.2383, + "step": 30079 + }, + { + "epoch": 2.436811406351264, + "grad_norm": 0.06499353796243668, + "learning_rate": 8.684459246590756e-05, + "loss": 0.2813, + "step": 30080 + }, + { + "epoch": 2.436892417368762, + "grad_norm": 0.0747925415635109, + "learning_rate": 8.684009181331294e-05, + "loss": 0.2787, + "step": 30081 + }, + { + "epoch": 2.4369734283862607, + "grad_norm": 0.07593086361885071, + "learning_rate": 8.683559116071831e-05, + "loss": 0.2714, + "step": 30082 + }, + { + "epoch": 2.437054439403759, + "grad_norm": 0.06988240033388138, + "learning_rate": 8.683109050812368e-05, + "loss": 0.2704, + "step": 30083 + }, + { + "epoch": 2.4371354504212572, + "grad_norm": 0.06397940218448639, + "learning_rate": 8.682658985552906e-05, + "loss": 0.2307, + "step": 30084 + }, + { + "epoch": 2.4372164614387555, + "grad_norm": 0.0741710364818573, + "learning_rate": 8.682208920293443e-05, + "loss": 0.2703, + "step": 30085 + }, + { + "epoch": 2.437297472456254, + "grad_norm": 0.05661537125706673, + "learning_rate": 8.68175885503398e-05, + "loss": 0.2515, + "step": 30086 + }, + { + "epoch": 2.4373784834737524, + "grad_norm": 0.070607990026474, + "learning_rate": 8.681308789774518e-05, + "loss": 0.2371, + "step": 30087 + }, + { + "epoch": 2.4374594944912507, + "grad_norm": 0.07335694134235382, + "learning_rate": 8.680858724515055e-05, + "loss": 0.2893, + "step": 30088 + }, + { + "epoch": 2.4375405055087493, + "grad_norm": 0.05634382739663124, + "learning_rate": 8.680408659255592e-05, + "loss": 0.2443, + "step": 30089 + }, + { + "epoch": 2.4376215165262476, + "grad_norm": 0.06172090023756027, + "learning_rate": 8.67995859399613e-05, + "loss": 0.2265, + "step": 30090 + }, + { + "epoch": 2.437702527543746, + "grad_norm": 0.07646841555833817, + "learning_rate": 8.679508528736667e-05, + "loss": 0.2593, + "step": 30091 + }, + { + "epoch": 2.4377835385612445, + "grad_norm": 0.07664379477500916, + "learning_rate": 8.679058463477205e-05, + "loss": 0.2795, + "step": 30092 + }, + { + "epoch": 2.4378645495787428, + "grad_norm": 0.10637167096138, + "learning_rate": 8.678608398217742e-05, + "loss": 0.3214, + "step": 30093 + }, + { + "epoch": 2.437945560596241, + "grad_norm": 0.0653686448931694, + "learning_rate": 8.678158332958279e-05, + "loss": 0.2566, + "step": 30094 + }, + { + "epoch": 2.4380265716137393, + "grad_norm": 0.06110738590359688, + "learning_rate": 8.677708267698817e-05, + "loss": 0.2642, + "step": 30095 + }, + { + "epoch": 2.438107582631238, + "grad_norm": 0.06815928965806961, + "learning_rate": 8.677258202439354e-05, + "loss": 0.2397, + "step": 30096 + }, + { + "epoch": 2.438188593648736, + "grad_norm": 0.07178793847560883, + "learning_rate": 8.676808137179891e-05, + "loss": 0.2464, + "step": 30097 + }, + { + "epoch": 2.4382696046662344, + "grad_norm": 0.06039717420935631, + "learning_rate": 8.676358071920429e-05, + "loss": 0.2454, + "step": 30098 + }, + { + "epoch": 2.438350615683733, + "grad_norm": 0.06946100294589996, + "learning_rate": 8.675908006660966e-05, + "loss": 0.2279, + "step": 30099 + }, + { + "epoch": 2.4384316267012314, + "grad_norm": 0.04718714952468872, + "learning_rate": 8.675457941401503e-05, + "loss": 0.2418, + "step": 30100 + }, + { + "epoch": 2.4385126377187296, + "grad_norm": 0.06509923934936523, + "learning_rate": 8.675007876142041e-05, + "loss": 0.2436, + "step": 30101 + }, + { + "epoch": 2.4385936487362283, + "grad_norm": 0.06489907205104828, + "learning_rate": 8.674557810882578e-05, + "loss": 0.236, + "step": 30102 + }, + { + "epoch": 2.4386746597537265, + "grad_norm": 0.06927445530891418, + "learning_rate": 8.674107745623116e-05, + "loss": 0.2661, + "step": 30103 + }, + { + "epoch": 2.438755670771225, + "grad_norm": 0.07982465624809265, + "learning_rate": 8.673657680363653e-05, + "loss": 0.2622, + "step": 30104 + }, + { + "epoch": 2.4388366817887235, + "grad_norm": 0.06970755010843277, + "learning_rate": 8.67320761510419e-05, + "loss": 0.2519, + "step": 30105 + }, + { + "epoch": 2.4389176928062217, + "grad_norm": 0.0699467882514, + "learning_rate": 8.672757549844728e-05, + "loss": 0.2591, + "step": 30106 + }, + { + "epoch": 2.43899870382372, + "grad_norm": 0.07506277412176132, + "learning_rate": 8.672307484585265e-05, + "loss": 0.2524, + "step": 30107 + }, + { + "epoch": 2.439079714841218, + "grad_norm": 0.06829731166362762, + "learning_rate": 8.671857419325802e-05, + "loss": 0.2967, + "step": 30108 + }, + { + "epoch": 2.439160725858717, + "grad_norm": 0.05720601603388786, + "learning_rate": 8.67140735406634e-05, + "loss": 0.1938, + "step": 30109 + }, + { + "epoch": 2.439241736876215, + "grad_norm": 0.07455245405435562, + "learning_rate": 8.670957288806878e-05, + "loss": 0.2327, + "step": 30110 + }, + { + "epoch": 2.4393227478937134, + "grad_norm": 0.07823212444782257, + "learning_rate": 8.670507223547414e-05, + "loss": 0.256, + "step": 30111 + }, + { + "epoch": 2.439403758911212, + "grad_norm": 0.06638111919164658, + "learning_rate": 8.670057158287952e-05, + "loss": 0.1916, + "step": 30112 + }, + { + "epoch": 2.4394847699287103, + "grad_norm": 0.07046499103307724, + "learning_rate": 8.66960709302849e-05, + "loss": 0.2316, + "step": 30113 + }, + { + "epoch": 2.4395657809462086, + "grad_norm": 0.06909290701150894, + "learning_rate": 8.669157027769026e-05, + "loss": 0.234, + "step": 30114 + }, + { + "epoch": 2.439646791963707, + "grad_norm": 0.0655369684100151, + "learning_rate": 8.668706962509564e-05, + "loss": 0.2252, + "step": 30115 + }, + { + "epoch": 2.4397278029812055, + "grad_norm": 0.06978704035282135, + "learning_rate": 8.668256897250103e-05, + "loss": 0.2338, + "step": 30116 + }, + { + "epoch": 2.4398088139987038, + "grad_norm": 0.07861284911632538, + "learning_rate": 8.667806831990639e-05, + "loss": 0.2589, + "step": 30117 + }, + { + "epoch": 2.439889825016202, + "grad_norm": 0.07819448411464691, + "learning_rate": 8.667356766731176e-05, + "loss": 0.2219, + "step": 30118 + }, + { + "epoch": 2.4399708360337007, + "grad_norm": 0.06307553499937057, + "learning_rate": 8.666906701471715e-05, + "loss": 0.2715, + "step": 30119 + }, + { + "epoch": 2.440051847051199, + "grad_norm": 0.07494818419218063, + "learning_rate": 8.66645663621225e-05, + "loss": 0.2309, + "step": 30120 + }, + { + "epoch": 2.440132858068697, + "grad_norm": 0.06351149082183838, + "learning_rate": 8.666006570952788e-05, + "loss": 0.251, + "step": 30121 + }, + { + "epoch": 2.440213869086196, + "grad_norm": 0.04857697710394859, + "learning_rate": 8.665556505693327e-05, + "loss": 0.2666, + "step": 30122 + }, + { + "epoch": 2.440294880103694, + "grad_norm": 0.06089930236339569, + "learning_rate": 8.665106440433863e-05, + "loss": 0.2118, + "step": 30123 + }, + { + "epoch": 2.4403758911211924, + "grad_norm": 0.061717960983514786, + "learning_rate": 8.6646563751744e-05, + "loss": 0.246, + "step": 30124 + }, + { + "epoch": 2.440456902138691, + "grad_norm": 0.05882474035024643, + "learning_rate": 8.664206309914939e-05, + "loss": 0.2653, + "step": 30125 + }, + { + "epoch": 2.4405379131561893, + "grad_norm": 0.06826498359441757, + "learning_rate": 8.663756244655475e-05, + "loss": 0.2209, + "step": 30126 + }, + { + "epoch": 2.4406189241736875, + "grad_norm": 0.066226065158844, + "learning_rate": 8.663306179396012e-05, + "loss": 0.2289, + "step": 30127 + }, + { + "epoch": 2.440699935191186, + "grad_norm": 0.06775106489658356, + "learning_rate": 8.662856114136551e-05, + "loss": 0.2569, + "step": 30128 + }, + { + "epoch": 2.4407809462086845, + "grad_norm": 0.058902762830257416, + "learning_rate": 8.662406048877087e-05, + "loss": 0.2281, + "step": 30129 + }, + { + "epoch": 2.4408619572261827, + "grad_norm": 0.0739506185054779, + "learning_rate": 8.661955983617624e-05, + "loss": 0.2965, + "step": 30130 + }, + { + "epoch": 2.440942968243681, + "grad_norm": 0.06172848120331764, + "learning_rate": 8.661505918358163e-05, + "loss": 0.2381, + "step": 30131 + }, + { + "epoch": 2.4410239792611796, + "grad_norm": 0.07081998139619827, + "learning_rate": 8.661055853098699e-05, + "loss": 0.2702, + "step": 30132 + }, + { + "epoch": 2.441104990278678, + "grad_norm": 0.07338794320821762, + "learning_rate": 8.660605787839236e-05, + "loss": 0.2521, + "step": 30133 + }, + { + "epoch": 2.441186001296176, + "grad_norm": 0.06638986617326736, + "learning_rate": 8.660155722579775e-05, + "loss": 0.2489, + "step": 30134 + }, + { + "epoch": 2.441267012313675, + "grad_norm": 0.07379947602748871, + "learning_rate": 8.659705657320311e-05, + "loss": 0.2803, + "step": 30135 + }, + { + "epoch": 2.441348023331173, + "grad_norm": 0.0717678815126419, + "learning_rate": 8.659255592060848e-05, + "loss": 0.2377, + "step": 30136 + }, + { + "epoch": 2.4414290343486713, + "grad_norm": 0.07228371500968933, + "learning_rate": 8.658805526801387e-05, + "loss": 0.2236, + "step": 30137 + }, + { + "epoch": 2.4415100453661696, + "grad_norm": 0.06211762875318527, + "learning_rate": 8.658355461541923e-05, + "loss": 0.1953, + "step": 30138 + }, + { + "epoch": 2.4415910563836682, + "grad_norm": 0.06627662479877472, + "learning_rate": 8.657905396282462e-05, + "loss": 0.2584, + "step": 30139 + }, + { + "epoch": 2.4416720674011665, + "grad_norm": 0.06256860494613647, + "learning_rate": 8.657455331022999e-05, + "loss": 0.2335, + "step": 30140 + }, + { + "epoch": 2.4417530784186647, + "grad_norm": 0.06448493897914886, + "learning_rate": 8.657005265763535e-05, + "loss": 0.2528, + "step": 30141 + }, + { + "epoch": 2.4418340894361634, + "grad_norm": 0.06773277372121811, + "learning_rate": 8.656555200504074e-05, + "loss": 0.2519, + "step": 30142 + }, + { + "epoch": 2.4419151004536617, + "grad_norm": 0.06808852404356003, + "learning_rate": 8.656105135244611e-05, + "loss": 0.2425, + "step": 30143 + }, + { + "epoch": 2.44199611147116, + "grad_norm": 0.07154618948698044, + "learning_rate": 8.655655069985149e-05, + "loss": 0.2675, + "step": 30144 + }, + { + "epoch": 2.4420771224886586, + "grad_norm": 0.05622822791337967, + "learning_rate": 8.655205004725686e-05, + "loss": 0.2169, + "step": 30145 + }, + { + "epoch": 2.442158133506157, + "grad_norm": 0.06128344312310219, + "learning_rate": 8.654754939466223e-05, + "loss": 0.2408, + "step": 30146 + }, + { + "epoch": 2.442239144523655, + "grad_norm": 0.061993323266506195, + "learning_rate": 8.654304874206761e-05, + "loss": 0.2565, + "step": 30147 + }, + { + "epoch": 2.442320155541154, + "grad_norm": 0.06531644612550735, + "learning_rate": 8.653854808947298e-05, + "loss": 0.2509, + "step": 30148 + }, + { + "epoch": 2.442401166558652, + "grad_norm": 0.048911985009908676, + "learning_rate": 8.653404743687835e-05, + "loss": 0.2278, + "step": 30149 + }, + { + "epoch": 2.4424821775761503, + "grad_norm": 0.07343631982803345, + "learning_rate": 8.652954678428373e-05, + "loss": 0.2444, + "step": 30150 + }, + { + "epoch": 2.442563188593649, + "grad_norm": 0.0558130107820034, + "learning_rate": 8.65250461316891e-05, + "loss": 0.2607, + "step": 30151 + }, + { + "epoch": 2.442644199611147, + "grad_norm": 0.07380655407905579, + "learning_rate": 8.652054547909448e-05, + "loss": 0.223, + "step": 30152 + }, + { + "epoch": 2.4427252106286454, + "grad_norm": 0.08603765815496445, + "learning_rate": 8.651604482649985e-05, + "loss": 0.2589, + "step": 30153 + }, + { + "epoch": 2.4428062216461437, + "grad_norm": 0.08148400485515594, + "learning_rate": 8.651154417390522e-05, + "loss": 0.2296, + "step": 30154 + }, + { + "epoch": 2.4428872326636424, + "grad_norm": 0.07049152255058289, + "learning_rate": 8.65070435213106e-05, + "loss": 0.2623, + "step": 30155 + }, + { + "epoch": 2.4429682436811406, + "grad_norm": 0.06398821622133255, + "learning_rate": 8.650254286871597e-05, + "loss": 0.2785, + "step": 30156 + }, + { + "epoch": 2.443049254698639, + "grad_norm": 0.05324990302324295, + "learning_rate": 8.649804221612134e-05, + "loss": 0.2106, + "step": 30157 + }, + { + "epoch": 2.4431302657161376, + "grad_norm": 0.06434676051139832, + "learning_rate": 8.649354156352672e-05, + "loss": 0.235, + "step": 30158 + }, + { + "epoch": 2.443211276733636, + "grad_norm": 0.06905324012041092, + "learning_rate": 8.648904091093209e-05, + "loss": 0.3151, + "step": 30159 + }, + { + "epoch": 2.443292287751134, + "grad_norm": 0.06296160072088242, + "learning_rate": 8.648454025833746e-05, + "loss": 0.2356, + "step": 30160 + }, + { + "epoch": 2.4433732987686323, + "grad_norm": 0.07445219904184341, + "learning_rate": 8.648003960574284e-05, + "loss": 0.2383, + "step": 30161 + }, + { + "epoch": 2.443454309786131, + "grad_norm": 0.06814183294773102, + "learning_rate": 8.647553895314821e-05, + "loss": 0.249, + "step": 30162 + }, + { + "epoch": 2.4435353208036292, + "grad_norm": 0.07182428240776062, + "learning_rate": 8.647103830055358e-05, + "loss": 0.2982, + "step": 30163 + }, + { + "epoch": 2.4436163318211275, + "grad_norm": 0.05431900545954704, + "learning_rate": 8.646653764795896e-05, + "loss": 0.2317, + "step": 30164 + }, + { + "epoch": 2.443697342838626, + "grad_norm": 0.06252321600914001, + "learning_rate": 8.646203699536433e-05, + "loss": 0.2462, + "step": 30165 + }, + { + "epoch": 2.4437783538561244, + "grad_norm": 0.05793005973100662, + "learning_rate": 8.64575363427697e-05, + "loss": 0.2459, + "step": 30166 + }, + { + "epoch": 2.4438593648736227, + "grad_norm": 0.06284428387880325, + "learning_rate": 8.645303569017508e-05, + "loss": 0.2341, + "step": 30167 + }, + { + "epoch": 2.4439403758911213, + "grad_norm": 0.06669968366622925, + "learning_rate": 8.644853503758045e-05, + "loss": 0.2574, + "step": 30168 + }, + { + "epoch": 2.4440213869086196, + "grad_norm": 0.06979219615459442, + "learning_rate": 8.644403438498583e-05, + "loss": 0.2617, + "step": 30169 + }, + { + "epoch": 2.444102397926118, + "grad_norm": 0.07064683735370636, + "learning_rate": 8.64395337323912e-05, + "loss": 0.2584, + "step": 30170 + }, + { + "epoch": 2.4441834089436165, + "grad_norm": 0.05474438518285751, + "learning_rate": 8.643503307979657e-05, + "loss": 0.2299, + "step": 30171 + }, + { + "epoch": 2.4442644199611148, + "grad_norm": 0.08040313422679901, + "learning_rate": 8.643053242720195e-05, + "loss": 0.2606, + "step": 30172 + }, + { + "epoch": 2.444345430978613, + "grad_norm": 0.062317855656147, + "learning_rate": 8.642603177460732e-05, + "loss": 0.2458, + "step": 30173 + }, + { + "epoch": 2.4444264419961117, + "grad_norm": 0.060882315039634705, + "learning_rate": 8.64215311220127e-05, + "loss": 0.2343, + "step": 30174 + }, + { + "epoch": 2.44450745301361, + "grad_norm": 0.05670086294412613, + "learning_rate": 8.641703046941807e-05, + "loss": 0.2096, + "step": 30175 + }, + { + "epoch": 2.444588464031108, + "grad_norm": 0.06691998243331909, + "learning_rate": 8.641252981682344e-05, + "loss": 0.2181, + "step": 30176 + }, + { + "epoch": 2.4446694750486064, + "grad_norm": 0.07000657916069031, + "learning_rate": 8.640802916422882e-05, + "loss": 0.2579, + "step": 30177 + }, + { + "epoch": 2.444750486066105, + "grad_norm": 0.06200079247355461, + "learning_rate": 8.640352851163419e-05, + "loss": 0.2235, + "step": 30178 + }, + { + "epoch": 2.4448314970836034, + "grad_norm": 0.08357629179954529, + "learning_rate": 8.639902785903956e-05, + "loss": 0.2811, + "step": 30179 + }, + { + "epoch": 2.4449125081011016, + "grad_norm": 0.07436035573482513, + "learning_rate": 8.639452720644494e-05, + "loss": 0.262, + "step": 30180 + }, + { + "epoch": 2.4449935191186003, + "grad_norm": 0.05238322168588638, + "learning_rate": 8.639002655385031e-05, + "loss": 0.2377, + "step": 30181 + }, + { + "epoch": 2.4450745301360985, + "grad_norm": 0.06278589367866516, + "learning_rate": 8.638552590125568e-05, + "loss": 0.2956, + "step": 30182 + }, + { + "epoch": 2.445155541153597, + "grad_norm": 0.0722695142030716, + "learning_rate": 8.638102524866106e-05, + "loss": 0.2302, + "step": 30183 + }, + { + "epoch": 2.445236552171095, + "grad_norm": 0.05430447682738304, + "learning_rate": 8.637652459606643e-05, + "loss": 0.2463, + "step": 30184 + }, + { + "epoch": 2.4453175631885937, + "grad_norm": 0.062322165817022324, + "learning_rate": 8.63720239434718e-05, + "loss": 0.2584, + "step": 30185 + }, + { + "epoch": 2.445398574206092, + "grad_norm": 0.0586535707116127, + "learning_rate": 8.636752329087718e-05, + "loss": 0.2408, + "step": 30186 + }, + { + "epoch": 2.44547958522359, + "grad_norm": 0.05980996787548065, + "learning_rate": 8.636302263828255e-05, + "loss": 0.244, + "step": 30187 + }, + { + "epoch": 2.445560596241089, + "grad_norm": 0.06888397783041, + "learning_rate": 8.635852198568792e-05, + "loss": 0.2519, + "step": 30188 + }, + { + "epoch": 2.445641607258587, + "grad_norm": 0.0636942908167839, + "learning_rate": 8.63540213330933e-05, + "loss": 0.2485, + "step": 30189 + }, + { + "epoch": 2.4457226182760854, + "grad_norm": 0.07013783603906631, + "learning_rate": 8.634952068049867e-05, + "loss": 0.2381, + "step": 30190 + }, + { + "epoch": 2.445803629293584, + "grad_norm": 0.06727758795022964, + "learning_rate": 8.634502002790406e-05, + "loss": 0.2452, + "step": 30191 + }, + { + "epoch": 2.4458846403110823, + "grad_norm": 0.07199087738990784, + "learning_rate": 8.634051937530942e-05, + "loss": 0.2598, + "step": 30192 + }, + { + "epoch": 2.4459656513285806, + "grad_norm": 0.06623843312263489, + "learning_rate": 8.633601872271479e-05, + "loss": 0.237, + "step": 30193 + }, + { + "epoch": 2.4460466623460793, + "grad_norm": 0.05432756245136261, + "learning_rate": 8.633151807012018e-05, + "loss": 0.2641, + "step": 30194 + }, + { + "epoch": 2.4461276733635775, + "grad_norm": 0.083702951669693, + "learning_rate": 8.632701741752554e-05, + "loss": 0.219, + "step": 30195 + }, + { + "epoch": 2.4462086843810757, + "grad_norm": 0.06516814231872559, + "learning_rate": 8.632251676493091e-05, + "loss": 0.219, + "step": 30196 + }, + { + "epoch": 2.4462896953985744, + "grad_norm": 0.0688856914639473, + "learning_rate": 8.63180161123363e-05, + "loss": 0.2319, + "step": 30197 + }, + { + "epoch": 2.4463707064160727, + "grad_norm": 0.05996042490005493, + "learning_rate": 8.631351545974166e-05, + "loss": 0.2344, + "step": 30198 + }, + { + "epoch": 2.446451717433571, + "grad_norm": 0.05216207727789879, + "learning_rate": 8.630901480714703e-05, + "loss": 0.211, + "step": 30199 + }, + { + "epoch": 2.446532728451069, + "grad_norm": 0.0762057974934578, + "learning_rate": 8.630451415455242e-05, + "loss": 0.2654, + "step": 30200 + }, + { + "epoch": 2.446613739468568, + "grad_norm": 0.07425080984830856, + "learning_rate": 8.630001350195778e-05, + "loss": 0.2564, + "step": 30201 + }, + { + "epoch": 2.446694750486066, + "grad_norm": 0.0731613039970398, + "learning_rate": 8.629551284936316e-05, + "loss": 0.263, + "step": 30202 + }, + { + "epoch": 2.4467757615035644, + "grad_norm": 0.06292743235826492, + "learning_rate": 8.629101219676854e-05, + "loss": 0.2536, + "step": 30203 + }, + { + "epoch": 2.446856772521063, + "grad_norm": 0.05863060802221298, + "learning_rate": 8.62865115441739e-05, + "loss": 0.2441, + "step": 30204 + }, + { + "epoch": 2.4469377835385613, + "grad_norm": 0.07905661314725876, + "learning_rate": 8.628201089157928e-05, + "loss": 0.2976, + "step": 30205 + }, + { + "epoch": 2.4470187945560595, + "grad_norm": 0.07654749602079391, + "learning_rate": 8.627751023898466e-05, + "loss": 0.2482, + "step": 30206 + }, + { + "epoch": 2.4470998055735578, + "grad_norm": 0.06803172081708908, + "learning_rate": 8.627300958639002e-05, + "loss": 0.259, + "step": 30207 + }, + { + "epoch": 2.4471808165910565, + "grad_norm": 0.0701146125793457, + "learning_rate": 8.62685089337954e-05, + "loss": 0.281, + "step": 30208 + }, + { + "epoch": 2.4472618276085547, + "grad_norm": 0.06699921190738678, + "learning_rate": 8.626400828120078e-05, + "loss": 0.2412, + "step": 30209 + }, + { + "epoch": 2.447342838626053, + "grad_norm": 0.05488741770386696, + "learning_rate": 8.625950762860614e-05, + "loss": 0.2227, + "step": 30210 + }, + { + "epoch": 2.4474238496435516, + "grad_norm": 0.06483423709869385, + "learning_rate": 8.625500697601152e-05, + "loss": 0.2543, + "step": 30211 + }, + { + "epoch": 2.44750486066105, + "grad_norm": 0.07855366915464401, + "learning_rate": 8.62505063234169e-05, + "loss": 0.2256, + "step": 30212 + }, + { + "epoch": 2.447585871678548, + "grad_norm": 0.06719925999641418, + "learning_rate": 8.624600567082228e-05, + "loss": 0.2345, + "step": 30213 + }, + { + "epoch": 2.447666882696047, + "grad_norm": 0.07370990514755249, + "learning_rate": 8.624150501822764e-05, + "loss": 0.2294, + "step": 30214 + }, + { + "epoch": 2.447747893713545, + "grad_norm": 0.06388029456138611, + "learning_rate": 8.623700436563303e-05, + "loss": 0.2191, + "step": 30215 + }, + { + "epoch": 2.4478289047310433, + "grad_norm": 0.05369741469621658, + "learning_rate": 8.62325037130384e-05, + "loss": 0.1817, + "step": 30216 + }, + { + "epoch": 2.447909915748542, + "grad_norm": 0.07407825440168381, + "learning_rate": 8.622800306044377e-05, + "loss": 0.2759, + "step": 30217 + }, + { + "epoch": 2.4479909267660402, + "grad_norm": 0.07580500096082687, + "learning_rate": 8.622350240784915e-05, + "loss": 0.2373, + "step": 30218 + }, + { + "epoch": 2.4480719377835385, + "grad_norm": 0.05918845906853676, + "learning_rate": 8.621900175525452e-05, + "loss": 0.253, + "step": 30219 + }, + { + "epoch": 2.448152948801037, + "grad_norm": 0.05765198916196823, + "learning_rate": 8.62145011026599e-05, + "loss": 0.2461, + "step": 30220 + }, + { + "epoch": 2.4482339598185354, + "grad_norm": 0.058827124536037445, + "learning_rate": 8.621000045006527e-05, + "loss": 0.2439, + "step": 30221 + }, + { + "epoch": 2.4483149708360337, + "grad_norm": 0.06166047975420952, + "learning_rate": 8.620549979747064e-05, + "loss": 0.2563, + "step": 30222 + }, + { + "epoch": 2.448395981853532, + "grad_norm": 0.05963588505983353, + "learning_rate": 8.620099914487601e-05, + "loss": 0.2635, + "step": 30223 + }, + { + "epoch": 2.4484769928710306, + "grad_norm": 0.0705462247133255, + "learning_rate": 8.619649849228139e-05, + "loss": 0.2458, + "step": 30224 + }, + { + "epoch": 2.448558003888529, + "grad_norm": 0.06184877082705498, + "learning_rate": 8.619199783968676e-05, + "loss": 0.27, + "step": 30225 + }, + { + "epoch": 2.448639014906027, + "grad_norm": 0.07148279994726181, + "learning_rate": 8.618749718709214e-05, + "loss": 0.2715, + "step": 30226 + }, + { + "epoch": 2.448720025923526, + "grad_norm": 0.08536798506975174, + "learning_rate": 8.618299653449751e-05, + "loss": 0.2776, + "step": 30227 + }, + { + "epoch": 2.448801036941024, + "grad_norm": 0.06759648770093918, + "learning_rate": 8.617849588190288e-05, + "loss": 0.2646, + "step": 30228 + }, + { + "epoch": 2.4488820479585223, + "grad_norm": 0.06979471445083618, + "learning_rate": 8.617399522930826e-05, + "loss": 0.2466, + "step": 30229 + }, + { + "epoch": 2.4489630589760205, + "grad_norm": 0.06026214361190796, + "learning_rate": 8.616949457671363e-05, + "loss": 0.2331, + "step": 30230 + }, + { + "epoch": 2.449044069993519, + "grad_norm": 0.05924506485462189, + "learning_rate": 8.6164993924119e-05, + "loss": 0.2982, + "step": 30231 + }, + { + "epoch": 2.4491250810110174, + "grad_norm": 0.06539326161146164, + "learning_rate": 8.616049327152438e-05, + "loss": 0.2333, + "step": 30232 + }, + { + "epoch": 2.4492060920285157, + "grad_norm": 0.07669103890657425, + "learning_rate": 8.615599261892975e-05, + "loss": 0.2756, + "step": 30233 + }, + { + "epoch": 2.4492871030460144, + "grad_norm": 0.06633754819631577, + "learning_rate": 8.615149196633512e-05, + "loss": 0.2472, + "step": 30234 + }, + { + "epoch": 2.4493681140635126, + "grad_norm": 0.06898613274097443, + "learning_rate": 8.61469913137405e-05, + "loss": 0.2201, + "step": 30235 + }, + { + "epoch": 2.449449125081011, + "grad_norm": 0.06048338860273361, + "learning_rate": 8.614249066114587e-05, + "loss": 0.2409, + "step": 30236 + }, + { + "epoch": 2.4495301360985096, + "grad_norm": 0.05309195816516876, + "learning_rate": 8.613799000855125e-05, + "loss": 0.235, + "step": 30237 + }, + { + "epoch": 2.449611147116008, + "grad_norm": 0.0583578422665596, + "learning_rate": 8.613348935595662e-05, + "loss": 0.2428, + "step": 30238 + }, + { + "epoch": 2.449692158133506, + "grad_norm": 0.056156620383262634, + "learning_rate": 8.612898870336199e-05, + "loss": 0.253, + "step": 30239 + }, + { + "epoch": 2.4497731691510047, + "grad_norm": 0.06680503487586975, + "learning_rate": 8.612448805076737e-05, + "loss": 0.2071, + "step": 30240 + }, + { + "epoch": 2.449854180168503, + "grad_norm": 0.07724934816360474, + "learning_rate": 8.611998739817274e-05, + "loss": 0.2515, + "step": 30241 + }, + { + "epoch": 2.4499351911860012, + "grad_norm": 0.0604155957698822, + "learning_rate": 8.611548674557811e-05, + "loss": 0.218, + "step": 30242 + }, + { + "epoch": 2.4500162022035, + "grad_norm": 0.06490961462259293, + "learning_rate": 8.611098609298349e-05, + "loss": 0.2586, + "step": 30243 + }, + { + "epoch": 2.450097213220998, + "grad_norm": 0.05839584767818451, + "learning_rate": 8.610648544038886e-05, + "loss": 0.2598, + "step": 30244 + }, + { + "epoch": 2.4501782242384964, + "grad_norm": 0.08250536024570465, + "learning_rate": 8.610198478779423e-05, + "loss": 0.2916, + "step": 30245 + }, + { + "epoch": 2.4502592352559946, + "grad_norm": 0.06101342290639877, + "learning_rate": 8.609748413519961e-05, + "loss": 0.2361, + "step": 30246 + }, + { + "epoch": 2.4503402462734933, + "grad_norm": 0.06912539154291153, + "learning_rate": 8.609298348260498e-05, + "loss": 0.251, + "step": 30247 + }, + { + "epoch": 2.4504212572909916, + "grad_norm": 0.0605338029563427, + "learning_rate": 8.608848283001035e-05, + "loss": 0.2406, + "step": 30248 + }, + { + "epoch": 2.45050226830849, + "grad_norm": 0.06752576678991318, + "learning_rate": 8.608398217741573e-05, + "loss": 0.2248, + "step": 30249 + }, + { + "epoch": 2.4505832793259885, + "grad_norm": 0.07446485757827759, + "learning_rate": 8.60794815248211e-05, + "loss": 0.2459, + "step": 30250 + }, + { + "epoch": 2.4506642903434868, + "grad_norm": 0.07084500789642334, + "learning_rate": 8.607498087222648e-05, + "loss": 0.2724, + "step": 30251 + }, + { + "epoch": 2.450745301360985, + "grad_norm": 0.05843502655625343, + "learning_rate": 8.607048021963185e-05, + "loss": 0.231, + "step": 30252 + }, + { + "epoch": 2.4508263123784833, + "grad_norm": 0.07239796966314316, + "learning_rate": 8.606597956703722e-05, + "loss": 0.2943, + "step": 30253 + }, + { + "epoch": 2.450907323395982, + "grad_norm": 0.05716480687260628, + "learning_rate": 8.60614789144426e-05, + "loss": 0.2787, + "step": 30254 + }, + { + "epoch": 2.45098833441348, + "grad_norm": 0.07082024216651917, + "learning_rate": 8.605697826184797e-05, + "loss": 0.2477, + "step": 30255 + }, + { + "epoch": 2.4510693454309784, + "grad_norm": 0.06596090644598007, + "learning_rate": 8.605247760925334e-05, + "loss": 0.2573, + "step": 30256 + }, + { + "epoch": 2.451150356448477, + "grad_norm": 0.06249532848596573, + "learning_rate": 8.604797695665872e-05, + "loss": 0.2237, + "step": 30257 + }, + { + "epoch": 2.4512313674659754, + "grad_norm": 0.05652106925845146, + "learning_rate": 8.604347630406409e-05, + "loss": 0.2822, + "step": 30258 + }, + { + "epoch": 2.4513123784834736, + "grad_norm": 0.07296479493379593, + "learning_rate": 8.603897565146946e-05, + "loss": 0.2955, + "step": 30259 + }, + { + "epoch": 2.4513933895009723, + "grad_norm": 0.07375498861074448, + "learning_rate": 8.603447499887484e-05, + "loss": 0.1986, + "step": 30260 + }, + { + "epoch": 2.4514744005184705, + "grad_norm": 0.07448673248291016, + "learning_rate": 8.602997434628021e-05, + "loss": 0.2583, + "step": 30261 + }, + { + "epoch": 2.451555411535969, + "grad_norm": 0.061081256717443466, + "learning_rate": 8.602547369368559e-05, + "loss": 0.2175, + "step": 30262 + }, + { + "epoch": 2.4516364225534675, + "grad_norm": 0.08052729070186615, + "learning_rate": 8.602097304109096e-05, + "loss": 0.2595, + "step": 30263 + }, + { + "epoch": 2.4517174335709657, + "grad_norm": 0.058177463710308075, + "learning_rate": 8.601647238849633e-05, + "loss": 0.2324, + "step": 30264 + }, + { + "epoch": 2.451798444588464, + "grad_norm": 0.06863915175199509, + "learning_rate": 8.60119717359017e-05, + "loss": 0.2182, + "step": 30265 + }, + { + "epoch": 2.4518794556059627, + "grad_norm": 0.0763634443283081, + "learning_rate": 8.600747108330708e-05, + "loss": 0.2431, + "step": 30266 + }, + { + "epoch": 2.451960466623461, + "grad_norm": 0.06735827773809433, + "learning_rate": 8.600297043071245e-05, + "loss": 0.2313, + "step": 30267 + }, + { + "epoch": 2.452041477640959, + "grad_norm": 0.06448716670274734, + "learning_rate": 8.599846977811783e-05, + "loss": 0.2683, + "step": 30268 + }, + { + "epoch": 2.4521224886584574, + "grad_norm": 0.07303851842880249, + "learning_rate": 8.599396912552321e-05, + "loss": 0.2663, + "step": 30269 + }, + { + "epoch": 2.452203499675956, + "grad_norm": 0.07002940028905869, + "learning_rate": 8.598946847292857e-05, + "loss": 0.2348, + "step": 30270 + }, + { + "epoch": 2.4522845106934543, + "grad_norm": 0.05738309770822525, + "learning_rate": 8.598496782033395e-05, + "loss": 0.2549, + "step": 30271 + }, + { + "epoch": 2.4523655217109526, + "grad_norm": 0.06919324398040771, + "learning_rate": 8.598046716773933e-05, + "loss": 0.2567, + "step": 30272 + }, + { + "epoch": 2.4524465327284513, + "grad_norm": 0.06908387690782547, + "learning_rate": 8.59759665151447e-05, + "loss": 0.3042, + "step": 30273 + }, + { + "epoch": 2.4525275437459495, + "grad_norm": 0.08584818243980408, + "learning_rate": 8.597146586255007e-05, + "loss": 0.26, + "step": 30274 + }, + { + "epoch": 2.4526085547634477, + "grad_norm": 0.05924270302057266, + "learning_rate": 8.596696520995546e-05, + "loss": 0.2441, + "step": 30275 + }, + { + "epoch": 2.452689565780946, + "grad_norm": 0.05678810179233551, + "learning_rate": 8.596246455736082e-05, + "loss": 0.2191, + "step": 30276 + }, + { + "epoch": 2.4527705767984447, + "grad_norm": 0.07264811545610428, + "learning_rate": 8.595796390476619e-05, + "loss": 0.2639, + "step": 30277 + }, + { + "epoch": 2.452851587815943, + "grad_norm": 0.07443877309560776, + "learning_rate": 8.595346325217158e-05, + "loss": 0.2369, + "step": 30278 + }, + { + "epoch": 2.452932598833441, + "grad_norm": 0.06062542647123337, + "learning_rate": 8.594896259957694e-05, + "loss": 0.2448, + "step": 30279 + }, + { + "epoch": 2.45301360985094, + "grad_norm": 0.06636027246713638, + "learning_rate": 8.594446194698231e-05, + "loss": 0.2481, + "step": 30280 + }, + { + "epoch": 2.453094620868438, + "grad_norm": 0.06837310642004013, + "learning_rate": 8.59399612943877e-05, + "loss": 0.2091, + "step": 30281 + }, + { + "epoch": 2.4531756318859363, + "grad_norm": 0.061930716037750244, + "learning_rate": 8.593546064179307e-05, + "loss": 0.233, + "step": 30282 + }, + { + "epoch": 2.453256642903435, + "grad_norm": 0.06425345689058304, + "learning_rate": 8.593095998919843e-05, + "loss": 0.2712, + "step": 30283 + }, + { + "epoch": 2.4533376539209333, + "grad_norm": 0.05221385881304741, + "learning_rate": 8.592645933660382e-05, + "loss": 0.1921, + "step": 30284 + }, + { + "epoch": 2.4534186649384315, + "grad_norm": 0.06649923324584961, + "learning_rate": 8.592195868400919e-05, + "loss": 0.2396, + "step": 30285 + }, + { + "epoch": 2.45349967595593, + "grad_norm": 0.05886203050613403, + "learning_rate": 8.591745803141455e-05, + "loss": 0.2474, + "step": 30286 + }, + { + "epoch": 2.4535806869734285, + "grad_norm": 0.08170748502016068, + "learning_rate": 8.591295737881994e-05, + "loss": 0.1982, + "step": 30287 + }, + { + "epoch": 2.4536616979909267, + "grad_norm": 0.06259898841381073, + "learning_rate": 8.590845672622531e-05, + "loss": 0.2768, + "step": 30288 + }, + { + "epoch": 2.4537427090084254, + "grad_norm": 0.05977996066212654, + "learning_rate": 8.590395607363067e-05, + "loss": 0.263, + "step": 30289 + }, + { + "epoch": 2.4538237200259236, + "grad_norm": 0.07319679111242294, + "learning_rate": 8.589945542103606e-05, + "loss": 0.2685, + "step": 30290 + }, + { + "epoch": 2.453904731043422, + "grad_norm": 0.07402021437883377, + "learning_rate": 8.589495476844143e-05, + "loss": 0.2638, + "step": 30291 + }, + { + "epoch": 2.45398574206092, + "grad_norm": 0.0682920292019844, + "learning_rate": 8.58904541158468e-05, + "loss": 0.2361, + "step": 30292 + }, + { + "epoch": 2.454066753078419, + "grad_norm": 0.08681609481573105, + "learning_rate": 8.588595346325218e-05, + "loss": 0.2554, + "step": 30293 + }, + { + "epoch": 2.454147764095917, + "grad_norm": 0.07418433576822281, + "learning_rate": 8.588145281065755e-05, + "loss": 0.2437, + "step": 30294 + }, + { + "epoch": 2.4542287751134153, + "grad_norm": 0.08345085382461548, + "learning_rate": 8.587695215806291e-05, + "loss": 0.2521, + "step": 30295 + }, + { + "epoch": 2.454309786130914, + "grad_norm": 0.06327337771654129, + "learning_rate": 8.58724515054683e-05, + "loss": 0.222, + "step": 30296 + }, + { + "epoch": 2.4543907971484122, + "grad_norm": 0.060608334839344025, + "learning_rate": 8.586795085287367e-05, + "loss": 0.222, + "step": 30297 + }, + { + "epoch": 2.4544718081659105, + "grad_norm": 0.0566270612180233, + "learning_rate": 8.586345020027905e-05, + "loss": 0.2147, + "step": 30298 + }, + { + "epoch": 2.4545528191834087, + "grad_norm": 0.06820667535066605, + "learning_rate": 8.585894954768442e-05, + "loss": 0.2288, + "step": 30299 + }, + { + "epoch": 2.4546338302009074, + "grad_norm": 0.06401017308235168, + "learning_rate": 8.58544488950898e-05, + "loss": 0.2591, + "step": 30300 + }, + { + "epoch": 2.4547148412184057, + "grad_norm": 0.06722749769687653, + "learning_rate": 8.584994824249517e-05, + "loss": 0.2487, + "step": 30301 + }, + { + "epoch": 2.454795852235904, + "grad_norm": 0.07767395675182343, + "learning_rate": 8.584544758990054e-05, + "loss": 0.2766, + "step": 30302 + }, + { + "epoch": 2.4548768632534026, + "grad_norm": 0.06633684784173965, + "learning_rate": 8.584094693730592e-05, + "loss": 0.2748, + "step": 30303 + }, + { + "epoch": 2.454957874270901, + "grad_norm": 0.05974660813808441, + "learning_rate": 8.583644628471129e-05, + "loss": 0.2275, + "step": 30304 + }, + { + "epoch": 2.455038885288399, + "grad_norm": 0.07270497828722, + "learning_rate": 8.583194563211666e-05, + "loss": 0.2186, + "step": 30305 + }, + { + "epoch": 2.4551198963058978, + "grad_norm": 0.09797348827123642, + "learning_rate": 8.582744497952204e-05, + "loss": 0.2735, + "step": 30306 + }, + { + "epoch": 2.455200907323396, + "grad_norm": 0.07703609019517899, + "learning_rate": 8.582294432692741e-05, + "loss": 0.2192, + "step": 30307 + }, + { + "epoch": 2.4552819183408943, + "grad_norm": 0.06129564717411995, + "learning_rate": 8.581844367433278e-05, + "loss": 0.2369, + "step": 30308 + }, + { + "epoch": 2.455362929358393, + "grad_norm": 0.06443587690591812, + "learning_rate": 8.581394302173816e-05, + "loss": 0.2501, + "step": 30309 + }, + { + "epoch": 2.455443940375891, + "grad_norm": 0.07890526205301285, + "learning_rate": 8.580944236914353e-05, + "loss": 0.2453, + "step": 30310 + }, + { + "epoch": 2.4555249513933894, + "grad_norm": 0.06493376940488815, + "learning_rate": 8.58049417165489e-05, + "loss": 0.2391, + "step": 30311 + }, + { + "epoch": 2.4556059624108877, + "grad_norm": 0.06649234890937805, + "learning_rate": 8.580044106395428e-05, + "loss": 0.2606, + "step": 30312 + }, + { + "epoch": 2.4556869734283864, + "grad_norm": 0.05301273614168167, + "learning_rate": 8.579594041135965e-05, + "loss": 0.2416, + "step": 30313 + }, + { + "epoch": 2.4557679844458846, + "grad_norm": 0.07830186933279037, + "learning_rate": 8.579143975876503e-05, + "loss": 0.2389, + "step": 30314 + }, + { + "epoch": 2.455848995463383, + "grad_norm": 0.07225922495126724, + "learning_rate": 8.57869391061704e-05, + "loss": 0.2436, + "step": 30315 + }, + { + "epoch": 2.4559300064808816, + "grad_norm": 0.062381599098443985, + "learning_rate": 8.578243845357577e-05, + "loss": 0.2313, + "step": 30316 + }, + { + "epoch": 2.45601101749838, + "grad_norm": 0.06470402330160141, + "learning_rate": 8.577793780098115e-05, + "loss": 0.2578, + "step": 30317 + }, + { + "epoch": 2.456092028515878, + "grad_norm": 0.06753360480070114, + "learning_rate": 8.577343714838652e-05, + "loss": 0.2984, + "step": 30318 + }, + { + "epoch": 2.4561730395333763, + "grad_norm": 0.051338646560907364, + "learning_rate": 8.57689364957919e-05, + "loss": 0.2128, + "step": 30319 + }, + { + "epoch": 2.456254050550875, + "grad_norm": 0.0592113696038723, + "learning_rate": 8.576443584319727e-05, + "loss": 0.236, + "step": 30320 + }, + { + "epoch": 2.4563350615683732, + "grad_norm": 0.05667173117399216, + "learning_rate": 8.575993519060264e-05, + "loss": 0.2514, + "step": 30321 + }, + { + "epoch": 2.4564160725858715, + "grad_norm": 0.07165750861167908, + "learning_rate": 8.575543453800801e-05, + "loss": 0.2425, + "step": 30322 + }, + { + "epoch": 2.45649708360337, + "grad_norm": 0.06985411047935486, + "learning_rate": 8.575093388541339e-05, + "loss": 0.29, + "step": 30323 + }, + { + "epoch": 2.4565780946208684, + "grad_norm": 0.06525350362062454, + "learning_rate": 8.574643323281876e-05, + "loss": 0.2371, + "step": 30324 + }, + { + "epoch": 2.4566591056383666, + "grad_norm": 0.0659409761428833, + "learning_rate": 8.574193258022414e-05, + "loss": 0.2527, + "step": 30325 + }, + { + "epoch": 2.4567401166558653, + "grad_norm": 0.055850714445114136, + "learning_rate": 8.573743192762951e-05, + "loss": 0.2211, + "step": 30326 + }, + { + "epoch": 2.4568211276733636, + "grad_norm": 0.05591071397066116, + "learning_rate": 8.573293127503488e-05, + "loss": 0.2212, + "step": 30327 + }, + { + "epoch": 2.456902138690862, + "grad_norm": 0.06260688602924347, + "learning_rate": 8.572843062244026e-05, + "loss": 0.2319, + "step": 30328 + }, + { + "epoch": 2.4569831497083605, + "grad_norm": 0.07620882242918015, + "learning_rate": 8.572392996984563e-05, + "loss": 0.3032, + "step": 30329 + }, + { + "epoch": 2.4570641607258588, + "grad_norm": 0.0697203055024147, + "learning_rate": 8.5719429317251e-05, + "loss": 0.2622, + "step": 30330 + }, + { + "epoch": 2.457145171743357, + "grad_norm": 0.05793391913175583, + "learning_rate": 8.571492866465638e-05, + "loss": 0.2152, + "step": 30331 + }, + { + "epoch": 2.4572261827608557, + "grad_norm": 0.07787619531154633, + "learning_rate": 8.571042801206175e-05, + "loss": 0.2662, + "step": 30332 + }, + { + "epoch": 2.457307193778354, + "grad_norm": 0.0645591989159584, + "learning_rate": 8.570592735946712e-05, + "loss": 0.232, + "step": 30333 + }, + { + "epoch": 2.457388204795852, + "grad_norm": 0.07003186643123627, + "learning_rate": 8.57014267068725e-05, + "loss": 0.2611, + "step": 30334 + }, + { + "epoch": 2.4574692158133504, + "grad_norm": 0.0585886612534523, + "learning_rate": 8.569692605427787e-05, + "loss": 0.2741, + "step": 30335 + }, + { + "epoch": 2.457550226830849, + "grad_norm": 0.059686798602342606, + "learning_rate": 8.569242540168325e-05, + "loss": 0.1979, + "step": 30336 + }, + { + "epoch": 2.4576312378483474, + "grad_norm": 0.07068637758493423, + "learning_rate": 8.568792474908862e-05, + "loss": 0.2559, + "step": 30337 + }, + { + "epoch": 2.4577122488658456, + "grad_norm": 0.07276032119989395, + "learning_rate": 8.568342409649399e-05, + "loss": 0.2432, + "step": 30338 + }, + { + "epoch": 2.4577932598833443, + "grad_norm": 0.06920722872018814, + "learning_rate": 8.567892344389937e-05, + "loss": 0.2193, + "step": 30339 + }, + { + "epoch": 2.4578742709008425, + "grad_norm": 0.08199632167816162, + "learning_rate": 8.567442279130474e-05, + "loss": 0.275, + "step": 30340 + }, + { + "epoch": 2.457955281918341, + "grad_norm": 0.06501300632953644, + "learning_rate": 8.566992213871011e-05, + "loss": 0.2413, + "step": 30341 + }, + { + "epoch": 2.458036292935839, + "grad_norm": 0.07494282722473145, + "learning_rate": 8.566542148611549e-05, + "loss": 0.277, + "step": 30342 + }, + { + "epoch": 2.4581173039533377, + "grad_norm": 0.07412391155958176, + "learning_rate": 8.566092083352086e-05, + "loss": 0.253, + "step": 30343 + }, + { + "epoch": 2.458198314970836, + "grad_norm": 0.07492408156394958, + "learning_rate": 8.565642018092623e-05, + "loss": 0.2307, + "step": 30344 + }, + { + "epoch": 2.458279325988334, + "grad_norm": 0.05356239527463913, + "learning_rate": 8.565191952833161e-05, + "loss": 0.2513, + "step": 30345 + }, + { + "epoch": 2.458360337005833, + "grad_norm": 0.06325045973062515, + "learning_rate": 8.564741887573698e-05, + "loss": 0.2453, + "step": 30346 + }, + { + "epoch": 2.458441348023331, + "grad_norm": 0.06481393426656723, + "learning_rate": 8.564291822314236e-05, + "loss": 0.2395, + "step": 30347 + }, + { + "epoch": 2.4585223590408294, + "grad_norm": 0.06986995786428452, + "learning_rate": 8.563841757054773e-05, + "loss": 0.268, + "step": 30348 + }, + { + "epoch": 2.458603370058328, + "grad_norm": 0.07024464011192322, + "learning_rate": 8.56339169179531e-05, + "loss": 0.2651, + "step": 30349 + }, + { + "epoch": 2.4586843810758263, + "grad_norm": 0.0805504098534584, + "learning_rate": 8.562941626535849e-05, + "loss": 0.2235, + "step": 30350 + }, + { + "epoch": 2.4587653920933246, + "grad_norm": 0.07116348296403885, + "learning_rate": 8.562491561276386e-05, + "loss": 0.2771, + "step": 30351 + }, + { + "epoch": 2.4588464031108233, + "grad_norm": 0.06739845871925354, + "learning_rate": 8.562041496016922e-05, + "loss": 0.2338, + "step": 30352 + }, + { + "epoch": 2.4589274141283215, + "grad_norm": 0.08376084268093109, + "learning_rate": 8.561591430757461e-05, + "loss": 0.2715, + "step": 30353 + }, + { + "epoch": 2.4590084251458197, + "grad_norm": 0.06276308000087738, + "learning_rate": 8.561141365497998e-05, + "loss": 0.2489, + "step": 30354 + }, + { + "epoch": 2.4590894361633184, + "grad_norm": 0.07075909525156021, + "learning_rate": 8.560691300238534e-05, + "loss": 0.2909, + "step": 30355 + }, + { + "epoch": 2.4591704471808167, + "grad_norm": 0.061290279030799866, + "learning_rate": 8.560241234979073e-05, + "loss": 0.2094, + "step": 30356 + }, + { + "epoch": 2.459251458198315, + "grad_norm": 0.05792885273694992, + "learning_rate": 8.55979116971961e-05, + "loss": 0.2233, + "step": 30357 + }, + { + "epoch": 2.459332469215813, + "grad_norm": 0.07260933518409729, + "learning_rate": 8.559341104460146e-05, + "loss": 0.2944, + "step": 30358 + }, + { + "epoch": 2.459413480233312, + "grad_norm": 0.06975215673446655, + "learning_rate": 8.558891039200685e-05, + "loss": 0.2188, + "step": 30359 + }, + { + "epoch": 2.45949449125081, + "grad_norm": 0.07033288478851318, + "learning_rate": 8.558440973941223e-05, + "loss": 0.2467, + "step": 30360 + }, + { + "epoch": 2.4595755022683083, + "grad_norm": 0.07207024097442627, + "learning_rate": 8.557990908681759e-05, + "loss": 0.2752, + "step": 30361 + }, + { + "epoch": 2.459656513285807, + "grad_norm": 0.0666518434882164, + "learning_rate": 8.557540843422297e-05, + "loss": 0.2163, + "step": 30362 + }, + { + "epoch": 2.4597375243033053, + "grad_norm": 0.0729021430015564, + "learning_rate": 8.557090778162835e-05, + "loss": 0.2594, + "step": 30363 + }, + { + "epoch": 2.4598185353208035, + "grad_norm": 0.0761818215250969, + "learning_rate": 8.55664071290337e-05, + "loss": 0.2738, + "step": 30364 + }, + { + "epoch": 2.4598995463383018, + "grad_norm": 0.07403729110956192, + "learning_rate": 8.55619064764391e-05, + "loss": 0.2168, + "step": 30365 + }, + { + "epoch": 2.4599805573558005, + "grad_norm": 0.06657052785158157, + "learning_rate": 8.555740582384447e-05, + "loss": 0.2402, + "step": 30366 + }, + { + "epoch": 2.4600615683732987, + "grad_norm": 0.07630985975265503, + "learning_rate": 8.555290517124983e-05, + "loss": 0.2508, + "step": 30367 + }, + { + "epoch": 2.460142579390797, + "grad_norm": 0.06758099794387817, + "learning_rate": 8.554840451865521e-05, + "loss": 0.2263, + "step": 30368 + }, + { + "epoch": 2.4602235904082956, + "grad_norm": 0.06532974541187286, + "learning_rate": 8.554390386606059e-05, + "loss": 0.2811, + "step": 30369 + }, + { + "epoch": 2.460304601425794, + "grad_norm": 0.06683867424726486, + "learning_rate": 8.553940321346595e-05, + "loss": 0.2445, + "step": 30370 + }, + { + "epoch": 2.460385612443292, + "grad_norm": 0.06784668564796448, + "learning_rate": 8.553490256087134e-05, + "loss": 0.2371, + "step": 30371 + }, + { + "epoch": 2.460466623460791, + "grad_norm": 0.07643197476863861, + "learning_rate": 8.553040190827671e-05, + "loss": 0.2522, + "step": 30372 + }, + { + "epoch": 2.460547634478289, + "grad_norm": 0.08601364493370056, + "learning_rate": 8.552590125568207e-05, + "loss": 0.2702, + "step": 30373 + }, + { + "epoch": 2.4606286454957873, + "grad_norm": 0.06803518533706665, + "learning_rate": 8.552140060308746e-05, + "loss": 0.2846, + "step": 30374 + }, + { + "epoch": 2.460709656513286, + "grad_norm": 0.07338423281908035, + "learning_rate": 8.551689995049283e-05, + "loss": 0.2245, + "step": 30375 + }, + { + "epoch": 2.4607906675307842, + "grad_norm": 0.05104734003543854, + "learning_rate": 8.55123992978982e-05, + "loss": 0.2611, + "step": 30376 + }, + { + "epoch": 2.4608716785482825, + "grad_norm": 0.07619819790124893, + "learning_rate": 8.550789864530358e-05, + "loss": 0.2321, + "step": 30377 + }, + { + "epoch": 2.460952689565781, + "grad_norm": 0.06153459474444389, + "learning_rate": 8.550339799270895e-05, + "loss": 0.2389, + "step": 30378 + }, + { + "epoch": 2.4610337005832794, + "grad_norm": 0.0647321343421936, + "learning_rate": 8.549889734011432e-05, + "loss": 0.2187, + "step": 30379 + }, + { + "epoch": 2.4611147116007777, + "grad_norm": 0.06784353405237198, + "learning_rate": 8.54943966875197e-05, + "loss": 0.2167, + "step": 30380 + }, + { + "epoch": 2.461195722618276, + "grad_norm": 0.06753828376531601, + "learning_rate": 8.548989603492507e-05, + "loss": 0.2649, + "step": 30381 + }, + { + "epoch": 2.4612767336357746, + "grad_norm": 0.07720842957496643, + "learning_rate": 8.548539538233044e-05, + "loss": 0.2522, + "step": 30382 + }, + { + "epoch": 2.461357744653273, + "grad_norm": 0.07862772792577744, + "learning_rate": 8.548089472973582e-05, + "loss": 0.2818, + "step": 30383 + }, + { + "epoch": 2.461438755670771, + "grad_norm": 0.08115944266319275, + "learning_rate": 8.547639407714119e-05, + "loss": 0.2641, + "step": 30384 + }, + { + "epoch": 2.4615197666882698, + "grad_norm": 0.07155396789312363, + "learning_rate": 8.547189342454657e-05, + "loss": 0.2993, + "step": 30385 + }, + { + "epoch": 2.461600777705768, + "grad_norm": 0.07185786217451096, + "learning_rate": 8.546739277195194e-05, + "loss": 0.2796, + "step": 30386 + }, + { + "epoch": 2.4616817887232663, + "grad_norm": 0.06836527585983276, + "learning_rate": 8.546289211935731e-05, + "loss": 0.2409, + "step": 30387 + }, + { + "epoch": 2.4617627997407645, + "grad_norm": 0.07136380672454834, + "learning_rate": 8.545839146676269e-05, + "loss": 0.25, + "step": 30388 + }, + { + "epoch": 2.461843810758263, + "grad_norm": 0.07146602123975754, + "learning_rate": 8.545389081416806e-05, + "loss": 0.2616, + "step": 30389 + }, + { + "epoch": 2.4619248217757614, + "grad_norm": 0.06611467897891998, + "learning_rate": 8.544939016157343e-05, + "loss": 0.2365, + "step": 30390 + }, + { + "epoch": 2.4620058327932597, + "grad_norm": 0.06571775674819946, + "learning_rate": 8.544488950897881e-05, + "loss": 0.2619, + "step": 30391 + }, + { + "epoch": 2.4620868438107584, + "grad_norm": 0.06344595551490784, + "learning_rate": 8.544038885638418e-05, + "loss": 0.2376, + "step": 30392 + }, + { + "epoch": 2.4621678548282566, + "grad_norm": 0.06370608508586884, + "learning_rate": 8.543588820378955e-05, + "loss": 0.215, + "step": 30393 + }, + { + "epoch": 2.462248865845755, + "grad_norm": 0.07206301391124725, + "learning_rate": 8.543138755119493e-05, + "loss": 0.2498, + "step": 30394 + }, + { + "epoch": 2.4623298768632536, + "grad_norm": 0.06672500818967819, + "learning_rate": 8.54268868986003e-05, + "loss": 0.1975, + "step": 30395 + }, + { + "epoch": 2.462410887880752, + "grad_norm": 0.05213450640439987, + "learning_rate": 8.542238624600568e-05, + "loss": 0.2409, + "step": 30396 + }, + { + "epoch": 2.46249189889825, + "grad_norm": 0.07119280099868774, + "learning_rate": 8.541788559341105e-05, + "loss": 0.2497, + "step": 30397 + }, + { + "epoch": 2.4625729099157487, + "grad_norm": 0.06520720571279526, + "learning_rate": 8.541338494081642e-05, + "loss": 0.2301, + "step": 30398 + }, + { + "epoch": 2.462653920933247, + "grad_norm": 0.08692757040262222, + "learning_rate": 8.54088842882218e-05, + "loss": 0.2663, + "step": 30399 + }, + { + "epoch": 2.462734931950745, + "grad_norm": 0.07218527793884277, + "learning_rate": 8.540438363562717e-05, + "loss": 0.241, + "step": 30400 + }, + { + "epoch": 2.462815942968244, + "grad_norm": 0.07891545444726944, + "learning_rate": 8.539988298303254e-05, + "loss": 0.242, + "step": 30401 + }, + { + "epoch": 2.462896953985742, + "grad_norm": 0.07669158279895782, + "learning_rate": 8.539538233043792e-05, + "loss": 0.2612, + "step": 30402 + }, + { + "epoch": 2.4629779650032404, + "grad_norm": 0.06553692370653152, + "learning_rate": 8.539088167784329e-05, + "loss": 0.2627, + "step": 30403 + }, + { + "epoch": 2.4630589760207386, + "grad_norm": 0.07745185494422913, + "learning_rate": 8.538638102524866e-05, + "loss": 0.2248, + "step": 30404 + }, + { + "epoch": 2.4631399870382373, + "grad_norm": 0.07231428474187851, + "learning_rate": 8.538188037265404e-05, + "loss": 0.2218, + "step": 30405 + }, + { + "epoch": 2.4632209980557356, + "grad_norm": 0.06869354099035263, + "learning_rate": 8.537737972005941e-05, + "loss": 0.2896, + "step": 30406 + }, + { + "epoch": 2.463302009073234, + "grad_norm": 0.07032286375761032, + "learning_rate": 8.537287906746478e-05, + "loss": 0.2081, + "step": 30407 + }, + { + "epoch": 2.4633830200907325, + "grad_norm": 0.06324692070484161, + "learning_rate": 8.536837841487016e-05, + "loss": 0.2233, + "step": 30408 + }, + { + "epoch": 2.4634640311082308, + "grad_norm": 0.07106756418943405, + "learning_rate": 8.536387776227553e-05, + "loss": 0.2105, + "step": 30409 + }, + { + "epoch": 2.463545042125729, + "grad_norm": 0.05563431605696678, + "learning_rate": 8.53593771096809e-05, + "loss": 0.1939, + "step": 30410 + }, + { + "epoch": 2.4636260531432272, + "grad_norm": 0.06630561500787735, + "learning_rate": 8.535487645708628e-05, + "loss": 0.2427, + "step": 30411 + }, + { + "epoch": 2.463707064160726, + "grad_norm": 0.06674076616764069, + "learning_rate": 8.535037580449165e-05, + "loss": 0.2506, + "step": 30412 + }, + { + "epoch": 2.463788075178224, + "grad_norm": 0.07224856317043304, + "learning_rate": 8.534587515189703e-05, + "loss": 0.282, + "step": 30413 + }, + { + "epoch": 2.4638690861957224, + "grad_norm": 0.05965695530176163, + "learning_rate": 8.53413744993024e-05, + "loss": 0.2581, + "step": 30414 + }, + { + "epoch": 2.463950097213221, + "grad_norm": 0.057730019092559814, + "learning_rate": 8.533687384670777e-05, + "loss": 0.2325, + "step": 30415 + }, + { + "epoch": 2.4640311082307194, + "grad_norm": 0.0637093186378479, + "learning_rate": 8.533237319411315e-05, + "loss": 0.2286, + "step": 30416 + }, + { + "epoch": 2.4641121192482176, + "grad_norm": 0.06882121413946152, + "learning_rate": 8.532787254151853e-05, + "loss": 0.2783, + "step": 30417 + }, + { + "epoch": 2.4641931302657163, + "grad_norm": 0.07381433993577957, + "learning_rate": 8.53233718889239e-05, + "loss": 0.2349, + "step": 30418 + }, + { + "epoch": 2.4642741412832145, + "grad_norm": 0.058938298374414444, + "learning_rate": 8.531887123632927e-05, + "loss": 0.2514, + "step": 30419 + }, + { + "epoch": 2.464355152300713, + "grad_norm": 0.07231079787015915, + "learning_rate": 8.531437058373466e-05, + "loss": 0.2605, + "step": 30420 + }, + { + "epoch": 2.4644361633182115, + "grad_norm": 0.06277959793806076, + "learning_rate": 8.530986993114002e-05, + "loss": 0.2499, + "step": 30421 + }, + { + "epoch": 2.4645171743357097, + "grad_norm": 0.07568095624446869, + "learning_rate": 8.530536927854539e-05, + "loss": 0.2548, + "step": 30422 + }, + { + "epoch": 2.464598185353208, + "grad_norm": 0.06656137108802795, + "learning_rate": 8.530086862595078e-05, + "loss": 0.2307, + "step": 30423 + }, + { + "epoch": 2.4646791963707066, + "grad_norm": 0.05642401799559593, + "learning_rate": 8.529636797335614e-05, + "loss": 0.228, + "step": 30424 + }, + { + "epoch": 2.464760207388205, + "grad_norm": 0.07163457572460175, + "learning_rate": 8.529186732076151e-05, + "loss": 0.2378, + "step": 30425 + }, + { + "epoch": 2.464841218405703, + "grad_norm": 0.06835125386714935, + "learning_rate": 8.52873666681669e-05, + "loss": 0.2362, + "step": 30426 + }, + { + "epoch": 2.4649222294232014, + "grad_norm": 0.07255390286445618, + "learning_rate": 8.528286601557226e-05, + "loss": 0.2377, + "step": 30427 + }, + { + "epoch": 2.4650032404407, + "grad_norm": 0.07115132361650467, + "learning_rate": 8.527836536297764e-05, + "loss": 0.2631, + "step": 30428 + }, + { + "epoch": 2.4650842514581983, + "grad_norm": 0.05883652716875076, + "learning_rate": 8.527386471038302e-05, + "loss": 0.2244, + "step": 30429 + }, + { + "epoch": 2.4651652624756966, + "grad_norm": 0.08018367737531662, + "learning_rate": 8.526936405778838e-05, + "loss": 0.2834, + "step": 30430 + }, + { + "epoch": 2.4652462734931953, + "grad_norm": 0.07445010542869568, + "learning_rate": 8.526486340519376e-05, + "loss": 0.2497, + "step": 30431 + }, + { + "epoch": 2.4653272845106935, + "grad_norm": 0.07131835073232651, + "learning_rate": 8.526036275259914e-05, + "loss": 0.2285, + "step": 30432 + }, + { + "epoch": 2.4654082955281917, + "grad_norm": 0.07377203553915024, + "learning_rate": 8.52558621000045e-05, + "loss": 0.2694, + "step": 30433 + }, + { + "epoch": 2.46548930654569, + "grad_norm": 0.059419889003038406, + "learning_rate": 8.525136144740989e-05, + "loss": 0.2168, + "step": 30434 + }, + { + "epoch": 2.4655703175631887, + "grad_norm": 0.052533701062202454, + "learning_rate": 8.524686079481526e-05, + "loss": 0.2151, + "step": 30435 + }, + { + "epoch": 2.465651328580687, + "grad_norm": 0.0678594559431076, + "learning_rate": 8.524236014222062e-05, + "loss": 0.2407, + "step": 30436 + }, + { + "epoch": 2.465732339598185, + "grad_norm": 0.07487330585718155, + "learning_rate": 8.5237859489626e-05, + "loss": 0.2665, + "step": 30437 + }, + { + "epoch": 2.465813350615684, + "grad_norm": 0.06742486357688904, + "learning_rate": 8.523335883703138e-05, + "loss": 0.2655, + "step": 30438 + }, + { + "epoch": 2.465894361633182, + "grad_norm": 0.05089341476559639, + "learning_rate": 8.522885818443674e-05, + "loss": 0.2299, + "step": 30439 + }, + { + "epoch": 2.4659753726506803, + "grad_norm": 0.05498534068465233, + "learning_rate": 8.522435753184213e-05, + "loss": 0.2017, + "step": 30440 + }, + { + "epoch": 2.466056383668179, + "grad_norm": 0.06885070353746414, + "learning_rate": 8.52198568792475e-05, + "loss": 0.2298, + "step": 30441 + }, + { + "epoch": 2.4661373946856773, + "grad_norm": 0.06844142079353333, + "learning_rate": 8.521535622665286e-05, + "loss": 0.2448, + "step": 30442 + }, + { + "epoch": 2.4662184057031755, + "grad_norm": 0.08285916596651077, + "learning_rate": 8.521085557405825e-05, + "loss": 0.2514, + "step": 30443 + }, + { + "epoch": 2.466299416720674, + "grad_norm": 0.06653977930545807, + "learning_rate": 8.520635492146362e-05, + "loss": 0.236, + "step": 30444 + }, + { + "epoch": 2.4663804277381725, + "grad_norm": 0.06271621584892273, + "learning_rate": 8.520185426886898e-05, + "loss": 0.2182, + "step": 30445 + }, + { + "epoch": 2.4664614387556707, + "grad_norm": 0.08903393894433975, + "learning_rate": 8.519735361627437e-05, + "loss": 0.2353, + "step": 30446 + }, + { + "epoch": 2.4665424497731694, + "grad_norm": 0.06892696768045425, + "learning_rate": 8.519285296367974e-05, + "loss": 0.242, + "step": 30447 + }, + { + "epoch": 2.4666234607906676, + "grad_norm": 0.06523766368627548, + "learning_rate": 8.51883523110851e-05, + "loss": 0.2128, + "step": 30448 + }, + { + "epoch": 2.466704471808166, + "grad_norm": 0.06940148770809174, + "learning_rate": 8.518385165849049e-05, + "loss": 0.2851, + "step": 30449 + }, + { + "epoch": 2.466785482825664, + "grad_norm": 0.07958599925041199, + "learning_rate": 8.517935100589586e-05, + "loss": 0.2569, + "step": 30450 + }, + { + "epoch": 2.466866493843163, + "grad_norm": 0.080347441136837, + "learning_rate": 8.517485035330122e-05, + "loss": 0.2485, + "step": 30451 + }, + { + "epoch": 2.466947504860661, + "grad_norm": 0.0917719230055809, + "learning_rate": 8.517034970070661e-05, + "loss": 0.2446, + "step": 30452 + }, + { + "epoch": 2.4670285158781593, + "grad_norm": 0.07323457300662994, + "learning_rate": 8.516584904811198e-05, + "loss": 0.2448, + "step": 30453 + }, + { + "epoch": 2.467109526895658, + "grad_norm": 0.08020184934139252, + "learning_rate": 8.516134839551734e-05, + "loss": 0.2403, + "step": 30454 + }, + { + "epoch": 2.4671905379131562, + "grad_norm": 0.06991557776927948, + "learning_rate": 8.515684774292273e-05, + "loss": 0.2151, + "step": 30455 + }, + { + "epoch": 2.4672715489306545, + "grad_norm": 0.0720125064253807, + "learning_rate": 8.51523470903281e-05, + "loss": 0.2741, + "step": 30456 + }, + { + "epoch": 2.4673525599481527, + "grad_norm": 0.06054284796118736, + "learning_rate": 8.514784643773348e-05, + "loss": 0.2589, + "step": 30457 + }, + { + "epoch": 2.4674335709656514, + "grad_norm": 0.061874501407146454, + "learning_rate": 8.514334578513885e-05, + "loss": 0.2394, + "step": 30458 + }, + { + "epoch": 2.4675145819831497, + "grad_norm": 0.07197708636522293, + "learning_rate": 8.513884513254423e-05, + "loss": 0.2428, + "step": 30459 + }, + { + "epoch": 2.467595593000648, + "grad_norm": 0.08322641998529434, + "learning_rate": 8.51343444799496e-05, + "loss": 0.2442, + "step": 30460 + }, + { + "epoch": 2.4676766040181466, + "grad_norm": 0.06238508224487305, + "learning_rate": 8.512984382735497e-05, + "loss": 0.27, + "step": 30461 + }, + { + "epoch": 2.467757615035645, + "grad_norm": 0.055403050035238266, + "learning_rate": 8.512534317476035e-05, + "loss": 0.2421, + "step": 30462 + }, + { + "epoch": 2.467838626053143, + "grad_norm": 0.07716569304466248, + "learning_rate": 8.512084252216572e-05, + "loss": 0.2726, + "step": 30463 + }, + { + "epoch": 2.4679196370706418, + "grad_norm": 0.0678950846195221, + "learning_rate": 8.51163418695711e-05, + "loss": 0.2568, + "step": 30464 + }, + { + "epoch": 2.46800064808814, + "grad_norm": 0.07370582222938538, + "learning_rate": 8.511184121697647e-05, + "loss": 0.2735, + "step": 30465 + }, + { + "epoch": 2.4680816591056383, + "grad_norm": 0.06295979768037796, + "learning_rate": 8.510734056438184e-05, + "loss": 0.239, + "step": 30466 + }, + { + "epoch": 2.468162670123137, + "grad_norm": 0.07194853574037552, + "learning_rate": 8.510283991178721e-05, + "loss": 0.24, + "step": 30467 + }, + { + "epoch": 2.468243681140635, + "grad_norm": 0.0788399875164032, + "learning_rate": 8.509833925919259e-05, + "loss": 0.2444, + "step": 30468 + }, + { + "epoch": 2.4683246921581334, + "grad_norm": 0.06429265439510345, + "learning_rate": 8.509383860659796e-05, + "loss": 0.241, + "step": 30469 + }, + { + "epoch": 2.468405703175632, + "grad_norm": 0.05816531553864479, + "learning_rate": 8.508933795400334e-05, + "loss": 0.2041, + "step": 30470 + }, + { + "epoch": 2.4684867141931304, + "grad_norm": 0.058793339878320694, + "learning_rate": 8.508483730140871e-05, + "loss": 0.2095, + "step": 30471 + }, + { + "epoch": 2.4685677252106286, + "grad_norm": 0.07979535311460495, + "learning_rate": 8.508033664881408e-05, + "loss": 0.2869, + "step": 30472 + }, + { + "epoch": 2.468648736228127, + "grad_norm": 0.060868628323078156, + "learning_rate": 8.507583599621946e-05, + "loss": 0.2525, + "step": 30473 + }, + { + "epoch": 2.4687297472456255, + "grad_norm": 0.0653449222445488, + "learning_rate": 8.507133534362483e-05, + "loss": 0.2202, + "step": 30474 + }, + { + "epoch": 2.468810758263124, + "grad_norm": 0.07391634583473206, + "learning_rate": 8.50668346910302e-05, + "loss": 0.2511, + "step": 30475 + }, + { + "epoch": 2.468891769280622, + "grad_norm": 0.06315775960683823, + "learning_rate": 8.506233403843558e-05, + "loss": 0.2647, + "step": 30476 + }, + { + "epoch": 2.4689727802981207, + "grad_norm": 0.08127032220363617, + "learning_rate": 8.505783338584095e-05, + "loss": 0.2974, + "step": 30477 + }, + { + "epoch": 2.469053791315619, + "grad_norm": 0.07029463350772858, + "learning_rate": 8.505333273324632e-05, + "loss": 0.234, + "step": 30478 + }, + { + "epoch": 2.469134802333117, + "grad_norm": 0.08047143369913101, + "learning_rate": 8.50488320806517e-05, + "loss": 0.2738, + "step": 30479 + }, + { + "epoch": 2.4692158133506155, + "grad_norm": 0.07719200104475021, + "learning_rate": 8.504433142805707e-05, + "loss": 0.2413, + "step": 30480 + }, + { + "epoch": 2.469296824368114, + "grad_norm": 0.08738135546445847, + "learning_rate": 8.503983077546244e-05, + "loss": 0.2801, + "step": 30481 + }, + { + "epoch": 2.4693778353856124, + "grad_norm": 0.06579606235027313, + "learning_rate": 8.503533012286782e-05, + "loss": 0.2395, + "step": 30482 + }, + { + "epoch": 2.4694588464031106, + "grad_norm": 0.06997605413198471, + "learning_rate": 8.503082947027319e-05, + "loss": 0.2696, + "step": 30483 + }, + { + "epoch": 2.4695398574206093, + "grad_norm": 0.07098864018917084, + "learning_rate": 8.502632881767857e-05, + "loss": 0.2248, + "step": 30484 + }, + { + "epoch": 2.4696208684381076, + "grad_norm": 0.06740034371614456, + "learning_rate": 8.502182816508394e-05, + "loss": 0.2787, + "step": 30485 + }, + { + "epoch": 2.469701879455606, + "grad_norm": 0.0740785151720047, + "learning_rate": 8.501732751248933e-05, + "loss": 0.2315, + "step": 30486 + }, + { + "epoch": 2.4697828904731045, + "grad_norm": 0.09051207453012466, + "learning_rate": 8.501282685989469e-05, + "loss": 0.277, + "step": 30487 + }, + { + "epoch": 2.4698639014906028, + "grad_norm": 0.06422457098960876, + "learning_rate": 8.500832620730006e-05, + "loss": 0.232, + "step": 30488 + }, + { + "epoch": 2.469944912508101, + "grad_norm": 0.0801544114947319, + "learning_rate": 8.500382555470545e-05, + "loss": 0.3011, + "step": 30489 + }, + { + "epoch": 2.4700259235255997, + "grad_norm": 0.055484507232904434, + "learning_rate": 8.499932490211081e-05, + "loss": 0.2941, + "step": 30490 + }, + { + "epoch": 2.470106934543098, + "grad_norm": 0.06433779746294022, + "learning_rate": 8.499482424951618e-05, + "loss": 0.1931, + "step": 30491 + }, + { + "epoch": 2.470187945560596, + "grad_norm": 0.06803855299949646, + "learning_rate": 8.499032359692157e-05, + "loss": 0.217, + "step": 30492 + }, + { + "epoch": 2.470268956578095, + "grad_norm": 0.0670522153377533, + "learning_rate": 8.498582294432693e-05, + "loss": 0.2807, + "step": 30493 + }, + { + "epoch": 2.470349967595593, + "grad_norm": 0.07295812666416168, + "learning_rate": 8.49813222917323e-05, + "loss": 0.2891, + "step": 30494 + }, + { + "epoch": 2.4704309786130914, + "grad_norm": 0.06234823167324066, + "learning_rate": 8.497682163913769e-05, + "loss": 0.252, + "step": 30495 + }, + { + "epoch": 2.4705119896305896, + "grad_norm": 0.06830089539289474, + "learning_rate": 8.497232098654305e-05, + "loss": 0.2569, + "step": 30496 + }, + { + "epoch": 2.4705930006480883, + "grad_norm": 0.06779050827026367, + "learning_rate": 8.496782033394842e-05, + "loss": 0.2216, + "step": 30497 + }, + { + "epoch": 2.4706740116655865, + "grad_norm": 0.0621131956577301, + "learning_rate": 8.496331968135381e-05, + "loss": 0.2374, + "step": 30498 + }, + { + "epoch": 2.470755022683085, + "grad_norm": 0.06830302625894547, + "learning_rate": 8.495881902875917e-05, + "loss": 0.2565, + "step": 30499 + }, + { + "epoch": 2.4708360337005835, + "grad_norm": 0.05653122067451477, + "learning_rate": 8.495431837616454e-05, + "loss": 0.2369, + "step": 30500 + }, + { + "epoch": 2.4709170447180817, + "grad_norm": 0.05332133173942566, + "learning_rate": 8.494981772356993e-05, + "loss": 0.2493, + "step": 30501 + }, + { + "epoch": 2.47099805573558, + "grad_norm": 0.06842204183340073, + "learning_rate": 8.494531707097529e-05, + "loss": 0.2659, + "step": 30502 + }, + { + "epoch": 2.471079066753078, + "grad_norm": 0.061891455203294754, + "learning_rate": 8.494081641838066e-05, + "loss": 0.2596, + "step": 30503 + }, + { + "epoch": 2.471160077770577, + "grad_norm": 0.048567235469818115, + "learning_rate": 8.493631576578605e-05, + "loss": 0.203, + "step": 30504 + }, + { + "epoch": 2.471241088788075, + "grad_norm": 0.061536628752946854, + "learning_rate": 8.493181511319141e-05, + "loss": 0.2194, + "step": 30505 + }, + { + "epoch": 2.4713220998055734, + "grad_norm": 0.056444212794303894, + "learning_rate": 8.492731446059679e-05, + "loss": 0.2044, + "step": 30506 + }, + { + "epoch": 2.471403110823072, + "grad_norm": 0.06512252241373062, + "learning_rate": 8.492281380800217e-05, + "loss": 0.2332, + "step": 30507 + }, + { + "epoch": 2.4714841218405703, + "grad_norm": 0.05856655165553093, + "learning_rate": 8.491831315540753e-05, + "loss": 0.2205, + "step": 30508 + }, + { + "epoch": 2.4715651328580686, + "grad_norm": 0.07192254066467285, + "learning_rate": 8.491381250281292e-05, + "loss": 0.2276, + "step": 30509 + }, + { + "epoch": 2.4716461438755672, + "grad_norm": 0.07527060806751251, + "learning_rate": 8.490931185021829e-05, + "loss": 0.2525, + "step": 30510 + }, + { + "epoch": 2.4717271548930655, + "grad_norm": 0.06652418524026871, + "learning_rate": 8.490481119762365e-05, + "loss": 0.2377, + "step": 30511 + }, + { + "epoch": 2.4718081659105637, + "grad_norm": 0.0621594674885273, + "learning_rate": 8.490031054502904e-05, + "loss": 0.2076, + "step": 30512 + }, + { + "epoch": 2.4718891769280624, + "grad_norm": 0.07410325109958649, + "learning_rate": 8.489580989243441e-05, + "loss": 0.2559, + "step": 30513 + }, + { + "epoch": 2.4719701879455607, + "grad_norm": 0.06433682143688202, + "learning_rate": 8.489130923983977e-05, + "loss": 0.2071, + "step": 30514 + }, + { + "epoch": 2.472051198963059, + "grad_norm": 0.06185179203748703, + "learning_rate": 8.488680858724516e-05, + "loss": 0.2434, + "step": 30515 + }, + { + "epoch": 2.4721322099805576, + "grad_norm": 0.06650276482105255, + "learning_rate": 8.488230793465053e-05, + "loss": 0.253, + "step": 30516 + }, + { + "epoch": 2.472213220998056, + "grad_norm": 0.05839819461107254, + "learning_rate": 8.48778072820559e-05, + "loss": 0.2209, + "step": 30517 + }, + { + "epoch": 2.472294232015554, + "grad_norm": 0.05731014162302017, + "learning_rate": 8.487330662946128e-05, + "loss": 0.2107, + "step": 30518 + }, + { + "epoch": 2.4723752430330523, + "grad_norm": 0.08695816248655319, + "learning_rate": 8.486880597686666e-05, + "loss": 0.2402, + "step": 30519 + }, + { + "epoch": 2.472456254050551, + "grad_norm": 0.060433726757764816, + "learning_rate": 8.486430532427202e-05, + "loss": 0.236, + "step": 30520 + }, + { + "epoch": 2.4725372650680493, + "grad_norm": 0.060748256742954254, + "learning_rate": 8.48598046716774e-05, + "loss": 0.2119, + "step": 30521 + }, + { + "epoch": 2.4726182760855475, + "grad_norm": 0.07171925157308578, + "learning_rate": 8.485530401908278e-05, + "loss": 0.286, + "step": 30522 + }, + { + "epoch": 2.472699287103046, + "grad_norm": 0.053237102925777435, + "learning_rate": 8.485080336648814e-05, + "loss": 0.1828, + "step": 30523 + }, + { + "epoch": 2.4727802981205445, + "grad_norm": 0.06715618073940277, + "learning_rate": 8.484630271389352e-05, + "loss": 0.2562, + "step": 30524 + }, + { + "epoch": 2.4728613091380427, + "grad_norm": 0.06880155205726624, + "learning_rate": 8.48418020612989e-05, + "loss": 0.2556, + "step": 30525 + }, + { + "epoch": 2.472942320155541, + "grad_norm": 0.06980689615011215, + "learning_rate": 8.483730140870426e-05, + "loss": 0.2443, + "step": 30526 + }, + { + "epoch": 2.4730233311730396, + "grad_norm": 0.06468836963176727, + "learning_rate": 8.483280075610964e-05, + "loss": 0.2377, + "step": 30527 + }, + { + "epoch": 2.473104342190538, + "grad_norm": 0.07451368868350983, + "learning_rate": 8.482830010351502e-05, + "loss": 0.2622, + "step": 30528 + }, + { + "epoch": 2.473185353208036, + "grad_norm": 0.07759378850460052, + "learning_rate": 8.482379945092038e-05, + "loss": 0.2815, + "step": 30529 + }, + { + "epoch": 2.473266364225535, + "grad_norm": 0.0744905173778534, + "learning_rate": 8.481929879832577e-05, + "loss": 0.2696, + "step": 30530 + }, + { + "epoch": 2.473347375243033, + "grad_norm": 0.06338359415531158, + "learning_rate": 8.481479814573114e-05, + "loss": 0.2335, + "step": 30531 + }, + { + "epoch": 2.4734283862605313, + "grad_norm": 0.061391785740852356, + "learning_rate": 8.48102974931365e-05, + "loss": 0.2507, + "step": 30532 + }, + { + "epoch": 2.47350939727803, + "grad_norm": 0.05529424548149109, + "learning_rate": 8.480579684054189e-05, + "loss": 0.2195, + "step": 30533 + }, + { + "epoch": 2.4735904082955282, + "grad_norm": 0.06360072642564774, + "learning_rate": 8.480129618794726e-05, + "loss": 0.2163, + "step": 30534 + }, + { + "epoch": 2.4736714193130265, + "grad_norm": 0.08186694234609604, + "learning_rate": 8.479679553535263e-05, + "loss": 0.2304, + "step": 30535 + }, + { + "epoch": 2.473752430330525, + "grad_norm": 0.06562604010105133, + "learning_rate": 8.4792294882758e-05, + "loss": 0.2797, + "step": 30536 + }, + { + "epoch": 2.4738334413480234, + "grad_norm": 0.07563050836324692, + "learning_rate": 8.478779423016338e-05, + "loss": 0.2963, + "step": 30537 + }, + { + "epoch": 2.4739144523655217, + "grad_norm": 0.0658164918422699, + "learning_rate": 8.478329357756875e-05, + "loss": 0.2575, + "step": 30538 + }, + { + "epoch": 2.47399546338302, + "grad_norm": 0.07443150132894516, + "learning_rate": 8.477879292497413e-05, + "loss": 0.2333, + "step": 30539 + }, + { + "epoch": 2.4740764744005186, + "grad_norm": 0.06861604005098343, + "learning_rate": 8.47742922723795e-05, + "loss": 0.2373, + "step": 30540 + }, + { + "epoch": 2.474157485418017, + "grad_norm": 0.07418636232614517, + "learning_rate": 8.476979161978487e-05, + "loss": 0.2265, + "step": 30541 + }, + { + "epoch": 2.474238496435515, + "grad_norm": 0.06815627962350845, + "learning_rate": 8.476529096719025e-05, + "loss": 0.2377, + "step": 30542 + }, + { + "epoch": 2.4743195074530138, + "grad_norm": 0.06877239048480988, + "learning_rate": 8.476079031459562e-05, + "loss": 0.2037, + "step": 30543 + }, + { + "epoch": 2.474400518470512, + "grad_norm": 0.0606442354619503, + "learning_rate": 8.4756289662001e-05, + "loss": 0.2412, + "step": 30544 + }, + { + "epoch": 2.4744815294880103, + "grad_norm": 0.0652085393667221, + "learning_rate": 8.475178900940637e-05, + "loss": 0.2045, + "step": 30545 + }, + { + "epoch": 2.4745625405055085, + "grad_norm": 0.07037603855133057, + "learning_rate": 8.474728835681174e-05, + "loss": 0.2213, + "step": 30546 + }, + { + "epoch": 2.474643551523007, + "grad_norm": 0.06558571755886078, + "learning_rate": 8.474278770421712e-05, + "loss": 0.2443, + "step": 30547 + }, + { + "epoch": 2.4747245625405054, + "grad_norm": 0.07204144448041916, + "learning_rate": 8.473828705162249e-05, + "loss": 0.2457, + "step": 30548 + }, + { + "epoch": 2.4748055735580037, + "grad_norm": 0.08189809322357178, + "learning_rate": 8.473378639902786e-05, + "loss": 0.2156, + "step": 30549 + }, + { + "epoch": 2.4748865845755024, + "grad_norm": 0.0684322714805603, + "learning_rate": 8.472928574643324e-05, + "loss": 0.2378, + "step": 30550 + }, + { + "epoch": 2.4749675955930006, + "grad_norm": 0.07303576916456223, + "learning_rate": 8.472478509383861e-05, + "loss": 0.2454, + "step": 30551 + }, + { + "epoch": 2.475048606610499, + "grad_norm": 0.07292795181274414, + "learning_rate": 8.472028444124398e-05, + "loss": 0.2673, + "step": 30552 + }, + { + "epoch": 2.4751296176279975, + "grad_norm": 0.06463218480348587, + "learning_rate": 8.471578378864936e-05, + "loss": 0.2719, + "step": 30553 + }, + { + "epoch": 2.475210628645496, + "grad_norm": 0.08307449519634247, + "learning_rate": 8.471128313605473e-05, + "loss": 0.256, + "step": 30554 + }, + { + "epoch": 2.475291639662994, + "grad_norm": 0.05798949673771858, + "learning_rate": 8.47067824834601e-05, + "loss": 0.2702, + "step": 30555 + }, + { + "epoch": 2.4753726506804927, + "grad_norm": 0.07065913081169128, + "learning_rate": 8.470228183086548e-05, + "loss": 0.2602, + "step": 30556 + }, + { + "epoch": 2.475453661697991, + "grad_norm": 0.08354199677705765, + "learning_rate": 8.469778117827085e-05, + "loss": 0.2517, + "step": 30557 + }, + { + "epoch": 2.475534672715489, + "grad_norm": 0.05542146787047386, + "learning_rate": 8.469328052567623e-05, + "loss": 0.2171, + "step": 30558 + }, + { + "epoch": 2.475615683732988, + "grad_norm": 0.07074569165706635, + "learning_rate": 8.46887798730816e-05, + "loss": 0.2834, + "step": 30559 + }, + { + "epoch": 2.475696694750486, + "grad_norm": 0.07820092886686325, + "learning_rate": 8.468427922048697e-05, + "loss": 0.2285, + "step": 30560 + }, + { + "epoch": 2.4757777057679844, + "grad_norm": 0.059474945068359375, + "learning_rate": 8.467977856789236e-05, + "loss": 0.2126, + "step": 30561 + }, + { + "epoch": 2.4758587167854826, + "grad_norm": 0.06978403031826019, + "learning_rate": 8.467527791529772e-05, + "loss": 0.2501, + "step": 30562 + }, + { + "epoch": 2.4759397278029813, + "grad_norm": 0.07842637598514557, + "learning_rate": 8.46707772627031e-05, + "loss": 0.2466, + "step": 30563 + }, + { + "epoch": 2.4760207388204796, + "grad_norm": 0.0689956545829773, + "learning_rate": 8.466627661010848e-05, + "loss": 0.2547, + "step": 30564 + }, + { + "epoch": 2.476101749837978, + "grad_norm": 0.07924441993236542, + "learning_rate": 8.466177595751384e-05, + "loss": 0.2389, + "step": 30565 + }, + { + "epoch": 2.4761827608554765, + "grad_norm": 0.06799956411123276, + "learning_rate": 8.465727530491921e-05, + "loss": 0.236, + "step": 30566 + }, + { + "epoch": 2.4762637718729748, + "grad_norm": 0.07037723064422607, + "learning_rate": 8.46527746523246e-05, + "loss": 0.2611, + "step": 30567 + }, + { + "epoch": 2.476344782890473, + "grad_norm": 0.05896992236375809, + "learning_rate": 8.464827399972996e-05, + "loss": 0.2478, + "step": 30568 + }, + { + "epoch": 2.4764257939079712, + "grad_norm": 0.06081313639879227, + "learning_rate": 8.464377334713534e-05, + "loss": 0.2164, + "step": 30569 + }, + { + "epoch": 2.47650680492547, + "grad_norm": 0.06605610251426697, + "learning_rate": 8.463927269454072e-05, + "loss": 0.2315, + "step": 30570 + }, + { + "epoch": 2.476587815942968, + "grad_norm": 0.06241413950920105, + "learning_rate": 8.463477204194608e-05, + "loss": 0.2517, + "step": 30571 + }, + { + "epoch": 2.4766688269604664, + "grad_norm": 0.0740523412823677, + "learning_rate": 8.463027138935146e-05, + "loss": 0.2226, + "step": 30572 + }, + { + "epoch": 2.476749837977965, + "grad_norm": 0.06345130503177643, + "learning_rate": 8.462577073675684e-05, + "loss": 0.2541, + "step": 30573 + }, + { + "epoch": 2.4768308489954634, + "grad_norm": 0.07574643939733505, + "learning_rate": 8.46212700841622e-05, + "loss": 0.2631, + "step": 30574 + }, + { + "epoch": 2.4769118600129616, + "grad_norm": 0.06299690157175064, + "learning_rate": 8.461676943156758e-05, + "loss": 0.2358, + "step": 30575 + }, + { + "epoch": 2.4769928710304603, + "grad_norm": 0.07480761408805847, + "learning_rate": 8.461226877897296e-05, + "loss": 0.2674, + "step": 30576 + }, + { + "epoch": 2.4770738820479585, + "grad_norm": 0.0540340431034565, + "learning_rate": 8.460776812637832e-05, + "loss": 0.227, + "step": 30577 + }, + { + "epoch": 2.4771548930654568, + "grad_norm": 0.07527433335781097, + "learning_rate": 8.46032674737837e-05, + "loss": 0.2709, + "step": 30578 + }, + { + "epoch": 2.4772359040829555, + "grad_norm": 0.06662973016500473, + "learning_rate": 8.459876682118909e-05, + "loss": 0.2107, + "step": 30579 + }, + { + "epoch": 2.4773169151004537, + "grad_norm": 0.052600983530282974, + "learning_rate": 8.459426616859445e-05, + "loss": 0.2453, + "step": 30580 + }, + { + "epoch": 2.477397926117952, + "grad_norm": 0.06754674017429352, + "learning_rate": 8.458976551599982e-05, + "loss": 0.2646, + "step": 30581 + }, + { + "epoch": 2.4774789371354506, + "grad_norm": 0.07289335131645203, + "learning_rate": 8.45852648634052e-05, + "loss": 0.2732, + "step": 30582 + }, + { + "epoch": 2.477559948152949, + "grad_norm": 0.061072248965501785, + "learning_rate": 8.458076421081057e-05, + "loss": 0.1947, + "step": 30583 + }, + { + "epoch": 2.477640959170447, + "grad_norm": 0.07370416820049286, + "learning_rate": 8.457626355821594e-05, + "loss": 0.2897, + "step": 30584 + }, + { + "epoch": 2.4777219701879454, + "grad_norm": 0.06480906158685684, + "learning_rate": 8.457176290562133e-05, + "loss": 0.2679, + "step": 30585 + }, + { + "epoch": 2.477802981205444, + "grad_norm": 0.05878300592303276, + "learning_rate": 8.456726225302669e-05, + "loss": 0.2373, + "step": 30586 + }, + { + "epoch": 2.4778839922229423, + "grad_norm": 0.07398363202810287, + "learning_rate": 8.456276160043207e-05, + "loss": 0.2447, + "step": 30587 + }, + { + "epoch": 2.4779650032404406, + "grad_norm": 0.07146541774272919, + "learning_rate": 8.455826094783745e-05, + "loss": 0.2513, + "step": 30588 + }, + { + "epoch": 2.4780460142579392, + "grad_norm": 0.07220860570669174, + "learning_rate": 8.455376029524281e-05, + "loss": 0.2173, + "step": 30589 + }, + { + "epoch": 2.4781270252754375, + "grad_norm": 0.08894750475883484, + "learning_rate": 8.45492596426482e-05, + "loss": 0.2895, + "step": 30590 + }, + { + "epoch": 2.4782080362929357, + "grad_norm": 0.06148972734808922, + "learning_rate": 8.454475899005357e-05, + "loss": 0.2457, + "step": 30591 + }, + { + "epoch": 2.478289047310434, + "grad_norm": 0.07154235243797302, + "learning_rate": 8.454025833745893e-05, + "loss": 0.2492, + "step": 30592 + }, + { + "epoch": 2.4783700583279327, + "grad_norm": 0.06511364877223969, + "learning_rate": 8.453575768486432e-05, + "loss": 0.2275, + "step": 30593 + }, + { + "epoch": 2.478451069345431, + "grad_norm": 0.051312949508428574, + "learning_rate": 8.453125703226969e-05, + "loss": 0.2409, + "step": 30594 + }, + { + "epoch": 2.478532080362929, + "grad_norm": 0.06297353655099869, + "learning_rate": 8.452675637967505e-05, + "loss": 0.214, + "step": 30595 + }, + { + "epoch": 2.478613091380428, + "grad_norm": 0.059804175049066544, + "learning_rate": 8.452225572708044e-05, + "loss": 0.212, + "step": 30596 + }, + { + "epoch": 2.478694102397926, + "grad_norm": 0.06705871969461441, + "learning_rate": 8.451775507448581e-05, + "loss": 0.244, + "step": 30597 + }, + { + "epoch": 2.4787751134154243, + "grad_norm": 0.06258346140384674, + "learning_rate": 8.451325442189117e-05, + "loss": 0.2943, + "step": 30598 + }, + { + "epoch": 2.478856124432923, + "grad_norm": 0.0627899020910263, + "learning_rate": 8.450875376929656e-05, + "loss": 0.232, + "step": 30599 + }, + { + "epoch": 2.4789371354504213, + "grad_norm": 0.0678204819560051, + "learning_rate": 8.450425311670193e-05, + "loss": 0.229, + "step": 30600 + }, + { + "epoch": 2.4790181464679195, + "grad_norm": 0.05785902217030525, + "learning_rate": 8.449975246410729e-05, + "loss": 0.2413, + "step": 30601 + }, + { + "epoch": 2.479099157485418, + "grad_norm": 0.058222945779561996, + "learning_rate": 8.449525181151268e-05, + "loss": 0.2707, + "step": 30602 + }, + { + "epoch": 2.4791801685029164, + "grad_norm": 0.06798720359802246, + "learning_rate": 8.449075115891805e-05, + "loss": 0.2509, + "step": 30603 + }, + { + "epoch": 2.4792611795204147, + "grad_norm": 0.061136022210121155, + "learning_rate": 8.448625050632341e-05, + "loss": 0.2303, + "step": 30604 + }, + { + "epoch": 2.4793421905379134, + "grad_norm": 0.07287042587995529, + "learning_rate": 8.44817498537288e-05, + "loss": 0.2456, + "step": 30605 + }, + { + "epoch": 2.4794232015554116, + "grad_norm": 0.07112374156713486, + "learning_rate": 8.447724920113417e-05, + "loss": 0.2289, + "step": 30606 + }, + { + "epoch": 2.47950421257291, + "grad_norm": 0.06856828182935715, + "learning_rate": 8.447274854853953e-05, + "loss": 0.3085, + "step": 30607 + }, + { + "epoch": 2.479585223590408, + "grad_norm": 0.05687296390533447, + "learning_rate": 8.446824789594492e-05, + "loss": 0.2255, + "step": 30608 + }, + { + "epoch": 2.479666234607907, + "grad_norm": 0.06885094195604324, + "learning_rate": 8.44637472433503e-05, + "loss": 0.2761, + "step": 30609 + }, + { + "epoch": 2.479747245625405, + "grad_norm": 0.05651099607348442, + "learning_rate": 8.445924659075565e-05, + "loss": 0.233, + "step": 30610 + }, + { + "epoch": 2.4798282566429033, + "grad_norm": 0.08474230021238327, + "learning_rate": 8.445474593816104e-05, + "loss": 0.2556, + "step": 30611 + }, + { + "epoch": 2.479909267660402, + "grad_norm": 0.06992295384407043, + "learning_rate": 8.445024528556641e-05, + "loss": 0.2496, + "step": 30612 + }, + { + "epoch": 2.4799902786779002, + "grad_norm": 0.05965316668152809, + "learning_rate": 8.444574463297179e-05, + "loss": 0.2151, + "step": 30613 + }, + { + "epoch": 2.4800712896953985, + "grad_norm": 0.07484708726406097, + "learning_rate": 8.444124398037716e-05, + "loss": 0.2451, + "step": 30614 + }, + { + "epoch": 2.4801523007128967, + "grad_norm": 0.07535053789615631, + "learning_rate": 8.443674332778253e-05, + "loss": 0.2582, + "step": 30615 + }, + { + "epoch": 2.4802333117303954, + "grad_norm": 0.0784064307808876, + "learning_rate": 8.443224267518791e-05, + "loss": 0.2701, + "step": 30616 + }, + { + "epoch": 2.4803143227478937, + "grad_norm": 0.06548595428466797, + "learning_rate": 8.442774202259328e-05, + "loss": 0.2358, + "step": 30617 + }, + { + "epoch": 2.480395333765392, + "grad_norm": 0.08438055962324142, + "learning_rate": 8.442324136999866e-05, + "loss": 0.2528, + "step": 30618 + }, + { + "epoch": 2.4804763447828906, + "grad_norm": 0.06295493245124817, + "learning_rate": 8.441874071740403e-05, + "loss": 0.2414, + "step": 30619 + }, + { + "epoch": 2.480557355800389, + "grad_norm": 0.06215086951851845, + "learning_rate": 8.44142400648094e-05, + "loss": 0.2201, + "step": 30620 + }, + { + "epoch": 2.480638366817887, + "grad_norm": 0.04356473311781883, + "learning_rate": 8.440973941221478e-05, + "loss": 0.2121, + "step": 30621 + }, + { + "epoch": 2.4807193778353858, + "grad_norm": 0.06062763184309006, + "learning_rate": 8.440523875962015e-05, + "loss": 0.2442, + "step": 30622 + }, + { + "epoch": 2.480800388852884, + "grad_norm": 0.07249461859464645, + "learning_rate": 8.440073810702552e-05, + "loss": 0.2531, + "step": 30623 + }, + { + "epoch": 2.4808813998703823, + "grad_norm": 0.0566757470369339, + "learning_rate": 8.43962374544309e-05, + "loss": 0.2855, + "step": 30624 + }, + { + "epoch": 2.480962410887881, + "grad_norm": 0.05951603129506111, + "learning_rate": 8.439173680183627e-05, + "loss": 0.2121, + "step": 30625 + }, + { + "epoch": 2.481043421905379, + "grad_norm": 0.06192256510257721, + "learning_rate": 8.438723614924164e-05, + "loss": 0.1961, + "step": 30626 + }, + { + "epoch": 2.4811244329228774, + "grad_norm": 0.06632743030786514, + "learning_rate": 8.438273549664702e-05, + "loss": 0.2504, + "step": 30627 + }, + { + "epoch": 2.481205443940376, + "grad_norm": 0.05739384517073631, + "learning_rate": 8.437823484405239e-05, + "loss": 0.2301, + "step": 30628 + }, + { + "epoch": 2.4812864549578744, + "grad_norm": 0.052155740559101105, + "learning_rate": 8.437373419145777e-05, + "loss": 0.2313, + "step": 30629 + }, + { + "epoch": 2.4813674659753726, + "grad_norm": 0.07894574850797653, + "learning_rate": 8.436923353886314e-05, + "loss": 0.2343, + "step": 30630 + }, + { + "epoch": 2.481448476992871, + "grad_norm": 0.07149939984083176, + "learning_rate": 8.436473288626851e-05, + "loss": 0.2414, + "step": 30631 + }, + { + "epoch": 2.4815294880103695, + "grad_norm": 0.0677579790353775, + "learning_rate": 8.436023223367389e-05, + "loss": 0.2401, + "step": 30632 + }, + { + "epoch": 2.481610499027868, + "grad_norm": 0.07865419238805771, + "learning_rate": 8.435573158107926e-05, + "loss": 0.265, + "step": 30633 + }, + { + "epoch": 2.481691510045366, + "grad_norm": 0.07887757569551468, + "learning_rate": 8.435123092848463e-05, + "loss": 0.2649, + "step": 30634 + }, + { + "epoch": 2.4817725210628647, + "grad_norm": 0.06781040132045746, + "learning_rate": 8.434673027589001e-05, + "loss": 0.245, + "step": 30635 + }, + { + "epoch": 2.481853532080363, + "grad_norm": 0.07102234661579132, + "learning_rate": 8.434222962329538e-05, + "loss": 0.2769, + "step": 30636 + }, + { + "epoch": 2.481934543097861, + "grad_norm": 0.05997258052229881, + "learning_rate": 8.433772897070075e-05, + "loss": 0.2127, + "step": 30637 + }, + { + "epoch": 2.4820155541153595, + "grad_norm": 0.06905551999807358, + "learning_rate": 8.433322831810613e-05, + "loss": 0.2399, + "step": 30638 + }, + { + "epoch": 2.482096565132858, + "grad_norm": 0.0653391182422638, + "learning_rate": 8.43287276655115e-05, + "loss": 0.2442, + "step": 30639 + }, + { + "epoch": 2.4821775761503564, + "grad_norm": 0.0679607167840004, + "learning_rate": 8.432422701291688e-05, + "loss": 0.2498, + "step": 30640 + }, + { + "epoch": 2.4822585871678546, + "grad_norm": 0.07271024584770203, + "learning_rate": 8.431972636032225e-05, + "loss": 0.2263, + "step": 30641 + }, + { + "epoch": 2.4823395981853533, + "grad_norm": 0.0782843753695488, + "learning_rate": 8.431522570772764e-05, + "loss": 0.2853, + "step": 30642 + }, + { + "epoch": 2.4824206092028516, + "grad_norm": 0.06645394116640091, + "learning_rate": 8.4310725055133e-05, + "loss": 0.2078, + "step": 30643 + }, + { + "epoch": 2.48250162022035, + "grad_norm": 0.06782057881355286, + "learning_rate": 8.430622440253837e-05, + "loss": 0.2438, + "step": 30644 + }, + { + "epoch": 2.4825826312378485, + "grad_norm": 0.06983155757188797, + "learning_rate": 8.430172374994376e-05, + "loss": 0.2171, + "step": 30645 + }, + { + "epoch": 2.4826636422553467, + "grad_norm": 0.06148098036646843, + "learning_rate": 8.429722309734912e-05, + "loss": 0.2539, + "step": 30646 + }, + { + "epoch": 2.482744653272845, + "grad_norm": 0.062350235879421234, + "learning_rate": 8.429272244475449e-05, + "loss": 0.253, + "step": 30647 + }, + { + "epoch": 2.4828256642903437, + "grad_norm": 0.08074339479207993, + "learning_rate": 8.428822179215988e-05, + "loss": 0.2768, + "step": 30648 + }, + { + "epoch": 2.482906675307842, + "grad_norm": 0.061865705996751785, + "learning_rate": 8.428372113956524e-05, + "loss": 0.2295, + "step": 30649 + }, + { + "epoch": 2.48298768632534, + "grad_norm": 0.06636541336774826, + "learning_rate": 8.427922048697061e-05, + "loss": 0.2492, + "step": 30650 + }, + { + "epoch": 2.483068697342839, + "grad_norm": 0.05307980999350548, + "learning_rate": 8.4274719834376e-05, + "loss": 0.1934, + "step": 30651 + }, + { + "epoch": 2.483149708360337, + "grad_norm": 0.06762948632240295, + "learning_rate": 8.427021918178136e-05, + "loss": 0.2117, + "step": 30652 + }, + { + "epoch": 2.4832307193778353, + "grad_norm": 0.05977749824523926, + "learning_rate": 8.426571852918673e-05, + "loss": 0.242, + "step": 30653 + }, + { + "epoch": 2.4833117303953336, + "grad_norm": 0.07046031951904297, + "learning_rate": 8.426121787659212e-05, + "loss": 0.2485, + "step": 30654 + }, + { + "epoch": 2.4833927414128323, + "grad_norm": 0.06229454278945923, + "learning_rate": 8.425671722399748e-05, + "loss": 0.2353, + "step": 30655 + }, + { + "epoch": 2.4834737524303305, + "grad_norm": 0.07501602172851562, + "learning_rate": 8.425221657140285e-05, + "loss": 0.2711, + "step": 30656 + }, + { + "epoch": 2.4835547634478288, + "grad_norm": 0.0901978611946106, + "learning_rate": 8.424771591880824e-05, + "loss": 0.2729, + "step": 30657 + }, + { + "epoch": 2.4836357744653275, + "grad_norm": 0.07065119594335556, + "learning_rate": 8.42432152662136e-05, + "loss": 0.2416, + "step": 30658 + }, + { + "epoch": 2.4837167854828257, + "grad_norm": 0.07169196754693985, + "learning_rate": 8.423871461361897e-05, + "loss": 0.2981, + "step": 30659 + }, + { + "epoch": 2.483797796500324, + "grad_norm": 0.06765072047710419, + "learning_rate": 8.423421396102436e-05, + "loss": 0.2378, + "step": 30660 + }, + { + "epoch": 2.483878807517822, + "grad_norm": 0.07858853787183762, + "learning_rate": 8.422971330842972e-05, + "loss": 0.2106, + "step": 30661 + }, + { + "epoch": 2.483959818535321, + "grad_norm": 0.06337659806013107, + "learning_rate": 8.42252126558351e-05, + "loss": 0.271, + "step": 30662 + }, + { + "epoch": 2.484040829552819, + "grad_norm": 0.059507403522729874, + "learning_rate": 8.422071200324048e-05, + "loss": 0.2453, + "step": 30663 + }, + { + "epoch": 2.4841218405703174, + "grad_norm": 0.06476711481809616, + "learning_rate": 8.421621135064584e-05, + "loss": 0.2628, + "step": 30664 + }, + { + "epoch": 2.484202851587816, + "grad_norm": 0.060637783259153366, + "learning_rate": 8.421171069805122e-05, + "loss": 0.2652, + "step": 30665 + }, + { + "epoch": 2.4842838626053143, + "grad_norm": 0.07929336279630661, + "learning_rate": 8.42072100454566e-05, + "loss": 0.2567, + "step": 30666 + }, + { + "epoch": 2.4843648736228126, + "grad_norm": 0.07002262771129608, + "learning_rate": 8.420270939286196e-05, + "loss": 0.2426, + "step": 30667 + }, + { + "epoch": 2.4844458846403112, + "grad_norm": 0.0657939612865448, + "learning_rate": 8.419820874026735e-05, + "loss": 0.2328, + "step": 30668 + }, + { + "epoch": 2.4845268956578095, + "grad_norm": 0.06577887386083603, + "learning_rate": 8.419370808767272e-05, + "loss": 0.2103, + "step": 30669 + }, + { + "epoch": 2.4846079066753077, + "grad_norm": 0.057862211018800735, + "learning_rate": 8.418920743507808e-05, + "loss": 0.2646, + "step": 30670 + }, + { + "epoch": 2.4846889176928064, + "grad_norm": 0.06161754950881004, + "learning_rate": 8.418470678248347e-05, + "loss": 0.2235, + "step": 30671 + }, + { + "epoch": 2.4847699287103047, + "grad_norm": 0.0700254738330841, + "learning_rate": 8.418020612988884e-05, + "loss": 0.2535, + "step": 30672 + }, + { + "epoch": 2.484850939727803, + "grad_norm": 0.07041961699724197, + "learning_rate": 8.41757054772942e-05, + "loss": 0.2669, + "step": 30673 + }, + { + "epoch": 2.4849319507453016, + "grad_norm": 0.0570821575820446, + "learning_rate": 8.417120482469959e-05, + "loss": 0.1884, + "step": 30674 + }, + { + "epoch": 2.4850129617628, + "grad_norm": 0.0660528689622879, + "learning_rate": 8.416670417210496e-05, + "loss": 0.2748, + "step": 30675 + }, + { + "epoch": 2.485093972780298, + "grad_norm": 0.077247254550457, + "learning_rate": 8.416220351951032e-05, + "loss": 0.267, + "step": 30676 + }, + { + "epoch": 2.4851749837977963, + "grad_norm": 0.06788933277130127, + "learning_rate": 8.415770286691571e-05, + "loss": 0.2301, + "step": 30677 + }, + { + "epoch": 2.485255994815295, + "grad_norm": 0.07161764055490494, + "learning_rate": 8.415320221432109e-05, + "loss": 0.2332, + "step": 30678 + }, + { + "epoch": 2.4853370058327933, + "grad_norm": 0.08423119783401489, + "learning_rate": 8.414870156172645e-05, + "loss": 0.2964, + "step": 30679 + }, + { + "epoch": 2.4854180168502915, + "grad_norm": 0.06369981914758682, + "learning_rate": 8.414420090913183e-05, + "loss": 0.2404, + "step": 30680 + }, + { + "epoch": 2.48549902786779, + "grad_norm": 0.06630921363830566, + "learning_rate": 8.41397002565372e-05, + "loss": 0.2425, + "step": 30681 + }, + { + "epoch": 2.4855800388852884, + "grad_norm": 0.06150691956281662, + "learning_rate": 8.413519960394257e-05, + "loss": 0.1941, + "step": 30682 + }, + { + "epoch": 2.4856610499027867, + "grad_norm": 0.058932892978191376, + "learning_rate": 8.413069895134795e-05, + "loss": 0.238, + "step": 30683 + }, + { + "epoch": 2.485742060920285, + "grad_norm": 0.1102815568447113, + "learning_rate": 8.412619829875333e-05, + "loss": 0.2678, + "step": 30684 + }, + { + "epoch": 2.4858230719377836, + "grad_norm": 0.06793038547039032, + "learning_rate": 8.412169764615869e-05, + "loss": 0.262, + "step": 30685 + }, + { + "epoch": 2.485904082955282, + "grad_norm": 0.05985938012599945, + "learning_rate": 8.411719699356407e-05, + "loss": 0.2546, + "step": 30686 + }, + { + "epoch": 2.48598509397278, + "grad_norm": 0.060996122658252716, + "learning_rate": 8.411269634096945e-05, + "loss": 0.2451, + "step": 30687 + }, + { + "epoch": 2.486066104990279, + "grad_norm": 0.06225128471851349, + "learning_rate": 8.410819568837481e-05, + "loss": 0.2376, + "step": 30688 + }, + { + "epoch": 2.486147116007777, + "grad_norm": 0.07075046002864838, + "learning_rate": 8.41036950357802e-05, + "loss": 0.2201, + "step": 30689 + }, + { + "epoch": 2.4862281270252753, + "grad_norm": 0.07719450443983078, + "learning_rate": 8.409919438318557e-05, + "loss": 0.2591, + "step": 30690 + }, + { + "epoch": 2.486309138042774, + "grad_norm": 0.07757656276226044, + "learning_rate": 8.409469373059093e-05, + "loss": 0.3049, + "step": 30691 + }, + { + "epoch": 2.4863901490602722, + "grad_norm": 0.06839792430400848, + "learning_rate": 8.409019307799632e-05, + "loss": 0.2417, + "step": 30692 + }, + { + "epoch": 2.4864711600777705, + "grad_norm": 0.07726556062698364, + "learning_rate": 8.408569242540169e-05, + "loss": 0.2267, + "step": 30693 + }, + { + "epoch": 2.486552171095269, + "grad_norm": 0.0668526440858841, + "learning_rate": 8.408119177280706e-05, + "loss": 0.232, + "step": 30694 + }, + { + "epoch": 2.4866331821127674, + "grad_norm": 0.05855938419699669, + "learning_rate": 8.407669112021244e-05, + "loss": 0.2572, + "step": 30695 + }, + { + "epoch": 2.4867141931302656, + "grad_norm": 0.07574159651994705, + "learning_rate": 8.407219046761781e-05, + "loss": 0.2189, + "step": 30696 + }, + { + "epoch": 2.4867952041477643, + "grad_norm": 0.07984759658575058, + "learning_rate": 8.406768981502318e-05, + "loss": 0.2623, + "step": 30697 + }, + { + "epoch": 2.4868762151652626, + "grad_norm": 0.06058523803949356, + "learning_rate": 8.406318916242856e-05, + "loss": 0.2509, + "step": 30698 + }, + { + "epoch": 2.486957226182761, + "grad_norm": 0.07417916506528854, + "learning_rate": 8.405868850983393e-05, + "loss": 0.2528, + "step": 30699 + }, + { + "epoch": 2.487038237200259, + "grad_norm": 0.07533387839794159, + "learning_rate": 8.40541878572393e-05, + "loss": 0.2253, + "step": 30700 + }, + { + "epoch": 2.4871192482177578, + "grad_norm": 0.06526943296194077, + "learning_rate": 8.404968720464468e-05, + "loss": 0.2292, + "step": 30701 + }, + { + "epoch": 2.487200259235256, + "grad_norm": 0.08730722218751907, + "learning_rate": 8.404518655205005e-05, + "loss": 0.2835, + "step": 30702 + }, + { + "epoch": 2.4872812702527543, + "grad_norm": 0.061956096440553665, + "learning_rate": 8.404068589945543e-05, + "loss": 0.2447, + "step": 30703 + }, + { + "epoch": 2.487362281270253, + "grad_norm": 0.05448000505566597, + "learning_rate": 8.40361852468608e-05, + "loss": 0.2064, + "step": 30704 + }, + { + "epoch": 2.487443292287751, + "grad_norm": 0.061836034059524536, + "learning_rate": 8.403168459426617e-05, + "loss": 0.2299, + "step": 30705 + }, + { + "epoch": 2.4875243033052494, + "grad_norm": 0.04638923332095146, + "learning_rate": 8.402718394167155e-05, + "loss": 0.2057, + "step": 30706 + }, + { + "epoch": 2.4876053143227477, + "grad_norm": 0.0720670148730278, + "learning_rate": 8.402268328907692e-05, + "loss": 0.2222, + "step": 30707 + }, + { + "epoch": 2.4876863253402464, + "grad_norm": 0.06310757249593735, + "learning_rate": 8.40181826364823e-05, + "loss": 0.2407, + "step": 30708 + }, + { + "epoch": 2.4877673363577446, + "grad_norm": 0.07061705738306046, + "learning_rate": 8.401368198388767e-05, + "loss": 0.2506, + "step": 30709 + }, + { + "epoch": 2.487848347375243, + "grad_norm": 0.07268223911523819, + "learning_rate": 8.400918133129304e-05, + "loss": 0.2372, + "step": 30710 + }, + { + "epoch": 2.4879293583927415, + "grad_norm": 0.06775526702404022, + "learning_rate": 8.400468067869841e-05, + "loss": 0.2453, + "step": 30711 + }, + { + "epoch": 2.48801036941024, + "grad_norm": 0.06613241136074066, + "learning_rate": 8.400018002610379e-05, + "loss": 0.2715, + "step": 30712 + }, + { + "epoch": 2.488091380427738, + "grad_norm": 0.05558057501912117, + "learning_rate": 8.399567937350916e-05, + "loss": 0.2134, + "step": 30713 + }, + { + "epoch": 2.4881723914452367, + "grad_norm": 0.06692616641521454, + "learning_rate": 8.399117872091454e-05, + "loss": 0.2437, + "step": 30714 + }, + { + "epoch": 2.488253402462735, + "grad_norm": 0.05806124210357666, + "learning_rate": 8.398667806831991e-05, + "loss": 0.2347, + "step": 30715 + }, + { + "epoch": 2.488334413480233, + "grad_norm": 0.0691765546798706, + "learning_rate": 8.398217741572528e-05, + "loss": 0.2374, + "step": 30716 + }, + { + "epoch": 2.488415424497732, + "grad_norm": 0.08031977713108063, + "learning_rate": 8.397767676313066e-05, + "loss": 0.2315, + "step": 30717 + }, + { + "epoch": 2.48849643551523, + "grad_norm": 0.06622499972581863, + "learning_rate": 8.397317611053603e-05, + "loss": 0.2299, + "step": 30718 + }, + { + "epoch": 2.4885774465327284, + "grad_norm": 0.07046611607074738, + "learning_rate": 8.39686754579414e-05, + "loss": 0.2726, + "step": 30719 + }, + { + "epoch": 2.488658457550227, + "grad_norm": 0.0632859617471695, + "learning_rate": 8.396417480534679e-05, + "loss": 0.2554, + "step": 30720 + }, + { + "epoch": 2.4887394685677253, + "grad_norm": 0.06783681362867355, + "learning_rate": 8.395967415275215e-05, + "loss": 0.253, + "step": 30721 + }, + { + "epoch": 2.4888204795852236, + "grad_norm": 0.06444837898015976, + "learning_rate": 8.395517350015752e-05, + "loss": 0.2713, + "step": 30722 + }, + { + "epoch": 2.488901490602722, + "grad_norm": 0.06186733394861221, + "learning_rate": 8.395067284756291e-05, + "loss": 0.246, + "step": 30723 + }, + { + "epoch": 2.4889825016202205, + "grad_norm": 0.06374796479940414, + "learning_rate": 8.394617219496827e-05, + "loss": 0.2104, + "step": 30724 + }, + { + "epoch": 2.4890635126377187, + "grad_norm": 0.05994171276688576, + "learning_rate": 8.394167154237364e-05, + "loss": 0.2266, + "step": 30725 + }, + { + "epoch": 2.489144523655217, + "grad_norm": 0.06336595863103867, + "learning_rate": 8.393717088977903e-05, + "loss": 0.2484, + "step": 30726 + }, + { + "epoch": 2.4892255346727157, + "grad_norm": 0.07264426350593567, + "learning_rate": 8.393267023718439e-05, + "loss": 0.2569, + "step": 30727 + }, + { + "epoch": 2.489306545690214, + "grad_norm": 0.05867601931095123, + "learning_rate": 8.392816958458977e-05, + "loss": 0.2649, + "step": 30728 + }, + { + "epoch": 2.489387556707712, + "grad_norm": 0.06587252020835876, + "learning_rate": 8.392366893199515e-05, + "loss": 0.24, + "step": 30729 + }, + { + "epoch": 2.4894685677252104, + "grad_norm": 0.0639888271689415, + "learning_rate": 8.391916827940051e-05, + "loss": 0.2455, + "step": 30730 + }, + { + "epoch": 2.489549578742709, + "grad_norm": 0.05863109230995178, + "learning_rate": 8.391466762680589e-05, + "loss": 0.2364, + "step": 30731 + }, + { + "epoch": 2.4896305897602073, + "grad_norm": 0.05232294648885727, + "learning_rate": 8.391016697421127e-05, + "loss": 0.2337, + "step": 30732 + }, + { + "epoch": 2.4897116007777056, + "grad_norm": 0.07216206192970276, + "learning_rate": 8.390566632161663e-05, + "loss": 0.2358, + "step": 30733 + }, + { + "epoch": 2.4897926117952043, + "grad_norm": 0.06885205954313278, + "learning_rate": 8.390116566902201e-05, + "loss": 0.2556, + "step": 30734 + }, + { + "epoch": 2.4898736228127025, + "grad_norm": 0.08514875918626785, + "learning_rate": 8.38966650164274e-05, + "loss": 0.2399, + "step": 30735 + }, + { + "epoch": 2.4899546338302008, + "grad_norm": 0.0638003721833229, + "learning_rate": 8.389216436383275e-05, + "loss": 0.2225, + "step": 30736 + }, + { + "epoch": 2.4900356448476995, + "grad_norm": 0.05131521448493004, + "learning_rate": 8.388766371123813e-05, + "loss": 0.2403, + "step": 30737 + }, + { + "epoch": 2.4901166558651977, + "grad_norm": 0.06556036323308945, + "learning_rate": 8.388316305864352e-05, + "loss": 0.2427, + "step": 30738 + }, + { + "epoch": 2.490197666882696, + "grad_norm": 0.06367666274309158, + "learning_rate": 8.387866240604888e-05, + "loss": 0.2795, + "step": 30739 + }, + { + "epoch": 2.4902786779001946, + "grad_norm": 0.07163514196872711, + "learning_rate": 8.387416175345425e-05, + "loss": 0.285, + "step": 30740 + }, + { + "epoch": 2.490359688917693, + "grad_norm": 0.07182058691978455, + "learning_rate": 8.386966110085964e-05, + "loss": 0.2071, + "step": 30741 + }, + { + "epoch": 2.490440699935191, + "grad_norm": 0.07572603970766068, + "learning_rate": 8.3865160448265e-05, + "loss": 0.2247, + "step": 30742 + }, + { + "epoch": 2.4905217109526894, + "grad_norm": 0.07965695858001709, + "learning_rate": 8.386065979567037e-05, + "loss": 0.2605, + "step": 30743 + }, + { + "epoch": 2.490602721970188, + "grad_norm": 0.06812844425439835, + "learning_rate": 8.385615914307576e-05, + "loss": 0.2173, + "step": 30744 + }, + { + "epoch": 2.4906837329876863, + "grad_norm": 0.059363484382629395, + "learning_rate": 8.385165849048112e-05, + "loss": 0.213, + "step": 30745 + }, + { + "epoch": 2.4907647440051845, + "grad_norm": 0.05940215289592743, + "learning_rate": 8.38471578378865e-05, + "loss": 0.2231, + "step": 30746 + }, + { + "epoch": 2.4908457550226832, + "grad_norm": 0.0675797238945961, + "learning_rate": 8.384265718529188e-05, + "loss": 0.2375, + "step": 30747 + }, + { + "epoch": 2.4909267660401815, + "grad_norm": 0.07645593583583832, + "learning_rate": 8.383815653269724e-05, + "loss": 0.2662, + "step": 30748 + }, + { + "epoch": 2.4910077770576797, + "grad_norm": 0.06828924268484116, + "learning_rate": 8.383365588010262e-05, + "loss": 0.2422, + "step": 30749 + }, + { + "epoch": 2.4910887880751784, + "grad_norm": 0.05796194076538086, + "learning_rate": 8.3829155227508e-05, + "loss": 0.2714, + "step": 30750 + }, + { + "epoch": 2.4911697990926767, + "grad_norm": 0.05657875910401344, + "learning_rate": 8.382465457491336e-05, + "loss": 0.2778, + "step": 30751 + }, + { + "epoch": 2.491250810110175, + "grad_norm": 0.06976671516895294, + "learning_rate": 8.382015392231875e-05, + "loss": 0.2602, + "step": 30752 + }, + { + "epoch": 2.491331821127673, + "grad_norm": 0.0654015988111496, + "learning_rate": 8.381565326972412e-05, + "loss": 0.2255, + "step": 30753 + }, + { + "epoch": 2.491412832145172, + "grad_norm": 0.07128993421792984, + "learning_rate": 8.381115261712948e-05, + "loss": 0.2695, + "step": 30754 + }, + { + "epoch": 2.49149384316267, + "grad_norm": 0.06543438136577606, + "learning_rate": 8.380665196453487e-05, + "loss": 0.2221, + "step": 30755 + }, + { + "epoch": 2.4915748541801683, + "grad_norm": 0.06640952825546265, + "learning_rate": 8.380215131194024e-05, + "loss": 0.2306, + "step": 30756 + }, + { + "epoch": 2.491655865197667, + "grad_norm": 0.07250026613473892, + "learning_rate": 8.37976506593456e-05, + "loss": 0.2375, + "step": 30757 + }, + { + "epoch": 2.4917368762151653, + "grad_norm": 0.07018820941448212, + "learning_rate": 8.379315000675099e-05, + "loss": 0.2903, + "step": 30758 + }, + { + "epoch": 2.4918178872326635, + "grad_norm": 0.08030076324939728, + "learning_rate": 8.378864935415636e-05, + "loss": 0.2615, + "step": 30759 + }, + { + "epoch": 2.491898898250162, + "grad_norm": 0.08757913112640381, + "learning_rate": 8.378414870156172e-05, + "loss": 0.2731, + "step": 30760 + }, + { + "epoch": 2.4919799092676604, + "grad_norm": 0.08045205473899841, + "learning_rate": 8.377964804896711e-05, + "loss": 0.2832, + "step": 30761 + }, + { + "epoch": 2.4920609202851587, + "grad_norm": 0.06057002395391464, + "learning_rate": 8.377514739637248e-05, + "loss": 0.2432, + "step": 30762 + }, + { + "epoch": 2.4921419313026574, + "grad_norm": 0.05765476077795029, + "learning_rate": 8.377064674377784e-05, + "loss": 0.247, + "step": 30763 + }, + { + "epoch": 2.4922229423201556, + "grad_norm": 0.056977640837430954, + "learning_rate": 8.376614609118323e-05, + "loss": 0.2014, + "step": 30764 + }, + { + "epoch": 2.492303953337654, + "grad_norm": 0.07265925407409668, + "learning_rate": 8.37616454385886e-05, + "loss": 0.2569, + "step": 30765 + }, + { + "epoch": 2.492384964355152, + "grad_norm": 0.06928717344999313, + "learning_rate": 8.375714478599396e-05, + "loss": 0.2234, + "step": 30766 + }, + { + "epoch": 2.492465975372651, + "grad_norm": 0.06462343037128448, + "learning_rate": 8.375264413339935e-05, + "loss": 0.2669, + "step": 30767 + }, + { + "epoch": 2.492546986390149, + "grad_norm": 0.08052459359169006, + "learning_rate": 8.374814348080472e-05, + "loss": 0.236, + "step": 30768 + }, + { + "epoch": 2.4926279974076473, + "grad_norm": 0.06301391124725342, + "learning_rate": 8.374364282821008e-05, + "loss": 0.2311, + "step": 30769 + }, + { + "epoch": 2.492709008425146, + "grad_norm": 0.077354796230793, + "learning_rate": 8.373914217561547e-05, + "loss": 0.2357, + "step": 30770 + }, + { + "epoch": 2.4927900194426442, + "grad_norm": 0.06379301846027374, + "learning_rate": 8.373464152302084e-05, + "loss": 0.2072, + "step": 30771 + }, + { + "epoch": 2.4928710304601425, + "grad_norm": 0.06881890445947647, + "learning_rate": 8.373014087042622e-05, + "loss": 0.248, + "step": 30772 + }, + { + "epoch": 2.4929520414776407, + "grad_norm": 0.0689517930150032, + "learning_rate": 8.372564021783159e-05, + "loss": 0.2441, + "step": 30773 + }, + { + "epoch": 2.4930330524951394, + "grad_norm": 0.05295158922672272, + "learning_rate": 8.372113956523696e-05, + "loss": 0.2357, + "step": 30774 + }, + { + "epoch": 2.4931140635126376, + "grad_norm": 0.06985612958669662, + "learning_rate": 8.371663891264234e-05, + "loss": 0.2318, + "step": 30775 + }, + { + "epoch": 2.493195074530136, + "grad_norm": 0.0848032534122467, + "learning_rate": 8.371213826004771e-05, + "loss": 0.2192, + "step": 30776 + }, + { + "epoch": 2.4932760855476346, + "grad_norm": 0.06986959278583527, + "learning_rate": 8.370763760745309e-05, + "loss": 0.2328, + "step": 30777 + }, + { + "epoch": 2.493357096565133, + "grad_norm": 0.07506789267063141, + "learning_rate": 8.370313695485846e-05, + "loss": 0.2663, + "step": 30778 + }, + { + "epoch": 2.493438107582631, + "grad_norm": 0.07147037237882614, + "learning_rate": 8.369863630226383e-05, + "loss": 0.2346, + "step": 30779 + }, + { + "epoch": 2.4935191186001298, + "grad_norm": 0.06762287020683289, + "learning_rate": 8.36941356496692e-05, + "loss": 0.2223, + "step": 30780 + }, + { + "epoch": 2.493600129617628, + "grad_norm": 0.0584288164973259, + "learning_rate": 8.368963499707458e-05, + "loss": 0.2497, + "step": 30781 + }, + { + "epoch": 2.4936811406351262, + "grad_norm": 0.07436443120241165, + "learning_rate": 8.368513434447995e-05, + "loss": 0.2495, + "step": 30782 + }, + { + "epoch": 2.493762151652625, + "grad_norm": 0.06659824401140213, + "learning_rate": 8.368063369188533e-05, + "loss": 0.2361, + "step": 30783 + }, + { + "epoch": 2.493843162670123, + "grad_norm": 0.07718595117330551, + "learning_rate": 8.36761330392907e-05, + "loss": 0.2793, + "step": 30784 + }, + { + "epoch": 2.4939241736876214, + "grad_norm": 0.06469152122735977, + "learning_rate": 8.367163238669607e-05, + "loss": 0.2164, + "step": 30785 + }, + { + "epoch": 2.49400518470512, + "grad_norm": 0.07271499186754227, + "learning_rate": 8.366713173410145e-05, + "loss": 0.2249, + "step": 30786 + }, + { + "epoch": 2.4940861957226184, + "grad_norm": 0.06968480348587036, + "learning_rate": 8.366263108150682e-05, + "loss": 0.1998, + "step": 30787 + }, + { + "epoch": 2.4941672067401166, + "grad_norm": 0.05411296710371971, + "learning_rate": 8.36581304289122e-05, + "loss": 0.2202, + "step": 30788 + }, + { + "epoch": 2.494248217757615, + "grad_norm": 0.08066213130950928, + "learning_rate": 8.365362977631757e-05, + "loss": 0.2692, + "step": 30789 + }, + { + "epoch": 2.4943292287751135, + "grad_norm": 0.07861744612455368, + "learning_rate": 8.364912912372294e-05, + "loss": 0.2448, + "step": 30790 + }, + { + "epoch": 2.494410239792612, + "grad_norm": 0.06976334750652313, + "learning_rate": 8.364462847112832e-05, + "loss": 0.2366, + "step": 30791 + }, + { + "epoch": 2.49449125081011, + "grad_norm": 0.06461894512176514, + "learning_rate": 8.364012781853369e-05, + "loss": 0.2481, + "step": 30792 + }, + { + "epoch": 2.4945722618276087, + "grad_norm": 0.08025755733251572, + "learning_rate": 8.363562716593906e-05, + "loss": 0.2452, + "step": 30793 + }, + { + "epoch": 2.494653272845107, + "grad_norm": 0.061761967837810516, + "learning_rate": 8.363112651334444e-05, + "loss": 0.2833, + "step": 30794 + }, + { + "epoch": 2.494734283862605, + "grad_norm": 0.0678744688630104, + "learning_rate": 8.362662586074981e-05, + "loss": 0.2691, + "step": 30795 + }, + { + "epoch": 2.4948152948801035, + "grad_norm": 0.05866160988807678, + "learning_rate": 8.362212520815518e-05, + "loss": 0.2171, + "step": 30796 + }, + { + "epoch": 2.494896305897602, + "grad_norm": 0.04981419816613197, + "learning_rate": 8.361762455556056e-05, + "loss": 0.2361, + "step": 30797 + }, + { + "epoch": 2.4949773169151004, + "grad_norm": 0.059023331850767136, + "learning_rate": 8.361312390296593e-05, + "loss": 0.2385, + "step": 30798 + }, + { + "epoch": 2.4950583279325986, + "grad_norm": 0.0639590322971344, + "learning_rate": 8.36086232503713e-05, + "loss": 0.236, + "step": 30799 + }, + { + "epoch": 2.4951393389500973, + "grad_norm": 0.06808973848819733, + "learning_rate": 8.360412259777668e-05, + "loss": 0.2893, + "step": 30800 + }, + { + "epoch": 2.4952203499675956, + "grad_norm": 0.07416072487831116, + "learning_rate": 8.359962194518207e-05, + "loss": 0.2832, + "step": 30801 + }, + { + "epoch": 2.495301360985094, + "grad_norm": 0.0650864839553833, + "learning_rate": 8.359512129258743e-05, + "loss": 0.2546, + "step": 30802 + }, + { + "epoch": 2.4953823720025925, + "grad_norm": 0.08762753009796143, + "learning_rate": 8.35906206399928e-05, + "loss": 0.2987, + "step": 30803 + }, + { + "epoch": 2.4954633830200907, + "grad_norm": 0.061661556363105774, + "learning_rate": 8.358611998739819e-05, + "loss": 0.2546, + "step": 30804 + }, + { + "epoch": 2.495544394037589, + "grad_norm": 0.06270502507686615, + "learning_rate": 8.358161933480355e-05, + "loss": 0.2302, + "step": 30805 + }, + { + "epoch": 2.4956254050550877, + "grad_norm": 0.07116597890853882, + "learning_rate": 8.357711868220892e-05, + "loss": 0.2538, + "step": 30806 + }, + { + "epoch": 2.495706416072586, + "grad_norm": 0.06109967827796936, + "learning_rate": 8.357261802961431e-05, + "loss": 0.2781, + "step": 30807 + }, + { + "epoch": 2.495787427090084, + "grad_norm": 0.0619681179523468, + "learning_rate": 8.356811737701967e-05, + "loss": 0.2463, + "step": 30808 + }, + { + "epoch": 2.495868438107583, + "grad_norm": 0.051027651876211166, + "learning_rate": 8.356361672442504e-05, + "loss": 0.2279, + "step": 30809 + }, + { + "epoch": 2.495949449125081, + "grad_norm": 0.07194460928440094, + "learning_rate": 8.355911607183043e-05, + "loss": 0.2074, + "step": 30810 + }, + { + "epoch": 2.4960304601425793, + "grad_norm": 0.051879867911338806, + "learning_rate": 8.355461541923579e-05, + "loss": 0.2447, + "step": 30811 + }, + { + "epoch": 2.4961114711600776, + "grad_norm": 0.06579706072807312, + "learning_rate": 8.355011476664116e-05, + "loss": 0.293, + "step": 30812 + }, + { + "epoch": 2.4961924821775763, + "grad_norm": 0.06795860826969147, + "learning_rate": 8.354561411404655e-05, + "loss": 0.2648, + "step": 30813 + }, + { + "epoch": 2.4962734931950745, + "grad_norm": 0.06433531641960144, + "learning_rate": 8.354111346145191e-05, + "loss": 0.2001, + "step": 30814 + }, + { + "epoch": 2.4963545042125728, + "grad_norm": 0.07233475148677826, + "learning_rate": 8.353661280885728e-05, + "loss": 0.284, + "step": 30815 + }, + { + "epoch": 2.4964355152300715, + "grad_norm": 0.06656422466039658, + "learning_rate": 8.353211215626267e-05, + "loss": 0.2203, + "step": 30816 + }, + { + "epoch": 2.4965165262475697, + "grad_norm": 0.07233376801013947, + "learning_rate": 8.352761150366803e-05, + "loss": 0.2488, + "step": 30817 + }, + { + "epoch": 2.496597537265068, + "grad_norm": 0.05535171553492546, + "learning_rate": 8.35231108510734e-05, + "loss": 0.2426, + "step": 30818 + }, + { + "epoch": 2.496678548282566, + "grad_norm": 0.06830519437789917, + "learning_rate": 8.351861019847879e-05, + "loss": 0.2119, + "step": 30819 + }, + { + "epoch": 2.496759559300065, + "grad_norm": 0.05747649446129799, + "learning_rate": 8.351410954588415e-05, + "loss": 0.1943, + "step": 30820 + }, + { + "epoch": 2.496840570317563, + "grad_norm": 0.06299077719449997, + "learning_rate": 8.350960889328952e-05, + "loss": 0.2415, + "step": 30821 + }, + { + "epoch": 2.4969215813350614, + "grad_norm": 0.06807470321655273, + "learning_rate": 8.350510824069491e-05, + "loss": 0.2317, + "step": 30822 + }, + { + "epoch": 2.49700259235256, + "grad_norm": 0.07347835600376129, + "learning_rate": 8.350060758810027e-05, + "loss": 0.2706, + "step": 30823 + }, + { + "epoch": 2.4970836033700583, + "grad_norm": 0.06623311340808868, + "learning_rate": 8.349610693550565e-05, + "loss": 0.2308, + "step": 30824 + }, + { + "epoch": 2.4971646143875565, + "grad_norm": 0.065008744597435, + "learning_rate": 8.349160628291103e-05, + "loss": 0.243, + "step": 30825 + }, + { + "epoch": 2.4972456254050552, + "grad_norm": 0.0647854432463646, + "learning_rate": 8.348710563031639e-05, + "loss": 0.2314, + "step": 30826 + }, + { + "epoch": 2.4973266364225535, + "grad_norm": 0.07481033354997635, + "learning_rate": 8.348260497772178e-05, + "loss": 0.2432, + "step": 30827 + }, + { + "epoch": 2.4974076474400517, + "grad_norm": 0.06997375190258026, + "learning_rate": 8.347810432512715e-05, + "loss": 0.2522, + "step": 30828 + }, + { + "epoch": 2.4974886584575504, + "grad_norm": 0.06547009944915771, + "learning_rate": 8.347360367253251e-05, + "loss": 0.2632, + "step": 30829 + }, + { + "epoch": 2.4975696694750487, + "grad_norm": 0.06076023727655411, + "learning_rate": 8.34691030199379e-05, + "loss": 0.2494, + "step": 30830 + }, + { + "epoch": 2.497650680492547, + "grad_norm": 0.07652776688337326, + "learning_rate": 8.346460236734327e-05, + "loss": 0.261, + "step": 30831 + }, + { + "epoch": 2.4977316915100456, + "grad_norm": 0.05655470862984657, + "learning_rate": 8.346010171474863e-05, + "loss": 0.2281, + "step": 30832 + }, + { + "epoch": 2.497812702527544, + "grad_norm": 0.06015842780470848, + "learning_rate": 8.345560106215402e-05, + "loss": 0.2345, + "step": 30833 + }, + { + "epoch": 2.497893713545042, + "grad_norm": 0.06424501538276672, + "learning_rate": 8.34511004095594e-05, + "loss": 0.2638, + "step": 30834 + }, + { + "epoch": 2.4979747245625403, + "grad_norm": 0.07210192084312439, + "learning_rate": 8.344659975696475e-05, + "loss": 0.252, + "step": 30835 + }, + { + "epoch": 2.498055735580039, + "grad_norm": 0.09136870503425598, + "learning_rate": 8.344209910437014e-05, + "loss": 0.2684, + "step": 30836 + }, + { + "epoch": 2.4981367465975373, + "grad_norm": 0.05969814956188202, + "learning_rate": 8.343759845177552e-05, + "loss": 0.2264, + "step": 30837 + }, + { + "epoch": 2.4982177576150355, + "grad_norm": 0.07652722299098969, + "learning_rate": 8.343309779918088e-05, + "loss": 0.2453, + "step": 30838 + }, + { + "epoch": 2.498298768632534, + "grad_norm": 0.05736469849944115, + "learning_rate": 8.342859714658626e-05, + "loss": 0.243, + "step": 30839 + }, + { + "epoch": 2.4983797796500324, + "grad_norm": 0.061904434114694595, + "learning_rate": 8.342409649399164e-05, + "loss": 0.2392, + "step": 30840 + }, + { + "epoch": 2.4984607906675307, + "grad_norm": 0.07190416753292084, + "learning_rate": 8.3419595841397e-05, + "loss": 0.2707, + "step": 30841 + }, + { + "epoch": 2.498541801685029, + "grad_norm": 0.08621451258659363, + "learning_rate": 8.341509518880238e-05, + "loss": 0.248, + "step": 30842 + }, + { + "epoch": 2.4986228127025276, + "grad_norm": 0.0821828618645668, + "learning_rate": 8.341059453620776e-05, + "loss": 0.2222, + "step": 30843 + }, + { + "epoch": 2.498703823720026, + "grad_norm": 0.05787357687950134, + "learning_rate": 8.340609388361312e-05, + "loss": 0.2421, + "step": 30844 + }, + { + "epoch": 2.498784834737524, + "grad_norm": 0.05896326154470444, + "learning_rate": 8.34015932310185e-05, + "loss": 0.2403, + "step": 30845 + }, + { + "epoch": 2.498865845755023, + "grad_norm": 0.059107761830091476, + "learning_rate": 8.339709257842388e-05, + "loss": 0.2261, + "step": 30846 + }, + { + "epoch": 2.498946856772521, + "grad_norm": 0.06375333666801453, + "learning_rate": 8.339259192582924e-05, + "loss": 0.2294, + "step": 30847 + }, + { + "epoch": 2.4990278677900193, + "grad_norm": 0.07359378039836884, + "learning_rate": 8.338809127323463e-05, + "loss": 0.2555, + "step": 30848 + }, + { + "epoch": 2.499108878807518, + "grad_norm": 0.06330002844333649, + "learning_rate": 8.338359062064e-05, + "loss": 0.2664, + "step": 30849 + }, + { + "epoch": 2.499189889825016, + "grad_norm": 0.06739085167646408, + "learning_rate": 8.337908996804536e-05, + "loss": 0.2217, + "step": 30850 + }, + { + "epoch": 2.4992709008425145, + "grad_norm": 0.07190203666687012, + "learning_rate": 8.337458931545075e-05, + "loss": 0.2355, + "step": 30851 + }, + { + "epoch": 2.499351911860013, + "grad_norm": 0.06133474037051201, + "learning_rate": 8.337008866285612e-05, + "loss": 0.1959, + "step": 30852 + }, + { + "epoch": 2.4994329228775114, + "grad_norm": 0.0660034790635109, + "learning_rate": 8.336558801026149e-05, + "loss": 0.2614, + "step": 30853 + }, + { + "epoch": 2.4995139338950096, + "grad_norm": 0.06458307802677155, + "learning_rate": 8.336108735766687e-05, + "loss": 0.262, + "step": 30854 + }, + { + "epoch": 2.4995949449125083, + "grad_norm": 0.06402486562728882, + "learning_rate": 8.335658670507224e-05, + "loss": 0.2258, + "step": 30855 + }, + { + "epoch": 2.4996759559300066, + "grad_norm": 0.07437600940465927, + "learning_rate": 8.335208605247761e-05, + "loss": 0.2691, + "step": 30856 + }, + { + "epoch": 2.499756966947505, + "grad_norm": 0.08359698206186295, + "learning_rate": 8.334758539988299e-05, + "loss": 0.2653, + "step": 30857 + }, + { + "epoch": 2.499837977965003, + "grad_norm": 0.07374858856201172, + "learning_rate": 8.334308474728836e-05, + "loss": 0.2155, + "step": 30858 + }, + { + "epoch": 2.4999189889825018, + "grad_norm": 0.06222660467028618, + "learning_rate": 8.333858409469373e-05, + "loss": 0.207, + "step": 30859 + }, + { + "epoch": 2.5, + "grad_norm": 0.05293847993016243, + "learning_rate": 8.333408344209911e-05, + "loss": 0.2201, + "step": 30860 + }, + { + "epoch": 2.5000810110174982, + "grad_norm": 0.07100159674882889, + "learning_rate": 8.332958278950448e-05, + "loss": 0.2418, + "step": 30861 + }, + { + "epoch": 2.5001620220349965, + "grad_norm": 0.06915083527565002, + "learning_rate": 8.332508213690986e-05, + "loss": 0.2505, + "step": 30862 + }, + { + "epoch": 2.500243033052495, + "grad_norm": 0.07485393434762955, + "learning_rate": 8.332058148431523e-05, + "loss": 0.2725, + "step": 30863 + }, + { + "epoch": 2.5003240440699934, + "grad_norm": 0.06454658508300781, + "learning_rate": 8.33160808317206e-05, + "loss": 0.241, + "step": 30864 + }, + { + "epoch": 2.5004050550874917, + "grad_norm": 0.07296113669872284, + "learning_rate": 8.331158017912598e-05, + "loss": 0.2803, + "step": 30865 + }, + { + "epoch": 2.5004860661049904, + "grad_norm": 0.0719156339764595, + "learning_rate": 8.330707952653135e-05, + "loss": 0.2439, + "step": 30866 + }, + { + "epoch": 2.5005670771224886, + "grad_norm": 0.06684891879558563, + "learning_rate": 8.330257887393672e-05, + "loss": 0.2381, + "step": 30867 + }, + { + "epoch": 2.500648088139987, + "grad_norm": 0.06705359369516373, + "learning_rate": 8.32980782213421e-05, + "loss": 0.2667, + "step": 30868 + }, + { + "epoch": 2.5007290991574855, + "grad_norm": 0.06898030638694763, + "learning_rate": 8.329357756874747e-05, + "loss": 0.238, + "step": 30869 + }, + { + "epoch": 2.500810110174984, + "grad_norm": 0.07460542023181915, + "learning_rate": 8.328907691615284e-05, + "loss": 0.2264, + "step": 30870 + }, + { + "epoch": 2.500891121192482, + "grad_norm": 0.06566330790519714, + "learning_rate": 8.328457626355822e-05, + "loss": 0.2231, + "step": 30871 + }, + { + "epoch": 2.5009721322099807, + "grad_norm": 0.06869935244321823, + "learning_rate": 8.328007561096359e-05, + "loss": 0.2458, + "step": 30872 + }, + { + "epoch": 2.501053143227479, + "grad_norm": 0.08810488879680634, + "learning_rate": 8.327557495836897e-05, + "loss": 0.2617, + "step": 30873 + }, + { + "epoch": 2.501134154244977, + "grad_norm": 0.06041054055094719, + "learning_rate": 8.327107430577434e-05, + "loss": 0.2252, + "step": 30874 + }, + { + "epoch": 2.501215165262476, + "grad_norm": 0.0637449398636818, + "learning_rate": 8.326657365317971e-05, + "loss": 0.2395, + "step": 30875 + }, + { + "epoch": 2.501296176279974, + "grad_norm": 0.0678592324256897, + "learning_rate": 8.326207300058509e-05, + "loss": 0.2409, + "step": 30876 + }, + { + "epoch": 2.5013771872974724, + "grad_norm": 0.06725373864173889, + "learning_rate": 8.325757234799046e-05, + "loss": 0.2309, + "step": 30877 + }, + { + "epoch": 2.501458198314971, + "grad_norm": 0.050262413918972015, + "learning_rate": 8.325307169539583e-05, + "loss": 0.2047, + "step": 30878 + }, + { + "epoch": 2.5015392093324693, + "grad_norm": 0.07753513753414154, + "learning_rate": 8.324857104280122e-05, + "loss": 0.2693, + "step": 30879 + }, + { + "epoch": 2.5016202203499676, + "grad_norm": 0.05689910426735878, + "learning_rate": 8.324407039020658e-05, + "loss": 0.2371, + "step": 30880 + }, + { + "epoch": 2.5017012313674662, + "grad_norm": 0.06461703777313232, + "learning_rate": 8.323956973761195e-05, + "loss": 0.1844, + "step": 30881 + }, + { + "epoch": 2.5017822423849645, + "grad_norm": 0.07810285687446594, + "learning_rate": 8.323506908501734e-05, + "loss": 0.2965, + "step": 30882 + }, + { + "epoch": 2.5018632534024627, + "grad_norm": 0.06737931817770004, + "learning_rate": 8.32305684324227e-05, + "loss": 0.2273, + "step": 30883 + }, + { + "epoch": 2.501944264419961, + "grad_norm": 0.06584341824054718, + "learning_rate": 8.322606777982807e-05, + "loss": 0.243, + "step": 30884 + }, + { + "epoch": 2.5020252754374592, + "grad_norm": 0.061913296580314636, + "learning_rate": 8.322156712723346e-05, + "loss": 0.215, + "step": 30885 + }, + { + "epoch": 2.502106286454958, + "grad_norm": 0.08903798460960388, + "learning_rate": 8.321706647463882e-05, + "loss": 0.2401, + "step": 30886 + }, + { + "epoch": 2.502187297472456, + "grad_norm": 0.05823826417326927, + "learning_rate": 8.32125658220442e-05, + "loss": 0.2282, + "step": 30887 + }, + { + "epoch": 2.5022683084899544, + "grad_norm": 0.06540390849113464, + "learning_rate": 8.320806516944958e-05, + "loss": 0.2232, + "step": 30888 + }, + { + "epoch": 2.502349319507453, + "grad_norm": 0.07079609483480453, + "learning_rate": 8.320356451685494e-05, + "loss": 0.273, + "step": 30889 + }, + { + "epoch": 2.5024303305249513, + "grad_norm": 0.06377361714839935, + "learning_rate": 8.319906386426032e-05, + "loss": 0.2508, + "step": 30890 + }, + { + "epoch": 2.5025113415424496, + "grad_norm": 0.05838244780898094, + "learning_rate": 8.31945632116657e-05, + "loss": 0.243, + "step": 30891 + }, + { + "epoch": 2.5025923525599483, + "grad_norm": 0.0708763375878334, + "learning_rate": 8.319006255907106e-05, + "loss": 0.2257, + "step": 30892 + }, + { + "epoch": 2.5026733635774465, + "grad_norm": 0.05726628750562668, + "learning_rate": 8.318556190647644e-05, + "loss": 0.233, + "step": 30893 + }, + { + "epoch": 2.5027543745949448, + "grad_norm": 0.08269777148962021, + "learning_rate": 8.318106125388182e-05, + "loss": 0.2664, + "step": 30894 + }, + { + "epoch": 2.5028353856124435, + "grad_norm": 0.0771472156047821, + "learning_rate": 8.317656060128718e-05, + "loss": 0.2415, + "step": 30895 + }, + { + "epoch": 2.5029163966299417, + "grad_norm": 0.0826282948255539, + "learning_rate": 8.317205994869256e-05, + "loss": 0.2691, + "step": 30896 + }, + { + "epoch": 2.50299740764744, + "grad_norm": 0.05811542645096779, + "learning_rate": 8.316755929609795e-05, + "loss": 0.2405, + "step": 30897 + }, + { + "epoch": 2.5030784186649386, + "grad_norm": 0.06644046306610107, + "learning_rate": 8.31630586435033e-05, + "loss": 0.2587, + "step": 30898 + }, + { + "epoch": 2.503159429682437, + "grad_norm": 0.06532420963048935, + "learning_rate": 8.315855799090868e-05, + "loss": 0.2284, + "step": 30899 + }, + { + "epoch": 2.503240440699935, + "grad_norm": 0.08071818947792053, + "learning_rate": 8.315405733831407e-05, + "loss": 0.2254, + "step": 30900 + }, + { + "epoch": 2.503321451717434, + "grad_norm": 0.06762257218360901, + "learning_rate": 8.314955668571943e-05, + "loss": 0.2662, + "step": 30901 + }, + { + "epoch": 2.503402462734932, + "grad_norm": 0.07281740754842758, + "learning_rate": 8.31450560331248e-05, + "loss": 0.2385, + "step": 30902 + }, + { + "epoch": 2.5034834737524303, + "grad_norm": 0.05427378788590431, + "learning_rate": 8.314055538053019e-05, + "loss": 0.2264, + "step": 30903 + }, + { + "epoch": 2.5035644847699285, + "grad_norm": 0.07130347937345505, + "learning_rate": 8.313605472793555e-05, + "loss": 0.2604, + "step": 30904 + }, + { + "epoch": 2.5036454957874272, + "grad_norm": 0.062126901000738144, + "learning_rate": 8.313155407534093e-05, + "loss": 0.2492, + "step": 30905 + }, + { + "epoch": 2.5037265068049255, + "grad_norm": 0.061227038502693176, + "learning_rate": 8.312705342274631e-05, + "loss": 0.2367, + "step": 30906 + }, + { + "epoch": 2.5038075178224237, + "grad_norm": 0.06333401799201965, + "learning_rate": 8.312255277015167e-05, + "loss": 0.2348, + "step": 30907 + }, + { + "epoch": 2.503888528839922, + "grad_norm": 0.05559328943490982, + "learning_rate": 8.311805211755705e-05, + "loss": 0.2244, + "step": 30908 + }, + { + "epoch": 2.5039695398574207, + "grad_norm": 0.055634625256061554, + "learning_rate": 8.311355146496243e-05, + "loss": 0.2136, + "step": 30909 + }, + { + "epoch": 2.504050550874919, + "grad_norm": 0.09396779537200928, + "learning_rate": 8.310905081236779e-05, + "loss": 0.2394, + "step": 30910 + }, + { + "epoch": 2.504131561892417, + "grad_norm": 0.06352783739566803, + "learning_rate": 8.310455015977318e-05, + "loss": 0.233, + "step": 30911 + }, + { + "epoch": 2.504212572909916, + "grad_norm": 0.06977412849664688, + "learning_rate": 8.310004950717855e-05, + "loss": 0.2355, + "step": 30912 + }, + { + "epoch": 2.504293583927414, + "grad_norm": 0.05852169543504715, + "learning_rate": 8.309554885458391e-05, + "loss": 0.242, + "step": 30913 + }, + { + "epoch": 2.5043745949449123, + "grad_norm": 0.0841292068362236, + "learning_rate": 8.30910482019893e-05, + "loss": 0.272, + "step": 30914 + }, + { + "epoch": 2.504455605962411, + "grad_norm": 0.07529615610837936, + "learning_rate": 8.308654754939467e-05, + "loss": 0.2463, + "step": 30915 + }, + { + "epoch": 2.5045366169799093, + "grad_norm": 0.06277451664209366, + "learning_rate": 8.308204689680003e-05, + "loss": 0.2405, + "step": 30916 + }, + { + "epoch": 2.5046176279974075, + "grad_norm": 0.06636457145214081, + "learning_rate": 8.307754624420542e-05, + "loss": 0.2437, + "step": 30917 + }, + { + "epoch": 2.504698639014906, + "grad_norm": 0.07447199523448944, + "learning_rate": 8.307304559161079e-05, + "loss": 0.2237, + "step": 30918 + }, + { + "epoch": 2.5047796500324044, + "grad_norm": 0.07748948782682419, + "learning_rate": 8.306854493901615e-05, + "loss": 0.2311, + "step": 30919 + }, + { + "epoch": 2.5048606610499027, + "grad_norm": 0.07205738872289658, + "learning_rate": 8.306404428642154e-05, + "loss": 0.2384, + "step": 30920 + }, + { + "epoch": 2.5049416720674014, + "grad_norm": 0.06956085562705994, + "learning_rate": 8.305954363382691e-05, + "loss": 0.2322, + "step": 30921 + }, + { + "epoch": 2.5050226830848996, + "grad_norm": 0.07650784403085709, + "learning_rate": 8.305504298123227e-05, + "loss": 0.2319, + "step": 30922 + }, + { + "epoch": 2.505103694102398, + "grad_norm": 0.06893320381641388, + "learning_rate": 8.305054232863766e-05, + "loss": 0.2731, + "step": 30923 + }, + { + "epoch": 2.5051847051198965, + "grad_norm": 0.1009536162018776, + "learning_rate": 8.304604167604303e-05, + "loss": 0.252, + "step": 30924 + }, + { + "epoch": 2.505265716137395, + "grad_norm": 0.07383900880813599, + "learning_rate": 8.304154102344839e-05, + "loss": 0.2708, + "step": 30925 + }, + { + "epoch": 2.505346727154893, + "grad_norm": 0.06586895883083344, + "learning_rate": 8.303704037085378e-05, + "loss": 0.2802, + "step": 30926 + }, + { + "epoch": 2.5054277381723913, + "grad_norm": 0.06558408588171005, + "learning_rate": 8.303253971825915e-05, + "loss": 0.2248, + "step": 30927 + }, + { + "epoch": 2.50550874918989, + "grad_norm": 0.059231966733932495, + "learning_rate": 8.302803906566451e-05, + "loss": 0.2249, + "step": 30928 + }, + { + "epoch": 2.505589760207388, + "grad_norm": 0.07415720075368881, + "learning_rate": 8.30235384130699e-05, + "loss": 0.2538, + "step": 30929 + }, + { + "epoch": 2.5056707712248865, + "grad_norm": 0.06834818422794342, + "learning_rate": 8.301903776047527e-05, + "loss": 0.2339, + "step": 30930 + }, + { + "epoch": 2.5057517822423847, + "grad_norm": 0.06618189066648483, + "learning_rate": 8.301453710788065e-05, + "loss": 0.2409, + "step": 30931 + }, + { + "epoch": 2.5058327932598834, + "grad_norm": 0.06760060787200928, + "learning_rate": 8.301003645528602e-05, + "loss": 0.2271, + "step": 30932 + }, + { + "epoch": 2.5059138042773816, + "grad_norm": 0.08346579223871231, + "learning_rate": 8.30055358026914e-05, + "loss": 0.2185, + "step": 30933 + }, + { + "epoch": 2.50599481529488, + "grad_norm": 0.059812482446432114, + "learning_rate": 8.300103515009677e-05, + "loss": 0.2391, + "step": 30934 + }, + { + "epoch": 2.5060758263123786, + "grad_norm": 0.1017281636595726, + "learning_rate": 8.299653449750214e-05, + "loss": 0.2935, + "step": 30935 + }, + { + "epoch": 2.506156837329877, + "grad_norm": 0.059621043503284454, + "learning_rate": 8.299203384490752e-05, + "loss": 0.2088, + "step": 30936 + }, + { + "epoch": 2.506237848347375, + "grad_norm": 0.0738162249326706, + "learning_rate": 8.298753319231289e-05, + "loss": 0.2646, + "step": 30937 + }, + { + "epoch": 2.5063188593648738, + "grad_norm": 0.07516893744468689, + "learning_rate": 8.298303253971826e-05, + "loss": 0.2495, + "step": 30938 + }, + { + "epoch": 2.506399870382372, + "grad_norm": 0.0615583173930645, + "learning_rate": 8.297853188712364e-05, + "loss": 0.2633, + "step": 30939 + }, + { + "epoch": 2.5064808813998702, + "grad_norm": 0.07958000898361206, + "learning_rate": 8.297403123452901e-05, + "loss": 0.261, + "step": 30940 + }, + { + "epoch": 2.506561892417369, + "grad_norm": 0.0821593627333641, + "learning_rate": 8.296953058193438e-05, + "loss": 0.2462, + "step": 30941 + }, + { + "epoch": 2.506642903434867, + "grad_norm": 0.0779246836900711, + "learning_rate": 8.296502992933976e-05, + "loss": 0.268, + "step": 30942 + }, + { + "epoch": 2.5067239144523654, + "grad_norm": 0.07051964849233627, + "learning_rate": 8.296052927674513e-05, + "loss": 0.2419, + "step": 30943 + }, + { + "epoch": 2.506804925469864, + "grad_norm": 0.06869851052761078, + "learning_rate": 8.29560286241505e-05, + "loss": 0.2333, + "step": 30944 + }, + { + "epoch": 2.5068859364873624, + "grad_norm": 0.06015220284461975, + "learning_rate": 8.295152797155588e-05, + "loss": 0.2614, + "step": 30945 + }, + { + "epoch": 2.5069669475048606, + "grad_norm": 0.07013320922851562, + "learning_rate": 8.294702731896125e-05, + "loss": 0.2796, + "step": 30946 + }, + { + "epoch": 2.5070479585223593, + "grad_norm": 0.07819762080907822, + "learning_rate": 8.294252666636663e-05, + "loss": 0.2289, + "step": 30947 + }, + { + "epoch": 2.5071289695398575, + "grad_norm": 0.0813426524400711, + "learning_rate": 8.2938026013772e-05, + "loss": 0.2402, + "step": 30948 + }, + { + "epoch": 2.5072099805573558, + "grad_norm": 0.055428456515073776, + "learning_rate": 8.293352536117737e-05, + "loss": 0.223, + "step": 30949 + }, + { + "epoch": 2.507290991574854, + "grad_norm": 0.06621409952640533, + "learning_rate": 8.292902470858275e-05, + "loss": 0.2153, + "step": 30950 + }, + { + "epoch": 2.5073720025923527, + "grad_norm": 0.06681319326162338, + "learning_rate": 8.292452405598812e-05, + "loss": 0.2816, + "step": 30951 + }, + { + "epoch": 2.507453013609851, + "grad_norm": 0.08249227702617645, + "learning_rate": 8.29200234033935e-05, + "loss": 0.2652, + "step": 30952 + }, + { + "epoch": 2.507534024627349, + "grad_norm": 0.06201348453760147, + "learning_rate": 8.291552275079887e-05, + "loss": 0.2267, + "step": 30953 + }, + { + "epoch": 2.5076150356448474, + "grad_norm": 0.07167745381593704, + "learning_rate": 8.291102209820424e-05, + "loss": 0.2452, + "step": 30954 + }, + { + "epoch": 2.507696046662346, + "grad_norm": 0.050998084247112274, + "learning_rate": 8.290652144560961e-05, + "loss": 0.2648, + "step": 30955 + }, + { + "epoch": 2.5077770576798444, + "grad_norm": 0.05879823490977287, + "learning_rate": 8.290202079301499e-05, + "loss": 0.1901, + "step": 30956 + }, + { + "epoch": 2.5078580686973426, + "grad_norm": 0.0664614737033844, + "learning_rate": 8.289752014042036e-05, + "loss": 0.2271, + "step": 30957 + }, + { + "epoch": 2.5079390797148413, + "grad_norm": 0.05864118039608002, + "learning_rate": 8.289301948782574e-05, + "loss": 0.2495, + "step": 30958 + }, + { + "epoch": 2.5080200907323396, + "grad_norm": 0.06172305345535278, + "learning_rate": 8.288851883523111e-05, + "loss": 0.2209, + "step": 30959 + }, + { + "epoch": 2.508101101749838, + "grad_norm": 0.07727573812007904, + "learning_rate": 8.28840181826365e-05, + "loss": 0.2551, + "step": 30960 + }, + { + "epoch": 2.5081821127673365, + "grad_norm": 0.05355662852525711, + "learning_rate": 8.287951753004186e-05, + "loss": 0.2044, + "step": 30961 + }, + { + "epoch": 2.5082631237848347, + "grad_norm": 0.06967973709106445, + "learning_rate": 8.287501687744723e-05, + "loss": 0.3012, + "step": 30962 + }, + { + "epoch": 2.508344134802333, + "grad_norm": 0.07389171421527863, + "learning_rate": 8.287051622485262e-05, + "loss": 0.2043, + "step": 30963 + }, + { + "epoch": 2.5084251458198317, + "grad_norm": 0.05707792565226555, + "learning_rate": 8.286601557225798e-05, + "loss": 0.2422, + "step": 30964 + }, + { + "epoch": 2.50850615683733, + "grad_norm": 0.06958410888910294, + "learning_rate": 8.286151491966335e-05, + "loss": 0.2354, + "step": 30965 + }, + { + "epoch": 2.508587167854828, + "grad_norm": 0.06012604758143425, + "learning_rate": 8.285701426706874e-05, + "loss": 0.198, + "step": 30966 + }, + { + "epoch": 2.508668178872327, + "grad_norm": 0.06455910205841064, + "learning_rate": 8.28525136144741e-05, + "loss": 0.242, + "step": 30967 + }, + { + "epoch": 2.508749189889825, + "grad_norm": 0.07406137138605118, + "learning_rate": 8.284801296187947e-05, + "loss": 0.2476, + "step": 30968 + }, + { + "epoch": 2.5088302009073233, + "grad_norm": 0.06192121282219887, + "learning_rate": 8.284351230928486e-05, + "loss": 0.2304, + "step": 30969 + }, + { + "epoch": 2.508911211924822, + "grad_norm": 0.0722472220659256, + "learning_rate": 8.283901165669022e-05, + "loss": 0.277, + "step": 30970 + }, + { + "epoch": 2.5089922229423203, + "grad_norm": 0.06029026210308075, + "learning_rate": 8.283451100409559e-05, + "loss": 0.2065, + "step": 30971 + }, + { + "epoch": 2.5090732339598185, + "grad_norm": 0.06601886451244354, + "learning_rate": 8.283001035150098e-05, + "loss": 0.2576, + "step": 30972 + }, + { + "epoch": 2.5091542449773168, + "grad_norm": 0.06260927021503448, + "learning_rate": 8.282550969890634e-05, + "loss": 0.2336, + "step": 30973 + }, + { + "epoch": 2.5092352559948155, + "grad_norm": 0.0685477927327156, + "learning_rate": 8.282100904631171e-05, + "loss": 0.2755, + "step": 30974 + }, + { + "epoch": 2.5093162670123137, + "grad_norm": 0.06632889062166214, + "learning_rate": 8.28165083937171e-05, + "loss": 0.2266, + "step": 30975 + }, + { + "epoch": 2.509397278029812, + "grad_norm": 0.06554222851991653, + "learning_rate": 8.281200774112246e-05, + "loss": 0.259, + "step": 30976 + }, + { + "epoch": 2.50947828904731, + "grad_norm": 0.08140553534030914, + "learning_rate": 8.280750708852783e-05, + "loss": 0.2513, + "step": 30977 + }, + { + "epoch": 2.509559300064809, + "grad_norm": 0.08227310329675674, + "learning_rate": 8.280300643593322e-05, + "loss": 0.2705, + "step": 30978 + }, + { + "epoch": 2.509640311082307, + "grad_norm": 0.06471066921949387, + "learning_rate": 8.279850578333858e-05, + "loss": 0.2253, + "step": 30979 + }, + { + "epoch": 2.5097213220998054, + "grad_norm": 0.06800235062837601, + "learning_rate": 8.279400513074395e-05, + "loss": 0.2571, + "step": 30980 + }, + { + "epoch": 2.509802333117304, + "grad_norm": 0.06845162063837051, + "learning_rate": 8.278950447814934e-05, + "loss": 0.26, + "step": 30981 + }, + { + "epoch": 2.5098833441348023, + "grad_norm": 0.06966293603181839, + "learning_rate": 8.27850038255547e-05, + "loss": 0.2661, + "step": 30982 + }, + { + "epoch": 2.5099643551523005, + "grad_norm": 0.06582781672477722, + "learning_rate": 8.278050317296008e-05, + "loss": 0.2724, + "step": 30983 + }, + { + "epoch": 2.5100453661697992, + "grad_norm": 0.049061696976423264, + "learning_rate": 8.277600252036546e-05, + "loss": 0.1902, + "step": 30984 + }, + { + "epoch": 2.5101263771872975, + "grad_norm": 0.06378032267093658, + "learning_rate": 8.277150186777082e-05, + "loss": 0.2258, + "step": 30985 + }, + { + "epoch": 2.5102073882047957, + "grad_norm": 0.07274179905653, + "learning_rate": 8.276700121517621e-05, + "loss": 0.2643, + "step": 30986 + }, + { + "epoch": 2.5102883992222944, + "grad_norm": 0.08791711926460266, + "learning_rate": 8.276250056258158e-05, + "loss": 0.2667, + "step": 30987 + }, + { + "epoch": 2.5103694102397927, + "grad_norm": 0.06939581036567688, + "learning_rate": 8.275799990998694e-05, + "loss": 0.2409, + "step": 30988 + }, + { + "epoch": 2.510450421257291, + "grad_norm": 0.06034819036722183, + "learning_rate": 8.275349925739233e-05, + "loss": 0.2484, + "step": 30989 + }, + { + "epoch": 2.5105314322747896, + "grad_norm": 0.06719058752059937, + "learning_rate": 8.27489986047977e-05, + "loss": 0.2362, + "step": 30990 + }, + { + "epoch": 2.510612443292288, + "grad_norm": 0.07439959049224854, + "learning_rate": 8.274449795220306e-05, + "loss": 0.2581, + "step": 30991 + }, + { + "epoch": 2.510693454309786, + "grad_norm": 0.09280843287706375, + "learning_rate": 8.273999729960845e-05, + "loss": 0.2783, + "step": 30992 + }, + { + "epoch": 2.5107744653272848, + "grad_norm": 0.07778385281562805, + "learning_rate": 8.273549664701382e-05, + "loss": 0.2226, + "step": 30993 + }, + { + "epoch": 2.510855476344783, + "grad_norm": 0.04843428358435631, + "learning_rate": 8.273099599441918e-05, + "loss": 0.256, + "step": 30994 + }, + { + "epoch": 2.5109364873622813, + "grad_norm": 0.07087177038192749, + "learning_rate": 8.272649534182457e-05, + "loss": 0.2884, + "step": 30995 + }, + { + "epoch": 2.5110174983797795, + "grad_norm": 0.07463837414979935, + "learning_rate": 8.272199468922995e-05, + "loss": 0.2664, + "step": 30996 + }, + { + "epoch": 2.511098509397278, + "grad_norm": 0.07141347974538803, + "learning_rate": 8.271749403663532e-05, + "loss": 0.26, + "step": 30997 + }, + { + "epoch": 2.5111795204147764, + "grad_norm": 0.06270520389080048, + "learning_rate": 8.271299338404069e-05, + "loss": 0.2217, + "step": 30998 + }, + { + "epoch": 2.5112605314322747, + "grad_norm": 0.06914729624986649, + "learning_rate": 8.270849273144607e-05, + "loss": 0.2532, + "step": 30999 + }, + { + "epoch": 2.511341542449773, + "grad_norm": 0.07662202417850494, + "learning_rate": 8.270399207885144e-05, + "loss": 0.259, + "step": 31000 + }, + { + "epoch": 2.5114225534672716, + "grad_norm": 0.05497424304485321, + "learning_rate": 8.269949142625681e-05, + "loss": 0.2289, + "step": 31001 + }, + { + "epoch": 2.51150356448477, + "grad_norm": 0.09671833366155624, + "learning_rate": 8.269499077366219e-05, + "loss": 0.3282, + "step": 31002 + }, + { + "epoch": 2.511584575502268, + "grad_norm": 0.06896227598190308, + "learning_rate": 8.269049012106756e-05, + "loss": 0.2577, + "step": 31003 + }, + { + "epoch": 2.511665586519767, + "grad_norm": 0.07261406630277634, + "learning_rate": 8.268598946847293e-05, + "loss": 0.2413, + "step": 31004 + }, + { + "epoch": 2.511746597537265, + "grad_norm": 0.06723308563232422, + "learning_rate": 8.268148881587831e-05, + "loss": 0.2717, + "step": 31005 + }, + { + "epoch": 2.5118276085547633, + "grad_norm": 0.07040799409151077, + "learning_rate": 8.267698816328368e-05, + "loss": 0.2719, + "step": 31006 + }, + { + "epoch": 2.511908619572262, + "grad_norm": 0.057618461549282074, + "learning_rate": 8.267248751068906e-05, + "loss": 0.2569, + "step": 31007 + }, + { + "epoch": 2.51198963058976, + "grad_norm": 0.061061207205057144, + "learning_rate": 8.266798685809443e-05, + "loss": 0.2659, + "step": 31008 + }, + { + "epoch": 2.5120706416072585, + "grad_norm": 0.06231067329645157, + "learning_rate": 8.26634862054998e-05, + "loss": 0.2425, + "step": 31009 + }, + { + "epoch": 2.512151652624757, + "grad_norm": 0.07808933407068253, + "learning_rate": 8.265898555290518e-05, + "loss": 0.2279, + "step": 31010 + }, + { + "epoch": 2.5122326636422554, + "grad_norm": 0.06471867859363556, + "learning_rate": 8.265448490031055e-05, + "loss": 0.238, + "step": 31011 + }, + { + "epoch": 2.5123136746597536, + "grad_norm": 0.0662679597735405, + "learning_rate": 8.264998424771592e-05, + "loss": 0.2606, + "step": 31012 + }, + { + "epoch": 2.5123946856772523, + "grad_norm": 0.07310327142477036, + "learning_rate": 8.26454835951213e-05, + "loss": 0.2359, + "step": 31013 + }, + { + "epoch": 2.5124756966947506, + "grad_norm": 0.06892668455839157, + "learning_rate": 8.264098294252667e-05, + "loss": 0.2511, + "step": 31014 + }, + { + "epoch": 2.512556707712249, + "grad_norm": 0.05978599190711975, + "learning_rate": 8.263648228993204e-05, + "loss": 0.2551, + "step": 31015 + }, + { + "epoch": 2.5126377187297475, + "grad_norm": 0.08446621149778366, + "learning_rate": 8.263198163733742e-05, + "loss": 0.2699, + "step": 31016 + }, + { + "epoch": 2.5127187297472457, + "grad_norm": 0.06874669343233109, + "learning_rate": 8.262748098474279e-05, + "loss": 0.2363, + "step": 31017 + }, + { + "epoch": 2.512799740764744, + "grad_norm": 0.08762843906879425, + "learning_rate": 8.262298033214816e-05, + "loss": 0.2743, + "step": 31018 + }, + { + "epoch": 2.5128807517822422, + "grad_norm": 0.07058103382587433, + "learning_rate": 8.261847967955354e-05, + "loss": 0.2776, + "step": 31019 + }, + { + "epoch": 2.512961762799741, + "grad_norm": 0.06566563248634338, + "learning_rate": 8.261397902695891e-05, + "loss": 0.2729, + "step": 31020 + }, + { + "epoch": 2.513042773817239, + "grad_norm": 0.06321921199560165, + "learning_rate": 8.260947837436429e-05, + "loss": 0.2211, + "step": 31021 + }, + { + "epoch": 2.5131237848347374, + "grad_norm": 0.08189821988344193, + "learning_rate": 8.260497772176966e-05, + "loss": 0.262, + "step": 31022 + }, + { + "epoch": 2.5132047958522357, + "grad_norm": 0.07887553423643112, + "learning_rate": 8.260047706917503e-05, + "loss": 0.2556, + "step": 31023 + }, + { + "epoch": 2.5132858068697344, + "grad_norm": 0.06533343344926834, + "learning_rate": 8.25959764165804e-05, + "loss": 0.2488, + "step": 31024 + }, + { + "epoch": 2.5133668178872326, + "grad_norm": 0.0702027752995491, + "learning_rate": 8.259147576398578e-05, + "loss": 0.2433, + "step": 31025 + }, + { + "epoch": 2.513447828904731, + "grad_norm": 0.05941007658839226, + "learning_rate": 8.258697511139115e-05, + "loss": 0.2066, + "step": 31026 + }, + { + "epoch": 2.5135288399222295, + "grad_norm": 0.06379848718643188, + "learning_rate": 8.258247445879653e-05, + "loss": 0.2569, + "step": 31027 + }, + { + "epoch": 2.5136098509397278, + "grad_norm": 0.05726584047079086, + "learning_rate": 8.25779738062019e-05, + "loss": 0.2191, + "step": 31028 + }, + { + "epoch": 2.513690861957226, + "grad_norm": 0.05791817605495453, + "learning_rate": 8.257347315360727e-05, + "loss": 0.2129, + "step": 31029 + }, + { + "epoch": 2.5137718729747247, + "grad_norm": 0.0738334208726883, + "learning_rate": 8.256897250101265e-05, + "loss": 0.2542, + "step": 31030 + }, + { + "epoch": 2.513852883992223, + "grad_norm": 0.05998535081744194, + "learning_rate": 8.256447184841802e-05, + "loss": 0.2376, + "step": 31031 + }, + { + "epoch": 2.513933895009721, + "grad_norm": 0.07362499833106995, + "learning_rate": 8.25599711958234e-05, + "loss": 0.2627, + "step": 31032 + }, + { + "epoch": 2.51401490602722, + "grad_norm": 0.07652860134840012, + "learning_rate": 8.255547054322877e-05, + "loss": 0.2573, + "step": 31033 + }, + { + "epoch": 2.514095917044718, + "grad_norm": 0.07800864428281784, + "learning_rate": 8.255096989063414e-05, + "loss": 0.2786, + "step": 31034 + }, + { + "epoch": 2.5141769280622164, + "grad_norm": 0.05959049239754677, + "learning_rate": 8.254646923803952e-05, + "loss": 0.2756, + "step": 31035 + }, + { + "epoch": 2.514257939079715, + "grad_norm": 0.0774015411734581, + "learning_rate": 8.254196858544489e-05, + "loss": 0.2543, + "step": 31036 + }, + { + "epoch": 2.5143389500972133, + "grad_norm": 0.05201781168580055, + "learning_rate": 8.253746793285026e-05, + "loss": 0.2401, + "step": 31037 + }, + { + "epoch": 2.5144199611147116, + "grad_norm": 0.069788359105587, + "learning_rate": 8.253296728025565e-05, + "loss": 0.235, + "step": 31038 + }, + { + "epoch": 2.5145009721322102, + "grad_norm": 0.05659021437168121, + "learning_rate": 8.252846662766101e-05, + "loss": 0.2487, + "step": 31039 + }, + { + "epoch": 2.5145819831497085, + "grad_norm": 0.06626788526773453, + "learning_rate": 8.252396597506638e-05, + "loss": 0.2715, + "step": 31040 + }, + { + "epoch": 2.5146629941672067, + "grad_norm": 0.0640718936920166, + "learning_rate": 8.251946532247177e-05, + "loss": 0.2338, + "step": 31041 + }, + { + "epoch": 2.514744005184705, + "grad_norm": 0.06315992772579193, + "learning_rate": 8.251496466987713e-05, + "loss": 0.2602, + "step": 31042 + }, + { + "epoch": 2.5148250162022032, + "grad_norm": 0.07527586072683334, + "learning_rate": 8.25104640172825e-05, + "loss": 0.2437, + "step": 31043 + }, + { + "epoch": 2.514906027219702, + "grad_norm": 0.04863426089286804, + "learning_rate": 8.250596336468789e-05, + "loss": 0.2198, + "step": 31044 + }, + { + "epoch": 2.5149870382372, + "grad_norm": 0.06549611687660217, + "learning_rate": 8.250146271209325e-05, + "loss": 0.2217, + "step": 31045 + }, + { + "epoch": 2.5150680492546984, + "grad_norm": 0.07761254161596298, + "learning_rate": 8.249696205949863e-05, + "loss": 0.2536, + "step": 31046 + }, + { + "epoch": 2.515149060272197, + "grad_norm": 0.06986057013273239, + "learning_rate": 8.249246140690401e-05, + "loss": 0.2393, + "step": 31047 + }, + { + "epoch": 2.5152300712896953, + "grad_norm": 0.07064176350831985, + "learning_rate": 8.248796075430937e-05, + "loss": 0.2258, + "step": 31048 + }, + { + "epoch": 2.5153110823071936, + "grad_norm": 0.07709095627069473, + "learning_rate": 8.248346010171475e-05, + "loss": 0.28, + "step": 31049 + }, + { + "epoch": 2.5153920933246923, + "grad_norm": 0.07585789263248444, + "learning_rate": 8.247895944912013e-05, + "loss": 0.2327, + "step": 31050 + }, + { + "epoch": 2.5154731043421905, + "grad_norm": 0.061996519565582275, + "learning_rate": 8.24744587965255e-05, + "loss": 0.2263, + "step": 31051 + }, + { + "epoch": 2.5155541153596888, + "grad_norm": 0.05735709145665169, + "learning_rate": 8.246995814393087e-05, + "loss": 0.2294, + "step": 31052 + }, + { + "epoch": 2.5156351263771874, + "grad_norm": 0.05866505205631256, + "learning_rate": 8.246545749133625e-05, + "loss": 0.2437, + "step": 31053 + }, + { + "epoch": 2.5157161373946857, + "grad_norm": 0.07577653229236603, + "learning_rate": 8.246095683874161e-05, + "loss": 0.2429, + "step": 31054 + }, + { + "epoch": 2.515797148412184, + "grad_norm": 0.06760087609291077, + "learning_rate": 8.245645618614699e-05, + "loss": 0.237, + "step": 31055 + }, + { + "epoch": 2.5158781594296826, + "grad_norm": 0.08195766806602478, + "learning_rate": 8.245195553355238e-05, + "loss": 0.2464, + "step": 31056 + }, + { + "epoch": 2.515959170447181, + "grad_norm": 0.07483825087547302, + "learning_rate": 8.244745488095774e-05, + "loss": 0.2503, + "step": 31057 + }, + { + "epoch": 2.516040181464679, + "grad_norm": 0.06859326362609863, + "learning_rate": 8.244295422836311e-05, + "loss": 0.2488, + "step": 31058 + }, + { + "epoch": 2.516121192482178, + "grad_norm": 0.056272827088832855, + "learning_rate": 8.24384535757685e-05, + "loss": 0.1995, + "step": 31059 + }, + { + "epoch": 2.516202203499676, + "grad_norm": 0.059765033423900604, + "learning_rate": 8.243395292317386e-05, + "loss": 0.2459, + "step": 31060 + }, + { + "epoch": 2.5162832145171743, + "grad_norm": 0.075907401740551, + "learning_rate": 8.242945227057923e-05, + "loss": 0.263, + "step": 31061 + }, + { + "epoch": 2.516364225534673, + "grad_norm": 0.06140841171145439, + "learning_rate": 8.242495161798462e-05, + "loss": 0.2322, + "step": 31062 + }, + { + "epoch": 2.5164452365521712, + "grad_norm": 0.07933636009693146, + "learning_rate": 8.242045096538998e-05, + "loss": 0.2132, + "step": 31063 + }, + { + "epoch": 2.5165262475696695, + "grad_norm": 0.08671210706233978, + "learning_rate": 8.241595031279536e-05, + "loss": 0.2591, + "step": 31064 + }, + { + "epoch": 2.5166072585871677, + "grad_norm": 0.06944069266319275, + "learning_rate": 8.241144966020074e-05, + "loss": 0.2335, + "step": 31065 + }, + { + "epoch": 2.516688269604666, + "grad_norm": 0.0640743300318718, + "learning_rate": 8.240694900760611e-05, + "loss": 0.272, + "step": 31066 + }, + { + "epoch": 2.5167692806221647, + "grad_norm": 0.05724559724330902, + "learning_rate": 8.240244835501148e-05, + "loss": 0.2613, + "step": 31067 + }, + { + "epoch": 2.516850291639663, + "grad_norm": 0.06746908277273178, + "learning_rate": 8.239794770241686e-05, + "loss": 0.2403, + "step": 31068 + }, + { + "epoch": 2.516931302657161, + "grad_norm": 0.0781325176358223, + "learning_rate": 8.239344704982223e-05, + "loss": 0.2326, + "step": 31069 + }, + { + "epoch": 2.51701231367466, + "grad_norm": 0.05279175937175751, + "learning_rate": 8.23889463972276e-05, + "loss": 0.2217, + "step": 31070 + }, + { + "epoch": 2.517093324692158, + "grad_norm": 0.06322243064641953, + "learning_rate": 8.238444574463298e-05, + "loss": 0.2494, + "step": 31071 + }, + { + "epoch": 2.5171743357096563, + "grad_norm": 0.0703403502702713, + "learning_rate": 8.237994509203835e-05, + "loss": 0.2432, + "step": 31072 + }, + { + "epoch": 2.517255346727155, + "grad_norm": 0.07747770845890045, + "learning_rate": 8.237544443944373e-05, + "loss": 0.2315, + "step": 31073 + }, + { + "epoch": 2.5173363577446533, + "grad_norm": 0.06204115226864815, + "learning_rate": 8.23709437868491e-05, + "loss": 0.2381, + "step": 31074 + }, + { + "epoch": 2.5174173687621515, + "grad_norm": 0.06431187689304352, + "learning_rate": 8.236644313425447e-05, + "loss": 0.2485, + "step": 31075 + }, + { + "epoch": 2.51749837977965, + "grad_norm": 0.07739128917455673, + "learning_rate": 8.236194248165985e-05, + "loss": 0.1975, + "step": 31076 + }, + { + "epoch": 2.5175793907971484, + "grad_norm": 0.06386204063892365, + "learning_rate": 8.235744182906522e-05, + "loss": 0.2433, + "step": 31077 + }, + { + "epoch": 2.5176604018146467, + "grad_norm": 0.056900352239608765, + "learning_rate": 8.23529411764706e-05, + "loss": 0.2113, + "step": 31078 + }, + { + "epoch": 2.5177414128321454, + "grad_norm": 0.06782882660627365, + "learning_rate": 8.234844052387597e-05, + "loss": 0.2786, + "step": 31079 + }, + { + "epoch": 2.5178224238496436, + "grad_norm": 0.06768842786550522, + "learning_rate": 8.234393987128134e-05, + "loss": 0.2609, + "step": 31080 + }, + { + "epoch": 2.517903434867142, + "grad_norm": 0.05288982763886452, + "learning_rate": 8.233943921868672e-05, + "loss": 0.2281, + "step": 31081 + }, + { + "epoch": 2.5179844458846405, + "grad_norm": 0.07983200997114182, + "learning_rate": 8.233493856609209e-05, + "loss": 0.254, + "step": 31082 + }, + { + "epoch": 2.518065456902139, + "grad_norm": 0.054932545870542526, + "learning_rate": 8.233043791349746e-05, + "loss": 0.2325, + "step": 31083 + }, + { + "epoch": 2.518146467919637, + "grad_norm": 0.06276437640190125, + "learning_rate": 8.232593726090284e-05, + "loss": 0.2322, + "step": 31084 + }, + { + "epoch": 2.5182274789371357, + "grad_norm": 0.070334292948246, + "learning_rate": 8.232143660830821e-05, + "loss": 0.2695, + "step": 31085 + }, + { + "epoch": 2.518308489954634, + "grad_norm": 0.07306241244077682, + "learning_rate": 8.231693595571358e-05, + "loss": 0.2392, + "step": 31086 + }, + { + "epoch": 2.518389500972132, + "grad_norm": 0.06493571400642395, + "learning_rate": 8.231243530311896e-05, + "loss": 0.2222, + "step": 31087 + }, + { + "epoch": 2.5184705119896305, + "grad_norm": 0.07168751955032349, + "learning_rate": 8.230793465052433e-05, + "loss": 0.2696, + "step": 31088 + }, + { + "epoch": 2.5185515230071287, + "grad_norm": 0.06785114109516144, + "learning_rate": 8.23034339979297e-05, + "loss": 0.2299, + "step": 31089 + }, + { + "epoch": 2.5186325340246274, + "grad_norm": 0.07063904404640198, + "learning_rate": 8.229893334533508e-05, + "loss": 0.2868, + "step": 31090 + }, + { + "epoch": 2.5187135450421256, + "grad_norm": 0.06915218383073807, + "learning_rate": 8.229443269274045e-05, + "loss": 0.2387, + "step": 31091 + }, + { + "epoch": 2.518794556059624, + "grad_norm": 0.07183767110109329, + "learning_rate": 8.228993204014583e-05, + "loss": 0.2427, + "step": 31092 + }, + { + "epoch": 2.5188755670771226, + "grad_norm": 0.05309898406267166, + "learning_rate": 8.22854313875512e-05, + "loss": 0.2147, + "step": 31093 + }, + { + "epoch": 2.518956578094621, + "grad_norm": 0.06278495490550995, + "learning_rate": 8.228093073495657e-05, + "loss": 0.2275, + "step": 31094 + }, + { + "epoch": 2.519037589112119, + "grad_norm": 0.06439706683158875, + "learning_rate": 8.227643008236195e-05, + "loss": 0.2362, + "step": 31095 + }, + { + "epoch": 2.5191186001296177, + "grad_norm": 0.05649157240986824, + "learning_rate": 8.227192942976732e-05, + "loss": 0.1942, + "step": 31096 + }, + { + "epoch": 2.519199611147116, + "grad_norm": 0.07158772647380829, + "learning_rate": 8.226742877717269e-05, + "loss": 0.2734, + "step": 31097 + }, + { + "epoch": 2.5192806221646142, + "grad_norm": 0.06923554837703705, + "learning_rate": 8.226292812457807e-05, + "loss": 0.2339, + "step": 31098 + }, + { + "epoch": 2.519361633182113, + "grad_norm": 0.06337632983922958, + "learning_rate": 8.225842747198344e-05, + "loss": 0.2546, + "step": 31099 + }, + { + "epoch": 2.519442644199611, + "grad_norm": 0.06667693704366684, + "learning_rate": 8.225392681938881e-05, + "loss": 0.2225, + "step": 31100 + }, + { + "epoch": 2.5195236552171094, + "grad_norm": 0.06592801213264465, + "learning_rate": 8.224942616679419e-05, + "loss": 0.2211, + "step": 31101 + }, + { + "epoch": 2.519604666234608, + "grad_norm": 0.07447879761457443, + "learning_rate": 8.224492551419956e-05, + "loss": 0.294, + "step": 31102 + }, + { + "epoch": 2.5196856772521063, + "grad_norm": 0.06477392464876175, + "learning_rate": 8.224042486160493e-05, + "loss": 0.2443, + "step": 31103 + }, + { + "epoch": 2.5197666882696046, + "grad_norm": 0.06004251912236214, + "learning_rate": 8.223592420901031e-05, + "loss": 0.1946, + "step": 31104 + }, + { + "epoch": 2.5198476992871033, + "grad_norm": 0.06646063923835754, + "learning_rate": 8.223142355641568e-05, + "loss": 0.2496, + "step": 31105 + }, + { + "epoch": 2.5199287103046015, + "grad_norm": 0.06413585692644119, + "learning_rate": 8.222692290382106e-05, + "loss": 0.2262, + "step": 31106 + }, + { + "epoch": 2.5200097213220998, + "grad_norm": 0.0555780753493309, + "learning_rate": 8.222242225122643e-05, + "loss": 0.2569, + "step": 31107 + }, + { + "epoch": 2.5200907323395985, + "grad_norm": 0.07252832502126694, + "learning_rate": 8.22179215986318e-05, + "loss": 0.2627, + "step": 31108 + }, + { + "epoch": 2.5201717433570967, + "grad_norm": 0.061369750648736954, + "learning_rate": 8.221342094603718e-05, + "loss": 0.219, + "step": 31109 + }, + { + "epoch": 2.520252754374595, + "grad_norm": 0.07031913101673126, + "learning_rate": 8.220892029344255e-05, + "loss": 0.2617, + "step": 31110 + }, + { + "epoch": 2.520333765392093, + "grad_norm": 0.06914175301790237, + "learning_rate": 8.220441964084792e-05, + "loss": 0.2448, + "step": 31111 + }, + { + "epoch": 2.5204147764095914, + "grad_norm": 0.07155036926269531, + "learning_rate": 8.21999189882533e-05, + "loss": 0.2427, + "step": 31112 + }, + { + "epoch": 2.52049578742709, + "grad_norm": 0.06575379520654678, + "learning_rate": 8.219541833565867e-05, + "loss": 0.2573, + "step": 31113 + }, + { + "epoch": 2.5205767984445884, + "grad_norm": 0.06591765582561493, + "learning_rate": 8.219091768306404e-05, + "loss": 0.2327, + "step": 31114 + }, + { + "epoch": 2.5206578094620866, + "grad_norm": 0.08451759070158005, + "learning_rate": 8.218641703046942e-05, + "loss": 0.2686, + "step": 31115 + }, + { + "epoch": 2.5207388204795853, + "grad_norm": 0.058037735521793365, + "learning_rate": 8.218191637787479e-05, + "loss": 0.2307, + "step": 31116 + }, + { + "epoch": 2.5208198314970836, + "grad_norm": 0.07824068516492844, + "learning_rate": 8.217741572528017e-05, + "loss": 0.2556, + "step": 31117 + }, + { + "epoch": 2.520900842514582, + "grad_norm": 0.06366552412509918, + "learning_rate": 8.217291507268554e-05, + "loss": 0.2442, + "step": 31118 + }, + { + "epoch": 2.5209818535320805, + "grad_norm": 0.068834587931633, + "learning_rate": 8.216841442009093e-05, + "loss": 0.2743, + "step": 31119 + }, + { + "epoch": 2.5210628645495787, + "grad_norm": 0.07074061036109924, + "learning_rate": 8.216391376749629e-05, + "loss": 0.216, + "step": 31120 + }, + { + "epoch": 2.521143875567077, + "grad_norm": 0.048492182046175, + "learning_rate": 8.215941311490166e-05, + "loss": 0.2349, + "step": 31121 + }, + { + "epoch": 2.5212248865845757, + "grad_norm": 0.0563458614051342, + "learning_rate": 8.215491246230705e-05, + "loss": 0.226, + "step": 31122 + }, + { + "epoch": 2.521305897602074, + "grad_norm": 0.054789893329143524, + "learning_rate": 8.21504118097124e-05, + "loss": 0.2251, + "step": 31123 + }, + { + "epoch": 2.521386908619572, + "grad_norm": 0.0652405321598053, + "learning_rate": 8.214591115711778e-05, + "loss": 0.2357, + "step": 31124 + }, + { + "epoch": 2.521467919637071, + "grad_norm": 0.061516888439655304, + "learning_rate": 8.214141050452317e-05, + "loss": 0.2426, + "step": 31125 + }, + { + "epoch": 2.521548930654569, + "grad_norm": 0.07577957212924957, + "learning_rate": 8.213690985192853e-05, + "loss": 0.2577, + "step": 31126 + }, + { + "epoch": 2.5216299416720673, + "grad_norm": 0.07500039786100388, + "learning_rate": 8.21324091993339e-05, + "loss": 0.2392, + "step": 31127 + }, + { + "epoch": 2.521710952689566, + "grad_norm": 0.07097041606903076, + "learning_rate": 8.212790854673929e-05, + "loss": 0.244, + "step": 31128 + }, + { + "epoch": 2.5217919637070643, + "grad_norm": 0.06939812004566193, + "learning_rate": 8.212340789414465e-05, + "loss": 0.2284, + "step": 31129 + }, + { + "epoch": 2.5218729747245625, + "grad_norm": 0.05452917516231537, + "learning_rate": 8.211890724155002e-05, + "loss": 0.2235, + "step": 31130 + }, + { + "epoch": 2.5219539857420608, + "grad_norm": 0.062049634754657745, + "learning_rate": 8.211440658895541e-05, + "loss": 0.258, + "step": 31131 + }, + { + "epoch": 2.5220349967595594, + "grad_norm": 0.0639457032084465, + "learning_rate": 8.210990593636077e-05, + "loss": 0.2656, + "step": 31132 + }, + { + "epoch": 2.5221160077770577, + "grad_norm": 0.05421054735779762, + "learning_rate": 8.210540528376614e-05, + "loss": 0.2243, + "step": 31133 + }, + { + "epoch": 2.522197018794556, + "grad_norm": 0.07816123962402344, + "learning_rate": 8.210090463117153e-05, + "loss": 0.251, + "step": 31134 + }, + { + "epoch": 2.522278029812054, + "grad_norm": 0.06340120732784271, + "learning_rate": 8.20964039785769e-05, + "loss": 0.2402, + "step": 31135 + }, + { + "epoch": 2.522359040829553, + "grad_norm": 0.0676545798778534, + "learning_rate": 8.209190332598226e-05, + "loss": 0.241, + "step": 31136 + }, + { + "epoch": 2.522440051847051, + "grad_norm": 0.05805578827857971, + "learning_rate": 8.208740267338765e-05, + "loss": 0.2214, + "step": 31137 + }, + { + "epoch": 2.5225210628645494, + "grad_norm": 0.06669300049543381, + "learning_rate": 8.208290202079302e-05, + "loss": 0.2335, + "step": 31138 + }, + { + "epoch": 2.522602073882048, + "grad_norm": 0.06161652132868767, + "learning_rate": 8.207840136819838e-05, + "loss": 0.2354, + "step": 31139 + }, + { + "epoch": 2.5226830848995463, + "grad_norm": 0.07172819972038269, + "learning_rate": 8.207390071560377e-05, + "loss": 0.2715, + "step": 31140 + }, + { + "epoch": 2.5227640959170445, + "grad_norm": 0.051006119698286057, + "learning_rate": 8.206940006300915e-05, + "loss": 0.195, + "step": 31141 + }, + { + "epoch": 2.5228451069345432, + "grad_norm": 0.06547657400369644, + "learning_rate": 8.20648994104145e-05, + "loss": 0.2316, + "step": 31142 + }, + { + "epoch": 2.5229261179520415, + "grad_norm": 0.06320697069168091, + "learning_rate": 8.206039875781989e-05, + "loss": 0.2534, + "step": 31143 + }, + { + "epoch": 2.5230071289695397, + "grad_norm": 0.06975486874580383, + "learning_rate": 8.205589810522527e-05, + "loss": 0.2401, + "step": 31144 + }, + { + "epoch": 2.5230881399870384, + "grad_norm": 0.06577564030885696, + "learning_rate": 8.205139745263064e-05, + "loss": 0.2054, + "step": 31145 + }, + { + "epoch": 2.5231691510045366, + "grad_norm": 0.08921880275011063, + "learning_rate": 8.204689680003601e-05, + "loss": 0.2514, + "step": 31146 + }, + { + "epoch": 2.523250162022035, + "grad_norm": 0.07652483880519867, + "learning_rate": 8.204239614744139e-05, + "loss": 0.2526, + "step": 31147 + }, + { + "epoch": 2.5233311730395336, + "grad_norm": 0.05995349958539009, + "learning_rate": 8.203789549484676e-05, + "loss": 0.244, + "step": 31148 + }, + { + "epoch": 2.523412184057032, + "grad_norm": 0.06425217539072037, + "learning_rate": 8.203339484225213e-05, + "loss": 0.2284, + "step": 31149 + }, + { + "epoch": 2.52349319507453, + "grad_norm": 0.07397456467151642, + "learning_rate": 8.202889418965751e-05, + "loss": 0.268, + "step": 31150 + }, + { + "epoch": 2.5235742060920288, + "grad_norm": 0.06542269885540009, + "learning_rate": 8.202439353706288e-05, + "loss": 0.2364, + "step": 31151 + }, + { + "epoch": 2.523655217109527, + "grad_norm": 0.05775555968284607, + "learning_rate": 8.201989288446825e-05, + "loss": 0.2386, + "step": 31152 + }, + { + "epoch": 2.5237362281270252, + "grad_norm": 0.06939782202243805, + "learning_rate": 8.201539223187363e-05, + "loss": 0.2728, + "step": 31153 + }, + { + "epoch": 2.5238172391445235, + "grad_norm": 0.07716054469347, + "learning_rate": 8.2010891579279e-05, + "loss": 0.2684, + "step": 31154 + }, + { + "epoch": 2.523898250162022, + "grad_norm": 0.049509044736623764, + "learning_rate": 8.200639092668438e-05, + "loss": 0.2559, + "step": 31155 + }, + { + "epoch": 2.5239792611795204, + "grad_norm": 0.08856166899204254, + "learning_rate": 8.200189027408975e-05, + "loss": 0.2449, + "step": 31156 + }, + { + "epoch": 2.5240602721970187, + "grad_norm": 0.06791945546865463, + "learning_rate": 8.199738962149512e-05, + "loss": 0.2783, + "step": 31157 + }, + { + "epoch": 2.524141283214517, + "grad_norm": 0.06625067442655563, + "learning_rate": 8.19928889689005e-05, + "loss": 0.2368, + "step": 31158 + }, + { + "epoch": 2.5242222942320156, + "grad_norm": 0.051140010356903076, + "learning_rate": 8.198838831630587e-05, + "loss": 0.2421, + "step": 31159 + }, + { + "epoch": 2.524303305249514, + "grad_norm": 0.06882493942975998, + "learning_rate": 8.198388766371124e-05, + "loss": 0.2504, + "step": 31160 + }, + { + "epoch": 2.524384316267012, + "grad_norm": 0.08425910770893097, + "learning_rate": 8.197938701111662e-05, + "loss": 0.242, + "step": 31161 + }, + { + "epoch": 2.524465327284511, + "grad_norm": 0.07307098060846329, + "learning_rate": 8.197488635852199e-05, + "loss": 0.2875, + "step": 31162 + }, + { + "epoch": 2.524546338302009, + "grad_norm": 0.06732004880905151, + "learning_rate": 8.197038570592736e-05, + "loss": 0.2407, + "step": 31163 + }, + { + "epoch": 2.5246273493195073, + "grad_norm": 0.05499308556318283, + "learning_rate": 8.196588505333274e-05, + "loss": 0.214, + "step": 31164 + }, + { + "epoch": 2.524708360337006, + "grad_norm": 0.07563404738903046, + "learning_rate": 8.196138440073811e-05, + "loss": 0.2255, + "step": 31165 + }, + { + "epoch": 2.524789371354504, + "grad_norm": 0.07380463182926178, + "learning_rate": 8.195688374814349e-05, + "loss": 0.2911, + "step": 31166 + }, + { + "epoch": 2.5248703823720025, + "grad_norm": 0.07666265964508057, + "learning_rate": 8.195238309554886e-05, + "loss": 0.218, + "step": 31167 + }, + { + "epoch": 2.524951393389501, + "grad_norm": 0.0652792826294899, + "learning_rate": 8.194788244295423e-05, + "loss": 0.2203, + "step": 31168 + }, + { + "epoch": 2.5250324044069994, + "grad_norm": 0.07028517872095108, + "learning_rate": 8.19433817903596e-05, + "loss": 0.2337, + "step": 31169 + }, + { + "epoch": 2.5251134154244976, + "grad_norm": 0.07915160804986954, + "learning_rate": 8.193888113776498e-05, + "loss": 0.2431, + "step": 31170 + }, + { + "epoch": 2.5251944264419963, + "grad_norm": 0.0773158147931099, + "learning_rate": 8.193438048517035e-05, + "loss": 0.2556, + "step": 31171 + }, + { + "epoch": 2.5252754374594946, + "grad_norm": 0.0707051157951355, + "learning_rate": 8.192987983257573e-05, + "loss": 0.2387, + "step": 31172 + }, + { + "epoch": 2.525356448476993, + "grad_norm": 0.09297461807727814, + "learning_rate": 8.19253791799811e-05, + "loss": 0.2542, + "step": 31173 + }, + { + "epoch": 2.5254374594944915, + "grad_norm": 0.07315745204687119, + "learning_rate": 8.192087852738647e-05, + "loss": 0.2251, + "step": 31174 + }, + { + "epoch": 2.5255184705119897, + "grad_norm": 0.0702429711818695, + "learning_rate": 8.191637787479185e-05, + "loss": 0.2634, + "step": 31175 + }, + { + "epoch": 2.525599481529488, + "grad_norm": 0.06600876897573471, + "learning_rate": 8.191187722219722e-05, + "loss": 0.2157, + "step": 31176 + }, + { + "epoch": 2.5256804925469862, + "grad_norm": 0.06941699236631393, + "learning_rate": 8.19073765696026e-05, + "loss": 0.2759, + "step": 31177 + }, + { + "epoch": 2.525761503564485, + "grad_norm": 0.08072786033153534, + "learning_rate": 8.190287591700797e-05, + "loss": 0.2418, + "step": 31178 + }, + { + "epoch": 2.525842514581983, + "grad_norm": 0.0683659017086029, + "learning_rate": 8.189837526441334e-05, + "loss": 0.2526, + "step": 31179 + }, + { + "epoch": 2.5259235255994814, + "grad_norm": 0.06554919481277466, + "learning_rate": 8.189387461181872e-05, + "loss": 0.2554, + "step": 31180 + }, + { + "epoch": 2.5260045366169797, + "grad_norm": 0.07267916202545166, + "learning_rate": 8.188937395922409e-05, + "loss": 0.2287, + "step": 31181 + }, + { + "epoch": 2.5260855476344783, + "grad_norm": 0.06501460075378418, + "learning_rate": 8.188487330662946e-05, + "loss": 0.2373, + "step": 31182 + }, + { + "epoch": 2.5261665586519766, + "grad_norm": 0.06084512174129486, + "learning_rate": 8.188037265403484e-05, + "loss": 0.2371, + "step": 31183 + }, + { + "epoch": 2.526247569669475, + "grad_norm": 0.06529213488101959, + "learning_rate": 8.187587200144021e-05, + "loss": 0.2729, + "step": 31184 + }, + { + "epoch": 2.5263285806869735, + "grad_norm": 0.07154843956232071, + "learning_rate": 8.187137134884558e-05, + "loss": 0.2429, + "step": 31185 + }, + { + "epoch": 2.5264095917044718, + "grad_norm": 0.07982178032398224, + "learning_rate": 8.186687069625096e-05, + "loss": 0.2964, + "step": 31186 + }, + { + "epoch": 2.52649060272197, + "grad_norm": 0.058192428201436996, + "learning_rate": 8.186237004365633e-05, + "loss": 0.2279, + "step": 31187 + }, + { + "epoch": 2.5265716137394687, + "grad_norm": 0.07291363179683685, + "learning_rate": 8.18578693910617e-05, + "loss": 0.2713, + "step": 31188 + }, + { + "epoch": 2.526652624756967, + "grad_norm": 0.062472179532051086, + "learning_rate": 8.185336873846708e-05, + "loss": 0.2577, + "step": 31189 + }, + { + "epoch": 2.526733635774465, + "grad_norm": 0.0711505115032196, + "learning_rate": 8.184886808587245e-05, + "loss": 0.2482, + "step": 31190 + }, + { + "epoch": 2.526814646791964, + "grad_norm": 0.07676628232002258, + "learning_rate": 8.184436743327783e-05, + "loss": 0.2605, + "step": 31191 + }, + { + "epoch": 2.526895657809462, + "grad_norm": 0.08312243968248367, + "learning_rate": 8.18398667806832e-05, + "loss": 0.2551, + "step": 31192 + }, + { + "epoch": 2.5269766688269604, + "grad_norm": 0.08073175698518753, + "learning_rate": 8.183536612808857e-05, + "loss": 0.2822, + "step": 31193 + }, + { + "epoch": 2.527057679844459, + "grad_norm": 0.06505302339792252, + "learning_rate": 8.183086547549395e-05, + "loss": 0.2396, + "step": 31194 + }, + { + "epoch": 2.5271386908619573, + "grad_norm": 0.06782861053943634, + "learning_rate": 8.182636482289932e-05, + "loss": 0.2482, + "step": 31195 + }, + { + "epoch": 2.5272197018794555, + "grad_norm": 0.06377576291561127, + "learning_rate": 8.18218641703047e-05, + "loss": 0.2243, + "step": 31196 + }, + { + "epoch": 2.5273007128969542, + "grad_norm": 0.07454057037830353, + "learning_rate": 8.181736351771008e-05, + "loss": 0.2649, + "step": 31197 + }, + { + "epoch": 2.5273817239144525, + "grad_norm": 0.0724162757396698, + "learning_rate": 8.181286286511544e-05, + "loss": 0.2363, + "step": 31198 + }, + { + "epoch": 2.5274627349319507, + "grad_norm": 0.07536423951387405, + "learning_rate": 8.180836221252081e-05, + "loss": 0.2535, + "step": 31199 + }, + { + "epoch": 2.527543745949449, + "grad_norm": 0.071848563849926, + "learning_rate": 8.18038615599262e-05, + "loss": 0.2259, + "step": 31200 + }, + { + "epoch": 2.5276247569669477, + "grad_norm": 0.05115946754813194, + "learning_rate": 8.179936090733156e-05, + "loss": 0.2048, + "step": 31201 + }, + { + "epoch": 2.527705767984446, + "grad_norm": 0.07633697241544724, + "learning_rate": 8.179486025473693e-05, + "loss": 0.2453, + "step": 31202 + }, + { + "epoch": 2.527786779001944, + "grad_norm": 0.07274796813726425, + "learning_rate": 8.179035960214232e-05, + "loss": 0.2779, + "step": 31203 + }, + { + "epoch": 2.5278677900194424, + "grad_norm": 0.06171036511659622, + "learning_rate": 8.17858589495477e-05, + "loss": 0.2316, + "step": 31204 + }, + { + "epoch": 2.527948801036941, + "grad_norm": 0.06370175629854202, + "learning_rate": 8.178135829695306e-05, + "loss": 0.229, + "step": 31205 + }, + { + "epoch": 2.5280298120544393, + "grad_norm": 0.06523820757865906, + "learning_rate": 8.177685764435844e-05, + "loss": 0.2706, + "step": 31206 + }, + { + "epoch": 2.5281108230719376, + "grad_norm": 0.06415421515703201, + "learning_rate": 8.177235699176382e-05, + "loss": 0.2251, + "step": 31207 + }, + { + "epoch": 2.5281918340894363, + "grad_norm": 0.06288018077611923, + "learning_rate": 8.176785633916918e-05, + "loss": 0.2352, + "step": 31208 + }, + { + "epoch": 2.5282728451069345, + "grad_norm": 0.0697561576962471, + "learning_rate": 8.176335568657456e-05, + "loss": 0.2235, + "step": 31209 + }, + { + "epoch": 2.5283538561244328, + "grad_norm": 0.05984310805797577, + "learning_rate": 8.175885503397994e-05, + "loss": 0.3137, + "step": 31210 + }, + { + "epoch": 2.5284348671419314, + "grad_norm": 0.06374064087867737, + "learning_rate": 8.17543543813853e-05, + "loss": 0.2261, + "step": 31211 + }, + { + "epoch": 2.5285158781594297, + "grad_norm": 0.07622829079627991, + "learning_rate": 8.174985372879068e-05, + "loss": 0.2648, + "step": 31212 + }, + { + "epoch": 2.528596889176928, + "grad_norm": 0.061640314757823944, + "learning_rate": 8.174535307619606e-05, + "loss": 0.231, + "step": 31213 + }, + { + "epoch": 2.5286779001944266, + "grad_norm": 0.048052046447992325, + "learning_rate": 8.174085242360142e-05, + "loss": 0.2346, + "step": 31214 + }, + { + "epoch": 2.528758911211925, + "grad_norm": 0.06793522834777832, + "learning_rate": 8.17363517710068e-05, + "loss": 0.2208, + "step": 31215 + }, + { + "epoch": 2.528839922229423, + "grad_norm": 0.07524669170379639, + "learning_rate": 8.173185111841218e-05, + "loss": 0.2506, + "step": 31216 + }, + { + "epoch": 2.528920933246922, + "grad_norm": 0.06395810842514038, + "learning_rate": 8.172735046581754e-05, + "loss": 0.2392, + "step": 31217 + }, + { + "epoch": 2.52900194426442, + "grad_norm": 0.066915363073349, + "learning_rate": 8.172284981322293e-05, + "loss": 0.2288, + "step": 31218 + }, + { + "epoch": 2.5290829552819183, + "grad_norm": 0.07390469312667847, + "learning_rate": 8.17183491606283e-05, + "loss": 0.2657, + "step": 31219 + }, + { + "epoch": 2.529163966299417, + "grad_norm": 0.06230680271983147, + "learning_rate": 8.171384850803366e-05, + "loss": 0.2053, + "step": 31220 + }, + { + "epoch": 2.529244977316915, + "grad_norm": 0.07825953513383865, + "learning_rate": 8.170934785543905e-05, + "loss": 0.2406, + "step": 31221 + }, + { + "epoch": 2.5293259883344135, + "grad_norm": 0.07075408846139908, + "learning_rate": 8.170484720284442e-05, + "loss": 0.2395, + "step": 31222 + }, + { + "epoch": 2.5294069993519117, + "grad_norm": 0.08722478896379471, + "learning_rate": 8.17003465502498e-05, + "loss": 0.2744, + "step": 31223 + }, + { + "epoch": 2.5294880103694104, + "grad_norm": 0.07645217329263687, + "learning_rate": 8.169584589765517e-05, + "loss": 0.2373, + "step": 31224 + }, + { + "epoch": 2.5295690213869086, + "grad_norm": 0.057986825704574585, + "learning_rate": 8.169134524506054e-05, + "loss": 0.2142, + "step": 31225 + }, + { + "epoch": 2.529650032404407, + "grad_norm": 0.07386277616024017, + "learning_rate": 8.168684459246591e-05, + "loss": 0.2463, + "step": 31226 + }, + { + "epoch": 2.529731043421905, + "grad_norm": 0.073249451816082, + "learning_rate": 8.168234393987129e-05, + "loss": 0.2204, + "step": 31227 + }, + { + "epoch": 2.529812054439404, + "grad_norm": 0.06574179977178574, + "learning_rate": 8.167784328727666e-05, + "loss": 0.2398, + "step": 31228 + }, + { + "epoch": 2.529893065456902, + "grad_norm": 0.06217941641807556, + "learning_rate": 8.167334263468204e-05, + "loss": 0.2265, + "step": 31229 + }, + { + "epoch": 2.5299740764744003, + "grad_norm": 0.07396439462900162, + "learning_rate": 8.166884198208741e-05, + "loss": 0.2724, + "step": 31230 + }, + { + "epoch": 2.530055087491899, + "grad_norm": 0.06049538031220436, + "learning_rate": 8.166434132949278e-05, + "loss": 0.2595, + "step": 31231 + }, + { + "epoch": 2.5301360985093972, + "grad_norm": 0.06809945404529572, + "learning_rate": 8.165984067689816e-05, + "loss": 0.2765, + "step": 31232 + }, + { + "epoch": 2.5302171095268955, + "grad_norm": 0.06943079829216003, + "learning_rate": 8.165534002430353e-05, + "loss": 0.2106, + "step": 31233 + }, + { + "epoch": 2.530298120544394, + "grad_norm": 0.07760677486658096, + "learning_rate": 8.16508393717089e-05, + "loss": 0.2546, + "step": 31234 + }, + { + "epoch": 2.5303791315618924, + "grad_norm": 0.0711454451084137, + "learning_rate": 8.164633871911428e-05, + "loss": 0.2497, + "step": 31235 + }, + { + "epoch": 2.5304601425793907, + "grad_norm": 0.06712127476930618, + "learning_rate": 8.164183806651965e-05, + "loss": 0.2484, + "step": 31236 + }, + { + "epoch": 2.5305411535968894, + "grad_norm": 0.07480525225400925, + "learning_rate": 8.163733741392502e-05, + "loss": 0.2369, + "step": 31237 + }, + { + "epoch": 2.5306221646143876, + "grad_norm": 0.06365400552749634, + "learning_rate": 8.16328367613304e-05, + "loss": 0.2914, + "step": 31238 + }, + { + "epoch": 2.530703175631886, + "grad_norm": 0.05150923877954483, + "learning_rate": 8.162833610873577e-05, + "loss": 0.2473, + "step": 31239 + }, + { + "epoch": 2.5307841866493845, + "grad_norm": 0.06193334981799126, + "learning_rate": 8.162383545614115e-05, + "loss": 0.225, + "step": 31240 + }, + { + "epoch": 2.530865197666883, + "grad_norm": 0.07247815281152725, + "learning_rate": 8.161933480354652e-05, + "loss": 0.2574, + "step": 31241 + }, + { + "epoch": 2.530946208684381, + "grad_norm": 0.06654870510101318, + "learning_rate": 8.161483415095189e-05, + "loss": 0.2669, + "step": 31242 + }, + { + "epoch": 2.5310272197018797, + "grad_norm": 0.05943829193711281, + "learning_rate": 8.161033349835727e-05, + "loss": 0.2294, + "step": 31243 + }, + { + "epoch": 2.531108230719378, + "grad_norm": 0.07509196549654007, + "learning_rate": 8.160583284576264e-05, + "loss": 0.253, + "step": 31244 + }, + { + "epoch": 2.531189241736876, + "grad_norm": 0.0631127879023552, + "learning_rate": 8.160133219316801e-05, + "loss": 0.223, + "step": 31245 + }, + { + "epoch": 2.5312702527543745, + "grad_norm": 0.06481597572565079, + "learning_rate": 8.159683154057339e-05, + "loss": 0.2604, + "step": 31246 + }, + { + "epoch": 2.531351263771873, + "grad_norm": 0.058724142611026764, + "learning_rate": 8.159233088797876e-05, + "loss": 0.2142, + "step": 31247 + }, + { + "epoch": 2.5314322747893714, + "grad_norm": 0.05988123267889023, + "learning_rate": 8.158783023538413e-05, + "loss": 0.232, + "step": 31248 + }, + { + "epoch": 2.5315132858068696, + "grad_norm": 0.066890649497509, + "learning_rate": 8.158332958278951e-05, + "loss": 0.2558, + "step": 31249 + }, + { + "epoch": 2.531594296824368, + "grad_norm": 0.06956793367862701, + "learning_rate": 8.157882893019488e-05, + "loss": 0.2574, + "step": 31250 + }, + { + "epoch": 2.5316753078418666, + "grad_norm": 0.06504230201244354, + "learning_rate": 8.157432827760026e-05, + "loss": 0.2539, + "step": 31251 + }, + { + "epoch": 2.531756318859365, + "grad_norm": 0.07182765752077103, + "learning_rate": 8.156982762500563e-05, + "loss": 0.2228, + "step": 31252 + }, + { + "epoch": 2.531837329876863, + "grad_norm": 0.06693996489048004, + "learning_rate": 8.1565326972411e-05, + "loss": 0.2399, + "step": 31253 + }, + { + "epoch": 2.5319183408943617, + "grad_norm": 0.05704502761363983, + "learning_rate": 8.156082631981638e-05, + "loss": 0.2004, + "step": 31254 + }, + { + "epoch": 2.53199935191186, + "grad_norm": 0.08396997302770615, + "learning_rate": 8.155632566722175e-05, + "loss": 0.2521, + "step": 31255 + }, + { + "epoch": 2.5320803629293582, + "grad_norm": 0.0741053894162178, + "learning_rate": 8.155182501462712e-05, + "loss": 0.2688, + "step": 31256 + }, + { + "epoch": 2.532161373946857, + "grad_norm": 0.06670941412448883, + "learning_rate": 8.15473243620325e-05, + "loss": 0.237, + "step": 31257 + }, + { + "epoch": 2.532242384964355, + "grad_norm": 0.06339386105537415, + "learning_rate": 8.154282370943787e-05, + "loss": 0.2096, + "step": 31258 + }, + { + "epoch": 2.5323233959818534, + "grad_norm": 0.06393574923276901, + "learning_rate": 8.153832305684324e-05, + "loss": 0.2722, + "step": 31259 + }, + { + "epoch": 2.532404406999352, + "grad_norm": 0.05412076786160469, + "learning_rate": 8.153382240424862e-05, + "loss": 0.2805, + "step": 31260 + }, + { + "epoch": 2.5324854180168503, + "grad_norm": 0.06732047349214554, + "learning_rate": 8.152932175165399e-05, + "loss": 0.2738, + "step": 31261 + }, + { + "epoch": 2.5325664290343486, + "grad_norm": 0.05094984546303749, + "learning_rate": 8.152482109905936e-05, + "loss": 0.2153, + "step": 31262 + }, + { + "epoch": 2.5326474400518473, + "grad_norm": 0.07786114513874054, + "learning_rate": 8.152032044646474e-05, + "loss": 0.2511, + "step": 31263 + }, + { + "epoch": 2.5327284510693455, + "grad_norm": 0.07169172912836075, + "learning_rate": 8.151581979387011e-05, + "loss": 0.2491, + "step": 31264 + }, + { + "epoch": 2.5328094620868438, + "grad_norm": 0.0770837813615799, + "learning_rate": 8.151131914127549e-05, + "loss": 0.25, + "step": 31265 + }, + { + "epoch": 2.5328904731043425, + "grad_norm": 0.06381841748952866, + "learning_rate": 8.150681848868086e-05, + "loss": 0.271, + "step": 31266 + }, + { + "epoch": 2.5329714841218407, + "grad_norm": 0.06991208344697952, + "learning_rate": 8.150231783608623e-05, + "loss": 0.2584, + "step": 31267 + }, + { + "epoch": 2.533052495139339, + "grad_norm": 0.06889082491397858, + "learning_rate": 8.14978171834916e-05, + "loss": 0.2506, + "step": 31268 + }, + { + "epoch": 2.533133506156837, + "grad_norm": 0.05752260610461235, + "learning_rate": 8.149331653089698e-05, + "loss": 0.2336, + "step": 31269 + }, + { + "epoch": 2.5332145171743354, + "grad_norm": 0.05750564485788345, + "learning_rate": 8.148881587830237e-05, + "loss": 0.2201, + "step": 31270 + }, + { + "epoch": 2.533295528191834, + "grad_norm": 0.05817430838942528, + "learning_rate": 8.148431522570773e-05, + "loss": 0.2258, + "step": 31271 + }, + { + "epoch": 2.5333765392093324, + "grad_norm": 0.07425309717655182, + "learning_rate": 8.14798145731131e-05, + "loss": 0.2193, + "step": 31272 + }, + { + "epoch": 2.5334575502268306, + "grad_norm": 0.07449954748153687, + "learning_rate": 8.147531392051849e-05, + "loss": 0.2453, + "step": 31273 + }, + { + "epoch": 2.5335385612443293, + "grad_norm": 0.07094830274581909, + "learning_rate": 8.147081326792385e-05, + "loss": 0.2577, + "step": 31274 + }, + { + "epoch": 2.5336195722618275, + "grad_norm": 0.057979848235845566, + "learning_rate": 8.146631261532922e-05, + "loss": 0.2077, + "step": 31275 + }, + { + "epoch": 2.533700583279326, + "grad_norm": 0.08039218187332153, + "learning_rate": 8.146181196273461e-05, + "loss": 0.2589, + "step": 31276 + }, + { + "epoch": 2.5337815942968245, + "grad_norm": 0.06383368372917175, + "learning_rate": 8.145731131013997e-05, + "loss": 0.229, + "step": 31277 + }, + { + "epoch": 2.5338626053143227, + "grad_norm": 0.05893819406628609, + "learning_rate": 8.145281065754536e-05, + "loss": 0.2638, + "step": 31278 + }, + { + "epoch": 2.533943616331821, + "grad_norm": 0.07219822704792023, + "learning_rate": 8.144831000495073e-05, + "loss": 0.2814, + "step": 31279 + }, + { + "epoch": 2.5340246273493197, + "grad_norm": 0.06139072775840759, + "learning_rate": 8.144380935235609e-05, + "loss": 0.2013, + "step": 31280 + }, + { + "epoch": 2.534105638366818, + "grad_norm": 0.06618008762598038, + "learning_rate": 8.143930869976148e-05, + "loss": 0.2239, + "step": 31281 + }, + { + "epoch": 2.534186649384316, + "grad_norm": 0.07001540809869766, + "learning_rate": 8.143480804716685e-05, + "loss": 0.2598, + "step": 31282 + }, + { + "epoch": 2.534267660401815, + "grad_norm": 0.06306258589029312, + "learning_rate": 8.143030739457221e-05, + "loss": 0.2509, + "step": 31283 + }, + { + "epoch": 2.534348671419313, + "grad_norm": 0.08775768429040909, + "learning_rate": 8.14258067419776e-05, + "loss": 0.2411, + "step": 31284 + }, + { + "epoch": 2.5344296824368113, + "grad_norm": 0.07452242076396942, + "learning_rate": 8.142130608938297e-05, + "loss": 0.2495, + "step": 31285 + }, + { + "epoch": 2.53451069345431, + "grad_norm": 0.0733480229973793, + "learning_rate": 8.141680543678833e-05, + "loss": 0.2598, + "step": 31286 + }, + { + "epoch": 2.5345917044718083, + "grad_norm": 0.08361136168241501, + "learning_rate": 8.141230478419372e-05, + "loss": 0.2812, + "step": 31287 + }, + { + "epoch": 2.5346727154893065, + "grad_norm": 0.06638027727603912, + "learning_rate": 8.140780413159909e-05, + "loss": 0.232, + "step": 31288 + }, + { + "epoch": 2.534753726506805, + "grad_norm": 0.0772709921002388, + "learning_rate": 8.140330347900445e-05, + "loss": 0.2764, + "step": 31289 + }, + { + "epoch": 2.5348347375243034, + "grad_norm": 0.06880713999271393, + "learning_rate": 8.139880282640984e-05, + "loss": 0.2691, + "step": 31290 + }, + { + "epoch": 2.5349157485418017, + "grad_norm": 0.07806142419576645, + "learning_rate": 8.139430217381521e-05, + "loss": 0.2485, + "step": 31291 + }, + { + "epoch": 2.5349967595593, + "grad_norm": 0.07091590017080307, + "learning_rate": 8.138980152122057e-05, + "loss": 0.2637, + "step": 31292 + }, + { + "epoch": 2.535077770576798, + "grad_norm": 0.07089286297559738, + "learning_rate": 8.138530086862596e-05, + "loss": 0.2482, + "step": 31293 + }, + { + "epoch": 2.535158781594297, + "grad_norm": 0.06219044700264931, + "learning_rate": 8.138080021603133e-05, + "loss": 0.2416, + "step": 31294 + }, + { + "epoch": 2.535239792611795, + "grad_norm": 0.07427731901407242, + "learning_rate": 8.13762995634367e-05, + "loss": 0.2847, + "step": 31295 + }, + { + "epoch": 2.5353208036292934, + "grad_norm": 0.06974802166223526, + "learning_rate": 8.137179891084208e-05, + "loss": 0.2975, + "step": 31296 + }, + { + "epoch": 2.535401814646792, + "grad_norm": 0.0793389305472374, + "learning_rate": 8.136729825824745e-05, + "loss": 0.2627, + "step": 31297 + }, + { + "epoch": 2.5354828256642903, + "grad_norm": 0.07065420597791672, + "learning_rate": 8.136279760565281e-05, + "loss": 0.2782, + "step": 31298 + }, + { + "epoch": 2.5355638366817885, + "grad_norm": 0.07344506680965424, + "learning_rate": 8.13582969530582e-05, + "loss": 0.2681, + "step": 31299 + }, + { + "epoch": 2.535644847699287, + "grad_norm": 0.06105487421154976, + "learning_rate": 8.135379630046358e-05, + "loss": 0.2714, + "step": 31300 + }, + { + "epoch": 2.5357258587167855, + "grad_norm": 0.06935641914606094, + "learning_rate": 8.134929564786894e-05, + "loss": 0.221, + "step": 31301 + }, + { + "epoch": 2.5358068697342837, + "grad_norm": 0.07248562574386597, + "learning_rate": 8.134479499527432e-05, + "loss": 0.2825, + "step": 31302 + }, + { + "epoch": 2.5358878807517824, + "grad_norm": 0.0756208673119545, + "learning_rate": 8.13402943426797e-05, + "loss": 0.2494, + "step": 31303 + }, + { + "epoch": 2.5359688917692806, + "grad_norm": 0.0709727481007576, + "learning_rate": 8.133579369008507e-05, + "loss": 0.2962, + "step": 31304 + }, + { + "epoch": 2.536049902786779, + "grad_norm": 0.05900624394416809, + "learning_rate": 8.133129303749044e-05, + "loss": 0.2414, + "step": 31305 + }, + { + "epoch": 2.5361309138042776, + "grad_norm": 0.08494237810373306, + "learning_rate": 8.132679238489582e-05, + "loss": 0.2627, + "step": 31306 + }, + { + "epoch": 2.536211924821776, + "grad_norm": 0.07157905399799347, + "learning_rate": 8.132229173230119e-05, + "loss": 0.2756, + "step": 31307 + }, + { + "epoch": 2.536292935839274, + "grad_norm": 0.05654566362500191, + "learning_rate": 8.131779107970656e-05, + "loss": 0.2383, + "step": 31308 + }, + { + "epoch": 2.5363739468567728, + "grad_norm": 0.06779604405164719, + "learning_rate": 8.131329042711194e-05, + "loss": 0.2504, + "step": 31309 + }, + { + "epoch": 2.536454957874271, + "grad_norm": 0.059628915041685104, + "learning_rate": 8.130878977451731e-05, + "loss": 0.216, + "step": 31310 + }, + { + "epoch": 2.5365359688917692, + "grad_norm": 0.07072723656892776, + "learning_rate": 8.130428912192268e-05, + "loss": 0.2538, + "step": 31311 + }, + { + "epoch": 2.536616979909268, + "grad_norm": 0.0625431165099144, + "learning_rate": 8.129978846932806e-05, + "loss": 0.2461, + "step": 31312 + }, + { + "epoch": 2.536697990926766, + "grad_norm": 0.07024965435266495, + "learning_rate": 8.129528781673343e-05, + "loss": 0.2557, + "step": 31313 + }, + { + "epoch": 2.5367790019442644, + "grad_norm": 0.07092742621898651, + "learning_rate": 8.12907871641388e-05, + "loss": 0.2451, + "step": 31314 + }, + { + "epoch": 2.5368600129617627, + "grad_norm": 0.07966975122690201, + "learning_rate": 8.128628651154418e-05, + "loss": 0.2333, + "step": 31315 + }, + { + "epoch": 2.536941023979261, + "grad_norm": 0.06543375551700592, + "learning_rate": 8.128178585894955e-05, + "loss": 0.2375, + "step": 31316 + }, + { + "epoch": 2.5370220349967596, + "grad_norm": 0.0650889202952385, + "learning_rate": 8.127728520635493e-05, + "loss": 0.2669, + "step": 31317 + }, + { + "epoch": 2.537103046014258, + "grad_norm": 0.06884335726499557, + "learning_rate": 8.12727845537603e-05, + "loss": 0.2407, + "step": 31318 + }, + { + "epoch": 2.537184057031756, + "grad_norm": 0.07633572816848755, + "learning_rate": 8.126828390116567e-05, + "loss": 0.2261, + "step": 31319 + }, + { + "epoch": 2.537265068049255, + "grad_norm": 0.08338648080825806, + "learning_rate": 8.126378324857105e-05, + "loss": 0.2446, + "step": 31320 + }, + { + "epoch": 2.537346079066753, + "grad_norm": 0.06513289362192154, + "learning_rate": 8.125928259597642e-05, + "loss": 0.2519, + "step": 31321 + }, + { + "epoch": 2.5374270900842513, + "grad_norm": 0.07970602810382843, + "learning_rate": 8.12547819433818e-05, + "loss": 0.2615, + "step": 31322 + }, + { + "epoch": 2.53750810110175, + "grad_norm": 0.06919629871845245, + "learning_rate": 8.125028129078717e-05, + "loss": 0.2653, + "step": 31323 + }, + { + "epoch": 2.537589112119248, + "grad_norm": 0.06731939315795898, + "learning_rate": 8.124578063819254e-05, + "loss": 0.244, + "step": 31324 + }, + { + "epoch": 2.5376701231367464, + "grad_norm": 0.08713632822036743, + "learning_rate": 8.124127998559792e-05, + "loss": 0.2377, + "step": 31325 + }, + { + "epoch": 2.537751134154245, + "grad_norm": 0.07864696532487869, + "learning_rate": 8.123677933300329e-05, + "loss": 0.2489, + "step": 31326 + }, + { + "epoch": 2.5378321451717434, + "grad_norm": 0.07397564500570297, + "learning_rate": 8.123227868040866e-05, + "loss": 0.2627, + "step": 31327 + }, + { + "epoch": 2.5379131561892416, + "grad_norm": 0.05374839901924133, + "learning_rate": 8.122777802781404e-05, + "loss": 0.2231, + "step": 31328 + }, + { + "epoch": 2.5379941672067403, + "grad_norm": 0.06318262964487076, + "learning_rate": 8.122327737521941e-05, + "loss": 0.2434, + "step": 31329 + }, + { + "epoch": 2.5380751782242386, + "grad_norm": 0.07979314029216766, + "learning_rate": 8.121877672262478e-05, + "loss": 0.2595, + "step": 31330 + }, + { + "epoch": 2.538156189241737, + "grad_norm": 0.06891234219074249, + "learning_rate": 8.121427607003016e-05, + "loss": 0.2358, + "step": 31331 + }, + { + "epoch": 2.5382372002592355, + "grad_norm": 0.054671745747327805, + "learning_rate": 8.120977541743553e-05, + "loss": 0.234, + "step": 31332 + }, + { + "epoch": 2.5383182112767337, + "grad_norm": 0.06143256276845932, + "learning_rate": 8.12052747648409e-05, + "loss": 0.2299, + "step": 31333 + }, + { + "epoch": 2.538399222294232, + "grad_norm": 0.0636318251490593, + "learning_rate": 8.120077411224628e-05, + "loss": 0.2702, + "step": 31334 + }, + { + "epoch": 2.5384802333117307, + "grad_norm": 0.07338964939117432, + "learning_rate": 8.119627345965165e-05, + "loss": 0.2552, + "step": 31335 + }, + { + "epoch": 2.538561244329229, + "grad_norm": 0.07319425791501999, + "learning_rate": 8.119177280705702e-05, + "loss": 0.217, + "step": 31336 + }, + { + "epoch": 2.538642255346727, + "grad_norm": 0.06431514769792557, + "learning_rate": 8.11872721544624e-05, + "loss": 0.2385, + "step": 31337 + }, + { + "epoch": 2.5387232663642254, + "grad_norm": 0.06492746621370316, + "learning_rate": 8.118277150186777e-05, + "loss": 0.218, + "step": 31338 + }, + { + "epoch": 2.5388042773817237, + "grad_norm": 0.07999663054943085, + "learning_rate": 8.117827084927316e-05, + "loss": 0.2526, + "step": 31339 + }, + { + "epoch": 2.5388852883992223, + "grad_norm": 0.08836555480957031, + "learning_rate": 8.117377019667852e-05, + "loss": 0.2302, + "step": 31340 + }, + { + "epoch": 2.5389662994167206, + "grad_norm": 0.0734882652759552, + "learning_rate": 8.116926954408389e-05, + "loss": 0.2532, + "step": 31341 + }, + { + "epoch": 2.539047310434219, + "grad_norm": 0.07000671327114105, + "learning_rate": 8.116476889148928e-05, + "loss": 0.2284, + "step": 31342 + }, + { + "epoch": 2.5391283214517175, + "grad_norm": 0.07682245224714279, + "learning_rate": 8.116026823889464e-05, + "loss": 0.2607, + "step": 31343 + }, + { + "epoch": 2.5392093324692158, + "grad_norm": 0.07145370543003082, + "learning_rate": 8.115576758630001e-05, + "loss": 0.2871, + "step": 31344 + }, + { + "epoch": 2.539290343486714, + "grad_norm": 0.07194703817367554, + "learning_rate": 8.11512669337054e-05, + "loss": 0.2931, + "step": 31345 + }, + { + "epoch": 2.5393713545042127, + "grad_norm": 0.09231244027614594, + "learning_rate": 8.114676628111076e-05, + "loss": 0.2872, + "step": 31346 + }, + { + "epoch": 2.539452365521711, + "grad_norm": 0.06625858694314957, + "learning_rate": 8.114226562851613e-05, + "loss": 0.2093, + "step": 31347 + }, + { + "epoch": 2.539533376539209, + "grad_norm": 0.06716077029705048, + "learning_rate": 8.113776497592152e-05, + "loss": 0.2434, + "step": 31348 + }, + { + "epoch": 2.539614387556708, + "grad_norm": 0.05468598008155823, + "learning_rate": 8.113326432332688e-05, + "loss": 0.2153, + "step": 31349 + }, + { + "epoch": 2.539695398574206, + "grad_norm": 0.05044583976268768, + "learning_rate": 8.112876367073226e-05, + "loss": 0.2816, + "step": 31350 + }, + { + "epoch": 2.5397764095917044, + "grad_norm": 0.08922028541564941, + "learning_rate": 8.112426301813764e-05, + "loss": 0.3512, + "step": 31351 + }, + { + "epoch": 2.539857420609203, + "grad_norm": 0.07008825987577438, + "learning_rate": 8.1119762365543e-05, + "loss": 0.2424, + "step": 31352 + }, + { + "epoch": 2.5399384316267013, + "grad_norm": 0.07038585841655731, + "learning_rate": 8.111526171294838e-05, + "loss": 0.2347, + "step": 31353 + }, + { + "epoch": 2.5400194426441995, + "grad_norm": 0.06255844235420227, + "learning_rate": 8.111076106035376e-05, + "loss": 0.25, + "step": 31354 + }, + { + "epoch": 2.5401004536616982, + "grad_norm": 0.06451769173145294, + "learning_rate": 8.110626040775912e-05, + "loss": 0.2458, + "step": 31355 + }, + { + "epoch": 2.5401814646791965, + "grad_norm": 0.0686870664358139, + "learning_rate": 8.110175975516451e-05, + "loss": 0.2339, + "step": 31356 + }, + { + "epoch": 2.5402624756966947, + "grad_norm": 0.07512461394071579, + "learning_rate": 8.109725910256988e-05, + "loss": 0.2332, + "step": 31357 + }, + { + "epoch": 2.540343486714193, + "grad_norm": 0.07387032359838486, + "learning_rate": 8.109275844997524e-05, + "loss": 0.2731, + "step": 31358 + }, + { + "epoch": 2.5404244977316917, + "grad_norm": 0.07460983842611313, + "learning_rate": 8.108825779738063e-05, + "loss": 0.2667, + "step": 31359 + }, + { + "epoch": 2.54050550874919, + "grad_norm": 0.09056951850652695, + "learning_rate": 8.1083757144786e-05, + "loss": 0.2487, + "step": 31360 + }, + { + "epoch": 2.540586519766688, + "grad_norm": 0.0651530846953392, + "learning_rate": 8.107925649219136e-05, + "loss": 0.267, + "step": 31361 + }, + { + "epoch": 2.5406675307841864, + "grad_norm": 0.07405731827020645, + "learning_rate": 8.107475583959675e-05, + "loss": 0.2498, + "step": 31362 + }, + { + "epoch": 2.540748541801685, + "grad_norm": 0.08580704033374786, + "learning_rate": 8.107025518700213e-05, + "loss": 0.239, + "step": 31363 + }, + { + "epoch": 2.5408295528191833, + "grad_norm": 0.08362039923667908, + "learning_rate": 8.106575453440749e-05, + "loss": 0.2814, + "step": 31364 + }, + { + "epoch": 2.5409105638366816, + "grad_norm": 0.06828504055738449, + "learning_rate": 8.106125388181287e-05, + "loss": 0.2383, + "step": 31365 + }, + { + "epoch": 2.5409915748541803, + "grad_norm": 0.060805875808000565, + "learning_rate": 8.105675322921825e-05, + "loss": 0.2859, + "step": 31366 + }, + { + "epoch": 2.5410725858716785, + "grad_norm": 0.08148464560508728, + "learning_rate": 8.10522525766236e-05, + "loss": 0.2517, + "step": 31367 + }, + { + "epoch": 2.5411535968891767, + "grad_norm": 0.0642819032073021, + "learning_rate": 8.1047751924029e-05, + "loss": 0.2102, + "step": 31368 + }, + { + "epoch": 2.5412346079066754, + "grad_norm": 0.07097551971673965, + "learning_rate": 8.104325127143437e-05, + "loss": 0.2484, + "step": 31369 + }, + { + "epoch": 2.5413156189241737, + "grad_norm": 0.07399990409612656, + "learning_rate": 8.103875061883973e-05, + "loss": 0.2383, + "step": 31370 + }, + { + "epoch": 2.541396629941672, + "grad_norm": 0.0638495683670044, + "learning_rate": 8.103424996624511e-05, + "loss": 0.2475, + "step": 31371 + }, + { + "epoch": 2.5414776409591706, + "grad_norm": 0.06386625021696091, + "learning_rate": 8.102974931365049e-05, + "loss": 0.2494, + "step": 31372 + }, + { + "epoch": 2.541558651976669, + "grad_norm": 0.08529839664697647, + "learning_rate": 8.102524866105585e-05, + "loss": 0.2434, + "step": 31373 + }, + { + "epoch": 2.541639662994167, + "grad_norm": 0.07638844847679138, + "learning_rate": 8.102074800846124e-05, + "loss": 0.2781, + "step": 31374 + }, + { + "epoch": 2.541720674011666, + "grad_norm": 0.06507378816604614, + "learning_rate": 8.101624735586661e-05, + "loss": 0.2088, + "step": 31375 + }, + { + "epoch": 2.541801685029164, + "grad_norm": 0.062385451048612595, + "learning_rate": 8.101174670327197e-05, + "loss": 0.321, + "step": 31376 + }, + { + "epoch": 2.5418826960466623, + "grad_norm": 0.051668040454387665, + "learning_rate": 8.100724605067736e-05, + "loss": 0.2125, + "step": 31377 + }, + { + "epoch": 2.541963707064161, + "grad_norm": 0.06572634726762772, + "learning_rate": 8.100274539808273e-05, + "loss": 0.2803, + "step": 31378 + }, + { + "epoch": 2.542044718081659, + "grad_norm": 0.08419101685285568, + "learning_rate": 8.099824474548809e-05, + "loss": 0.2258, + "step": 31379 + }, + { + "epoch": 2.5421257290991575, + "grad_norm": 0.05883840471506119, + "learning_rate": 8.099374409289348e-05, + "loss": 0.2255, + "step": 31380 + }, + { + "epoch": 2.5422067401166557, + "grad_norm": 0.0676005482673645, + "learning_rate": 8.098924344029885e-05, + "loss": 0.2646, + "step": 31381 + }, + { + "epoch": 2.5422877511341544, + "grad_norm": 0.09608455747365952, + "learning_rate": 8.098474278770422e-05, + "loss": 0.2543, + "step": 31382 + }, + { + "epoch": 2.5423687621516526, + "grad_norm": 0.07695041596889496, + "learning_rate": 8.09802421351096e-05, + "loss": 0.2484, + "step": 31383 + }, + { + "epoch": 2.542449773169151, + "grad_norm": 0.05805501341819763, + "learning_rate": 8.097574148251497e-05, + "loss": 0.2323, + "step": 31384 + }, + { + "epoch": 2.542530784186649, + "grad_norm": 0.06507986783981323, + "learning_rate": 8.097124082992035e-05, + "loss": 0.2528, + "step": 31385 + }, + { + "epoch": 2.542611795204148, + "grad_norm": 0.06332040578126907, + "learning_rate": 8.096674017732572e-05, + "loss": 0.2356, + "step": 31386 + }, + { + "epoch": 2.542692806221646, + "grad_norm": 0.06507432460784912, + "learning_rate": 8.096223952473109e-05, + "loss": 0.2551, + "step": 31387 + }, + { + "epoch": 2.5427738172391443, + "grad_norm": 0.08199954777956009, + "learning_rate": 8.095773887213647e-05, + "loss": 0.258, + "step": 31388 + }, + { + "epoch": 2.542854828256643, + "grad_norm": 0.0717313215136528, + "learning_rate": 8.095323821954184e-05, + "loss": 0.2619, + "step": 31389 + }, + { + "epoch": 2.5429358392741412, + "grad_norm": 0.056809939444065094, + "learning_rate": 8.094873756694721e-05, + "loss": 0.2235, + "step": 31390 + }, + { + "epoch": 2.5430168502916395, + "grad_norm": 0.06941332668066025, + "learning_rate": 8.094423691435259e-05, + "loss": 0.246, + "step": 31391 + }, + { + "epoch": 2.543097861309138, + "grad_norm": 0.07208379358053207, + "learning_rate": 8.093973626175796e-05, + "loss": 0.2508, + "step": 31392 + }, + { + "epoch": 2.5431788723266364, + "grad_norm": 0.06503705680370331, + "learning_rate": 8.093523560916333e-05, + "loss": 0.2538, + "step": 31393 + }, + { + "epoch": 2.5432598833441347, + "grad_norm": 0.07446480542421341, + "learning_rate": 8.093073495656871e-05, + "loss": 0.3158, + "step": 31394 + }, + { + "epoch": 2.5433408943616334, + "grad_norm": 0.06678412109613419, + "learning_rate": 8.092623430397408e-05, + "loss": 0.223, + "step": 31395 + }, + { + "epoch": 2.5434219053791316, + "grad_norm": 0.05912339687347412, + "learning_rate": 8.092173365137945e-05, + "loss": 0.2329, + "step": 31396 + }, + { + "epoch": 2.54350291639663, + "grad_norm": 0.06796559691429138, + "learning_rate": 8.091723299878483e-05, + "loss": 0.225, + "step": 31397 + }, + { + "epoch": 2.5435839274141285, + "grad_norm": 0.07572735846042633, + "learning_rate": 8.09127323461902e-05, + "loss": 0.2327, + "step": 31398 + }, + { + "epoch": 2.5436649384316268, + "grad_norm": 0.07548676431179047, + "learning_rate": 8.090823169359558e-05, + "loss": 0.2619, + "step": 31399 + }, + { + "epoch": 2.543745949449125, + "grad_norm": 0.05666543170809746, + "learning_rate": 8.090373104100095e-05, + "loss": 0.2124, + "step": 31400 + }, + { + "epoch": 2.5438269604666237, + "grad_norm": 0.0540505051612854, + "learning_rate": 8.089923038840632e-05, + "loss": 0.2521, + "step": 31401 + }, + { + "epoch": 2.543907971484122, + "grad_norm": 0.07738293707370758, + "learning_rate": 8.08947297358117e-05, + "loss": 0.239, + "step": 31402 + }, + { + "epoch": 2.54398898250162, + "grad_norm": 0.07248470187187195, + "learning_rate": 8.089022908321707e-05, + "loss": 0.2341, + "step": 31403 + }, + { + "epoch": 2.5440699935191184, + "grad_norm": 0.06621932983398438, + "learning_rate": 8.088572843062244e-05, + "loss": 0.2474, + "step": 31404 + }, + { + "epoch": 2.544151004536617, + "grad_norm": 0.06753819435834885, + "learning_rate": 8.088122777802782e-05, + "loss": 0.253, + "step": 31405 + }, + { + "epoch": 2.5442320155541154, + "grad_norm": 0.06610064208507538, + "learning_rate": 8.087672712543319e-05, + "loss": 0.2116, + "step": 31406 + }, + { + "epoch": 2.5443130265716136, + "grad_norm": 0.06519056856632233, + "learning_rate": 8.087222647283856e-05, + "loss": 0.298, + "step": 31407 + }, + { + "epoch": 2.544394037589112, + "grad_norm": 0.080140121281147, + "learning_rate": 8.086772582024395e-05, + "loss": 0.2725, + "step": 31408 + }, + { + "epoch": 2.5444750486066106, + "grad_norm": 0.06608147919178009, + "learning_rate": 8.086322516764931e-05, + "loss": 0.2641, + "step": 31409 + }, + { + "epoch": 2.544556059624109, + "grad_norm": 0.060456544160842896, + "learning_rate": 8.085872451505469e-05, + "loss": 0.2372, + "step": 31410 + }, + { + "epoch": 2.544637070641607, + "grad_norm": 0.06948632746934891, + "learning_rate": 8.085422386246007e-05, + "loss": 0.235, + "step": 31411 + }, + { + "epoch": 2.5447180816591057, + "grad_norm": 0.062045566737651825, + "learning_rate": 8.084972320986543e-05, + "loss": 0.2112, + "step": 31412 + }, + { + "epoch": 2.544799092676604, + "grad_norm": 0.05757555738091469, + "learning_rate": 8.08452225572708e-05, + "loss": 0.2134, + "step": 31413 + }, + { + "epoch": 2.5448801036941022, + "grad_norm": 0.06793248653411865, + "learning_rate": 8.084072190467619e-05, + "loss": 0.2696, + "step": 31414 + }, + { + "epoch": 2.544961114711601, + "grad_norm": 0.05913359671831131, + "learning_rate": 8.083622125208155e-05, + "loss": 0.2545, + "step": 31415 + }, + { + "epoch": 2.545042125729099, + "grad_norm": 0.061100032180547714, + "learning_rate": 8.083172059948693e-05, + "loss": 0.2445, + "step": 31416 + }, + { + "epoch": 2.5451231367465974, + "grad_norm": 0.07344558835029602, + "learning_rate": 8.082721994689231e-05, + "loss": 0.2253, + "step": 31417 + }, + { + "epoch": 2.545204147764096, + "grad_norm": 0.06750749051570892, + "learning_rate": 8.082271929429767e-05, + "loss": 0.288, + "step": 31418 + }, + { + "epoch": 2.5452851587815943, + "grad_norm": 0.059958647936582565, + "learning_rate": 8.081821864170305e-05, + "loss": 0.2649, + "step": 31419 + }, + { + "epoch": 2.5453661697990926, + "grad_norm": 0.0737132877111435, + "learning_rate": 8.081371798910843e-05, + "loss": 0.218, + "step": 31420 + }, + { + "epoch": 2.5454471808165913, + "grad_norm": 0.06931359320878983, + "learning_rate": 8.08092173365138e-05, + "loss": 0.229, + "step": 31421 + }, + { + "epoch": 2.5455281918340895, + "grad_norm": 0.07487521320581436, + "learning_rate": 8.080471668391917e-05, + "loss": 0.2505, + "step": 31422 + }, + { + "epoch": 2.5456092028515878, + "grad_norm": 0.05484943836927414, + "learning_rate": 8.080021603132456e-05, + "loss": 0.2027, + "step": 31423 + }, + { + "epoch": 2.5456902138690864, + "grad_norm": 0.07246629893779755, + "learning_rate": 8.079571537872992e-05, + "loss": 0.2567, + "step": 31424 + }, + { + "epoch": 2.5457712248865847, + "grad_norm": 0.07829077541828156, + "learning_rate": 8.079121472613529e-05, + "loss": 0.273, + "step": 31425 + }, + { + "epoch": 2.545852235904083, + "grad_norm": 0.05823566019535065, + "learning_rate": 8.078671407354068e-05, + "loss": 0.2361, + "step": 31426 + }, + { + "epoch": 2.545933246921581, + "grad_norm": 0.05854753777384758, + "learning_rate": 8.078221342094604e-05, + "loss": 0.2787, + "step": 31427 + }, + { + "epoch": 2.54601425793908, + "grad_norm": 0.0628264844417572, + "learning_rate": 8.077771276835141e-05, + "loss": 0.2208, + "step": 31428 + }, + { + "epoch": 2.546095268956578, + "grad_norm": 0.0791764110326767, + "learning_rate": 8.07732121157568e-05, + "loss": 0.2485, + "step": 31429 + }, + { + "epoch": 2.5461762799740764, + "grad_norm": 0.06644466519355774, + "learning_rate": 8.076871146316216e-05, + "loss": 0.2638, + "step": 31430 + }, + { + "epoch": 2.5462572909915746, + "grad_norm": 0.07715664803981781, + "learning_rate": 8.076421081056753e-05, + "loss": 0.2882, + "step": 31431 + }, + { + "epoch": 2.5463383020090733, + "grad_norm": 0.07745123654603958, + "learning_rate": 8.075971015797292e-05, + "loss": 0.2652, + "step": 31432 + }, + { + "epoch": 2.5464193130265715, + "grad_norm": 0.07673593610525131, + "learning_rate": 8.075520950537828e-05, + "loss": 0.2977, + "step": 31433 + }, + { + "epoch": 2.54650032404407, + "grad_norm": 0.06416471302509308, + "learning_rate": 8.075070885278367e-05, + "loss": 0.2543, + "step": 31434 + }, + { + "epoch": 2.5465813350615685, + "grad_norm": 0.0717669129371643, + "learning_rate": 8.074620820018904e-05, + "loss": 0.2554, + "step": 31435 + }, + { + "epoch": 2.5466623460790667, + "grad_norm": 0.06670583039522171, + "learning_rate": 8.07417075475944e-05, + "loss": 0.2304, + "step": 31436 + }, + { + "epoch": 2.546743357096565, + "grad_norm": 0.06272175908088684, + "learning_rate": 8.073720689499979e-05, + "loss": 0.2561, + "step": 31437 + }, + { + "epoch": 2.5468243681140637, + "grad_norm": 0.05772021785378456, + "learning_rate": 8.073270624240516e-05, + "loss": 0.2488, + "step": 31438 + }, + { + "epoch": 2.546905379131562, + "grad_norm": 0.07507701963186264, + "learning_rate": 8.072820558981052e-05, + "loss": 0.2547, + "step": 31439 + }, + { + "epoch": 2.54698639014906, + "grad_norm": 0.07226193696260452, + "learning_rate": 8.07237049372159e-05, + "loss": 0.2555, + "step": 31440 + }, + { + "epoch": 2.547067401166559, + "grad_norm": 0.04555051773786545, + "learning_rate": 8.071920428462128e-05, + "loss": 0.2192, + "step": 31441 + }, + { + "epoch": 2.547148412184057, + "grad_norm": 0.07527697086334229, + "learning_rate": 8.071470363202664e-05, + "loss": 0.2175, + "step": 31442 + }, + { + "epoch": 2.5472294232015553, + "grad_norm": 0.057383034378290176, + "learning_rate": 8.071020297943203e-05, + "loss": 0.2166, + "step": 31443 + }, + { + "epoch": 2.547310434219054, + "grad_norm": 0.07875962555408478, + "learning_rate": 8.07057023268374e-05, + "loss": 0.2251, + "step": 31444 + }, + { + "epoch": 2.5473914452365523, + "grad_norm": 0.05845116823911667, + "learning_rate": 8.070120167424276e-05, + "loss": 0.2163, + "step": 31445 + }, + { + "epoch": 2.5474724562540505, + "grad_norm": 0.062219440937042236, + "learning_rate": 8.069670102164815e-05, + "loss": 0.2429, + "step": 31446 + }, + { + "epoch": 2.547553467271549, + "grad_norm": 0.06835257261991501, + "learning_rate": 8.069220036905352e-05, + "loss": 0.2873, + "step": 31447 + }, + { + "epoch": 2.5476344782890474, + "grad_norm": 0.07006371021270752, + "learning_rate": 8.068769971645888e-05, + "loss": 0.2312, + "step": 31448 + }, + { + "epoch": 2.5477154893065457, + "grad_norm": 0.06568927317857742, + "learning_rate": 8.068319906386427e-05, + "loss": 0.2677, + "step": 31449 + }, + { + "epoch": 2.547796500324044, + "grad_norm": 0.07233678549528122, + "learning_rate": 8.067869841126964e-05, + "loss": 0.2782, + "step": 31450 + }, + { + "epoch": 2.5478775113415426, + "grad_norm": 0.05997056886553764, + "learning_rate": 8.0674197758675e-05, + "loss": 0.2002, + "step": 31451 + }, + { + "epoch": 2.547958522359041, + "grad_norm": 0.07329046726226807, + "learning_rate": 8.066969710608039e-05, + "loss": 0.2554, + "step": 31452 + }, + { + "epoch": 2.548039533376539, + "grad_norm": 0.06880810111761093, + "learning_rate": 8.066519645348576e-05, + "loss": 0.2187, + "step": 31453 + }, + { + "epoch": 2.5481205443940373, + "grad_norm": 0.07072120159864426, + "learning_rate": 8.066069580089112e-05, + "loss": 0.2397, + "step": 31454 + }, + { + "epoch": 2.548201555411536, + "grad_norm": 0.0745464563369751, + "learning_rate": 8.065619514829651e-05, + "loss": 0.2726, + "step": 31455 + }, + { + "epoch": 2.5482825664290343, + "grad_norm": 0.07428991794586182, + "learning_rate": 8.065169449570188e-05, + "loss": 0.2519, + "step": 31456 + }, + { + "epoch": 2.5483635774465325, + "grad_norm": 0.06004711240530014, + "learning_rate": 8.064719384310724e-05, + "loss": 0.2289, + "step": 31457 + }, + { + "epoch": 2.548444588464031, + "grad_norm": 0.07008999586105347, + "learning_rate": 8.064269319051263e-05, + "loss": 0.2509, + "step": 31458 + }, + { + "epoch": 2.5485255994815295, + "grad_norm": 0.06014377623796463, + "learning_rate": 8.0638192537918e-05, + "loss": 0.2232, + "step": 31459 + }, + { + "epoch": 2.5486066104990277, + "grad_norm": 0.07197581231594086, + "learning_rate": 8.063369188532337e-05, + "loss": 0.2214, + "step": 31460 + }, + { + "epoch": 2.5486876215165264, + "grad_norm": 0.07166114449501038, + "learning_rate": 8.062919123272875e-05, + "loss": 0.2319, + "step": 31461 + }, + { + "epoch": 2.5487686325340246, + "grad_norm": 0.06933080404996872, + "learning_rate": 8.062469058013413e-05, + "loss": 0.2551, + "step": 31462 + }, + { + "epoch": 2.548849643551523, + "grad_norm": 0.05811614543199539, + "learning_rate": 8.06201899275395e-05, + "loss": 0.2361, + "step": 31463 + }, + { + "epoch": 2.5489306545690216, + "grad_norm": 0.06434644013643265, + "learning_rate": 8.061568927494487e-05, + "loss": 0.2391, + "step": 31464 + }, + { + "epoch": 2.54901166558652, + "grad_norm": 0.07208694517612457, + "learning_rate": 8.061118862235025e-05, + "loss": 0.2571, + "step": 31465 + }, + { + "epoch": 2.549092676604018, + "grad_norm": 0.05893586575984955, + "learning_rate": 8.060668796975562e-05, + "loss": 0.2181, + "step": 31466 + }, + { + "epoch": 2.5491736876215167, + "grad_norm": 0.07099951803684235, + "learning_rate": 8.0602187317161e-05, + "loss": 0.2707, + "step": 31467 + }, + { + "epoch": 2.549254698639015, + "grad_norm": 0.0784766674041748, + "learning_rate": 8.059768666456637e-05, + "loss": 0.2883, + "step": 31468 + }, + { + "epoch": 2.5493357096565132, + "grad_norm": 0.07410445064306259, + "learning_rate": 8.059318601197174e-05, + "loss": 0.2503, + "step": 31469 + }, + { + "epoch": 2.549416720674012, + "grad_norm": 0.07164378464221954, + "learning_rate": 8.058868535937711e-05, + "loss": 0.246, + "step": 31470 + }, + { + "epoch": 2.54949773169151, + "grad_norm": 0.06276273727416992, + "learning_rate": 8.058418470678249e-05, + "loss": 0.2383, + "step": 31471 + }, + { + "epoch": 2.5495787427090084, + "grad_norm": 0.07285445928573608, + "learning_rate": 8.057968405418786e-05, + "loss": 0.2663, + "step": 31472 + }, + { + "epoch": 2.5496597537265067, + "grad_norm": 0.059940405189991, + "learning_rate": 8.057518340159324e-05, + "loss": 0.2534, + "step": 31473 + }, + { + "epoch": 2.5497407647440054, + "grad_norm": 0.060176264494657516, + "learning_rate": 8.057068274899861e-05, + "loss": 0.2396, + "step": 31474 + }, + { + "epoch": 2.5498217757615036, + "grad_norm": 0.08137813955545425, + "learning_rate": 8.056618209640398e-05, + "loss": 0.2975, + "step": 31475 + }, + { + "epoch": 2.549902786779002, + "grad_norm": 0.0577470101416111, + "learning_rate": 8.056168144380936e-05, + "loss": 0.2355, + "step": 31476 + }, + { + "epoch": 2.5499837977965, + "grad_norm": 0.059146296232938766, + "learning_rate": 8.055718079121473e-05, + "loss": 0.1929, + "step": 31477 + }, + { + "epoch": 2.5500648088139988, + "grad_norm": 0.06168728321790695, + "learning_rate": 8.05526801386201e-05, + "loss": 0.2511, + "step": 31478 + }, + { + "epoch": 2.550145819831497, + "grad_norm": 0.06982030719518661, + "learning_rate": 8.054817948602548e-05, + "loss": 0.2568, + "step": 31479 + }, + { + "epoch": 2.5502268308489953, + "grad_norm": 0.07995269447565079, + "learning_rate": 8.054367883343085e-05, + "loss": 0.2949, + "step": 31480 + }, + { + "epoch": 2.550307841866494, + "grad_norm": 0.06170627847313881, + "learning_rate": 8.053917818083622e-05, + "loss": 0.2176, + "step": 31481 + }, + { + "epoch": 2.550388852883992, + "grad_norm": 0.05501281097531319, + "learning_rate": 8.05346775282416e-05, + "loss": 0.2013, + "step": 31482 + }, + { + "epoch": 2.5504698639014904, + "grad_norm": 0.0645870566368103, + "learning_rate": 8.053017687564697e-05, + "loss": 0.2536, + "step": 31483 + }, + { + "epoch": 2.550550874918989, + "grad_norm": 0.08232105523347855, + "learning_rate": 8.052567622305235e-05, + "loss": 0.2796, + "step": 31484 + }, + { + "epoch": 2.5506318859364874, + "grad_norm": 0.08395248651504517, + "learning_rate": 8.052117557045772e-05, + "loss": 0.2554, + "step": 31485 + }, + { + "epoch": 2.5507128969539856, + "grad_norm": 0.06629413366317749, + "learning_rate": 8.051667491786309e-05, + "loss": 0.2121, + "step": 31486 + }, + { + "epoch": 2.5507939079714843, + "grad_norm": 0.08047202974557877, + "learning_rate": 8.051217426526847e-05, + "loss": 0.2666, + "step": 31487 + }, + { + "epoch": 2.5508749189889826, + "grad_norm": 0.07313171774148941, + "learning_rate": 8.050767361267384e-05, + "loss": 0.2548, + "step": 31488 + }, + { + "epoch": 2.550955930006481, + "grad_norm": 0.07625266164541245, + "learning_rate": 8.050317296007923e-05, + "loss": 0.2876, + "step": 31489 + }, + { + "epoch": 2.5510369410239795, + "grad_norm": 0.060665592551231384, + "learning_rate": 8.049867230748459e-05, + "loss": 0.1688, + "step": 31490 + }, + { + "epoch": 2.5511179520414777, + "grad_norm": 0.052935559302568436, + "learning_rate": 8.049417165488996e-05, + "loss": 0.22, + "step": 31491 + }, + { + "epoch": 2.551198963058976, + "grad_norm": 0.07563591003417969, + "learning_rate": 8.048967100229535e-05, + "loss": 0.2976, + "step": 31492 + }, + { + "epoch": 2.5512799740764747, + "grad_norm": 0.0724901482462883, + "learning_rate": 8.048517034970071e-05, + "loss": 0.2973, + "step": 31493 + }, + { + "epoch": 2.551360985093973, + "grad_norm": 0.06520567089319229, + "learning_rate": 8.048066969710608e-05, + "loss": 0.2555, + "step": 31494 + }, + { + "epoch": 2.551441996111471, + "grad_norm": 0.05636593699455261, + "learning_rate": 8.047616904451147e-05, + "loss": 0.2236, + "step": 31495 + }, + { + "epoch": 2.5515230071289694, + "grad_norm": 0.06789355725049973, + "learning_rate": 8.047166839191683e-05, + "loss": 0.2481, + "step": 31496 + }, + { + "epoch": 2.5516040181464676, + "grad_norm": 0.06700514256954193, + "learning_rate": 8.04671677393222e-05, + "loss": 0.2253, + "step": 31497 + }, + { + "epoch": 2.5516850291639663, + "grad_norm": 0.06449780613183975, + "learning_rate": 8.046266708672759e-05, + "loss": 0.2459, + "step": 31498 + }, + { + "epoch": 2.5517660401814646, + "grad_norm": 0.07504129409790039, + "learning_rate": 8.045816643413295e-05, + "loss": 0.2684, + "step": 31499 + }, + { + "epoch": 2.551847051198963, + "grad_norm": 0.09547720849514008, + "learning_rate": 8.045366578153832e-05, + "loss": 0.2897, + "step": 31500 + }, + { + "epoch": 2.5519280622164615, + "grad_norm": 0.07837854325771332, + "learning_rate": 8.044916512894371e-05, + "loss": 0.2486, + "step": 31501 + }, + { + "epoch": 2.5520090732339598, + "grad_norm": 0.07050782442092896, + "learning_rate": 8.044466447634907e-05, + "loss": 0.2473, + "step": 31502 + }, + { + "epoch": 2.552090084251458, + "grad_norm": 0.0633266493678093, + "learning_rate": 8.044016382375444e-05, + "loss": 0.2212, + "step": 31503 + }, + { + "epoch": 2.5521710952689567, + "grad_norm": 0.06830959767103195, + "learning_rate": 8.043566317115983e-05, + "loss": 0.2966, + "step": 31504 + }, + { + "epoch": 2.552252106286455, + "grad_norm": 0.06610594689846039, + "learning_rate": 8.043116251856519e-05, + "loss": 0.2113, + "step": 31505 + }, + { + "epoch": 2.552333117303953, + "grad_norm": 0.06861544400453568, + "learning_rate": 8.042666186597056e-05, + "loss": 0.2738, + "step": 31506 + }, + { + "epoch": 2.552414128321452, + "grad_norm": 0.06505028903484344, + "learning_rate": 8.042216121337595e-05, + "loss": 0.2386, + "step": 31507 + }, + { + "epoch": 2.55249513933895, + "grad_norm": 0.05172327533364296, + "learning_rate": 8.041766056078131e-05, + "loss": 0.2403, + "step": 31508 + }, + { + "epoch": 2.5525761503564484, + "grad_norm": 0.07511154562234879, + "learning_rate": 8.041315990818669e-05, + "loss": 0.2425, + "step": 31509 + }, + { + "epoch": 2.552657161373947, + "grad_norm": 0.08115319162607193, + "learning_rate": 8.040865925559207e-05, + "loss": 0.2258, + "step": 31510 + }, + { + "epoch": 2.5527381723914453, + "grad_norm": 0.062161825597286224, + "learning_rate": 8.040415860299743e-05, + "loss": 0.283, + "step": 31511 + }, + { + "epoch": 2.5528191834089435, + "grad_norm": 0.07682914286851883, + "learning_rate": 8.03996579504028e-05, + "loss": 0.2659, + "step": 31512 + }, + { + "epoch": 2.5529001944264422, + "grad_norm": 0.07859036326408386, + "learning_rate": 8.03951572978082e-05, + "loss": 0.2941, + "step": 31513 + }, + { + "epoch": 2.5529812054439405, + "grad_norm": 0.06644420325756073, + "learning_rate": 8.039065664521355e-05, + "loss": 0.2299, + "step": 31514 + }, + { + "epoch": 2.5530622164614387, + "grad_norm": 0.07618606835603714, + "learning_rate": 8.038615599261894e-05, + "loss": 0.2805, + "step": 31515 + }, + { + "epoch": 2.5531432274789374, + "grad_norm": 0.05901041626930237, + "learning_rate": 8.038165534002431e-05, + "loss": 0.2437, + "step": 31516 + }, + { + "epoch": 2.5532242384964356, + "grad_norm": 0.05900729075074196, + "learning_rate": 8.037715468742967e-05, + "loss": 0.2205, + "step": 31517 + }, + { + "epoch": 2.553305249513934, + "grad_norm": 0.05804867297410965, + "learning_rate": 8.037265403483506e-05, + "loss": 0.2451, + "step": 31518 + }, + { + "epoch": 2.553386260531432, + "grad_norm": 0.08153724670410156, + "learning_rate": 8.036815338224043e-05, + "loss": 0.2562, + "step": 31519 + }, + { + "epoch": 2.5534672715489304, + "grad_norm": 0.08371604979038239, + "learning_rate": 8.03636527296458e-05, + "loss": 0.2581, + "step": 31520 + }, + { + "epoch": 2.553548282566429, + "grad_norm": 0.07773632556200027, + "learning_rate": 8.035915207705118e-05, + "loss": 0.2601, + "step": 31521 + }, + { + "epoch": 2.5536292935839273, + "grad_norm": 0.06940475851297379, + "learning_rate": 8.035465142445656e-05, + "loss": 0.2382, + "step": 31522 + }, + { + "epoch": 2.5537103046014256, + "grad_norm": 0.06401082873344421, + "learning_rate": 8.035015077186192e-05, + "loss": 0.2451, + "step": 31523 + }, + { + "epoch": 2.5537913156189243, + "grad_norm": 0.07718578726053238, + "learning_rate": 8.03456501192673e-05, + "loss": 0.3, + "step": 31524 + }, + { + "epoch": 2.5538723266364225, + "grad_norm": 0.06541422009468079, + "learning_rate": 8.034114946667268e-05, + "loss": 0.2555, + "step": 31525 + }, + { + "epoch": 2.5539533376539207, + "grad_norm": 0.06255664676427841, + "learning_rate": 8.033664881407804e-05, + "loss": 0.2131, + "step": 31526 + }, + { + "epoch": 2.5540343486714194, + "grad_norm": 0.07217202335596085, + "learning_rate": 8.033214816148342e-05, + "loss": 0.2385, + "step": 31527 + }, + { + "epoch": 2.5541153596889177, + "grad_norm": 0.08286501467227936, + "learning_rate": 8.03276475088888e-05, + "loss": 0.2232, + "step": 31528 + }, + { + "epoch": 2.554196370706416, + "grad_norm": 0.0661536231637001, + "learning_rate": 8.032314685629416e-05, + "loss": 0.2298, + "step": 31529 + }, + { + "epoch": 2.5542773817239146, + "grad_norm": 0.06117008253931999, + "learning_rate": 8.031864620369954e-05, + "loss": 0.234, + "step": 31530 + }, + { + "epoch": 2.554358392741413, + "grad_norm": 0.06352005898952484, + "learning_rate": 8.031414555110492e-05, + "loss": 0.2151, + "step": 31531 + }, + { + "epoch": 2.554439403758911, + "grad_norm": 0.0644054189324379, + "learning_rate": 8.030964489851028e-05, + "loss": 0.2567, + "step": 31532 + }, + { + "epoch": 2.55452041477641, + "grad_norm": 0.06056513637304306, + "learning_rate": 8.030514424591567e-05, + "loss": 0.2268, + "step": 31533 + }, + { + "epoch": 2.554601425793908, + "grad_norm": 0.058263856917619705, + "learning_rate": 8.030064359332104e-05, + "loss": 0.2508, + "step": 31534 + }, + { + "epoch": 2.5546824368114063, + "grad_norm": 0.08078998327255249, + "learning_rate": 8.02961429407264e-05, + "loss": 0.2539, + "step": 31535 + }, + { + "epoch": 2.554763447828905, + "grad_norm": 0.057538606226444244, + "learning_rate": 8.029164228813179e-05, + "loss": 0.2433, + "step": 31536 + }, + { + "epoch": 2.554844458846403, + "grad_norm": 0.05998895317316055, + "learning_rate": 8.028714163553716e-05, + "loss": 0.2392, + "step": 31537 + }, + { + "epoch": 2.5549254698639015, + "grad_norm": 0.062351860105991364, + "learning_rate": 8.028264098294252e-05, + "loss": 0.2477, + "step": 31538 + }, + { + "epoch": 2.5550064808814, + "grad_norm": 0.0619591660797596, + "learning_rate": 8.027814033034791e-05, + "loss": 0.2093, + "step": 31539 + }, + { + "epoch": 2.5550874918988984, + "grad_norm": 0.06385014206171036, + "learning_rate": 8.027363967775328e-05, + "loss": 0.2536, + "step": 31540 + }, + { + "epoch": 2.5551685029163966, + "grad_norm": 0.06226972118020058, + "learning_rate": 8.026913902515865e-05, + "loss": 0.244, + "step": 31541 + }, + { + "epoch": 2.555249513933895, + "grad_norm": 0.059874311089515686, + "learning_rate": 8.026463837256403e-05, + "loss": 0.2393, + "step": 31542 + }, + { + "epoch": 2.555330524951393, + "grad_norm": 0.06457412987947464, + "learning_rate": 8.02601377199694e-05, + "loss": 0.2551, + "step": 31543 + }, + { + "epoch": 2.555411535968892, + "grad_norm": 0.05957409739494324, + "learning_rate": 8.025563706737478e-05, + "loss": 0.2265, + "step": 31544 + }, + { + "epoch": 2.55549254698639, + "grad_norm": 0.06354091316461563, + "learning_rate": 8.025113641478015e-05, + "loss": 0.2137, + "step": 31545 + }, + { + "epoch": 2.5555735580038883, + "grad_norm": 0.06427013874053955, + "learning_rate": 8.024663576218552e-05, + "loss": 0.2358, + "step": 31546 + }, + { + "epoch": 2.555654569021387, + "grad_norm": 0.06286320835351944, + "learning_rate": 8.02421351095909e-05, + "loss": 0.2234, + "step": 31547 + }, + { + "epoch": 2.5557355800388852, + "grad_norm": 0.07858006656169891, + "learning_rate": 8.023763445699627e-05, + "loss": 0.24, + "step": 31548 + }, + { + "epoch": 2.5558165910563835, + "grad_norm": 0.07946208864450455, + "learning_rate": 8.023313380440164e-05, + "loss": 0.2449, + "step": 31549 + }, + { + "epoch": 2.555897602073882, + "grad_norm": 0.06396917253732681, + "learning_rate": 8.022863315180702e-05, + "loss": 0.2593, + "step": 31550 + }, + { + "epoch": 2.5559786130913804, + "grad_norm": 0.07191366702318192, + "learning_rate": 8.022413249921239e-05, + "loss": 0.2111, + "step": 31551 + }, + { + "epoch": 2.5560596241088787, + "grad_norm": 0.07205166667699814, + "learning_rate": 8.021963184661776e-05, + "loss": 0.2119, + "step": 31552 + }, + { + "epoch": 2.5561406351263773, + "grad_norm": 0.09519444406032562, + "learning_rate": 8.021513119402314e-05, + "loss": 0.2577, + "step": 31553 + }, + { + "epoch": 2.5562216461438756, + "grad_norm": 0.07464559376239777, + "learning_rate": 8.021063054142851e-05, + "loss": 0.2512, + "step": 31554 + }, + { + "epoch": 2.556302657161374, + "grad_norm": 0.07090155780315399, + "learning_rate": 8.020612988883388e-05, + "loss": 0.233, + "step": 31555 + }, + { + "epoch": 2.5563836681788725, + "grad_norm": 0.06708156317472458, + "learning_rate": 8.020162923623926e-05, + "loss": 0.2269, + "step": 31556 + }, + { + "epoch": 2.5564646791963708, + "grad_norm": 0.07535571604967117, + "learning_rate": 8.019712858364463e-05, + "loss": 0.2014, + "step": 31557 + }, + { + "epoch": 2.556545690213869, + "grad_norm": 0.07511401176452637, + "learning_rate": 8.019262793105e-05, + "loss": 0.2518, + "step": 31558 + }, + { + "epoch": 2.5566267012313677, + "grad_norm": 0.055385805666446686, + "learning_rate": 8.018812727845538e-05, + "loss": 0.213, + "step": 31559 + }, + { + "epoch": 2.556707712248866, + "grad_norm": 0.08293783664703369, + "learning_rate": 8.018362662586075e-05, + "loss": 0.2147, + "step": 31560 + }, + { + "epoch": 2.556788723266364, + "grad_norm": 0.08957002311944962, + "learning_rate": 8.017912597326613e-05, + "loss": 0.239, + "step": 31561 + }, + { + "epoch": 2.556869734283863, + "grad_norm": 0.07297007739543915, + "learning_rate": 8.01746253206715e-05, + "loss": 0.2445, + "step": 31562 + }, + { + "epoch": 2.556950745301361, + "grad_norm": 0.06064193323254585, + "learning_rate": 8.017012466807687e-05, + "loss": 0.2358, + "step": 31563 + }, + { + "epoch": 2.5570317563188594, + "grad_norm": 0.06830794364213943, + "learning_rate": 8.016562401548225e-05, + "loss": 0.258, + "step": 31564 + }, + { + "epoch": 2.5571127673363576, + "grad_norm": 0.06914442032575607, + "learning_rate": 8.016112336288762e-05, + "loss": 0.2373, + "step": 31565 + }, + { + "epoch": 2.557193778353856, + "grad_norm": 0.07195620238780975, + "learning_rate": 8.0156622710293e-05, + "loss": 0.247, + "step": 31566 + }, + { + "epoch": 2.5572747893713546, + "grad_norm": 0.06008646637201309, + "learning_rate": 8.015212205769838e-05, + "loss": 0.2111, + "step": 31567 + }, + { + "epoch": 2.557355800388853, + "grad_norm": 0.0733952522277832, + "learning_rate": 8.014762140510374e-05, + "loss": 0.2395, + "step": 31568 + }, + { + "epoch": 2.557436811406351, + "grad_norm": 0.06406228989362717, + "learning_rate": 8.014312075250912e-05, + "loss": 0.2522, + "step": 31569 + }, + { + "epoch": 2.5575178224238497, + "grad_norm": 0.0706440731883049, + "learning_rate": 8.01386200999145e-05, + "loss": 0.251, + "step": 31570 + }, + { + "epoch": 2.557598833441348, + "grad_norm": 0.06202545017004013, + "learning_rate": 8.013411944731986e-05, + "loss": 0.2392, + "step": 31571 + }, + { + "epoch": 2.557679844458846, + "grad_norm": 0.06435450911521912, + "learning_rate": 8.012961879472524e-05, + "loss": 0.2225, + "step": 31572 + }, + { + "epoch": 2.557760855476345, + "grad_norm": 0.06245691329240799, + "learning_rate": 8.012511814213062e-05, + "loss": 0.2139, + "step": 31573 + }, + { + "epoch": 2.557841866493843, + "grad_norm": 0.07368949055671692, + "learning_rate": 8.012061748953598e-05, + "loss": 0.2669, + "step": 31574 + }, + { + "epoch": 2.5579228775113414, + "grad_norm": 0.09155303239822388, + "learning_rate": 8.011611683694136e-05, + "loss": 0.2254, + "step": 31575 + }, + { + "epoch": 2.55800388852884, + "grad_norm": 0.0641794353723526, + "learning_rate": 8.011161618434674e-05, + "loss": 0.2448, + "step": 31576 + }, + { + "epoch": 2.5580848995463383, + "grad_norm": 0.04910936951637268, + "learning_rate": 8.01071155317521e-05, + "loss": 0.2323, + "step": 31577 + }, + { + "epoch": 2.5581659105638366, + "grad_norm": 0.08007602393627167, + "learning_rate": 8.010261487915748e-05, + "loss": 0.2633, + "step": 31578 + }, + { + "epoch": 2.5582469215813353, + "grad_norm": 0.07051931321620941, + "learning_rate": 8.009811422656286e-05, + "loss": 0.2535, + "step": 31579 + }, + { + "epoch": 2.5583279325988335, + "grad_norm": 0.05478541553020477, + "learning_rate": 8.009361357396822e-05, + "loss": 0.2535, + "step": 31580 + }, + { + "epoch": 2.5584089436163318, + "grad_norm": 0.06377498805522919, + "learning_rate": 8.00891129213736e-05, + "loss": 0.2529, + "step": 31581 + }, + { + "epoch": 2.5584899546338304, + "grad_norm": 0.07668465375900269, + "learning_rate": 8.008461226877899e-05, + "loss": 0.2516, + "step": 31582 + }, + { + "epoch": 2.5585709656513287, + "grad_norm": 0.06877156347036362, + "learning_rate": 8.008011161618435e-05, + "loss": 0.2237, + "step": 31583 + }, + { + "epoch": 2.558651976668827, + "grad_norm": 0.07158958911895752, + "learning_rate": 8.007561096358972e-05, + "loss": 0.2451, + "step": 31584 + }, + { + "epoch": 2.558732987686325, + "grad_norm": 0.06303799152374268, + "learning_rate": 8.00711103109951e-05, + "loss": 0.238, + "step": 31585 + }, + { + "epoch": 2.558813998703824, + "grad_norm": 0.06521542370319366, + "learning_rate": 8.006660965840047e-05, + "loss": 0.2328, + "step": 31586 + }, + { + "epoch": 2.558895009721322, + "grad_norm": 0.08857200294733047, + "learning_rate": 8.006210900580584e-05, + "loss": 0.2863, + "step": 31587 + }, + { + "epoch": 2.5589760207388204, + "grad_norm": 0.06617053598165512, + "learning_rate": 8.005760835321123e-05, + "loss": 0.2446, + "step": 31588 + }, + { + "epoch": 2.5590570317563186, + "grad_norm": 0.07978159934282303, + "learning_rate": 8.005310770061659e-05, + "loss": 0.2124, + "step": 31589 + }, + { + "epoch": 2.5591380427738173, + "grad_norm": 0.060769423842430115, + "learning_rate": 8.004860704802196e-05, + "loss": 0.249, + "step": 31590 + }, + { + "epoch": 2.5592190537913155, + "grad_norm": 0.07907140254974365, + "learning_rate": 8.004410639542735e-05, + "loss": 0.2616, + "step": 31591 + }, + { + "epoch": 2.559300064808814, + "grad_norm": 0.0631987601518631, + "learning_rate": 8.003960574283271e-05, + "loss": 0.2509, + "step": 31592 + }, + { + "epoch": 2.5593810758263125, + "grad_norm": 0.0674910768866539, + "learning_rate": 8.00351050902381e-05, + "loss": 0.2322, + "step": 31593 + }, + { + "epoch": 2.5594620868438107, + "grad_norm": 0.06475397199392319, + "learning_rate": 8.003060443764347e-05, + "loss": 0.248, + "step": 31594 + }, + { + "epoch": 2.559543097861309, + "grad_norm": 0.06565103679895401, + "learning_rate": 8.002610378504883e-05, + "loss": 0.269, + "step": 31595 + }, + { + "epoch": 2.5596241088788076, + "grad_norm": 0.06982734799385071, + "learning_rate": 8.002160313245422e-05, + "loss": 0.2485, + "step": 31596 + }, + { + "epoch": 2.559705119896306, + "grad_norm": 0.0696418285369873, + "learning_rate": 8.001710247985959e-05, + "loss": 0.2317, + "step": 31597 + }, + { + "epoch": 2.559786130913804, + "grad_norm": 0.06628835201263428, + "learning_rate": 8.001260182726495e-05, + "loss": 0.2639, + "step": 31598 + }, + { + "epoch": 2.559867141931303, + "grad_norm": 0.08352109789848328, + "learning_rate": 8.000810117467034e-05, + "loss": 0.2375, + "step": 31599 + }, + { + "epoch": 2.559948152948801, + "grad_norm": 0.057371314615011215, + "learning_rate": 8.000360052207571e-05, + "loss": 0.1994, + "step": 31600 + }, + { + "epoch": 2.5600291639662993, + "grad_norm": 0.06860015541315079, + "learning_rate": 7.999909986948107e-05, + "loss": 0.2653, + "step": 31601 + }, + { + "epoch": 2.560110174983798, + "grad_norm": 0.07338287681341171, + "learning_rate": 7.999459921688646e-05, + "loss": 0.2302, + "step": 31602 + }, + { + "epoch": 2.5601911860012962, + "grad_norm": 0.08055268228054047, + "learning_rate": 7.999009856429183e-05, + "loss": 0.2321, + "step": 31603 + }, + { + "epoch": 2.5602721970187945, + "grad_norm": 0.06045558676123619, + "learning_rate": 7.998559791169719e-05, + "loss": 0.2268, + "step": 31604 + }, + { + "epoch": 2.560353208036293, + "grad_norm": 0.0702483206987381, + "learning_rate": 7.998109725910258e-05, + "loss": 0.2423, + "step": 31605 + }, + { + "epoch": 2.5604342190537914, + "grad_norm": 0.05823696404695511, + "learning_rate": 7.997659660650795e-05, + "loss": 0.2305, + "step": 31606 + }, + { + "epoch": 2.5605152300712897, + "grad_norm": 0.04120326414704323, + "learning_rate": 7.997209595391331e-05, + "loss": 0.1926, + "step": 31607 + }, + { + "epoch": 2.560596241088788, + "grad_norm": 0.07255415618419647, + "learning_rate": 7.99675953013187e-05, + "loss": 0.2321, + "step": 31608 + }, + { + "epoch": 2.5606772521062866, + "grad_norm": 0.06354595720767975, + "learning_rate": 7.996309464872407e-05, + "loss": 0.1984, + "step": 31609 + }, + { + "epoch": 2.560758263123785, + "grad_norm": 0.05864912271499634, + "learning_rate": 7.995859399612943e-05, + "loss": 0.2216, + "step": 31610 + }, + { + "epoch": 2.560839274141283, + "grad_norm": 0.07821990549564362, + "learning_rate": 7.995409334353482e-05, + "loss": 0.2606, + "step": 31611 + }, + { + "epoch": 2.5609202851587813, + "grad_norm": 0.08326715975999832, + "learning_rate": 7.99495926909402e-05, + "loss": 0.2755, + "step": 31612 + }, + { + "epoch": 2.56100129617628, + "grad_norm": 0.06333693861961365, + "learning_rate": 7.994509203834555e-05, + "loss": 0.2467, + "step": 31613 + }, + { + "epoch": 2.5610823071937783, + "grad_norm": 0.07545526325702667, + "learning_rate": 7.994059138575094e-05, + "loss": 0.2635, + "step": 31614 + }, + { + "epoch": 2.5611633182112765, + "grad_norm": 0.07584195584058762, + "learning_rate": 7.993609073315631e-05, + "loss": 0.2271, + "step": 31615 + }, + { + "epoch": 2.561244329228775, + "grad_norm": 0.06313547492027283, + "learning_rate": 7.993159008056167e-05, + "loss": 0.2386, + "step": 31616 + }, + { + "epoch": 2.5613253402462735, + "grad_norm": 0.07154353708028793, + "learning_rate": 7.992708942796706e-05, + "loss": 0.2546, + "step": 31617 + }, + { + "epoch": 2.5614063512637717, + "grad_norm": 0.05730261653661728, + "learning_rate": 7.992258877537244e-05, + "loss": 0.2422, + "step": 31618 + }, + { + "epoch": 2.5614873622812704, + "grad_norm": 0.07545353472232819, + "learning_rate": 7.99180881227778e-05, + "loss": 0.2251, + "step": 31619 + }, + { + "epoch": 2.5615683732987686, + "grad_norm": 0.06900720298290253, + "learning_rate": 7.991358747018318e-05, + "loss": 0.2536, + "step": 31620 + }, + { + "epoch": 2.561649384316267, + "grad_norm": 0.08917799592018127, + "learning_rate": 7.990908681758856e-05, + "loss": 0.2861, + "step": 31621 + }, + { + "epoch": 2.5617303953337656, + "grad_norm": 0.07041896134614944, + "learning_rate": 7.990458616499393e-05, + "loss": 0.2607, + "step": 31622 + }, + { + "epoch": 2.561811406351264, + "grad_norm": 0.06320405751466751, + "learning_rate": 7.99000855123993e-05, + "loss": 0.2377, + "step": 31623 + }, + { + "epoch": 2.561892417368762, + "grad_norm": 0.07257578521966934, + "learning_rate": 7.989558485980468e-05, + "loss": 0.2455, + "step": 31624 + }, + { + "epoch": 2.5619734283862607, + "grad_norm": 0.05553307384252548, + "learning_rate": 7.989108420721005e-05, + "loss": 0.2239, + "step": 31625 + }, + { + "epoch": 2.562054439403759, + "grad_norm": 0.06904014199972153, + "learning_rate": 7.988658355461542e-05, + "loss": 0.2554, + "step": 31626 + }, + { + "epoch": 2.5621354504212572, + "grad_norm": 0.06648155301809311, + "learning_rate": 7.98820829020208e-05, + "loss": 0.2718, + "step": 31627 + }, + { + "epoch": 2.562216461438756, + "grad_norm": 0.06676594913005829, + "learning_rate": 7.987758224942617e-05, + "loss": 0.2238, + "step": 31628 + }, + { + "epoch": 2.562297472456254, + "grad_norm": 0.08011876046657562, + "learning_rate": 7.987308159683154e-05, + "loss": 0.2354, + "step": 31629 + }, + { + "epoch": 2.5623784834737524, + "grad_norm": 0.07950930297374725, + "learning_rate": 7.986858094423692e-05, + "loss": 0.2578, + "step": 31630 + }, + { + "epoch": 2.5624594944912507, + "grad_norm": 0.06657250970602036, + "learning_rate": 7.986408029164229e-05, + "loss": 0.2425, + "step": 31631 + }, + { + "epoch": 2.5625405055087493, + "grad_norm": 0.06820258498191833, + "learning_rate": 7.985957963904767e-05, + "loss": 0.2869, + "step": 31632 + }, + { + "epoch": 2.5626215165262476, + "grad_norm": 0.07058540731668472, + "learning_rate": 7.985507898645304e-05, + "loss": 0.2421, + "step": 31633 + }, + { + "epoch": 2.562702527543746, + "grad_norm": 0.082171231508255, + "learning_rate": 7.985057833385841e-05, + "loss": 0.2493, + "step": 31634 + }, + { + "epoch": 2.562783538561244, + "grad_norm": 0.07239805907011032, + "learning_rate": 7.984607768126379e-05, + "loss": 0.2511, + "step": 31635 + }, + { + "epoch": 2.5628645495787428, + "grad_norm": 0.08331374824047089, + "learning_rate": 7.984157702866916e-05, + "loss": 0.2364, + "step": 31636 + }, + { + "epoch": 2.562945560596241, + "grad_norm": 0.06185334920883179, + "learning_rate": 7.983707637607453e-05, + "loss": 0.2163, + "step": 31637 + }, + { + "epoch": 2.5630265716137393, + "grad_norm": 0.06657331436872482, + "learning_rate": 7.983257572347991e-05, + "loss": 0.2514, + "step": 31638 + }, + { + "epoch": 2.563107582631238, + "grad_norm": 0.07455500960350037, + "learning_rate": 7.982807507088528e-05, + "loss": 0.2303, + "step": 31639 + }, + { + "epoch": 2.563188593648736, + "grad_norm": 0.07059744745492935, + "learning_rate": 7.982357441829065e-05, + "loss": 0.2506, + "step": 31640 + }, + { + "epoch": 2.5632696046662344, + "grad_norm": 0.06850961595773697, + "learning_rate": 7.981907376569603e-05, + "loss": 0.2667, + "step": 31641 + }, + { + "epoch": 2.563350615683733, + "grad_norm": 0.061915162950754166, + "learning_rate": 7.98145731131014e-05, + "loss": 0.2525, + "step": 31642 + }, + { + "epoch": 2.5634316267012314, + "grad_norm": 0.056903161108493805, + "learning_rate": 7.981007246050678e-05, + "loss": 0.2664, + "step": 31643 + }, + { + "epoch": 2.5635126377187296, + "grad_norm": 0.06871285289525986, + "learning_rate": 7.980557180791215e-05, + "loss": 0.2468, + "step": 31644 + }, + { + "epoch": 2.5635936487362283, + "grad_norm": 0.06563066691160202, + "learning_rate": 7.980107115531752e-05, + "loss": 0.2289, + "step": 31645 + }, + { + "epoch": 2.5636746597537265, + "grad_norm": 0.07172146439552307, + "learning_rate": 7.97965705027229e-05, + "loss": 0.2563, + "step": 31646 + }, + { + "epoch": 2.563755670771225, + "grad_norm": 0.0756186842918396, + "learning_rate": 7.979206985012827e-05, + "loss": 0.2602, + "step": 31647 + }, + { + "epoch": 2.5638366817887235, + "grad_norm": 0.0699758529663086, + "learning_rate": 7.978756919753366e-05, + "loss": 0.2454, + "step": 31648 + }, + { + "epoch": 2.5639176928062217, + "grad_norm": 0.06742367148399353, + "learning_rate": 7.978306854493902e-05, + "loss": 0.2735, + "step": 31649 + }, + { + "epoch": 2.56399870382372, + "grad_norm": 0.07160858064889908, + "learning_rate": 7.977856789234439e-05, + "loss": 0.2511, + "step": 31650 + }, + { + "epoch": 2.5640797148412187, + "grad_norm": 0.06456317007541656, + "learning_rate": 7.977406723974978e-05, + "loss": 0.2287, + "step": 31651 + }, + { + "epoch": 2.564160725858717, + "grad_norm": 0.06363669037818909, + "learning_rate": 7.976956658715514e-05, + "loss": 0.2989, + "step": 31652 + }, + { + "epoch": 2.564241736876215, + "grad_norm": 0.07487460225820541, + "learning_rate": 7.976506593456051e-05, + "loss": 0.2616, + "step": 31653 + }, + { + "epoch": 2.5643227478937134, + "grad_norm": 0.056149642914533615, + "learning_rate": 7.97605652819659e-05, + "loss": 0.2086, + "step": 31654 + }, + { + "epoch": 2.564403758911212, + "grad_norm": 0.06197667494416237, + "learning_rate": 7.975606462937126e-05, + "loss": 0.2958, + "step": 31655 + }, + { + "epoch": 2.5644847699287103, + "grad_norm": 0.07219796627759933, + "learning_rate": 7.975156397677663e-05, + "loss": 0.2387, + "step": 31656 + }, + { + "epoch": 2.5645657809462086, + "grad_norm": 0.06966370344161987, + "learning_rate": 7.974706332418202e-05, + "loss": 0.2681, + "step": 31657 + }, + { + "epoch": 2.564646791963707, + "grad_norm": 0.07845804840326309, + "learning_rate": 7.974256267158738e-05, + "loss": 0.2618, + "step": 31658 + }, + { + "epoch": 2.5647278029812055, + "grad_norm": 0.07504191249608994, + "learning_rate": 7.973806201899275e-05, + "loss": 0.2152, + "step": 31659 + }, + { + "epoch": 2.5648088139987038, + "grad_norm": 0.0714363306760788, + "learning_rate": 7.973356136639814e-05, + "loss": 0.2462, + "step": 31660 + }, + { + "epoch": 2.564889825016202, + "grad_norm": 0.05532535910606384, + "learning_rate": 7.97290607138035e-05, + "loss": 0.2049, + "step": 31661 + }, + { + "epoch": 2.5649708360337007, + "grad_norm": 0.07604444026947021, + "learning_rate": 7.972456006120887e-05, + "loss": 0.2465, + "step": 31662 + }, + { + "epoch": 2.565051847051199, + "grad_norm": 0.06291288137435913, + "learning_rate": 7.972005940861426e-05, + "loss": 0.2371, + "step": 31663 + }, + { + "epoch": 2.565132858068697, + "grad_norm": 0.055569905787706375, + "learning_rate": 7.971555875601962e-05, + "loss": 0.2203, + "step": 31664 + }, + { + "epoch": 2.565213869086196, + "grad_norm": 0.07797441631555557, + "learning_rate": 7.9711058103425e-05, + "loss": 0.2219, + "step": 31665 + }, + { + "epoch": 2.565294880103694, + "grad_norm": 0.058151982724666595, + "learning_rate": 7.970655745083038e-05, + "loss": 0.2429, + "step": 31666 + }, + { + "epoch": 2.5653758911211924, + "grad_norm": 0.06993840634822845, + "learning_rate": 7.970205679823574e-05, + "loss": 0.2447, + "step": 31667 + }, + { + "epoch": 2.565456902138691, + "grad_norm": 0.08092262595891953, + "learning_rate": 7.969755614564112e-05, + "loss": 0.2304, + "step": 31668 + }, + { + "epoch": 2.5655379131561893, + "grad_norm": 0.060657061636447906, + "learning_rate": 7.96930554930465e-05, + "loss": 0.2387, + "step": 31669 + }, + { + "epoch": 2.5656189241736875, + "grad_norm": 0.0777408555150032, + "learning_rate": 7.968855484045186e-05, + "loss": 0.2553, + "step": 31670 + }, + { + "epoch": 2.565699935191186, + "grad_norm": 0.09145821630954742, + "learning_rate": 7.968405418785724e-05, + "loss": 0.2321, + "step": 31671 + }, + { + "epoch": 2.5657809462086845, + "grad_norm": 0.07050789892673492, + "learning_rate": 7.967955353526262e-05, + "loss": 0.2307, + "step": 31672 + }, + { + "epoch": 2.5658619572261827, + "grad_norm": 0.07583153247833252, + "learning_rate": 7.967505288266798e-05, + "loss": 0.2552, + "step": 31673 + }, + { + "epoch": 2.5659429682436814, + "grad_norm": 0.06492862850427628, + "learning_rate": 7.967055223007337e-05, + "loss": 0.2218, + "step": 31674 + }, + { + "epoch": 2.5660239792611796, + "grad_norm": 0.06800020486116409, + "learning_rate": 7.966605157747874e-05, + "loss": 0.2584, + "step": 31675 + }, + { + "epoch": 2.566104990278678, + "grad_norm": 0.06461351364850998, + "learning_rate": 7.96615509248841e-05, + "loss": 0.2393, + "step": 31676 + }, + { + "epoch": 2.566186001296176, + "grad_norm": 0.0789063423871994, + "learning_rate": 7.965705027228949e-05, + "loss": 0.329, + "step": 31677 + }, + { + "epoch": 2.566267012313675, + "grad_norm": 0.0644729882478714, + "learning_rate": 7.965254961969486e-05, + "loss": 0.2665, + "step": 31678 + }, + { + "epoch": 2.566348023331173, + "grad_norm": 0.07038956135511398, + "learning_rate": 7.964804896710023e-05, + "loss": 0.2604, + "step": 31679 + }, + { + "epoch": 2.5664290343486713, + "grad_norm": 0.06449288129806519, + "learning_rate": 7.964354831450561e-05, + "loss": 0.2496, + "step": 31680 + }, + { + "epoch": 2.5665100453661696, + "grad_norm": 0.0661557987332344, + "learning_rate": 7.963904766191099e-05, + "loss": 0.2896, + "step": 31681 + }, + { + "epoch": 2.5665910563836682, + "grad_norm": 0.06338050216436386, + "learning_rate": 7.963454700931635e-05, + "loss": 0.2271, + "step": 31682 + }, + { + "epoch": 2.5666720674011665, + "grad_norm": 0.07911043614149094, + "learning_rate": 7.963004635672173e-05, + "loss": 0.2658, + "step": 31683 + }, + { + "epoch": 2.5667530784186647, + "grad_norm": 0.06274065375328064, + "learning_rate": 7.96255457041271e-05, + "loss": 0.2752, + "step": 31684 + }, + { + "epoch": 2.5668340894361634, + "grad_norm": 0.06047653779387474, + "learning_rate": 7.962104505153247e-05, + "loss": 0.2436, + "step": 31685 + }, + { + "epoch": 2.5669151004536617, + "grad_norm": 0.05865699425339699, + "learning_rate": 7.961654439893785e-05, + "loss": 0.2425, + "step": 31686 + }, + { + "epoch": 2.56699611147116, + "grad_norm": 0.05763997510075569, + "learning_rate": 7.961204374634323e-05, + "loss": 0.2008, + "step": 31687 + }, + { + "epoch": 2.5670771224886586, + "grad_norm": 0.06605460494756699, + "learning_rate": 7.960754309374859e-05, + "loss": 0.2653, + "step": 31688 + }, + { + "epoch": 2.567158133506157, + "grad_norm": 0.07231853157281876, + "learning_rate": 7.960304244115397e-05, + "loss": 0.2368, + "step": 31689 + }, + { + "epoch": 2.567239144523655, + "grad_norm": 0.059788014739751816, + "learning_rate": 7.959854178855935e-05, + "loss": 0.2558, + "step": 31690 + }, + { + "epoch": 2.567320155541154, + "grad_norm": 0.06757822632789612, + "learning_rate": 7.959404113596471e-05, + "loss": 0.2506, + "step": 31691 + }, + { + "epoch": 2.567401166558652, + "grad_norm": 0.06352180987596512, + "learning_rate": 7.95895404833701e-05, + "loss": 0.2464, + "step": 31692 + }, + { + "epoch": 2.5674821775761503, + "grad_norm": 0.0770125687122345, + "learning_rate": 7.958503983077547e-05, + "loss": 0.2519, + "step": 31693 + }, + { + "epoch": 2.567563188593649, + "grad_norm": 0.06670795381069183, + "learning_rate": 7.958053917818083e-05, + "loss": 0.2404, + "step": 31694 + }, + { + "epoch": 2.567644199611147, + "grad_norm": 0.07569408416748047, + "learning_rate": 7.957603852558622e-05, + "loss": 0.2846, + "step": 31695 + }, + { + "epoch": 2.5677252106286454, + "grad_norm": 0.08044213801622391, + "learning_rate": 7.957153787299159e-05, + "loss": 0.2473, + "step": 31696 + }, + { + "epoch": 2.567806221646144, + "grad_norm": 0.07201042026281357, + "learning_rate": 7.956703722039695e-05, + "loss": 0.2284, + "step": 31697 + }, + { + "epoch": 2.5678872326636424, + "grad_norm": 0.07905683666467667, + "learning_rate": 7.956253656780234e-05, + "loss": 0.2666, + "step": 31698 + }, + { + "epoch": 2.5679682436811406, + "grad_norm": 0.06858737021684647, + "learning_rate": 7.955803591520771e-05, + "loss": 0.2826, + "step": 31699 + }, + { + "epoch": 2.568049254698639, + "grad_norm": 0.06841877847909927, + "learning_rate": 7.955353526261308e-05, + "loss": 0.2319, + "step": 31700 + }, + { + "epoch": 2.568130265716137, + "grad_norm": 0.07550330460071564, + "learning_rate": 7.954903461001846e-05, + "loss": 0.2848, + "step": 31701 + }, + { + "epoch": 2.568211276733636, + "grad_norm": 0.07050354033708572, + "learning_rate": 7.954453395742383e-05, + "loss": 0.2248, + "step": 31702 + }, + { + "epoch": 2.568292287751134, + "grad_norm": 0.06879187375307083, + "learning_rate": 7.95400333048292e-05, + "loss": 0.2598, + "step": 31703 + }, + { + "epoch": 2.5683732987686323, + "grad_norm": 0.07464316487312317, + "learning_rate": 7.953553265223458e-05, + "loss": 0.2493, + "step": 31704 + }, + { + "epoch": 2.568454309786131, + "grad_norm": 0.08402939140796661, + "learning_rate": 7.953103199963995e-05, + "loss": 0.3024, + "step": 31705 + }, + { + "epoch": 2.5685353208036292, + "grad_norm": 0.08375585079193115, + "learning_rate": 7.952653134704533e-05, + "loss": 0.2586, + "step": 31706 + }, + { + "epoch": 2.5686163318211275, + "grad_norm": 0.07422322034835815, + "learning_rate": 7.95220306944507e-05, + "loss": 0.2421, + "step": 31707 + }, + { + "epoch": 2.568697342838626, + "grad_norm": 0.057322461158037186, + "learning_rate": 7.951753004185607e-05, + "loss": 0.2143, + "step": 31708 + }, + { + "epoch": 2.5687783538561244, + "grad_norm": 0.056815192103385925, + "learning_rate": 7.951302938926145e-05, + "loss": 0.2386, + "step": 31709 + }, + { + "epoch": 2.5688593648736227, + "grad_norm": 0.07299406826496124, + "learning_rate": 7.950852873666682e-05, + "loss": 0.2548, + "step": 31710 + }, + { + "epoch": 2.5689403758911213, + "grad_norm": 0.07193489372730255, + "learning_rate": 7.95040280840722e-05, + "loss": 0.2603, + "step": 31711 + }, + { + "epoch": 2.5690213869086196, + "grad_norm": 0.06784748286008835, + "learning_rate": 7.949952743147757e-05, + "loss": 0.2642, + "step": 31712 + }, + { + "epoch": 2.569102397926118, + "grad_norm": 0.07315580546855927, + "learning_rate": 7.949502677888294e-05, + "loss": 0.2688, + "step": 31713 + }, + { + "epoch": 2.5691834089436165, + "grad_norm": 0.07491733878850937, + "learning_rate": 7.949052612628831e-05, + "loss": 0.2509, + "step": 31714 + }, + { + "epoch": 2.5692644199611148, + "grad_norm": 0.061660926789045334, + "learning_rate": 7.948602547369369e-05, + "loss": 0.2521, + "step": 31715 + }, + { + "epoch": 2.569345430978613, + "grad_norm": 0.06637068837881088, + "learning_rate": 7.948152482109906e-05, + "loss": 0.2659, + "step": 31716 + }, + { + "epoch": 2.5694264419961117, + "grad_norm": 0.05649961158633232, + "learning_rate": 7.947702416850444e-05, + "loss": 0.2615, + "step": 31717 + }, + { + "epoch": 2.56950745301361, + "grad_norm": 0.07894111424684525, + "learning_rate": 7.947252351590981e-05, + "loss": 0.2268, + "step": 31718 + }, + { + "epoch": 2.569588464031108, + "grad_norm": 0.07335829734802246, + "learning_rate": 7.946802286331518e-05, + "loss": 0.2655, + "step": 31719 + }, + { + "epoch": 2.569669475048607, + "grad_norm": 0.06373408436775208, + "learning_rate": 7.946352221072056e-05, + "loss": 0.2919, + "step": 31720 + }, + { + "epoch": 2.569750486066105, + "grad_norm": 0.05747825652360916, + "learning_rate": 7.945902155812593e-05, + "loss": 0.2486, + "step": 31721 + }, + { + "epoch": 2.5698314970836034, + "grad_norm": 0.054663050919771194, + "learning_rate": 7.94545209055313e-05, + "loss": 0.244, + "step": 31722 + }, + { + "epoch": 2.5699125081011016, + "grad_norm": 0.049483031034469604, + "learning_rate": 7.945002025293668e-05, + "loss": 0.1919, + "step": 31723 + }, + { + "epoch": 2.5699935191186, + "grad_norm": 0.06937556713819504, + "learning_rate": 7.944551960034205e-05, + "loss": 0.2199, + "step": 31724 + }, + { + "epoch": 2.5700745301360985, + "grad_norm": 0.0625581368803978, + "learning_rate": 7.944101894774742e-05, + "loss": 0.212, + "step": 31725 + }, + { + "epoch": 2.570155541153597, + "grad_norm": 0.061997413635253906, + "learning_rate": 7.943651829515281e-05, + "loss": 0.1864, + "step": 31726 + }, + { + "epoch": 2.570236552171095, + "grad_norm": 0.05428750067949295, + "learning_rate": 7.943201764255817e-05, + "loss": 0.2125, + "step": 31727 + }, + { + "epoch": 2.5703175631885937, + "grad_norm": 0.0579020231962204, + "learning_rate": 7.942751698996355e-05, + "loss": 0.2402, + "step": 31728 + }, + { + "epoch": 2.570398574206092, + "grad_norm": 0.0654272735118866, + "learning_rate": 7.942301633736893e-05, + "loss": 0.2564, + "step": 31729 + }, + { + "epoch": 2.57047958522359, + "grad_norm": 0.06533244997262955, + "learning_rate": 7.941851568477429e-05, + "loss": 0.2245, + "step": 31730 + }, + { + "epoch": 2.570560596241089, + "grad_norm": 0.05718918517231941, + "learning_rate": 7.941401503217967e-05, + "loss": 0.2114, + "step": 31731 + }, + { + "epoch": 2.570641607258587, + "grad_norm": 0.07008680701255798, + "learning_rate": 7.940951437958505e-05, + "loss": 0.2426, + "step": 31732 + }, + { + "epoch": 2.5707226182760854, + "grad_norm": 0.09347264468669891, + "learning_rate": 7.940501372699041e-05, + "loss": 0.297, + "step": 31733 + }, + { + "epoch": 2.570803629293584, + "grad_norm": 0.0643296018242836, + "learning_rate": 7.940051307439579e-05, + "loss": 0.2487, + "step": 31734 + }, + { + "epoch": 2.5708846403110823, + "grad_norm": 0.06303554028272629, + "learning_rate": 7.939601242180117e-05, + "loss": 0.23, + "step": 31735 + }, + { + "epoch": 2.5709656513285806, + "grad_norm": 0.07969815284013748, + "learning_rate": 7.939151176920653e-05, + "loss": 0.279, + "step": 31736 + }, + { + "epoch": 2.5710466623460793, + "grad_norm": 0.0638255774974823, + "learning_rate": 7.938701111661191e-05, + "loss": 0.2303, + "step": 31737 + }, + { + "epoch": 2.5711276733635775, + "grad_norm": 0.07085902988910675, + "learning_rate": 7.93825104640173e-05, + "loss": 0.2291, + "step": 31738 + }, + { + "epoch": 2.5712086843810757, + "grad_norm": 0.07110630720853806, + "learning_rate": 7.937800981142265e-05, + "loss": 0.2587, + "step": 31739 + }, + { + "epoch": 2.5712896953985744, + "grad_norm": 0.06317129731178284, + "learning_rate": 7.937350915882803e-05, + "loss": 0.2378, + "step": 31740 + }, + { + "epoch": 2.5713707064160727, + "grad_norm": 0.06473881006240845, + "learning_rate": 7.936900850623342e-05, + "loss": 0.2282, + "step": 31741 + }, + { + "epoch": 2.571451717433571, + "grad_norm": 0.05658971518278122, + "learning_rate": 7.936450785363878e-05, + "loss": 0.2565, + "step": 31742 + }, + { + "epoch": 2.5715327284510696, + "grad_norm": 0.08154220134019852, + "learning_rate": 7.936000720104415e-05, + "loss": 0.2951, + "step": 31743 + }, + { + "epoch": 2.571613739468568, + "grad_norm": 0.06489664316177368, + "learning_rate": 7.935550654844954e-05, + "loss": 0.2579, + "step": 31744 + }, + { + "epoch": 2.571694750486066, + "grad_norm": 0.06681975722312927, + "learning_rate": 7.93510058958549e-05, + "loss": 0.2536, + "step": 31745 + }, + { + "epoch": 2.5717757615035644, + "grad_norm": 0.07080577313899994, + "learning_rate": 7.934650524326027e-05, + "loss": 0.2278, + "step": 31746 + }, + { + "epoch": 2.5718567725210626, + "grad_norm": 0.05441749840974808, + "learning_rate": 7.934200459066566e-05, + "loss": 0.2455, + "step": 31747 + }, + { + "epoch": 2.5719377835385613, + "grad_norm": 0.061398494988679886, + "learning_rate": 7.933750393807102e-05, + "loss": 0.2176, + "step": 31748 + }, + { + "epoch": 2.5720187945560595, + "grad_norm": 0.06527212262153625, + "learning_rate": 7.933300328547639e-05, + "loss": 0.2472, + "step": 31749 + }, + { + "epoch": 2.5720998055735578, + "grad_norm": 0.06973239779472351, + "learning_rate": 7.932850263288178e-05, + "loss": 0.2494, + "step": 31750 + }, + { + "epoch": 2.5721808165910565, + "grad_norm": 0.07312745600938797, + "learning_rate": 7.932400198028714e-05, + "loss": 0.2688, + "step": 31751 + }, + { + "epoch": 2.5722618276085547, + "grad_norm": 0.0772642195224762, + "learning_rate": 7.931950132769253e-05, + "loss": 0.2538, + "step": 31752 + }, + { + "epoch": 2.572342838626053, + "grad_norm": 0.08061595261096954, + "learning_rate": 7.93150006750979e-05, + "loss": 0.2651, + "step": 31753 + }, + { + "epoch": 2.5724238496435516, + "grad_norm": 0.06649156659841537, + "learning_rate": 7.931050002250326e-05, + "loss": 0.2767, + "step": 31754 + }, + { + "epoch": 2.57250486066105, + "grad_norm": 0.07221254706382751, + "learning_rate": 7.930599936990865e-05, + "loss": 0.2387, + "step": 31755 + }, + { + "epoch": 2.572585871678548, + "grad_norm": 0.06058627739548683, + "learning_rate": 7.930149871731402e-05, + "loss": 0.2548, + "step": 31756 + }, + { + "epoch": 2.572666882696047, + "grad_norm": 0.07540536671876907, + "learning_rate": 7.929699806471938e-05, + "loss": 0.232, + "step": 31757 + }, + { + "epoch": 2.572747893713545, + "grad_norm": 0.07115985453128815, + "learning_rate": 7.929249741212477e-05, + "loss": 0.2661, + "step": 31758 + }, + { + "epoch": 2.5728289047310433, + "grad_norm": 0.06167732924222946, + "learning_rate": 7.928799675953014e-05, + "loss": 0.2584, + "step": 31759 + }, + { + "epoch": 2.572909915748542, + "grad_norm": 0.07705485820770264, + "learning_rate": 7.92834961069355e-05, + "loss": 0.2734, + "step": 31760 + }, + { + "epoch": 2.5729909267660402, + "grad_norm": 0.08480952680110931, + "learning_rate": 7.927899545434089e-05, + "loss": 0.2601, + "step": 31761 + }, + { + "epoch": 2.5730719377835385, + "grad_norm": 0.06356513500213623, + "learning_rate": 7.927449480174626e-05, + "loss": 0.2332, + "step": 31762 + }, + { + "epoch": 2.573152948801037, + "grad_norm": 0.07468358427286148, + "learning_rate": 7.926999414915162e-05, + "loss": 0.266, + "step": 31763 + }, + { + "epoch": 2.5732339598185354, + "grad_norm": 0.07145358622074127, + "learning_rate": 7.926549349655701e-05, + "loss": 0.2509, + "step": 31764 + }, + { + "epoch": 2.5733149708360337, + "grad_norm": 0.059972260147333145, + "learning_rate": 7.926099284396238e-05, + "loss": 0.2798, + "step": 31765 + }, + { + "epoch": 2.5733959818535324, + "grad_norm": 0.0660482719540596, + "learning_rate": 7.925649219136774e-05, + "loss": 0.2027, + "step": 31766 + }, + { + "epoch": 2.5734769928710306, + "grad_norm": 0.06355316191911697, + "learning_rate": 7.925199153877313e-05, + "loss": 0.2612, + "step": 31767 + }, + { + "epoch": 2.573558003888529, + "grad_norm": 0.06815008074045181, + "learning_rate": 7.92474908861785e-05, + "loss": 0.2356, + "step": 31768 + }, + { + "epoch": 2.573639014906027, + "grad_norm": 0.06347496062517166, + "learning_rate": 7.924299023358386e-05, + "loss": 0.2563, + "step": 31769 + }, + { + "epoch": 2.5737200259235253, + "grad_norm": 0.06929799169301987, + "learning_rate": 7.923848958098925e-05, + "loss": 0.2598, + "step": 31770 + }, + { + "epoch": 2.573801036941024, + "grad_norm": 0.05718935653567314, + "learning_rate": 7.923398892839462e-05, + "loss": 0.2301, + "step": 31771 + }, + { + "epoch": 2.5738820479585223, + "grad_norm": 0.07201238721609116, + "learning_rate": 7.922948827579998e-05, + "loss": 0.2603, + "step": 31772 + }, + { + "epoch": 2.5739630589760205, + "grad_norm": 0.06836757063865662, + "learning_rate": 7.922498762320537e-05, + "loss": 0.2469, + "step": 31773 + }, + { + "epoch": 2.574044069993519, + "grad_norm": 0.06981032341718674, + "learning_rate": 7.922048697061074e-05, + "loss": 0.2563, + "step": 31774 + }, + { + "epoch": 2.5741250810110174, + "grad_norm": 0.0501057431101799, + "learning_rate": 7.92159863180161e-05, + "loss": 0.2203, + "step": 31775 + }, + { + "epoch": 2.5742060920285157, + "grad_norm": 0.06120969355106354, + "learning_rate": 7.921148566542149e-05, + "loss": 0.2058, + "step": 31776 + }, + { + "epoch": 2.5742871030460144, + "grad_norm": 0.058200716972351074, + "learning_rate": 7.920698501282687e-05, + "loss": 0.294, + "step": 31777 + }, + { + "epoch": 2.5743681140635126, + "grad_norm": 0.06421802192926407, + "learning_rate": 7.920248436023223e-05, + "loss": 0.2257, + "step": 31778 + }, + { + "epoch": 2.574449125081011, + "grad_norm": 0.062095798552036285, + "learning_rate": 7.919798370763761e-05, + "loss": 0.2375, + "step": 31779 + }, + { + "epoch": 2.5745301360985096, + "grad_norm": 0.09116526693105698, + "learning_rate": 7.919348305504299e-05, + "loss": 0.2646, + "step": 31780 + }, + { + "epoch": 2.574611147116008, + "grad_norm": 0.06377584487199783, + "learning_rate": 7.918898240244836e-05, + "loss": 0.2194, + "step": 31781 + }, + { + "epoch": 2.574692158133506, + "grad_norm": 0.06614330410957336, + "learning_rate": 7.918448174985373e-05, + "loss": 0.2386, + "step": 31782 + }, + { + "epoch": 2.5747731691510047, + "grad_norm": 0.06941062211990356, + "learning_rate": 7.917998109725911e-05, + "loss": 0.2201, + "step": 31783 + }, + { + "epoch": 2.574854180168503, + "grad_norm": 0.07736022025346756, + "learning_rate": 7.917548044466448e-05, + "loss": 0.2627, + "step": 31784 + }, + { + "epoch": 2.5749351911860012, + "grad_norm": 0.07738102227449417, + "learning_rate": 7.917097979206985e-05, + "loss": 0.2485, + "step": 31785 + }, + { + "epoch": 2.5750162022035, + "grad_norm": 0.0658000260591507, + "learning_rate": 7.916647913947523e-05, + "loss": 0.2313, + "step": 31786 + }, + { + "epoch": 2.575097213220998, + "grad_norm": 0.05562092363834381, + "learning_rate": 7.91619784868806e-05, + "loss": 0.1899, + "step": 31787 + }, + { + "epoch": 2.5751782242384964, + "grad_norm": 0.07304248958826065, + "learning_rate": 7.915747783428597e-05, + "loss": 0.2122, + "step": 31788 + }, + { + "epoch": 2.5752592352559946, + "grad_norm": 0.06350831687450409, + "learning_rate": 7.915297718169135e-05, + "loss": 0.2167, + "step": 31789 + }, + { + "epoch": 2.5753402462734933, + "grad_norm": 0.07086034119129181, + "learning_rate": 7.914847652909672e-05, + "loss": 0.277, + "step": 31790 + }, + { + "epoch": 2.5754212572909916, + "grad_norm": 0.0860385000705719, + "learning_rate": 7.91439758765021e-05, + "loss": 0.2773, + "step": 31791 + }, + { + "epoch": 2.57550226830849, + "grad_norm": 0.08237428963184357, + "learning_rate": 7.913947522390747e-05, + "loss": 0.2728, + "step": 31792 + }, + { + "epoch": 2.575583279325988, + "grad_norm": 0.06451267749071121, + "learning_rate": 7.913497457131284e-05, + "loss": 0.23, + "step": 31793 + }, + { + "epoch": 2.5756642903434868, + "grad_norm": 0.07084333151578903, + "learning_rate": 7.913047391871822e-05, + "loss": 0.2147, + "step": 31794 + }, + { + "epoch": 2.575745301360985, + "grad_norm": 0.07405141741037369, + "learning_rate": 7.912597326612359e-05, + "loss": 0.2789, + "step": 31795 + }, + { + "epoch": 2.5758263123784833, + "grad_norm": 0.06787556409835815, + "learning_rate": 7.912147261352896e-05, + "loss": 0.2233, + "step": 31796 + }, + { + "epoch": 2.575907323395982, + "grad_norm": 0.0673149824142456, + "learning_rate": 7.911697196093434e-05, + "loss": 0.2395, + "step": 31797 + }, + { + "epoch": 2.57598833441348, + "grad_norm": 0.062474120408296585, + "learning_rate": 7.911247130833971e-05, + "loss": 0.2255, + "step": 31798 + }, + { + "epoch": 2.5760693454309784, + "grad_norm": 0.08614230901002884, + "learning_rate": 7.910797065574508e-05, + "loss": 0.2667, + "step": 31799 + }, + { + "epoch": 2.576150356448477, + "grad_norm": 0.0551866814494133, + "learning_rate": 7.910347000315046e-05, + "loss": 0.2323, + "step": 31800 + }, + { + "epoch": 2.5762313674659754, + "grad_norm": 0.06509862095117569, + "learning_rate": 7.909896935055583e-05, + "loss": 0.2328, + "step": 31801 + }, + { + "epoch": 2.5763123784834736, + "grad_norm": 0.06465253233909607, + "learning_rate": 7.90944686979612e-05, + "loss": 0.2326, + "step": 31802 + }, + { + "epoch": 2.5763933895009723, + "grad_norm": 0.07347733527421951, + "learning_rate": 7.908996804536658e-05, + "loss": 0.2337, + "step": 31803 + }, + { + "epoch": 2.5764744005184705, + "grad_norm": 0.0698373094201088, + "learning_rate": 7.908546739277195e-05, + "loss": 0.2174, + "step": 31804 + }, + { + "epoch": 2.576555411535969, + "grad_norm": 0.06428837776184082, + "learning_rate": 7.908096674017733e-05, + "loss": 0.2271, + "step": 31805 + }, + { + "epoch": 2.5766364225534675, + "grad_norm": 0.08448978513479233, + "learning_rate": 7.90764660875827e-05, + "loss": 0.2484, + "step": 31806 + }, + { + "epoch": 2.5767174335709657, + "grad_norm": 0.07157791405916214, + "learning_rate": 7.907196543498809e-05, + "loss": 0.274, + "step": 31807 + }, + { + "epoch": 2.576798444588464, + "grad_norm": 0.06949556618928909, + "learning_rate": 7.906746478239345e-05, + "loss": 0.2483, + "step": 31808 + }, + { + "epoch": 2.5768794556059627, + "grad_norm": 0.0694814994931221, + "learning_rate": 7.906296412979882e-05, + "loss": 0.2675, + "step": 31809 + }, + { + "epoch": 2.576960466623461, + "grad_norm": 0.0681164488196373, + "learning_rate": 7.905846347720421e-05, + "loss": 0.2498, + "step": 31810 + }, + { + "epoch": 2.577041477640959, + "grad_norm": 0.06991052627563477, + "learning_rate": 7.905396282460957e-05, + "loss": 0.2577, + "step": 31811 + }, + { + "epoch": 2.5771224886584574, + "grad_norm": 0.07411139458417892, + "learning_rate": 7.904946217201494e-05, + "loss": 0.2511, + "step": 31812 + }, + { + "epoch": 2.577203499675956, + "grad_norm": 0.07247655838727951, + "learning_rate": 7.904496151942033e-05, + "loss": 0.2281, + "step": 31813 + }, + { + "epoch": 2.5772845106934543, + "grad_norm": 0.06791792064905167, + "learning_rate": 7.904046086682569e-05, + "loss": 0.1999, + "step": 31814 + }, + { + "epoch": 2.5773655217109526, + "grad_norm": 0.06338424980640411, + "learning_rate": 7.903596021423106e-05, + "loss": 0.2779, + "step": 31815 + }, + { + "epoch": 2.577446532728451, + "grad_norm": 0.06921472400426865, + "learning_rate": 7.903145956163645e-05, + "loss": 0.2648, + "step": 31816 + }, + { + "epoch": 2.5775275437459495, + "grad_norm": 0.06940491497516632, + "learning_rate": 7.902695890904181e-05, + "loss": 0.2565, + "step": 31817 + }, + { + "epoch": 2.5776085547634477, + "grad_norm": 0.09353849291801453, + "learning_rate": 7.902245825644718e-05, + "loss": 0.2863, + "step": 31818 + }, + { + "epoch": 2.577689565780946, + "grad_norm": 0.06500814110040665, + "learning_rate": 7.901795760385257e-05, + "loss": 0.225, + "step": 31819 + }, + { + "epoch": 2.5777705767984447, + "grad_norm": 0.07395117729902267, + "learning_rate": 7.901345695125793e-05, + "loss": 0.2565, + "step": 31820 + }, + { + "epoch": 2.577851587815943, + "grad_norm": 0.06356361508369446, + "learning_rate": 7.90089562986633e-05, + "loss": 0.2422, + "step": 31821 + }, + { + "epoch": 2.577932598833441, + "grad_norm": 0.0858735591173172, + "learning_rate": 7.900445564606869e-05, + "loss": 0.2295, + "step": 31822 + }, + { + "epoch": 2.57801360985094, + "grad_norm": 0.08137764781713486, + "learning_rate": 7.899995499347405e-05, + "loss": 0.2482, + "step": 31823 + }, + { + "epoch": 2.578094620868438, + "grad_norm": 0.06723980605602264, + "learning_rate": 7.899545434087942e-05, + "loss": 0.2327, + "step": 31824 + }, + { + "epoch": 2.5781756318859363, + "grad_norm": 0.07822005450725555, + "learning_rate": 7.899095368828481e-05, + "loss": 0.2167, + "step": 31825 + }, + { + "epoch": 2.578256642903435, + "grad_norm": 0.06803352385759354, + "learning_rate": 7.898645303569017e-05, + "loss": 0.2758, + "step": 31826 + }, + { + "epoch": 2.5783376539209333, + "grad_norm": 0.06352008134126663, + "learning_rate": 7.898195238309555e-05, + "loss": 0.2449, + "step": 31827 + }, + { + "epoch": 2.5784186649384315, + "grad_norm": 0.05779225379228592, + "learning_rate": 7.897745173050093e-05, + "loss": 0.2469, + "step": 31828 + }, + { + "epoch": 2.57849967595593, + "grad_norm": 0.08133967220783234, + "learning_rate": 7.897295107790629e-05, + "loss": 0.264, + "step": 31829 + }, + { + "epoch": 2.5785806869734285, + "grad_norm": 0.06355690956115723, + "learning_rate": 7.896845042531167e-05, + "loss": 0.2246, + "step": 31830 + }, + { + "epoch": 2.5786616979909267, + "grad_norm": 0.0669471025466919, + "learning_rate": 7.896394977271705e-05, + "loss": 0.273, + "step": 31831 + }, + { + "epoch": 2.5787427090084254, + "grad_norm": 0.08634018898010254, + "learning_rate": 7.895944912012241e-05, + "loss": 0.2405, + "step": 31832 + }, + { + "epoch": 2.5788237200259236, + "grad_norm": 0.06180933117866516, + "learning_rate": 7.89549484675278e-05, + "loss": 0.2301, + "step": 31833 + }, + { + "epoch": 2.578904731043422, + "grad_norm": 0.0742286667227745, + "learning_rate": 7.895044781493317e-05, + "loss": 0.2425, + "step": 31834 + }, + { + "epoch": 2.57898574206092, + "grad_norm": 0.07804601639509201, + "learning_rate": 7.894594716233853e-05, + "loss": 0.259, + "step": 31835 + }, + { + "epoch": 2.579066753078419, + "grad_norm": 0.07568758726119995, + "learning_rate": 7.894144650974392e-05, + "loss": 0.2752, + "step": 31836 + }, + { + "epoch": 2.579147764095917, + "grad_norm": 0.08096685260534286, + "learning_rate": 7.89369458571493e-05, + "loss": 0.2559, + "step": 31837 + }, + { + "epoch": 2.5792287751134153, + "grad_norm": 0.06668811291456223, + "learning_rate": 7.893244520455466e-05, + "loss": 0.2306, + "step": 31838 + }, + { + "epoch": 2.5793097861309136, + "grad_norm": 0.06312041729688644, + "learning_rate": 7.892794455196004e-05, + "loss": 0.2368, + "step": 31839 + }, + { + "epoch": 2.5793907971484122, + "grad_norm": 0.06583622843027115, + "learning_rate": 7.892344389936542e-05, + "loss": 0.2461, + "step": 31840 + }, + { + "epoch": 2.5794718081659105, + "grad_norm": 0.07359354197978973, + "learning_rate": 7.891894324677078e-05, + "loss": 0.2411, + "step": 31841 + }, + { + "epoch": 2.5795528191834087, + "grad_norm": 0.07067452371120453, + "learning_rate": 7.891444259417616e-05, + "loss": 0.2187, + "step": 31842 + }, + { + "epoch": 2.5796338302009074, + "grad_norm": 0.07543587684631348, + "learning_rate": 7.890994194158154e-05, + "loss": 0.2515, + "step": 31843 + }, + { + "epoch": 2.5797148412184057, + "grad_norm": 0.07966103404760361, + "learning_rate": 7.89054412889869e-05, + "loss": 0.2217, + "step": 31844 + }, + { + "epoch": 2.579795852235904, + "grad_norm": 0.063313789665699, + "learning_rate": 7.890094063639228e-05, + "loss": 0.2173, + "step": 31845 + }, + { + "epoch": 2.5798768632534026, + "grad_norm": 0.06680615246295929, + "learning_rate": 7.889643998379766e-05, + "loss": 0.2391, + "step": 31846 + }, + { + "epoch": 2.579957874270901, + "grad_norm": 0.06952749937772751, + "learning_rate": 7.889193933120302e-05, + "loss": 0.2215, + "step": 31847 + }, + { + "epoch": 2.580038885288399, + "grad_norm": 0.06919938325881958, + "learning_rate": 7.88874386786084e-05, + "loss": 0.2328, + "step": 31848 + }, + { + "epoch": 2.5801198963058978, + "grad_norm": 0.057678766548633575, + "learning_rate": 7.888293802601378e-05, + "loss": 0.2199, + "step": 31849 + }, + { + "epoch": 2.580200907323396, + "grad_norm": 0.06296581029891968, + "learning_rate": 7.887843737341914e-05, + "loss": 0.243, + "step": 31850 + }, + { + "epoch": 2.5802819183408943, + "grad_norm": 0.06534301489591599, + "learning_rate": 7.887393672082453e-05, + "loss": 0.2571, + "step": 31851 + }, + { + "epoch": 2.580362929358393, + "grad_norm": 0.06356888264417648, + "learning_rate": 7.88694360682299e-05, + "loss": 0.2826, + "step": 31852 + }, + { + "epoch": 2.580443940375891, + "grad_norm": 0.067490354180336, + "learning_rate": 7.886493541563527e-05, + "loss": 0.1816, + "step": 31853 + }, + { + "epoch": 2.5805249513933894, + "grad_norm": 0.07598926872015, + "learning_rate": 7.886043476304065e-05, + "loss": 0.2491, + "step": 31854 + }, + { + "epoch": 2.580605962410888, + "grad_norm": 0.08062969893217087, + "learning_rate": 7.885593411044602e-05, + "loss": 0.2401, + "step": 31855 + }, + { + "epoch": 2.5806869734283864, + "grad_norm": 0.06391061097383499, + "learning_rate": 7.88514334578514e-05, + "loss": 0.2205, + "step": 31856 + }, + { + "epoch": 2.5807679844458846, + "grad_norm": 0.06544674932956696, + "learning_rate": 7.884693280525677e-05, + "loss": 0.2358, + "step": 31857 + }, + { + "epoch": 2.580848995463383, + "grad_norm": 0.07806091010570526, + "learning_rate": 7.884243215266214e-05, + "loss": 0.2748, + "step": 31858 + }, + { + "epoch": 2.5809300064808816, + "grad_norm": 0.07057543098926544, + "learning_rate": 7.883793150006751e-05, + "loss": 0.2613, + "step": 31859 + }, + { + "epoch": 2.58101101749838, + "grad_norm": 0.07381962239742279, + "learning_rate": 7.883343084747289e-05, + "loss": 0.2217, + "step": 31860 + }, + { + "epoch": 2.581092028515878, + "grad_norm": 0.07270567864179611, + "learning_rate": 7.882893019487826e-05, + "loss": 0.2973, + "step": 31861 + }, + { + "epoch": 2.5811730395333763, + "grad_norm": 0.0710500106215477, + "learning_rate": 7.882442954228364e-05, + "loss": 0.2109, + "step": 31862 + }, + { + "epoch": 2.581254050550875, + "grad_norm": 0.06224672496318817, + "learning_rate": 7.881992888968901e-05, + "loss": 0.2362, + "step": 31863 + }, + { + "epoch": 2.5813350615683732, + "grad_norm": 0.08102069050073624, + "learning_rate": 7.881542823709438e-05, + "loss": 0.2663, + "step": 31864 + }, + { + "epoch": 2.5814160725858715, + "grad_norm": 0.07523306459188461, + "learning_rate": 7.881092758449976e-05, + "loss": 0.236, + "step": 31865 + }, + { + "epoch": 2.58149708360337, + "grad_norm": 0.059707846492528915, + "learning_rate": 7.880642693190513e-05, + "loss": 0.226, + "step": 31866 + }, + { + "epoch": 2.5815780946208684, + "grad_norm": 0.08537987619638443, + "learning_rate": 7.88019262793105e-05, + "loss": 0.2803, + "step": 31867 + }, + { + "epoch": 2.5816591056383666, + "grad_norm": 0.08499877899885178, + "learning_rate": 7.879742562671588e-05, + "loss": 0.2439, + "step": 31868 + }, + { + "epoch": 2.5817401166558653, + "grad_norm": 0.0631185919046402, + "learning_rate": 7.879292497412125e-05, + "loss": 0.2641, + "step": 31869 + }, + { + "epoch": 2.5818211276733636, + "grad_norm": 0.0609898567199707, + "learning_rate": 7.878842432152662e-05, + "loss": 0.2563, + "step": 31870 + }, + { + "epoch": 2.581902138690862, + "grad_norm": 0.06977617740631104, + "learning_rate": 7.8783923668932e-05, + "loss": 0.2555, + "step": 31871 + }, + { + "epoch": 2.5819831497083605, + "grad_norm": 0.08325894176959991, + "learning_rate": 7.877942301633737e-05, + "loss": 0.2246, + "step": 31872 + }, + { + "epoch": 2.5820641607258588, + "grad_norm": 0.06733439862728119, + "learning_rate": 7.877492236374274e-05, + "loss": 0.2504, + "step": 31873 + }, + { + "epoch": 2.582145171743357, + "grad_norm": 0.06447796523571014, + "learning_rate": 7.877042171114812e-05, + "loss": 0.2509, + "step": 31874 + }, + { + "epoch": 2.5822261827608557, + "grad_norm": 0.06359328329563141, + "learning_rate": 7.876592105855349e-05, + "loss": 0.2577, + "step": 31875 + }, + { + "epoch": 2.582307193778354, + "grad_norm": 0.06793835759162903, + "learning_rate": 7.876142040595887e-05, + "loss": 0.2806, + "step": 31876 + }, + { + "epoch": 2.582388204795852, + "grad_norm": 0.06319888681173325, + "learning_rate": 7.875691975336424e-05, + "loss": 0.2502, + "step": 31877 + }, + { + "epoch": 2.582469215813351, + "grad_norm": 0.06786399334669113, + "learning_rate": 7.875241910076961e-05, + "loss": 0.2169, + "step": 31878 + }, + { + "epoch": 2.582550226830849, + "grad_norm": 0.06800121068954468, + "learning_rate": 7.874791844817499e-05, + "loss": 0.2228, + "step": 31879 + }, + { + "epoch": 2.5826312378483474, + "grad_norm": 0.06821072101593018, + "learning_rate": 7.874341779558036e-05, + "loss": 0.2414, + "step": 31880 + }, + { + "epoch": 2.5827122488658456, + "grad_norm": 0.072810597717762, + "learning_rate": 7.873891714298573e-05, + "loss": 0.2831, + "step": 31881 + }, + { + "epoch": 2.5827932598833443, + "grad_norm": 0.0673997551202774, + "learning_rate": 7.873441649039111e-05, + "loss": 0.2378, + "step": 31882 + }, + { + "epoch": 2.5828742709008425, + "grad_norm": 0.07142699509859085, + "learning_rate": 7.872991583779648e-05, + "loss": 0.2479, + "step": 31883 + }, + { + "epoch": 2.582955281918341, + "grad_norm": 0.07638320326805115, + "learning_rate": 7.872541518520185e-05, + "loss": 0.2707, + "step": 31884 + }, + { + "epoch": 2.583036292935839, + "grad_norm": 0.06295417994260788, + "learning_rate": 7.872091453260724e-05, + "loss": 0.2215, + "step": 31885 + }, + { + "epoch": 2.5831173039533377, + "grad_norm": 0.07948005944490433, + "learning_rate": 7.87164138800126e-05, + "loss": 0.2867, + "step": 31886 + }, + { + "epoch": 2.583198314970836, + "grad_norm": 0.07531192898750305, + "learning_rate": 7.871191322741798e-05, + "loss": 0.2373, + "step": 31887 + }, + { + "epoch": 2.583279325988334, + "grad_norm": 0.07652915269136429, + "learning_rate": 7.870741257482336e-05, + "loss": 0.2251, + "step": 31888 + }, + { + "epoch": 2.583360337005833, + "grad_norm": 0.06174972653388977, + "learning_rate": 7.870291192222872e-05, + "loss": 0.274, + "step": 31889 + }, + { + "epoch": 2.583441348023331, + "grad_norm": 0.06382008641958237, + "learning_rate": 7.86984112696341e-05, + "loss": 0.2568, + "step": 31890 + }, + { + "epoch": 2.5835223590408294, + "grad_norm": 0.07435295730829239, + "learning_rate": 7.869391061703948e-05, + "loss": 0.253, + "step": 31891 + }, + { + "epoch": 2.583603370058328, + "grad_norm": 0.05679628998041153, + "learning_rate": 7.868940996444484e-05, + "loss": 0.2062, + "step": 31892 + }, + { + "epoch": 2.5836843810758263, + "grad_norm": 0.07225190848112106, + "learning_rate": 7.868490931185022e-05, + "loss": 0.2477, + "step": 31893 + }, + { + "epoch": 2.5837653920933246, + "grad_norm": 0.07137248665094376, + "learning_rate": 7.86804086592556e-05, + "loss": 0.269, + "step": 31894 + }, + { + "epoch": 2.5838464031108233, + "grad_norm": 0.06835871934890747, + "learning_rate": 7.867590800666096e-05, + "loss": 0.2538, + "step": 31895 + }, + { + "epoch": 2.5839274141283215, + "grad_norm": 0.06776162981987, + "learning_rate": 7.867140735406634e-05, + "loss": 0.2528, + "step": 31896 + }, + { + "epoch": 2.5840084251458197, + "grad_norm": 0.08782105892896652, + "learning_rate": 7.866690670147172e-05, + "loss": 0.2711, + "step": 31897 + }, + { + "epoch": 2.5840894361633184, + "grad_norm": 0.07954996824264526, + "learning_rate": 7.866240604887708e-05, + "loss": 0.2217, + "step": 31898 + }, + { + "epoch": 2.5841704471808167, + "grad_norm": 0.07364766299724579, + "learning_rate": 7.865790539628246e-05, + "loss": 0.2444, + "step": 31899 + }, + { + "epoch": 2.584251458198315, + "grad_norm": 0.06456109136343002, + "learning_rate": 7.865340474368785e-05, + "loss": 0.2373, + "step": 31900 + }, + { + "epoch": 2.5843324692158136, + "grad_norm": 0.060708265751600266, + "learning_rate": 7.86489040910932e-05, + "loss": 0.2437, + "step": 31901 + }, + { + "epoch": 2.584413480233312, + "grad_norm": 0.06029626354575157, + "learning_rate": 7.864440343849858e-05, + "loss": 0.2127, + "step": 31902 + }, + { + "epoch": 2.58449449125081, + "grad_norm": 0.05488735809922218, + "learning_rate": 7.863990278590397e-05, + "loss": 0.1978, + "step": 31903 + }, + { + "epoch": 2.5845755022683083, + "grad_norm": 0.06499477475881577, + "learning_rate": 7.863540213330933e-05, + "loss": 0.2439, + "step": 31904 + }, + { + "epoch": 2.584656513285807, + "grad_norm": 0.06294959038496017, + "learning_rate": 7.86309014807147e-05, + "loss": 0.2378, + "step": 31905 + }, + { + "epoch": 2.5847375243033053, + "grad_norm": 0.06903290003538132, + "learning_rate": 7.862640082812009e-05, + "loss": 0.2506, + "step": 31906 + }, + { + "epoch": 2.5848185353208035, + "grad_norm": 0.07474994659423828, + "learning_rate": 7.862190017552545e-05, + "loss": 0.2359, + "step": 31907 + }, + { + "epoch": 2.5848995463383018, + "grad_norm": 0.05141305550932884, + "learning_rate": 7.861739952293082e-05, + "loss": 0.2331, + "step": 31908 + }, + { + "epoch": 2.5849805573558005, + "grad_norm": 0.07798498123884201, + "learning_rate": 7.861289887033621e-05, + "loss": 0.2547, + "step": 31909 + }, + { + "epoch": 2.5850615683732987, + "grad_norm": 0.06230713427066803, + "learning_rate": 7.860839821774157e-05, + "loss": 0.2093, + "step": 31910 + }, + { + "epoch": 2.585142579390797, + "grad_norm": 0.06651043146848679, + "learning_rate": 7.860389756514696e-05, + "loss": 0.2335, + "step": 31911 + }, + { + "epoch": 2.5852235904082956, + "grad_norm": 0.06720586866140366, + "learning_rate": 7.859939691255233e-05, + "loss": 0.2343, + "step": 31912 + }, + { + "epoch": 2.585304601425794, + "grad_norm": 0.06423775106668472, + "learning_rate": 7.859489625995769e-05, + "loss": 0.2247, + "step": 31913 + }, + { + "epoch": 2.585385612443292, + "grad_norm": 0.0718165785074234, + "learning_rate": 7.859039560736308e-05, + "loss": 0.2158, + "step": 31914 + }, + { + "epoch": 2.585466623460791, + "grad_norm": 0.07057687640190125, + "learning_rate": 7.858589495476845e-05, + "loss": 0.2108, + "step": 31915 + }, + { + "epoch": 2.585547634478289, + "grad_norm": 0.06897904723882675, + "learning_rate": 7.858139430217381e-05, + "loss": 0.3009, + "step": 31916 + }, + { + "epoch": 2.5856286454957873, + "grad_norm": 0.05689922347664833, + "learning_rate": 7.85768936495792e-05, + "loss": 0.2781, + "step": 31917 + }, + { + "epoch": 2.585709656513286, + "grad_norm": 0.061782244592905045, + "learning_rate": 7.857239299698457e-05, + "loss": 0.2151, + "step": 31918 + }, + { + "epoch": 2.5857906675307842, + "grad_norm": 0.06645800918340683, + "learning_rate": 7.856789234438994e-05, + "loss": 0.2151, + "step": 31919 + }, + { + "epoch": 2.5858716785482825, + "grad_norm": 0.07219221442937851, + "learning_rate": 7.856339169179532e-05, + "loss": 0.2389, + "step": 31920 + }, + { + "epoch": 2.585952689565781, + "grad_norm": 0.07668904960155487, + "learning_rate": 7.855889103920069e-05, + "loss": 0.2608, + "step": 31921 + }, + { + "epoch": 2.5860337005832794, + "grad_norm": 0.0610949881374836, + "learning_rate": 7.855439038660606e-05, + "loss": 0.228, + "step": 31922 + }, + { + "epoch": 2.5861147116007777, + "grad_norm": 0.07142166048288345, + "learning_rate": 7.854988973401144e-05, + "loss": 0.2539, + "step": 31923 + }, + { + "epoch": 2.5861957226182763, + "grad_norm": 0.06144547089934349, + "learning_rate": 7.854538908141681e-05, + "loss": 0.225, + "step": 31924 + }, + { + "epoch": 2.5862767336357746, + "grad_norm": 0.06903857737779617, + "learning_rate": 7.854088842882219e-05, + "loss": 0.2198, + "step": 31925 + }, + { + "epoch": 2.586357744653273, + "grad_norm": 0.062392707914114, + "learning_rate": 7.853638777622756e-05, + "loss": 0.2668, + "step": 31926 + }, + { + "epoch": 2.586438755670771, + "grad_norm": 0.06246126815676689, + "learning_rate": 7.853188712363293e-05, + "loss": 0.2567, + "step": 31927 + }, + { + "epoch": 2.5865197666882693, + "grad_norm": 0.07953917980194092, + "learning_rate": 7.85273864710383e-05, + "loss": 0.2558, + "step": 31928 + }, + { + "epoch": 2.586600777705768, + "grad_norm": 0.07347556203603745, + "learning_rate": 7.852288581844368e-05, + "loss": 0.2663, + "step": 31929 + }, + { + "epoch": 2.5866817887232663, + "grad_norm": 0.06010118126869202, + "learning_rate": 7.851838516584905e-05, + "loss": 0.2645, + "step": 31930 + }, + { + "epoch": 2.5867627997407645, + "grad_norm": 0.06075914949178696, + "learning_rate": 7.851388451325443e-05, + "loss": 0.2351, + "step": 31931 + }, + { + "epoch": 2.586843810758263, + "grad_norm": 0.0672052800655365, + "learning_rate": 7.85093838606598e-05, + "loss": 0.2519, + "step": 31932 + }, + { + "epoch": 2.5869248217757614, + "grad_norm": 0.05604797974228859, + "learning_rate": 7.850488320806517e-05, + "loss": 0.225, + "step": 31933 + }, + { + "epoch": 2.5870058327932597, + "grad_norm": 0.055545855313539505, + "learning_rate": 7.850038255547055e-05, + "loss": 0.2383, + "step": 31934 + }, + { + "epoch": 2.5870868438107584, + "grad_norm": 0.058497052639722824, + "learning_rate": 7.849588190287592e-05, + "loss": 0.2571, + "step": 31935 + }, + { + "epoch": 2.5871678548282566, + "grad_norm": 0.07107438147068024, + "learning_rate": 7.84913812502813e-05, + "loss": 0.2375, + "step": 31936 + }, + { + "epoch": 2.587248865845755, + "grad_norm": 0.08413809537887573, + "learning_rate": 7.848688059768667e-05, + "loss": 0.3145, + "step": 31937 + }, + { + "epoch": 2.5873298768632536, + "grad_norm": 0.06129063665866852, + "learning_rate": 7.848237994509204e-05, + "loss": 0.2484, + "step": 31938 + }, + { + "epoch": 2.587410887880752, + "grad_norm": 0.06950299441814423, + "learning_rate": 7.847787929249742e-05, + "loss": 0.2527, + "step": 31939 + }, + { + "epoch": 2.58749189889825, + "grad_norm": 0.062319688498973846, + "learning_rate": 7.847337863990279e-05, + "loss": 0.2404, + "step": 31940 + }, + { + "epoch": 2.5875729099157487, + "grad_norm": 0.06427384912967682, + "learning_rate": 7.846887798730816e-05, + "loss": 0.2597, + "step": 31941 + }, + { + "epoch": 2.587653920933247, + "grad_norm": 0.055912766605615616, + "learning_rate": 7.846437733471354e-05, + "loss": 0.2475, + "step": 31942 + }, + { + "epoch": 2.587734931950745, + "grad_norm": 0.06777750700712204, + "learning_rate": 7.845987668211891e-05, + "loss": 0.2068, + "step": 31943 + }, + { + "epoch": 2.587815942968244, + "grad_norm": 0.06045191362500191, + "learning_rate": 7.845537602952428e-05, + "loss": 0.1975, + "step": 31944 + }, + { + "epoch": 2.587896953985742, + "grad_norm": 0.05264495685696602, + "learning_rate": 7.845087537692966e-05, + "loss": 0.222, + "step": 31945 + }, + { + "epoch": 2.5879779650032404, + "grad_norm": 0.0727301612496376, + "learning_rate": 7.844637472433503e-05, + "loss": 0.229, + "step": 31946 + }, + { + "epoch": 2.588058976020739, + "grad_norm": 0.0756848007440567, + "learning_rate": 7.84418740717404e-05, + "loss": 0.2335, + "step": 31947 + }, + { + "epoch": 2.5881399870382373, + "grad_norm": 0.07050405442714691, + "learning_rate": 7.843737341914578e-05, + "loss": 0.2176, + "step": 31948 + }, + { + "epoch": 2.5882209980557356, + "grad_norm": 0.08286644518375397, + "learning_rate": 7.843287276655115e-05, + "loss": 0.2437, + "step": 31949 + }, + { + "epoch": 2.588302009073234, + "grad_norm": 0.06457363814115524, + "learning_rate": 7.842837211395653e-05, + "loss": 0.2737, + "step": 31950 + }, + { + "epoch": 2.588383020090732, + "grad_norm": 0.05261223763227463, + "learning_rate": 7.84238714613619e-05, + "loss": 0.2254, + "step": 31951 + }, + { + "epoch": 2.5884640311082308, + "grad_norm": 0.07144974172115326, + "learning_rate": 7.841937080876727e-05, + "loss": 0.2101, + "step": 31952 + }, + { + "epoch": 2.588545042125729, + "grad_norm": 0.07414579391479492, + "learning_rate": 7.841487015617265e-05, + "loss": 0.2218, + "step": 31953 + }, + { + "epoch": 2.5886260531432272, + "grad_norm": 0.06547979265451431, + "learning_rate": 7.841036950357802e-05, + "loss": 0.2299, + "step": 31954 + }, + { + "epoch": 2.588707064160726, + "grad_norm": 0.06448670476675034, + "learning_rate": 7.84058688509834e-05, + "loss": 0.2585, + "step": 31955 + }, + { + "epoch": 2.588788075178224, + "grad_norm": 0.08230631053447723, + "learning_rate": 7.840136819838877e-05, + "loss": 0.2804, + "step": 31956 + }, + { + "epoch": 2.5888690861957224, + "grad_norm": 0.057719308882951736, + "learning_rate": 7.839686754579414e-05, + "loss": 0.2039, + "step": 31957 + }, + { + "epoch": 2.588950097213221, + "grad_norm": 0.07325759530067444, + "learning_rate": 7.839236689319951e-05, + "loss": 0.2741, + "step": 31958 + }, + { + "epoch": 2.5890311082307194, + "grad_norm": 0.06162470951676369, + "learning_rate": 7.838786624060489e-05, + "loss": 0.243, + "step": 31959 + }, + { + "epoch": 2.5891121192482176, + "grad_norm": 0.07601647078990936, + "learning_rate": 7.838336558801026e-05, + "loss": 0.2172, + "step": 31960 + }, + { + "epoch": 2.5891931302657163, + "grad_norm": 0.06250499188899994, + "learning_rate": 7.837886493541564e-05, + "loss": 0.238, + "step": 31961 + }, + { + "epoch": 2.5892741412832145, + "grad_norm": 0.0644306018948555, + "learning_rate": 7.837436428282101e-05, + "loss": 0.2483, + "step": 31962 + }, + { + "epoch": 2.589355152300713, + "grad_norm": 0.07267862558364868, + "learning_rate": 7.836986363022638e-05, + "loss": 0.2606, + "step": 31963 + }, + { + "epoch": 2.5894361633182115, + "grad_norm": 0.08261629939079285, + "learning_rate": 7.836536297763176e-05, + "loss": 0.255, + "step": 31964 + }, + { + "epoch": 2.5895171743357097, + "grad_norm": 0.06600159406661987, + "learning_rate": 7.836086232503713e-05, + "loss": 0.2572, + "step": 31965 + }, + { + "epoch": 2.589598185353208, + "grad_norm": 0.06426016986370087, + "learning_rate": 7.835636167244252e-05, + "loss": 0.226, + "step": 31966 + }, + { + "epoch": 2.5896791963707066, + "grad_norm": 0.0756409540772438, + "learning_rate": 7.835186101984788e-05, + "loss": 0.2416, + "step": 31967 + }, + { + "epoch": 2.589760207388205, + "grad_norm": 0.07189775258302689, + "learning_rate": 7.834736036725325e-05, + "loss": 0.2586, + "step": 31968 + }, + { + "epoch": 2.589841218405703, + "grad_norm": 0.06634508073329926, + "learning_rate": 7.834285971465864e-05, + "loss": 0.232, + "step": 31969 + }, + { + "epoch": 2.589922229423202, + "grad_norm": 0.07708480209112167, + "learning_rate": 7.8338359062064e-05, + "loss": 0.2601, + "step": 31970 + }, + { + "epoch": 2.5900032404407, + "grad_norm": 0.07109393179416656, + "learning_rate": 7.833385840946937e-05, + "loss": 0.2367, + "step": 31971 + }, + { + "epoch": 2.5900842514581983, + "grad_norm": 0.06452390551567078, + "learning_rate": 7.832935775687476e-05, + "loss": 0.2532, + "step": 31972 + }, + { + "epoch": 2.5901652624756966, + "grad_norm": 0.06601597368717194, + "learning_rate": 7.832485710428012e-05, + "loss": 0.2728, + "step": 31973 + }, + { + "epoch": 2.590246273493195, + "grad_norm": 0.06474656611680984, + "learning_rate": 7.832035645168549e-05, + "loss": 0.2459, + "step": 31974 + }, + { + "epoch": 2.5903272845106935, + "grad_norm": 0.07416459918022156, + "learning_rate": 7.831585579909088e-05, + "loss": 0.2702, + "step": 31975 + }, + { + "epoch": 2.5904082955281917, + "grad_norm": 0.07041539996862411, + "learning_rate": 7.831135514649624e-05, + "loss": 0.2545, + "step": 31976 + }, + { + "epoch": 2.59048930654569, + "grad_norm": 0.07663211226463318, + "learning_rate": 7.830685449390161e-05, + "loss": 0.2413, + "step": 31977 + }, + { + "epoch": 2.5905703175631887, + "grad_norm": 0.0726345106959343, + "learning_rate": 7.8302353841307e-05, + "loss": 0.2414, + "step": 31978 + }, + { + "epoch": 2.590651328580687, + "grad_norm": 0.06782495975494385, + "learning_rate": 7.829785318871236e-05, + "loss": 0.2362, + "step": 31979 + }, + { + "epoch": 2.590732339598185, + "grad_norm": 0.06722512096166611, + "learning_rate": 7.829335253611773e-05, + "loss": 0.2548, + "step": 31980 + }, + { + "epoch": 2.590813350615684, + "grad_norm": 0.07960183173418045, + "learning_rate": 7.828885188352312e-05, + "loss": 0.2321, + "step": 31981 + }, + { + "epoch": 2.590894361633182, + "grad_norm": 0.057806596159935, + "learning_rate": 7.828435123092848e-05, + "loss": 0.2318, + "step": 31982 + }, + { + "epoch": 2.5909753726506803, + "grad_norm": 0.07928095757961273, + "learning_rate": 7.827985057833385e-05, + "loss": 0.2425, + "step": 31983 + }, + { + "epoch": 2.591056383668179, + "grad_norm": 0.07918012142181396, + "learning_rate": 7.827534992573924e-05, + "loss": 0.2469, + "step": 31984 + }, + { + "epoch": 2.5911373946856773, + "grad_norm": 0.06785554438829422, + "learning_rate": 7.82708492731446e-05, + "loss": 0.2426, + "step": 31985 + }, + { + "epoch": 2.5912184057031755, + "grad_norm": 0.0647081658244133, + "learning_rate": 7.826634862054998e-05, + "loss": 0.2429, + "step": 31986 + }, + { + "epoch": 2.591299416720674, + "grad_norm": 0.07442610710859299, + "learning_rate": 7.826184796795536e-05, + "loss": 0.2838, + "step": 31987 + }, + { + "epoch": 2.5913804277381725, + "grad_norm": 0.08988859504461288, + "learning_rate": 7.825734731536074e-05, + "loss": 0.2178, + "step": 31988 + }, + { + "epoch": 2.5914614387556707, + "grad_norm": 0.07456997781991959, + "learning_rate": 7.82528466627661e-05, + "loss": 0.2431, + "step": 31989 + }, + { + "epoch": 2.5915424497731694, + "grad_norm": 0.054924655705690384, + "learning_rate": 7.824834601017148e-05, + "loss": 0.2429, + "step": 31990 + }, + { + "epoch": 2.5916234607906676, + "grad_norm": 0.06245535612106323, + "learning_rate": 7.824384535757686e-05, + "loss": 0.2261, + "step": 31991 + }, + { + "epoch": 2.591704471808166, + "grad_norm": 0.07505930215120316, + "learning_rate": 7.823934470498223e-05, + "loss": 0.2483, + "step": 31992 + }, + { + "epoch": 2.5917854828256646, + "grad_norm": 0.06292567402124405, + "learning_rate": 7.82348440523876e-05, + "loss": 0.2525, + "step": 31993 + }, + { + "epoch": 2.591866493843163, + "grad_norm": 0.06478261947631836, + "learning_rate": 7.823034339979298e-05, + "loss": 0.2212, + "step": 31994 + }, + { + "epoch": 2.591947504860661, + "grad_norm": 0.08286509662866592, + "learning_rate": 7.822584274719835e-05, + "loss": 0.2374, + "step": 31995 + }, + { + "epoch": 2.5920285158781593, + "grad_norm": 0.06472121179103851, + "learning_rate": 7.822134209460373e-05, + "loss": 0.2454, + "step": 31996 + }, + { + "epoch": 2.5921095268956575, + "grad_norm": 0.06049100682139397, + "learning_rate": 7.82168414420091e-05, + "loss": 0.2333, + "step": 31997 + }, + { + "epoch": 2.5921905379131562, + "grad_norm": 0.06915795803070068, + "learning_rate": 7.821234078941447e-05, + "loss": 0.2544, + "step": 31998 + }, + { + "epoch": 2.5922715489306545, + "grad_norm": 0.07713301479816437, + "learning_rate": 7.820784013681985e-05, + "loss": 0.2656, + "step": 31999 + }, + { + "epoch": 2.5923525599481527, + "grad_norm": 0.06822709739208221, + "learning_rate": 7.820333948422522e-05, + "loss": 0.2675, + "step": 32000 + }, + { + "epoch": 2.5924335709656514, + "grad_norm": 0.07031536102294922, + "learning_rate": 7.819883883163059e-05, + "loss": 0.2574, + "step": 32001 + }, + { + "epoch": 2.5925145819831497, + "grad_norm": 0.06541726738214493, + "learning_rate": 7.819433817903597e-05, + "loss": 0.2327, + "step": 32002 + }, + { + "epoch": 2.592595593000648, + "grad_norm": 0.07353468984365463, + "learning_rate": 7.818983752644134e-05, + "loss": 0.2186, + "step": 32003 + }, + { + "epoch": 2.5926766040181466, + "grad_norm": 0.057273805141448975, + "learning_rate": 7.818533687384671e-05, + "loss": 0.2287, + "step": 32004 + }, + { + "epoch": 2.592757615035645, + "grad_norm": 0.06168617680668831, + "learning_rate": 7.818083622125209e-05, + "loss": 0.2699, + "step": 32005 + }, + { + "epoch": 2.592838626053143, + "grad_norm": 0.07566682994365692, + "learning_rate": 7.817633556865746e-05, + "loss": 0.2402, + "step": 32006 + }, + { + "epoch": 2.5929196370706418, + "grad_norm": 0.06090695783495903, + "learning_rate": 7.817183491606283e-05, + "loss": 0.2508, + "step": 32007 + }, + { + "epoch": 2.59300064808814, + "grad_norm": 0.06652563810348511, + "learning_rate": 7.816733426346821e-05, + "loss": 0.2417, + "step": 32008 + }, + { + "epoch": 2.5930816591056383, + "grad_norm": 0.08565051853656769, + "learning_rate": 7.816283361087358e-05, + "loss": 0.2456, + "step": 32009 + }, + { + "epoch": 2.593162670123137, + "grad_norm": 0.06650824099779129, + "learning_rate": 7.815833295827896e-05, + "loss": 0.2313, + "step": 32010 + }, + { + "epoch": 2.593243681140635, + "grad_norm": 0.05159616097807884, + "learning_rate": 7.815383230568433e-05, + "loss": 0.221, + "step": 32011 + }, + { + "epoch": 2.5933246921581334, + "grad_norm": 0.07195013761520386, + "learning_rate": 7.81493316530897e-05, + "loss": 0.2957, + "step": 32012 + }, + { + "epoch": 2.593405703175632, + "grad_norm": 0.08130887150764465, + "learning_rate": 7.814483100049508e-05, + "loss": 0.2132, + "step": 32013 + }, + { + "epoch": 2.5934867141931304, + "grad_norm": 0.0904105082154274, + "learning_rate": 7.814033034790045e-05, + "loss": 0.2671, + "step": 32014 + }, + { + "epoch": 2.5935677252106286, + "grad_norm": 0.06435638666152954, + "learning_rate": 7.813582969530582e-05, + "loss": 0.2366, + "step": 32015 + }, + { + "epoch": 2.593648736228127, + "grad_norm": 0.07310062646865845, + "learning_rate": 7.81313290427112e-05, + "loss": 0.2915, + "step": 32016 + }, + { + "epoch": 2.5937297472456255, + "grad_norm": 0.06183222308754921, + "learning_rate": 7.812682839011657e-05, + "loss": 0.2516, + "step": 32017 + }, + { + "epoch": 2.593810758263124, + "grad_norm": 0.06138157099485397, + "learning_rate": 7.812232773752194e-05, + "loss": 0.2402, + "step": 32018 + }, + { + "epoch": 2.593891769280622, + "grad_norm": 0.06564778089523315, + "learning_rate": 7.811782708492732e-05, + "loss": 0.2193, + "step": 32019 + }, + { + "epoch": 2.5939727802981203, + "grad_norm": 0.07527987658977509, + "learning_rate": 7.811332643233269e-05, + "loss": 0.2521, + "step": 32020 + }, + { + "epoch": 2.594053791315619, + "grad_norm": 0.07320604473352432, + "learning_rate": 7.810882577973807e-05, + "loss": 0.2271, + "step": 32021 + }, + { + "epoch": 2.594134802333117, + "grad_norm": 0.0609990619122982, + "learning_rate": 7.810432512714344e-05, + "loss": 0.2274, + "step": 32022 + }, + { + "epoch": 2.5942158133506155, + "grad_norm": 0.06894183903932571, + "learning_rate": 7.809982447454881e-05, + "loss": 0.2463, + "step": 32023 + }, + { + "epoch": 2.594296824368114, + "grad_norm": 0.09135408699512482, + "learning_rate": 7.809532382195419e-05, + "loss": 0.271, + "step": 32024 + }, + { + "epoch": 2.5943778353856124, + "grad_norm": 0.06804493069648743, + "learning_rate": 7.809082316935956e-05, + "loss": 0.2331, + "step": 32025 + }, + { + "epoch": 2.5944588464031106, + "grad_norm": 0.06523216515779495, + "learning_rate": 7.808632251676493e-05, + "loss": 0.2107, + "step": 32026 + }, + { + "epoch": 2.5945398574206093, + "grad_norm": 0.05876978114247322, + "learning_rate": 7.80818218641703e-05, + "loss": 0.2293, + "step": 32027 + }, + { + "epoch": 2.5946208684381076, + "grad_norm": 0.09167477488517761, + "learning_rate": 7.807732121157568e-05, + "loss": 0.2273, + "step": 32028 + }, + { + "epoch": 2.594701879455606, + "grad_norm": 0.06189775466918945, + "learning_rate": 7.807282055898105e-05, + "loss": 0.2415, + "step": 32029 + }, + { + "epoch": 2.5947828904731045, + "grad_norm": 0.06428928673267365, + "learning_rate": 7.806831990638643e-05, + "loss": 0.2318, + "step": 32030 + }, + { + "epoch": 2.5948639014906028, + "grad_norm": 0.05991830304265022, + "learning_rate": 7.80638192537918e-05, + "loss": 0.2147, + "step": 32031 + }, + { + "epoch": 2.594944912508101, + "grad_norm": 0.07664906978607178, + "learning_rate": 7.805931860119717e-05, + "loss": 0.2763, + "step": 32032 + }, + { + "epoch": 2.5950259235255997, + "grad_norm": 0.06697775423526764, + "learning_rate": 7.805481794860255e-05, + "loss": 0.233, + "step": 32033 + }, + { + "epoch": 2.595106934543098, + "grad_norm": 0.07400806248188019, + "learning_rate": 7.805031729600792e-05, + "loss": 0.2343, + "step": 32034 + }, + { + "epoch": 2.595187945560596, + "grad_norm": 0.06435059010982513, + "learning_rate": 7.80458166434133e-05, + "loss": 0.2256, + "step": 32035 + }, + { + "epoch": 2.595268956578095, + "grad_norm": 0.05965763330459595, + "learning_rate": 7.804131599081867e-05, + "loss": 0.2557, + "step": 32036 + }, + { + "epoch": 2.595349967595593, + "grad_norm": 0.05650469288229942, + "learning_rate": 7.803681533822404e-05, + "loss": 0.2102, + "step": 32037 + }, + { + "epoch": 2.5954309786130914, + "grad_norm": 0.062130898237228394, + "learning_rate": 7.803231468562942e-05, + "loss": 0.2337, + "step": 32038 + }, + { + "epoch": 2.5955119896305896, + "grad_norm": 0.08088408410549164, + "learning_rate": 7.802781403303479e-05, + "loss": 0.2722, + "step": 32039 + }, + { + "epoch": 2.5955930006480883, + "grad_norm": 0.06567167490720749, + "learning_rate": 7.802331338044016e-05, + "loss": 0.2643, + "step": 32040 + }, + { + "epoch": 2.5956740116655865, + "grad_norm": 0.06135808676481247, + "learning_rate": 7.801881272784554e-05, + "loss": 0.1826, + "step": 32041 + }, + { + "epoch": 2.595755022683085, + "grad_norm": 0.07413794845342636, + "learning_rate": 7.801431207525091e-05, + "loss": 0.2329, + "step": 32042 + }, + { + "epoch": 2.595836033700583, + "grad_norm": 0.08833226561546326, + "learning_rate": 7.800981142265628e-05, + "loss": 0.2747, + "step": 32043 + }, + { + "epoch": 2.5959170447180817, + "grad_norm": 0.07479183375835419, + "learning_rate": 7.800531077006167e-05, + "loss": 0.2454, + "step": 32044 + }, + { + "epoch": 2.59599805573558, + "grad_norm": 0.07777579128742218, + "learning_rate": 7.800081011746703e-05, + "loss": 0.2246, + "step": 32045 + }, + { + "epoch": 2.596079066753078, + "grad_norm": 0.06699565798044205, + "learning_rate": 7.79963094648724e-05, + "loss": 0.2376, + "step": 32046 + }, + { + "epoch": 2.596160077770577, + "grad_norm": 0.07032589614391327, + "learning_rate": 7.799180881227779e-05, + "loss": 0.2038, + "step": 32047 + }, + { + "epoch": 2.596241088788075, + "grad_norm": 0.07530355453491211, + "learning_rate": 7.798730815968315e-05, + "loss": 0.2412, + "step": 32048 + }, + { + "epoch": 2.5963220998055734, + "grad_norm": 0.07182317227125168, + "learning_rate": 7.798280750708853e-05, + "loss": 0.2384, + "step": 32049 + }, + { + "epoch": 2.596403110823072, + "grad_norm": 0.07123768329620361, + "learning_rate": 7.797830685449391e-05, + "loss": 0.2564, + "step": 32050 + }, + { + "epoch": 2.5964841218405703, + "grad_norm": 0.06900478154420853, + "learning_rate": 7.797380620189927e-05, + "loss": 0.2412, + "step": 32051 + }, + { + "epoch": 2.5965651328580686, + "grad_norm": 0.07733968645334244, + "learning_rate": 7.796930554930465e-05, + "loss": 0.2846, + "step": 32052 + }, + { + "epoch": 2.5966461438755672, + "grad_norm": 0.06465481966733932, + "learning_rate": 7.796480489671003e-05, + "loss": 0.2019, + "step": 32053 + }, + { + "epoch": 2.5967271548930655, + "grad_norm": 0.06342798471450806, + "learning_rate": 7.79603042441154e-05, + "loss": 0.2099, + "step": 32054 + }, + { + "epoch": 2.5968081659105637, + "grad_norm": 0.07235822081565857, + "learning_rate": 7.795580359152077e-05, + "loss": 0.2274, + "step": 32055 + }, + { + "epoch": 2.5968891769280624, + "grad_norm": 0.06785944849252701, + "learning_rate": 7.795130293892615e-05, + "loss": 0.2421, + "step": 32056 + }, + { + "epoch": 2.5969701879455607, + "grad_norm": 0.08089176565408707, + "learning_rate": 7.794680228633153e-05, + "loss": 0.2329, + "step": 32057 + }, + { + "epoch": 2.597051198963059, + "grad_norm": 0.061501555144786835, + "learning_rate": 7.794230163373689e-05, + "loss": 0.2261, + "step": 32058 + }, + { + "epoch": 2.5971322099805576, + "grad_norm": 0.06012822687625885, + "learning_rate": 7.793780098114228e-05, + "loss": 0.229, + "step": 32059 + }, + { + "epoch": 2.597213220998056, + "grad_norm": 0.07157018780708313, + "learning_rate": 7.793330032854765e-05, + "loss": 0.2696, + "step": 32060 + }, + { + "epoch": 2.597294232015554, + "grad_norm": 0.07280758768320084, + "learning_rate": 7.792879967595301e-05, + "loss": 0.2474, + "step": 32061 + }, + { + "epoch": 2.5973752430330523, + "grad_norm": 0.06970489770174026, + "learning_rate": 7.79242990233584e-05, + "loss": 0.209, + "step": 32062 + }, + { + "epoch": 2.597456254050551, + "grad_norm": 0.060698650777339935, + "learning_rate": 7.791979837076377e-05, + "loss": 0.2343, + "step": 32063 + }, + { + "epoch": 2.5975372650680493, + "grad_norm": 0.07536067068576813, + "learning_rate": 7.791529771816913e-05, + "loss": 0.2347, + "step": 32064 + }, + { + "epoch": 2.5976182760855475, + "grad_norm": 0.062164146453142166, + "learning_rate": 7.791079706557452e-05, + "loss": 0.2291, + "step": 32065 + }, + { + "epoch": 2.5976992871030458, + "grad_norm": 0.06609980016946793, + "learning_rate": 7.790629641297989e-05, + "loss": 0.2181, + "step": 32066 + }, + { + "epoch": 2.5977802981205445, + "grad_norm": 0.07265621423721313, + "learning_rate": 7.790179576038525e-05, + "loss": 0.2465, + "step": 32067 + }, + { + "epoch": 2.5978613091380427, + "grad_norm": 0.08752243220806122, + "learning_rate": 7.789729510779064e-05, + "loss": 0.2734, + "step": 32068 + }, + { + "epoch": 2.597942320155541, + "grad_norm": 0.06559853255748749, + "learning_rate": 7.789279445519601e-05, + "loss": 0.2437, + "step": 32069 + }, + { + "epoch": 2.5980233311730396, + "grad_norm": 0.06379378587007523, + "learning_rate": 7.788829380260139e-05, + "loss": 0.2586, + "step": 32070 + }, + { + "epoch": 2.598104342190538, + "grad_norm": 0.06516022235155106, + "learning_rate": 7.788379315000676e-05, + "loss": 0.2178, + "step": 32071 + }, + { + "epoch": 2.598185353208036, + "grad_norm": 0.0715324804186821, + "learning_rate": 7.787929249741213e-05, + "loss": 0.2428, + "step": 32072 + }, + { + "epoch": 2.598266364225535, + "grad_norm": 0.06728719919919968, + "learning_rate": 7.78747918448175e-05, + "loss": 0.2606, + "step": 32073 + }, + { + "epoch": 2.598347375243033, + "grad_norm": 0.057596538215875626, + "learning_rate": 7.787029119222288e-05, + "loss": 0.2282, + "step": 32074 + }, + { + "epoch": 2.5984283862605313, + "grad_norm": 0.07895933091640472, + "learning_rate": 7.786579053962825e-05, + "loss": 0.2665, + "step": 32075 + }, + { + "epoch": 2.59850939727803, + "grad_norm": 0.062324851751327515, + "learning_rate": 7.786128988703363e-05, + "loss": 0.2022, + "step": 32076 + }, + { + "epoch": 2.5985904082955282, + "grad_norm": 0.06992025673389435, + "learning_rate": 7.7856789234439e-05, + "loss": 0.2236, + "step": 32077 + }, + { + "epoch": 2.5986714193130265, + "grad_norm": 0.0895005464553833, + "learning_rate": 7.785228858184437e-05, + "loss": 0.233, + "step": 32078 + }, + { + "epoch": 2.598752430330525, + "grad_norm": 0.06543274968862534, + "learning_rate": 7.784778792924975e-05, + "loss": 0.2491, + "step": 32079 + }, + { + "epoch": 2.5988334413480234, + "grad_norm": 0.07061692327260971, + "learning_rate": 7.784328727665512e-05, + "loss": 0.2302, + "step": 32080 + }, + { + "epoch": 2.5989144523655217, + "grad_norm": 0.08071654289960861, + "learning_rate": 7.78387866240605e-05, + "loss": 0.2655, + "step": 32081 + }, + { + "epoch": 2.5989954633830203, + "grad_norm": 0.06737606227397919, + "learning_rate": 7.783428597146587e-05, + "loss": 0.234, + "step": 32082 + }, + { + "epoch": 2.5990764744005186, + "grad_norm": 0.06019715592265129, + "learning_rate": 7.782978531887124e-05, + "loss": 0.245, + "step": 32083 + }, + { + "epoch": 2.599157485418017, + "grad_norm": 0.06333910673856735, + "learning_rate": 7.782528466627662e-05, + "loss": 0.2508, + "step": 32084 + }, + { + "epoch": 2.599238496435515, + "grad_norm": 0.07445622980594635, + "learning_rate": 7.782078401368199e-05, + "loss": 0.2456, + "step": 32085 + }, + { + "epoch": 2.5993195074530138, + "grad_norm": 0.06121855229139328, + "learning_rate": 7.781628336108736e-05, + "loss": 0.2211, + "step": 32086 + }, + { + "epoch": 2.599400518470512, + "grad_norm": 0.09158430993556976, + "learning_rate": 7.781178270849274e-05, + "loss": 0.2805, + "step": 32087 + }, + { + "epoch": 2.5994815294880103, + "grad_norm": 0.0586605928838253, + "learning_rate": 7.780728205589811e-05, + "loss": 0.2152, + "step": 32088 + }, + { + "epoch": 2.5995625405055085, + "grad_norm": 0.08469560742378235, + "learning_rate": 7.780278140330348e-05, + "loss": 0.2512, + "step": 32089 + }, + { + "epoch": 2.599643551523007, + "grad_norm": 0.08891402184963226, + "learning_rate": 7.779828075070886e-05, + "loss": 0.2435, + "step": 32090 + }, + { + "epoch": 2.5997245625405054, + "grad_norm": 0.05682078003883362, + "learning_rate": 7.779378009811423e-05, + "loss": 0.1997, + "step": 32091 + }, + { + "epoch": 2.5998055735580037, + "grad_norm": 0.07719125598669052, + "learning_rate": 7.77892794455196e-05, + "loss": 0.2507, + "step": 32092 + }, + { + "epoch": 2.5998865845755024, + "grad_norm": 0.09698209166526794, + "learning_rate": 7.778477879292498e-05, + "loss": 0.2179, + "step": 32093 + }, + { + "epoch": 2.5999675955930006, + "grad_norm": 0.05910210683941841, + "learning_rate": 7.778027814033035e-05, + "loss": 0.2175, + "step": 32094 + }, + { + "epoch": 2.600048606610499, + "grad_norm": 0.06909825652837753, + "learning_rate": 7.777577748773573e-05, + "loss": 0.2486, + "step": 32095 + }, + { + "epoch": 2.6001296176279975, + "grad_norm": 0.0670301616191864, + "learning_rate": 7.77712768351411e-05, + "loss": 0.2375, + "step": 32096 + }, + { + "epoch": 2.600210628645496, + "grad_norm": 0.0631147101521492, + "learning_rate": 7.776677618254647e-05, + "loss": 0.2515, + "step": 32097 + }, + { + "epoch": 2.600291639662994, + "grad_norm": 0.08409339934587479, + "learning_rate": 7.776227552995185e-05, + "loss": 0.2635, + "step": 32098 + }, + { + "epoch": 2.6003726506804927, + "grad_norm": 0.06275641918182373, + "learning_rate": 7.775777487735722e-05, + "loss": 0.222, + "step": 32099 + }, + { + "epoch": 2.600453661697991, + "grad_norm": 0.06495209783315659, + "learning_rate": 7.77532742247626e-05, + "loss": 0.2427, + "step": 32100 + }, + { + "epoch": 2.600534672715489, + "grad_norm": 0.058924321085214615, + "learning_rate": 7.774877357216797e-05, + "loss": 0.2369, + "step": 32101 + }, + { + "epoch": 2.600615683732988, + "grad_norm": 0.06495524197816849, + "learning_rate": 7.774427291957334e-05, + "loss": 0.2248, + "step": 32102 + }, + { + "epoch": 2.600696694750486, + "grad_norm": 0.06625855714082718, + "learning_rate": 7.773977226697871e-05, + "loss": 0.2416, + "step": 32103 + }, + { + "epoch": 2.6007777057679844, + "grad_norm": 0.06197715550661087, + "learning_rate": 7.773527161438409e-05, + "loss": 0.2356, + "step": 32104 + }, + { + "epoch": 2.600858716785483, + "grad_norm": 0.06675733625888824, + "learning_rate": 7.773077096178946e-05, + "loss": 0.2749, + "step": 32105 + }, + { + "epoch": 2.6009397278029813, + "grad_norm": 0.06515955924987793, + "learning_rate": 7.772627030919483e-05, + "loss": 0.2303, + "step": 32106 + }, + { + "epoch": 2.6010207388204796, + "grad_norm": 0.05678664520382881, + "learning_rate": 7.772176965660021e-05, + "loss": 0.204, + "step": 32107 + }, + { + "epoch": 2.601101749837978, + "grad_norm": 0.08410949259996414, + "learning_rate": 7.771726900400558e-05, + "loss": 0.2382, + "step": 32108 + }, + { + "epoch": 2.6011827608554765, + "grad_norm": 0.06365490704774857, + "learning_rate": 7.771276835141096e-05, + "loss": 0.2017, + "step": 32109 + }, + { + "epoch": 2.6012637718729748, + "grad_norm": 0.07431008666753769, + "learning_rate": 7.770826769881633e-05, + "loss": 0.2291, + "step": 32110 + }, + { + "epoch": 2.601344782890473, + "grad_norm": 0.07208096235990524, + "learning_rate": 7.77037670462217e-05, + "loss": 0.234, + "step": 32111 + }, + { + "epoch": 2.6014257939079712, + "grad_norm": 0.09047136455774307, + "learning_rate": 7.769926639362708e-05, + "loss": 0.2459, + "step": 32112 + }, + { + "epoch": 2.60150680492547, + "grad_norm": 0.06856090575456619, + "learning_rate": 7.769476574103245e-05, + "loss": 0.2726, + "step": 32113 + }, + { + "epoch": 2.601587815942968, + "grad_norm": 0.056730642914772034, + "learning_rate": 7.769026508843782e-05, + "loss": 0.2499, + "step": 32114 + }, + { + "epoch": 2.6016688269604664, + "grad_norm": 0.0567195862531662, + "learning_rate": 7.76857644358432e-05, + "loss": 0.2298, + "step": 32115 + }, + { + "epoch": 2.601749837977965, + "grad_norm": 0.0721188485622406, + "learning_rate": 7.768126378324857e-05, + "loss": 0.2161, + "step": 32116 + }, + { + "epoch": 2.6018308489954634, + "grad_norm": 0.08085426688194275, + "learning_rate": 7.767676313065394e-05, + "loss": 0.2409, + "step": 32117 + }, + { + "epoch": 2.6019118600129616, + "grad_norm": 0.07815434783697128, + "learning_rate": 7.767226247805932e-05, + "loss": 0.2103, + "step": 32118 + }, + { + "epoch": 2.6019928710304603, + "grad_norm": 0.0653507336974144, + "learning_rate": 7.766776182546469e-05, + "loss": 0.2375, + "step": 32119 + }, + { + "epoch": 2.6020738820479585, + "grad_norm": 0.05822880193591118, + "learning_rate": 7.766326117287007e-05, + "loss": 0.2571, + "step": 32120 + }, + { + "epoch": 2.6021548930654568, + "grad_norm": 0.06808577477931976, + "learning_rate": 7.765876052027544e-05, + "loss": 0.2179, + "step": 32121 + }, + { + "epoch": 2.6022359040829555, + "grad_norm": 0.0650370866060257, + "learning_rate": 7.765425986768081e-05, + "loss": 0.2428, + "step": 32122 + }, + { + "epoch": 2.6023169151004537, + "grad_norm": 0.07062049210071564, + "learning_rate": 7.76497592150862e-05, + "loss": 0.2361, + "step": 32123 + }, + { + "epoch": 2.602397926117952, + "grad_norm": 0.06092534959316254, + "learning_rate": 7.764525856249156e-05, + "loss": 0.2163, + "step": 32124 + }, + { + "epoch": 2.6024789371354506, + "grad_norm": 0.06739575415849686, + "learning_rate": 7.764075790989695e-05, + "loss": 0.2449, + "step": 32125 + }, + { + "epoch": 2.602559948152949, + "grad_norm": 0.06647944450378418, + "learning_rate": 7.763625725730232e-05, + "loss": 0.231, + "step": 32126 + }, + { + "epoch": 2.602640959170447, + "grad_norm": 0.06996641308069229, + "learning_rate": 7.763175660470768e-05, + "loss": 0.2452, + "step": 32127 + }, + { + "epoch": 2.602721970187946, + "grad_norm": 0.061999525874853134, + "learning_rate": 7.762725595211307e-05, + "loss": 0.2371, + "step": 32128 + }, + { + "epoch": 2.602802981205444, + "grad_norm": 0.06345861405134201, + "learning_rate": 7.762275529951844e-05, + "loss": 0.2354, + "step": 32129 + }, + { + "epoch": 2.6028839922229423, + "grad_norm": 0.07594001293182373, + "learning_rate": 7.76182546469238e-05, + "loss": 0.2262, + "step": 32130 + }, + { + "epoch": 2.6029650032404406, + "grad_norm": 0.05956905707716942, + "learning_rate": 7.761375399432919e-05, + "loss": 0.2466, + "step": 32131 + }, + { + "epoch": 2.6030460142579392, + "grad_norm": 0.07624305039644241, + "learning_rate": 7.760925334173456e-05, + "loss": 0.2476, + "step": 32132 + }, + { + "epoch": 2.6031270252754375, + "grad_norm": 0.06887023895978928, + "learning_rate": 7.760475268913992e-05, + "loss": 0.2213, + "step": 32133 + }, + { + "epoch": 2.6032080362929357, + "grad_norm": 0.0756741613149643, + "learning_rate": 7.760025203654531e-05, + "loss": 0.2794, + "step": 32134 + }, + { + "epoch": 2.603289047310434, + "grad_norm": 0.05459382012486458, + "learning_rate": 7.759575138395068e-05, + "loss": 0.2318, + "step": 32135 + }, + { + "epoch": 2.6033700583279327, + "grad_norm": 0.06112325191497803, + "learning_rate": 7.759125073135604e-05, + "loss": 0.2449, + "step": 32136 + }, + { + "epoch": 2.603451069345431, + "grad_norm": 0.056865058839321136, + "learning_rate": 7.758675007876143e-05, + "loss": 0.2248, + "step": 32137 + }, + { + "epoch": 2.603532080362929, + "grad_norm": 0.07650525867938995, + "learning_rate": 7.75822494261668e-05, + "loss": 0.2255, + "step": 32138 + }, + { + "epoch": 2.603613091380428, + "grad_norm": 0.06502238661050797, + "learning_rate": 7.757774877357216e-05, + "loss": 0.2462, + "step": 32139 + }, + { + "epoch": 2.603694102397926, + "grad_norm": 0.06768109649419785, + "learning_rate": 7.757324812097755e-05, + "loss": 0.2303, + "step": 32140 + }, + { + "epoch": 2.6037751134154243, + "grad_norm": 0.0667257085442543, + "learning_rate": 7.756874746838292e-05, + "loss": 0.2302, + "step": 32141 + }, + { + "epoch": 2.603856124432923, + "grad_norm": 0.06509841233491898, + "learning_rate": 7.756424681578828e-05, + "loss": 0.262, + "step": 32142 + }, + { + "epoch": 2.6039371354504213, + "grad_norm": 0.06468883901834488, + "learning_rate": 7.755974616319367e-05, + "loss": 0.2787, + "step": 32143 + }, + { + "epoch": 2.6040181464679195, + "grad_norm": 0.08520834892988205, + "learning_rate": 7.755524551059905e-05, + "loss": 0.2821, + "step": 32144 + }, + { + "epoch": 2.604099157485418, + "grad_norm": 0.06440196186304092, + "learning_rate": 7.75507448580044e-05, + "loss": 0.225, + "step": 32145 + }, + { + "epoch": 2.6041801685029164, + "grad_norm": 0.06669966131448746, + "learning_rate": 7.754624420540979e-05, + "loss": 0.2595, + "step": 32146 + }, + { + "epoch": 2.6042611795204147, + "grad_norm": 0.08062119036912918, + "learning_rate": 7.754174355281517e-05, + "loss": 0.2767, + "step": 32147 + }, + { + "epoch": 2.6043421905379134, + "grad_norm": 0.069743312895298, + "learning_rate": 7.753724290022053e-05, + "loss": 0.2634, + "step": 32148 + }, + { + "epoch": 2.6044232015554116, + "grad_norm": 0.06465429812669754, + "learning_rate": 7.753274224762591e-05, + "loss": 0.2333, + "step": 32149 + }, + { + "epoch": 2.60450421257291, + "grad_norm": 0.06599178165197372, + "learning_rate": 7.752824159503129e-05, + "loss": 0.2581, + "step": 32150 + }, + { + "epoch": 2.6045852235904086, + "grad_norm": 0.06734525412321091, + "learning_rate": 7.752374094243666e-05, + "loss": 0.2836, + "step": 32151 + }, + { + "epoch": 2.604666234607907, + "grad_norm": 0.08109408617019653, + "learning_rate": 7.751924028984203e-05, + "loss": 0.2455, + "step": 32152 + }, + { + "epoch": 2.604747245625405, + "grad_norm": 0.06604114174842834, + "learning_rate": 7.751473963724741e-05, + "loss": 0.2375, + "step": 32153 + }, + { + "epoch": 2.6048282566429033, + "grad_norm": 0.06498434394598007, + "learning_rate": 7.751023898465278e-05, + "loss": 0.2373, + "step": 32154 + }, + { + "epoch": 2.6049092676604015, + "grad_norm": 0.06334884464740753, + "learning_rate": 7.750573833205816e-05, + "loss": 0.2258, + "step": 32155 + }, + { + "epoch": 2.6049902786779002, + "grad_norm": 0.06946169584989548, + "learning_rate": 7.750123767946353e-05, + "loss": 0.192, + "step": 32156 + }, + { + "epoch": 2.6050712896953985, + "grad_norm": 0.06957487761974335, + "learning_rate": 7.74967370268689e-05, + "loss": 0.257, + "step": 32157 + }, + { + "epoch": 2.6051523007128967, + "grad_norm": 0.08043936640024185, + "learning_rate": 7.749223637427428e-05, + "loss": 0.2327, + "step": 32158 + }, + { + "epoch": 2.6052333117303954, + "grad_norm": 0.06651458889245987, + "learning_rate": 7.748773572167965e-05, + "loss": 0.2216, + "step": 32159 + }, + { + "epoch": 2.6053143227478937, + "grad_norm": 0.07017157226800919, + "learning_rate": 7.748323506908502e-05, + "loss": 0.2383, + "step": 32160 + }, + { + "epoch": 2.605395333765392, + "grad_norm": 0.07861942052841187, + "learning_rate": 7.74787344164904e-05, + "loss": 0.2728, + "step": 32161 + }, + { + "epoch": 2.6054763447828906, + "grad_norm": 0.07645098865032196, + "learning_rate": 7.747423376389577e-05, + "loss": 0.2531, + "step": 32162 + }, + { + "epoch": 2.605557355800389, + "grad_norm": 0.06685902923345566, + "learning_rate": 7.746973311130114e-05, + "loss": 0.2501, + "step": 32163 + }, + { + "epoch": 2.605638366817887, + "grad_norm": 0.057141318917274475, + "learning_rate": 7.746523245870652e-05, + "loss": 0.209, + "step": 32164 + }, + { + "epoch": 2.6057193778353858, + "grad_norm": 0.07345332205295563, + "learning_rate": 7.746073180611189e-05, + "loss": 0.2316, + "step": 32165 + }, + { + "epoch": 2.605800388852884, + "grad_norm": 0.07095864415168762, + "learning_rate": 7.745623115351726e-05, + "loss": 0.2351, + "step": 32166 + }, + { + "epoch": 2.6058813998703823, + "grad_norm": 0.0770629495382309, + "learning_rate": 7.745173050092264e-05, + "loss": 0.2171, + "step": 32167 + }, + { + "epoch": 2.605962410887881, + "grad_norm": 0.0654478594660759, + "learning_rate": 7.744722984832801e-05, + "loss": 0.2113, + "step": 32168 + }, + { + "epoch": 2.606043421905379, + "grad_norm": 0.07980560511350632, + "learning_rate": 7.744272919573339e-05, + "loss": 0.2671, + "step": 32169 + }, + { + "epoch": 2.6061244329228774, + "grad_norm": 0.06946124136447906, + "learning_rate": 7.743822854313876e-05, + "loss": 0.2077, + "step": 32170 + }, + { + "epoch": 2.606205443940376, + "grad_norm": 0.07111818343400955, + "learning_rate": 7.743372789054413e-05, + "loss": 0.2573, + "step": 32171 + }, + { + "epoch": 2.6062864549578744, + "grad_norm": 0.06737150251865387, + "learning_rate": 7.74292272379495e-05, + "loss": 0.2075, + "step": 32172 + }, + { + "epoch": 2.6063674659753726, + "grad_norm": 0.08232983946800232, + "learning_rate": 7.742472658535488e-05, + "loss": 0.2885, + "step": 32173 + }, + { + "epoch": 2.6064484769928713, + "grad_norm": 0.0650903508067131, + "learning_rate": 7.742022593276025e-05, + "loss": 0.2218, + "step": 32174 + }, + { + "epoch": 2.6065294880103695, + "grad_norm": 0.05715804547071457, + "learning_rate": 7.741572528016563e-05, + "loss": 0.2562, + "step": 32175 + }, + { + "epoch": 2.606610499027868, + "grad_norm": 0.08915777504444122, + "learning_rate": 7.7411224627571e-05, + "loss": 0.2583, + "step": 32176 + }, + { + "epoch": 2.606691510045366, + "grad_norm": 0.060810547322034836, + "learning_rate": 7.740672397497637e-05, + "loss": 0.2595, + "step": 32177 + }, + { + "epoch": 2.6067725210628643, + "grad_norm": 0.06770209223031998, + "learning_rate": 7.740222332238175e-05, + "loss": 0.245, + "step": 32178 + }, + { + "epoch": 2.606853532080363, + "grad_norm": 0.07107766717672348, + "learning_rate": 7.739772266978712e-05, + "loss": 0.2365, + "step": 32179 + }, + { + "epoch": 2.606934543097861, + "grad_norm": 0.064228855073452, + "learning_rate": 7.73932220171925e-05, + "loss": 0.1962, + "step": 32180 + }, + { + "epoch": 2.6070155541153595, + "grad_norm": 0.0662747398018837, + "learning_rate": 7.738872136459787e-05, + "loss": 0.2428, + "step": 32181 + }, + { + "epoch": 2.607096565132858, + "grad_norm": 0.06631524860858917, + "learning_rate": 7.738422071200324e-05, + "loss": 0.2313, + "step": 32182 + }, + { + "epoch": 2.6071775761503564, + "grad_norm": 0.06557276844978333, + "learning_rate": 7.737972005940862e-05, + "loss": 0.2079, + "step": 32183 + }, + { + "epoch": 2.6072585871678546, + "grad_norm": 0.06969014555215836, + "learning_rate": 7.737521940681399e-05, + "loss": 0.2213, + "step": 32184 + }, + { + "epoch": 2.6073395981853533, + "grad_norm": 0.05485299229621887, + "learning_rate": 7.737071875421936e-05, + "loss": 0.2153, + "step": 32185 + }, + { + "epoch": 2.6074206092028516, + "grad_norm": 0.06255738437175751, + "learning_rate": 7.736621810162474e-05, + "loss": 0.2748, + "step": 32186 + }, + { + "epoch": 2.60750162022035, + "grad_norm": 0.07666832953691483, + "learning_rate": 7.736171744903011e-05, + "loss": 0.2586, + "step": 32187 + }, + { + "epoch": 2.6075826312378485, + "grad_norm": 0.07308944314718246, + "learning_rate": 7.735721679643548e-05, + "loss": 0.2681, + "step": 32188 + }, + { + "epoch": 2.6076636422553467, + "grad_norm": 0.0682573989033699, + "learning_rate": 7.735271614384086e-05, + "loss": 0.2337, + "step": 32189 + }, + { + "epoch": 2.607744653272845, + "grad_norm": 0.08183484524488449, + "learning_rate": 7.734821549124623e-05, + "loss": 0.2215, + "step": 32190 + }, + { + "epoch": 2.6078256642903437, + "grad_norm": 0.08453409373760223, + "learning_rate": 7.73437148386516e-05, + "loss": 0.2535, + "step": 32191 + }, + { + "epoch": 2.607906675307842, + "grad_norm": 0.07045777142047882, + "learning_rate": 7.733921418605699e-05, + "loss": 0.2249, + "step": 32192 + }, + { + "epoch": 2.60798768632534, + "grad_norm": 0.060405831784009933, + "learning_rate": 7.733471353346235e-05, + "loss": 0.2345, + "step": 32193 + }, + { + "epoch": 2.608068697342839, + "grad_norm": 0.08729273080825806, + "learning_rate": 7.733021288086773e-05, + "loss": 0.2365, + "step": 32194 + }, + { + "epoch": 2.608149708360337, + "grad_norm": 0.06557916104793549, + "learning_rate": 7.732571222827311e-05, + "loss": 0.2168, + "step": 32195 + }, + { + "epoch": 2.6082307193778353, + "grad_norm": 0.07198052853345871, + "learning_rate": 7.732121157567847e-05, + "loss": 0.2444, + "step": 32196 + }, + { + "epoch": 2.608311730395334, + "grad_norm": 0.07454212754964828, + "learning_rate": 7.731671092308385e-05, + "loss": 0.2326, + "step": 32197 + }, + { + "epoch": 2.6083927414128323, + "grad_norm": 0.07987958192825317, + "learning_rate": 7.731221027048923e-05, + "loss": 0.2359, + "step": 32198 + }, + { + "epoch": 2.6084737524303305, + "grad_norm": 0.07974809408187866, + "learning_rate": 7.73077096178946e-05, + "loss": 0.257, + "step": 32199 + }, + { + "epoch": 2.6085547634478288, + "grad_norm": 0.0739462673664093, + "learning_rate": 7.730320896529997e-05, + "loss": 0.2693, + "step": 32200 + }, + { + "epoch": 2.608635774465327, + "grad_norm": 0.06772410869598389, + "learning_rate": 7.729870831270535e-05, + "loss": 0.214, + "step": 32201 + }, + { + "epoch": 2.6087167854828257, + "grad_norm": 0.06608753651380539, + "learning_rate": 7.729420766011071e-05, + "loss": 0.2142, + "step": 32202 + }, + { + "epoch": 2.608797796500324, + "grad_norm": 0.06899905949831009, + "learning_rate": 7.72897070075161e-05, + "loss": 0.2072, + "step": 32203 + }, + { + "epoch": 2.608878807517822, + "grad_norm": 0.07096538692712784, + "learning_rate": 7.728520635492148e-05, + "loss": 0.261, + "step": 32204 + }, + { + "epoch": 2.608959818535321, + "grad_norm": 0.08852484077215195, + "learning_rate": 7.728070570232684e-05, + "loss": 0.2959, + "step": 32205 + }, + { + "epoch": 2.609040829552819, + "grad_norm": 0.0703040286898613, + "learning_rate": 7.727620504973222e-05, + "loss": 0.3129, + "step": 32206 + }, + { + "epoch": 2.6091218405703174, + "grad_norm": 0.06888163834810257, + "learning_rate": 7.72717043971376e-05, + "loss": 0.2657, + "step": 32207 + }, + { + "epoch": 2.609202851587816, + "grad_norm": 0.06287510693073273, + "learning_rate": 7.726720374454296e-05, + "loss": 0.232, + "step": 32208 + }, + { + "epoch": 2.6092838626053143, + "grad_norm": 0.06748857349157333, + "learning_rate": 7.726270309194834e-05, + "loss": 0.2416, + "step": 32209 + }, + { + "epoch": 2.6093648736228126, + "grad_norm": 0.07733192294836044, + "learning_rate": 7.725820243935372e-05, + "loss": 0.2529, + "step": 32210 + }, + { + "epoch": 2.6094458846403112, + "grad_norm": 0.06905055791139603, + "learning_rate": 7.725370178675908e-05, + "loss": 0.2513, + "step": 32211 + }, + { + "epoch": 2.6095268956578095, + "grad_norm": 0.06713546812534332, + "learning_rate": 7.724920113416446e-05, + "loss": 0.2609, + "step": 32212 + }, + { + "epoch": 2.6096079066753077, + "grad_norm": 0.04904366284608841, + "learning_rate": 7.724470048156984e-05, + "loss": 0.2258, + "step": 32213 + }, + { + "epoch": 2.6096889176928064, + "grad_norm": 0.0761357843875885, + "learning_rate": 7.72401998289752e-05, + "loss": 0.25, + "step": 32214 + }, + { + "epoch": 2.6097699287103047, + "grad_norm": 0.06284836679697037, + "learning_rate": 7.723569917638058e-05, + "loss": 0.2252, + "step": 32215 + }, + { + "epoch": 2.609850939727803, + "grad_norm": 0.06382585316896439, + "learning_rate": 7.723119852378596e-05, + "loss": 0.2554, + "step": 32216 + }, + { + "epoch": 2.6099319507453016, + "grad_norm": 0.061265233904123306, + "learning_rate": 7.722669787119132e-05, + "loss": 0.2382, + "step": 32217 + }, + { + "epoch": 2.6100129617628, + "grad_norm": 0.06161702796816826, + "learning_rate": 7.72221972185967e-05, + "loss": 0.2122, + "step": 32218 + }, + { + "epoch": 2.610093972780298, + "grad_norm": 0.06948524713516235, + "learning_rate": 7.721769656600208e-05, + "loss": 0.2784, + "step": 32219 + }, + { + "epoch": 2.6101749837977968, + "grad_norm": 0.06408175826072693, + "learning_rate": 7.721319591340744e-05, + "loss": 0.2285, + "step": 32220 + }, + { + "epoch": 2.610255994815295, + "grad_norm": 0.08492530882358551, + "learning_rate": 7.720869526081283e-05, + "loss": 0.2356, + "step": 32221 + }, + { + "epoch": 2.6103370058327933, + "grad_norm": 0.06570692360401154, + "learning_rate": 7.72041946082182e-05, + "loss": 0.2748, + "step": 32222 + }, + { + "epoch": 2.6104180168502915, + "grad_norm": 0.0714087188243866, + "learning_rate": 7.719969395562356e-05, + "loss": 0.2488, + "step": 32223 + }, + { + "epoch": 2.6104990278677898, + "grad_norm": 0.07423436641693115, + "learning_rate": 7.719519330302895e-05, + "loss": 0.2216, + "step": 32224 + }, + { + "epoch": 2.6105800388852884, + "grad_norm": 0.05890345573425293, + "learning_rate": 7.719069265043432e-05, + "loss": 0.2195, + "step": 32225 + }, + { + "epoch": 2.6106610499027867, + "grad_norm": 0.0739654079079628, + "learning_rate": 7.718619199783968e-05, + "loss": 0.2489, + "step": 32226 + }, + { + "epoch": 2.610742060920285, + "grad_norm": 0.07757818698883057, + "learning_rate": 7.718169134524507e-05, + "loss": 0.259, + "step": 32227 + }, + { + "epoch": 2.6108230719377836, + "grad_norm": 0.0802304819226265, + "learning_rate": 7.717719069265044e-05, + "loss": 0.2567, + "step": 32228 + }, + { + "epoch": 2.610904082955282, + "grad_norm": 0.06230137497186661, + "learning_rate": 7.717269004005582e-05, + "loss": 0.2389, + "step": 32229 + }, + { + "epoch": 2.61098509397278, + "grad_norm": 0.07052315026521683, + "learning_rate": 7.716818938746119e-05, + "loss": 0.2591, + "step": 32230 + }, + { + "epoch": 2.611066104990279, + "grad_norm": 0.07671262323856354, + "learning_rate": 7.716368873486656e-05, + "loss": 0.243, + "step": 32231 + }, + { + "epoch": 2.611147116007777, + "grad_norm": 0.06278721243143082, + "learning_rate": 7.715918808227194e-05, + "loss": 0.2128, + "step": 32232 + }, + { + "epoch": 2.6112281270252753, + "grad_norm": 0.098354272544384, + "learning_rate": 7.715468742967731e-05, + "loss": 0.2761, + "step": 32233 + }, + { + "epoch": 2.611309138042774, + "grad_norm": 0.07634606957435608, + "learning_rate": 7.715018677708268e-05, + "loss": 0.262, + "step": 32234 + }, + { + "epoch": 2.6113901490602722, + "grad_norm": 0.07335880398750305, + "learning_rate": 7.714568612448806e-05, + "loss": 0.233, + "step": 32235 + }, + { + "epoch": 2.6114711600777705, + "grad_norm": 0.05759400501847267, + "learning_rate": 7.714118547189343e-05, + "loss": 0.208, + "step": 32236 + }, + { + "epoch": 2.611552171095269, + "grad_norm": 0.07161126285791397, + "learning_rate": 7.71366848192988e-05, + "loss": 0.2126, + "step": 32237 + }, + { + "epoch": 2.6116331821127674, + "grad_norm": 0.07215110957622528, + "learning_rate": 7.713218416670418e-05, + "loss": 0.2435, + "step": 32238 + }, + { + "epoch": 2.6117141931302656, + "grad_norm": 0.05700969696044922, + "learning_rate": 7.712768351410955e-05, + "loss": 0.2107, + "step": 32239 + }, + { + "epoch": 2.6117952041477643, + "grad_norm": 0.0639571100473404, + "learning_rate": 7.712318286151492e-05, + "loss": 0.2427, + "step": 32240 + }, + { + "epoch": 2.6118762151652626, + "grad_norm": 0.06004498153924942, + "learning_rate": 7.71186822089203e-05, + "loss": 0.246, + "step": 32241 + }, + { + "epoch": 2.611957226182761, + "grad_norm": 0.059504538774490356, + "learning_rate": 7.711418155632567e-05, + "loss": 0.2312, + "step": 32242 + }, + { + "epoch": 2.612038237200259, + "grad_norm": 0.08116213977336884, + "learning_rate": 7.710968090373105e-05, + "loss": 0.305, + "step": 32243 + }, + { + "epoch": 2.6121192482177578, + "grad_norm": 0.06468251347541809, + "learning_rate": 7.710518025113642e-05, + "loss": 0.2377, + "step": 32244 + }, + { + "epoch": 2.612200259235256, + "grad_norm": 0.06595216691493988, + "learning_rate": 7.710067959854179e-05, + "loss": 0.2472, + "step": 32245 + }, + { + "epoch": 2.6122812702527543, + "grad_norm": 0.07535982877016068, + "learning_rate": 7.709617894594717e-05, + "loss": 0.2429, + "step": 32246 + }, + { + "epoch": 2.6123622812702525, + "grad_norm": 0.07220283150672913, + "learning_rate": 7.709167829335254e-05, + "loss": 0.2217, + "step": 32247 + }, + { + "epoch": 2.612443292287751, + "grad_norm": 0.07300985604524612, + "learning_rate": 7.708717764075791e-05, + "loss": 0.2433, + "step": 32248 + }, + { + "epoch": 2.6125243033052494, + "grad_norm": 0.06394525617361069, + "learning_rate": 7.708267698816329e-05, + "loss": 0.2997, + "step": 32249 + }, + { + "epoch": 2.6126053143227477, + "grad_norm": 0.060523077845573425, + "learning_rate": 7.707817633556866e-05, + "loss": 0.2195, + "step": 32250 + }, + { + "epoch": 2.6126863253402464, + "grad_norm": 0.06264317780733109, + "learning_rate": 7.707367568297403e-05, + "loss": 0.2492, + "step": 32251 + }, + { + "epoch": 2.6127673363577446, + "grad_norm": 0.06706266105175018, + "learning_rate": 7.706917503037941e-05, + "loss": 0.2164, + "step": 32252 + }, + { + "epoch": 2.612848347375243, + "grad_norm": 0.06442420929670334, + "learning_rate": 7.706467437778478e-05, + "loss": 0.2514, + "step": 32253 + }, + { + "epoch": 2.6129293583927415, + "grad_norm": 0.08338776230812073, + "learning_rate": 7.706017372519016e-05, + "loss": 0.2873, + "step": 32254 + }, + { + "epoch": 2.61301036941024, + "grad_norm": 0.08646774291992188, + "learning_rate": 7.705567307259553e-05, + "loss": 0.2131, + "step": 32255 + }, + { + "epoch": 2.613091380427738, + "grad_norm": 0.060562461614608765, + "learning_rate": 7.70511724200009e-05, + "loss": 0.2104, + "step": 32256 + }, + { + "epoch": 2.6131723914452367, + "grad_norm": 0.06581650674343109, + "learning_rate": 7.704667176740628e-05, + "loss": 0.2556, + "step": 32257 + }, + { + "epoch": 2.613253402462735, + "grad_norm": 0.0644855946302414, + "learning_rate": 7.704217111481165e-05, + "loss": 0.244, + "step": 32258 + }, + { + "epoch": 2.613334413480233, + "grad_norm": 0.06705068796873093, + "learning_rate": 7.703767046221702e-05, + "loss": 0.2729, + "step": 32259 + }, + { + "epoch": 2.613415424497732, + "grad_norm": 0.06724876910448074, + "learning_rate": 7.70331698096224e-05, + "loss": 0.2644, + "step": 32260 + }, + { + "epoch": 2.61349643551523, + "grad_norm": 0.0717342346906662, + "learning_rate": 7.702866915702778e-05, + "loss": 0.2399, + "step": 32261 + }, + { + "epoch": 2.6135774465327284, + "grad_norm": 0.06799786537885666, + "learning_rate": 7.702416850443314e-05, + "loss": 0.2561, + "step": 32262 + }, + { + "epoch": 2.613658457550227, + "grad_norm": 0.06930407881736755, + "learning_rate": 7.701966785183852e-05, + "loss": 0.2153, + "step": 32263 + }, + { + "epoch": 2.6137394685677253, + "grad_norm": 0.07747004181146622, + "learning_rate": 7.70151671992439e-05, + "loss": 0.2524, + "step": 32264 + }, + { + "epoch": 2.6138204795852236, + "grad_norm": 0.06788234412670135, + "learning_rate": 7.701066654664926e-05, + "loss": 0.2446, + "step": 32265 + }, + { + "epoch": 2.613901490602722, + "grad_norm": 0.06565559655427933, + "learning_rate": 7.700616589405464e-05, + "loss": 0.2925, + "step": 32266 + }, + { + "epoch": 2.6139825016202205, + "grad_norm": 0.06663317233324051, + "learning_rate": 7.700166524146003e-05, + "loss": 0.2257, + "step": 32267 + }, + { + "epoch": 2.6140635126377187, + "grad_norm": 0.07271160930395126, + "learning_rate": 7.699716458886539e-05, + "loss": 0.2135, + "step": 32268 + }, + { + "epoch": 2.614144523655217, + "grad_norm": 0.08005572110414505, + "learning_rate": 7.699266393627076e-05, + "loss": 0.2594, + "step": 32269 + }, + { + "epoch": 2.6142255346727152, + "grad_norm": 0.08294783532619476, + "learning_rate": 7.698816328367615e-05, + "loss": 0.2594, + "step": 32270 + }, + { + "epoch": 2.614306545690214, + "grad_norm": 0.07596366107463837, + "learning_rate": 7.69836626310815e-05, + "loss": 0.2699, + "step": 32271 + }, + { + "epoch": 2.614387556707712, + "grad_norm": 0.08953236043453217, + "learning_rate": 7.697916197848688e-05, + "loss": 0.2384, + "step": 32272 + }, + { + "epoch": 2.6144685677252104, + "grad_norm": 0.07046157121658325, + "learning_rate": 7.697466132589227e-05, + "loss": 0.2189, + "step": 32273 + }, + { + "epoch": 2.614549578742709, + "grad_norm": 0.05413123220205307, + "learning_rate": 7.697016067329763e-05, + "loss": 0.2373, + "step": 32274 + }, + { + "epoch": 2.6146305897602073, + "grad_norm": 0.07471950352191925, + "learning_rate": 7.6965660020703e-05, + "loss": 0.2546, + "step": 32275 + }, + { + "epoch": 2.6147116007777056, + "grad_norm": 0.06346073001623154, + "learning_rate": 7.696115936810839e-05, + "loss": 0.2304, + "step": 32276 + }, + { + "epoch": 2.6147926117952043, + "grad_norm": 0.06895754486322403, + "learning_rate": 7.695665871551375e-05, + "loss": 0.2521, + "step": 32277 + }, + { + "epoch": 2.6148736228127025, + "grad_norm": 0.0746992975473404, + "learning_rate": 7.695215806291912e-05, + "loss": 0.2486, + "step": 32278 + }, + { + "epoch": 2.6149546338302008, + "grad_norm": 0.063937708735466, + "learning_rate": 7.694765741032451e-05, + "loss": 0.2435, + "step": 32279 + }, + { + "epoch": 2.6150356448476995, + "grad_norm": 0.07269015163183212, + "learning_rate": 7.694315675772987e-05, + "loss": 0.2738, + "step": 32280 + }, + { + "epoch": 2.6151166558651977, + "grad_norm": 0.09423906356096268, + "learning_rate": 7.693865610513524e-05, + "loss": 0.2198, + "step": 32281 + }, + { + "epoch": 2.615197666882696, + "grad_norm": 0.06471596658229828, + "learning_rate": 7.693415545254063e-05, + "loss": 0.2556, + "step": 32282 + }, + { + "epoch": 2.6152786779001946, + "grad_norm": 0.0668293684720993, + "learning_rate": 7.692965479994599e-05, + "loss": 0.2652, + "step": 32283 + }, + { + "epoch": 2.615359688917693, + "grad_norm": 0.06607384234666824, + "learning_rate": 7.692515414735138e-05, + "loss": 0.2113, + "step": 32284 + }, + { + "epoch": 2.615440699935191, + "grad_norm": 0.07677590101957321, + "learning_rate": 7.692065349475675e-05, + "loss": 0.2479, + "step": 32285 + }, + { + "epoch": 2.61552171095269, + "grad_norm": 0.06859259307384491, + "learning_rate": 7.691615284216211e-05, + "loss": 0.2425, + "step": 32286 + }, + { + "epoch": 2.615602721970188, + "grad_norm": 0.0541251078248024, + "learning_rate": 7.69116521895675e-05, + "loss": 0.2511, + "step": 32287 + }, + { + "epoch": 2.6156837329876863, + "grad_norm": 0.0717424750328064, + "learning_rate": 7.690715153697287e-05, + "loss": 0.2662, + "step": 32288 + }, + { + "epoch": 2.6157647440051845, + "grad_norm": 0.07615697383880615, + "learning_rate": 7.690265088437823e-05, + "loss": 0.2544, + "step": 32289 + }, + { + "epoch": 2.6158457550226832, + "grad_norm": 0.061555415391922, + "learning_rate": 7.689815023178362e-05, + "loss": 0.22, + "step": 32290 + }, + { + "epoch": 2.6159267660401815, + "grad_norm": 0.07001828402280807, + "learning_rate": 7.689364957918899e-05, + "loss": 0.2428, + "step": 32291 + }, + { + "epoch": 2.6160077770576797, + "grad_norm": 0.0808320865035057, + "learning_rate": 7.688914892659435e-05, + "loss": 0.2334, + "step": 32292 + }, + { + "epoch": 2.616088788075178, + "grad_norm": 0.06133532524108887, + "learning_rate": 7.688464827399974e-05, + "loss": 0.2588, + "step": 32293 + }, + { + "epoch": 2.6161697990926767, + "grad_norm": 0.07468949258327484, + "learning_rate": 7.688014762140511e-05, + "loss": 0.244, + "step": 32294 + }, + { + "epoch": 2.616250810110175, + "grad_norm": 0.061842817813158035, + "learning_rate": 7.687564696881047e-05, + "loss": 0.2684, + "step": 32295 + }, + { + "epoch": 2.616331821127673, + "grad_norm": 0.0641714558005333, + "learning_rate": 7.687114631621586e-05, + "loss": 0.2385, + "step": 32296 + }, + { + "epoch": 2.616412832145172, + "grad_norm": 0.08465207368135452, + "learning_rate": 7.686664566362123e-05, + "loss": 0.2598, + "step": 32297 + }, + { + "epoch": 2.61649384316267, + "grad_norm": 0.09201711416244507, + "learning_rate": 7.68621450110266e-05, + "loss": 0.2785, + "step": 32298 + }, + { + "epoch": 2.6165748541801683, + "grad_norm": 0.06847358494997025, + "learning_rate": 7.685764435843198e-05, + "loss": 0.2386, + "step": 32299 + }, + { + "epoch": 2.616655865197667, + "grad_norm": 0.07034248858690262, + "learning_rate": 7.685314370583735e-05, + "loss": 0.2418, + "step": 32300 + }, + { + "epoch": 2.6167368762151653, + "grad_norm": 0.06319961696863174, + "learning_rate": 7.684864305324271e-05, + "loss": 0.2412, + "step": 32301 + }, + { + "epoch": 2.6168178872326635, + "grad_norm": 0.10452283173799515, + "learning_rate": 7.68441424006481e-05, + "loss": 0.199, + "step": 32302 + }, + { + "epoch": 2.616898898250162, + "grad_norm": 0.06628091633319855, + "learning_rate": 7.683964174805348e-05, + "loss": 0.2441, + "step": 32303 + }, + { + "epoch": 2.6169799092676604, + "grad_norm": 0.08092938363552094, + "learning_rate": 7.683514109545884e-05, + "loss": 0.2497, + "step": 32304 + }, + { + "epoch": 2.6170609202851587, + "grad_norm": 0.054823897778987885, + "learning_rate": 7.683064044286422e-05, + "loss": 0.2224, + "step": 32305 + }, + { + "epoch": 2.6171419313026574, + "grad_norm": 0.08330629765987396, + "learning_rate": 7.68261397902696e-05, + "loss": 0.2496, + "step": 32306 + }, + { + "epoch": 2.6172229423201556, + "grad_norm": 0.06323496997356415, + "learning_rate": 7.682163913767496e-05, + "loss": 0.2206, + "step": 32307 + }, + { + "epoch": 2.617303953337654, + "grad_norm": 0.07369324564933777, + "learning_rate": 7.681713848508034e-05, + "loss": 0.2567, + "step": 32308 + }, + { + "epoch": 2.6173849643551526, + "grad_norm": 0.07172773033380508, + "learning_rate": 7.681263783248572e-05, + "loss": 0.2587, + "step": 32309 + }, + { + "epoch": 2.617465975372651, + "grad_norm": 0.07234320789575577, + "learning_rate": 7.680813717989109e-05, + "loss": 0.2255, + "step": 32310 + }, + { + "epoch": 2.617546986390149, + "grad_norm": 0.07661251723766327, + "learning_rate": 7.680363652729646e-05, + "loss": 0.2329, + "step": 32311 + }, + { + "epoch": 2.6176279974076473, + "grad_norm": 0.0664915069937706, + "learning_rate": 7.679913587470184e-05, + "loss": 0.2622, + "step": 32312 + }, + { + "epoch": 2.617709008425146, + "grad_norm": 0.07211392372846603, + "learning_rate": 7.679463522210721e-05, + "loss": 0.2687, + "step": 32313 + }, + { + "epoch": 2.6177900194426442, + "grad_norm": 0.06264899671077728, + "learning_rate": 7.679013456951259e-05, + "loss": 0.21, + "step": 32314 + }, + { + "epoch": 2.6178710304601425, + "grad_norm": 0.06314536929130554, + "learning_rate": 7.678563391691796e-05, + "loss": 0.2079, + "step": 32315 + }, + { + "epoch": 2.6179520414776407, + "grad_norm": 0.07251273095607758, + "learning_rate": 7.678113326432333e-05, + "loss": 0.2262, + "step": 32316 + }, + { + "epoch": 2.6180330524951394, + "grad_norm": 0.06271009147167206, + "learning_rate": 7.67766326117287e-05, + "loss": 0.2423, + "step": 32317 + }, + { + "epoch": 2.6181140635126376, + "grad_norm": 0.06633373349905014, + "learning_rate": 7.677213195913408e-05, + "loss": 0.2674, + "step": 32318 + }, + { + "epoch": 2.618195074530136, + "grad_norm": 0.06320098787546158, + "learning_rate": 7.676763130653945e-05, + "loss": 0.2256, + "step": 32319 + }, + { + "epoch": 2.6182760855476346, + "grad_norm": 0.06923279911279678, + "learning_rate": 7.676313065394483e-05, + "loss": 0.2426, + "step": 32320 + }, + { + "epoch": 2.618357096565133, + "grad_norm": 0.07858932018280029, + "learning_rate": 7.67586300013502e-05, + "loss": 0.2355, + "step": 32321 + }, + { + "epoch": 2.618438107582631, + "grad_norm": 0.06619489938020706, + "learning_rate": 7.675412934875557e-05, + "loss": 0.1963, + "step": 32322 + }, + { + "epoch": 2.6185191186001298, + "grad_norm": 0.06854096055030823, + "learning_rate": 7.674962869616095e-05, + "loss": 0.2577, + "step": 32323 + }, + { + "epoch": 2.618600129617628, + "grad_norm": 0.06553176045417786, + "learning_rate": 7.674512804356632e-05, + "loss": 0.2607, + "step": 32324 + }, + { + "epoch": 2.6186811406351262, + "grad_norm": 0.06606678664684296, + "learning_rate": 7.67406273909717e-05, + "loss": 0.2687, + "step": 32325 + }, + { + "epoch": 2.618762151652625, + "grad_norm": 0.08767323195934296, + "learning_rate": 7.673612673837707e-05, + "loss": 0.2404, + "step": 32326 + }, + { + "epoch": 2.618843162670123, + "grad_norm": 0.07807376980781555, + "learning_rate": 7.673162608578244e-05, + "loss": 0.246, + "step": 32327 + }, + { + "epoch": 2.6189241736876214, + "grad_norm": 0.11678686738014221, + "learning_rate": 7.672712543318782e-05, + "loss": 0.2686, + "step": 32328 + }, + { + "epoch": 2.61900518470512, + "grad_norm": 0.07521022111177444, + "learning_rate": 7.672262478059319e-05, + "loss": 0.2316, + "step": 32329 + }, + { + "epoch": 2.6190861957226184, + "grad_norm": 0.06827811151742935, + "learning_rate": 7.671812412799856e-05, + "loss": 0.2324, + "step": 32330 + }, + { + "epoch": 2.6191672067401166, + "grad_norm": 0.06705344468355179, + "learning_rate": 7.671362347540394e-05, + "loss": 0.224, + "step": 32331 + }, + { + "epoch": 2.6192482177576153, + "grad_norm": 0.06198614090681076, + "learning_rate": 7.670912282280931e-05, + "loss": 0.2539, + "step": 32332 + }, + { + "epoch": 2.6193292287751135, + "grad_norm": 0.09031055867671967, + "learning_rate": 7.670462217021468e-05, + "loss": 0.2687, + "step": 32333 + }, + { + "epoch": 2.619410239792612, + "grad_norm": 0.06712203472852707, + "learning_rate": 7.670012151762006e-05, + "loss": 0.2179, + "step": 32334 + }, + { + "epoch": 2.61949125081011, + "grad_norm": 0.05460682138800621, + "learning_rate": 7.669562086502543e-05, + "loss": 0.2411, + "step": 32335 + }, + { + "epoch": 2.6195722618276087, + "grad_norm": 0.08100723475217819, + "learning_rate": 7.669112021243082e-05, + "loss": 0.2147, + "step": 32336 + }, + { + "epoch": 2.619653272845107, + "grad_norm": 0.06605152785778046, + "learning_rate": 7.668661955983618e-05, + "loss": 0.264, + "step": 32337 + }, + { + "epoch": 2.619734283862605, + "grad_norm": 0.06912678480148315, + "learning_rate": 7.668211890724155e-05, + "loss": 0.2526, + "step": 32338 + }, + { + "epoch": 2.6198152948801035, + "grad_norm": 0.07025929540395737, + "learning_rate": 7.667761825464694e-05, + "loss": 0.2403, + "step": 32339 + }, + { + "epoch": 2.619896305897602, + "grad_norm": 0.06496301293373108, + "learning_rate": 7.66731176020523e-05, + "loss": 0.2339, + "step": 32340 + }, + { + "epoch": 2.6199773169151004, + "grad_norm": 0.06952445209026337, + "learning_rate": 7.666861694945767e-05, + "loss": 0.2371, + "step": 32341 + }, + { + "epoch": 2.6200583279325986, + "grad_norm": 0.06544504314661026, + "learning_rate": 7.666411629686306e-05, + "loss": 0.2753, + "step": 32342 + }, + { + "epoch": 2.6201393389500973, + "grad_norm": 0.07840461283922195, + "learning_rate": 7.665961564426842e-05, + "loss": 0.2742, + "step": 32343 + }, + { + "epoch": 2.6202203499675956, + "grad_norm": 0.07411501556634903, + "learning_rate": 7.665511499167379e-05, + "loss": 0.2433, + "step": 32344 + }, + { + "epoch": 2.620301360985094, + "grad_norm": 0.06554578244686127, + "learning_rate": 7.665061433907918e-05, + "loss": 0.216, + "step": 32345 + }, + { + "epoch": 2.6203823720025925, + "grad_norm": 0.06754684448242188, + "learning_rate": 7.664611368648454e-05, + "loss": 0.2724, + "step": 32346 + }, + { + "epoch": 2.6204633830200907, + "grad_norm": 0.061257973313331604, + "learning_rate": 7.664161303388991e-05, + "loss": 0.2091, + "step": 32347 + }, + { + "epoch": 2.620544394037589, + "grad_norm": 0.07244864851236343, + "learning_rate": 7.66371123812953e-05, + "loss": 0.2419, + "step": 32348 + }, + { + "epoch": 2.6206254050550877, + "grad_norm": 0.07496330887079239, + "learning_rate": 7.663261172870066e-05, + "loss": 0.2521, + "step": 32349 + }, + { + "epoch": 2.620706416072586, + "grad_norm": 0.07110787183046341, + "learning_rate": 7.662811107610603e-05, + "loss": 0.2727, + "step": 32350 + }, + { + "epoch": 2.620787427090084, + "grad_norm": 0.05446374788880348, + "learning_rate": 7.662361042351142e-05, + "loss": 0.2528, + "step": 32351 + }, + { + "epoch": 2.620868438107583, + "grad_norm": 0.06342311948537827, + "learning_rate": 7.661910977091678e-05, + "loss": 0.2287, + "step": 32352 + }, + { + "epoch": 2.620949449125081, + "grad_norm": 0.06803973764181137, + "learning_rate": 7.661460911832216e-05, + "loss": 0.2491, + "step": 32353 + }, + { + "epoch": 2.6210304601425793, + "grad_norm": 0.0856875330209732, + "learning_rate": 7.661010846572754e-05, + "loss": 0.2084, + "step": 32354 + }, + { + "epoch": 2.621111471160078, + "grad_norm": 0.06051705405116081, + "learning_rate": 7.66056078131329e-05, + "loss": 0.2578, + "step": 32355 + }, + { + "epoch": 2.6211924821775763, + "grad_norm": 0.08695938438177109, + "learning_rate": 7.660110716053828e-05, + "loss": 0.3125, + "step": 32356 + }, + { + "epoch": 2.6212734931950745, + "grad_norm": 0.0813361182808876, + "learning_rate": 7.659660650794366e-05, + "loss": 0.2567, + "step": 32357 + }, + { + "epoch": 2.6213545042125728, + "grad_norm": 0.08723953366279602, + "learning_rate": 7.659210585534902e-05, + "loss": 0.2701, + "step": 32358 + }, + { + "epoch": 2.6214355152300715, + "grad_norm": 0.06307353079319, + "learning_rate": 7.65876052027544e-05, + "loss": 0.1899, + "step": 32359 + }, + { + "epoch": 2.6215165262475697, + "grad_norm": 0.07490169256925583, + "learning_rate": 7.658310455015978e-05, + "loss": 0.232, + "step": 32360 + }, + { + "epoch": 2.621597537265068, + "grad_norm": 0.07667241990566254, + "learning_rate": 7.657860389756514e-05, + "loss": 0.25, + "step": 32361 + }, + { + "epoch": 2.621678548282566, + "grad_norm": 0.06483347713947296, + "learning_rate": 7.657410324497053e-05, + "loss": 0.2403, + "step": 32362 + }, + { + "epoch": 2.621759559300065, + "grad_norm": 0.06192995235323906, + "learning_rate": 7.65696025923759e-05, + "loss": 0.2547, + "step": 32363 + }, + { + "epoch": 2.621840570317563, + "grad_norm": 0.06280378252267838, + "learning_rate": 7.656510193978127e-05, + "loss": 0.2088, + "step": 32364 + }, + { + "epoch": 2.6219215813350614, + "grad_norm": 0.07592108845710754, + "learning_rate": 7.656060128718665e-05, + "loss": 0.239, + "step": 32365 + }, + { + "epoch": 2.62200259235256, + "grad_norm": 0.06386705487966537, + "learning_rate": 7.655610063459203e-05, + "loss": 0.2509, + "step": 32366 + }, + { + "epoch": 2.6220836033700583, + "grad_norm": 0.06451716274023056, + "learning_rate": 7.655159998199739e-05, + "loss": 0.2352, + "step": 32367 + }, + { + "epoch": 2.6221646143875565, + "grad_norm": 0.07501220703125, + "learning_rate": 7.654709932940277e-05, + "loss": 0.2633, + "step": 32368 + }, + { + "epoch": 2.6222456254050552, + "grad_norm": 0.07946756482124329, + "learning_rate": 7.654259867680815e-05, + "loss": 0.2367, + "step": 32369 + }, + { + "epoch": 2.6223266364225535, + "grad_norm": 0.05955003201961517, + "learning_rate": 7.653809802421351e-05, + "loss": 0.2201, + "step": 32370 + }, + { + "epoch": 2.6224076474400517, + "grad_norm": 0.07076594978570938, + "learning_rate": 7.65335973716189e-05, + "loss": 0.2268, + "step": 32371 + }, + { + "epoch": 2.6224886584575504, + "grad_norm": 0.06496407836675644, + "learning_rate": 7.652909671902427e-05, + "loss": 0.2207, + "step": 32372 + }, + { + "epoch": 2.6225696694750487, + "grad_norm": 0.058163098990917206, + "learning_rate": 7.652459606642963e-05, + "loss": 0.2196, + "step": 32373 + }, + { + "epoch": 2.622650680492547, + "grad_norm": 0.07921572774648666, + "learning_rate": 7.652009541383501e-05, + "loss": 0.2733, + "step": 32374 + }, + { + "epoch": 2.6227316915100456, + "grad_norm": 0.07185711711645126, + "learning_rate": 7.651559476124039e-05, + "loss": 0.2641, + "step": 32375 + }, + { + "epoch": 2.622812702527544, + "grad_norm": 0.07281036674976349, + "learning_rate": 7.651109410864575e-05, + "loss": 0.2401, + "step": 32376 + }, + { + "epoch": 2.622893713545042, + "grad_norm": 0.06288488209247589, + "learning_rate": 7.650659345605114e-05, + "loss": 0.2117, + "step": 32377 + }, + { + "epoch": 2.6229747245625408, + "grad_norm": 0.07066594809293747, + "learning_rate": 7.650209280345651e-05, + "loss": 0.2199, + "step": 32378 + }, + { + "epoch": 2.623055735580039, + "grad_norm": 0.05946981906890869, + "learning_rate": 7.649759215086187e-05, + "loss": 0.2398, + "step": 32379 + }, + { + "epoch": 2.6231367465975373, + "grad_norm": 0.07342832535505295, + "learning_rate": 7.649309149826726e-05, + "loss": 0.2429, + "step": 32380 + }, + { + "epoch": 2.6232177576150355, + "grad_norm": 0.06416358798742294, + "learning_rate": 7.648859084567263e-05, + "loss": 0.2544, + "step": 32381 + }, + { + "epoch": 2.6232987686325338, + "grad_norm": 0.056006185710430145, + "learning_rate": 7.648409019307799e-05, + "loss": 0.1901, + "step": 32382 + }, + { + "epoch": 2.6233797796500324, + "grad_norm": 0.06662266701459885, + "learning_rate": 7.647958954048338e-05, + "loss": 0.2087, + "step": 32383 + }, + { + "epoch": 2.6234607906675307, + "grad_norm": 0.07266471534967422, + "learning_rate": 7.647508888788875e-05, + "loss": 0.2493, + "step": 32384 + }, + { + "epoch": 2.623541801685029, + "grad_norm": 0.05771796777844429, + "learning_rate": 7.647058823529411e-05, + "loss": 0.2701, + "step": 32385 + }, + { + "epoch": 2.6236228127025276, + "grad_norm": 0.07551492005586624, + "learning_rate": 7.64660875826995e-05, + "loss": 0.2664, + "step": 32386 + }, + { + "epoch": 2.623703823720026, + "grad_norm": 0.06250123679637909, + "learning_rate": 7.646158693010487e-05, + "loss": 0.1939, + "step": 32387 + }, + { + "epoch": 2.623784834737524, + "grad_norm": 0.08225307613611221, + "learning_rate": 7.645708627751025e-05, + "loss": 0.2735, + "step": 32388 + }, + { + "epoch": 2.623865845755023, + "grad_norm": 0.06248268112540245, + "learning_rate": 7.645258562491562e-05, + "loss": 0.263, + "step": 32389 + }, + { + "epoch": 2.623946856772521, + "grad_norm": 0.07815909385681152, + "learning_rate": 7.644808497232099e-05, + "loss": 0.2409, + "step": 32390 + }, + { + "epoch": 2.6240278677900193, + "grad_norm": 0.05346912518143654, + "learning_rate": 7.644358431972637e-05, + "loss": 0.2733, + "step": 32391 + }, + { + "epoch": 2.624108878807518, + "grad_norm": 0.07662081718444824, + "learning_rate": 7.643908366713174e-05, + "loss": 0.213, + "step": 32392 + }, + { + "epoch": 2.624189889825016, + "grad_norm": 0.07478302717208862, + "learning_rate": 7.643458301453711e-05, + "loss": 0.2294, + "step": 32393 + }, + { + "epoch": 2.6242709008425145, + "grad_norm": 0.04852926358580589, + "learning_rate": 7.643008236194249e-05, + "loss": 0.2121, + "step": 32394 + }, + { + "epoch": 2.624351911860013, + "grad_norm": 0.06255146116018295, + "learning_rate": 7.642558170934786e-05, + "loss": 0.2581, + "step": 32395 + }, + { + "epoch": 2.6244329228775114, + "grad_norm": 0.06238606944680214, + "learning_rate": 7.642108105675323e-05, + "loss": 0.2354, + "step": 32396 + }, + { + "epoch": 2.6245139338950096, + "grad_norm": 0.06730019301176071, + "learning_rate": 7.641658040415861e-05, + "loss": 0.262, + "step": 32397 + }, + { + "epoch": 2.6245949449125083, + "grad_norm": 0.07908900827169418, + "learning_rate": 7.641207975156398e-05, + "loss": 0.2703, + "step": 32398 + }, + { + "epoch": 2.6246759559300066, + "grad_norm": 0.06469932943582535, + "learning_rate": 7.640757909896935e-05, + "loss": 0.2283, + "step": 32399 + }, + { + "epoch": 2.624756966947505, + "grad_norm": 0.06892475485801697, + "learning_rate": 7.640307844637473e-05, + "loss": 0.2434, + "step": 32400 + }, + { + "epoch": 2.6248379779650035, + "grad_norm": 0.05960692837834358, + "learning_rate": 7.63985777937801e-05, + "loss": 0.2154, + "step": 32401 + }, + { + "epoch": 2.6249189889825018, + "grad_norm": 0.06794025003910065, + "learning_rate": 7.639407714118548e-05, + "loss": 0.2474, + "step": 32402 + }, + { + "epoch": 2.625, + "grad_norm": 0.08723536133766174, + "learning_rate": 7.638957648859085e-05, + "loss": 0.2854, + "step": 32403 + }, + { + "epoch": 2.6250810110174982, + "grad_norm": 0.07772015035152435, + "learning_rate": 7.638507583599622e-05, + "loss": 0.2649, + "step": 32404 + }, + { + "epoch": 2.6251620220349965, + "grad_norm": 0.06779235601425171, + "learning_rate": 7.63805751834016e-05, + "loss": 0.2492, + "step": 32405 + }, + { + "epoch": 2.625243033052495, + "grad_norm": 0.07044626772403717, + "learning_rate": 7.637607453080697e-05, + "loss": 0.2549, + "step": 32406 + }, + { + "epoch": 2.6253240440699934, + "grad_norm": 0.06848505139350891, + "learning_rate": 7.637157387821234e-05, + "loss": 0.2499, + "step": 32407 + }, + { + "epoch": 2.6254050550874917, + "grad_norm": 0.05949360504746437, + "learning_rate": 7.636707322561772e-05, + "loss": 0.2437, + "step": 32408 + }, + { + "epoch": 2.6254860661049904, + "grad_norm": 0.06853028386831284, + "learning_rate": 7.636257257302309e-05, + "loss": 0.252, + "step": 32409 + }, + { + "epoch": 2.6255670771224886, + "grad_norm": 0.06630270928144455, + "learning_rate": 7.635807192042846e-05, + "loss": 0.2911, + "step": 32410 + }, + { + "epoch": 2.625648088139987, + "grad_norm": 0.05694075673818588, + "learning_rate": 7.635357126783384e-05, + "loss": 0.237, + "step": 32411 + }, + { + "epoch": 2.6257290991574855, + "grad_norm": 0.060872483998537064, + "learning_rate": 7.634907061523921e-05, + "loss": 0.2001, + "step": 32412 + }, + { + "epoch": 2.625810110174984, + "grad_norm": 0.07585026323795319, + "learning_rate": 7.634456996264459e-05, + "loss": 0.2366, + "step": 32413 + }, + { + "epoch": 2.625891121192482, + "grad_norm": 0.07402799278497696, + "learning_rate": 7.634006931004997e-05, + "loss": 0.224, + "step": 32414 + }, + { + "epoch": 2.6259721322099807, + "grad_norm": 0.06976964324712753, + "learning_rate": 7.633556865745533e-05, + "loss": 0.2131, + "step": 32415 + }, + { + "epoch": 2.626053143227479, + "grad_norm": 0.06143612414598465, + "learning_rate": 7.63310680048607e-05, + "loss": 0.2317, + "step": 32416 + }, + { + "epoch": 2.626134154244977, + "grad_norm": 0.06992180645465851, + "learning_rate": 7.63265673522661e-05, + "loss": 0.2192, + "step": 32417 + }, + { + "epoch": 2.626215165262476, + "grad_norm": 0.08070293813943863, + "learning_rate": 7.632206669967145e-05, + "loss": 0.2618, + "step": 32418 + }, + { + "epoch": 2.626296176279974, + "grad_norm": 0.052165403962135315, + "learning_rate": 7.631756604707683e-05, + "loss": 0.1829, + "step": 32419 + }, + { + "epoch": 2.6263771872974724, + "grad_norm": 0.048813480883836746, + "learning_rate": 7.631306539448221e-05, + "loss": 0.2209, + "step": 32420 + }, + { + "epoch": 2.626458198314971, + "grad_norm": 0.07322482764720917, + "learning_rate": 7.630856474188757e-05, + "loss": 0.2471, + "step": 32421 + }, + { + "epoch": 2.6265392093324693, + "grad_norm": 0.08113683760166168, + "learning_rate": 7.630406408929295e-05, + "loss": 0.241, + "step": 32422 + }, + { + "epoch": 2.6266202203499676, + "grad_norm": 0.06385481357574463, + "learning_rate": 7.629956343669833e-05, + "loss": 0.2138, + "step": 32423 + }, + { + "epoch": 2.6267012313674662, + "grad_norm": 0.07413046061992645, + "learning_rate": 7.62950627841037e-05, + "loss": 0.2732, + "step": 32424 + }, + { + "epoch": 2.6267822423849645, + "grad_norm": 0.06510314345359802, + "learning_rate": 7.629056213150907e-05, + "loss": 0.2449, + "step": 32425 + }, + { + "epoch": 2.6268632534024627, + "grad_norm": 0.06330209225416183, + "learning_rate": 7.628606147891446e-05, + "loss": 0.2809, + "step": 32426 + }, + { + "epoch": 2.626944264419961, + "grad_norm": 0.07114989310503006, + "learning_rate": 7.628156082631982e-05, + "loss": 0.2421, + "step": 32427 + }, + { + "epoch": 2.6270252754374592, + "grad_norm": 0.05140691623091698, + "learning_rate": 7.627706017372519e-05, + "loss": 0.2027, + "step": 32428 + }, + { + "epoch": 2.627106286454958, + "grad_norm": 0.06812848895788193, + "learning_rate": 7.627255952113058e-05, + "loss": 0.2745, + "step": 32429 + }, + { + "epoch": 2.627187297472456, + "grad_norm": 0.06321202218532562, + "learning_rate": 7.626805886853594e-05, + "loss": 0.2336, + "step": 32430 + }, + { + "epoch": 2.6272683084899544, + "grad_norm": 0.07882923632860184, + "learning_rate": 7.626355821594131e-05, + "loss": 0.2177, + "step": 32431 + }, + { + "epoch": 2.627349319507453, + "grad_norm": 0.06591887772083282, + "learning_rate": 7.62590575633467e-05, + "loss": 0.239, + "step": 32432 + }, + { + "epoch": 2.6274303305249513, + "grad_norm": 0.06610725075006485, + "learning_rate": 7.625455691075206e-05, + "loss": 0.221, + "step": 32433 + }, + { + "epoch": 2.6275113415424496, + "grad_norm": 0.0637730062007904, + "learning_rate": 7.625005625815743e-05, + "loss": 0.2773, + "step": 32434 + }, + { + "epoch": 2.6275923525599483, + "grad_norm": 0.06635932624340057, + "learning_rate": 7.624555560556282e-05, + "loss": 0.2655, + "step": 32435 + }, + { + "epoch": 2.6276733635774465, + "grad_norm": 0.07389713823795319, + "learning_rate": 7.624105495296818e-05, + "loss": 0.2285, + "step": 32436 + }, + { + "epoch": 2.6277543745949448, + "grad_norm": 0.05655698478221893, + "learning_rate": 7.623655430037355e-05, + "loss": 0.1999, + "step": 32437 + }, + { + "epoch": 2.6278353856124435, + "grad_norm": 0.062165699899196625, + "learning_rate": 7.623205364777894e-05, + "loss": 0.2385, + "step": 32438 + }, + { + "epoch": 2.6279163966299417, + "grad_norm": 0.06502586603164673, + "learning_rate": 7.62275529951843e-05, + "loss": 0.2192, + "step": 32439 + }, + { + "epoch": 2.62799740764744, + "grad_norm": 0.0674702525138855, + "learning_rate": 7.622305234258967e-05, + "loss": 0.2316, + "step": 32440 + }, + { + "epoch": 2.6280784186649386, + "grad_norm": 0.07493073493242264, + "learning_rate": 7.621855168999506e-05, + "loss": 0.2313, + "step": 32441 + }, + { + "epoch": 2.628159429682437, + "grad_norm": 0.0628257542848587, + "learning_rate": 7.621405103740042e-05, + "loss": 0.2539, + "step": 32442 + }, + { + "epoch": 2.628240440699935, + "grad_norm": 0.055204570293426514, + "learning_rate": 7.620955038480581e-05, + "loss": 0.207, + "step": 32443 + }, + { + "epoch": 2.628321451717434, + "grad_norm": 0.06424355506896973, + "learning_rate": 7.620504973221118e-05, + "loss": 0.2354, + "step": 32444 + }, + { + "epoch": 2.628402462734932, + "grad_norm": 0.06352122128009796, + "learning_rate": 7.620054907961654e-05, + "loss": 0.2193, + "step": 32445 + }, + { + "epoch": 2.6284834737524303, + "grad_norm": 0.05706331506371498, + "learning_rate": 7.619604842702193e-05, + "loss": 0.2269, + "step": 32446 + }, + { + "epoch": 2.6285644847699285, + "grad_norm": 0.07694992423057556, + "learning_rate": 7.61915477744273e-05, + "loss": 0.2404, + "step": 32447 + }, + { + "epoch": 2.6286454957874272, + "grad_norm": 0.05537230148911476, + "learning_rate": 7.618704712183266e-05, + "loss": 0.2304, + "step": 32448 + }, + { + "epoch": 2.6287265068049255, + "grad_norm": 0.07225795835256577, + "learning_rate": 7.618254646923805e-05, + "loss": 0.2303, + "step": 32449 + }, + { + "epoch": 2.6288075178224237, + "grad_norm": 0.07407918572425842, + "learning_rate": 7.617804581664342e-05, + "loss": 0.2663, + "step": 32450 + }, + { + "epoch": 2.628888528839922, + "grad_norm": 0.07087674736976624, + "learning_rate": 7.617354516404878e-05, + "loss": 0.2669, + "step": 32451 + }, + { + "epoch": 2.6289695398574207, + "grad_norm": 0.08752910792827606, + "learning_rate": 7.616904451145417e-05, + "loss": 0.2677, + "step": 32452 + }, + { + "epoch": 2.629050550874919, + "grad_norm": 0.06621308624744415, + "learning_rate": 7.616454385885954e-05, + "loss": 0.2215, + "step": 32453 + }, + { + "epoch": 2.629131561892417, + "grad_norm": 0.07405148446559906, + "learning_rate": 7.61600432062649e-05, + "loss": 0.2325, + "step": 32454 + }, + { + "epoch": 2.629212572909916, + "grad_norm": 0.08505482971668243, + "learning_rate": 7.615554255367029e-05, + "loss": 0.3245, + "step": 32455 + }, + { + "epoch": 2.629293583927414, + "grad_norm": 0.0664757639169693, + "learning_rate": 7.615104190107566e-05, + "loss": 0.223, + "step": 32456 + }, + { + "epoch": 2.6293745949449123, + "grad_norm": 0.06615759432315826, + "learning_rate": 7.614654124848102e-05, + "loss": 0.2758, + "step": 32457 + }, + { + "epoch": 2.629455605962411, + "grad_norm": 0.054171085357666016, + "learning_rate": 7.614204059588641e-05, + "loss": 0.1806, + "step": 32458 + }, + { + "epoch": 2.6295366169799093, + "grad_norm": 0.06245662271976471, + "learning_rate": 7.613753994329178e-05, + "loss": 0.2236, + "step": 32459 + }, + { + "epoch": 2.6296176279974075, + "grad_norm": 0.0623309463262558, + "learning_rate": 7.613303929069714e-05, + "loss": 0.2411, + "step": 32460 + }, + { + "epoch": 2.629698639014906, + "grad_norm": 0.06058632954955101, + "learning_rate": 7.612853863810253e-05, + "loss": 0.256, + "step": 32461 + }, + { + "epoch": 2.6297796500324044, + "grad_norm": 0.06449336558580399, + "learning_rate": 7.61240379855079e-05, + "loss": 0.2124, + "step": 32462 + }, + { + "epoch": 2.6298606610499027, + "grad_norm": 0.06935933232307434, + "learning_rate": 7.611953733291327e-05, + "loss": 0.2831, + "step": 32463 + }, + { + "epoch": 2.6299416720674014, + "grad_norm": 0.060872264206409454, + "learning_rate": 7.611503668031865e-05, + "loss": 0.2145, + "step": 32464 + }, + { + "epoch": 2.6300226830848996, + "grad_norm": 0.07569092512130737, + "learning_rate": 7.611053602772403e-05, + "loss": 0.2138, + "step": 32465 + }, + { + "epoch": 2.630103694102398, + "grad_norm": 0.06525085866451263, + "learning_rate": 7.610603537512939e-05, + "loss": 0.2359, + "step": 32466 + }, + { + "epoch": 2.6301847051198965, + "grad_norm": 0.06427115201950073, + "learning_rate": 7.610153472253477e-05, + "loss": 0.2974, + "step": 32467 + }, + { + "epoch": 2.630265716137395, + "grad_norm": 0.05559571832418442, + "learning_rate": 7.609703406994015e-05, + "loss": 0.2354, + "step": 32468 + }, + { + "epoch": 2.630346727154893, + "grad_norm": 0.08632521331310272, + "learning_rate": 7.609253341734552e-05, + "loss": 0.2556, + "step": 32469 + }, + { + "epoch": 2.6304277381723913, + "grad_norm": 0.08800216764211655, + "learning_rate": 7.60880327647509e-05, + "loss": 0.2764, + "step": 32470 + }, + { + "epoch": 2.63050874918989, + "grad_norm": 0.04657886177301407, + "learning_rate": 7.608353211215627e-05, + "loss": 0.2174, + "step": 32471 + }, + { + "epoch": 2.630589760207388, + "grad_norm": 0.06629232317209244, + "learning_rate": 7.607903145956164e-05, + "loss": 0.2327, + "step": 32472 + }, + { + "epoch": 2.6306707712248865, + "grad_norm": 0.07411345094442368, + "learning_rate": 7.607453080696702e-05, + "loss": 0.2397, + "step": 32473 + }, + { + "epoch": 2.6307517822423847, + "grad_norm": 0.05961480364203453, + "learning_rate": 7.607003015437239e-05, + "loss": 0.205, + "step": 32474 + }, + { + "epoch": 2.6308327932598834, + "grad_norm": 0.07046563178300858, + "learning_rate": 7.606552950177776e-05, + "loss": 0.236, + "step": 32475 + }, + { + "epoch": 2.6309138042773816, + "grad_norm": 0.06287655979394913, + "learning_rate": 7.606102884918314e-05, + "loss": 0.2673, + "step": 32476 + }, + { + "epoch": 2.63099481529488, + "grad_norm": 0.05665929988026619, + "learning_rate": 7.605652819658851e-05, + "loss": 0.2287, + "step": 32477 + }, + { + "epoch": 2.6310758263123786, + "grad_norm": 0.07031681388616562, + "learning_rate": 7.605202754399388e-05, + "loss": 0.2574, + "step": 32478 + }, + { + "epoch": 2.631156837329877, + "grad_norm": 0.06546088308095932, + "learning_rate": 7.604752689139926e-05, + "loss": 0.2581, + "step": 32479 + }, + { + "epoch": 2.631237848347375, + "grad_norm": 0.0613400936126709, + "learning_rate": 7.604302623880463e-05, + "loss": 0.2315, + "step": 32480 + }, + { + "epoch": 2.6313188593648738, + "grad_norm": 0.06321492046117783, + "learning_rate": 7.603852558621e-05, + "loss": 0.2587, + "step": 32481 + }, + { + "epoch": 2.631399870382372, + "grad_norm": 0.06262946128845215, + "learning_rate": 7.603402493361538e-05, + "loss": 0.2344, + "step": 32482 + }, + { + "epoch": 2.6314808813998702, + "grad_norm": 0.08262698352336884, + "learning_rate": 7.602952428102075e-05, + "loss": 0.2682, + "step": 32483 + }, + { + "epoch": 2.631561892417369, + "grad_norm": 0.0711028203368187, + "learning_rate": 7.602502362842612e-05, + "loss": 0.2854, + "step": 32484 + }, + { + "epoch": 2.631642903434867, + "grad_norm": 0.07169201970100403, + "learning_rate": 7.60205229758315e-05, + "loss": 0.252, + "step": 32485 + }, + { + "epoch": 2.6317239144523654, + "grad_norm": 0.10206906497478485, + "learning_rate": 7.601602232323687e-05, + "loss": 0.3226, + "step": 32486 + }, + { + "epoch": 2.631804925469864, + "grad_norm": 0.05083823204040527, + "learning_rate": 7.601152167064225e-05, + "loss": 0.2138, + "step": 32487 + }, + { + "epoch": 2.6318859364873624, + "grad_norm": 0.07286400347948074, + "learning_rate": 7.600702101804762e-05, + "loss": 0.2506, + "step": 32488 + }, + { + "epoch": 2.6319669475048606, + "grad_norm": 0.06732335686683655, + "learning_rate": 7.600252036545299e-05, + "loss": 0.2105, + "step": 32489 + }, + { + "epoch": 2.6320479585223593, + "grad_norm": 0.06304507702589035, + "learning_rate": 7.599801971285837e-05, + "loss": 0.2874, + "step": 32490 + }, + { + "epoch": 2.6321289695398575, + "grad_norm": 0.06779453158378601, + "learning_rate": 7.599351906026374e-05, + "loss": 0.2219, + "step": 32491 + }, + { + "epoch": 2.6322099805573558, + "grad_norm": 0.07301854342222214, + "learning_rate": 7.598901840766911e-05, + "loss": 0.2083, + "step": 32492 + }, + { + "epoch": 2.632290991574854, + "grad_norm": 0.07681296765804291, + "learning_rate": 7.598451775507449e-05, + "loss": 0.2742, + "step": 32493 + }, + { + "epoch": 2.6323720025923527, + "grad_norm": 0.08773787319660187, + "learning_rate": 7.598001710247986e-05, + "loss": 0.2812, + "step": 32494 + }, + { + "epoch": 2.632453013609851, + "grad_norm": 0.07804632931947708, + "learning_rate": 7.597551644988525e-05, + "loss": 0.2324, + "step": 32495 + }, + { + "epoch": 2.632534024627349, + "grad_norm": 0.0674925297498703, + "learning_rate": 7.597101579729061e-05, + "loss": 0.2301, + "step": 32496 + }, + { + "epoch": 2.6326150356448474, + "grad_norm": 0.05840333551168442, + "learning_rate": 7.596651514469598e-05, + "loss": 0.2329, + "step": 32497 + }, + { + "epoch": 2.632696046662346, + "grad_norm": 0.06401608139276505, + "learning_rate": 7.596201449210137e-05, + "loss": 0.2633, + "step": 32498 + }, + { + "epoch": 2.6327770576798444, + "grad_norm": 0.0558314323425293, + "learning_rate": 7.595751383950673e-05, + "loss": 0.2179, + "step": 32499 + }, + { + "epoch": 2.6328580686973426, + "grad_norm": 0.06202941760420799, + "learning_rate": 7.59530131869121e-05, + "loss": 0.2683, + "step": 32500 + }, + { + "epoch": 2.6329390797148413, + "grad_norm": 0.06091583892703056, + "learning_rate": 7.594851253431749e-05, + "loss": 0.2597, + "step": 32501 + }, + { + "epoch": 2.6330200907323396, + "grad_norm": 0.08405400812625885, + "learning_rate": 7.594401188172285e-05, + "loss": 0.2353, + "step": 32502 + }, + { + "epoch": 2.633101101749838, + "grad_norm": 0.05751390382647514, + "learning_rate": 7.593951122912822e-05, + "loss": 0.2386, + "step": 32503 + }, + { + "epoch": 2.6331821127673365, + "grad_norm": 0.06506470590829849, + "learning_rate": 7.593501057653361e-05, + "loss": 0.2794, + "step": 32504 + }, + { + "epoch": 2.6332631237848347, + "grad_norm": 0.05713038891553879, + "learning_rate": 7.593050992393897e-05, + "loss": 0.2323, + "step": 32505 + }, + { + "epoch": 2.633344134802333, + "grad_norm": 0.058620527386665344, + "learning_rate": 7.592600927134434e-05, + "loss": 0.2063, + "step": 32506 + }, + { + "epoch": 2.6334251458198317, + "grad_norm": 0.06940052658319473, + "learning_rate": 7.592150861874973e-05, + "loss": 0.2366, + "step": 32507 + }, + { + "epoch": 2.63350615683733, + "grad_norm": 0.07536128908395767, + "learning_rate": 7.591700796615509e-05, + "loss": 0.2548, + "step": 32508 + }, + { + "epoch": 2.633587167854828, + "grad_norm": 0.07242191582918167, + "learning_rate": 7.591250731356046e-05, + "loss": 0.2497, + "step": 32509 + }, + { + "epoch": 2.633668178872327, + "grad_norm": 0.08319424837827682, + "learning_rate": 7.590800666096585e-05, + "loss": 0.2839, + "step": 32510 + }, + { + "epoch": 2.633749189889825, + "grad_norm": 0.06494160741567612, + "learning_rate": 7.590350600837121e-05, + "loss": 0.2149, + "step": 32511 + }, + { + "epoch": 2.6338302009073233, + "grad_norm": 0.07588519901037216, + "learning_rate": 7.589900535577659e-05, + "loss": 0.2237, + "step": 32512 + }, + { + "epoch": 2.633911211924822, + "grad_norm": 0.07043095678091049, + "learning_rate": 7.589450470318197e-05, + "loss": 0.2427, + "step": 32513 + }, + { + "epoch": 2.6339922229423203, + "grad_norm": 0.05470437556505203, + "learning_rate": 7.589000405058733e-05, + "loss": 0.2254, + "step": 32514 + }, + { + "epoch": 2.6340732339598185, + "grad_norm": 0.06383230537176132, + "learning_rate": 7.58855033979927e-05, + "loss": 0.2294, + "step": 32515 + }, + { + "epoch": 2.6341542449773168, + "grad_norm": 0.06890767812728882, + "learning_rate": 7.58810027453981e-05, + "loss": 0.2546, + "step": 32516 + }, + { + "epoch": 2.6342352559948155, + "grad_norm": 0.06413356214761734, + "learning_rate": 7.587650209280345e-05, + "loss": 0.2194, + "step": 32517 + }, + { + "epoch": 2.6343162670123137, + "grad_norm": 0.07503025233745575, + "learning_rate": 7.587200144020883e-05, + "loss": 0.2763, + "step": 32518 + }, + { + "epoch": 2.634397278029812, + "grad_norm": 0.09024403989315033, + "learning_rate": 7.586750078761421e-05, + "loss": 0.2639, + "step": 32519 + }, + { + "epoch": 2.63447828904731, + "grad_norm": 0.08484242111444473, + "learning_rate": 7.586300013501957e-05, + "loss": 0.2422, + "step": 32520 + }, + { + "epoch": 2.634559300064809, + "grad_norm": 0.0671757161617279, + "learning_rate": 7.585849948242496e-05, + "loss": 0.2385, + "step": 32521 + }, + { + "epoch": 2.634640311082307, + "grad_norm": 0.0698087215423584, + "learning_rate": 7.585399882983034e-05, + "loss": 0.2244, + "step": 32522 + }, + { + "epoch": 2.6347213220998054, + "grad_norm": 0.058275751769542694, + "learning_rate": 7.58494981772357e-05, + "loss": 0.2251, + "step": 32523 + }, + { + "epoch": 2.634802333117304, + "grad_norm": 0.08362040668725967, + "learning_rate": 7.584499752464108e-05, + "loss": 0.2318, + "step": 32524 + }, + { + "epoch": 2.6348833441348023, + "grad_norm": 0.06042616814374924, + "learning_rate": 7.584049687204646e-05, + "loss": 0.2167, + "step": 32525 + }, + { + "epoch": 2.6349643551523005, + "grad_norm": 0.06875180453062057, + "learning_rate": 7.583599621945182e-05, + "loss": 0.2818, + "step": 32526 + }, + { + "epoch": 2.6350453661697992, + "grad_norm": 0.0656152069568634, + "learning_rate": 7.58314955668572e-05, + "loss": 0.2206, + "step": 32527 + }, + { + "epoch": 2.6351263771872975, + "grad_norm": 0.0642925277352333, + "learning_rate": 7.582699491426258e-05, + "loss": 0.2473, + "step": 32528 + }, + { + "epoch": 2.6352073882047957, + "grad_norm": 0.07825696468353271, + "learning_rate": 7.582249426166794e-05, + "loss": 0.2188, + "step": 32529 + }, + { + "epoch": 2.6352883992222944, + "grad_norm": 0.06980311870574951, + "learning_rate": 7.581799360907332e-05, + "loss": 0.2673, + "step": 32530 + }, + { + "epoch": 2.6353694102397927, + "grad_norm": 0.06447041034698486, + "learning_rate": 7.58134929564787e-05, + "loss": 0.1968, + "step": 32531 + }, + { + "epoch": 2.635450421257291, + "grad_norm": 0.08126753568649292, + "learning_rate": 7.580899230388406e-05, + "loss": 0.2308, + "step": 32532 + }, + { + "epoch": 2.6355314322747896, + "grad_norm": 0.07112924009561539, + "learning_rate": 7.580449165128944e-05, + "loss": 0.2664, + "step": 32533 + }, + { + "epoch": 2.635612443292288, + "grad_norm": 0.08120989054441452, + "learning_rate": 7.579999099869482e-05, + "loss": 0.27, + "step": 32534 + }, + { + "epoch": 2.635693454309786, + "grad_norm": 0.06857681274414062, + "learning_rate": 7.579549034610018e-05, + "loss": 0.2216, + "step": 32535 + }, + { + "epoch": 2.6357744653272848, + "grad_norm": 0.06669405102729797, + "learning_rate": 7.579098969350557e-05, + "loss": 0.2048, + "step": 32536 + }, + { + "epoch": 2.635855476344783, + "grad_norm": 0.06707888841629028, + "learning_rate": 7.578648904091094e-05, + "loss": 0.2508, + "step": 32537 + }, + { + "epoch": 2.6359364873622813, + "grad_norm": 0.07921718806028366, + "learning_rate": 7.57819883883163e-05, + "loss": 0.2566, + "step": 32538 + }, + { + "epoch": 2.6360174983797795, + "grad_norm": 0.06119322031736374, + "learning_rate": 7.577748773572169e-05, + "loss": 0.245, + "step": 32539 + }, + { + "epoch": 2.636098509397278, + "grad_norm": 0.05891178548336029, + "learning_rate": 7.577298708312706e-05, + "loss": 0.2474, + "step": 32540 + }, + { + "epoch": 2.6361795204147764, + "grad_norm": 0.08063157647848129, + "learning_rate": 7.576848643053242e-05, + "loss": 0.2611, + "step": 32541 + }, + { + "epoch": 2.6362605314322747, + "grad_norm": 0.08956170082092285, + "learning_rate": 7.576398577793781e-05, + "loss": 0.2495, + "step": 32542 + }, + { + "epoch": 2.636341542449773, + "grad_norm": 0.06798410415649414, + "learning_rate": 7.575948512534318e-05, + "loss": 0.2467, + "step": 32543 + }, + { + "epoch": 2.6364225534672716, + "grad_norm": 0.05866193026304245, + "learning_rate": 7.575498447274854e-05, + "loss": 0.2177, + "step": 32544 + }, + { + "epoch": 2.63650356448477, + "grad_norm": 0.05509014427661896, + "learning_rate": 7.575048382015393e-05, + "loss": 0.2732, + "step": 32545 + }, + { + "epoch": 2.636584575502268, + "grad_norm": 0.07084442675113678, + "learning_rate": 7.57459831675593e-05, + "loss": 0.2132, + "step": 32546 + }, + { + "epoch": 2.636665586519767, + "grad_norm": 0.0648268610239029, + "learning_rate": 7.574148251496468e-05, + "loss": 0.2429, + "step": 32547 + }, + { + "epoch": 2.636746597537265, + "grad_norm": 0.061500899493694305, + "learning_rate": 7.573698186237005e-05, + "loss": 0.2264, + "step": 32548 + }, + { + "epoch": 2.6368276085547633, + "grad_norm": 0.05797162652015686, + "learning_rate": 7.573248120977542e-05, + "loss": 0.2345, + "step": 32549 + }, + { + "epoch": 2.636908619572262, + "grad_norm": 0.07138822972774506, + "learning_rate": 7.57279805571808e-05, + "loss": 0.2401, + "step": 32550 + }, + { + "epoch": 2.63698963058976, + "grad_norm": 0.06884890049695969, + "learning_rate": 7.572347990458617e-05, + "loss": 0.218, + "step": 32551 + }, + { + "epoch": 2.6370706416072585, + "grad_norm": 0.06728807836771011, + "learning_rate": 7.571897925199154e-05, + "loss": 0.2532, + "step": 32552 + }, + { + "epoch": 2.637151652624757, + "grad_norm": 0.06018717586994171, + "learning_rate": 7.571447859939692e-05, + "loss": 0.2437, + "step": 32553 + }, + { + "epoch": 2.6372326636422554, + "grad_norm": 0.05906055495142937, + "learning_rate": 7.570997794680229e-05, + "loss": 0.2397, + "step": 32554 + }, + { + "epoch": 2.6373136746597536, + "grad_norm": 0.07531093806028366, + "learning_rate": 7.570547729420766e-05, + "loss": 0.2393, + "step": 32555 + }, + { + "epoch": 2.6373946856772523, + "grad_norm": 0.05766147002577782, + "learning_rate": 7.570097664161304e-05, + "loss": 0.2038, + "step": 32556 + }, + { + "epoch": 2.6374756966947506, + "grad_norm": 0.06611020117998123, + "learning_rate": 7.569647598901841e-05, + "loss": 0.2788, + "step": 32557 + }, + { + "epoch": 2.637556707712249, + "grad_norm": 0.06789632886648178, + "learning_rate": 7.569197533642378e-05, + "loss": 0.2282, + "step": 32558 + }, + { + "epoch": 2.6376377187297475, + "grad_norm": 0.06541197001934052, + "learning_rate": 7.568747468382916e-05, + "loss": 0.2074, + "step": 32559 + }, + { + "epoch": 2.6377187297472457, + "grad_norm": 0.05770771950483322, + "learning_rate": 7.568297403123453e-05, + "loss": 0.2686, + "step": 32560 + }, + { + "epoch": 2.637799740764744, + "grad_norm": 0.06224161759018898, + "learning_rate": 7.56784733786399e-05, + "loss": 0.2401, + "step": 32561 + }, + { + "epoch": 2.6378807517822422, + "grad_norm": 0.07410628348588943, + "learning_rate": 7.567397272604528e-05, + "loss": 0.2267, + "step": 32562 + }, + { + "epoch": 2.637961762799741, + "grad_norm": 0.07967900484800339, + "learning_rate": 7.566947207345065e-05, + "loss": 0.2519, + "step": 32563 + }, + { + "epoch": 2.638042773817239, + "grad_norm": 0.05484290421009064, + "learning_rate": 7.566497142085603e-05, + "loss": 0.2387, + "step": 32564 + }, + { + "epoch": 2.6381237848347374, + "grad_norm": 0.07121595740318298, + "learning_rate": 7.56604707682614e-05, + "loss": 0.2445, + "step": 32565 + }, + { + "epoch": 2.6382047958522357, + "grad_norm": 0.07594309002161026, + "learning_rate": 7.565597011566677e-05, + "loss": 0.2466, + "step": 32566 + }, + { + "epoch": 2.6382858068697344, + "grad_norm": 0.07039465010166168, + "learning_rate": 7.565146946307215e-05, + "loss": 0.2453, + "step": 32567 + }, + { + "epoch": 2.6383668178872326, + "grad_norm": 0.07094226777553558, + "learning_rate": 7.564696881047752e-05, + "loss": 0.2492, + "step": 32568 + }, + { + "epoch": 2.638447828904731, + "grad_norm": 0.06495281308889389, + "learning_rate": 7.56424681578829e-05, + "loss": 0.2191, + "step": 32569 + }, + { + "epoch": 2.6385288399222295, + "grad_norm": 0.0773189589381218, + "learning_rate": 7.563796750528827e-05, + "loss": 0.2458, + "step": 32570 + }, + { + "epoch": 2.6386098509397278, + "grad_norm": 0.06711572408676147, + "learning_rate": 7.563346685269364e-05, + "loss": 0.222, + "step": 32571 + }, + { + "epoch": 2.638690861957226, + "grad_norm": 0.07043211907148361, + "learning_rate": 7.562896620009902e-05, + "loss": 0.2116, + "step": 32572 + }, + { + "epoch": 2.6387718729747247, + "grad_norm": 0.07517684251070023, + "learning_rate": 7.56244655475044e-05, + "loss": 0.2516, + "step": 32573 + }, + { + "epoch": 2.638852883992223, + "grad_norm": 0.05948114022612572, + "learning_rate": 7.561996489490976e-05, + "loss": 0.2167, + "step": 32574 + }, + { + "epoch": 2.638933895009721, + "grad_norm": 0.07915988564491272, + "learning_rate": 7.561546424231514e-05, + "loss": 0.2525, + "step": 32575 + }, + { + "epoch": 2.63901490602722, + "grad_norm": 0.07200496643781662, + "learning_rate": 7.561096358972052e-05, + "loss": 0.2496, + "step": 32576 + }, + { + "epoch": 2.639095917044718, + "grad_norm": 0.06896743178367615, + "learning_rate": 7.560646293712588e-05, + "loss": 0.2254, + "step": 32577 + }, + { + "epoch": 2.6391769280622164, + "grad_norm": 0.08511435985565186, + "learning_rate": 7.560196228453126e-05, + "loss": 0.2259, + "step": 32578 + }, + { + "epoch": 2.639257939079715, + "grad_norm": 0.06475701183080673, + "learning_rate": 7.559746163193664e-05, + "loss": 0.2316, + "step": 32579 + }, + { + "epoch": 2.6393389500972133, + "grad_norm": 0.06055932492017746, + "learning_rate": 7.5592960979342e-05, + "loss": 0.2858, + "step": 32580 + }, + { + "epoch": 2.6394199611147116, + "grad_norm": 0.060425933450460434, + "learning_rate": 7.558846032674738e-05, + "loss": 0.2444, + "step": 32581 + }, + { + "epoch": 2.6395009721322102, + "grad_norm": 0.058254752308130264, + "learning_rate": 7.558395967415277e-05, + "loss": 0.2394, + "step": 32582 + }, + { + "epoch": 2.6395819831497085, + "grad_norm": 0.06561224162578583, + "learning_rate": 7.557945902155813e-05, + "loss": 0.2432, + "step": 32583 + }, + { + "epoch": 2.6396629941672067, + "grad_norm": 0.0659739077091217, + "learning_rate": 7.55749583689635e-05, + "loss": 0.2335, + "step": 32584 + }, + { + "epoch": 2.639744005184705, + "grad_norm": 0.06240279600024223, + "learning_rate": 7.557045771636889e-05, + "loss": 0.2339, + "step": 32585 + }, + { + "epoch": 2.6398250162022032, + "grad_norm": 0.07514799386262894, + "learning_rate": 7.556595706377425e-05, + "loss": 0.2695, + "step": 32586 + }, + { + "epoch": 2.639906027219702, + "grad_norm": 0.0739828571677208, + "learning_rate": 7.556145641117962e-05, + "loss": 0.2369, + "step": 32587 + }, + { + "epoch": 2.6399870382372, + "grad_norm": 0.0633767768740654, + "learning_rate": 7.5556955758585e-05, + "loss": 0.23, + "step": 32588 + }, + { + "epoch": 2.6400680492546984, + "grad_norm": 0.07802503556013107, + "learning_rate": 7.555245510599037e-05, + "loss": 0.3001, + "step": 32589 + }, + { + "epoch": 2.640149060272197, + "grad_norm": 0.06382672488689423, + "learning_rate": 7.554795445339574e-05, + "loss": 0.2588, + "step": 32590 + }, + { + "epoch": 2.6402300712896953, + "grad_norm": 0.06049477681517601, + "learning_rate": 7.554345380080113e-05, + "loss": 0.2287, + "step": 32591 + }, + { + "epoch": 2.6403110823071936, + "grad_norm": 0.06609281152486801, + "learning_rate": 7.553895314820649e-05, + "loss": 0.2418, + "step": 32592 + }, + { + "epoch": 2.6403920933246923, + "grad_norm": 0.06597542017698288, + "learning_rate": 7.553445249561186e-05, + "loss": 0.2306, + "step": 32593 + }, + { + "epoch": 2.6404731043421905, + "grad_norm": 0.07869094610214233, + "learning_rate": 7.552995184301725e-05, + "loss": 0.2537, + "step": 32594 + }, + { + "epoch": 2.6405541153596888, + "grad_norm": 0.06835228204727173, + "learning_rate": 7.552545119042261e-05, + "loss": 0.24, + "step": 32595 + }, + { + "epoch": 2.6406351263771874, + "grad_norm": 0.07356154918670654, + "learning_rate": 7.552095053782798e-05, + "loss": 0.2272, + "step": 32596 + }, + { + "epoch": 2.6407161373946857, + "grad_norm": 0.09468083083629608, + "learning_rate": 7.551644988523337e-05, + "loss": 0.2401, + "step": 32597 + }, + { + "epoch": 2.640797148412184, + "grad_norm": 0.05996778979897499, + "learning_rate": 7.551194923263873e-05, + "loss": 0.2278, + "step": 32598 + }, + { + "epoch": 2.6408781594296826, + "grad_norm": 0.07565630972385406, + "learning_rate": 7.55074485800441e-05, + "loss": 0.2634, + "step": 32599 + }, + { + "epoch": 2.640959170447181, + "grad_norm": 0.07954513281583786, + "learning_rate": 7.550294792744949e-05, + "loss": 0.2922, + "step": 32600 + }, + { + "epoch": 2.641040181464679, + "grad_norm": 0.07488039135932922, + "learning_rate": 7.549844727485485e-05, + "loss": 0.2688, + "step": 32601 + }, + { + "epoch": 2.641121192482178, + "grad_norm": 0.06334620714187622, + "learning_rate": 7.549394662226024e-05, + "loss": 0.2393, + "step": 32602 + }, + { + "epoch": 2.641202203499676, + "grad_norm": 0.0585486926138401, + "learning_rate": 7.548944596966561e-05, + "loss": 0.2279, + "step": 32603 + }, + { + "epoch": 2.6412832145171743, + "grad_norm": 0.08411995321512222, + "learning_rate": 7.548494531707097e-05, + "loss": 0.2395, + "step": 32604 + }, + { + "epoch": 2.641364225534673, + "grad_norm": 0.06781220436096191, + "learning_rate": 7.548044466447636e-05, + "loss": 0.238, + "step": 32605 + }, + { + "epoch": 2.6414452365521712, + "grad_norm": 0.07711216807365417, + "learning_rate": 7.547594401188173e-05, + "loss": 0.2428, + "step": 32606 + }, + { + "epoch": 2.6415262475696695, + "grad_norm": 0.060875892639160156, + "learning_rate": 7.547144335928709e-05, + "loss": 0.2344, + "step": 32607 + }, + { + "epoch": 2.6416072585871677, + "grad_norm": 0.05834164097905159, + "learning_rate": 7.546694270669248e-05, + "loss": 0.2612, + "step": 32608 + }, + { + "epoch": 2.641688269604666, + "grad_norm": 0.054436735808849335, + "learning_rate": 7.546244205409785e-05, + "loss": 0.2359, + "step": 32609 + }, + { + "epoch": 2.6417692806221647, + "grad_norm": 0.05471387878060341, + "learning_rate": 7.545794140150321e-05, + "loss": 0.2391, + "step": 32610 + }, + { + "epoch": 2.641850291639663, + "grad_norm": 0.07137314230203629, + "learning_rate": 7.54534407489086e-05, + "loss": 0.2723, + "step": 32611 + }, + { + "epoch": 2.641931302657161, + "grad_norm": 0.056198038160800934, + "learning_rate": 7.544894009631397e-05, + "loss": 0.2534, + "step": 32612 + }, + { + "epoch": 2.64201231367466, + "grad_norm": 0.06524398922920227, + "learning_rate": 7.544443944371933e-05, + "loss": 0.2166, + "step": 32613 + }, + { + "epoch": 2.642093324692158, + "grad_norm": 0.07286935299634933, + "learning_rate": 7.543993879112472e-05, + "loss": 0.2192, + "step": 32614 + }, + { + "epoch": 2.6421743357096563, + "grad_norm": 0.07484481483697891, + "learning_rate": 7.54354381385301e-05, + "loss": 0.2274, + "step": 32615 + }, + { + "epoch": 2.642255346727155, + "grad_norm": 0.06895321607589722, + "learning_rate": 7.543093748593545e-05, + "loss": 0.2625, + "step": 32616 + }, + { + "epoch": 2.6423363577446533, + "grad_norm": 0.05788419023156166, + "learning_rate": 7.542643683334084e-05, + "loss": 0.2183, + "step": 32617 + }, + { + "epoch": 2.6424173687621515, + "grad_norm": 0.06298904120922089, + "learning_rate": 7.542193618074621e-05, + "loss": 0.243, + "step": 32618 + }, + { + "epoch": 2.64249837977965, + "grad_norm": 0.09361536800861359, + "learning_rate": 7.541743552815157e-05, + "loss": 0.2904, + "step": 32619 + }, + { + "epoch": 2.6425793907971484, + "grad_norm": 0.06291305273771286, + "learning_rate": 7.541293487555696e-05, + "loss": 0.2298, + "step": 32620 + }, + { + "epoch": 2.6426604018146467, + "grad_norm": 0.07938309013843536, + "learning_rate": 7.540843422296234e-05, + "loss": 0.2714, + "step": 32621 + }, + { + "epoch": 2.6427414128321454, + "grad_norm": 0.06279677897691727, + "learning_rate": 7.54039335703677e-05, + "loss": 0.2719, + "step": 32622 + }, + { + "epoch": 2.6428224238496436, + "grad_norm": 0.05711670219898224, + "learning_rate": 7.539943291777308e-05, + "loss": 0.2207, + "step": 32623 + }, + { + "epoch": 2.642903434867142, + "grad_norm": 0.0746513158082962, + "learning_rate": 7.539493226517846e-05, + "loss": 0.2689, + "step": 32624 + }, + { + "epoch": 2.6429844458846405, + "grad_norm": 0.06141054257750511, + "learning_rate": 7.539043161258382e-05, + "loss": 0.2542, + "step": 32625 + }, + { + "epoch": 2.643065456902139, + "grad_norm": 0.04914986714720726, + "learning_rate": 7.53859309599892e-05, + "loss": 0.1971, + "step": 32626 + }, + { + "epoch": 2.643146467919637, + "grad_norm": 0.06171542406082153, + "learning_rate": 7.538143030739458e-05, + "loss": 0.2379, + "step": 32627 + }, + { + "epoch": 2.6432274789371357, + "grad_norm": 0.058278243988752365, + "learning_rate": 7.537692965479995e-05, + "loss": 0.2454, + "step": 32628 + }, + { + "epoch": 2.643308489954634, + "grad_norm": 0.06110624596476555, + "learning_rate": 7.537242900220532e-05, + "loss": 0.2411, + "step": 32629 + }, + { + "epoch": 2.643389500972132, + "grad_norm": 0.0745861604809761, + "learning_rate": 7.53679283496107e-05, + "loss": 0.2381, + "step": 32630 + }, + { + "epoch": 2.6434705119896305, + "grad_norm": 0.06847658008337021, + "learning_rate": 7.536342769701607e-05, + "loss": 0.2489, + "step": 32631 + }, + { + "epoch": 2.6435515230071287, + "grad_norm": 0.07420511543750763, + "learning_rate": 7.535892704442145e-05, + "loss": 0.2661, + "step": 32632 + }, + { + "epoch": 2.6436325340246274, + "grad_norm": 0.0696389377117157, + "learning_rate": 7.535442639182682e-05, + "loss": 0.2802, + "step": 32633 + }, + { + "epoch": 2.6437135450421256, + "grad_norm": 0.05882401019334793, + "learning_rate": 7.534992573923219e-05, + "loss": 0.2239, + "step": 32634 + }, + { + "epoch": 2.643794556059624, + "grad_norm": 0.08121724426746368, + "learning_rate": 7.534542508663757e-05, + "loss": 0.2467, + "step": 32635 + }, + { + "epoch": 2.6438755670771226, + "grad_norm": 0.07373958826065063, + "learning_rate": 7.534092443404294e-05, + "loss": 0.2181, + "step": 32636 + }, + { + "epoch": 2.643956578094621, + "grad_norm": 0.06903345882892609, + "learning_rate": 7.533642378144831e-05, + "loss": 0.2287, + "step": 32637 + }, + { + "epoch": 2.644037589112119, + "grad_norm": 0.05462869256734848, + "learning_rate": 7.533192312885369e-05, + "loss": 0.2376, + "step": 32638 + }, + { + "epoch": 2.6441186001296177, + "grad_norm": 0.08030558377504349, + "learning_rate": 7.532742247625906e-05, + "loss": 0.2477, + "step": 32639 + }, + { + "epoch": 2.644199611147116, + "grad_norm": 0.06077692657709122, + "learning_rate": 7.532292182366443e-05, + "loss": 0.2466, + "step": 32640 + }, + { + "epoch": 2.6442806221646142, + "grad_norm": 0.07113752514123917, + "learning_rate": 7.531842117106981e-05, + "loss": 0.1929, + "step": 32641 + }, + { + "epoch": 2.644361633182113, + "grad_norm": 0.07391038537025452, + "learning_rate": 7.531392051847518e-05, + "loss": 0.2271, + "step": 32642 + }, + { + "epoch": 2.644442644199611, + "grad_norm": 0.06465182453393936, + "learning_rate": 7.530941986588055e-05, + "loss": 0.2248, + "step": 32643 + }, + { + "epoch": 2.6445236552171094, + "grad_norm": 0.0905710756778717, + "learning_rate": 7.530491921328593e-05, + "loss": 0.2302, + "step": 32644 + }, + { + "epoch": 2.644604666234608, + "grad_norm": 0.06979474425315857, + "learning_rate": 7.53004185606913e-05, + "loss": 0.257, + "step": 32645 + }, + { + "epoch": 2.6446856772521063, + "grad_norm": 0.09235589951276779, + "learning_rate": 7.529591790809668e-05, + "loss": 0.2435, + "step": 32646 + }, + { + "epoch": 2.6447666882696046, + "grad_norm": 0.0746222734451294, + "learning_rate": 7.529141725550205e-05, + "loss": 0.22, + "step": 32647 + }, + { + "epoch": 2.6448476992871033, + "grad_norm": 0.06662612408399582, + "learning_rate": 7.528691660290742e-05, + "loss": 0.282, + "step": 32648 + }, + { + "epoch": 2.6449287103046015, + "grad_norm": 0.07575134187936783, + "learning_rate": 7.52824159503128e-05, + "loss": 0.2775, + "step": 32649 + }, + { + "epoch": 2.6450097213220998, + "grad_norm": 0.06182150915265083, + "learning_rate": 7.527791529771817e-05, + "loss": 0.2392, + "step": 32650 + }, + { + "epoch": 2.6450907323395985, + "grad_norm": 0.05878112465143204, + "learning_rate": 7.527341464512354e-05, + "loss": 0.2584, + "step": 32651 + }, + { + "epoch": 2.6451717433570967, + "grad_norm": 0.06054788455367088, + "learning_rate": 7.526891399252892e-05, + "loss": 0.2283, + "step": 32652 + }, + { + "epoch": 2.645252754374595, + "grad_norm": 0.06147018447518349, + "learning_rate": 7.526441333993429e-05, + "loss": 0.2315, + "step": 32653 + }, + { + "epoch": 2.645333765392093, + "grad_norm": 0.07113911956548691, + "learning_rate": 7.525991268733968e-05, + "loss": 0.2514, + "step": 32654 + }, + { + "epoch": 2.6454147764095914, + "grad_norm": 0.07609254866838455, + "learning_rate": 7.525541203474504e-05, + "loss": 0.2986, + "step": 32655 + }, + { + "epoch": 2.64549578742709, + "grad_norm": 0.0673244446516037, + "learning_rate": 7.525091138215041e-05, + "loss": 0.2681, + "step": 32656 + }, + { + "epoch": 2.6455767984445884, + "grad_norm": 0.07759573310613632, + "learning_rate": 7.52464107295558e-05, + "loss": 0.2694, + "step": 32657 + }, + { + "epoch": 2.6456578094620866, + "grad_norm": 0.05810312554240227, + "learning_rate": 7.524191007696116e-05, + "loss": 0.2054, + "step": 32658 + }, + { + "epoch": 2.6457388204795853, + "grad_norm": 0.06164870783686638, + "learning_rate": 7.523740942436653e-05, + "loss": 0.2628, + "step": 32659 + }, + { + "epoch": 2.6458198314970836, + "grad_norm": 0.06307579576969147, + "learning_rate": 7.523290877177192e-05, + "loss": 0.2479, + "step": 32660 + }, + { + "epoch": 2.645900842514582, + "grad_norm": 0.06636112183332443, + "learning_rate": 7.522840811917728e-05, + "loss": 0.2731, + "step": 32661 + }, + { + "epoch": 2.6459818535320805, + "grad_norm": 0.08152161538600922, + "learning_rate": 7.522390746658265e-05, + "loss": 0.2277, + "step": 32662 + }, + { + "epoch": 2.6460628645495787, + "grad_norm": 0.06108549237251282, + "learning_rate": 7.521940681398804e-05, + "loss": 0.2522, + "step": 32663 + }, + { + "epoch": 2.646143875567077, + "grad_norm": 0.06697442382574081, + "learning_rate": 7.52149061613934e-05, + "loss": 0.2561, + "step": 32664 + }, + { + "epoch": 2.6462248865845757, + "grad_norm": 0.0735754519701004, + "learning_rate": 7.521040550879877e-05, + "loss": 0.197, + "step": 32665 + }, + { + "epoch": 2.646305897602074, + "grad_norm": 0.07068278640508652, + "learning_rate": 7.520590485620416e-05, + "loss": 0.2477, + "step": 32666 + }, + { + "epoch": 2.646386908619572, + "grad_norm": 0.07574401050806046, + "learning_rate": 7.520140420360952e-05, + "loss": 0.2381, + "step": 32667 + }, + { + "epoch": 2.646467919637071, + "grad_norm": 0.05677516758441925, + "learning_rate": 7.51969035510149e-05, + "loss": 0.2326, + "step": 32668 + }, + { + "epoch": 2.646548930654569, + "grad_norm": 0.0748014971613884, + "learning_rate": 7.519240289842028e-05, + "loss": 0.2259, + "step": 32669 + }, + { + "epoch": 2.6466299416720673, + "grad_norm": 0.07873434573411942, + "learning_rate": 7.518790224582564e-05, + "loss": 0.245, + "step": 32670 + }, + { + "epoch": 2.646710952689566, + "grad_norm": 0.06972617655992508, + "learning_rate": 7.518340159323102e-05, + "loss": 0.2302, + "step": 32671 + }, + { + "epoch": 2.6467919637070643, + "grad_norm": 0.07274175435304642, + "learning_rate": 7.51789009406364e-05, + "loss": 0.2413, + "step": 32672 + }, + { + "epoch": 2.6468729747245625, + "grad_norm": 0.07968560606241226, + "learning_rate": 7.517440028804176e-05, + "loss": 0.2227, + "step": 32673 + }, + { + "epoch": 2.6469539857420608, + "grad_norm": 0.07307036966085434, + "learning_rate": 7.516989963544714e-05, + "loss": 0.2334, + "step": 32674 + }, + { + "epoch": 2.6470349967595594, + "grad_norm": 0.07144438475370407, + "learning_rate": 7.516539898285252e-05, + "loss": 0.2635, + "step": 32675 + }, + { + "epoch": 2.6471160077770577, + "grad_norm": 0.0709671601653099, + "learning_rate": 7.516089833025788e-05, + "loss": 0.2459, + "step": 32676 + }, + { + "epoch": 2.647197018794556, + "grad_norm": 0.07104463130235672, + "learning_rate": 7.515639767766326e-05, + "loss": 0.26, + "step": 32677 + }, + { + "epoch": 2.647278029812054, + "grad_norm": 0.0711294636130333, + "learning_rate": 7.515189702506864e-05, + "loss": 0.2542, + "step": 32678 + }, + { + "epoch": 2.647359040829553, + "grad_norm": 0.07094048708677292, + "learning_rate": 7.5147396372474e-05, + "loss": 0.2864, + "step": 32679 + }, + { + "epoch": 2.647440051847051, + "grad_norm": 0.07097669690847397, + "learning_rate": 7.514289571987939e-05, + "loss": 0.2434, + "step": 32680 + }, + { + "epoch": 2.6475210628645494, + "grad_norm": 0.0773238092660904, + "learning_rate": 7.513839506728477e-05, + "loss": 0.2633, + "step": 32681 + }, + { + "epoch": 2.647602073882048, + "grad_norm": 0.061735138297080994, + "learning_rate": 7.513389441469013e-05, + "loss": 0.2205, + "step": 32682 + }, + { + "epoch": 2.6476830848995463, + "grad_norm": 0.08001622557640076, + "learning_rate": 7.512939376209551e-05, + "loss": 0.2697, + "step": 32683 + }, + { + "epoch": 2.6477640959170445, + "grad_norm": 0.05978796258568764, + "learning_rate": 7.512489310950089e-05, + "loss": 0.2402, + "step": 32684 + }, + { + "epoch": 2.6478451069345432, + "grad_norm": 0.06869897246360779, + "learning_rate": 7.512039245690625e-05, + "loss": 0.2341, + "step": 32685 + }, + { + "epoch": 2.6479261179520415, + "grad_norm": 0.07840482890605927, + "learning_rate": 7.511589180431163e-05, + "loss": 0.2416, + "step": 32686 + }, + { + "epoch": 2.6480071289695397, + "grad_norm": 0.08068672567605972, + "learning_rate": 7.511139115171701e-05, + "loss": 0.2327, + "step": 32687 + }, + { + "epoch": 2.6480881399870384, + "grad_norm": 0.0856936126947403, + "learning_rate": 7.510689049912237e-05, + "loss": 0.2434, + "step": 32688 + }, + { + "epoch": 2.6481691510045366, + "grad_norm": 0.06403835862874985, + "learning_rate": 7.510238984652775e-05, + "loss": 0.2453, + "step": 32689 + }, + { + "epoch": 2.648250162022035, + "grad_norm": 0.05858549848198891, + "learning_rate": 7.509788919393313e-05, + "loss": 0.2318, + "step": 32690 + }, + { + "epoch": 2.6483311730395336, + "grad_norm": 0.0913025438785553, + "learning_rate": 7.509338854133849e-05, + "loss": 0.2923, + "step": 32691 + }, + { + "epoch": 2.648412184057032, + "grad_norm": 0.05180613324046135, + "learning_rate": 7.508888788874387e-05, + "loss": 0.211, + "step": 32692 + }, + { + "epoch": 2.64849319507453, + "grad_norm": 0.06905458867549896, + "learning_rate": 7.508438723614925e-05, + "loss": 0.2236, + "step": 32693 + }, + { + "epoch": 2.6485742060920288, + "grad_norm": 0.07221891731023788, + "learning_rate": 7.507988658355461e-05, + "loss": 0.2228, + "step": 32694 + }, + { + "epoch": 2.648655217109527, + "grad_norm": 0.07029981166124344, + "learning_rate": 7.507538593096e-05, + "loss": 0.288, + "step": 32695 + }, + { + "epoch": 2.6487362281270252, + "grad_norm": 0.07628034800291061, + "learning_rate": 7.507088527836537e-05, + "loss": 0.2504, + "step": 32696 + }, + { + "epoch": 2.6488172391445235, + "grad_norm": 0.0622580461204052, + "learning_rate": 7.506638462577073e-05, + "loss": 0.2462, + "step": 32697 + }, + { + "epoch": 2.648898250162022, + "grad_norm": 0.06469859927892685, + "learning_rate": 7.506188397317612e-05, + "loss": 0.238, + "step": 32698 + }, + { + "epoch": 2.6489792611795204, + "grad_norm": 0.09067538380622864, + "learning_rate": 7.505738332058149e-05, + "loss": 0.2341, + "step": 32699 + }, + { + "epoch": 2.6490602721970187, + "grad_norm": 0.062475938349962234, + "learning_rate": 7.505288266798685e-05, + "loss": 0.2527, + "step": 32700 + }, + { + "epoch": 2.649141283214517, + "grad_norm": 0.08265208452939987, + "learning_rate": 7.504838201539224e-05, + "loss": 0.2583, + "step": 32701 + }, + { + "epoch": 2.6492222942320156, + "grad_norm": 0.0584041066467762, + "learning_rate": 7.504388136279761e-05, + "loss": 0.2062, + "step": 32702 + }, + { + "epoch": 2.649303305249514, + "grad_norm": 0.06874685734510422, + "learning_rate": 7.503938071020297e-05, + "loss": 0.3045, + "step": 32703 + }, + { + "epoch": 2.649384316267012, + "grad_norm": 0.08429435640573502, + "learning_rate": 7.503488005760836e-05, + "loss": 0.246, + "step": 32704 + }, + { + "epoch": 2.649465327284511, + "grad_norm": 0.0826609656214714, + "learning_rate": 7.503037940501373e-05, + "loss": 0.2725, + "step": 32705 + }, + { + "epoch": 2.649546338302009, + "grad_norm": 0.06336696445941925, + "learning_rate": 7.50258787524191e-05, + "loss": 0.2564, + "step": 32706 + }, + { + "epoch": 2.6496273493195073, + "grad_norm": 0.06022218242287636, + "learning_rate": 7.502137809982448e-05, + "loss": 0.2375, + "step": 32707 + }, + { + "epoch": 2.649708360337006, + "grad_norm": 0.07887089252471924, + "learning_rate": 7.501687744722985e-05, + "loss": 0.2879, + "step": 32708 + }, + { + "epoch": 2.649789371354504, + "grad_norm": 0.06073421984910965, + "learning_rate": 7.501237679463523e-05, + "loss": 0.254, + "step": 32709 + }, + { + "epoch": 2.6498703823720025, + "grad_norm": 0.06496462225914001, + "learning_rate": 7.50078761420406e-05, + "loss": 0.2568, + "step": 32710 + }, + { + "epoch": 2.649951393389501, + "grad_norm": 0.060677289962768555, + "learning_rate": 7.500337548944597e-05, + "loss": 0.2228, + "step": 32711 + }, + { + "epoch": 2.6500324044069994, + "grad_norm": 0.06383208930492401, + "learning_rate": 7.499887483685135e-05, + "loss": 0.2338, + "step": 32712 + }, + { + "epoch": 2.6501134154244976, + "grad_norm": 0.059448208659887314, + "learning_rate": 7.499437418425672e-05, + "loss": 0.2227, + "step": 32713 + }, + { + "epoch": 2.6501944264419963, + "grad_norm": 0.06204259768128395, + "learning_rate": 7.49898735316621e-05, + "loss": 0.2526, + "step": 32714 + }, + { + "epoch": 2.6502754374594946, + "grad_norm": 0.051161028444767, + "learning_rate": 7.498537287906747e-05, + "loss": 0.2497, + "step": 32715 + }, + { + "epoch": 2.650356448476993, + "grad_norm": 0.07101891189813614, + "learning_rate": 7.498087222647284e-05, + "loss": 0.2082, + "step": 32716 + }, + { + "epoch": 2.6504374594944915, + "grad_norm": 0.05326693132519722, + "learning_rate": 7.497637157387822e-05, + "loss": 0.219, + "step": 32717 + }, + { + "epoch": 2.6505184705119897, + "grad_norm": 0.06772123277187347, + "learning_rate": 7.497187092128359e-05, + "loss": 0.263, + "step": 32718 + }, + { + "epoch": 2.650599481529488, + "grad_norm": 0.06042374670505524, + "learning_rate": 7.496737026868896e-05, + "loss": 0.2462, + "step": 32719 + }, + { + "epoch": 2.6506804925469862, + "grad_norm": 0.06658099591732025, + "learning_rate": 7.496286961609434e-05, + "loss": 0.2341, + "step": 32720 + }, + { + "epoch": 2.650761503564485, + "grad_norm": 0.0610867403447628, + "learning_rate": 7.495836896349971e-05, + "loss": 0.2434, + "step": 32721 + }, + { + "epoch": 2.650842514581983, + "grad_norm": 0.06181452050805092, + "learning_rate": 7.495386831090508e-05, + "loss": 0.2349, + "step": 32722 + }, + { + "epoch": 2.6509235255994814, + "grad_norm": 0.06832627952098846, + "learning_rate": 7.494936765831046e-05, + "loss": 0.2463, + "step": 32723 + }, + { + "epoch": 2.6510045366169797, + "grad_norm": 0.06687076389789581, + "learning_rate": 7.494486700571583e-05, + "loss": 0.245, + "step": 32724 + }, + { + "epoch": 2.6510855476344783, + "grad_norm": 0.073924221098423, + "learning_rate": 7.49403663531212e-05, + "loss": 0.2394, + "step": 32725 + }, + { + "epoch": 2.6511665586519766, + "grad_norm": 0.06764573603868484, + "learning_rate": 7.493586570052658e-05, + "loss": 0.268, + "step": 32726 + }, + { + "epoch": 2.651247569669475, + "grad_norm": 0.056262798607349396, + "learning_rate": 7.493136504793195e-05, + "loss": 0.1928, + "step": 32727 + }, + { + "epoch": 2.6513285806869735, + "grad_norm": 0.08186808228492737, + "learning_rate": 7.492686439533732e-05, + "loss": 0.2306, + "step": 32728 + }, + { + "epoch": 2.6514095917044718, + "grad_norm": 0.06322402507066727, + "learning_rate": 7.49223637427427e-05, + "loss": 0.1993, + "step": 32729 + }, + { + "epoch": 2.65149060272197, + "grad_norm": 0.06521429866552353, + "learning_rate": 7.491786309014807e-05, + "loss": 0.2408, + "step": 32730 + }, + { + "epoch": 2.6515716137394687, + "grad_norm": 0.06778412312269211, + "learning_rate": 7.491336243755345e-05, + "loss": 0.2515, + "step": 32731 + }, + { + "epoch": 2.651652624756967, + "grad_norm": 0.05836867913603783, + "learning_rate": 7.490886178495883e-05, + "loss": 0.2386, + "step": 32732 + }, + { + "epoch": 2.651733635774465, + "grad_norm": 0.07563242316246033, + "learning_rate": 7.490436113236419e-05, + "loss": 0.2285, + "step": 32733 + }, + { + "epoch": 2.651814646791964, + "grad_norm": 0.07448039948940277, + "learning_rate": 7.489986047976957e-05, + "loss": 0.2491, + "step": 32734 + }, + { + "epoch": 2.651895657809462, + "grad_norm": 0.07622935622930527, + "learning_rate": 7.489535982717495e-05, + "loss": 0.2075, + "step": 32735 + }, + { + "epoch": 2.6519766688269604, + "grad_norm": 0.07185497879981995, + "learning_rate": 7.489085917458031e-05, + "loss": 0.2174, + "step": 32736 + }, + { + "epoch": 2.652057679844459, + "grad_norm": 0.06740206480026245, + "learning_rate": 7.488635852198569e-05, + "loss": 0.2716, + "step": 32737 + }, + { + "epoch": 2.6521386908619573, + "grad_norm": 0.07669515907764435, + "learning_rate": 7.488185786939107e-05, + "loss": 0.2235, + "step": 32738 + }, + { + "epoch": 2.6522197018794555, + "grad_norm": 0.0658188983798027, + "learning_rate": 7.487735721679643e-05, + "loss": 0.2752, + "step": 32739 + }, + { + "epoch": 2.6523007128969542, + "grad_norm": 0.07216110080480576, + "learning_rate": 7.487285656420181e-05, + "loss": 0.2269, + "step": 32740 + }, + { + "epoch": 2.6523817239144525, + "grad_norm": 0.059288520365953445, + "learning_rate": 7.48683559116072e-05, + "loss": 0.2627, + "step": 32741 + }, + { + "epoch": 2.6524627349319507, + "grad_norm": 0.0678819864988327, + "learning_rate": 7.486385525901256e-05, + "loss": 0.2596, + "step": 32742 + }, + { + "epoch": 2.652543745949449, + "grad_norm": 0.062606081366539, + "learning_rate": 7.485935460641793e-05, + "loss": 0.2459, + "step": 32743 + }, + { + "epoch": 2.6526247569669477, + "grad_norm": 0.07655350863933563, + "learning_rate": 7.485485395382332e-05, + "loss": 0.2138, + "step": 32744 + }, + { + "epoch": 2.652705767984446, + "grad_norm": 0.06734804064035416, + "learning_rate": 7.485035330122868e-05, + "loss": 0.2406, + "step": 32745 + }, + { + "epoch": 2.652786779001944, + "grad_norm": 0.07340921461582184, + "learning_rate": 7.484585264863405e-05, + "loss": 0.2493, + "step": 32746 + }, + { + "epoch": 2.6528677900194424, + "grad_norm": 0.06226073205471039, + "learning_rate": 7.484135199603944e-05, + "loss": 0.22, + "step": 32747 + }, + { + "epoch": 2.652948801036941, + "grad_norm": 0.07011928409337997, + "learning_rate": 7.48368513434448e-05, + "loss": 0.2151, + "step": 32748 + }, + { + "epoch": 2.6530298120544393, + "grad_norm": 0.06465835869312286, + "learning_rate": 7.483235069085017e-05, + "loss": 0.2576, + "step": 32749 + }, + { + "epoch": 2.6531108230719376, + "grad_norm": 0.05908047780394554, + "learning_rate": 7.482785003825556e-05, + "loss": 0.2277, + "step": 32750 + }, + { + "epoch": 2.6531918340894363, + "grad_norm": 0.07969486713409424, + "learning_rate": 7.482334938566092e-05, + "loss": 0.2495, + "step": 32751 + }, + { + "epoch": 2.6532728451069345, + "grad_norm": 0.08361916989088058, + "learning_rate": 7.481884873306629e-05, + "loss": 0.2756, + "step": 32752 + }, + { + "epoch": 2.6533538561244328, + "grad_norm": 0.06488799303770065, + "learning_rate": 7.481434808047168e-05, + "loss": 0.2179, + "step": 32753 + }, + { + "epoch": 2.6534348671419314, + "grad_norm": 0.06097380816936493, + "learning_rate": 7.480984742787704e-05, + "loss": 0.2414, + "step": 32754 + }, + { + "epoch": 2.6535158781594297, + "grad_norm": 0.06575482338666916, + "learning_rate": 7.480534677528241e-05, + "loss": 0.2405, + "step": 32755 + }, + { + "epoch": 2.653596889176928, + "grad_norm": 0.07554206252098083, + "learning_rate": 7.48008461226878e-05, + "loss": 0.2275, + "step": 32756 + }, + { + "epoch": 2.6536779001944266, + "grad_norm": 0.06285907328128815, + "learning_rate": 7.479634547009316e-05, + "loss": 0.2471, + "step": 32757 + }, + { + "epoch": 2.653758911211925, + "grad_norm": 0.07046081870794296, + "learning_rate": 7.479184481749853e-05, + "loss": 0.2722, + "step": 32758 + }, + { + "epoch": 2.653839922229423, + "grad_norm": 0.060215841978788376, + "learning_rate": 7.478734416490392e-05, + "loss": 0.255, + "step": 32759 + }, + { + "epoch": 2.653920933246922, + "grad_norm": 0.07125762850046158, + "learning_rate": 7.478284351230928e-05, + "loss": 0.2296, + "step": 32760 + }, + { + "epoch": 2.65400194426442, + "grad_norm": 0.062798410654068, + "learning_rate": 7.477834285971467e-05, + "loss": 0.2356, + "step": 32761 + }, + { + "epoch": 2.6540829552819183, + "grad_norm": 0.06494462490081787, + "learning_rate": 7.477384220712004e-05, + "loss": 0.2589, + "step": 32762 + }, + { + "epoch": 2.654163966299417, + "grad_norm": 0.05592583492398262, + "learning_rate": 7.47693415545254e-05, + "loss": 0.2291, + "step": 32763 + }, + { + "epoch": 2.654244977316915, + "grad_norm": 0.060103777796030045, + "learning_rate": 7.476484090193079e-05, + "loss": 0.2522, + "step": 32764 + }, + { + "epoch": 2.6543259883344135, + "grad_norm": 0.05874945595860481, + "learning_rate": 7.476034024933616e-05, + "loss": 0.2365, + "step": 32765 + }, + { + "epoch": 2.6544069993519117, + "grad_norm": 0.07520363479852676, + "learning_rate": 7.475583959674152e-05, + "loss": 0.2695, + "step": 32766 + }, + { + "epoch": 2.6544880103694104, + "grad_norm": 0.06662902981042862, + "learning_rate": 7.475133894414691e-05, + "loss": 0.285, + "step": 32767 + }, + { + "epoch": 2.6545690213869086, + "grad_norm": 0.06352225691080093, + "learning_rate": 7.474683829155228e-05, + "loss": 0.2239, + "step": 32768 + }, + { + "epoch": 2.654650032404407, + "grad_norm": 0.061932697892189026, + "learning_rate": 7.474233763895764e-05, + "loss": 0.2295, + "step": 32769 + }, + { + "epoch": 2.654731043421905, + "grad_norm": 0.08426105231046677, + "learning_rate": 7.473783698636303e-05, + "loss": 0.2352, + "step": 32770 + }, + { + "epoch": 2.654812054439404, + "grad_norm": 0.06787164509296417, + "learning_rate": 7.47333363337684e-05, + "loss": 0.2387, + "step": 32771 + }, + { + "epoch": 2.654893065456902, + "grad_norm": 0.07294077426195145, + "learning_rate": 7.472883568117378e-05, + "loss": 0.2472, + "step": 32772 + }, + { + "epoch": 2.6549740764744003, + "grad_norm": 0.06752139329910278, + "learning_rate": 7.472433502857915e-05, + "loss": 0.2441, + "step": 32773 + }, + { + "epoch": 2.655055087491899, + "grad_norm": 0.07813578844070435, + "learning_rate": 7.471983437598452e-05, + "loss": 0.2892, + "step": 32774 + }, + { + "epoch": 2.6551360985093972, + "grad_norm": 0.06288386881351471, + "learning_rate": 7.47153337233899e-05, + "loss": 0.2494, + "step": 32775 + }, + { + "epoch": 2.6552171095268955, + "grad_norm": 0.06610371172428131, + "learning_rate": 7.471083307079527e-05, + "loss": 0.2302, + "step": 32776 + }, + { + "epoch": 2.655298120544394, + "grad_norm": 0.0636439397931099, + "learning_rate": 7.470633241820064e-05, + "loss": 0.2547, + "step": 32777 + }, + { + "epoch": 2.6553791315618924, + "grad_norm": 0.05692015960812569, + "learning_rate": 7.470183176560602e-05, + "loss": 0.2121, + "step": 32778 + }, + { + "epoch": 2.6554601425793907, + "grad_norm": 0.06695279479026794, + "learning_rate": 7.469733111301139e-05, + "loss": 0.2443, + "step": 32779 + }, + { + "epoch": 2.6555411535968894, + "grad_norm": 0.05734477564692497, + "learning_rate": 7.469283046041677e-05, + "loss": 0.2494, + "step": 32780 + }, + { + "epoch": 2.6556221646143876, + "grad_norm": 0.07733457535505295, + "learning_rate": 7.468832980782214e-05, + "loss": 0.2526, + "step": 32781 + }, + { + "epoch": 2.655703175631886, + "grad_norm": 0.07286423444747925, + "learning_rate": 7.468382915522751e-05, + "loss": 0.2524, + "step": 32782 + }, + { + "epoch": 2.6557841866493845, + "grad_norm": 0.05612710863351822, + "learning_rate": 7.467932850263289e-05, + "loss": 0.2019, + "step": 32783 + }, + { + "epoch": 2.655865197666883, + "grad_norm": 0.06247418001294136, + "learning_rate": 7.467482785003826e-05, + "loss": 0.2558, + "step": 32784 + }, + { + "epoch": 2.655946208684381, + "grad_norm": 0.06592721492052078, + "learning_rate": 7.467032719744363e-05, + "loss": 0.2253, + "step": 32785 + }, + { + "epoch": 2.6560272197018797, + "grad_norm": 0.0667741596698761, + "learning_rate": 7.466582654484901e-05, + "loss": 0.2555, + "step": 32786 + }, + { + "epoch": 2.656108230719378, + "grad_norm": 0.09115524590015411, + "learning_rate": 7.466132589225438e-05, + "loss": 0.3118, + "step": 32787 + }, + { + "epoch": 2.656189241736876, + "grad_norm": 0.06214887276291847, + "learning_rate": 7.465682523965975e-05, + "loss": 0.2552, + "step": 32788 + }, + { + "epoch": 2.6562702527543745, + "grad_norm": 0.07011143118143082, + "learning_rate": 7.465232458706513e-05, + "loss": 0.2423, + "step": 32789 + }, + { + "epoch": 2.656351263771873, + "grad_norm": 0.0744331106543541, + "learning_rate": 7.46478239344705e-05, + "loss": 0.2576, + "step": 32790 + }, + { + "epoch": 2.6564322747893714, + "grad_norm": 0.07393400371074677, + "learning_rate": 7.464332328187588e-05, + "loss": 0.2289, + "step": 32791 + }, + { + "epoch": 2.6565132858068696, + "grad_norm": 0.06812934577465057, + "learning_rate": 7.463882262928125e-05, + "loss": 0.2269, + "step": 32792 + }, + { + "epoch": 2.656594296824368, + "grad_norm": 0.06811689585447311, + "learning_rate": 7.463432197668662e-05, + "loss": 0.2322, + "step": 32793 + }, + { + "epoch": 2.6566753078418666, + "grad_norm": 0.06463243812322617, + "learning_rate": 7.4629821324092e-05, + "loss": 0.244, + "step": 32794 + }, + { + "epoch": 2.656756318859365, + "grad_norm": 0.06714651733636856, + "learning_rate": 7.462532067149737e-05, + "loss": 0.2522, + "step": 32795 + }, + { + "epoch": 2.656837329876863, + "grad_norm": 0.07441917061805725, + "learning_rate": 7.462082001890274e-05, + "loss": 0.2573, + "step": 32796 + }, + { + "epoch": 2.6569183408943617, + "grad_norm": 0.04962689429521561, + "learning_rate": 7.461631936630812e-05, + "loss": 0.2251, + "step": 32797 + }, + { + "epoch": 2.65699935191186, + "grad_norm": 0.0671190395951271, + "learning_rate": 7.461181871371349e-05, + "loss": 0.2388, + "step": 32798 + }, + { + "epoch": 2.6570803629293582, + "grad_norm": 0.05777635797858238, + "learning_rate": 7.460731806111886e-05, + "loss": 0.2143, + "step": 32799 + }, + { + "epoch": 2.657161373946857, + "grad_norm": 0.07067793607711792, + "learning_rate": 7.460281740852424e-05, + "loss": 0.26, + "step": 32800 + }, + { + "epoch": 2.657242384964355, + "grad_norm": 0.080874502658844, + "learning_rate": 7.459831675592961e-05, + "loss": 0.2637, + "step": 32801 + }, + { + "epoch": 2.6573233959818534, + "grad_norm": 0.05750936269760132, + "learning_rate": 7.459381610333498e-05, + "loss": 0.2358, + "step": 32802 + }, + { + "epoch": 2.657404406999352, + "grad_norm": 0.07016448676586151, + "learning_rate": 7.458931545074036e-05, + "loss": 0.2365, + "step": 32803 + }, + { + "epoch": 2.6574854180168503, + "grad_norm": 0.05285506322979927, + "learning_rate": 7.458481479814573e-05, + "loss": 0.2466, + "step": 32804 + }, + { + "epoch": 2.6575664290343486, + "grad_norm": 0.06905540823936462, + "learning_rate": 7.45803141455511e-05, + "loss": 0.2385, + "step": 32805 + }, + { + "epoch": 2.6576474400518473, + "grad_norm": 0.07303239405155182, + "learning_rate": 7.457581349295648e-05, + "loss": 0.254, + "step": 32806 + }, + { + "epoch": 2.6577284510693455, + "grad_norm": 0.07117251306772232, + "learning_rate": 7.457131284036185e-05, + "loss": 0.2309, + "step": 32807 + }, + { + "epoch": 2.6578094620868438, + "grad_norm": 0.07382563501596451, + "learning_rate": 7.456681218776723e-05, + "loss": 0.2534, + "step": 32808 + }, + { + "epoch": 2.6578904731043425, + "grad_norm": 0.0767350047826767, + "learning_rate": 7.45623115351726e-05, + "loss": 0.2698, + "step": 32809 + }, + { + "epoch": 2.6579714841218407, + "grad_norm": 0.06391940265893936, + "learning_rate": 7.455781088257797e-05, + "loss": 0.2923, + "step": 32810 + }, + { + "epoch": 2.658052495139339, + "grad_norm": 0.07749442756175995, + "learning_rate": 7.455331022998335e-05, + "loss": 0.2898, + "step": 32811 + }, + { + "epoch": 2.658133506156837, + "grad_norm": 0.06880564242601395, + "learning_rate": 7.454880957738872e-05, + "loss": 0.2209, + "step": 32812 + }, + { + "epoch": 2.6582145171743354, + "grad_norm": 0.06603529304265976, + "learning_rate": 7.454430892479411e-05, + "loss": 0.2457, + "step": 32813 + }, + { + "epoch": 2.658295528191834, + "grad_norm": 0.07407406717538834, + "learning_rate": 7.453980827219947e-05, + "loss": 0.2332, + "step": 32814 + }, + { + "epoch": 2.6583765392093324, + "grad_norm": 0.059012070298194885, + "learning_rate": 7.453530761960484e-05, + "loss": 0.2146, + "step": 32815 + }, + { + "epoch": 2.6584575502268306, + "grad_norm": 0.06564911454916, + "learning_rate": 7.453080696701023e-05, + "loss": 0.2453, + "step": 32816 + }, + { + "epoch": 2.6585385612443293, + "grad_norm": 0.06099837273359299, + "learning_rate": 7.452630631441559e-05, + "loss": 0.2525, + "step": 32817 + }, + { + "epoch": 2.6586195722618275, + "grad_norm": 0.06015372276306152, + "learning_rate": 7.452180566182096e-05, + "loss": 0.2526, + "step": 32818 + }, + { + "epoch": 2.658700583279326, + "grad_norm": 0.07565578818321228, + "learning_rate": 7.451730500922635e-05, + "loss": 0.2416, + "step": 32819 + }, + { + "epoch": 2.6587815942968245, + "grad_norm": 0.07236052304506302, + "learning_rate": 7.451280435663171e-05, + "loss": 0.2683, + "step": 32820 + }, + { + "epoch": 2.6588626053143227, + "grad_norm": 0.08145192265510559, + "learning_rate": 7.450830370403708e-05, + "loss": 0.2372, + "step": 32821 + }, + { + "epoch": 2.658943616331821, + "grad_norm": 0.0653989315032959, + "learning_rate": 7.450380305144247e-05, + "loss": 0.2019, + "step": 32822 + }, + { + "epoch": 2.6590246273493197, + "grad_norm": 0.08094444870948792, + "learning_rate": 7.449930239884783e-05, + "loss": 0.2539, + "step": 32823 + }, + { + "epoch": 2.659105638366818, + "grad_norm": 0.06678465008735657, + "learning_rate": 7.44948017462532e-05, + "loss": 0.2444, + "step": 32824 + }, + { + "epoch": 2.659186649384316, + "grad_norm": 0.0730195865035057, + "learning_rate": 7.449030109365859e-05, + "loss": 0.2558, + "step": 32825 + }, + { + "epoch": 2.659267660401815, + "grad_norm": 0.06824833154678345, + "learning_rate": 7.448580044106395e-05, + "loss": 0.2344, + "step": 32826 + }, + { + "epoch": 2.659348671419313, + "grad_norm": 0.054951079189777374, + "learning_rate": 7.448129978846932e-05, + "loss": 0.2197, + "step": 32827 + }, + { + "epoch": 2.6594296824368113, + "grad_norm": 0.06483834981918335, + "learning_rate": 7.447679913587471e-05, + "loss": 0.2326, + "step": 32828 + }, + { + "epoch": 2.65951069345431, + "grad_norm": 0.08794785290956497, + "learning_rate": 7.447229848328007e-05, + "loss": 0.2344, + "step": 32829 + }, + { + "epoch": 2.6595917044718083, + "grad_norm": 0.07904116064310074, + "learning_rate": 7.446779783068545e-05, + "loss": 0.293, + "step": 32830 + }, + { + "epoch": 2.6596727154893065, + "grad_norm": 0.07375107705593109, + "learning_rate": 7.446329717809083e-05, + "loss": 0.2413, + "step": 32831 + }, + { + "epoch": 2.659753726506805, + "grad_norm": 0.07543234527111053, + "learning_rate": 7.445879652549619e-05, + "loss": 0.2486, + "step": 32832 + }, + { + "epoch": 2.6598347375243034, + "grad_norm": 0.0675840899348259, + "learning_rate": 7.445429587290157e-05, + "loss": 0.2402, + "step": 32833 + }, + { + "epoch": 2.6599157485418017, + "grad_norm": 0.07507772743701935, + "learning_rate": 7.444979522030695e-05, + "loss": 0.2293, + "step": 32834 + }, + { + "epoch": 2.6599967595593, + "grad_norm": 0.07033761590719223, + "learning_rate": 7.444529456771231e-05, + "loss": 0.2751, + "step": 32835 + }, + { + "epoch": 2.660077770576798, + "grad_norm": 0.07205361127853394, + "learning_rate": 7.444079391511769e-05, + "loss": 0.268, + "step": 32836 + }, + { + "epoch": 2.660158781594297, + "grad_norm": 0.06997992098331451, + "learning_rate": 7.443629326252307e-05, + "loss": 0.2482, + "step": 32837 + }, + { + "epoch": 2.660239792611795, + "grad_norm": 0.07074286788702011, + "learning_rate": 7.443179260992843e-05, + "loss": 0.2292, + "step": 32838 + }, + { + "epoch": 2.6603208036292934, + "grad_norm": 0.06832756102085114, + "learning_rate": 7.442729195733382e-05, + "loss": 0.2708, + "step": 32839 + }, + { + "epoch": 2.660401814646792, + "grad_norm": 0.06949548423290253, + "learning_rate": 7.44227913047392e-05, + "loss": 0.2357, + "step": 32840 + }, + { + "epoch": 2.6604828256642903, + "grad_norm": 0.06634163856506348, + "learning_rate": 7.441829065214457e-05, + "loss": 0.2334, + "step": 32841 + }, + { + "epoch": 2.6605638366817885, + "grad_norm": 0.07854374498128891, + "learning_rate": 7.441378999954994e-05, + "loss": 0.2625, + "step": 32842 + }, + { + "epoch": 2.660644847699287, + "grad_norm": 0.073130302131176, + "learning_rate": 7.440928934695532e-05, + "loss": 0.2658, + "step": 32843 + }, + { + "epoch": 2.6607258587167855, + "grad_norm": 0.06455226242542267, + "learning_rate": 7.440478869436069e-05, + "loss": 0.2409, + "step": 32844 + }, + { + "epoch": 2.6608068697342837, + "grad_norm": 0.06555270403623581, + "learning_rate": 7.440028804176606e-05, + "loss": 0.2591, + "step": 32845 + }, + { + "epoch": 2.6608878807517824, + "grad_norm": 0.06920315325260162, + "learning_rate": 7.439578738917144e-05, + "loss": 0.2523, + "step": 32846 + }, + { + "epoch": 2.6609688917692806, + "grad_norm": 0.07340586930513382, + "learning_rate": 7.439128673657681e-05, + "loss": 0.2112, + "step": 32847 + }, + { + "epoch": 2.661049902786779, + "grad_norm": 0.06806132942438126, + "learning_rate": 7.438678608398218e-05, + "loss": 0.2399, + "step": 32848 + }, + { + "epoch": 2.6611309138042776, + "grad_norm": 0.0959523618221283, + "learning_rate": 7.438228543138756e-05, + "loss": 0.21, + "step": 32849 + }, + { + "epoch": 2.661211924821776, + "grad_norm": 0.07259013503789902, + "learning_rate": 7.437778477879293e-05, + "loss": 0.2479, + "step": 32850 + }, + { + "epoch": 2.661292935839274, + "grad_norm": 0.07072082161903381, + "learning_rate": 7.43732841261983e-05, + "loss": 0.2315, + "step": 32851 + }, + { + "epoch": 2.6613739468567728, + "grad_norm": 0.06280186027288437, + "learning_rate": 7.436878347360368e-05, + "loss": 0.2325, + "step": 32852 + }, + { + "epoch": 2.661454957874271, + "grad_norm": 0.08135109394788742, + "learning_rate": 7.436428282100905e-05, + "loss": 0.2748, + "step": 32853 + }, + { + "epoch": 2.6615359688917692, + "grad_norm": 0.06902328133583069, + "learning_rate": 7.435978216841443e-05, + "loss": 0.2285, + "step": 32854 + }, + { + "epoch": 2.661616979909268, + "grad_norm": 0.058770183473825455, + "learning_rate": 7.43552815158198e-05, + "loss": 0.2423, + "step": 32855 + }, + { + "epoch": 2.661697990926766, + "grad_norm": 0.0691264271736145, + "learning_rate": 7.435078086322517e-05, + "loss": 0.2207, + "step": 32856 + }, + { + "epoch": 2.6617790019442644, + "grad_norm": 0.0778164342045784, + "learning_rate": 7.434628021063055e-05, + "loss": 0.2412, + "step": 32857 + }, + { + "epoch": 2.6618600129617627, + "grad_norm": 0.08259502053260803, + "learning_rate": 7.434177955803592e-05, + "loss": 0.2691, + "step": 32858 + }, + { + "epoch": 2.661941023979261, + "grad_norm": 0.06187589466571808, + "learning_rate": 7.43372789054413e-05, + "loss": 0.2248, + "step": 32859 + }, + { + "epoch": 2.6620220349967596, + "grad_norm": 0.0695401206612587, + "learning_rate": 7.433277825284667e-05, + "loss": 0.2364, + "step": 32860 + }, + { + "epoch": 2.662103046014258, + "grad_norm": 0.08041361719369888, + "learning_rate": 7.432827760025204e-05, + "loss": 0.2444, + "step": 32861 + }, + { + "epoch": 2.662184057031756, + "grad_norm": 0.058977339416742325, + "learning_rate": 7.432377694765741e-05, + "loss": 0.2423, + "step": 32862 + }, + { + "epoch": 2.662265068049255, + "grad_norm": 0.05708383023738861, + "learning_rate": 7.431927629506279e-05, + "loss": 0.2171, + "step": 32863 + }, + { + "epoch": 2.662346079066753, + "grad_norm": 0.06705954670906067, + "learning_rate": 7.431477564246816e-05, + "loss": 0.2145, + "step": 32864 + }, + { + "epoch": 2.6624270900842513, + "grad_norm": 0.06451255828142166, + "learning_rate": 7.431027498987354e-05, + "loss": 0.2152, + "step": 32865 + }, + { + "epoch": 2.66250810110175, + "grad_norm": 0.06673882901668549, + "learning_rate": 7.430577433727891e-05, + "loss": 0.254, + "step": 32866 + }, + { + "epoch": 2.662589112119248, + "grad_norm": 0.06643757969141006, + "learning_rate": 7.430127368468428e-05, + "loss": 0.2403, + "step": 32867 + }, + { + "epoch": 2.6626701231367464, + "grad_norm": 0.06720244139432907, + "learning_rate": 7.429677303208966e-05, + "loss": 0.2334, + "step": 32868 + }, + { + "epoch": 2.662751134154245, + "grad_norm": 0.0601067841053009, + "learning_rate": 7.429227237949503e-05, + "loss": 0.2168, + "step": 32869 + }, + { + "epoch": 2.6628321451717434, + "grad_norm": 0.0732504352927208, + "learning_rate": 7.42877717269004e-05, + "loss": 0.2565, + "step": 32870 + }, + { + "epoch": 2.6629131561892416, + "grad_norm": 0.06915004551410675, + "learning_rate": 7.428327107430578e-05, + "loss": 0.2443, + "step": 32871 + }, + { + "epoch": 2.6629941672067403, + "grad_norm": 0.06645822525024414, + "learning_rate": 7.427877042171115e-05, + "loss": 0.2207, + "step": 32872 + }, + { + "epoch": 2.6630751782242386, + "grad_norm": 0.06649801880121231, + "learning_rate": 7.427426976911652e-05, + "loss": 0.2651, + "step": 32873 + }, + { + "epoch": 2.663156189241737, + "grad_norm": 0.06191537529230118, + "learning_rate": 7.42697691165219e-05, + "loss": 0.2389, + "step": 32874 + }, + { + "epoch": 2.6632372002592355, + "grad_norm": 0.07433738559484482, + "learning_rate": 7.426526846392727e-05, + "loss": 0.2857, + "step": 32875 + }, + { + "epoch": 2.6633182112767337, + "grad_norm": 0.06460598856210709, + "learning_rate": 7.426076781133265e-05, + "loss": 0.2842, + "step": 32876 + }, + { + "epoch": 2.663399222294232, + "grad_norm": 0.06493370234966278, + "learning_rate": 7.425626715873802e-05, + "loss": 0.2334, + "step": 32877 + }, + { + "epoch": 2.6634802333117307, + "grad_norm": 0.0691828578710556, + "learning_rate": 7.425176650614339e-05, + "loss": 0.2099, + "step": 32878 + }, + { + "epoch": 2.663561244329229, + "grad_norm": 0.06899280846118927, + "learning_rate": 7.424726585354877e-05, + "loss": 0.2307, + "step": 32879 + }, + { + "epoch": 2.663642255346727, + "grad_norm": 0.06932519376277924, + "learning_rate": 7.424276520095414e-05, + "loss": 0.2354, + "step": 32880 + }, + { + "epoch": 2.6637232663642254, + "grad_norm": 0.06170056387782097, + "learning_rate": 7.423826454835951e-05, + "loss": 0.2123, + "step": 32881 + }, + { + "epoch": 2.6638042773817237, + "grad_norm": 0.06595876812934875, + "learning_rate": 7.423376389576489e-05, + "loss": 0.2404, + "step": 32882 + }, + { + "epoch": 2.6638852883992223, + "grad_norm": 0.07378228008747101, + "learning_rate": 7.422926324317026e-05, + "loss": 0.2741, + "step": 32883 + }, + { + "epoch": 2.6639662994167206, + "grad_norm": 0.06303569674491882, + "learning_rate": 7.422476259057563e-05, + "loss": 0.2634, + "step": 32884 + }, + { + "epoch": 2.664047310434219, + "grad_norm": 0.07362163811922073, + "learning_rate": 7.422026193798101e-05, + "loss": 0.2168, + "step": 32885 + }, + { + "epoch": 2.6641283214517175, + "grad_norm": 0.05958087369799614, + "learning_rate": 7.421576128538638e-05, + "loss": 0.2555, + "step": 32886 + }, + { + "epoch": 2.6642093324692158, + "grad_norm": 0.09635303169488907, + "learning_rate": 7.421126063279175e-05, + "loss": 0.221, + "step": 32887 + }, + { + "epoch": 2.664290343486714, + "grad_norm": 0.07042831927537918, + "learning_rate": 7.420675998019713e-05, + "loss": 0.2282, + "step": 32888 + }, + { + "epoch": 2.6643713545042127, + "grad_norm": 0.061355553567409515, + "learning_rate": 7.42022593276025e-05, + "loss": 0.2289, + "step": 32889 + }, + { + "epoch": 2.664452365521711, + "grad_norm": 0.08902187645435333, + "learning_rate": 7.419775867500788e-05, + "loss": 0.2903, + "step": 32890 + }, + { + "epoch": 2.664533376539209, + "grad_norm": 0.07006392627954483, + "learning_rate": 7.419325802241326e-05, + "loss": 0.2058, + "step": 32891 + }, + { + "epoch": 2.664614387556708, + "grad_norm": 0.07656487077474594, + "learning_rate": 7.418875736981862e-05, + "loss": 0.2665, + "step": 32892 + }, + { + "epoch": 2.664695398574206, + "grad_norm": 0.07237304747104645, + "learning_rate": 7.4184256717224e-05, + "loss": 0.2195, + "step": 32893 + }, + { + "epoch": 2.6647764095917044, + "grad_norm": 0.07717087119817734, + "learning_rate": 7.417975606462938e-05, + "loss": 0.2536, + "step": 32894 + }, + { + "epoch": 2.664857420609203, + "grad_norm": 0.07785272598266602, + "learning_rate": 7.417525541203474e-05, + "loss": 0.2629, + "step": 32895 + }, + { + "epoch": 2.6649384316267013, + "grad_norm": 0.0654202550649643, + "learning_rate": 7.417075475944012e-05, + "loss": 0.2482, + "step": 32896 + }, + { + "epoch": 2.6650194426441995, + "grad_norm": 0.08050129562616348, + "learning_rate": 7.41662541068455e-05, + "loss": 0.2913, + "step": 32897 + }, + { + "epoch": 2.6651004536616982, + "grad_norm": 0.06768804043531418, + "learning_rate": 7.416175345425086e-05, + "loss": 0.2443, + "step": 32898 + }, + { + "epoch": 2.6651814646791965, + "grad_norm": 0.0724261924624443, + "learning_rate": 7.415725280165624e-05, + "loss": 0.2556, + "step": 32899 + }, + { + "epoch": 2.6652624756966947, + "grad_norm": 0.054597921669483185, + "learning_rate": 7.415275214906163e-05, + "loss": 0.2127, + "step": 32900 + }, + { + "epoch": 2.665343486714193, + "grad_norm": 0.08325103670358658, + "learning_rate": 7.414825149646699e-05, + "loss": 0.2412, + "step": 32901 + }, + { + "epoch": 2.6654244977316917, + "grad_norm": 0.06817788630723953, + "learning_rate": 7.414375084387236e-05, + "loss": 0.2504, + "step": 32902 + }, + { + "epoch": 2.66550550874919, + "grad_norm": 0.07943432778120041, + "learning_rate": 7.413925019127775e-05, + "loss": 0.2358, + "step": 32903 + }, + { + "epoch": 2.665586519766688, + "grad_norm": 0.0712936744093895, + "learning_rate": 7.41347495386831e-05, + "loss": 0.2562, + "step": 32904 + }, + { + "epoch": 2.6656675307841864, + "grad_norm": 0.06890852749347687, + "learning_rate": 7.413024888608848e-05, + "loss": 0.2333, + "step": 32905 + }, + { + "epoch": 2.665748541801685, + "grad_norm": 0.08567355573177338, + "learning_rate": 7.412574823349387e-05, + "loss": 0.2737, + "step": 32906 + }, + { + "epoch": 2.6658295528191833, + "grad_norm": 0.06952392309904099, + "learning_rate": 7.412124758089923e-05, + "loss": 0.2425, + "step": 32907 + }, + { + "epoch": 2.6659105638366816, + "grad_norm": 0.0863032266497612, + "learning_rate": 7.41167469283046e-05, + "loss": 0.2555, + "step": 32908 + }, + { + "epoch": 2.6659915748541803, + "grad_norm": 0.0648011565208435, + "learning_rate": 7.411224627570999e-05, + "loss": 0.2101, + "step": 32909 + }, + { + "epoch": 2.6660725858716785, + "grad_norm": 0.06821193546056747, + "learning_rate": 7.410774562311536e-05, + "loss": 0.2466, + "step": 32910 + }, + { + "epoch": 2.6661535968891767, + "grad_norm": 0.05463450029492378, + "learning_rate": 7.410324497052072e-05, + "loss": 0.225, + "step": 32911 + }, + { + "epoch": 2.6662346079066754, + "grad_norm": 0.06467321515083313, + "learning_rate": 7.409874431792611e-05, + "loss": 0.2331, + "step": 32912 + }, + { + "epoch": 2.6663156189241737, + "grad_norm": 0.06272049993276596, + "learning_rate": 7.409424366533148e-05, + "loss": 0.239, + "step": 32913 + }, + { + "epoch": 2.666396629941672, + "grad_norm": 0.07468179613351822, + "learning_rate": 7.408974301273684e-05, + "loss": 0.2639, + "step": 32914 + }, + { + "epoch": 2.6664776409591706, + "grad_norm": 0.07041250169277191, + "learning_rate": 7.408524236014223e-05, + "loss": 0.2703, + "step": 32915 + }, + { + "epoch": 2.666558651976669, + "grad_norm": 0.07356708496809006, + "learning_rate": 7.40807417075476e-05, + "loss": 0.2616, + "step": 32916 + }, + { + "epoch": 2.666639662994167, + "grad_norm": 0.06438080221414566, + "learning_rate": 7.407624105495298e-05, + "loss": 0.2226, + "step": 32917 + }, + { + "epoch": 2.666720674011666, + "grad_norm": 0.07412604242563248, + "learning_rate": 7.407174040235835e-05, + "loss": 0.2895, + "step": 32918 + }, + { + "epoch": 2.666801685029164, + "grad_norm": 0.07654453068971634, + "learning_rate": 7.406723974976372e-05, + "loss": 0.2243, + "step": 32919 + }, + { + "epoch": 2.6668826960466623, + "grad_norm": 0.059283047914505005, + "learning_rate": 7.40627390971691e-05, + "loss": 0.2298, + "step": 32920 + }, + { + "epoch": 2.666963707064161, + "grad_norm": 0.06338262557983398, + "learning_rate": 7.405823844457447e-05, + "loss": 0.2419, + "step": 32921 + }, + { + "epoch": 2.667044718081659, + "grad_norm": 0.0706416442990303, + "learning_rate": 7.405373779197984e-05, + "loss": 0.2299, + "step": 32922 + }, + { + "epoch": 2.6671257290991575, + "grad_norm": 0.06983651220798492, + "learning_rate": 7.404923713938522e-05, + "loss": 0.246, + "step": 32923 + }, + { + "epoch": 2.6672067401166557, + "grad_norm": 0.06590235978364944, + "learning_rate": 7.404473648679059e-05, + "loss": 0.2675, + "step": 32924 + }, + { + "epoch": 2.6672877511341544, + "grad_norm": 0.06504794955253601, + "learning_rate": 7.404023583419597e-05, + "loss": 0.247, + "step": 32925 + }, + { + "epoch": 2.6673687621516526, + "grad_norm": 0.07013759016990662, + "learning_rate": 7.403573518160134e-05, + "loss": 0.2233, + "step": 32926 + }, + { + "epoch": 2.667449773169151, + "grad_norm": 0.07694179564714432, + "learning_rate": 7.403123452900671e-05, + "loss": 0.2745, + "step": 32927 + }, + { + "epoch": 2.667530784186649, + "grad_norm": 0.0731961578130722, + "learning_rate": 7.402673387641209e-05, + "loss": 0.2511, + "step": 32928 + }, + { + "epoch": 2.667611795204148, + "grad_norm": 0.06496523320674896, + "learning_rate": 7.402223322381746e-05, + "loss": 0.256, + "step": 32929 + }, + { + "epoch": 2.667692806221646, + "grad_norm": 0.05998383089900017, + "learning_rate": 7.401773257122283e-05, + "loss": 0.2482, + "step": 32930 + }, + { + "epoch": 2.6677738172391443, + "grad_norm": 0.0756494551897049, + "learning_rate": 7.401323191862821e-05, + "loss": 0.2843, + "step": 32931 + }, + { + "epoch": 2.667854828256643, + "grad_norm": 0.06741814315319061, + "learning_rate": 7.400873126603358e-05, + "loss": 0.265, + "step": 32932 + }, + { + "epoch": 2.6679358392741412, + "grad_norm": 0.07142461091279984, + "learning_rate": 7.400423061343895e-05, + "loss": 0.242, + "step": 32933 + }, + { + "epoch": 2.6680168502916395, + "grad_norm": 0.05582594871520996, + "learning_rate": 7.399972996084433e-05, + "loss": 0.2296, + "step": 32934 + }, + { + "epoch": 2.668097861309138, + "grad_norm": 0.06327968835830688, + "learning_rate": 7.39952293082497e-05, + "loss": 0.272, + "step": 32935 + }, + { + "epoch": 2.6681788723266364, + "grad_norm": 0.07664207369089127, + "learning_rate": 7.399072865565507e-05, + "loss": 0.2518, + "step": 32936 + }, + { + "epoch": 2.6682598833441347, + "grad_norm": 0.06662250310182571, + "learning_rate": 7.398622800306045e-05, + "loss": 0.2276, + "step": 32937 + }, + { + "epoch": 2.6683408943616334, + "grad_norm": 0.06861812621355057, + "learning_rate": 7.398172735046582e-05, + "loss": 0.2379, + "step": 32938 + }, + { + "epoch": 2.6684219053791316, + "grad_norm": 0.06940872967243195, + "learning_rate": 7.39772266978712e-05, + "loss": 0.2181, + "step": 32939 + }, + { + "epoch": 2.66850291639663, + "grad_norm": 0.05719943717122078, + "learning_rate": 7.397272604527657e-05, + "loss": 0.2575, + "step": 32940 + }, + { + "epoch": 2.6685839274141285, + "grad_norm": 0.0687544196844101, + "learning_rate": 7.396822539268194e-05, + "loss": 0.2167, + "step": 32941 + }, + { + "epoch": 2.6686649384316268, + "grad_norm": 0.05461694300174713, + "learning_rate": 7.396372474008732e-05, + "loss": 0.2191, + "step": 32942 + }, + { + "epoch": 2.668745949449125, + "grad_norm": 0.09195633977651596, + "learning_rate": 7.395922408749269e-05, + "loss": 0.2238, + "step": 32943 + }, + { + "epoch": 2.6688269604666237, + "grad_norm": 0.07397960871458054, + "learning_rate": 7.395472343489806e-05, + "loss": 0.2789, + "step": 32944 + }, + { + "epoch": 2.668907971484122, + "grad_norm": 0.06718258559703827, + "learning_rate": 7.395022278230344e-05, + "loss": 0.2049, + "step": 32945 + }, + { + "epoch": 2.66898898250162, + "grad_norm": 0.06812811642885208, + "learning_rate": 7.394572212970881e-05, + "loss": 0.2347, + "step": 32946 + }, + { + "epoch": 2.6690699935191184, + "grad_norm": 0.06567735970020294, + "learning_rate": 7.394122147711418e-05, + "loss": 0.2389, + "step": 32947 + }, + { + "epoch": 2.669151004536617, + "grad_norm": 0.07259346544742584, + "learning_rate": 7.393672082451956e-05, + "loss": 0.2341, + "step": 32948 + }, + { + "epoch": 2.6692320155541154, + "grad_norm": 0.057004157453775406, + "learning_rate": 7.393222017192493e-05, + "loss": 0.2235, + "step": 32949 + }, + { + "epoch": 2.6693130265716136, + "grad_norm": 0.07738929241895676, + "learning_rate": 7.39277195193303e-05, + "loss": 0.2077, + "step": 32950 + }, + { + "epoch": 2.669394037589112, + "grad_norm": 0.08664657175540924, + "learning_rate": 7.392321886673568e-05, + "loss": 0.2846, + "step": 32951 + }, + { + "epoch": 2.6694750486066106, + "grad_norm": 0.08418330550193787, + "learning_rate": 7.391871821414105e-05, + "loss": 0.2987, + "step": 32952 + }, + { + "epoch": 2.669556059624109, + "grad_norm": 0.08851824700832367, + "learning_rate": 7.391421756154643e-05, + "loss": 0.2564, + "step": 32953 + }, + { + "epoch": 2.669637070641607, + "grad_norm": 0.07711591571569443, + "learning_rate": 7.39097169089518e-05, + "loss": 0.2528, + "step": 32954 + }, + { + "epoch": 2.6697180816591057, + "grad_norm": 0.061196353286504745, + "learning_rate": 7.390521625635717e-05, + "loss": 0.2271, + "step": 32955 + }, + { + "epoch": 2.669799092676604, + "grad_norm": 0.07016127556562424, + "learning_rate": 7.390071560376255e-05, + "loss": 0.238, + "step": 32956 + }, + { + "epoch": 2.6698801036941022, + "grad_norm": 0.075688436627388, + "learning_rate": 7.389621495116792e-05, + "loss": 0.2279, + "step": 32957 + }, + { + "epoch": 2.669961114711601, + "grad_norm": 0.06808993220329285, + "learning_rate": 7.38917142985733e-05, + "loss": 0.2429, + "step": 32958 + }, + { + "epoch": 2.670042125729099, + "grad_norm": 0.07332292199134827, + "learning_rate": 7.388721364597867e-05, + "loss": 0.2337, + "step": 32959 + }, + { + "epoch": 2.6701231367465974, + "grad_norm": 0.058614619076251984, + "learning_rate": 7.388271299338404e-05, + "loss": 0.2578, + "step": 32960 + }, + { + "epoch": 2.670204147764096, + "grad_norm": 0.05495653301477432, + "learning_rate": 7.387821234078941e-05, + "loss": 0.2715, + "step": 32961 + }, + { + "epoch": 2.6702851587815943, + "grad_norm": 0.08176012337207794, + "learning_rate": 7.387371168819479e-05, + "loss": 0.2445, + "step": 32962 + }, + { + "epoch": 2.6703661697990926, + "grad_norm": 0.060160085558891296, + "learning_rate": 7.386921103560016e-05, + "loss": 0.2281, + "step": 32963 + }, + { + "epoch": 2.6704471808165913, + "grad_norm": 0.09189620614051819, + "learning_rate": 7.386471038300554e-05, + "loss": 0.2297, + "step": 32964 + }, + { + "epoch": 2.6705281918340895, + "grad_norm": 0.060057610273361206, + "learning_rate": 7.386020973041091e-05, + "loss": 0.2108, + "step": 32965 + }, + { + "epoch": 2.6706092028515878, + "grad_norm": 0.0785459652543068, + "learning_rate": 7.385570907781628e-05, + "loss": 0.2736, + "step": 32966 + }, + { + "epoch": 2.6706902138690864, + "grad_norm": 0.07110899686813354, + "learning_rate": 7.385120842522166e-05, + "loss": 0.272, + "step": 32967 + }, + { + "epoch": 2.6707712248865847, + "grad_norm": 0.08549158275127411, + "learning_rate": 7.384670777262703e-05, + "loss": 0.2238, + "step": 32968 + }, + { + "epoch": 2.670852235904083, + "grad_norm": 0.06585050374269485, + "learning_rate": 7.38422071200324e-05, + "loss": 0.2332, + "step": 32969 + }, + { + "epoch": 2.670933246921581, + "grad_norm": 0.07162567228078842, + "learning_rate": 7.383770646743778e-05, + "loss": 0.26, + "step": 32970 + }, + { + "epoch": 2.67101425793908, + "grad_norm": 0.0801546648144722, + "learning_rate": 7.383320581484315e-05, + "loss": 0.2837, + "step": 32971 + }, + { + "epoch": 2.671095268956578, + "grad_norm": 0.05951867997646332, + "learning_rate": 7.382870516224854e-05, + "loss": 0.214, + "step": 32972 + }, + { + "epoch": 2.6711762799740764, + "grad_norm": 0.06935639679431915, + "learning_rate": 7.38242045096539e-05, + "loss": 0.2163, + "step": 32973 + }, + { + "epoch": 2.6712572909915746, + "grad_norm": 0.08030533790588379, + "learning_rate": 7.381970385705927e-05, + "loss": 0.2229, + "step": 32974 + }, + { + "epoch": 2.6713383020090733, + "grad_norm": 0.06654808670282364, + "learning_rate": 7.381520320446466e-05, + "loss": 0.1838, + "step": 32975 + }, + { + "epoch": 2.6714193130265715, + "grad_norm": 0.0794433057308197, + "learning_rate": 7.381070255187002e-05, + "loss": 0.2433, + "step": 32976 + }, + { + "epoch": 2.67150032404407, + "grad_norm": 0.06465379148721695, + "learning_rate": 7.380620189927539e-05, + "loss": 0.2271, + "step": 32977 + }, + { + "epoch": 2.6715813350615685, + "grad_norm": 0.06718406081199646, + "learning_rate": 7.380170124668078e-05, + "loss": 0.2625, + "step": 32978 + }, + { + "epoch": 2.6716623460790667, + "grad_norm": 0.08068958669900894, + "learning_rate": 7.379720059408615e-05, + "loss": 0.2868, + "step": 32979 + }, + { + "epoch": 2.671743357096565, + "grad_norm": 0.08404342085123062, + "learning_rate": 7.379269994149151e-05, + "loss": 0.2575, + "step": 32980 + }, + { + "epoch": 2.6718243681140637, + "grad_norm": 0.0789237916469574, + "learning_rate": 7.37881992888969e-05, + "loss": 0.239, + "step": 32981 + }, + { + "epoch": 2.671905379131562, + "grad_norm": 0.07895661890506744, + "learning_rate": 7.378369863630227e-05, + "loss": 0.2527, + "step": 32982 + }, + { + "epoch": 2.67198639014906, + "grad_norm": 0.05732421949505806, + "learning_rate": 7.377919798370763e-05, + "loss": 0.2235, + "step": 32983 + }, + { + "epoch": 2.672067401166559, + "grad_norm": 0.06396181881427765, + "learning_rate": 7.377469733111302e-05, + "loss": 0.2192, + "step": 32984 + }, + { + "epoch": 2.672148412184057, + "grad_norm": 0.0701260045170784, + "learning_rate": 7.37701966785184e-05, + "loss": 0.2878, + "step": 32985 + }, + { + "epoch": 2.6722294232015553, + "grad_norm": 0.06090003252029419, + "learning_rate": 7.376569602592375e-05, + "loss": 0.2271, + "step": 32986 + }, + { + "epoch": 2.672310434219054, + "grad_norm": 0.0759798064827919, + "learning_rate": 7.376119537332914e-05, + "loss": 0.2326, + "step": 32987 + }, + { + "epoch": 2.6723914452365523, + "grad_norm": 0.07578757405281067, + "learning_rate": 7.375669472073452e-05, + "loss": 0.246, + "step": 32988 + }, + { + "epoch": 2.6724724562540505, + "grad_norm": 0.081985242664814, + "learning_rate": 7.375219406813988e-05, + "loss": 0.2728, + "step": 32989 + }, + { + "epoch": 2.672553467271549, + "grad_norm": 0.07229936122894287, + "learning_rate": 7.374769341554526e-05, + "loss": 0.2434, + "step": 32990 + }, + { + "epoch": 2.6726344782890474, + "grad_norm": 0.08067141473293304, + "learning_rate": 7.374319276295064e-05, + "loss": 0.2439, + "step": 32991 + }, + { + "epoch": 2.6727154893065457, + "grad_norm": 0.06598978489637375, + "learning_rate": 7.3738692110356e-05, + "loss": 0.2388, + "step": 32992 + }, + { + "epoch": 2.672796500324044, + "grad_norm": 0.06533140689134598, + "learning_rate": 7.373419145776138e-05, + "loss": 0.2161, + "step": 32993 + }, + { + "epoch": 2.6728775113415426, + "grad_norm": 0.06714296340942383, + "learning_rate": 7.372969080516676e-05, + "loss": 0.2742, + "step": 32994 + }, + { + "epoch": 2.672958522359041, + "grad_norm": 0.05954829603433609, + "learning_rate": 7.372519015257212e-05, + "loss": 0.2243, + "step": 32995 + }, + { + "epoch": 2.673039533376539, + "grad_norm": 0.05642719566822052, + "learning_rate": 7.37206894999775e-05, + "loss": 0.2207, + "step": 32996 + }, + { + "epoch": 2.6731205443940373, + "grad_norm": 0.06855549663305283, + "learning_rate": 7.371618884738288e-05, + "loss": 0.2558, + "step": 32997 + }, + { + "epoch": 2.673201555411536, + "grad_norm": 0.07786260545253754, + "learning_rate": 7.371168819478825e-05, + "loss": 0.2037, + "step": 32998 + }, + { + "epoch": 2.6732825664290343, + "grad_norm": 0.06984948366880417, + "learning_rate": 7.370718754219363e-05, + "loss": 0.2563, + "step": 32999 + }, + { + "epoch": 2.6733635774465325, + "grad_norm": 0.07111197710037231, + "learning_rate": 7.3702686889599e-05, + "loss": 0.2558, + "step": 33000 + }, + { + "epoch": 2.673444588464031, + "grad_norm": 0.05868116021156311, + "learning_rate": 7.369818623700437e-05, + "loss": 0.2407, + "step": 33001 + }, + { + "epoch": 2.6735255994815295, + "grad_norm": 0.07903192192316055, + "learning_rate": 7.369368558440975e-05, + "loss": 0.2704, + "step": 33002 + }, + { + "epoch": 2.6736066104990277, + "grad_norm": 0.06334315985441208, + "learning_rate": 7.368918493181512e-05, + "loss": 0.2486, + "step": 33003 + }, + { + "epoch": 2.6736876215165264, + "grad_norm": 0.06944619119167328, + "learning_rate": 7.36846842792205e-05, + "loss": 0.2121, + "step": 33004 + }, + { + "epoch": 2.6737686325340246, + "grad_norm": 0.06809698045253754, + "learning_rate": 7.368018362662587e-05, + "loss": 0.2242, + "step": 33005 + }, + { + "epoch": 2.673849643551523, + "grad_norm": 0.0604662150144577, + "learning_rate": 7.367568297403124e-05, + "loss": 0.2319, + "step": 33006 + }, + { + "epoch": 2.6739306545690216, + "grad_norm": 0.08217473328113556, + "learning_rate": 7.367118232143661e-05, + "loss": 0.2467, + "step": 33007 + }, + { + "epoch": 2.67401166558652, + "grad_norm": 0.05181199684739113, + "learning_rate": 7.366668166884199e-05, + "loss": 0.2315, + "step": 33008 + }, + { + "epoch": 2.674092676604018, + "grad_norm": 0.06757138669490814, + "learning_rate": 7.366218101624736e-05, + "loss": 0.2506, + "step": 33009 + }, + { + "epoch": 2.6741736876215167, + "grad_norm": 0.06875761598348618, + "learning_rate": 7.365768036365273e-05, + "loss": 0.2638, + "step": 33010 + }, + { + "epoch": 2.674254698639015, + "grad_norm": 0.07288714498281479, + "learning_rate": 7.365317971105811e-05, + "loss": 0.2307, + "step": 33011 + }, + { + "epoch": 2.6743357096565132, + "grad_norm": 0.07257254421710968, + "learning_rate": 7.364867905846348e-05, + "loss": 0.251, + "step": 33012 + }, + { + "epoch": 2.674416720674012, + "grad_norm": 0.06154019013047218, + "learning_rate": 7.364417840586886e-05, + "loss": 0.2334, + "step": 33013 + }, + { + "epoch": 2.67449773169151, + "grad_norm": 0.06860852241516113, + "learning_rate": 7.363967775327423e-05, + "loss": 0.2807, + "step": 33014 + }, + { + "epoch": 2.6745787427090084, + "grad_norm": 0.06585631519556046, + "learning_rate": 7.36351771006796e-05, + "loss": 0.2459, + "step": 33015 + }, + { + "epoch": 2.6746597537265067, + "grad_norm": 0.06791950762271881, + "learning_rate": 7.363067644808498e-05, + "loss": 0.2817, + "step": 33016 + }, + { + "epoch": 2.6747407647440054, + "grad_norm": 0.061369288712739944, + "learning_rate": 7.362617579549035e-05, + "loss": 0.2283, + "step": 33017 + }, + { + "epoch": 2.6748217757615036, + "grad_norm": 0.060109131038188934, + "learning_rate": 7.362167514289572e-05, + "loss": 0.2465, + "step": 33018 + }, + { + "epoch": 2.674902786779002, + "grad_norm": 0.0682285726070404, + "learning_rate": 7.36171744903011e-05, + "loss": 0.3074, + "step": 33019 + }, + { + "epoch": 2.6749837977965, + "grad_norm": 0.07335293292999268, + "learning_rate": 7.361267383770647e-05, + "loss": 0.2427, + "step": 33020 + }, + { + "epoch": 2.6750648088139988, + "grad_norm": 0.08329258114099503, + "learning_rate": 7.360817318511184e-05, + "loss": 0.2594, + "step": 33021 + }, + { + "epoch": 2.675145819831497, + "grad_norm": 0.06502322852611542, + "learning_rate": 7.360367253251722e-05, + "loss": 0.2127, + "step": 33022 + }, + { + "epoch": 2.6752268308489953, + "grad_norm": 0.06749629229307175, + "learning_rate": 7.359917187992259e-05, + "loss": 0.248, + "step": 33023 + }, + { + "epoch": 2.675307841866494, + "grad_norm": 0.0741758793592453, + "learning_rate": 7.359467122732797e-05, + "loss": 0.2177, + "step": 33024 + }, + { + "epoch": 2.675388852883992, + "grad_norm": 0.07246986776590347, + "learning_rate": 7.359017057473334e-05, + "loss": 0.2696, + "step": 33025 + }, + { + "epoch": 2.6754698639014904, + "grad_norm": 0.07233214378356934, + "learning_rate": 7.358566992213871e-05, + "loss": 0.2256, + "step": 33026 + }, + { + "epoch": 2.675550874918989, + "grad_norm": 0.061211708933115005, + "learning_rate": 7.358116926954409e-05, + "loss": 0.2186, + "step": 33027 + }, + { + "epoch": 2.6756318859364874, + "grad_norm": 0.07487697154283524, + "learning_rate": 7.357666861694946e-05, + "loss": 0.2549, + "step": 33028 + }, + { + "epoch": 2.6757128969539856, + "grad_norm": 0.07913942635059357, + "learning_rate": 7.357216796435483e-05, + "loss": 0.2858, + "step": 33029 + }, + { + "epoch": 2.6757939079714843, + "grad_norm": 0.06393986195325851, + "learning_rate": 7.356766731176021e-05, + "loss": 0.1946, + "step": 33030 + }, + { + "epoch": 2.6758749189889826, + "grad_norm": 0.06356556713581085, + "learning_rate": 7.356316665916558e-05, + "loss": 0.2099, + "step": 33031 + }, + { + "epoch": 2.675955930006481, + "grad_norm": 0.06801209598779678, + "learning_rate": 7.355866600657095e-05, + "loss": 0.2403, + "step": 33032 + }, + { + "epoch": 2.6760369410239795, + "grad_norm": 0.05762135609984398, + "learning_rate": 7.355416535397633e-05, + "loss": 0.2487, + "step": 33033 + }, + { + "epoch": 2.6761179520414777, + "grad_norm": 0.06864021718502045, + "learning_rate": 7.35496647013817e-05, + "loss": 0.2513, + "step": 33034 + }, + { + "epoch": 2.676198963058976, + "grad_norm": 0.053114648908376694, + "learning_rate": 7.354516404878708e-05, + "loss": 0.1946, + "step": 33035 + }, + { + "epoch": 2.6762799740764747, + "grad_norm": 0.07913174480199814, + "learning_rate": 7.354066339619245e-05, + "loss": 0.2559, + "step": 33036 + }, + { + "epoch": 2.676360985093973, + "grad_norm": 0.061761241406202316, + "learning_rate": 7.353616274359782e-05, + "loss": 0.2353, + "step": 33037 + }, + { + "epoch": 2.676441996111471, + "grad_norm": 0.08432194590568542, + "learning_rate": 7.35316620910032e-05, + "loss": 0.2903, + "step": 33038 + }, + { + "epoch": 2.6765230071289694, + "grad_norm": 0.09266561269760132, + "learning_rate": 7.352716143840857e-05, + "loss": 0.2652, + "step": 33039 + }, + { + "epoch": 2.6766040181464676, + "grad_norm": 0.06161290407180786, + "learning_rate": 7.352266078581394e-05, + "loss": 0.2394, + "step": 33040 + }, + { + "epoch": 2.6766850291639663, + "grad_norm": 0.05790049955248833, + "learning_rate": 7.351816013321932e-05, + "loss": 0.2208, + "step": 33041 + }, + { + "epoch": 2.6767660401814646, + "grad_norm": 0.08553800731897354, + "learning_rate": 7.351365948062469e-05, + "loss": 0.2521, + "step": 33042 + }, + { + "epoch": 2.676847051198963, + "grad_norm": 0.07292623817920685, + "learning_rate": 7.350915882803006e-05, + "loss": 0.2595, + "step": 33043 + }, + { + "epoch": 2.6769280622164615, + "grad_norm": 0.07673818618059158, + "learning_rate": 7.350465817543544e-05, + "loss": 0.2461, + "step": 33044 + }, + { + "epoch": 2.6770090732339598, + "grad_norm": 0.06594213098287582, + "learning_rate": 7.350015752284082e-05, + "loss": 0.2667, + "step": 33045 + }, + { + "epoch": 2.677090084251458, + "grad_norm": 0.0723397359251976, + "learning_rate": 7.349565687024618e-05, + "loss": 0.2517, + "step": 33046 + }, + { + "epoch": 2.6771710952689567, + "grad_norm": 0.06494677066802979, + "learning_rate": 7.349115621765156e-05, + "loss": 0.268, + "step": 33047 + }, + { + "epoch": 2.677252106286455, + "grad_norm": 0.07920119911432266, + "learning_rate": 7.348665556505695e-05, + "loss": 0.2019, + "step": 33048 + }, + { + "epoch": 2.677333117303953, + "grad_norm": 0.06195864453911781, + "learning_rate": 7.34821549124623e-05, + "loss": 0.2074, + "step": 33049 + }, + { + "epoch": 2.677414128321452, + "grad_norm": 0.06277751177549362, + "learning_rate": 7.347765425986769e-05, + "loss": 0.2351, + "step": 33050 + }, + { + "epoch": 2.67749513933895, + "grad_norm": 0.07916513085365295, + "learning_rate": 7.347315360727307e-05, + "loss": 0.2296, + "step": 33051 + }, + { + "epoch": 2.6775761503564484, + "grad_norm": 0.06696799397468567, + "learning_rate": 7.346865295467843e-05, + "loss": 0.2141, + "step": 33052 + }, + { + "epoch": 2.677657161373947, + "grad_norm": 0.07103472948074341, + "learning_rate": 7.346415230208381e-05, + "loss": 0.2666, + "step": 33053 + }, + { + "epoch": 2.6777381723914453, + "grad_norm": 0.071993388235569, + "learning_rate": 7.345965164948919e-05, + "loss": 0.2598, + "step": 33054 + }, + { + "epoch": 2.6778191834089435, + "grad_norm": 0.06992863863706589, + "learning_rate": 7.345515099689455e-05, + "loss": 0.2631, + "step": 33055 + }, + { + "epoch": 2.6779001944264422, + "grad_norm": 0.04920663684606552, + "learning_rate": 7.345065034429993e-05, + "loss": 0.2122, + "step": 33056 + }, + { + "epoch": 2.6779812054439405, + "grad_norm": 0.05919310450553894, + "learning_rate": 7.344614969170531e-05, + "loss": 0.2311, + "step": 33057 + }, + { + "epoch": 2.6780622164614387, + "grad_norm": 0.07455393671989441, + "learning_rate": 7.344164903911067e-05, + "loss": 0.2463, + "step": 33058 + }, + { + "epoch": 2.6781432274789374, + "grad_norm": 0.0503612719476223, + "learning_rate": 7.343714838651606e-05, + "loss": 0.231, + "step": 33059 + }, + { + "epoch": 2.6782242384964356, + "grad_norm": 0.08073712140321732, + "learning_rate": 7.343264773392143e-05, + "loss": 0.2548, + "step": 33060 + }, + { + "epoch": 2.678305249513934, + "grad_norm": 0.06331314146518707, + "learning_rate": 7.342814708132679e-05, + "loss": 0.2254, + "step": 33061 + }, + { + "epoch": 2.678386260531432, + "grad_norm": 0.07347361743450165, + "learning_rate": 7.342364642873218e-05, + "loss": 0.2901, + "step": 33062 + }, + { + "epoch": 2.6784672715489304, + "grad_norm": 0.07064739614725113, + "learning_rate": 7.341914577613755e-05, + "loss": 0.2493, + "step": 33063 + }, + { + "epoch": 2.678548282566429, + "grad_norm": 0.05976058542728424, + "learning_rate": 7.341464512354291e-05, + "loss": 0.2609, + "step": 33064 + }, + { + "epoch": 2.6786292935839273, + "grad_norm": 0.08085988461971283, + "learning_rate": 7.34101444709483e-05, + "loss": 0.2543, + "step": 33065 + }, + { + "epoch": 2.6787103046014256, + "grad_norm": 0.07125470787286758, + "learning_rate": 7.340564381835367e-05, + "loss": 0.2506, + "step": 33066 + }, + { + "epoch": 2.6787913156189243, + "grad_norm": 0.05963381752371788, + "learning_rate": 7.340114316575903e-05, + "loss": 0.2614, + "step": 33067 + }, + { + "epoch": 2.6788723266364225, + "grad_norm": 0.08461900055408478, + "learning_rate": 7.339664251316442e-05, + "loss": 0.2944, + "step": 33068 + }, + { + "epoch": 2.6789533376539207, + "grad_norm": 0.07790318131446838, + "learning_rate": 7.339214186056979e-05, + "loss": 0.2164, + "step": 33069 + }, + { + "epoch": 2.6790343486714194, + "grad_norm": 0.06046464294195175, + "learning_rate": 7.338764120797515e-05, + "loss": 0.2328, + "step": 33070 + }, + { + "epoch": 2.6791153596889177, + "grad_norm": 0.07902556657791138, + "learning_rate": 7.338314055538054e-05, + "loss": 0.2431, + "step": 33071 + }, + { + "epoch": 2.679196370706416, + "grad_norm": 0.07424572855234146, + "learning_rate": 7.337863990278591e-05, + "loss": 0.2215, + "step": 33072 + }, + { + "epoch": 2.6792773817239146, + "grad_norm": 0.08468609303236008, + "learning_rate": 7.337413925019127e-05, + "loss": 0.2426, + "step": 33073 + }, + { + "epoch": 2.679358392741413, + "grad_norm": 0.0756547823548317, + "learning_rate": 7.336963859759666e-05, + "loss": 0.2741, + "step": 33074 + }, + { + "epoch": 2.679439403758911, + "grad_norm": 0.06156749650835991, + "learning_rate": 7.336513794500203e-05, + "loss": 0.2129, + "step": 33075 + }, + { + "epoch": 2.67952041477641, + "grad_norm": 0.07414403557777405, + "learning_rate": 7.33606372924074e-05, + "loss": 0.2438, + "step": 33076 + }, + { + "epoch": 2.679601425793908, + "grad_norm": 0.08308634161949158, + "learning_rate": 7.335613663981278e-05, + "loss": 0.2182, + "step": 33077 + }, + { + "epoch": 2.6796824368114063, + "grad_norm": 0.07818423956632614, + "learning_rate": 7.335163598721815e-05, + "loss": 0.2259, + "step": 33078 + }, + { + "epoch": 2.679763447828905, + "grad_norm": 0.07427828758955002, + "learning_rate": 7.334713533462353e-05, + "loss": 0.2669, + "step": 33079 + }, + { + "epoch": 2.679844458846403, + "grad_norm": 0.07437131553888321, + "learning_rate": 7.33426346820289e-05, + "loss": 0.2438, + "step": 33080 + }, + { + "epoch": 2.6799254698639015, + "grad_norm": 0.07310924679040909, + "learning_rate": 7.333813402943427e-05, + "loss": 0.2495, + "step": 33081 + }, + { + "epoch": 2.6800064808814, + "grad_norm": 0.08593792468309402, + "learning_rate": 7.333363337683965e-05, + "loss": 0.2339, + "step": 33082 + }, + { + "epoch": 2.6800874918988984, + "grad_norm": 0.06198699772357941, + "learning_rate": 7.332913272424502e-05, + "loss": 0.274, + "step": 33083 + }, + { + "epoch": 2.6801685029163966, + "grad_norm": 0.07218437641859055, + "learning_rate": 7.33246320716504e-05, + "loss": 0.2084, + "step": 33084 + }, + { + "epoch": 2.680249513933895, + "grad_norm": 0.07658874988555908, + "learning_rate": 7.332013141905577e-05, + "loss": 0.2106, + "step": 33085 + }, + { + "epoch": 2.680330524951393, + "grad_norm": 0.07715161889791489, + "learning_rate": 7.331563076646114e-05, + "loss": 0.2361, + "step": 33086 + }, + { + "epoch": 2.680411535968892, + "grad_norm": 0.07156594097614288, + "learning_rate": 7.331113011386652e-05, + "loss": 0.2526, + "step": 33087 + }, + { + "epoch": 2.68049254698639, + "grad_norm": 0.06449709832668304, + "learning_rate": 7.330662946127189e-05, + "loss": 0.2542, + "step": 33088 + }, + { + "epoch": 2.6805735580038883, + "grad_norm": 0.06789814680814743, + "learning_rate": 7.330212880867726e-05, + "loss": 0.2546, + "step": 33089 + }, + { + "epoch": 2.680654569021387, + "grad_norm": 0.060112036764621735, + "learning_rate": 7.329762815608264e-05, + "loss": 0.2154, + "step": 33090 + }, + { + "epoch": 2.6807355800388852, + "grad_norm": 0.06398256868124008, + "learning_rate": 7.329312750348801e-05, + "loss": 0.2722, + "step": 33091 + }, + { + "epoch": 2.6808165910563835, + "grad_norm": 0.05962206423282623, + "learning_rate": 7.328862685089338e-05, + "loss": 0.2408, + "step": 33092 + }, + { + "epoch": 2.680897602073882, + "grad_norm": 0.05923084542155266, + "learning_rate": 7.328412619829876e-05, + "loss": 0.2777, + "step": 33093 + }, + { + "epoch": 2.6809786130913804, + "grad_norm": 0.07328286021947861, + "learning_rate": 7.327962554570413e-05, + "loss": 0.2516, + "step": 33094 + }, + { + "epoch": 2.6810596241088787, + "grad_norm": 0.07436803728342056, + "learning_rate": 7.32751248931095e-05, + "loss": 0.215, + "step": 33095 + }, + { + "epoch": 2.6811406351263773, + "grad_norm": 0.07537177950143814, + "learning_rate": 7.327062424051488e-05, + "loss": 0.2732, + "step": 33096 + }, + { + "epoch": 2.6812216461438756, + "grad_norm": 0.0655427947640419, + "learning_rate": 7.326612358792025e-05, + "loss": 0.2564, + "step": 33097 + }, + { + "epoch": 2.681302657161374, + "grad_norm": 0.06698904931545258, + "learning_rate": 7.326162293532563e-05, + "loss": 0.2322, + "step": 33098 + }, + { + "epoch": 2.6813836681788725, + "grad_norm": 0.06953173130750656, + "learning_rate": 7.3257122282731e-05, + "loss": 0.2397, + "step": 33099 + }, + { + "epoch": 2.6814646791963708, + "grad_norm": 0.061687178909778595, + "learning_rate": 7.325262163013637e-05, + "loss": 0.2374, + "step": 33100 + }, + { + "epoch": 2.681545690213869, + "grad_norm": 0.0845770463347435, + "learning_rate": 7.324812097754175e-05, + "loss": 0.2528, + "step": 33101 + }, + { + "epoch": 2.6816267012313677, + "grad_norm": 0.0615716390311718, + "learning_rate": 7.324362032494712e-05, + "loss": 0.2315, + "step": 33102 + }, + { + "epoch": 2.681707712248866, + "grad_norm": 0.07067247480154037, + "learning_rate": 7.32391196723525e-05, + "loss": 0.2569, + "step": 33103 + }, + { + "epoch": 2.681788723266364, + "grad_norm": 0.07423364371061325, + "learning_rate": 7.323461901975787e-05, + "loss": 0.2387, + "step": 33104 + }, + { + "epoch": 2.681869734283863, + "grad_norm": 0.07920686155557632, + "learning_rate": 7.323011836716324e-05, + "loss": 0.2634, + "step": 33105 + }, + { + "epoch": 2.681950745301361, + "grad_norm": 0.0680554062128067, + "learning_rate": 7.322561771456861e-05, + "loss": 0.2232, + "step": 33106 + }, + { + "epoch": 2.6820317563188594, + "grad_norm": 0.07262624800205231, + "learning_rate": 7.322111706197399e-05, + "loss": 0.2345, + "step": 33107 + }, + { + "epoch": 2.6821127673363576, + "grad_norm": 0.05929746478796005, + "learning_rate": 7.321661640937936e-05, + "loss": 0.2062, + "step": 33108 + }, + { + "epoch": 2.682193778353856, + "grad_norm": 0.06272649019956589, + "learning_rate": 7.321211575678474e-05, + "loss": 0.2401, + "step": 33109 + }, + { + "epoch": 2.6822747893713546, + "grad_norm": 0.07481082528829575, + "learning_rate": 7.320761510419011e-05, + "loss": 0.242, + "step": 33110 + }, + { + "epoch": 2.682355800388853, + "grad_norm": 0.07007595151662827, + "learning_rate": 7.320311445159548e-05, + "loss": 0.2444, + "step": 33111 + }, + { + "epoch": 2.682436811406351, + "grad_norm": 0.08007033914327621, + "learning_rate": 7.319861379900086e-05, + "loss": 0.2856, + "step": 33112 + }, + { + "epoch": 2.6825178224238497, + "grad_norm": 0.06305108219385147, + "learning_rate": 7.319411314640623e-05, + "loss": 0.1845, + "step": 33113 + }, + { + "epoch": 2.682598833441348, + "grad_norm": 0.06217300146818161, + "learning_rate": 7.318961249381162e-05, + "loss": 0.2333, + "step": 33114 + }, + { + "epoch": 2.682679844458846, + "grad_norm": 0.0762304812669754, + "learning_rate": 7.318511184121698e-05, + "loss": 0.2566, + "step": 33115 + }, + { + "epoch": 2.682760855476345, + "grad_norm": 0.064681276679039, + "learning_rate": 7.318061118862235e-05, + "loss": 0.2285, + "step": 33116 + }, + { + "epoch": 2.682841866493843, + "grad_norm": 0.07641452550888062, + "learning_rate": 7.317611053602774e-05, + "loss": 0.2715, + "step": 33117 + }, + { + "epoch": 2.6829228775113414, + "grad_norm": 0.07061993330717087, + "learning_rate": 7.31716098834331e-05, + "loss": 0.2617, + "step": 33118 + }, + { + "epoch": 2.68300388852884, + "grad_norm": 0.07542899996042252, + "learning_rate": 7.316710923083847e-05, + "loss": 0.2281, + "step": 33119 + }, + { + "epoch": 2.6830848995463383, + "grad_norm": 0.07717788964509964, + "learning_rate": 7.316260857824386e-05, + "loss": 0.2113, + "step": 33120 + }, + { + "epoch": 2.6831659105638366, + "grad_norm": 0.08544004708528519, + "learning_rate": 7.315810792564922e-05, + "loss": 0.3001, + "step": 33121 + }, + { + "epoch": 2.6832469215813353, + "grad_norm": 0.06345837563276291, + "learning_rate": 7.315360727305459e-05, + "loss": 0.2396, + "step": 33122 + }, + { + "epoch": 2.6833279325988335, + "grad_norm": 0.07493044435977936, + "learning_rate": 7.314910662045998e-05, + "loss": 0.2772, + "step": 33123 + }, + { + "epoch": 2.6834089436163318, + "grad_norm": 0.06237129122018814, + "learning_rate": 7.314460596786534e-05, + "loss": 0.2675, + "step": 33124 + }, + { + "epoch": 2.6834899546338304, + "grad_norm": 0.08715277910232544, + "learning_rate": 7.314010531527071e-05, + "loss": 0.2656, + "step": 33125 + }, + { + "epoch": 2.6835709656513287, + "grad_norm": 0.06376224756240845, + "learning_rate": 7.31356046626761e-05, + "loss": 0.264, + "step": 33126 + }, + { + "epoch": 2.683651976668827, + "grad_norm": 0.05935446545481682, + "learning_rate": 7.313110401008146e-05, + "loss": 0.223, + "step": 33127 + }, + { + "epoch": 2.683732987686325, + "grad_norm": 0.07266701012849808, + "learning_rate": 7.312660335748683e-05, + "loss": 0.2985, + "step": 33128 + }, + { + "epoch": 2.683813998703824, + "grad_norm": 0.059860944747924805, + "learning_rate": 7.312210270489222e-05, + "loss": 0.2024, + "step": 33129 + }, + { + "epoch": 2.683895009721322, + "grad_norm": 0.06587036699056625, + "learning_rate": 7.311760205229758e-05, + "loss": 0.245, + "step": 33130 + }, + { + "epoch": 2.6839760207388204, + "grad_norm": 0.06060159206390381, + "learning_rate": 7.311310139970297e-05, + "loss": 0.2355, + "step": 33131 + }, + { + "epoch": 2.6840570317563186, + "grad_norm": 0.06470111012458801, + "learning_rate": 7.310860074710834e-05, + "loss": 0.2146, + "step": 33132 + }, + { + "epoch": 2.6841380427738173, + "grad_norm": 0.060420744121074677, + "learning_rate": 7.31041000945137e-05, + "loss": 0.2589, + "step": 33133 + }, + { + "epoch": 2.6842190537913155, + "grad_norm": 0.07256756722927094, + "learning_rate": 7.309959944191909e-05, + "loss": 0.2549, + "step": 33134 + }, + { + "epoch": 2.684300064808814, + "grad_norm": 0.07449455559253693, + "learning_rate": 7.309509878932446e-05, + "loss": 0.2264, + "step": 33135 + }, + { + "epoch": 2.6843810758263125, + "grad_norm": 0.06787081807851791, + "learning_rate": 7.309059813672982e-05, + "loss": 0.2685, + "step": 33136 + }, + { + "epoch": 2.6844620868438107, + "grad_norm": 0.08377574384212494, + "learning_rate": 7.308609748413521e-05, + "loss": 0.2421, + "step": 33137 + }, + { + "epoch": 2.684543097861309, + "grad_norm": 0.06868404895067215, + "learning_rate": 7.308159683154058e-05, + "loss": 0.2213, + "step": 33138 + }, + { + "epoch": 2.6846241088788076, + "grad_norm": 0.05533949285745621, + "learning_rate": 7.307709617894594e-05, + "loss": 0.2353, + "step": 33139 + }, + { + "epoch": 2.684705119896306, + "grad_norm": 0.05881068855524063, + "learning_rate": 7.307259552635133e-05, + "loss": 0.2078, + "step": 33140 + }, + { + "epoch": 2.684786130913804, + "grad_norm": 0.0701180249452591, + "learning_rate": 7.30680948737567e-05, + "loss": 0.214, + "step": 33141 + }, + { + "epoch": 2.684867141931303, + "grad_norm": 0.07627440243959427, + "learning_rate": 7.306359422116206e-05, + "loss": 0.2459, + "step": 33142 + }, + { + "epoch": 2.684948152948801, + "grad_norm": 0.06293842941522598, + "learning_rate": 7.305909356856745e-05, + "loss": 0.2094, + "step": 33143 + }, + { + "epoch": 2.6850291639662993, + "grad_norm": 0.07441238313913345, + "learning_rate": 7.305459291597282e-05, + "loss": 0.2658, + "step": 33144 + }, + { + "epoch": 2.685110174983798, + "grad_norm": 0.07221051305532455, + "learning_rate": 7.305009226337818e-05, + "loss": 0.2472, + "step": 33145 + }, + { + "epoch": 2.6851911860012962, + "grad_norm": 0.06838735938072205, + "learning_rate": 7.304559161078357e-05, + "loss": 0.2299, + "step": 33146 + }, + { + "epoch": 2.6852721970187945, + "grad_norm": 0.06920010596513748, + "learning_rate": 7.304109095818895e-05, + "loss": 0.2072, + "step": 33147 + }, + { + "epoch": 2.685353208036293, + "grad_norm": 0.05869818478822708, + "learning_rate": 7.30365903055943e-05, + "loss": 0.1972, + "step": 33148 + }, + { + "epoch": 2.6854342190537914, + "grad_norm": 0.0703733041882515, + "learning_rate": 7.303208965299969e-05, + "loss": 0.2579, + "step": 33149 + }, + { + "epoch": 2.6855152300712897, + "grad_norm": 0.0641406923532486, + "learning_rate": 7.302758900040507e-05, + "loss": 0.2525, + "step": 33150 + }, + { + "epoch": 2.685596241088788, + "grad_norm": 0.07498500496149063, + "learning_rate": 7.302308834781043e-05, + "loss": 0.2679, + "step": 33151 + }, + { + "epoch": 2.6856772521062866, + "grad_norm": 0.07610327005386353, + "learning_rate": 7.301858769521581e-05, + "loss": 0.2749, + "step": 33152 + }, + { + "epoch": 2.685758263123785, + "grad_norm": 0.06884650886058807, + "learning_rate": 7.301408704262119e-05, + "loss": 0.2293, + "step": 33153 + }, + { + "epoch": 2.685839274141283, + "grad_norm": 0.07191795855760574, + "learning_rate": 7.300958639002655e-05, + "loss": 0.2463, + "step": 33154 + }, + { + "epoch": 2.6859202851587813, + "grad_norm": 0.08212050050497055, + "learning_rate": 7.300508573743193e-05, + "loss": 0.2759, + "step": 33155 + }, + { + "epoch": 2.68600129617628, + "grad_norm": 0.06252212077379227, + "learning_rate": 7.300058508483731e-05, + "loss": 0.2329, + "step": 33156 + }, + { + "epoch": 2.6860823071937783, + "grad_norm": 0.07689528912305832, + "learning_rate": 7.299608443224268e-05, + "loss": 0.248, + "step": 33157 + }, + { + "epoch": 2.6861633182112765, + "grad_norm": 0.05639709159731865, + "learning_rate": 7.299158377964806e-05, + "loss": 0.2008, + "step": 33158 + }, + { + "epoch": 2.686244329228775, + "grad_norm": 0.06725417077541351, + "learning_rate": 7.298708312705343e-05, + "loss": 0.2233, + "step": 33159 + }, + { + "epoch": 2.6863253402462735, + "grad_norm": 0.060404933989048004, + "learning_rate": 7.29825824744588e-05, + "loss": 0.2402, + "step": 33160 + }, + { + "epoch": 2.6864063512637717, + "grad_norm": 0.07094196230173111, + "learning_rate": 7.297808182186418e-05, + "loss": 0.2344, + "step": 33161 + }, + { + "epoch": 2.6864873622812704, + "grad_norm": 0.06894193589687347, + "learning_rate": 7.297358116926955e-05, + "loss": 0.2251, + "step": 33162 + }, + { + "epoch": 2.6865683732987686, + "grad_norm": 0.05957801640033722, + "learning_rate": 7.296908051667492e-05, + "loss": 0.2377, + "step": 33163 + }, + { + "epoch": 2.686649384316267, + "grad_norm": 0.061558082699775696, + "learning_rate": 7.29645798640803e-05, + "loss": 0.2337, + "step": 33164 + }, + { + "epoch": 2.6867303953337656, + "grad_norm": 0.06628931313753128, + "learning_rate": 7.296007921148567e-05, + "loss": 0.239, + "step": 33165 + }, + { + "epoch": 2.686811406351264, + "grad_norm": 0.07567416876554489, + "learning_rate": 7.295557855889104e-05, + "loss": 0.2472, + "step": 33166 + }, + { + "epoch": 2.686892417368762, + "grad_norm": 0.055566683411598206, + "learning_rate": 7.295107790629642e-05, + "loss": 0.1925, + "step": 33167 + }, + { + "epoch": 2.6869734283862607, + "grad_norm": 0.071644127368927, + "learning_rate": 7.294657725370179e-05, + "loss": 0.229, + "step": 33168 + }, + { + "epoch": 2.687054439403759, + "grad_norm": 0.057638879865407944, + "learning_rate": 7.294207660110717e-05, + "loss": 0.2478, + "step": 33169 + }, + { + "epoch": 2.6871354504212572, + "grad_norm": 0.06617935746908188, + "learning_rate": 7.293757594851254e-05, + "loss": 0.2413, + "step": 33170 + }, + { + "epoch": 2.687216461438756, + "grad_norm": 0.055808231234550476, + "learning_rate": 7.293307529591791e-05, + "loss": 0.228, + "step": 33171 + }, + { + "epoch": 2.687297472456254, + "grad_norm": 0.06902889162302017, + "learning_rate": 7.292857464332329e-05, + "loss": 0.2437, + "step": 33172 + }, + { + "epoch": 2.6873784834737524, + "grad_norm": 0.061212435364723206, + "learning_rate": 7.292407399072866e-05, + "loss": 0.243, + "step": 33173 + }, + { + "epoch": 2.6874594944912507, + "grad_norm": 0.06083257123827934, + "learning_rate": 7.291957333813403e-05, + "loss": 0.2048, + "step": 33174 + }, + { + "epoch": 2.6875405055087493, + "grad_norm": 0.06691806018352509, + "learning_rate": 7.29150726855394e-05, + "loss": 0.2118, + "step": 33175 + }, + { + "epoch": 2.6876215165262476, + "grad_norm": 0.06726226955652237, + "learning_rate": 7.291057203294478e-05, + "loss": 0.2679, + "step": 33176 + }, + { + "epoch": 2.687702527543746, + "grad_norm": 0.06752733141183853, + "learning_rate": 7.290607138035015e-05, + "loss": 0.2157, + "step": 33177 + }, + { + "epoch": 2.687783538561244, + "grad_norm": 0.06699246168136597, + "learning_rate": 7.290157072775553e-05, + "loss": 0.2987, + "step": 33178 + }, + { + "epoch": 2.6878645495787428, + "grad_norm": 0.07415398210287094, + "learning_rate": 7.28970700751609e-05, + "loss": 0.227, + "step": 33179 + }, + { + "epoch": 2.687945560596241, + "grad_norm": 0.07832382619380951, + "learning_rate": 7.289256942256627e-05, + "loss": 0.2113, + "step": 33180 + }, + { + "epoch": 2.6880265716137393, + "grad_norm": 0.08317804336547852, + "learning_rate": 7.288806876997165e-05, + "loss": 0.2368, + "step": 33181 + }, + { + "epoch": 2.688107582631238, + "grad_norm": 0.08504065871238708, + "learning_rate": 7.288356811737702e-05, + "loss": 0.2501, + "step": 33182 + }, + { + "epoch": 2.688188593648736, + "grad_norm": 0.06972184777259827, + "learning_rate": 7.287906746478241e-05, + "loss": 0.2503, + "step": 33183 + }, + { + "epoch": 2.6882696046662344, + "grad_norm": 0.06454414874315262, + "learning_rate": 7.287456681218777e-05, + "loss": 0.2296, + "step": 33184 + }, + { + "epoch": 2.688350615683733, + "grad_norm": 0.07722259312868118, + "learning_rate": 7.287006615959314e-05, + "loss": 0.2707, + "step": 33185 + }, + { + "epoch": 2.6884316267012314, + "grad_norm": 0.07629488408565521, + "learning_rate": 7.286556550699853e-05, + "loss": 0.2498, + "step": 33186 + }, + { + "epoch": 2.6885126377187296, + "grad_norm": 0.06469930708408356, + "learning_rate": 7.286106485440389e-05, + "loss": 0.2165, + "step": 33187 + }, + { + "epoch": 2.6885936487362283, + "grad_norm": 0.07222463935613632, + "learning_rate": 7.285656420180926e-05, + "loss": 0.2577, + "step": 33188 + }, + { + "epoch": 2.6886746597537265, + "grad_norm": 0.06537472456693649, + "learning_rate": 7.285206354921465e-05, + "loss": 0.2383, + "step": 33189 + }, + { + "epoch": 2.688755670771225, + "grad_norm": 0.06282258778810501, + "learning_rate": 7.284756289662001e-05, + "loss": 0.2593, + "step": 33190 + }, + { + "epoch": 2.6888366817887235, + "grad_norm": 0.07500491291284561, + "learning_rate": 7.284306224402538e-05, + "loss": 0.2235, + "step": 33191 + }, + { + "epoch": 2.6889176928062217, + "grad_norm": 0.0683460459113121, + "learning_rate": 7.283856159143077e-05, + "loss": 0.2503, + "step": 33192 + }, + { + "epoch": 2.68899870382372, + "grad_norm": 0.06435289978981018, + "learning_rate": 7.283406093883613e-05, + "loss": 0.264, + "step": 33193 + }, + { + "epoch": 2.6890797148412187, + "grad_norm": 0.06583774089813232, + "learning_rate": 7.28295602862415e-05, + "loss": 0.2308, + "step": 33194 + }, + { + "epoch": 2.689160725858717, + "grad_norm": 0.079376220703125, + "learning_rate": 7.282505963364689e-05, + "loss": 0.266, + "step": 33195 + }, + { + "epoch": 2.689241736876215, + "grad_norm": 0.06783498823642731, + "learning_rate": 7.282055898105225e-05, + "loss": 0.2771, + "step": 33196 + }, + { + "epoch": 2.6893227478937134, + "grad_norm": 0.085577592253685, + "learning_rate": 7.281605832845763e-05, + "loss": 0.2668, + "step": 33197 + }, + { + "epoch": 2.689403758911212, + "grad_norm": 0.08192739635705948, + "learning_rate": 7.281155767586301e-05, + "loss": 0.2549, + "step": 33198 + }, + { + "epoch": 2.6894847699287103, + "grad_norm": 0.05756738781929016, + "learning_rate": 7.280705702326837e-05, + "loss": 0.2618, + "step": 33199 + }, + { + "epoch": 2.6895657809462086, + "grad_norm": 0.06547581404447556, + "learning_rate": 7.280255637067375e-05, + "loss": 0.2215, + "step": 33200 + }, + { + "epoch": 2.689646791963707, + "grad_norm": 0.08526495844125748, + "learning_rate": 7.279805571807913e-05, + "loss": 0.2376, + "step": 33201 + }, + { + "epoch": 2.6897278029812055, + "grad_norm": 0.07014923542737961, + "learning_rate": 7.27935550654845e-05, + "loss": 0.2528, + "step": 33202 + }, + { + "epoch": 2.6898088139987038, + "grad_norm": 0.06240760535001755, + "learning_rate": 7.278905441288987e-05, + "loss": 0.2287, + "step": 33203 + }, + { + "epoch": 2.689889825016202, + "grad_norm": 0.06542903184890747, + "learning_rate": 7.278455376029525e-05, + "loss": 0.2727, + "step": 33204 + }, + { + "epoch": 2.6899708360337007, + "grad_norm": 0.05773237347602844, + "learning_rate": 7.278005310770061e-05, + "loss": 0.2155, + "step": 33205 + }, + { + "epoch": 2.690051847051199, + "grad_norm": 0.0647493302822113, + "learning_rate": 7.277555245510599e-05, + "loss": 0.2355, + "step": 33206 + }, + { + "epoch": 2.690132858068697, + "grad_norm": 0.08845240622758865, + "learning_rate": 7.277105180251138e-05, + "loss": 0.2547, + "step": 33207 + }, + { + "epoch": 2.690213869086196, + "grad_norm": 0.07543900609016418, + "learning_rate": 7.276655114991674e-05, + "loss": 0.2389, + "step": 33208 + }, + { + "epoch": 2.690294880103694, + "grad_norm": 0.07690194994211197, + "learning_rate": 7.276205049732212e-05, + "loss": 0.234, + "step": 33209 + }, + { + "epoch": 2.6903758911211924, + "grad_norm": 0.08355151861906052, + "learning_rate": 7.27575498447275e-05, + "loss": 0.2186, + "step": 33210 + }, + { + "epoch": 2.690456902138691, + "grad_norm": 0.06553471833467484, + "learning_rate": 7.275304919213286e-05, + "loss": 0.2411, + "step": 33211 + }, + { + "epoch": 2.6905379131561893, + "grad_norm": 0.05886956304311752, + "learning_rate": 7.274854853953824e-05, + "loss": 0.2046, + "step": 33212 + }, + { + "epoch": 2.6906189241736875, + "grad_norm": 0.06500393897294998, + "learning_rate": 7.274404788694362e-05, + "loss": 0.2312, + "step": 33213 + }, + { + "epoch": 2.690699935191186, + "grad_norm": 0.060121990740299225, + "learning_rate": 7.273954723434898e-05, + "loss": 0.2289, + "step": 33214 + }, + { + "epoch": 2.6907809462086845, + "grad_norm": 0.07940906286239624, + "learning_rate": 7.273504658175436e-05, + "loss": 0.2924, + "step": 33215 + }, + { + "epoch": 2.6908619572261827, + "grad_norm": 0.07092858105897903, + "learning_rate": 7.273054592915974e-05, + "loss": 0.2849, + "step": 33216 + }, + { + "epoch": 2.6909429682436814, + "grad_norm": 0.07701389491558075, + "learning_rate": 7.27260452765651e-05, + "loss": 0.231, + "step": 33217 + }, + { + "epoch": 2.6910239792611796, + "grad_norm": 0.07645139843225479, + "learning_rate": 7.272154462397049e-05, + "loss": 0.2716, + "step": 33218 + }, + { + "epoch": 2.691104990278678, + "grad_norm": 0.07623913884162903, + "learning_rate": 7.271704397137586e-05, + "loss": 0.2404, + "step": 33219 + }, + { + "epoch": 2.691186001296176, + "grad_norm": 0.07207286357879639, + "learning_rate": 7.271254331878122e-05, + "loss": 0.2685, + "step": 33220 + }, + { + "epoch": 2.691267012313675, + "grad_norm": 0.06755689531564713, + "learning_rate": 7.27080426661866e-05, + "loss": 0.2, + "step": 33221 + }, + { + "epoch": 2.691348023331173, + "grad_norm": 0.048205725848674774, + "learning_rate": 7.270354201359198e-05, + "loss": 0.2153, + "step": 33222 + }, + { + "epoch": 2.6914290343486713, + "grad_norm": 0.056371089071035385, + "learning_rate": 7.269904136099734e-05, + "loss": 0.229, + "step": 33223 + }, + { + "epoch": 2.6915100453661696, + "grad_norm": 0.07858487218618393, + "learning_rate": 7.269454070840273e-05, + "loss": 0.2349, + "step": 33224 + }, + { + "epoch": 2.6915910563836682, + "grad_norm": 0.06717490404844284, + "learning_rate": 7.26900400558081e-05, + "loss": 0.2728, + "step": 33225 + }, + { + "epoch": 2.6916720674011665, + "grad_norm": 0.08073551952838898, + "learning_rate": 7.268553940321346e-05, + "loss": 0.2517, + "step": 33226 + }, + { + "epoch": 2.6917530784186647, + "grad_norm": 0.06915895640850067, + "learning_rate": 7.268103875061885e-05, + "loss": 0.2544, + "step": 33227 + }, + { + "epoch": 2.6918340894361634, + "grad_norm": 0.09112341701984406, + "learning_rate": 7.267653809802422e-05, + "loss": 0.2584, + "step": 33228 + }, + { + "epoch": 2.6919151004536617, + "grad_norm": 0.056663673371076584, + "learning_rate": 7.267203744542958e-05, + "loss": 0.2328, + "step": 33229 + }, + { + "epoch": 2.69199611147116, + "grad_norm": 0.06987922638654709, + "learning_rate": 7.266753679283497e-05, + "loss": 0.2238, + "step": 33230 + }, + { + "epoch": 2.6920771224886586, + "grad_norm": 0.06600745767354965, + "learning_rate": 7.266303614024034e-05, + "loss": 0.2534, + "step": 33231 + }, + { + "epoch": 2.692158133506157, + "grad_norm": 0.07932254672050476, + "learning_rate": 7.26585354876457e-05, + "loss": 0.2912, + "step": 33232 + }, + { + "epoch": 2.692239144523655, + "grad_norm": 0.0636344626545906, + "learning_rate": 7.265403483505109e-05, + "loss": 0.2158, + "step": 33233 + }, + { + "epoch": 2.692320155541154, + "grad_norm": 0.06788277626037598, + "learning_rate": 7.264953418245646e-05, + "loss": 0.243, + "step": 33234 + }, + { + "epoch": 2.692401166558652, + "grad_norm": 0.07238825410604477, + "learning_rate": 7.264503352986184e-05, + "loss": 0.2375, + "step": 33235 + }, + { + "epoch": 2.6924821775761503, + "grad_norm": 0.07912512123584747, + "learning_rate": 7.264053287726721e-05, + "loss": 0.2399, + "step": 33236 + }, + { + "epoch": 2.692563188593649, + "grad_norm": 0.06999436765909195, + "learning_rate": 7.263603222467258e-05, + "loss": 0.2532, + "step": 33237 + }, + { + "epoch": 2.692644199611147, + "grad_norm": 0.0635719895362854, + "learning_rate": 7.263153157207796e-05, + "loss": 0.2021, + "step": 33238 + }, + { + "epoch": 2.6927252106286454, + "grad_norm": 0.0745772048830986, + "learning_rate": 7.262703091948333e-05, + "loss": 0.2395, + "step": 33239 + }, + { + "epoch": 2.692806221646144, + "grad_norm": 0.06520460546016693, + "learning_rate": 7.26225302668887e-05, + "loss": 0.2094, + "step": 33240 + }, + { + "epoch": 2.6928872326636424, + "grad_norm": 0.08982928842306137, + "learning_rate": 7.261802961429408e-05, + "loss": 0.255, + "step": 33241 + }, + { + "epoch": 2.6929682436811406, + "grad_norm": 0.07295730710029602, + "learning_rate": 7.261352896169945e-05, + "loss": 0.2429, + "step": 33242 + }, + { + "epoch": 2.693049254698639, + "grad_norm": 0.08273440599441528, + "learning_rate": 7.260902830910483e-05, + "loss": 0.2343, + "step": 33243 + }, + { + "epoch": 2.693130265716137, + "grad_norm": 0.0663905143737793, + "learning_rate": 7.26045276565102e-05, + "loss": 0.216, + "step": 33244 + }, + { + "epoch": 2.693211276733636, + "grad_norm": 0.0648883581161499, + "learning_rate": 7.260002700391557e-05, + "loss": 0.2583, + "step": 33245 + }, + { + "epoch": 2.693292287751134, + "grad_norm": 0.0766635537147522, + "learning_rate": 7.259552635132095e-05, + "loss": 0.2446, + "step": 33246 + }, + { + "epoch": 2.6933732987686323, + "grad_norm": 0.059019945561885834, + "learning_rate": 7.259102569872632e-05, + "loss": 0.2218, + "step": 33247 + }, + { + "epoch": 2.693454309786131, + "grad_norm": 0.07485313713550568, + "learning_rate": 7.25865250461317e-05, + "loss": 0.2732, + "step": 33248 + }, + { + "epoch": 2.6935353208036292, + "grad_norm": 0.05649654194712639, + "learning_rate": 7.258202439353707e-05, + "loss": 0.2228, + "step": 33249 + }, + { + "epoch": 2.6936163318211275, + "grad_norm": 0.06117332726716995, + "learning_rate": 7.257752374094244e-05, + "loss": 0.2393, + "step": 33250 + }, + { + "epoch": 2.693697342838626, + "grad_norm": 0.07414082437753677, + "learning_rate": 7.257302308834781e-05, + "loss": 0.2713, + "step": 33251 + }, + { + "epoch": 2.6937783538561244, + "grad_norm": 0.06634288281202316, + "learning_rate": 7.256852243575319e-05, + "loss": 0.2481, + "step": 33252 + }, + { + "epoch": 2.6938593648736227, + "grad_norm": 0.08081548660993576, + "learning_rate": 7.256402178315856e-05, + "loss": 0.2457, + "step": 33253 + }, + { + "epoch": 2.6939403758911213, + "grad_norm": 0.07430370151996613, + "learning_rate": 7.255952113056393e-05, + "loss": 0.2422, + "step": 33254 + }, + { + "epoch": 2.6940213869086196, + "grad_norm": 0.05825252830982208, + "learning_rate": 7.255502047796931e-05, + "loss": 0.2506, + "step": 33255 + }, + { + "epoch": 2.694102397926118, + "grad_norm": 0.06999190151691437, + "learning_rate": 7.255051982537468e-05, + "loss": 0.2349, + "step": 33256 + }, + { + "epoch": 2.6941834089436165, + "grad_norm": 0.0780666321516037, + "learning_rate": 7.254601917278006e-05, + "loss": 0.2406, + "step": 33257 + }, + { + "epoch": 2.6942644199611148, + "grad_norm": 0.06424911320209503, + "learning_rate": 7.254151852018543e-05, + "loss": 0.2341, + "step": 33258 + }, + { + "epoch": 2.694345430978613, + "grad_norm": 0.06545555591583252, + "learning_rate": 7.25370178675908e-05, + "loss": 0.2404, + "step": 33259 + }, + { + "epoch": 2.6944264419961117, + "grad_norm": 0.07272839546203613, + "learning_rate": 7.253251721499618e-05, + "loss": 0.2459, + "step": 33260 + }, + { + "epoch": 2.69450745301361, + "grad_norm": 0.05883920565247536, + "learning_rate": 7.252801656240155e-05, + "loss": 0.2115, + "step": 33261 + }, + { + "epoch": 2.694588464031108, + "grad_norm": 0.08189603686332703, + "learning_rate": 7.252351590980692e-05, + "loss": 0.2414, + "step": 33262 + }, + { + "epoch": 2.694669475048607, + "grad_norm": 0.061765994876623154, + "learning_rate": 7.25190152572123e-05, + "loss": 0.2222, + "step": 33263 + }, + { + "epoch": 2.694750486066105, + "grad_norm": 0.06287230551242828, + "learning_rate": 7.251451460461768e-05, + "loss": 0.2543, + "step": 33264 + }, + { + "epoch": 2.6948314970836034, + "grad_norm": 0.07255005091428757, + "learning_rate": 7.251001395202304e-05, + "loss": 0.2561, + "step": 33265 + }, + { + "epoch": 2.6949125081011016, + "grad_norm": 0.06739959120750427, + "learning_rate": 7.250551329942842e-05, + "loss": 0.2254, + "step": 33266 + }, + { + "epoch": 2.6949935191186, + "grad_norm": 0.08224621415138245, + "learning_rate": 7.25010126468338e-05, + "loss": 0.2517, + "step": 33267 + }, + { + "epoch": 2.6950745301360985, + "grad_norm": 0.06153412163257599, + "learning_rate": 7.249651199423917e-05, + "loss": 0.2531, + "step": 33268 + }, + { + "epoch": 2.695155541153597, + "grad_norm": 0.06801117956638336, + "learning_rate": 7.249201134164454e-05, + "loss": 0.2405, + "step": 33269 + }, + { + "epoch": 2.695236552171095, + "grad_norm": 0.06874486804008484, + "learning_rate": 7.248751068904993e-05, + "loss": 0.2378, + "step": 33270 + }, + { + "epoch": 2.6953175631885937, + "grad_norm": 0.06314772367477417, + "learning_rate": 7.248301003645529e-05, + "loss": 0.242, + "step": 33271 + }, + { + "epoch": 2.695398574206092, + "grad_norm": 0.08381687849760056, + "learning_rate": 7.247850938386066e-05, + "loss": 0.2253, + "step": 33272 + }, + { + "epoch": 2.69547958522359, + "grad_norm": 0.06956927478313446, + "learning_rate": 7.247400873126605e-05, + "loss": 0.2453, + "step": 33273 + }, + { + "epoch": 2.695560596241089, + "grad_norm": 0.09207741171121597, + "learning_rate": 7.246950807867141e-05, + "loss": 0.2703, + "step": 33274 + }, + { + "epoch": 2.695641607258587, + "grad_norm": 0.07127492129802704, + "learning_rate": 7.246500742607678e-05, + "loss": 0.2314, + "step": 33275 + }, + { + "epoch": 2.6957226182760854, + "grad_norm": 0.07321468740701675, + "learning_rate": 7.246050677348217e-05, + "loss": 0.2373, + "step": 33276 + }, + { + "epoch": 2.695803629293584, + "grad_norm": 0.08432543277740479, + "learning_rate": 7.245600612088753e-05, + "loss": 0.2574, + "step": 33277 + }, + { + "epoch": 2.6958846403110823, + "grad_norm": 0.06618187576532364, + "learning_rate": 7.24515054682929e-05, + "loss": 0.2416, + "step": 33278 + }, + { + "epoch": 2.6959656513285806, + "grad_norm": 0.05841851234436035, + "learning_rate": 7.244700481569829e-05, + "loss": 0.2438, + "step": 33279 + }, + { + "epoch": 2.6960466623460793, + "grad_norm": 0.0806334912776947, + "learning_rate": 7.244250416310365e-05, + "loss": 0.2827, + "step": 33280 + }, + { + "epoch": 2.6961276733635775, + "grad_norm": 0.06472641229629517, + "learning_rate": 7.243800351050902e-05, + "loss": 0.2229, + "step": 33281 + }, + { + "epoch": 2.6962086843810757, + "grad_norm": 0.07949283719062805, + "learning_rate": 7.243350285791441e-05, + "loss": 0.2679, + "step": 33282 + }, + { + "epoch": 2.6962896953985744, + "grad_norm": 0.06775813549757004, + "learning_rate": 7.242900220531977e-05, + "loss": 0.2363, + "step": 33283 + }, + { + "epoch": 2.6963707064160727, + "grad_norm": 0.06058688834309578, + "learning_rate": 7.242450155272514e-05, + "loss": 0.2377, + "step": 33284 + }, + { + "epoch": 2.696451717433571, + "grad_norm": 0.05882725119590759, + "learning_rate": 7.242000090013053e-05, + "loss": 0.2168, + "step": 33285 + }, + { + "epoch": 2.6965327284510696, + "grad_norm": 0.06747718900442123, + "learning_rate": 7.241550024753589e-05, + "loss": 0.2318, + "step": 33286 + }, + { + "epoch": 2.696613739468568, + "grad_norm": 0.07593017816543579, + "learning_rate": 7.241099959494126e-05, + "loss": 0.232, + "step": 33287 + }, + { + "epoch": 2.696694750486066, + "grad_norm": 0.055422279983758926, + "learning_rate": 7.240649894234665e-05, + "loss": 0.2036, + "step": 33288 + }, + { + "epoch": 2.6967757615035644, + "grad_norm": 0.07576458901166916, + "learning_rate": 7.240199828975201e-05, + "loss": 0.2645, + "step": 33289 + }, + { + "epoch": 2.6968567725210626, + "grad_norm": 0.06819913536310196, + "learning_rate": 7.23974976371574e-05, + "loss": 0.2327, + "step": 33290 + }, + { + "epoch": 2.6969377835385613, + "grad_norm": 0.0817083939909935, + "learning_rate": 7.239299698456277e-05, + "loss": 0.2123, + "step": 33291 + }, + { + "epoch": 2.6970187945560595, + "grad_norm": 0.05920260399580002, + "learning_rate": 7.238849633196813e-05, + "loss": 0.202, + "step": 33292 + }, + { + "epoch": 2.6970998055735578, + "grad_norm": 0.08613543212413788, + "learning_rate": 7.238399567937352e-05, + "loss": 0.2552, + "step": 33293 + }, + { + "epoch": 2.6971808165910565, + "grad_norm": 0.07524523138999939, + "learning_rate": 7.237949502677889e-05, + "loss": 0.2334, + "step": 33294 + }, + { + "epoch": 2.6972618276085547, + "grad_norm": 0.07589299231767654, + "learning_rate": 7.237499437418425e-05, + "loss": 0.2353, + "step": 33295 + }, + { + "epoch": 2.697342838626053, + "grad_norm": 0.06346601247787476, + "learning_rate": 7.237049372158964e-05, + "loss": 0.2238, + "step": 33296 + }, + { + "epoch": 2.6974238496435516, + "grad_norm": 0.09287595748901367, + "learning_rate": 7.236599306899501e-05, + "loss": 0.2437, + "step": 33297 + }, + { + "epoch": 2.69750486066105, + "grad_norm": 0.07111447304487228, + "learning_rate": 7.236149241640037e-05, + "loss": 0.2139, + "step": 33298 + }, + { + "epoch": 2.697585871678548, + "grad_norm": 0.0682421624660492, + "learning_rate": 7.235699176380576e-05, + "loss": 0.2269, + "step": 33299 + }, + { + "epoch": 2.697666882696047, + "grad_norm": 0.07002420723438263, + "learning_rate": 7.235249111121113e-05, + "loss": 0.2146, + "step": 33300 + }, + { + "epoch": 2.697747893713545, + "grad_norm": 0.07240556925535202, + "learning_rate": 7.23479904586165e-05, + "loss": 0.2509, + "step": 33301 + }, + { + "epoch": 2.6978289047310433, + "grad_norm": 0.07018124312162399, + "learning_rate": 7.234348980602188e-05, + "loss": 0.2489, + "step": 33302 + }, + { + "epoch": 2.697909915748542, + "grad_norm": 0.08013798296451569, + "learning_rate": 7.233898915342725e-05, + "loss": 0.2603, + "step": 33303 + }, + { + "epoch": 2.6979909267660402, + "grad_norm": 0.07782293856143951, + "learning_rate": 7.233448850083262e-05, + "loss": 0.2498, + "step": 33304 + }, + { + "epoch": 2.6980719377835385, + "grad_norm": 0.07612436264753342, + "learning_rate": 7.2329987848238e-05, + "loss": 0.2671, + "step": 33305 + }, + { + "epoch": 2.698152948801037, + "grad_norm": 0.0673048198223114, + "learning_rate": 7.232548719564338e-05, + "loss": 0.2325, + "step": 33306 + }, + { + "epoch": 2.6982339598185354, + "grad_norm": 0.06546095013618469, + "learning_rate": 7.232098654304874e-05, + "loss": 0.209, + "step": 33307 + }, + { + "epoch": 2.6983149708360337, + "grad_norm": 0.06932816654443741, + "learning_rate": 7.231648589045412e-05, + "loss": 0.2107, + "step": 33308 + }, + { + "epoch": 2.6983959818535324, + "grad_norm": 0.04809468239545822, + "learning_rate": 7.23119852378595e-05, + "loss": 0.2242, + "step": 33309 + }, + { + "epoch": 2.6984769928710306, + "grad_norm": 0.07442724704742432, + "learning_rate": 7.230748458526486e-05, + "loss": 0.2577, + "step": 33310 + }, + { + "epoch": 2.698558003888529, + "grad_norm": 0.06932736188173294, + "learning_rate": 7.230298393267024e-05, + "loss": 0.2478, + "step": 33311 + }, + { + "epoch": 2.698639014906027, + "grad_norm": 0.06765572726726532, + "learning_rate": 7.229848328007562e-05, + "loss": 0.1954, + "step": 33312 + }, + { + "epoch": 2.6987200259235253, + "grad_norm": 0.07260085642337799, + "learning_rate": 7.229398262748098e-05, + "loss": 0.2151, + "step": 33313 + }, + { + "epoch": 2.698801036941024, + "grad_norm": 0.07135795801877975, + "learning_rate": 7.228948197488636e-05, + "loss": 0.2522, + "step": 33314 + }, + { + "epoch": 2.6988820479585223, + "grad_norm": 0.06212751194834709, + "learning_rate": 7.228498132229174e-05, + "loss": 0.1974, + "step": 33315 + }, + { + "epoch": 2.6989630589760205, + "grad_norm": 0.07515834271907806, + "learning_rate": 7.228048066969711e-05, + "loss": 0.2718, + "step": 33316 + }, + { + "epoch": 2.699044069993519, + "grad_norm": 0.08626675605773926, + "learning_rate": 7.227598001710249e-05, + "loss": 0.2927, + "step": 33317 + }, + { + "epoch": 2.6991250810110174, + "grad_norm": 0.062219806015491486, + "learning_rate": 7.227147936450786e-05, + "loss": 0.226, + "step": 33318 + }, + { + "epoch": 2.6992060920285157, + "grad_norm": 0.05566618964076042, + "learning_rate": 7.226697871191323e-05, + "loss": 0.2235, + "step": 33319 + }, + { + "epoch": 2.6992871030460144, + "grad_norm": 0.07704995572566986, + "learning_rate": 7.22624780593186e-05, + "loss": 0.2252, + "step": 33320 + }, + { + "epoch": 2.6993681140635126, + "grad_norm": 0.06611926108598709, + "learning_rate": 7.225797740672398e-05, + "loss": 0.2323, + "step": 33321 + }, + { + "epoch": 2.699449125081011, + "grad_norm": 0.08879465609788895, + "learning_rate": 7.225347675412935e-05, + "loss": 0.2155, + "step": 33322 + }, + { + "epoch": 2.6995301360985096, + "grad_norm": 0.06557431817054749, + "learning_rate": 7.224897610153473e-05, + "loss": 0.2402, + "step": 33323 + }, + { + "epoch": 2.699611147116008, + "grad_norm": 0.06380093842744827, + "learning_rate": 7.22444754489401e-05, + "loss": 0.2188, + "step": 33324 + }, + { + "epoch": 2.699692158133506, + "grad_norm": 0.075450100004673, + "learning_rate": 7.223997479634547e-05, + "loss": 0.2284, + "step": 33325 + }, + { + "epoch": 2.6997731691510047, + "grad_norm": 0.0675964429974556, + "learning_rate": 7.223547414375085e-05, + "loss": 0.2116, + "step": 33326 + }, + { + "epoch": 2.699854180168503, + "grad_norm": 0.0674784854054451, + "learning_rate": 7.223097349115622e-05, + "loss": 0.256, + "step": 33327 + }, + { + "epoch": 2.6999351911860012, + "grad_norm": 0.08821087330579758, + "learning_rate": 7.22264728385616e-05, + "loss": 0.2311, + "step": 33328 + }, + { + "epoch": 2.7000162022035, + "grad_norm": 0.0646054744720459, + "learning_rate": 7.222197218596697e-05, + "loss": 0.2302, + "step": 33329 + }, + { + "epoch": 2.700097213220998, + "grad_norm": 0.07179594039916992, + "learning_rate": 7.221747153337234e-05, + "loss": 0.277, + "step": 33330 + }, + { + "epoch": 2.7001782242384964, + "grad_norm": 0.07757652550935745, + "learning_rate": 7.221297088077772e-05, + "loss": 0.2516, + "step": 33331 + }, + { + "epoch": 2.7002592352559946, + "grad_norm": 0.06800487637519836, + "learning_rate": 7.220847022818309e-05, + "loss": 0.256, + "step": 33332 + }, + { + "epoch": 2.7003402462734933, + "grad_norm": 0.06418580561876297, + "learning_rate": 7.220396957558846e-05, + "loss": 0.2208, + "step": 33333 + }, + { + "epoch": 2.7004212572909916, + "grad_norm": 0.10417748242616653, + "learning_rate": 7.219946892299384e-05, + "loss": 0.301, + "step": 33334 + }, + { + "epoch": 2.70050226830849, + "grad_norm": 0.06674910336732864, + "learning_rate": 7.219496827039921e-05, + "loss": 0.2428, + "step": 33335 + }, + { + "epoch": 2.700583279325988, + "grad_norm": 0.052473656833171844, + "learning_rate": 7.219046761780458e-05, + "loss": 0.1834, + "step": 33336 + }, + { + "epoch": 2.7006642903434868, + "grad_norm": 0.07685059309005737, + "learning_rate": 7.218596696520996e-05, + "loss": 0.2553, + "step": 33337 + }, + { + "epoch": 2.700745301360985, + "grad_norm": 0.062406525015830994, + "learning_rate": 7.218146631261533e-05, + "loss": 0.2381, + "step": 33338 + }, + { + "epoch": 2.7008263123784833, + "grad_norm": 0.0681590810418129, + "learning_rate": 7.21769656600207e-05, + "loss": 0.2208, + "step": 33339 + }, + { + "epoch": 2.700907323395982, + "grad_norm": 0.06955277919769287, + "learning_rate": 7.217246500742608e-05, + "loss": 0.2628, + "step": 33340 + }, + { + "epoch": 2.70098833441348, + "grad_norm": 0.07531130313873291, + "learning_rate": 7.216796435483145e-05, + "loss": 0.2291, + "step": 33341 + }, + { + "epoch": 2.7010693454309784, + "grad_norm": 0.0607423335313797, + "learning_rate": 7.216346370223684e-05, + "loss": 0.208, + "step": 33342 + }, + { + "epoch": 2.701150356448477, + "grad_norm": 0.0653001219034195, + "learning_rate": 7.21589630496422e-05, + "loss": 0.232, + "step": 33343 + }, + { + "epoch": 2.7012313674659754, + "grad_norm": 0.07373838126659393, + "learning_rate": 7.215446239704757e-05, + "loss": 0.2258, + "step": 33344 + }, + { + "epoch": 2.7013123784834736, + "grad_norm": 0.07639024406671524, + "learning_rate": 7.214996174445296e-05, + "loss": 0.2367, + "step": 33345 + }, + { + "epoch": 2.7013933895009723, + "grad_norm": 0.06796936690807343, + "learning_rate": 7.214546109185832e-05, + "loss": 0.2381, + "step": 33346 + }, + { + "epoch": 2.7014744005184705, + "grad_norm": 0.07707379758358002, + "learning_rate": 7.21409604392637e-05, + "loss": 0.2342, + "step": 33347 + }, + { + "epoch": 2.701555411535969, + "grad_norm": 0.06277401000261307, + "learning_rate": 7.213645978666908e-05, + "loss": 0.2317, + "step": 33348 + }, + { + "epoch": 2.7016364225534675, + "grad_norm": 0.05920235440135002, + "learning_rate": 7.213195913407444e-05, + "loss": 0.2417, + "step": 33349 + }, + { + "epoch": 2.7017174335709657, + "grad_norm": 0.072438545525074, + "learning_rate": 7.212745848147981e-05, + "loss": 0.2415, + "step": 33350 + }, + { + "epoch": 2.701798444588464, + "grad_norm": 0.07473402470350266, + "learning_rate": 7.21229578288852e-05, + "loss": 0.2696, + "step": 33351 + }, + { + "epoch": 2.7018794556059627, + "grad_norm": 0.07099024951457977, + "learning_rate": 7.211845717629056e-05, + "loss": 0.2352, + "step": 33352 + }, + { + "epoch": 2.701960466623461, + "grad_norm": 0.06595303118228912, + "learning_rate": 7.211395652369594e-05, + "loss": 0.2281, + "step": 33353 + }, + { + "epoch": 2.702041477640959, + "grad_norm": 0.067349873483181, + "learning_rate": 7.210945587110132e-05, + "loss": 0.2216, + "step": 33354 + }, + { + "epoch": 2.7021224886584574, + "grad_norm": 0.0640546903014183, + "learning_rate": 7.210495521850668e-05, + "loss": 0.2381, + "step": 33355 + }, + { + "epoch": 2.702203499675956, + "grad_norm": 0.07349754124879837, + "learning_rate": 7.210045456591206e-05, + "loss": 0.3067, + "step": 33356 + }, + { + "epoch": 2.7022845106934543, + "grad_norm": 0.0878182053565979, + "learning_rate": 7.209595391331744e-05, + "loss": 0.2244, + "step": 33357 + }, + { + "epoch": 2.7023655217109526, + "grad_norm": 0.06644090265035629, + "learning_rate": 7.20914532607228e-05, + "loss": 0.2537, + "step": 33358 + }, + { + "epoch": 2.702446532728451, + "grad_norm": 0.067959725856781, + "learning_rate": 7.208695260812818e-05, + "loss": 0.2672, + "step": 33359 + }, + { + "epoch": 2.7025275437459495, + "grad_norm": 0.06765919923782349, + "learning_rate": 7.208245195553356e-05, + "loss": 0.2533, + "step": 33360 + }, + { + "epoch": 2.7026085547634477, + "grad_norm": 0.07062802463769913, + "learning_rate": 7.207795130293892e-05, + "loss": 0.2377, + "step": 33361 + }, + { + "epoch": 2.702689565780946, + "grad_norm": 0.07054631412029266, + "learning_rate": 7.20734506503443e-05, + "loss": 0.2494, + "step": 33362 + }, + { + "epoch": 2.7027705767984447, + "grad_norm": 0.07606198638677597, + "learning_rate": 7.206894999774968e-05, + "loss": 0.246, + "step": 33363 + }, + { + "epoch": 2.702851587815943, + "grad_norm": 0.08337026089429855, + "learning_rate": 7.206444934515504e-05, + "loss": 0.2349, + "step": 33364 + }, + { + "epoch": 2.702932598833441, + "grad_norm": 0.0638199970126152, + "learning_rate": 7.205994869256042e-05, + "loss": 0.2243, + "step": 33365 + }, + { + "epoch": 2.70301360985094, + "grad_norm": 0.08534477651119232, + "learning_rate": 7.20554480399658e-05, + "loss": 0.2551, + "step": 33366 + }, + { + "epoch": 2.703094620868438, + "grad_norm": 0.06455480307340622, + "learning_rate": 7.205094738737117e-05, + "loss": 0.2196, + "step": 33367 + }, + { + "epoch": 2.7031756318859363, + "grad_norm": 0.07369302958250046, + "learning_rate": 7.204644673477655e-05, + "loss": 0.223, + "step": 33368 + }, + { + "epoch": 2.703256642903435, + "grad_norm": 0.06185693293809891, + "learning_rate": 7.204194608218193e-05, + "loss": 0.2207, + "step": 33369 + }, + { + "epoch": 2.7033376539209333, + "grad_norm": 0.0870005264878273, + "learning_rate": 7.203744542958729e-05, + "loss": 0.2543, + "step": 33370 + }, + { + "epoch": 2.7034186649384315, + "grad_norm": 0.0734618604183197, + "learning_rate": 7.203294477699267e-05, + "loss": 0.2882, + "step": 33371 + }, + { + "epoch": 2.70349967595593, + "grad_norm": 0.08058614283800125, + "learning_rate": 7.202844412439805e-05, + "loss": 0.2344, + "step": 33372 + }, + { + "epoch": 2.7035806869734285, + "grad_norm": 0.06244783475995064, + "learning_rate": 7.202394347180341e-05, + "loss": 0.2252, + "step": 33373 + }, + { + "epoch": 2.7036616979909267, + "grad_norm": 0.06978266686201096, + "learning_rate": 7.20194428192088e-05, + "loss": 0.2308, + "step": 33374 + }, + { + "epoch": 2.7037427090084254, + "grad_norm": 0.0777072086930275, + "learning_rate": 7.201494216661417e-05, + "loss": 0.2629, + "step": 33375 + }, + { + "epoch": 2.7038237200259236, + "grad_norm": 0.07009978592395782, + "learning_rate": 7.201044151401953e-05, + "loss": 0.2857, + "step": 33376 + }, + { + "epoch": 2.703904731043422, + "grad_norm": 0.07170268893241882, + "learning_rate": 7.200594086142492e-05, + "loss": 0.2273, + "step": 33377 + }, + { + "epoch": 2.70398574206092, + "grad_norm": 0.06669243425130844, + "learning_rate": 7.200144020883029e-05, + "loss": 0.2396, + "step": 33378 + }, + { + "epoch": 2.704066753078419, + "grad_norm": 0.0675826445221901, + "learning_rate": 7.199693955623565e-05, + "loss": 0.2302, + "step": 33379 + }, + { + "epoch": 2.704147764095917, + "grad_norm": 0.07260756194591522, + "learning_rate": 7.199243890364104e-05, + "loss": 0.2338, + "step": 33380 + }, + { + "epoch": 2.7042287751134153, + "grad_norm": 0.06512048840522766, + "learning_rate": 7.198793825104641e-05, + "loss": 0.2291, + "step": 33381 + }, + { + "epoch": 2.7043097861309136, + "grad_norm": 0.06933195143938065, + "learning_rate": 7.198343759845177e-05, + "loss": 0.2877, + "step": 33382 + }, + { + "epoch": 2.7043907971484122, + "grad_norm": 0.06995735317468643, + "learning_rate": 7.197893694585716e-05, + "loss": 0.2208, + "step": 33383 + }, + { + "epoch": 2.7044718081659105, + "grad_norm": 0.07151626795530319, + "learning_rate": 7.197443629326253e-05, + "loss": 0.236, + "step": 33384 + }, + { + "epoch": 2.7045528191834087, + "grad_norm": 0.08576501160860062, + "learning_rate": 7.196993564066789e-05, + "loss": 0.2119, + "step": 33385 + }, + { + "epoch": 2.7046338302009074, + "grad_norm": 0.06742434948682785, + "learning_rate": 7.196543498807328e-05, + "loss": 0.2228, + "step": 33386 + }, + { + "epoch": 2.7047148412184057, + "grad_norm": 0.07232806086540222, + "learning_rate": 7.196093433547865e-05, + "loss": 0.2335, + "step": 33387 + }, + { + "epoch": 2.704795852235904, + "grad_norm": 0.06070084869861603, + "learning_rate": 7.195643368288401e-05, + "loss": 0.2159, + "step": 33388 + }, + { + "epoch": 2.7048768632534026, + "grad_norm": 0.0777098536491394, + "learning_rate": 7.19519330302894e-05, + "loss": 0.2654, + "step": 33389 + }, + { + "epoch": 2.704957874270901, + "grad_norm": 0.07623117417097092, + "learning_rate": 7.194743237769477e-05, + "loss": 0.2627, + "step": 33390 + }, + { + "epoch": 2.705038885288399, + "grad_norm": 0.07137143611907959, + "learning_rate": 7.194293172510013e-05, + "loss": 0.1909, + "step": 33391 + }, + { + "epoch": 2.7051198963058978, + "grad_norm": 0.06881123781204224, + "learning_rate": 7.193843107250552e-05, + "loss": 0.2623, + "step": 33392 + }, + { + "epoch": 2.705200907323396, + "grad_norm": 0.07806967943906784, + "learning_rate": 7.193393041991089e-05, + "loss": 0.2565, + "step": 33393 + }, + { + "epoch": 2.7052819183408943, + "grad_norm": 0.05773887410759926, + "learning_rate": 7.192942976731627e-05, + "loss": 0.2081, + "step": 33394 + }, + { + "epoch": 2.705362929358393, + "grad_norm": 0.06873015314340591, + "learning_rate": 7.192492911472164e-05, + "loss": 0.2472, + "step": 33395 + }, + { + "epoch": 2.705443940375891, + "grad_norm": 0.07720614224672318, + "learning_rate": 7.192042846212701e-05, + "loss": 0.2222, + "step": 33396 + }, + { + "epoch": 2.7055249513933894, + "grad_norm": 0.0625235065817833, + "learning_rate": 7.191592780953239e-05, + "loss": 0.2268, + "step": 33397 + }, + { + "epoch": 2.705605962410888, + "grad_norm": 0.06290551275014877, + "learning_rate": 7.191142715693776e-05, + "loss": 0.2327, + "step": 33398 + }, + { + "epoch": 2.7056869734283864, + "grad_norm": 0.05978340655565262, + "learning_rate": 7.190692650434313e-05, + "loss": 0.2379, + "step": 33399 + }, + { + "epoch": 2.7057679844458846, + "grad_norm": 0.06658025830984116, + "learning_rate": 7.190242585174851e-05, + "loss": 0.2415, + "step": 33400 + }, + { + "epoch": 2.705848995463383, + "grad_norm": 0.04890163242816925, + "learning_rate": 7.189792519915388e-05, + "loss": 0.2447, + "step": 33401 + }, + { + "epoch": 2.7059300064808816, + "grad_norm": 0.06032396852970123, + "learning_rate": 7.189342454655926e-05, + "loss": 0.2411, + "step": 33402 + }, + { + "epoch": 2.70601101749838, + "grad_norm": 0.07358459383249283, + "learning_rate": 7.188892389396463e-05, + "loss": 0.2707, + "step": 33403 + }, + { + "epoch": 2.706092028515878, + "grad_norm": 0.07115450501441956, + "learning_rate": 7.188442324137e-05, + "loss": 0.2074, + "step": 33404 + }, + { + "epoch": 2.7061730395333763, + "grad_norm": 0.08676160871982574, + "learning_rate": 7.187992258877538e-05, + "loss": 0.267, + "step": 33405 + }, + { + "epoch": 2.706254050550875, + "grad_norm": 0.07879418134689331, + "learning_rate": 7.187542193618075e-05, + "loss": 0.2451, + "step": 33406 + }, + { + "epoch": 2.7063350615683732, + "grad_norm": 0.06741847097873688, + "learning_rate": 7.187092128358612e-05, + "loss": 0.232, + "step": 33407 + }, + { + "epoch": 2.7064160725858715, + "grad_norm": 0.06861771643161774, + "learning_rate": 7.18664206309915e-05, + "loss": 0.2397, + "step": 33408 + }, + { + "epoch": 2.70649708360337, + "grad_norm": 0.06544741988182068, + "learning_rate": 7.186191997839687e-05, + "loss": 0.206, + "step": 33409 + }, + { + "epoch": 2.7065780946208684, + "grad_norm": 0.07405445724725723, + "learning_rate": 7.185741932580224e-05, + "loss": 0.2349, + "step": 33410 + }, + { + "epoch": 2.7066591056383666, + "grad_norm": 0.06363479048013687, + "learning_rate": 7.185291867320762e-05, + "loss": 0.2493, + "step": 33411 + }, + { + "epoch": 2.7067401166558653, + "grad_norm": 0.06907227635383606, + "learning_rate": 7.184841802061299e-05, + "loss": 0.2485, + "step": 33412 + }, + { + "epoch": 2.7068211276733636, + "grad_norm": 0.07298295199871063, + "learning_rate": 7.184391736801836e-05, + "loss": 0.2263, + "step": 33413 + }, + { + "epoch": 2.706902138690862, + "grad_norm": 0.07148992270231247, + "learning_rate": 7.183941671542374e-05, + "loss": 0.237, + "step": 33414 + }, + { + "epoch": 2.7069831497083605, + "grad_norm": 0.057731013745069504, + "learning_rate": 7.183491606282911e-05, + "loss": 0.2101, + "step": 33415 + }, + { + "epoch": 2.7070641607258588, + "grad_norm": 0.07263604551553726, + "learning_rate": 7.183041541023449e-05, + "loss": 0.2743, + "step": 33416 + }, + { + "epoch": 2.707145171743357, + "grad_norm": 0.06796883046627045, + "learning_rate": 7.182591475763986e-05, + "loss": 0.2423, + "step": 33417 + }, + { + "epoch": 2.7072261827608557, + "grad_norm": 0.06539809703826904, + "learning_rate": 7.182141410504523e-05, + "loss": 0.2331, + "step": 33418 + }, + { + "epoch": 2.707307193778354, + "grad_norm": 0.061306267976760864, + "learning_rate": 7.18169134524506e-05, + "loss": 0.2238, + "step": 33419 + }, + { + "epoch": 2.707388204795852, + "grad_norm": 0.07076717913150787, + "learning_rate": 7.181241279985598e-05, + "loss": 0.2162, + "step": 33420 + }, + { + "epoch": 2.707469215813351, + "grad_norm": 0.07293318212032318, + "learning_rate": 7.180791214726135e-05, + "loss": 0.2403, + "step": 33421 + }, + { + "epoch": 2.707550226830849, + "grad_norm": 0.06484469026327133, + "learning_rate": 7.180341149466673e-05, + "loss": 0.2588, + "step": 33422 + }, + { + "epoch": 2.7076312378483474, + "grad_norm": 0.06189373508095741, + "learning_rate": 7.179891084207211e-05, + "loss": 0.2457, + "step": 33423 + }, + { + "epoch": 2.7077122488658456, + "grad_norm": 0.06705143302679062, + "learning_rate": 7.179441018947747e-05, + "loss": 0.2319, + "step": 33424 + }, + { + "epoch": 2.7077932598833443, + "grad_norm": 0.06246310472488403, + "learning_rate": 7.178990953688285e-05, + "loss": 0.2564, + "step": 33425 + }, + { + "epoch": 2.7078742709008425, + "grad_norm": 0.05720631778240204, + "learning_rate": 7.178540888428824e-05, + "loss": 0.2121, + "step": 33426 + }, + { + "epoch": 2.707955281918341, + "grad_norm": 0.06298079341650009, + "learning_rate": 7.17809082316936e-05, + "loss": 0.2262, + "step": 33427 + }, + { + "epoch": 2.708036292935839, + "grad_norm": 0.07074960321187973, + "learning_rate": 7.177640757909897e-05, + "loss": 0.263, + "step": 33428 + }, + { + "epoch": 2.7081173039533377, + "grad_norm": 0.052444297820329666, + "learning_rate": 7.177190692650436e-05, + "loss": 0.2398, + "step": 33429 + }, + { + "epoch": 2.708198314970836, + "grad_norm": 0.09744896739721298, + "learning_rate": 7.176740627390972e-05, + "loss": 0.288, + "step": 33430 + }, + { + "epoch": 2.708279325988334, + "grad_norm": 0.06278154999017715, + "learning_rate": 7.176290562131509e-05, + "loss": 0.2493, + "step": 33431 + }, + { + "epoch": 2.708360337005833, + "grad_norm": 0.06635581701993942, + "learning_rate": 7.175840496872048e-05, + "loss": 0.2874, + "step": 33432 + }, + { + "epoch": 2.708441348023331, + "grad_norm": 0.06772460788488388, + "learning_rate": 7.175390431612584e-05, + "loss": 0.2096, + "step": 33433 + }, + { + "epoch": 2.7085223590408294, + "grad_norm": 0.07880239188671112, + "learning_rate": 7.174940366353121e-05, + "loss": 0.2606, + "step": 33434 + }, + { + "epoch": 2.708603370058328, + "grad_norm": 0.06576960533857346, + "learning_rate": 7.17449030109366e-05, + "loss": 0.2168, + "step": 33435 + }, + { + "epoch": 2.7086843810758263, + "grad_norm": 0.06567549705505371, + "learning_rate": 7.174040235834196e-05, + "loss": 0.229, + "step": 33436 + }, + { + "epoch": 2.7087653920933246, + "grad_norm": 0.08709168434143066, + "learning_rate": 7.173590170574733e-05, + "loss": 0.281, + "step": 33437 + }, + { + "epoch": 2.7088464031108233, + "grad_norm": 0.06958389282226562, + "learning_rate": 7.173140105315272e-05, + "loss": 0.2553, + "step": 33438 + }, + { + "epoch": 2.7089274141283215, + "grad_norm": 0.07254193723201752, + "learning_rate": 7.172690040055808e-05, + "loss": 0.2231, + "step": 33439 + }, + { + "epoch": 2.7090084251458197, + "grad_norm": 0.06497646123170853, + "learning_rate": 7.172239974796345e-05, + "loss": 0.2209, + "step": 33440 + }, + { + "epoch": 2.7090894361633184, + "grad_norm": 0.0700288861989975, + "learning_rate": 7.171789909536884e-05, + "loss": 0.2325, + "step": 33441 + }, + { + "epoch": 2.7091704471808167, + "grad_norm": 0.0581977441906929, + "learning_rate": 7.17133984427742e-05, + "loss": 0.2221, + "step": 33442 + }, + { + "epoch": 2.709251458198315, + "grad_norm": 0.07684613764286041, + "learning_rate": 7.170889779017957e-05, + "loss": 0.2357, + "step": 33443 + }, + { + "epoch": 2.7093324692158136, + "grad_norm": 0.07410931587219238, + "learning_rate": 7.170439713758496e-05, + "loss": 0.247, + "step": 33444 + }, + { + "epoch": 2.709413480233312, + "grad_norm": 0.07118923217058182, + "learning_rate": 7.169989648499032e-05, + "loss": 0.2413, + "step": 33445 + }, + { + "epoch": 2.70949449125081, + "grad_norm": 0.0676896721124649, + "learning_rate": 7.16953958323957e-05, + "loss": 0.2431, + "step": 33446 + }, + { + "epoch": 2.7095755022683083, + "grad_norm": 0.06472186744213104, + "learning_rate": 7.169089517980108e-05, + "loss": 0.2399, + "step": 33447 + }, + { + "epoch": 2.709656513285807, + "grad_norm": 0.060834795236587524, + "learning_rate": 7.168639452720644e-05, + "loss": 0.2208, + "step": 33448 + }, + { + "epoch": 2.7097375243033053, + "grad_norm": 0.06608045101165771, + "learning_rate": 7.168189387461183e-05, + "loss": 0.2341, + "step": 33449 + }, + { + "epoch": 2.7098185353208035, + "grad_norm": 0.07285463064908981, + "learning_rate": 7.16773932220172e-05, + "loss": 0.2401, + "step": 33450 + }, + { + "epoch": 2.7098995463383018, + "grad_norm": 0.06280604004859924, + "learning_rate": 7.167289256942256e-05, + "loss": 0.2285, + "step": 33451 + }, + { + "epoch": 2.7099805573558005, + "grad_norm": 0.07557007670402527, + "learning_rate": 7.166839191682795e-05, + "loss": 0.2702, + "step": 33452 + }, + { + "epoch": 2.7100615683732987, + "grad_norm": 0.06775477528572083, + "learning_rate": 7.166389126423332e-05, + "loss": 0.2416, + "step": 33453 + }, + { + "epoch": 2.710142579390797, + "grad_norm": 0.07118985056877136, + "learning_rate": 7.165939061163868e-05, + "loss": 0.2498, + "step": 33454 + }, + { + "epoch": 2.7102235904082956, + "grad_norm": 0.06145792827010155, + "learning_rate": 7.165488995904407e-05, + "loss": 0.2484, + "step": 33455 + }, + { + "epoch": 2.710304601425794, + "grad_norm": 0.08589167147874832, + "learning_rate": 7.165038930644944e-05, + "loss": 0.2659, + "step": 33456 + }, + { + "epoch": 2.710385612443292, + "grad_norm": 0.07768699526786804, + "learning_rate": 7.16458886538548e-05, + "loss": 0.2092, + "step": 33457 + }, + { + "epoch": 2.710466623460791, + "grad_norm": 0.08188536763191223, + "learning_rate": 7.164138800126019e-05, + "loss": 0.2414, + "step": 33458 + }, + { + "epoch": 2.710547634478289, + "grad_norm": 0.05485611408948898, + "learning_rate": 7.163688734866556e-05, + "loss": 0.2101, + "step": 33459 + }, + { + "epoch": 2.7106286454957873, + "grad_norm": 0.0856386050581932, + "learning_rate": 7.163238669607092e-05, + "loss": 0.2819, + "step": 33460 + }, + { + "epoch": 2.710709656513286, + "grad_norm": 0.05618041381239891, + "learning_rate": 7.162788604347631e-05, + "loss": 0.2136, + "step": 33461 + }, + { + "epoch": 2.7107906675307842, + "grad_norm": 0.0732741504907608, + "learning_rate": 7.162338539088169e-05, + "loss": 0.2406, + "step": 33462 + }, + { + "epoch": 2.7108716785482825, + "grad_norm": 0.0713251456618309, + "learning_rate": 7.161888473828705e-05, + "loss": 0.2269, + "step": 33463 + }, + { + "epoch": 2.710952689565781, + "grad_norm": 0.06645556539297104, + "learning_rate": 7.161438408569243e-05, + "loss": 0.2355, + "step": 33464 + }, + { + "epoch": 2.7110337005832794, + "grad_norm": 0.06869667023420334, + "learning_rate": 7.16098834330978e-05, + "loss": 0.2426, + "step": 33465 + }, + { + "epoch": 2.7111147116007777, + "grad_norm": 0.06282439827919006, + "learning_rate": 7.160538278050317e-05, + "loss": 0.2451, + "step": 33466 + }, + { + "epoch": 2.7111957226182763, + "grad_norm": 0.06767906248569489, + "learning_rate": 7.160088212790855e-05, + "loss": 0.204, + "step": 33467 + }, + { + "epoch": 2.7112767336357746, + "grad_norm": 0.07737169414758682, + "learning_rate": 7.159638147531393e-05, + "loss": 0.277, + "step": 33468 + }, + { + "epoch": 2.711357744653273, + "grad_norm": 0.06870124489068985, + "learning_rate": 7.159188082271929e-05, + "loss": 0.2128, + "step": 33469 + }, + { + "epoch": 2.711438755670771, + "grad_norm": 0.06312039494514465, + "learning_rate": 7.158738017012467e-05, + "loss": 0.2247, + "step": 33470 + }, + { + "epoch": 2.7115197666882693, + "grad_norm": 0.08233053237199783, + "learning_rate": 7.158287951753005e-05, + "loss": 0.2808, + "step": 33471 + }, + { + "epoch": 2.711600777705768, + "grad_norm": 0.09459495544433594, + "learning_rate": 7.157837886493541e-05, + "loss": 0.2381, + "step": 33472 + }, + { + "epoch": 2.7116817887232663, + "grad_norm": 0.06900796294212341, + "learning_rate": 7.15738782123408e-05, + "loss": 0.2062, + "step": 33473 + }, + { + "epoch": 2.7117627997407645, + "grad_norm": 0.061789944767951965, + "learning_rate": 7.156937755974617e-05, + "loss": 0.2018, + "step": 33474 + }, + { + "epoch": 2.711843810758263, + "grad_norm": 0.07610882818698883, + "learning_rate": 7.156487690715154e-05, + "loss": 0.2645, + "step": 33475 + }, + { + "epoch": 2.7119248217757614, + "grad_norm": 0.06013292446732521, + "learning_rate": 7.156037625455692e-05, + "loss": 0.2092, + "step": 33476 + }, + { + "epoch": 2.7120058327932597, + "grad_norm": 0.08542242646217346, + "learning_rate": 7.155587560196229e-05, + "loss": 0.2674, + "step": 33477 + }, + { + "epoch": 2.7120868438107584, + "grad_norm": 0.06455022841691971, + "learning_rate": 7.155137494936766e-05, + "loss": 0.2131, + "step": 33478 + }, + { + "epoch": 2.7121678548282566, + "grad_norm": 0.07186616957187653, + "learning_rate": 7.154687429677304e-05, + "loss": 0.23, + "step": 33479 + }, + { + "epoch": 2.712248865845755, + "grad_norm": 0.07408653199672699, + "learning_rate": 7.154237364417841e-05, + "loss": 0.2427, + "step": 33480 + }, + { + "epoch": 2.7123298768632536, + "grad_norm": 0.07832899689674377, + "learning_rate": 7.153787299158378e-05, + "loss": 0.3104, + "step": 33481 + }, + { + "epoch": 2.712410887880752, + "grad_norm": 0.05991185083985329, + "learning_rate": 7.153337233898916e-05, + "loss": 0.2335, + "step": 33482 + }, + { + "epoch": 2.71249189889825, + "grad_norm": 0.06915418058633804, + "learning_rate": 7.152887168639453e-05, + "loss": 0.2403, + "step": 33483 + }, + { + "epoch": 2.7125729099157487, + "grad_norm": 0.06852851808071136, + "learning_rate": 7.15243710337999e-05, + "loss": 0.2568, + "step": 33484 + }, + { + "epoch": 2.712653920933247, + "grad_norm": 0.06072534993290901, + "learning_rate": 7.151987038120528e-05, + "loss": 0.2297, + "step": 33485 + }, + { + "epoch": 2.712734931950745, + "grad_norm": 0.07479368150234222, + "learning_rate": 7.151536972861065e-05, + "loss": 0.2208, + "step": 33486 + }, + { + "epoch": 2.712815942968244, + "grad_norm": 0.0576009564101696, + "learning_rate": 7.151086907601603e-05, + "loss": 0.2207, + "step": 33487 + }, + { + "epoch": 2.712896953985742, + "grad_norm": 0.06572145223617554, + "learning_rate": 7.15063684234214e-05, + "loss": 0.2381, + "step": 33488 + }, + { + "epoch": 2.7129779650032404, + "grad_norm": 0.06663456559181213, + "learning_rate": 7.150186777082677e-05, + "loss": 0.2246, + "step": 33489 + }, + { + "epoch": 2.713058976020739, + "grad_norm": 0.06880438327789307, + "learning_rate": 7.149736711823215e-05, + "loss": 0.2555, + "step": 33490 + }, + { + "epoch": 2.7131399870382373, + "grad_norm": 0.08984614163637161, + "learning_rate": 7.149286646563752e-05, + "loss": 0.2848, + "step": 33491 + }, + { + "epoch": 2.7132209980557356, + "grad_norm": 0.06485553085803986, + "learning_rate": 7.148836581304289e-05, + "loss": 0.2543, + "step": 33492 + }, + { + "epoch": 2.713302009073234, + "grad_norm": 0.0725436732172966, + "learning_rate": 7.148386516044827e-05, + "loss": 0.2744, + "step": 33493 + }, + { + "epoch": 2.713383020090732, + "grad_norm": 0.05967855826020241, + "learning_rate": 7.147936450785364e-05, + "loss": 0.1979, + "step": 33494 + }, + { + "epoch": 2.7134640311082308, + "grad_norm": 0.06330663710832596, + "learning_rate": 7.147486385525901e-05, + "loss": 0.222, + "step": 33495 + }, + { + "epoch": 2.713545042125729, + "grad_norm": 0.07709506154060364, + "learning_rate": 7.147036320266439e-05, + "loss": 0.2257, + "step": 33496 + }, + { + "epoch": 2.7136260531432272, + "grad_norm": 0.06852762401103973, + "learning_rate": 7.146586255006976e-05, + "loss": 0.2556, + "step": 33497 + }, + { + "epoch": 2.713707064160726, + "grad_norm": 0.0920860543847084, + "learning_rate": 7.146136189747513e-05, + "loss": 0.2751, + "step": 33498 + }, + { + "epoch": 2.713788075178224, + "grad_norm": 0.06808226555585861, + "learning_rate": 7.145686124488051e-05, + "loss": 0.2922, + "step": 33499 + }, + { + "epoch": 2.7138690861957224, + "grad_norm": 0.09032059460878372, + "learning_rate": 7.145236059228588e-05, + "loss": 0.262, + "step": 33500 + }, + { + "epoch": 2.713950097213221, + "grad_norm": 0.07511615008115768, + "learning_rate": 7.144785993969127e-05, + "loss": 0.2589, + "step": 33501 + }, + { + "epoch": 2.7140311082307194, + "grad_norm": 0.08429624885320663, + "learning_rate": 7.144335928709663e-05, + "loss": 0.2865, + "step": 33502 + }, + { + "epoch": 2.7141121192482176, + "grad_norm": 0.06352245062589645, + "learning_rate": 7.1438858634502e-05, + "loss": 0.224, + "step": 33503 + }, + { + "epoch": 2.7141931302657163, + "grad_norm": 0.07323718816041946, + "learning_rate": 7.143435798190739e-05, + "loss": 0.2201, + "step": 33504 + }, + { + "epoch": 2.7142741412832145, + "grad_norm": 0.060021862387657166, + "learning_rate": 7.142985732931275e-05, + "loss": 0.2206, + "step": 33505 + }, + { + "epoch": 2.714355152300713, + "grad_norm": 0.07336699217557907, + "learning_rate": 7.142535667671812e-05, + "loss": 0.2374, + "step": 33506 + }, + { + "epoch": 2.7144361633182115, + "grad_norm": 0.05722619220614433, + "learning_rate": 7.142085602412351e-05, + "loss": 0.232, + "step": 33507 + }, + { + "epoch": 2.7145171743357097, + "grad_norm": 0.07713234424591064, + "learning_rate": 7.141635537152887e-05, + "loss": 0.274, + "step": 33508 + }, + { + "epoch": 2.714598185353208, + "grad_norm": 0.07163354009389877, + "learning_rate": 7.141185471893424e-05, + "loss": 0.2428, + "step": 33509 + }, + { + "epoch": 2.7146791963707066, + "grad_norm": 0.057901978492736816, + "learning_rate": 7.140735406633963e-05, + "loss": 0.2215, + "step": 33510 + }, + { + "epoch": 2.714760207388205, + "grad_norm": 0.07321817427873611, + "learning_rate": 7.140285341374499e-05, + "loss": 0.2323, + "step": 33511 + }, + { + "epoch": 2.714841218405703, + "grad_norm": 0.057540737092494965, + "learning_rate": 7.139835276115037e-05, + "loss": 0.2594, + "step": 33512 + }, + { + "epoch": 2.714922229423202, + "grad_norm": 0.07701243460178375, + "learning_rate": 7.139385210855575e-05, + "loss": 0.2269, + "step": 33513 + }, + { + "epoch": 2.7150032404407, + "grad_norm": 0.06628618389368057, + "learning_rate": 7.138935145596111e-05, + "loss": 0.2503, + "step": 33514 + }, + { + "epoch": 2.7150842514581983, + "grad_norm": 0.0763016939163208, + "learning_rate": 7.138485080336649e-05, + "loss": 0.2567, + "step": 33515 + }, + { + "epoch": 2.7151652624756966, + "grad_norm": 0.06274400651454926, + "learning_rate": 7.138035015077187e-05, + "loss": 0.2472, + "step": 33516 + }, + { + "epoch": 2.715246273493195, + "grad_norm": 0.06951197981834412, + "learning_rate": 7.137584949817723e-05, + "loss": 0.2483, + "step": 33517 + }, + { + "epoch": 2.7153272845106935, + "grad_norm": 0.07218840718269348, + "learning_rate": 7.137134884558261e-05, + "loss": 0.2551, + "step": 33518 + }, + { + "epoch": 2.7154082955281917, + "grad_norm": 0.06729360669851303, + "learning_rate": 7.1366848192988e-05, + "loss": 0.231, + "step": 33519 + }, + { + "epoch": 2.71548930654569, + "grad_norm": 0.07275759428739548, + "learning_rate": 7.136234754039335e-05, + "loss": 0.2293, + "step": 33520 + }, + { + "epoch": 2.7155703175631887, + "grad_norm": 0.07135862857103348, + "learning_rate": 7.135784688779873e-05, + "loss": 0.229, + "step": 33521 + }, + { + "epoch": 2.715651328580687, + "grad_norm": 0.07239732146263123, + "learning_rate": 7.135334623520411e-05, + "loss": 0.2509, + "step": 33522 + }, + { + "epoch": 2.715732339598185, + "grad_norm": 0.061063461005687714, + "learning_rate": 7.134884558260947e-05, + "loss": 0.1925, + "step": 33523 + }, + { + "epoch": 2.715813350615684, + "grad_norm": 0.07632570713758469, + "learning_rate": 7.134434493001485e-05, + "loss": 0.2524, + "step": 33524 + }, + { + "epoch": 2.715894361633182, + "grad_norm": 0.0663372054696083, + "learning_rate": 7.133984427742024e-05, + "loss": 0.2305, + "step": 33525 + }, + { + "epoch": 2.7159753726506803, + "grad_norm": 0.07694404572248459, + "learning_rate": 7.13353436248256e-05, + "loss": 0.2213, + "step": 33526 + }, + { + "epoch": 2.716056383668179, + "grad_norm": 0.052713215351104736, + "learning_rate": 7.133084297223098e-05, + "loss": 0.2292, + "step": 33527 + }, + { + "epoch": 2.7161373946856773, + "grad_norm": 0.055823810398578644, + "learning_rate": 7.132634231963636e-05, + "loss": 0.2124, + "step": 33528 + }, + { + "epoch": 2.7162184057031755, + "grad_norm": 0.06696312129497528, + "learning_rate": 7.132184166704172e-05, + "loss": 0.2505, + "step": 33529 + }, + { + "epoch": 2.716299416720674, + "grad_norm": 0.08826608210802078, + "learning_rate": 7.13173410144471e-05, + "loss": 0.2581, + "step": 33530 + }, + { + "epoch": 2.7163804277381725, + "grad_norm": 0.06801729649305344, + "learning_rate": 7.131284036185248e-05, + "loss": 0.2392, + "step": 33531 + }, + { + "epoch": 2.7164614387556707, + "grad_norm": 0.06413350254297256, + "learning_rate": 7.130833970925784e-05, + "loss": 0.2219, + "step": 33532 + }, + { + "epoch": 2.7165424497731694, + "grad_norm": 0.0645436942577362, + "learning_rate": 7.130383905666322e-05, + "loss": 0.231, + "step": 33533 + }, + { + "epoch": 2.7166234607906676, + "grad_norm": 0.0745738223195076, + "learning_rate": 7.12993384040686e-05, + "loss": 0.2507, + "step": 33534 + }, + { + "epoch": 2.716704471808166, + "grad_norm": 0.0662076473236084, + "learning_rate": 7.129483775147396e-05, + "loss": 0.2205, + "step": 33535 + }, + { + "epoch": 2.7167854828256646, + "grad_norm": 0.05969638377428055, + "learning_rate": 7.129033709887935e-05, + "loss": 0.2391, + "step": 33536 + }, + { + "epoch": 2.716866493843163, + "grad_norm": 0.0601656436920166, + "learning_rate": 7.128583644628472e-05, + "loss": 0.2518, + "step": 33537 + }, + { + "epoch": 2.716947504860661, + "grad_norm": 0.07162291556596756, + "learning_rate": 7.128133579369008e-05, + "loss": 0.1978, + "step": 33538 + }, + { + "epoch": 2.7170285158781593, + "grad_norm": 0.07743734866380692, + "learning_rate": 7.127683514109547e-05, + "loss": 0.2858, + "step": 33539 + }, + { + "epoch": 2.7171095268956575, + "grad_norm": 0.06878258287906647, + "learning_rate": 7.127233448850084e-05, + "loss": 0.2163, + "step": 33540 + }, + { + "epoch": 2.7171905379131562, + "grad_norm": 0.07549357414245605, + "learning_rate": 7.12678338359062e-05, + "loss": 0.248, + "step": 33541 + }, + { + "epoch": 2.7172715489306545, + "grad_norm": 0.08683260530233383, + "learning_rate": 7.126333318331159e-05, + "loss": 0.273, + "step": 33542 + }, + { + "epoch": 2.7173525599481527, + "grad_norm": 0.07400300353765488, + "learning_rate": 7.125883253071696e-05, + "loss": 0.2123, + "step": 33543 + }, + { + "epoch": 2.7174335709656514, + "grad_norm": 0.07202634960412979, + "learning_rate": 7.125433187812232e-05, + "loss": 0.2748, + "step": 33544 + }, + { + "epoch": 2.7175145819831497, + "grad_norm": 0.09767428040504456, + "learning_rate": 7.124983122552771e-05, + "loss": 0.2557, + "step": 33545 + }, + { + "epoch": 2.717595593000648, + "grad_norm": 0.06339447945356369, + "learning_rate": 7.124533057293308e-05, + "loss": 0.2201, + "step": 33546 + }, + { + "epoch": 2.7176766040181466, + "grad_norm": 0.062197357416152954, + "learning_rate": 7.124082992033844e-05, + "loss": 0.2771, + "step": 33547 + }, + { + "epoch": 2.717757615035645, + "grad_norm": 0.09015563130378723, + "learning_rate": 7.123632926774383e-05, + "loss": 0.2631, + "step": 33548 + }, + { + "epoch": 2.717838626053143, + "grad_norm": 0.07836005091667175, + "learning_rate": 7.12318286151492e-05, + "loss": 0.2618, + "step": 33549 + }, + { + "epoch": 2.7179196370706418, + "grad_norm": 0.07033463567495346, + "learning_rate": 7.122732796255456e-05, + "loss": 0.237, + "step": 33550 + }, + { + "epoch": 2.71800064808814, + "grad_norm": 0.07338385283946991, + "learning_rate": 7.122282730995995e-05, + "loss": 0.2414, + "step": 33551 + }, + { + "epoch": 2.7180816591056383, + "grad_norm": 0.07711820304393768, + "learning_rate": 7.121832665736532e-05, + "loss": 0.2412, + "step": 33552 + }, + { + "epoch": 2.718162670123137, + "grad_norm": 0.07389004528522491, + "learning_rate": 7.12138260047707e-05, + "loss": 0.2343, + "step": 33553 + }, + { + "epoch": 2.718243681140635, + "grad_norm": 0.05881441757082939, + "learning_rate": 7.120932535217607e-05, + "loss": 0.2261, + "step": 33554 + }, + { + "epoch": 2.7183246921581334, + "grad_norm": 0.05964907258749008, + "learning_rate": 7.120482469958144e-05, + "loss": 0.2293, + "step": 33555 + }, + { + "epoch": 2.718405703175632, + "grad_norm": 0.07038917392492294, + "learning_rate": 7.120032404698682e-05, + "loss": 0.2141, + "step": 33556 + }, + { + "epoch": 2.7184867141931304, + "grad_norm": 0.0781693086028099, + "learning_rate": 7.119582339439219e-05, + "loss": 0.2586, + "step": 33557 + }, + { + "epoch": 2.7185677252106286, + "grad_norm": 0.07387826591730118, + "learning_rate": 7.119132274179756e-05, + "loss": 0.2357, + "step": 33558 + }, + { + "epoch": 2.718648736228127, + "grad_norm": 0.06107885017991066, + "learning_rate": 7.118682208920294e-05, + "loss": 0.2209, + "step": 33559 + }, + { + "epoch": 2.7187297472456255, + "grad_norm": 0.06332498043775558, + "learning_rate": 7.118232143660831e-05, + "loss": 0.2584, + "step": 33560 + }, + { + "epoch": 2.718810758263124, + "grad_norm": 0.07451072335243225, + "learning_rate": 7.117782078401369e-05, + "loss": 0.2507, + "step": 33561 + }, + { + "epoch": 2.718891769280622, + "grad_norm": 0.06843792647123337, + "learning_rate": 7.117332013141906e-05, + "loss": 0.2654, + "step": 33562 + }, + { + "epoch": 2.7189727802981203, + "grad_norm": 0.07237779349088669, + "learning_rate": 7.116881947882443e-05, + "loss": 0.2313, + "step": 33563 + }, + { + "epoch": 2.719053791315619, + "grad_norm": 0.1040261909365654, + "learning_rate": 7.11643188262298e-05, + "loss": 0.2367, + "step": 33564 + }, + { + "epoch": 2.719134802333117, + "grad_norm": 0.059434372931718826, + "learning_rate": 7.115981817363518e-05, + "loss": 0.2504, + "step": 33565 + }, + { + "epoch": 2.7192158133506155, + "grad_norm": 0.07661649584770203, + "learning_rate": 7.115531752104055e-05, + "loss": 0.2233, + "step": 33566 + }, + { + "epoch": 2.719296824368114, + "grad_norm": 0.08492406457662582, + "learning_rate": 7.115081686844593e-05, + "loss": 0.2525, + "step": 33567 + }, + { + "epoch": 2.7193778353856124, + "grad_norm": 0.07533139735460281, + "learning_rate": 7.11463162158513e-05, + "loss": 0.268, + "step": 33568 + }, + { + "epoch": 2.7194588464031106, + "grad_norm": 0.07281734049320221, + "learning_rate": 7.114181556325667e-05, + "loss": 0.2023, + "step": 33569 + }, + { + "epoch": 2.7195398574206093, + "grad_norm": 0.08975783735513687, + "learning_rate": 7.113731491066205e-05, + "loss": 0.2275, + "step": 33570 + }, + { + "epoch": 2.7196208684381076, + "grad_norm": 0.0756605714559555, + "learning_rate": 7.113281425806742e-05, + "loss": 0.2513, + "step": 33571 + }, + { + "epoch": 2.719701879455606, + "grad_norm": 0.07475683838129044, + "learning_rate": 7.11283136054728e-05, + "loss": 0.2341, + "step": 33572 + }, + { + "epoch": 2.7197828904731045, + "grad_norm": 0.06809760630130768, + "learning_rate": 7.112381295287817e-05, + "loss": 0.2198, + "step": 33573 + }, + { + "epoch": 2.7198639014906028, + "grad_norm": 0.05999773368239403, + "learning_rate": 7.111931230028354e-05, + "loss": 0.2005, + "step": 33574 + }, + { + "epoch": 2.719944912508101, + "grad_norm": 0.06841011345386505, + "learning_rate": 7.111481164768892e-05, + "loss": 0.2354, + "step": 33575 + }, + { + "epoch": 2.7200259235255997, + "grad_norm": 0.07008085399866104, + "learning_rate": 7.111031099509429e-05, + "loss": 0.2323, + "step": 33576 + }, + { + "epoch": 2.720106934543098, + "grad_norm": 0.06371785700321198, + "learning_rate": 7.110581034249966e-05, + "loss": 0.264, + "step": 33577 + }, + { + "epoch": 2.720187945560596, + "grad_norm": 0.06585246324539185, + "learning_rate": 7.110130968990504e-05, + "loss": 0.2564, + "step": 33578 + }, + { + "epoch": 2.720268956578095, + "grad_norm": 0.06621664017438889, + "learning_rate": 7.109680903731041e-05, + "loss": 0.2347, + "step": 33579 + }, + { + "epoch": 2.720349967595593, + "grad_norm": 0.07635606080293655, + "learning_rate": 7.109230838471578e-05, + "loss": 0.2729, + "step": 33580 + }, + { + "epoch": 2.7204309786130914, + "grad_norm": 0.06877021491527557, + "learning_rate": 7.108780773212116e-05, + "loss": 0.2482, + "step": 33581 + }, + { + "epoch": 2.7205119896305896, + "grad_norm": 0.058608278632164, + "learning_rate": 7.108330707952654e-05, + "loss": 0.2102, + "step": 33582 + }, + { + "epoch": 2.7205930006480883, + "grad_norm": 0.05785350129008293, + "learning_rate": 7.10788064269319e-05, + "loss": 0.219, + "step": 33583 + }, + { + "epoch": 2.7206740116655865, + "grad_norm": 0.0791582241654396, + "learning_rate": 7.107430577433728e-05, + "loss": 0.2492, + "step": 33584 + }, + { + "epoch": 2.720755022683085, + "grad_norm": 0.09484634548425674, + "learning_rate": 7.106980512174267e-05, + "loss": 0.2916, + "step": 33585 + }, + { + "epoch": 2.720836033700583, + "grad_norm": 0.0706673339009285, + "learning_rate": 7.106530446914803e-05, + "loss": 0.2352, + "step": 33586 + }, + { + "epoch": 2.7209170447180817, + "grad_norm": 0.07668599486351013, + "learning_rate": 7.10608038165534e-05, + "loss": 0.2717, + "step": 33587 + }, + { + "epoch": 2.72099805573558, + "grad_norm": 0.06315944343805313, + "learning_rate": 7.105630316395879e-05, + "loss": 0.2646, + "step": 33588 + }, + { + "epoch": 2.721079066753078, + "grad_norm": 0.06296543776988983, + "learning_rate": 7.105180251136415e-05, + "loss": 0.248, + "step": 33589 + }, + { + "epoch": 2.721160077770577, + "grad_norm": 0.05871746689081192, + "learning_rate": 7.104730185876952e-05, + "loss": 0.2101, + "step": 33590 + }, + { + "epoch": 2.721241088788075, + "grad_norm": 0.0563269667327404, + "learning_rate": 7.104280120617491e-05, + "loss": 0.2151, + "step": 33591 + }, + { + "epoch": 2.7213220998055734, + "grad_norm": 0.07399358600378036, + "learning_rate": 7.103830055358027e-05, + "loss": 0.2352, + "step": 33592 + }, + { + "epoch": 2.721403110823072, + "grad_norm": 0.06997597217559814, + "learning_rate": 7.103379990098564e-05, + "loss": 0.2531, + "step": 33593 + }, + { + "epoch": 2.7214841218405703, + "grad_norm": 0.06861952692270279, + "learning_rate": 7.102929924839103e-05, + "loss": 0.2447, + "step": 33594 + }, + { + "epoch": 2.7215651328580686, + "grad_norm": 0.07394934445619583, + "learning_rate": 7.102479859579639e-05, + "loss": 0.2736, + "step": 33595 + }, + { + "epoch": 2.7216461438755672, + "grad_norm": 0.05605033412575722, + "learning_rate": 7.102029794320176e-05, + "loss": 0.1993, + "step": 33596 + }, + { + "epoch": 2.7217271548930655, + "grad_norm": 0.05519258230924606, + "learning_rate": 7.101579729060715e-05, + "loss": 0.2333, + "step": 33597 + }, + { + "epoch": 2.7218081659105637, + "grad_norm": 0.06714540719985962, + "learning_rate": 7.101129663801251e-05, + "loss": 0.2494, + "step": 33598 + }, + { + "epoch": 2.7218891769280624, + "grad_norm": 0.07090084999799728, + "learning_rate": 7.100679598541788e-05, + "loss": 0.2563, + "step": 33599 + }, + { + "epoch": 2.7219701879455607, + "grad_norm": 0.07883425056934357, + "learning_rate": 7.100229533282327e-05, + "loss": 0.2712, + "step": 33600 + }, + { + "epoch": 2.722051198963059, + "grad_norm": 0.07029104977846146, + "learning_rate": 7.099779468022863e-05, + "loss": 0.255, + "step": 33601 + }, + { + "epoch": 2.7221322099805576, + "grad_norm": 0.07110211253166199, + "learning_rate": 7.0993294027634e-05, + "loss": 0.2401, + "step": 33602 + }, + { + "epoch": 2.722213220998056, + "grad_norm": 0.1000295951962471, + "learning_rate": 7.098879337503939e-05, + "loss": 0.2613, + "step": 33603 + }, + { + "epoch": 2.722294232015554, + "grad_norm": 0.06811027973890305, + "learning_rate": 7.098429272244475e-05, + "loss": 0.2265, + "step": 33604 + }, + { + "epoch": 2.7223752430330523, + "grad_norm": 0.07708834111690521, + "learning_rate": 7.097979206985012e-05, + "loss": 0.2799, + "step": 33605 + }, + { + "epoch": 2.722456254050551, + "grad_norm": 0.0472310408949852, + "learning_rate": 7.097529141725551e-05, + "loss": 0.2226, + "step": 33606 + }, + { + "epoch": 2.7225372650680493, + "grad_norm": 0.05726686492562294, + "learning_rate": 7.097079076466087e-05, + "loss": 0.2461, + "step": 33607 + }, + { + "epoch": 2.7226182760855475, + "grad_norm": 0.06914225220680237, + "learning_rate": 7.096629011206626e-05, + "loss": 0.245, + "step": 33608 + }, + { + "epoch": 2.7226992871030458, + "grad_norm": 0.09501109272241592, + "learning_rate": 7.096178945947163e-05, + "loss": 0.3241, + "step": 33609 + }, + { + "epoch": 2.7227802981205445, + "grad_norm": 0.07300115376710892, + "learning_rate": 7.095728880687699e-05, + "loss": 0.2492, + "step": 33610 + }, + { + "epoch": 2.7228613091380427, + "grad_norm": 0.0799044743180275, + "learning_rate": 7.095278815428238e-05, + "loss": 0.2455, + "step": 33611 + }, + { + "epoch": 2.722942320155541, + "grad_norm": 0.07171258330345154, + "learning_rate": 7.094828750168775e-05, + "loss": 0.2421, + "step": 33612 + }, + { + "epoch": 2.7230233311730396, + "grad_norm": 0.05257529020309448, + "learning_rate": 7.094378684909311e-05, + "loss": 0.2129, + "step": 33613 + }, + { + "epoch": 2.723104342190538, + "grad_norm": 0.07569834589958191, + "learning_rate": 7.09392861964985e-05, + "loss": 0.2429, + "step": 33614 + }, + { + "epoch": 2.723185353208036, + "grad_norm": 0.06856614351272583, + "learning_rate": 7.093478554390387e-05, + "loss": 0.2554, + "step": 33615 + }, + { + "epoch": 2.723266364225535, + "grad_norm": 0.0748237892985344, + "learning_rate": 7.093028489130923e-05, + "loss": 0.2562, + "step": 33616 + }, + { + "epoch": 2.723347375243033, + "grad_norm": 0.06843352317810059, + "learning_rate": 7.092578423871462e-05, + "loss": 0.2739, + "step": 33617 + }, + { + "epoch": 2.7234283862605313, + "grad_norm": 0.07195959240198135, + "learning_rate": 7.092128358612e-05, + "loss": 0.2439, + "step": 33618 + }, + { + "epoch": 2.72350939727803, + "grad_norm": 0.05495288968086243, + "learning_rate": 7.091678293352535e-05, + "loss": 0.211, + "step": 33619 + }, + { + "epoch": 2.7235904082955282, + "grad_norm": 0.06835697591304779, + "learning_rate": 7.091228228093074e-05, + "loss": 0.283, + "step": 33620 + }, + { + "epoch": 2.7236714193130265, + "grad_norm": 0.07124406844377518, + "learning_rate": 7.090778162833612e-05, + "loss": 0.2401, + "step": 33621 + }, + { + "epoch": 2.723752430330525, + "grad_norm": 0.07246687263250351, + "learning_rate": 7.090328097574148e-05, + "loss": 0.2355, + "step": 33622 + }, + { + "epoch": 2.7238334413480234, + "grad_norm": 0.06816147267818451, + "learning_rate": 7.089878032314686e-05, + "loss": 0.2505, + "step": 33623 + }, + { + "epoch": 2.7239144523655217, + "grad_norm": 0.07142950594425201, + "learning_rate": 7.089427967055224e-05, + "loss": 0.2656, + "step": 33624 + }, + { + "epoch": 2.7239954633830203, + "grad_norm": 0.06787704676389694, + "learning_rate": 7.088977901795761e-05, + "loss": 0.2318, + "step": 33625 + }, + { + "epoch": 2.7240764744005186, + "grad_norm": 0.05560300126671791, + "learning_rate": 7.088527836536298e-05, + "loss": 0.2253, + "step": 33626 + }, + { + "epoch": 2.724157485418017, + "grad_norm": 0.054587751626968384, + "learning_rate": 7.088077771276836e-05, + "loss": 0.2343, + "step": 33627 + }, + { + "epoch": 2.724238496435515, + "grad_norm": 0.06579109281301498, + "learning_rate": 7.087627706017373e-05, + "loss": 0.2483, + "step": 33628 + }, + { + "epoch": 2.7243195074530138, + "grad_norm": 0.06791524589061737, + "learning_rate": 7.08717764075791e-05, + "loss": 0.2794, + "step": 33629 + }, + { + "epoch": 2.724400518470512, + "grad_norm": 0.07304447144269943, + "learning_rate": 7.086727575498448e-05, + "loss": 0.2804, + "step": 33630 + }, + { + "epoch": 2.7244815294880103, + "grad_norm": 0.06753737479448318, + "learning_rate": 7.086277510238985e-05, + "loss": 0.2541, + "step": 33631 + }, + { + "epoch": 2.7245625405055085, + "grad_norm": 0.07851094752550125, + "learning_rate": 7.085827444979522e-05, + "loss": 0.2719, + "step": 33632 + }, + { + "epoch": 2.724643551523007, + "grad_norm": 0.07380569726228714, + "learning_rate": 7.08537737972006e-05, + "loss": 0.2608, + "step": 33633 + }, + { + "epoch": 2.7247245625405054, + "grad_norm": 0.05877283215522766, + "learning_rate": 7.084927314460597e-05, + "loss": 0.2309, + "step": 33634 + }, + { + "epoch": 2.7248055735580037, + "grad_norm": 0.07889322936534882, + "learning_rate": 7.084477249201135e-05, + "loss": 0.2519, + "step": 33635 + }, + { + "epoch": 2.7248865845755024, + "grad_norm": 0.08005855977535248, + "learning_rate": 7.084027183941672e-05, + "loss": 0.258, + "step": 33636 + }, + { + "epoch": 2.7249675955930006, + "grad_norm": 0.07733011245727539, + "learning_rate": 7.083577118682209e-05, + "loss": 0.2289, + "step": 33637 + }, + { + "epoch": 2.725048606610499, + "grad_norm": 0.06844766438007355, + "learning_rate": 7.083127053422747e-05, + "loss": 0.2214, + "step": 33638 + }, + { + "epoch": 2.7251296176279975, + "grad_norm": 0.07209837436676025, + "learning_rate": 7.082676988163284e-05, + "loss": 0.2471, + "step": 33639 + }, + { + "epoch": 2.725210628645496, + "grad_norm": 0.06612085551023483, + "learning_rate": 7.082226922903821e-05, + "loss": 0.258, + "step": 33640 + }, + { + "epoch": 2.725291639662994, + "grad_norm": 0.06333424896001816, + "learning_rate": 7.081776857644359e-05, + "loss": 0.2165, + "step": 33641 + }, + { + "epoch": 2.7253726506804927, + "grad_norm": 0.07169868797063828, + "learning_rate": 7.081326792384896e-05, + "loss": 0.2308, + "step": 33642 + }, + { + "epoch": 2.725453661697991, + "grad_norm": 0.07447036355733871, + "learning_rate": 7.080876727125433e-05, + "loss": 0.2666, + "step": 33643 + }, + { + "epoch": 2.725534672715489, + "grad_norm": 0.076211117208004, + "learning_rate": 7.080426661865971e-05, + "loss": 0.2153, + "step": 33644 + }, + { + "epoch": 2.725615683732988, + "grad_norm": 0.06768426299095154, + "learning_rate": 7.079976596606508e-05, + "loss": 0.2191, + "step": 33645 + }, + { + "epoch": 2.725696694750486, + "grad_norm": 0.0650651678442955, + "learning_rate": 7.079526531347046e-05, + "loss": 0.2467, + "step": 33646 + }, + { + "epoch": 2.7257777057679844, + "grad_norm": 0.07114232331514359, + "learning_rate": 7.079076466087583e-05, + "loss": 0.2419, + "step": 33647 + }, + { + "epoch": 2.725858716785483, + "grad_norm": 0.06247364357113838, + "learning_rate": 7.07862640082812e-05, + "loss": 0.2613, + "step": 33648 + }, + { + "epoch": 2.7259397278029813, + "grad_norm": 0.0777125209569931, + "learning_rate": 7.078176335568658e-05, + "loss": 0.2511, + "step": 33649 + }, + { + "epoch": 2.7260207388204796, + "grad_norm": 0.07471860200166702, + "learning_rate": 7.077726270309195e-05, + "loss": 0.2519, + "step": 33650 + }, + { + "epoch": 2.726101749837978, + "grad_norm": 0.07368387281894684, + "learning_rate": 7.077276205049732e-05, + "loss": 0.2666, + "step": 33651 + }, + { + "epoch": 2.7261827608554765, + "grad_norm": 0.08657611906528473, + "learning_rate": 7.07682613979027e-05, + "loss": 0.2486, + "step": 33652 + }, + { + "epoch": 2.7262637718729748, + "grad_norm": 0.06747881323099136, + "learning_rate": 7.076376074530807e-05, + "loss": 0.2253, + "step": 33653 + }, + { + "epoch": 2.726344782890473, + "grad_norm": 0.06942425668239594, + "learning_rate": 7.075926009271344e-05, + "loss": 0.2256, + "step": 33654 + }, + { + "epoch": 2.7264257939079712, + "grad_norm": 0.08220626413822174, + "learning_rate": 7.075475944011882e-05, + "loss": 0.2429, + "step": 33655 + }, + { + "epoch": 2.72650680492547, + "grad_norm": 0.06210099533200264, + "learning_rate": 7.075025878752419e-05, + "loss": 0.2634, + "step": 33656 + }, + { + "epoch": 2.726587815942968, + "grad_norm": 0.06558974087238312, + "learning_rate": 7.074575813492956e-05, + "loss": 0.2416, + "step": 33657 + }, + { + "epoch": 2.7266688269604664, + "grad_norm": 0.06641119718551636, + "learning_rate": 7.074125748233494e-05, + "loss": 0.2247, + "step": 33658 + }, + { + "epoch": 2.726749837977965, + "grad_norm": 0.06373778730630875, + "learning_rate": 7.073675682974031e-05, + "loss": 0.2278, + "step": 33659 + }, + { + "epoch": 2.7268308489954634, + "grad_norm": 0.06465722620487213, + "learning_rate": 7.07322561771457e-05, + "loss": 0.237, + "step": 33660 + }, + { + "epoch": 2.7269118600129616, + "grad_norm": 0.05753665044903755, + "learning_rate": 7.072775552455106e-05, + "loss": 0.2318, + "step": 33661 + }, + { + "epoch": 2.7269928710304603, + "grad_norm": 0.07702966779470444, + "learning_rate": 7.072325487195643e-05, + "loss": 0.2348, + "step": 33662 + }, + { + "epoch": 2.7270738820479585, + "grad_norm": 0.06840366870164871, + "learning_rate": 7.071875421936182e-05, + "loss": 0.2309, + "step": 33663 + }, + { + "epoch": 2.7271548930654568, + "grad_norm": 0.058910470455884933, + "learning_rate": 7.071425356676718e-05, + "loss": 0.2279, + "step": 33664 + }, + { + "epoch": 2.7272359040829555, + "grad_norm": 0.07476451992988586, + "learning_rate": 7.070975291417255e-05, + "loss": 0.2169, + "step": 33665 + }, + { + "epoch": 2.7273169151004537, + "grad_norm": 0.0877615436911583, + "learning_rate": 7.070525226157794e-05, + "loss": 0.322, + "step": 33666 + }, + { + "epoch": 2.727397926117952, + "grad_norm": 0.06256525218486786, + "learning_rate": 7.07007516089833e-05, + "loss": 0.1851, + "step": 33667 + }, + { + "epoch": 2.7274789371354506, + "grad_norm": 0.057740043848752975, + "learning_rate": 7.069625095638867e-05, + "loss": 0.249, + "step": 33668 + }, + { + "epoch": 2.727559948152949, + "grad_norm": 0.062011539936065674, + "learning_rate": 7.069175030379406e-05, + "loss": 0.2031, + "step": 33669 + }, + { + "epoch": 2.727640959170447, + "grad_norm": 0.07256323844194412, + "learning_rate": 7.068724965119942e-05, + "loss": 0.2277, + "step": 33670 + }, + { + "epoch": 2.727721970187946, + "grad_norm": 0.07076326757669449, + "learning_rate": 7.06827489986048e-05, + "loss": 0.2631, + "step": 33671 + }, + { + "epoch": 2.727802981205444, + "grad_norm": 0.058779485523700714, + "learning_rate": 7.067824834601018e-05, + "loss": 0.2013, + "step": 33672 + }, + { + "epoch": 2.7278839922229423, + "grad_norm": 0.0768062174320221, + "learning_rate": 7.067374769341554e-05, + "loss": 0.2639, + "step": 33673 + }, + { + "epoch": 2.7279650032404406, + "grad_norm": 0.07481861114501953, + "learning_rate": 7.066924704082092e-05, + "loss": 0.2232, + "step": 33674 + }, + { + "epoch": 2.7280460142579392, + "grad_norm": 0.07629449665546417, + "learning_rate": 7.06647463882263e-05, + "loss": 0.2398, + "step": 33675 + }, + { + "epoch": 2.7281270252754375, + "grad_norm": 0.07321591675281525, + "learning_rate": 7.066024573563166e-05, + "loss": 0.2728, + "step": 33676 + }, + { + "epoch": 2.7282080362929357, + "grad_norm": 0.06354720145463943, + "learning_rate": 7.065574508303704e-05, + "loss": 0.1912, + "step": 33677 + }, + { + "epoch": 2.728289047310434, + "grad_norm": 0.08110125362873077, + "learning_rate": 7.065124443044242e-05, + "loss": 0.2261, + "step": 33678 + }, + { + "epoch": 2.7283700583279327, + "grad_norm": 0.08282686024904251, + "learning_rate": 7.064674377784778e-05, + "loss": 0.256, + "step": 33679 + }, + { + "epoch": 2.728451069345431, + "grad_norm": 0.07880692183971405, + "learning_rate": 7.064224312525316e-05, + "loss": 0.2178, + "step": 33680 + }, + { + "epoch": 2.728532080362929, + "grad_norm": 0.08492696285247803, + "learning_rate": 7.063774247265854e-05, + "loss": 0.226, + "step": 33681 + }, + { + "epoch": 2.728613091380428, + "grad_norm": 0.06499497592449188, + "learning_rate": 7.06332418200639e-05, + "loss": 0.2546, + "step": 33682 + }, + { + "epoch": 2.728694102397926, + "grad_norm": 0.07533867657184601, + "learning_rate": 7.062874116746928e-05, + "loss": 0.28, + "step": 33683 + }, + { + "epoch": 2.7287751134154243, + "grad_norm": 0.07215952128171921, + "learning_rate": 7.062424051487467e-05, + "loss": 0.2216, + "step": 33684 + }, + { + "epoch": 2.728856124432923, + "grad_norm": 0.05357545614242554, + "learning_rate": 7.061973986228003e-05, + "loss": 0.2004, + "step": 33685 + }, + { + "epoch": 2.7289371354504213, + "grad_norm": 0.08010009676218033, + "learning_rate": 7.061523920968541e-05, + "loss": 0.2291, + "step": 33686 + }, + { + "epoch": 2.7290181464679195, + "grad_norm": 0.07454568147659302, + "learning_rate": 7.061073855709079e-05, + "loss": 0.286, + "step": 33687 + }, + { + "epoch": 2.729099157485418, + "grad_norm": 0.08589702099561691, + "learning_rate": 7.060623790449615e-05, + "loss": 0.266, + "step": 33688 + }, + { + "epoch": 2.7291801685029164, + "grad_norm": 0.0718001127243042, + "learning_rate": 7.060173725190153e-05, + "loss": 0.2354, + "step": 33689 + }, + { + "epoch": 2.7292611795204147, + "grad_norm": 0.05786353349685669, + "learning_rate": 7.059723659930691e-05, + "loss": 0.2384, + "step": 33690 + }, + { + "epoch": 2.7293421905379134, + "grad_norm": 0.06753803044557571, + "learning_rate": 7.059273594671227e-05, + "loss": 0.2229, + "step": 33691 + }, + { + "epoch": 2.7294232015554116, + "grad_norm": 0.05679044499993324, + "learning_rate": 7.058823529411765e-05, + "loss": 0.2221, + "step": 33692 + }, + { + "epoch": 2.72950421257291, + "grad_norm": 0.058725755661726, + "learning_rate": 7.058373464152303e-05, + "loss": 0.2534, + "step": 33693 + }, + { + "epoch": 2.7295852235904086, + "grad_norm": 0.06571077555418015, + "learning_rate": 7.05792339889284e-05, + "loss": 0.2338, + "step": 33694 + }, + { + "epoch": 2.729666234607907, + "grad_norm": 0.07829579710960388, + "learning_rate": 7.057473333633378e-05, + "loss": 0.2649, + "step": 33695 + }, + { + "epoch": 2.729747245625405, + "grad_norm": 0.05849388986825943, + "learning_rate": 7.057023268373915e-05, + "loss": 0.2353, + "step": 33696 + }, + { + "epoch": 2.7298282566429033, + "grad_norm": 0.053417835384607315, + "learning_rate": 7.056573203114452e-05, + "loss": 0.23, + "step": 33697 + }, + { + "epoch": 2.7299092676604015, + "grad_norm": 0.06413785368204117, + "learning_rate": 7.05612313785499e-05, + "loss": 0.2688, + "step": 33698 + }, + { + "epoch": 2.7299902786779002, + "grad_norm": 0.06464651226997375, + "learning_rate": 7.055673072595527e-05, + "loss": 0.2158, + "step": 33699 + }, + { + "epoch": 2.7300712896953985, + "grad_norm": 0.058058079332113266, + "learning_rate": 7.055223007336064e-05, + "loss": 0.23, + "step": 33700 + }, + { + "epoch": 2.7301523007128967, + "grad_norm": 0.0811547338962555, + "learning_rate": 7.054772942076602e-05, + "loss": 0.2421, + "step": 33701 + }, + { + "epoch": 2.7302333117303954, + "grad_norm": 0.06070886552333832, + "learning_rate": 7.054322876817139e-05, + "loss": 0.2083, + "step": 33702 + }, + { + "epoch": 2.7303143227478937, + "grad_norm": 0.05657278373837471, + "learning_rate": 7.053872811557676e-05, + "loss": 0.1969, + "step": 33703 + }, + { + "epoch": 2.730395333765392, + "grad_norm": 0.06588597595691681, + "learning_rate": 7.053422746298214e-05, + "loss": 0.2475, + "step": 33704 + }, + { + "epoch": 2.7304763447828906, + "grad_norm": 0.07281491905450821, + "learning_rate": 7.052972681038751e-05, + "loss": 0.2373, + "step": 33705 + }, + { + "epoch": 2.730557355800389, + "grad_norm": 0.0641547366976738, + "learning_rate": 7.052522615779288e-05, + "loss": 0.2116, + "step": 33706 + }, + { + "epoch": 2.730638366817887, + "grad_norm": 0.0717482790350914, + "learning_rate": 7.052072550519826e-05, + "loss": 0.2386, + "step": 33707 + }, + { + "epoch": 2.7307193778353858, + "grad_norm": 0.07203912734985352, + "learning_rate": 7.051622485260363e-05, + "loss": 0.2436, + "step": 33708 + }, + { + "epoch": 2.730800388852884, + "grad_norm": 0.06969515234231949, + "learning_rate": 7.0511724200009e-05, + "loss": 0.2409, + "step": 33709 + }, + { + "epoch": 2.7308813998703823, + "grad_norm": 0.08552543818950653, + "learning_rate": 7.050722354741438e-05, + "loss": 0.2173, + "step": 33710 + }, + { + "epoch": 2.730962410887881, + "grad_norm": 0.0826902985572815, + "learning_rate": 7.050272289481975e-05, + "loss": 0.237, + "step": 33711 + }, + { + "epoch": 2.731043421905379, + "grad_norm": 0.05936390534043312, + "learning_rate": 7.049822224222513e-05, + "loss": 0.2083, + "step": 33712 + }, + { + "epoch": 2.7311244329228774, + "grad_norm": 0.08406198024749756, + "learning_rate": 7.04937215896305e-05, + "loss": 0.2386, + "step": 33713 + }, + { + "epoch": 2.731205443940376, + "grad_norm": 0.07294872403144836, + "learning_rate": 7.048922093703587e-05, + "loss": 0.2189, + "step": 33714 + }, + { + "epoch": 2.7312864549578744, + "grad_norm": 0.0857522040605545, + "learning_rate": 7.048472028444125e-05, + "loss": 0.2611, + "step": 33715 + }, + { + "epoch": 2.7313674659753726, + "grad_norm": 0.07238192856311798, + "learning_rate": 7.048021963184662e-05, + "loss": 0.2455, + "step": 33716 + }, + { + "epoch": 2.7314484769928713, + "grad_norm": 0.08115571737289429, + "learning_rate": 7.0475718979252e-05, + "loss": 0.2292, + "step": 33717 + }, + { + "epoch": 2.7315294880103695, + "grad_norm": 0.06046930328011513, + "learning_rate": 7.047121832665737e-05, + "loss": 0.2508, + "step": 33718 + }, + { + "epoch": 2.731610499027868, + "grad_norm": 0.0676305890083313, + "learning_rate": 7.046671767406274e-05, + "loss": 0.2265, + "step": 33719 + }, + { + "epoch": 2.731691510045366, + "grad_norm": 0.07789310067892075, + "learning_rate": 7.046221702146812e-05, + "loss": 0.2027, + "step": 33720 + }, + { + "epoch": 2.7317725210628643, + "grad_norm": 0.057190366089344025, + "learning_rate": 7.045771636887349e-05, + "loss": 0.2165, + "step": 33721 + }, + { + "epoch": 2.731853532080363, + "grad_norm": 0.07512050122022629, + "learning_rate": 7.045321571627886e-05, + "loss": 0.2153, + "step": 33722 + }, + { + "epoch": 2.731934543097861, + "grad_norm": 0.05146675184369087, + "learning_rate": 7.044871506368424e-05, + "loss": 0.2318, + "step": 33723 + }, + { + "epoch": 2.7320155541153595, + "grad_norm": 0.07653915882110596, + "learning_rate": 7.044421441108961e-05, + "loss": 0.2963, + "step": 33724 + }, + { + "epoch": 2.732096565132858, + "grad_norm": 0.06947991251945496, + "learning_rate": 7.043971375849498e-05, + "loss": 0.2098, + "step": 33725 + }, + { + "epoch": 2.7321775761503564, + "grad_norm": 0.05789974704384804, + "learning_rate": 7.043521310590036e-05, + "loss": 0.2251, + "step": 33726 + }, + { + "epoch": 2.7322585871678546, + "grad_norm": 0.06780644506216049, + "learning_rate": 7.043071245330573e-05, + "loss": 0.2158, + "step": 33727 + }, + { + "epoch": 2.7323395981853533, + "grad_norm": 0.06673760712146759, + "learning_rate": 7.04262118007111e-05, + "loss": 0.2658, + "step": 33728 + }, + { + "epoch": 2.7324206092028516, + "grad_norm": 0.06278131902217865, + "learning_rate": 7.042171114811648e-05, + "loss": 0.2576, + "step": 33729 + }, + { + "epoch": 2.73250162022035, + "grad_norm": 0.07872526347637177, + "learning_rate": 7.041721049552185e-05, + "loss": 0.2636, + "step": 33730 + }, + { + "epoch": 2.7325826312378485, + "grad_norm": 0.05997196584939957, + "learning_rate": 7.041270984292722e-05, + "loss": 0.2184, + "step": 33731 + }, + { + "epoch": 2.7326636422553467, + "grad_norm": 0.07817976921796799, + "learning_rate": 7.04082091903326e-05, + "loss": 0.2661, + "step": 33732 + }, + { + "epoch": 2.732744653272845, + "grad_norm": 0.060990262776613235, + "learning_rate": 7.040370853773797e-05, + "loss": 0.229, + "step": 33733 + }, + { + "epoch": 2.7328256642903437, + "grad_norm": 0.07189729809761047, + "learning_rate": 7.039920788514335e-05, + "loss": 0.2156, + "step": 33734 + }, + { + "epoch": 2.732906675307842, + "grad_norm": 0.06340024620294571, + "learning_rate": 7.039470723254872e-05, + "loss": 0.2244, + "step": 33735 + }, + { + "epoch": 2.73298768632534, + "grad_norm": 0.059773221611976624, + "learning_rate": 7.039020657995409e-05, + "loss": 0.2135, + "step": 33736 + }, + { + "epoch": 2.733068697342839, + "grad_norm": 0.05692936107516289, + "learning_rate": 7.038570592735947e-05, + "loss": 0.2086, + "step": 33737 + }, + { + "epoch": 2.733149708360337, + "grad_norm": 0.05831446498632431, + "learning_rate": 7.038120527476485e-05, + "loss": 0.1777, + "step": 33738 + }, + { + "epoch": 2.7332307193778353, + "grad_norm": 0.0769772082567215, + "learning_rate": 7.037670462217021e-05, + "loss": 0.2574, + "step": 33739 + }, + { + "epoch": 2.733311730395334, + "grad_norm": 0.0605337955057621, + "learning_rate": 7.037220396957559e-05, + "loss": 0.2145, + "step": 33740 + }, + { + "epoch": 2.7333927414128323, + "grad_norm": 0.08001101762056351, + "learning_rate": 7.036770331698097e-05, + "loss": 0.2414, + "step": 33741 + }, + { + "epoch": 2.7334737524303305, + "grad_norm": 0.05913480371236801, + "learning_rate": 7.036320266438633e-05, + "loss": 0.2118, + "step": 33742 + }, + { + "epoch": 2.7335547634478288, + "grad_norm": 0.0753609761595726, + "learning_rate": 7.035870201179171e-05, + "loss": 0.2286, + "step": 33743 + }, + { + "epoch": 2.733635774465327, + "grad_norm": 0.07040075212717056, + "learning_rate": 7.03542013591971e-05, + "loss": 0.2193, + "step": 33744 + }, + { + "epoch": 2.7337167854828257, + "grad_norm": 0.06425496935844421, + "learning_rate": 7.034970070660246e-05, + "loss": 0.2583, + "step": 33745 + }, + { + "epoch": 2.733797796500324, + "grad_norm": 0.06807851046323776, + "learning_rate": 7.034520005400783e-05, + "loss": 0.2412, + "step": 33746 + }, + { + "epoch": 2.733878807517822, + "grad_norm": 0.06525375694036484, + "learning_rate": 7.034069940141322e-05, + "loss": 0.2068, + "step": 33747 + }, + { + "epoch": 2.733959818535321, + "grad_norm": 0.06132105737924576, + "learning_rate": 7.033619874881858e-05, + "loss": 0.1967, + "step": 33748 + }, + { + "epoch": 2.734040829552819, + "grad_norm": 0.07580532133579254, + "learning_rate": 7.033169809622395e-05, + "loss": 0.2364, + "step": 33749 + }, + { + "epoch": 2.7341218405703174, + "grad_norm": 0.0784335732460022, + "learning_rate": 7.032719744362934e-05, + "loss": 0.2257, + "step": 33750 + }, + { + "epoch": 2.734202851587816, + "grad_norm": 0.07119733840227127, + "learning_rate": 7.03226967910347e-05, + "loss": 0.2395, + "step": 33751 + }, + { + "epoch": 2.7342838626053143, + "grad_norm": 0.07127971947193146, + "learning_rate": 7.031819613844007e-05, + "loss": 0.2173, + "step": 33752 + }, + { + "epoch": 2.7343648736228126, + "grad_norm": 0.07059836387634277, + "learning_rate": 7.031369548584546e-05, + "loss": 0.2458, + "step": 33753 + }, + { + "epoch": 2.7344458846403112, + "grad_norm": 0.07279672473669052, + "learning_rate": 7.030919483325082e-05, + "loss": 0.247, + "step": 33754 + }, + { + "epoch": 2.7345268956578095, + "grad_norm": 0.0836254432797432, + "learning_rate": 7.030469418065619e-05, + "loss": 0.2716, + "step": 33755 + }, + { + "epoch": 2.7346079066753077, + "grad_norm": 0.06887470930814743, + "learning_rate": 7.030019352806158e-05, + "loss": 0.2572, + "step": 33756 + }, + { + "epoch": 2.7346889176928064, + "grad_norm": 0.06887579709291458, + "learning_rate": 7.029569287546694e-05, + "loss": 0.2227, + "step": 33757 + }, + { + "epoch": 2.7347699287103047, + "grad_norm": 0.0751393735408783, + "learning_rate": 7.029119222287231e-05, + "loss": 0.2437, + "step": 33758 + }, + { + "epoch": 2.734850939727803, + "grad_norm": 0.062123481184244156, + "learning_rate": 7.02866915702777e-05, + "loss": 0.2359, + "step": 33759 + }, + { + "epoch": 2.7349319507453016, + "grad_norm": 0.09555557370185852, + "learning_rate": 7.028219091768306e-05, + "loss": 0.2334, + "step": 33760 + }, + { + "epoch": 2.7350129617628, + "grad_norm": 0.0862041488289833, + "learning_rate": 7.027769026508843e-05, + "loss": 0.2499, + "step": 33761 + }, + { + "epoch": 2.735093972780298, + "grad_norm": 0.05869421362876892, + "learning_rate": 7.027318961249382e-05, + "loss": 0.1972, + "step": 33762 + }, + { + "epoch": 2.7351749837977968, + "grad_norm": 0.071591317653656, + "learning_rate": 7.02686889598992e-05, + "loss": 0.2497, + "step": 33763 + }, + { + "epoch": 2.735255994815295, + "grad_norm": 0.07253791391849518, + "learning_rate": 7.026418830730455e-05, + "loss": 0.2708, + "step": 33764 + }, + { + "epoch": 2.7353370058327933, + "grad_norm": 0.07822144776582718, + "learning_rate": 7.025968765470994e-05, + "loss": 0.2319, + "step": 33765 + }, + { + "epoch": 2.7354180168502915, + "grad_norm": 0.08056861162185669, + "learning_rate": 7.025518700211531e-05, + "loss": 0.2354, + "step": 33766 + }, + { + "epoch": 2.7354990278677898, + "grad_norm": 0.06811316311359406, + "learning_rate": 7.025068634952069e-05, + "loss": 0.2526, + "step": 33767 + }, + { + "epoch": 2.7355800388852884, + "grad_norm": 0.07110168039798737, + "learning_rate": 7.024618569692606e-05, + "loss": 0.2254, + "step": 33768 + }, + { + "epoch": 2.7356610499027867, + "grad_norm": 0.06414638459682465, + "learning_rate": 7.024168504433144e-05, + "loss": 0.2435, + "step": 33769 + }, + { + "epoch": 2.735742060920285, + "grad_norm": 0.06620276719331741, + "learning_rate": 7.023718439173681e-05, + "loss": 0.2487, + "step": 33770 + }, + { + "epoch": 2.7358230719377836, + "grad_norm": 0.07465215027332306, + "learning_rate": 7.023268373914218e-05, + "loss": 0.262, + "step": 33771 + }, + { + "epoch": 2.735904082955282, + "grad_norm": 0.07484614104032516, + "learning_rate": 7.022818308654756e-05, + "loss": 0.2374, + "step": 33772 + }, + { + "epoch": 2.73598509397278, + "grad_norm": 0.0882822796702385, + "learning_rate": 7.022368243395293e-05, + "loss": 0.2527, + "step": 33773 + }, + { + "epoch": 2.736066104990279, + "grad_norm": 0.06477689743041992, + "learning_rate": 7.02191817813583e-05, + "loss": 0.2397, + "step": 33774 + }, + { + "epoch": 2.736147116007777, + "grad_norm": 0.055372174829244614, + "learning_rate": 7.021468112876368e-05, + "loss": 0.2228, + "step": 33775 + }, + { + "epoch": 2.7362281270252753, + "grad_norm": 0.07421818375587463, + "learning_rate": 7.021018047616905e-05, + "loss": 0.2492, + "step": 33776 + }, + { + "epoch": 2.736309138042774, + "grad_norm": 0.06517310440540314, + "learning_rate": 7.020567982357442e-05, + "loss": 0.2673, + "step": 33777 + }, + { + "epoch": 2.7363901490602722, + "grad_norm": 0.08472959697246552, + "learning_rate": 7.02011791709798e-05, + "loss": 0.2383, + "step": 33778 + }, + { + "epoch": 2.7364711600777705, + "grad_norm": 0.08939102292060852, + "learning_rate": 7.019667851838517e-05, + "loss": 0.2173, + "step": 33779 + }, + { + "epoch": 2.736552171095269, + "grad_norm": 0.06531999260187149, + "learning_rate": 7.019217786579055e-05, + "loss": 0.2868, + "step": 33780 + }, + { + "epoch": 2.7366331821127674, + "grad_norm": 0.07156679034233093, + "learning_rate": 7.018767721319592e-05, + "loss": 0.2444, + "step": 33781 + }, + { + "epoch": 2.7367141931302656, + "grad_norm": 0.08077112585306168, + "learning_rate": 7.018317656060129e-05, + "loss": 0.2384, + "step": 33782 + }, + { + "epoch": 2.7367952041477643, + "grad_norm": 0.06216087564826012, + "learning_rate": 7.017867590800667e-05, + "loss": 0.2296, + "step": 33783 + }, + { + "epoch": 2.7368762151652626, + "grad_norm": 0.08354579657316208, + "learning_rate": 7.017417525541204e-05, + "loss": 0.2386, + "step": 33784 + }, + { + "epoch": 2.736957226182761, + "grad_norm": 0.07849965244531631, + "learning_rate": 7.016967460281741e-05, + "loss": 0.2343, + "step": 33785 + }, + { + "epoch": 2.737038237200259, + "grad_norm": 0.06662687659263611, + "learning_rate": 7.016517395022279e-05, + "loss": 0.2158, + "step": 33786 + }, + { + "epoch": 2.7371192482177578, + "grad_norm": 0.08657371252775192, + "learning_rate": 7.016067329762816e-05, + "loss": 0.2542, + "step": 33787 + }, + { + "epoch": 2.737200259235256, + "grad_norm": 0.06502904742956161, + "learning_rate": 7.015617264503353e-05, + "loss": 0.2463, + "step": 33788 + }, + { + "epoch": 2.7372812702527543, + "grad_norm": 0.06432264298200607, + "learning_rate": 7.015167199243891e-05, + "loss": 0.2188, + "step": 33789 + }, + { + "epoch": 2.7373622812702525, + "grad_norm": 0.06669308245182037, + "learning_rate": 7.014717133984428e-05, + "loss": 0.2682, + "step": 33790 + }, + { + "epoch": 2.737443292287751, + "grad_norm": 0.07942566275596619, + "learning_rate": 7.014267068724965e-05, + "loss": 0.2607, + "step": 33791 + }, + { + "epoch": 2.7375243033052494, + "grad_norm": 0.07019062340259552, + "learning_rate": 7.013817003465503e-05, + "loss": 0.2515, + "step": 33792 + }, + { + "epoch": 2.7376053143227477, + "grad_norm": 0.0668862909078598, + "learning_rate": 7.01336693820604e-05, + "loss": 0.2336, + "step": 33793 + }, + { + "epoch": 2.7376863253402464, + "grad_norm": 0.06477285176515579, + "learning_rate": 7.012916872946578e-05, + "loss": 0.21, + "step": 33794 + }, + { + "epoch": 2.7377673363577446, + "grad_norm": 0.08168063312768936, + "learning_rate": 7.012466807687115e-05, + "loss": 0.2426, + "step": 33795 + }, + { + "epoch": 2.737848347375243, + "grad_norm": 0.07425316423177719, + "learning_rate": 7.012016742427652e-05, + "loss": 0.2753, + "step": 33796 + }, + { + "epoch": 2.7379293583927415, + "grad_norm": 0.0631299614906311, + "learning_rate": 7.01156667716819e-05, + "loss": 0.249, + "step": 33797 + }, + { + "epoch": 2.73801036941024, + "grad_norm": 0.05468069016933441, + "learning_rate": 7.011116611908727e-05, + "loss": 0.2033, + "step": 33798 + }, + { + "epoch": 2.738091380427738, + "grad_norm": 0.07220423221588135, + "learning_rate": 7.010666546649264e-05, + "loss": 0.2031, + "step": 33799 + }, + { + "epoch": 2.7381723914452367, + "grad_norm": 0.06589049100875854, + "learning_rate": 7.010216481389802e-05, + "loss": 0.2005, + "step": 33800 + }, + { + "epoch": 2.738253402462735, + "grad_norm": 0.07369597256183624, + "learning_rate": 7.009766416130339e-05, + "loss": 0.2631, + "step": 33801 + }, + { + "epoch": 2.738334413480233, + "grad_norm": 0.060120873153209686, + "learning_rate": 7.009316350870876e-05, + "loss": 0.2399, + "step": 33802 + }, + { + "epoch": 2.738415424497732, + "grad_norm": 0.06925645470619202, + "learning_rate": 7.008866285611414e-05, + "loss": 0.2172, + "step": 33803 + }, + { + "epoch": 2.73849643551523, + "grad_norm": 0.08340088278055191, + "learning_rate": 7.008416220351951e-05, + "loss": 0.2284, + "step": 33804 + }, + { + "epoch": 2.7385774465327284, + "grad_norm": 0.06543581932783127, + "learning_rate": 7.007966155092489e-05, + "loss": 0.2927, + "step": 33805 + }, + { + "epoch": 2.738658457550227, + "grad_norm": 0.07321444153785706, + "learning_rate": 7.007516089833026e-05, + "loss": 0.2035, + "step": 33806 + }, + { + "epoch": 2.7387394685677253, + "grad_norm": 0.08377903699874878, + "learning_rate": 7.007066024573563e-05, + "loss": 0.2437, + "step": 33807 + }, + { + "epoch": 2.7388204795852236, + "grad_norm": 0.07676710933446884, + "learning_rate": 7.0066159593141e-05, + "loss": 0.2379, + "step": 33808 + }, + { + "epoch": 2.738901490602722, + "grad_norm": 0.07532139122486115, + "learning_rate": 7.006165894054638e-05, + "loss": 0.2104, + "step": 33809 + }, + { + "epoch": 2.7389825016202205, + "grad_norm": 0.07883398979902267, + "learning_rate": 7.005715828795175e-05, + "loss": 0.2516, + "step": 33810 + }, + { + "epoch": 2.7390635126377187, + "grad_norm": 0.08295116573572159, + "learning_rate": 7.005265763535713e-05, + "loss": 0.2142, + "step": 33811 + }, + { + "epoch": 2.739144523655217, + "grad_norm": 0.07011543959379196, + "learning_rate": 7.00481569827625e-05, + "loss": 0.2167, + "step": 33812 + }, + { + "epoch": 2.7392255346727152, + "grad_norm": 0.07436306029558182, + "learning_rate": 7.004365633016787e-05, + "loss": 0.258, + "step": 33813 + }, + { + "epoch": 2.739306545690214, + "grad_norm": 0.07716170698404312, + "learning_rate": 7.003915567757325e-05, + "loss": 0.241, + "step": 33814 + }, + { + "epoch": 2.739387556707712, + "grad_norm": 0.06765492260456085, + "learning_rate": 7.003465502497862e-05, + "loss": 0.2618, + "step": 33815 + }, + { + "epoch": 2.7394685677252104, + "grad_norm": 0.07305973023176193, + "learning_rate": 7.0030154372384e-05, + "loss": 0.2412, + "step": 33816 + }, + { + "epoch": 2.739549578742709, + "grad_norm": 0.06445764005184174, + "learning_rate": 7.002565371978937e-05, + "loss": 0.224, + "step": 33817 + }, + { + "epoch": 2.7396305897602073, + "grad_norm": 0.06938768178224564, + "learning_rate": 7.002115306719474e-05, + "loss": 0.2641, + "step": 33818 + }, + { + "epoch": 2.7397116007777056, + "grad_norm": 0.06424959003925323, + "learning_rate": 7.001665241460013e-05, + "loss": 0.2338, + "step": 33819 + }, + { + "epoch": 2.7397926117952043, + "grad_norm": 0.07303512841463089, + "learning_rate": 7.001215176200549e-05, + "loss": 0.2504, + "step": 33820 + }, + { + "epoch": 2.7398736228127025, + "grad_norm": 0.0583353228867054, + "learning_rate": 7.000765110941086e-05, + "loss": 0.2449, + "step": 33821 + }, + { + "epoch": 2.7399546338302008, + "grad_norm": 0.06295175850391388, + "learning_rate": 7.000315045681625e-05, + "loss": 0.223, + "step": 33822 + }, + { + "epoch": 2.7400356448476995, + "grad_norm": 0.07108136266469955, + "learning_rate": 6.999864980422161e-05, + "loss": 0.2506, + "step": 33823 + }, + { + "epoch": 2.7401166558651977, + "grad_norm": 0.07039394229650497, + "learning_rate": 6.999414915162698e-05, + "loss": 0.2432, + "step": 33824 + }, + { + "epoch": 2.740197666882696, + "grad_norm": 0.07306911051273346, + "learning_rate": 6.998964849903237e-05, + "loss": 0.2378, + "step": 33825 + }, + { + "epoch": 2.7402786779001946, + "grad_norm": 0.07997465133666992, + "learning_rate": 6.998514784643773e-05, + "loss": 0.2805, + "step": 33826 + }, + { + "epoch": 2.740359688917693, + "grad_norm": 0.08780818432569504, + "learning_rate": 6.99806471938431e-05, + "loss": 0.2553, + "step": 33827 + }, + { + "epoch": 2.740440699935191, + "grad_norm": 0.07117179781198502, + "learning_rate": 6.997614654124849e-05, + "loss": 0.2315, + "step": 33828 + }, + { + "epoch": 2.74052171095269, + "grad_norm": 0.08158638328313828, + "learning_rate": 6.997164588865385e-05, + "loss": 0.2547, + "step": 33829 + }, + { + "epoch": 2.740602721970188, + "grad_norm": 0.06616087257862091, + "learning_rate": 6.996714523605923e-05, + "loss": 0.2258, + "step": 33830 + }, + { + "epoch": 2.7406837329876863, + "grad_norm": 0.0600474514067173, + "learning_rate": 6.996264458346461e-05, + "loss": 0.2402, + "step": 33831 + }, + { + "epoch": 2.7407647440051845, + "grad_norm": 0.06791268289089203, + "learning_rate": 6.995814393086999e-05, + "loss": 0.261, + "step": 33832 + }, + { + "epoch": 2.7408457550226832, + "grad_norm": 0.06313014775514603, + "learning_rate": 6.995364327827535e-05, + "loss": 0.2329, + "step": 33833 + }, + { + "epoch": 2.7409267660401815, + "grad_norm": 0.07031618058681488, + "learning_rate": 6.994914262568073e-05, + "loss": 0.2339, + "step": 33834 + }, + { + "epoch": 2.7410077770576797, + "grad_norm": 0.07318814843893051, + "learning_rate": 6.994464197308611e-05, + "loss": 0.2446, + "step": 33835 + }, + { + "epoch": 2.741088788075178, + "grad_norm": 0.0746544748544693, + "learning_rate": 6.994014132049147e-05, + "loss": 0.2573, + "step": 33836 + }, + { + "epoch": 2.7411697990926767, + "grad_norm": 0.08126812428236008, + "learning_rate": 6.993564066789685e-05, + "loss": 0.2768, + "step": 33837 + }, + { + "epoch": 2.741250810110175, + "grad_norm": 0.0760657787322998, + "learning_rate": 6.993114001530223e-05, + "loss": 0.2319, + "step": 33838 + }, + { + "epoch": 2.741331821127673, + "grad_norm": 0.07601358741521835, + "learning_rate": 6.992663936270759e-05, + "loss": 0.262, + "step": 33839 + }, + { + "epoch": 2.741412832145172, + "grad_norm": 0.07119158655405045, + "learning_rate": 6.992213871011297e-05, + "loss": 0.2495, + "step": 33840 + }, + { + "epoch": 2.74149384316267, + "grad_norm": 0.05685252323746681, + "learning_rate": 6.991763805751835e-05, + "loss": 0.2144, + "step": 33841 + }, + { + "epoch": 2.7415748541801683, + "grad_norm": 0.05508258566260338, + "learning_rate": 6.991313740492371e-05, + "loss": 0.2268, + "step": 33842 + }, + { + "epoch": 2.741655865197667, + "grad_norm": 0.07089263945817947, + "learning_rate": 6.99086367523291e-05, + "loss": 0.2272, + "step": 33843 + }, + { + "epoch": 2.7417368762151653, + "grad_norm": 0.07166006416082382, + "learning_rate": 6.990413609973447e-05, + "loss": 0.2718, + "step": 33844 + }, + { + "epoch": 2.7418178872326635, + "grad_norm": 0.0669952780008316, + "learning_rate": 6.989963544713984e-05, + "loss": 0.2445, + "step": 33845 + }, + { + "epoch": 2.741898898250162, + "grad_norm": 0.06594375520944595, + "learning_rate": 6.989513479454522e-05, + "loss": 0.2806, + "step": 33846 + }, + { + "epoch": 2.7419799092676604, + "grad_norm": 0.04766225442290306, + "learning_rate": 6.989063414195059e-05, + "loss": 0.2036, + "step": 33847 + }, + { + "epoch": 2.7420609202851587, + "grad_norm": 0.0686596930027008, + "learning_rate": 6.988613348935596e-05, + "loss": 0.2248, + "step": 33848 + }, + { + "epoch": 2.7421419313026574, + "grad_norm": 0.05600970238447189, + "learning_rate": 6.988163283676134e-05, + "loss": 0.206, + "step": 33849 + }, + { + "epoch": 2.7422229423201556, + "grad_norm": 0.08106046915054321, + "learning_rate": 6.987713218416671e-05, + "loss": 0.2254, + "step": 33850 + }, + { + "epoch": 2.742303953337654, + "grad_norm": 0.06680909544229507, + "learning_rate": 6.987263153157208e-05, + "loss": 0.251, + "step": 33851 + }, + { + "epoch": 2.7423849643551526, + "grad_norm": 0.07697292417287827, + "learning_rate": 6.986813087897746e-05, + "loss": 0.2281, + "step": 33852 + }, + { + "epoch": 2.742465975372651, + "grad_norm": 0.07131654024124146, + "learning_rate": 6.986363022638283e-05, + "loss": 0.2315, + "step": 33853 + }, + { + "epoch": 2.742546986390149, + "grad_norm": 0.06702481955289841, + "learning_rate": 6.98591295737882e-05, + "loss": 0.215, + "step": 33854 + }, + { + "epoch": 2.7426279974076473, + "grad_norm": 0.06961885839700699, + "learning_rate": 6.985462892119358e-05, + "loss": 0.2585, + "step": 33855 + }, + { + "epoch": 2.742709008425146, + "grad_norm": 0.06202276796102524, + "learning_rate": 6.985012826859895e-05, + "loss": 0.2271, + "step": 33856 + }, + { + "epoch": 2.7427900194426442, + "grad_norm": 0.07506030052900314, + "learning_rate": 6.984562761600433e-05, + "loss": 0.2559, + "step": 33857 + }, + { + "epoch": 2.7428710304601425, + "grad_norm": 0.07953765243291855, + "learning_rate": 6.98411269634097e-05, + "loss": 0.2489, + "step": 33858 + }, + { + "epoch": 2.7429520414776407, + "grad_norm": 0.07561314105987549, + "learning_rate": 6.983662631081507e-05, + "loss": 0.2335, + "step": 33859 + }, + { + "epoch": 2.7430330524951394, + "grad_norm": 0.07074534147977829, + "learning_rate": 6.983212565822045e-05, + "loss": 0.2195, + "step": 33860 + }, + { + "epoch": 2.7431140635126376, + "grad_norm": 0.06227237731218338, + "learning_rate": 6.982762500562582e-05, + "loss": 0.2746, + "step": 33861 + }, + { + "epoch": 2.743195074530136, + "grad_norm": 0.06314686685800552, + "learning_rate": 6.98231243530312e-05, + "loss": 0.2408, + "step": 33862 + }, + { + "epoch": 2.7432760855476346, + "grad_norm": 0.06816864013671875, + "learning_rate": 6.981862370043657e-05, + "loss": 0.2566, + "step": 33863 + }, + { + "epoch": 2.743357096565133, + "grad_norm": 0.057434193789958954, + "learning_rate": 6.981412304784194e-05, + "loss": 0.1896, + "step": 33864 + }, + { + "epoch": 2.743438107582631, + "grad_norm": 0.08918724209070206, + "learning_rate": 6.980962239524731e-05, + "loss": 0.2798, + "step": 33865 + }, + { + "epoch": 2.7435191186001298, + "grad_norm": 0.0637928768992424, + "learning_rate": 6.980512174265269e-05, + "loss": 0.2373, + "step": 33866 + }, + { + "epoch": 2.743600129617628, + "grad_norm": 0.06168600916862488, + "learning_rate": 6.980062109005806e-05, + "loss": 0.2044, + "step": 33867 + }, + { + "epoch": 2.7436811406351262, + "grad_norm": 0.06205640733242035, + "learning_rate": 6.979612043746344e-05, + "loss": 0.2485, + "step": 33868 + }, + { + "epoch": 2.743762151652625, + "grad_norm": 0.0713237076997757, + "learning_rate": 6.979161978486881e-05, + "loss": 0.2555, + "step": 33869 + }, + { + "epoch": 2.743843162670123, + "grad_norm": 0.0663994699716568, + "learning_rate": 6.978711913227418e-05, + "loss": 0.2017, + "step": 33870 + }, + { + "epoch": 2.7439241736876214, + "grad_norm": 0.07154440879821777, + "learning_rate": 6.978261847967956e-05, + "loss": 0.259, + "step": 33871 + }, + { + "epoch": 2.74400518470512, + "grad_norm": 0.08224856108427048, + "learning_rate": 6.977811782708493e-05, + "loss": 0.2549, + "step": 33872 + }, + { + "epoch": 2.7440861957226184, + "grad_norm": 0.07634004205465317, + "learning_rate": 6.97736171744903e-05, + "loss": 0.2671, + "step": 33873 + }, + { + "epoch": 2.7441672067401166, + "grad_norm": 0.06648749858140945, + "learning_rate": 6.976911652189568e-05, + "loss": 0.2517, + "step": 33874 + }, + { + "epoch": 2.7442482177576153, + "grad_norm": 0.07519011199474335, + "learning_rate": 6.976461586930105e-05, + "loss": 0.2343, + "step": 33875 + }, + { + "epoch": 2.7443292287751135, + "grad_norm": 0.0715513825416565, + "learning_rate": 6.976011521670642e-05, + "loss": 0.2435, + "step": 33876 + }, + { + "epoch": 2.744410239792612, + "grad_norm": 0.05868488922715187, + "learning_rate": 6.97556145641118e-05, + "loss": 0.191, + "step": 33877 + }, + { + "epoch": 2.74449125081011, + "grad_norm": 0.06658539921045303, + "learning_rate": 6.975111391151717e-05, + "loss": 0.2674, + "step": 33878 + }, + { + "epoch": 2.7445722618276087, + "grad_norm": 0.07914631813764572, + "learning_rate": 6.974661325892255e-05, + "loss": 0.2455, + "step": 33879 + }, + { + "epoch": 2.744653272845107, + "grad_norm": 0.06325214356184006, + "learning_rate": 6.974211260632792e-05, + "loss": 0.2192, + "step": 33880 + }, + { + "epoch": 2.744734283862605, + "grad_norm": 0.07583370804786682, + "learning_rate": 6.973761195373329e-05, + "loss": 0.246, + "step": 33881 + }, + { + "epoch": 2.7448152948801035, + "grad_norm": 0.08487239480018616, + "learning_rate": 6.973311130113867e-05, + "loss": 0.264, + "step": 33882 + }, + { + "epoch": 2.744896305897602, + "grad_norm": 0.06740166246891022, + "learning_rate": 6.972861064854404e-05, + "loss": 0.2526, + "step": 33883 + }, + { + "epoch": 2.7449773169151004, + "grad_norm": 0.065463587641716, + "learning_rate": 6.972410999594941e-05, + "loss": 0.2367, + "step": 33884 + }, + { + "epoch": 2.7450583279325986, + "grad_norm": 0.07324957102537155, + "learning_rate": 6.971960934335479e-05, + "loss": 0.2468, + "step": 33885 + }, + { + "epoch": 2.7451393389500973, + "grad_norm": 0.06521370261907578, + "learning_rate": 6.971510869076016e-05, + "loss": 0.2448, + "step": 33886 + }, + { + "epoch": 2.7452203499675956, + "grad_norm": 0.0762423500418663, + "learning_rate": 6.971060803816553e-05, + "loss": 0.2584, + "step": 33887 + }, + { + "epoch": 2.745301360985094, + "grad_norm": 0.06677243113517761, + "learning_rate": 6.970610738557091e-05, + "loss": 0.2226, + "step": 33888 + }, + { + "epoch": 2.7453823720025925, + "grad_norm": 0.06267214566469193, + "learning_rate": 6.970160673297628e-05, + "loss": 0.2011, + "step": 33889 + }, + { + "epoch": 2.7454633830200907, + "grad_norm": 0.054746679961681366, + "learning_rate": 6.969710608038165e-05, + "loss": 0.2164, + "step": 33890 + }, + { + "epoch": 2.745544394037589, + "grad_norm": 0.06546787172555923, + "learning_rate": 6.969260542778703e-05, + "loss": 0.2138, + "step": 33891 + }, + { + "epoch": 2.7456254050550877, + "grad_norm": 0.06459563225507736, + "learning_rate": 6.96881047751924e-05, + "loss": 0.2623, + "step": 33892 + }, + { + "epoch": 2.745706416072586, + "grad_norm": 0.06675142049789429, + "learning_rate": 6.968360412259778e-05, + "loss": 0.2613, + "step": 33893 + }, + { + "epoch": 2.745787427090084, + "grad_norm": 0.06525643169879913, + "learning_rate": 6.967910347000315e-05, + "loss": 0.2273, + "step": 33894 + }, + { + "epoch": 2.745868438107583, + "grad_norm": 0.0785670205950737, + "learning_rate": 6.967460281740852e-05, + "loss": 0.2384, + "step": 33895 + }, + { + "epoch": 2.745949449125081, + "grad_norm": 0.061771150678396225, + "learning_rate": 6.96701021648139e-05, + "loss": 0.2368, + "step": 33896 + }, + { + "epoch": 2.7460304601425793, + "grad_norm": 0.07329079508781433, + "learning_rate": 6.966560151221928e-05, + "loss": 0.2479, + "step": 33897 + }, + { + "epoch": 2.746111471160078, + "grad_norm": 0.058052219450473785, + "learning_rate": 6.966110085962466e-05, + "loss": 0.2358, + "step": 33898 + }, + { + "epoch": 2.7461924821775763, + "grad_norm": 0.08250406384468079, + "learning_rate": 6.965660020703002e-05, + "loss": 0.2169, + "step": 33899 + }, + { + "epoch": 2.7462734931950745, + "grad_norm": 0.06558365374803543, + "learning_rate": 6.96520995544354e-05, + "loss": 0.2499, + "step": 33900 + }, + { + "epoch": 2.7463545042125728, + "grad_norm": 0.06303752958774567, + "learning_rate": 6.964759890184078e-05, + "loss": 0.1933, + "step": 33901 + }, + { + "epoch": 2.7464355152300715, + "grad_norm": 0.07631899416446686, + "learning_rate": 6.964309824924614e-05, + "loss": 0.231, + "step": 33902 + }, + { + "epoch": 2.7465165262475697, + "grad_norm": 0.07728277891874313, + "learning_rate": 6.963859759665153e-05, + "loss": 0.254, + "step": 33903 + }, + { + "epoch": 2.746597537265068, + "grad_norm": 0.06139729544520378, + "learning_rate": 6.96340969440569e-05, + "loss": 0.2061, + "step": 33904 + }, + { + "epoch": 2.746678548282566, + "grad_norm": 0.06394772976636887, + "learning_rate": 6.962959629146226e-05, + "loss": 0.2302, + "step": 33905 + }, + { + "epoch": 2.746759559300065, + "grad_norm": 0.07677542418241501, + "learning_rate": 6.962509563886765e-05, + "loss": 0.2518, + "step": 33906 + }, + { + "epoch": 2.746840570317563, + "grad_norm": 0.06649527698755264, + "learning_rate": 6.962059498627302e-05, + "loss": 0.2174, + "step": 33907 + }, + { + "epoch": 2.7469215813350614, + "grad_norm": 0.07912680506706238, + "learning_rate": 6.961609433367838e-05, + "loss": 0.2361, + "step": 33908 + }, + { + "epoch": 2.74700259235256, + "grad_norm": 0.05456256493926048, + "learning_rate": 6.961159368108377e-05, + "loss": 0.2232, + "step": 33909 + }, + { + "epoch": 2.7470836033700583, + "grad_norm": 0.06172642111778259, + "learning_rate": 6.960709302848914e-05, + "loss": 0.2531, + "step": 33910 + }, + { + "epoch": 2.7471646143875565, + "grad_norm": 0.07474996894598007, + "learning_rate": 6.96025923758945e-05, + "loss": 0.2614, + "step": 33911 + }, + { + "epoch": 2.7472456254050552, + "grad_norm": 0.06314655393362045, + "learning_rate": 6.959809172329989e-05, + "loss": 0.2316, + "step": 33912 + }, + { + "epoch": 2.7473266364225535, + "grad_norm": 0.0807376354932785, + "learning_rate": 6.959359107070526e-05, + "loss": 0.2238, + "step": 33913 + }, + { + "epoch": 2.7474076474400517, + "grad_norm": 0.07609011977910995, + "learning_rate": 6.958909041811062e-05, + "loss": 0.2425, + "step": 33914 + }, + { + "epoch": 2.7474886584575504, + "grad_norm": 0.08066786080598831, + "learning_rate": 6.958458976551601e-05, + "loss": 0.2354, + "step": 33915 + }, + { + "epoch": 2.7475696694750487, + "grad_norm": 0.06597253680229187, + "learning_rate": 6.958008911292138e-05, + "loss": 0.2118, + "step": 33916 + }, + { + "epoch": 2.747650680492547, + "grad_norm": 0.06814610958099365, + "learning_rate": 6.957558846032674e-05, + "loss": 0.2365, + "step": 33917 + }, + { + "epoch": 2.7477316915100456, + "grad_norm": 0.07999754697084427, + "learning_rate": 6.957108780773213e-05, + "loss": 0.277, + "step": 33918 + }, + { + "epoch": 2.747812702527544, + "grad_norm": 0.07574919611215591, + "learning_rate": 6.95665871551375e-05, + "loss": 0.281, + "step": 33919 + }, + { + "epoch": 2.747893713545042, + "grad_norm": 0.062194306403398514, + "learning_rate": 6.956208650254286e-05, + "loss": 0.2249, + "step": 33920 + }, + { + "epoch": 2.7479747245625408, + "grad_norm": 0.06311631202697754, + "learning_rate": 6.955758584994825e-05, + "loss": 0.2159, + "step": 33921 + }, + { + "epoch": 2.748055735580039, + "grad_norm": 0.06273925304412842, + "learning_rate": 6.955308519735362e-05, + "loss": 0.196, + "step": 33922 + }, + { + "epoch": 2.7481367465975373, + "grad_norm": 0.05916561558842659, + "learning_rate": 6.954858454475898e-05, + "loss": 0.2142, + "step": 33923 + }, + { + "epoch": 2.7482177576150355, + "grad_norm": 0.07138768583536148, + "learning_rate": 6.954408389216437e-05, + "loss": 0.2128, + "step": 33924 + }, + { + "epoch": 2.7482987686325338, + "grad_norm": 0.07354070246219635, + "learning_rate": 6.953958323956974e-05, + "loss": 0.2503, + "step": 33925 + }, + { + "epoch": 2.7483797796500324, + "grad_norm": 0.07693708688020706, + "learning_rate": 6.953508258697512e-05, + "loss": 0.2403, + "step": 33926 + }, + { + "epoch": 2.7484607906675307, + "grad_norm": 0.07450443506240845, + "learning_rate": 6.953058193438049e-05, + "loss": 0.2474, + "step": 33927 + }, + { + "epoch": 2.748541801685029, + "grad_norm": 0.07271936535835266, + "learning_rate": 6.952608128178587e-05, + "loss": 0.2424, + "step": 33928 + }, + { + "epoch": 2.7486228127025276, + "grad_norm": 0.05559582635760307, + "learning_rate": 6.952158062919124e-05, + "loss": 0.2425, + "step": 33929 + }, + { + "epoch": 2.748703823720026, + "grad_norm": 0.07008780539035797, + "learning_rate": 6.951707997659661e-05, + "loss": 0.248, + "step": 33930 + }, + { + "epoch": 2.748784834737524, + "grad_norm": 0.06590422242879868, + "learning_rate": 6.951257932400199e-05, + "loss": 0.2359, + "step": 33931 + }, + { + "epoch": 2.748865845755023, + "grad_norm": 0.0587596632540226, + "learning_rate": 6.950807867140736e-05, + "loss": 0.257, + "step": 33932 + }, + { + "epoch": 2.748946856772521, + "grad_norm": 0.06739132106304169, + "learning_rate": 6.950357801881273e-05, + "loss": 0.2381, + "step": 33933 + }, + { + "epoch": 2.7490278677900193, + "grad_norm": 0.06548912823200226, + "learning_rate": 6.949907736621811e-05, + "loss": 0.2451, + "step": 33934 + }, + { + "epoch": 2.749108878807518, + "grad_norm": 0.05901022627949715, + "learning_rate": 6.949457671362348e-05, + "loss": 0.2312, + "step": 33935 + }, + { + "epoch": 2.749189889825016, + "grad_norm": 0.0753978043794632, + "learning_rate": 6.949007606102885e-05, + "loss": 0.2602, + "step": 33936 + }, + { + "epoch": 2.7492709008425145, + "grad_norm": 0.05910757556557655, + "learning_rate": 6.948557540843423e-05, + "loss": 0.2512, + "step": 33937 + }, + { + "epoch": 2.749351911860013, + "grad_norm": 0.06722880154848099, + "learning_rate": 6.94810747558396e-05, + "loss": 0.2267, + "step": 33938 + }, + { + "epoch": 2.7494329228775114, + "grad_norm": 0.07442230731248856, + "learning_rate": 6.947657410324498e-05, + "loss": 0.2271, + "step": 33939 + }, + { + "epoch": 2.7495139338950096, + "grad_norm": 0.06559806317090988, + "learning_rate": 6.947207345065035e-05, + "loss": 0.2305, + "step": 33940 + }, + { + "epoch": 2.7495949449125083, + "grad_norm": 0.05963011085987091, + "learning_rate": 6.946757279805572e-05, + "loss": 0.242, + "step": 33941 + }, + { + "epoch": 2.7496759559300066, + "grad_norm": 0.0676308423280716, + "learning_rate": 6.94630721454611e-05, + "loss": 0.2261, + "step": 33942 + }, + { + "epoch": 2.749756966947505, + "grad_norm": 0.07964812219142914, + "learning_rate": 6.945857149286647e-05, + "loss": 0.3088, + "step": 33943 + }, + { + "epoch": 2.7498379779650035, + "grad_norm": 0.05791834741830826, + "learning_rate": 6.945407084027184e-05, + "loss": 0.218, + "step": 33944 + }, + { + "epoch": 2.7499189889825018, + "grad_norm": 0.07928522676229477, + "learning_rate": 6.944957018767722e-05, + "loss": 0.225, + "step": 33945 + }, + { + "epoch": 2.75, + "grad_norm": 0.08295446634292603, + "learning_rate": 6.944506953508259e-05, + "loss": 0.2764, + "step": 33946 + }, + { + "epoch": 2.7500810110174982, + "grad_norm": 0.06554339081048965, + "learning_rate": 6.944056888248796e-05, + "loss": 0.2681, + "step": 33947 + }, + { + "epoch": 2.7501620220349965, + "grad_norm": 0.07376903295516968, + "learning_rate": 6.943606822989334e-05, + "loss": 0.2583, + "step": 33948 + }, + { + "epoch": 2.750243033052495, + "grad_norm": 0.07312076538801193, + "learning_rate": 6.943156757729871e-05, + "loss": 0.2299, + "step": 33949 + }, + { + "epoch": 2.7503240440699934, + "grad_norm": 0.05707407742738724, + "learning_rate": 6.942706692470408e-05, + "loss": 0.2532, + "step": 33950 + }, + { + "epoch": 2.7504050550874917, + "grad_norm": 0.06634999811649323, + "learning_rate": 6.942256627210946e-05, + "loss": 0.2768, + "step": 33951 + }, + { + "epoch": 2.7504860661049904, + "grad_norm": 0.06431234627962112, + "learning_rate": 6.941806561951483e-05, + "loss": 0.2164, + "step": 33952 + }, + { + "epoch": 2.7505670771224886, + "grad_norm": 0.07189373672008514, + "learning_rate": 6.94135649669202e-05, + "loss": 0.2043, + "step": 33953 + }, + { + "epoch": 2.750648088139987, + "grad_norm": 0.06200005114078522, + "learning_rate": 6.940906431432558e-05, + "loss": 0.2356, + "step": 33954 + }, + { + "epoch": 2.7507290991574855, + "grad_norm": 0.08007141947746277, + "learning_rate": 6.940456366173095e-05, + "loss": 0.2381, + "step": 33955 + }, + { + "epoch": 2.750810110174984, + "grad_norm": 0.06408270448446274, + "learning_rate": 6.940006300913633e-05, + "loss": 0.2401, + "step": 33956 + }, + { + "epoch": 2.750891121192482, + "grad_norm": 0.06597329676151276, + "learning_rate": 6.93955623565417e-05, + "loss": 0.2812, + "step": 33957 + }, + { + "epoch": 2.7509721322099807, + "grad_norm": 0.0680282860994339, + "learning_rate": 6.939106170394707e-05, + "loss": 0.2148, + "step": 33958 + }, + { + "epoch": 2.751053143227479, + "grad_norm": 0.07116768509149551, + "learning_rate": 6.938656105135245e-05, + "loss": 0.2254, + "step": 33959 + }, + { + "epoch": 2.751134154244977, + "grad_norm": 0.09429546445608139, + "learning_rate": 6.938206039875782e-05, + "loss": 0.2869, + "step": 33960 + }, + { + "epoch": 2.751215165262476, + "grad_norm": 0.061728041619062424, + "learning_rate": 6.93775597461632e-05, + "loss": 0.2476, + "step": 33961 + }, + { + "epoch": 2.751296176279974, + "grad_norm": 0.0597955696284771, + "learning_rate": 6.937305909356857e-05, + "loss": 0.2159, + "step": 33962 + }, + { + "epoch": 2.7513771872974724, + "grad_norm": 0.0768377035856247, + "learning_rate": 6.936855844097394e-05, + "loss": 0.2677, + "step": 33963 + }, + { + "epoch": 2.751458198314971, + "grad_norm": 0.09343895316123962, + "learning_rate": 6.936405778837932e-05, + "loss": 0.2634, + "step": 33964 + }, + { + "epoch": 2.7515392093324693, + "grad_norm": 0.08717542886734009, + "learning_rate": 6.935955713578469e-05, + "loss": 0.2586, + "step": 33965 + }, + { + "epoch": 2.7516202203499676, + "grad_norm": 0.09890085458755493, + "learning_rate": 6.935505648319006e-05, + "loss": 0.2414, + "step": 33966 + }, + { + "epoch": 2.7517012313674662, + "grad_norm": 0.07061535120010376, + "learning_rate": 6.935055583059545e-05, + "loss": 0.2343, + "step": 33967 + }, + { + "epoch": 2.7517822423849645, + "grad_norm": 0.0656886100769043, + "learning_rate": 6.934605517800081e-05, + "loss": 0.2115, + "step": 33968 + }, + { + "epoch": 2.7518632534024627, + "grad_norm": 0.056593045592308044, + "learning_rate": 6.934155452540618e-05, + "loss": 0.2224, + "step": 33969 + }, + { + "epoch": 2.751944264419961, + "grad_norm": 0.08037639409303665, + "learning_rate": 6.933705387281157e-05, + "loss": 0.2474, + "step": 33970 + }, + { + "epoch": 2.7520252754374592, + "grad_norm": 0.06938332319259644, + "learning_rate": 6.933255322021693e-05, + "loss": 0.2223, + "step": 33971 + }, + { + "epoch": 2.752106286454958, + "grad_norm": 0.081057608127594, + "learning_rate": 6.93280525676223e-05, + "loss": 0.2436, + "step": 33972 + }, + { + "epoch": 2.752187297472456, + "grad_norm": 0.07251177728176117, + "learning_rate": 6.932355191502769e-05, + "loss": 0.2473, + "step": 33973 + }, + { + "epoch": 2.7522683084899544, + "grad_norm": 0.07197289913892746, + "learning_rate": 6.931905126243305e-05, + "loss": 0.2609, + "step": 33974 + }, + { + "epoch": 2.752349319507453, + "grad_norm": 0.071237713098526, + "learning_rate": 6.931455060983842e-05, + "loss": 0.2004, + "step": 33975 + }, + { + "epoch": 2.7524303305249513, + "grad_norm": 0.069503553211689, + "learning_rate": 6.931004995724381e-05, + "loss": 0.2672, + "step": 33976 + }, + { + "epoch": 2.7525113415424496, + "grad_norm": 0.07165911048650742, + "learning_rate": 6.930554930464917e-05, + "loss": 0.1889, + "step": 33977 + }, + { + "epoch": 2.7525923525599483, + "grad_norm": 0.06798816472291946, + "learning_rate": 6.930104865205456e-05, + "loss": 0.2506, + "step": 33978 + }, + { + "epoch": 2.7526733635774465, + "grad_norm": 0.07564514875411987, + "learning_rate": 6.929654799945993e-05, + "loss": 0.2637, + "step": 33979 + }, + { + "epoch": 2.7527543745949448, + "grad_norm": 0.060637280344963074, + "learning_rate": 6.929204734686529e-05, + "loss": 0.2081, + "step": 33980 + }, + { + "epoch": 2.7528353856124435, + "grad_norm": 0.06939760595560074, + "learning_rate": 6.928754669427068e-05, + "loss": 0.2313, + "step": 33981 + }, + { + "epoch": 2.7529163966299417, + "grad_norm": 0.07175181061029434, + "learning_rate": 6.928304604167605e-05, + "loss": 0.2277, + "step": 33982 + }, + { + "epoch": 2.75299740764744, + "grad_norm": 0.06570654362440109, + "learning_rate": 6.927854538908141e-05, + "loss": 0.229, + "step": 33983 + }, + { + "epoch": 2.7530784186649386, + "grad_norm": 0.08200690150260925, + "learning_rate": 6.92740447364868e-05, + "loss": 0.2282, + "step": 33984 + }, + { + "epoch": 2.753159429682437, + "grad_norm": 0.07265735417604446, + "learning_rate": 6.926954408389217e-05, + "loss": 0.2854, + "step": 33985 + }, + { + "epoch": 2.753240440699935, + "grad_norm": 0.0830908939242363, + "learning_rate": 6.926504343129753e-05, + "loss": 0.2356, + "step": 33986 + }, + { + "epoch": 2.753321451717434, + "grad_norm": 0.05797835439443588, + "learning_rate": 6.926054277870292e-05, + "loss": 0.2184, + "step": 33987 + }, + { + "epoch": 2.753402462734932, + "grad_norm": 0.07792074233293533, + "learning_rate": 6.92560421261083e-05, + "loss": 0.2397, + "step": 33988 + }, + { + "epoch": 2.7534834737524303, + "grad_norm": 0.07205940783023834, + "learning_rate": 6.925154147351366e-05, + "loss": 0.2362, + "step": 33989 + }, + { + "epoch": 2.7535644847699285, + "grad_norm": 0.06467721611261368, + "learning_rate": 6.924704082091904e-05, + "loss": 0.2393, + "step": 33990 + }, + { + "epoch": 2.7536454957874272, + "grad_norm": 0.06070276349782944, + "learning_rate": 6.924254016832442e-05, + "loss": 0.2191, + "step": 33991 + }, + { + "epoch": 2.7537265068049255, + "grad_norm": 0.06308267265558243, + "learning_rate": 6.923803951572978e-05, + "loss": 0.2415, + "step": 33992 + }, + { + "epoch": 2.7538075178224237, + "grad_norm": 0.0762130543589592, + "learning_rate": 6.923353886313516e-05, + "loss": 0.2365, + "step": 33993 + }, + { + "epoch": 2.753888528839922, + "grad_norm": 0.07591088861227036, + "learning_rate": 6.922903821054054e-05, + "loss": 0.2486, + "step": 33994 + }, + { + "epoch": 2.7539695398574207, + "grad_norm": 0.07481890916824341, + "learning_rate": 6.92245375579459e-05, + "loss": 0.2647, + "step": 33995 + }, + { + "epoch": 2.754050550874919, + "grad_norm": 0.07969322055578232, + "learning_rate": 6.922003690535128e-05, + "loss": 0.2999, + "step": 33996 + }, + { + "epoch": 2.754131561892417, + "grad_norm": 0.08638513833284378, + "learning_rate": 6.921553625275666e-05, + "loss": 0.2851, + "step": 33997 + }, + { + "epoch": 2.754212572909916, + "grad_norm": 0.06518282741308212, + "learning_rate": 6.921103560016202e-05, + "loss": 0.2319, + "step": 33998 + }, + { + "epoch": 2.754293583927414, + "grad_norm": 0.06742814183235168, + "learning_rate": 6.92065349475674e-05, + "loss": 0.2463, + "step": 33999 + }, + { + "epoch": 2.7543745949449123, + "grad_norm": 0.07727876305580139, + "learning_rate": 6.920203429497278e-05, + "loss": 0.2808, + "step": 34000 + }, + { + "epoch": 2.754455605962411, + "grad_norm": 0.08557460457086563, + "learning_rate": 6.919753364237814e-05, + "loss": 0.2823, + "step": 34001 + }, + { + "epoch": 2.7545366169799093, + "grad_norm": 0.0750543400645256, + "learning_rate": 6.919303298978353e-05, + "loss": 0.2273, + "step": 34002 + }, + { + "epoch": 2.7546176279974075, + "grad_norm": 0.08017384260892868, + "learning_rate": 6.91885323371889e-05, + "loss": 0.2672, + "step": 34003 + }, + { + "epoch": 2.754698639014906, + "grad_norm": 0.06331358104944229, + "learning_rate": 6.918403168459427e-05, + "loss": 0.2213, + "step": 34004 + }, + { + "epoch": 2.7547796500324044, + "grad_norm": 0.09369877725839615, + "learning_rate": 6.917953103199965e-05, + "loss": 0.2576, + "step": 34005 + }, + { + "epoch": 2.7548606610499027, + "grad_norm": 0.06448009610176086, + "learning_rate": 6.917503037940502e-05, + "loss": 0.245, + "step": 34006 + }, + { + "epoch": 2.7549416720674014, + "grad_norm": 0.07438341528177261, + "learning_rate": 6.91705297268104e-05, + "loss": 0.2535, + "step": 34007 + }, + { + "epoch": 2.7550226830848996, + "grad_norm": 0.06136835366487503, + "learning_rate": 6.916602907421577e-05, + "loss": 0.1772, + "step": 34008 + }, + { + "epoch": 2.755103694102398, + "grad_norm": 0.0914490669965744, + "learning_rate": 6.916152842162114e-05, + "loss": 0.2733, + "step": 34009 + }, + { + "epoch": 2.7551847051198965, + "grad_norm": 0.06491867452859879, + "learning_rate": 6.915702776902651e-05, + "loss": 0.237, + "step": 34010 + }, + { + "epoch": 2.755265716137395, + "grad_norm": 0.05932406708598137, + "learning_rate": 6.915252711643189e-05, + "loss": 0.2666, + "step": 34011 + }, + { + "epoch": 2.755346727154893, + "grad_norm": 0.07096493989229202, + "learning_rate": 6.914802646383726e-05, + "loss": 0.2507, + "step": 34012 + }, + { + "epoch": 2.7554277381723913, + "grad_norm": 0.06969096511602402, + "learning_rate": 6.914352581124264e-05, + "loss": 0.2219, + "step": 34013 + }, + { + "epoch": 2.75550874918989, + "grad_norm": 0.0693049356341362, + "learning_rate": 6.913902515864801e-05, + "loss": 0.2498, + "step": 34014 + }, + { + "epoch": 2.755589760207388, + "grad_norm": 0.06646228581666946, + "learning_rate": 6.913452450605338e-05, + "loss": 0.2494, + "step": 34015 + }, + { + "epoch": 2.7556707712248865, + "grad_norm": 0.058434128761291504, + "learning_rate": 6.913002385345876e-05, + "loss": 0.1993, + "step": 34016 + }, + { + "epoch": 2.7557517822423847, + "grad_norm": 0.06461863964796066, + "learning_rate": 6.912552320086413e-05, + "loss": 0.2394, + "step": 34017 + }, + { + "epoch": 2.7558327932598834, + "grad_norm": 0.06257472187280655, + "learning_rate": 6.91210225482695e-05, + "loss": 0.2442, + "step": 34018 + }, + { + "epoch": 2.7559138042773816, + "grad_norm": 0.07050945609807968, + "learning_rate": 6.911652189567488e-05, + "loss": 0.2254, + "step": 34019 + }, + { + "epoch": 2.75599481529488, + "grad_norm": 0.07353843748569489, + "learning_rate": 6.911202124308025e-05, + "loss": 0.2551, + "step": 34020 + }, + { + "epoch": 2.7560758263123786, + "grad_norm": 0.07434510439634323, + "learning_rate": 6.910752059048562e-05, + "loss": 0.2247, + "step": 34021 + }, + { + "epoch": 2.756156837329877, + "grad_norm": 0.06477247178554535, + "learning_rate": 6.9103019937891e-05, + "loss": 0.216, + "step": 34022 + }, + { + "epoch": 2.756237848347375, + "grad_norm": 0.06273271143436432, + "learning_rate": 6.909851928529637e-05, + "loss": 0.2295, + "step": 34023 + }, + { + "epoch": 2.7563188593648738, + "grad_norm": 0.06990016996860504, + "learning_rate": 6.909401863270174e-05, + "loss": 0.2672, + "step": 34024 + }, + { + "epoch": 2.756399870382372, + "grad_norm": 0.070733442902565, + "learning_rate": 6.908951798010712e-05, + "loss": 0.2324, + "step": 34025 + }, + { + "epoch": 2.7564808813998702, + "grad_norm": 0.07302505522966385, + "learning_rate": 6.908501732751249e-05, + "loss": 0.2399, + "step": 34026 + }, + { + "epoch": 2.756561892417369, + "grad_norm": 0.0650155171751976, + "learning_rate": 6.908051667491787e-05, + "loss": 0.2295, + "step": 34027 + }, + { + "epoch": 2.756642903434867, + "grad_norm": 0.07546880841255188, + "learning_rate": 6.907601602232324e-05, + "loss": 0.2041, + "step": 34028 + }, + { + "epoch": 2.7567239144523654, + "grad_norm": 0.06319276988506317, + "learning_rate": 6.907151536972861e-05, + "loss": 0.2252, + "step": 34029 + }, + { + "epoch": 2.756804925469864, + "grad_norm": 0.08173449337482452, + "learning_rate": 6.906701471713399e-05, + "loss": 0.2507, + "step": 34030 + }, + { + "epoch": 2.7568859364873624, + "grad_norm": 0.06582282483577728, + "learning_rate": 6.906251406453936e-05, + "loss": 0.23, + "step": 34031 + }, + { + "epoch": 2.7569669475048606, + "grad_norm": 0.0720318853855133, + "learning_rate": 6.905801341194473e-05, + "loss": 0.2675, + "step": 34032 + }, + { + "epoch": 2.7570479585223593, + "grad_norm": 0.06968838721513748, + "learning_rate": 6.905351275935011e-05, + "loss": 0.2095, + "step": 34033 + }, + { + "epoch": 2.7571289695398575, + "grad_norm": 0.07105859369039536, + "learning_rate": 6.904901210675548e-05, + "loss": 0.2526, + "step": 34034 + }, + { + "epoch": 2.7572099805573558, + "grad_norm": 0.07147864252328873, + "learning_rate": 6.904451145416085e-05, + "loss": 0.2264, + "step": 34035 + }, + { + "epoch": 2.757290991574854, + "grad_norm": 0.08469801396131516, + "learning_rate": 6.904001080156624e-05, + "loss": 0.2404, + "step": 34036 + }, + { + "epoch": 2.7573720025923527, + "grad_norm": 0.0625695213675499, + "learning_rate": 6.90355101489716e-05, + "loss": 0.2344, + "step": 34037 + }, + { + "epoch": 2.757453013609851, + "grad_norm": 0.06275301426649094, + "learning_rate": 6.903100949637698e-05, + "loss": 0.2615, + "step": 34038 + }, + { + "epoch": 2.757534024627349, + "grad_norm": 0.07460293918848038, + "learning_rate": 6.902650884378236e-05, + "loss": 0.2533, + "step": 34039 + }, + { + "epoch": 2.7576150356448474, + "grad_norm": 0.0641096755862236, + "learning_rate": 6.902200819118772e-05, + "loss": 0.2409, + "step": 34040 + }, + { + "epoch": 2.757696046662346, + "grad_norm": 0.06952983140945435, + "learning_rate": 6.90175075385931e-05, + "loss": 0.2488, + "step": 34041 + }, + { + "epoch": 2.7577770576798444, + "grad_norm": 0.06201440840959549, + "learning_rate": 6.901300688599848e-05, + "loss": 0.2475, + "step": 34042 + }, + { + "epoch": 2.7578580686973426, + "grad_norm": 0.06937065720558167, + "learning_rate": 6.900850623340384e-05, + "loss": 0.2258, + "step": 34043 + }, + { + "epoch": 2.7579390797148413, + "grad_norm": 0.07255363464355469, + "learning_rate": 6.900400558080922e-05, + "loss": 0.2046, + "step": 34044 + }, + { + "epoch": 2.7580200907323396, + "grad_norm": 0.06431081891059875, + "learning_rate": 6.89995049282146e-05, + "loss": 0.2412, + "step": 34045 + }, + { + "epoch": 2.758101101749838, + "grad_norm": 0.060214076191186905, + "learning_rate": 6.899500427561996e-05, + "loss": 0.2368, + "step": 34046 + }, + { + "epoch": 2.7581821127673365, + "grad_norm": 0.0770927146077156, + "learning_rate": 6.899050362302534e-05, + "loss": 0.261, + "step": 34047 + }, + { + "epoch": 2.7582631237848347, + "grad_norm": 0.055978596210479736, + "learning_rate": 6.898600297043072e-05, + "loss": 0.2484, + "step": 34048 + }, + { + "epoch": 2.758344134802333, + "grad_norm": 0.07794700562953949, + "learning_rate": 6.898150231783609e-05, + "loss": 0.2395, + "step": 34049 + }, + { + "epoch": 2.7584251458198317, + "grad_norm": 0.08637645840644836, + "learning_rate": 6.897700166524146e-05, + "loss": 0.2474, + "step": 34050 + }, + { + "epoch": 2.75850615683733, + "grad_norm": 0.06628405302762985, + "learning_rate": 6.897250101264685e-05, + "loss": 0.2618, + "step": 34051 + }, + { + "epoch": 2.758587167854828, + "grad_norm": 0.07030244916677475, + "learning_rate": 6.89680003600522e-05, + "loss": 0.2091, + "step": 34052 + }, + { + "epoch": 2.758668178872327, + "grad_norm": 0.05243457108736038, + "learning_rate": 6.896349970745758e-05, + "loss": 0.2023, + "step": 34053 + }, + { + "epoch": 2.758749189889825, + "grad_norm": 0.06554213911294937, + "learning_rate": 6.895899905486297e-05, + "loss": 0.2613, + "step": 34054 + }, + { + "epoch": 2.7588302009073233, + "grad_norm": 0.08007621020078659, + "learning_rate": 6.895449840226833e-05, + "loss": 0.2792, + "step": 34055 + }, + { + "epoch": 2.758911211924822, + "grad_norm": 0.07678791880607605, + "learning_rate": 6.894999774967371e-05, + "loss": 0.2163, + "step": 34056 + }, + { + "epoch": 2.7589922229423203, + "grad_norm": 0.06600095331668854, + "learning_rate": 6.894549709707909e-05, + "loss": 0.2529, + "step": 34057 + }, + { + "epoch": 2.7590732339598185, + "grad_norm": 0.0781325250864029, + "learning_rate": 6.894099644448445e-05, + "loss": 0.2323, + "step": 34058 + }, + { + "epoch": 2.7591542449773168, + "grad_norm": 0.07266079634428024, + "learning_rate": 6.893649579188983e-05, + "loss": 0.2763, + "step": 34059 + }, + { + "epoch": 2.7592352559948155, + "grad_norm": 0.06358586996793747, + "learning_rate": 6.893199513929521e-05, + "loss": 0.2214, + "step": 34060 + }, + { + "epoch": 2.7593162670123137, + "grad_norm": 0.08538417518138885, + "learning_rate": 6.892749448670057e-05, + "loss": 0.2352, + "step": 34061 + }, + { + "epoch": 2.759397278029812, + "grad_norm": 0.07268763333559036, + "learning_rate": 6.892299383410596e-05, + "loss": 0.2172, + "step": 34062 + }, + { + "epoch": 2.75947828904731, + "grad_norm": 0.07276776432991028, + "learning_rate": 6.891849318151133e-05, + "loss": 0.255, + "step": 34063 + }, + { + "epoch": 2.759559300064809, + "grad_norm": 0.07200052589178085, + "learning_rate": 6.891399252891669e-05, + "loss": 0.2703, + "step": 34064 + }, + { + "epoch": 2.759640311082307, + "grad_norm": 0.09576958417892456, + "learning_rate": 6.890949187632208e-05, + "loss": 0.2816, + "step": 34065 + }, + { + "epoch": 2.7597213220998054, + "grad_norm": 0.08117634057998657, + "learning_rate": 6.890499122372745e-05, + "loss": 0.249, + "step": 34066 + }, + { + "epoch": 2.759802333117304, + "grad_norm": 0.059016503393650055, + "learning_rate": 6.890049057113281e-05, + "loss": 0.2394, + "step": 34067 + }, + { + "epoch": 2.7598833441348023, + "grad_norm": 0.06975926458835602, + "learning_rate": 6.88959899185382e-05, + "loss": 0.2693, + "step": 34068 + }, + { + "epoch": 2.7599643551523005, + "grad_norm": 0.06149639934301376, + "learning_rate": 6.889148926594357e-05, + "loss": 0.2803, + "step": 34069 + }, + { + "epoch": 2.7600453661697992, + "grad_norm": 0.08504508435726166, + "learning_rate": 6.888698861334893e-05, + "loss": 0.277, + "step": 34070 + }, + { + "epoch": 2.7601263771872975, + "grad_norm": 0.0662258192896843, + "learning_rate": 6.888248796075432e-05, + "loss": 0.2374, + "step": 34071 + }, + { + "epoch": 2.7602073882047957, + "grad_norm": 0.0804053395986557, + "learning_rate": 6.887798730815969e-05, + "loss": 0.2527, + "step": 34072 + }, + { + "epoch": 2.7602883992222944, + "grad_norm": 0.07075381278991699, + "learning_rate": 6.887348665556505e-05, + "loss": 0.2536, + "step": 34073 + }, + { + "epoch": 2.7603694102397927, + "grad_norm": 0.06893350183963776, + "learning_rate": 6.886898600297044e-05, + "loss": 0.2206, + "step": 34074 + }, + { + "epoch": 2.760450421257291, + "grad_norm": 0.060836561024188995, + "learning_rate": 6.886448535037581e-05, + "loss": 0.2044, + "step": 34075 + }, + { + "epoch": 2.7605314322747896, + "grad_norm": 0.0595211423933506, + "learning_rate": 6.885998469778117e-05, + "loss": 0.2402, + "step": 34076 + }, + { + "epoch": 2.760612443292288, + "grad_norm": 0.07096649706363678, + "learning_rate": 6.885548404518656e-05, + "loss": 0.2102, + "step": 34077 + }, + { + "epoch": 2.760693454309786, + "grad_norm": 0.0808127298951149, + "learning_rate": 6.885098339259193e-05, + "loss": 0.2048, + "step": 34078 + }, + { + "epoch": 2.7607744653272848, + "grad_norm": 0.06761705130338669, + "learning_rate": 6.884648273999729e-05, + "loss": 0.2122, + "step": 34079 + }, + { + "epoch": 2.760855476344783, + "grad_norm": 0.05668129399418831, + "learning_rate": 6.884198208740268e-05, + "loss": 0.2432, + "step": 34080 + }, + { + "epoch": 2.7609364873622813, + "grad_norm": 0.060756102204322815, + "learning_rate": 6.883748143480805e-05, + "loss": 0.257, + "step": 34081 + }, + { + "epoch": 2.7610174983797795, + "grad_norm": 0.07630965113639832, + "learning_rate": 6.883298078221341e-05, + "loss": 0.2406, + "step": 34082 + }, + { + "epoch": 2.761098509397278, + "grad_norm": 0.07598250359296799, + "learning_rate": 6.88284801296188e-05, + "loss": 0.2886, + "step": 34083 + }, + { + "epoch": 2.7611795204147764, + "grad_norm": 0.07698392868041992, + "learning_rate": 6.882397947702417e-05, + "loss": 0.2702, + "step": 34084 + }, + { + "epoch": 2.7612605314322747, + "grad_norm": 0.06041031703352928, + "learning_rate": 6.881947882442955e-05, + "loss": 0.2089, + "step": 34085 + }, + { + "epoch": 2.761341542449773, + "grad_norm": 0.0772460475564003, + "learning_rate": 6.881497817183492e-05, + "loss": 0.2368, + "step": 34086 + }, + { + "epoch": 2.7614225534672716, + "grad_norm": 0.062571682035923, + "learning_rate": 6.88104775192403e-05, + "loss": 0.2754, + "step": 34087 + }, + { + "epoch": 2.76150356448477, + "grad_norm": 0.07503519207239151, + "learning_rate": 6.880597686664567e-05, + "loss": 0.224, + "step": 34088 + }, + { + "epoch": 2.761584575502268, + "grad_norm": 0.07290045917034149, + "learning_rate": 6.880147621405104e-05, + "loss": 0.2328, + "step": 34089 + }, + { + "epoch": 2.761665586519767, + "grad_norm": 0.08204520493745804, + "learning_rate": 6.879697556145642e-05, + "loss": 0.2032, + "step": 34090 + }, + { + "epoch": 2.761746597537265, + "grad_norm": 0.05929996818304062, + "learning_rate": 6.879247490886179e-05, + "loss": 0.2157, + "step": 34091 + }, + { + "epoch": 2.7618276085547633, + "grad_norm": 0.07957153767347336, + "learning_rate": 6.878797425626716e-05, + "loss": 0.2244, + "step": 34092 + }, + { + "epoch": 2.761908619572262, + "grad_norm": 0.07009933888912201, + "learning_rate": 6.878347360367254e-05, + "loss": 0.2706, + "step": 34093 + }, + { + "epoch": 2.76198963058976, + "grad_norm": 0.058375827968120575, + "learning_rate": 6.877897295107791e-05, + "loss": 0.225, + "step": 34094 + }, + { + "epoch": 2.7620706416072585, + "grad_norm": 0.06847576797008514, + "learning_rate": 6.877447229848328e-05, + "loss": 0.2501, + "step": 34095 + }, + { + "epoch": 2.762151652624757, + "grad_norm": 0.07601086795330048, + "learning_rate": 6.876997164588866e-05, + "loss": 0.2545, + "step": 34096 + }, + { + "epoch": 2.7622326636422554, + "grad_norm": 0.05859259143471718, + "learning_rate": 6.876547099329403e-05, + "loss": 0.245, + "step": 34097 + }, + { + "epoch": 2.7623136746597536, + "grad_norm": 0.09432229399681091, + "learning_rate": 6.87609703406994e-05, + "loss": 0.2341, + "step": 34098 + }, + { + "epoch": 2.7623946856772523, + "grad_norm": 0.0631144791841507, + "learning_rate": 6.875646968810478e-05, + "loss": 0.2361, + "step": 34099 + }, + { + "epoch": 2.7624756966947506, + "grad_norm": 0.07539720833301544, + "learning_rate": 6.875196903551015e-05, + "loss": 0.311, + "step": 34100 + }, + { + "epoch": 2.762556707712249, + "grad_norm": 0.07470940053462982, + "learning_rate": 6.874746838291553e-05, + "loss": 0.2444, + "step": 34101 + }, + { + "epoch": 2.7626377187297475, + "grad_norm": 0.07325862348079681, + "learning_rate": 6.87429677303209e-05, + "loss": 0.2461, + "step": 34102 + }, + { + "epoch": 2.7627187297472457, + "grad_norm": 0.053800273686647415, + "learning_rate": 6.873846707772627e-05, + "loss": 0.2503, + "step": 34103 + }, + { + "epoch": 2.762799740764744, + "grad_norm": 0.057765450328588486, + "learning_rate": 6.873396642513165e-05, + "loss": 0.2727, + "step": 34104 + }, + { + "epoch": 2.7628807517822422, + "grad_norm": 0.07699766010046005, + "learning_rate": 6.872946577253702e-05, + "loss": 0.2603, + "step": 34105 + }, + { + "epoch": 2.762961762799741, + "grad_norm": 0.06159216910600662, + "learning_rate": 6.87249651199424e-05, + "loss": 0.2394, + "step": 34106 + }, + { + "epoch": 2.763042773817239, + "grad_norm": 0.06081032380461693, + "learning_rate": 6.872046446734777e-05, + "loss": 0.2559, + "step": 34107 + }, + { + "epoch": 2.7631237848347374, + "grad_norm": 0.06624030321836472, + "learning_rate": 6.871596381475314e-05, + "loss": 0.2043, + "step": 34108 + }, + { + "epoch": 2.7632047958522357, + "grad_norm": 0.07395646721124649, + "learning_rate": 6.871146316215851e-05, + "loss": 0.2352, + "step": 34109 + }, + { + "epoch": 2.7632858068697344, + "grad_norm": 0.07033342123031616, + "learning_rate": 6.870696250956389e-05, + "loss": 0.2737, + "step": 34110 + }, + { + "epoch": 2.7633668178872326, + "grad_norm": 0.06181248649954796, + "learning_rate": 6.870246185696928e-05, + "loss": 0.2192, + "step": 34111 + }, + { + "epoch": 2.763447828904731, + "grad_norm": 0.08963780850172043, + "learning_rate": 6.869796120437464e-05, + "loss": 0.23, + "step": 34112 + }, + { + "epoch": 2.7635288399222295, + "grad_norm": 0.06250149011611938, + "learning_rate": 6.869346055178001e-05, + "loss": 0.2011, + "step": 34113 + }, + { + "epoch": 2.7636098509397278, + "grad_norm": 0.0642528384923935, + "learning_rate": 6.86889598991854e-05, + "loss": 0.2589, + "step": 34114 + }, + { + "epoch": 2.763690861957226, + "grad_norm": 0.061350125819444656, + "learning_rate": 6.868445924659076e-05, + "loss": 0.2141, + "step": 34115 + }, + { + "epoch": 2.7637718729747247, + "grad_norm": 0.11386937648057938, + "learning_rate": 6.867995859399613e-05, + "loss": 0.2322, + "step": 34116 + }, + { + "epoch": 2.763852883992223, + "grad_norm": 0.06786888092756271, + "learning_rate": 6.867545794140152e-05, + "loss": 0.24, + "step": 34117 + }, + { + "epoch": 2.763933895009721, + "grad_norm": 0.06581398844718933, + "learning_rate": 6.867095728880688e-05, + "loss": 0.266, + "step": 34118 + }, + { + "epoch": 2.76401490602722, + "grad_norm": 0.07778912037611008, + "learning_rate": 6.866645663621225e-05, + "loss": 0.2575, + "step": 34119 + }, + { + "epoch": 2.764095917044718, + "grad_norm": 0.05699251592159271, + "learning_rate": 6.866195598361764e-05, + "loss": 0.2367, + "step": 34120 + }, + { + "epoch": 2.7641769280622164, + "grad_norm": 0.06215566396713257, + "learning_rate": 6.8657455331023e-05, + "loss": 0.2401, + "step": 34121 + }, + { + "epoch": 2.764257939079715, + "grad_norm": 0.07722298800945282, + "learning_rate": 6.865295467842837e-05, + "loss": 0.2614, + "step": 34122 + }, + { + "epoch": 2.7643389500972133, + "grad_norm": 0.08287777006626129, + "learning_rate": 6.864845402583376e-05, + "loss": 0.2182, + "step": 34123 + }, + { + "epoch": 2.7644199611147116, + "grad_norm": 0.06992882490158081, + "learning_rate": 6.864395337323912e-05, + "loss": 0.2434, + "step": 34124 + }, + { + "epoch": 2.7645009721322102, + "grad_norm": 0.06839783489704132, + "learning_rate": 6.863945272064449e-05, + "loss": 0.2385, + "step": 34125 + }, + { + "epoch": 2.7645819831497085, + "grad_norm": 0.08003885298967361, + "learning_rate": 6.863495206804988e-05, + "loss": 0.2472, + "step": 34126 + }, + { + "epoch": 2.7646629941672067, + "grad_norm": 0.07297755777835846, + "learning_rate": 6.863045141545524e-05, + "loss": 0.2534, + "step": 34127 + }, + { + "epoch": 2.764744005184705, + "grad_norm": 0.06011795997619629, + "learning_rate": 6.862595076286061e-05, + "loss": 0.2433, + "step": 34128 + }, + { + "epoch": 2.7648250162022032, + "grad_norm": 0.05884721502661705, + "learning_rate": 6.8621450110266e-05, + "loss": 0.2437, + "step": 34129 + }, + { + "epoch": 2.764906027219702, + "grad_norm": 0.060120269656181335, + "learning_rate": 6.861694945767136e-05, + "loss": 0.2557, + "step": 34130 + }, + { + "epoch": 2.7649870382372, + "grad_norm": 0.06731750816106796, + "learning_rate": 6.861244880507673e-05, + "loss": 0.2703, + "step": 34131 + }, + { + "epoch": 2.7650680492546984, + "grad_norm": 0.06370685994625092, + "learning_rate": 6.860794815248212e-05, + "loss": 0.1969, + "step": 34132 + }, + { + "epoch": 2.765149060272197, + "grad_norm": 0.07013097405433655, + "learning_rate": 6.860344749988748e-05, + "loss": 0.2258, + "step": 34133 + }, + { + "epoch": 2.7652300712896953, + "grad_norm": 0.06985171884298325, + "learning_rate": 6.859894684729285e-05, + "loss": 0.2342, + "step": 34134 + }, + { + "epoch": 2.7653110823071936, + "grad_norm": 0.06555838882923126, + "learning_rate": 6.859444619469824e-05, + "loss": 0.2189, + "step": 34135 + }, + { + "epoch": 2.7653920933246923, + "grad_norm": 0.06166043132543564, + "learning_rate": 6.85899455421036e-05, + "loss": 0.1987, + "step": 34136 + }, + { + "epoch": 2.7654731043421905, + "grad_norm": 0.050333742052316666, + "learning_rate": 6.858544488950899e-05, + "loss": 0.2041, + "step": 34137 + }, + { + "epoch": 2.7655541153596888, + "grad_norm": 0.08235972374677658, + "learning_rate": 6.858094423691436e-05, + "loss": 0.2338, + "step": 34138 + }, + { + "epoch": 2.7656351263771874, + "grad_norm": 0.07296254485845566, + "learning_rate": 6.857644358431972e-05, + "loss": 0.2222, + "step": 34139 + }, + { + "epoch": 2.7657161373946857, + "grad_norm": 0.06814103573560715, + "learning_rate": 6.857194293172511e-05, + "loss": 0.2563, + "step": 34140 + }, + { + "epoch": 2.765797148412184, + "grad_norm": 0.06390096992254257, + "learning_rate": 6.856744227913048e-05, + "loss": 0.2665, + "step": 34141 + }, + { + "epoch": 2.7658781594296826, + "grad_norm": 0.07354594767093658, + "learning_rate": 6.856294162653584e-05, + "loss": 0.2291, + "step": 34142 + }, + { + "epoch": 2.765959170447181, + "grad_norm": 0.07856588810682297, + "learning_rate": 6.855844097394123e-05, + "loss": 0.2688, + "step": 34143 + }, + { + "epoch": 2.766040181464679, + "grad_norm": 0.08960554748773575, + "learning_rate": 6.85539403213466e-05, + "loss": 0.3206, + "step": 34144 + }, + { + "epoch": 2.766121192482178, + "grad_norm": 0.06466906517744064, + "learning_rate": 6.854943966875196e-05, + "loss": 0.2163, + "step": 34145 + }, + { + "epoch": 2.766202203499676, + "grad_norm": 0.07751652598381042, + "learning_rate": 6.854493901615735e-05, + "loss": 0.2569, + "step": 34146 + }, + { + "epoch": 2.7662832145171743, + "grad_norm": 0.07169736176729202, + "learning_rate": 6.854043836356273e-05, + "loss": 0.2173, + "step": 34147 + }, + { + "epoch": 2.766364225534673, + "grad_norm": 0.10449399054050446, + "learning_rate": 6.853593771096809e-05, + "loss": 0.2221, + "step": 34148 + }, + { + "epoch": 2.7664452365521712, + "grad_norm": 0.05646883696317673, + "learning_rate": 6.853143705837347e-05, + "loss": 0.2063, + "step": 34149 + }, + { + "epoch": 2.7665262475696695, + "grad_norm": 0.07571178674697876, + "learning_rate": 6.852693640577885e-05, + "loss": 0.2241, + "step": 34150 + }, + { + "epoch": 2.7666072585871677, + "grad_norm": 0.06778353452682495, + "learning_rate": 6.85224357531842e-05, + "loss": 0.2323, + "step": 34151 + }, + { + "epoch": 2.766688269604666, + "grad_norm": 0.07670864462852478, + "learning_rate": 6.85179351005896e-05, + "loss": 0.2914, + "step": 34152 + }, + { + "epoch": 2.7667692806221647, + "grad_norm": 0.06275644153356552, + "learning_rate": 6.851343444799497e-05, + "loss": 0.2204, + "step": 34153 + }, + { + "epoch": 2.766850291639663, + "grad_norm": 0.057569120079278946, + "learning_rate": 6.850893379540033e-05, + "loss": 0.2422, + "step": 34154 + }, + { + "epoch": 2.766931302657161, + "grad_norm": 0.0675300657749176, + "learning_rate": 6.850443314280571e-05, + "loss": 0.2607, + "step": 34155 + }, + { + "epoch": 2.76701231367466, + "grad_norm": 0.05765746906399727, + "learning_rate": 6.849993249021109e-05, + "loss": 0.2258, + "step": 34156 + }, + { + "epoch": 2.767093324692158, + "grad_norm": 0.08889303356409073, + "learning_rate": 6.849543183761645e-05, + "loss": 0.2619, + "step": 34157 + }, + { + "epoch": 2.7671743357096563, + "grad_norm": 0.0686429962515831, + "learning_rate": 6.849093118502183e-05, + "loss": 0.2244, + "step": 34158 + }, + { + "epoch": 2.767255346727155, + "grad_norm": 0.06495986878871918, + "learning_rate": 6.848643053242721e-05, + "loss": 0.2121, + "step": 34159 + }, + { + "epoch": 2.7673363577446533, + "grad_norm": 0.0850294977426529, + "learning_rate": 6.848192987983257e-05, + "loss": 0.2696, + "step": 34160 + }, + { + "epoch": 2.7674173687621515, + "grad_norm": 0.07529442012310028, + "learning_rate": 6.847742922723796e-05, + "loss": 0.2447, + "step": 34161 + }, + { + "epoch": 2.76749837977965, + "grad_norm": 0.08042323589324951, + "learning_rate": 6.847292857464333e-05, + "loss": 0.2635, + "step": 34162 + }, + { + "epoch": 2.7675793907971484, + "grad_norm": 0.08062606304883957, + "learning_rate": 6.84684279220487e-05, + "loss": 0.2282, + "step": 34163 + }, + { + "epoch": 2.7676604018146467, + "grad_norm": 0.062072739005088806, + "learning_rate": 6.846392726945408e-05, + "loss": 0.2497, + "step": 34164 + }, + { + "epoch": 2.7677414128321454, + "grad_norm": 0.08234535902738571, + "learning_rate": 6.845942661685945e-05, + "loss": 0.2873, + "step": 34165 + }, + { + "epoch": 2.7678224238496436, + "grad_norm": 0.07265506684780121, + "learning_rate": 6.845492596426482e-05, + "loss": 0.2969, + "step": 34166 + }, + { + "epoch": 2.767903434867142, + "grad_norm": 0.06202319264411926, + "learning_rate": 6.84504253116702e-05, + "loss": 0.2533, + "step": 34167 + }, + { + "epoch": 2.7679844458846405, + "grad_norm": 0.0771106705069542, + "learning_rate": 6.844592465907557e-05, + "loss": 0.2164, + "step": 34168 + }, + { + "epoch": 2.768065456902139, + "grad_norm": 0.06931550800800323, + "learning_rate": 6.844142400648094e-05, + "loss": 0.2737, + "step": 34169 + }, + { + "epoch": 2.768146467919637, + "grad_norm": 0.06963351368904114, + "learning_rate": 6.843692335388632e-05, + "loss": 0.2342, + "step": 34170 + }, + { + "epoch": 2.7682274789371357, + "grad_norm": 0.0673379972577095, + "learning_rate": 6.843242270129169e-05, + "loss": 0.2214, + "step": 34171 + }, + { + "epoch": 2.768308489954634, + "grad_norm": 0.05613080412149429, + "learning_rate": 6.842792204869707e-05, + "loss": 0.2457, + "step": 34172 + }, + { + "epoch": 2.768389500972132, + "grad_norm": 0.06227600574493408, + "learning_rate": 6.842342139610244e-05, + "loss": 0.2736, + "step": 34173 + }, + { + "epoch": 2.7684705119896305, + "grad_norm": 0.07635784149169922, + "learning_rate": 6.841892074350781e-05, + "loss": 0.2526, + "step": 34174 + }, + { + "epoch": 2.7685515230071287, + "grad_norm": 0.0714573860168457, + "learning_rate": 6.841442009091319e-05, + "loss": 0.2522, + "step": 34175 + }, + { + "epoch": 2.7686325340246274, + "grad_norm": 0.05897994339466095, + "learning_rate": 6.840991943831856e-05, + "loss": 0.2216, + "step": 34176 + }, + { + "epoch": 2.7687135450421256, + "grad_norm": 0.06542729586362839, + "learning_rate": 6.840541878572393e-05, + "loss": 0.2409, + "step": 34177 + }, + { + "epoch": 2.768794556059624, + "grad_norm": 0.0854971632361412, + "learning_rate": 6.840091813312931e-05, + "loss": 0.2025, + "step": 34178 + }, + { + "epoch": 2.7688755670771226, + "grad_norm": 0.07088334113359451, + "learning_rate": 6.839641748053468e-05, + "loss": 0.2625, + "step": 34179 + }, + { + "epoch": 2.768956578094621, + "grad_norm": 0.07033325731754303, + "learning_rate": 6.839191682794005e-05, + "loss": 0.2396, + "step": 34180 + }, + { + "epoch": 2.769037589112119, + "grad_norm": 0.0697719007730484, + "learning_rate": 6.838741617534543e-05, + "loss": 0.2381, + "step": 34181 + }, + { + "epoch": 2.7691186001296177, + "grad_norm": 0.06432507932186127, + "learning_rate": 6.83829155227508e-05, + "loss": 0.2111, + "step": 34182 + }, + { + "epoch": 2.769199611147116, + "grad_norm": 0.06042730435729027, + "learning_rate": 6.837841487015617e-05, + "loss": 0.2316, + "step": 34183 + }, + { + "epoch": 2.7692806221646142, + "grad_norm": 0.06913060694932938, + "learning_rate": 6.837391421756155e-05, + "loss": 0.2385, + "step": 34184 + }, + { + "epoch": 2.769361633182113, + "grad_norm": 0.09149839729070663, + "learning_rate": 6.836941356496692e-05, + "loss": 0.2421, + "step": 34185 + }, + { + "epoch": 2.769442644199611, + "grad_norm": 0.06392455101013184, + "learning_rate": 6.83649129123723e-05, + "loss": 0.2093, + "step": 34186 + }, + { + "epoch": 2.7695236552171094, + "grad_norm": 0.07868770509958267, + "learning_rate": 6.836041225977767e-05, + "loss": 0.2547, + "step": 34187 + }, + { + "epoch": 2.769604666234608, + "grad_norm": 0.07805663347244263, + "learning_rate": 6.835591160718304e-05, + "loss": 0.232, + "step": 34188 + }, + { + "epoch": 2.7696856772521063, + "grad_norm": 0.0701029971241951, + "learning_rate": 6.835141095458843e-05, + "loss": 0.2309, + "step": 34189 + }, + { + "epoch": 2.7697666882696046, + "grad_norm": 0.07640746235847473, + "learning_rate": 6.834691030199379e-05, + "loss": 0.2492, + "step": 34190 + }, + { + "epoch": 2.7698476992871033, + "grad_norm": 0.09099043160676956, + "learning_rate": 6.834240964939916e-05, + "loss": 0.2548, + "step": 34191 + }, + { + "epoch": 2.7699287103046015, + "grad_norm": 0.0710119977593422, + "learning_rate": 6.833790899680455e-05, + "loss": 0.2006, + "step": 34192 + }, + { + "epoch": 2.7700097213220998, + "grad_norm": 0.07448773831129074, + "learning_rate": 6.833340834420991e-05, + "loss": 0.2295, + "step": 34193 + }, + { + "epoch": 2.7700907323395985, + "grad_norm": 0.07332435995340347, + "learning_rate": 6.832890769161528e-05, + "loss": 0.2575, + "step": 34194 + }, + { + "epoch": 2.7701717433570967, + "grad_norm": 0.06369539350271225, + "learning_rate": 6.832440703902067e-05, + "loss": 0.229, + "step": 34195 + }, + { + "epoch": 2.770252754374595, + "grad_norm": 0.06551164388656616, + "learning_rate": 6.831990638642603e-05, + "loss": 0.2353, + "step": 34196 + }, + { + "epoch": 2.770333765392093, + "grad_norm": 0.06759309023618698, + "learning_rate": 6.83154057338314e-05, + "loss": 0.2376, + "step": 34197 + }, + { + "epoch": 2.7704147764095914, + "grad_norm": 0.06851544231176376, + "learning_rate": 6.831090508123679e-05, + "loss": 0.2226, + "step": 34198 + }, + { + "epoch": 2.77049578742709, + "grad_norm": 0.08439340442419052, + "learning_rate": 6.830640442864215e-05, + "loss": 0.2609, + "step": 34199 + }, + { + "epoch": 2.7705767984445884, + "grad_norm": 0.06333932280540466, + "learning_rate": 6.830190377604753e-05, + "loss": 0.1892, + "step": 34200 + }, + { + "epoch": 2.7706578094620866, + "grad_norm": 0.0682709664106369, + "learning_rate": 6.829740312345291e-05, + "loss": 0.2032, + "step": 34201 + }, + { + "epoch": 2.7707388204795853, + "grad_norm": 0.06252402067184448, + "learning_rate": 6.829290247085827e-05, + "loss": 0.2441, + "step": 34202 + }, + { + "epoch": 2.7708198314970836, + "grad_norm": 0.06736239790916443, + "learning_rate": 6.828840181826365e-05, + "loss": 0.2608, + "step": 34203 + }, + { + "epoch": 2.770900842514582, + "grad_norm": 0.06103686988353729, + "learning_rate": 6.828390116566903e-05, + "loss": 0.2006, + "step": 34204 + }, + { + "epoch": 2.7709818535320805, + "grad_norm": 0.06788726150989532, + "learning_rate": 6.82794005130744e-05, + "loss": 0.2633, + "step": 34205 + }, + { + "epoch": 2.7710628645495787, + "grad_norm": 0.0695648342370987, + "learning_rate": 6.827489986047977e-05, + "loss": 0.2048, + "step": 34206 + }, + { + "epoch": 2.771143875567077, + "grad_norm": 0.07660722732543945, + "learning_rate": 6.827039920788516e-05, + "loss": 0.244, + "step": 34207 + }, + { + "epoch": 2.7712248865845757, + "grad_norm": 0.06939336657524109, + "learning_rate": 6.826589855529052e-05, + "loss": 0.2472, + "step": 34208 + }, + { + "epoch": 2.771305897602074, + "grad_norm": 0.06204705685377121, + "learning_rate": 6.826139790269589e-05, + "loss": 0.1828, + "step": 34209 + }, + { + "epoch": 2.771386908619572, + "grad_norm": 0.09053542464971542, + "learning_rate": 6.825689725010128e-05, + "loss": 0.2619, + "step": 34210 + }, + { + "epoch": 2.771467919637071, + "grad_norm": 0.06675177067518234, + "learning_rate": 6.825239659750664e-05, + "loss": 0.2146, + "step": 34211 + }, + { + "epoch": 2.771548930654569, + "grad_norm": 0.07906267791986465, + "learning_rate": 6.824789594491201e-05, + "loss": 0.2495, + "step": 34212 + }, + { + "epoch": 2.7716299416720673, + "grad_norm": 0.0674787312746048, + "learning_rate": 6.82433952923174e-05, + "loss": 0.2408, + "step": 34213 + }, + { + "epoch": 2.771710952689566, + "grad_norm": 0.06417389959096909, + "learning_rate": 6.823889463972276e-05, + "loss": 0.2353, + "step": 34214 + }, + { + "epoch": 2.7717919637070643, + "grad_norm": 0.0717383325099945, + "learning_rate": 6.823439398712814e-05, + "loss": 0.2797, + "step": 34215 + }, + { + "epoch": 2.7718729747245625, + "grad_norm": 0.07927710562944412, + "learning_rate": 6.822989333453352e-05, + "loss": 0.2345, + "step": 34216 + }, + { + "epoch": 2.7719539857420608, + "grad_norm": 0.07535596191883087, + "learning_rate": 6.822539268193888e-05, + "loss": 0.2829, + "step": 34217 + }, + { + "epoch": 2.7720349967595594, + "grad_norm": 0.06705041974782944, + "learning_rate": 6.822089202934426e-05, + "loss": 0.2121, + "step": 34218 + }, + { + "epoch": 2.7721160077770577, + "grad_norm": 0.0631314367055893, + "learning_rate": 6.821639137674964e-05, + "loss": 0.2218, + "step": 34219 + }, + { + "epoch": 2.772197018794556, + "grad_norm": 0.08411700278520584, + "learning_rate": 6.8211890724155e-05, + "loss": 0.2544, + "step": 34220 + }, + { + "epoch": 2.772278029812054, + "grad_norm": 0.05945594608783722, + "learning_rate": 6.820739007156039e-05, + "loss": 0.2006, + "step": 34221 + }, + { + "epoch": 2.772359040829553, + "grad_norm": 0.07686436176300049, + "learning_rate": 6.820288941896576e-05, + "loss": 0.2498, + "step": 34222 + }, + { + "epoch": 2.772440051847051, + "grad_norm": 0.0920679047703743, + "learning_rate": 6.819838876637112e-05, + "loss": 0.2636, + "step": 34223 + }, + { + "epoch": 2.7725210628645494, + "grad_norm": 0.06771649420261383, + "learning_rate": 6.81938881137765e-05, + "loss": 0.2202, + "step": 34224 + }, + { + "epoch": 2.772602073882048, + "grad_norm": 0.06906941533088684, + "learning_rate": 6.818938746118188e-05, + "loss": 0.2569, + "step": 34225 + }, + { + "epoch": 2.7726830848995463, + "grad_norm": 0.05707994103431702, + "learning_rate": 6.818488680858724e-05, + "loss": 0.219, + "step": 34226 + }, + { + "epoch": 2.7727640959170445, + "grad_norm": 0.065834179520607, + "learning_rate": 6.818038615599263e-05, + "loss": 0.2546, + "step": 34227 + }, + { + "epoch": 2.7728451069345432, + "grad_norm": 0.06828774511814117, + "learning_rate": 6.8175885503398e-05, + "loss": 0.2346, + "step": 34228 + }, + { + "epoch": 2.7729261179520415, + "grad_norm": 0.057811468839645386, + "learning_rate": 6.817138485080336e-05, + "loss": 0.2097, + "step": 34229 + }, + { + "epoch": 2.7730071289695397, + "grad_norm": 0.07118486613035202, + "learning_rate": 6.816688419820875e-05, + "loss": 0.2427, + "step": 34230 + }, + { + "epoch": 2.7730881399870384, + "grad_norm": 0.062281396239995956, + "learning_rate": 6.816238354561412e-05, + "loss": 0.226, + "step": 34231 + }, + { + "epoch": 2.7731691510045366, + "grad_norm": 0.07268352806568146, + "learning_rate": 6.815788289301948e-05, + "loss": 0.2392, + "step": 34232 + }, + { + "epoch": 2.773250162022035, + "grad_norm": 0.07287313044071198, + "learning_rate": 6.815338224042487e-05, + "loss": 0.2569, + "step": 34233 + }, + { + "epoch": 2.7733311730395336, + "grad_norm": 0.07274441421031952, + "learning_rate": 6.814888158783024e-05, + "loss": 0.26, + "step": 34234 + }, + { + "epoch": 2.773412184057032, + "grad_norm": 0.07283134013414383, + "learning_rate": 6.81443809352356e-05, + "loss": 0.2761, + "step": 34235 + }, + { + "epoch": 2.77349319507453, + "grad_norm": 0.0661434605717659, + "learning_rate": 6.813988028264099e-05, + "loss": 0.2348, + "step": 34236 + }, + { + "epoch": 2.7735742060920288, + "grad_norm": 0.06506034731864929, + "learning_rate": 6.813537963004636e-05, + "loss": 0.238, + "step": 34237 + }, + { + "epoch": 2.773655217109527, + "grad_norm": 0.07169941067695618, + "learning_rate": 6.813087897745172e-05, + "loss": 0.2654, + "step": 34238 + }, + { + "epoch": 2.7737362281270252, + "grad_norm": 0.06020570173859596, + "learning_rate": 6.812637832485711e-05, + "loss": 0.2147, + "step": 34239 + }, + { + "epoch": 2.7738172391445235, + "grad_norm": 0.0668194517493248, + "learning_rate": 6.812187767226248e-05, + "loss": 0.2402, + "step": 34240 + }, + { + "epoch": 2.773898250162022, + "grad_norm": 0.07484626770019531, + "learning_rate": 6.811737701966784e-05, + "loss": 0.2321, + "step": 34241 + }, + { + "epoch": 2.7739792611795204, + "grad_norm": 0.06657987087965012, + "learning_rate": 6.811287636707323e-05, + "loss": 0.2443, + "step": 34242 + }, + { + "epoch": 2.7740602721970187, + "grad_norm": 0.07141261547803879, + "learning_rate": 6.81083757144786e-05, + "loss": 0.2208, + "step": 34243 + }, + { + "epoch": 2.774141283214517, + "grad_norm": 0.08069676160812378, + "learning_rate": 6.810387506188398e-05, + "loss": 0.2144, + "step": 34244 + }, + { + "epoch": 2.7742222942320156, + "grad_norm": 0.07490212470293045, + "learning_rate": 6.809937440928935e-05, + "loss": 0.2379, + "step": 34245 + }, + { + "epoch": 2.774303305249514, + "grad_norm": 0.07865491509437561, + "learning_rate": 6.809487375669473e-05, + "loss": 0.2451, + "step": 34246 + }, + { + "epoch": 2.774384316267012, + "grad_norm": 0.07843254506587982, + "learning_rate": 6.80903731041001e-05, + "loss": 0.2623, + "step": 34247 + }, + { + "epoch": 2.774465327284511, + "grad_norm": 0.0783471018075943, + "learning_rate": 6.808587245150547e-05, + "loss": 0.2292, + "step": 34248 + }, + { + "epoch": 2.774546338302009, + "grad_norm": 0.058314185589551926, + "learning_rate": 6.808137179891085e-05, + "loss": 0.2225, + "step": 34249 + }, + { + "epoch": 2.7746273493195073, + "grad_norm": 0.09157566726207733, + "learning_rate": 6.807687114631622e-05, + "loss": 0.2533, + "step": 34250 + }, + { + "epoch": 2.774708360337006, + "grad_norm": 0.08975666761398315, + "learning_rate": 6.80723704937216e-05, + "loss": 0.2492, + "step": 34251 + }, + { + "epoch": 2.774789371354504, + "grad_norm": 0.08823460340499878, + "learning_rate": 6.806786984112697e-05, + "loss": 0.2509, + "step": 34252 + }, + { + "epoch": 2.7748703823720025, + "grad_norm": 0.08141889423131943, + "learning_rate": 6.806336918853234e-05, + "loss": 0.2395, + "step": 34253 + }, + { + "epoch": 2.774951393389501, + "grad_norm": 0.07389726489782333, + "learning_rate": 6.805886853593771e-05, + "loss": 0.2248, + "step": 34254 + }, + { + "epoch": 2.7750324044069994, + "grad_norm": 0.08528506010770798, + "learning_rate": 6.805436788334309e-05, + "loss": 0.279, + "step": 34255 + }, + { + "epoch": 2.7751134154244976, + "grad_norm": 0.08055625855922699, + "learning_rate": 6.804986723074846e-05, + "loss": 0.2374, + "step": 34256 + }, + { + "epoch": 2.7751944264419963, + "grad_norm": 0.08427558839321136, + "learning_rate": 6.804536657815384e-05, + "loss": 0.2612, + "step": 34257 + }, + { + "epoch": 2.7752754374594946, + "grad_norm": 0.07798214256763458, + "learning_rate": 6.804086592555921e-05, + "loss": 0.2819, + "step": 34258 + }, + { + "epoch": 2.775356448476993, + "grad_norm": 0.07762635499238968, + "learning_rate": 6.803636527296458e-05, + "loss": 0.215, + "step": 34259 + }, + { + "epoch": 2.7754374594944915, + "grad_norm": 0.09789147228002548, + "learning_rate": 6.803186462036996e-05, + "loss": 0.2489, + "step": 34260 + }, + { + "epoch": 2.7755184705119897, + "grad_norm": 0.07010649889707565, + "learning_rate": 6.802736396777533e-05, + "loss": 0.2476, + "step": 34261 + }, + { + "epoch": 2.775599481529488, + "grad_norm": 0.06729248911142349, + "learning_rate": 6.80228633151807e-05, + "loss": 0.2405, + "step": 34262 + }, + { + "epoch": 2.7756804925469862, + "grad_norm": 0.08678527176380157, + "learning_rate": 6.801836266258608e-05, + "loss": 0.2487, + "step": 34263 + }, + { + "epoch": 2.775761503564485, + "grad_norm": 0.06508525460958481, + "learning_rate": 6.801386200999145e-05, + "loss": 0.2134, + "step": 34264 + }, + { + "epoch": 2.775842514581983, + "grad_norm": 0.06213225796818733, + "learning_rate": 6.800936135739682e-05, + "loss": 0.2169, + "step": 34265 + }, + { + "epoch": 2.7759235255994814, + "grad_norm": 0.07055771350860596, + "learning_rate": 6.80048607048022e-05, + "loss": 0.289, + "step": 34266 + }, + { + "epoch": 2.7760045366169797, + "grad_norm": 0.07395903021097183, + "learning_rate": 6.800036005220757e-05, + "loss": 0.2553, + "step": 34267 + }, + { + "epoch": 2.7760855476344783, + "grad_norm": 0.05806746706366539, + "learning_rate": 6.799585939961294e-05, + "loss": 0.1816, + "step": 34268 + }, + { + "epoch": 2.7761665586519766, + "grad_norm": 0.072418212890625, + "learning_rate": 6.799135874701832e-05, + "loss": 0.2678, + "step": 34269 + }, + { + "epoch": 2.776247569669475, + "grad_norm": 0.06981861591339111, + "learning_rate": 6.79868580944237e-05, + "loss": 0.3012, + "step": 34270 + }, + { + "epoch": 2.7763285806869735, + "grad_norm": 0.06536334753036499, + "learning_rate": 6.798235744182907e-05, + "loss": 0.2188, + "step": 34271 + }, + { + "epoch": 2.7764095917044718, + "grad_norm": 0.06149706616997719, + "learning_rate": 6.797785678923444e-05, + "loss": 0.2251, + "step": 34272 + }, + { + "epoch": 2.77649060272197, + "grad_norm": 0.05460198596119881, + "learning_rate": 6.797335613663983e-05, + "loss": 0.2401, + "step": 34273 + }, + { + "epoch": 2.7765716137394687, + "grad_norm": 0.07276234775781631, + "learning_rate": 6.796885548404519e-05, + "loss": 0.2793, + "step": 34274 + }, + { + "epoch": 2.776652624756967, + "grad_norm": 0.06272850185632706, + "learning_rate": 6.796435483145056e-05, + "loss": 0.2495, + "step": 34275 + }, + { + "epoch": 2.776733635774465, + "grad_norm": 0.07314179837703705, + "learning_rate": 6.795985417885595e-05, + "loss": 0.231, + "step": 34276 + }, + { + "epoch": 2.776814646791964, + "grad_norm": 0.06718869507312775, + "learning_rate": 6.795535352626131e-05, + "loss": 0.2868, + "step": 34277 + }, + { + "epoch": 2.776895657809462, + "grad_norm": 0.06727830320596695, + "learning_rate": 6.795085287366668e-05, + "loss": 0.2443, + "step": 34278 + }, + { + "epoch": 2.7769766688269604, + "grad_norm": 0.062349457293748856, + "learning_rate": 6.794635222107207e-05, + "loss": 0.2412, + "step": 34279 + }, + { + "epoch": 2.777057679844459, + "grad_norm": 0.08564005047082901, + "learning_rate": 6.794185156847743e-05, + "loss": 0.258, + "step": 34280 + }, + { + "epoch": 2.7771386908619573, + "grad_norm": 0.05737084522843361, + "learning_rate": 6.79373509158828e-05, + "loss": 0.2522, + "step": 34281 + }, + { + "epoch": 2.7772197018794555, + "grad_norm": 0.07271725684404373, + "learning_rate": 6.793285026328819e-05, + "loss": 0.2331, + "step": 34282 + }, + { + "epoch": 2.7773007128969542, + "grad_norm": 0.05929986387491226, + "learning_rate": 6.792834961069355e-05, + "loss": 0.2134, + "step": 34283 + }, + { + "epoch": 2.7773817239144525, + "grad_norm": 0.07402640581130981, + "learning_rate": 6.792384895809892e-05, + "loss": 0.2497, + "step": 34284 + }, + { + "epoch": 2.7774627349319507, + "grad_norm": 0.0746961385011673, + "learning_rate": 6.791934830550431e-05, + "loss": 0.2521, + "step": 34285 + }, + { + "epoch": 2.777543745949449, + "grad_norm": 0.0649658590555191, + "learning_rate": 6.791484765290967e-05, + "loss": 0.2287, + "step": 34286 + }, + { + "epoch": 2.7776247569669477, + "grad_norm": 0.09045769274234772, + "learning_rate": 6.791034700031504e-05, + "loss": 0.2135, + "step": 34287 + }, + { + "epoch": 2.777705767984446, + "grad_norm": 0.059244897216558456, + "learning_rate": 6.790584634772043e-05, + "loss": 0.2282, + "step": 34288 + }, + { + "epoch": 2.777786779001944, + "grad_norm": 0.07081972062587738, + "learning_rate": 6.790134569512579e-05, + "loss": 0.2521, + "step": 34289 + }, + { + "epoch": 2.7778677900194424, + "grad_norm": 0.07101619988679886, + "learning_rate": 6.789684504253116e-05, + "loss": 0.2492, + "step": 34290 + }, + { + "epoch": 2.777948801036941, + "grad_norm": 0.07111038267612457, + "learning_rate": 6.789234438993655e-05, + "loss": 0.2415, + "step": 34291 + }, + { + "epoch": 2.7780298120544393, + "grad_norm": 0.05958026275038719, + "learning_rate": 6.788784373734191e-05, + "loss": 0.1995, + "step": 34292 + }, + { + "epoch": 2.7781108230719376, + "grad_norm": 0.06855040788650513, + "learning_rate": 6.788334308474728e-05, + "loss": 0.2341, + "step": 34293 + }, + { + "epoch": 2.7781918340894363, + "grad_norm": 0.06002607196569443, + "learning_rate": 6.787884243215267e-05, + "loss": 0.2287, + "step": 34294 + }, + { + "epoch": 2.7782728451069345, + "grad_norm": 0.0822204127907753, + "learning_rate": 6.787434177955803e-05, + "loss": 0.2672, + "step": 34295 + }, + { + "epoch": 2.7783538561244328, + "grad_norm": 0.0641741082072258, + "learning_rate": 6.786984112696342e-05, + "loss": 0.2066, + "step": 34296 + }, + { + "epoch": 2.7784348671419314, + "grad_norm": 0.07143940776586533, + "learning_rate": 6.786534047436879e-05, + "loss": 0.2429, + "step": 34297 + }, + { + "epoch": 2.7785158781594297, + "grad_norm": 0.08297896385192871, + "learning_rate": 6.786083982177415e-05, + "loss": 0.2247, + "step": 34298 + }, + { + "epoch": 2.778596889176928, + "grad_norm": 0.08254558593034744, + "learning_rate": 6.785633916917954e-05, + "loss": 0.2453, + "step": 34299 + }, + { + "epoch": 2.7786779001944266, + "grad_norm": 0.06594591587781906, + "learning_rate": 6.785183851658491e-05, + "loss": 0.2548, + "step": 34300 + }, + { + "epoch": 2.778758911211925, + "grad_norm": 0.08076639473438263, + "learning_rate": 6.784733786399027e-05, + "loss": 0.2329, + "step": 34301 + }, + { + "epoch": 2.778839922229423, + "grad_norm": 0.07141506671905518, + "learning_rate": 6.784283721139566e-05, + "loss": 0.239, + "step": 34302 + }, + { + "epoch": 2.778920933246922, + "grad_norm": 0.05972827598452568, + "learning_rate": 6.783833655880103e-05, + "loss": 0.2027, + "step": 34303 + }, + { + "epoch": 2.77900194426442, + "grad_norm": 0.058857645839452744, + "learning_rate": 6.78338359062064e-05, + "loss": 0.2091, + "step": 34304 + }, + { + "epoch": 2.7790829552819183, + "grad_norm": 0.06419733166694641, + "learning_rate": 6.782933525361178e-05, + "loss": 0.2232, + "step": 34305 + }, + { + "epoch": 2.779163966299417, + "grad_norm": 0.07907504588365555, + "learning_rate": 6.782483460101716e-05, + "loss": 0.2345, + "step": 34306 + }, + { + "epoch": 2.779244977316915, + "grad_norm": 0.0683266744017601, + "learning_rate": 6.782033394842252e-05, + "loss": 0.2764, + "step": 34307 + }, + { + "epoch": 2.7793259883344135, + "grad_norm": 0.06538711488246918, + "learning_rate": 6.78158332958279e-05, + "loss": 0.2376, + "step": 34308 + }, + { + "epoch": 2.7794069993519117, + "grad_norm": 0.06912099570035934, + "learning_rate": 6.781133264323328e-05, + "loss": 0.242, + "step": 34309 + }, + { + "epoch": 2.7794880103694104, + "grad_norm": 0.07980895042419434, + "learning_rate": 6.780683199063864e-05, + "loss": 0.2766, + "step": 34310 + }, + { + "epoch": 2.7795690213869086, + "grad_norm": 0.07531173527240753, + "learning_rate": 6.780233133804402e-05, + "loss": 0.2434, + "step": 34311 + }, + { + "epoch": 2.779650032404407, + "grad_norm": 0.06003532186150551, + "learning_rate": 6.77978306854494e-05, + "loss": 0.2389, + "step": 34312 + }, + { + "epoch": 2.779731043421905, + "grad_norm": 0.07522325217723846, + "learning_rate": 6.779333003285476e-05, + "loss": 0.2211, + "step": 34313 + }, + { + "epoch": 2.779812054439404, + "grad_norm": 0.06744924187660217, + "learning_rate": 6.778882938026014e-05, + "loss": 0.2506, + "step": 34314 + }, + { + "epoch": 2.779893065456902, + "grad_norm": 0.05675087496638298, + "learning_rate": 6.778432872766552e-05, + "loss": 0.202, + "step": 34315 + }, + { + "epoch": 2.7799740764744003, + "grad_norm": 0.06242429465055466, + "learning_rate": 6.777982807507088e-05, + "loss": 0.2161, + "step": 34316 + }, + { + "epoch": 2.780055087491899, + "grad_norm": 0.08354310691356659, + "learning_rate": 6.777532742247626e-05, + "loss": 0.2787, + "step": 34317 + }, + { + "epoch": 2.7801360985093972, + "grad_norm": 0.05664534866809845, + "learning_rate": 6.777082676988164e-05, + "loss": 0.1897, + "step": 34318 + }, + { + "epoch": 2.7802171095268955, + "grad_norm": 0.09233099967241287, + "learning_rate": 6.7766326117287e-05, + "loss": 0.2648, + "step": 34319 + }, + { + "epoch": 2.780298120544394, + "grad_norm": 0.07990967482328415, + "learning_rate": 6.776182546469239e-05, + "loss": 0.2448, + "step": 34320 + }, + { + "epoch": 2.7803791315618924, + "grad_norm": 0.06859664618968964, + "learning_rate": 6.775732481209776e-05, + "loss": 0.2354, + "step": 34321 + }, + { + "epoch": 2.7804601425793907, + "grad_norm": 0.05772284418344498, + "learning_rate": 6.775282415950313e-05, + "loss": 0.2309, + "step": 34322 + }, + { + "epoch": 2.7805411535968894, + "grad_norm": 0.06003151461482048, + "learning_rate": 6.77483235069085e-05, + "loss": 0.2292, + "step": 34323 + }, + { + "epoch": 2.7806221646143876, + "grad_norm": 0.07578642666339874, + "learning_rate": 6.774382285431388e-05, + "loss": 0.2366, + "step": 34324 + }, + { + "epoch": 2.780703175631886, + "grad_norm": 0.06596392393112183, + "learning_rate": 6.773932220171925e-05, + "loss": 0.2677, + "step": 34325 + }, + { + "epoch": 2.7807841866493845, + "grad_norm": 0.08551062643527985, + "learning_rate": 6.773482154912463e-05, + "loss": 0.2299, + "step": 34326 + }, + { + "epoch": 2.780865197666883, + "grad_norm": 0.09138695150613785, + "learning_rate": 6.773032089653e-05, + "loss": 0.2513, + "step": 34327 + }, + { + "epoch": 2.780946208684381, + "grad_norm": 0.11304971575737, + "learning_rate": 6.772582024393537e-05, + "loss": 0.2293, + "step": 34328 + }, + { + "epoch": 2.7810272197018797, + "grad_norm": 0.0825752317905426, + "learning_rate": 6.772131959134075e-05, + "loss": 0.2433, + "step": 34329 + }, + { + "epoch": 2.781108230719378, + "grad_norm": 0.07327274978160858, + "learning_rate": 6.771681893874612e-05, + "loss": 0.2101, + "step": 34330 + }, + { + "epoch": 2.781189241736876, + "grad_norm": 0.059017740190029144, + "learning_rate": 6.77123182861515e-05, + "loss": 0.2263, + "step": 34331 + }, + { + "epoch": 2.7812702527543745, + "grad_norm": 0.060888487845659256, + "learning_rate": 6.770781763355687e-05, + "loss": 0.2376, + "step": 34332 + }, + { + "epoch": 2.781351263771873, + "grad_norm": 0.050954628735780716, + "learning_rate": 6.770331698096224e-05, + "loss": 0.2443, + "step": 34333 + }, + { + "epoch": 2.7814322747893714, + "grad_norm": 0.07145940512418747, + "learning_rate": 6.769881632836762e-05, + "loss": 0.2808, + "step": 34334 + }, + { + "epoch": 2.7815132858068696, + "grad_norm": 0.06935775279998779, + "learning_rate": 6.769431567577299e-05, + "loss": 0.2596, + "step": 34335 + }, + { + "epoch": 2.781594296824368, + "grad_norm": 0.07173454761505127, + "learning_rate": 6.768981502317836e-05, + "loss": 0.2566, + "step": 34336 + }, + { + "epoch": 2.7816753078418666, + "grad_norm": 0.08079207688570023, + "learning_rate": 6.768531437058374e-05, + "loss": 0.2637, + "step": 34337 + }, + { + "epoch": 2.781756318859365, + "grad_norm": 0.05620386078953743, + "learning_rate": 6.768081371798911e-05, + "loss": 0.2232, + "step": 34338 + }, + { + "epoch": 2.781837329876863, + "grad_norm": 0.0672222226858139, + "learning_rate": 6.767631306539448e-05, + "loss": 0.2388, + "step": 34339 + }, + { + "epoch": 2.7819183408943617, + "grad_norm": 0.06389716267585754, + "learning_rate": 6.767181241279986e-05, + "loss": 0.2297, + "step": 34340 + }, + { + "epoch": 2.78199935191186, + "grad_norm": 0.06956552714109421, + "learning_rate": 6.766731176020523e-05, + "loss": 0.2206, + "step": 34341 + }, + { + "epoch": 2.7820803629293582, + "grad_norm": 0.06504596769809723, + "learning_rate": 6.76628111076106e-05, + "loss": 0.2438, + "step": 34342 + }, + { + "epoch": 2.782161373946857, + "grad_norm": 0.09995260089635849, + "learning_rate": 6.765831045501598e-05, + "loss": 0.2781, + "step": 34343 + }, + { + "epoch": 2.782242384964355, + "grad_norm": 0.074515700340271, + "learning_rate": 6.765380980242135e-05, + "loss": 0.2707, + "step": 34344 + }, + { + "epoch": 2.7823233959818534, + "grad_norm": 0.05361089110374451, + "learning_rate": 6.764930914982673e-05, + "loss": 0.1947, + "step": 34345 + }, + { + "epoch": 2.782404406999352, + "grad_norm": 0.07568366080522537, + "learning_rate": 6.76448084972321e-05, + "loss": 0.2898, + "step": 34346 + }, + { + "epoch": 2.7824854180168503, + "grad_norm": 0.07337959110736847, + "learning_rate": 6.764030784463747e-05, + "loss": 0.2613, + "step": 34347 + }, + { + "epoch": 2.7825664290343486, + "grad_norm": 0.06756607443094254, + "learning_rate": 6.763580719204286e-05, + "loss": 0.2078, + "step": 34348 + }, + { + "epoch": 2.7826474400518473, + "grad_norm": 0.07057231664657593, + "learning_rate": 6.763130653944822e-05, + "loss": 0.2187, + "step": 34349 + }, + { + "epoch": 2.7827284510693455, + "grad_norm": 0.0597674734890461, + "learning_rate": 6.76268058868536e-05, + "loss": 0.2315, + "step": 34350 + }, + { + "epoch": 2.7828094620868438, + "grad_norm": 0.06805480271577835, + "learning_rate": 6.762230523425898e-05, + "loss": 0.276, + "step": 34351 + }, + { + "epoch": 2.7828904731043425, + "grad_norm": 0.047967858612537384, + "learning_rate": 6.761780458166434e-05, + "loss": 0.2266, + "step": 34352 + }, + { + "epoch": 2.7829714841218407, + "grad_norm": 0.07217514514923096, + "learning_rate": 6.761330392906971e-05, + "loss": 0.1975, + "step": 34353 + }, + { + "epoch": 2.783052495139339, + "grad_norm": 0.06429262459278107, + "learning_rate": 6.76088032764751e-05, + "loss": 0.2329, + "step": 34354 + }, + { + "epoch": 2.783133506156837, + "grad_norm": 0.05952700972557068, + "learning_rate": 6.760430262388046e-05, + "loss": 0.2487, + "step": 34355 + }, + { + "epoch": 2.7832145171743354, + "grad_norm": 0.0745530053973198, + "learning_rate": 6.759980197128584e-05, + "loss": 0.2477, + "step": 34356 + }, + { + "epoch": 2.783295528191834, + "grad_norm": 0.08647239953279495, + "learning_rate": 6.759530131869122e-05, + "loss": 0.209, + "step": 34357 + }, + { + "epoch": 2.7833765392093324, + "grad_norm": 0.08480441570281982, + "learning_rate": 6.759080066609658e-05, + "loss": 0.2785, + "step": 34358 + }, + { + "epoch": 2.7834575502268306, + "grad_norm": 0.07079491764307022, + "learning_rate": 6.758630001350196e-05, + "loss": 0.1753, + "step": 34359 + }, + { + "epoch": 2.7835385612443293, + "grad_norm": 0.0632624626159668, + "learning_rate": 6.758179936090734e-05, + "loss": 0.2346, + "step": 34360 + }, + { + "epoch": 2.7836195722618275, + "grad_norm": 0.08435448259115219, + "learning_rate": 6.75772987083127e-05, + "loss": 0.2539, + "step": 34361 + }, + { + "epoch": 2.783700583279326, + "grad_norm": 0.07184403389692307, + "learning_rate": 6.757279805571808e-05, + "loss": 0.2656, + "step": 34362 + }, + { + "epoch": 2.7837815942968245, + "grad_norm": 0.07023405283689499, + "learning_rate": 6.756829740312346e-05, + "loss": 0.2108, + "step": 34363 + }, + { + "epoch": 2.7838626053143227, + "grad_norm": 0.07092241197824478, + "learning_rate": 6.756379675052882e-05, + "loss": 0.2064, + "step": 34364 + }, + { + "epoch": 2.783943616331821, + "grad_norm": 0.07640679180622101, + "learning_rate": 6.75592960979342e-05, + "loss": 0.2837, + "step": 34365 + }, + { + "epoch": 2.7840246273493197, + "grad_norm": 0.06458078324794769, + "learning_rate": 6.755479544533959e-05, + "loss": 0.2426, + "step": 34366 + }, + { + "epoch": 2.784105638366818, + "grad_norm": 0.08889305591583252, + "learning_rate": 6.755029479274495e-05, + "loss": 0.2502, + "step": 34367 + }, + { + "epoch": 2.784186649384316, + "grad_norm": 0.0728810727596283, + "learning_rate": 6.754579414015032e-05, + "loss": 0.2762, + "step": 34368 + }, + { + "epoch": 2.784267660401815, + "grad_norm": 0.06907269358634949, + "learning_rate": 6.75412934875557e-05, + "loss": 0.2933, + "step": 34369 + }, + { + "epoch": 2.784348671419313, + "grad_norm": 0.06508993357419968, + "learning_rate": 6.753679283496107e-05, + "loss": 0.2284, + "step": 34370 + }, + { + "epoch": 2.7844296824368113, + "grad_norm": 0.07364595681428909, + "learning_rate": 6.753229218236644e-05, + "loss": 0.2047, + "step": 34371 + }, + { + "epoch": 2.78451069345431, + "grad_norm": 0.07595406472682953, + "learning_rate": 6.752779152977183e-05, + "loss": 0.2373, + "step": 34372 + }, + { + "epoch": 2.7845917044718083, + "grad_norm": 0.08654715865850449, + "learning_rate": 6.752329087717719e-05, + "loss": 0.2802, + "step": 34373 + }, + { + "epoch": 2.7846727154893065, + "grad_norm": 0.07352244853973389, + "learning_rate": 6.751879022458257e-05, + "loss": 0.2392, + "step": 34374 + }, + { + "epoch": 2.784753726506805, + "grad_norm": 0.06762534379959106, + "learning_rate": 6.751428957198795e-05, + "loss": 0.2184, + "step": 34375 + }, + { + "epoch": 2.7848347375243034, + "grad_norm": 0.07174224406480789, + "learning_rate": 6.750978891939331e-05, + "loss": 0.2153, + "step": 34376 + }, + { + "epoch": 2.7849157485418017, + "grad_norm": 0.06852773576974869, + "learning_rate": 6.75052882667987e-05, + "loss": 0.2234, + "step": 34377 + }, + { + "epoch": 2.7849967595593, + "grad_norm": 0.06424195319414139, + "learning_rate": 6.750078761420407e-05, + "loss": 0.2547, + "step": 34378 + }, + { + "epoch": 2.785077770576798, + "grad_norm": 0.06172497943043709, + "learning_rate": 6.749628696160943e-05, + "loss": 0.2291, + "step": 34379 + }, + { + "epoch": 2.785158781594297, + "grad_norm": 0.07526271790266037, + "learning_rate": 6.749178630901482e-05, + "loss": 0.2528, + "step": 34380 + }, + { + "epoch": 2.785239792611795, + "grad_norm": 0.06385906040668488, + "learning_rate": 6.748728565642019e-05, + "loss": 0.2146, + "step": 34381 + }, + { + "epoch": 2.7853208036292934, + "grad_norm": 0.08248017728328705, + "learning_rate": 6.748278500382555e-05, + "loss": 0.2931, + "step": 34382 + }, + { + "epoch": 2.785401814646792, + "grad_norm": 0.06632523238658905, + "learning_rate": 6.747828435123094e-05, + "loss": 0.2118, + "step": 34383 + }, + { + "epoch": 2.7854828256642903, + "grad_norm": 0.07956916838884354, + "learning_rate": 6.747378369863631e-05, + "loss": 0.2701, + "step": 34384 + }, + { + "epoch": 2.7855638366817885, + "grad_norm": 0.06996366381645203, + "learning_rate": 6.746928304604167e-05, + "loss": 0.2281, + "step": 34385 + }, + { + "epoch": 2.785644847699287, + "grad_norm": 0.07411151379346848, + "learning_rate": 6.746478239344706e-05, + "loss": 0.26, + "step": 34386 + }, + { + "epoch": 2.7857258587167855, + "grad_norm": 0.053881555795669556, + "learning_rate": 6.746028174085243e-05, + "loss": 0.2054, + "step": 34387 + }, + { + "epoch": 2.7858068697342837, + "grad_norm": 0.07679598033428192, + "learning_rate": 6.745578108825779e-05, + "loss": 0.2424, + "step": 34388 + }, + { + "epoch": 2.7858878807517824, + "grad_norm": 0.07939158380031586, + "learning_rate": 6.745128043566318e-05, + "loss": 0.2767, + "step": 34389 + }, + { + "epoch": 2.7859688917692806, + "grad_norm": 0.07225979119539261, + "learning_rate": 6.744677978306855e-05, + "loss": 0.2158, + "step": 34390 + }, + { + "epoch": 2.786049902786779, + "grad_norm": 0.06612420827150345, + "learning_rate": 6.744227913047391e-05, + "loss": 0.2233, + "step": 34391 + }, + { + "epoch": 2.7861309138042776, + "grad_norm": 0.09697147458791733, + "learning_rate": 6.74377784778793e-05, + "loss": 0.255, + "step": 34392 + }, + { + "epoch": 2.786211924821776, + "grad_norm": 0.07526032626628876, + "learning_rate": 6.743327782528467e-05, + "loss": 0.2381, + "step": 34393 + }, + { + "epoch": 2.786292935839274, + "grad_norm": 0.08052770048379898, + "learning_rate": 6.742877717269003e-05, + "loss": 0.2626, + "step": 34394 + }, + { + "epoch": 2.7863739468567728, + "grad_norm": 0.06478998064994812, + "learning_rate": 6.742427652009542e-05, + "loss": 0.2183, + "step": 34395 + }, + { + "epoch": 2.786454957874271, + "grad_norm": 0.06821257621049881, + "learning_rate": 6.741977586750079e-05, + "loss": 0.2422, + "step": 34396 + }, + { + "epoch": 2.7865359688917692, + "grad_norm": 0.07175886631011963, + "learning_rate": 6.741527521490615e-05, + "loss": 0.2066, + "step": 34397 + }, + { + "epoch": 2.786616979909268, + "grad_norm": 0.06310652196407318, + "learning_rate": 6.741077456231154e-05, + "loss": 0.2241, + "step": 34398 + }, + { + "epoch": 2.786697990926766, + "grad_norm": 0.0647004023194313, + "learning_rate": 6.740627390971691e-05, + "loss": 0.2276, + "step": 34399 + }, + { + "epoch": 2.7867790019442644, + "grad_norm": 0.08007898926734924, + "learning_rate": 6.740177325712227e-05, + "loss": 0.2739, + "step": 34400 + }, + { + "epoch": 2.7868600129617627, + "grad_norm": 0.06193099170923233, + "learning_rate": 6.739727260452766e-05, + "loss": 0.2301, + "step": 34401 + }, + { + "epoch": 2.786941023979261, + "grad_norm": 0.06945597380399704, + "learning_rate": 6.739277195193303e-05, + "loss": 0.2555, + "step": 34402 + }, + { + "epoch": 2.7870220349967596, + "grad_norm": 0.07463754713535309, + "learning_rate": 6.738827129933841e-05, + "loss": 0.2691, + "step": 34403 + }, + { + "epoch": 2.787103046014258, + "grad_norm": 0.06011288985610008, + "learning_rate": 6.738377064674378e-05, + "loss": 0.2347, + "step": 34404 + }, + { + "epoch": 2.787184057031756, + "grad_norm": 0.06475888192653656, + "learning_rate": 6.737926999414916e-05, + "loss": 0.2483, + "step": 34405 + }, + { + "epoch": 2.787265068049255, + "grad_norm": 0.05519997701048851, + "learning_rate": 6.737476934155453e-05, + "loss": 0.2429, + "step": 34406 + }, + { + "epoch": 2.787346079066753, + "grad_norm": 0.08073216676712036, + "learning_rate": 6.73702686889599e-05, + "loss": 0.2635, + "step": 34407 + }, + { + "epoch": 2.7874270900842513, + "grad_norm": 0.06941085308790207, + "learning_rate": 6.736576803636528e-05, + "loss": 0.2153, + "step": 34408 + }, + { + "epoch": 2.78750810110175, + "grad_norm": 0.08742102235555649, + "learning_rate": 6.736126738377065e-05, + "loss": 0.2539, + "step": 34409 + }, + { + "epoch": 2.787589112119248, + "grad_norm": 0.06479343771934509, + "learning_rate": 6.735676673117602e-05, + "loss": 0.2463, + "step": 34410 + }, + { + "epoch": 2.7876701231367464, + "grad_norm": 0.06715091317892075, + "learning_rate": 6.73522660785814e-05, + "loss": 0.2261, + "step": 34411 + }, + { + "epoch": 2.787751134154245, + "grad_norm": 0.06692752987146378, + "learning_rate": 6.734776542598677e-05, + "loss": 0.2037, + "step": 34412 + }, + { + "epoch": 2.7878321451717434, + "grad_norm": 0.07739755511283875, + "learning_rate": 6.734326477339214e-05, + "loss": 0.2517, + "step": 34413 + }, + { + "epoch": 2.7879131561892416, + "grad_norm": 0.08224021643400192, + "learning_rate": 6.733876412079752e-05, + "loss": 0.2464, + "step": 34414 + }, + { + "epoch": 2.7879941672067403, + "grad_norm": 0.06463365256786346, + "learning_rate": 6.733426346820289e-05, + "loss": 0.2182, + "step": 34415 + }, + { + "epoch": 2.7880751782242386, + "grad_norm": 0.08047308027744293, + "learning_rate": 6.732976281560827e-05, + "loss": 0.252, + "step": 34416 + }, + { + "epoch": 2.788156189241737, + "grad_norm": 0.06497808545827866, + "learning_rate": 6.732526216301364e-05, + "loss": 0.2045, + "step": 34417 + }, + { + "epoch": 2.7882372002592355, + "grad_norm": 0.057299938052892685, + "learning_rate": 6.732076151041901e-05, + "loss": 0.2309, + "step": 34418 + }, + { + "epoch": 2.7883182112767337, + "grad_norm": 0.0716933012008667, + "learning_rate": 6.731626085782439e-05, + "loss": 0.2534, + "step": 34419 + }, + { + "epoch": 2.788399222294232, + "grad_norm": 0.0884045958518982, + "learning_rate": 6.731176020522976e-05, + "loss": 0.2376, + "step": 34420 + }, + { + "epoch": 2.7884802333117307, + "grad_norm": 0.07025162875652313, + "learning_rate": 6.730725955263513e-05, + "loss": 0.2471, + "step": 34421 + }, + { + "epoch": 2.788561244329229, + "grad_norm": 0.11640918254852295, + "learning_rate": 6.730275890004051e-05, + "loss": 0.2244, + "step": 34422 + }, + { + "epoch": 2.788642255346727, + "grad_norm": 0.06777489930391312, + "learning_rate": 6.729825824744588e-05, + "loss": 0.2375, + "step": 34423 + }, + { + "epoch": 2.7887232663642254, + "grad_norm": 0.06596884876489639, + "learning_rate": 6.729375759485125e-05, + "loss": 0.2492, + "step": 34424 + }, + { + "epoch": 2.7888042773817237, + "grad_norm": 0.06826752424240112, + "learning_rate": 6.728925694225663e-05, + "loss": 0.2131, + "step": 34425 + }, + { + "epoch": 2.7888852883992223, + "grad_norm": 0.07430961728096008, + "learning_rate": 6.7284756289662e-05, + "loss": 0.2756, + "step": 34426 + }, + { + "epoch": 2.7889662994167206, + "grad_norm": 0.0647616907954216, + "learning_rate": 6.728025563706737e-05, + "loss": 0.2101, + "step": 34427 + }, + { + "epoch": 2.789047310434219, + "grad_norm": 0.06215166300535202, + "learning_rate": 6.727575498447275e-05, + "loss": 0.2188, + "step": 34428 + }, + { + "epoch": 2.7891283214517175, + "grad_norm": 0.06014298275113106, + "learning_rate": 6.727125433187814e-05, + "loss": 0.1843, + "step": 34429 + }, + { + "epoch": 2.7892093324692158, + "grad_norm": 0.07539289444684982, + "learning_rate": 6.72667536792835e-05, + "loss": 0.2651, + "step": 34430 + }, + { + "epoch": 2.789290343486714, + "grad_norm": 0.07783479988574982, + "learning_rate": 6.726225302668887e-05, + "loss": 0.2589, + "step": 34431 + }, + { + "epoch": 2.7893713545042127, + "grad_norm": 0.07596423476934433, + "learning_rate": 6.725775237409426e-05, + "loss": 0.2412, + "step": 34432 + }, + { + "epoch": 2.789452365521711, + "grad_norm": 0.06946752220392227, + "learning_rate": 6.725325172149962e-05, + "loss": 0.2539, + "step": 34433 + }, + { + "epoch": 2.789533376539209, + "grad_norm": 0.07110365480184555, + "learning_rate": 6.724875106890499e-05, + "loss": 0.2496, + "step": 34434 + }, + { + "epoch": 2.789614387556708, + "grad_norm": 0.06963635236024857, + "learning_rate": 6.724425041631038e-05, + "loss": 0.2613, + "step": 34435 + }, + { + "epoch": 2.789695398574206, + "grad_norm": 0.07469368726015091, + "learning_rate": 6.723974976371574e-05, + "loss": 0.2121, + "step": 34436 + }, + { + "epoch": 2.7897764095917044, + "grad_norm": 0.057353585958480835, + "learning_rate": 6.723524911112111e-05, + "loss": 0.2013, + "step": 34437 + }, + { + "epoch": 2.789857420609203, + "grad_norm": 0.0705106109380722, + "learning_rate": 6.72307484585265e-05, + "loss": 0.2649, + "step": 34438 + }, + { + "epoch": 2.7899384316267013, + "grad_norm": 0.07225330919027328, + "learning_rate": 6.722624780593186e-05, + "loss": 0.2561, + "step": 34439 + }, + { + "epoch": 2.7900194426441995, + "grad_norm": 0.06563139706850052, + "learning_rate": 6.722174715333723e-05, + "loss": 0.2022, + "step": 34440 + }, + { + "epoch": 2.7901004536616982, + "grad_norm": 0.06589397042989731, + "learning_rate": 6.721724650074262e-05, + "loss": 0.2353, + "step": 34441 + }, + { + "epoch": 2.7901814646791965, + "grad_norm": 0.061312295496463776, + "learning_rate": 6.721274584814798e-05, + "loss": 0.1942, + "step": 34442 + }, + { + "epoch": 2.7902624756966947, + "grad_norm": 0.0669531598687172, + "learning_rate": 6.720824519555335e-05, + "loss": 0.2218, + "step": 34443 + }, + { + "epoch": 2.790343486714193, + "grad_norm": 0.06205807998776436, + "learning_rate": 6.720374454295874e-05, + "loss": 0.228, + "step": 34444 + }, + { + "epoch": 2.7904244977316917, + "grad_norm": 0.08307838439941406, + "learning_rate": 6.71992438903641e-05, + "loss": 0.2183, + "step": 34445 + }, + { + "epoch": 2.79050550874919, + "grad_norm": 0.07728546112775803, + "learning_rate": 6.719474323776947e-05, + "loss": 0.2474, + "step": 34446 + }, + { + "epoch": 2.790586519766688, + "grad_norm": 0.07177817076444626, + "learning_rate": 6.719024258517486e-05, + "loss": 0.254, + "step": 34447 + }, + { + "epoch": 2.7906675307841864, + "grad_norm": 0.06399939209222794, + "learning_rate": 6.718574193258022e-05, + "loss": 0.2457, + "step": 34448 + }, + { + "epoch": 2.790748541801685, + "grad_norm": 0.05192526802420616, + "learning_rate": 6.71812412799856e-05, + "loss": 0.2214, + "step": 34449 + }, + { + "epoch": 2.7908295528191833, + "grad_norm": 0.06940247118473053, + "learning_rate": 6.717674062739098e-05, + "loss": 0.2491, + "step": 34450 + }, + { + "epoch": 2.7909105638366816, + "grad_norm": 0.07798627763986588, + "learning_rate": 6.717223997479634e-05, + "loss": 0.236, + "step": 34451 + }, + { + "epoch": 2.7909915748541803, + "grad_norm": 0.06335878372192383, + "learning_rate": 6.716773932220171e-05, + "loss": 0.2106, + "step": 34452 + }, + { + "epoch": 2.7910725858716785, + "grad_norm": 0.06506040692329407, + "learning_rate": 6.71632386696071e-05, + "loss": 0.2444, + "step": 34453 + }, + { + "epoch": 2.7911535968891767, + "grad_norm": 0.06760771572589874, + "learning_rate": 6.715873801701246e-05, + "loss": 0.2147, + "step": 34454 + }, + { + "epoch": 2.7912346079066754, + "grad_norm": 0.06756895035505295, + "learning_rate": 6.715423736441785e-05, + "loss": 0.273, + "step": 34455 + }, + { + "epoch": 2.7913156189241737, + "grad_norm": 0.0767185315489769, + "learning_rate": 6.714973671182322e-05, + "loss": 0.2733, + "step": 34456 + }, + { + "epoch": 2.791396629941672, + "grad_norm": 0.07195433229207993, + "learning_rate": 6.714523605922858e-05, + "loss": 0.2248, + "step": 34457 + }, + { + "epoch": 2.7914776409591706, + "grad_norm": 0.05929771810770035, + "learning_rate": 6.714073540663397e-05, + "loss": 0.2299, + "step": 34458 + }, + { + "epoch": 2.791558651976669, + "grad_norm": 0.06814391165971756, + "learning_rate": 6.713623475403934e-05, + "loss": 0.2515, + "step": 34459 + }, + { + "epoch": 2.791639662994167, + "grad_norm": 0.07823874801397324, + "learning_rate": 6.71317341014447e-05, + "loss": 0.2584, + "step": 34460 + }, + { + "epoch": 2.791720674011666, + "grad_norm": 0.0727526992559433, + "learning_rate": 6.712723344885009e-05, + "loss": 0.2264, + "step": 34461 + }, + { + "epoch": 2.791801685029164, + "grad_norm": 0.06739646196365356, + "learning_rate": 6.712273279625546e-05, + "loss": 0.2707, + "step": 34462 + }, + { + "epoch": 2.7918826960466623, + "grad_norm": 0.07312295585870743, + "learning_rate": 6.711823214366082e-05, + "loss": 0.2549, + "step": 34463 + }, + { + "epoch": 2.791963707064161, + "grad_norm": 0.0585709773004055, + "learning_rate": 6.711373149106621e-05, + "loss": 0.2358, + "step": 34464 + }, + { + "epoch": 2.792044718081659, + "grad_norm": 0.06079203262925148, + "learning_rate": 6.710923083847159e-05, + "loss": 0.2251, + "step": 34465 + }, + { + "epoch": 2.7921257290991575, + "grad_norm": 0.07005365937948227, + "learning_rate": 6.710473018587695e-05, + "loss": 0.2099, + "step": 34466 + }, + { + "epoch": 2.7922067401166557, + "grad_norm": 0.08667057752609253, + "learning_rate": 6.710022953328233e-05, + "loss": 0.2528, + "step": 34467 + }, + { + "epoch": 2.7922877511341544, + "grad_norm": 0.07198949158191681, + "learning_rate": 6.70957288806877e-05, + "loss": 0.2678, + "step": 34468 + }, + { + "epoch": 2.7923687621516526, + "grad_norm": 0.06842099130153656, + "learning_rate": 6.709122822809307e-05, + "loss": 0.2277, + "step": 34469 + }, + { + "epoch": 2.792449773169151, + "grad_norm": 0.06861353665590286, + "learning_rate": 6.708672757549845e-05, + "loss": 0.2078, + "step": 34470 + }, + { + "epoch": 2.792530784186649, + "grad_norm": 0.06622407585382462, + "learning_rate": 6.708222692290383e-05, + "loss": 0.219, + "step": 34471 + }, + { + "epoch": 2.792611795204148, + "grad_norm": 0.05334659665822983, + "learning_rate": 6.707772627030919e-05, + "loss": 0.2134, + "step": 34472 + }, + { + "epoch": 2.792692806221646, + "grad_norm": 0.07334712147712708, + "learning_rate": 6.707322561771457e-05, + "loss": 0.235, + "step": 34473 + }, + { + "epoch": 2.7927738172391443, + "grad_norm": 0.06408771872520447, + "learning_rate": 6.706872496511995e-05, + "loss": 0.2147, + "step": 34474 + }, + { + "epoch": 2.792854828256643, + "grad_norm": 0.07221581041812897, + "learning_rate": 6.706422431252531e-05, + "loss": 0.2565, + "step": 34475 + }, + { + "epoch": 2.7929358392741412, + "grad_norm": 0.06033742427825928, + "learning_rate": 6.70597236599307e-05, + "loss": 0.2485, + "step": 34476 + }, + { + "epoch": 2.7930168502916395, + "grad_norm": 0.07943415641784668, + "learning_rate": 6.705522300733607e-05, + "loss": 0.2359, + "step": 34477 + }, + { + "epoch": 2.793097861309138, + "grad_norm": 0.0714966431260109, + "learning_rate": 6.705072235474143e-05, + "loss": 0.2412, + "step": 34478 + }, + { + "epoch": 2.7931788723266364, + "grad_norm": 0.059110529720783234, + "learning_rate": 6.704622170214682e-05, + "loss": 0.2053, + "step": 34479 + }, + { + "epoch": 2.7932598833441347, + "grad_norm": 0.08263637870550156, + "learning_rate": 6.704172104955219e-05, + "loss": 0.2521, + "step": 34480 + }, + { + "epoch": 2.7933408943616334, + "grad_norm": 0.07647662609815598, + "learning_rate": 6.703722039695756e-05, + "loss": 0.2456, + "step": 34481 + }, + { + "epoch": 2.7934219053791316, + "grad_norm": 0.06695931404829025, + "learning_rate": 6.703271974436294e-05, + "loss": 0.2433, + "step": 34482 + }, + { + "epoch": 2.79350291639663, + "grad_norm": 0.06521735340356827, + "learning_rate": 6.702821909176831e-05, + "loss": 0.2418, + "step": 34483 + }, + { + "epoch": 2.7935839274141285, + "grad_norm": 0.05986318364739418, + "learning_rate": 6.702371843917368e-05, + "loss": 0.2465, + "step": 34484 + }, + { + "epoch": 2.7936649384316268, + "grad_norm": 0.07254021614789963, + "learning_rate": 6.701921778657906e-05, + "loss": 0.2622, + "step": 34485 + }, + { + "epoch": 2.793745949449125, + "grad_norm": 0.07294338941574097, + "learning_rate": 6.701471713398443e-05, + "loss": 0.2323, + "step": 34486 + }, + { + "epoch": 2.7938269604666237, + "grad_norm": 0.06299685686826706, + "learning_rate": 6.70102164813898e-05, + "loss": 0.2194, + "step": 34487 + }, + { + "epoch": 2.793907971484122, + "grad_norm": 0.06214449554681778, + "learning_rate": 6.700571582879518e-05, + "loss": 0.2218, + "step": 34488 + }, + { + "epoch": 2.79398898250162, + "grad_norm": 0.07161349803209305, + "learning_rate": 6.700121517620055e-05, + "loss": 0.2257, + "step": 34489 + }, + { + "epoch": 2.7940699935191184, + "grad_norm": 0.06868092715740204, + "learning_rate": 6.699671452360593e-05, + "loss": 0.2494, + "step": 34490 + }, + { + "epoch": 2.794151004536617, + "grad_norm": 0.07574149966239929, + "learning_rate": 6.69922138710113e-05, + "loss": 0.2281, + "step": 34491 + }, + { + "epoch": 2.7942320155541154, + "grad_norm": 0.07077787071466446, + "learning_rate": 6.698771321841667e-05, + "loss": 0.2547, + "step": 34492 + }, + { + "epoch": 2.7943130265716136, + "grad_norm": 0.061070095747709274, + "learning_rate": 6.698321256582205e-05, + "loss": 0.2976, + "step": 34493 + }, + { + "epoch": 2.794394037589112, + "grad_norm": 0.06678925454616547, + "learning_rate": 6.697871191322742e-05, + "loss": 0.2325, + "step": 34494 + }, + { + "epoch": 2.7944750486066106, + "grad_norm": 0.06714421510696411, + "learning_rate": 6.69742112606328e-05, + "loss": 0.2559, + "step": 34495 + }, + { + "epoch": 2.794556059624109, + "grad_norm": 0.07086465507745743, + "learning_rate": 6.696971060803817e-05, + "loss": 0.2762, + "step": 34496 + }, + { + "epoch": 2.794637070641607, + "grad_norm": 0.08439730852842331, + "learning_rate": 6.696520995544354e-05, + "loss": 0.2505, + "step": 34497 + }, + { + "epoch": 2.7947180816591057, + "grad_norm": 0.06269598752260208, + "learning_rate": 6.696070930284891e-05, + "loss": 0.2165, + "step": 34498 + }, + { + "epoch": 2.794799092676604, + "grad_norm": 0.05698813870549202, + "learning_rate": 6.695620865025429e-05, + "loss": 0.2581, + "step": 34499 + }, + { + "epoch": 2.7948801036941022, + "grad_norm": 0.08211810886859894, + "learning_rate": 6.695170799765966e-05, + "loss": 0.2274, + "step": 34500 + }, + { + "epoch": 2.794961114711601, + "grad_norm": 0.0799921527504921, + "learning_rate": 6.694720734506504e-05, + "loss": 0.2721, + "step": 34501 + }, + { + "epoch": 2.795042125729099, + "grad_norm": 0.06881997734308243, + "learning_rate": 6.694270669247041e-05, + "loss": 0.2394, + "step": 34502 + }, + { + "epoch": 2.7951231367465974, + "grad_norm": 0.06042630970478058, + "learning_rate": 6.693820603987578e-05, + "loss": 0.2451, + "step": 34503 + }, + { + "epoch": 2.795204147764096, + "grad_norm": 0.06238381192088127, + "learning_rate": 6.693370538728116e-05, + "loss": 0.2208, + "step": 34504 + }, + { + "epoch": 2.7952851587815943, + "grad_norm": 0.058415718376636505, + "learning_rate": 6.692920473468653e-05, + "loss": 0.25, + "step": 34505 + }, + { + "epoch": 2.7953661697990926, + "grad_norm": 0.06427952647209167, + "learning_rate": 6.69247040820919e-05, + "loss": 0.2714, + "step": 34506 + }, + { + "epoch": 2.7954471808165913, + "grad_norm": 0.06831841170787811, + "learning_rate": 6.692020342949729e-05, + "loss": 0.2201, + "step": 34507 + }, + { + "epoch": 2.7955281918340895, + "grad_norm": 0.07188865542411804, + "learning_rate": 6.691570277690265e-05, + "loss": 0.2024, + "step": 34508 + }, + { + "epoch": 2.7956092028515878, + "grad_norm": 0.07600495219230652, + "learning_rate": 6.691120212430802e-05, + "loss": 0.2454, + "step": 34509 + }, + { + "epoch": 2.7956902138690864, + "grad_norm": 0.07487045228481293, + "learning_rate": 6.690670147171341e-05, + "loss": 0.2543, + "step": 34510 + }, + { + "epoch": 2.7957712248865847, + "grad_norm": 0.07164686918258667, + "learning_rate": 6.690220081911877e-05, + "loss": 0.2616, + "step": 34511 + }, + { + "epoch": 2.795852235904083, + "grad_norm": 0.07789607346057892, + "learning_rate": 6.689770016652414e-05, + "loss": 0.2309, + "step": 34512 + }, + { + "epoch": 2.795933246921581, + "grad_norm": 0.06713174283504486, + "learning_rate": 6.689319951392953e-05, + "loss": 0.2241, + "step": 34513 + }, + { + "epoch": 2.79601425793908, + "grad_norm": 0.07741440087556839, + "learning_rate": 6.688869886133489e-05, + "loss": 0.2458, + "step": 34514 + }, + { + "epoch": 2.796095268956578, + "grad_norm": 0.0612313486635685, + "learning_rate": 6.688419820874027e-05, + "loss": 0.2272, + "step": 34515 + }, + { + "epoch": 2.7961762799740764, + "grad_norm": 0.06217198818922043, + "learning_rate": 6.687969755614565e-05, + "loss": 0.2304, + "step": 34516 + }, + { + "epoch": 2.7962572909915746, + "grad_norm": 0.0742369219660759, + "learning_rate": 6.687519690355101e-05, + "loss": 0.2653, + "step": 34517 + }, + { + "epoch": 2.7963383020090733, + "grad_norm": 0.06669142842292786, + "learning_rate": 6.687069625095639e-05, + "loss": 0.2621, + "step": 34518 + }, + { + "epoch": 2.7964193130265715, + "grad_norm": 0.09008178114891052, + "learning_rate": 6.686619559836177e-05, + "loss": 0.2696, + "step": 34519 + }, + { + "epoch": 2.79650032404407, + "grad_norm": 0.06483547389507294, + "learning_rate": 6.686169494576713e-05, + "loss": 0.2261, + "step": 34520 + }, + { + "epoch": 2.7965813350615685, + "grad_norm": 0.06703682988882065, + "learning_rate": 6.685719429317251e-05, + "loss": 0.253, + "step": 34521 + }, + { + "epoch": 2.7966623460790667, + "grad_norm": 0.06760460138320923, + "learning_rate": 6.68526936405779e-05, + "loss": 0.2837, + "step": 34522 + }, + { + "epoch": 2.796743357096565, + "grad_norm": 0.08956301212310791, + "learning_rate": 6.684819298798325e-05, + "loss": 0.2619, + "step": 34523 + }, + { + "epoch": 2.7968243681140637, + "grad_norm": 0.07180722057819366, + "learning_rate": 6.684369233538863e-05, + "loss": 0.2567, + "step": 34524 + }, + { + "epoch": 2.796905379131562, + "grad_norm": 0.06838277727365494, + "learning_rate": 6.683919168279402e-05, + "loss": 0.2282, + "step": 34525 + }, + { + "epoch": 2.79698639014906, + "grad_norm": 0.06887766718864441, + "learning_rate": 6.683469103019938e-05, + "loss": 0.2045, + "step": 34526 + }, + { + "epoch": 2.797067401166559, + "grad_norm": 0.07803060859441757, + "learning_rate": 6.683019037760475e-05, + "loss": 0.2324, + "step": 34527 + }, + { + "epoch": 2.797148412184057, + "grad_norm": 0.08068870007991791, + "learning_rate": 6.682568972501014e-05, + "loss": 0.2478, + "step": 34528 + }, + { + "epoch": 2.7972294232015553, + "grad_norm": 0.060097936540842056, + "learning_rate": 6.68211890724155e-05, + "loss": 0.2032, + "step": 34529 + }, + { + "epoch": 2.797310434219054, + "grad_norm": 0.06428631395101547, + "learning_rate": 6.681668841982087e-05, + "loss": 0.2645, + "step": 34530 + }, + { + "epoch": 2.7973914452365523, + "grad_norm": 0.0661778450012207, + "learning_rate": 6.681218776722626e-05, + "loss": 0.274, + "step": 34531 + }, + { + "epoch": 2.7974724562540505, + "grad_norm": 0.07209504395723343, + "learning_rate": 6.680768711463162e-05, + "loss": 0.1972, + "step": 34532 + }, + { + "epoch": 2.797553467271549, + "grad_norm": 0.07826335728168488, + "learning_rate": 6.6803186462037e-05, + "loss": 0.1785, + "step": 34533 + }, + { + "epoch": 2.7976344782890474, + "grad_norm": 0.0779106393456459, + "learning_rate": 6.679868580944238e-05, + "loss": 0.2395, + "step": 34534 + }, + { + "epoch": 2.7977154893065457, + "grad_norm": 0.07281313091516495, + "learning_rate": 6.679418515684774e-05, + "loss": 0.229, + "step": 34535 + }, + { + "epoch": 2.797796500324044, + "grad_norm": 0.06805375218391418, + "learning_rate": 6.678968450425312e-05, + "loss": 0.2508, + "step": 34536 + }, + { + "epoch": 2.7978775113415426, + "grad_norm": 0.07204117625951767, + "learning_rate": 6.67851838516585e-05, + "loss": 0.2444, + "step": 34537 + }, + { + "epoch": 2.797958522359041, + "grad_norm": 0.06302698701620102, + "learning_rate": 6.678068319906386e-05, + "loss": 0.2394, + "step": 34538 + }, + { + "epoch": 2.798039533376539, + "grad_norm": 0.06678730249404907, + "learning_rate": 6.677618254646925e-05, + "loss": 0.2558, + "step": 34539 + }, + { + "epoch": 2.7981205443940373, + "grad_norm": 0.07123063504695892, + "learning_rate": 6.677168189387462e-05, + "loss": 0.2087, + "step": 34540 + }, + { + "epoch": 2.798201555411536, + "grad_norm": 0.060089875012636185, + "learning_rate": 6.676718124127998e-05, + "loss": 0.2292, + "step": 34541 + }, + { + "epoch": 2.7982825664290343, + "grad_norm": 0.06729140877723694, + "learning_rate": 6.676268058868537e-05, + "loss": 0.2311, + "step": 34542 + }, + { + "epoch": 2.7983635774465325, + "grad_norm": 0.07628912478685379, + "learning_rate": 6.675817993609074e-05, + "loss": 0.2566, + "step": 34543 + }, + { + "epoch": 2.798444588464031, + "grad_norm": 0.07927137613296509, + "learning_rate": 6.67536792834961e-05, + "loss": 0.2523, + "step": 34544 + }, + { + "epoch": 2.7985255994815295, + "grad_norm": 0.06815194338560104, + "learning_rate": 6.674917863090149e-05, + "loss": 0.2686, + "step": 34545 + }, + { + "epoch": 2.7986066104990277, + "grad_norm": 0.06721120327711105, + "learning_rate": 6.674467797830686e-05, + "loss": 0.2471, + "step": 34546 + }, + { + "epoch": 2.7986876215165264, + "grad_norm": 0.08339305222034454, + "learning_rate": 6.674017732571223e-05, + "loss": 0.2074, + "step": 34547 + }, + { + "epoch": 2.7987686325340246, + "grad_norm": 0.06246606260538101, + "learning_rate": 6.673567667311761e-05, + "loss": 0.2553, + "step": 34548 + }, + { + "epoch": 2.798849643551523, + "grad_norm": 0.06168720871210098, + "learning_rate": 6.673117602052298e-05, + "loss": 0.2603, + "step": 34549 + }, + { + "epoch": 2.7989306545690216, + "grad_norm": 0.0719793364405632, + "learning_rate": 6.672667536792836e-05, + "loss": 0.2338, + "step": 34550 + }, + { + "epoch": 2.79901166558652, + "grad_norm": 0.07064522057771683, + "learning_rate": 6.672217471533373e-05, + "loss": 0.2486, + "step": 34551 + }, + { + "epoch": 2.799092676604018, + "grad_norm": 0.07162480801343918, + "learning_rate": 6.67176740627391e-05, + "loss": 0.2603, + "step": 34552 + }, + { + "epoch": 2.7991736876215167, + "grad_norm": 0.0879589319229126, + "learning_rate": 6.671317341014448e-05, + "loss": 0.2434, + "step": 34553 + }, + { + "epoch": 2.799254698639015, + "grad_norm": 0.05829513818025589, + "learning_rate": 6.670867275754985e-05, + "loss": 0.2313, + "step": 34554 + }, + { + "epoch": 2.7993357096565132, + "grad_norm": 0.07135823369026184, + "learning_rate": 6.670417210495522e-05, + "loss": 0.2476, + "step": 34555 + }, + { + "epoch": 2.799416720674012, + "grad_norm": 0.06813136488199234, + "learning_rate": 6.66996714523606e-05, + "loss": 0.2315, + "step": 34556 + }, + { + "epoch": 2.79949773169151, + "grad_norm": 0.08152391761541367, + "learning_rate": 6.669517079976597e-05, + "loss": 0.2475, + "step": 34557 + }, + { + "epoch": 2.7995787427090084, + "grad_norm": 0.0726170465350151, + "learning_rate": 6.669067014717134e-05, + "loss": 0.2706, + "step": 34558 + }, + { + "epoch": 2.7996597537265067, + "grad_norm": 0.08020608127117157, + "learning_rate": 6.668616949457672e-05, + "loss": 0.2555, + "step": 34559 + }, + { + "epoch": 2.7997407647440054, + "grad_norm": 0.07207894325256348, + "learning_rate": 6.668166884198209e-05, + "loss": 0.2487, + "step": 34560 + }, + { + "epoch": 2.7998217757615036, + "grad_norm": 0.07342177629470825, + "learning_rate": 6.667716818938746e-05, + "loss": 0.274, + "step": 34561 + }, + { + "epoch": 2.799902786779002, + "grad_norm": 0.07004830241203308, + "learning_rate": 6.667266753679284e-05, + "loss": 0.22, + "step": 34562 + }, + { + "epoch": 2.7999837977965, + "grad_norm": 0.06473211944103241, + "learning_rate": 6.666816688419821e-05, + "loss": 0.1806, + "step": 34563 + }, + { + "epoch": 2.8000648088139988, + "grad_norm": 0.072107695043087, + "learning_rate": 6.666366623160359e-05, + "loss": 0.2916, + "step": 34564 + }, + { + "epoch": 2.800145819831497, + "grad_norm": 0.07329043000936508, + "learning_rate": 6.665916557900896e-05, + "loss": 0.2855, + "step": 34565 + }, + { + "epoch": 2.8002268308489953, + "grad_norm": 0.06471650302410126, + "learning_rate": 6.665466492641433e-05, + "loss": 0.2268, + "step": 34566 + }, + { + "epoch": 2.800307841866494, + "grad_norm": 0.07123664766550064, + "learning_rate": 6.66501642738197e-05, + "loss": 0.2165, + "step": 34567 + }, + { + "epoch": 2.800388852883992, + "grad_norm": 0.07238410413265228, + "learning_rate": 6.664566362122508e-05, + "loss": 0.233, + "step": 34568 + }, + { + "epoch": 2.8004698639014904, + "grad_norm": 0.07660076022148132, + "learning_rate": 6.664116296863045e-05, + "loss": 0.2519, + "step": 34569 + }, + { + "epoch": 2.800550874918989, + "grad_norm": 0.07034006714820862, + "learning_rate": 6.663666231603583e-05, + "loss": 0.2442, + "step": 34570 + }, + { + "epoch": 2.8006318859364874, + "grad_norm": 0.0729314535856247, + "learning_rate": 6.66321616634412e-05, + "loss": 0.2307, + "step": 34571 + }, + { + "epoch": 2.8007128969539856, + "grad_norm": 0.06865367293357849, + "learning_rate": 6.662766101084657e-05, + "loss": 0.2385, + "step": 34572 + }, + { + "epoch": 2.8007939079714843, + "grad_norm": 0.0807727724313736, + "learning_rate": 6.662316035825195e-05, + "loss": 0.2429, + "step": 34573 + }, + { + "epoch": 2.8008749189889826, + "grad_norm": 0.06009256839752197, + "learning_rate": 6.661865970565732e-05, + "loss": 0.2426, + "step": 34574 + }, + { + "epoch": 2.800955930006481, + "grad_norm": 0.07074223458766937, + "learning_rate": 6.66141590530627e-05, + "loss": 0.2391, + "step": 34575 + }, + { + "epoch": 2.8010369410239795, + "grad_norm": 0.06917184591293335, + "learning_rate": 6.660965840046807e-05, + "loss": 0.2406, + "step": 34576 + }, + { + "epoch": 2.8011179520414777, + "grad_norm": 0.07436686754226685, + "learning_rate": 6.660515774787344e-05, + "loss": 0.282, + "step": 34577 + }, + { + "epoch": 2.801198963058976, + "grad_norm": 0.0587419793009758, + "learning_rate": 6.660065709527882e-05, + "loss": 0.2045, + "step": 34578 + }, + { + "epoch": 2.8012799740764747, + "grad_norm": 0.06786326318979263, + "learning_rate": 6.659615644268419e-05, + "loss": 0.2613, + "step": 34579 + }, + { + "epoch": 2.801360985093973, + "grad_norm": 0.06299388408660889, + "learning_rate": 6.659165579008956e-05, + "loss": 0.2184, + "step": 34580 + }, + { + "epoch": 2.801441996111471, + "grad_norm": 0.0794903039932251, + "learning_rate": 6.658715513749494e-05, + "loss": 0.2686, + "step": 34581 + }, + { + "epoch": 2.8015230071289694, + "grad_norm": 0.06590180099010468, + "learning_rate": 6.658265448490031e-05, + "loss": 0.2272, + "step": 34582 + }, + { + "epoch": 2.8016040181464676, + "grad_norm": 0.0644986629486084, + "learning_rate": 6.657815383230568e-05, + "loss": 0.2402, + "step": 34583 + }, + { + "epoch": 2.8016850291639663, + "grad_norm": 0.06860768795013428, + "learning_rate": 6.657365317971106e-05, + "loss": 0.2406, + "step": 34584 + }, + { + "epoch": 2.8017660401814646, + "grad_norm": 0.059577204287052155, + "learning_rate": 6.656915252711643e-05, + "loss": 0.2433, + "step": 34585 + }, + { + "epoch": 2.801847051198963, + "grad_norm": 0.06488867849111557, + "learning_rate": 6.65646518745218e-05, + "loss": 0.2505, + "step": 34586 + }, + { + "epoch": 2.8019280622164615, + "grad_norm": 0.0636620968580246, + "learning_rate": 6.656015122192718e-05, + "loss": 0.2132, + "step": 34587 + }, + { + "epoch": 2.8020090732339598, + "grad_norm": 0.07555467635393143, + "learning_rate": 6.655565056933257e-05, + "loss": 0.2072, + "step": 34588 + }, + { + "epoch": 2.802090084251458, + "grad_norm": 0.06727524101734161, + "learning_rate": 6.655114991673793e-05, + "loss": 0.2106, + "step": 34589 + }, + { + "epoch": 2.8021710952689567, + "grad_norm": 0.09039086848497391, + "learning_rate": 6.65466492641433e-05, + "loss": 0.224, + "step": 34590 + }, + { + "epoch": 2.802252106286455, + "grad_norm": 0.06671174615621567, + "learning_rate": 6.654214861154869e-05, + "loss": 0.2353, + "step": 34591 + }, + { + "epoch": 2.802333117303953, + "grad_norm": 0.068624347448349, + "learning_rate": 6.653764795895405e-05, + "loss": 0.25, + "step": 34592 + }, + { + "epoch": 2.802414128321452, + "grad_norm": 0.06567755341529846, + "learning_rate": 6.653314730635942e-05, + "loss": 0.2615, + "step": 34593 + }, + { + "epoch": 2.80249513933895, + "grad_norm": 0.06661804765462875, + "learning_rate": 6.652864665376481e-05, + "loss": 0.2165, + "step": 34594 + }, + { + "epoch": 2.8025761503564484, + "grad_norm": 0.07081003487110138, + "learning_rate": 6.652414600117017e-05, + "loss": 0.2194, + "step": 34595 + }, + { + "epoch": 2.802657161373947, + "grad_norm": 0.07812508940696716, + "learning_rate": 6.651964534857554e-05, + "loss": 0.2238, + "step": 34596 + }, + { + "epoch": 2.8027381723914453, + "grad_norm": 0.06472626328468323, + "learning_rate": 6.651514469598093e-05, + "loss": 0.2292, + "step": 34597 + }, + { + "epoch": 2.8028191834089435, + "grad_norm": 0.06261005252599716, + "learning_rate": 6.651064404338629e-05, + "loss": 0.2384, + "step": 34598 + }, + { + "epoch": 2.8029001944264422, + "grad_norm": 0.07326589524745941, + "learning_rate": 6.650614339079166e-05, + "loss": 0.2766, + "step": 34599 + }, + { + "epoch": 2.8029812054439405, + "grad_norm": 0.07288602739572525, + "learning_rate": 6.650164273819705e-05, + "loss": 0.2294, + "step": 34600 + }, + { + "epoch": 2.8030622164614387, + "grad_norm": 0.07847611606121063, + "learning_rate": 6.649714208560241e-05, + "loss": 0.2404, + "step": 34601 + }, + { + "epoch": 2.8031432274789374, + "grad_norm": 0.06981007754802704, + "learning_rate": 6.649264143300778e-05, + "loss": 0.1985, + "step": 34602 + }, + { + "epoch": 2.8032242384964356, + "grad_norm": 0.08531308174133301, + "learning_rate": 6.648814078041317e-05, + "loss": 0.2403, + "step": 34603 + }, + { + "epoch": 2.803305249513934, + "grad_norm": 0.08525741845369339, + "learning_rate": 6.648364012781853e-05, + "loss": 0.2574, + "step": 34604 + }, + { + "epoch": 2.803386260531432, + "grad_norm": 0.08389617502689362, + "learning_rate": 6.64791394752239e-05, + "loss": 0.2306, + "step": 34605 + }, + { + "epoch": 2.8034672715489304, + "grad_norm": 0.072044737637043, + "learning_rate": 6.647463882262929e-05, + "loss": 0.2215, + "step": 34606 + }, + { + "epoch": 2.803548282566429, + "grad_norm": 0.06642574071884155, + "learning_rate": 6.647013817003465e-05, + "loss": 0.2351, + "step": 34607 + }, + { + "epoch": 2.8036292935839273, + "grad_norm": 0.061156339943408966, + "learning_rate": 6.646563751744002e-05, + "loss": 0.1957, + "step": 34608 + }, + { + "epoch": 2.8037103046014256, + "grad_norm": 0.06726231426000595, + "learning_rate": 6.646113686484541e-05, + "loss": 0.2144, + "step": 34609 + }, + { + "epoch": 2.8037913156189243, + "grad_norm": 0.06568557769060135, + "learning_rate": 6.645663621225077e-05, + "loss": 0.2513, + "step": 34610 + }, + { + "epoch": 2.8038723266364225, + "grad_norm": 0.06943460553884506, + "learning_rate": 6.645213555965614e-05, + "loss": 0.2174, + "step": 34611 + }, + { + "epoch": 2.8039533376539207, + "grad_norm": 0.06923588365316391, + "learning_rate": 6.644763490706153e-05, + "loss": 0.2228, + "step": 34612 + }, + { + "epoch": 2.8040343486714194, + "grad_norm": 0.07375077158212662, + "learning_rate": 6.644313425446689e-05, + "loss": 0.229, + "step": 34613 + }, + { + "epoch": 2.8041153596889177, + "grad_norm": 0.0803399607539177, + "learning_rate": 6.643863360187228e-05, + "loss": 0.2498, + "step": 34614 + }, + { + "epoch": 2.804196370706416, + "grad_norm": 0.0667591392993927, + "learning_rate": 6.643413294927765e-05, + "loss": 0.2154, + "step": 34615 + }, + { + "epoch": 2.8042773817239146, + "grad_norm": 0.07770081609487534, + "learning_rate": 6.642963229668303e-05, + "loss": 0.2353, + "step": 34616 + }, + { + "epoch": 2.804358392741413, + "grad_norm": 0.07570865750312805, + "learning_rate": 6.64251316440884e-05, + "loss": 0.2543, + "step": 34617 + }, + { + "epoch": 2.804439403758911, + "grad_norm": 0.08184947073459625, + "learning_rate": 6.642063099149377e-05, + "loss": 0.212, + "step": 34618 + }, + { + "epoch": 2.80452041477641, + "grad_norm": 0.07105211913585663, + "learning_rate": 6.641613033889915e-05, + "loss": 0.2307, + "step": 34619 + }, + { + "epoch": 2.804601425793908, + "grad_norm": 0.08429677039384842, + "learning_rate": 6.641162968630452e-05, + "loss": 0.2478, + "step": 34620 + }, + { + "epoch": 2.8046824368114063, + "grad_norm": 0.06321967393159866, + "learning_rate": 6.64071290337099e-05, + "loss": 0.2593, + "step": 34621 + }, + { + "epoch": 2.804763447828905, + "grad_norm": 0.07362980395555496, + "learning_rate": 6.640262838111527e-05, + "loss": 0.2551, + "step": 34622 + }, + { + "epoch": 2.804844458846403, + "grad_norm": 0.053744908422231674, + "learning_rate": 6.639812772852064e-05, + "loss": 0.2146, + "step": 34623 + }, + { + "epoch": 2.8049254698639015, + "grad_norm": 0.05200938135385513, + "learning_rate": 6.639362707592602e-05, + "loss": 0.2074, + "step": 34624 + }, + { + "epoch": 2.8050064808814, + "grad_norm": 0.06889305263757706, + "learning_rate": 6.638912642333139e-05, + "loss": 0.2125, + "step": 34625 + }, + { + "epoch": 2.8050874918988984, + "grad_norm": 0.06506326794624329, + "learning_rate": 6.638462577073676e-05, + "loss": 0.2397, + "step": 34626 + }, + { + "epoch": 2.8051685029163966, + "grad_norm": 0.06494927406311035, + "learning_rate": 6.638012511814214e-05, + "loss": 0.2151, + "step": 34627 + }, + { + "epoch": 2.805249513933895, + "grad_norm": 0.061213236302137375, + "learning_rate": 6.637562446554751e-05, + "loss": 0.2753, + "step": 34628 + }, + { + "epoch": 2.805330524951393, + "grad_norm": 0.06733286380767822, + "learning_rate": 6.637112381295288e-05, + "loss": 0.2663, + "step": 34629 + }, + { + "epoch": 2.805411535968892, + "grad_norm": 0.07056358456611633, + "learning_rate": 6.636662316035826e-05, + "loss": 0.2348, + "step": 34630 + }, + { + "epoch": 2.80549254698639, + "grad_norm": 0.06755927205085754, + "learning_rate": 6.636212250776363e-05, + "loss": 0.2353, + "step": 34631 + }, + { + "epoch": 2.8055735580038883, + "grad_norm": 0.06227118894457817, + "learning_rate": 6.6357621855169e-05, + "loss": 0.2439, + "step": 34632 + }, + { + "epoch": 2.805654569021387, + "grad_norm": 0.07647748291492462, + "learning_rate": 6.635312120257438e-05, + "loss": 0.2632, + "step": 34633 + }, + { + "epoch": 2.8057355800388852, + "grad_norm": 0.05834818631410599, + "learning_rate": 6.634862054997975e-05, + "loss": 0.1897, + "step": 34634 + }, + { + "epoch": 2.8058165910563835, + "grad_norm": 0.06183220446109772, + "learning_rate": 6.634411989738512e-05, + "loss": 0.2031, + "step": 34635 + }, + { + "epoch": 2.805897602073882, + "grad_norm": 0.0634341686964035, + "learning_rate": 6.63396192447905e-05, + "loss": 0.2026, + "step": 34636 + }, + { + "epoch": 2.8059786130913804, + "grad_norm": 0.06907891482114792, + "learning_rate": 6.633511859219587e-05, + "loss": 0.2259, + "step": 34637 + }, + { + "epoch": 2.8060596241088787, + "grad_norm": 0.06217336654663086, + "learning_rate": 6.633061793960125e-05, + "loss": 0.246, + "step": 34638 + }, + { + "epoch": 2.8061406351263773, + "grad_norm": 0.0637386217713356, + "learning_rate": 6.632611728700662e-05, + "loss": 0.2425, + "step": 34639 + }, + { + "epoch": 2.8062216461438756, + "grad_norm": 0.05497181415557861, + "learning_rate": 6.632161663441199e-05, + "loss": 0.2363, + "step": 34640 + }, + { + "epoch": 2.806302657161374, + "grad_norm": 0.06874691694974899, + "learning_rate": 6.631711598181737e-05, + "loss": 0.2528, + "step": 34641 + }, + { + "epoch": 2.8063836681788725, + "grad_norm": 0.060625869780778885, + "learning_rate": 6.631261532922274e-05, + "loss": 0.2129, + "step": 34642 + }, + { + "epoch": 2.8064646791963708, + "grad_norm": 0.06054311618208885, + "learning_rate": 6.630811467662811e-05, + "loss": 0.205, + "step": 34643 + }, + { + "epoch": 2.806545690213869, + "grad_norm": 0.0658852681517601, + "learning_rate": 6.630361402403349e-05, + "loss": 0.2381, + "step": 34644 + }, + { + "epoch": 2.8066267012313677, + "grad_norm": 0.07085654884576797, + "learning_rate": 6.629911337143886e-05, + "loss": 0.2305, + "step": 34645 + }, + { + "epoch": 2.806707712248866, + "grad_norm": 0.08350074291229248, + "learning_rate": 6.629461271884423e-05, + "loss": 0.2542, + "step": 34646 + }, + { + "epoch": 2.806788723266364, + "grad_norm": 0.08466752618551254, + "learning_rate": 6.629011206624961e-05, + "loss": 0.2522, + "step": 34647 + }, + { + "epoch": 2.806869734283863, + "grad_norm": 0.0648236945271492, + "learning_rate": 6.628561141365498e-05, + "loss": 0.2413, + "step": 34648 + }, + { + "epoch": 2.806950745301361, + "grad_norm": 0.06670759618282318, + "learning_rate": 6.628111076106036e-05, + "loss": 0.2769, + "step": 34649 + }, + { + "epoch": 2.8070317563188594, + "grad_norm": 0.07005154341459274, + "learning_rate": 6.627661010846573e-05, + "loss": 0.2387, + "step": 34650 + }, + { + "epoch": 2.8071127673363576, + "grad_norm": 0.07206209748983383, + "learning_rate": 6.62721094558711e-05, + "loss": 0.2357, + "step": 34651 + }, + { + "epoch": 2.807193778353856, + "grad_norm": 0.07660338282585144, + "learning_rate": 6.626760880327648e-05, + "loss": 0.2409, + "step": 34652 + }, + { + "epoch": 2.8072747893713546, + "grad_norm": 0.06903325766324997, + "learning_rate": 6.626310815068185e-05, + "loss": 0.249, + "step": 34653 + }, + { + "epoch": 2.807355800388853, + "grad_norm": 0.06659288704395294, + "learning_rate": 6.625860749808722e-05, + "loss": 0.2152, + "step": 34654 + }, + { + "epoch": 2.807436811406351, + "grad_norm": 0.05893072485923767, + "learning_rate": 6.62541068454926e-05, + "loss": 0.2092, + "step": 34655 + }, + { + "epoch": 2.8075178224238497, + "grad_norm": 0.07203570753335953, + "learning_rate": 6.624960619289797e-05, + "loss": 0.2303, + "step": 34656 + }, + { + "epoch": 2.807598833441348, + "grad_norm": 0.06032579019665718, + "learning_rate": 6.624510554030334e-05, + "loss": 0.2224, + "step": 34657 + }, + { + "epoch": 2.807679844458846, + "grad_norm": 0.08079580217599869, + "learning_rate": 6.624060488770872e-05, + "loss": 0.26, + "step": 34658 + }, + { + "epoch": 2.807760855476345, + "grad_norm": 0.07629918307065964, + "learning_rate": 6.623610423511409e-05, + "loss": 0.22, + "step": 34659 + }, + { + "epoch": 2.807841866493843, + "grad_norm": 0.0819818377494812, + "learning_rate": 6.623160358251947e-05, + "loss": 0.2332, + "step": 34660 + }, + { + "epoch": 2.8079228775113414, + "grad_norm": 0.07031166553497314, + "learning_rate": 6.622710292992484e-05, + "loss": 0.2707, + "step": 34661 + }, + { + "epoch": 2.80800388852884, + "grad_norm": 0.07032985240221024, + "learning_rate": 6.622260227733021e-05, + "loss": 0.2258, + "step": 34662 + }, + { + "epoch": 2.8080848995463383, + "grad_norm": 0.07309204339981079, + "learning_rate": 6.621810162473559e-05, + "loss": 0.2405, + "step": 34663 + }, + { + "epoch": 2.8081659105638366, + "grad_norm": 0.06003347039222717, + "learning_rate": 6.621360097214096e-05, + "loss": 0.1924, + "step": 34664 + }, + { + "epoch": 2.8082469215813353, + "grad_norm": 0.08050237596035004, + "learning_rate": 6.620910031954633e-05, + "loss": 0.2514, + "step": 34665 + }, + { + "epoch": 2.8083279325988335, + "grad_norm": 0.06988146901130676, + "learning_rate": 6.620459966695172e-05, + "loss": 0.2691, + "step": 34666 + }, + { + "epoch": 2.8084089436163318, + "grad_norm": 0.06781960278749466, + "learning_rate": 6.620009901435708e-05, + "loss": 0.2264, + "step": 34667 + }, + { + "epoch": 2.8084899546338304, + "grad_norm": 0.07960690557956696, + "learning_rate": 6.619559836176245e-05, + "loss": 0.2934, + "step": 34668 + }, + { + "epoch": 2.8085709656513287, + "grad_norm": 0.08437389880418777, + "learning_rate": 6.619109770916784e-05, + "loss": 0.3173, + "step": 34669 + }, + { + "epoch": 2.808651976668827, + "grad_norm": 0.07166004180908203, + "learning_rate": 6.61865970565732e-05, + "loss": 0.2455, + "step": 34670 + }, + { + "epoch": 2.808732987686325, + "grad_norm": 0.0753786712884903, + "learning_rate": 6.618209640397857e-05, + "loss": 0.2823, + "step": 34671 + }, + { + "epoch": 2.808813998703824, + "grad_norm": 0.0854075700044632, + "learning_rate": 6.617759575138396e-05, + "loss": 0.2383, + "step": 34672 + }, + { + "epoch": 2.808895009721322, + "grad_norm": 0.06980396062135696, + "learning_rate": 6.617309509878932e-05, + "loss": 0.2351, + "step": 34673 + }, + { + "epoch": 2.8089760207388204, + "grad_norm": 0.06357509642839432, + "learning_rate": 6.61685944461947e-05, + "loss": 0.2088, + "step": 34674 + }, + { + "epoch": 2.8090570317563186, + "grad_norm": 0.05988110974431038, + "learning_rate": 6.616409379360008e-05, + "loss": 0.2062, + "step": 34675 + }, + { + "epoch": 2.8091380427738173, + "grad_norm": 0.07213082164525986, + "learning_rate": 6.615959314100544e-05, + "loss": 0.2386, + "step": 34676 + }, + { + "epoch": 2.8092190537913155, + "grad_norm": 0.08268000185489655, + "learning_rate": 6.615509248841082e-05, + "loss": 0.2764, + "step": 34677 + }, + { + "epoch": 2.809300064808814, + "grad_norm": 0.05751238018274307, + "learning_rate": 6.61505918358162e-05, + "loss": 0.2063, + "step": 34678 + }, + { + "epoch": 2.8093810758263125, + "grad_norm": 0.0581294521689415, + "learning_rate": 6.614609118322156e-05, + "loss": 0.1949, + "step": 34679 + }, + { + "epoch": 2.8094620868438107, + "grad_norm": 0.07043235749006271, + "learning_rate": 6.614159053062694e-05, + "loss": 0.2478, + "step": 34680 + }, + { + "epoch": 2.809543097861309, + "grad_norm": 0.07290154695510864, + "learning_rate": 6.613708987803232e-05, + "loss": 0.2856, + "step": 34681 + }, + { + "epoch": 2.8096241088788076, + "grad_norm": 0.07745785266160965, + "learning_rate": 6.613258922543768e-05, + "loss": 0.2073, + "step": 34682 + }, + { + "epoch": 2.809705119896306, + "grad_norm": 0.07462266832590103, + "learning_rate": 6.612808857284306e-05, + "loss": 0.2184, + "step": 34683 + }, + { + "epoch": 2.809786130913804, + "grad_norm": 0.0726090669631958, + "learning_rate": 6.612358792024845e-05, + "loss": 0.2314, + "step": 34684 + }, + { + "epoch": 2.809867141931303, + "grad_norm": 0.07967076450586319, + "learning_rate": 6.611908726765382e-05, + "loss": 0.2429, + "step": 34685 + }, + { + "epoch": 2.809948152948801, + "grad_norm": 0.06095170974731445, + "learning_rate": 6.611458661505918e-05, + "loss": 0.2106, + "step": 34686 + }, + { + "epoch": 2.8100291639662993, + "grad_norm": 0.06986818462610245, + "learning_rate": 6.611008596246457e-05, + "loss": 0.2225, + "step": 34687 + }, + { + "epoch": 2.810110174983798, + "grad_norm": 0.06797707825899124, + "learning_rate": 6.610558530986994e-05, + "loss": 0.2099, + "step": 34688 + }, + { + "epoch": 2.8101911860012962, + "grad_norm": 0.06992008537054062, + "learning_rate": 6.61010846572753e-05, + "loss": 0.2615, + "step": 34689 + }, + { + "epoch": 2.8102721970187945, + "grad_norm": 0.0866551622748375, + "learning_rate": 6.609658400468069e-05, + "loss": 0.2792, + "step": 34690 + }, + { + "epoch": 2.810353208036293, + "grad_norm": 0.06725787371397018, + "learning_rate": 6.609208335208606e-05, + "loss": 0.2464, + "step": 34691 + }, + { + "epoch": 2.8104342190537914, + "grad_norm": 0.06845443695783615, + "learning_rate": 6.608758269949143e-05, + "loss": 0.239, + "step": 34692 + }, + { + "epoch": 2.8105152300712897, + "grad_norm": 0.07654809206724167, + "learning_rate": 6.608308204689681e-05, + "loss": 0.2886, + "step": 34693 + }, + { + "epoch": 2.810596241088788, + "grad_norm": 0.0735306441783905, + "learning_rate": 6.607858139430218e-05, + "loss": 0.2403, + "step": 34694 + }, + { + "epoch": 2.8106772521062866, + "grad_norm": 0.06867165863513947, + "learning_rate": 6.607408074170755e-05, + "loss": 0.2408, + "step": 34695 + }, + { + "epoch": 2.810758263123785, + "grad_norm": 0.06796415150165558, + "learning_rate": 6.606958008911293e-05, + "loss": 0.2632, + "step": 34696 + }, + { + "epoch": 2.810839274141283, + "grad_norm": 0.07219690829515457, + "learning_rate": 6.60650794365183e-05, + "loss": 0.22, + "step": 34697 + }, + { + "epoch": 2.8109202851587813, + "grad_norm": 0.0721060186624527, + "learning_rate": 6.606057878392368e-05, + "loss": 0.2405, + "step": 34698 + }, + { + "epoch": 2.81100129617628, + "grad_norm": 0.06989874690771103, + "learning_rate": 6.605607813132905e-05, + "loss": 0.2185, + "step": 34699 + }, + { + "epoch": 2.8110823071937783, + "grad_norm": 0.07357072085142136, + "learning_rate": 6.605157747873442e-05, + "loss": 0.2446, + "step": 34700 + }, + { + "epoch": 2.8111633182112765, + "grad_norm": 0.06997592002153397, + "learning_rate": 6.60470768261398e-05, + "loss": 0.2355, + "step": 34701 + }, + { + "epoch": 2.811244329228775, + "grad_norm": 0.06015832722187042, + "learning_rate": 6.604257617354517e-05, + "loss": 0.2095, + "step": 34702 + }, + { + "epoch": 2.8113253402462735, + "grad_norm": 0.06459351629018784, + "learning_rate": 6.603807552095054e-05, + "loss": 0.2295, + "step": 34703 + }, + { + "epoch": 2.8114063512637717, + "grad_norm": 0.06236038729548454, + "learning_rate": 6.603357486835592e-05, + "loss": 0.2424, + "step": 34704 + }, + { + "epoch": 2.8114873622812704, + "grad_norm": 0.0749562606215477, + "learning_rate": 6.602907421576129e-05, + "loss": 0.2212, + "step": 34705 + }, + { + "epoch": 2.8115683732987686, + "grad_norm": 0.06482082605361938, + "learning_rate": 6.602457356316666e-05, + "loss": 0.2116, + "step": 34706 + }, + { + "epoch": 2.811649384316267, + "grad_norm": 0.06248362362384796, + "learning_rate": 6.602007291057204e-05, + "loss": 0.196, + "step": 34707 + }, + { + "epoch": 2.8117303953337656, + "grad_norm": 0.0831056535243988, + "learning_rate": 6.601557225797741e-05, + "loss": 0.2368, + "step": 34708 + }, + { + "epoch": 2.811811406351264, + "grad_norm": 0.07669353485107422, + "learning_rate": 6.601107160538279e-05, + "loss": 0.2473, + "step": 34709 + }, + { + "epoch": 2.811892417368762, + "grad_norm": 0.06788359582424164, + "learning_rate": 6.600657095278816e-05, + "loss": 0.2364, + "step": 34710 + }, + { + "epoch": 2.8119734283862607, + "grad_norm": 0.07720784842967987, + "learning_rate": 6.600207030019353e-05, + "loss": 0.255, + "step": 34711 + }, + { + "epoch": 2.812054439403759, + "grad_norm": 0.07028775662183762, + "learning_rate": 6.59975696475989e-05, + "loss": 0.2154, + "step": 34712 + }, + { + "epoch": 2.8121354504212572, + "grad_norm": 0.07846012711524963, + "learning_rate": 6.599306899500428e-05, + "loss": 0.2551, + "step": 34713 + }, + { + "epoch": 2.812216461438756, + "grad_norm": 0.05860767886042595, + "learning_rate": 6.598856834240965e-05, + "loss": 0.2109, + "step": 34714 + }, + { + "epoch": 2.812297472456254, + "grad_norm": 0.06866607069969177, + "learning_rate": 6.598406768981503e-05, + "loss": 0.2356, + "step": 34715 + }, + { + "epoch": 2.8123784834737524, + "grad_norm": 0.0720444768667221, + "learning_rate": 6.59795670372204e-05, + "loss": 0.2537, + "step": 34716 + }, + { + "epoch": 2.8124594944912507, + "grad_norm": 0.07664826512336731, + "learning_rate": 6.597506638462577e-05, + "loss": 0.2701, + "step": 34717 + }, + { + "epoch": 2.8125405055087493, + "grad_norm": 0.06196041405200958, + "learning_rate": 6.597056573203115e-05, + "loss": 0.2354, + "step": 34718 + }, + { + "epoch": 2.8126215165262476, + "grad_norm": 0.07691022008657455, + "learning_rate": 6.596606507943652e-05, + "loss": 0.2415, + "step": 34719 + }, + { + "epoch": 2.812702527543746, + "grad_norm": 0.05945818871259689, + "learning_rate": 6.59615644268419e-05, + "loss": 0.2418, + "step": 34720 + }, + { + "epoch": 2.812783538561244, + "grad_norm": 0.06513824313879013, + "learning_rate": 6.595706377424727e-05, + "loss": 0.2605, + "step": 34721 + }, + { + "epoch": 2.8128645495787428, + "grad_norm": 0.08122964203357697, + "learning_rate": 6.595256312165264e-05, + "loss": 0.2643, + "step": 34722 + }, + { + "epoch": 2.812945560596241, + "grad_norm": 0.06063034385442734, + "learning_rate": 6.594806246905802e-05, + "loss": 0.2454, + "step": 34723 + }, + { + "epoch": 2.8130265716137393, + "grad_norm": 0.059998977929353714, + "learning_rate": 6.594356181646339e-05, + "loss": 0.2002, + "step": 34724 + }, + { + "epoch": 2.813107582631238, + "grad_norm": 0.07546351104974747, + "learning_rate": 6.593906116386876e-05, + "loss": 0.2441, + "step": 34725 + }, + { + "epoch": 2.813188593648736, + "grad_norm": 0.07578836381435394, + "learning_rate": 6.593456051127414e-05, + "loss": 0.2553, + "step": 34726 + }, + { + "epoch": 2.8132696046662344, + "grad_norm": 0.07513416558504105, + "learning_rate": 6.593005985867951e-05, + "loss": 0.2314, + "step": 34727 + }, + { + "epoch": 2.813350615683733, + "grad_norm": 0.07094703614711761, + "learning_rate": 6.592555920608488e-05, + "loss": 0.2104, + "step": 34728 + }, + { + "epoch": 2.8134316267012314, + "grad_norm": 0.06131584197282791, + "learning_rate": 6.592105855349026e-05, + "loss": 0.2389, + "step": 34729 + }, + { + "epoch": 2.8135126377187296, + "grad_norm": 0.06059715896844864, + "learning_rate": 6.591655790089563e-05, + "loss": 0.2167, + "step": 34730 + }, + { + "epoch": 2.8135936487362283, + "grad_norm": 0.08900104463100433, + "learning_rate": 6.5912057248301e-05, + "loss": 0.2172, + "step": 34731 + }, + { + "epoch": 2.8136746597537265, + "grad_norm": 0.06046757847070694, + "learning_rate": 6.590755659570638e-05, + "loss": 0.2062, + "step": 34732 + }, + { + "epoch": 2.813755670771225, + "grad_norm": 0.06872811168432236, + "learning_rate": 6.590305594311175e-05, + "loss": 0.2142, + "step": 34733 + }, + { + "epoch": 2.8138366817887235, + "grad_norm": 0.07165510952472687, + "learning_rate": 6.589855529051713e-05, + "loss": 0.2444, + "step": 34734 + }, + { + "epoch": 2.8139176928062217, + "grad_norm": 0.06559931486845016, + "learning_rate": 6.58940546379225e-05, + "loss": 0.2244, + "step": 34735 + }, + { + "epoch": 2.81399870382372, + "grad_norm": 0.061313237994909286, + "learning_rate": 6.588955398532787e-05, + "loss": 0.2403, + "step": 34736 + }, + { + "epoch": 2.8140797148412187, + "grad_norm": 0.07175567001104355, + "learning_rate": 6.588505333273325e-05, + "loss": 0.2025, + "step": 34737 + }, + { + "epoch": 2.814160725858717, + "grad_norm": 0.07134431600570679, + "learning_rate": 6.588055268013862e-05, + "loss": 0.227, + "step": 34738 + }, + { + "epoch": 2.814241736876215, + "grad_norm": 0.07444427162408829, + "learning_rate": 6.5876052027544e-05, + "loss": 0.2227, + "step": 34739 + }, + { + "epoch": 2.8143227478937134, + "grad_norm": 0.0755746141076088, + "learning_rate": 6.587155137494937e-05, + "loss": 0.2617, + "step": 34740 + }, + { + "epoch": 2.814403758911212, + "grad_norm": 0.07173725217580795, + "learning_rate": 6.586705072235474e-05, + "loss": 0.2281, + "step": 34741 + }, + { + "epoch": 2.8144847699287103, + "grad_norm": 0.08544918894767761, + "learning_rate": 6.586255006976011e-05, + "loss": 0.2604, + "step": 34742 + }, + { + "epoch": 2.8145657809462086, + "grad_norm": 0.08578377962112427, + "learning_rate": 6.585804941716549e-05, + "loss": 0.2231, + "step": 34743 + }, + { + "epoch": 2.814646791963707, + "grad_norm": 0.04919799044728279, + "learning_rate": 6.585354876457086e-05, + "loss": 0.2155, + "step": 34744 + }, + { + "epoch": 2.8147278029812055, + "grad_norm": 0.07585250586271286, + "learning_rate": 6.584904811197623e-05, + "loss": 0.2356, + "step": 34745 + }, + { + "epoch": 2.8148088139987038, + "grad_norm": 0.08482617139816284, + "learning_rate": 6.584454745938161e-05, + "loss": 0.2677, + "step": 34746 + }, + { + "epoch": 2.814889825016202, + "grad_norm": 0.08311016857624054, + "learning_rate": 6.5840046806787e-05, + "loss": 0.2143, + "step": 34747 + }, + { + "epoch": 2.8149708360337007, + "grad_norm": 0.07984510809183121, + "learning_rate": 6.583554615419236e-05, + "loss": 0.287, + "step": 34748 + }, + { + "epoch": 2.815051847051199, + "grad_norm": 0.058721333742141724, + "learning_rate": 6.583104550159773e-05, + "loss": 0.2335, + "step": 34749 + }, + { + "epoch": 2.815132858068697, + "grad_norm": 0.07456637173891068, + "learning_rate": 6.582654484900312e-05, + "loss": 0.2607, + "step": 34750 + }, + { + "epoch": 2.815213869086196, + "grad_norm": 0.06956981122493744, + "learning_rate": 6.582204419640848e-05, + "loss": 0.2164, + "step": 34751 + }, + { + "epoch": 2.815294880103694, + "grad_norm": 0.05485118553042412, + "learning_rate": 6.581754354381385e-05, + "loss": 0.2154, + "step": 34752 + }, + { + "epoch": 2.8153758911211924, + "grad_norm": 0.0642561987042427, + "learning_rate": 6.581304289121924e-05, + "loss": 0.2323, + "step": 34753 + }, + { + "epoch": 2.815456902138691, + "grad_norm": 0.08512634038925171, + "learning_rate": 6.580854223862461e-05, + "loss": 0.2392, + "step": 34754 + }, + { + "epoch": 2.8155379131561893, + "grad_norm": 0.07992500811815262, + "learning_rate": 6.580404158602997e-05, + "loss": 0.2507, + "step": 34755 + }, + { + "epoch": 2.8156189241736875, + "grad_norm": 0.06383884698152542, + "learning_rate": 6.579954093343536e-05, + "loss": 0.2258, + "step": 34756 + }, + { + "epoch": 2.815699935191186, + "grad_norm": 0.07852292060852051, + "learning_rate": 6.579504028084073e-05, + "loss": 0.2868, + "step": 34757 + }, + { + "epoch": 2.8157809462086845, + "grad_norm": 0.06256461888551712, + "learning_rate": 6.579053962824609e-05, + "loss": 0.2594, + "step": 34758 + }, + { + "epoch": 2.8158619572261827, + "grad_norm": 0.06567879021167755, + "learning_rate": 6.578603897565148e-05, + "loss": 0.241, + "step": 34759 + }, + { + "epoch": 2.8159429682436814, + "grad_norm": 0.08263526856899261, + "learning_rate": 6.578153832305685e-05, + "loss": 0.2563, + "step": 34760 + }, + { + "epoch": 2.8160239792611796, + "grad_norm": 0.05803782120347023, + "learning_rate": 6.577703767046221e-05, + "loss": 0.2702, + "step": 34761 + }, + { + "epoch": 2.816104990278678, + "grad_norm": 0.06257728487253189, + "learning_rate": 6.57725370178676e-05, + "loss": 0.2379, + "step": 34762 + }, + { + "epoch": 2.816186001296176, + "grad_norm": 0.0719677284359932, + "learning_rate": 6.576803636527297e-05, + "loss": 0.2398, + "step": 34763 + }, + { + "epoch": 2.816267012313675, + "grad_norm": 0.06265591830015182, + "learning_rate": 6.576353571267833e-05, + "loss": 0.2097, + "step": 34764 + }, + { + "epoch": 2.816348023331173, + "grad_norm": 0.06853341311216354, + "learning_rate": 6.575903506008372e-05, + "loss": 0.2298, + "step": 34765 + }, + { + "epoch": 2.8164290343486713, + "grad_norm": 0.07417232543230057, + "learning_rate": 6.57545344074891e-05, + "loss": 0.2361, + "step": 34766 + }, + { + "epoch": 2.8165100453661696, + "grad_norm": 0.06885260343551636, + "learning_rate": 6.575003375489445e-05, + "loss": 0.2339, + "step": 34767 + }, + { + "epoch": 2.8165910563836682, + "grad_norm": 0.07457751035690308, + "learning_rate": 6.574553310229984e-05, + "loss": 0.2434, + "step": 34768 + }, + { + "epoch": 2.8166720674011665, + "grad_norm": 0.0838153213262558, + "learning_rate": 6.574103244970521e-05, + "loss": 0.2571, + "step": 34769 + }, + { + "epoch": 2.8167530784186647, + "grad_norm": 0.0747385174036026, + "learning_rate": 6.573653179711057e-05, + "loss": 0.2562, + "step": 34770 + }, + { + "epoch": 2.8168340894361634, + "grad_norm": 0.06462652236223221, + "learning_rate": 6.573203114451596e-05, + "loss": 0.2311, + "step": 34771 + }, + { + "epoch": 2.8169151004536617, + "grad_norm": 0.06499453634023666, + "learning_rate": 6.572753049192134e-05, + "loss": 0.2292, + "step": 34772 + }, + { + "epoch": 2.81699611147116, + "grad_norm": 0.07763504236936569, + "learning_rate": 6.572302983932671e-05, + "loss": 0.275, + "step": 34773 + }, + { + "epoch": 2.8170771224886586, + "grad_norm": 0.07104837894439697, + "learning_rate": 6.571852918673208e-05, + "loss": 0.2451, + "step": 34774 + }, + { + "epoch": 2.817158133506157, + "grad_norm": 0.06204180419445038, + "learning_rate": 6.571402853413746e-05, + "loss": 0.2286, + "step": 34775 + }, + { + "epoch": 2.817239144523655, + "grad_norm": 0.060844436287879944, + "learning_rate": 6.570952788154283e-05, + "loss": 0.2389, + "step": 34776 + }, + { + "epoch": 2.817320155541154, + "grad_norm": 0.07024809718132019, + "learning_rate": 6.57050272289482e-05, + "loss": 0.2467, + "step": 34777 + }, + { + "epoch": 2.817401166558652, + "grad_norm": 0.06422761082649231, + "learning_rate": 6.570052657635358e-05, + "loss": 0.231, + "step": 34778 + }, + { + "epoch": 2.8174821775761503, + "grad_norm": 0.066131092607975, + "learning_rate": 6.569602592375895e-05, + "loss": 0.2645, + "step": 34779 + }, + { + "epoch": 2.817563188593649, + "grad_norm": 0.08315324038267136, + "learning_rate": 6.569152527116432e-05, + "loss": 0.2272, + "step": 34780 + }, + { + "epoch": 2.817644199611147, + "grad_norm": 0.07713814824819565, + "learning_rate": 6.56870246185697e-05, + "loss": 0.2293, + "step": 34781 + }, + { + "epoch": 2.8177252106286454, + "grad_norm": 0.08167970180511475, + "learning_rate": 6.568252396597507e-05, + "loss": 0.2669, + "step": 34782 + }, + { + "epoch": 2.817806221646144, + "grad_norm": 0.060405585914850235, + "learning_rate": 6.567802331338045e-05, + "loss": 0.2059, + "step": 34783 + }, + { + "epoch": 2.8178872326636424, + "grad_norm": 0.05984266847372055, + "learning_rate": 6.567352266078582e-05, + "loss": 0.2416, + "step": 34784 + }, + { + "epoch": 2.8179682436811406, + "grad_norm": 0.0785268023610115, + "learning_rate": 6.566902200819119e-05, + "loss": 0.2622, + "step": 34785 + }, + { + "epoch": 2.818049254698639, + "grad_norm": 0.06491284817457199, + "learning_rate": 6.566452135559657e-05, + "loss": 0.2012, + "step": 34786 + }, + { + "epoch": 2.818130265716137, + "grad_norm": 0.07182303071022034, + "learning_rate": 6.566002070300194e-05, + "loss": 0.1987, + "step": 34787 + }, + { + "epoch": 2.818211276733636, + "grad_norm": 0.06826022267341614, + "learning_rate": 6.565552005040731e-05, + "loss": 0.2458, + "step": 34788 + }, + { + "epoch": 2.818292287751134, + "grad_norm": 0.07355841994285583, + "learning_rate": 6.565101939781269e-05, + "loss": 0.2204, + "step": 34789 + }, + { + "epoch": 2.8183732987686323, + "grad_norm": 0.07428918778896332, + "learning_rate": 6.564651874521806e-05, + "loss": 0.2811, + "step": 34790 + }, + { + "epoch": 2.818454309786131, + "grad_norm": 0.06705375015735626, + "learning_rate": 6.564201809262343e-05, + "loss": 0.2766, + "step": 34791 + }, + { + "epoch": 2.8185353208036292, + "grad_norm": 0.07119186222553253, + "learning_rate": 6.563751744002881e-05, + "loss": 0.2485, + "step": 34792 + }, + { + "epoch": 2.8186163318211275, + "grad_norm": 0.06527364999055862, + "learning_rate": 6.563301678743418e-05, + "loss": 0.1968, + "step": 34793 + }, + { + "epoch": 2.818697342838626, + "grad_norm": 0.06532592326402664, + "learning_rate": 6.562851613483956e-05, + "loss": 0.2672, + "step": 34794 + }, + { + "epoch": 2.8187783538561244, + "grad_norm": 0.06652968376874924, + "learning_rate": 6.562401548224493e-05, + "loss": 0.221, + "step": 34795 + }, + { + "epoch": 2.8188593648736227, + "grad_norm": 0.07571815699338913, + "learning_rate": 6.56195148296503e-05, + "loss": 0.2233, + "step": 34796 + }, + { + "epoch": 2.8189403758911213, + "grad_norm": 0.07855816185474396, + "learning_rate": 6.561501417705568e-05, + "loss": 0.253, + "step": 34797 + }, + { + "epoch": 2.8190213869086196, + "grad_norm": 0.07121338695287704, + "learning_rate": 6.561051352446105e-05, + "loss": 0.2578, + "step": 34798 + }, + { + "epoch": 2.819102397926118, + "grad_norm": 0.06593113392591476, + "learning_rate": 6.560601287186642e-05, + "loss": 0.212, + "step": 34799 + }, + { + "epoch": 2.8191834089436165, + "grad_norm": 0.07159001380205154, + "learning_rate": 6.56015122192718e-05, + "loss": 0.2365, + "step": 34800 + }, + { + "epoch": 2.8192644199611148, + "grad_norm": 0.06383642554283142, + "learning_rate": 6.559701156667717e-05, + "loss": 0.1856, + "step": 34801 + }, + { + "epoch": 2.819345430978613, + "grad_norm": 0.06708572059869766, + "learning_rate": 6.559251091408254e-05, + "loss": 0.2248, + "step": 34802 + }, + { + "epoch": 2.8194264419961117, + "grad_norm": 0.06649935245513916, + "learning_rate": 6.558801026148792e-05, + "loss": 0.2095, + "step": 34803 + }, + { + "epoch": 2.81950745301361, + "grad_norm": 0.06844470649957657, + "learning_rate": 6.558350960889329e-05, + "loss": 0.2459, + "step": 34804 + }, + { + "epoch": 2.819588464031108, + "grad_norm": 0.06876395642757416, + "learning_rate": 6.557900895629866e-05, + "loss": 0.2228, + "step": 34805 + }, + { + "epoch": 2.819669475048607, + "grad_norm": 0.07152493298053741, + "learning_rate": 6.557450830370404e-05, + "loss": 0.2422, + "step": 34806 + }, + { + "epoch": 2.819750486066105, + "grad_norm": 0.075772725045681, + "learning_rate": 6.557000765110941e-05, + "loss": 0.2311, + "step": 34807 + }, + { + "epoch": 2.8198314970836034, + "grad_norm": 0.05591581016778946, + "learning_rate": 6.556550699851479e-05, + "loss": 0.2257, + "step": 34808 + }, + { + "epoch": 2.8199125081011016, + "grad_norm": 0.0721067413687706, + "learning_rate": 6.556100634592016e-05, + "loss": 0.2546, + "step": 34809 + }, + { + "epoch": 2.8199935191186, + "grad_norm": 0.04921577125787735, + "learning_rate": 6.555650569332553e-05, + "loss": 0.1953, + "step": 34810 + }, + { + "epoch": 2.8200745301360985, + "grad_norm": 0.07357095181941986, + "learning_rate": 6.55520050407309e-05, + "loss": 0.2555, + "step": 34811 + }, + { + "epoch": 2.820155541153597, + "grad_norm": 0.08259088546037674, + "learning_rate": 6.554750438813628e-05, + "loss": 0.2466, + "step": 34812 + }, + { + "epoch": 2.820236552171095, + "grad_norm": 0.07193511724472046, + "learning_rate": 6.554300373554165e-05, + "loss": 0.2403, + "step": 34813 + }, + { + "epoch": 2.8203175631885937, + "grad_norm": 0.0642080307006836, + "learning_rate": 6.553850308294703e-05, + "loss": 0.2419, + "step": 34814 + }, + { + "epoch": 2.820398574206092, + "grad_norm": 0.10085230320692062, + "learning_rate": 6.55340024303524e-05, + "loss": 0.2357, + "step": 34815 + }, + { + "epoch": 2.82047958522359, + "grad_norm": 0.07006397098302841, + "learning_rate": 6.552950177775777e-05, + "loss": 0.2743, + "step": 34816 + }, + { + "epoch": 2.820560596241089, + "grad_norm": 0.057857707142829895, + "learning_rate": 6.552500112516315e-05, + "loss": 0.2454, + "step": 34817 + }, + { + "epoch": 2.820641607258587, + "grad_norm": 0.06146040931344032, + "learning_rate": 6.552050047256852e-05, + "loss": 0.1922, + "step": 34818 + }, + { + "epoch": 2.8207226182760854, + "grad_norm": 0.09001678228378296, + "learning_rate": 6.55159998199739e-05, + "loss": 0.2536, + "step": 34819 + }, + { + "epoch": 2.820803629293584, + "grad_norm": 0.07994525879621506, + "learning_rate": 6.551149916737928e-05, + "loss": 0.2718, + "step": 34820 + }, + { + "epoch": 2.8208846403110823, + "grad_norm": 0.059684574604034424, + "learning_rate": 6.550699851478464e-05, + "loss": 0.2079, + "step": 34821 + }, + { + "epoch": 2.8209656513285806, + "grad_norm": 0.06576598435640335, + "learning_rate": 6.550249786219002e-05, + "loss": 0.2614, + "step": 34822 + }, + { + "epoch": 2.8210466623460793, + "grad_norm": 0.055970799177885056, + "learning_rate": 6.54979972095954e-05, + "loss": 0.186, + "step": 34823 + }, + { + "epoch": 2.8211276733635775, + "grad_norm": 0.06768764555454254, + "learning_rate": 6.549349655700076e-05, + "loss": 0.2597, + "step": 34824 + }, + { + "epoch": 2.8212086843810757, + "grad_norm": 0.07927141338586807, + "learning_rate": 6.548899590440615e-05, + "loss": 0.2573, + "step": 34825 + }, + { + "epoch": 2.8212896953985744, + "grad_norm": 0.07906976342201233, + "learning_rate": 6.548449525181152e-05, + "loss": 0.2302, + "step": 34826 + }, + { + "epoch": 2.8213707064160727, + "grad_norm": 0.09384344518184662, + "learning_rate": 6.547999459921688e-05, + "loss": 0.2324, + "step": 34827 + }, + { + "epoch": 2.821451717433571, + "grad_norm": 0.06932132691144943, + "learning_rate": 6.547549394662227e-05, + "loss": 0.23, + "step": 34828 + }, + { + "epoch": 2.8215327284510696, + "grad_norm": 0.06174250319600105, + "learning_rate": 6.547099329402764e-05, + "loss": 0.227, + "step": 34829 + }, + { + "epoch": 2.821613739468568, + "grad_norm": 0.06949333846569061, + "learning_rate": 6.5466492641433e-05, + "loss": 0.2362, + "step": 34830 + }, + { + "epoch": 2.821694750486066, + "grad_norm": 0.0753270760178566, + "learning_rate": 6.546199198883839e-05, + "loss": 0.2455, + "step": 34831 + }, + { + "epoch": 2.8217757615035644, + "grad_norm": 0.07068932056427002, + "learning_rate": 6.545749133624377e-05, + "loss": 0.2138, + "step": 34832 + }, + { + "epoch": 2.8218567725210626, + "grad_norm": 0.059605516493320465, + "learning_rate": 6.545299068364913e-05, + "loss": 0.2259, + "step": 34833 + }, + { + "epoch": 2.8219377835385613, + "grad_norm": 0.07835594564676285, + "learning_rate": 6.544849003105451e-05, + "loss": 0.2409, + "step": 34834 + }, + { + "epoch": 2.8220187945560595, + "grad_norm": 0.058763157576322556, + "learning_rate": 6.544398937845989e-05, + "loss": 0.1978, + "step": 34835 + }, + { + "epoch": 2.8220998055735578, + "grad_norm": 0.06683146208524704, + "learning_rate": 6.543948872586525e-05, + "loss": 0.2535, + "step": 34836 + }, + { + "epoch": 2.8221808165910565, + "grad_norm": 0.07187090069055557, + "learning_rate": 6.543498807327063e-05, + "loss": 0.2472, + "step": 34837 + }, + { + "epoch": 2.8222618276085547, + "grad_norm": 0.06906905770301819, + "learning_rate": 6.543048742067601e-05, + "loss": 0.2251, + "step": 34838 + }, + { + "epoch": 2.822342838626053, + "grad_norm": 0.07675648480653763, + "learning_rate": 6.542598676808137e-05, + "loss": 0.2314, + "step": 34839 + }, + { + "epoch": 2.8224238496435516, + "grad_norm": 0.06668443977832794, + "learning_rate": 6.542148611548675e-05, + "loss": 0.2838, + "step": 34840 + }, + { + "epoch": 2.82250486066105, + "grad_norm": 0.06730206310749054, + "learning_rate": 6.541698546289213e-05, + "loss": 0.2056, + "step": 34841 + }, + { + "epoch": 2.822585871678548, + "grad_norm": 0.061029449105262756, + "learning_rate": 6.541248481029749e-05, + "loss": 0.2379, + "step": 34842 + }, + { + "epoch": 2.822666882696047, + "grad_norm": 0.07811133563518524, + "learning_rate": 6.540798415770288e-05, + "loss": 0.2162, + "step": 34843 + }, + { + "epoch": 2.822747893713545, + "grad_norm": 0.06834142655134201, + "learning_rate": 6.540348350510825e-05, + "loss": 0.2365, + "step": 34844 + }, + { + "epoch": 2.8228289047310433, + "grad_norm": 0.07567695528268814, + "learning_rate": 6.539898285251361e-05, + "loss": 0.2561, + "step": 34845 + }, + { + "epoch": 2.822909915748542, + "grad_norm": 0.07392167299985886, + "learning_rate": 6.5394482199919e-05, + "loss": 0.2387, + "step": 34846 + }, + { + "epoch": 2.8229909267660402, + "grad_norm": 0.07837437838315964, + "learning_rate": 6.538998154732437e-05, + "loss": 0.2296, + "step": 34847 + }, + { + "epoch": 2.8230719377835385, + "grad_norm": 0.06737375259399414, + "learning_rate": 6.538548089472973e-05, + "loss": 0.2398, + "step": 34848 + }, + { + "epoch": 2.823152948801037, + "grad_norm": 0.05513811856508255, + "learning_rate": 6.538098024213512e-05, + "loss": 0.2324, + "step": 34849 + }, + { + "epoch": 2.8232339598185354, + "grad_norm": 0.07730121910572052, + "learning_rate": 6.537647958954049e-05, + "loss": 0.2797, + "step": 34850 + }, + { + "epoch": 2.8233149708360337, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.537197893694586e-05, + "loss": 0.2119, + "step": 34851 + }, + { + "epoch": 2.8233959818535324, + "grad_norm": 0.054995715618133545, + "learning_rate": 6.536747828435124e-05, + "loss": 0.2336, + "step": 34852 + }, + { + "epoch": 2.8234769928710306, + "grad_norm": 0.07880965620279312, + "learning_rate": 6.536297763175661e-05, + "loss": 0.2678, + "step": 34853 + }, + { + "epoch": 2.823558003888529, + "grad_norm": 0.0686798021197319, + "learning_rate": 6.535847697916198e-05, + "loss": 0.2092, + "step": 34854 + }, + { + "epoch": 2.823639014906027, + "grad_norm": 0.06438298523426056, + "learning_rate": 6.535397632656736e-05, + "loss": 0.223, + "step": 34855 + }, + { + "epoch": 2.8237200259235253, + "grad_norm": 0.07243643701076508, + "learning_rate": 6.534947567397273e-05, + "loss": 0.2362, + "step": 34856 + }, + { + "epoch": 2.823801036941024, + "grad_norm": 0.06763161718845367, + "learning_rate": 6.53449750213781e-05, + "loss": 0.2336, + "step": 34857 + }, + { + "epoch": 2.8238820479585223, + "grad_norm": 0.07330334186553955, + "learning_rate": 6.534047436878348e-05, + "loss": 0.2797, + "step": 34858 + }, + { + "epoch": 2.8239630589760205, + "grad_norm": 0.06176656112074852, + "learning_rate": 6.533597371618885e-05, + "loss": 0.233, + "step": 34859 + }, + { + "epoch": 2.824044069993519, + "grad_norm": 0.05908965319395065, + "learning_rate": 6.533147306359423e-05, + "loss": 0.2061, + "step": 34860 + }, + { + "epoch": 2.8241250810110174, + "grad_norm": 0.08021603524684906, + "learning_rate": 6.53269724109996e-05, + "loss": 0.2717, + "step": 34861 + }, + { + "epoch": 2.8242060920285157, + "grad_norm": 0.0696469247341156, + "learning_rate": 6.532247175840497e-05, + "loss": 0.2386, + "step": 34862 + }, + { + "epoch": 2.8242871030460144, + "grad_norm": 0.06569056212902069, + "learning_rate": 6.531797110581035e-05, + "loss": 0.2837, + "step": 34863 + }, + { + "epoch": 2.8243681140635126, + "grad_norm": 0.06969066709280014, + "learning_rate": 6.531347045321572e-05, + "loss": 0.2276, + "step": 34864 + }, + { + "epoch": 2.824449125081011, + "grad_norm": 0.0799136832356453, + "learning_rate": 6.53089698006211e-05, + "loss": 0.2332, + "step": 34865 + }, + { + "epoch": 2.8245301360985096, + "grad_norm": 0.06439350545406342, + "learning_rate": 6.530446914802647e-05, + "loss": 0.2407, + "step": 34866 + }, + { + "epoch": 2.824611147116008, + "grad_norm": 0.07675740867853165, + "learning_rate": 6.529996849543184e-05, + "loss": 0.1951, + "step": 34867 + }, + { + "epoch": 2.824692158133506, + "grad_norm": 0.061667781323194504, + "learning_rate": 6.529546784283722e-05, + "loss": 0.254, + "step": 34868 + }, + { + "epoch": 2.8247731691510047, + "grad_norm": 0.06682218611240387, + "learning_rate": 6.529096719024259e-05, + "loss": 0.2163, + "step": 34869 + }, + { + "epoch": 2.824854180168503, + "grad_norm": 0.05522351711988449, + "learning_rate": 6.528646653764796e-05, + "loss": 0.219, + "step": 34870 + }, + { + "epoch": 2.8249351911860012, + "grad_norm": 0.09306511282920837, + "learning_rate": 6.528196588505334e-05, + "loss": 0.2794, + "step": 34871 + }, + { + "epoch": 2.8250162022035, + "grad_norm": 0.06492568552494049, + "learning_rate": 6.527746523245871e-05, + "loss": 0.2203, + "step": 34872 + }, + { + "epoch": 2.825097213220998, + "grad_norm": 0.07955403625965118, + "learning_rate": 6.527296457986408e-05, + "loss": 0.2162, + "step": 34873 + }, + { + "epoch": 2.8251782242384964, + "grad_norm": 0.0795619934797287, + "learning_rate": 6.526846392726946e-05, + "loss": 0.2759, + "step": 34874 + }, + { + "epoch": 2.8252592352559946, + "grad_norm": 0.06763854622840881, + "learning_rate": 6.526396327467483e-05, + "loss": 0.2137, + "step": 34875 + }, + { + "epoch": 2.8253402462734933, + "grad_norm": 0.06950404495000839, + "learning_rate": 6.52594626220802e-05, + "loss": 0.2366, + "step": 34876 + }, + { + "epoch": 2.8254212572909916, + "grad_norm": 0.06068060174584389, + "learning_rate": 6.525496196948558e-05, + "loss": 0.2257, + "step": 34877 + }, + { + "epoch": 2.82550226830849, + "grad_norm": 0.07817313075065613, + "learning_rate": 6.525046131689095e-05, + "loss": 0.2418, + "step": 34878 + }, + { + "epoch": 2.825583279325988, + "grad_norm": 0.07102708518505096, + "learning_rate": 6.524596066429632e-05, + "loss": 0.2426, + "step": 34879 + }, + { + "epoch": 2.8256642903434868, + "grad_norm": 0.07444391399621964, + "learning_rate": 6.52414600117017e-05, + "loss": 0.2446, + "step": 34880 + }, + { + "epoch": 2.825745301360985, + "grad_norm": 0.06354103982448578, + "learning_rate": 6.523695935910707e-05, + "loss": 0.2356, + "step": 34881 + }, + { + "epoch": 2.8258263123784833, + "grad_norm": 0.06879103928804398, + "learning_rate": 6.523245870651245e-05, + "loss": 0.2282, + "step": 34882 + }, + { + "epoch": 2.825907323395982, + "grad_norm": 0.06873899698257446, + "learning_rate": 6.522795805391782e-05, + "loss": 0.2419, + "step": 34883 + }, + { + "epoch": 2.82598833441348, + "grad_norm": 0.0864935889840126, + "learning_rate": 6.522345740132319e-05, + "loss": 0.2754, + "step": 34884 + }, + { + "epoch": 2.8260693454309784, + "grad_norm": 0.05859457328915596, + "learning_rate": 6.521895674872857e-05, + "loss": 0.2396, + "step": 34885 + }, + { + "epoch": 2.826150356448477, + "grad_norm": 0.08526481688022614, + "learning_rate": 6.521445609613394e-05, + "loss": 0.2512, + "step": 34886 + }, + { + "epoch": 2.8262313674659754, + "grad_norm": 0.07040493935346603, + "learning_rate": 6.520995544353931e-05, + "loss": 0.2025, + "step": 34887 + }, + { + "epoch": 2.8263123784834736, + "grad_norm": 0.07555270940065384, + "learning_rate": 6.520545479094469e-05, + "loss": 0.2448, + "step": 34888 + }, + { + "epoch": 2.8263933895009723, + "grad_norm": 0.06401999294757843, + "learning_rate": 6.520095413835007e-05, + "loss": 0.218, + "step": 34889 + }, + { + "epoch": 2.8264744005184705, + "grad_norm": 0.07106225192546844, + "learning_rate": 6.519645348575543e-05, + "loss": 0.2403, + "step": 34890 + }, + { + "epoch": 2.826555411535969, + "grad_norm": 0.06419890373945236, + "learning_rate": 6.519195283316081e-05, + "loss": 0.2198, + "step": 34891 + }, + { + "epoch": 2.8266364225534675, + "grad_norm": 0.08662693202495575, + "learning_rate": 6.51874521805662e-05, + "loss": 0.2662, + "step": 34892 + }, + { + "epoch": 2.8267174335709657, + "grad_norm": 0.06219939887523651, + "learning_rate": 6.518295152797156e-05, + "loss": 0.2195, + "step": 34893 + }, + { + "epoch": 2.826798444588464, + "grad_norm": 0.06532366573810577, + "learning_rate": 6.517845087537693e-05, + "loss": 0.2113, + "step": 34894 + }, + { + "epoch": 2.8268794556059627, + "grad_norm": 0.06734823435544968, + "learning_rate": 6.517395022278232e-05, + "loss": 0.2382, + "step": 34895 + }, + { + "epoch": 2.826960466623461, + "grad_norm": 0.0640251487493515, + "learning_rate": 6.516944957018768e-05, + "loss": 0.2352, + "step": 34896 + }, + { + "epoch": 2.827041477640959, + "grad_norm": 0.08535750955343246, + "learning_rate": 6.516494891759305e-05, + "loss": 0.2394, + "step": 34897 + }, + { + "epoch": 2.8271224886584574, + "grad_norm": 0.06400448828935623, + "learning_rate": 6.516044826499844e-05, + "loss": 0.2099, + "step": 34898 + }, + { + "epoch": 2.827203499675956, + "grad_norm": 0.06465429067611694, + "learning_rate": 6.51559476124038e-05, + "loss": 0.2472, + "step": 34899 + }, + { + "epoch": 2.8272845106934543, + "grad_norm": 0.06246986612677574, + "learning_rate": 6.515144695980917e-05, + "loss": 0.2248, + "step": 34900 + }, + { + "epoch": 2.8273655217109526, + "grad_norm": 0.060044556856155396, + "learning_rate": 6.514694630721456e-05, + "loss": 0.2189, + "step": 34901 + }, + { + "epoch": 2.827446532728451, + "grad_norm": 0.07407305389642715, + "learning_rate": 6.514244565461992e-05, + "loss": 0.2497, + "step": 34902 + }, + { + "epoch": 2.8275275437459495, + "grad_norm": 0.08100856095552444, + "learning_rate": 6.513794500202529e-05, + "loss": 0.202, + "step": 34903 + }, + { + "epoch": 2.8276085547634477, + "grad_norm": 0.05908948928117752, + "learning_rate": 6.513344434943068e-05, + "loss": 0.2329, + "step": 34904 + }, + { + "epoch": 2.827689565780946, + "grad_norm": 0.06425736099481583, + "learning_rate": 6.512894369683604e-05, + "loss": 0.2385, + "step": 34905 + }, + { + "epoch": 2.8277705767984447, + "grad_norm": 0.07773395627737045, + "learning_rate": 6.512444304424143e-05, + "loss": 0.2401, + "step": 34906 + }, + { + "epoch": 2.827851587815943, + "grad_norm": 0.09697293490171432, + "learning_rate": 6.51199423916468e-05, + "loss": 0.2682, + "step": 34907 + }, + { + "epoch": 2.827932598833441, + "grad_norm": 0.07075246423482895, + "learning_rate": 6.511544173905216e-05, + "loss": 0.2332, + "step": 34908 + }, + { + "epoch": 2.82801360985094, + "grad_norm": 0.07517353445291519, + "learning_rate": 6.511094108645755e-05, + "loss": 0.232, + "step": 34909 + }, + { + "epoch": 2.828094620868438, + "grad_norm": 0.06551103293895721, + "learning_rate": 6.510644043386292e-05, + "loss": 0.208, + "step": 34910 + }, + { + "epoch": 2.8281756318859363, + "grad_norm": 0.060613300651311874, + "learning_rate": 6.510193978126828e-05, + "loss": 0.2181, + "step": 34911 + }, + { + "epoch": 2.828256642903435, + "grad_norm": 0.06362450867891312, + "learning_rate": 6.509743912867367e-05, + "loss": 0.2349, + "step": 34912 + }, + { + "epoch": 2.8283376539209333, + "grad_norm": 0.11658961325883865, + "learning_rate": 6.509293847607904e-05, + "loss": 0.2772, + "step": 34913 + }, + { + "epoch": 2.8284186649384315, + "grad_norm": 0.06812942773103714, + "learning_rate": 6.50884378234844e-05, + "loss": 0.2187, + "step": 34914 + }, + { + "epoch": 2.82849967595593, + "grad_norm": 0.07889614254236221, + "learning_rate": 6.508393717088979e-05, + "loss": 0.2722, + "step": 34915 + }, + { + "epoch": 2.8285806869734285, + "grad_norm": 0.06316548585891724, + "learning_rate": 6.507943651829516e-05, + "loss": 0.2252, + "step": 34916 + }, + { + "epoch": 2.8286616979909267, + "grad_norm": 0.08983833342790604, + "learning_rate": 6.507493586570052e-05, + "loss": 0.2591, + "step": 34917 + }, + { + "epoch": 2.8287427090084254, + "grad_norm": 0.07219039648771286, + "learning_rate": 6.507043521310591e-05, + "loss": 0.2134, + "step": 34918 + }, + { + "epoch": 2.8288237200259236, + "grad_norm": 0.07436413317918777, + "learning_rate": 6.506593456051128e-05, + "loss": 0.2704, + "step": 34919 + }, + { + "epoch": 2.828904731043422, + "grad_norm": 0.0707738921046257, + "learning_rate": 6.506143390791664e-05, + "loss": 0.23, + "step": 34920 + }, + { + "epoch": 2.82898574206092, + "grad_norm": 0.06954982876777649, + "learning_rate": 6.505693325532203e-05, + "loss": 0.2565, + "step": 34921 + }, + { + "epoch": 2.829066753078419, + "grad_norm": 0.057384226471185684, + "learning_rate": 6.50524326027274e-05, + "loss": 0.2442, + "step": 34922 + }, + { + "epoch": 2.829147764095917, + "grad_norm": 0.060179632157087326, + "learning_rate": 6.504793195013276e-05, + "loss": 0.2302, + "step": 34923 + }, + { + "epoch": 2.8292287751134153, + "grad_norm": 0.06120677664875984, + "learning_rate": 6.504343129753815e-05, + "loss": 0.2157, + "step": 34924 + }, + { + "epoch": 2.8293097861309136, + "grad_norm": 0.06761647015810013, + "learning_rate": 6.503893064494352e-05, + "loss": 0.2727, + "step": 34925 + }, + { + "epoch": 2.8293907971484122, + "grad_norm": 0.06313874572515488, + "learning_rate": 6.503442999234888e-05, + "loss": 0.236, + "step": 34926 + }, + { + "epoch": 2.8294718081659105, + "grad_norm": 0.05890517681837082, + "learning_rate": 6.502992933975427e-05, + "loss": 0.2384, + "step": 34927 + }, + { + "epoch": 2.8295528191834087, + "grad_norm": 0.07191134244203568, + "learning_rate": 6.502542868715964e-05, + "loss": 0.2269, + "step": 34928 + }, + { + "epoch": 2.8296338302009074, + "grad_norm": 0.07116875052452087, + "learning_rate": 6.5020928034565e-05, + "loss": 0.2258, + "step": 34929 + }, + { + "epoch": 2.8297148412184057, + "grad_norm": 0.07394856214523315, + "learning_rate": 6.501642738197039e-05, + "loss": 0.2275, + "step": 34930 + }, + { + "epoch": 2.829795852235904, + "grad_norm": 0.06653162837028503, + "learning_rate": 6.501192672937577e-05, + "loss": 0.2285, + "step": 34931 + }, + { + "epoch": 2.8298768632534026, + "grad_norm": 0.0657869353890419, + "learning_rate": 6.500742607678114e-05, + "loss": 0.2579, + "step": 34932 + }, + { + "epoch": 2.829957874270901, + "grad_norm": 0.0657775029540062, + "learning_rate": 6.500292542418651e-05, + "loss": 0.2337, + "step": 34933 + }, + { + "epoch": 2.830038885288399, + "grad_norm": 0.07672388851642609, + "learning_rate": 6.499842477159189e-05, + "loss": 0.2695, + "step": 34934 + }, + { + "epoch": 2.8301198963058978, + "grad_norm": 0.09917972981929779, + "learning_rate": 6.499392411899726e-05, + "loss": 0.2231, + "step": 34935 + }, + { + "epoch": 2.830200907323396, + "grad_norm": 0.07146567106246948, + "learning_rate": 6.498942346640263e-05, + "loss": 0.2256, + "step": 34936 + }, + { + "epoch": 2.8302819183408943, + "grad_norm": 0.06359067559242249, + "learning_rate": 6.498492281380801e-05, + "loss": 0.2181, + "step": 34937 + }, + { + "epoch": 2.830362929358393, + "grad_norm": 0.0743074044585228, + "learning_rate": 6.498042216121338e-05, + "loss": 0.243, + "step": 34938 + }, + { + "epoch": 2.830443940375891, + "grad_norm": 0.07573742419481277, + "learning_rate": 6.497592150861875e-05, + "loss": 0.2234, + "step": 34939 + }, + { + "epoch": 2.8305249513933894, + "grad_norm": 0.0722004771232605, + "learning_rate": 6.497142085602413e-05, + "loss": 0.2239, + "step": 34940 + }, + { + "epoch": 2.830605962410888, + "grad_norm": 0.05707455798983574, + "learning_rate": 6.49669202034295e-05, + "loss": 0.2017, + "step": 34941 + }, + { + "epoch": 2.8306869734283864, + "grad_norm": 0.0702546238899231, + "learning_rate": 6.496241955083488e-05, + "loss": 0.259, + "step": 34942 + }, + { + "epoch": 2.8307679844458846, + "grad_norm": 0.06682027876377106, + "learning_rate": 6.495791889824025e-05, + "loss": 0.257, + "step": 34943 + }, + { + "epoch": 2.830848995463383, + "grad_norm": 0.07950440049171448, + "learning_rate": 6.495341824564562e-05, + "loss": 0.2959, + "step": 34944 + }, + { + "epoch": 2.8309300064808816, + "grad_norm": 0.08968168497085571, + "learning_rate": 6.4948917593051e-05, + "loss": 0.2162, + "step": 34945 + }, + { + "epoch": 2.83101101749838, + "grad_norm": 0.06955923140048981, + "learning_rate": 6.494441694045637e-05, + "loss": 0.2408, + "step": 34946 + }, + { + "epoch": 2.831092028515878, + "grad_norm": 0.04745893552899361, + "learning_rate": 6.493991628786174e-05, + "loss": 0.2034, + "step": 34947 + }, + { + "epoch": 2.8311730395333763, + "grad_norm": 0.06792275607585907, + "learning_rate": 6.493541563526712e-05, + "loss": 0.2606, + "step": 34948 + }, + { + "epoch": 2.831254050550875, + "grad_norm": 0.0904744416475296, + "learning_rate": 6.493091498267249e-05, + "loss": 0.2697, + "step": 34949 + }, + { + "epoch": 2.8313350615683732, + "grad_norm": 0.07122079282999039, + "learning_rate": 6.492641433007786e-05, + "loss": 0.2314, + "step": 34950 + }, + { + "epoch": 2.8314160725858715, + "grad_norm": 0.07180433720350266, + "learning_rate": 6.492191367748324e-05, + "loss": 0.2862, + "step": 34951 + }, + { + "epoch": 2.83149708360337, + "grad_norm": 0.05907098948955536, + "learning_rate": 6.491741302488861e-05, + "loss": 0.2454, + "step": 34952 + }, + { + "epoch": 2.8315780946208684, + "grad_norm": 0.052595555782318115, + "learning_rate": 6.491291237229399e-05, + "loss": 0.2333, + "step": 34953 + }, + { + "epoch": 2.8316591056383666, + "grad_norm": 0.06805722415447235, + "learning_rate": 6.490841171969936e-05, + "loss": 0.2222, + "step": 34954 + }, + { + "epoch": 2.8317401166558653, + "grad_norm": 0.054611627012491226, + "learning_rate": 6.490391106710473e-05, + "loss": 0.2171, + "step": 34955 + }, + { + "epoch": 2.8318211276733636, + "grad_norm": 0.06894121319055557, + "learning_rate": 6.48994104145101e-05, + "loss": 0.2061, + "step": 34956 + }, + { + "epoch": 2.831902138690862, + "grad_norm": 0.07178279757499695, + "learning_rate": 6.489490976191548e-05, + "loss": 0.2446, + "step": 34957 + }, + { + "epoch": 2.8319831497083605, + "grad_norm": 0.06755075603723526, + "learning_rate": 6.489040910932087e-05, + "loss": 0.2233, + "step": 34958 + }, + { + "epoch": 2.8320641607258588, + "grad_norm": 0.06892146170139313, + "learning_rate": 6.488590845672623e-05, + "loss": 0.2749, + "step": 34959 + }, + { + "epoch": 2.832145171743357, + "grad_norm": 0.06232399493455887, + "learning_rate": 6.48814078041316e-05, + "loss": 0.2401, + "step": 34960 + }, + { + "epoch": 2.8322261827608557, + "grad_norm": 0.07726490497589111, + "learning_rate": 6.487690715153699e-05, + "loss": 0.2388, + "step": 34961 + }, + { + "epoch": 2.832307193778354, + "grad_norm": 0.07525191456079483, + "learning_rate": 6.487240649894235e-05, + "loss": 0.2742, + "step": 34962 + }, + { + "epoch": 2.832388204795852, + "grad_norm": 0.0881979689002037, + "learning_rate": 6.486790584634772e-05, + "loss": 0.2627, + "step": 34963 + }, + { + "epoch": 2.832469215813351, + "grad_norm": 0.0649413987994194, + "learning_rate": 6.486340519375311e-05, + "loss": 0.2475, + "step": 34964 + }, + { + "epoch": 2.832550226830849, + "grad_norm": 0.06952086836099625, + "learning_rate": 6.485890454115847e-05, + "loss": 0.2643, + "step": 34965 + }, + { + "epoch": 2.8326312378483474, + "grad_norm": 0.0617440864443779, + "learning_rate": 6.485440388856384e-05, + "loss": 0.229, + "step": 34966 + }, + { + "epoch": 2.8327122488658456, + "grad_norm": 0.059850435703992844, + "learning_rate": 6.484990323596923e-05, + "loss": 0.265, + "step": 34967 + }, + { + "epoch": 2.8327932598833443, + "grad_norm": 0.07749108970165253, + "learning_rate": 6.484540258337459e-05, + "loss": 0.2749, + "step": 34968 + }, + { + "epoch": 2.8328742709008425, + "grad_norm": 0.06757545471191406, + "learning_rate": 6.484090193077996e-05, + "loss": 0.2506, + "step": 34969 + }, + { + "epoch": 2.832955281918341, + "grad_norm": 0.06373197585344315, + "learning_rate": 6.483640127818535e-05, + "loss": 0.2415, + "step": 34970 + }, + { + "epoch": 2.833036292935839, + "grad_norm": 0.06949126720428467, + "learning_rate": 6.483190062559071e-05, + "loss": 0.2503, + "step": 34971 + }, + { + "epoch": 2.8331173039533377, + "grad_norm": 0.05850743502378464, + "learning_rate": 6.482739997299608e-05, + "loss": 0.1972, + "step": 34972 + }, + { + "epoch": 2.833198314970836, + "grad_norm": 0.07521665096282959, + "learning_rate": 6.482289932040147e-05, + "loss": 0.2519, + "step": 34973 + }, + { + "epoch": 2.833279325988334, + "grad_norm": 0.058105140924453735, + "learning_rate": 6.481839866780683e-05, + "loss": 0.2276, + "step": 34974 + }, + { + "epoch": 2.833360337005833, + "grad_norm": 0.05883853882551193, + "learning_rate": 6.48138980152122e-05, + "loss": 0.2589, + "step": 34975 + }, + { + "epoch": 2.833441348023331, + "grad_norm": 0.06599044799804688, + "learning_rate": 6.480939736261759e-05, + "loss": 0.2381, + "step": 34976 + }, + { + "epoch": 2.8335223590408294, + "grad_norm": 0.0584799088537693, + "learning_rate": 6.480489671002295e-05, + "loss": 0.2495, + "step": 34977 + }, + { + "epoch": 2.833603370058328, + "grad_norm": 0.07386167347431183, + "learning_rate": 6.480039605742833e-05, + "loss": 0.2524, + "step": 34978 + }, + { + "epoch": 2.8336843810758263, + "grad_norm": 0.07024627923965454, + "learning_rate": 6.479589540483371e-05, + "loss": 0.2099, + "step": 34979 + }, + { + "epoch": 2.8337653920933246, + "grad_norm": 0.057228393852710724, + "learning_rate": 6.479139475223907e-05, + "loss": 0.2309, + "step": 34980 + }, + { + "epoch": 2.8338464031108233, + "grad_norm": 0.07119888067245483, + "learning_rate": 6.478689409964445e-05, + "loss": 0.2353, + "step": 34981 + }, + { + "epoch": 2.8339274141283215, + "grad_norm": 0.06771930307149887, + "learning_rate": 6.478239344704983e-05, + "loss": 0.2281, + "step": 34982 + }, + { + "epoch": 2.8340084251458197, + "grad_norm": 0.06906388700008392, + "learning_rate": 6.477789279445519e-05, + "loss": 0.2931, + "step": 34983 + }, + { + "epoch": 2.8340894361633184, + "grad_norm": 0.07088629901409149, + "learning_rate": 6.477339214186058e-05, + "loss": 0.2441, + "step": 34984 + }, + { + "epoch": 2.8341704471808167, + "grad_norm": 0.06921649724245071, + "learning_rate": 6.476889148926595e-05, + "loss": 0.2769, + "step": 34985 + }, + { + "epoch": 2.834251458198315, + "grad_norm": 0.07710966467857361, + "learning_rate": 6.476439083667131e-05, + "loss": 0.2329, + "step": 34986 + }, + { + "epoch": 2.8343324692158136, + "grad_norm": 0.056059908121824265, + "learning_rate": 6.47598901840767e-05, + "loss": 0.2521, + "step": 34987 + }, + { + "epoch": 2.834413480233312, + "grad_norm": 0.061635423451662064, + "learning_rate": 6.475538953148207e-05, + "loss": 0.1952, + "step": 34988 + }, + { + "epoch": 2.83449449125081, + "grad_norm": 0.08030088990926743, + "learning_rate": 6.475088887888743e-05, + "loss": 0.2684, + "step": 34989 + }, + { + "epoch": 2.8345755022683083, + "grad_norm": 0.07802361994981766, + "learning_rate": 6.474638822629282e-05, + "loss": 0.265, + "step": 34990 + }, + { + "epoch": 2.834656513285807, + "grad_norm": 0.06831242889165878, + "learning_rate": 6.47418875736982e-05, + "loss": 0.2168, + "step": 34991 + }, + { + "epoch": 2.8347375243033053, + "grad_norm": 0.0766051784157753, + "learning_rate": 6.473738692110356e-05, + "loss": 0.2396, + "step": 34992 + }, + { + "epoch": 2.8348185353208035, + "grad_norm": 0.09226632863283157, + "learning_rate": 6.473288626850894e-05, + "loss": 0.2222, + "step": 34993 + }, + { + "epoch": 2.8348995463383018, + "grad_norm": 0.06202827766537666, + "learning_rate": 6.472838561591432e-05, + "loss": 0.2082, + "step": 34994 + }, + { + "epoch": 2.8349805573558005, + "grad_norm": 0.06007321923971176, + "learning_rate": 6.472388496331968e-05, + "loss": 0.2341, + "step": 34995 + }, + { + "epoch": 2.8350615683732987, + "grad_norm": 0.07148656994104385, + "learning_rate": 6.471938431072506e-05, + "loss": 0.2519, + "step": 34996 + }, + { + "epoch": 2.835142579390797, + "grad_norm": 0.06458251923322678, + "learning_rate": 6.471488365813044e-05, + "loss": 0.2426, + "step": 34997 + }, + { + "epoch": 2.8352235904082956, + "grad_norm": 0.07826030254364014, + "learning_rate": 6.47103830055358e-05, + "loss": 0.2701, + "step": 34998 + }, + { + "epoch": 2.835304601425794, + "grad_norm": 0.08532463014125824, + "learning_rate": 6.470588235294118e-05, + "loss": 0.2626, + "step": 34999 + }, + { + "epoch": 2.835385612443292, + "grad_norm": 0.08283942192792892, + "learning_rate": 6.470138170034656e-05, + "loss": 0.2379, + "step": 35000 + }, + { + "epoch": 2.835466623460791, + "grad_norm": 0.07847673445940018, + "learning_rate": 6.469688104775192e-05, + "loss": 0.2509, + "step": 35001 + }, + { + "epoch": 2.835547634478289, + "grad_norm": 0.0680897906422615, + "learning_rate": 6.46923803951573e-05, + "loss": 0.1928, + "step": 35002 + }, + { + "epoch": 2.8356286454957873, + "grad_norm": 0.05337081104516983, + "learning_rate": 6.468787974256268e-05, + "loss": 0.1969, + "step": 35003 + }, + { + "epoch": 2.835709656513286, + "grad_norm": 0.07576156407594681, + "learning_rate": 6.468337908996804e-05, + "loss": 0.2327, + "step": 35004 + }, + { + "epoch": 2.8357906675307842, + "grad_norm": 0.08666659891605377, + "learning_rate": 6.467887843737343e-05, + "loss": 0.3167, + "step": 35005 + }, + { + "epoch": 2.8358716785482825, + "grad_norm": 0.09243784844875336, + "learning_rate": 6.46743777847788e-05, + "loss": 0.2857, + "step": 35006 + }, + { + "epoch": 2.835952689565781, + "grad_norm": 0.0682855099439621, + "learning_rate": 6.466987713218416e-05, + "loss": 0.1955, + "step": 35007 + }, + { + "epoch": 2.8360337005832794, + "grad_norm": 0.062375444918870926, + "learning_rate": 6.466537647958955e-05, + "loss": 0.2608, + "step": 35008 + }, + { + "epoch": 2.8361147116007777, + "grad_norm": 0.059896890074014664, + "learning_rate": 6.466087582699492e-05, + "loss": 0.2254, + "step": 35009 + }, + { + "epoch": 2.8361957226182763, + "grad_norm": 0.07151475548744202, + "learning_rate": 6.46563751744003e-05, + "loss": 0.2278, + "step": 35010 + }, + { + "epoch": 2.8362767336357746, + "grad_norm": 0.05724192038178444, + "learning_rate": 6.465187452180567e-05, + "loss": 0.2194, + "step": 35011 + }, + { + "epoch": 2.836357744653273, + "grad_norm": 0.05848658084869385, + "learning_rate": 6.464737386921104e-05, + "loss": 0.2469, + "step": 35012 + }, + { + "epoch": 2.836438755670771, + "grad_norm": 0.07299201935529709, + "learning_rate": 6.464287321661641e-05, + "loss": 0.2336, + "step": 35013 + }, + { + "epoch": 2.8365197666882693, + "grad_norm": 0.055811021476984024, + "learning_rate": 6.463837256402179e-05, + "loss": 0.2315, + "step": 35014 + }, + { + "epoch": 2.836600777705768, + "grad_norm": 0.07474242150783539, + "learning_rate": 6.463387191142716e-05, + "loss": 0.2569, + "step": 35015 + }, + { + "epoch": 2.8366817887232663, + "grad_norm": 0.07334800809621811, + "learning_rate": 6.462937125883254e-05, + "loss": 0.2535, + "step": 35016 + }, + { + "epoch": 2.8367627997407645, + "grad_norm": 0.06788776814937592, + "learning_rate": 6.462487060623791e-05, + "loss": 0.2351, + "step": 35017 + }, + { + "epoch": 2.836843810758263, + "grad_norm": 0.06585695594549179, + "learning_rate": 6.462036995364328e-05, + "loss": 0.2525, + "step": 35018 + }, + { + "epoch": 2.8369248217757614, + "grad_norm": 0.07982311397790909, + "learning_rate": 6.461586930104866e-05, + "loss": 0.259, + "step": 35019 + }, + { + "epoch": 2.8370058327932597, + "grad_norm": 0.07220277935266495, + "learning_rate": 6.461136864845403e-05, + "loss": 0.2495, + "step": 35020 + }, + { + "epoch": 2.8370868438107584, + "grad_norm": 0.07429298013448715, + "learning_rate": 6.46068679958594e-05, + "loss": 0.2376, + "step": 35021 + }, + { + "epoch": 2.8371678548282566, + "grad_norm": 0.08143361657857895, + "learning_rate": 6.460236734326478e-05, + "loss": 0.2465, + "step": 35022 + }, + { + "epoch": 2.837248865845755, + "grad_norm": 0.06289290636777878, + "learning_rate": 6.459786669067015e-05, + "loss": 0.2462, + "step": 35023 + }, + { + "epoch": 2.8373298768632536, + "grad_norm": 0.06706014275550842, + "learning_rate": 6.459336603807552e-05, + "loss": 0.2297, + "step": 35024 + }, + { + "epoch": 2.837410887880752, + "grad_norm": 0.05648915097117424, + "learning_rate": 6.45888653854809e-05, + "loss": 0.2318, + "step": 35025 + }, + { + "epoch": 2.83749189889825, + "grad_norm": 0.07064875960350037, + "learning_rate": 6.458436473288627e-05, + "loss": 0.2537, + "step": 35026 + }, + { + "epoch": 2.8375729099157487, + "grad_norm": 0.06095169112086296, + "learning_rate": 6.457986408029165e-05, + "loss": 0.2096, + "step": 35027 + }, + { + "epoch": 2.837653920933247, + "grad_norm": 0.05833174288272858, + "learning_rate": 6.457536342769702e-05, + "loss": 0.2017, + "step": 35028 + }, + { + "epoch": 2.837734931950745, + "grad_norm": 0.0616929791867733, + "learning_rate": 6.457086277510239e-05, + "loss": 0.2404, + "step": 35029 + }, + { + "epoch": 2.837815942968244, + "grad_norm": 0.07648038119077682, + "learning_rate": 6.456636212250777e-05, + "loss": 0.292, + "step": 35030 + }, + { + "epoch": 2.837896953985742, + "grad_norm": 0.07276062667369843, + "learning_rate": 6.456186146991314e-05, + "loss": 0.2465, + "step": 35031 + }, + { + "epoch": 2.8379779650032404, + "grad_norm": 0.06426334381103516, + "learning_rate": 6.455736081731851e-05, + "loss": 0.2026, + "step": 35032 + }, + { + "epoch": 2.838058976020739, + "grad_norm": 0.09658359736204147, + "learning_rate": 6.455286016472389e-05, + "loss": 0.2774, + "step": 35033 + }, + { + "epoch": 2.8381399870382373, + "grad_norm": 0.06532305479049683, + "learning_rate": 6.454835951212926e-05, + "loss": 0.2223, + "step": 35034 + }, + { + "epoch": 2.8382209980557356, + "grad_norm": 0.05708573758602142, + "learning_rate": 6.454385885953463e-05, + "loss": 0.1995, + "step": 35035 + }, + { + "epoch": 2.838302009073234, + "grad_norm": 0.07263416051864624, + "learning_rate": 6.453935820694002e-05, + "loss": 0.2672, + "step": 35036 + }, + { + "epoch": 2.838383020090732, + "grad_norm": 0.0830162912607193, + "learning_rate": 6.453485755434538e-05, + "loss": 0.2136, + "step": 35037 + }, + { + "epoch": 2.8384640311082308, + "grad_norm": 0.06936755776405334, + "learning_rate": 6.453035690175075e-05, + "loss": 0.2827, + "step": 35038 + }, + { + "epoch": 2.838545042125729, + "grad_norm": 0.0790124237537384, + "learning_rate": 6.452585624915614e-05, + "loss": 0.2345, + "step": 35039 + }, + { + "epoch": 2.8386260531432272, + "grad_norm": 0.06774169951677322, + "learning_rate": 6.45213555965615e-05, + "loss": 0.2457, + "step": 35040 + }, + { + "epoch": 2.838707064160726, + "grad_norm": 0.0654446929693222, + "learning_rate": 6.451685494396688e-05, + "loss": 0.2259, + "step": 35041 + }, + { + "epoch": 2.838788075178224, + "grad_norm": 0.06522495299577713, + "learning_rate": 6.451235429137226e-05, + "loss": 0.2096, + "step": 35042 + }, + { + "epoch": 2.8388690861957224, + "grad_norm": 0.06641356647014618, + "learning_rate": 6.450785363877762e-05, + "loss": 0.2841, + "step": 35043 + }, + { + "epoch": 2.838950097213221, + "grad_norm": 0.06917969137430191, + "learning_rate": 6.4503352986183e-05, + "loss": 0.1908, + "step": 35044 + }, + { + "epoch": 2.8390311082307194, + "grad_norm": 0.07256962358951569, + "learning_rate": 6.449885233358838e-05, + "loss": 0.2137, + "step": 35045 + }, + { + "epoch": 2.8391121192482176, + "grad_norm": 0.06664843112230301, + "learning_rate": 6.449435168099374e-05, + "loss": 0.2453, + "step": 35046 + }, + { + "epoch": 2.8391931302657163, + "grad_norm": 0.05385294556617737, + "learning_rate": 6.448985102839912e-05, + "loss": 0.2228, + "step": 35047 + }, + { + "epoch": 2.8392741412832145, + "grad_norm": 0.07032804191112518, + "learning_rate": 6.44853503758045e-05, + "loss": 0.24, + "step": 35048 + }, + { + "epoch": 2.839355152300713, + "grad_norm": 0.0735827162861824, + "learning_rate": 6.448084972320986e-05, + "loss": 0.2415, + "step": 35049 + }, + { + "epoch": 2.8394361633182115, + "grad_norm": 0.06067224219441414, + "learning_rate": 6.447634907061524e-05, + "loss": 0.2344, + "step": 35050 + }, + { + "epoch": 2.8395171743357097, + "grad_norm": 0.06811947375535965, + "learning_rate": 6.447184841802063e-05, + "loss": 0.203, + "step": 35051 + }, + { + "epoch": 2.839598185353208, + "grad_norm": 0.07413507252931595, + "learning_rate": 6.446734776542599e-05, + "loss": 0.2707, + "step": 35052 + }, + { + "epoch": 2.8396791963707066, + "grad_norm": 0.06308168172836304, + "learning_rate": 6.446284711283136e-05, + "loss": 0.2208, + "step": 35053 + }, + { + "epoch": 2.839760207388205, + "grad_norm": 0.07065436244010925, + "learning_rate": 6.445834646023675e-05, + "loss": 0.2497, + "step": 35054 + }, + { + "epoch": 2.839841218405703, + "grad_norm": 0.08040545135736465, + "learning_rate": 6.44538458076421e-05, + "loss": 0.2678, + "step": 35055 + }, + { + "epoch": 2.839922229423202, + "grad_norm": 0.08743759989738464, + "learning_rate": 6.444934515504748e-05, + "loss": 0.2581, + "step": 35056 + }, + { + "epoch": 2.8400032404407, + "grad_norm": 0.06685937941074371, + "learning_rate": 6.444484450245287e-05, + "loss": 0.2281, + "step": 35057 + }, + { + "epoch": 2.8400842514581983, + "grad_norm": 0.06777413934469223, + "learning_rate": 6.444034384985823e-05, + "loss": 0.2623, + "step": 35058 + }, + { + "epoch": 2.8401652624756966, + "grad_norm": 0.06635896861553192, + "learning_rate": 6.44358431972636e-05, + "loss": 0.2158, + "step": 35059 + }, + { + "epoch": 2.840246273493195, + "grad_norm": 0.06646884232759476, + "learning_rate": 6.443134254466899e-05, + "loss": 0.2056, + "step": 35060 + }, + { + "epoch": 2.8403272845106935, + "grad_norm": 0.06737709790468216, + "learning_rate": 6.442684189207435e-05, + "loss": 0.2506, + "step": 35061 + }, + { + "epoch": 2.8404082955281917, + "grad_norm": 0.07704462856054306, + "learning_rate": 6.442234123947972e-05, + "loss": 0.2152, + "step": 35062 + }, + { + "epoch": 2.84048930654569, + "grad_norm": 0.06483523547649384, + "learning_rate": 6.441784058688511e-05, + "loss": 0.2347, + "step": 35063 + }, + { + "epoch": 2.8405703175631887, + "grad_norm": 0.07217948883771896, + "learning_rate": 6.441333993429047e-05, + "loss": 0.24, + "step": 35064 + }, + { + "epoch": 2.840651328580687, + "grad_norm": 0.07810252904891968, + "learning_rate": 6.440883928169586e-05, + "loss": 0.2455, + "step": 35065 + }, + { + "epoch": 2.840732339598185, + "grad_norm": 0.08098962903022766, + "learning_rate": 6.440433862910123e-05, + "loss": 0.2083, + "step": 35066 + }, + { + "epoch": 2.840813350615684, + "grad_norm": 0.07664956152439117, + "learning_rate": 6.439983797650659e-05, + "loss": 0.2215, + "step": 35067 + }, + { + "epoch": 2.840894361633182, + "grad_norm": 0.06895925849676132, + "learning_rate": 6.439533732391198e-05, + "loss": 0.24, + "step": 35068 + }, + { + "epoch": 2.8409753726506803, + "grad_norm": 0.07341915369033813, + "learning_rate": 6.439083667131735e-05, + "loss": 0.2441, + "step": 35069 + }, + { + "epoch": 2.841056383668179, + "grad_norm": 0.0792069062590599, + "learning_rate": 6.438633601872271e-05, + "loss": 0.2352, + "step": 35070 + }, + { + "epoch": 2.8411373946856773, + "grad_norm": 0.0779370442032814, + "learning_rate": 6.43818353661281e-05, + "loss": 0.249, + "step": 35071 + }, + { + "epoch": 2.8412184057031755, + "grad_norm": 0.08689171075820923, + "learning_rate": 6.437733471353347e-05, + "loss": 0.2668, + "step": 35072 + }, + { + "epoch": 2.841299416720674, + "grad_norm": 0.08527795970439911, + "learning_rate": 6.437283406093883e-05, + "loss": 0.2353, + "step": 35073 + }, + { + "epoch": 2.8413804277381725, + "grad_norm": 0.05817866697907448, + "learning_rate": 6.436833340834422e-05, + "loss": 0.2205, + "step": 35074 + }, + { + "epoch": 2.8414614387556707, + "grad_norm": 0.07152362167835236, + "learning_rate": 6.436383275574959e-05, + "loss": 0.2293, + "step": 35075 + }, + { + "epoch": 2.8415424497731694, + "grad_norm": 0.055119458585977554, + "learning_rate": 6.435933210315495e-05, + "loss": 0.1939, + "step": 35076 + }, + { + "epoch": 2.8416234607906676, + "grad_norm": 0.0744624063372612, + "learning_rate": 6.435483145056034e-05, + "loss": 0.2108, + "step": 35077 + }, + { + "epoch": 2.841704471808166, + "grad_norm": 0.07901319861412048, + "learning_rate": 6.435033079796571e-05, + "loss": 0.2493, + "step": 35078 + }, + { + "epoch": 2.8417854828256646, + "grad_norm": 0.07424967736005783, + "learning_rate": 6.434583014537107e-05, + "loss": 0.2564, + "step": 35079 + }, + { + "epoch": 2.841866493843163, + "grad_norm": 0.07194703817367554, + "learning_rate": 6.434132949277646e-05, + "loss": 0.2647, + "step": 35080 + }, + { + "epoch": 2.841947504860661, + "grad_norm": 0.07558548450469971, + "learning_rate": 6.433682884018183e-05, + "loss": 0.2529, + "step": 35081 + }, + { + "epoch": 2.8420285158781593, + "grad_norm": 0.08328049629926682, + "learning_rate": 6.43323281875872e-05, + "loss": 0.2749, + "step": 35082 + }, + { + "epoch": 2.8421095268956575, + "grad_norm": 0.07167727500200272, + "learning_rate": 6.432782753499258e-05, + "loss": 0.2678, + "step": 35083 + }, + { + "epoch": 2.8421905379131562, + "grad_norm": 0.08185160160064697, + "learning_rate": 6.432332688239795e-05, + "loss": 0.2322, + "step": 35084 + }, + { + "epoch": 2.8422715489306545, + "grad_norm": 0.08160267770290375, + "learning_rate": 6.431882622980331e-05, + "loss": 0.2491, + "step": 35085 + }, + { + "epoch": 2.8423525599481527, + "grad_norm": 0.0694357231259346, + "learning_rate": 6.43143255772087e-05, + "loss": 0.2365, + "step": 35086 + }, + { + "epoch": 2.8424335709656514, + "grad_norm": 0.06928124278783798, + "learning_rate": 6.430982492461407e-05, + "loss": 0.2033, + "step": 35087 + }, + { + "epoch": 2.8425145819831497, + "grad_norm": 0.07067787647247314, + "learning_rate": 6.430532427201944e-05, + "loss": 0.2214, + "step": 35088 + }, + { + "epoch": 2.842595593000648, + "grad_norm": 0.06498835980892181, + "learning_rate": 6.430082361942482e-05, + "loss": 0.2462, + "step": 35089 + }, + { + "epoch": 2.8426766040181466, + "grad_norm": 0.06612785905599594, + "learning_rate": 6.42963229668302e-05, + "loss": 0.2241, + "step": 35090 + }, + { + "epoch": 2.842757615035645, + "grad_norm": 0.06681181490421295, + "learning_rate": 6.429182231423557e-05, + "loss": 0.2506, + "step": 35091 + }, + { + "epoch": 2.842838626053143, + "grad_norm": 0.0670313686132431, + "learning_rate": 6.428732166164094e-05, + "loss": 0.2273, + "step": 35092 + }, + { + "epoch": 2.8429196370706418, + "grad_norm": 0.06744195520877838, + "learning_rate": 6.428282100904632e-05, + "loss": 0.2009, + "step": 35093 + }, + { + "epoch": 2.84300064808814, + "grad_norm": 0.06611412763595581, + "learning_rate": 6.427832035645169e-05, + "loss": 0.2131, + "step": 35094 + }, + { + "epoch": 2.8430816591056383, + "grad_norm": 0.05895596370100975, + "learning_rate": 6.427381970385706e-05, + "loss": 0.2321, + "step": 35095 + }, + { + "epoch": 2.843162670123137, + "grad_norm": 0.06782980263233185, + "learning_rate": 6.426931905126244e-05, + "loss": 0.228, + "step": 35096 + }, + { + "epoch": 2.843243681140635, + "grad_norm": 0.06184051185846329, + "learning_rate": 6.426481839866781e-05, + "loss": 0.2317, + "step": 35097 + }, + { + "epoch": 2.8433246921581334, + "grad_norm": 0.07850348949432373, + "learning_rate": 6.426031774607318e-05, + "loss": 0.2829, + "step": 35098 + }, + { + "epoch": 2.843405703175632, + "grad_norm": 0.07288786768913269, + "learning_rate": 6.425581709347856e-05, + "loss": 0.2351, + "step": 35099 + }, + { + "epoch": 2.8434867141931304, + "grad_norm": 0.06398995220661163, + "learning_rate": 6.425131644088393e-05, + "loss": 0.2206, + "step": 35100 + }, + { + "epoch": 2.8435677252106286, + "grad_norm": 0.06157770752906799, + "learning_rate": 6.42468157882893e-05, + "loss": 0.2379, + "step": 35101 + }, + { + "epoch": 2.843648736228127, + "grad_norm": 0.06304378807544708, + "learning_rate": 6.424231513569468e-05, + "loss": 0.2784, + "step": 35102 + }, + { + "epoch": 2.8437297472456255, + "grad_norm": 0.06795405596494675, + "learning_rate": 6.423781448310005e-05, + "loss": 0.2381, + "step": 35103 + }, + { + "epoch": 2.843810758263124, + "grad_norm": 0.06040545925498009, + "learning_rate": 6.423331383050543e-05, + "loss": 0.2162, + "step": 35104 + }, + { + "epoch": 2.843891769280622, + "grad_norm": 0.059341639280319214, + "learning_rate": 6.42288131779108e-05, + "loss": 0.2117, + "step": 35105 + }, + { + "epoch": 2.8439727802981203, + "grad_norm": 0.07643841952085495, + "learning_rate": 6.422431252531617e-05, + "loss": 0.235, + "step": 35106 + }, + { + "epoch": 2.844053791315619, + "grad_norm": 0.07376381009817123, + "learning_rate": 6.421981187272155e-05, + "loss": 0.2177, + "step": 35107 + }, + { + "epoch": 2.844134802333117, + "grad_norm": 0.05964221805334091, + "learning_rate": 6.421531122012692e-05, + "loss": 0.2086, + "step": 35108 + }, + { + "epoch": 2.8442158133506155, + "grad_norm": 0.06813271343708038, + "learning_rate": 6.42108105675323e-05, + "loss": 0.2221, + "step": 35109 + }, + { + "epoch": 2.844296824368114, + "grad_norm": 0.07453761994838715, + "learning_rate": 6.420630991493767e-05, + "loss": 0.2207, + "step": 35110 + }, + { + "epoch": 2.8443778353856124, + "grad_norm": 0.06619906425476074, + "learning_rate": 6.420180926234304e-05, + "loss": 0.2148, + "step": 35111 + }, + { + "epoch": 2.8444588464031106, + "grad_norm": 0.06775642186403275, + "learning_rate": 6.419730860974842e-05, + "loss": 0.2325, + "step": 35112 + }, + { + "epoch": 2.8445398574206093, + "grad_norm": 0.07589752972126007, + "learning_rate": 6.419280795715379e-05, + "loss": 0.2606, + "step": 35113 + }, + { + "epoch": 2.8446208684381076, + "grad_norm": 0.0746060386300087, + "learning_rate": 6.418830730455916e-05, + "loss": 0.2374, + "step": 35114 + }, + { + "epoch": 2.844701879455606, + "grad_norm": 0.08039572834968567, + "learning_rate": 6.418380665196454e-05, + "loss": 0.27, + "step": 35115 + }, + { + "epoch": 2.8447828904731045, + "grad_norm": 0.0715525895357132, + "learning_rate": 6.417930599936991e-05, + "loss": 0.2392, + "step": 35116 + }, + { + "epoch": 2.8448639014906028, + "grad_norm": 0.06464890390634537, + "learning_rate": 6.41748053467753e-05, + "loss": 0.2321, + "step": 35117 + }, + { + "epoch": 2.844944912508101, + "grad_norm": 0.07697032392024994, + "learning_rate": 6.417030469418066e-05, + "loss": 0.2271, + "step": 35118 + }, + { + "epoch": 2.8450259235255997, + "grad_norm": 0.07292238622903824, + "learning_rate": 6.416580404158603e-05, + "loss": 0.2573, + "step": 35119 + }, + { + "epoch": 2.845106934543098, + "grad_norm": 0.07538817822933197, + "learning_rate": 6.416130338899142e-05, + "loss": 0.2529, + "step": 35120 + }, + { + "epoch": 2.845187945560596, + "grad_norm": 0.08894102275371552, + "learning_rate": 6.415680273639678e-05, + "loss": 0.2783, + "step": 35121 + }, + { + "epoch": 2.845268956578095, + "grad_norm": 0.06148630753159523, + "learning_rate": 6.415230208380215e-05, + "loss": 0.2373, + "step": 35122 + }, + { + "epoch": 2.845349967595593, + "grad_norm": 0.06874866038560867, + "learning_rate": 6.414780143120754e-05, + "loss": 0.2085, + "step": 35123 + }, + { + "epoch": 2.8454309786130914, + "grad_norm": 0.06734011322259903, + "learning_rate": 6.41433007786129e-05, + "loss": 0.2627, + "step": 35124 + }, + { + "epoch": 2.8455119896305896, + "grad_norm": 0.07099562883377075, + "learning_rate": 6.413880012601827e-05, + "loss": 0.2366, + "step": 35125 + }, + { + "epoch": 2.8455930006480883, + "grad_norm": 0.06782719492912292, + "learning_rate": 6.413429947342366e-05, + "loss": 0.2591, + "step": 35126 + }, + { + "epoch": 2.8456740116655865, + "grad_norm": 0.08088771253824234, + "learning_rate": 6.412979882082902e-05, + "loss": 0.2627, + "step": 35127 + }, + { + "epoch": 2.845755022683085, + "grad_norm": 0.06747904419898987, + "learning_rate": 6.412529816823439e-05, + "loss": 0.2215, + "step": 35128 + }, + { + "epoch": 2.845836033700583, + "grad_norm": 0.09774980694055557, + "learning_rate": 6.412079751563978e-05, + "loss": 0.2136, + "step": 35129 + }, + { + "epoch": 2.8459170447180817, + "grad_norm": 0.06703547388315201, + "learning_rate": 6.411629686304514e-05, + "loss": 0.2568, + "step": 35130 + }, + { + "epoch": 2.84599805573558, + "grad_norm": 0.07806454598903656, + "learning_rate": 6.411179621045051e-05, + "loss": 0.2178, + "step": 35131 + }, + { + "epoch": 2.846079066753078, + "grad_norm": 0.0774211660027504, + "learning_rate": 6.41072955578559e-05, + "loss": 0.2296, + "step": 35132 + }, + { + "epoch": 2.846160077770577, + "grad_norm": 0.06930553913116455, + "learning_rate": 6.410279490526126e-05, + "loss": 0.2351, + "step": 35133 + }, + { + "epoch": 2.846241088788075, + "grad_norm": 0.0710282027721405, + "learning_rate": 6.409829425266663e-05, + "loss": 0.2244, + "step": 35134 + }, + { + "epoch": 2.8463220998055734, + "grad_norm": 0.05973135307431221, + "learning_rate": 6.409379360007202e-05, + "loss": 0.1993, + "step": 35135 + }, + { + "epoch": 2.846403110823072, + "grad_norm": 0.07437211275100708, + "learning_rate": 6.408929294747738e-05, + "loss": 0.253, + "step": 35136 + }, + { + "epoch": 2.8464841218405703, + "grad_norm": 0.06692270934581757, + "learning_rate": 6.408479229488276e-05, + "loss": 0.2403, + "step": 35137 + }, + { + "epoch": 2.8465651328580686, + "grad_norm": 0.06204549968242645, + "learning_rate": 6.408029164228814e-05, + "loss": 0.2558, + "step": 35138 + }, + { + "epoch": 2.8466461438755672, + "grad_norm": 0.05657990649342537, + "learning_rate": 6.40757909896935e-05, + "loss": 0.2377, + "step": 35139 + }, + { + "epoch": 2.8467271548930655, + "grad_norm": 0.08282967656850815, + "learning_rate": 6.407129033709888e-05, + "loss": 0.2422, + "step": 35140 + }, + { + "epoch": 2.8468081659105637, + "grad_norm": 0.06874528527259827, + "learning_rate": 6.406678968450426e-05, + "loss": 0.2538, + "step": 35141 + }, + { + "epoch": 2.8468891769280624, + "grad_norm": 0.08185140788555145, + "learning_rate": 6.406228903190962e-05, + "loss": 0.2704, + "step": 35142 + }, + { + "epoch": 2.8469701879455607, + "grad_norm": 0.07284029573202133, + "learning_rate": 6.405778837931501e-05, + "loss": 0.2421, + "step": 35143 + }, + { + "epoch": 2.847051198963059, + "grad_norm": 0.06483624130487442, + "learning_rate": 6.405328772672038e-05, + "loss": 0.1819, + "step": 35144 + }, + { + "epoch": 2.8471322099805576, + "grad_norm": 0.06488542258739471, + "learning_rate": 6.404878707412574e-05, + "loss": 0.2052, + "step": 35145 + }, + { + "epoch": 2.847213220998056, + "grad_norm": 0.06830263137817383, + "learning_rate": 6.404428642153113e-05, + "loss": 0.2134, + "step": 35146 + }, + { + "epoch": 2.847294232015554, + "grad_norm": 0.08114606142044067, + "learning_rate": 6.40397857689365e-05, + "loss": 0.2294, + "step": 35147 + }, + { + "epoch": 2.8473752430330523, + "grad_norm": 0.05794978141784668, + "learning_rate": 6.403528511634186e-05, + "loss": 0.2045, + "step": 35148 + }, + { + "epoch": 2.847456254050551, + "grad_norm": 0.06319063156843185, + "learning_rate": 6.403078446374725e-05, + "loss": 0.231, + "step": 35149 + }, + { + "epoch": 2.8475372650680493, + "grad_norm": 0.0754387155175209, + "learning_rate": 6.402628381115263e-05, + "loss": 0.2598, + "step": 35150 + }, + { + "epoch": 2.8476182760855475, + "grad_norm": 0.06007128581404686, + "learning_rate": 6.402178315855799e-05, + "loss": 0.2397, + "step": 35151 + }, + { + "epoch": 2.8476992871030458, + "grad_norm": 0.07417289167642593, + "learning_rate": 6.401728250596337e-05, + "loss": 0.2182, + "step": 35152 + }, + { + "epoch": 2.8477802981205445, + "grad_norm": 0.06433563679456711, + "learning_rate": 6.401278185336875e-05, + "loss": 0.2193, + "step": 35153 + }, + { + "epoch": 2.8478613091380427, + "grad_norm": 0.06929808855056763, + "learning_rate": 6.40082812007741e-05, + "loss": 0.2558, + "step": 35154 + }, + { + "epoch": 2.847942320155541, + "grad_norm": 0.07436833530664444, + "learning_rate": 6.40037805481795e-05, + "loss": 0.2258, + "step": 35155 + }, + { + "epoch": 2.8480233311730396, + "grad_norm": 0.05718601122498512, + "learning_rate": 6.399927989558487e-05, + "loss": 0.2283, + "step": 35156 + }, + { + "epoch": 2.848104342190538, + "grad_norm": 0.079722099006176, + "learning_rate": 6.399477924299023e-05, + "loss": 0.2145, + "step": 35157 + }, + { + "epoch": 2.848185353208036, + "grad_norm": 0.06823069602251053, + "learning_rate": 6.399027859039561e-05, + "loss": 0.1983, + "step": 35158 + }, + { + "epoch": 2.848266364225535, + "grad_norm": 0.07056091725826263, + "learning_rate": 6.398577793780099e-05, + "loss": 0.2287, + "step": 35159 + }, + { + "epoch": 2.848347375243033, + "grad_norm": 0.054329004138708115, + "learning_rate": 6.398127728520635e-05, + "loss": 0.2391, + "step": 35160 + }, + { + "epoch": 2.8484283862605313, + "grad_norm": 0.07334672659635544, + "learning_rate": 6.397677663261174e-05, + "loss": 0.2789, + "step": 35161 + }, + { + "epoch": 2.84850939727803, + "grad_norm": 0.08274146169424057, + "learning_rate": 6.397227598001711e-05, + "loss": 0.2236, + "step": 35162 + }, + { + "epoch": 2.8485904082955282, + "grad_norm": 0.07405499368906021, + "learning_rate": 6.396777532742247e-05, + "loss": 0.2411, + "step": 35163 + }, + { + "epoch": 2.8486714193130265, + "grad_norm": 0.07061372697353363, + "learning_rate": 6.396327467482786e-05, + "loss": 0.241, + "step": 35164 + }, + { + "epoch": 2.848752430330525, + "grad_norm": 0.08040562272071838, + "learning_rate": 6.395877402223323e-05, + "loss": 0.2407, + "step": 35165 + }, + { + "epoch": 2.8488334413480234, + "grad_norm": 0.07685498148202896, + "learning_rate": 6.395427336963859e-05, + "loss": 0.2458, + "step": 35166 + }, + { + "epoch": 2.8489144523655217, + "grad_norm": 0.08878312259912491, + "learning_rate": 6.394977271704398e-05, + "loss": 0.2675, + "step": 35167 + }, + { + "epoch": 2.8489954633830203, + "grad_norm": 0.06860477477312088, + "learning_rate": 6.394527206444935e-05, + "loss": 0.2683, + "step": 35168 + }, + { + "epoch": 2.8490764744005186, + "grad_norm": 0.06936057657003403, + "learning_rate": 6.394077141185472e-05, + "loss": 0.2105, + "step": 35169 + }, + { + "epoch": 2.849157485418017, + "grad_norm": 0.06778555363416672, + "learning_rate": 6.39362707592601e-05, + "loss": 0.2152, + "step": 35170 + }, + { + "epoch": 2.849238496435515, + "grad_norm": 0.07298450171947479, + "learning_rate": 6.393177010666547e-05, + "loss": 0.2174, + "step": 35171 + }, + { + "epoch": 2.8493195074530138, + "grad_norm": 0.07470021396875381, + "learning_rate": 6.392726945407084e-05, + "loss": 0.2394, + "step": 35172 + }, + { + "epoch": 2.849400518470512, + "grad_norm": 0.07039808481931686, + "learning_rate": 6.392276880147622e-05, + "loss": 0.2132, + "step": 35173 + }, + { + "epoch": 2.8494815294880103, + "grad_norm": 0.061103638261556625, + "learning_rate": 6.391826814888159e-05, + "loss": 0.2225, + "step": 35174 + }, + { + "epoch": 2.8495625405055085, + "grad_norm": 0.0838061049580574, + "learning_rate": 6.391376749628697e-05, + "loss": 0.2628, + "step": 35175 + }, + { + "epoch": 2.849643551523007, + "grad_norm": 0.0686267763376236, + "learning_rate": 6.390926684369234e-05, + "loss": 0.2291, + "step": 35176 + }, + { + "epoch": 2.8497245625405054, + "grad_norm": 0.058250222355127335, + "learning_rate": 6.390476619109771e-05, + "loss": 0.1935, + "step": 35177 + }, + { + "epoch": 2.8498055735580037, + "grad_norm": 0.0706809014081955, + "learning_rate": 6.390026553850309e-05, + "loss": 0.2664, + "step": 35178 + }, + { + "epoch": 2.8498865845755024, + "grad_norm": 0.06753183901309967, + "learning_rate": 6.389576488590846e-05, + "loss": 0.2202, + "step": 35179 + }, + { + "epoch": 2.8499675955930006, + "grad_norm": 0.07120547443628311, + "learning_rate": 6.389126423331383e-05, + "loss": 0.2449, + "step": 35180 + }, + { + "epoch": 2.850048606610499, + "grad_norm": 0.06694607436656952, + "learning_rate": 6.388676358071921e-05, + "loss": 0.265, + "step": 35181 + }, + { + "epoch": 2.8501296176279975, + "grad_norm": 0.0656026229262352, + "learning_rate": 6.388226292812458e-05, + "loss": 0.272, + "step": 35182 + }, + { + "epoch": 2.850210628645496, + "grad_norm": 0.06331050395965576, + "learning_rate": 6.387776227552995e-05, + "loss": 0.2309, + "step": 35183 + }, + { + "epoch": 2.850291639662994, + "grad_norm": 0.06337806582450867, + "learning_rate": 6.387326162293533e-05, + "loss": 0.2291, + "step": 35184 + }, + { + "epoch": 2.8503726506804927, + "grad_norm": 0.06179683655500412, + "learning_rate": 6.38687609703407e-05, + "loss": 0.2544, + "step": 35185 + }, + { + "epoch": 2.850453661697991, + "grad_norm": 0.08526507019996643, + "learning_rate": 6.386426031774608e-05, + "loss": 0.2118, + "step": 35186 + }, + { + "epoch": 2.850534672715489, + "grad_norm": 0.0754719078540802, + "learning_rate": 6.385975966515145e-05, + "loss": 0.2297, + "step": 35187 + }, + { + "epoch": 2.850615683732988, + "grad_norm": 0.06728820502758026, + "learning_rate": 6.385525901255682e-05, + "loss": 0.2813, + "step": 35188 + }, + { + "epoch": 2.850696694750486, + "grad_norm": 0.06928353011608124, + "learning_rate": 6.38507583599622e-05, + "loss": 0.2412, + "step": 35189 + }, + { + "epoch": 2.8507777057679844, + "grad_norm": 0.07666745781898499, + "learning_rate": 6.384625770736757e-05, + "loss": 0.246, + "step": 35190 + }, + { + "epoch": 2.850858716785483, + "grad_norm": 0.058885589241981506, + "learning_rate": 6.384175705477294e-05, + "loss": 0.249, + "step": 35191 + }, + { + "epoch": 2.8509397278029813, + "grad_norm": 0.075236476957798, + "learning_rate": 6.383725640217832e-05, + "loss": 0.2784, + "step": 35192 + }, + { + "epoch": 2.8510207388204796, + "grad_norm": 0.06604667007923126, + "learning_rate": 6.383275574958369e-05, + "loss": 0.2177, + "step": 35193 + }, + { + "epoch": 2.851101749837978, + "grad_norm": 0.06537952274084091, + "learning_rate": 6.382825509698906e-05, + "loss": 0.222, + "step": 35194 + }, + { + "epoch": 2.8511827608554765, + "grad_norm": 0.07576059550046921, + "learning_rate": 6.382375444439445e-05, + "loss": 0.221, + "step": 35195 + }, + { + "epoch": 2.8512637718729748, + "grad_norm": 0.06259869784116745, + "learning_rate": 6.381925379179981e-05, + "loss": 0.228, + "step": 35196 + }, + { + "epoch": 2.851344782890473, + "grad_norm": 0.07308464497327805, + "learning_rate": 6.381475313920518e-05, + "loss": 0.2048, + "step": 35197 + }, + { + "epoch": 2.8514257939079712, + "grad_norm": 0.06591349095106125, + "learning_rate": 6.381025248661057e-05, + "loss": 0.2246, + "step": 35198 + }, + { + "epoch": 2.85150680492547, + "grad_norm": 0.0643618181347847, + "learning_rate": 6.380575183401593e-05, + "loss": 0.2277, + "step": 35199 + }, + { + "epoch": 2.851587815942968, + "grad_norm": 0.06636466085910797, + "learning_rate": 6.38012511814213e-05, + "loss": 0.2164, + "step": 35200 + }, + { + "epoch": 2.8516688269604664, + "grad_norm": 0.07473160326480865, + "learning_rate": 6.379675052882669e-05, + "loss": 0.2129, + "step": 35201 + }, + { + "epoch": 2.851749837977965, + "grad_norm": 0.06256754696369171, + "learning_rate": 6.379224987623205e-05, + "loss": 0.2112, + "step": 35202 + }, + { + "epoch": 2.8518308489954634, + "grad_norm": 0.06157509610056877, + "learning_rate": 6.378774922363743e-05, + "loss": 0.1868, + "step": 35203 + }, + { + "epoch": 2.8519118600129616, + "grad_norm": 0.0895833671092987, + "learning_rate": 6.378324857104281e-05, + "loss": 0.2341, + "step": 35204 + }, + { + "epoch": 2.8519928710304603, + "grad_norm": 0.061636537313461304, + "learning_rate": 6.377874791844817e-05, + "loss": 0.2464, + "step": 35205 + }, + { + "epoch": 2.8520738820479585, + "grad_norm": 0.07151246815919876, + "learning_rate": 6.377424726585355e-05, + "loss": 0.2118, + "step": 35206 + }, + { + "epoch": 2.8521548930654568, + "grad_norm": 0.08077570796012878, + "learning_rate": 6.376974661325893e-05, + "loss": 0.2486, + "step": 35207 + }, + { + "epoch": 2.8522359040829555, + "grad_norm": 0.07708319276571274, + "learning_rate": 6.37652459606643e-05, + "loss": 0.2665, + "step": 35208 + }, + { + "epoch": 2.8523169151004537, + "grad_norm": 0.06794562935829163, + "learning_rate": 6.376074530806967e-05, + "loss": 0.228, + "step": 35209 + }, + { + "epoch": 2.852397926117952, + "grad_norm": 0.06809671968221664, + "learning_rate": 6.375624465547506e-05, + "loss": 0.2367, + "step": 35210 + }, + { + "epoch": 2.8524789371354506, + "grad_norm": 0.07280357927083969, + "learning_rate": 6.375174400288042e-05, + "loss": 0.2497, + "step": 35211 + }, + { + "epoch": 2.852559948152949, + "grad_norm": 0.07265602797269821, + "learning_rate": 6.374724335028579e-05, + "loss": 0.2702, + "step": 35212 + }, + { + "epoch": 2.852640959170447, + "grad_norm": 0.07723171263933182, + "learning_rate": 6.374274269769118e-05, + "loss": 0.2374, + "step": 35213 + }, + { + "epoch": 2.852721970187946, + "grad_norm": 0.07471165806055069, + "learning_rate": 6.373824204509654e-05, + "loss": 0.2167, + "step": 35214 + }, + { + "epoch": 2.852802981205444, + "grad_norm": 0.06255011260509491, + "learning_rate": 6.373374139250191e-05, + "loss": 0.2229, + "step": 35215 + }, + { + "epoch": 2.8528839922229423, + "grad_norm": 0.07400676608085632, + "learning_rate": 6.37292407399073e-05, + "loss": 0.2626, + "step": 35216 + }, + { + "epoch": 2.8529650032404406, + "grad_norm": 0.07762713730335236, + "learning_rate": 6.372474008731266e-05, + "loss": 0.2496, + "step": 35217 + }, + { + "epoch": 2.8530460142579392, + "grad_norm": 0.07321801781654358, + "learning_rate": 6.372023943471803e-05, + "loss": 0.249, + "step": 35218 + }, + { + "epoch": 2.8531270252754375, + "grad_norm": 0.0633714348077774, + "learning_rate": 6.371573878212342e-05, + "loss": 0.2319, + "step": 35219 + }, + { + "epoch": 2.8532080362929357, + "grad_norm": 0.06907516717910767, + "learning_rate": 6.371123812952878e-05, + "loss": 0.2051, + "step": 35220 + }, + { + "epoch": 2.853289047310434, + "grad_norm": 0.0761343389749527, + "learning_rate": 6.370673747693416e-05, + "loss": 0.2348, + "step": 35221 + }, + { + "epoch": 2.8533700583279327, + "grad_norm": 0.05884816497564316, + "learning_rate": 6.370223682433954e-05, + "loss": 0.2294, + "step": 35222 + }, + { + "epoch": 2.853451069345431, + "grad_norm": 0.08518347144126892, + "learning_rate": 6.36977361717449e-05, + "loss": 0.268, + "step": 35223 + }, + { + "epoch": 2.853532080362929, + "grad_norm": 0.06178712844848633, + "learning_rate": 6.369323551915029e-05, + "loss": 0.2653, + "step": 35224 + }, + { + "epoch": 2.853613091380428, + "grad_norm": 0.0769021064043045, + "learning_rate": 6.368873486655566e-05, + "loss": 0.2757, + "step": 35225 + }, + { + "epoch": 2.853694102397926, + "grad_norm": 0.08336912095546722, + "learning_rate": 6.368423421396102e-05, + "loss": 0.2508, + "step": 35226 + }, + { + "epoch": 2.8537751134154243, + "grad_norm": 0.08090720325708389, + "learning_rate": 6.36797335613664e-05, + "loss": 0.2616, + "step": 35227 + }, + { + "epoch": 2.853856124432923, + "grad_norm": 0.07356841117143631, + "learning_rate": 6.367523290877178e-05, + "loss": 0.2864, + "step": 35228 + }, + { + "epoch": 2.8539371354504213, + "grad_norm": 0.08213097602128983, + "learning_rate": 6.367073225617714e-05, + "loss": 0.2625, + "step": 35229 + }, + { + "epoch": 2.8540181464679195, + "grad_norm": 0.06912447512149811, + "learning_rate": 6.366623160358253e-05, + "loss": 0.2115, + "step": 35230 + }, + { + "epoch": 2.854099157485418, + "grad_norm": 0.07546108961105347, + "learning_rate": 6.36617309509879e-05, + "loss": 0.2738, + "step": 35231 + }, + { + "epoch": 2.8541801685029164, + "grad_norm": 0.06090640276670456, + "learning_rate": 6.365723029839326e-05, + "loss": 0.2212, + "step": 35232 + }, + { + "epoch": 2.8542611795204147, + "grad_norm": 0.06261391192674637, + "learning_rate": 6.365272964579865e-05, + "loss": 0.2001, + "step": 35233 + }, + { + "epoch": 2.8543421905379134, + "grad_norm": 0.0737534910440445, + "learning_rate": 6.364822899320402e-05, + "loss": 0.2346, + "step": 35234 + }, + { + "epoch": 2.8544232015554116, + "grad_norm": 0.07249848544597626, + "learning_rate": 6.364372834060938e-05, + "loss": 0.2115, + "step": 35235 + }, + { + "epoch": 2.85450421257291, + "grad_norm": 0.06756263226270676, + "learning_rate": 6.363922768801477e-05, + "loss": 0.2579, + "step": 35236 + }, + { + "epoch": 2.8545852235904086, + "grad_norm": 0.0735563412308693, + "learning_rate": 6.363472703542014e-05, + "loss": 0.2398, + "step": 35237 + }, + { + "epoch": 2.854666234607907, + "grad_norm": 0.06471346318721771, + "learning_rate": 6.36302263828255e-05, + "loss": 0.2311, + "step": 35238 + }, + { + "epoch": 2.854747245625405, + "grad_norm": 0.06796788424253464, + "learning_rate": 6.362572573023089e-05, + "loss": 0.23, + "step": 35239 + }, + { + "epoch": 2.8548282566429033, + "grad_norm": 0.06186298653483391, + "learning_rate": 6.362122507763626e-05, + "loss": 0.2499, + "step": 35240 + }, + { + "epoch": 2.8549092676604015, + "grad_norm": 0.08557670563459396, + "learning_rate": 6.361672442504162e-05, + "loss": 0.2748, + "step": 35241 + }, + { + "epoch": 2.8549902786779002, + "grad_norm": 0.09569763392210007, + "learning_rate": 6.361222377244701e-05, + "loss": 0.2927, + "step": 35242 + }, + { + "epoch": 2.8550712896953985, + "grad_norm": 0.06551126390695572, + "learning_rate": 6.360772311985238e-05, + "loss": 0.2238, + "step": 35243 + }, + { + "epoch": 2.8551523007128967, + "grad_norm": 0.07863879203796387, + "learning_rate": 6.360322246725774e-05, + "loss": 0.2576, + "step": 35244 + }, + { + "epoch": 2.8552333117303954, + "grad_norm": 0.08634800463914871, + "learning_rate": 6.359872181466313e-05, + "loss": 0.2323, + "step": 35245 + }, + { + "epoch": 2.8553143227478937, + "grad_norm": 0.06437304615974426, + "learning_rate": 6.35942211620685e-05, + "loss": 0.2183, + "step": 35246 + }, + { + "epoch": 2.855395333765392, + "grad_norm": 0.06253159791231155, + "learning_rate": 6.358972050947387e-05, + "loss": 0.2394, + "step": 35247 + }, + { + "epoch": 2.8554763447828906, + "grad_norm": 0.07356922328472137, + "learning_rate": 6.358521985687925e-05, + "loss": 0.2593, + "step": 35248 + }, + { + "epoch": 2.855557355800389, + "grad_norm": 0.08020610362291336, + "learning_rate": 6.358071920428463e-05, + "loss": 0.2317, + "step": 35249 + }, + { + "epoch": 2.855638366817887, + "grad_norm": 0.07049740850925446, + "learning_rate": 6.357621855169e-05, + "loss": 0.2635, + "step": 35250 + }, + { + "epoch": 2.8557193778353858, + "grad_norm": 0.0626765564084053, + "learning_rate": 6.357171789909537e-05, + "loss": 0.2446, + "step": 35251 + }, + { + "epoch": 2.855800388852884, + "grad_norm": 0.06806015223264694, + "learning_rate": 6.356721724650075e-05, + "loss": 0.2239, + "step": 35252 + }, + { + "epoch": 2.8558813998703823, + "grad_norm": 0.0689484253525734, + "learning_rate": 6.356271659390612e-05, + "loss": 0.2429, + "step": 35253 + }, + { + "epoch": 2.855962410887881, + "grad_norm": 0.05486216023564339, + "learning_rate": 6.35582159413115e-05, + "loss": 0.2164, + "step": 35254 + }, + { + "epoch": 2.856043421905379, + "grad_norm": 0.07790280878543854, + "learning_rate": 6.355371528871687e-05, + "loss": 0.2526, + "step": 35255 + }, + { + "epoch": 2.8561244329228774, + "grad_norm": 0.07450840622186661, + "learning_rate": 6.354921463612224e-05, + "loss": 0.27, + "step": 35256 + }, + { + "epoch": 2.856205443940376, + "grad_norm": 0.06325256079435349, + "learning_rate": 6.354471398352761e-05, + "loss": 0.2214, + "step": 35257 + }, + { + "epoch": 2.8562864549578744, + "grad_norm": 0.05930173397064209, + "learning_rate": 6.354021333093299e-05, + "loss": 0.1991, + "step": 35258 + }, + { + "epoch": 2.8563674659753726, + "grad_norm": 0.08211062848567963, + "learning_rate": 6.353571267833836e-05, + "loss": 0.2146, + "step": 35259 + }, + { + "epoch": 2.8564484769928713, + "grad_norm": 0.06989636272192001, + "learning_rate": 6.353121202574374e-05, + "loss": 0.2205, + "step": 35260 + }, + { + "epoch": 2.8565294880103695, + "grad_norm": 0.07901213318109512, + "learning_rate": 6.352671137314911e-05, + "loss": 0.2526, + "step": 35261 + }, + { + "epoch": 2.856610499027868, + "grad_norm": 0.07214916497468948, + "learning_rate": 6.352221072055448e-05, + "loss": 0.2327, + "step": 35262 + }, + { + "epoch": 2.856691510045366, + "grad_norm": 0.06947073340415955, + "learning_rate": 6.351771006795986e-05, + "loss": 0.2287, + "step": 35263 + }, + { + "epoch": 2.8567725210628643, + "grad_norm": 0.06751016527414322, + "learning_rate": 6.351320941536523e-05, + "loss": 0.2027, + "step": 35264 + }, + { + "epoch": 2.856853532080363, + "grad_norm": 0.07422146946191788, + "learning_rate": 6.35087087627706e-05, + "loss": 0.279, + "step": 35265 + }, + { + "epoch": 2.856934543097861, + "grad_norm": 0.057897377759218216, + "learning_rate": 6.350420811017598e-05, + "loss": 0.2344, + "step": 35266 + }, + { + "epoch": 2.8570155541153595, + "grad_norm": 0.0692281723022461, + "learning_rate": 6.349970745758135e-05, + "loss": 0.1927, + "step": 35267 + }, + { + "epoch": 2.857096565132858, + "grad_norm": 0.06540977954864502, + "learning_rate": 6.349520680498672e-05, + "loss": 0.2654, + "step": 35268 + }, + { + "epoch": 2.8571775761503564, + "grad_norm": 0.06885252892971039, + "learning_rate": 6.34907061523921e-05, + "loss": 0.2198, + "step": 35269 + }, + { + "epoch": 2.8572585871678546, + "grad_norm": 0.0721738189458847, + "learning_rate": 6.348620549979747e-05, + "loss": 0.2105, + "step": 35270 + }, + { + "epoch": 2.8573395981853533, + "grad_norm": 0.07012255489826202, + "learning_rate": 6.348170484720285e-05, + "loss": 0.2645, + "step": 35271 + }, + { + "epoch": 2.8574206092028516, + "grad_norm": 0.07275258749723434, + "learning_rate": 6.347720419460822e-05, + "loss": 0.2459, + "step": 35272 + }, + { + "epoch": 2.85750162022035, + "grad_norm": 0.06511726975440979, + "learning_rate": 6.347270354201359e-05, + "loss": 0.2477, + "step": 35273 + }, + { + "epoch": 2.8575826312378485, + "grad_norm": 0.07181781530380249, + "learning_rate": 6.346820288941897e-05, + "loss": 0.257, + "step": 35274 + }, + { + "epoch": 2.8576636422553467, + "grad_norm": 0.082255057990551, + "learning_rate": 6.346370223682434e-05, + "loss": 0.234, + "step": 35275 + }, + { + "epoch": 2.857744653272845, + "grad_norm": 0.07187642902135849, + "learning_rate": 6.345920158422973e-05, + "loss": 0.218, + "step": 35276 + }, + { + "epoch": 2.8578256642903437, + "grad_norm": 0.08506189286708832, + "learning_rate": 6.345470093163509e-05, + "loss": 0.2611, + "step": 35277 + }, + { + "epoch": 2.857906675307842, + "grad_norm": 0.062427401542663574, + "learning_rate": 6.345020027904046e-05, + "loss": 0.2199, + "step": 35278 + }, + { + "epoch": 2.85798768632534, + "grad_norm": 0.07684777677059174, + "learning_rate": 6.344569962644585e-05, + "loss": 0.2513, + "step": 35279 + }, + { + "epoch": 2.858068697342839, + "grad_norm": 0.06772081553936005, + "learning_rate": 6.344119897385121e-05, + "loss": 0.2081, + "step": 35280 + }, + { + "epoch": 2.858149708360337, + "grad_norm": 0.07348523288965225, + "learning_rate": 6.343669832125658e-05, + "loss": 0.2299, + "step": 35281 + }, + { + "epoch": 2.8582307193778353, + "grad_norm": 0.06706465035676956, + "learning_rate": 6.343219766866197e-05, + "loss": 0.2021, + "step": 35282 + }, + { + "epoch": 2.858311730395334, + "grad_norm": 0.07620218396186829, + "learning_rate": 6.342769701606733e-05, + "loss": 0.2334, + "step": 35283 + }, + { + "epoch": 2.8583927414128323, + "grad_norm": 0.06165030971169472, + "learning_rate": 6.34231963634727e-05, + "loss": 0.2203, + "step": 35284 + }, + { + "epoch": 2.8584737524303305, + "grad_norm": 0.07112786173820496, + "learning_rate": 6.341869571087809e-05, + "loss": 0.2484, + "step": 35285 + }, + { + "epoch": 2.8585547634478288, + "grad_norm": 0.08486390858888626, + "learning_rate": 6.341419505828345e-05, + "loss": 0.2339, + "step": 35286 + }, + { + "epoch": 2.858635774465327, + "grad_norm": 0.06826337426900864, + "learning_rate": 6.340969440568882e-05, + "loss": 0.2212, + "step": 35287 + }, + { + "epoch": 2.8587167854828257, + "grad_norm": 0.06725852191448212, + "learning_rate": 6.340519375309421e-05, + "loss": 0.2556, + "step": 35288 + }, + { + "epoch": 2.858797796500324, + "grad_norm": 0.06825444102287292, + "learning_rate": 6.340069310049957e-05, + "loss": 0.2074, + "step": 35289 + }, + { + "epoch": 2.858878807517822, + "grad_norm": 0.08633331209421158, + "learning_rate": 6.339619244790494e-05, + "loss": 0.251, + "step": 35290 + }, + { + "epoch": 2.858959818535321, + "grad_norm": 0.05289662256836891, + "learning_rate": 6.339169179531033e-05, + "loss": 0.2458, + "step": 35291 + }, + { + "epoch": 2.859040829552819, + "grad_norm": 0.06766065210103989, + "learning_rate": 6.338719114271569e-05, + "loss": 0.2007, + "step": 35292 + }, + { + "epoch": 2.8591218405703174, + "grad_norm": 0.07813969999551773, + "learning_rate": 6.338269049012106e-05, + "loss": 0.2458, + "step": 35293 + }, + { + "epoch": 2.859202851587816, + "grad_norm": 0.07089391350746155, + "learning_rate": 6.337818983752645e-05, + "loss": 0.2372, + "step": 35294 + }, + { + "epoch": 2.8592838626053143, + "grad_norm": 0.0717868059873581, + "learning_rate": 6.337368918493181e-05, + "loss": 0.2528, + "step": 35295 + }, + { + "epoch": 2.8593648736228126, + "grad_norm": 0.07486657053232193, + "learning_rate": 6.336918853233719e-05, + "loss": 0.2474, + "step": 35296 + }, + { + "epoch": 2.8594458846403112, + "grad_norm": 0.06870671361684799, + "learning_rate": 6.336468787974257e-05, + "loss": 0.2606, + "step": 35297 + }, + { + "epoch": 2.8595268956578095, + "grad_norm": 0.057232122868299484, + "learning_rate": 6.336018722714793e-05, + "loss": 0.233, + "step": 35298 + }, + { + "epoch": 2.8596079066753077, + "grad_norm": 0.07663465291261673, + "learning_rate": 6.33556865745533e-05, + "loss": 0.2177, + "step": 35299 + }, + { + "epoch": 2.8596889176928064, + "grad_norm": 0.050945814698934555, + "learning_rate": 6.335118592195869e-05, + "loss": 0.2268, + "step": 35300 + }, + { + "epoch": 2.8597699287103047, + "grad_norm": 0.06260829418897629, + "learning_rate": 6.334668526936405e-05, + "loss": 0.2582, + "step": 35301 + }, + { + "epoch": 2.859850939727803, + "grad_norm": 0.07052972912788391, + "learning_rate": 6.334218461676944e-05, + "loss": 0.2534, + "step": 35302 + }, + { + "epoch": 2.8599319507453016, + "grad_norm": 0.07194507867097855, + "learning_rate": 6.333768396417481e-05, + "loss": 0.2344, + "step": 35303 + }, + { + "epoch": 2.8600129617628, + "grad_norm": 0.07999186217784882, + "learning_rate": 6.333318331158017e-05, + "loss": 0.2746, + "step": 35304 + }, + { + "epoch": 2.860093972780298, + "grad_norm": 0.05699498951435089, + "learning_rate": 6.332868265898556e-05, + "loss": 0.2543, + "step": 35305 + }, + { + "epoch": 2.8601749837977968, + "grad_norm": 0.06230924651026726, + "learning_rate": 6.332418200639093e-05, + "loss": 0.22, + "step": 35306 + }, + { + "epoch": 2.860255994815295, + "grad_norm": 0.0675150528550148, + "learning_rate": 6.33196813537963e-05, + "loss": 0.2309, + "step": 35307 + }, + { + "epoch": 2.8603370058327933, + "grad_norm": 0.06433086842298508, + "learning_rate": 6.331518070120168e-05, + "loss": 0.2268, + "step": 35308 + }, + { + "epoch": 2.8604180168502915, + "grad_norm": 0.06448919326066971, + "learning_rate": 6.331068004860706e-05, + "loss": 0.2167, + "step": 35309 + }, + { + "epoch": 2.8604990278677898, + "grad_norm": 0.058945026248693466, + "learning_rate": 6.330617939601242e-05, + "loss": 0.2175, + "step": 35310 + }, + { + "epoch": 2.8605800388852884, + "grad_norm": 0.062296051532030106, + "learning_rate": 6.33016787434178e-05, + "loss": 0.2121, + "step": 35311 + }, + { + "epoch": 2.8606610499027867, + "grad_norm": 0.06867983937263489, + "learning_rate": 6.329717809082318e-05, + "loss": 0.2024, + "step": 35312 + }, + { + "epoch": 2.860742060920285, + "grad_norm": 0.07533132284879684, + "learning_rate": 6.329267743822854e-05, + "loss": 0.2656, + "step": 35313 + }, + { + "epoch": 2.8608230719377836, + "grad_norm": 0.07362021505832672, + "learning_rate": 6.328817678563392e-05, + "loss": 0.235, + "step": 35314 + }, + { + "epoch": 2.860904082955282, + "grad_norm": 0.08826854079961777, + "learning_rate": 6.32836761330393e-05, + "loss": 0.2769, + "step": 35315 + }, + { + "epoch": 2.86098509397278, + "grad_norm": 0.07047184556722641, + "learning_rate": 6.327917548044466e-05, + "loss": 0.2569, + "step": 35316 + }, + { + "epoch": 2.861066104990279, + "grad_norm": 0.06254454702138901, + "learning_rate": 6.327467482785004e-05, + "loss": 0.2417, + "step": 35317 + }, + { + "epoch": 2.861147116007777, + "grad_norm": 0.08397172391414642, + "learning_rate": 6.327017417525542e-05, + "loss": 0.2571, + "step": 35318 + }, + { + "epoch": 2.8612281270252753, + "grad_norm": 0.06830441951751709, + "learning_rate": 6.326567352266078e-05, + "loss": 0.2049, + "step": 35319 + }, + { + "epoch": 2.861309138042774, + "grad_norm": 0.06679099053144455, + "learning_rate": 6.326117287006617e-05, + "loss": 0.2213, + "step": 35320 + }, + { + "epoch": 2.8613901490602722, + "grad_norm": 0.07950352877378464, + "learning_rate": 6.325667221747154e-05, + "loss": 0.2821, + "step": 35321 + }, + { + "epoch": 2.8614711600777705, + "grad_norm": 0.06094673275947571, + "learning_rate": 6.32521715648769e-05, + "loss": 0.2486, + "step": 35322 + }, + { + "epoch": 2.861552171095269, + "grad_norm": 0.07335047423839569, + "learning_rate": 6.324767091228229e-05, + "loss": 0.2606, + "step": 35323 + }, + { + "epoch": 2.8616331821127674, + "grad_norm": 0.06174264848232269, + "learning_rate": 6.324317025968766e-05, + "loss": 0.2105, + "step": 35324 + }, + { + "epoch": 2.8617141931302656, + "grad_norm": 0.07181406766176224, + "learning_rate": 6.323866960709302e-05, + "loss": 0.2196, + "step": 35325 + }, + { + "epoch": 2.8617952041477643, + "grad_norm": 0.060329336673021317, + "learning_rate": 6.323416895449841e-05, + "loss": 0.2211, + "step": 35326 + }, + { + "epoch": 2.8618762151652626, + "grad_norm": 0.06322979927062988, + "learning_rate": 6.322966830190378e-05, + "loss": 0.2337, + "step": 35327 + }, + { + "epoch": 2.861957226182761, + "grad_norm": 0.0658821389079094, + "learning_rate": 6.322516764930915e-05, + "loss": 0.2367, + "step": 35328 + }, + { + "epoch": 2.862038237200259, + "grad_norm": 0.07008875906467438, + "learning_rate": 6.322066699671453e-05, + "loss": 0.2247, + "step": 35329 + }, + { + "epoch": 2.8621192482177578, + "grad_norm": 0.07550831139087677, + "learning_rate": 6.32161663441199e-05, + "loss": 0.2494, + "step": 35330 + }, + { + "epoch": 2.862200259235256, + "grad_norm": 0.06202748790383339, + "learning_rate": 6.321166569152527e-05, + "loss": 0.262, + "step": 35331 + }, + { + "epoch": 2.8622812702527543, + "grad_norm": 0.06371939927339554, + "learning_rate": 6.320716503893065e-05, + "loss": 0.2553, + "step": 35332 + }, + { + "epoch": 2.8623622812702525, + "grad_norm": 0.06563441455364227, + "learning_rate": 6.320266438633602e-05, + "loss": 0.2378, + "step": 35333 + }, + { + "epoch": 2.862443292287751, + "grad_norm": 0.061470042914152145, + "learning_rate": 6.31981637337414e-05, + "loss": 0.2063, + "step": 35334 + }, + { + "epoch": 2.8625243033052494, + "grad_norm": 0.07362918555736542, + "learning_rate": 6.319366308114677e-05, + "loss": 0.2389, + "step": 35335 + }, + { + "epoch": 2.8626053143227477, + "grad_norm": 0.06169411167502403, + "learning_rate": 6.318916242855214e-05, + "loss": 0.2074, + "step": 35336 + }, + { + "epoch": 2.8626863253402464, + "grad_norm": 0.07283011823892593, + "learning_rate": 6.318466177595752e-05, + "loss": 0.2238, + "step": 35337 + }, + { + "epoch": 2.8627673363577446, + "grad_norm": 0.06744706630706787, + "learning_rate": 6.318016112336289e-05, + "loss": 0.272, + "step": 35338 + }, + { + "epoch": 2.862848347375243, + "grad_norm": 0.0716882050037384, + "learning_rate": 6.317566047076826e-05, + "loss": 0.2215, + "step": 35339 + }, + { + "epoch": 2.8629293583927415, + "grad_norm": 0.05441503971815109, + "learning_rate": 6.317115981817364e-05, + "loss": 0.1988, + "step": 35340 + }, + { + "epoch": 2.86301036941024, + "grad_norm": 0.0689413994550705, + "learning_rate": 6.316665916557901e-05, + "loss": 0.2252, + "step": 35341 + }, + { + "epoch": 2.863091380427738, + "grad_norm": 0.061735041439533234, + "learning_rate": 6.316215851298438e-05, + "loss": 0.232, + "step": 35342 + }, + { + "epoch": 2.8631723914452367, + "grad_norm": 0.09404154121875763, + "learning_rate": 6.315765786038976e-05, + "loss": 0.2105, + "step": 35343 + }, + { + "epoch": 2.863253402462735, + "grad_norm": 0.0623144656419754, + "learning_rate": 6.315315720779513e-05, + "loss": 0.2107, + "step": 35344 + }, + { + "epoch": 2.863334413480233, + "grad_norm": 0.062484726309776306, + "learning_rate": 6.31486565552005e-05, + "loss": 0.2606, + "step": 35345 + }, + { + "epoch": 2.863415424497732, + "grad_norm": 0.07301829010248184, + "learning_rate": 6.314415590260588e-05, + "loss": 0.2335, + "step": 35346 + }, + { + "epoch": 2.86349643551523, + "grad_norm": 0.06890372931957245, + "learning_rate": 6.313965525001125e-05, + "loss": 0.2306, + "step": 35347 + }, + { + "epoch": 2.8635774465327284, + "grad_norm": 0.07791730016469955, + "learning_rate": 6.313515459741663e-05, + "loss": 0.2477, + "step": 35348 + }, + { + "epoch": 2.863658457550227, + "grad_norm": 0.07661699503660202, + "learning_rate": 6.3130653944822e-05, + "loss": 0.2427, + "step": 35349 + }, + { + "epoch": 2.8637394685677253, + "grad_norm": 0.07754947990179062, + "learning_rate": 6.312615329222737e-05, + "loss": 0.2412, + "step": 35350 + }, + { + "epoch": 2.8638204795852236, + "grad_norm": 0.07087945193052292, + "learning_rate": 6.312165263963275e-05, + "loss": 0.269, + "step": 35351 + }, + { + "epoch": 2.863901490602722, + "grad_norm": 0.0750063881278038, + "learning_rate": 6.311715198703812e-05, + "loss": 0.2525, + "step": 35352 + }, + { + "epoch": 2.8639825016202205, + "grad_norm": 0.06159652769565582, + "learning_rate": 6.31126513344435e-05, + "loss": 0.2505, + "step": 35353 + }, + { + "epoch": 2.8640635126377187, + "grad_norm": 0.06571204215288162, + "learning_rate": 6.310815068184888e-05, + "loss": 0.2329, + "step": 35354 + }, + { + "epoch": 2.864144523655217, + "grad_norm": 0.06939777731895447, + "learning_rate": 6.310365002925424e-05, + "loss": 0.2651, + "step": 35355 + }, + { + "epoch": 2.8642255346727152, + "grad_norm": 0.06422492116689682, + "learning_rate": 6.309914937665961e-05, + "loss": 0.2443, + "step": 35356 + }, + { + "epoch": 2.864306545690214, + "grad_norm": 0.07563510537147522, + "learning_rate": 6.3094648724065e-05, + "loss": 0.2143, + "step": 35357 + }, + { + "epoch": 2.864387556707712, + "grad_norm": 0.07055597752332687, + "learning_rate": 6.309014807147036e-05, + "loss": 0.2762, + "step": 35358 + }, + { + "epoch": 2.8644685677252104, + "grad_norm": 0.07962583005428314, + "learning_rate": 6.308564741887574e-05, + "loss": 0.2304, + "step": 35359 + }, + { + "epoch": 2.864549578742709, + "grad_norm": 0.07300310581922531, + "learning_rate": 6.308114676628112e-05, + "loss": 0.2459, + "step": 35360 + }, + { + "epoch": 2.8646305897602073, + "grad_norm": 0.06624911725521088, + "learning_rate": 6.307664611368648e-05, + "loss": 0.2411, + "step": 35361 + }, + { + "epoch": 2.8647116007777056, + "grad_norm": 0.07704348862171173, + "learning_rate": 6.307214546109186e-05, + "loss": 0.2419, + "step": 35362 + }, + { + "epoch": 2.8647926117952043, + "grad_norm": 0.07485693693161011, + "learning_rate": 6.306764480849724e-05, + "loss": 0.2371, + "step": 35363 + }, + { + "epoch": 2.8648736228127025, + "grad_norm": 0.06935972720384598, + "learning_rate": 6.30631441559026e-05, + "loss": 0.2247, + "step": 35364 + }, + { + "epoch": 2.8649546338302008, + "grad_norm": 0.07739797234535217, + "learning_rate": 6.305864350330798e-05, + "loss": 0.2283, + "step": 35365 + }, + { + "epoch": 2.8650356448476995, + "grad_norm": 0.06980100274085999, + "learning_rate": 6.305414285071336e-05, + "loss": 0.2481, + "step": 35366 + }, + { + "epoch": 2.8651166558651977, + "grad_norm": 0.0687079057097435, + "learning_rate": 6.304964219811872e-05, + "loss": 0.2261, + "step": 35367 + }, + { + "epoch": 2.865197666882696, + "grad_norm": 0.060128916054964066, + "learning_rate": 6.30451415455241e-05, + "loss": 0.2552, + "step": 35368 + }, + { + "epoch": 2.8652786779001946, + "grad_norm": 0.08529718220233917, + "learning_rate": 6.304064089292949e-05, + "loss": 0.2835, + "step": 35369 + }, + { + "epoch": 2.865359688917693, + "grad_norm": 0.0598611943423748, + "learning_rate": 6.303614024033485e-05, + "loss": 0.202, + "step": 35370 + }, + { + "epoch": 2.865440699935191, + "grad_norm": 0.07495393604040146, + "learning_rate": 6.303163958774022e-05, + "loss": 0.2317, + "step": 35371 + }, + { + "epoch": 2.86552171095269, + "grad_norm": 0.057052772492170334, + "learning_rate": 6.30271389351456e-05, + "loss": 0.2322, + "step": 35372 + }, + { + "epoch": 2.865602721970188, + "grad_norm": 0.06384529173374176, + "learning_rate": 6.302263828255097e-05, + "loss": 0.2231, + "step": 35373 + }, + { + "epoch": 2.8656837329876863, + "grad_norm": 0.06650028377771378, + "learning_rate": 6.301813762995634e-05, + "loss": 0.2609, + "step": 35374 + }, + { + "epoch": 2.8657647440051845, + "grad_norm": 0.06612016260623932, + "learning_rate": 6.301363697736173e-05, + "loss": 0.231, + "step": 35375 + }, + { + "epoch": 2.8658457550226832, + "grad_norm": 0.059364426881074905, + "learning_rate": 6.300913632476709e-05, + "loss": 0.2322, + "step": 35376 + }, + { + "epoch": 2.8659267660401815, + "grad_norm": 0.06600163877010345, + "learning_rate": 6.300463567217246e-05, + "loss": 0.2155, + "step": 35377 + }, + { + "epoch": 2.8660077770576797, + "grad_norm": 0.0754174292087555, + "learning_rate": 6.300013501957785e-05, + "loss": 0.246, + "step": 35378 + }, + { + "epoch": 2.866088788075178, + "grad_norm": 0.08635005354881287, + "learning_rate": 6.299563436698321e-05, + "loss": 0.247, + "step": 35379 + }, + { + "epoch": 2.8661697990926767, + "grad_norm": 0.06976178288459778, + "learning_rate": 6.29911337143886e-05, + "loss": 0.2224, + "step": 35380 + }, + { + "epoch": 2.866250810110175, + "grad_norm": 0.06297709047794342, + "learning_rate": 6.298663306179397e-05, + "loss": 0.2141, + "step": 35381 + }, + { + "epoch": 2.866331821127673, + "grad_norm": 0.06248379126191139, + "learning_rate": 6.298213240919933e-05, + "loss": 0.2168, + "step": 35382 + }, + { + "epoch": 2.866412832145172, + "grad_norm": 0.060140691697597504, + "learning_rate": 6.297763175660472e-05, + "loss": 0.2225, + "step": 35383 + }, + { + "epoch": 2.86649384316267, + "grad_norm": 0.07628795504570007, + "learning_rate": 6.297313110401009e-05, + "loss": 0.2349, + "step": 35384 + }, + { + "epoch": 2.8665748541801683, + "grad_norm": 0.06263097375631332, + "learning_rate": 6.296863045141545e-05, + "loss": 0.2161, + "step": 35385 + }, + { + "epoch": 2.866655865197667, + "grad_norm": 0.09367737919092178, + "learning_rate": 6.296412979882084e-05, + "loss": 0.2256, + "step": 35386 + }, + { + "epoch": 2.8667368762151653, + "grad_norm": 0.07277649641036987, + "learning_rate": 6.295962914622621e-05, + "loss": 0.2514, + "step": 35387 + }, + { + "epoch": 2.8668178872326635, + "grad_norm": 0.07186709344387054, + "learning_rate": 6.295512849363157e-05, + "loss": 0.2418, + "step": 35388 + }, + { + "epoch": 2.866898898250162, + "grad_norm": 0.06744267791509628, + "learning_rate": 6.295062784103696e-05, + "loss": 0.2127, + "step": 35389 + }, + { + "epoch": 2.8669799092676604, + "grad_norm": 0.06757162511348724, + "learning_rate": 6.294612718844233e-05, + "loss": 0.2476, + "step": 35390 + }, + { + "epoch": 2.8670609202851587, + "grad_norm": 0.061270441859960556, + "learning_rate": 6.294162653584769e-05, + "loss": 0.2302, + "step": 35391 + }, + { + "epoch": 2.8671419313026574, + "grad_norm": 0.07216763496398926, + "learning_rate": 6.293712588325308e-05, + "loss": 0.274, + "step": 35392 + }, + { + "epoch": 2.8672229423201556, + "grad_norm": 0.0688694566488266, + "learning_rate": 6.293262523065845e-05, + "loss": 0.2617, + "step": 35393 + }, + { + "epoch": 2.867303953337654, + "grad_norm": 0.07353059202432632, + "learning_rate": 6.292812457806381e-05, + "loss": 0.2692, + "step": 35394 + }, + { + "epoch": 2.8673849643551526, + "grad_norm": 0.07066787034273148, + "learning_rate": 6.29236239254692e-05, + "loss": 0.2335, + "step": 35395 + }, + { + "epoch": 2.867465975372651, + "grad_norm": 0.07494823634624481, + "learning_rate": 6.291912327287457e-05, + "loss": 0.2416, + "step": 35396 + }, + { + "epoch": 2.867546986390149, + "grad_norm": 0.07171319425106049, + "learning_rate": 6.291462262027993e-05, + "loss": 0.2809, + "step": 35397 + }, + { + "epoch": 2.8676279974076473, + "grad_norm": 0.07325095683336258, + "learning_rate": 6.291012196768532e-05, + "loss": 0.2258, + "step": 35398 + }, + { + "epoch": 2.867709008425146, + "grad_norm": 0.0509103499352932, + "learning_rate": 6.29056213150907e-05, + "loss": 0.2138, + "step": 35399 + }, + { + "epoch": 2.8677900194426442, + "grad_norm": 0.055378302931785583, + "learning_rate": 6.290112066249607e-05, + "loss": 0.2314, + "step": 35400 + }, + { + "epoch": 2.8678710304601425, + "grad_norm": 0.07109376788139343, + "learning_rate": 6.289662000990144e-05, + "loss": 0.223, + "step": 35401 + }, + { + "epoch": 2.8679520414776407, + "grad_norm": 0.07241575419902802, + "learning_rate": 6.289211935730681e-05, + "loss": 0.2772, + "step": 35402 + }, + { + "epoch": 2.8680330524951394, + "grad_norm": 0.08092466741800308, + "learning_rate": 6.288761870471219e-05, + "loss": 0.2879, + "step": 35403 + }, + { + "epoch": 2.8681140635126376, + "grad_norm": 0.06131209805607796, + "learning_rate": 6.288311805211756e-05, + "loss": 0.2668, + "step": 35404 + }, + { + "epoch": 2.868195074530136, + "grad_norm": 0.06692001223564148, + "learning_rate": 6.287861739952294e-05, + "loss": 0.2437, + "step": 35405 + }, + { + "epoch": 2.8682760855476346, + "grad_norm": 0.09271931648254395, + "learning_rate": 6.287411674692831e-05, + "loss": 0.2373, + "step": 35406 + }, + { + "epoch": 2.868357096565133, + "grad_norm": 0.06557455658912659, + "learning_rate": 6.286961609433368e-05, + "loss": 0.2273, + "step": 35407 + }, + { + "epoch": 2.868438107582631, + "grad_norm": 0.08470204472541809, + "learning_rate": 6.286511544173906e-05, + "loss": 0.2361, + "step": 35408 + }, + { + "epoch": 2.8685191186001298, + "grad_norm": 0.060380466282367706, + "learning_rate": 6.286061478914443e-05, + "loss": 0.2297, + "step": 35409 + }, + { + "epoch": 2.868600129617628, + "grad_norm": 0.07112187147140503, + "learning_rate": 6.28561141365498e-05, + "loss": 0.234, + "step": 35410 + }, + { + "epoch": 2.8686811406351262, + "grad_norm": 0.07456043362617493, + "learning_rate": 6.285161348395518e-05, + "loss": 0.2998, + "step": 35411 + }, + { + "epoch": 2.868762151652625, + "grad_norm": 0.05900955572724342, + "learning_rate": 6.284711283136055e-05, + "loss": 0.2041, + "step": 35412 + }, + { + "epoch": 2.868843162670123, + "grad_norm": 0.07007622718811035, + "learning_rate": 6.284261217876592e-05, + "loss": 0.2142, + "step": 35413 + }, + { + "epoch": 2.8689241736876214, + "grad_norm": 0.08670007437467575, + "learning_rate": 6.28381115261713e-05, + "loss": 0.2603, + "step": 35414 + }, + { + "epoch": 2.86900518470512, + "grad_norm": 0.07252345979213715, + "learning_rate": 6.283361087357667e-05, + "loss": 0.2462, + "step": 35415 + }, + { + "epoch": 2.8690861957226184, + "grad_norm": 0.0890662744641304, + "learning_rate": 6.282911022098204e-05, + "loss": 0.2453, + "step": 35416 + }, + { + "epoch": 2.8691672067401166, + "grad_norm": 0.07275164127349854, + "learning_rate": 6.282460956838742e-05, + "loss": 0.2335, + "step": 35417 + }, + { + "epoch": 2.8692482177576153, + "grad_norm": 0.07086493074893951, + "learning_rate": 6.282010891579279e-05, + "loss": 0.2012, + "step": 35418 + }, + { + "epoch": 2.8693292287751135, + "grad_norm": 0.07489845901727676, + "learning_rate": 6.281560826319817e-05, + "loss": 0.2585, + "step": 35419 + }, + { + "epoch": 2.869410239792612, + "grad_norm": 0.06938111037015915, + "learning_rate": 6.281110761060354e-05, + "loss": 0.2936, + "step": 35420 + }, + { + "epoch": 2.86949125081011, + "grad_norm": 0.10458862781524658, + "learning_rate": 6.280660695800891e-05, + "loss": 0.215, + "step": 35421 + }, + { + "epoch": 2.8695722618276087, + "grad_norm": 0.07261291146278381, + "learning_rate": 6.280210630541429e-05, + "loss": 0.2707, + "step": 35422 + }, + { + "epoch": 2.869653272845107, + "grad_norm": 0.06971706449985504, + "learning_rate": 6.279760565281966e-05, + "loss": 0.2418, + "step": 35423 + }, + { + "epoch": 2.869734283862605, + "grad_norm": 0.06998847424983978, + "learning_rate": 6.279310500022503e-05, + "loss": 0.2288, + "step": 35424 + }, + { + "epoch": 2.8698152948801035, + "grad_norm": 0.07669820636510849, + "learning_rate": 6.278860434763041e-05, + "loss": 0.2475, + "step": 35425 + }, + { + "epoch": 2.869896305897602, + "grad_norm": 0.07179666310548782, + "learning_rate": 6.278410369503578e-05, + "loss": 0.2127, + "step": 35426 + }, + { + "epoch": 2.8699773169151004, + "grad_norm": 0.07503090798854828, + "learning_rate": 6.277960304244115e-05, + "loss": 0.239, + "step": 35427 + }, + { + "epoch": 2.8700583279325986, + "grad_norm": 0.0681663379073143, + "learning_rate": 6.277510238984653e-05, + "loss": 0.2254, + "step": 35428 + }, + { + "epoch": 2.8701393389500973, + "grad_norm": 0.074290432035923, + "learning_rate": 6.27706017372519e-05, + "loss": 0.2204, + "step": 35429 + }, + { + "epoch": 2.8702203499675956, + "grad_norm": 0.08182138949632645, + "learning_rate": 6.276610108465728e-05, + "loss": 0.2269, + "step": 35430 + }, + { + "epoch": 2.870301360985094, + "grad_norm": 0.07460813224315643, + "learning_rate": 6.276160043206265e-05, + "loss": 0.2537, + "step": 35431 + }, + { + "epoch": 2.8703823720025925, + "grad_norm": 0.0704612210392952, + "learning_rate": 6.275709977946802e-05, + "loss": 0.2059, + "step": 35432 + }, + { + "epoch": 2.8704633830200907, + "grad_norm": 0.07259119302034378, + "learning_rate": 6.27525991268734e-05, + "loss": 0.2568, + "step": 35433 + }, + { + "epoch": 2.870544394037589, + "grad_norm": 0.06900456547737122, + "learning_rate": 6.274809847427877e-05, + "loss": 0.2531, + "step": 35434 + }, + { + "epoch": 2.8706254050550877, + "grad_norm": 0.07990577816963196, + "learning_rate": 6.274359782168416e-05, + "loss": 0.2337, + "step": 35435 + }, + { + "epoch": 2.870706416072586, + "grad_norm": 0.05933460593223572, + "learning_rate": 6.273909716908952e-05, + "loss": 0.2447, + "step": 35436 + }, + { + "epoch": 2.870787427090084, + "grad_norm": 0.0612567774951458, + "learning_rate": 6.273459651649489e-05, + "loss": 0.2196, + "step": 35437 + }, + { + "epoch": 2.870868438107583, + "grad_norm": 0.0733252465724945, + "learning_rate": 6.273009586390028e-05, + "loss": 0.2116, + "step": 35438 + }, + { + "epoch": 2.870949449125081, + "grad_norm": 0.07396426796913147, + "learning_rate": 6.272559521130564e-05, + "loss": 0.2392, + "step": 35439 + }, + { + "epoch": 2.8710304601425793, + "grad_norm": 0.07068262249231339, + "learning_rate": 6.272109455871101e-05, + "loss": 0.2437, + "step": 35440 + }, + { + "epoch": 2.871111471160078, + "grad_norm": 0.06269538402557373, + "learning_rate": 6.27165939061164e-05, + "loss": 0.2486, + "step": 35441 + }, + { + "epoch": 2.8711924821775763, + "grad_norm": 0.07349444180727005, + "learning_rate": 6.271209325352176e-05, + "loss": 0.2264, + "step": 35442 + }, + { + "epoch": 2.8712734931950745, + "grad_norm": 0.06318012624979019, + "learning_rate": 6.270759260092713e-05, + "loss": 0.2091, + "step": 35443 + }, + { + "epoch": 2.8713545042125728, + "grad_norm": 0.07905392348766327, + "learning_rate": 6.270309194833252e-05, + "loss": 0.2388, + "step": 35444 + }, + { + "epoch": 2.8714355152300715, + "grad_norm": 0.08803623914718628, + "learning_rate": 6.269859129573788e-05, + "loss": 0.239, + "step": 35445 + }, + { + "epoch": 2.8715165262475697, + "grad_norm": 0.07360757142305374, + "learning_rate": 6.269409064314325e-05, + "loss": 0.2107, + "step": 35446 + }, + { + "epoch": 2.871597537265068, + "grad_norm": 0.08824556320905685, + "learning_rate": 6.268958999054864e-05, + "loss": 0.2401, + "step": 35447 + }, + { + "epoch": 2.871678548282566, + "grad_norm": 0.06086265295743942, + "learning_rate": 6.2685089337954e-05, + "loss": 0.2577, + "step": 35448 + }, + { + "epoch": 2.871759559300065, + "grad_norm": 0.06580790132284164, + "learning_rate": 6.268058868535937e-05, + "loss": 0.2476, + "step": 35449 + }, + { + "epoch": 2.871840570317563, + "grad_norm": 0.061794914305210114, + "learning_rate": 6.267608803276476e-05, + "loss": 0.2647, + "step": 35450 + }, + { + "epoch": 2.8719215813350614, + "grad_norm": 0.06876268237829208, + "learning_rate": 6.267158738017012e-05, + "loss": 0.2087, + "step": 35451 + }, + { + "epoch": 2.87200259235256, + "grad_norm": 0.06276143342256546, + "learning_rate": 6.26670867275755e-05, + "loss": 0.2074, + "step": 35452 + }, + { + "epoch": 2.8720836033700583, + "grad_norm": 0.07023021578788757, + "learning_rate": 6.266258607498088e-05, + "loss": 0.2612, + "step": 35453 + }, + { + "epoch": 2.8721646143875565, + "grad_norm": 0.06984831392765045, + "learning_rate": 6.265808542238624e-05, + "loss": 0.2163, + "step": 35454 + }, + { + "epoch": 2.8722456254050552, + "grad_norm": 0.08129183948040009, + "learning_rate": 6.265358476979162e-05, + "loss": 0.2289, + "step": 35455 + }, + { + "epoch": 2.8723266364225535, + "grad_norm": 0.07722354680299759, + "learning_rate": 6.2649084117197e-05, + "loss": 0.2451, + "step": 35456 + }, + { + "epoch": 2.8724076474400517, + "grad_norm": 0.07596605271100998, + "learning_rate": 6.264458346460236e-05, + "loss": 0.2431, + "step": 35457 + }, + { + "epoch": 2.8724886584575504, + "grad_norm": 0.07646816223859787, + "learning_rate": 6.264008281200774e-05, + "loss": 0.2241, + "step": 35458 + }, + { + "epoch": 2.8725696694750487, + "grad_norm": 0.05903325974941254, + "learning_rate": 6.263558215941312e-05, + "loss": 0.2295, + "step": 35459 + }, + { + "epoch": 2.872650680492547, + "grad_norm": 0.07608656585216522, + "learning_rate": 6.263108150681848e-05, + "loss": 0.2624, + "step": 35460 + }, + { + "epoch": 2.8727316915100456, + "grad_norm": 0.062335386872291565, + "learning_rate": 6.262658085422387e-05, + "loss": 0.2214, + "step": 35461 + }, + { + "epoch": 2.872812702527544, + "grad_norm": 0.07655826956033707, + "learning_rate": 6.262208020162924e-05, + "loss": 0.2578, + "step": 35462 + }, + { + "epoch": 2.872893713545042, + "grad_norm": 0.07051601260900497, + "learning_rate": 6.26175795490346e-05, + "loss": 0.2404, + "step": 35463 + }, + { + "epoch": 2.8729747245625408, + "grad_norm": 0.08787756413221359, + "learning_rate": 6.261307889643999e-05, + "loss": 0.2726, + "step": 35464 + }, + { + "epoch": 2.873055735580039, + "grad_norm": 0.07177945226430893, + "learning_rate": 6.260857824384536e-05, + "loss": 0.2675, + "step": 35465 + }, + { + "epoch": 2.8731367465975373, + "grad_norm": 0.09521305561065674, + "learning_rate": 6.260407759125072e-05, + "loss": 0.3276, + "step": 35466 + }, + { + "epoch": 2.8732177576150355, + "grad_norm": 0.07370465993881226, + "learning_rate": 6.259957693865611e-05, + "loss": 0.2483, + "step": 35467 + }, + { + "epoch": 2.8732987686325338, + "grad_norm": 0.06736384332180023, + "learning_rate": 6.259507628606149e-05, + "loss": 0.2223, + "step": 35468 + }, + { + "epoch": 2.8733797796500324, + "grad_norm": 0.05994664132595062, + "learning_rate": 6.259057563346686e-05, + "loss": 0.2281, + "step": 35469 + }, + { + "epoch": 2.8734607906675307, + "grad_norm": 0.07355383038520813, + "learning_rate": 6.258607498087223e-05, + "loss": 0.2431, + "step": 35470 + }, + { + "epoch": 2.873541801685029, + "grad_norm": 0.06896742433309555, + "learning_rate": 6.25815743282776e-05, + "loss": 0.2634, + "step": 35471 + }, + { + "epoch": 2.8736228127025276, + "grad_norm": 0.07273846864700317, + "learning_rate": 6.257707367568298e-05, + "loss": 0.2292, + "step": 35472 + }, + { + "epoch": 2.873703823720026, + "grad_norm": 0.07593858242034912, + "learning_rate": 6.257257302308835e-05, + "loss": 0.229, + "step": 35473 + }, + { + "epoch": 2.873784834737524, + "grad_norm": 0.07159194350242615, + "learning_rate": 6.256807237049373e-05, + "loss": 0.2512, + "step": 35474 + }, + { + "epoch": 2.873865845755023, + "grad_norm": 0.05694905295968056, + "learning_rate": 6.25635717178991e-05, + "loss": 0.2053, + "step": 35475 + }, + { + "epoch": 2.873946856772521, + "grad_norm": 0.06768777221441269, + "learning_rate": 6.255907106530447e-05, + "loss": 0.2114, + "step": 35476 + }, + { + "epoch": 2.8740278677900193, + "grad_norm": 0.0636713057756424, + "learning_rate": 6.255457041270985e-05, + "loss": 0.2611, + "step": 35477 + }, + { + "epoch": 2.874108878807518, + "grad_norm": 0.07856924831867218, + "learning_rate": 6.255006976011522e-05, + "loss": 0.2495, + "step": 35478 + }, + { + "epoch": 2.874189889825016, + "grad_norm": 0.05206296592950821, + "learning_rate": 6.25455691075206e-05, + "loss": 0.2194, + "step": 35479 + }, + { + "epoch": 2.8742709008425145, + "grad_norm": 0.06615325063467026, + "learning_rate": 6.254106845492597e-05, + "loss": 0.2006, + "step": 35480 + }, + { + "epoch": 2.874351911860013, + "grad_norm": 0.07415047287940979, + "learning_rate": 6.253656780233134e-05, + "loss": 0.2854, + "step": 35481 + }, + { + "epoch": 2.8744329228775114, + "grad_norm": 0.07157108187675476, + "learning_rate": 6.253206714973672e-05, + "loss": 0.2438, + "step": 35482 + }, + { + "epoch": 2.8745139338950096, + "grad_norm": 0.06774777173995972, + "learning_rate": 6.252756649714209e-05, + "loss": 0.2013, + "step": 35483 + }, + { + "epoch": 2.8745949449125083, + "grad_norm": 0.0959698036313057, + "learning_rate": 6.252306584454746e-05, + "loss": 0.218, + "step": 35484 + }, + { + "epoch": 2.8746759559300066, + "grad_norm": 0.08200061321258545, + "learning_rate": 6.251856519195284e-05, + "loss": 0.2541, + "step": 35485 + }, + { + "epoch": 2.874756966947505, + "grad_norm": 0.07556328922510147, + "learning_rate": 6.251406453935821e-05, + "loss": 0.2114, + "step": 35486 + }, + { + "epoch": 2.8748379779650035, + "grad_norm": 0.06020106002688408, + "learning_rate": 6.250956388676358e-05, + "loss": 0.2459, + "step": 35487 + }, + { + "epoch": 2.8749189889825018, + "grad_norm": 0.09242961555719376, + "learning_rate": 6.250506323416896e-05, + "loss": 0.2142, + "step": 35488 + }, + { + "epoch": 2.875, + "grad_norm": 0.07264678180217743, + "learning_rate": 6.250056258157433e-05, + "loss": 0.2267, + "step": 35489 + }, + { + "epoch": 2.8750810110174982, + "grad_norm": 0.07662799954414368, + "learning_rate": 6.24960619289797e-05, + "loss": 0.2436, + "step": 35490 + }, + { + "epoch": 2.8751620220349965, + "grad_norm": 0.06048709526658058, + "learning_rate": 6.249156127638508e-05, + "loss": 0.2083, + "step": 35491 + }, + { + "epoch": 2.875243033052495, + "grad_norm": 0.10970834642648697, + "learning_rate": 6.248706062379045e-05, + "loss": 0.2632, + "step": 35492 + }, + { + "epoch": 2.8753240440699934, + "grad_norm": 0.07959237694740295, + "learning_rate": 6.248255997119583e-05, + "loss": 0.2716, + "step": 35493 + }, + { + "epoch": 2.8754050550874917, + "grad_norm": 0.0831311047077179, + "learning_rate": 6.24780593186012e-05, + "loss": 0.2483, + "step": 35494 + }, + { + "epoch": 2.8754860661049904, + "grad_norm": 0.08556374907493591, + "learning_rate": 6.247355866600657e-05, + "loss": 0.2334, + "step": 35495 + }, + { + "epoch": 2.8755670771224886, + "grad_norm": 0.0768716111779213, + "learning_rate": 6.246905801341195e-05, + "loss": 0.1901, + "step": 35496 + }, + { + "epoch": 2.875648088139987, + "grad_norm": 0.06417785584926605, + "learning_rate": 6.246455736081732e-05, + "loss": 0.2079, + "step": 35497 + }, + { + "epoch": 2.8757290991574855, + "grad_norm": 0.06606259942054749, + "learning_rate": 6.24600567082227e-05, + "loss": 0.1936, + "step": 35498 + }, + { + "epoch": 2.875810110174984, + "grad_norm": 0.06523019820451736, + "learning_rate": 6.245555605562807e-05, + "loss": 0.2154, + "step": 35499 + }, + { + "epoch": 2.875891121192482, + "grad_norm": 0.07109911739826202, + "learning_rate": 6.245105540303344e-05, + "loss": 0.2438, + "step": 35500 + }, + { + "epoch": 2.8759721322099807, + "grad_norm": 0.05882502347230911, + "learning_rate": 6.244655475043881e-05, + "loss": 0.2372, + "step": 35501 + }, + { + "epoch": 2.876053143227479, + "grad_norm": 0.06620030850172043, + "learning_rate": 6.244205409784419e-05, + "loss": 0.2526, + "step": 35502 + }, + { + "epoch": 2.876134154244977, + "grad_norm": 0.061462339013814926, + "learning_rate": 6.243755344524956e-05, + "loss": 0.2338, + "step": 35503 + }, + { + "epoch": 2.876215165262476, + "grad_norm": 0.07159540057182312, + "learning_rate": 6.243305279265494e-05, + "loss": 0.2107, + "step": 35504 + }, + { + "epoch": 2.876296176279974, + "grad_norm": 0.07247888296842575, + "learning_rate": 6.242855214006031e-05, + "loss": 0.2427, + "step": 35505 + }, + { + "epoch": 2.8763771872974724, + "grad_norm": 0.067585788667202, + "learning_rate": 6.242405148746568e-05, + "loss": 0.2259, + "step": 35506 + }, + { + "epoch": 2.876458198314971, + "grad_norm": 0.07686949521303177, + "learning_rate": 6.241955083487106e-05, + "loss": 0.3078, + "step": 35507 + }, + { + "epoch": 2.8765392093324693, + "grad_norm": 0.07205145061016083, + "learning_rate": 6.241505018227643e-05, + "loss": 0.2163, + "step": 35508 + }, + { + "epoch": 2.8766202203499676, + "grad_norm": 0.08684998005628586, + "learning_rate": 6.24105495296818e-05, + "loss": 0.2437, + "step": 35509 + }, + { + "epoch": 2.8767012313674662, + "grad_norm": 0.07802329957485199, + "learning_rate": 6.240604887708718e-05, + "loss": 0.2524, + "step": 35510 + }, + { + "epoch": 2.8767822423849645, + "grad_norm": 0.06080799549818039, + "learning_rate": 6.240154822449255e-05, + "loss": 0.2426, + "step": 35511 + }, + { + "epoch": 2.8768632534024627, + "grad_norm": 0.07781938463449478, + "learning_rate": 6.239704757189792e-05, + "loss": 0.2979, + "step": 35512 + }, + { + "epoch": 2.876944264419961, + "grad_norm": 0.06262904405593872, + "learning_rate": 6.239254691930331e-05, + "loss": 0.2756, + "step": 35513 + }, + { + "epoch": 2.8770252754374592, + "grad_norm": 0.07558223605155945, + "learning_rate": 6.238804626670867e-05, + "loss": 0.2413, + "step": 35514 + }, + { + "epoch": 2.877106286454958, + "grad_norm": 0.07031270861625671, + "learning_rate": 6.238354561411404e-05, + "loss": 0.2594, + "step": 35515 + }, + { + "epoch": 2.877187297472456, + "grad_norm": 0.075470469892025, + "learning_rate": 6.237904496151943e-05, + "loss": 0.2414, + "step": 35516 + }, + { + "epoch": 2.8772683084899544, + "grad_norm": 0.0652003362774849, + "learning_rate": 6.237454430892479e-05, + "loss": 0.2424, + "step": 35517 + }, + { + "epoch": 2.877349319507453, + "grad_norm": 0.08088836073875427, + "learning_rate": 6.237004365633017e-05, + "loss": 0.2411, + "step": 35518 + }, + { + "epoch": 2.8774303305249513, + "grad_norm": 0.06947515904903412, + "learning_rate": 6.236554300373555e-05, + "loss": 0.2463, + "step": 35519 + }, + { + "epoch": 2.8775113415424496, + "grad_norm": 0.07593680173158646, + "learning_rate": 6.236104235114091e-05, + "loss": 0.2444, + "step": 35520 + }, + { + "epoch": 2.8775923525599483, + "grad_norm": 0.08024193346500397, + "learning_rate": 6.235654169854629e-05, + "loss": 0.214, + "step": 35521 + }, + { + "epoch": 2.8776733635774465, + "grad_norm": 0.0644368976354599, + "learning_rate": 6.235204104595167e-05, + "loss": 0.2066, + "step": 35522 + }, + { + "epoch": 2.8777543745949448, + "grad_norm": 0.06782899051904678, + "learning_rate": 6.234754039335703e-05, + "loss": 0.2366, + "step": 35523 + }, + { + "epoch": 2.8778353856124435, + "grad_norm": 0.08508621901273727, + "learning_rate": 6.234303974076241e-05, + "loss": 0.2548, + "step": 35524 + }, + { + "epoch": 2.8779163966299417, + "grad_norm": 0.0708160474896431, + "learning_rate": 6.23385390881678e-05, + "loss": 0.2563, + "step": 35525 + }, + { + "epoch": 2.87799740764744, + "grad_norm": 0.08127661794424057, + "learning_rate": 6.233403843557315e-05, + "loss": 0.2414, + "step": 35526 + }, + { + "epoch": 2.8780784186649386, + "grad_norm": 0.06155683100223541, + "learning_rate": 6.232953778297853e-05, + "loss": 0.2284, + "step": 35527 + }, + { + "epoch": 2.878159429682437, + "grad_norm": 0.06013333424925804, + "learning_rate": 6.232503713038392e-05, + "loss": 0.2303, + "step": 35528 + }, + { + "epoch": 2.878240440699935, + "grad_norm": 0.07108181715011597, + "learning_rate": 6.232053647778928e-05, + "loss": 0.2475, + "step": 35529 + }, + { + "epoch": 2.878321451717434, + "grad_norm": 0.06939464062452316, + "learning_rate": 6.231603582519465e-05, + "loss": 0.2586, + "step": 35530 + }, + { + "epoch": 2.878402462734932, + "grad_norm": 0.09219469130039215, + "learning_rate": 6.231153517260004e-05, + "loss": 0.2114, + "step": 35531 + }, + { + "epoch": 2.8784834737524303, + "grad_norm": 0.0657503604888916, + "learning_rate": 6.23070345200054e-05, + "loss": 0.238, + "step": 35532 + }, + { + "epoch": 2.8785644847699285, + "grad_norm": 0.08449986577033997, + "learning_rate": 6.230253386741077e-05, + "loss": 0.2588, + "step": 35533 + }, + { + "epoch": 2.8786454957874272, + "grad_norm": 0.06352499127388, + "learning_rate": 6.229803321481616e-05, + "loss": 0.1936, + "step": 35534 + }, + { + "epoch": 2.8787265068049255, + "grad_norm": 0.05862050503492355, + "learning_rate": 6.229353256222152e-05, + "loss": 0.2048, + "step": 35535 + }, + { + "epoch": 2.8788075178224237, + "grad_norm": 0.08377757668495178, + "learning_rate": 6.228903190962689e-05, + "loss": 0.2343, + "step": 35536 + }, + { + "epoch": 2.878888528839922, + "grad_norm": 0.07096932083368301, + "learning_rate": 6.228453125703228e-05, + "loss": 0.2631, + "step": 35537 + }, + { + "epoch": 2.8789695398574207, + "grad_norm": 0.05568651854991913, + "learning_rate": 6.228003060443765e-05, + "loss": 0.2669, + "step": 35538 + }, + { + "epoch": 2.879050550874919, + "grad_norm": 0.07287915050983429, + "learning_rate": 6.227552995184303e-05, + "loss": 0.2246, + "step": 35539 + }, + { + "epoch": 2.879131561892417, + "grad_norm": 0.06835003197193146, + "learning_rate": 6.22710292992484e-05, + "loss": 0.2161, + "step": 35540 + }, + { + "epoch": 2.879212572909916, + "grad_norm": 0.06841269880533218, + "learning_rate": 6.226652864665377e-05, + "loss": 0.221, + "step": 35541 + }, + { + "epoch": 2.879293583927414, + "grad_norm": 0.07463429123163223, + "learning_rate": 6.226202799405915e-05, + "loss": 0.2398, + "step": 35542 + }, + { + "epoch": 2.8793745949449123, + "grad_norm": 0.07194212824106216, + "learning_rate": 6.225752734146452e-05, + "loss": 0.2364, + "step": 35543 + }, + { + "epoch": 2.879455605962411, + "grad_norm": 0.08712617307901382, + "learning_rate": 6.225302668886989e-05, + "loss": 0.21, + "step": 35544 + }, + { + "epoch": 2.8795366169799093, + "grad_norm": 0.06648823618888855, + "learning_rate": 6.224852603627527e-05, + "loss": 0.1939, + "step": 35545 + }, + { + "epoch": 2.8796176279974075, + "grad_norm": 0.08884168416261673, + "learning_rate": 6.224402538368064e-05, + "loss": 0.2639, + "step": 35546 + }, + { + "epoch": 2.879698639014906, + "grad_norm": 0.06660214066505432, + "learning_rate": 6.223952473108601e-05, + "loss": 0.2142, + "step": 35547 + }, + { + "epoch": 2.8797796500324044, + "grad_norm": 0.0755704939365387, + "learning_rate": 6.223502407849139e-05, + "loss": 0.2261, + "step": 35548 + }, + { + "epoch": 2.8798606610499027, + "grad_norm": 0.07990830391645432, + "learning_rate": 6.223052342589676e-05, + "loss": 0.2378, + "step": 35549 + }, + { + "epoch": 2.8799416720674014, + "grad_norm": 0.05775754526257515, + "learning_rate": 6.222602277330213e-05, + "loss": 0.2239, + "step": 35550 + }, + { + "epoch": 2.8800226830848996, + "grad_norm": 0.06801251322031021, + "learning_rate": 6.222152212070751e-05, + "loss": 0.2262, + "step": 35551 + }, + { + "epoch": 2.880103694102398, + "grad_norm": 0.0567367859184742, + "learning_rate": 6.221702146811288e-05, + "loss": 0.2115, + "step": 35552 + }, + { + "epoch": 2.8801847051198965, + "grad_norm": 0.06986761093139648, + "learning_rate": 6.221252081551826e-05, + "loss": 0.2124, + "step": 35553 + }, + { + "epoch": 2.880265716137395, + "grad_norm": 0.07065030932426453, + "learning_rate": 6.220802016292363e-05, + "loss": 0.2382, + "step": 35554 + }, + { + "epoch": 2.880346727154893, + "grad_norm": 0.06928656995296478, + "learning_rate": 6.2203519510329e-05, + "loss": 0.2904, + "step": 35555 + }, + { + "epoch": 2.8804277381723913, + "grad_norm": 0.0750303864479065, + "learning_rate": 6.219901885773438e-05, + "loss": 0.2133, + "step": 35556 + }, + { + "epoch": 2.88050874918989, + "grad_norm": 0.0664948970079422, + "learning_rate": 6.219451820513975e-05, + "loss": 0.2041, + "step": 35557 + }, + { + "epoch": 2.880589760207388, + "grad_norm": 0.07583000510931015, + "learning_rate": 6.219001755254512e-05, + "loss": 0.2363, + "step": 35558 + }, + { + "epoch": 2.8806707712248865, + "grad_norm": 0.07268665730953217, + "learning_rate": 6.21855168999505e-05, + "loss": 0.2409, + "step": 35559 + }, + { + "epoch": 2.8807517822423847, + "grad_norm": 0.06231937184929848, + "learning_rate": 6.218101624735587e-05, + "loss": 0.1869, + "step": 35560 + }, + { + "epoch": 2.8808327932598834, + "grad_norm": 0.06016063317656517, + "learning_rate": 6.217651559476124e-05, + "loss": 0.2062, + "step": 35561 + }, + { + "epoch": 2.8809138042773816, + "grad_norm": 0.0735088363289833, + "learning_rate": 6.217201494216662e-05, + "loss": 0.2389, + "step": 35562 + }, + { + "epoch": 2.88099481529488, + "grad_norm": 0.07358022779226303, + "learning_rate": 6.216751428957199e-05, + "loss": 0.2366, + "step": 35563 + }, + { + "epoch": 2.8810758263123786, + "grad_norm": 0.07338882982730865, + "learning_rate": 6.216301363697737e-05, + "loss": 0.2587, + "step": 35564 + }, + { + "epoch": 2.881156837329877, + "grad_norm": 0.06316777318716049, + "learning_rate": 6.215851298438274e-05, + "loss": 0.2384, + "step": 35565 + }, + { + "epoch": 2.881237848347375, + "grad_norm": 0.06496182829141617, + "learning_rate": 6.215401233178811e-05, + "loss": 0.2138, + "step": 35566 + }, + { + "epoch": 2.8813188593648738, + "grad_norm": 0.07022936642169952, + "learning_rate": 6.214951167919349e-05, + "loss": 0.3083, + "step": 35567 + }, + { + "epoch": 2.881399870382372, + "grad_norm": 0.062291331589221954, + "learning_rate": 6.214501102659886e-05, + "loss": 0.2285, + "step": 35568 + }, + { + "epoch": 2.8814808813998702, + "grad_norm": 0.07278969138860703, + "learning_rate": 6.214051037400423e-05, + "loss": 0.2596, + "step": 35569 + }, + { + "epoch": 2.881561892417369, + "grad_norm": 0.07649640738964081, + "learning_rate": 6.21360097214096e-05, + "loss": 0.228, + "step": 35570 + }, + { + "epoch": 2.881642903434867, + "grad_norm": 0.061720188707113266, + "learning_rate": 6.213150906881498e-05, + "loss": 0.2165, + "step": 35571 + }, + { + "epoch": 2.8817239144523654, + "grad_norm": 0.0904294103384018, + "learning_rate": 6.212700841622035e-05, + "loss": 0.298, + "step": 35572 + }, + { + "epoch": 2.881804925469864, + "grad_norm": 0.07655181735754013, + "learning_rate": 6.212250776362573e-05, + "loss": 0.2519, + "step": 35573 + }, + { + "epoch": 2.8818859364873624, + "grad_norm": 0.06292436271905899, + "learning_rate": 6.21180071110311e-05, + "loss": 0.2502, + "step": 35574 + }, + { + "epoch": 2.8819669475048606, + "grad_norm": 0.06908023357391357, + "learning_rate": 6.211350645843647e-05, + "loss": 0.234, + "step": 35575 + }, + { + "epoch": 2.8820479585223593, + "grad_norm": 0.06845370680093765, + "learning_rate": 6.210900580584185e-05, + "loss": 0.2498, + "step": 35576 + }, + { + "epoch": 2.8821289695398575, + "grad_norm": 0.059849537909030914, + "learning_rate": 6.210450515324722e-05, + "loss": 0.2268, + "step": 35577 + }, + { + "epoch": 2.8822099805573558, + "grad_norm": 0.061932940036058426, + "learning_rate": 6.21000045006526e-05, + "loss": 0.2153, + "step": 35578 + }, + { + "epoch": 2.882290991574854, + "grad_norm": 0.06094523146748543, + "learning_rate": 6.209550384805797e-05, + "loss": 0.2275, + "step": 35579 + }, + { + "epoch": 2.8823720025923527, + "grad_norm": 0.07336302846670151, + "learning_rate": 6.209100319546334e-05, + "loss": 0.2353, + "step": 35580 + }, + { + "epoch": 2.882453013609851, + "grad_norm": 0.07363414019346237, + "learning_rate": 6.208650254286872e-05, + "loss": 0.2224, + "step": 35581 + }, + { + "epoch": 2.882534024627349, + "grad_norm": 0.07803016155958176, + "learning_rate": 6.208200189027409e-05, + "loss": 0.2543, + "step": 35582 + }, + { + "epoch": 2.8826150356448474, + "grad_norm": 0.06834989786148071, + "learning_rate": 6.207750123767946e-05, + "loss": 0.2238, + "step": 35583 + }, + { + "epoch": 2.882696046662346, + "grad_norm": 0.0689224898815155, + "learning_rate": 6.207300058508484e-05, + "loss": 0.2557, + "step": 35584 + }, + { + "epoch": 2.8827770576798444, + "grad_norm": 0.07135829329490662, + "learning_rate": 6.206849993249021e-05, + "loss": 0.2292, + "step": 35585 + }, + { + "epoch": 2.8828580686973426, + "grad_norm": 0.05913795530796051, + "learning_rate": 6.206399927989558e-05, + "loss": 0.2147, + "step": 35586 + }, + { + "epoch": 2.8829390797148413, + "grad_norm": 0.0672590509057045, + "learning_rate": 6.205949862730096e-05, + "loss": 0.2223, + "step": 35587 + }, + { + "epoch": 2.8830200907323396, + "grad_norm": 0.10194092988967896, + "learning_rate": 6.205499797470633e-05, + "loss": 0.2594, + "step": 35588 + }, + { + "epoch": 2.883101101749838, + "grad_norm": 0.08454529941082001, + "learning_rate": 6.20504973221117e-05, + "loss": 0.1887, + "step": 35589 + }, + { + "epoch": 2.8831821127673365, + "grad_norm": 0.0754120796918869, + "learning_rate": 6.204599666951708e-05, + "loss": 0.2313, + "step": 35590 + }, + { + "epoch": 2.8832631237848347, + "grad_norm": 0.0692853033542633, + "learning_rate": 6.204149601692245e-05, + "loss": 0.234, + "step": 35591 + }, + { + "epoch": 2.883344134802333, + "grad_norm": 0.06850889325141907, + "learning_rate": 6.203699536432783e-05, + "loss": 0.2424, + "step": 35592 + }, + { + "epoch": 2.8834251458198317, + "grad_norm": 0.06814275681972504, + "learning_rate": 6.20324947117332e-05, + "loss": 0.2283, + "step": 35593 + }, + { + "epoch": 2.88350615683733, + "grad_norm": 0.07559818029403687, + "learning_rate": 6.202799405913859e-05, + "loss": 0.2431, + "step": 35594 + }, + { + "epoch": 2.883587167854828, + "grad_norm": 0.06723404675722122, + "learning_rate": 6.202349340654395e-05, + "loss": 0.222, + "step": 35595 + }, + { + "epoch": 2.883668178872327, + "grad_norm": 0.08386305719614029, + "learning_rate": 6.201899275394932e-05, + "loss": 0.2784, + "step": 35596 + }, + { + "epoch": 2.883749189889825, + "grad_norm": 0.0725565105676651, + "learning_rate": 6.201449210135471e-05, + "loss": 0.2076, + "step": 35597 + }, + { + "epoch": 2.8838302009073233, + "grad_norm": 0.05963285639882088, + "learning_rate": 6.200999144876007e-05, + "loss": 0.2076, + "step": 35598 + }, + { + "epoch": 2.883911211924822, + "grad_norm": 0.08718138188123703, + "learning_rate": 6.200549079616544e-05, + "loss": 0.245, + "step": 35599 + }, + { + "epoch": 2.8839922229423203, + "grad_norm": 0.058612942695617676, + "learning_rate": 6.200099014357083e-05, + "loss": 0.2009, + "step": 35600 + }, + { + "epoch": 2.8840732339598185, + "grad_norm": 0.08298259228467941, + "learning_rate": 6.199648949097619e-05, + "loss": 0.2368, + "step": 35601 + }, + { + "epoch": 2.8841542449773168, + "grad_norm": 0.07767398655414581, + "learning_rate": 6.199198883838156e-05, + "loss": 0.2454, + "step": 35602 + }, + { + "epoch": 2.8842352559948155, + "grad_norm": 0.07122716307640076, + "learning_rate": 6.198748818578695e-05, + "loss": 0.2794, + "step": 35603 + }, + { + "epoch": 2.8843162670123137, + "grad_norm": 0.06702442467212677, + "learning_rate": 6.198298753319231e-05, + "loss": 0.2153, + "step": 35604 + }, + { + "epoch": 2.884397278029812, + "grad_norm": 0.07701624929904938, + "learning_rate": 6.197848688059768e-05, + "loss": 0.2566, + "step": 35605 + }, + { + "epoch": 2.88447828904731, + "grad_norm": 0.0829615518450737, + "learning_rate": 6.197398622800307e-05, + "loss": 0.2451, + "step": 35606 + }, + { + "epoch": 2.884559300064809, + "grad_norm": 0.0616791732609272, + "learning_rate": 6.196948557540844e-05, + "loss": 0.2841, + "step": 35607 + }, + { + "epoch": 2.884640311082307, + "grad_norm": 0.06792052835226059, + "learning_rate": 6.19649849228138e-05, + "loss": 0.2464, + "step": 35608 + }, + { + "epoch": 2.8847213220998054, + "grad_norm": 0.07558625936508179, + "learning_rate": 6.196048427021919e-05, + "loss": 0.2394, + "step": 35609 + }, + { + "epoch": 2.884802333117304, + "grad_norm": 0.06835988163948059, + "learning_rate": 6.195598361762456e-05, + "loss": 0.2254, + "step": 35610 + }, + { + "epoch": 2.8848833441348023, + "grad_norm": 0.07234005630016327, + "learning_rate": 6.195148296502992e-05, + "loss": 0.2331, + "step": 35611 + }, + { + "epoch": 2.8849643551523005, + "grad_norm": 0.10257917642593384, + "learning_rate": 6.194698231243531e-05, + "loss": 0.2124, + "step": 35612 + }, + { + "epoch": 2.8850453661697992, + "grad_norm": 0.05626985803246498, + "learning_rate": 6.194248165984069e-05, + "loss": 0.2305, + "step": 35613 + }, + { + "epoch": 2.8851263771872975, + "grad_norm": 0.08011636137962341, + "learning_rate": 6.193798100724605e-05, + "loss": 0.3001, + "step": 35614 + }, + { + "epoch": 2.8852073882047957, + "grad_norm": 0.07734625786542892, + "learning_rate": 6.193348035465143e-05, + "loss": 0.2325, + "step": 35615 + }, + { + "epoch": 2.8852883992222944, + "grad_norm": 0.07830455154180527, + "learning_rate": 6.19289797020568e-05, + "loss": 0.2291, + "step": 35616 + }, + { + "epoch": 2.8853694102397927, + "grad_norm": 0.0644298568367958, + "learning_rate": 6.192447904946217e-05, + "loss": 0.2332, + "step": 35617 + }, + { + "epoch": 2.885450421257291, + "grad_norm": 0.06490510702133179, + "learning_rate": 6.191997839686755e-05, + "loss": 0.2059, + "step": 35618 + }, + { + "epoch": 2.8855314322747896, + "grad_norm": 0.0631365105509758, + "learning_rate": 6.191547774427293e-05, + "loss": 0.2456, + "step": 35619 + }, + { + "epoch": 2.885612443292288, + "grad_norm": 0.07085856050252914, + "learning_rate": 6.19109770916783e-05, + "loss": 0.2167, + "step": 35620 + }, + { + "epoch": 2.885693454309786, + "grad_norm": 0.07526741921901703, + "learning_rate": 6.190647643908367e-05, + "loss": 0.2384, + "step": 35621 + }, + { + "epoch": 2.8857744653272848, + "grad_norm": 0.0649634301662445, + "learning_rate": 6.190197578648905e-05, + "loss": 0.2464, + "step": 35622 + }, + { + "epoch": 2.885855476344783, + "grad_norm": 0.06935419142246246, + "learning_rate": 6.189747513389442e-05, + "loss": 0.2064, + "step": 35623 + }, + { + "epoch": 2.8859364873622813, + "grad_norm": 0.07325664162635803, + "learning_rate": 6.18929744812998e-05, + "loss": 0.2449, + "step": 35624 + }, + { + "epoch": 2.8860174983797795, + "grad_norm": 0.06494774669408798, + "learning_rate": 6.188847382870517e-05, + "loss": 0.2535, + "step": 35625 + }, + { + "epoch": 2.886098509397278, + "grad_norm": 0.08057263493537903, + "learning_rate": 6.188397317611054e-05, + "loss": 0.2219, + "step": 35626 + }, + { + "epoch": 2.8861795204147764, + "grad_norm": 0.07165004312992096, + "learning_rate": 6.187947252351592e-05, + "loss": 0.2221, + "step": 35627 + }, + { + "epoch": 2.8862605314322747, + "grad_norm": 0.07129422575235367, + "learning_rate": 6.187497187092129e-05, + "loss": 0.2189, + "step": 35628 + }, + { + "epoch": 2.886341542449773, + "grad_norm": 0.08567812293767929, + "learning_rate": 6.187047121832666e-05, + "loss": 0.2296, + "step": 35629 + }, + { + "epoch": 2.8864225534672716, + "grad_norm": 0.0857577994465828, + "learning_rate": 6.186597056573204e-05, + "loss": 0.2672, + "step": 35630 + }, + { + "epoch": 2.88650356448477, + "grad_norm": 0.09535958617925644, + "learning_rate": 6.186146991313741e-05, + "loss": 0.2712, + "step": 35631 + }, + { + "epoch": 2.886584575502268, + "grad_norm": 0.07217075675725937, + "learning_rate": 6.185696926054278e-05, + "loss": 0.2839, + "step": 35632 + }, + { + "epoch": 2.886665586519767, + "grad_norm": 0.07447575777769089, + "learning_rate": 6.185246860794816e-05, + "loss": 0.259, + "step": 35633 + }, + { + "epoch": 2.886746597537265, + "grad_norm": 0.06412901729345322, + "learning_rate": 6.184796795535353e-05, + "loss": 0.2066, + "step": 35634 + }, + { + "epoch": 2.8868276085547633, + "grad_norm": 0.06986956298351288, + "learning_rate": 6.18434673027589e-05, + "loss": 0.2538, + "step": 35635 + }, + { + "epoch": 2.886908619572262, + "grad_norm": 0.08456194400787354, + "learning_rate": 6.183896665016428e-05, + "loss": 0.2918, + "step": 35636 + }, + { + "epoch": 2.88698963058976, + "grad_norm": 0.07183456420898438, + "learning_rate": 6.183446599756965e-05, + "loss": 0.2643, + "step": 35637 + }, + { + "epoch": 2.8870706416072585, + "grad_norm": 0.07418840378522873, + "learning_rate": 6.182996534497503e-05, + "loss": 0.2165, + "step": 35638 + }, + { + "epoch": 2.887151652624757, + "grad_norm": 0.06666819006204605, + "learning_rate": 6.18254646923804e-05, + "loss": 0.2132, + "step": 35639 + }, + { + "epoch": 2.8872326636422554, + "grad_norm": 0.06616838276386261, + "learning_rate": 6.182096403978577e-05, + "loss": 0.2479, + "step": 35640 + }, + { + "epoch": 2.8873136746597536, + "grad_norm": 0.07370367646217346, + "learning_rate": 6.181646338719115e-05, + "loss": 0.2395, + "step": 35641 + }, + { + "epoch": 2.8873946856772523, + "grad_norm": 0.07266636192798615, + "learning_rate": 6.181196273459652e-05, + "loss": 0.212, + "step": 35642 + }, + { + "epoch": 2.8874756966947506, + "grad_norm": 0.06336379051208496, + "learning_rate": 6.18074620820019e-05, + "loss": 0.2285, + "step": 35643 + }, + { + "epoch": 2.887556707712249, + "grad_norm": 0.07010842114686966, + "learning_rate": 6.180296142940727e-05, + "loss": 0.2644, + "step": 35644 + }, + { + "epoch": 2.8876377187297475, + "grad_norm": 0.07312007248401642, + "learning_rate": 6.179846077681264e-05, + "loss": 0.2542, + "step": 35645 + }, + { + "epoch": 2.8877187297472457, + "grad_norm": 0.05939409136772156, + "learning_rate": 6.179396012421801e-05, + "loss": 0.2296, + "step": 35646 + }, + { + "epoch": 2.887799740764744, + "grad_norm": 0.07634563744068146, + "learning_rate": 6.178945947162339e-05, + "loss": 0.2464, + "step": 35647 + }, + { + "epoch": 2.8878807517822422, + "grad_norm": 0.08463514596223831, + "learning_rate": 6.178495881902876e-05, + "loss": 0.2719, + "step": 35648 + }, + { + "epoch": 2.887961762799741, + "grad_norm": 0.06642767786979675, + "learning_rate": 6.178045816643413e-05, + "loss": 0.2499, + "step": 35649 + }, + { + "epoch": 2.888042773817239, + "grad_norm": 0.07932719588279724, + "learning_rate": 6.177595751383951e-05, + "loss": 0.2366, + "step": 35650 + }, + { + "epoch": 2.8881237848347374, + "grad_norm": 0.06755657494068146, + "learning_rate": 6.177145686124488e-05, + "loss": 0.2319, + "step": 35651 + }, + { + "epoch": 2.8882047958522357, + "grad_norm": 0.07422051578760147, + "learning_rate": 6.176695620865026e-05, + "loss": 0.2295, + "step": 35652 + }, + { + "epoch": 2.8882858068697344, + "grad_norm": 0.060580454766750336, + "learning_rate": 6.176245555605563e-05, + "loss": 0.2221, + "step": 35653 + }, + { + "epoch": 2.8883668178872326, + "grad_norm": 0.06504825502634048, + "learning_rate": 6.1757954903461e-05, + "loss": 0.2375, + "step": 35654 + }, + { + "epoch": 2.888447828904731, + "grad_norm": 0.060391828417778015, + "learning_rate": 6.175345425086638e-05, + "loss": 0.1846, + "step": 35655 + }, + { + "epoch": 2.8885288399222295, + "grad_norm": 0.07530611753463745, + "learning_rate": 6.174895359827175e-05, + "loss": 0.236, + "step": 35656 + }, + { + "epoch": 2.8886098509397278, + "grad_norm": 0.060225117951631546, + "learning_rate": 6.174445294567712e-05, + "loss": 0.2358, + "step": 35657 + }, + { + "epoch": 2.888690861957226, + "grad_norm": 0.05687911808490753, + "learning_rate": 6.17399522930825e-05, + "loss": 0.2323, + "step": 35658 + }, + { + "epoch": 2.8887718729747247, + "grad_norm": 0.06379065662622452, + "learning_rate": 6.173545164048787e-05, + "loss": 0.2152, + "step": 35659 + }, + { + "epoch": 2.888852883992223, + "grad_norm": 0.06311369687318802, + "learning_rate": 6.173095098789324e-05, + "loss": 0.2269, + "step": 35660 + }, + { + "epoch": 2.888933895009721, + "grad_norm": 0.08408977836370468, + "learning_rate": 6.172645033529862e-05, + "loss": 0.237, + "step": 35661 + }, + { + "epoch": 2.88901490602722, + "grad_norm": 0.07219025492668152, + "learning_rate": 6.172194968270399e-05, + "loss": 0.2533, + "step": 35662 + }, + { + "epoch": 2.889095917044718, + "grad_norm": 0.08540571480989456, + "learning_rate": 6.171744903010937e-05, + "loss": 0.2353, + "step": 35663 + }, + { + "epoch": 2.8891769280622164, + "grad_norm": 0.07214327901601791, + "learning_rate": 6.171294837751474e-05, + "loss": 0.263, + "step": 35664 + }, + { + "epoch": 2.889257939079715, + "grad_norm": 0.07148697227239609, + "learning_rate": 6.170844772492011e-05, + "loss": 0.2214, + "step": 35665 + }, + { + "epoch": 2.8893389500972133, + "grad_norm": 0.06975147128105164, + "learning_rate": 6.170394707232549e-05, + "loss": 0.2098, + "step": 35666 + }, + { + "epoch": 2.8894199611147116, + "grad_norm": 0.07287190109491348, + "learning_rate": 6.169944641973086e-05, + "loss": 0.2221, + "step": 35667 + }, + { + "epoch": 2.8895009721322102, + "grad_norm": 0.06442992389202118, + "learning_rate": 6.169494576713623e-05, + "loss": 0.2223, + "step": 35668 + }, + { + "epoch": 2.8895819831497085, + "grad_norm": 0.0819707065820694, + "learning_rate": 6.169044511454161e-05, + "loss": 0.1962, + "step": 35669 + }, + { + "epoch": 2.8896629941672067, + "grad_norm": 0.08242817223072052, + "learning_rate": 6.168594446194698e-05, + "loss": 0.2302, + "step": 35670 + }, + { + "epoch": 2.889744005184705, + "grad_norm": 0.08381433039903641, + "learning_rate": 6.168144380935235e-05, + "loss": 0.2477, + "step": 35671 + }, + { + "epoch": 2.8898250162022032, + "grad_norm": 0.06956245005130768, + "learning_rate": 6.167694315675774e-05, + "loss": 0.2422, + "step": 35672 + }, + { + "epoch": 2.889906027219702, + "grad_norm": 0.06494015455245972, + "learning_rate": 6.167244250416311e-05, + "loss": 0.2564, + "step": 35673 + }, + { + "epoch": 2.8899870382372, + "grad_norm": 0.06729976087808609, + "learning_rate": 6.166794185156848e-05, + "loss": 0.2336, + "step": 35674 + }, + { + "epoch": 2.8900680492546984, + "grad_norm": 0.06940561532974243, + "learning_rate": 6.166344119897386e-05, + "loss": 0.2058, + "step": 35675 + }, + { + "epoch": 2.890149060272197, + "grad_norm": 0.06687884032726288, + "learning_rate": 6.165894054637924e-05, + "loss": 0.2139, + "step": 35676 + }, + { + "epoch": 2.8902300712896953, + "grad_norm": 0.07727450132369995, + "learning_rate": 6.16544398937846e-05, + "loss": 0.2432, + "step": 35677 + }, + { + "epoch": 2.8903110823071936, + "grad_norm": 0.06999623775482178, + "learning_rate": 6.164993924118998e-05, + "loss": 0.2718, + "step": 35678 + }, + { + "epoch": 2.8903920933246923, + "grad_norm": 0.07031282782554626, + "learning_rate": 6.164543858859536e-05, + "loss": 0.2262, + "step": 35679 + }, + { + "epoch": 2.8904731043421905, + "grad_norm": 0.07029929012060165, + "learning_rate": 6.164093793600072e-05, + "loss": 0.2375, + "step": 35680 + }, + { + "epoch": 2.8905541153596888, + "grad_norm": 0.08660591393709183, + "learning_rate": 6.16364372834061e-05, + "loss": 0.2649, + "step": 35681 + }, + { + "epoch": 2.8906351263771874, + "grad_norm": 0.08125916123390198, + "learning_rate": 6.163193663081148e-05, + "loss": 0.2546, + "step": 35682 + }, + { + "epoch": 2.8907161373946857, + "grad_norm": 0.07145148515701294, + "learning_rate": 6.162743597821684e-05, + "loss": 0.2218, + "step": 35683 + }, + { + "epoch": 2.890797148412184, + "grad_norm": 0.0741056427359581, + "learning_rate": 6.162293532562222e-05, + "loss": 0.2769, + "step": 35684 + }, + { + "epoch": 2.8908781594296826, + "grad_norm": 0.08147289603948593, + "learning_rate": 6.16184346730276e-05, + "loss": 0.2197, + "step": 35685 + }, + { + "epoch": 2.890959170447181, + "grad_norm": 0.06483854353427887, + "learning_rate": 6.161393402043296e-05, + "loss": 0.2311, + "step": 35686 + }, + { + "epoch": 2.891040181464679, + "grad_norm": 0.06676837056875229, + "learning_rate": 6.160943336783835e-05, + "loss": 0.2453, + "step": 35687 + }, + { + "epoch": 2.891121192482178, + "grad_norm": 0.07340171933174133, + "learning_rate": 6.160493271524372e-05, + "loss": 0.2072, + "step": 35688 + }, + { + "epoch": 2.891202203499676, + "grad_norm": 0.06559917330741882, + "learning_rate": 6.160043206264908e-05, + "loss": 0.2415, + "step": 35689 + }, + { + "epoch": 2.8912832145171743, + "grad_norm": 0.058529727160930634, + "learning_rate": 6.159593141005447e-05, + "loss": 0.2363, + "step": 35690 + }, + { + "epoch": 2.891364225534673, + "grad_norm": 0.06556835025548935, + "learning_rate": 6.159143075745984e-05, + "loss": 0.235, + "step": 35691 + }, + { + "epoch": 2.8914452365521712, + "grad_norm": 0.061084944754838943, + "learning_rate": 6.15869301048652e-05, + "loss": 0.2404, + "step": 35692 + }, + { + "epoch": 2.8915262475696695, + "grad_norm": 0.06845731288194656, + "learning_rate": 6.158242945227059e-05, + "loss": 0.2403, + "step": 35693 + }, + { + "epoch": 2.8916072585871677, + "grad_norm": 0.057648856192827225, + "learning_rate": 6.157792879967596e-05, + "loss": 0.213, + "step": 35694 + }, + { + "epoch": 2.891688269604666, + "grad_norm": 0.07292310148477554, + "learning_rate": 6.157342814708132e-05, + "loss": 0.224, + "step": 35695 + }, + { + "epoch": 2.8917692806221647, + "grad_norm": 0.06565499305725098, + "learning_rate": 6.156892749448671e-05, + "loss": 0.2563, + "step": 35696 + }, + { + "epoch": 2.891850291639663, + "grad_norm": 0.0779080018401146, + "learning_rate": 6.156442684189208e-05, + "loss": 0.232, + "step": 35697 + }, + { + "epoch": 2.891931302657161, + "grad_norm": 0.07010319828987122, + "learning_rate": 6.155992618929746e-05, + "loss": 0.2262, + "step": 35698 + }, + { + "epoch": 2.89201231367466, + "grad_norm": 0.06465441733598709, + "learning_rate": 6.155542553670283e-05, + "loss": 0.2268, + "step": 35699 + }, + { + "epoch": 2.892093324692158, + "grad_norm": 0.08106248080730438, + "learning_rate": 6.15509248841082e-05, + "loss": 0.241, + "step": 35700 + }, + { + "epoch": 2.8921743357096563, + "grad_norm": 0.06921873986721039, + "learning_rate": 6.154642423151358e-05, + "loss": 0.2296, + "step": 35701 + }, + { + "epoch": 2.892255346727155, + "grad_norm": 0.07705320417881012, + "learning_rate": 6.154192357891895e-05, + "loss": 0.253, + "step": 35702 + }, + { + "epoch": 2.8923363577446533, + "grad_norm": 0.07825233042240143, + "learning_rate": 6.153742292632432e-05, + "loss": 0.2568, + "step": 35703 + }, + { + "epoch": 2.8924173687621515, + "grad_norm": 0.07463907450437546, + "learning_rate": 6.15329222737297e-05, + "loss": 0.2506, + "step": 35704 + }, + { + "epoch": 2.89249837977965, + "grad_norm": 0.07520240545272827, + "learning_rate": 6.152842162113507e-05, + "loss": 0.2678, + "step": 35705 + }, + { + "epoch": 2.8925793907971484, + "grad_norm": 0.07687666267156601, + "learning_rate": 6.152392096854044e-05, + "loss": 0.264, + "step": 35706 + }, + { + "epoch": 2.8926604018146467, + "grad_norm": 0.07741294801235199, + "learning_rate": 6.151942031594582e-05, + "loss": 0.2225, + "step": 35707 + }, + { + "epoch": 2.8927414128321454, + "grad_norm": 0.07104411721229553, + "learning_rate": 6.151491966335119e-05, + "loss": 0.2285, + "step": 35708 + }, + { + "epoch": 2.8928224238496436, + "grad_norm": 0.07104265689849854, + "learning_rate": 6.151041901075656e-05, + "loss": 0.2319, + "step": 35709 + }, + { + "epoch": 2.892903434867142, + "grad_norm": 0.0759245902299881, + "learning_rate": 6.150591835816194e-05, + "loss": 0.2577, + "step": 35710 + }, + { + "epoch": 2.8929844458846405, + "grad_norm": 0.06897891312837601, + "learning_rate": 6.150141770556731e-05, + "loss": 0.2196, + "step": 35711 + }, + { + "epoch": 2.893065456902139, + "grad_norm": 0.07628782093524933, + "learning_rate": 6.149691705297269e-05, + "loss": 0.2444, + "step": 35712 + }, + { + "epoch": 2.893146467919637, + "grad_norm": 0.06364277005195618, + "learning_rate": 6.149241640037806e-05, + "loss": 0.2097, + "step": 35713 + }, + { + "epoch": 2.8932274789371357, + "grad_norm": 0.07740595191717148, + "learning_rate": 6.148791574778343e-05, + "loss": 0.2185, + "step": 35714 + }, + { + "epoch": 2.893308489954634, + "grad_norm": 0.07443327456712723, + "learning_rate": 6.14834150951888e-05, + "loss": 0.2131, + "step": 35715 + }, + { + "epoch": 2.893389500972132, + "grad_norm": 0.0595758818089962, + "learning_rate": 6.147891444259418e-05, + "loss": 0.2057, + "step": 35716 + }, + { + "epoch": 2.8934705119896305, + "grad_norm": 0.07437099516391754, + "learning_rate": 6.147441378999955e-05, + "loss": 0.2699, + "step": 35717 + }, + { + "epoch": 2.8935515230071287, + "grad_norm": 0.08497471362352371, + "learning_rate": 6.146991313740493e-05, + "loss": 0.2037, + "step": 35718 + }, + { + "epoch": 2.8936325340246274, + "grad_norm": 0.07126790285110474, + "learning_rate": 6.14654124848103e-05, + "loss": 0.2333, + "step": 35719 + }, + { + "epoch": 2.8937135450421256, + "grad_norm": 0.07110083848237991, + "learning_rate": 6.146091183221567e-05, + "loss": 0.2206, + "step": 35720 + }, + { + "epoch": 2.893794556059624, + "grad_norm": 0.07824023067951202, + "learning_rate": 6.145641117962105e-05, + "loss": 0.239, + "step": 35721 + }, + { + "epoch": 2.8938755670771226, + "grad_norm": 0.0700746402144432, + "learning_rate": 6.145191052702642e-05, + "loss": 0.2125, + "step": 35722 + }, + { + "epoch": 2.893956578094621, + "grad_norm": 0.06137807294726372, + "learning_rate": 6.14474098744318e-05, + "loss": 0.2615, + "step": 35723 + }, + { + "epoch": 2.894037589112119, + "grad_norm": 0.0737156793475151, + "learning_rate": 6.144290922183717e-05, + "loss": 0.2481, + "step": 35724 + }, + { + "epoch": 2.8941186001296177, + "grad_norm": 0.06886066496372223, + "learning_rate": 6.143840856924254e-05, + "loss": 0.2657, + "step": 35725 + }, + { + "epoch": 2.894199611147116, + "grad_norm": 0.08199062943458557, + "learning_rate": 6.143390791664792e-05, + "loss": 0.2282, + "step": 35726 + }, + { + "epoch": 2.8942806221646142, + "grad_norm": 0.053236592561006546, + "learning_rate": 6.142940726405329e-05, + "loss": 0.2214, + "step": 35727 + }, + { + "epoch": 2.894361633182113, + "grad_norm": 0.08542463928461075, + "learning_rate": 6.142490661145866e-05, + "loss": 0.2405, + "step": 35728 + }, + { + "epoch": 2.894442644199611, + "grad_norm": 0.07335537672042847, + "learning_rate": 6.142040595886404e-05, + "loss": 0.2456, + "step": 35729 + }, + { + "epoch": 2.8945236552171094, + "grad_norm": 0.06895040720701218, + "learning_rate": 6.141590530626941e-05, + "loss": 0.2405, + "step": 35730 + }, + { + "epoch": 2.894604666234608, + "grad_norm": 0.0709012895822525, + "learning_rate": 6.141140465367478e-05, + "loss": 0.223, + "step": 35731 + }, + { + "epoch": 2.8946856772521063, + "grad_norm": 0.058473143726587296, + "learning_rate": 6.140690400108016e-05, + "loss": 0.2292, + "step": 35732 + }, + { + "epoch": 2.8947666882696046, + "grad_norm": 0.07192769646644592, + "learning_rate": 6.140240334848553e-05, + "loss": 0.2443, + "step": 35733 + }, + { + "epoch": 2.8948476992871033, + "grad_norm": 0.0589325875043869, + "learning_rate": 6.13979026958909e-05, + "loss": 0.2225, + "step": 35734 + }, + { + "epoch": 2.8949287103046015, + "grad_norm": 0.06955495476722717, + "learning_rate": 6.139340204329628e-05, + "loss": 0.2651, + "step": 35735 + }, + { + "epoch": 2.8950097213220998, + "grad_norm": 0.07267536222934723, + "learning_rate": 6.138890139070165e-05, + "loss": 0.2517, + "step": 35736 + }, + { + "epoch": 2.8950907323395985, + "grad_norm": 0.06132880225777626, + "learning_rate": 6.138440073810703e-05, + "loss": 0.2315, + "step": 35737 + }, + { + "epoch": 2.8951717433570967, + "grad_norm": 0.08604191243648529, + "learning_rate": 6.13799000855124e-05, + "loss": 0.2307, + "step": 35738 + }, + { + "epoch": 2.895252754374595, + "grad_norm": 0.0703839436173439, + "learning_rate": 6.137539943291777e-05, + "loss": 0.2391, + "step": 35739 + }, + { + "epoch": 2.895333765392093, + "grad_norm": 0.0668756365776062, + "learning_rate": 6.137089878032315e-05, + "loss": 0.2407, + "step": 35740 + }, + { + "epoch": 2.8954147764095914, + "grad_norm": 0.08132848143577576, + "learning_rate": 6.136639812772852e-05, + "loss": 0.226, + "step": 35741 + }, + { + "epoch": 2.89549578742709, + "grad_norm": 0.06000250205397606, + "learning_rate": 6.136189747513391e-05, + "loss": 0.227, + "step": 35742 + }, + { + "epoch": 2.8955767984445884, + "grad_norm": 0.07280347496271133, + "learning_rate": 6.135739682253927e-05, + "loss": 0.243, + "step": 35743 + }, + { + "epoch": 2.8956578094620866, + "grad_norm": 0.05585459619760513, + "learning_rate": 6.135289616994464e-05, + "loss": 0.1944, + "step": 35744 + }, + { + "epoch": 2.8957388204795853, + "grad_norm": 0.07624555379152298, + "learning_rate": 6.134839551735003e-05, + "loss": 0.2585, + "step": 35745 + }, + { + "epoch": 2.8958198314970836, + "grad_norm": 0.08827781677246094, + "learning_rate": 6.134389486475539e-05, + "loss": 0.2441, + "step": 35746 + }, + { + "epoch": 2.895900842514582, + "grad_norm": 0.08926903456449509, + "learning_rate": 6.133939421216076e-05, + "loss": 0.25, + "step": 35747 + }, + { + "epoch": 2.8959818535320805, + "grad_norm": 0.06952491402626038, + "learning_rate": 6.133489355956615e-05, + "loss": 0.2142, + "step": 35748 + }, + { + "epoch": 2.8960628645495787, + "grad_norm": 0.07767337560653687, + "learning_rate": 6.133039290697151e-05, + "loss": 0.2297, + "step": 35749 + }, + { + "epoch": 2.896143875567077, + "grad_norm": 0.0816320851445198, + "learning_rate": 6.132589225437688e-05, + "loss": 0.2429, + "step": 35750 + }, + { + "epoch": 2.8962248865845757, + "grad_norm": 0.06641452759504318, + "learning_rate": 6.132139160178227e-05, + "loss": 0.235, + "step": 35751 + }, + { + "epoch": 2.896305897602074, + "grad_norm": 0.0660315752029419, + "learning_rate": 6.131689094918763e-05, + "loss": 0.2504, + "step": 35752 + }, + { + "epoch": 2.896386908619572, + "grad_norm": 0.09795043617486954, + "learning_rate": 6.131239029659302e-05, + "loss": 0.2725, + "step": 35753 + }, + { + "epoch": 2.896467919637071, + "grad_norm": 0.07718279212713242, + "learning_rate": 6.130788964399839e-05, + "loss": 0.2823, + "step": 35754 + }, + { + "epoch": 2.896548930654569, + "grad_norm": 0.07075318694114685, + "learning_rate": 6.130338899140375e-05, + "loss": 0.2465, + "step": 35755 + }, + { + "epoch": 2.8966299416720673, + "grad_norm": 0.06218565255403519, + "learning_rate": 6.129888833880914e-05, + "loss": 0.2148, + "step": 35756 + }, + { + "epoch": 2.896710952689566, + "grad_norm": 0.07949451357126236, + "learning_rate": 6.129438768621451e-05, + "loss": 0.2384, + "step": 35757 + }, + { + "epoch": 2.8967919637070643, + "grad_norm": 0.053814273327589035, + "learning_rate": 6.128988703361987e-05, + "loss": 0.2206, + "step": 35758 + }, + { + "epoch": 2.8968729747245625, + "grad_norm": 0.07167844474315643, + "learning_rate": 6.128538638102526e-05, + "loss": 0.2381, + "step": 35759 + }, + { + "epoch": 2.8969539857420608, + "grad_norm": 0.07708174735307693, + "learning_rate": 6.128088572843063e-05, + "loss": 0.232, + "step": 35760 + }, + { + "epoch": 2.8970349967595594, + "grad_norm": 0.06454035639762878, + "learning_rate": 6.127638507583599e-05, + "loss": 0.2296, + "step": 35761 + }, + { + "epoch": 2.8971160077770577, + "grad_norm": 0.08612510561943054, + "learning_rate": 6.127188442324138e-05, + "loss": 0.2265, + "step": 35762 + }, + { + "epoch": 2.897197018794556, + "grad_norm": 0.06060539931058884, + "learning_rate": 6.126738377064675e-05, + "loss": 0.2063, + "step": 35763 + }, + { + "epoch": 2.897278029812054, + "grad_norm": 0.07030733674764633, + "learning_rate": 6.126288311805211e-05, + "loss": 0.2411, + "step": 35764 + }, + { + "epoch": 2.897359040829553, + "grad_norm": 0.06264536082744598, + "learning_rate": 6.12583824654575e-05, + "loss": 0.2182, + "step": 35765 + }, + { + "epoch": 2.897440051847051, + "grad_norm": 0.08044012635946274, + "learning_rate": 6.125388181286287e-05, + "loss": 0.2497, + "step": 35766 + }, + { + "epoch": 2.8975210628645494, + "grad_norm": 0.06459780782461166, + "learning_rate": 6.124938116026823e-05, + "loss": 0.2542, + "step": 35767 + }, + { + "epoch": 2.897602073882048, + "grad_norm": 0.0798090472817421, + "learning_rate": 6.124488050767362e-05, + "loss": 0.2075, + "step": 35768 + }, + { + "epoch": 2.8976830848995463, + "grad_norm": 0.0677122250199318, + "learning_rate": 6.1240379855079e-05, + "loss": 0.1975, + "step": 35769 + }, + { + "epoch": 2.8977640959170445, + "grad_norm": 0.06713425368070602, + "learning_rate": 6.123587920248435e-05, + "loss": 0.2271, + "step": 35770 + }, + { + "epoch": 2.8978451069345432, + "grad_norm": 0.09057461470365524, + "learning_rate": 6.123137854988974e-05, + "loss": 0.2581, + "step": 35771 + }, + { + "epoch": 2.8979261179520415, + "grad_norm": 0.06432762742042542, + "learning_rate": 6.122687789729512e-05, + "loss": 0.2248, + "step": 35772 + }, + { + "epoch": 2.8980071289695397, + "grad_norm": 0.07236693799495697, + "learning_rate": 6.122237724470048e-05, + "loss": 0.2248, + "step": 35773 + }, + { + "epoch": 2.8980881399870384, + "grad_norm": 0.06687065213918686, + "learning_rate": 6.121787659210586e-05, + "loss": 0.1849, + "step": 35774 + }, + { + "epoch": 2.8981691510045366, + "grad_norm": 0.05940384045243263, + "learning_rate": 6.121337593951124e-05, + "loss": 0.2392, + "step": 35775 + }, + { + "epoch": 2.898250162022035, + "grad_norm": 0.07831598073244095, + "learning_rate": 6.12088752869166e-05, + "loss": 0.2467, + "step": 35776 + }, + { + "epoch": 2.8983311730395336, + "grad_norm": 0.08034084737300873, + "learning_rate": 6.120437463432198e-05, + "loss": 0.2475, + "step": 35777 + }, + { + "epoch": 2.898412184057032, + "grad_norm": 0.06301796436309814, + "learning_rate": 6.119987398172736e-05, + "loss": 0.2021, + "step": 35778 + }, + { + "epoch": 2.89849319507453, + "grad_norm": 0.0662277564406395, + "learning_rate": 6.119537332913273e-05, + "loss": 0.2191, + "step": 35779 + }, + { + "epoch": 2.8985742060920288, + "grad_norm": 0.06957314908504486, + "learning_rate": 6.11908726765381e-05, + "loss": 0.2556, + "step": 35780 + }, + { + "epoch": 2.898655217109527, + "grad_norm": 0.05701505020260811, + "learning_rate": 6.118637202394348e-05, + "loss": 0.244, + "step": 35781 + }, + { + "epoch": 2.8987362281270252, + "grad_norm": 0.06337479501962662, + "learning_rate": 6.118187137134885e-05, + "loss": 0.222, + "step": 35782 + }, + { + "epoch": 2.8988172391445235, + "grad_norm": 0.07498802244663239, + "learning_rate": 6.117737071875422e-05, + "loss": 0.2608, + "step": 35783 + }, + { + "epoch": 2.898898250162022, + "grad_norm": 0.06363961100578308, + "learning_rate": 6.11728700661596e-05, + "loss": 0.2693, + "step": 35784 + }, + { + "epoch": 2.8989792611795204, + "grad_norm": 0.058687712997198105, + "learning_rate": 6.116836941356497e-05, + "loss": 0.2206, + "step": 35785 + }, + { + "epoch": 2.8990602721970187, + "grad_norm": 0.07359886914491653, + "learning_rate": 6.116386876097035e-05, + "loss": 0.2365, + "step": 35786 + }, + { + "epoch": 2.899141283214517, + "grad_norm": 0.0818164125084877, + "learning_rate": 6.115936810837572e-05, + "loss": 0.2434, + "step": 35787 + }, + { + "epoch": 2.8992222942320156, + "grad_norm": 0.06891828030347824, + "learning_rate": 6.115486745578109e-05, + "loss": 0.2451, + "step": 35788 + }, + { + "epoch": 2.899303305249514, + "grad_norm": 0.0845608338713646, + "learning_rate": 6.115036680318647e-05, + "loss": 0.2512, + "step": 35789 + }, + { + "epoch": 2.899384316267012, + "grad_norm": 0.07846824079751968, + "learning_rate": 6.114586615059184e-05, + "loss": 0.206, + "step": 35790 + }, + { + "epoch": 2.899465327284511, + "grad_norm": 0.07462865859270096, + "learning_rate": 6.114136549799721e-05, + "loss": 0.2471, + "step": 35791 + }, + { + "epoch": 2.899546338302009, + "grad_norm": 0.06899303197860718, + "learning_rate": 6.113686484540259e-05, + "loss": 0.223, + "step": 35792 + }, + { + "epoch": 2.8996273493195073, + "grad_norm": 0.07402803003787994, + "learning_rate": 6.113236419280796e-05, + "loss": 0.2511, + "step": 35793 + }, + { + "epoch": 2.899708360337006, + "grad_norm": 0.06916865706443787, + "learning_rate": 6.112786354021333e-05, + "loss": 0.226, + "step": 35794 + }, + { + "epoch": 2.899789371354504, + "grad_norm": 0.07550256699323654, + "learning_rate": 6.112336288761871e-05, + "loss": 0.2197, + "step": 35795 + }, + { + "epoch": 2.8998703823720025, + "grad_norm": 0.06550214439630508, + "learning_rate": 6.111886223502408e-05, + "loss": 0.2392, + "step": 35796 + }, + { + "epoch": 2.899951393389501, + "grad_norm": 0.09479155391454697, + "learning_rate": 6.111436158242946e-05, + "loss": 0.2623, + "step": 35797 + }, + { + "epoch": 2.9000324044069994, + "grad_norm": 0.06591324508190155, + "learning_rate": 6.110986092983483e-05, + "loss": 0.2439, + "step": 35798 + }, + { + "epoch": 2.9001134154244976, + "grad_norm": 0.07462452352046967, + "learning_rate": 6.11053602772402e-05, + "loss": 0.2639, + "step": 35799 + }, + { + "epoch": 2.9001944264419963, + "grad_norm": 0.0622115395963192, + "learning_rate": 6.110085962464558e-05, + "loss": 0.2001, + "step": 35800 + }, + { + "epoch": 2.9002754374594946, + "grad_norm": 0.07298856973648071, + "learning_rate": 6.109635897205095e-05, + "loss": 0.241, + "step": 35801 + }, + { + "epoch": 2.900356448476993, + "grad_norm": 0.07199979573488235, + "learning_rate": 6.109185831945632e-05, + "loss": 0.2375, + "step": 35802 + }, + { + "epoch": 2.9004374594944915, + "grad_norm": 0.08813222497701645, + "learning_rate": 6.10873576668617e-05, + "loss": 0.2208, + "step": 35803 + }, + { + "epoch": 2.9005184705119897, + "grad_norm": 0.06779766082763672, + "learning_rate": 6.108285701426707e-05, + "loss": 0.2225, + "step": 35804 + }, + { + "epoch": 2.900599481529488, + "grad_norm": 0.07290019094944, + "learning_rate": 6.107835636167244e-05, + "loss": 0.2221, + "step": 35805 + }, + { + "epoch": 2.9006804925469862, + "grad_norm": 0.07287006825208664, + "learning_rate": 6.107385570907782e-05, + "loss": 0.2547, + "step": 35806 + }, + { + "epoch": 2.900761503564485, + "grad_norm": 0.07200240343809128, + "learning_rate": 6.106935505648319e-05, + "loss": 0.2173, + "step": 35807 + }, + { + "epoch": 2.900842514581983, + "grad_norm": 0.06275129318237305, + "learning_rate": 6.106485440388856e-05, + "loss": 0.2357, + "step": 35808 + }, + { + "epoch": 2.9009235255994814, + "grad_norm": 0.07564835995435715, + "learning_rate": 6.106035375129394e-05, + "loss": 0.2277, + "step": 35809 + }, + { + "epoch": 2.9010045366169797, + "grad_norm": 0.05272950977087021, + "learning_rate": 6.105585309869931e-05, + "loss": 0.191, + "step": 35810 + }, + { + "epoch": 2.9010855476344783, + "grad_norm": 0.0691477358341217, + "learning_rate": 6.10513524461047e-05, + "loss": 0.2281, + "step": 35811 + }, + { + "epoch": 2.9011665586519766, + "grad_norm": 0.07196378707885742, + "learning_rate": 6.104685179351006e-05, + "loss": 0.2087, + "step": 35812 + }, + { + "epoch": 2.901247569669475, + "grad_norm": 0.059849925339221954, + "learning_rate": 6.104235114091543e-05, + "loss": 0.2459, + "step": 35813 + }, + { + "epoch": 2.9013285806869735, + "grad_norm": 0.0893348902463913, + "learning_rate": 6.103785048832082e-05, + "loss": 0.2147, + "step": 35814 + }, + { + "epoch": 2.9014095917044718, + "grad_norm": 0.06526979058980942, + "learning_rate": 6.103334983572618e-05, + "loss": 0.2392, + "step": 35815 + }, + { + "epoch": 2.90149060272197, + "grad_norm": 0.06764549016952515, + "learning_rate": 6.1028849183131554e-05, + "loss": 0.2748, + "step": 35816 + }, + { + "epoch": 2.9015716137394687, + "grad_norm": 0.07543917000293732, + "learning_rate": 6.1024348530536934e-05, + "loss": 0.2473, + "step": 35817 + }, + { + "epoch": 2.901652624756967, + "grad_norm": 0.0789756327867508, + "learning_rate": 6.101984787794231e-05, + "loss": 0.2554, + "step": 35818 + }, + { + "epoch": 2.901733635774465, + "grad_norm": 0.05799851939082146, + "learning_rate": 6.1015347225347675e-05, + "loss": 0.215, + "step": 35819 + }, + { + "epoch": 2.901814646791964, + "grad_norm": 0.06580415368080139, + "learning_rate": 6.1010846572753055e-05, + "loss": 0.2228, + "step": 35820 + }, + { + "epoch": 2.901895657809462, + "grad_norm": 0.05901229754090309, + "learning_rate": 6.100634592015843e-05, + "loss": 0.1921, + "step": 35821 + }, + { + "epoch": 2.9019766688269604, + "grad_norm": 0.07883156836032867, + "learning_rate": 6.1001845267563795e-05, + "loss": 0.2291, + "step": 35822 + }, + { + "epoch": 2.902057679844459, + "grad_norm": 0.07001320272684097, + "learning_rate": 6.0997344614969176e-05, + "loss": 0.2415, + "step": 35823 + }, + { + "epoch": 2.9021386908619573, + "grad_norm": 0.0769026055932045, + "learning_rate": 6.099284396237455e-05, + "loss": 0.2237, + "step": 35824 + }, + { + "epoch": 2.9022197018794555, + "grad_norm": 0.07997845858335495, + "learning_rate": 6.0988343309779916e-05, + "loss": 0.2736, + "step": 35825 + }, + { + "epoch": 2.9023007128969542, + "grad_norm": 0.06704940646886826, + "learning_rate": 6.0983842657185297e-05, + "loss": 0.2322, + "step": 35826 + }, + { + "epoch": 2.9023817239144525, + "grad_norm": 0.07610899209976196, + "learning_rate": 6.097934200459067e-05, + "loss": 0.214, + "step": 35827 + }, + { + "epoch": 2.9024627349319507, + "grad_norm": 0.07217517495155334, + "learning_rate": 6.097484135199604e-05, + "loss": 0.2088, + "step": 35828 + }, + { + "epoch": 2.902543745949449, + "grad_norm": 0.07086925953626633, + "learning_rate": 6.097034069940142e-05, + "loss": 0.2388, + "step": 35829 + }, + { + "epoch": 2.9026247569669477, + "grad_norm": 0.07095544040203094, + "learning_rate": 6.096584004680679e-05, + "loss": 0.2729, + "step": 35830 + }, + { + "epoch": 2.902705767984446, + "grad_norm": 0.0699014961719513, + "learning_rate": 6.096133939421217e-05, + "loss": 0.2417, + "step": 35831 + }, + { + "epoch": 2.902786779001944, + "grad_norm": 0.0674590915441513, + "learning_rate": 6.095683874161754e-05, + "loss": 0.2022, + "step": 35832 + }, + { + "epoch": 2.9028677900194424, + "grad_norm": 0.06928414106369019, + "learning_rate": 6.095233808902291e-05, + "loss": 0.2075, + "step": 35833 + }, + { + "epoch": 2.902948801036941, + "grad_norm": 0.06169206649065018, + "learning_rate": 6.094783743642829e-05, + "loss": 0.1976, + "step": 35834 + }, + { + "epoch": 2.9030298120544393, + "grad_norm": 0.07483284175395966, + "learning_rate": 6.094333678383366e-05, + "loss": 0.2431, + "step": 35835 + }, + { + "epoch": 2.9031108230719376, + "grad_norm": 0.07868918031454086, + "learning_rate": 6.093883613123903e-05, + "loss": 0.2426, + "step": 35836 + }, + { + "epoch": 2.9031918340894363, + "grad_norm": 0.06046166270971298, + "learning_rate": 6.093433547864441e-05, + "loss": 0.2048, + "step": 35837 + }, + { + "epoch": 2.9032728451069345, + "grad_norm": 0.05576327070593834, + "learning_rate": 6.092983482604978e-05, + "loss": 0.2047, + "step": 35838 + }, + { + "epoch": 2.9033538561244328, + "grad_norm": 0.07098156213760376, + "learning_rate": 6.0925334173455153e-05, + "loss": 0.212, + "step": 35839 + }, + { + "epoch": 2.9034348671419314, + "grad_norm": 0.06611877679824829, + "learning_rate": 6.0920833520860534e-05, + "loss": 0.201, + "step": 35840 + }, + { + "epoch": 2.9035158781594297, + "grad_norm": 0.05478915944695473, + "learning_rate": 6.09163328682659e-05, + "loss": 0.2121, + "step": 35841 + }, + { + "epoch": 2.903596889176928, + "grad_norm": 0.07683388888835907, + "learning_rate": 6.0911832215671274e-05, + "loss": 0.2307, + "step": 35842 + }, + { + "epoch": 2.9036779001944266, + "grad_norm": 0.06985338777303696, + "learning_rate": 6.0907331563076655e-05, + "loss": 0.2677, + "step": 35843 + }, + { + "epoch": 2.903758911211925, + "grad_norm": 0.05494997650384903, + "learning_rate": 6.090283091048202e-05, + "loss": 0.207, + "step": 35844 + }, + { + "epoch": 2.903839922229423, + "grad_norm": 0.06654516607522964, + "learning_rate": 6.0898330257887395e-05, + "loss": 0.2374, + "step": 35845 + }, + { + "epoch": 2.903920933246922, + "grad_norm": 0.08357511460781097, + "learning_rate": 6.0893829605292775e-05, + "loss": 0.3211, + "step": 35846 + }, + { + "epoch": 2.90400194426442, + "grad_norm": 0.06784695386886597, + "learning_rate": 6.088932895269814e-05, + "loss": 0.2089, + "step": 35847 + }, + { + "epoch": 2.9040829552819183, + "grad_norm": 0.056639935821294785, + "learning_rate": 6.0884828300103516e-05, + "loss": 0.2092, + "step": 35848 + }, + { + "epoch": 2.904163966299417, + "grad_norm": 0.06386925280094147, + "learning_rate": 6.0880327647508896e-05, + "loss": 0.2534, + "step": 35849 + }, + { + "epoch": 2.904244977316915, + "grad_norm": 0.06474310904741287, + "learning_rate": 6.087582699491426e-05, + "loss": 0.2513, + "step": 35850 + }, + { + "epoch": 2.9043259883344135, + "grad_norm": 0.06959279626607895, + "learning_rate": 6.087132634231964e-05, + "loss": 0.2557, + "step": 35851 + }, + { + "epoch": 2.9044069993519117, + "grad_norm": 0.06982912123203278, + "learning_rate": 6.086682568972502e-05, + "loss": 0.2188, + "step": 35852 + }, + { + "epoch": 2.9044880103694104, + "grad_norm": 0.06339522451162338, + "learning_rate": 6.0862325037130384e-05, + "loss": 0.2278, + "step": 35853 + }, + { + "epoch": 2.9045690213869086, + "grad_norm": 0.06980250775814056, + "learning_rate": 6.085782438453576e-05, + "loss": 0.2094, + "step": 35854 + }, + { + "epoch": 2.904650032404407, + "grad_norm": 0.06515302509069443, + "learning_rate": 6.085332373194114e-05, + "loss": 0.237, + "step": 35855 + }, + { + "epoch": 2.904731043421905, + "grad_norm": 0.05380278453230858, + "learning_rate": 6.0848823079346505e-05, + "loss": 0.2298, + "step": 35856 + }, + { + "epoch": 2.904812054439404, + "grad_norm": 0.06366229057312012, + "learning_rate": 6.0844322426751885e-05, + "loss": 0.2188, + "step": 35857 + }, + { + "epoch": 2.904893065456902, + "grad_norm": 0.08803581446409225, + "learning_rate": 6.083982177415726e-05, + "loss": 0.2557, + "step": 35858 + }, + { + "epoch": 2.9049740764744003, + "grad_norm": 0.08504535257816315, + "learning_rate": 6.0835321121562626e-05, + "loss": 0.2789, + "step": 35859 + }, + { + "epoch": 2.905055087491899, + "grad_norm": 0.07685036212205887, + "learning_rate": 6.0830820468968006e-05, + "loss": 0.2296, + "step": 35860 + }, + { + "epoch": 2.9051360985093972, + "grad_norm": 0.08343004435300827, + "learning_rate": 6.082631981637338e-05, + "loss": 0.261, + "step": 35861 + }, + { + "epoch": 2.9052171095268955, + "grad_norm": 0.07279090583324432, + "learning_rate": 6.0821819163778746e-05, + "loss": 0.2527, + "step": 35862 + }, + { + "epoch": 2.905298120544394, + "grad_norm": 0.07041387259960175, + "learning_rate": 6.081731851118413e-05, + "loss": 0.2108, + "step": 35863 + }, + { + "epoch": 2.9053791315618924, + "grad_norm": 0.07155166566371918, + "learning_rate": 6.08128178585895e-05, + "loss": 0.2382, + "step": 35864 + }, + { + "epoch": 2.9054601425793907, + "grad_norm": 0.07028033584356308, + "learning_rate": 6.080831720599487e-05, + "loss": 0.2522, + "step": 35865 + }, + { + "epoch": 2.9055411535968894, + "grad_norm": 0.05422496423125267, + "learning_rate": 6.080381655340025e-05, + "loss": 0.2167, + "step": 35866 + }, + { + "epoch": 2.9056221646143876, + "grad_norm": 0.06637411564588547, + "learning_rate": 6.079931590080562e-05, + "loss": 0.2423, + "step": 35867 + }, + { + "epoch": 2.905703175631886, + "grad_norm": 0.08302521705627441, + "learning_rate": 6.079481524821099e-05, + "loss": 0.2241, + "step": 35868 + }, + { + "epoch": 2.9057841866493845, + "grad_norm": 0.06816847622394562, + "learning_rate": 6.079031459561637e-05, + "loss": 0.262, + "step": 35869 + }, + { + "epoch": 2.905865197666883, + "grad_norm": 0.07962358742952347, + "learning_rate": 6.078581394302174e-05, + "loss": 0.2411, + "step": 35870 + }, + { + "epoch": 2.905946208684381, + "grad_norm": 0.06026584282517433, + "learning_rate": 6.078131329042711e-05, + "loss": 0.2123, + "step": 35871 + }, + { + "epoch": 2.9060272197018797, + "grad_norm": 0.07744492590427399, + "learning_rate": 6.077681263783249e-05, + "loss": 0.2571, + "step": 35872 + }, + { + "epoch": 2.906108230719378, + "grad_norm": 0.07641256600618362, + "learning_rate": 6.077231198523786e-05, + "loss": 0.2433, + "step": 35873 + }, + { + "epoch": 2.906189241736876, + "grad_norm": 0.06493622064590454, + "learning_rate": 6.076781133264323e-05, + "loss": 0.2226, + "step": 35874 + }, + { + "epoch": 2.9062702527543745, + "grad_norm": 0.06400008499622345, + "learning_rate": 6.076331068004861e-05, + "loss": 0.2073, + "step": 35875 + }, + { + "epoch": 2.906351263771873, + "grad_norm": 0.08324109762907028, + "learning_rate": 6.0758810027453984e-05, + "loss": 0.2243, + "step": 35876 + }, + { + "epoch": 2.9064322747893714, + "grad_norm": 0.0696927011013031, + "learning_rate": 6.075430937485935e-05, + "loss": 0.2262, + "step": 35877 + }, + { + "epoch": 2.9065132858068696, + "grad_norm": 0.07888327538967133, + "learning_rate": 6.074980872226473e-05, + "loss": 0.2412, + "step": 35878 + }, + { + "epoch": 2.906594296824368, + "grad_norm": 0.07660671323537827, + "learning_rate": 6.0745308069670104e-05, + "loss": 0.2162, + "step": 35879 + }, + { + "epoch": 2.9066753078418666, + "grad_norm": 0.07841958850622177, + "learning_rate": 6.074080741707547e-05, + "loss": 0.2149, + "step": 35880 + }, + { + "epoch": 2.906756318859365, + "grad_norm": 0.0810098648071289, + "learning_rate": 6.073630676448085e-05, + "loss": 0.2379, + "step": 35881 + }, + { + "epoch": 2.906837329876863, + "grad_norm": 0.06500328332185745, + "learning_rate": 6.0731806111886225e-05, + "loss": 0.2691, + "step": 35882 + }, + { + "epoch": 2.9069183408943617, + "grad_norm": 0.0727931559085846, + "learning_rate": 6.072730545929159e-05, + "loss": 0.2249, + "step": 35883 + }, + { + "epoch": 2.90699935191186, + "grad_norm": 0.05468270182609558, + "learning_rate": 6.072280480669697e-05, + "loss": 0.225, + "step": 35884 + }, + { + "epoch": 2.9070803629293582, + "grad_norm": 0.06025093048810959, + "learning_rate": 6.0718304154102346e-05, + "loss": 0.1984, + "step": 35885 + }, + { + "epoch": 2.907161373946857, + "grad_norm": 0.06294640898704529, + "learning_rate": 6.0713803501507726e-05, + "loss": 0.2257, + "step": 35886 + }, + { + "epoch": 2.907242384964355, + "grad_norm": 0.06526791304349899, + "learning_rate": 6.07093028489131e-05, + "loss": 0.2354, + "step": 35887 + }, + { + "epoch": 2.9073233959818534, + "grad_norm": 0.07205391675233841, + "learning_rate": 6.070480219631847e-05, + "loss": 0.2435, + "step": 35888 + }, + { + "epoch": 2.907404406999352, + "grad_norm": 0.061126165091991425, + "learning_rate": 6.070030154372385e-05, + "loss": 0.2328, + "step": 35889 + }, + { + "epoch": 2.9074854180168503, + "grad_norm": 0.08679535239934921, + "learning_rate": 6.069580089112922e-05, + "loss": 0.2527, + "step": 35890 + }, + { + "epoch": 2.9075664290343486, + "grad_norm": 0.0681762620806694, + "learning_rate": 6.069130023853459e-05, + "loss": 0.2374, + "step": 35891 + }, + { + "epoch": 2.9076474400518473, + "grad_norm": 0.08707486838102341, + "learning_rate": 6.068679958593997e-05, + "loss": 0.2545, + "step": 35892 + }, + { + "epoch": 2.9077284510693455, + "grad_norm": 0.07081759721040726, + "learning_rate": 6.068229893334534e-05, + "loss": 0.233, + "step": 35893 + }, + { + "epoch": 2.9078094620868438, + "grad_norm": 0.06580916047096252, + "learning_rate": 6.067779828075071e-05, + "loss": 0.2258, + "step": 35894 + }, + { + "epoch": 2.9078904731043425, + "grad_norm": 0.08303841948509216, + "learning_rate": 6.067329762815609e-05, + "loss": 0.223, + "step": 35895 + }, + { + "epoch": 2.9079714841218407, + "grad_norm": 0.0737767368555069, + "learning_rate": 6.066879697556146e-05, + "loss": 0.2329, + "step": 35896 + }, + { + "epoch": 2.908052495139339, + "grad_norm": 0.07478015124797821, + "learning_rate": 6.066429632296683e-05, + "loss": 0.2463, + "step": 35897 + }, + { + "epoch": 2.908133506156837, + "grad_norm": 0.0626714825630188, + "learning_rate": 6.065979567037221e-05, + "loss": 0.2489, + "step": 35898 + }, + { + "epoch": 2.9082145171743354, + "grad_norm": 0.07597566395998001, + "learning_rate": 6.065529501777758e-05, + "loss": 0.2727, + "step": 35899 + }, + { + "epoch": 2.908295528191834, + "grad_norm": 0.07273010909557343, + "learning_rate": 6.065079436518295e-05, + "loss": 0.2128, + "step": 35900 + }, + { + "epoch": 2.9083765392093324, + "grad_norm": 0.07037299126386642, + "learning_rate": 6.064629371258833e-05, + "loss": 0.2208, + "step": 35901 + }, + { + "epoch": 2.9084575502268306, + "grad_norm": 0.07470954954624176, + "learning_rate": 6.0641793059993704e-05, + "loss": 0.2775, + "step": 35902 + }, + { + "epoch": 2.9085385612443293, + "grad_norm": 0.06422090530395508, + "learning_rate": 6.063729240739907e-05, + "loss": 0.2161, + "step": 35903 + }, + { + "epoch": 2.9086195722618275, + "grad_norm": 0.0677059069275856, + "learning_rate": 6.063279175480445e-05, + "loss": 0.2187, + "step": 35904 + }, + { + "epoch": 2.908700583279326, + "grad_norm": 0.05977741628885269, + "learning_rate": 6.0628291102209825e-05, + "loss": 0.2246, + "step": 35905 + }, + { + "epoch": 2.9087815942968245, + "grad_norm": 0.06989169120788574, + "learning_rate": 6.062379044961519e-05, + "loss": 0.2392, + "step": 35906 + }, + { + "epoch": 2.9088626053143227, + "grad_norm": 0.07747205346822739, + "learning_rate": 6.061928979702057e-05, + "loss": 0.2463, + "step": 35907 + }, + { + "epoch": 2.908943616331821, + "grad_norm": 0.059107888489961624, + "learning_rate": 6.0614789144425946e-05, + "loss": 0.2219, + "step": 35908 + }, + { + "epoch": 2.9090246273493197, + "grad_norm": 0.06700557470321655, + "learning_rate": 6.061028849183131e-05, + "loss": 0.2547, + "step": 35909 + }, + { + "epoch": 2.909105638366818, + "grad_norm": 0.057208240032196045, + "learning_rate": 6.060578783923669e-05, + "loss": 0.2164, + "step": 35910 + }, + { + "epoch": 2.909186649384316, + "grad_norm": 0.08306428790092468, + "learning_rate": 6.0601287186642066e-05, + "loss": 0.235, + "step": 35911 + }, + { + "epoch": 2.909267660401815, + "grad_norm": 0.06049541011452675, + "learning_rate": 6.059678653404745e-05, + "loss": 0.233, + "step": 35912 + }, + { + "epoch": 2.909348671419313, + "grad_norm": 0.07041322439908981, + "learning_rate": 6.0592285881452814e-05, + "loss": 0.2491, + "step": 35913 + }, + { + "epoch": 2.9094296824368113, + "grad_norm": 0.053164467215538025, + "learning_rate": 6.058778522885819e-05, + "loss": 0.2484, + "step": 35914 + }, + { + "epoch": 2.90951069345431, + "grad_norm": 0.06176530569791794, + "learning_rate": 6.058328457626357e-05, + "loss": 0.2616, + "step": 35915 + }, + { + "epoch": 2.9095917044718083, + "grad_norm": 0.07826686650514603, + "learning_rate": 6.0578783923668934e-05, + "loss": 0.2445, + "step": 35916 + }, + { + "epoch": 2.9096727154893065, + "grad_norm": 0.05870719254016876, + "learning_rate": 6.057428327107431e-05, + "loss": 0.2202, + "step": 35917 + }, + { + "epoch": 2.909753726506805, + "grad_norm": 0.08019772171974182, + "learning_rate": 6.056978261847969e-05, + "loss": 0.2925, + "step": 35918 + }, + { + "epoch": 2.9098347375243034, + "grad_norm": 0.08110763132572174, + "learning_rate": 6.0565281965885055e-05, + "loss": 0.243, + "step": 35919 + }, + { + "epoch": 2.9099157485418017, + "grad_norm": 0.06867033243179321, + "learning_rate": 6.056078131329043e-05, + "loss": 0.2711, + "step": 35920 + }, + { + "epoch": 2.9099967595593, + "grad_norm": 0.06370605528354645, + "learning_rate": 6.055628066069581e-05, + "loss": 0.2008, + "step": 35921 + }, + { + "epoch": 2.910077770576798, + "grad_norm": 0.06781043112277985, + "learning_rate": 6.0551780008101176e-05, + "loss": 0.2233, + "step": 35922 + }, + { + "epoch": 2.910158781594297, + "grad_norm": 0.061090607196092606, + "learning_rate": 6.054727935550655e-05, + "loss": 0.2301, + "step": 35923 + }, + { + "epoch": 2.910239792611795, + "grad_norm": 0.06507328897714615, + "learning_rate": 6.054277870291193e-05, + "loss": 0.2153, + "step": 35924 + }, + { + "epoch": 2.9103208036292934, + "grad_norm": 0.06221972405910492, + "learning_rate": 6.05382780503173e-05, + "loss": 0.2341, + "step": 35925 + }, + { + "epoch": 2.910401814646792, + "grad_norm": 0.08535179495811462, + "learning_rate": 6.053377739772267e-05, + "loss": 0.2062, + "step": 35926 + }, + { + "epoch": 2.9104828256642903, + "grad_norm": 0.06976145505905151, + "learning_rate": 6.052927674512805e-05, + "loss": 0.2191, + "step": 35927 + }, + { + "epoch": 2.9105638366817885, + "grad_norm": 0.0640319213271141, + "learning_rate": 6.052477609253342e-05, + "loss": 0.2136, + "step": 35928 + }, + { + "epoch": 2.910644847699287, + "grad_norm": 0.0687737911939621, + "learning_rate": 6.052027543993879e-05, + "loss": 0.2224, + "step": 35929 + }, + { + "epoch": 2.9107258587167855, + "grad_norm": 0.06703128665685654, + "learning_rate": 6.051577478734417e-05, + "loss": 0.2207, + "step": 35930 + }, + { + "epoch": 2.9108068697342837, + "grad_norm": 0.08989090472459793, + "learning_rate": 6.051127413474954e-05, + "loss": 0.229, + "step": 35931 + }, + { + "epoch": 2.9108878807517824, + "grad_norm": 0.08146070688962936, + "learning_rate": 6.050677348215491e-05, + "loss": 0.2524, + "step": 35932 + }, + { + "epoch": 2.9109688917692806, + "grad_norm": 0.06896800547838211, + "learning_rate": 6.050227282956029e-05, + "loss": 0.2529, + "step": 35933 + }, + { + "epoch": 2.911049902786779, + "grad_norm": 0.06814651191234589, + "learning_rate": 6.049777217696566e-05, + "loss": 0.2504, + "step": 35934 + }, + { + "epoch": 2.9111309138042776, + "grad_norm": 0.06067023426294327, + "learning_rate": 6.049327152437103e-05, + "loss": 0.2049, + "step": 35935 + }, + { + "epoch": 2.911211924821776, + "grad_norm": 0.08196239918470383, + "learning_rate": 6.048877087177641e-05, + "loss": 0.2512, + "step": 35936 + }, + { + "epoch": 2.911292935839274, + "grad_norm": 0.059716738760471344, + "learning_rate": 6.048427021918178e-05, + "loss": 0.2547, + "step": 35937 + }, + { + "epoch": 2.9113739468567728, + "grad_norm": 0.12704335153102875, + "learning_rate": 6.047976956658716e-05, + "loss": 0.2401, + "step": 35938 + }, + { + "epoch": 2.911454957874271, + "grad_norm": 0.08267374336719513, + "learning_rate": 6.0475268913992534e-05, + "loss": 0.2669, + "step": 35939 + }, + { + "epoch": 2.9115359688917692, + "grad_norm": 0.05156685784459114, + "learning_rate": 6.04707682613979e-05, + "loss": 0.2432, + "step": 35940 + }, + { + "epoch": 2.911616979909268, + "grad_norm": 0.06564828753471375, + "learning_rate": 6.046626760880328e-05, + "loss": 0.2191, + "step": 35941 + }, + { + "epoch": 2.911697990926766, + "grad_norm": 0.0737994983792305, + "learning_rate": 6.0461766956208655e-05, + "loss": 0.254, + "step": 35942 + }, + { + "epoch": 2.9117790019442644, + "grad_norm": 0.06724006682634354, + "learning_rate": 6.045726630361402e-05, + "loss": 0.2057, + "step": 35943 + }, + { + "epoch": 2.9118600129617627, + "grad_norm": 0.07741007208824158, + "learning_rate": 6.04527656510194e-05, + "loss": 0.2592, + "step": 35944 + }, + { + "epoch": 2.911941023979261, + "grad_norm": 0.06976274400949478, + "learning_rate": 6.0448264998424776e-05, + "loss": 0.2221, + "step": 35945 + }, + { + "epoch": 2.9120220349967596, + "grad_norm": 0.06324218213558197, + "learning_rate": 6.044376434583014e-05, + "loss": 0.2021, + "step": 35946 + }, + { + "epoch": 2.912103046014258, + "grad_norm": 0.06732454150915146, + "learning_rate": 6.043926369323552e-05, + "loss": 0.2843, + "step": 35947 + }, + { + "epoch": 2.912184057031756, + "grad_norm": 0.057616446167230606, + "learning_rate": 6.0434763040640897e-05, + "loss": 0.232, + "step": 35948 + }, + { + "epoch": 2.912265068049255, + "grad_norm": 0.06809848546981812, + "learning_rate": 6.043026238804626e-05, + "loss": 0.2738, + "step": 35949 + }, + { + "epoch": 2.912346079066753, + "grad_norm": 0.08779431879520416, + "learning_rate": 6.0425761735451644e-05, + "loss": 0.2318, + "step": 35950 + }, + { + "epoch": 2.9124270900842513, + "grad_norm": 0.07004832476377487, + "learning_rate": 6.042126108285702e-05, + "loss": 0.2597, + "step": 35951 + }, + { + "epoch": 2.91250810110175, + "grad_norm": 0.07469019293785095, + "learning_rate": 6.0416760430262384e-05, + "loss": 0.2601, + "step": 35952 + }, + { + "epoch": 2.912589112119248, + "grad_norm": 0.08924978971481323, + "learning_rate": 6.0412259777667765e-05, + "loss": 0.2753, + "step": 35953 + }, + { + "epoch": 2.9126701231367464, + "grad_norm": 0.06867161393165588, + "learning_rate": 6.040775912507314e-05, + "loss": 0.2344, + "step": 35954 + }, + { + "epoch": 2.912751134154245, + "grad_norm": 0.053387414664030075, + "learning_rate": 6.0403258472478505e-05, + "loss": 0.2333, + "step": 35955 + }, + { + "epoch": 2.9128321451717434, + "grad_norm": 0.06053132563829422, + "learning_rate": 6.039875781988389e-05, + "loss": 0.2225, + "step": 35956 + }, + { + "epoch": 2.9129131561892416, + "grad_norm": 0.07425640523433685, + "learning_rate": 6.039425716728926e-05, + "loss": 0.2235, + "step": 35957 + }, + { + "epoch": 2.9129941672067403, + "grad_norm": 0.07319247722625732, + "learning_rate": 6.0389756514694626e-05, + "loss": 0.2746, + "step": 35958 + }, + { + "epoch": 2.9130751782242386, + "grad_norm": 0.07840321213006973, + "learning_rate": 6.038525586210001e-05, + "loss": 0.2477, + "step": 35959 + }, + { + "epoch": 2.913156189241737, + "grad_norm": 0.078122079372406, + "learning_rate": 6.038075520950538e-05, + "loss": 0.2466, + "step": 35960 + }, + { + "epoch": 2.9132372002592355, + "grad_norm": 0.07562543451786041, + "learning_rate": 6.0376254556910747e-05, + "loss": 0.2156, + "step": 35961 + }, + { + "epoch": 2.9133182112767337, + "grad_norm": 0.08191193640232086, + "learning_rate": 6.0371753904316134e-05, + "loss": 0.2012, + "step": 35962 + }, + { + "epoch": 2.913399222294232, + "grad_norm": 0.08304063230752945, + "learning_rate": 6.03672532517215e-05, + "loss": 0.2545, + "step": 35963 + }, + { + "epoch": 2.9134802333117307, + "grad_norm": 0.06543884426355362, + "learning_rate": 6.036275259912688e-05, + "loss": 0.2176, + "step": 35964 + }, + { + "epoch": 2.913561244329229, + "grad_norm": 0.06955443322658539, + "learning_rate": 6.0358251946532255e-05, + "loss": 0.3001, + "step": 35965 + }, + { + "epoch": 2.913642255346727, + "grad_norm": 0.06895852088928223, + "learning_rate": 6.035375129393762e-05, + "loss": 0.2049, + "step": 35966 + }, + { + "epoch": 2.9137232663642254, + "grad_norm": 0.07708650082349777, + "learning_rate": 6.0349250641343e-05, + "loss": 0.1875, + "step": 35967 + }, + { + "epoch": 2.9138042773817237, + "grad_norm": 0.06768258661031723, + "learning_rate": 6.0344749988748375e-05, + "loss": 0.2432, + "step": 35968 + }, + { + "epoch": 2.9138852883992223, + "grad_norm": 0.06098243594169617, + "learning_rate": 6.034024933615374e-05, + "loss": 0.2507, + "step": 35969 + }, + { + "epoch": 2.9139662994167206, + "grad_norm": 0.06618139147758484, + "learning_rate": 6.033574868355912e-05, + "loss": 0.243, + "step": 35970 + }, + { + "epoch": 2.914047310434219, + "grad_norm": 0.05809492990374565, + "learning_rate": 6.0331248030964496e-05, + "loss": 0.18, + "step": 35971 + }, + { + "epoch": 2.9141283214517175, + "grad_norm": 0.06135577708482742, + "learning_rate": 6.032674737836986e-05, + "loss": 0.2337, + "step": 35972 + }, + { + "epoch": 2.9142093324692158, + "grad_norm": 0.0696924701333046, + "learning_rate": 6.0322246725775243e-05, + "loss": 0.2357, + "step": 35973 + }, + { + "epoch": 2.914290343486714, + "grad_norm": 0.08586207032203674, + "learning_rate": 6.031774607318062e-05, + "loss": 0.2312, + "step": 35974 + }, + { + "epoch": 2.9143713545042127, + "grad_norm": 0.07971008121967316, + "learning_rate": 6.0313245420585984e-05, + "loss": 0.2802, + "step": 35975 + }, + { + "epoch": 2.914452365521711, + "grad_norm": 0.07447706162929535, + "learning_rate": 6.0308744767991364e-05, + "loss": 0.1998, + "step": 35976 + }, + { + "epoch": 2.914533376539209, + "grad_norm": 0.07790254801511765, + "learning_rate": 6.030424411539674e-05, + "loss": 0.2352, + "step": 35977 + }, + { + "epoch": 2.914614387556708, + "grad_norm": 0.08902592211961746, + "learning_rate": 6.0299743462802105e-05, + "loss": 0.2782, + "step": 35978 + }, + { + "epoch": 2.914695398574206, + "grad_norm": 0.0902736634016037, + "learning_rate": 6.0295242810207485e-05, + "loss": 0.2615, + "step": 35979 + }, + { + "epoch": 2.9147764095917044, + "grad_norm": 0.08209915459156036, + "learning_rate": 6.029074215761286e-05, + "loss": 0.2546, + "step": 35980 + }, + { + "epoch": 2.914857420609203, + "grad_norm": 0.06725655496120453, + "learning_rate": 6.0286241505018225e-05, + "loss": 0.2279, + "step": 35981 + }, + { + "epoch": 2.9149384316267013, + "grad_norm": 0.07377345114946365, + "learning_rate": 6.0281740852423606e-05, + "loss": 0.2458, + "step": 35982 + }, + { + "epoch": 2.9150194426441995, + "grad_norm": 0.06759821623563766, + "learning_rate": 6.027724019982898e-05, + "loss": 0.2527, + "step": 35983 + }, + { + "epoch": 2.9151004536616982, + "grad_norm": 0.0730883926153183, + "learning_rate": 6.0272739547234346e-05, + "loss": 0.2729, + "step": 35984 + }, + { + "epoch": 2.9151814646791965, + "grad_norm": 0.06299924850463867, + "learning_rate": 6.026823889463973e-05, + "loss": 0.2348, + "step": 35985 + }, + { + "epoch": 2.9152624756966947, + "grad_norm": 0.06301521509885788, + "learning_rate": 6.02637382420451e-05, + "loss": 0.2553, + "step": 35986 + }, + { + "epoch": 2.915343486714193, + "grad_norm": 0.07721760869026184, + "learning_rate": 6.025923758945047e-05, + "loss": 0.2446, + "step": 35987 + }, + { + "epoch": 2.9154244977316917, + "grad_norm": 0.07018601894378662, + "learning_rate": 6.025473693685585e-05, + "loss": 0.2279, + "step": 35988 + }, + { + "epoch": 2.91550550874919, + "grad_norm": 0.0949833020567894, + "learning_rate": 6.025023628426122e-05, + "loss": 0.2389, + "step": 35989 + }, + { + "epoch": 2.915586519766688, + "grad_norm": 0.06459692865610123, + "learning_rate": 6.02457356316666e-05, + "loss": 0.224, + "step": 35990 + }, + { + "epoch": 2.9156675307841864, + "grad_norm": 0.08588383346796036, + "learning_rate": 6.024123497907197e-05, + "loss": 0.2213, + "step": 35991 + }, + { + "epoch": 2.915748541801685, + "grad_norm": 0.056231267750263214, + "learning_rate": 6.023673432647734e-05, + "loss": 0.2273, + "step": 35992 + }, + { + "epoch": 2.9158295528191833, + "grad_norm": 0.06222568452358246, + "learning_rate": 6.023223367388272e-05, + "loss": 0.222, + "step": 35993 + }, + { + "epoch": 2.9159105638366816, + "grad_norm": 0.06202542781829834, + "learning_rate": 6.022773302128809e-05, + "loss": 0.245, + "step": 35994 + }, + { + "epoch": 2.9159915748541803, + "grad_norm": 0.06934040039777756, + "learning_rate": 6.022323236869346e-05, + "loss": 0.2778, + "step": 35995 + }, + { + "epoch": 2.9160725858716785, + "grad_norm": 0.05983179062604904, + "learning_rate": 6.021873171609884e-05, + "loss": 0.2096, + "step": 35996 + }, + { + "epoch": 2.9161535968891767, + "grad_norm": 0.0680842399597168, + "learning_rate": 6.021423106350421e-05, + "loss": 0.1782, + "step": 35997 + }, + { + "epoch": 2.9162346079066754, + "grad_norm": 0.07440262287855148, + "learning_rate": 6.0209730410909583e-05, + "loss": 0.2313, + "step": 35998 + }, + { + "epoch": 2.9163156189241737, + "grad_norm": 0.07230222970247269, + "learning_rate": 6.0205229758314964e-05, + "loss": 0.2244, + "step": 35999 + }, + { + "epoch": 2.916396629941672, + "grad_norm": 0.06707389652729034, + "learning_rate": 6.020072910572033e-05, + "loss": 0.2261, + "step": 36000 + }, + { + "epoch": 2.9164776409591706, + "grad_norm": 0.08801737427711487, + "learning_rate": 6.0196228453125704e-05, + "loss": 0.2211, + "step": 36001 + }, + { + "epoch": 2.916558651976669, + "grad_norm": 0.060239892452955246, + "learning_rate": 6.0191727800531085e-05, + "loss": 0.2263, + "step": 36002 + }, + { + "epoch": 2.916639662994167, + "grad_norm": 0.06880299001932144, + "learning_rate": 6.018722714793645e-05, + "loss": 0.2139, + "step": 36003 + }, + { + "epoch": 2.916720674011666, + "grad_norm": 0.0746193379163742, + "learning_rate": 6.0182726495341825e-05, + "loss": 0.2507, + "step": 36004 + }, + { + "epoch": 2.916801685029164, + "grad_norm": 0.0893036276102066, + "learning_rate": 6.0178225842747205e-05, + "loss": 0.2142, + "step": 36005 + }, + { + "epoch": 2.9168826960466623, + "grad_norm": 0.07693555951118469, + "learning_rate": 6.017372519015257e-05, + "loss": 0.2635, + "step": 36006 + }, + { + "epoch": 2.916963707064161, + "grad_norm": 0.0776832103729248, + "learning_rate": 6.0169224537557946e-05, + "loss": 0.2693, + "step": 36007 + }, + { + "epoch": 2.917044718081659, + "grad_norm": 0.06802288442850113, + "learning_rate": 6.0164723884963326e-05, + "loss": 0.2196, + "step": 36008 + }, + { + "epoch": 2.9171257290991575, + "grad_norm": 0.06248356029391289, + "learning_rate": 6.016022323236869e-05, + "loss": 0.2019, + "step": 36009 + }, + { + "epoch": 2.9172067401166557, + "grad_norm": 0.08267667889595032, + "learning_rate": 6.015572257977407e-05, + "loss": 0.2381, + "step": 36010 + }, + { + "epoch": 2.9172877511341544, + "grad_norm": 0.09951439499855042, + "learning_rate": 6.015122192717945e-05, + "loss": 0.2291, + "step": 36011 + }, + { + "epoch": 2.9173687621516526, + "grad_norm": 0.0678766742348671, + "learning_rate": 6.0146721274584814e-05, + "loss": 0.2568, + "step": 36012 + }, + { + "epoch": 2.917449773169151, + "grad_norm": 0.0689510852098465, + "learning_rate": 6.014222062199019e-05, + "loss": 0.2227, + "step": 36013 + }, + { + "epoch": 2.917530784186649, + "grad_norm": 0.07802077382802963, + "learning_rate": 6.013771996939557e-05, + "loss": 0.2553, + "step": 36014 + }, + { + "epoch": 2.917611795204148, + "grad_norm": 0.08323326706886292, + "learning_rate": 6.0133219316800935e-05, + "loss": 0.2574, + "step": 36015 + }, + { + "epoch": 2.917692806221646, + "grad_norm": 0.0724555179476738, + "learning_rate": 6.0128718664206315e-05, + "loss": 0.2756, + "step": 36016 + }, + { + "epoch": 2.9177738172391443, + "grad_norm": 0.06053786724805832, + "learning_rate": 6.012421801161169e-05, + "loss": 0.2384, + "step": 36017 + }, + { + "epoch": 2.917854828256643, + "grad_norm": 0.06939715892076492, + "learning_rate": 6.0119717359017056e-05, + "loss": 0.2602, + "step": 36018 + }, + { + "epoch": 2.9179358392741412, + "grad_norm": 0.058913156390190125, + "learning_rate": 6.0115216706422436e-05, + "loss": 0.2173, + "step": 36019 + }, + { + "epoch": 2.9180168502916395, + "grad_norm": 0.0667242705821991, + "learning_rate": 6.011071605382781e-05, + "loss": 0.2754, + "step": 36020 + }, + { + "epoch": 2.918097861309138, + "grad_norm": 0.07719171792268753, + "learning_rate": 6.0106215401233176e-05, + "loss": 0.2325, + "step": 36021 + }, + { + "epoch": 2.9181788723266364, + "grad_norm": 0.06560521572828293, + "learning_rate": 6.010171474863856e-05, + "loss": 0.237, + "step": 36022 + }, + { + "epoch": 2.9182598833441347, + "grad_norm": 0.07015185058116913, + "learning_rate": 6.009721409604393e-05, + "loss": 0.2556, + "step": 36023 + }, + { + "epoch": 2.9183408943616334, + "grad_norm": 0.08773837238550186, + "learning_rate": 6.00927134434493e-05, + "loss": 0.2307, + "step": 36024 + }, + { + "epoch": 2.9184219053791316, + "grad_norm": 0.07875818014144897, + "learning_rate": 6.0088212790854684e-05, + "loss": 0.2694, + "step": 36025 + }, + { + "epoch": 2.91850291639663, + "grad_norm": 0.09875552356243134, + "learning_rate": 6.008371213826005e-05, + "loss": 0.228, + "step": 36026 + }, + { + "epoch": 2.9185839274141285, + "grad_norm": 0.06942545622587204, + "learning_rate": 6.007921148566542e-05, + "loss": 0.2305, + "step": 36027 + }, + { + "epoch": 2.9186649384316268, + "grad_norm": 0.06027941405773163, + "learning_rate": 6.0074710833070805e-05, + "loss": 0.2302, + "step": 36028 + }, + { + "epoch": 2.918745949449125, + "grad_norm": 0.06728939712047577, + "learning_rate": 6.007021018047617e-05, + "loss": 0.267, + "step": 36029 + }, + { + "epoch": 2.9188269604666237, + "grad_norm": 0.07952848076820374, + "learning_rate": 6.006570952788154e-05, + "loss": 0.2559, + "step": 36030 + }, + { + "epoch": 2.918907971484122, + "grad_norm": 0.08097676187753677, + "learning_rate": 6.0061208875286926e-05, + "loss": 0.2443, + "step": 36031 + }, + { + "epoch": 2.91898898250162, + "grad_norm": 0.08644181489944458, + "learning_rate": 6.005670822269229e-05, + "loss": 0.2557, + "step": 36032 + }, + { + "epoch": 2.9190699935191184, + "grad_norm": 0.08713238686323166, + "learning_rate": 6.005220757009766e-05, + "loss": 0.2512, + "step": 36033 + }, + { + "epoch": 2.919151004536617, + "grad_norm": 0.06697341799736023, + "learning_rate": 6.004770691750305e-05, + "loss": 0.2599, + "step": 36034 + }, + { + "epoch": 2.9192320155541154, + "grad_norm": 0.07292734831571579, + "learning_rate": 6.0043206264908414e-05, + "loss": 0.23, + "step": 36035 + }, + { + "epoch": 2.9193130265716136, + "grad_norm": 0.08001824468374252, + "learning_rate": 6.003870561231378e-05, + "loss": 0.2166, + "step": 36036 + }, + { + "epoch": 2.919394037589112, + "grad_norm": 0.0886823758482933, + "learning_rate": 6.003420495971917e-05, + "loss": 0.2745, + "step": 36037 + }, + { + "epoch": 2.9194750486066106, + "grad_norm": 0.06975951045751572, + "learning_rate": 6.0029704307124534e-05, + "loss": 0.235, + "step": 36038 + }, + { + "epoch": 2.919556059624109, + "grad_norm": 0.0714506283402443, + "learning_rate": 6.00252036545299e-05, + "loss": 0.238, + "step": 36039 + }, + { + "epoch": 2.919637070641607, + "grad_norm": 0.07188653200864792, + "learning_rate": 6.002070300193529e-05, + "loss": 0.2556, + "step": 36040 + }, + { + "epoch": 2.9197180816591057, + "grad_norm": 0.06636472791433334, + "learning_rate": 6.0016202349340655e-05, + "loss": 0.2175, + "step": 36041 + }, + { + "epoch": 2.919799092676604, + "grad_norm": 0.08170676976442337, + "learning_rate": 6.0011701696746036e-05, + "loss": 0.2331, + "step": 36042 + }, + { + "epoch": 2.9198801036941022, + "grad_norm": 0.07941601425409317, + "learning_rate": 6.000720104415141e-05, + "loss": 0.2594, + "step": 36043 + }, + { + "epoch": 2.919961114711601, + "grad_norm": 0.07099863886833191, + "learning_rate": 6.0002700391556776e-05, + "loss": 0.2031, + "step": 36044 + }, + { + "epoch": 2.920042125729099, + "grad_norm": 0.07502160221338272, + "learning_rate": 5.9998199738962156e-05, + "loss": 0.271, + "step": 36045 + }, + { + "epoch": 2.9201231367465974, + "grad_norm": 0.06421137601137161, + "learning_rate": 5.999369908636753e-05, + "loss": 0.2468, + "step": 36046 + }, + { + "epoch": 2.920204147764096, + "grad_norm": 0.07903636991977692, + "learning_rate": 5.99891984337729e-05, + "loss": 0.2231, + "step": 36047 + }, + { + "epoch": 2.9202851587815943, + "grad_norm": 0.06335616111755371, + "learning_rate": 5.998469778117828e-05, + "loss": 0.2637, + "step": 36048 + }, + { + "epoch": 2.9203661697990926, + "grad_norm": 0.0635843276977539, + "learning_rate": 5.998019712858365e-05, + "loss": 0.2393, + "step": 36049 + }, + { + "epoch": 2.9204471808165913, + "grad_norm": 0.07643640786409378, + "learning_rate": 5.997569647598902e-05, + "loss": 0.2274, + "step": 36050 + }, + { + "epoch": 2.9205281918340895, + "grad_norm": 0.06594489514827728, + "learning_rate": 5.99711958233944e-05, + "loss": 0.2577, + "step": 36051 + }, + { + "epoch": 2.9206092028515878, + "grad_norm": 0.06565924733877182, + "learning_rate": 5.996669517079977e-05, + "loss": 0.2338, + "step": 36052 + }, + { + "epoch": 2.9206902138690864, + "grad_norm": 0.062482208013534546, + "learning_rate": 5.996219451820514e-05, + "loss": 0.2011, + "step": 36053 + }, + { + "epoch": 2.9207712248865847, + "grad_norm": 0.08185567706823349, + "learning_rate": 5.995769386561052e-05, + "loss": 0.2261, + "step": 36054 + }, + { + "epoch": 2.920852235904083, + "grad_norm": 0.081700898706913, + "learning_rate": 5.995319321301589e-05, + "loss": 0.2386, + "step": 36055 + }, + { + "epoch": 2.920933246921581, + "grad_norm": 0.06645942479372025, + "learning_rate": 5.994869256042126e-05, + "loss": 0.2569, + "step": 36056 + }, + { + "epoch": 2.92101425793908, + "grad_norm": 0.06409208476543427, + "learning_rate": 5.994419190782664e-05, + "loss": 0.2149, + "step": 36057 + }, + { + "epoch": 2.921095268956578, + "grad_norm": 0.07192101329565048, + "learning_rate": 5.993969125523201e-05, + "loss": 0.2396, + "step": 36058 + }, + { + "epoch": 2.9211762799740764, + "grad_norm": 0.06122337281703949, + "learning_rate": 5.993519060263738e-05, + "loss": 0.2373, + "step": 36059 + }, + { + "epoch": 2.9212572909915746, + "grad_norm": 0.061685703694820404, + "learning_rate": 5.993068995004276e-05, + "loss": 0.2008, + "step": 36060 + }, + { + "epoch": 2.9213383020090733, + "grad_norm": 0.07286947965621948, + "learning_rate": 5.9926189297448134e-05, + "loss": 0.236, + "step": 36061 + }, + { + "epoch": 2.9214193130265715, + "grad_norm": 0.06022263318300247, + "learning_rate": 5.99216886448535e-05, + "loss": 0.2183, + "step": 36062 + }, + { + "epoch": 2.92150032404407, + "grad_norm": 0.06972159445285797, + "learning_rate": 5.991718799225888e-05, + "loss": 0.2201, + "step": 36063 + }, + { + "epoch": 2.9215813350615685, + "grad_norm": 0.06769529730081558, + "learning_rate": 5.9912687339664255e-05, + "loss": 0.2061, + "step": 36064 + }, + { + "epoch": 2.9216623460790667, + "grad_norm": 0.08727024495601654, + "learning_rate": 5.990818668706962e-05, + "loss": 0.2219, + "step": 36065 + }, + { + "epoch": 2.921743357096565, + "grad_norm": 0.07862614095211029, + "learning_rate": 5.9903686034475e-05, + "loss": 0.2637, + "step": 36066 + }, + { + "epoch": 2.9218243681140637, + "grad_norm": 0.07064937800168991, + "learning_rate": 5.9899185381880376e-05, + "loss": 0.2427, + "step": 36067 + }, + { + "epoch": 2.921905379131562, + "grad_norm": 0.06485024839639664, + "learning_rate": 5.989468472928574e-05, + "loss": 0.2264, + "step": 36068 + }, + { + "epoch": 2.92198639014906, + "grad_norm": 0.0720147117972374, + "learning_rate": 5.989018407669112e-05, + "loss": 0.2398, + "step": 36069 + }, + { + "epoch": 2.922067401166559, + "grad_norm": 0.05576111748814583, + "learning_rate": 5.9885683424096496e-05, + "loss": 0.2087, + "step": 36070 + }, + { + "epoch": 2.922148412184057, + "grad_norm": 0.07834657281637192, + "learning_rate": 5.988118277150188e-05, + "loss": 0.2386, + "step": 36071 + }, + { + "epoch": 2.9222294232015553, + "grad_norm": 0.07598018646240234, + "learning_rate": 5.9876682118907244e-05, + "loss": 0.2439, + "step": 36072 + }, + { + "epoch": 2.922310434219054, + "grad_norm": 0.06340734660625458, + "learning_rate": 5.987218146631262e-05, + "loss": 0.2215, + "step": 36073 + }, + { + "epoch": 2.9223914452365523, + "grad_norm": 0.0714435800909996, + "learning_rate": 5.9867680813718e-05, + "loss": 0.3095, + "step": 36074 + }, + { + "epoch": 2.9224724562540505, + "grad_norm": 0.06071990728378296, + "learning_rate": 5.9863180161123364e-05, + "loss": 0.2146, + "step": 36075 + }, + { + "epoch": 2.922553467271549, + "grad_norm": 0.07448688894510269, + "learning_rate": 5.985867950852874e-05, + "loss": 0.2154, + "step": 36076 + }, + { + "epoch": 2.9226344782890474, + "grad_norm": 0.06451181322336197, + "learning_rate": 5.985417885593412e-05, + "loss": 0.2266, + "step": 36077 + }, + { + "epoch": 2.9227154893065457, + "grad_norm": 0.15605607628822327, + "learning_rate": 5.9849678203339485e-05, + "loss": 0.2543, + "step": 36078 + }, + { + "epoch": 2.922796500324044, + "grad_norm": 0.06267999857664108, + "learning_rate": 5.984517755074486e-05, + "loss": 0.2258, + "step": 36079 + }, + { + "epoch": 2.9228775113415426, + "grad_norm": 0.07223370671272278, + "learning_rate": 5.984067689815024e-05, + "loss": 0.2142, + "step": 36080 + }, + { + "epoch": 2.922958522359041, + "grad_norm": 0.06177590414881706, + "learning_rate": 5.9836176245555606e-05, + "loss": 0.2014, + "step": 36081 + }, + { + "epoch": 2.923039533376539, + "grad_norm": 0.07765360176563263, + "learning_rate": 5.983167559296098e-05, + "loss": 0.2466, + "step": 36082 + }, + { + "epoch": 2.9231205443940373, + "grad_norm": 0.07498011738061905, + "learning_rate": 5.982717494036636e-05, + "loss": 0.2452, + "step": 36083 + }, + { + "epoch": 2.923201555411536, + "grad_norm": 0.06292008608579636, + "learning_rate": 5.982267428777173e-05, + "loss": 0.2839, + "step": 36084 + }, + { + "epoch": 2.9232825664290343, + "grad_norm": 0.0622960664331913, + "learning_rate": 5.98181736351771e-05, + "loss": 0.2315, + "step": 36085 + }, + { + "epoch": 2.9233635774465325, + "grad_norm": 0.07538387179374695, + "learning_rate": 5.981367298258248e-05, + "loss": 0.2152, + "step": 36086 + }, + { + "epoch": 2.923444588464031, + "grad_norm": 0.060837045311927795, + "learning_rate": 5.980917232998785e-05, + "loss": 0.2283, + "step": 36087 + }, + { + "epoch": 2.9235255994815295, + "grad_norm": 0.059952542185783386, + "learning_rate": 5.980467167739322e-05, + "loss": 0.2267, + "step": 36088 + }, + { + "epoch": 2.9236066104990277, + "grad_norm": 0.07587370276451111, + "learning_rate": 5.98001710247986e-05, + "loss": 0.261, + "step": 36089 + }, + { + "epoch": 2.9236876215165264, + "grad_norm": 0.08088859170675278, + "learning_rate": 5.979567037220397e-05, + "loss": 0.2302, + "step": 36090 + }, + { + "epoch": 2.9237686325340246, + "grad_norm": 0.07576531916856766, + "learning_rate": 5.979116971960934e-05, + "loss": 0.2257, + "step": 36091 + }, + { + "epoch": 2.923849643551523, + "grad_norm": 0.08089031279087067, + "learning_rate": 5.978666906701472e-05, + "loss": 0.2481, + "step": 36092 + }, + { + "epoch": 2.9239306545690216, + "grad_norm": 0.07415632158517838, + "learning_rate": 5.978216841442009e-05, + "loss": 0.2173, + "step": 36093 + }, + { + "epoch": 2.92401166558652, + "grad_norm": 0.06114750728011131, + "learning_rate": 5.977766776182546e-05, + "loss": 0.2361, + "step": 36094 + }, + { + "epoch": 2.924092676604018, + "grad_norm": 0.066036157310009, + "learning_rate": 5.977316710923084e-05, + "loss": 0.2411, + "step": 36095 + }, + { + "epoch": 2.9241736876215167, + "grad_norm": 0.07516303658485413, + "learning_rate": 5.976866645663621e-05, + "loss": 0.2481, + "step": 36096 + }, + { + "epoch": 2.924254698639015, + "grad_norm": 0.08236120641231537, + "learning_rate": 5.97641658040416e-05, + "loss": 0.2399, + "step": 36097 + }, + { + "epoch": 2.9243357096565132, + "grad_norm": 0.08051339536905289, + "learning_rate": 5.9759665151446964e-05, + "loss": 0.2312, + "step": 36098 + }, + { + "epoch": 2.924416720674012, + "grad_norm": 0.06887134909629822, + "learning_rate": 5.975516449885233e-05, + "loss": 0.2171, + "step": 36099 + }, + { + "epoch": 2.92449773169151, + "grad_norm": 0.06876733899116516, + "learning_rate": 5.975066384625772e-05, + "loss": 0.2453, + "step": 36100 + }, + { + "epoch": 2.9245787427090084, + "grad_norm": 0.08076386898756027, + "learning_rate": 5.9746163193663085e-05, + "loss": 0.2468, + "step": 36101 + }, + { + "epoch": 2.9246597537265067, + "grad_norm": 0.057946834713220596, + "learning_rate": 5.974166254106845e-05, + "loss": 0.2112, + "step": 36102 + }, + { + "epoch": 2.9247407647440054, + "grad_norm": 0.05689328908920288, + "learning_rate": 5.973716188847384e-05, + "loss": 0.189, + "step": 36103 + }, + { + "epoch": 2.9248217757615036, + "grad_norm": 0.07286691665649414, + "learning_rate": 5.9732661235879206e-05, + "loss": 0.2746, + "step": 36104 + }, + { + "epoch": 2.924902786779002, + "grad_norm": 0.07182064652442932, + "learning_rate": 5.972816058328457e-05, + "loss": 0.2343, + "step": 36105 + }, + { + "epoch": 2.9249837977965, + "grad_norm": 0.0803033784031868, + "learning_rate": 5.972365993068996e-05, + "loss": 0.2761, + "step": 36106 + }, + { + "epoch": 2.9250648088139988, + "grad_norm": 0.059216950088739395, + "learning_rate": 5.9719159278095327e-05, + "loss": 0.2362, + "step": 36107 + }, + { + "epoch": 2.925145819831497, + "grad_norm": 0.07625512778759003, + "learning_rate": 5.97146586255007e-05, + "loss": 0.2577, + "step": 36108 + }, + { + "epoch": 2.9252268308489953, + "grad_norm": 0.0680420845746994, + "learning_rate": 5.971015797290608e-05, + "loss": 0.2478, + "step": 36109 + }, + { + "epoch": 2.925307841866494, + "grad_norm": 0.07407931983470917, + "learning_rate": 5.970565732031145e-05, + "loss": 0.2175, + "step": 36110 + }, + { + "epoch": 2.925388852883992, + "grad_norm": 0.07050876319408417, + "learning_rate": 5.970115666771682e-05, + "loss": 0.2551, + "step": 36111 + }, + { + "epoch": 2.9254698639014904, + "grad_norm": 0.057711564004421234, + "learning_rate": 5.96966560151222e-05, + "loss": 0.2034, + "step": 36112 + }, + { + "epoch": 2.925550874918989, + "grad_norm": 0.06754898279905319, + "learning_rate": 5.969215536252757e-05, + "loss": 0.2493, + "step": 36113 + }, + { + "epoch": 2.9256318859364874, + "grad_norm": 0.07142467051744461, + "learning_rate": 5.968765470993294e-05, + "loss": 0.2153, + "step": 36114 + }, + { + "epoch": 2.9257128969539856, + "grad_norm": 0.08044569939374924, + "learning_rate": 5.968315405733832e-05, + "loss": 0.2436, + "step": 36115 + }, + { + "epoch": 2.9257939079714843, + "grad_norm": 0.06678476184606552, + "learning_rate": 5.967865340474369e-05, + "loss": 0.2147, + "step": 36116 + }, + { + "epoch": 2.9258749189889826, + "grad_norm": 0.06317151337862015, + "learning_rate": 5.967415275214906e-05, + "loss": 0.2122, + "step": 36117 + }, + { + "epoch": 2.925955930006481, + "grad_norm": 0.06971244513988495, + "learning_rate": 5.966965209955444e-05, + "loss": 0.2348, + "step": 36118 + }, + { + "epoch": 2.9260369410239795, + "grad_norm": 0.06993476301431656, + "learning_rate": 5.966515144695981e-05, + "loss": 0.2464, + "step": 36119 + }, + { + "epoch": 2.9261179520414777, + "grad_norm": 0.06371525675058365, + "learning_rate": 5.9660650794365183e-05, + "loss": 0.25, + "step": 36120 + }, + { + "epoch": 2.926198963058976, + "grad_norm": 0.07864125818014145, + "learning_rate": 5.9656150141770564e-05, + "loss": 0.2573, + "step": 36121 + }, + { + "epoch": 2.9262799740764747, + "grad_norm": 0.07708080857992172, + "learning_rate": 5.965164948917593e-05, + "loss": 0.2499, + "step": 36122 + }, + { + "epoch": 2.926360985093973, + "grad_norm": 0.0688590481877327, + "learning_rate": 5.964714883658131e-05, + "loss": 0.2152, + "step": 36123 + }, + { + "epoch": 2.926441996111471, + "grad_norm": 0.10641010850667953, + "learning_rate": 5.9642648183986685e-05, + "loss": 0.2338, + "step": 36124 + }, + { + "epoch": 2.9265230071289694, + "grad_norm": 0.06765805929899216, + "learning_rate": 5.963814753139205e-05, + "loss": 0.2249, + "step": 36125 + }, + { + "epoch": 2.9266040181464676, + "grad_norm": 0.0745808556675911, + "learning_rate": 5.963364687879743e-05, + "loss": 0.243, + "step": 36126 + }, + { + "epoch": 2.9266850291639663, + "grad_norm": 0.08130037784576416, + "learning_rate": 5.9629146226202805e-05, + "loss": 0.2407, + "step": 36127 + }, + { + "epoch": 2.9267660401814646, + "grad_norm": 0.07136457413434982, + "learning_rate": 5.962464557360817e-05, + "loss": 0.2321, + "step": 36128 + }, + { + "epoch": 2.926847051198963, + "grad_norm": 0.06018296629190445, + "learning_rate": 5.962014492101355e-05, + "loss": 0.2418, + "step": 36129 + }, + { + "epoch": 2.9269280622164615, + "grad_norm": 0.0679769515991211, + "learning_rate": 5.9615644268418926e-05, + "loss": 0.2187, + "step": 36130 + }, + { + "epoch": 2.9270090732339598, + "grad_norm": 0.07336881011724472, + "learning_rate": 5.961114361582429e-05, + "loss": 0.2377, + "step": 36131 + }, + { + "epoch": 2.927090084251458, + "grad_norm": 0.06220309063792229, + "learning_rate": 5.9606642963229673e-05, + "loss": 0.2127, + "step": 36132 + }, + { + "epoch": 2.9271710952689567, + "grad_norm": 0.06562183052301407, + "learning_rate": 5.960214231063505e-05, + "loss": 0.2139, + "step": 36133 + }, + { + "epoch": 2.927252106286455, + "grad_norm": 0.07450972497463226, + "learning_rate": 5.9597641658040414e-05, + "loss": 0.2087, + "step": 36134 + }, + { + "epoch": 2.927333117303953, + "grad_norm": 0.08635947853326797, + "learning_rate": 5.9593141005445794e-05, + "loss": 0.2949, + "step": 36135 + }, + { + "epoch": 2.927414128321452, + "grad_norm": 0.07206778973340988, + "learning_rate": 5.958864035285117e-05, + "loss": 0.2091, + "step": 36136 + }, + { + "epoch": 2.92749513933895, + "grad_norm": 0.08502889424562454, + "learning_rate": 5.9584139700256535e-05, + "loss": 0.2183, + "step": 36137 + }, + { + "epoch": 2.9275761503564484, + "grad_norm": 0.08003343641757965, + "learning_rate": 5.9579639047661915e-05, + "loss": 0.2619, + "step": 36138 + }, + { + "epoch": 2.927657161373947, + "grad_norm": 0.06411267817020416, + "learning_rate": 5.957513839506729e-05, + "loss": 0.1962, + "step": 36139 + }, + { + "epoch": 2.9277381723914453, + "grad_norm": 0.07732285559177399, + "learning_rate": 5.9570637742472655e-05, + "loss": 0.2055, + "step": 36140 + }, + { + "epoch": 2.9278191834089435, + "grad_norm": 0.07913392037153244, + "learning_rate": 5.9566137089878036e-05, + "loss": 0.2521, + "step": 36141 + }, + { + "epoch": 2.9279001944264422, + "grad_norm": 0.06923690438270569, + "learning_rate": 5.956163643728341e-05, + "loss": 0.235, + "step": 36142 + }, + { + "epoch": 2.9279812054439405, + "grad_norm": 0.06813286244869232, + "learning_rate": 5.9557135784688776e-05, + "loss": 0.2352, + "step": 36143 + }, + { + "epoch": 2.9280622164614387, + "grad_norm": 0.06538520008325577, + "learning_rate": 5.955263513209416e-05, + "loss": 0.2298, + "step": 36144 + }, + { + "epoch": 2.9281432274789374, + "grad_norm": 0.08470512181520462, + "learning_rate": 5.954813447949953e-05, + "loss": 0.2459, + "step": 36145 + }, + { + "epoch": 2.9282242384964356, + "grad_norm": 0.06865940988063812, + "learning_rate": 5.95436338269049e-05, + "loss": 0.2014, + "step": 36146 + }, + { + "epoch": 2.928305249513934, + "grad_norm": 0.07658974826335907, + "learning_rate": 5.953913317431028e-05, + "loss": 0.2166, + "step": 36147 + }, + { + "epoch": 2.928386260531432, + "grad_norm": 0.08708245307207108, + "learning_rate": 5.953463252171565e-05, + "loss": 0.2725, + "step": 36148 + }, + { + "epoch": 2.9284672715489304, + "grad_norm": 0.08692276477813721, + "learning_rate": 5.953013186912103e-05, + "loss": 0.2537, + "step": 36149 + }, + { + "epoch": 2.928548282566429, + "grad_norm": 0.06990312784910202, + "learning_rate": 5.95256312165264e-05, + "loss": 0.2333, + "step": 36150 + }, + { + "epoch": 2.9286292935839273, + "grad_norm": 0.06813271343708038, + "learning_rate": 5.952113056393177e-05, + "loss": 0.2336, + "step": 36151 + }, + { + "epoch": 2.9287103046014256, + "grad_norm": 0.07490336149930954, + "learning_rate": 5.951662991133715e-05, + "loss": 0.2331, + "step": 36152 + }, + { + "epoch": 2.9287913156189243, + "grad_norm": 0.07326825708150864, + "learning_rate": 5.951212925874252e-05, + "loss": 0.243, + "step": 36153 + }, + { + "epoch": 2.9288723266364225, + "grad_norm": 0.07395300269126892, + "learning_rate": 5.950762860614789e-05, + "loss": 0.251, + "step": 36154 + }, + { + "epoch": 2.9289533376539207, + "grad_norm": 0.06281843781471252, + "learning_rate": 5.950312795355327e-05, + "loss": 0.2416, + "step": 36155 + }, + { + "epoch": 2.9290343486714194, + "grad_norm": 0.06327148526906967, + "learning_rate": 5.949862730095864e-05, + "loss": 0.2023, + "step": 36156 + }, + { + "epoch": 2.9291153596889177, + "grad_norm": 0.0622863844037056, + "learning_rate": 5.9494126648364014e-05, + "loss": 0.2162, + "step": 36157 + }, + { + "epoch": 2.929196370706416, + "grad_norm": 0.08750246465206146, + "learning_rate": 5.9489625995769394e-05, + "loss": 0.2569, + "step": 36158 + }, + { + "epoch": 2.9292773817239146, + "grad_norm": 0.08218368887901306, + "learning_rate": 5.948512534317476e-05, + "loss": 0.238, + "step": 36159 + }, + { + "epoch": 2.929358392741413, + "grad_norm": 0.06105087697505951, + "learning_rate": 5.9480624690580134e-05, + "loss": 0.24, + "step": 36160 + }, + { + "epoch": 2.929439403758911, + "grad_norm": 0.05267670005559921, + "learning_rate": 5.9476124037985515e-05, + "loss": 0.2191, + "step": 36161 + }, + { + "epoch": 2.92952041477641, + "grad_norm": 0.06280451267957687, + "learning_rate": 5.947162338539088e-05, + "loss": 0.2278, + "step": 36162 + }, + { + "epoch": 2.929601425793908, + "grad_norm": 0.09271591156721115, + "learning_rate": 5.9467122732796255e-05, + "loss": 0.2223, + "step": 36163 + }, + { + "epoch": 2.9296824368114063, + "grad_norm": 0.06803149729967117, + "learning_rate": 5.9462622080201636e-05, + "loss": 0.2334, + "step": 36164 + }, + { + "epoch": 2.929763447828905, + "grad_norm": 0.08212228119373322, + "learning_rate": 5.9458121427607e-05, + "loss": 0.2246, + "step": 36165 + }, + { + "epoch": 2.929844458846403, + "grad_norm": 0.07780248671770096, + "learning_rate": 5.9453620775012376e-05, + "loss": 0.2379, + "step": 36166 + }, + { + "epoch": 2.9299254698639015, + "grad_norm": 0.06778130680322647, + "learning_rate": 5.9449120122417756e-05, + "loss": 0.2397, + "step": 36167 + }, + { + "epoch": 2.9300064808814, + "grad_norm": 0.07062183320522308, + "learning_rate": 5.944461946982312e-05, + "loss": 0.2277, + "step": 36168 + }, + { + "epoch": 2.9300874918988984, + "grad_norm": 0.0797196626663208, + "learning_rate": 5.94401188172285e-05, + "loss": 0.253, + "step": 36169 + }, + { + "epoch": 2.9301685029163966, + "grad_norm": 0.06827553361654282, + "learning_rate": 5.943561816463388e-05, + "loss": 0.2286, + "step": 36170 + }, + { + "epoch": 2.930249513933895, + "grad_norm": 0.09623746573925018, + "learning_rate": 5.9431117512039244e-05, + "loss": 0.2639, + "step": 36171 + }, + { + "epoch": 2.930330524951393, + "grad_norm": 0.08124282211065292, + "learning_rate": 5.942661685944462e-05, + "loss": 0.2256, + "step": 36172 + }, + { + "epoch": 2.930411535968892, + "grad_norm": 0.05782557278871536, + "learning_rate": 5.942211620685e-05, + "loss": 0.2109, + "step": 36173 + }, + { + "epoch": 2.93049254698639, + "grad_norm": 0.08583999425172806, + "learning_rate": 5.9417615554255365e-05, + "loss": 0.3011, + "step": 36174 + }, + { + "epoch": 2.9305735580038883, + "grad_norm": 0.0772082656621933, + "learning_rate": 5.941311490166075e-05, + "loss": 0.2378, + "step": 36175 + }, + { + "epoch": 2.930654569021387, + "grad_norm": 0.06742317229509354, + "learning_rate": 5.940861424906612e-05, + "loss": 0.24, + "step": 36176 + }, + { + "epoch": 2.9307355800388852, + "grad_norm": 0.06944488734006882, + "learning_rate": 5.940411359647149e-05, + "loss": 0.2228, + "step": 36177 + }, + { + "epoch": 2.9308165910563835, + "grad_norm": 0.07197489589452744, + "learning_rate": 5.939961294387687e-05, + "loss": 0.27, + "step": 36178 + }, + { + "epoch": 2.930897602073882, + "grad_norm": 0.08774056285619736, + "learning_rate": 5.939511229128224e-05, + "loss": 0.2765, + "step": 36179 + }, + { + "epoch": 2.9309786130913804, + "grad_norm": 0.06142619997262955, + "learning_rate": 5.939061163868761e-05, + "loss": 0.2269, + "step": 36180 + }, + { + "epoch": 2.9310596241088787, + "grad_norm": 0.07134956121444702, + "learning_rate": 5.9386110986092994e-05, + "loss": 0.2299, + "step": 36181 + }, + { + "epoch": 2.9311406351263773, + "grad_norm": 0.06611652672290802, + "learning_rate": 5.938161033349836e-05, + "loss": 0.2032, + "step": 36182 + }, + { + "epoch": 2.9312216461438756, + "grad_norm": 0.07123982906341553, + "learning_rate": 5.9377109680903734e-05, + "loss": 0.2169, + "step": 36183 + }, + { + "epoch": 2.931302657161374, + "grad_norm": 0.07195700705051422, + "learning_rate": 5.9372609028309114e-05, + "loss": 0.2308, + "step": 36184 + }, + { + "epoch": 2.9313836681788725, + "grad_norm": 0.06632047146558762, + "learning_rate": 5.936810837571448e-05, + "loss": 0.2245, + "step": 36185 + }, + { + "epoch": 2.9314646791963708, + "grad_norm": 0.08340226113796234, + "learning_rate": 5.9363607723119855e-05, + "loss": 0.2389, + "step": 36186 + }, + { + "epoch": 2.931545690213869, + "grad_norm": 0.08525307476520538, + "learning_rate": 5.9359107070525235e-05, + "loss": 0.2569, + "step": 36187 + }, + { + "epoch": 2.9316267012313677, + "grad_norm": 0.06627818942070007, + "learning_rate": 5.93546064179306e-05, + "loss": 0.2362, + "step": 36188 + }, + { + "epoch": 2.931707712248866, + "grad_norm": 0.07321153581142426, + "learning_rate": 5.9350105765335976e-05, + "loss": 0.2238, + "step": 36189 + }, + { + "epoch": 2.931788723266364, + "grad_norm": 0.06970200687646866, + "learning_rate": 5.9345605112741356e-05, + "loss": 0.247, + "step": 36190 + }, + { + "epoch": 2.931869734283863, + "grad_norm": 0.06322509050369263, + "learning_rate": 5.934110446014672e-05, + "loss": 0.2081, + "step": 36191 + }, + { + "epoch": 2.931950745301361, + "grad_norm": 0.07383931428194046, + "learning_rate": 5.9336603807552096e-05, + "loss": 0.2285, + "step": 36192 + }, + { + "epoch": 2.9320317563188594, + "grad_norm": 0.07150974124670029, + "learning_rate": 5.933210315495748e-05, + "loss": 0.2052, + "step": 36193 + }, + { + "epoch": 2.9321127673363576, + "grad_norm": 0.08250176161527634, + "learning_rate": 5.9327602502362844e-05, + "loss": 0.2557, + "step": 36194 + }, + { + "epoch": 2.932193778353856, + "grad_norm": 0.08581391721963882, + "learning_rate": 5.932310184976822e-05, + "loss": 0.2755, + "step": 36195 + }, + { + "epoch": 2.9322747893713546, + "grad_norm": 0.07952970266342163, + "learning_rate": 5.93186011971736e-05, + "loss": 0.2323, + "step": 36196 + }, + { + "epoch": 2.932355800388853, + "grad_norm": 0.08379239588975906, + "learning_rate": 5.9314100544578964e-05, + "loss": 0.2358, + "step": 36197 + }, + { + "epoch": 2.932436811406351, + "grad_norm": 0.07248571515083313, + "learning_rate": 5.930959989198434e-05, + "loss": 0.2258, + "step": 36198 + }, + { + "epoch": 2.9325178224238497, + "grad_norm": 0.07288960367441177, + "learning_rate": 5.930509923938972e-05, + "loss": 0.2334, + "step": 36199 + }, + { + "epoch": 2.932598833441348, + "grad_norm": 0.0691913366317749, + "learning_rate": 5.9300598586795085e-05, + "loss": 0.209, + "step": 36200 + }, + { + "epoch": 2.932679844458846, + "grad_norm": 0.06181837618350983, + "learning_rate": 5.9296097934200466e-05, + "loss": 0.2213, + "step": 36201 + }, + { + "epoch": 2.932760855476345, + "grad_norm": 0.06725524365901947, + "learning_rate": 5.929159728160584e-05, + "loss": 0.2012, + "step": 36202 + }, + { + "epoch": 2.932841866493843, + "grad_norm": 0.06281258165836334, + "learning_rate": 5.9287096629011206e-05, + "loss": 0.2418, + "step": 36203 + }, + { + "epoch": 2.9329228775113414, + "grad_norm": 0.06464042514562607, + "learning_rate": 5.9282595976416586e-05, + "loss": 0.235, + "step": 36204 + }, + { + "epoch": 2.93300388852884, + "grad_norm": 0.06401500850915909, + "learning_rate": 5.927809532382196e-05, + "loss": 0.2067, + "step": 36205 + }, + { + "epoch": 2.9330848995463383, + "grad_norm": 0.0744711384177208, + "learning_rate": 5.927359467122733e-05, + "loss": 0.2644, + "step": 36206 + }, + { + "epoch": 2.9331659105638366, + "grad_norm": 0.06625427305698395, + "learning_rate": 5.926909401863271e-05, + "loss": 0.2379, + "step": 36207 + }, + { + "epoch": 2.9332469215813353, + "grad_norm": 0.05669691041111946, + "learning_rate": 5.926459336603808e-05, + "loss": 0.2245, + "step": 36208 + }, + { + "epoch": 2.9333279325988335, + "grad_norm": 0.07033190876245499, + "learning_rate": 5.926009271344345e-05, + "loss": 0.2269, + "step": 36209 + }, + { + "epoch": 2.9334089436163318, + "grad_norm": 0.06328772753477097, + "learning_rate": 5.925559206084883e-05, + "loss": 0.1984, + "step": 36210 + }, + { + "epoch": 2.9334899546338304, + "grad_norm": 0.10779903829097748, + "learning_rate": 5.92510914082542e-05, + "loss": 0.2562, + "step": 36211 + }, + { + "epoch": 2.9335709656513287, + "grad_norm": 0.0638335794210434, + "learning_rate": 5.924659075565957e-05, + "loss": 0.2151, + "step": 36212 + }, + { + "epoch": 2.933651976668827, + "grad_norm": 0.07483941316604614, + "learning_rate": 5.924209010306495e-05, + "loss": 0.2682, + "step": 36213 + }, + { + "epoch": 2.933732987686325, + "grad_norm": 0.04787442088127136, + "learning_rate": 5.923758945047032e-05, + "loss": 0.1916, + "step": 36214 + }, + { + "epoch": 2.933813998703824, + "grad_norm": 0.0677068680524826, + "learning_rate": 5.923308879787569e-05, + "loss": 0.2389, + "step": 36215 + }, + { + "epoch": 2.933895009721322, + "grad_norm": 0.05607495829463005, + "learning_rate": 5.922858814528107e-05, + "loss": 0.205, + "step": 36216 + }, + { + "epoch": 2.9339760207388204, + "grad_norm": 0.0977732464671135, + "learning_rate": 5.922408749268644e-05, + "loss": 0.2886, + "step": 36217 + }, + { + "epoch": 2.9340570317563186, + "grad_norm": 0.07866278290748596, + "learning_rate": 5.921958684009181e-05, + "loss": 0.2498, + "step": 36218 + }, + { + "epoch": 2.9341380427738173, + "grad_norm": 0.06456935405731201, + "learning_rate": 5.921508618749719e-05, + "loss": 0.2421, + "step": 36219 + }, + { + "epoch": 2.9342190537913155, + "grad_norm": 0.06755385547876358, + "learning_rate": 5.9210585534902564e-05, + "loss": 0.2349, + "step": 36220 + }, + { + "epoch": 2.934300064808814, + "grad_norm": 0.06859587877988815, + "learning_rate": 5.920608488230793e-05, + "loss": 0.2239, + "step": 36221 + }, + { + "epoch": 2.9343810758263125, + "grad_norm": 0.06106014922261238, + "learning_rate": 5.920158422971331e-05, + "loss": 0.2118, + "step": 36222 + }, + { + "epoch": 2.9344620868438107, + "grad_norm": 0.0807553380727768, + "learning_rate": 5.9197083577118685e-05, + "loss": 0.2256, + "step": 36223 + }, + { + "epoch": 2.934543097861309, + "grad_norm": 0.06360894441604614, + "learning_rate": 5.919258292452405e-05, + "loss": 0.2257, + "step": 36224 + }, + { + "epoch": 2.9346241088788076, + "grad_norm": 0.07121789455413818, + "learning_rate": 5.918808227192943e-05, + "loss": 0.2161, + "step": 36225 + }, + { + "epoch": 2.934705119896306, + "grad_norm": 0.06633513420820236, + "learning_rate": 5.9183581619334806e-05, + "loss": 0.2218, + "step": 36226 + }, + { + "epoch": 2.934786130913804, + "grad_norm": 0.08529671281576157, + "learning_rate": 5.917908096674017e-05, + "loss": 0.2529, + "step": 36227 + }, + { + "epoch": 2.934867141931303, + "grad_norm": 0.06434918195009232, + "learning_rate": 5.917458031414555e-05, + "loss": 0.2959, + "step": 36228 + }, + { + "epoch": 2.934948152948801, + "grad_norm": 0.0785248875617981, + "learning_rate": 5.9170079661550926e-05, + "loss": 0.2368, + "step": 36229 + }, + { + "epoch": 2.9350291639662993, + "grad_norm": 0.07209806889295578, + "learning_rate": 5.916557900895631e-05, + "loss": 0.2218, + "step": 36230 + }, + { + "epoch": 2.935110174983798, + "grad_norm": 0.06406816840171814, + "learning_rate": 5.9161078356361674e-05, + "loss": 0.2592, + "step": 36231 + }, + { + "epoch": 2.9351911860012962, + "grad_norm": 0.06611085683107376, + "learning_rate": 5.915657770376705e-05, + "loss": 0.2361, + "step": 36232 + }, + { + "epoch": 2.9352721970187945, + "grad_norm": 0.08267108350992203, + "learning_rate": 5.915207705117243e-05, + "loss": 0.2227, + "step": 36233 + }, + { + "epoch": 2.935353208036293, + "grad_norm": 0.04846043139696121, + "learning_rate": 5.9147576398577795e-05, + "loss": 0.2004, + "step": 36234 + }, + { + "epoch": 2.9354342190537914, + "grad_norm": 0.07758070528507233, + "learning_rate": 5.914307574598317e-05, + "loss": 0.2329, + "step": 36235 + }, + { + "epoch": 2.9355152300712897, + "grad_norm": 0.07985047250986099, + "learning_rate": 5.913857509338855e-05, + "loss": 0.2617, + "step": 36236 + }, + { + "epoch": 2.935596241088788, + "grad_norm": 0.0672224760055542, + "learning_rate": 5.9134074440793915e-05, + "loss": 0.2242, + "step": 36237 + }, + { + "epoch": 2.9356772521062866, + "grad_norm": 0.05481787770986557, + "learning_rate": 5.912957378819929e-05, + "loss": 0.2285, + "step": 36238 + }, + { + "epoch": 2.935758263123785, + "grad_norm": 0.06856312602758408, + "learning_rate": 5.912507313560467e-05, + "loss": 0.2128, + "step": 36239 + }, + { + "epoch": 2.935839274141283, + "grad_norm": 0.08229131996631622, + "learning_rate": 5.9120572483010036e-05, + "loss": 0.232, + "step": 36240 + }, + { + "epoch": 2.9359202851587813, + "grad_norm": 0.08203065395355225, + "learning_rate": 5.911607183041541e-05, + "loss": 0.2496, + "step": 36241 + }, + { + "epoch": 2.93600129617628, + "grad_norm": 0.06302852183580399, + "learning_rate": 5.911157117782079e-05, + "loss": 0.22, + "step": 36242 + }, + { + "epoch": 2.9360823071937783, + "grad_norm": 0.07781028002500534, + "learning_rate": 5.910707052522616e-05, + "loss": 0.2149, + "step": 36243 + }, + { + "epoch": 2.9361633182112765, + "grad_norm": 0.057055290788412094, + "learning_rate": 5.910256987263153e-05, + "loss": 0.2192, + "step": 36244 + }, + { + "epoch": 2.936244329228775, + "grad_norm": 0.07755919545888901, + "learning_rate": 5.909806922003691e-05, + "loss": 0.2027, + "step": 36245 + }, + { + "epoch": 2.9363253402462735, + "grad_norm": 0.07984499633312225, + "learning_rate": 5.9093568567442285e-05, + "loss": 0.3008, + "step": 36246 + }, + { + "epoch": 2.9364063512637717, + "grad_norm": 0.07670563459396362, + "learning_rate": 5.908906791484765e-05, + "loss": 0.2304, + "step": 36247 + }, + { + "epoch": 2.9364873622812704, + "grad_norm": 0.07776518911123276, + "learning_rate": 5.908456726225303e-05, + "loss": 0.2522, + "step": 36248 + }, + { + "epoch": 2.9365683732987686, + "grad_norm": 0.07556840032339096, + "learning_rate": 5.9080066609658405e-05, + "loss": 0.2786, + "step": 36249 + }, + { + "epoch": 2.936649384316267, + "grad_norm": 0.07746266573667526, + "learning_rate": 5.907556595706377e-05, + "loss": 0.2553, + "step": 36250 + }, + { + "epoch": 2.9367303953337656, + "grad_norm": 0.07655060291290283, + "learning_rate": 5.907106530446915e-05, + "loss": 0.2623, + "step": 36251 + }, + { + "epoch": 2.936811406351264, + "grad_norm": 0.0660661831498146, + "learning_rate": 5.9066564651874526e-05, + "loss": 0.2599, + "step": 36252 + }, + { + "epoch": 2.936892417368762, + "grad_norm": 0.0715685710310936, + "learning_rate": 5.906206399927989e-05, + "loss": 0.2416, + "step": 36253 + }, + { + "epoch": 2.9369734283862607, + "grad_norm": 0.0755523145198822, + "learning_rate": 5.905756334668527e-05, + "loss": 0.1945, + "step": 36254 + }, + { + "epoch": 2.937054439403759, + "grad_norm": 0.06897228211164474, + "learning_rate": 5.905306269409065e-05, + "loss": 0.2484, + "step": 36255 + }, + { + "epoch": 2.9371354504212572, + "grad_norm": 0.058608174324035645, + "learning_rate": 5.904856204149603e-05, + "loss": 0.2372, + "step": 36256 + }, + { + "epoch": 2.937216461438756, + "grad_norm": 0.0753135159611702, + "learning_rate": 5.9044061388901394e-05, + "loss": 0.24, + "step": 36257 + }, + { + "epoch": 2.937297472456254, + "grad_norm": 0.060669947415590286, + "learning_rate": 5.903956073630677e-05, + "loss": 0.2222, + "step": 36258 + }, + { + "epoch": 2.9373784834737524, + "grad_norm": 0.06780382245779037, + "learning_rate": 5.903506008371215e-05, + "loss": 0.2175, + "step": 36259 + }, + { + "epoch": 2.9374594944912507, + "grad_norm": 0.07297215610742569, + "learning_rate": 5.9030559431117515e-05, + "loss": 0.2391, + "step": 36260 + }, + { + "epoch": 2.9375405055087493, + "grad_norm": 0.05533376708626747, + "learning_rate": 5.902605877852289e-05, + "loss": 0.2383, + "step": 36261 + }, + { + "epoch": 2.9376215165262476, + "grad_norm": 0.07857169955968857, + "learning_rate": 5.902155812592827e-05, + "loss": 0.2224, + "step": 36262 + }, + { + "epoch": 2.937702527543746, + "grad_norm": 0.06497713923454285, + "learning_rate": 5.9017057473333636e-05, + "loss": 0.2276, + "step": 36263 + }, + { + "epoch": 2.937783538561244, + "grad_norm": 0.06889013200998306, + "learning_rate": 5.901255682073901e-05, + "loss": 0.2339, + "step": 36264 + }, + { + "epoch": 2.9378645495787428, + "grad_norm": 0.08349175751209259, + "learning_rate": 5.900805616814439e-05, + "loss": 0.2421, + "step": 36265 + }, + { + "epoch": 2.937945560596241, + "grad_norm": 0.06726394593715668, + "learning_rate": 5.9003555515549757e-05, + "loss": 0.1948, + "step": 36266 + }, + { + "epoch": 2.9380265716137393, + "grad_norm": 0.06818697601556778, + "learning_rate": 5.899905486295513e-05, + "loss": 0.2306, + "step": 36267 + }, + { + "epoch": 2.938107582631238, + "grad_norm": 0.07113194465637207, + "learning_rate": 5.899455421036051e-05, + "loss": 0.2849, + "step": 36268 + }, + { + "epoch": 2.938188593648736, + "grad_norm": 0.06667479872703552, + "learning_rate": 5.899005355776588e-05, + "loss": 0.2268, + "step": 36269 + }, + { + "epoch": 2.9382696046662344, + "grad_norm": 0.06525031477212906, + "learning_rate": 5.898555290517125e-05, + "loss": 0.2418, + "step": 36270 + }, + { + "epoch": 2.938350615683733, + "grad_norm": 0.07072465866804123, + "learning_rate": 5.898105225257663e-05, + "loss": 0.2467, + "step": 36271 + }, + { + "epoch": 2.9384316267012314, + "grad_norm": 0.07480061054229736, + "learning_rate": 5.8976551599982e-05, + "loss": 0.2547, + "step": 36272 + }, + { + "epoch": 2.9385126377187296, + "grad_norm": 0.06577193737030029, + "learning_rate": 5.897205094738737e-05, + "loss": 0.224, + "step": 36273 + }, + { + "epoch": 2.9385936487362283, + "grad_norm": 0.06845003366470337, + "learning_rate": 5.896755029479275e-05, + "loss": 0.252, + "step": 36274 + }, + { + "epoch": 2.9386746597537265, + "grad_norm": 0.07179780304431915, + "learning_rate": 5.896304964219812e-05, + "loss": 0.2625, + "step": 36275 + }, + { + "epoch": 2.938755670771225, + "grad_norm": 0.08876433223485947, + "learning_rate": 5.895854898960349e-05, + "loss": 0.2339, + "step": 36276 + }, + { + "epoch": 2.9388366817887235, + "grad_norm": 0.08522546291351318, + "learning_rate": 5.895404833700887e-05, + "loss": 0.227, + "step": 36277 + }, + { + "epoch": 2.9389176928062217, + "grad_norm": 0.07845503836870193, + "learning_rate": 5.894954768441424e-05, + "loss": 0.2634, + "step": 36278 + }, + { + "epoch": 2.93899870382372, + "grad_norm": 0.06297402828931808, + "learning_rate": 5.8945047031819613e-05, + "loss": 0.2386, + "step": 36279 + }, + { + "epoch": 2.9390797148412187, + "grad_norm": 0.08465047180652618, + "learning_rate": 5.8940546379224994e-05, + "loss": 0.2565, + "step": 36280 + }, + { + "epoch": 2.939160725858717, + "grad_norm": 0.07127416133880615, + "learning_rate": 5.893604572663036e-05, + "loss": 0.2476, + "step": 36281 + }, + { + "epoch": 2.939241736876215, + "grad_norm": 0.06154097244143486, + "learning_rate": 5.893154507403574e-05, + "loss": 0.2539, + "step": 36282 + }, + { + "epoch": 2.9393227478937134, + "grad_norm": 0.06503734737634659, + "learning_rate": 5.8927044421441115e-05, + "loss": 0.2353, + "step": 36283 + }, + { + "epoch": 2.939403758911212, + "grad_norm": 0.06861399859189987, + "learning_rate": 5.892254376884648e-05, + "loss": 0.2582, + "step": 36284 + }, + { + "epoch": 2.9394847699287103, + "grad_norm": 0.06149303540587425, + "learning_rate": 5.891804311625186e-05, + "loss": 0.2141, + "step": 36285 + }, + { + "epoch": 2.9395657809462086, + "grad_norm": 0.07276730984449387, + "learning_rate": 5.8913542463657235e-05, + "loss": 0.2389, + "step": 36286 + }, + { + "epoch": 2.939646791963707, + "grad_norm": 0.0715443417429924, + "learning_rate": 5.89090418110626e-05, + "loss": 0.2372, + "step": 36287 + }, + { + "epoch": 2.9397278029812055, + "grad_norm": 0.062204938381910324, + "learning_rate": 5.890454115846798e-05, + "loss": 0.2568, + "step": 36288 + }, + { + "epoch": 2.9398088139987038, + "grad_norm": 0.0683540403842926, + "learning_rate": 5.8900040505873356e-05, + "loss": 0.2389, + "step": 36289 + }, + { + "epoch": 2.939889825016202, + "grad_norm": 0.07297884672880173, + "learning_rate": 5.889553985327872e-05, + "loss": 0.2405, + "step": 36290 + }, + { + "epoch": 2.9399708360337007, + "grad_norm": 0.06527545303106308, + "learning_rate": 5.8891039200684103e-05, + "loss": 0.2614, + "step": 36291 + }, + { + "epoch": 2.940051847051199, + "grad_norm": 0.0808287039399147, + "learning_rate": 5.888653854808948e-05, + "loss": 0.2379, + "step": 36292 + }, + { + "epoch": 2.940132858068697, + "grad_norm": 0.07050243765115738, + "learning_rate": 5.8882037895494844e-05, + "loss": 0.2261, + "step": 36293 + }, + { + "epoch": 2.940213869086196, + "grad_norm": 0.06260987371206284, + "learning_rate": 5.8877537242900224e-05, + "loss": 0.2136, + "step": 36294 + }, + { + "epoch": 2.940294880103694, + "grad_norm": 0.07322978973388672, + "learning_rate": 5.88730365903056e-05, + "loss": 0.2196, + "step": 36295 + }, + { + "epoch": 2.9403758911211924, + "grad_norm": 0.07632744312286377, + "learning_rate": 5.8868535937710965e-05, + "loss": 0.2172, + "step": 36296 + }, + { + "epoch": 2.940456902138691, + "grad_norm": 0.06406212598085403, + "learning_rate": 5.8864035285116345e-05, + "loss": 0.2239, + "step": 36297 + }, + { + "epoch": 2.9405379131561893, + "grad_norm": 0.07500002533197403, + "learning_rate": 5.885953463252172e-05, + "loss": 0.2279, + "step": 36298 + }, + { + "epoch": 2.9406189241736875, + "grad_norm": 0.06563522666692734, + "learning_rate": 5.8855033979927086e-05, + "loss": 0.2317, + "step": 36299 + }, + { + "epoch": 2.940699935191186, + "grad_norm": 0.0740443617105484, + "learning_rate": 5.8850533327332466e-05, + "loss": 0.2419, + "step": 36300 + }, + { + "epoch": 2.9407809462086845, + "grad_norm": 0.06501168757677078, + "learning_rate": 5.884603267473784e-05, + "loss": 0.253, + "step": 36301 + }, + { + "epoch": 2.9408619572261827, + "grad_norm": 0.07930354028940201, + "learning_rate": 5.8841532022143206e-05, + "loss": 0.23, + "step": 36302 + }, + { + "epoch": 2.9409429682436814, + "grad_norm": 0.06326466798782349, + "learning_rate": 5.883703136954859e-05, + "loss": 0.22, + "step": 36303 + }, + { + "epoch": 2.9410239792611796, + "grad_norm": 0.08392850309610367, + "learning_rate": 5.883253071695396e-05, + "loss": 0.2652, + "step": 36304 + }, + { + "epoch": 2.941104990278678, + "grad_norm": 0.08548044413328171, + "learning_rate": 5.882803006435933e-05, + "loss": 0.2386, + "step": 36305 + }, + { + "epoch": 2.941186001296176, + "grad_norm": 0.07695137709379196, + "learning_rate": 5.882352941176471e-05, + "loss": 0.2681, + "step": 36306 + }, + { + "epoch": 2.941267012313675, + "grad_norm": 0.07780933380126953, + "learning_rate": 5.881902875917008e-05, + "loss": 0.25, + "step": 36307 + }, + { + "epoch": 2.941348023331173, + "grad_norm": 0.06766297668218613, + "learning_rate": 5.881452810657546e-05, + "loss": 0.2378, + "step": 36308 + }, + { + "epoch": 2.9414290343486713, + "grad_norm": 0.06672467291355133, + "learning_rate": 5.881002745398083e-05, + "loss": 0.2312, + "step": 36309 + }, + { + "epoch": 2.9415100453661696, + "grad_norm": 0.06629875302314758, + "learning_rate": 5.88055268013862e-05, + "loss": 0.1752, + "step": 36310 + }, + { + "epoch": 2.9415910563836682, + "grad_norm": 0.0737876445055008, + "learning_rate": 5.880102614879158e-05, + "loss": 0.2579, + "step": 36311 + }, + { + "epoch": 2.9416720674011665, + "grad_norm": 0.07448048889636993, + "learning_rate": 5.879652549619695e-05, + "loss": 0.2324, + "step": 36312 + }, + { + "epoch": 2.9417530784186647, + "grad_norm": 0.07642365992069244, + "learning_rate": 5.879202484360232e-05, + "loss": 0.2153, + "step": 36313 + }, + { + "epoch": 2.9418340894361634, + "grad_norm": 0.07787315547466278, + "learning_rate": 5.87875241910077e-05, + "loss": 0.2748, + "step": 36314 + }, + { + "epoch": 2.9419151004536617, + "grad_norm": 0.06532657891511917, + "learning_rate": 5.878302353841308e-05, + "loss": 0.2083, + "step": 36315 + }, + { + "epoch": 2.94199611147116, + "grad_norm": 0.05467003956437111, + "learning_rate": 5.8778522885818444e-05, + "loss": 0.2153, + "step": 36316 + }, + { + "epoch": 2.9420771224886586, + "grad_norm": 0.0709647536277771, + "learning_rate": 5.8774022233223824e-05, + "loss": 0.2478, + "step": 36317 + }, + { + "epoch": 2.942158133506157, + "grad_norm": 0.06659302860498428, + "learning_rate": 5.87695215806292e-05, + "loss": 0.2199, + "step": 36318 + }, + { + "epoch": 2.942239144523655, + "grad_norm": 0.06867357343435287, + "learning_rate": 5.8765020928034564e-05, + "loss": 0.2352, + "step": 36319 + }, + { + "epoch": 2.942320155541154, + "grad_norm": 0.07150635123252869, + "learning_rate": 5.8760520275439945e-05, + "loss": 0.2181, + "step": 36320 + }, + { + "epoch": 2.942401166558652, + "grad_norm": 0.06097935512661934, + "learning_rate": 5.875601962284532e-05, + "loss": 0.2181, + "step": 36321 + }, + { + "epoch": 2.9424821775761503, + "grad_norm": 0.059090375900268555, + "learning_rate": 5.8751518970250685e-05, + "loss": 0.234, + "step": 36322 + }, + { + "epoch": 2.942563188593649, + "grad_norm": 0.06826435029506683, + "learning_rate": 5.8747018317656066e-05, + "loss": 0.2217, + "step": 36323 + }, + { + "epoch": 2.942644199611147, + "grad_norm": 0.06106416881084442, + "learning_rate": 5.874251766506144e-05, + "loss": 0.2194, + "step": 36324 + }, + { + "epoch": 2.9427252106286454, + "grad_norm": 0.08476357161998749, + "learning_rate": 5.8738017012466806e-05, + "loss": 0.258, + "step": 36325 + }, + { + "epoch": 2.942806221646144, + "grad_norm": 0.08583477139472961, + "learning_rate": 5.8733516359872186e-05, + "loss": 0.2373, + "step": 36326 + }, + { + "epoch": 2.9428872326636424, + "grad_norm": 0.06490720808506012, + "learning_rate": 5.872901570727756e-05, + "loss": 0.2244, + "step": 36327 + }, + { + "epoch": 2.9429682436811406, + "grad_norm": 0.06273435056209564, + "learning_rate": 5.872451505468293e-05, + "loss": 0.2383, + "step": 36328 + }, + { + "epoch": 2.943049254698639, + "grad_norm": 0.0690455287694931, + "learning_rate": 5.872001440208831e-05, + "loss": 0.2175, + "step": 36329 + }, + { + "epoch": 2.943130265716137, + "grad_norm": 0.06984954327344894, + "learning_rate": 5.871551374949368e-05, + "loss": 0.2369, + "step": 36330 + }, + { + "epoch": 2.943211276733636, + "grad_norm": 0.06107647344470024, + "learning_rate": 5.871101309689905e-05, + "loss": 0.2586, + "step": 36331 + }, + { + "epoch": 2.943292287751134, + "grad_norm": 0.05110946297645569, + "learning_rate": 5.870651244430443e-05, + "loss": 0.247, + "step": 36332 + }, + { + "epoch": 2.9433732987686323, + "grad_norm": 0.05262623727321625, + "learning_rate": 5.87020117917098e-05, + "loss": 0.2363, + "step": 36333 + }, + { + "epoch": 2.943454309786131, + "grad_norm": 0.06491180509328842, + "learning_rate": 5.869751113911518e-05, + "loss": 0.2098, + "step": 36334 + }, + { + "epoch": 2.9435353208036292, + "grad_norm": 0.06550070643424988, + "learning_rate": 5.869301048652055e-05, + "loss": 0.2218, + "step": 36335 + }, + { + "epoch": 2.9436163318211275, + "grad_norm": 0.06160715967416763, + "learning_rate": 5.868850983392592e-05, + "loss": 0.2426, + "step": 36336 + }, + { + "epoch": 2.943697342838626, + "grad_norm": 0.07329082489013672, + "learning_rate": 5.86840091813313e-05, + "loss": 0.2247, + "step": 36337 + }, + { + "epoch": 2.9437783538561244, + "grad_norm": 0.06759696453809738, + "learning_rate": 5.867950852873667e-05, + "loss": 0.2356, + "step": 36338 + }, + { + "epoch": 2.9438593648736227, + "grad_norm": 0.07199111580848694, + "learning_rate": 5.867500787614204e-05, + "loss": 0.2055, + "step": 36339 + }, + { + "epoch": 2.9439403758911213, + "grad_norm": 0.06256087124347687, + "learning_rate": 5.8670507223547424e-05, + "loss": 0.231, + "step": 36340 + }, + { + "epoch": 2.9440213869086196, + "grad_norm": 0.08340414613485336, + "learning_rate": 5.866600657095279e-05, + "loss": 0.2429, + "step": 36341 + }, + { + "epoch": 2.944102397926118, + "grad_norm": 0.06608752906322479, + "learning_rate": 5.8661505918358164e-05, + "loss": 0.253, + "step": 36342 + }, + { + "epoch": 2.9441834089436165, + "grad_norm": 0.06594279408454895, + "learning_rate": 5.8657005265763544e-05, + "loss": 0.2269, + "step": 36343 + }, + { + "epoch": 2.9442644199611148, + "grad_norm": 0.07080650329589844, + "learning_rate": 5.865250461316891e-05, + "loss": 0.2652, + "step": 36344 + }, + { + "epoch": 2.944345430978613, + "grad_norm": 0.06498748809099197, + "learning_rate": 5.8648003960574285e-05, + "loss": 0.2223, + "step": 36345 + }, + { + "epoch": 2.9444264419961117, + "grad_norm": 0.07089196145534515, + "learning_rate": 5.8643503307979665e-05, + "loss": 0.225, + "step": 36346 + }, + { + "epoch": 2.94450745301361, + "grad_norm": 0.06853543967008591, + "learning_rate": 5.863900265538503e-05, + "loss": 0.2135, + "step": 36347 + }, + { + "epoch": 2.944588464031108, + "grad_norm": 0.0707242339849472, + "learning_rate": 5.8634502002790406e-05, + "loss": 0.2413, + "step": 36348 + }, + { + "epoch": 2.944669475048607, + "grad_norm": 0.07439923286437988, + "learning_rate": 5.8630001350195786e-05, + "loss": 0.2203, + "step": 36349 + }, + { + "epoch": 2.944750486066105, + "grad_norm": 0.08233077079057693, + "learning_rate": 5.862550069760115e-05, + "loss": 0.2218, + "step": 36350 + }, + { + "epoch": 2.9448314970836034, + "grad_norm": 0.07423988729715347, + "learning_rate": 5.8621000045006526e-05, + "loss": 0.2328, + "step": 36351 + }, + { + "epoch": 2.9449125081011016, + "grad_norm": 0.06345546245574951, + "learning_rate": 5.861649939241191e-05, + "loss": 0.2776, + "step": 36352 + }, + { + "epoch": 2.9449935191186, + "grad_norm": 0.07971476763486862, + "learning_rate": 5.8611998739817274e-05, + "loss": 0.2116, + "step": 36353 + }, + { + "epoch": 2.9450745301360985, + "grad_norm": 0.06921039521694183, + "learning_rate": 5.860749808722265e-05, + "loss": 0.2595, + "step": 36354 + }, + { + "epoch": 2.945155541153597, + "grad_norm": 0.06553160399198532, + "learning_rate": 5.860299743462803e-05, + "loss": 0.2056, + "step": 36355 + }, + { + "epoch": 2.945236552171095, + "grad_norm": 0.069551981985569, + "learning_rate": 5.8598496782033394e-05, + "loss": 0.2239, + "step": 36356 + }, + { + "epoch": 2.9453175631885937, + "grad_norm": 0.06705120205879211, + "learning_rate": 5.859399612943877e-05, + "loss": 0.1983, + "step": 36357 + }, + { + "epoch": 2.945398574206092, + "grad_norm": 0.06197086349129677, + "learning_rate": 5.858949547684415e-05, + "loss": 0.2264, + "step": 36358 + }, + { + "epoch": 2.94547958522359, + "grad_norm": 0.06881693005561829, + "learning_rate": 5.8584994824249515e-05, + "loss": 0.2393, + "step": 36359 + }, + { + "epoch": 2.945560596241089, + "grad_norm": 0.07800547033548355, + "learning_rate": 5.8580494171654896e-05, + "loss": 0.2728, + "step": 36360 + }, + { + "epoch": 2.945641607258587, + "grad_norm": 0.06328790634870529, + "learning_rate": 5.857599351906027e-05, + "loss": 0.2343, + "step": 36361 + }, + { + "epoch": 2.9457226182760854, + "grad_norm": 0.07082858681678772, + "learning_rate": 5.8571492866465636e-05, + "loss": 0.2285, + "step": 36362 + }, + { + "epoch": 2.945803629293584, + "grad_norm": 0.07132884114980698, + "learning_rate": 5.8566992213871016e-05, + "loss": 0.2679, + "step": 36363 + }, + { + "epoch": 2.9458846403110823, + "grad_norm": 0.06614235043525696, + "learning_rate": 5.856249156127639e-05, + "loss": 0.2511, + "step": 36364 + }, + { + "epoch": 2.9459656513285806, + "grad_norm": 0.07640809565782547, + "learning_rate": 5.855799090868176e-05, + "loss": 0.239, + "step": 36365 + }, + { + "epoch": 2.9460466623460793, + "grad_norm": 0.07471118867397308, + "learning_rate": 5.855349025608714e-05, + "loss": 0.2087, + "step": 36366 + }, + { + "epoch": 2.9461276733635775, + "grad_norm": 0.08368020504713058, + "learning_rate": 5.854898960349251e-05, + "loss": 0.2227, + "step": 36367 + }, + { + "epoch": 2.9462086843810757, + "grad_norm": 0.07055380195379257, + "learning_rate": 5.854448895089788e-05, + "loss": 0.2538, + "step": 36368 + }, + { + "epoch": 2.9462896953985744, + "grad_norm": 0.07868610322475433, + "learning_rate": 5.853998829830326e-05, + "loss": 0.2145, + "step": 36369 + }, + { + "epoch": 2.9463707064160727, + "grad_norm": 0.06251738965511322, + "learning_rate": 5.853548764570863e-05, + "loss": 0.211, + "step": 36370 + }, + { + "epoch": 2.946451717433571, + "grad_norm": 0.07561231404542923, + "learning_rate": 5.8530986993114e-05, + "loss": 0.2344, + "step": 36371 + }, + { + "epoch": 2.9465327284510696, + "grad_norm": 0.0834537148475647, + "learning_rate": 5.852648634051938e-05, + "loss": 0.2006, + "step": 36372 + }, + { + "epoch": 2.946613739468568, + "grad_norm": 0.0762653574347496, + "learning_rate": 5.852198568792475e-05, + "loss": 0.243, + "step": 36373 + }, + { + "epoch": 2.946694750486066, + "grad_norm": 0.07931900769472122, + "learning_rate": 5.851748503533012e-05, + "loss": 0.2371, + "step": 36374 + }, + { + "epoch": 2.9467757615035644, + "grad_norm": 0.07118021696805954, + "learning_rate": 5.85129843827355e-05, + "loss": 0.2396, + "step": 36375 + }, + { + "epoch": 2.9468567725210626, + "grad_norm": 0.09576117992401123, + "learning_rate": 5.850848373014087e-05, + "loss": 0.2699, + "step": 36376 + }, + { + "epoch": 2.9469377835385613, + "grad_norm": 0.07844670116901398, + "learning_rate": 5.850398307754624e-05, + "loss": 0.2611, + "step": 36377 + }, + { + "epoch": 2.9470187945560595, + "grad_norm": 0.07611510902643204, + "learning_rate": 5.849948242495162e-05, + "loss": 0.233, + "step": 36378 + }, + { + "epoch": 2.9470998055735578, + "grad_norm": 0.06875669956207275, + "learning_rate": 5.8494981772356994e-05, + "loss": 0.2342, + "step": 36379 + }, + { + "epoch": 2.9471808165910565, + "grad_norm": 0.05916833132505417, + "learning_rate": 5.849048111976236e-05, + "loss": 0.2067, + "step": 36380 + }, + { + "epoch": 2.9472618276085547, + "grad_norm": 0.06750084459781647, + "learning_rate": 5.848598046716775e-05, + "loss": 0.2626, + "step": 36381 + }, + { + "epoch": 2.947342838626053, + "grad_norm": 0.07500852644443512, + "learning_rate": 5.8481479814573115e-05, + "loss": 0.2259, + "step": 36382 + }, + { + "epoch": 2.9474238496435516, + "grad_norm": 0.081149160861969, + "learning_rate": 5.847697916197848e-05, + "loss": 0.2295, + "step": 36383 + }, + { + "epoch": 2.94750486066105, + "grad_norm": 0.07233553379774094, + "learning_rate": 5.847247850938387e-05, + "loss": 0.3038, + "step": 36384 + }, + { + "epoch": 2.947585871678548, + "grad_norm": 0.05843168869614601, + "learning_rate": 5.8467977856789236e-05, + "loss": 0.2013, + "step": 36385 + }, + { + "epoch": 2.947666882696047, + "grad_norm": 0.06838095933198929, + "learning_rate": 5.84634772041946e-05, + "loss": 0.1948, + "step": 36386 + }, + { + "epoch": 2.947747893713545, + "grad_norm": 0.07753279060125351, + "learning_rate": 5.845897655159999e-05, + "loss": 0.2314, + "step": 36387 + }, + { + "epoch": 2.9478289047310433, + "grad_norm": 0.060668349266052246, + "learning_rate": 5.8454475899005357e-05, + "loss": 0.2207, + "step": 36388 + }, + { + "epoch": 2.947909915748542, + "grad_norm": 0.06636876612901688, + "learning_rate": 5.844997524641074e-05, + "loss": 0.2157, + "step": 36389 + }, + { + "epoch": 2.9479909267660402, + "grad_norm": 0.06835020333528519, + "learning_rate": 5.844547459381611e-05, + "loss": 0.2651, + "step": 36390 + }, + { + "epoch": 2.9480719377835385, + "grad_norm": 0.06267762184143066, + "learning_rate": 5.844097394122148e-05, + "loss": 0.2177, + "step": 36391 + }, + { + "epoch": 2.948152948801037, + "grad_norm": 0.06823470443487167, + "learning_rate": 5.843647328862686e-05, + "loss": 0.209, + "step": 36392 + }, + { + "epoch": 2.9482339598185354, + "grad_norm": 0.07210874557495117, + "learning_rate": 5.843197263603223e-05, + "loss": 0.2383, + "step": 36393 + }, + { + "epoch": 2.9483149708360337, + "grad_norm": 0.06944157183170319, + "learning_rate": 5.84274719834376e-05, + "loss": 0.2474, + "step": 36394 + }, + { + "epoch": 2.9483959818535324, + "grad_norm": 0.06869715452194214, + "learning_rate": 5.842297133084298e-05, + "loss": 0.1967, + "step": 36395 + }, + { + "epoch": 2.9484769928710306, + "grad_norm": 0.07306776195764542, + "learning_rate": 5.841847067824835e-05, + "loss": 0.2055, + "step": 36396 + }, + { + "epoch": 2.948558003888529, + "grad_norm": 0.06623499095439911, + "learning_rate": 5.841397002565372e-05, + "loss": 0.2519, + "step": 36397 + }, + { + "epoch": 2.948639014906027, + "grad_norm": 0.07583684474229813, + "learning_rate": 5.84094693730591e-05, + "loss": 0.2676, + "step": 36398 + }, + { + "epoch": 2.9487200259235253, + "grad_norm": 0.08304212242364883, + "learning_rate": 5.840496872046447e-05, + "loss": 0.2836, + "step": 36399 + }, + { + "epoch": 2.948801036941024, + "grad_norm": 0.06242493912577629, + "learning_rate": 5.840046806786984e-05, + "loss": 0.2512, + "step": 36400 + }, + { + "epoch": 2.9488820479585223, + "grad_norm": 0.06770298629999161, + "learning_rate": 5.839596741527522e-05, + "loss": 0.2472, + "step": 36401 + }, + { + "epoch": 2.9489630589760205, + "grad_norm": 0.07269997894763947, + "learning_rate": 5.8391466762680594e-05, + "loss": 0.2654, + "step": 36402 + }, + { + "epoch": 2.949044069993519, + "grad_norm": 0.060589950531721115, + "learning_rate": 5.838696611008596e-05, + "loss": 0.2046, + "step": 36403 + }, + { + "epoch": 2.9491250810110174, + "grad_norm": 0.06518398225307465, + "learning_rate": 5.838246545749134e-05, + "loss": 0.2226, + "step": 36404 + }, + { + "epoch": 2.9492060920285157, + "grad_norm": 0.07953999936580658, + "learning_rate": 5.8377964804896715e-05, + "loss": 0.2554, + "step": 36405 + }, + { + "epoch": 2.9492871030460144, + "grad_norm": 0.05752985551953316, + "learning_rate": 5.837346415230208e-05, + "loss": 0.2228, + "step": 36406 + }, + { + "epoch": 2.9493681140635126, + "grad_norm": 0.07126346975564957, + "learning_rate": 5.836896349970746e-05, + "loss": 0.2552, + "step": 36407 + }, + { + "epoch": 2.949449125081011, + "grad_norm": 0.059985458850860596, + "learning_rate": 5.8364462847112835e-05, + "loss": 0.238, + "step": 36408 + }, + { + "epoch": 2.9495301360985096, + "grad_norm": 0.07471557706594467, + "learning_rate": 5.83599621945182e-05, + "loss": 0.2084, + "step": 36409 + }, + { + "epoch": 2.949611147116008, + "grad_norm": 0.0673462375998497, + "learning_rate": 5.835546154192358e-05, + "loss": 0.2717, + "step": 36410 + }, + { + "epoch": 2.949692158133506, + "grad_norm": 0.06365612894296646, + "learning_rate": 5.8350960889328956e-05, + "loss": 0.2385, + "step": 36411 + }, + { + "epoch": 2.9497731691510047, + "grad_norm": 0.06991245597600937, + "learning_rate": 5.834646023673432e-05, + "loss": 0.2248, + "step": 36412 + }, + { + "epoch": 2.949854180168503, + "grad_norm": 0.07591899484395981, + "learning_rate": 5.8341959584139703e-05, + "loss": 0.22, + "step": 36413 + }, + { + "epoch": 2.9499351911860012, + "grad_norm": 0.07599229365587234, + "learning_rate": 5.833745893154508e-05, + "loss": 0.2484, + "step": 36414 + }, + { + "epoch": 2.9500162022035, + "grad_norm": 0.06545108556747437, + "learning_rate": 5.833295827895046e-05, + "loss": 0.2073, + "step": 36415 + }, + { + "epoch": 2.950097213220998, + "grad_norm": 0.06527014076709747, + "learning_rate": 5.8328457626355824e-05, + "loss": 0.2219, + "step": 36416 + }, + { + "epoch": 2.9501782242384964, + "grad_norm": 0.07544932514429092, + "learning_rate": 5.83239569737612e-05, + "loss": 0.2462, + "step": 36417 + }, + { + "epoch": 2.9502592352559946, + "grad_norm": 0.08888234198093414, + "learning_rate": 5.831945632116658e-05, + "loss": 0.2363, + "step": 36418 + }, + { + "epoch": 2.9503402462734933, + "grad_norm": 0.07459647208452225, + "learning_rate": 5.8314955668571945e-05, + "loss": 0.2426, + "step": 36419 + }, + { + "epoch": 2.9504212572909916, + "grad_norm": 0.0885954275727272, + "learning_rate": 5.831045501597732e-05, + "loss": 0.2533, + "step": 36420 + }, + { + "epoch": 2.95050226830849, + "grad_norm": 0.0728658065199852, + "learning_rate": 5.83059543633827e-05, + "loss": 0.2295, + "step": 36421 + }, + { + "epoch": 2.950583279325988, + "grad_norm": 0.07531264424324036, + "learning_rate": 5.8301453710788066e-05, + "loss": 0.2412, + "step": 36422 + }, + { + "epoch": 2.9506642903434868, + "grad_norm": 0.06192412227392197, + "learning_rate": 5.829695305819344e-05, + "loss": 0.2121, + "step": 36423 + }, + { + "epoch": 2.950745301360985, + "grad_norm": 0.06983577460050583, + "learning_rate": 5.829245240559882e-05, + "loss": 0.2304, + "step": 36424 + }, + { + "epoch": 2.9508263123784833, + "grad_norm": 0.08154419809579849, + "learning_rate": 5.828795175300419e-05, + "loss": 0.2647, + "step": 36425 + }, + { + "epoch": 2.950907323395982, + "grad_norm": 0.07178998738527298, + "learning_rate": 5.828345110040956e-05, + "loss": 0.2357, + "step": 36426 + }, + { + "epoch": 2.95098833441348, + "grad_norm": 0.07730478048324585, + "learning_rate": 5.827895044781494e-05, + "loss": 0.2271, + "step": 36427 + }, + { + "epoch": 2.9510693454309784, + "grad_norm": 0.07810342311859131, + "learning_rate": 5.827444979522031e-05, + "loss": 0.2295, + "step": 36428 + }, + { + "epoch": 2.951150356448477, + "grad_norm": 0.06364695727825165, + "learning_rate": 5.826994914262568e-05, + "loss": 0.2314, + "step": 36429 + }, + { + "epoch": 2.9512313674659754, + "grad_norm": 0.06301039457321167, + "learning_rate": 5.826544849003106e-05, + "loss": 0.2351, + "step": 36430 + }, + { + "epoch": 2.9513123784834736, + "grad_norm": 0.08015149086713791, + "learning_rate": 5.826094783743643e-05, + "loss": 0.2205, + "step": 36431 + }, + { + "epoch": 2.9513933895009723, + "grad_norm": 0.05650978162884712, + "learning_rate": 5.82564471848418e-05, + "loss": 0.2316, + "step": 36432 + }, + { + "epoch": 2.9514744005184705, + "grad_norm": 0.06430716067552567, + "learning_rate": 5.825194653224718e-05, + "loss": 0.2136, + "step": 36433 + }, + { + "epoch": 2.951555411535969, + "grad_norm": 0.07166837155818939, + "learning_rate": 5.824744587965255e-05, + "loss": 0.2194, + "step": 36434 + }, + { + "epoch": 2.9516364225534675, + "grad_norm": 0.07965375483036041, + "learning_rate": 5.824294522705792e-05, + "loss": 0.2609, + "step": 36435 + }, + { + "epoch": 2.9517174335709657, + "grad_norm": 0.06448642909526825, + "learning_rate": 5.82384445744633e-05, + "loss": 0.2271, + "step": 36436 + }, + { + "epoch": 2.951798444588464, + "grad_norm": 0.073979951441288, + "learning_rate": 5.823394392186867e-05, + "loss": 0.2448, + "step": 36437 + }, + { + "epoch": 2.9518794556059627, + "grad_norm": 0.07171009480953217, + "learning_rate": 5.8229443269274043e-05, + "loss": 0.2052, + "step": 36438 + }, + { + "epoch": 2.951960466623461, + "grad_norm": 0.07442062348127365, + "learning_rate": 5.8224942616679424e-05, + "loss": 0.2574, + "step": 36439 + }, + { + "epoch": 2.952041477640959, + "grad_norm": 0.07632353901863098, + "learning_rate": 5.822044196408479e-05, + "loss": 0.2441, + "step": 36440 + }, + { + "epoch": 2.9521224886584574, + "grad_norm": 0.09367818385362625, + "learning_rate": 5.821594131149017e-05, + "loss": 0.2539, + "step": 36441 + }, + { + "epoch": 2.952203499675956, + "grad_norm": 0.06162737309932709, + "learning_rate": 5.8211440658895545e-05, + "loss": 0.2441, + "step": 36442 + }, + { + "epoch": 2.9522845106934543, + "grad_norm": 0.06815103441476822, + "learning_rate": 5.820694000630091e-05, + "loss": 0.2262, + "step": 36443 + }, + { + "epoch": 2.9523655217109526, + "grad_norm": 0.0818028524518013, + "learning_rate": 5.820243935370629e-05, + "loss": 0.213, + "step": 36444 + }, + { + "epoch": 2.952446532728451, + "grad_norm": 0.057324279099702835, + "learning_rate": 5.8197938701111665e-05, + "loss": 0.2319, + "step": 36445 + }, + { + "epoch": 2.9525275437459495, + "grad_norm": 0.07575049996376038, + "learning_rate": 5.819343804851703e-05, + "loss": 0.2409, + "step": 36446 + }, + { + "epoch": 2.9526085547634477, + "grad_norm": 0.08090692013502121, + "learning_rate": 5.818893739592241e-05, + "loss": 0.2608, + "step": 36447 + }, + { + "epoch": 2.952689565780946, + "grad_norm": 0.08545611798763275, + "learning_rate": 5.8184436743327786e-05, + "loss": 0.2354, + "step": 36448 + }, + { + "epoch": 2.9527705767984447, + "grad_norm": 0.06185333430767059, + "learning_rate": 5.817993609073315e-05, + "loss": 0.2466, + "step": 36449 + }, + { + "epoch": 2.952851587815943, + "grad_norm": 0.06604185700416565, + "learning_rate": 5.817543543813854e-05, + "loss": 0.2307, + "step": 36450 + }, + { + "epoch": 2.952932598833441, + "grad_norm": 0.06959135085344315, + "learning_rate": 5.817093478554391e-05, + "loss": 0.2378, + "step": 36451 + }, + { + "epoch": 2.95301360985094, + "grad_norm": 0.07814179360866547, + "learning_rate": 5.8166434132949274e-05, + "loss": 0.254, + "step": 36452 + }, + { + "epoch": 2.953094620868438, + "grad_norm": 0.06907559186220169, + "learning_rate": 5.816193348035466e-05, + "loss": 0.2342, + "step": 36453 + }, + { + "epoch": 2.9531756318859363, + "grad_norm": 0.08826621621847153, + "learning_rate": 5.815743282776003e-05, + "loss": 0.2375, + "step": 36454 + }, + { + "epoch": 2.953256642903435, + "grad_norm": 0.08610070496797562, + "learning_rate": 5.8152932175165395e-05, + "loss": 0.2515, + "step": 36455 + }, + { + "epoch": 2.9533376539209333, + "grad_norm": 0.0768219530582428, + "learning_rate": 5.814843152257078e-05, + "loss": 0.2643, + "step": 36456 + }, + { + "epoch": 2.9534186649384315, + "grad_norm": 0.08998133987188339, + "learning_rate": 5.814393086997615e-05, + "loss": 0.2352, + "step": 36457 + }, + { + "epoch": 2.95349967595593, + "grad_norm": 0.06378687173128128, + "learning_rate": 5.8139430217381516e-05, + "loss": 0.2024, + "step": 36458 + }, + { + "epoch": 2.9535806869734285, + "grad_norm": 0.07165522873401642, + "learning_rate": 5.81349295647869e-05, + "loss": 0.2318, + "step": 36459 + }, + { + "epoch": 2.9536616979909267, + "grad_norm": 0.0691077783703804, + "learning_rate": 5.813042891219227e-05, + "loss": 0.2299, + "step": 36460 + }, + { + "epoch": 2.9537427090084254, + "grad_norm": 0.08201265335083008, + "learning_rate": 5.8125928259597636e-05, + "loss": 0.2369, + "step": 36461 + }, + { + "epoch": 2.9538237200259236, + "grad_norm": 0.06257831305265427, + "learning_rate": 5.8121427607003024e-05, + "loss": 0.2074, + "step": 36462 + }, + { + "epoch": 2.953904731043422, + "grad_norm": 0.0590004026889801, + "learning_rate": 5.811692695440839e-05, + "loss": 0.2563, + "step": 36463 + }, + { + "epoch": 2.95398574206092, + "grad_norm": 0.05743236094713211, + "learning_rate": 5.811242630181376e-05, + "loss": 0.2244, + "step": 36464 + }, + { + "epoch": 2.954066753078419, + "grad_norm": 0.07363023608922958, + "learning_rate": 5.8107925649219144e-05, + "loss": 0.2144, + "step": 36465 + }, + { + "epoch": 2.954147764095917, + "grad_norm": 0.07051236927509308, + "learning_rate": 5.810342499662451e-05, + "loss": 0.2412, + "step": 36466 + }, + { + "epoch": 2.9542287751134153, + "grad_norm": 0.06581655144691467, + "learning_rate": 5.809892434402989e-05, + "loss": 0.2234, + "step": 36467 + }, + { + "epoch": 2.9543097861309136, + "grad_norm": 0.07746722549200058, + "learning_rate": 5.8094423691435265e-05, + "loss": 0.2497, + "step": 36468 + }, + { + "epoch": 2.9543907971484122, + "grad_norm": 0.07712780684232712, + "learning_rate": 5.808992303884063e-05, + "loss": 0.2754, + "step": 36469 + }, + { + "epoch": 2.9544718081659105, + "grad_norm": 0.06322678178548813, + "learning_rate": 5.808542238624601e-05, + "loss": 0.2088, + "step": 36470 + }, + { + "epoch": 2.9545528191834087, + "grad_norm": 0.06469064950942993, + "learning_rate": 5.8080921733651386e-05, + "loss": 0.2484, + "step": 36471 + }, + { + "epoch": 2.9546338302009074, + "grad_norm": 0.055840183049440384, + "learning_rate": 5.807642108105675e-05, + "loss": 0.2361, + "step": 36472 + }, + { + "epoch": 2.9547148412184057, + "grad_norm": 0.0745265781879425, + "learning_rate": 5.807192042846213e-05, + "loss": 0.245, + "step": 36473 + }, + { + "epoch": 2.954795852235904, + "grad_norm": 0.07949806749820709, + "learning_rate": 5.806741977586751e-05, + "loss": 0.2293, + "step": 36474 + }, + { + "epoch": 2.9548768632534026, + "grad_norm": 0.07750680297613144, + "learning_rate": 5.8062919123272874e-05, + "loss": 0.2629, + "step": 36475 + }, + { + "epoch": 2.954957874270901, + "grad_norm": 0.06992745399475098, + "learning_rate": 5.8058418470678254e-05, + "loss": 0.2457, + "step": 36476 + }, + { + "epoch": 2.955038885288399, + "grad_norm": 0.06394144892692566, + "learning_rate": 5.805391781808363e-05, + "loss": 0.256, + "step": 36477 + }, + { + "epoch": 2.9551198963058978, + "grad_norm": 0.07632666826248169, + "learning_rate": 5.8049417165488994e-05, + "loss": 0.2397, + "step": 36478 + }, + { + "epoch": 2.955200907323396, + "grad_norm": 0.06426636129617691, + "learning_rate": 5.8044916512894375e-05, + "loss": 0.2383, + "step": 36479 + }, + { + "epoch": 2.9552819183408943, + "grad_norm": 0.06205812096595764, + "learning_rate": 5.804041586029975e-05, + "loss": 0.2265, + "step": 36480 + }, + { + "epoch": 2.955362929358393, + "grad_norm": 0.10336317867040634, + "learning_rate": 5.8035915207705115e-05, + "loss": 0.2269, + "step": 36481 + }, + { + "epoch": 2.955443940375891, + "grad_norm": 0.08274838328361511, + "learning_rate": 5.8031414555110496e-05, + "loss": 0.2612, + "step": 36482 + }, + { + "epoch": 2.9555249513933894, + "grad_norm": 0.0678529441356659, + "learning_rate": 5.802691390251587e-05, + "loss": 0.2079, + "step": 36483 + }, + { + "epoch": 2.955605962410888, + "grad_norm": 0.0703086107969284, + "learning_rate": 5.8022413249921236e-05, + "loss": 0.2161, + "step": 36484 + }, + { + "epoch": 2.9556869734283864, + "grad_norm": 0.07301075011491776, + "learning_rate": 5.8017912597326616e-05, + "loss": 0.2301, + "step": 36485 + }, + { + "epoch": 2.9557679844458846, + "grad_norm": 0.07188529521226883, + "learning_rate": 5.801341194473199e-05, + "loss": 0.268, + "step": 36486 + }, + { + "epoch": 2.955848995463383, + "grad_norm": 0.05946533754467964, + "learning_rate": 5.800891129213736e-05, + "loss": 0.2307, + "step": 36487 + }, + { + "epoch": 2.9559300064808816, + "grad_norm": 0.07464953511953354, + "learning_rate": 5.800441063954274e-05, + "loss": 0.2562, + "step": 36488 + }, + { + "epoch": 2.95601101749838, + "grad_norm": 0.08129985630512238, + "learning_rate": 5.799990998694811e-05, + "loss": 0.2446, + "step": 36489 + }, + { + "epoch": 2.956092028515878, + "grad_norm": 0.07472100853919983, + "learning_rate": 5.799540933435348e-05, + "loss": 0.2391, + "step": 36490 + }, + { + "epoch": 2.9561730395333763, + "grad_norm": 0.07534375786781311, + "learning_rate": 5.799090868175886e-05, + "loss": 0.2406, + "step": 36491 + }, + { + "epoch": 2.956254050550875, + "grad_norm": 0.07338795065879822, + "learning_rate": 5.798640802916423e-05, + "loss": 0.2649, + "step": 36492 + }, + { + "epoch": 2.9563350615683732, + "grad_norm": 0.06019020080566406, + "learning_rate": 5.798190737656961e-05, + "loss": 0.2068, + "step": 36493 + }, + { + "epoch": 2.9564160725858715, + "grad_norm": 0.07050585746765137, + "learning_rate": 5.797740672397498e-05, + "loss": 0.205, + "step": 36494 + }, + { + "epoch": 2.95649708360337, + "grad_norm": 0.07372116297483444, + "learning_rate": 5.797290607138035e-05, + "loss": 0.2451, + "step": 36495 + }, + { + "epoch": 2.9565780946208684, + "grad_norm": 0.056005675345659256, + "learning_rate": 5.796840541878573e-05, + "loss": 0.2651, + "step": 36496 + }, + { + "epoch": 2.9566591056383666, + "grad_norm": 0.06699275970458984, + "learning_rate": 5.79639047661911e-05, + "loss": 0.2495, + "step": 36497 + }, + { + "epoch": 2.9567401166558653, + "grad_norm": 0.0599977932870388, + "learning_rate": 5.795940411359647e-05, + "loss": 0.2201, + "step": 36498 + }, + { + "epoch": 2.9568211276733636, + "grad_norm": 0.06906404346227646, + "learning_rate": 5.7954903461001854e-05, + "loss": 0.2357, + "step": 36499 + }, + { + "epoch": 2.956902138690862, + "grad_norm": 0.08245977014303207, + "learning_rate": 5.795040280840722e-05, + "loss": 0.2137, + "step": 36500 + }, + { + "epoch": 2.9569831497083605, + "grad_norm": 0.06937648355960846, + "learning_rate": 5.7945902155812594e-05, + "loss": 0.2331, + "step": 36501 + }, + { + "epoch": 2.9570641607258588, + "grad_norm": 0.08504277467727661, + "learning_rate": 5.7941401503217974e-05, + "loss": 0.2126, + "step": 36502 + }, + { + "epoch": 2.957145171743357, + "grad_norm": 0.06685921549797058, + "learning_rate": 5.793690085062334e-05, + "loss": 0.2254, + "step": 36503 + }, + { + "epoch": 2.9572261827608557, + "grad_norm": 0.07819310575723648, + "learning_rate": 5.7932400198028715e-05, + "loss": 0.2516, + "step": 36504 + }, + { + "epoch": 2.957307193778354, + "grad_norm": 0.0625201091170311, + "learning_rate": 5.7927899545434095e-05, + "loss": 0.2099, + "step": 36505 + }, + { + "epoch": 2.957388204795852, + "grad_norm": 0.06668776273727417, + "learning_rate": 5.792339889283946e-05, + "loss": 0.1961, + "step": 36506 + }, + { + "epoch": 2.957469215813351, + "grad_norm": 0.05308452248573303, + "learning_rate": 5.7918898240244836e-05, + "loss": 0.2351, + "step": 36507 + }, + { + "epoch": 2.957550226830849, + "grad_norm": 0.07450252026319504, + "learning_rate": 5.7914397587650216e-05, + "loss": 0.2425, + "step": 36508 + }, + { + "epoch": 2.9576312378483474, + "grad_norm": 0.05780648812651634, + "learning_rate": 5.790989693505558e-05, + "loss": 0.2224, + "step": 36509 + }, + { + "epoch": 2.9577122488658456, + "grad_norm": 0.07254470139741898, + "learning_rate": 5.7905396282460956e-05, + "loss": 0.2557, + "step": 36510 + }, + { + "epoch": 2.9577932598833443, + "grad_norm": 0.06770768016576767, + "learning_rate": 5.790089562986634e-05, + "loss": 0.244, + "step": 36511 + }, + { + "epoch": 2.9578742709008425, + "grad_norm": 0.08380091935396194, + "learning_rate": 5.7896394977271704e-05, + "loss": 0.2591, + "step": 36512 + }, + { + "epoch": 2.957955281918341, + "grad_norm": 0.06993330270051956, + "learning_rate": 5.789189432467708e-05, + "loss": 0.2276, + "step": 36513 + }, + { + "epoch": 2.958036292935839, + "grad_norm": 0.07040643692016602, + "learning_rate": 5.788739367208246e-05, + "loss": 0.2245, + "step": 36514 + }, + { + "epoch": 2.9581173039533377, + "grad_norm": 0.07032278925180435, + "learning_rate": 5.7882893019487824e-05, + "loss": 0.2378, + "step": 36515 + }, + { + "epoch": 2.958198314970836, + "grad_norm": 0.047874245792627335, + "learning_rate": 5.78783923668932e-05, + "loss": 0.1835, + "step": 36516 + }, + { + "epoch": 2.958279325988334, + "grad_norm": 0.06393905729055405, + "learning_rate": 5.787389171429858e-05, + "loss": 0.2281, + "step": 36517 + }, + { + "epoch": 2.958360337005833, + "grad_norm": 0.06521731615066528, + "learning_rate": 5.7869391061703945e-05, + "loss": 0.2326, + "step": 36518 + }, + { + "epoch": 2.958441348023331, + "grad_norm": 0.09332821518182755, + "learning_rate": 5.786489040910933e-05, + "loss": 0.2457, + "step": 36519 + }, + { + "epoch": 2.9585223590408294, + "grad_norm": 0.052814967930316925, + "learning_rate": 5.78603897565147e-05, + "loss": 0.2083, + "step": 36520 + }, + { + "epoch": 2.958603370058328, + "grad_norm": 0.06364589184522629, + "learning_rate": 5.7855889103920066e-05, + "loss": 0.2288, + "step": 36521 + }, + { + "epoch": 2.9586843810758263, + "grad_norm": 0.07168759405612946, + "learning_rate": 5.785138845132545e-05, + "loss": 0.2214, + "step": 36522 + }, + { + "epoch": 2.9587653920933246, + "grad_norm": 0.0853877067565918, + "learning_rate": 5.784688779873082e-05, + "loss": 0.2841, + "step": 36523 + }, + { + "epoch": 2.9588464031108233, + "grad_norm": 0.07488827407360077, + "learning_rate": 5.784238714613619e-05, + "loss": 0.2702, + "step": 36524 + }, + { + "epoch": 2.9589274141283215, + "grad_norm": 0.060363445430994034, + "learning_rate": 5.7837886493541574e-05, + "loss": 0.2635, + "step": 36525 + }, + { + "epoch": 2.9590084251458197, + "grad_norm": 0.07399044930934906, + "learning_rate": 5.783338584094694e-05, + "loss": 0.226, + "step": 36526 + }, + { + "epoch": 2.9590894361633184, + "grad_norm": 0.06035542115569115, + "learning_rate": 5.782888518835231e-05, + "loss": 0.1998, + "step": 36527 + }, + { + "epoch": 2.9591704471808167, + "grad_norm": 0.059440962970256805, + "learning_rate": 5.7824384535757695e-05, + "loss": 0.2374, + "step": 36528 + }, + { + "epoch": 2.959251458198315, + "grad_norm": 0.06876916438341141, + "learning_rate": 5.781988388316306e-05, + "loss": 0.2204, + "step": 36529 + }, + { + "epoch": 2.9593324692158136, + "grad_norm": 0.07742578536272049, + "learning_rate": 5.781538323056843e-05, + "loss": 0.1969, + "step": 36530 + }, + { + "epoch": 2.959413480233312, + "grad_norm": 0.07292307168245316, + "learning_rate": 5.7810882577973816e-05, + "loss": 0.2276, + "step": 36531 + }, + { + "epoch": 2.95949449125081, + "grad_norm": 0.07512343674898148, + "learning_rate": 5.780638192537918e-05, + "loss": 0.2269, + "step": 36532 + }, + { + "epoch": 2.9595755022683083, + "grad_norm": 0.05893891677260399, + "learning_rate": 5.780188127278455e-05, + "loss": 0.2385, + "step": 36533 + }, + { + "epoch": 2.959656513285807, + "grad_norm": 0.06704125553369522, + "learning_rate": 5.7797380620189936e-05, + "loss": 0.2305, + "step": 36534 + }, + { + "epoch": 2.9597375243033053, + "grad_norm": 0.07865846157073975, + "learning_rate": 5.77928799675953e-05, + "loss": 0.2482, + "step": 36535 + }, + { + "epoch": 2.9598185353208035, + "grad_norm": 0.06334725022315979, + "learning_rate": 5.778837931500068e-05, + "loss": 0.2497, + "step": 36536 + }, + { + "epoch": 2.9598995463383018, + "grad_norm": 0.06638215482234955, + "learning_rate": 5.778387866240606e-05, + "loss": 0.2113, + "step": 36537 + }, + { + "epoch": 2.9599805573558005, + "grad_norm": 0.06827043741941452, + "learning_rate": 5.7779378009811424e-05, + "loss": 0.2311, + "step": 36538 + }, + { + "epoch": 2.9600615683732987, + "grad_norm": 0.05676588788628578, + "learning_rate": 5.77748773572168e-05, + "loss": 0.21, + "step": 36539 + }, + { + "epoch": 2.960142579390797, + "grad_norm": 0.08814438432455063, + "learning_rate": 5.777037670462218e-05, + "loss": 0.3044, + "step": 36540 + }, + { + "epoch": 2.9602235904082956, + "grad_norm": 0.07136793434619904, + "learning_rate": 5.7765876052027545e-05, + "loss": 0.2657, + "step": 36541 + }, + { + "epoch": 2.960304601425794, + "grad_norm": 0.06732363253831863, + "learning_rate": 5.776137539943292e-05, + "loss": 0.246, + "step": 36542 + }, + { + "epoch": 2.960385612443292, + "grad_norm": 0.07745783030986786, + "learning_rate": 5.77568747468383e-05, + "loss": 0.2371, + "step": 36543 + }, + { + "epoch": 2.960466623460791, + "grad_norm": 0.06036265939474106, + "learning_rate": 5.7752374094243666e-05, + "loss": 0.2035, + "step": 36544 + }, + { + "epoch": 2.960547634478289, + "grad_norm": 0.06795885413885117, + "learning_rate": 5.774787344164904e-05, + "loss": 0.2229, + "step": 36545 + }, + { + "epoch": 2.9606286454957873, + "grad_norm": 0.10156449675559998, + "learning_rate": 5.774337278905442e-05, + "loss": 0.2794, + "step": 36546 + }, + { + "epoch": 2.960709656513286, + "grad_norm": 0.07111938297748566, + "learning_rate": 5.7738872136459787e-05, + "loss": 0.2011, + "step": 36547 + }, + { + "epoch": 2.9607906675307842, + "grad_norm": 0.05817258358001709, + "learning_rate": 5.773437148386517e-05, + "loss": 0.2034, + "step": 36548 + }, + { + "epoch": 2.9608716785482825, + "grad_norm": 0.06076313182711601, + "learning_rate": 5.772987083127054e-05, + "loss": 0.2228, + "step": 36549 + }, + { + "epoch": 2.960952689565781, + "grad_norm": 0.08142624795436859, + "learning_rate": 5.772537017867591e-05, + "loss": 0.2786, + "step": 36550 + }, + { + "epoch": 2.9610337005832794, + "grad_norm": 0.07742969691753387, + "learning_rate": 5.772086952608129e-05, + "loss": 0.2945, + "step": 36551 + }, + { + "epoch": 2.9611147116007777, + "grad_norm": 0.06102697178721428, + "learning_rate": 5.771636887348666e-05, + "loss": 0.2289, + "step": 36552 + }, + { + "epoch": 2.9611957226182763, + "grad_norm": 0.06912451982498169, + "learning_rate": 5.771186822089203e-05, + "loss": 0.21, + "step": 36553 + }, + { + "epoch": 2.9612767336357746, + "grad_norm": 0.05915422737598419, + "learning_rate": 5.770736756829741e-05, + "loss": 0.1814, + "step": 36554 + }, + { + "epoch": 2.961357744653273, + "grad_norm": 0.06098875403404236, + "learning_rate": 5.770286691570278e-05, + "loss": 0.2633, + "step": 36555 + }, + { + "epoch": 2.961438755670771, + "grad_norm": 0.07346302270889282, + "learning_rate": 5.769836626310815e-05, + "loss": 0.2828, + "step": 36556 + }, + { + "epoch": 2.9615197666882693, + "grad_norm": 0.08193167299032211, + "learning_rate": 5.769386561051353e-05, + "loss": 0.2435, + "step": 36557 + }, + { + "epoch": 2.961600777705768, + "grad_norm": 0.07248496264219284, + "learning_rate": 5.76893649579189e-05, + "loss": 0.2135, + "step": 36558 + }, + { + "epoch": 2.9616817887232663, + "grad_norm": 0.07301896810531616, + "learning_rate": 5.768486430532427e-05, + "loss": 0.2176, + "step": 36559 + }, + { + "epoch": 2.9617627997407645, + "grad_norm": 0.07056121528148651, + "learning_rate": 5.768036365272965e-05, + "loss": 0.2808, + "step": 36560 + }, + { + "epoch": 2.961843810758263, + "grad_norm": 0.057158637791872025, + "learning_rate": 5.7675863000135024e-05, + "loss": 0.2535, + "step": 36561 + }, + { + "epoch": 2.9619248217757614, + "grad_norm": 0.07093117386102676, + "learning_rate": 5.767136234754039e-05, + "loss": 0.223, + "step": 36562 + }, + { + "epoch": 2.9620058327932597, + "grad_norm": 0.07848324626684189, + "learning_rate": 5.766686169494577e-05, + "loss": 0.2333, + "step": 36563 + }, + { + "epoch": 2.9620868438107584, + "grad_norm": 0.07335309684276581, + "learning_rate": 5.7662361042351145e-05, + "loss": 0.2507, + "step": 36564 + }, + { + "epoch": 2.9621678548282566, + "grad_norm": 0.06225890666246414, + "learning_rate": 5.765786038975651e-05, + "loss": 0.2424, + "step": 36565 + }, + { + "epoch": 2.962248865845755, + "grad_norm": 0.061031196266412735, + "learning_rate": 5.765335973716189e-05, + "loss": 0.2232, + "step": 36566 + }, + { + "epoch": 2.9623298768632536, + "grad_norm": 0.06554216146469116, + "learning_rate": 5.7648859084567265e-05, + "loss": 0.2382, + "step": 36567 + }, + { + "epoch": 2.962410887880752, + "grad_norm": 0.08055777102708817, + "learning_rate": 5.764435843197263e-05, + "loss": 0.2449, + "step": 36568 + }, + { + "epoch": 2.96249189889825, + "grad_norm": 0.07099359482526779, + "learning_rate": 5.763985777937801e-05, + "loss": 0.2464, + "step": 36569 + }, + { + "epoch": 2.9625729099157487, + "grad_norm": 0.07479843497276306, + "learning_rate": 5.7635357126783386e-05, + "loss": 0.2208, + "step": 36570 + }, + { + "epoch": 2.962653920933247, + "grad_norm": 0.06658231467008591, + "learning_rate": 5.763085647418875e-05, + "loss": 0.2021, + "step": 36571 + }, + { + "epoch": 2.962734931950745, + "grad_norm": 0.0665665715932846, + "learning_rate": 5.7626355821594133e-05, + "loss": 0.2087, + "step": 36572 + }, + { + "epoch": 2.962815942968244, + "grad_norm": 0.06857075542211533, + "learning_rate": 5.762185516899951e-05, + "loss": 0.2755, + "step": 36573 + }, + { + "epoch": 2.962896953985742, + "grad_norm": 0.07066106796264648, + "learning_rate": 5.761735451640489e-05, + "loss": 0.2654, + "step": 36574 + }, + { + "epoch": 2.9629779650032404, + "grad_norm": 0.06204492971301079, + "learning_rate": 5.7612853863810254e-05, + "loss": 0.2247, + "step": 36575 + }, + { + "epoch": 2.963058976020739, + "grad_norm": 0.06969764828681946, + "learning_rate": 5.760835321121563e-05, + "loss": 0.2691, + "step": 36576 + }, + { + "epoch": 2.9631399870382373, + "grad_norm": 0.05596327409148216, + "learning_rate": 5.760385255862101e-05, + "loss": 0.2097, + "step": 36577 + }, + { + "epoch": 2.9632209980557356, + "grad_norm": 0.06442490965127945, + "learning_rate": 5.7599351906026375e-05, + "loss": 0.2236, + "step": 36578 + }, + { + "epoch": 2.963302009073234, + "grad_norm": 0.07338786125183105, + "learning_rate": 5.759485125343175e-05, + "loss": 0.2406, + "step": 36579 + }, + { + "epoch": 2.963383020090732, + "grad_norm": 0.07324231415987015, + "learning_rate": 5.759035060083713e-05, + "loss": 0.2234, + "step": 36580 + }, + { + "epoch": 2.9634640311082308, + "grad_norm": 0.061953090131282806, + "learning_rate": 5.7585849948242496e-05, + "loss": 0.201, + "step": 36581 + }, + { + "epoch": 2.963545042125729, + "grad_norm": 0.06300554424524307, + "learning_rate": 5.758134929564787e-05, + "loss": 0.2068, + "step": 36582 + }, + { + "epoch": 2.9636260531432272, + "grad_norm": 0.0651894062757492, + "learning_rate": 5.757684864305325e-05, + "loss": 0.2519, + "step": 36583 + }, + { + "epoch": 2.963707064160726, + "grad_norm": 0.0788087248802185, + "learning_rate": 5.757234799045862e-05, + "loss": 0.2195, + "step": 36584 + }, + { + "epoch": 2.963788075178224, + "grad_norm": 0.07214096933603287, + "learning_rate": 5.756784733786399e-05, + "loss": 0.2267, + "step": 36585 + }, + { + "epoch": 2.9638690861957224, + "grad_norm": 0.05137092247605324, + "learning_rate": 5.756334668526937e-05, + "loss": 0.2019, + "step": 36586 + }, + { + "epoch": 2.963950097213221, + "grad_norm": 0.08676977455615997, + "learning_rate": 5.755884603267474e-05, + "loss": 0.248, + "step": 36587 + }, + { + "epoch": 2.9640311082307194, + "grad_norm": 0.07802512496709824, + "learning_rate": 5.755434538008011e-05, + "loss": 0.2724, + "step": 36588 + }, + { + "epoch": 2.9641121192482176, + "grad_norm": 0.0753769502043724, + "learning_rate": 5.754984472748549e-05, + "loss": 0.2109, + "step": 36589 + }, + { + "epoch": 2.9641931302657163, + "grad_norm": 0.07928132265806198, + "learning_rate": 5.754534407489086e-05, + "loss": 0.2555, + "step": 36590 + }, + { + "epoch": 2.9642741412832145, + "grad_norm": 0.08000775426626205, + "learning_rate": 5.754084342229623e-05, + "loss": 0.2267, + "step": 36591 + }, + { + "epoch": 2.964355152300713, + "grad_norm": 0.0798909068107605, + "learning_rate": 5.753634276970161e-05, + "loss": 0.2471, + "step": 36592 + }, + { + "epoch": 2.9644361633182115, + "grad_norm": 0.08604051917791367, + "learning_rate": 5.753184211710698e-05, + "loss": 0.2207, + "step": 36593 + }, + { + "epoch": 2.9645171743357097, + "grad_norm": 0.06759706139564514, + "learning_rate": 5.752734146451235e-05, + "loss": 0.2117, + "step": 36594 + }, + { + "epoch": 2.964598185353208, + "grad_norm": 0.06823612749576569, + "learning_rate": 5.752284081191773e-05, + "loss": 0.2091, + "step": 36595 + }, + { + "epoch": 2.9646791963707066, + "grad_norm": 0.05693601071834564, + "learning_rate": 5.75183401593231e-05, + "loss": 0.223, + "step": 36596 + }, + { + "epoch": 2.964760207388205, + "grad_norm": 0.07944602519273758, + "learning_rate": 5.7513839506728474e-05, + "loss": 0.2785, + "step": 36597 + }, + { + "epoch": 2.964841218405703, + "grad_norm": 0.08167745918035507, + "learning_rate": 5.7509338854133854e-05, + "loss": 0.2245, + "step": 36598 + }, + { + "epoch": 2.964922229423202, + "grad_norm": 0.08227448165416718, + "learning_rate": 5.750483820153922e-05, + "loss": 0.2542, + "step": 36599 + }, + { + "epoch": 2.9650032404407, + "grad_norm": 0.05877317115664482, + "learning_rate": 5.750033754894461e-05, + "loss": 0.2076, + "step": 36600 + }, + { + "epoch": 2.9650842514581983, + "grad_norm": 0.0726679190993309, + "learning_rate": 5.7495836896349975e-05, + "loss": 0.2218, + "step": 36601 + }, + { + "epoch": 2.9651652624756966, + "grad_norm": 0.0621146522462368, + "learning_rate": 5.749133624375534e-05, + "loss": 0.2346, + "step": 36602 + }, + { + "epoch": 2.965246273493195, + "grad_norm": 0.08478382229804993, + "learning_rate": 5.748683559116073e-05, + "loss": 0.2481, + "step": 36603 + }, + { + "epoch": 2.9653272845106935, + "grad_norm": 0.06537126004695892, + "learning_rate": 5.7482334938566096e-05, + "loss": 0.2317, + "step": 36604 + }, + { + "epoch": 2.9654082955281917, + "grad_norm": 0.06782863289117813, + "learning_rate": 5.747783428597147e-05, + "loss": 0.1949, + "step": 36605 + }, + { + "epoch": 2.96548930654569, + "grad_norm": 0.0709555372595787, + "learning_rate": 5.747333363337685e-05, + "loss": 0.2361, + "step": 36606 + }, + { + "epoch": 2.9655703175631887, + "grad_norm": 0.06789838522672653, + "learning_rate": 5.7468832980782216e-05, + "loss": 0.2231, + "step": 36607 + }, + { + "epoch": 2.965651328580687, + "grad_norm": 0.07288671284914017, + "learning_rate": 5.746433232818759e-05, + "loss": 0.2363, + "step": 36608 + }, + { + "epoch": 2.965732339598185, + "grad_norm": 0.060141559690237045, + "learning_rate": 5.745983167559297e-05, + "loss": 0.215, + "step": 36609 + }, + { + "epoch": 2.965813350615684, + "grad_norm": 0.05655485391616821, + "learning_rate": 5.745533102299834e-05, + "loss": 0.2373, + "step": 36610 + }, + { + "epoch": 2.965894361633182, + "grad_norm": 0.07940292358398438, + "learning_rate": 5.745083037040371e-05, + "loss": 0.2613, + "step": 36611 + }, + { + "epoch": 2.9659753726506803, + "grad_norm": 0.06751307100057602, + "learning_rate": 5.744632971780909e-05, + "loss": 0.2073, + "step": 36612 + }, + { + "epoch": 2.966056383668179, + "grad_norm": 0.06888186186552048, + "learning_rate": 5.744182906521446e-05, + "loss": 0.249, + "step": 36613 + }, + { + "epoch": 2.9661373946856773, + "grad_norm": 0.06600555032491684, + "learning_rate": 5.743732841261983e-05, + "loss": 0.2299, + "step": 36614 + }, + { + "epoch": 2.9662184057031755, + "grad_norm": 0.07247529923915863, + "learning_rate": 5.743282776002521e-05, + "loss": 0.2191, + "step": 36615 + }, + { + "epoch": 2.966299416720674, + "grad_norm": 0.06783980131149292, + "learning_rate": 5.742832710743058e-05, + "loss": 0.2469, + "step": 36616 + }, + { + "epoch": 2.9663804277381725, + "grad_norm": 0.05341443419456482, + "learning_rate": 5.742382645483595e-05, + "loss": 0.2058, + "step": 36617 + }, + { + "epoch": 2.9664614387556707, + "grad_norm": 0.06809686124324799, + "learning_rate": 5.741932580224133e-05, + "loss": 0.2443, + "step": 36618 + }, + { + "epoch": 2.9665424497731694, + "grad_norm": 0.08757691085338593, + "learning_rate": 5.74148251496467e-05, + "loss": 0.2596, + "step": 36619 + }, + { + "epoch": 2.9666234607906676, + "grad_norm": 0.07357107102870941, + "learning_rate": 5.741032449705207e-05, + "loss": 0.2223, + "step": 36620 + }, + { + "epoch": 2.966704471808166, + "grad_norm": 0.07046817988157272, + "learning_rate": 5.7405823844457454e-05, + "loss": 0.238, + "step": 36621 + }, + { + "epoch": 2.9667854828256646, + "grad_norm": 0.07757671922445297, + "learning_rate": 5.740132319186282e-05, + "loss": 0.2575, + "step": 36622 + }, + { + "epoch": 2.966866493843163, + "grad_norm": 0.09322373569011688, + "learning_rate": 5.7396822539268194e-05, + "loss": 0.2306, + "step": 36623 + }, + { + "epoch": 2.966947504860661, + "grad_norm": 0.07821787148714066, + "learning_rate": 5.7392321886673574e-05, + "loss": 0.2679, + "step": 36624 + }, + { + "epoch": 2.9670285158781593, + "grad_norm": 0.06471216678619385, + "learning_rate": 5.738782123407894e-05, + "loss": 0.2387, + "step": 36625 + }, + { + "epoch": 2.9671095268956575, + "grad_norm": 0.0634206086397171, + "learning_rate": 5.738332058148432e-05, + "loss": 0.2094, + "step": 36626 + }, + { + "epoch": 2.9671905379131562, + "grad_norm": 0.054617397487163544, + "learning_rate": 5.7378819928889695e-05, + "loss": 0.2139, + "step": 36627 + }, + { + "epoch": 2.9672715489306545, + "grad_norm": 0.0798972025513649, + "learning_rate": 5.737431927629506e-05, + "loss": 0.2662, + "step": 36628 + }, + { + "epoch": 2.9673525599481527, + "grad_norm": 0.0739915519952774, + "learning_rate": 5.736981862370044e-05, + "loss": 0.2616, + "step": 36629 + }, + { + "epoch": 2.9674335709656514, + "grad_norm": 0.07516808062791824, + "learning_rate": 5.7365317971105816e-05, + "loss": 0.2412, + "step": 36630 + }, + { + "epoch": 2.9675145819831497, + "grad_norm": 0.07437235862016678, + "learning_rate": 5.736081731851118e-05, + "loss": 0.254, + "step": 36631 + }, + { + "epoch": 2.967595593000648, + "grad_norm": 0.0826960951089859, + "learning_rate": 5.735631666591656e-05, + "loss": 0.2511, + "step": 36632 + }, + { + "epoch": 2.9676766040181466, + "grad_norm": 0.08840584754943848, + "learning_rate": 5.735181601332194e-05, + "loss": 0.2449, + "step": 36633 + }, + { + "epoch": 2.967757615035645, + "grad_norm": 0.07088325917720795, + "learning_rate": 5.7347315360727304e-05, + "loss": 0.2315, + "step": 36634 + }, + { + "epoch": 2.967838626053143, + "grad_norm": 0.07013159245252609, + "learning_rate": 5.7342814708132684e-05, + "loss": 0.2712, + "step": 36635 + }, + { + "epoch": 2.9679196370706418, + "grad_norm": 0.06038451939821243, + "learning_rate": 5.733831405553806e-05, + "loss": 0.1965, + "step": 36636 + }, + { + "epoch": 2.96800064808814, + "grad_norm": 0.0711541622877121, + "learning_rate": 5.7333813402943424e-05, + "loss": 0.2286, + "step": 36637 + }, + { + "epoch": 2.9680816591056383, + "grad_norm": 0.0641595646739006, + "learning_rate": 5.7329312750348805e-05, + "loss": 0.1963, + "step": 36638 + }, + { + "epoch": 2.968162670123137, + "grad_norm": 0.05820443481206894, + "learning_rate": 5.732481209775418e-05, + "loss": 0.2042, + "step": 36639 + }, + { + "epoch": 2.968243681140635, + "grad_norm": 0.0708758533000946, + "learning_rate": 5.7320311445159545e-05, + "loss": 0.2332, + "step": 36640 + }, + { + "epoch": 2.9683246921581334, + "grad_norm": 0.06513377279043198, + "learning_rate": 5.7315810792564926e-05, + "loss": 0.1954, + "step": 36641 + }, + { + "epoch": 2.968405703175632, + "grad_norm": 0.0671873390674591, + "learning_rate": 5.73113101399703e-05, + "loss": 0.2437, + "step": 36642 + }, + { + "epoch": 2.9684867141931304, + "grad_norm": 0.07724633067846298, + "learning_rate": 5.7306809487375666e-05, + "loss": 0.2359, + "step": 36643 + }, + { + "epoch": 2.9685677252106286, + "grad_norm": 0.07575727999210358, + "learning_rate": 5.7302308834781046e-05, + "loss": 0.2281, + "step": 36644 + }, + { + "epoch": 2.968648736228127, + "grad_norm": 0.06524080783128738, + "learning_rate": 5.729780818218642e-05, + "loss": 0.2341, + "step": 36645 + }, + { + "epoch": 2.9687297472456255, + "grad_norm": 0.06586866080760956, + "learning_rate": 5.729330752959179e-05, + "loss": 0.2305, + "step": 36646 + }, + { + "epoch": 2.968810758263124, + "grad_norm": 0.057113513350486755, + "learning_rate": 5.728880687699717e-05, + "loss": 0.205, + "step": 36647 + }, + { + "epoch": 2.968891769280622, + "grad_norm": 0.07440678775310516, + "learning_rate": 5.728430622440254e-05, + "loss": 0.2353, + "step": 36648 + }, + { + "epoch": 2.9689727802981203, + "grad_norm": 0.07313182204961777, + "learning_rate": 5.727980557180791e-05, + "loss": 0.2262, + "step": 36649 + }, + { + "epoch": 2.969053791315619, + "grad_norm": 0.06670592725276947, + "learning_rate": 5.727530491921329e-05, + "loss": 0.2014, + "step": 36650 + }, + { + "epoch": 2.969134802333117, + "grad_norm": 0.06971270591020584, + "learning_rate": 5.727080426661866e-05, + "loss": 0.2418, + "step": 36651 + }, + { + "epoch": 2.9692158133506155, + "grad_norm": 0.07573027163743973, + "learning_rate": 5.726630361402404e-05, + "loss": 0.2379, + "step": 36652 + }, + { + "epoch": 2.969296824368114, + "grad_norm": 0.07461774349212646, + "learning_rate": 5.726180296142941e-05, + "loss": 0.2143, + "step": 36653 + }, + { + "epoch": 2.9693778353856124, + "grad_norm": 0.0802450180053711, + "learning_rate": 5.725730230883478e-05, + "loss": 0.2163, + "step": 36654 + }, + { + "epoch": 2.9694588464031106, + "grad_norm": 0.0737207680940628, + "learning_rate": 5.725280165624016e-05, + "loss": 0.303, + "step": 36655 + }, + { + "epoch": 2.9695398574206093, + "grad_norm": 0.07518462836742401, + "learning_rate": 5.724830100364553e-05, + "loss": 0.2254, + "step": 36656 + }, + { + "epoch": 2.9696208684381076, + "grad_norm": 0.06999009102582932, + "learning_rate": 5.72438003510509e-05, + "loss": 0.2647, + "step": 36657 + }, + { + "epoch": 2.969701879455606, + "grad_norm": 0.0705016627907753, + "learning_rate": 5.7239299698456284e-05, + "loss": 0.2446, + "step": 36658 + }, + { + "epoch": 2.9697828904731045, + "grad_norm": 0.07972300052642822, + "learning_rate": 5.723479904586165e-05, + "loss": 0.2184, + "step": 36659 + }, + { + "epoch": 2.9698639014906028, + "grad_norm": 0.07280859351158142, + "learning_rate": 5.7230298393267024e-05, + "loss": 0.25, + "step": 36660 + }, + { + "epoch": 2.969944912508101, + "grad_norm": 0.07672141492366791, + "learning_rate": 5.7225797740672404e-05, + "loss": 0.2497, + "step": 36661 + }, + { + "epoch": 2.9700259235255997, + "grad_norm": 0.06514837592840195, + "learning_rate": 5.722129708807777e-05, + "loss": 0.1916, + "step": 36662 + }, + { + "epoch": 2.970106934543098, + "grad_norm": 0.08025524020195007, + "learning_rate": 5.7216796435483145e-05, + "loss": 0.229, + "step": 36663 + }, + { + "epoch": 2.970187945560596, + "grad_norm": 0.06459321826696396, + "learning_rate": 5.7212295782888525e-05, + "loss": 0.2116, + "step": 36664 + }, + { + "epoch": 2.970268956578095, + "grad_norm": 0.06459393352270126, + "learning_rate": 5.720779513029389e-05, + "loss": 0.2322, + "step": 36665 + }, + { + "epoch": 2.970349967595593, + "grad_norm": 0.0697956532239914, + "learning_rate": 5.7203294477699266e-05, + "loss": 0.2363, + "step": 36666 + }, + { + "epoch": 2.9704309786130914, + "grad_norm": 0.07393402606248856, + "learning_rate": 5.7198793825104646e-05, + "loss": 0.2411, + "step": 36667 + }, + { + "epoch": 2.9705119896305896, + "grad_norm": 0.05831243842840195, + "learning_rate": 5.719429317251001e-05, + "loss": 0.219, + "step": 36668 + }, + { + "epoch": 2.9705930006480883, + "grad_norm": 0.06987475603818893, + "learning_rate": 5.7189792519915386e-05, + "loss": 0.256, + "step": 36669 + }, + { + "epoch": 2.9706740116655865, + "grad_norm": 0.06129377707839012, + "learning_rate": 5.718529186732077e-05, + "loss": 0.2088, + "step": 36670 + }, + { + "epoch": 2.970755022683085, + "grad_norm": 0.06800404191017151, + "learning_rate": 5.718079121472614e-05, + "loss": 0.2267, + "step": 36671 + }, + { + "epoch": 2.970836033700583, + "grad_norm": 0.05925722420215607, + "learning_rate": 5.717629056213151e-05, + "loss": 0.1863, + "step": 36672 + }, + { + "epoch": 2.9709170447180817, + "grad_norm": 0.06941638886928558, + "learning_rate": 5.717178990953689e-05, + "loss": 0.2214, + "step": 36673 + }, + { + "epoch": 2.97099805573558, + "grad_norm": 0.07604862004518509, + "learning_rate": 5.716728925694226e-05, + "loss": 0.2401, + "step": 36674 + }, + { + "epoch": 2.971079066753078, + "grad_norm": 0.066129170358181, + "learning_rate": 5.716278860434763e-05, + "loss": 0.2286, + "step": 36675 + }, + { + "epoch": 2.971160077770577, + "grad_norm": 0.07583235949277878, + "learning_rate": 5.715828795175301e-05, + "loss": 0.2284, + "step": 36676 + }, + { + "epoch": 2.971241088788075, + "grad_norm": 0.06899145990610123, + "learning_rate": 5.715378729915838e-05, + "loss": 0.2332, + "step": 36677 + }, + { + "epoch": 2.9713220998055734, + "grad_norm": 0.058244358748197556, + "learning_rate": 5.714928664656376e-05, + "loss": 0.2607, + "step": 36678 + }, + { + "epoch": 2.971403110823072, + "grad_norm": 0.0775933712720871, + "learning_rate": 5.714478599396913e-05, + "loss": 0.227, + "step": 36679 + }, + { + "epoch": 2.9714841218405703, + "grad_norm": 0.08306246995925903, + "learning_rate": 5.71402853413745e-05, + "loss": 0.2446, + "step": 36680 + }, + { + "epoch": 2.9715651328580686, + "grad_norm": 0.0686577782034874, + "learning_rate": 5.713578468877988e-05, + "loss": 0.2276, + "step": 36681 + }, + { + "epoch": 2.9716461438755672, + "grad_norm": 0.05848119035363197, + "learning_rate": 5.713128403618525e-05, + "loss": 0.2267, + "step": 36682 + }, + { + "epoch": 2.9717271548930655, + "grad_norm": 0.06222635135054588, + "learning_rate": 5.7126783383590624e-05, + "loss": 0.2205, + "step": 36683 + }, + { + "epoch": 2.9718081659105637, + "grad_norm": 0.08012259751558304, + "learning_rate": 5.7122282730996004e-05, + "loss": 0.2364, + "step": 36684 + }, + { + "epoch": 2.9718891769280624, + "grad_norm": 0.07313792407512665, + "learning_rate": 5.711778207840137e-05, + "loss": 0.2682, + "step": 36685 + }, + { + "epoch": 2.9719701879455607, + "grad_norm": 0.08533124625682831, + "learning_rate": 5.7113281425806745e-05, + "loss": 0.2391, + "step": 36686 + }, + { + "epoch": 2.972051198963059, + "grad_norm": 0.07785817980766296, + "learning_rate": 5.7108780773212125e-05, + "loss": 0.265, + "step": 36687 + }, + { + "epoch": 2.9721322099805576, + "grad_norm": 0.05892932415008545, + "learning_rate": 5.710428012061749e-05, + "loss": 0.2122, + "step": 36688 + }, + { + "epoch": 2.972213220998056, + "grad_norm": 0.06475520133972168, + "learning_rate": 5.7099779468022865e-05, + "loss": 0.2353, + "step": 36689 + }, + { + "epoch": 2.972294232015554, + "grad_norm": 0.06920190155506134, + "learning_rate": 5.7095278815428246e-05, + "loss": 0.238, + "step": 36690 + }, + { + "epoch": 2.9723752430330523, + "grad_norm": 0.05841432139277458, + "learning_rate": 5.709077816283361e-05, + "loss": 0.2313, + "step": 36691 + }, + { + "epoch": 2.972456254050551, + "grad_norm": 0.06791935116052628, + "learning_rate": 5.7086277510238986e-05, + "loss": 0.2221, + "step": 36692 + }, + { + "epoch": 2.9725372650680493, + "grad_norm": 0.0724581778049469, + "learning_rate": 5.7081776857644367e-05, + "loss": 0.237, + "step": 36693 + }, + { + "epoch": 2.9726182760855475, + "grad_norm": 0.08021465688943863, + "learning_rate": 5.707727620504973e-05, + "loss": 0.2457, + "step": 36694 + }, + { + "epoch": 2.9726992871030458, + "grad_norm": 0.06262336671352386, + "learning_rate": 5.707277555245511e-05, + "loss": 0.2763, + "step": 36695 + }, + { + "epoch": 2.9727802981205445, + "grad_norm": 0.0640406459569931, + "learning_rate": 5.706827489986049e-05, + "loss": 0.2138, + "step": 36696 + }, + { + "epoch": 2.9728613091380427, + "grad_norm": 0.06408407539129257, + "learning_rate": 5.7063774247265854e-05, + "loss": 0.2061, + "step": 36697 + }, + { + "epoch": 2.972942320155541, + "grad_norm": 0.06960425525903702, + "learning_rate": 5.705927359467123e-05, + "loss": 0.2182, + "step": 36698 + }, + { + "epoch": 2.9730233311730396, + "grad_norm": 0.05899034067988396, + "learning_rate": 5.705477294207661e-05, + "loss": 0.2301, + "step": 36699 + }, + { + "epoch": 2.973104342190538, + "grad_norm": 0.07200396806001663, + "learning_rate": 5.7050272289481975e-05, + "loss": 0.2169, + "step": 36700 + }, + { + "epoch": 2.973185353208036, + "grad_norm": 0.06714487820863724, + "learning_rate": 5.704577163688735e-05, + "loss": 0.2566, + "step": 36701 + }, + { + "epoch": 2.973266364225535, + "grad_norm": 0.07260193675756454, + "learning_rate": 5.704127098429273e-05, + "loss": 0.235, + "step": 36702 + }, + { + "epoch": 2.973347375243033, + "grad_norm": 0.0751718133687973, + "learning_rate": 5.7036770331698096e-05, + "loss": 0.2409, + "step": 36703 + }, + { + "epoch": 2.9734283862605313, + "grad_norm": 0.06625243276357651, + "learning_rate": 5.703226967910347e-05, + "loss": 0.2252, + "step": 36704 + }, + { + "epoch": 2.97350939727803, + "grad_norm": 0.07275108993053436, + "learning_rate": 5.702776902650885e-05, + "loss": 0.2133, + "step": 36705 + }, + { + "epoch": 2.9735904082955282, + "grad_norm": 0.08456332236528397, + "learning_rate": 5.7023268373914217e-05, + "loss": 0.2169, + "step": 36706 + }, + { + "epoch": 2.9736714193130265, + "grad_norm": 0.052250731736421585, + "learning_rate": 5.70187677213196e-05, + "loss": 0.2133, + "step": 36707 + }, + { + "epoch": 2.973752430330525, + "grad_norm": 0.0629974752664566, + "learning_rate": 5.701426706872497e-05, + "loss": 0.2495, + "step": 36708 + }, + { + "epoch": 2.9738334413480234, + "grad_norm": 0.07221046835184097, + "learning_rate": 5.700976641613034e-05, + "loss": 0.235, + "step": 36709 + }, + { + "epoch": 2.9739144523655217, + "grad_norm": 0.08091980218887329, + "learning_rate": 5.700526576353572e-05, + "loss": 0.2234, + "step": 36710 + }, + { + "epoch": 2.9739954633830203, + "grad_norm": 0.07329240441322327, + "learning_rate": 5.700076511094109e-05, + "loss": 0.2328, + "step": 36711 + }, + { + "epoch": 2.9740764744005186, + "grad_norm": 0.06695925444364548, + "learning_rate": 5.699626445834646e-05, + "loss": 0.2266, + "step": 36712 + }, + { + "epoch": 2.974157485418017, + "grad_norm": 0.07289384305477142, + "learning_rate": 5.699176380575184e-05, + "loss": 0.2424, + "step": 36713 + }, + { + "epoch": 2.974238496435515, + "grad_norm": 0.0600985512137413, + "learning_rate": 5.698726315315721e-05, + "loss": 0.2145, + "step": 36714 + }, + { + "epoch": 2.9743195074530138, + "grad_norm": 0.06750945746898651, + "learning_rate": 5.698276250056258e-05, + "loss": 0.2451, + "step": 36715 + }, + { + "epoch": 2.974400518470512, + "grad_norm": 0.09045244753360748, + "learning_rate": 5.697826184796796e-05, + "loss": 0.2417, + "step": 36716 + }, + { + "epoch": 2.9744815294880103, + "grad_norm": 0.07236456125974655, + "learning_rate": 5.697376119537333e-05, + "loss": 0.2006, + "step": 36717 + }, + { + "epoch": 2.9745625405055085, + "grad_norm": 0.06211559846997261, + "learning_rate": 5.69692605427787e-05, + "loss": 0.223, + "step": 36718 + }, + { + "epoch": 2.974643551523007, + "grad_norm": 0.07024736702442169, + "learning_rate": 5.696475989018408e-05, + "loss": 0.2447, + "step": 36719 + }, + { + "epoch": 2.9747245625405054, + "grad_norm": 0.0769864022731781, + "learning_rate": 5.6960259237589454e-05, + "loss": 0.2559, + "step": 36720 + }, + { + "epoch": 2.9748055735580037, + "grad_norm": 0.06864888221025467, + "learning_rate": 5.695575858499482e-05, + "loss": 0.2039, + "step": 36721 + }, + { + "epoch": 2.9748865845755024, + "grad_norm": 0.06834088265895844, + "learning_rate": 5.69512579324002e-05, + "loss": 0.2201, + "step": 36722 + }, + { + "epoch": 2.9749675955930006, + "grad_norm": 0.0637509822845459, + "learning_rate": 5.6946757279805575e-05, + "loss": 0.2219, + "step": 36723 + }, + { + "epoch": 2.975048606610499, + "grad_norm": 0.075530506670475, + "learning_rate": 5.694225662721094e-05, + "loss": 0.223, + "step": 36724 + }, + { + "epoch": 2.9751296176279975, + "grad_norm": 0.09860153496265411, + "learning_rate": 5.693775597461632e-05, + "loss": 0.2255, + "step": 36725 + }, + { + "epoch": 2.975210628645496, + "grad_norm": 0.07422681152820587, + "learning_rate": 5.6933255322021695e-05, + "loss": 0.2502, + "step": 36726 + }, + { + "epoch": 2.975291639662994, + "grad_norm": 0.06554101407527924, + "learning_rate": 5.692875466942706e-05, + "loss": 0.2168, + "step": 36727 + }, + { + "epoch": 2.9753726506804927, + "grad_norm": 0.05854536220431328, + "learning_rate": 5.692425401683244e-05, + "loss": 0.2171, + "step": 36728 + }, + { + "epoch": 2.975453661697991, + "grad_norm": 0.07138527929782867, + "learning_rate": 5.6919753364237816e-05, + "loss": 0.2234, + "step": 36729 + }, + { + "epoch": 2.975534672715489, + "grad_norm": 0.06646374613046646, + "learning_rate": 5.691525271164318e-05, + "loss": 0.2106, + "step": 36730 + }, + { + "epoch": 2.975615683732988, + "grad_norm": 0.061085499823093414, + "learning_rate": 5.6910752059048563e-05, + "loss": 0.2047, + "step": 36731 + }, + { + "epoch": 2.975696694750486, + "grad_norm": 0.07485700398683548, + "learning_rate": 5.690625140645394e-05, + "loss": 0.2248, + "step": 36732 + }, + { + "epoch": 2.9757777057679844, + "grad_norm": 0.05422642081975937, + "learning_rate": 5.690175075385932e-05, + "loss": 0.2411, + "step": 36733 + }, + { + "epoch": 2.975858716785483, + "grad_norm": 0.06741956621408463, + "learning_rate": 5.6897250101264684e-05, + "loss": 0.2408, + "step": 36734 + }, + { + "epoch": 2.9759397278029813, + "grad_norm": 0.05959073826670647, + "learning_rate": 5.689274944867006e-05, + "loss": 0.232, + "step": 36735 + }, + { + "epoch": 2.9760207388204796, + "grad_norm": 0.08108634501695633, + "learning_rate": 5.688824879607544e-05, + "loss": 0.2572, + "step": 36736 + }, + { + "epoch": 2.976101749837978, + "grad_norm": 0.07710261642932892, + "learning_rate": 5.6883748143480805e-05, + "loss": 0.2616, + "step": 36737 + }, + { + "epoch": 2.9761827608554765, + "grad_norm": 0.05549994856119156, + "learning_rate": 5.687924749088618e-05, + "loss": 0.1959, + "step": 36738 + }, + { + "epoch": 2.9762637718729748, + "grad_norm": 0.07519969344139099, + "learning_rate": 5.687474683829156e-05, + "loss": 0.2243, + "step": 36739 + }, + { + "epoch": 2.976344782890473, + "grad_norm": 0.06337859481573105, + "learning_rate": 5.687024618569693e-05, + "loss": 0.2568, + "step": 36740 + }, + { + "epoch": 2.9764257939079712, + "grad_norm": 0.07228632271289825, + "learning_rate": 5.68657455331023e-05, + "loss": 0.2505, + "step": 36741 + }, + { + "epoch": 2.97650680492547, + "grad_norm": 0.07821758836507797, + "learning_rate": 5.686124488050768e-05, + "loss": 0.2524, + "step": 36742 + }, + { + "epoch": 2.976587815942968, + "grad_norm": 0.0661669373512268, + "learning_rate": 5.6856744227913053e-05, + "loss": 0.2145, + "step": 36743 + }, + { + "epoch": 2.9766688269604664, + "grad_norm": 0.04832182824611664, + "learning_rate": 5.685224357531842e-05, + "loss": 0.215, + "step": 36744 + }, + { + "epoch": 2.976749837977965, + "grad_norm": 0.07201585918664932, + "learning_rate": 5.68477429227238e-05, + "loss": 0.245, + "step": 36745 + }, + { + "epoch": 2.9768308489954634, + "grad_norm": 0.06247623264789581, + "learning_rate": 5.6843242270129174e-05, + "loss": 0.2146, + "step": 36746 + }, + { + "epoch": 2.9769118600129616, + "grad_norm": 0.06249498575925827, + "learning_rate": 5.683874161753454e-05, + "loss": 0.2188, + "step": 36747 + }, + { + "epoch": 2.9769928710304603, + "grad_norm": 0.0698925331234932, + "learning_rate": 5.683424096493992e-05, + "loss": 0.2434, + "step": 36748 + }, + { + "epoch": 2.9770738820479585, + "grad_norm": 0.06198454648256302, + "learning_rate": 5.6829740312345295e-05, + "loss": 0.2281, + "step": 36749 + }, + { + "epoch": 2.9771548930654568, + "grad_norm": 0.07053163647651672, + "learning_rate": 5.682523965975066e-05, + "loss": 0.248, + "step": 36750 + }, + { + "epoch": 2.9772359040829555, + "grad_norm": 0.07296600192785263, + "learning_rate": 5.682073900715604e-05, + "loss": 0.2231, + "step": 36751 + }, + { + "epoch": 2.9773169151004537, + "grad_norm": 0.0734759047627449, + "learning_rate": 5.6816238354561416e-05, + "loss": 0.222, + "step": 36752 + }, + { + "epoch": 2.977397926117952, + "grad_norm": 0.06778737157583237, + "learning_rate": 5.681173770196678e-05, + "loss": 0.2413, + "step": 36753 + }, + { + "epoch": 2.9774789371354506, + "grad_norm": 0.06905605643987656, + "learning_rate": 5.680723704937216e-05, + "loss": 0.2188, + "step": 36754 + }, + { + "epoch": 2.977559948152949, + "grad_norm": 0.07669002562761307, + "learning_rate": 5.680273639677754e-05, + "loss": 0.2266, + "step": 36755 + }, + { + "epoch": 2.977640959170447, + "grad_norm": 0.0734478309750557, + "learning_rate": 5.6798235744182904e-05, + "loss": 0.2303, + "step": 36756 + }, + { + "epoch": 2.977721970187946, + "grad_norm": 0.0745403915643692, + "learning_rate": 5.6793735091588284e-05, + "loss": 0.2795, + "step": 36757 + }, + { + "epoch": 2.977802981205444, + "grad_norm": 0.0757230594754219, + "learning_rate": 5.678923443899366e-05, + "loss": 0.2217, + "step": 36758 + }, + { + "epoch": 2.9778839922229423, + "grad_norm": 0.09342848509550095, + "learning_rate": 5.678473378639904e-05, + "loss": 0.2421, + "step": 36759 + }, + { + "epoch": 2.9779650032404406, + "grad_norm": 0.07937044650316238, + "learning_rate": 5.6780233133804405e-05, + "loss": 0.224, + "step": 36760 + }, + { + "epoch": 2.9780460142579392, + "grad_norm": 0.06836716085672379, + "learning_rate": 5.677573248120978e-05, + "loss": 0.2351, + "step": 36761 + }, + { + "epoch": 2.9781270252754375, + "grad_norm": 0.05467992275953293, + "learning_rate": 5.677123182861516e-05, + "loss": 0.2282, + "step": 36762 + }, + { + "epoch": 2.9782080362929357, + "grad_norm": 0.08199840039014816, + "learning_rate": 5.6766731176020526e-05, + "loss": 0.2023, + "step": 36763 + }, + { + "epoch": 2.978289047310434, + "grad_norm": 0.06809769570827484, + "learning_rate": 5.67622305234259e-05, + "loss": 0.2468, + "step": 36764 + }, + { + "epoch": 2.9783700583279327, + "grad_norm": 0.06043122336268425, + "learning_rate": 5.675772987083128e-05, + "loss": 0.2184, + "step": 36765 + }, + { + "epoch": 2.978451069345431, + "grad_norm": 0.06579332053661346, + "learning_rate": 5.6753229218236646e-05, + "loss": 0.2366, + "step": 36766 + }, + { + "epoch": 2.978532080362929, + "grad_norm": 0.06955967098474503, + "learning_rate": 5.674872856564202e-05, + "loss": 0.2295, + "step": 36767 + }, + { + "epoch": 2.978613091380428, + "grad_norm": 0.0776662528514862, + "learning_rate": 5.67442279130474e-05, + "loss": 0.1878, + "step": 36768 + }, + { + "epoch": 2.978694102397926, + "grad_norm": 0.06722002476453781, + "learning_rate": 5.673972726045277e-05, + "loss": 0.209, + "step": 36769 + }, + { + "epoch": 2.9787751134154243, + "grad_norm": 0.07205243408679962, + "learning_rate": 5.673522660785814e-05, + "loss": 0.1897, + "step": 36770 + }, + { + "epoch": 2.978856124432923, + "grad_norm": 0.06477699428796768, + "learning_rate": 5.673072595526352e-05, + "loss": 0.2377, + "step": 36771 + }, + { + "epoch": 2.9789371354504213, + "grad_norm": 0.06230941042304039, + "learning_rate": 5.672622530266889e-05, + "loss": 0.2186, + "step": 36772 + }, + { + "epoch": 2.9790181464679195, + "grad_norm": 0.07898541539907455, + "learning_rate": 5.672172465007426e-05, + "loss": 0.2367, + "step": 36773 + }, + { + "epoch": 2.979099157485418, + "grad_norm": 0.11465831845998764, + "learning_rate": 5.671722399747964e-05, + "loss": 0.2297, + "step": 36774 + }, + { + "epoch": 2.9791801685029164, + "grad_norm": 0.07188776135444641, + "learning_rate": 5.671272334488501e-05, + "loss": 0.2447, + "step": 36775 + }, + { + "epoch": 2.9792611795204147, + "grad_norm": 0.07775386422872543, + "learning_rate": 5.670822269229038e-05, + "loss": 0.2155, + "step": 36776 + }, + { + "epoch": 2.9793421905379134, + "grad_norm": 0.08862534910440445, + "learning_rate": 5.670372203969576e-05, + "loss": 0.2713, + "step": 36777 + }, + { + "epoch": 2.9794232015554116, + "grad_norm": 0.06923260539770126, + "learning_rate": 5.669922138710113e-05, + "loss": 0.2396, + "step": 36778 + }, + { + "epoch": 2.97950421257291, + "grad_norm": 0.07492440193891525, + "learning_rate": 5.66947207345065e-05, + "loss": 0.2408, + "step": 36779 + }, + { + "epoch": 2.9795852235904086, + "grad_norm": 0.0657166913151741, + "learning_rate": 5.6690220081911884e-05, + "loss": 0.2263, + "step": 36780 + }, + { + "epoch": 2.979666234607907, + "grad_norm": 0.06350436806678772, + "learning_rate": 5.668571942931725e-05, + "loss": 0.2342, + "step": 36781 + }, + { + "epoch": 2.979747245625405, + "grad_norm": 0.07210227102041245, + "learning_rate": 5.6681218776722624e-05, + "loss": 0.2376, + "step": 36782 + }, + { + "epoch": 2.9798282566429033, + "grad_norm": 0.08657340705394745, + "learning_rate": 5.6676718124128004e-05, + "loss": 0.2338, + "step": 36783 + }, + { + "epoch": 2.9799092676604015, + "grad_norm": 0.06363702565431595, + "learning_rate": 5.667221747153337e-05, + "loss": 0.2116, + "step": 36784 + }, + { + "epoch": 2.9799902786779002, + "grad_norm": 0.07759935408830643, + "learning_rate": 5.666771681893875e-05, + "loss": 0.2377, + "step": 36785 + }, + { + "epoch": 2.9800712896953985, + "grad_norm": 0.07095187157392502, + "learning_rate": 5.6663216166344125e-05, + "loss": 0.2431, + "step": 36786 + }, + { + "epoch": 2.9801523007128967, + "grad_norm": 0.07811301946640015, + "learning_rate": 5.665871551374949e-05, + "loss": 0.2804, + "step": 36787 + }, + { + "epoch": 2.9802333117303954, + "grad_norm": 0.0775509923696518, + "learning_rate": 5.665421486115487e-05, + "loss": 0.2445, + "step": 36788 + }, + { + "epoch": 2.9803143227478937, + "grad_norm": 0.05990509316325188, + "learning_rate": 5.6649714208560246e-05, + "loss": 0.2156, + "step": 36789 + }, + { + "epoch": 2.980395333765392, + "grad_norm": 0.0601307637989521, + "learning_rate": 5.664521355596561e-05, + "loss": 0.1895, + "step": 36790 + }, + { + "epoch": 2.9804763447828906, + "grad_norm": 0.06881251186132431, + "learning_rate": 5.664071290337099e-05, + "loss": 0.2413, + "step": 36791 + }, + { + "epoch": 2.980557355800389, + "grad_norm": 0.06957107782363892, + "learning_rate": 5.663621225077637e-05, + "loss": 0.2475, + "step": 36792 + }, + { + "epoch": 2.980638366817887, + "grad_norm": 0.07004716247320175, + "learning_rate": 5.6631711598181734e-05, + "loss": 0.2469, + "step": 36793 + }, + { + "epoch": 2.9807193778353858, + "grad_norm": 0.06502577662467957, + "learning_rate": 5.6627210945587114e-05, + "loss": 0.2572, + "step": 36794 + }, + { + "epoch": 2.980800388852884, + "grad_norm": 0.07077474892139435, + "learning_rate": 5.662271029299249e-05, + "loss": 0.2167, + "step": 36795 + }, + { + "epoch": 2.9808813998703823, + "grad_norm": 0.07815693318843842, + "learning_rate": 5.6618209640397854e-05, + "loss": 0.2498, + "step": 36796 + }, + { + "epoch": 2.980962410887881, + "grad_norm": 0.06587857753038406, + "learning_rate": 5.6613708987803235e-05, + "loss": 0.2151, + "step": 36797 + }, + { + "epoch": 2.981043421905379, + "grad_norm": 0.07732000946998596, + "learning_rate": 5.660920833520861e-05, + "loss": 0.2745, + "step": 36798 + }, + { + "epoch": 2.9811244329228774, + "grad_norm": 0.06029903516173363, + "learning_rate": 5.6604707682613975e-05, + "loss": 0.2348, + "step": 36799 + }, + { + "epoch": 2.981205443940376, + "grad_norm": 0.0725325271487236, + "learning_rate": 5.6600207030019356e-05, + "loss": 0.2739, + "step": 36800 + }, + { + "epoch": 2.9812864549578744, + "grad_norm": 0.07531344890594482, + "learning_rate": 5.659570637742473e-05, + "loss": 0.2513, + "step": 36801 + }, + { + "epoch": 2.9813674659753726, + "grad_norm": 0.08409595489501953, + "learning_rate": 5.6591205724830096e-05, + "loss": 0.2855, + "step": 36802 + }, + { + "epoch": 2.9814484769928713, + "grad_norm": 0.0734337642788887, + "learning_rate": 5.6586705072235476e-05, + "loss": 0.2331, + "step": 36803 + }, + { + "epoch": 2.9815294880103695, + "grad_norm": 0.05683013051748276, + "learning_rate": 5.658220441964085e-05, + "loss": 0.2541, + "step": 36804 + }, + { + "epoch": 2.981610499027868, + "grad_norm": 0.08660989999771118, + "learning_rate": 5.657770376704622e-05, + "loss": 0.2709, + "step": 36805 + }, + { + "epoch": 2.981691510045366, + "grad_norm": 0.0753931775689125, + "learning_rate": 5.65732031144516e-05, + "loss": 0.239, + "step": 36806 + }, + { + "epoch": 2.9817725210628643, + "grad_norm": 0.06659752875566483, + "learning_rate": 5.656870246185697e-05, + "loss": 0.2585, + "step": 36807 + }, + { + "epoch": 2.981853532080363, + "grad_norm": 0.07593297213315964, + "learning_rate": 5.656420180926234e-05, + "loss": 0.2474, + "step": 36808 + }, + { + "epoch": 2.981934543097861, + "grad_norm": 0.06416141241788864, + "learning_rate": 5.6559701156667725e-05, + "loss": 0.2309, + "step": 36809 + }, + { + "epoch": 2.9820155541153595, + "grad_norm": 0.06690444052219391, + "learning_rate": 5.655520050407309e-05, + "loss": 0.254, + "step": 36810 + }, + { + "epoch": 2.982096565132858, + "grad_norm": 0.06408987939357758, + "learning_rate": 5.655069985147847e-05, + "loss": 0.2347, + "step": 36811 + }, + { + "epoch": 2.9821775761503564, + "grad_norm": 0.06503544747829437, + "learning_rate": 5.6546199198883846e-05, + "loss": 0.2096, + "step": 36812 + }, + { + "epoch": 2.9822585871678546, + "grad_norm": 0.06841304898262024, + "learning_rate": 5.654169854628921e-05, + "loss": 0.2502, + "step": 36813 + }, + { + "epoch": 2.9823395981853533, + "grad_norm": 0.0618809312582016, + "learning_rate": 5.653719789369459e-05, + "loss": 0.2488, + "step": 36814 + }, + { + "epoch": 2.9824206092028516, + "grad_norm": 0.09338065981864929, + "learning_rate": 5.6532697241099966e-05, + "loss": 0.2322, + "step": 36815 + }, + { + "epoch": 2.98250162022035, + "grad_norm": 0.07193892449140549, + "learning_rate": 5.652819658850533e-05, + "loss": 0.2355, + "step": 36816 + }, + { + "epoch": 2.9825826312378485, + "grad_norm": 0.07920563220977783, + "learning_rate": 5.6523695935910714e-05, + "loss": 0.2611, + "step": 36817 + }, + { + "epoch": 2.9826636422553467, + "grad_norm": 0.06333600729703903, + "learning_rate": 5.651919528331609e-05, + "loss": 0.1964, + "step": 36818 + }, + { + "epoch": 2.982744653272845, + "grad_norm": 0.06896653026342392, + "learning_rate": 5.6514694630721454e-05, + "loss": 0.2281, + "step": 36819 + }, + { + "epoch": 2.9828256642903437, + "grad_norm": 0.07341278344392776, + "learning_rate": 5.6510193978126834e-05, + "loss": 0.2536, + "step": 36820 + }, + { + "epoch": 2.982906675307842, + "grad_norm": 0.05979539453983307, + "learning_rate": 5.650569332553221e-05, + "loss": 0.2331, + "step": 36821 + }, + { + "epoch": 2.98298768632534, + "grad_norm": 0.07593628764152527, + "learning_rate": 5.6501192672937575e-05, + "loss": 0.2168, + "step": 36822 + }, + { + "epoch": 2.983068697342839, + "grad_norm": 0.06415986269712448, + "learning_rate": 5.6496692020342955e-05, + "loss": 0.2266, + "step": 36823 + }, + { + "epoch": 2.983149708360337, + "grad_norm": 0.0640503317117691, + "learning_rate": 5.649219136774833e-05, + "loss": 0.2336, + "step": 36824 + }, + { + "epoch": 2.9832307193778353, + "grad_norm": 0.06948990374803543, + "learning_rate": 5.6487690715153696e-05, + "loss": 0.2501, + "step": 36825 + }, + { + "epoch": 2.983311730395334, + "grad_norm": 0.05950549244880676, + "learning_rate": 5.6483190062559076e-05, + "loss": 0.2023, + "step": 36826 + }, + { + "epoch": 2.9833927414128323, + "grad_norm": 0.06723881512880325, + "learning_rate": 5.647868940996445e-05, + "loss": 0.2492, + "step": 36827 + }, + { + "epoch": 2.9834737524303305, + "grad_norm": 0.07525148242712021, + "learning_rate": 5.6474188757369817e-05, + "loss": 0.2129, + "step": 36828 + }, + { + "epoch": 2.9835547634478288, + "grad_norm": 0.06187017261981964, + "learning_rate": 5.64696881047752e-05, + "loss": 0.2312, + "step": 36829 + }, + { + "epoch": 2.983635774465327, + "grad_norm": 0.06554147601127625, + "learning_rate": 5.646518745218057e-05, + "loss": 0.2335, + "step": 36830 + }, + { + "epoch": 2.9837167854828257, + "grad_norm": 0.08165033906698227, + "learning_rate": 5.646068679958594e-05, + "loss": 0.2255, + "step": 36831 + }, + { + "epoch": 2.983797796500324, + "grad_norm": 0.07512885332107544, + "learning_rate": 5.645618614699132e-05, + "loss": 0.2286, + "step": 36832 + }, + { + "epoch": 2.983878807517822, + "grad_norm": 0.07381787896156311, + "learning_rate": 5.645168549439669e-05, + "loss": 0.2278, + "step": 36833 + }, + { + "epoch": 2.983959818535321, + "grad_norm": 0.05407465249300003, + "learning_rate": 5.644718484180206e-05, + "loss": 0.2133, + "step": 36834 + }, + { + "epoch": 2.984040829552819, + "grad_norm": 0.07097189873456955, + "learning_rate": 5.644268418920744e-05, + "loss": 0.2419, + "step": 36835 + }, + { + "epoch": 2.9841218405703174, + "grad_norm": 0.06315812468528748, + "learning_rate": 5.643818353661281e-05, + "loss": 0.2211, + "step": 36836 + }, + { + "epoch": 2.984202851587816, + "grad_norm": 0.08845272660255432, + "learning_rate": 5.643368288401819e-05, + "loss": 0.2207, + "step": 36837 + }, + { + "epoch": 2.9842838626053143, + "grad_norm": 0.06286734342575073, + "learning_rate": 5.642918223142356e-05, + "loss": 0.2274, + "step": 36838 + }, + { + "epoch": 2.9843648736228126, + "grad_norm": 0.06748202443122864, + "learning_rate": 5.642468157882893e-05, + "loss": 0.2246, + "step": 36839 + }, + { + "epoch": 2.9844458846403112, + "grad_norm": 0.07265258580446243, + "learning_rate": 5.642018092623431e-05, + "loss": 0.2506, + "step": 36840 + }, + { + "epoch": 2.9845268956578095, + "grad_norm": 0.07095389068126678, + "learning_rate": 5.641568027363968e-05, + "loss": 0.2023, + "step": 36841 + }, + { + "epoch": 2.9846079066753077, + "grad_norm": 0.06580625474452972, + "learning_rate": 5.6411179621045054e-05, + "loss": 0.2145, + "step": 36842 + }, + { + "epoch": 2.9846889176928064, + "grad_norm": 0.07952791452407837, + "learning_rate": 5.6406678968450434e-05, + "loss": 0.2439, + "step": 36843 + }, + { + "epoch": 2.9847699287103047, + "grad_norm": 0.06329455226659775, + "learning_rate": 5.64021783158558e-05, + "loss": 0.2118, + "step": 36844 + }, + { + "epoch": 2.984850939727803, + "grad_norm": 0.06945721805095673, + "learning_rate": 5.6397677663261175e-05, + "loss": 0.2085, + "step": 36845 + }, + { + "epoch": 2.9849319507453016, + "grad_norm": 0.06174931675195694, + "learning_rate": 5.6393177010666555e-05, + "loss": 0.2488, + "step": 36846 + }, + { + "epoch": 2.9850129617628, + "grad_norm": 0.0708073079586029, + "learning_rate": 5.638867635807192e-05, + "loss": 0.2367, + "step": 36847 + }, + { + "epoch": 2.985093972780298, + "grad_norm": 0.06488966196775436, + "learning_rate": 5.6384175705477295e-05, + "loss": 0.2631, + "step": 36848 + }, + { + "epoch": 2.9851749837977968, + "grad_norm": 0.06657645851373672, + "learning_rate": 5.6379675052882676e-05, + "loss": 0.2297, + "step": 36849 + }, + { + "epoch": 2.985255994815295, + "grad_norm": 0.07657184451818466, + "learning_rate": 5.637517440028804e-05, + "loss": 0.2499, + "step": 36850 + }, + { + "epoch": 2.9853370058327933, + "grad_norm": 0.07391027361154556, + "learning_rate": 5.6370673747693416e-05, + "loss": 0.2909, + "step": 36851 + }, + { + "epoch": 2.9854180168502915, + "grad_norm": 0.08322843164205551, + "learning_rate": 5.6366173095098797e-05, + "loss": 0.2532, + "step": 36852 + }, + { + "epoch": 2.9854990278677898, + "grad_norm": 0.0625251829624176, + "learning_rate": 5.636167244250416e-05, + "loss": 0.2415, + "step": 36853 + }, + { + "epoch": 2.9855800388852884, + "grad_norm": 0.07309753447771072, + "learning_rate": 5.635717178990954e-05, + "loss": 0.2235, + "step": 36854 + }, + { + "epoch": 2.9856610499027867, + "grad_norm": 0.10035616904497147, + "learning_rate": 5.635267113731492e-05, + "loss": 0.2233, + "step": 36855 + }, + { + "epoch": 2.985742060920285, + "grad_norm": 0.06504696607589722, + "learning_rate": 5.6348170484720284e-05, + "loss": 0.2555, + "step": 36856 + }, + { + "epoch": 2.9858230719377836, + "grad_norm": 0.05904727056622505, + "learning_rate": 5.634366983212566e-05, + "loss": 0.209, + "step": 36857 + }, + { + "epoch": 2.985904082955282, + "grad_norm": 0.057460296899080276, + "learning_rate": 5.633916917953104e-05, + "loss": 0.2188, + "step": 36858 + }, + { + "epoch": 2.98598509397278, + "grad_norm": 0.08421430736780167, + "learning_rate": 5.6334668526936405e-05, + "loss": 0.2448, + "step": 36859 + }, + { + "epoch": 2.986066104990279, + "grad_norm": 0.07628507912158966, + "learning_rate": 5.633016787434178e-05, + "loss": 0.2689, + "step": 36860 + }, + { + "epoch": 2.986147116007777, + "grad_norm": 0.06346532702445984, + "learning_rate": 5.632566722174716e-05, + "loss": 0.2263, + "step": 36861 + }, + { + "epoch": 2.9862281270252753, + "grad_norm": 0.06182417646050453, + "learning_rate": 5.6321166569152526e-05, + "loss": 0.2284, + "step": 36862 + }, + { + "epoch": 2.986309138042774, + "grad_norm": 0.07870961725711823, + "learning_rate": 5.6316665916557906e-05, + "loss": 0.2217, + "step": 36863 + }, + { + "epoch": 2.9863901490602722, + "grad_norm": 0.06632569432258606, + "learning_rate": 5.631216526396328e-05, + "loss": 0.234, + "step": 36864 + }, + { + "epoch": 2.9864711600777705, + "grad_norm": 0.06690378487110138, + "learning_rate": 5.630766461136865e-05, + "loss": 0.2192, + "step": 36865 + }, + { + "epoch": 2.986552171095269, + "grad_norm": 0.0669408068060875, + "learning_rate": 5.630316395877403e-05, + "loss": 0.2296, + "step": 36866 + }, + { + "epoch": 2.9866331821127674, + "grad_norm": 0.07498961687088013, + "learning_rate": 5.62986633061794e-05, + "loss": 0.2248, + "step": 36867 + }, + { + "epoch": 2.9867141931302656, + "grad_norm": 0.07754261046648026, + "learning_rate": 5.629416265358477e-05, + "loss": 0.2422, + "step": 36868 + }, + { + "epoch": 2.9867952041477643, + "grad_norm": 0.06618178635835648, + "learning_rate": 5.628966200099015e-05, + "loss": 0.2741, + "step": 36869 + }, + { + "epoch": 2.9868762151652626, + "grad_norm": 0.06932277977466583, + "learning_rate": 5.628516134839552e-05, + "loss": 0.2565, + "step": 36870 + }, + { + "epoch": 2.986957226182761, + "grad_norm": 0.08020354807376862, + "learning_rate": 5.628066069580089e-05, + "loss": 0.2391, + "step": 36871 + }, + { + "epoch": 2.987038237200259, + "grad_norm": 0.07193691283464432, + "learning_rate": 5.627616004320627e-05, + "loss": 0.2755, + "step": 36872 + }, + { + "epoch": 2.9871192482177578, + "grad_norm": 0.0832822322845459, + "learning_rate": 5.627165939061164e-05, + "loss": 0.2179, + "step": 36873 + }, + { + "epoch": 2.987200259235256, + "grad_norm": 0.06554457545280457, + "learning_rate": 5.626715873801701e-05, + "loss": 0.2217, + "step": 36874 + }, + { + "epoch": 2.9872812702527543, + "grad_norm": 0.05843869596719742, + "learning_rate": 5.626265808542239e-05, + "loss": 0.2312, + "step": 36875 + }, + { + "epoch": 2.9873622812702525, + "grad_norm": 0.06552619487047195, + "learning_rate": 5.625815743282776e-05, + "loss": 0.2341, + "step": 36876 + }, + { + "epoch": 2.987443292287751, + "grad_norm": 0.07473114132881165, + "learning_rate": 5.625365678023313e-05, + "loss": 0.2564, + "step": 36877 + }, + { + "epoch": 2.9875243033052494, + "grad_norm": 0.08000738173723221, + "learning_rate": 5.624915612763852e-05, + "loss": 0.2256, + "step": 36878 + }, + { + "epoch": 2.9876053143227477, + "grad_norm": 0.06652722507715225, + "learning_rate": 5.6244655475043884e-05, + "loss": 0.22, + "step": 36879 + }, + { + "epoch": 2.9876863253402464, + "grad_norm": 0.06857480853796005, + "learning_rate": 5.624015482244925e-05, + "loss": 0.1864, + "step": 36880 + }, + { + "epoch": 2.9877673363577446, + "grad_norm": 0.06475800275802612, + "learning_rate": 5.623565416985464e-05, + "loss": 0.2225, + "step": 36881 + }, + { + "epoch": 2.987848347375243, + "grad_norm": 0.06667593121528625, + "learning_rate": 5.6231153517260005e-05, + "loss": 0.2186, + "step": 36882 + }, + { + "epoch": 2.9879293583927415, + "grad_norm": 0.06999807804822922, + "learning_rate": 5.622665286466537e-05, + "loss": 0.2023, + "step": 36883 + }, + { + "epoch": 2.98801036941024, + "grad_norm": 0.0566612184047699, + "learning_rate": 5.622215221207076e-05, + "loss": 0.2193, + "step": 36884 + }, + { + "epoch": 2.988091380427738, + "grad_norm": 0.06729897856712341, + "learning_rate": 5.6217651559476125e-05, + "loss": 0.2308, + "step": 36885 + }, + { + "epoch": 2.9881723914452367, + "grad_norm": 0.06882520765066147, + "learning_rate": 5.621315090688149e-05, + "loss": 0.2252, + "step": 36886 + }, + { + "epoch": 2.988253402462735, + "grad_norm": 0.07283573597669601, + "learning_rate": 5.620865025428688e-05, + "loss": 0.2624, + "step": 36887 + }, + { + "epoch": 2.988334413480233, + "grad_norm": 0.06333566457033157, + "learning_rate": 5.6204149601692246e-05, + "loss": 0.2408, + "step": 36888 + }, + { + "epoch": 2.988415424497732, + "grad_norm": 0.08885639905929565, + "learning_rate": 5.619964894909761e-05, + "loss": 0.1944, + "step": 36889 + }, + { + "epoch": 2.98849643551523, + "grad_norm": 0.08246281743049622, + "learning_rate": 5.6195148296503e-05, + "loss": 0.212, + "step": 36890 + }, + { + "epoch": 2.9885774465327284, + "grad_norm": 0.09889711439609528, + "learning_rate": 5.619064764390837e-05, + "loss": 0.257, + "step": 36891 + }, + { + "epoch": 2.988658457550227, + "grad_norm": 0.07491931319236755, + "learning_rate": 5.618614699131375e-05, + "loss": 0.2262, + "step": 36892 + }, + { + "epoch": 2.9887394685677253, + "grad_norm": 0.06972592324018478, + "learning_rate": 5.618164633871912e-05, + "loss": 0.2335, + "step": 36893 + }, + { + "epoch": 2.9888204795852236, + "grad_norm": 0.06400167942047119, + "learning_rate": 5.617714568612449e-05, + "loss": 0.1905, + "step": 36894 + }, + { + "epoch": 2.988901490602722, + "grad_norm": 0.06958544999361038, + "learning_rate": 5.617264503352987e-05, + "loss": 0.2395, + "step": 36895 + }, + { + "epoch": 2.9889825016202205, + "grad_norm": 0.09128827601671219, + "learning_rate": 5.616814438093524e-05, + "loss": 0.2448, + "step": 36896 + }, + { + "epoch": 2.9890635126377187, + "grad_norm": 0.07656344026327133, + "learning_rate": 5.616364372834061e-05, + "loss": 0.2378, + "step": 36897 + }, + { + "epoch": 2.989144523655217, + "grad_norm": 0.07432923465967178, + "learning_rate": 5.615914307574599e-05, + "loss": 0.2323, + "step": 36898 + }, + { + "epoch": 2.9892255346727152, + "grad_norm": 0.07078418880701065, + "learning_rate": 5.615464242315136e-05, + "loss": 0.2494, + "step": 36899 + }, + { + "epoch": 2.989306545690214, + "grad_norm": 0.0705699622631073, + "learning_rate": 5.615014177055673e-05, + "loss": 0.2592, + "step": 36900 + }, + { + "epoch": 2.989387556707712, + "grad_norm": 0.07478456199169159, + "learning_rate": 5.614564111796211e-05, + "loss": 0.2549, + "step": 36901 + }, + { + "epoch": 2.9894685677252104, + "grad_norm": 0.0605703704059124, + "learning_rate": 5.6141140465367484e-05, + "loss": 0.2604, + "step": 36902 + }, + { + "epoch": 2.989549578742709, + "grad_norm": 0.06664056330919266, + "learning_rate": 5.613663981277285e-05, + "loss": 0.2315, + "step": 36903 + }, + { + "epoch": 2.9896305897602073, + "grad_norm": 0.06250179558992386, + "learning_rate": 5.613213916017823e-05, + "loss": 0.2196, + "step": 36904 + }, + { + "epoch": 2.9897116007777056, + "grad_norm": 0.07414854317903519, + "learning_rate": 5.6127638507583604e-05, + "loss": 0.2265, + "step": 36905 + }, + { + "epoch": 2.9897926117952043, + "grad_norm": 0.05994947999715805, + "learning_rate": 5.612313785498897e-05, + "loss": 0.2559, + "step": 36906 + }, + { + "epoch": 2.9898736228127025, + "grad_norm": 0.06740280985832214, + "learning_rate": 5.611863720239435e-05, + "loss": 0.2213, + "step": 36907 + }, + { + "epoch": 2.9899546338302008, + "grad_norm": 0.057032033801078796, + "learning_rate": 5.6114136549799725e-05, + "loss": 0.2229, + "step": 36908 + }, + { + "epoch": 2.9900356448476995, + "grad_norm": 0.06304501742124557, + "learning_rate": 5.610963589720509e-05, + "loss": 0.2175, + "step": 36909 + }, + { + "epoch": 2.9901166558651977, + "grad_norm": 0.0731239914894104, + "learning_rate": 5.610513524461047e-05, + "loss": 0.2228, + "step": 36910 + }, + { + "epoch": 2.990197666882696, + "grad_norm": 0.07033661007881165, + "learning_rate": 5.6100634592015846e-05, + "loss": 0.2895, + "step": 36911 + }, + { + "epoch": 2.9902786779001946, + "grad_norm": 0.06642790138721466, + "learning_rate": 5.609613393942121e-05, + "loss": 0.256, + "step": 36912 + }, + { + "epoch": 2.990359688917693, + "grad_norm": 0.06933210045099258, + "learning_rate": 5.609163328682659e-05, + "loss": 0.2643, + "step": 36913 + }, + { + "epoch": 2.990440699935191, + "grad_norm": 0.08031534403562546, + "learning_rate": 5.608713263423197e-05, + "loss": 0.2476, + "step": 36914 + }, + { + "epoch": 2.99052171095269, + "grad_norm": 0.06195875257253647, + "learning_rate": 5.6082631981637334e-05, + "loss": 0.2046, + "step": 36915 + }, + { + "epoch": 2.990602721970188, + "grad_norm": 0.07216474413871765, + "learning_rate": 5.6078131329042714e-05, + "loss": 0.2256, + "step": 36916 + }, + { + "epoch": 2.9906837329876863, + "grad_norm": 0.06766729801893234, + "learning_rate": 5.607363067644809e-05, + "loss": 0.2458, + "step": 36917 + }, + { + "epoch": 2.9907647440051845, + "grad_norm": 0.07051542401313782, + "learning_rate": 5.606913002385347e-05, + "loss": 0.2508, + "step": 36918 + }, + { + "epoch": 2.9908457550226832, + "grad_norm": 0.08813019841909409, + "learning_rate": 5.6064629371258835e-05, + "loss": 0.2302, + "step": 36919 + }, + { + "epoch": 2.9909267660401815, + "grad_norm": 0.08156602829694748, + "learning_rate": 5.606012871866421e-05, + "loss": 0.229, + "step": 36920 + }, + { + "epoch": 2.9910077770576797, + "grad_norm": 0.0654522255063057, + "learning_rate": 5.605562806606959e-05, + "loss": 0.2237, + "step": 36921 + }, + { + "epoch": 2.991088788075178, + "grad_norm": 0.07376670837402344, + "learning_rate": 5.6051127413474956e-05, + "loss": 0.2159, + "step": 36922 + }, + { + "epoch": 2.9911697990926767, + "grad_norm": 0.07919562608003616, + "learning_rate": 5.604662676088033e-05, + "loss": 0.2493, + "step": 36923 + }, + { + "epoch": 2.991250810110175, + "grad_norm": 0.0770791545510292, + "learning_rate": 5.604212610828571e-05, + "loss": 0.2226, + "step": 36924 + }, + { + "epoch": 2.991331821127673, + "grad_norm": 0.06926511228084564, + "learning_rate": 5.6037625455691076e-05, + "loss": 0.2281, + "step": 36925 + }, + { + "epoch": 2.991412832145172, + "grad_norm": 0.08092325925827026, + "learning_rate": 5.603312480309645e-05, + "loss": 0.2247, + "step": 36926 + }, + { + "epoch": 2.99149384316267, + "grad_norm": 0.0863441750407219, + "learning_rate": 5.602862415050183e-05, + "loss": 0.2467, + "step": 36927 + }, + { + "epoch": 2.9915748541801683, + "grad_norm": 0.06858447194099426, + "learning_rate": 5.60241234979072e-05, + "loss": 0.2399, + "step": 36928 + }, + { + "epoch": 2.991655865197667, + "grad_norm": 0.07151912152767181, + "learning_rate": 5.601962284531257e-05, + "loss": 0.2374, + "step": 36929 + }, + { + "epoch": 2.9917368762151653, + "grad_norm": 0.07646768540143967, + "learning_rate": 5.601512219271795e-05, + "loss": 0.2306, + "step": 36930 + }, + { + "epoch": 2.9918178872326635, + "grad_norm": 0.057797741144895554, + "learning_rate": 5.601062154012332e-05, + "loss": 0.2212, + "step": 36931 + }, + { + "epoch": 2.991898898250162, + "grad_norm": 0.0864371731877327, + "learning_rate": 5.600612088752869e-05, + "loss": 0.2567, + "step": 36932 + }, + { + "epoch": 2.9919799092676604, + "grad_norm": 0.07477493584156036, + "learning_rate": 5.600162023493407e-05, + "loss": 0.2252, + "step": 36933 + }, + { + "epoch": 2.9920609202851587, + "grad_norm": 0.06860797107219696, + "learning_rate": 5.599711958233944e-05, + "loss": 0.2285, + "step": 36934 + }, + { + "epoch": 2.9921419313026574, + "grad_norm": 0.06708861887454987, + "learning_rate": 5.599261892974481e-05, + "loss": 0.2718, + "step": 36935 + }, + { + "epoch": 2.9922229423201556, + "grad_norm": 0.07017388194799423, + "learning_rate": 5.598811827715019e-05, + "loss": 0.2182, + "step": 36936 + }, + { + "epoch": 2.992303953337654, + "grad_norm": 0.07404615730047226, + "learning_rate": 5.598361762455556e-05, + "loss": 0.2446, + "step": 36937 + }, + { + "epoch": 2.9923849643551526, + "grad_norm": 0.0662136971950531, + "learning_rate": 5.597911697196093e-05, + "loss": 0.232, + "step": 36938 + }, + { + "epoch": 2.992465975372651, + "grad_norm": 0.06485337018966675, + "learning_rate": 5.5974616319366314e-05, + "loss": 0.2074, + "step": 36939 + }, + { + "epoch": 2.992546986390149, + "grad_norm": 0.0583493709564209, + "learning_rate": 5.597011566677168e-05, + "loss": 0.226, + "step": 36940 + }, + { + "epoch": 2.9926279974076473, + "grad_norm": 0.0701061561703682, + "learning_rate": 5.5965615014177054e-05, + "loss": 0.218, + "step": 36941 + }, + { + "epoch": 2.992709008425146, + "grad_norm": 0.06900807470083237, + "learning_rate": 5.5961114361582434e-05, + "loss": 0.232, + "step": 36942 + }, + { + "epoch": 2.9927900194426442, + "grad_norm": 0.05951214209198952, + "learning_rate": 5.59566137089878e-05, + "loss": 0.2214, + "step": 36943 + }, + { + "epoch": 2.9928710304601425, + "grad_norm": 0.07958030700683594, + "learning_rate": 5.595211305639319e-05, + "loss": 0.2196, + "step": 36944 + }, + { + "epoch": 2.9929520414776407, + "grad_norm": 0.05673525854945183, + "learning_rate": 5.5947612403798555e-05, + "loss": 0.2276, + "step": 36945 + }, + { + "epoch": 2.9930330524951394, + "grad_norm": 0.06486007571220398, + "learning_rate": 5.594311175120392e-05, + "loss": 0.2064, + "step": 36946 + }, + { + "epoch": 2.9931140635126376, + "grad_norm": 0.07947579026222229, + "learning_rate": 5.593861109860931e-05, + "loss": 0.2512, + "step": 36947 + }, + { + "epoch": 2.993195074530136, + "grad_norm": 0.059358954429626465, + "learning_rate": 5.5934110446014676e-05, + "loss": 0.2127, + "step": 36948 + }, + { + "epoch": 2.9932760855476346, + "grad_norm": 0.09291760623455048, + "learning_rate": 5.592960979342004e-05, + "loss": 0.2713, + "step": 36949 + }, + { + "epoch": 2.993357096565133, + "grad_norm": 0.0725402757525444, + "learning_rate": 5.592510914082543e-05, + "loss": 0.2276, + "step": 36950 + }, + { + "epoch": 2.993438107582631, + "grad_norm": 0.0793076902627945, + "learning_rate": 5.59206084882308e-05, + "loss": 0.2456, + "step": 36951 + }, + { + "epoch": 2.9935191186001298, + "grad_norm": 0.06947968155145645, + "learning_rate": 5.5916107835636164e-05, + "loss": 0.2153, + "step": 36952 + }, + { + "epoch": 2.993600129617628, + "grad_norm": 0.07872812449932098, + "learning_rate": 5.591160718304155e-05, + "loss": 0.2489, + "step": 36953 + }, + { + "epoch": 2.9936811406351262, + "grad_norm": 0.06870760023593903, + "learning_rate": 5.590710653044692e-05, + "loss": 0.2383, + "step": 36954 + }, + { + "epoch": 2.993762151652625, + "grad_norm": 0.08993922919034958, + "learning_rate": 5.5902605877852284e-05, + "loss": 0.2144, + "step": 36955 + }, + { + "epoch": 2.993843162670123, + "grad_norm": 0.08114828914403915, + "learning_rate": 5.589810522525767e-05, + "loss": 0.247, + "step": 36956 + }, + { + "epoch": 2.9939241736876214, + "grad_norm": 0.05917239189147949, + "learning_rate": 5.589360457266304e-05, + "loss": 0.2236, + "step": 36957 + }, + { + "epoch": 2.99400518470512, + "grad_norm": 0.06396820396184921, + "learning_rate": 5.5889103920068405e-05, + "loss": 0.2262, + "step": 36958 + }, + { + "epoch": 2.9940861957226184, + "grad_norm": 0.10564498603343964, + "learning_rate": 5.588460326747379e-05, + "loss": 0.297, + "step": 36959 + }, + { + "epoch": 2.9941672067401166, + "grad_norm": 0.07465486973524094, + "learning_rate": 5.588010261487916e-05, + "loss": 0.246, + "step": 36960 + }, + { + "epoch": 2.9942482177576153, + "grad_norm": 0.06373563408851624, + "learning_rate": 5.5875601962284526e-05, + "loss": 0.2094, + "step": 36961 + }, + { + "epoch": 2.9943292287751135, + "grad_norm": 0.06839507073163986, + "learning_rate": 5.587110130968991e-05, + "loss": 0.2364, + "step": 36962 + }, + { + "epoch": 2.994410239792612, + "grad_norm": 0.06401334702968597, + "learning_rate": 5.586660065709528e-05, + "loss": 0.2258, + "step": 36963 + }, + { + "epoch": 2.99449125081011, + "grad_norm": 0.06055555120110512, + "learning_rate": 5.5862100004500654e-05, + "loss": 0.244, + "step": 36964 + }, + { + "epoch": 2.9945722618276087, + "grad_norm": 0.0696861669421196, + "learning_rate": 5.5857599351906034e-05, + "loss": 0.2492, + "step": 36965 + }, + { + "epoch": 2.994653272845107, + "grad_norm": 0.08219528198242188, + "learning_rate": 5.58530986993114e-05, + "loss": 0.2569, + "step": 36966 + }, + { + "epoch": 2.994734283862605, + "grad_norm": 0.0690721794962883, + "learning_rate": 5.5848598046716774e-05, + "loss": 0.2122, + "step": 36967 + }, + { + "epoch": 2.9948152948801035, + "grad_norm": 0.05418196693062782, + "learning_rate": 5.5844097394122155e-05, + "loss": 0.2198, + "step": 36968 + }, + { + "epoch": 2.994896305897602, + "grad_norm": 0.06869655102491379, + "learning_rate": 5.583959674152752e-05, + "loss": 0.2916, + "step": 36969 + }, + { + "epoch": 2.9949773169151004, + "grad_norm": 0.07241586595773697, + "learning_rate": 5.58350960889329e-05, + "loss": 0.2491, + "step": 36970 + }, + { + "epoch": 2.9950583279325986, + "grad_norm": 0.06834710389375687, + "learning_rate": 5.5830595436338276e-05, + "loss": 0.218, + "step": 36971 + }, + { + "epoch": 2.9951393389500973, + "grad_norm": 0.06092188134789467, + "learning_rate": 5.582609478374364e-05, + "loss": 0.2526, + "step": 36972 + }, + { + "epoch": 2.9952203499675956, + "grad_norm": 0.07484481483697891, + "learning_rate": 5.582159413114902e-05, + "loss": 0.2255, + "step": 36973 + }, + { + "epoch": 2.995301360985094, + "grad_norm": 0.06314831227064133, + "learning_rate": 5.5817093478554396e-05, + "loss": 0.2404, + "step": 36974 + }, + { + "epoch": 2.9953823720025925, + "grad_norm": 0.06152360513806343, + "learning_rate": 5.581259282595976e-05, + "loss": 0.2398, + "step": 36975 + }, + { + "epoch": 2.9954633830200907, + "grad_norm": 0.05533519759774208, + "learning_rate": 5.5808092173365144e-05, + "loss": 0.2503, + "step": 36976 + }, + { + "epoch": 2.995544394037589, + "grad_norm": 0.06785975396633148, + "learning_rate": 5.580359152077052e-05, + "loss": 0.2262, + "step": 36977 + }, + { + "epoch": 2.9956254050550877, + "grad_norm": 0.08108928799629211, + "learning_rate": 5.5799090868175884e-05, + "loss": 0.2441, + "step": 36978 + }, + { + "epoch": 2.995706416072586, + "grad_norm": 0.06509430706501007, + "learning_rate": 5.5794590215581265e-05, + "loss": 0.2306, + "step": 36979 + }, + { + "epoch": 2.995787427090084, + "grad_norm": 0.07513480633497238, + "learning_rate": 5.579008956298664e-05, + "loss": 0.1986, + "step": 36980 + }, + { + "epoch": 2.995868438107583, + "grad_norm": 0.05923466384410858, + "learning_rate": 5.5785588910392005e-05, + "loss": 0.2388, + "step": 36981 + }, + { + "epoch": 2.995949449125081, + "grad_norm": 0.06244102865457535, + "learning_rate": 5.5781088257797385e-05, + "loss": 0.2288, + "step": 36982 + }, + { + "epoch": 2.9960304601425793, + "grad_norm": 0.08009325712919235, + "learning_rate": 5.577658760520276e-05, + "loss": 0.3072, + "step": 36983 + }, + { + "epoch": 2.996111471160078, + "grad_norm": 0.09328343719244003, + "learning_rate": 5.5772086952608126e-05, + "loss": 0.2637, + "step": 36984 + }, + { + "epoch": 2.9961924821775763, + "grad_norm": 0.05990605056285858, + "learning_rate": 5.5767586300013506e-05, + "loss": 0.2468, + "step": 36985 + }, + { + "epoch": 2.9962734931950745, + "grad_norm": 0.07123095542192459, + "learning_rate": 5.576308564741888e-05, + "loss": 0.2032, + "step": 36986 + }, + { + "epoch": 2.9963545042125728, + "grad_norm": 0.06369782239198685, + "learning_rate": 5.5758584994824247e-05, + "loss": 0.1956, + "step": 36987 + }, + { + "epoch": 2.9964355152300715, + "grad_norm": 0.0753772184252739, + "learning_rate": 5.575408434222963e-05, + "loss": 0.2192, + "step": 36988 + }, + { + "epoch": 2.9965165262475697, + "grad_norm": 0.05591370910406113, + "learning_rate": 5.5749583689635e-05, + "loss": 0.2266, + "step": 36989 + }, + { + "epoch": 2.996597537265068, + "grad_norm": 0.07075093686580658, + "learning_rate": 5.574508303704037e-05, + "loss": 0.254, + "step": 36990 + }, + { + "epoch": 2.996678548282566, + "grad_norm": 0.06642140448093414, + "learning_rate": 5.574058238444575e-05, + "loss": 0.2854, + "step": 36991 + }, + { + "epoch": 2.996759559300065, + "grad_norm": 0.07355593144893646, + "learning_rate": 5.573608173185112e-05, + "loss": 0.242, + "step": 36992 + }, + { + "epoch": 2.996840570317563, + "grad_norm": 0.07146865874528885, + "learning_rate": 5.573158107925649e-05, + "loss": 0.2138, + "step": 36993 + }, + { + "epoch": 2.9969215813350614, + "grad_norm": 0.06968890130519867, + "learning_rate": 5.572708042666187e-05, + "loss": 0.2465, + "step": 36994 + }, + { + "epoch": 2.99700259235256, + "grad_norm": 0.09615656733512878, + "learning_rate": 5.572257977406724e-05, + "loss": 0.2272, + "step": 36995 + }, + { + "epoch": 2.9970836033700583, + "grad_norm": 0.06006643921136856, + "learning_rate": 5.571807912147262e-05, + "loss": 0.182, + "step": 36996 + }, + { + "epoch": 2.9971646143875565, + "grad_norm": 0.07367891818284988, + "learning_rate": 5.571357846887799e-05, + "loss": 0.2238, + "step": 36997 + }, + { + "epoch": 2.9972456254050552, + "grad_norm": 0.08367826789617538, + "learning_rate": 5.570907781628336e-05, + "loss": 0.2268, + "step": 36998 + }, + { + "epoch": 2.9973266364225535, + "grad_norm": 0.07553227990865707, + "learning_rate": 5.570457716368874e-05, + "loss": 0.2119, + "step": 36999 + }, + { + "epoch": 2.9974076474400517, + "grad_norm": 0.08225884288549423, + "learning_rate": 5.570007651109411e-05, + "loss": 0.2727, + "step": 37000 + }, + { + "epoch": 2.9974886584575504, + "grad_norm": 0.05904098227620125, + "learning_rate": 5.5695575858499484e-05, + "loss": 0.2552, + "step": 37001 + }, + { + "epoch": 2.9975696694750487, + "grad_norm": 0.055857256054878235, + "learning_rate": 5.5691075205904864e-05, + "loss": 0.2265, + "step": 37002 + }, + { + "epoch": 2.997650680492547, + "grad_norm": 0.060024164617061615, + "learning_rate": 5.568657455331023e-05, + "loss": 0.2122, + "step": 37003 + }, + { + "epoch": 2.9977316915100456, + "grad_norm": 0.05858771875500679, + "learning_rate": 5.5682073900715605e-05, + "loss": 0.2138, + "step": 37004 + }, + { + "epoch": 2.997812702527544, + "grad_norm": 0.0658746287226677, + "learning_rate": 5.5677573248120985e-05, + "loss": 0.2397, + "step": 37005 + }, + { + "epoch": 2.997893713545042, + "grad_norm": 0.07731475681066513, + "learning_rate": 5.567307259552635e-05, + "loss": 0.2376, + "step": 37006 + }, + { + "epoch": 2.9979747245625408, + "grad_norm": 0.06727912276983261, + "learning_rate": 5.5668571942931725e-05, + "loss": 0.2381, + "step": 37007 + }, + { + "epoch": 2.998055735580039, + "grad_norm": 0.06332457065582275, + "learning_rate": 5.5664071290337106e-05, + "loss": 0.2109, + "step": 37008 + }, + { + "epoch": 2.9981367465975373, + "grad_norm": 0.06446274369955063, + "learning_rate": 5.565957063774247e-05, + "loss": 0.2262, + "step": 37009 + }, + { + "epoch": 2.9982177576150355, + "grad_norm": 0.06915741413831711, + "learning_rate": 5.5655069985147846e-05, + "loss": 0.234, + "step": 37010 + }, + { + "epoch": 2.9982987686325338, + "grad_norm": 0.0659353956580162, + "learning_rate": 5.5650569332553227e-05, + "loss": 0.1997, + "step": 37011 + }, + { + "epoch": 2.9983797796500324, + "grad_norm": 0.06665370613336563, + "learning_rate": 5.5646068679958593e-05, + "loss": 0.2327, + "step": 37012 + }, + { + "epoch": 2.9984607906675307, + "grad_norm": 0.07406622171401978, + "learning_rate": 5.564156802736397e-05, + "loss": 0.2297, + "step": 37013 + }, + { + "epoch": 2.998541801685029, + "grad_norm": 0.07928238064050674, + "learning_rate": 5.563706737476935e-05, + "loss": 0.2363, + "step": 37014 + }, + { + "epoch": 2.9986228127025276, + "grad_norm": 0.07332681864500046, + "learning_rate": 5.5632566722174714e-05, + "loss": 0.2725, + "step": 37015 + }, + { + "epoch": 2.998703823720026, + "grad_norm": 0.0755414143204689, + "learning_rate": 5.562806606958009e-05, + "loss": 0.233, + "step": 37016 + }, + { + "epoch": 2.998784834737524, + "grad_norm": 0.07047983258962631, + "learning_rate": 5.562356541698547e-05, + "loss": 0.2551, + "step": 37017 + }, + { + "epoch": 2.998865845755023, + "grad_norm": 0.07175135612487793, + "learning_rate": 5.5619064764390835e-05, + "loss": 0.2268, + "step": 37018 + }, + { + "epoch": 2.998946856772521, + "grad_norm": 0.08001399040222168, + "learning_rate": 5.561456411179621e-05, + "loss": 0.241, + "step": 37019 + }, + { + "epoch": 2.9990278677900193, + "grad_norm": 0.06803309172391891, + "learning_rate": 5.561006345920159e-05, + "loss": 0.2063, + "step": 37020 + }, + { + "epoch": 2.999108878807518, + "grad_norm": 0.06360645592212677, + "learning_rate": 5.5605562806606956e-05, + "loss": 0.2126, + "step": 37021 + }, + { + "epoch": 2.999189889825016, + "grad_norm": 0.06314657628536224, + "learning_rate": 5.560106215401234e-05, + "loss": 0.2278, + "step": 37022 + }, + { + "epoch": 2.9992709008425145, + "grad_norm": 0.07989577949047089, + "learning_rate": 5.559656150141771e-05, + "loss": 0.2294, + "step": 37023 + }, + { + "epoch": 2.999351911860013, + "grad_norm": 0.0917118638753891, + "learning_rate": 5.559206084882308e-05, + "loss": 0.2288, + "step": 37024 + }, + { + "epoch": 2.9994329228775114, + "grad_norm": 0.06495703011751175, + "learning_rate": 5.5587560196228464e-05, + "loss": 0.2604, + "step": 37025 + }, + { + "epoch": 2.9995139338950096, + "grad_norm": 0.06716945022344589, + "learning_rate": 5.558305954363383e-05, + "loss": 0.2511, + "step": 37026 + }, + { + "epoch": 2.9995949449125083, + "grad_norm": 0.075443796813488, + "learning_rate": 5.55785588910392e-05, + "loss": 0.2178, + "step": 37027 + }, + { + "epoch": 2.9996759559300066, + "grad_norm": 0.06924057006835938, + "learning_rate": 5.5574058238444585e-05, + "loss": 0.2539, + "step": 37028 + }, + { + "epoch": 2.999756966947505, + "grad_norm": 0.06341889500617981, + "learning_rate": 5.556955758584995e-05, + "loss": 0.2463, + "step": 37029 + }, + { + "epoch": 2.9998379779650035, + "grad_norm": 0.06503824144601822, + "learning_rate": 5.5565056933255325e-05, + "loss": 0.1933, + "step": 37030 + }, + { + "epoch": 2.9999189889825018, + "grad_norm": 0.07555829733610153, + "learning_rate": 5.5560556280660705e-05, + "loss": 0.2049, + "step": 37031 + }, + { + "epoch": 3.0, + "grad_norm": 0.0738195925951004, + "learning_rate": 5.555605562806607e-05, + "loss": 0.214, + "step": 37032 + }, + { + "epoch": 3.0000810110174982, + "grad_norm": 0.0725194588303566, + "learning_rate": 5.5551554975471446e-05, + "loss": 0.2275, + "step": 37033 + }, + { + "epoch": 3.000162022034997, + "grad_norm": 0.07739761471748352, + "learning_rate": 5.5547054322876826e-05, + "loss": 0.2549, + "step": 37034 + }, + { + "epoch": 3.000243033052495, + "grad_norm": 0.08765597641468048, + "learning_rate": 5.554255367028219e-05, + "loss": 0.2413, + "step": 37035 + }, + { + "epoch": 3.0003240440699934, + "grad_norm": 0.07010837644338608, + "learning_rate": 5.553805301768757e-05, + "loss": 0.2515, + "step": 37036 + }, + { + "epoch": 3.000405055087492, + "grad_norm": 0.07616742700338364, + "learning_rate": 5.553355236509295e-05, + "loss": 0.2184, + "step": 37037 + }, + { + "epoch": 3.0004860661049904, + "grad_norm": 0.07259862869977951, + "learning_rate": 5.5529051712498314e-05, + "loss": 0.225, + "step": 37038 + }, + { + "epoch": 3.0005670771224886, + "grad_norm": 0.06548836827278137, + "learning_rate": 5.552455105990369e-05, + "loss": 0.2056, + "step": 37039 + }, + { + "epoch": 3.000648088139987, + "grad_norm": 0.06693669408559799, + "learning_rate": 5.552005040730907e-05, + "loss": 0.2328, + "step": 37040 + }, + { + "epoch": 3.0007290991574855, + "grad_norm": 0.09123533219099045, + "learning_rate": 5.5515549754714435e-05, + "loss": 0.2435, + "step": 37041 + }, + { + "epoch": 3.000810110174984, + "grad_norm": 0.06889007240533829, + "learning_rate": 5.551104910211981e-05, + "loss": 0.2274, + "step": 37042 + }, + { + "epoch": 3.000891121192482, + "grad_norm": 0.07103199511766434, + "learning_rate": 5.550654844952519e-05, + "loss": 0.2235, + "step": 37043 + }, + { + "epoch": 3.0009721322099807, + "grad_norm": 0.08572027832269669, + "learning_rate": 5.5502047796930556e-05, + "loss": 0.29, + "step": 37044 + }, + { + "epoch": 3.001053143227479, + "grad_norm": 0.0924052745103836, + "learning_rate": 5.549754714433593e-05, + "loss": 0.223, + "step": 37045 + }, + { + "epoch": 3.001134154244977, + "grad_norm": 0.07948629558086395, + "learning_rate": 5.549304649174131e-05, + "loss": 0.2384, + "step": 37046 + }, + { + "epoch": 3.001215165262476, + "grad_norm": 0.07083747535943985, + "learning_rate": 5.5488545839146676e-05, + "loss": 0.2236, + "step": 37047 + }, + { + "epoch": 3.001296176279974, + "grad_norm": 0.06366025656461716, + "learning_rate": 5.548404518655205e-05, + "loss": 0.2383, + "step": 37048 + }, + { + "epoch": 3.0013771872974724, + "grad_norm": 0.06225239485502243, + "learning_rate": 5.547954453395743e-05, + "loss": 0.1976, + "step": 37049 + }, + { + "epoch": 3.0014581983149706, + "grad_norm": 0.06707856804132462, + "learning_rate": 5.54750438813628e-05, + "loss": 0.2156, + "step": 37050 + }, + { + "epoch": 3.0015392093324693, + "grad_norm": 0.07274459302425385, + "learning_rate": 5.547054322876818e-05, + "loss": 0.2846, + "step": 37051 + }, + { + "epoch": 3.0016202203499676, + "grad_norm": 0.060465868562459946, + "learning_rate": 5.546604257617355e-05, + "loss": 0.2079, + "step": 37052 + }, + { + "epoch": 3.001701231367466, + "grad_norm": 0.061956025660037994, + "learning_rate": 5.546154192357892e-05, + "loss": 0.238, + "step": 37053 + }, + { + "epoch": 3.0017822423849645, + "grad_norm": 0.07218556851148605, + "learning_rate": 5.54570412709843e-05, + "loss": 0.217, + "step": 37054 + }, + { + "epoch": 3.0018632534024627, + "grad_norm": 0.06965664774179459, + "learning_rate": 5.545254061838967e-05, + "loss": 0.2006, + "step": 37055 + }, + { + "epoch": 3.001944264419961, + "grad_norm": 0.07059884816408157, + "learning_rate": 5.544803996579504e-05, + "loss": 0.2552, + "step": 37056 + }, + { + "epoch": 3.0020252754374597, + "grad_norm": 0.060033269226551056, + "learning_rate": 5.544353931320042e-05, + "loss": 0.2328, + "step": 37057 + }, + { + "epoch": 3.002106286454958, + "grad_norm": 0.07267241179943085, + "learning_rate": 5.543903866060579e-05, + "loss": 0.221, + "step": 37058 + }, + { + "epoch": 3.002187297472456, + "grad_norm": 0.059818148612976074, + "learning_rate": 5.543453800801116e-05, + "loss": 0.221, + "step": 37059 + }, + { + "epoch": 3.002268308489955, + "grad_norm": 0.10267902165651321, + "learning_rate": 5.543003735541654e-05, + "loss": 0.2289, + "step": 37060 + }, + { + "epoch": 3.002349319507453, + "grad_norm": 0.0698448047041893, + "learning_rate": 5.5425536702821914e-05, + "loss": 0.2109, + "step": 37061 + }, + { + "epoch": 3.0024303305249513, + "grad_norm": 0.06195312365889549, + "learning_rate": 5.542103605022728e-05, + "loss": 0.2531, + "step": 37062 + }, + { + "epoch": 3.0025113415424496, + "grad_norm": 0.06242690607905388, + "learning_rate": 5.541653539763266e-05, + "loss": 0.2113, + "step": 37063 + }, + { + "epoch": 3.0025923525599483, + "grad_norm": 0.06666281819343567, + "learning_rate": 5.5412034745038034e-05, + "loss": 0.2209, + "step": 37064 + }, + { + "epoch": 3.0026733635774465, + "grad_norm": 0.06624188274145126, + "learning_rate": 5.54075340924434e-05, + "loss": 0.2024, + "step": 37065 + }, + { + "epoch": 3.0027543745949448, + "grad_norm": 0.07061390578746796, + "learning_rate": 5.540303343984878e-05, + "loss": 0.2311, + "step": 37066 + }, + { + "epoch": 3.0028353856124435, + "grad_norm": 0.07270953059196472, + "learning_rate": 5.5398532787254155e-05, + "loss": 0.2471, + "step": 37067 + }, + { + "epoch": 3.0029163966299417, + "grad_norm": 0.06502418220043182, + "learning_rate": 5.539403213465952e-05, + "loss": 0.2313, + "step": 37068 + }, + { + "epoch": 3.00299740764744, + "grad_norm": 0.07650100439786911, + "learning_rate": 5.53895314820649e-05, + "loss": 0.2661, + "step": 37069 + }, + { + "epoch": 3.0030784186649386, + "grad_norm": 0.07324282824993134, + "learning_rate": 5.5385030829470276e-05, + "loss": 0.2472, + "step": 37070 + }, + { + "epoch": 3.003159429682437, + "grad_norm": 0.05792533606290817, + "learning_rate": 5.538053017687564e-05, + "loss": 0.209, + "step": 37071 + }, + { + "epoch": 3.003240440699935, + "grad_norm": 0.07956384122371674, + "learning_rate": 5.537602952428102e-05, + "loss": 0.2226, + "step": 37072 + }, + { + "epoch": 3.0033214517174334, + "grad_norm": 0.061989475041627884, + "learning_rate": 5.53715288716864e-05, + "loss": 0.2615, + "step": 37073 + }, + { + "epoch": 3.003402462734932, + "grad_norm": 0.07994687557220459, + "learning_rate": 5.5367028219091764e-05, + "loss": 0.2498, + "step": 37074 + }, + { + "epoch": 3.0034834737524303, + "grad_norm": 0.06498431414365768, + "learning_rate": 5.5362527566497144e-05, + "loss": 0.2343, + "step": 37075 + }, + { + "epoch": 3.0035644847699285, + "grad_norm": 0.07870197296142578, + "learning_rate": 5.535802691390252e-05, + "loss": 0.2269, + "step": 37076 + }, + { + "epoch": 3.0036454957874272, + "grad_norm": 0.07183849811553955, + "learning_rate": 5.53535262613079e-05, + "loss": 0.2438, + "step": 37077 + }, + { + "epoch": 3.0037265068049255, + "grad_norm": 0.07815229892730713, + "learning_rate": 5.5349025608713265e-05, + "loss": 0.2122, + "step": 37078 + }, + { + "epoch": 3.0038075178224237, + "grad_norm": 0.0587080642580986, + "learning_rate": 5.534452495611864e-05, + "loss": 0.211, + "step": 37079 + }, + { + "epoch": 3.0038885288399224, + "grad_norm": 0.08774900436401367, + "learning_rate": 5.534002430352402e-05, + "loss": 0.2188, + "step": 37080 + }, + { + "epoch": 3.0039695398574207, + "grad_norm": 0.08080202341079712, + "learning_rate": 5.5335523650929386e-05, + "loss": 0.2553, + "step": 37081 + }, + { + "epoch": 3.004050550874919, + "grad_norm": 0.07951570302248001, + "learning_rate": 5.533102299833476e-05, + "loss": 0.2161, + "step": 37082 + }, + { + "epoch": 3.0041315618924176, + "grad_norm": 0.07218746840953827, + "learning_rate": 5.532652234574014e-05, + "loss": 0.2065, + "step": 37083 + }, + { + "epoch": 3.004212572909916, + "grad_norm": 0.07605035603046417, + "learning_rate": 5.5322021693145506e-05, + "loss": 0.2587, + "step": 37084 + }, + { + "epoch": 3.004293583927414, + "grad_norm": 0.07501280307769775, + "learning_rate": 5.531752104055088e-05, + "loss": 0.2375, + "step": 37085 + }, + { + "epoch": 3.0043745949449123, + "grad_norm": 0.07414011657238007, + "learning_rate": 5.531302038795626e-05, + "loss": 0.2246, + "step": 37086 + }, + { + "epoch": 3.004455605962411, + "grad_norm": 0.06920185685157776, + "learning_rate": 5.530851973536163e-05, + "loss": 0.2588, + "step": 37087 + }, + { + "epoch": 3.0045366169799093, + "grad_norm": 0.06351779401302338, + "learning_rate": 5.5304019082767e-05, + "loss": 0.2167, + "step": 37088 + }, + { + "epoch": 3.0046176279974075, + "grad_norm": 0.058531858026981354, + "learning_rate": 5.529951843017238e-05, + "loss": 0.2324, + "step": 37089 + }, + { + "epoch": 3.004698639014906, + "grad_norm": 0.06440838426351547, + "learning_rate": 5.529501777757775e-05, + "loss": 0.2204, + "step": 37090 + }, + { + "epoch": 3.0047796500324044, + "grad_norm": 0.06930806487798691, + "learning_rate": 5.529051712498312e-05, + "loss": 0.1756, + "step": 37091 + }, + { + "epoch": 3.0048606610499027, + "grad_norm": 0.08574075251817703, + "learning_rate": 5.52860164723885e-05, + "loss": 0.2421, + "step": 37092 + }, + { + "epoch": 3.0049416720674014, + "grad_norm": 0.08673498034477234, + "learning_rate": 5.528151581979387e-05, + "loss": 0.2415, + "step": 37093 + }, + { + "epoch": 3.0050226830848996, + "grad_norm": 0.06406184285879135, + "learning_rate": 5.527701516719924e-05, + "loss": 0.1944, + "step": 37094 + }, + { + "epoch": 3.005103694102398, + "grad_norm": 0.0748908668756485, + "learning_rate": 5.527251451460462e-05, + "loss": 0.224, + "step": 37095 + }, + { + "epoch": 3.005184705119896, + "grad_norm": 0.0906112864613533, + "learning_rate": 5.526801386200999e-05, + "loss": 0.2435, + "step": 37096 + }, + { + "epoch": 3.005265716137395, + "grad_norm": 0.08011359721422195, + "learning_rate": 5.526351320941536e-05, + "loss": 0.2725, + "step": 37097 + }, + { + "epoch": 3.005346727154893, + "grad_norm": 0.06861620396375656, + "learning_rate": 5.5259012556820744e-05, + "loss": 0.2229, + "step": 37098 + }, + { + "epoch": 3.0054277381723913, + "grad_norm": 0.08145179599523544, + "learning_rate": 5.525451190422612e-05, + "loss": 0.2125, + "step": 37099 + }, + { + "epoch": 3.00550874918989, + "grad_norm": 0.06678757071495056, + "learning_rate": 5.5250011251631484e-05, + "loss": 0.2285, + "step": 37100 + }, + { + "epoch": 3.005589760207388, + "grad_norm": 0.08154644817113876, + "learning_rate": 5.5245510599036864e-05, + "loss": 0.2317, + "step": 37101 + }, + { + "epoch": 3.0056707712248865, + "grad_norm": 0.06820086389780045, + "learning_rate": 5.524100994644224e-05, + "loss": 0.2131, + "step": 37102 + }, + { + "epoch": 3.005751782242385, + "grad_norm": 0.07761559635400772, + "learning_rate": 5.523650929384762e-05, + "loss": 0.239, + "step": 37103 + }, + { + "epoch": 3.0058327932598834, + "grad_norm": 0.0691225454211235, + "learning_rate": 5.5232008641252985e-05, + "loss": 0.2477, + "step": 37104 + }, + { + "epoch": 3.0059138042773816, + "grad_norm": 0.09152821451425552, + "learning_rate": 5.522750798865836e-05, + "loss": 0.2406, + "step": 37105 + }, + { + "epoch": 3.00599481529488, + "grad_norm": 0.08434759080410004, + "learning_rate": 5.522300733606374e-05, + "loss": 0.2814, + "step": 37106 + }, + { + "epoch": 3.0060758263123786, + "grad_norm": 0.07852553576231003, + "learning_rate": 5.5218506683469106e-05, + "loss": 0.2628, + "step": 37107 + }, + { + "epoch": 3.006156837329877, + "grad_norm": 0.06626973301172256, + "learning_rate": 5.521400603087448e-05, + "loss": 0.2117, + "step": 37108 + }, + { + "epoch": 3.006237848347375, + "grad_norm": 0.06209743767976761, + "learning_rate": 5.520950537827986e-05, + "loss": 0.2654, + "step": 37109 + }, + { + "epoch": 3.0063188593648738, + "grad_norm": 0.06478123366832733, + "learning_rate": 5.520500472568523e-05, + "loss": 0.2731, + "step": 37110 + }, + { + "epoch": 3.006399870382372, + "grad_norm": 0.07236045598983765, + "learning_rate": 5.52005040730906e-05, + "loss": 0.2063, + "step": 37111 + }, + { + "epoch": 3.0064808813998702, + "grad_norm": 0.06700768321752548, + "learning_rate": 5.519600342049598e-05, + "loss": 0.2363, + "step": 37112 + }, + { + "epoch": 3.006561892417369, + "grad_norm": 0.06586731225252151, + "learning_rate": 5.519150276790135e-05, + "loss": 0.2576, + "step": 37113 + }, + { + "epoch": 3.006642903434867, + "grad_norm": 0.062321074306964874, + "learning_rate": 5.518700211530672e-05, + "loss": 0.2245, + "step": 37114 + }, + { + "epoch": 3.0067239144523654, + "grad_norm": 0.07160164415836334, + "learning_rate": 5.51825014627121e-05, + "loss": 0.2452, + "step": 37115 + }, + { + "epoch": 3.006804925469864, + "grad_norm": 0.07295297086238861, + "learning_rate": 5.517800081011747e-05, + "loss": 0.2185, + "step": 37116 + }, + { + "epoch": 3.0068859364873624, + "grad_norm": 0.06453075259923935, + "learning_rate": 5.517350015752284e-05, + "loss": 0.2221, + "step": 37117 + }, + { + "epoch": 3.0069669475048606, + "grad_norm": 0.08894529938697815, + "learning_rate": 5.516899950492822e-05, + "loss": 0.2735, + "step": 37118 + }, + { + "epoch": 3.007047958522359, + "grad_norm": 0.06164504587650299, + "learning_rate": 5.516449885233359e-05, + "loss": 0.2135, + "step": 37119 + }, + { + "epoch": 3.0071289695398575, + "grad_norm": 0.08532165735960007, + "learning_rate": 5.515999819973896e-05, + "loss": 0.2352, + "step": 37120 + }, + { + "epoch": 3.0072099805573558, + "grad_norm": 0.077989362180233, + "learning_rate": 5.515549754714434e-05, + "loss": 0.2249, + "step": 37121 + }, + { + "epoch": 3.007290991574854, + "grad_norm": 0.06919614225625992, + "learning_rate": 5.515099689454971e-05, + "loss": 0.2618, + "step": 37122 + }, + { + "epoch": 3.0073720025923527, + "grad_norm": 0.07316429167985916, + "learning_rate": 5.5146496241955084e-05, + "loss": 0.2389, + "step": 37123 + }, + { + "epoch": 3.007453013609851, + "grad_norm": 0.0694352462887764, + "learning_rate": 5.5141995589360464e-05, + "loss": 0.2361, + "step": 37124 + }, + { + "epoch": 3.007534024627349, + "grad_norm": 0.08120887726545334, + "learning_rate": 5.513749493676583e-05, + "loss": 0.2597, + "step": 37125 + }, + { + "epoch": 3.007615035644848, + "grad_norm": 0.08180362731218338, + "learning_rate": 5.5132994284171205e-05, + "loss": 0.2737, + "step": 37126 + }, + { + "epoch": 3.007696046662346, + "grad_norm": 0.07655584067106247, + "learning_rate": 5.5128493631576585e-05, + "loss": 0.247, + "step": 37127 + }, + { + "epoch": 3.0077770576798444, + "grad_norm": 0.057747483253479004, + "learning_rate": 5.512399297898195e-05, + "loss": 0.2309, + "step": 37128 + }, + { + "epoch": 3.0078580686973426, + "grad_norm": 0.05902184545993805, + "learning_rate": 5.511949232638733e-05, + "loss": 0.1943, + "step": 37129 + }, + { + "epoch": 3.0079390797148413, + "grad_norm": 0.06686657667160034, + "learning_rate": 5.5114991673792706e-05, + "loss": 0.2331, + "step": 37130 + }, + { + "epoch": 3.0080200907323396, + "grad_norm": 0.06984364986419678, + "learning_rate": 5.511049102119807e-05, + "loss": 0.2597, + "step": 37131 + }, + { + "epoch": 3.008101101749838, + "grad_norm": 0.05688484013080597, + "learning_rate": 5.510599036860345e-05, + "loss": 0.2591, + "step": 37132 + }, + { + "epoch": 3.0081821127673365, + "grad_norm": 0.07238908112049103, + "learning_rate": 5.5101489716008827e-05, + "loss": 0.2553, + "step": 37133 + }, + { + "epoch": 3.0082631237848347, + "grad_norm": 0.06852646172046661, + "learning_rate": 5.509698906341419e-05, + "loss": 0.2341, + "step": 37134 + }, + { + "epoch": 3.008344134802333, + "grad_norm": 0.07024137675762177, + "learning_rate": 5.5092488410819574e-05, + "loss": 0.224, + "step": 37135 + }, + { + "epoch": 3.0084251458198317, + "grad_norm": 0.07092615962028503, + "learning_rate": 5.508798775822495e-05, + "loss": 0.2485, + "step": 37136 + }, + { + "epoch": 3.00850615683733, + "grad_norm": 0.05837797746062279, + "learning_rate": 5.5083487105630314e-05, + "loss": 0.184, + "step": 37137 + }, + { + "epoch": 3.008587167854828, + "grad_norm": 0.06182774528861046, + "learning_rate": 5.5078986453035695e-05, + "loss": 0.2218, + "step": 37138 + }, + { + "epoch": 3.008668178872327, + "grad_norm": 0.0963364765048027, + "learning_rate": 5.507448580044107e-05, + "loss": 0.2624, + "step": 37139 + }, + { + "epoch": 3.008749189889825, + "grad_norm": 0.05469222739338875, + "learning_rate": 5.5069985147846435e-05, + "loss": 0.2098, + "step": 37140 + }, + { + "epoch": 3.0088302009073233, + "grad_norm": 0.08195900917053223, + "learning_rate": 5.5065484495251815e-05, + "loss": 0.2398, + "step": 37141 + }, + { + "epoch": 3.0089112119248216, + "grad_norm": 0.06950972229242325, + "learning_rate": 5.506098384265719e-05, + "loss": 0.2796, + "step": 37142 + }, + { + "epoch": 3.0089922229423203, + "grad_norm": 0.07501380145549774, + "learning_rate": 5.5056483190062556e-05, + "loss": 0.2564, + "step": 37143 + }, + { + "epoch": 3.0090732339598185, + "grad_norm": 0.06771993637084961, + "learning_rate": 5.5051982537467936e-05, + "loss": 0.2088, + "step": 37144 + }, + { + "epoch": 3.0091542449773168, + "grad_norm": 0.06669183075428009, + "learning_rate": 5.504748188487331e-05, + "loss": 0.1927, + "step": 37145 + }, + { + "epoch": 3.0092352559948155, + "grad_norm": 0.08785217255353928, + "learning_rate": 5.5042981232278677e-05, + "loss": 0.2287, + "step": 37146 + }, + { + "epoch": 3.0093162670123137, + "grad_norm": 0.0853303074836731, + "learning_rate": 5.503848057968406e-05, + "loss": 0.1907, + "step": 37147 + }, + { + "epoch": 3.009397278029812, + "grad_norm": 0.0789603441953659, + "learning_rate": 5.503397992708943e-05, + "loss": 0.1991, + "step": 37148 + }, + { + "epoch": 3.0094782890473106, + "grad_norm": 0.06373845040798187, + "learning_rate": 5.50294792744948e-05, + "loss": 0.2359, + "step": 37149 + }, + { + "epoch": 3.009559300064809, + "grad_norm": 0.0680304542183876, + "learning_rate": 5.502497862190018e-05, + "loss": 0.1963, + "step": 37150 + }, + { + "epoch": 3.009640311082307, + "grad_norm": 0.05651449039578438, + "learning_rate": 5.502047796930555e-05, + "loss": 0.1977, + "step": 37151 + }, + { + "epoch": 3.0097213220998054, + "grad_norm": 0.10431863367557526, + "learning_rate": 5.501597731671092e-05, + "loss": 0.2325, + "step": 37152 + }, + { + "epoch": 3.009802333117304, + "grad_norm": 0.07406028360128403, + "learning_rate": 5.50114766641163e-05, + "loss": 0.2395, + "step": 37153 + }, + { + "epoch": 3.0098833441348023, + "grad_norm": 0.06386356800794601, + "learning_rate": 5.500697601152167e-05, + "loss": 0.2155, + "step": 37154 + }, + { + "epoch": 3.0099643551523005, + "grad_norm": 0.10542084276676178, + "learning_rate": 5.500247535892705e-05, + "loss": 0.2085, + "step": 37155 + }, + { + "epoch": 3.0100453661697992, + "grad_norm": 0.0667385384440422, + "learning_rate": 5.499797470633242e-05, + "loss": 0.2158, + "step": 37156 + }, + { + "epoch": 3.0101263771872975, + "grad_norm": 0.07736918330192566, + "learning_rate": 5.499347405373779e-05, + "loss": 0.2182, + "step": 37157 + }, + { + "epoch": 3.0102073882047957, + "grad_norm": 0.06546894460916519, + "learning_rate": 5.498897340114317e-05, + "loss": 0.2375, + "step": 37158 + }, + { + "epoch": 3.0102883992222944, + "grad_norm": 0.08651191741228104, + "learning_rate": 5.498447274854854e-05, + "loss": 0.2414, + "step": 37159 + }, + { + "epoch": 3.0103694102397927, + "grad_norm": 0.08241085708141327, + "learning_rate": 5.4979972095953914e-05, + "loss": 0.2608, + "step": 37160 + }, + { + "epoch": 3.010450421257291, + "grad_norm": 0.07510359585285187, + "learning_rate": 5.4975471443359294e-05, + "loss": 0.222, + "step": 37161 + }, + { + "epoch": 3.0105314322747896, + "grad_norm": 0.07132948935031891, + "learning_rate": 5.497097079076466e-05, + "loss": 0.2024, + "step": 37162 + }, + { + "epoch": 3.010612443292288, + "grad_norm": 0.061131563037633896, + "learning_rate": 5.4966470138170035e-05, + "loss": 0.2456, + "step": 37163 + }, + { + "epoch": 3.010693454309786, + "grad_norm": 0.057700663805007935, + "learning_rate": 5.4961969485575415e-05, + "loss": 0.2281, + "step": 37164 + }, + { + "epoch": 3.0107744653272843, + "grad_norm": 0.06116776168346405, + "learning_rate": 5.495746883298078e-05, + "loss": 0.2004, + "step": 37165 + }, + { + "epoch": 3.010855476344783, + "grad_norm": 0.07130347192287445, + "learning_rate": 5.4952968180386155e-05, + "loss": 0.2461, + "step": 37166 + }, + { + "epoch": 3.0109364873622813, + "grad_norm": 0.05239642783999443, + "learning_rate": 5.4948467527791536e-05, + "loss": 0.1924, + "step": 37167 + }, + { + "epoch": 3.0110174983797795, + "grad_norm": 0.07538292557001114, + "learning_rate": 5.494396687519691e-05, + "loss": 0.2068, + "step": 37168 + }, + { + "epoch": 3.011098509397278, + "grad_norm": 0.07418717443943024, + "learning_rate": 5.4939466222602276e-05, + "loss": 0.2516, + "step": 37169 + }, + { + "epoch": 3.0111795204147764, + "grad_norm": 0.07645675539970398, + "learning_rate": 5.493496557000766e-05, + "loss": 0.2525, + "step": 37170 + }, + { + "epoch": 3.0112605314322747, + "grad_norm": 0.08645131438970566, + "learning_rate": 5.493046491741303e-05, + "loss": 0.2413, + "step": 37171 + }, + { + "epoch": 3.0113415424497734, + "grad_norm": 0.06277317553758621, + "learning_rate": 5.49259642648184e-05, + "loss": 0.208, + "step": 37172 + }, + { + "epoch": 3.0114225534672716, + "grad_norm": 0.07197176665067673, + "learning_rate": 5.492146361222378e-05, + "loss": 0.2284, + "step": 37173 + }, + { + "epoch": 3.01150356448477, + "grad_norm": 0.07234460115432739, + "learning_rate": 5.491696295962915e-05, + "loss": 0.2438, + "step": 37174 + }, + { + "epoch": 3.011584575502268, + "grad_norm": 0.07466896623373032, + "learning_rate": 5.491246230703452e-05, + "loss": 0.2424, + "step": 37175 + }, + { + "epoch": 3.011665586519767, + "grad_norm": 0.07664573192596436, + "learning_rate": 5.49079616544399e-05, + "loss": 0.1929, + "step": 37176 + }, + { + "epoch": 3.011746597537265, + "grad_norm": 0.09230896830558777, + "learning_rate": 5.490346100184527e-05, + "loss": 0.2293, + "step": 37177 + }, + { + "epoch": 3.0118276085547633, + "grad_norm": 0.06680130958557129, + "learning_rate": 5.489896034925064e-05, + "loss": 0.2365, + "step": 37178 + }, + { + "epoch": 3.011908619572262, + "grad_norm": 0.06987342238426208, + "learning_rate": 5.489445969665602e-05, + "loss": 0.2413, + "step": 37179 + }, + { + "epoch": 3.01198963058976, + "grad_norm": 0.06937224417924881, + "learning_rate": 5.488995904406139e-05, + "loss": 0.2374, + "step": 37180 + }, + { + "epoch": 3.0120706416072585, + "grad_norm": 0.059260692447423935, + "learning_rate": 5.488545839146677e-05, + "loss": 0.2195, + "step": 37181 + }, + { + "epoch": 3.012151652624757, + "grad_norm": 0.11258818954229355, + "learning_rate": 5.488095773887214e-05, + "loss": 0.241, + "step": 37182 + }, + { + "epoch": 3.0122326636422554, + "grad_norm": 0.0633421391248703, + "learning_rate": 5.4876457086277513e-05, + "loss": 0.2432, + "step": 37183 + }, + { + "epoch": 3.0123136746597536, + "grad_norm": 0.07214123755693436, + "learning_rate": 5.4871956433682894e-05, + "loss": 0.2305, + "step": 37184 + }, + { + "epoch": 3.0123946856772523, + "grad_norm": 0.07456515729427338, + "learning_rate": 5.486745578108826e-05, + "loss": 0.238, + "step": 37185 + }, + { + "epoch": 3.0124756966947506, + "grad_norm": 0.05460673198103905, + "learning_rate": 5.4862955128493634e-05, + "loss": 0.1989, + "step": 37186 + }, + { + "epoch": 3.012556707712249, + "grad_norm": 0.062048815190792084, + "learning_rate": 5.4858454475899015e-05, + "loss": 0.2333, + "step": 37187 + }, + { + "epoch": 3.012637718729747, + "grad_norm": 0.07398334890604019, + "learning_rate": 5.485395382330438e-05, + "loss": 0.2463, + "step": 37188 + }, + { + "epoch": 3.0127187297472457, + "grad_norm": 0.07296764850616455, + "learning_rate": 5.4849453170709755e-05, + "loss": 0.2321, + "step": 37189 + }, + { + "epoch": 3.012799740764744, + "grad_norm": 0.07078998535871506, + "learning_rate": 5.4844952518115135e-05, + "loss": 0.244, + "step": 37190 + }, + { + "epoch": 3.0128807517822422, + "grad_norm": 0.07269686460494995, + "learning_rate": 5.48404518655205e-05, + "loss": 0.1921, + "step": 37191 + }, + { + "epoch": 3.012961762799741, + "grad_norm": 0.07033134996891022, + "learning_rate": 5.4835951212925876e-05, + "loss": 0.2418, + "step": 37192 + }, + { + "epoch": 3.013042773817239, + "grad_norm": 0.08645029366016388, + "learning_rate": 5.4831450560331256e-05, + "loss": 0.2314, + "step": 37193 + }, + { + "epoch": 3.0131237848347374, + "grad_norm": 0.06531119346618652, + "learning_rate": 5.482694990773662e-05, + "loss": 0.2065, + "step": 37194 + }, + { + "epoch": 3.013204795852236, + "grad_norm": 0.07155883312225342, + "learning_rate": 5.4822449255142e-05, + "loss": 0.235, + "step": 37195 + }, + { + "epoch": 3.0132858068697344, + "grad_norm": 0.06503793597221375, + "learning_rate": 5.481794860254738e-05, + "loss": 0.2261, + "step": 37196 + }, + { + "epoch": 3.0133668178872326, + "grad_norm": 0.07013113796710968, + "learning_rate": 5.4813447949952744e-05, + "loss": 0.2167, + "step": 37197 + }, + { + "epoch": 3.013447828904731, + "grad_norm": 0.08574307709932327, + "learning_rate": 5.480894729735812e-05, + "loss": 0.2266, + "step": 37198 + }, + { + "epoch": 3.0135288399222295, + "grad_norm": 0.07470546662807465, + "learning_rate": 5.48044466447635e-05, + "loss": 0.2973, + "step": 37199 + }, + { + "epoch": 3.0136098509397278, + "grad_norm": 0.08057143539190292, + "learning_rate": 5.4799945992168865e-05, + "loss": 0.2368, + "step": 37200 + }, + { + "epoch": 3.013690861957226, + "grad_norm": 0.07528182119131088, + "learning_rate": 5.479544533957424e-05, + "loss": 0.2354, + "step": 37201 + }, + { + "epoch": 3.0137718729747247, + "grad_norm": 0.07321956008672714, + "learning_rate": 5.479094468697962e-05, + "loss": 0.2205, + "step": 37202 + }, + { + "epoch": 3.013852883992223, + "grad_norm": 0.07638127356767654, + "learning_rate": 5.4786444034384986e-05, + "loss": 0.2284, + "step": 37203 + }, + { + "epoch": 3.013933895009721, + "grad_norm": 0.06527112424373627, + "learning_rate": 5.478194338179036e-05, + "loss": 0.2554, + "step": 37204 + }, + { + "epoch": 3.01401490602722, + "grad_norm": 0.07681337743997574, + "learning_rate": 5.477744272919574e-05, + "loss": 0.2288, + "step": 37205 + }, + { + "epoch": 3.014095917044718, + "grad_norm": 0.06282360106706619, + "learning_rate": 5.4772942076601106e-05, + "loss": 0.217, + "step": 37206 + }, + { + "epoch": 3.0141769280622164, + "grad_norm": 0.06141400337219238, + "learning_rate": 5.476844142400648e-05, + "loss": 0.2678, + "step": 37207 + }, + { + "epoch": 3.0142579390797146, + "grad_norm": 0.06095695495605469, + "learning_rate": 5.476394077141186e-05, + "loss": 0.2428, + "step": 37208 + }, + { + "epoch": 3.0143389500972133, + "grad_norm": 0.062181975692510605, + "learning_rate": 5.475944011881723e-05, + "loss": 0.2381, + "step": 37209 + }, + { + "epoch": 3.0144199611147116, + "grad_norm": 0.06874234229326248, + "learning_rate": 5.475493946622261e-05, + "loss": 0.2473, + "step": 37210 + }, + { + "epoch": 3.01450097213221, + "grad_norm": 0.0715310275554657, + "learning_rate": 5.475043881362798e-05, + "loss": 0.229, + "step": 37211 + }, + { + "epoch": 3.0145819831497085, + "grad_norm": 0.06931551545858383, + "learning_rate": 5.474593816103335e-05, + "loss": 0.2037, + "step": 37212 + }, + { + "epoch": 3.0146629941672067, + "grad_norm": 0.06569510698318481, + "learning_rate": 5.474143750843873e-05, + "loss": 0.2061, + "step": 37213 + }, + { + "epoch": 3.014744005184705, + "grad_norm": 0.060928985476493835, + "learning_rate": 5.47369368558441e-05, + "loss": 0.2004, + "step": 37214 + }, + { + "epoch": 3.0148250162022037, + "grad_norm": 0.06559912115335464, + "learning_rate": 5.473243620324947e-05, + "loss": 0.2478, + "step": 37215 + }, + { + "epoch": 3.014906027219702, + "grad_norm": 0.09447550028562546, + "learning_rate": 5.472793555065485e-05, + "loss": 0.2177, + "step": 37216 + }, + { + "epoch": 3.0149870382372, + "grad_norm": 0.07177255302667618, + "learning_rate": 5.472343489806022e-05, + "loss": 0.2568, + "step": 37217 + }, + { + "epoch": 3.015068049254699, + "grad_norm": 0.07061301916837692, + "learning_rate": 5.471893424546559e-05, + "loss": 0.25, + "step": 37218 + }, + { + "epoch": 3.015149060272197, + "grad_norm": 0.055768292397260666, + "learning_rate": 5.471443359287097e-05, + "loss": 0.2318, + "step": 37219 + }, + { + "epoch": 3.0152300712896953, + "grad_norm": 0.06401805579662323, + "learning_rate": 5.4709932940276344e-05, + "loss": 0.1784, + "step": 37220 + }, + { + "epoch": 3.0153110823071936, + "grad_norm": 0.07539704442024231, + "learning_rate": 5.470543228768171e-05, + "loss": 0.2364, + "step": 37221 + }, + { + "epoch": 3.0153920933246923, + "grad_norm": 0.06201757490634918, + "learning_rate": 5.470093163508709e-05, + "loss": 0.2009, + "step": 37222 + }, + { + "epoch": 3.0154731043421905, + "grad_norm": 0.06623942404985428, + "learning_rate": 5.4696430982492464e-05, + "loss": 0.2436, + "step": 37223 + }, + { + "epoch": 3.0155541153596888, + "grad_norm": 0.07048413157463074, + "learning_rate": 5.469193032989783e-05, + "loss": 0.2292, + "step": 37224 + }, + { + "epoch": 3.0156351263771874, + "grad_norm": 0.06607518345117569, + "learning_rate": 5.468742967730321e-05, + "loss": 0.2366, + "step": 37225 + }, + { + "epoch": 3.0157161373946857, + "grad_norm": 0.07342692464590073, + "learning_rate": 5.4682929024708585e-05, + "loss": 0.1976, + "step": 37226 + }, + { + "epoch": 3.015797148412184, + "grad_norm": 0.0792352706193924, + "learning_rate": 5.467842837211395e-05, + "loss": 0.2558, + "step": 37227 + }, + { + "epoch": 3.0158781594296826, + "grad_norm": 0.06499131768941879, + "learning_rate": 5.467392771951933e-05, + "loss": 0.1834, + "step": 37228 + }, + { + "epoch": 3.015959170447181, + "grad_norm": 0.06605541706085205, + "learning_rate": 5.4669427066924706e-05, + "loss": 0.2081, + "step": 37229 + }, + { + "epoch": 3.016040181464679, + "grad_norm": 0.076319120824337, + "learning_rate": 5.466492641433007e-05, + "loss": 0.2378, + "step": 37230 + }, + { + "epoch": 3.0161211924821774, + "grad_norm": 0.0853307768702507, + "learning_rate": 5.466042576173545e-05, + "loss": 0.2144, + "step": 37231 + }, + { + "epoch": 3.016202203499676, + "grad_norm": 0.05636778101325035, + "learning_rate": 5.465592510914083e-05, + "loss": 0.2152, + "step": 37232 + }, + { + "epoch": 3.0162832145171743, + "grad_norm": 0.07191046327352524, + "learning_rate": 5.4651424456546194e-05, + "loss": 0.2203, + "step": 37233 + }, + { + "epoch": 3.0163642255346725, + "grad_norm": 0.06822015345096588, + "learning_rate": 5.464692380395158e-05, + "loss": 0.2697, + "step": 37234 + }, + { + "epoch": 3.0164452365521712, + "grad_norm": 0.06572640687227249, + "learning_rate": 5.464242315135695e-05, + "loss": 0.2328, + "step": 37235 + }, + { + "epoch": 3.0165262475696695, + "grad_norm": 0.06790418177843094, + "learning_rate": 5.463792249876233e-05, + "loss": 0.2499, + "step": 37236 + }, + { + "epoch": 3.0166072585871677, + "grad_norm": 0.07083853334188461, + "learning_rate": 5.46334218461677e-05, + "loss": 0.1971, + "step": 37237 + }, + { + "epoch": 3.0166882696046664, + "grad_norm": 0.0681314542889595, + "learning_rate": 5.462892119357307e-05, + "loss": 0.2352, + "step": 37238 + }, + { + "epoch": 3.0167692806221647, + "grad_norm": 0.059616200625896454, + "learning_rate": 5.462442054097845e-05, + "loss": 0.2286, + "step": 37239 + }, + { + "epoch": 3.016850291639663, + "grad_norm": 0.06560337543487549, + "learning_rate": 5.461991988838382e-05, + "loss": 0.2481, + "step": 37240 + }, + { + "epoch": 3.0169313026571616, + "grad_norm": 0.06413570791482925, + "learning_rate": 5.461541923578919e-05, + "loss": 0.2271, + "step": 37241 + }, + { + "epoch": 3.01701231367466, + "grad_norm": 0.07081443071365356, + "learning_rate": 5.461091858319457e-05, + "loss": 0.2156, + "step": 37242 + }, + { + "epoch": 3.017093324692158, + "grad_norm": 0.07580958306789398, + "learning_rate": 5.460641793059994e-05, + "loss": 0.2512, + "step": 37243 + }, + { + "epoch": 3.0171743357096563, + "grad_norm": 0.07665203511714935, + "learning_rate": 5.460191727800531e-05, + "loss": 0.2064, + "step": 37244 + }, + { + "epoch": 3.017255346727155, + "grad_norm": 0.0836213082075119, + "learning_rate": 5.459741662541069e-05, + "loss": 0.2442, + "step": 37245 + }, + { + "epoch": 3.0173363577446533, + "grad_norm": 0.07863006740808487, + "learning_rate": 5.4592915972816064e-05, + "loss": 0.2178, + "step": 37246 + }, + { + "epoch": 3.0174173687621515, + "grad_norm": 0.06348031014204025, + "learning_rate": 5.458841532022143e-05, + "loss": 0.1979, + "step": 37247 + }, + { + "epoch": 3.01749837977965, + "grad_norm": 0.0701642706990242, + "learning_rate": 5.458391466762681e-05, + "loss": 0.2428, + "step": 37248 + }, + { + "epoch": 3.0175793907971484, + "grad_norm": 0.0907692015171051, + "learning_rate": 5.4579414015032185e-05, + "loss": 0.2313, + "step": 37249 + }, + { + "epoch": 3.0176604018146467, + "grad_norm": 0.07722572237253189, + "learning_rate": 5.457491336243755e-05, + "loss": 0.2181, + "step": 37250 + }, + { + "epoch": 3.0177414128321454, + "grad_norm": 0.07011134922504425, + "learning_rate": 5.457041270984293e-05, + "loss": 0.2379, + "step": 37251 + }, + { + "epoch": 3.0178224238496436, + "grad_norm": 0.07664912939071655, + "learning_rate": 5.4565912057248306e-05, + "loss": 0.2393, + "step": 37252 + }, + { + "epoch": 3.017903434867142, + "grad_norm": 0.07362043857574463, + "learning_rate": 5.456141140465367e-05, + "loss": 0.2173, + "step": 37253 + }, + { + "epoch": 3.01798444588464, + "grad_norm": 0.07834141701459885, + "learning_rate": 5.455691075205905e-05, + "loss": 0.2493, + "step": 37254 + }, + { + "epoch": 3.018065456902139, + "grad_norm": 0.06812326610088348, + "learning_rate": 5.4552410099464426e-05, + "loss": 0.2226, + "step": 37255 + }, + { + "epoch": 3.018146467919637, + "grad_norm": 0.06823229044675827, + "learning_rate": 5.454790944686979e-05, + "loss": 0.206, + "step": 37256 + }, + { + "epoch": 3.0182274789371353, + "grad_norm": 0.09068725258111954, + "learning_rate": 5.4543408794275174e-05, + "loss": 0.2219, + "step": 37257 + }, + { + "epoch": 3.018308489954634, + "grad_norm": 0.0642944872379303, + "learning_rate": 5.453890814168055e-05, + "loss": 0.2413, + "step": 37258 + }, + { + "epoch": 3.018389500972132, + "grad_norm": 0.07336447387933731, + "learning_rate": 5.4534407489085914e-05, + "loss": 0.2409, + "step": 37259 + }, + { + "epoch": 3.0184705119896305, + "grad_norm": 0.07347241044044495, + "learning_rate": 5.4529906836491294e-05, + "loss": 0.2229, + "step": 37260 + }, + { + "epoch": 3.018551523007129, + "grad_norm": 0.07632424682378769, + "learning_rate": 5.452540618389667e-05, + "loss": 0.2265, + "step": 37261 + }, + { + "epoch": 3.0186325340246274, + "grad_norm": 0.0791018158197403, + "learning_rate": 5.452090553130205e-05, + "loss": 0.211, + "step": 37262 + }, + { + "epoch": 3.0187135450421256, + "grad_norm": 0.0772065594792366, + "learning_rate": 5.4516404878707415e-05, + "loss": 0.2076, + "step": 37263 + }, + { + "epoch": 3.0187945560596243, + "grad_norm": 0.06899842619895935, + "learning_rate": 5.451190422611279e-05, + "loss": 0.2531, + "step": 37264 + }, + { + "epoch": 3.0188755670771226, + "grad_norm": 0.08461082726716995, + "learning_rate": 5.450740357351817e-05, + "loss": 0.3056, + "step": 37265 + }, + { + "epoch": 3.018956578094621, + "grad_norm": 0.07619468867778778, + "learning_rate": 5.4502902920923536e-05, + "loss": 0.2356, + "step": 37266 + }, + { + "epoch": 3.019037589112119, + "grad_norm": 0.08498235791921616, + "learning_rate": 5.449840226832891e-05, + "loss": 0.2215, + "step": 37267 + }, + { + "epoch": 3.0191186001296177, + "grad_norm": 0.06508364528417587, + "learning_rate": 5.449390161573429e-05, + "loss": 0.2219, + "step": 37268 + }, + { + "epoch": 3.019199611147116, + "grad_norm": 0.060360874980688095, + "learning_rate": 5.448940096313966e-05, + "loss": 0.2633, + "step": 37269 + }, + { + "epoch": 3.0192806221646142, + "grad_norm": 0.07176513224840164, + "learning_rate": 5.448490031054503e-05, + "loss": 0.218, + "step": 37270 + }, + { + "epoch": 3.019361633182113, + "grad_norm": 0.07150030881166458, + "learning_rate": 5.448039965795041e-05, + "loss": 0.2151, + "step": 37271 + }, + { + "epoch": 3.019442644199611, + "grad_norm": 0.07552962005138397, + "learning_rate": 5.447589900535578e-05, + "loss": 0.2398, + "step": 37272 + }, + { + "epoch": 3.0195236552171094, + "grad_norm": 0.09413708001375198, + "learning_rate": 5.447139835276115e-05, + "loss": 0.2635, + "step": 37273 + }, + { + "epoch": 3.019604666234608, + "grad_norm": 0.07900536805391312, + "learning_rate": 5.446689770016653e-05, + "loss": 0.2267, + "step": 37274 + }, + { + "epoch": 3.0196856772521063, + "grad_norm": 0.07706227153539658, + "learning_rate": 5.44623970475719e-05, + "loss": 0.2637, + "step": 37275 + }, + { + "epoch": 3.0197666882696046, + "grad_norm": 0.06675276160240173, + "learning_rate": 5.445789639497727e-05, + "loss": 0.2128, + "step": 37276 + }, + { + "epoch": 3.019847699287103, + "grad_norm": 0.0579092912375927, + "learning_rate": 5.445339574238265e-05, + "loss": 0.2021, + "step": 37277 + }, + { + "epoch": 3.0199287103046015, + "grad_norm": 0.0754135251045227, + "learning_rate": 5.444889508978802e-05, + "loss": 0.2551, + "step": 37278 + }, + { + "epoch": 3.0200097213220998, + "grad_norm": 0.07271932810544968, + "learning_rate": 5.444439443719339e-05, + "loss": 0.2093, + "step": 37279 + }, + { + "epoch": 3.020090732339598, + "grad_norm": 0.07659343630075455, + "learning_rate": 5.443989378459877e-05, + "loss": 0.2459, + "step": 37280 + }, + { + "epoch": 3.0201717433570967, + "grad_norm": 0.06897158175706863, + "learning_rate": 5.443539313200414e-05, + "loss": 0.2466, + "step": 37281 + }, + { + "epoch": 3.020252754374595, + "grad_norm": 0.06454170495271683, + "learning_rate": 5.4430892479409514e-05, + "loss": 0.2076, + "step": 37282 + }, + { + "epoch": 3.020333765392093, + "grad_norm": 0.07755886763334274, + "learning_rate": 5.4426391826814894e-05, + "loss": 0.2343, + "step": 37283 + }, + { + "epoch": 3.020414776409592, + "grad_norm": 0.07092835009098053, + "learning_rate": 5.442189117422026e-05, + "loss": 0.2077, + "step": 37284 + }, + { + "epoch": 3.02049578742709, + "grad_norm": 0.0711079090833664, + "learning_rate": 5.4417390521625635e-05, + "loss": 0.2171, + "step": 37285 + }, + { + "epoch": 3.0205767984445884, + "grad_norm": 0.06403342634439468, + "learning_rate": 5.4412889869031015e-05, + "loss": 0.2404, + "step": 37286 + }, + { + "epoch": 3.020657809462087, + "grad_norm": 0.07245420664548874, + "learning_rate": 5.440838921643638e-05, + "loss": 0.222, + "step": 37287 + }, + { + "epoch": 3.0207388204795853, + "grad_norm": 0.09491761773824692, + "learning_rate": 5.440388856384176e-05, + "loss": 0.2192, + "step": 37288 + }, + { + "epoch": 3.0208198314970836, + "grad_norm": 0.06958624720573425, + "learning_rate": 5.4399387911247136e-05, + "loss": 0.2372, + "step": 37289 + }, + { + "epoch": 3.020900842514582, + "grad_norm": 0.07254919409751892, + "learning_rate": 5.43948872586525e-05, + "loss": 0.2008, + "step": 37290 + }, + { + "epoch": 3.0209818535320805, + "grad_norm": 0.06609213352203369, + "learning_rate": 5.439038660605788e-05, + "loss": 0.2431, + "step": 37291 + }, + { + "epoch": 3.0210628645495787, + "grad_norm": 0.07412755489349365, + "learning_rate": 5.4385885953463257e-05, + "loss": 0.2478, + "step": 37292 + }, + { + "epoch": 3.021143875567077, + "grad_norm": 0.04594634473323822, + "learning_rate": 5.438138530086862e-05, + "loss": 0.2057, + "step": 37293 + }, + { + "epoch": 3.0212248865845757, + "grad_norm": 0.09090680629014969, + "learning_rate": 5.4376884648274004e-05, + "loss": 0.252, + "step": 37294 + }, + { + "epoch": 3.021305897602074, + "grad_norm": 0.08909545093774796, + "learning_rate": 5.437238399567938e-05, + "loss": 0.2878, + "step": 37295 + }, + { + "epoch": 3.021386908619572, + "grad_norm": 0.07122080773115158, + "learning_rate": 5.4367883343084744e-05, + "loss": 0.2181, + "step": 37296 + }, + { + "epoch": 3.021467919637071, + "grad_norm": 0.07232771068811417, + "learning_rate": 5.4363382690490125e-05, + "loss": 0.2455, + "step": 37297 + }, + { + "epoch": 3.021548930654569, + "grad_norm": 0.07290542870759964, + "learning_rate": 5.43588820378955e-05, + "loss": 0.2413, + "step": 37298 + }, + { + "epoch": 3.0216299416720673, + "grad_norm": 0.08508367091417313, + "learning_rate": 5.4354381385300865e-05, + "loss": 0.1985, + "step": 37299 + }, + { + "epoch": 3.0217109526895656, + "grad_norm": 0.06248488649725914, + "learning_rate": 5.4349880732706245e-05, + "loss": 0.2598, + "step": 37300 + }, + { + "epoch": 3.0217919637070643, + "grad_norm": 0.09384345263242722, + "learning_rate": 5.434538008011162e-05, + "loss": 0.2532, + "step": 37301 + }, + { + "epoch": 3.0218729747245625, + "grad_norm": 0.07828628271818161, + "learning_rate": 5.4340879427516986e-05, + "loss": 0.253, + "step": 37302 + }, + { + "epoch": 3.0219539857420608, + "grad_norm": 0.06680522859096527, + "learning_rate": 5.433637877492237e-05, + "loss": 0.2128, + "step": 37303 + }, + { + "epoch": 3.0220349967595594, + "grad_norm": 0.07760295271873474, + "learning_rate": 5.433187812232774e-05, + "loss": 0.2144, + "step": 37304 + }, + { + "epoch": 3.0221160077770577, + "grad_norm": 0.06053243577480316, + "learning_rate": 5.432737746973311e-05, + "loss": 0.2238, + "step": 37305 + }, + { + "epoch": 3.022197018794556, + "grad_norm": 0.07531262189149857, + "learning_rate": 5.4322876817138494e-05, + "loss": 0.2447, + "step": 37306 + }, + { + "epoch": 3.0222780298120546, + "grad_norm": 0.06485378742218018, + "learning_rate": 5.431837616454386e-05, + "loss": 0.2276, + "step": 37307 + }, + { + "epoch": 3.022359040829553, + "grad_norm": 0.060371000319719315, + "learning_rate": 5.431387551194923e-05, + "loss": 0.1987, + "step": 37308 + }, + { + "epoch": 3.022440051847051, + "grad_norm": 0.07953300327062607, + "learning_rate": 5.4309374859354615e-05, + "loss": 0.2299, + "step": 37309 + }, + { + "epoch": 3.0225210628645494, + "grad_norm": 0.06593051552772522, + "learning_rate": 5.430487420675998e-05, + "loss": 0.2461, + "step": 37310 + }, + { + "epoch": 3.022602073882048, + "grad_norm": 0.065412238240242, + "learning_rate": 5.430037355416535e-05, + "loss": 0.2018, + "step": 37311 + }, + { + "epoch": 3.0226830848995463, + "grad_norm": 0.07266979664564133, + "learning_rate": 5.4295872901570735e-05, + "loss": 0.2308, + "step": 37312 + }, + { + "epoch": 3.0227640959170445, + "grad_norm": 0.08019926398992538, + "learning_rate": 5.42913722489761e-05, + "loss": 0.244, + "step": 37313 + }, + { + "epoch": 3.0228451069345432, + "grad_norm": 0.06879211217164993, + "learning_rate": 5.428687159638148e-05, + "loss": 0.2152, + "step": 37314 + }, + { + "epoch": 3.0229261179520415, + "grad_norm": 0.07490628957748413, + "learning_rate": 5.4282370943786856e-05, + "loss": 0.2829, + "step": 37315 + }, + { + "epoch": 3.0230071289695397, + "grad_norm": 0.06716594845056534, + "learning_rate": 5.427787029119222e-05, + "loss": 0.2269, + "step": 37316 + }, + { + "epoch": 3.0230881399870384, + "grad_norm": 0.0727708488702774, + "learning_rate": 5.4273369638597603e-05, + "loss": 0.2197, + "step": 37317 + }, + { + "epoch": 3.0231691510045366, + "grad_norm": 0.07033897936344147, + "learning_rate": 5.426886898600298e-05, + "loss": 0.227, + "step": 37318 + }, + { + "epoch": 3.023250162022035, + "grad_norm": 0.07997550070285797, + "learning_rate": 5.4264368333408344e-05, + "loss": 0.1858, + "step": 37319 + }, + { + "epoch": 3.0233311730395336, + "grad_norm": 0.05678095296025276, + "learning_rate": 5.4259867680813724e-05, + "loss": 0.226, + "step": 37320 + }, + { + "epoch": 3.023412184057032, + "grad_norm": 0.05888417363166809, + "learning_rate": 5.42553670282191e-05, + "loss": 0.2114, + "step": 37321 + }, + { + "epoch": 3.02349319507453, + "grad_norm": 0.08064017444849014, + "learning_rate": 5.4250866375624465e-05, + "loss": 0.264, + "step": 37322 + }, + { + "epoch": 3.0235742060920283, + "grad_norm": 0.0671364963054657, + "learning_rate": 5.4246365723029845e-05, + "loss": 0.2271, + "step": 37323 + }, + { + "epoch": 3.023655217109527, + "grad_norm": 0.07308002561330795, + "learning_rate": 5.424186507043522e-05, + "loss": 0.2027, + "step": 37324 + }, + { + "epoch": 3.0237362281270252, + "grad_norm": 0.07530664652585983, + "learning_rate": 5.4237364417840585e-05, + "loss": 0.2335, + "step": 37325 + }, + { + "epoch": 3.0238172391445235, + "grad_norm": 0.07628055661916733, + "learning_rate": 5.4232863765245966e-05, + "loss": 0.2619, + "step": 37326 + }, + { + "epoch": 3.023898250162022, + "grad_norm": 0.07323234528303146, + "learning_rate": 5.422836311265134e-05, + "loss": 0.2737, + "step": 37327 + }, + { + "epoch": 3.0239792611795204, + "grad_norm": 0.0741615816950798, + "learning_rate": 5.4223862460056706e-05, + "loss": 0.2271, + "step": 37328 + }, + { + "epoch": 3.0240602721970187, + "grad_norm": 0.07981351017951965, + "learning_rate": 5.421936180746209e-05, + "loss": 0.242, + "step": 37329 + }, + { + "epoch": 3.0241412832145174, + "grad_norm": 0.07190749794244766, + "learning_rate": 5.421486115486746e-05, + "loss": 0.2174, + "step": 37330 + }, + { + "epoch": 3.0242222942320156, + "grad_norm": 0.07480721175670624, + "learning_rate": 5.421036050227283e-05, + "loss": 0.2183, + "step": 37331 + }, + { + "epoch": 3.024303305249514, + "grad_norm": 0.0733325406908989, + "learning_rate": 5.420585984967821e-05, + "loss": 0.2294, + "step": 37332 + }, + { + "epoch": 3.024384316267012, + "grad_norm": 0.07993824034929276, + "learning_rate": 5.420135919708358e-05, + "loss": 0.2483, + "step": 37333 + }, + { + "epoch": 3.024465327284511, + "grad_norm": 0.07397164404392242, + "learning_rate": 5.419685854448895e-05, + "loss": 0.2497, + "step": 37334 + }, + { + "epoch": 3.024546338302009, + "grad_norm": 0.07546770572662354, + "learning_rate": 5.419235789189433e-05, + "loss": 0.2276, + "step": 37335 + }, + { + "epoch": 3.0246273493195073, + "grad_norm": 0.07344533503055573, + "learning_rate": 5.41878572392997e-05, + "loss": 0.2389, + "step": 37336 + }, + { + "epoch": 3.024708360337006, + "grad_norm": 0.0710587278008461, + "learning_rate": 5.418335658670507e-05, + "loss": 0.2324, + "step": 37337 + }, + { + "epoch": 3.024789371354504, + "grad_norm": 0.0577889122068882, + "learning_rate": 5.417885593411045e-05, + "loss": 0.2531, + "step": 37338 + }, + { + "epoch": 3.0248703823720025, + "grad_norm": 0.07441332191228867, + "learning_rate": 5.417435528151582e-05, + "loss": 0.2532, + "step": 37339 + }, + { + "epoch": 3.024951393389501, + "grad_norm": 0.07562925666570663, + "learning_rate": 5.41698546289212e-05, + "loss": 0.2319, + "step": 37340 + }, + { + "epoch": 3.0250324044069994, + "grad_norm": 0.061570510268211365, + "learning_rate": 5.416535397632657e-05, + "loss": 0.2177, + "step": 37341 + }, + { + "epoch": 3.0251134154244976, + "grad_norm": 0.06682217121124268, + "learning_rate": 5.4160853323731944e-05, + "loss": 0.2536, + "step": 37342 + }, + { + "epoch": 3.0251944264419963, + "grad_norm": 0.07247215509414673, + "learning_rate": 5.4156352671137324e-05, + "loss": 0.2109, + "step": 37343 + }, + { + "epoch": 3.0252754374594946, + "grad_norm": 0.06940843164920807, + "learning_rate": 5.415185201854269e-05, + "loss": 0.215, + "step": 37344 + }, + { + "epoch": 3.025356448476993, + "grad_norm": 0.09310358017683029, + "learning_rate": 5.4147351365948064e-05, + "loss": 0.1984, + "step": 37345 + }, + { + "epoch": 3.025437459494491, + "grad_norm": 0.06538576632738113, + "learning_rate": 5.4142850713353445e-05, + "loss": 0.2462, + "step": 37346 + }, + { + "epoch": 3.0255184705119897, + "grad_norm": 0.06469295918941498, + "learning_rate": 5.413835006075881e-05, + "loss": 0.2298, + "step": 37347 + }, + { + "epoch": 3.025599481529488, + "grad_norm": 0.08019588887691498, + "learning_rate": 5.4133849408164185e-05, + "loss": 0.2458, + "step": 37348 + }, + { + "epoch": 3.0256804925469862, + "grad_norm": 0.06767239421606064, + "learning_rate": 5.4129348755569565e-05, + "loss": 0.2186, + "step": 37349 + }, + { + "epoch": 3.025761503564485, + "grad_norm": 0.06455112993717194, + "learning_rate": 5.412484810297493e-05, + "loss": 0.2254, + "step": 37350 + }, + { + "epoch": 3.025842514581983, + "grad_norm": 0.06013812869787216, + "learning_rate": 5.4120347450380306e-05, + "loss": 0.2261, + "step": 37351 + }, + { + "epoch": 3.0259235255994814, + "grad_norm": 0.06590741127729416, + "learning_rate": 5.4115846797785686e-05, + "loss": 0.2639, + "step": 37352 + }, + { + "epoch": 3.02600453661698, + "grad_norm": 0.06299849599599838, + "learning_rate": 5.411134614519105e-05, + "loss": 0.2252, + "step": 37353 + }, + { + "epoch": 3.0260855476344783, + "grad_norm": 0.0614766888320446, + "learning_rate": 5.410684549259643e-05, + "loss": 0.2291, + "step": 37354 + }, + { + "epoch": 3.0261665586519766, + "grad_norm": 0.0771852657198906, + "learning_rate": 5.410234484000181e-05, + "loss": 0.219, + "step": 37355 + }, + { + "epoch": 3.026247569669475, + "grad_norm": 0.060940295457839966, + "learning_rate": 5.4097844187407174e-05, + "loss": 0.2142, + "step": 37356 + }, + { + "epoch": 3.0263285806869735, + "grad_norm": 0.0864911824464798, + "learning_rate": 5.409334353481255e-05, + "loss": 0.2632, + "step": 37357 + }, + { + "epoch": 3.0264095917044718, + "grad_norm": 0.0863729789853096, + "learning_rate": 5.408884288221793e-05, + "loss": 0.2591, + "step": 37358 + }, + { + "epoch": 3.02649060272197, + "grad_norm": 0.06721893697977066, + "learning_rate": 5.4084342229623295e-05, + "loss": 0.2522, + "step": 37359 + }, + { + "epoch": 3.0265716137394687, + "grad_norm": 0.0649418756365776, + "learning_rate": 5.407984157702867e-05, + "loss": 0.2273, + "step": 37360 + }, + { + "epoch": 3.026652624756967, + "grad_norm": 0.06797848641872406, + "learning_rate": 5.407534092443405e-05, + "loss": 0.2519, + "step": 37361 + }, + { + "epoch": 3.026733635774465, + "grad_norm": 0.08032295852899551, + "learning_rate": 5.4070840271839416e-05, + "loss": 0.2394, + "step": 37362 + }, + { + "epoch": 3.026814646791964, + "grad_norm": 0.05988214164972305, + "learning_rate": 5.406633961924479e-05, + "loss": 0.2148, + "step": 37363 + }, + { + "epoch": 3.026895657809462, + "grad_norm": 0.0621197335422039, + "learning_rate": 5.406183896665017e-05, + "loss": 0.2123, + "step": 37364 + }, + { + "epoch": 3.0269766688269604, + "grad_norm": 0.06993670016527176, + "learning_rate": 5.4057338314055536e-05, + "loss": 0.2174, + "step": 37365 + }, + { + "epoch": 3.027057679844459, + "grad_norm": 0.05763579532504082, + "learning_rate": 5.405283766146091e-05, + "loss": 0.187, + "step": 37366 + }, + { + "epoch": 3.0271386908619573, + "grad_norm": 0.06508929282426834, + "learning_rate": 5.404833700886629e-05, + "loss": 0.2239, + "step": 37367 + }, + { + "epoch": 3.0272197018794555, + "grad_norm": 0.06755175441503525, + "learning_rate": 5.404383635627166e-05, + "loss": 0.2611, + "step": 37368 + }, + { + "epoch": 3.027300712896954, + "grad_norm": 0.07040070742368698, + "learning_rate": 5.403933570367704e-05, + "loss": 0.2613, + "step": 37369 + }, + { + "epoch": 3.0273817239144525, + "grad_norm": 0.06353382021188736, + "learning_rate": 5.403483505108241e-05, + "loss": 0.2224, + "step": 37370 + }, + { + "epoch": 3.0274627349319507, + "grad_norm": 0.06055076792836189, + "learning_rate": 5.403033439848778e-05, + "loss": 0.2054, + "step": 37371 + }, + { + "epoch": 3.027543745949449, + "grad_norm": 0.06850864738225937, + "learning_rate": 5.4025833745893165e-05, + "loss": 0.2224, + "step": 37372 + }, + { + "epoch": 3.0276247569669477, + "grad_norm": 0.058965496718883514, + "learning_rate": 5.402133309329853e-05, + "loss": 0.199, + "step": 37373 + }, + { + "epoch": 3.027705767984446, + "grad_norm": 0.06645647436380386, + "learning_rate": 5.40168324407039e-05, + "loss": 0.2263, + "step": 37374 + }, + { + "epoch": 3.027786779001944, + "grad_norm": 0.07181558758020401, + "learning_rate": 5.4012331788109286e-05, + "loss": 0.2058, + "step": 37375 + }, + { + "epoch": 3.027867790019443, + "grad_norm": 0.06589259952306747, + "learning_rate": 5.400783113551465e-05, + "loss": 0.2199, + "step": 37376 + }, + { + "epoch": 3.027948801036941, + "grad_norm": 0.07510503381490707, + "learning_rate": 5.400333048292002e-05, + "loss": 0.2155, + "step": 37377 + }, + { + "epoch": 3.0280298120544393, + "grad_norm": 0.0718095675110817, + "learning_rate": 5.399882983032541e-05, + "loss": 0.2156, + "step": 37378 + }, + { + "epoch": 3.0281108230719376, + "grad_norm": 0.07855464518070221, + "learning_rate": 5.3994329177730774e-05, + "loss": 0.2282, + "step": 37379 + }, + { + "epoch": 3.0281918340894363, + "grad_norm": 0.0817565992474556, + "learning_rate": 5.398982852513614e-05, + "loss": 0.2462, + "step": 37380 + }, + { + "epoch": 3.0282728451069345, + "grad_norm": 0.06936581432819366, + "learning_rate": 5.398532787254153e-05, + "loss": 0.2508, + "step": 37381 + }, + { + "epoch": 3.0283538561244328, + "grad_norm": 0.07433851808309555, + "learning_rate": 5.3980827219946894e-05, + "loss": 0.25, + "step": 37382 + }, + { + "epoch": 3.0284348671419314, + "grad_norm": 0.06942424178123474, + "learning_rate": 5.397632656735226e-05, + "loss": 0.2481, + "step": 37383 + }, + { + "epoch": 3.0285158781594297, + "grad_norm": 0.07389133423566818, + "learning_rate": 5.397182591475765e-05, + "loss": 0.2411, + "step": 37384 + }, + { + "epoch": 3.028596889176928, + "grad_norm": 0.06642919778823853, + "learning_rate": 5.3967325262163015e-05, + "loss": 0.2291, + "step": 37385 + }, + { + "epoch": 3.0286779001944266, + "grad_norm": 0.09277532249689102, + "learning_rate": 5.396282460956838e-05, + "loss": 0.269, + "step": 37386 + }, + { + "epoch": 3.028758911211925, + "grad_norm": 0.07429872453212738, + "learning_rate": 5.395832395697377e-05, + "loss": 0.2613, + "step": 37387 + }, + { + "epoch": 3.028839922229423, + "grad_norm": 0.07359006255865097, + "learning_rate": 5.3953823304379136e-05, + "loss": 0.2138, + "step": 37388 + }, + { + "epoch": 3.028920933246922, + "grad_norm": 0.07269123941659927, + "learning_rate": 5.394932265178451e-05, + "loss": 0.251, + "step": 37389 + }, + { + "epoch": 3.02900194426442, + "grad_norm": 0.06176629289984703, + "learning_rate": 5.394482199918989e-05, + "loss": 0.2489, + "step": 37390 + }, + { + "epoch": 3.0290829552819183, + "grad_norm": 0.07285406440496445, + "learning_rate": 5.394032134659526e-05, + "loss": 0.2211, + "step": 37391 + }, + { + "epoch": 3.0291639662994165, + "grad_norm": 0.052691586315631866, + "learning_rate": 5.393582069400063e-05, + "loss": 0.2022, + "step": 37392 + }, + { + "epoch": 3.029244977316915, + "grad_norm": 0.058721382170915604, + "learning_rate": 5.393132004140601e-05, + "loss": 0.2314, + "step": 37393 + }, + { + "epoch": 3.0293259883344135, + "grad_norm": 0.06069071590900421, + "learning_rate": 5.392681938881138e-05, + "loss": 0.1991, + "step": 37394 + }, + { + "epoch": 3.0294069993519117, + "grad_norm": 0.058969639241695404, + "learning_rate": 5.392231873621676e-05, + "loss": 0.2395, + "step": 37395 + }, + { + "epoch": 3.0294880103694104, + "grad_norm": 0.07969574630260468, + "learning_rate": 5.391781808362213e-05, + "loss": 0.2136, + "step": 37396 + }, + { + "epoch": 3.0295690213869086, + "grad_norm": 0.07085856050252914, + "learning_rate": 5.39133174310275e-05, + "loss": 0.2667, + "step": 37397 + }, + { + "epoch": 3.029650032404407, + "grad_norm": 0.07769621908664703, + "learning_rate": 5.390881677843288e-05, + "loss": 0.2324, + "step": 37398 + }, + { + "epoch": 3.0297310434219056, + "grad_norm": 0.07305508852005005, + "learning_rate": 5.390431612583825e-05, + "loss": 0.2091, + "step": 37399 + }, + { + "epoch": 3.029812054439404, + "grad_norm": 0.07000664621591568, + "learning_rate": 5.389981547324362e-05, + "loss": 0.204, + "step": 37400 + }, + { + "epoch": 3.029893065456902, + "grad_norm": 0.06918566673994064, + "learning_rate": 5.3895314820649e-05, + "loss": 0.2249, + "step": 37401 + }, + { + "epoch": 3.0299740764744003, + "grad_norm": 0.06122610345482826, + "learning_rate": 5.389081416805437e-05, + "loss": 0.2038, + "step": 37402 + }, + { + "epoch": 3.030055087491899, + "grad_norm": 0.056917302310466766, + "learning_rate": 5.388631351545974e-05, + "loss": 0.2111, + "step": 37403 + }, + { + "epoch": 3.0301360985093972, + "grad_norm": 0.08675568550825119, + "learning_rate": 5.388181286286512e-05, + "loss": 0.2416, + "step": 37404 + }, + { + "epoch": 3.0302171095268955, + "grad_norm": 0.08074220269918442, + "learning_rate": 5.3877312210270494e-05, + "loss": 0.2445, + "step": 37405 + }, + { + "epoch": 3.030298120544394, + "grad_norm": 0.06495244055986404, + "learning_rate": 5.387281155767586e-05, + "loss": 0.2734, + "step": 37406 + }, + { + "epoch": 3.0303791315618924, + "grad_norm": 0.07835765928030014, + "learning_rate": 5.386831090508124e-05, + "loss": 0.239, + "step": 37407 + }, + { + "epoch": 3.0304601425793907, + "grad_norm": 0.07907546311616898, + "learning_rate": 5.3863810252486615e-05, + "loss": 0.2104, + "step": 37408 + }, + { + "epoch": 3.0305411535968894, + "grad_norm": 0.06553105264902115, + "learning_rate": 5.385930959989198e-05, + "loss": 0.2213, + "step": 37409 + }, + { + "epoch": 3.0306221646143876, + "grad_norm": 0.08010930567979813, + "learning_rate": 5.385480894729736e-05, + "loss": 0.2438, + "step": 37410 + }, + { + "epoch": 3.030703175631886, + "grad_norm": 0.06734942644834518, + "learning_rate": 5.3850308294702736e-05, + "loss": 0.2651, + "step": 37411 + }, + { + "epoch": 3.0307841866493845, + "grad_norm": 0.07775172591209412, + "learning_rate": 5.38458076421081e-05, + "loss": 0.2688, + "step": 37412 + }, + { + "epoch": 3.030865197666883, + "grad_norm": 0.08631449192762375, + "learning_rate": 5.384130698951348e-05, + "loss": 0.2444, + "step": 37413 + }, + { + "epoch": 3.030946208684381, + "grad_norm": 0.0761285126209259, + "learning_rate": 5.3836806336918856e-05, + "loss": 0.2356, + "step": 37414 + }, + { + "epoch": 3.0310272197018793, + "grad_norm": 0.06520287692546844, + "learning_rate": 5.383230568432422e-05, + "loss": 0.2142, + "step": 37415 + }, + { + "epoch": 3.031108230719378, + "grad_norm": 0.08522919565439224, + "learning_rate": 5.3827805031729604e-05, + "loss": 0.2504, + "step": 37416 + }, + { + "epoch": 3.031189241736876, + "grad_norm": 0.07640458643436432, + "learning_rate": 5.382330437913498e-05, + "loss": 0.2308, + "step": 37417 + }, + { + "epoch": 3.0312702527543745, + "grad_norm": 0.0752745121717453, + "learning_rate": 5.3818803726540344e-05, + "loss": 0.2541, + "step": 37418 + }, + { + "epoch": 3.031351263771873, + "grad_norm": 0.06180490925908089, + "learning_rate": 5.3814303073945725e-05, + "loss": 0.2418, + "step": 37419 + }, + { + "epoch": 3.0314322747893714, + "grad_norm": 0.07486403733491898, + "learning_rate": 5.38098024213511e-05, + "loss": 0.1804, + "step": 37420 + }, + { + "epoch": 3.0315132858068696, + "grad_norm": 0.06396470218896866, + "learning_rate": 5.380530176875648e-05, + "loss": 0.2823, + "step": 37421 + }, + { + "epoch": 3.0315942968243683, + "grad_norm": 0.08682464808225632, + "learning_rate": 5.3800801116161845e-05, + "loss": 0.3189, + "step": 37422 + }, + { + "epoch": 3.0316753078418666, + "grad_norm": 0.06351017951965332, + "learning_rate": 5.379630046356722e-05, + "loss": 0.2396, + "step": 37423 + }, + { + "epoch": 3.031756318859365, + "grad_norm": 0.07645206898450851, + "learning_rate": 5.37917998109726e-05, + "loss": 0.1973, + "step": 37424 + }, + { + "epoch": 3.031837329876863, + "grad_norm": 0.07359036803245544, + "learning_rate": 5.3787299158377966e-05, + "loss": 0.2536, + "step": 37425 + }, + { + "epoch": 3.0319183408943617, + "grad_norm": 0.0697692409157753, + "learning_rate": 5.378279850578334e-05, + "loss": 0.2437, + "step": 37426 + }, + { + "epoch": 3.03199935191186, + "grad_norm": 0.056899294257164, + "learning_rate": 5.377829785318872e-05, + "loss": 0.203, + "step": 37427 + }, + { + "epoch": 3.0320803629293582, + "grad_norm": 0.06358642131090164, + "learning_rate": 5.377379720059409e-05, + "loss": 0.2269, + "step": 37428 + }, + { + "epoch": 3.032161373946857, + "grad_norm": 0.08034027367830276, + "learning_rate": 5.376929654799946e-05, + "loss": 0.2828, + "step": 37429 + }, + { + "epoch": 3.032242384964355, + "grad_norm": 0.08328770101070404, + "learning_rate": 5.376479589540484e-05, + "loss": 0.2514, + "step": 37430 + }, + { + "epoch": 3.0323233959818534, + "grad_norm": 0.07368003576993942, + "learning_rate": 5.376029524281021e-05, + "loss": 0.2088, + "step": 37431 + }, + { + "epoch": 3.032404406999352, + "grad_norm": 0.054242219775915146, + "learning_rate": 5.375579459021558e-05, + "loss": 0.2459, + "step": 37432 + }, + { + "epoch": 3.0324854180168503, + "grad_norm": 0.061121974140405655, + "learning_rate": 5.375129393762096e-05, + "loss": 0.2175, + "step": 37433 + }, + { + "epoch": 3.0325664290343486, + "grad_norm": 0.07788266241550446, + "learning_rate": 5.374679328502633e-05, + "loss": 0.245, + "step": 37434 + }, + { + "epoch": 3.0326474400518473, + "grad_norm": 0.07702527195215225, + "learning_rate": 5.37422926324317e-05, + "loss": 0.2215, + "step": 37435 + }, + { + "epoch": 3.0327284510693455, + "grad_norm": 0.06540277600288391, + "learning_rate": 5.373779197983708e-05, + "loss": 0.2075, + "step": 37436 + }, + { + "epoch": 3.0328094620868438, + "grad_norm": 0.09210977703332901, + "learning_rate": 5.373329132724245e-05, + "loss": 0.2786, + "step": 37437 + }, + { + "epoch": 3.032890473104342, + "grad_norm": 0.06716837733983994, + "learning_rate": 5.372879067464782e-05, + "loss": 0.2514, + "step": 37438 + }, + { + "epoch": 3.0329714841218407, + "grad_norm": 0.0809812918305397, + "learning_rate": 5.37242900220532e-05, + "loss": 0.2673, + "step": 37439 + }, + { + "epoch": 3.033052495139339, + "grad_norm": 0.08728468418121338, + "learning_rate": 5.371978936945857e-05, + "loss": 0.2203, + "step": 37440 + }, + { + "epoch": 3.033133506156837, + "grad_norm": 0.08095324039459229, + "learning_rate": 5.3715288716863944e-05, + "loss": 0.2107, + "step": 37441 + }, + { + "epoch": 3.033214517174336, + "grad_norm": 0.0658116266131401, + "learning_rate": 5.3710788064269324e-05, + "loss": 0.2465, + "step": 37442 + }, + { + "epoch": 3.033295528191834, + "grad_norm": 0.07490494102239609, + "learning_rate": 5.370628741167469e-05, + "loss": 0.215, + "step": 37443 + }, + { + "epoch": 3.0333765392093324, + "grad_norm": 0.06612545251846313, + "learning_rate": 5.3701786759080065e-05, + "loss": 0.2258, + "step": 37444 + }, + { + "epoch": 3.033457550226831, + "grad_norm": 0.06559719145298004, + "learning_rate": 5.3697286106485445e-05, + "loss": 0.2329, + "step": 37445 + }, + { + "epoch": 3.0335385612443293, + "grad_norm": 0.08513180166482925, + "learning_rate": 5.369278545389081e-05, + "loss": 0.2581, + "step": 37446 + }, + { + "epoch": 3.0336195722618275, + "grad_norm": 0.0750497505068779, + "learning_rate": 5.36882848012962e-05, + "loss": 0.1976, + "step": 37447 + }, + { + "epoch": 3.033700583279326, + "grad_norm": 0.06607310473918915, + "learning_rate": 5.3683784148701566e-05, + "loss": 0.2383, + "step": 37448 + }, + { + "epoch": 3.0337815942968245, + "grad_norm": 0.06415095180273056, + "learning_rate": 5.367928349610693e-05, + "loss": 0.1751, + "step": 37449 + }, + { + "epoch": 3.0338626053143227, + "grad_norm": 0.07550282776355743, + "learning_rate": 5.367478284351232e-05, + "loss": 0.2309, + "step": 37450 + }, + { + "epoch": 3.033943616331821, + "grad_norm": 0.06409140676259995, + "learning_rate": 5.3670282190917687e-05, + "loss": 0.2014, + "step": 37451 + }, + { + "epoch": 3.0340246273493197, + "grad_norm": 0.06622499972581863, + "learning_rate": 5.3665781538323053e-05, + "loss": 0.2573, + "step": 37452 + }, + { + "epoch": 3.034105638366818, + "grad_norm": 0.06955836713314056, + "learning_rate": 5.366128088572844e-05, + "loss": 0.2423, + "step": 37453 + }, + { + "epoch": 3.034186649384316, + "grad_norm": 0.057708337903022766, + "learning_rate": 5.365678023313381e-05, + "loss": 0.1841, + "step": 37454 + }, + { + "epoch": 3.034267660401815, + "grad_norm": 0.06644859164953232, + "learning_rate": 5.3652279580539174e-05, + "loss": 0.2402, + "step": 37455 + }, + { + "epoch": 3.034348671419313, + "grad_norm": 0.06704627722501755, + "learning_rate": 5.364777892794456e-05, + "loss": 0.2369, + "step": 37456 + }, + { + "epoch": 3.0344296824368113, + "grad_norm": 0.07295235246419907, + "learning_rate": 5.364327827534993e-05, + "loss": 0.2788, + "step": 37457 + }, + { + "epoch": 3.0345106934543096, + "grad_norm": 0.08905566483736038, + "learning_rate": 5.36387776227553e-05, + "loss": 0.2241, + "step": 37458 + }, + { + "epoch": 3.0345917044718083, + "grad_norm": 0.07346481084823608, + "learning_rate": 5.363427697016068e-05, + "loss": 0.2377, + "step": 37459 + }, + { + "epoch": 3.0346727154893065, + "grad_norm": 0.07479727268218994, + "learning_rate": 5.362977631756605e-05, + "loss": 0.2355, + "step": 37460 + }, + { + "epoch": 3.0347537265068047, + "grad_norm": 0.06769086420536041, + "learning_rate": 5.362527566497142e-05, + "loss": 0.2046, + "step": 37461 + }, + { + "epoch": 3.0348347375243034, + "grad_norm": 0.060751911252737045, + "learning_rate": 5.36207750123768e-05, + "loss": 0.2178, + "step": 37462 + }, + { + "epoch": 3.0349157485418017, + "grad_norm": 0.07391112297773361, + "learning_rate": 5.361627435978217e-05, + "loss": 0.2547, + "step": 37463 + }, + { + "epoch": 3.0349967595593, + "grad_norm": 0.0735718160867691, + "learning_rate": 5.3611773707187543e-05, + "loss": 0.2025, + "step": 37464 + }, + { + "epoch": 3.0350777705767986, + "grad_norm": 0.07991587370634079, + "learning_rate": 5.3607273054592924e-05, + "loss": 0.2145, + "step": 37465 + }, + { + "epoch": 3.035158781594297, + "grad_norm": 0.07211611419916153, + "learning_rate": 5.360277240199829e-05, + "loss": 0.2086, + "step": 37466 + }, + { + "epoch": 3.035239792611795, + "grad_norm": 0.0736667737364769, + "learning_rate": 5.3598271749403664e-05, + "loss": 0.2063, + "step": 37467 + }, + { + "epoch": 3.035320803629294, + "grad_norm": 0.07628808170557022, + "learning_rate": 5.3593771096809045e-05, + "loss": 0.226, + "step": 37468 + }, + { + "epoch": 3.035401814646792, + "grad_norm": 0.06351827830076218, + "learning_rate": 5.358927044421441e-05, + "loss": 0.2271, + "step": 37469 + }, + { + "epoch": 3.0354828256642903, + "grad_norm": 0.06043693795800209, + "learning_rate": 5.3584769791619785e-05, + "loss": 0.2059, + "step": 37470 + }, + { + "epoch": 3.0355638366817885, + "grad_norm": 0.06572989374399185, + "learning_rate": 5.3580269139025165e-05, + "loss": 0.2441, + "step": 37471 + }, + { + "epoch": 3.035644847699287, + "grad_norm": 0.07039684802293777, + "learning_rate": 5.357576848643053e-05, + "loss": 0.2481, + "step": 37472 + }, + { + "epoch": 3.0357258587167855, + "grad_norm": 0.06588947027921677, + "learning_rate": 5.357126783383591e-05, + "loss": 0.2329, + "step": 37473 + }, + { + "epoch": 3.0358068697342837, + "grad_norm": 0.057079948484897614, + "learning_rate": 5.3566767181241286e-05, + "loss": 0.181, + "step": 37474 + }, + { + "epoch": 3.0358878807517824, + "grad_norm": 0.05848081409931183, + "learning_rate": 5.356226652864665e-05, + "loss": 0.2016, + "step": 37475 + }, + { + "epoch": 3.0359688917692806, + "grad_norm": 0.06334573030471802, + "learning_rate": 5.3557765876052033e-05, + "loss": 0.2294, + "step": 37476 + }, + { + "epoch": 3.036049902786779, + "grad_norm": 0.07205773144960403, + "learning_rate": 5.355326522345741e-05, + "loss": 0.2109, + "step": 37477 + }, + { + "epoch": 3.0361309138042776, + "grad_norm": 0.07480573654174805, + "learning_rate": 5.3548764570862774e-05, + "loss": 0.2309, + "step": 37478 + }, + { + "epoch": 3.036211924821776, + "grad_norm": 0.050419967621564865, + "learning_rate": 5.3544263918268154e-05, + "loss": 0.1819, + "step": 37479 + }, + { + "epoch": 3.036292935839274, + "grad_norm": 0.05797084793448448, + "learning_rate": 5.353976326567353e-05, + "loss": 0.199, + "step": 37480 + }, + { + "epoch": 3.0363739468567723, + "grad_norm": 0.0698668509721756, + "learning_rate": 5.3535262613078895e-05, + "loss": 0.2142, + "step": 37481 + }, + { + "epoch": 3.036454957874271, + "grad_norm": 0.07098384946584702, + "learning_rate": 5.3530761960484275e-05, + "loss": 0.2571, + "step": 37482 + }, + { + "epoch": 3.0365359688917692, + "grad_norm": 0.06307261437177658, + "learning_rate": 5.352626130788965e-05, + "loss": 0.2419, + "step": 37483 + }, + { + "epoch": 3.0366169799092675, + "grad_norm": 0.07396720349788666, + "learning_rate": 5.3521760655295015e-05, + "loss": 0.2252, + "step": 37484 + }, + { + "epoch": 3.036697990926766, + "grad_norm": 0.07606613636016846, + "learning_rate": 5.3517260002700396e-05, + "loss": 0.212, + "step": 37485 + }, + { + "epoch": 3.0367790019442644, + "grad_norm": 0.07393737137317657, + "learning_rate": 5.351275935010577e-05, + "loss": 0.2213, + "step": 37486 + }, + { + "epoch": 3.0368600129617627, + "grad_norm": 0.07436393201351166, + "learning_rate": 5.3508258697511136e-05, + "loss": 0.257, + "step": 37487 + }, + { + "epoch": 3.0369410239792614, + "grad_norm": 0.0729961097240448, + "learning_rate": 5.350375804491652e-05, + "loss": 0.2238, + "step": 37488 + }, + { + "epoch": 3.0370220349967596, + "grad_norm": 0.08112277835607529, + "learning_rate": 5.349925739232189e-05, + "loss": 0.2837, + "step": 37489 + }, + { + "epoch": 3.037103046014258, + "grad_norm": 0.05937008932232857, + "learning_rate": 5.349475673972726e-05, + "loss": 0.2072, + "step": 37490 + }, + { + "epoch": 3.0371840570317565, + "grad_norm": 0.07106920331716537, + "learning_rate": 5.349025608713264e-05, + "loss": 0.2089, + "step": 37491 + }, + { + "epoch": 3.037265068049255, + "grad_norm": 0.05964120104908943, + "learning_rate": 5.348575543453801e-05, + "loss": 0.2145, + "step": 37492 + }, + { + "epoch": 3.037346079066753, + "grad_norm": 0.0652584359049797, + "learning_rate": 5.348125478194338e-05, + "loss": 0.2002, + "step": 37493 + }, + { + "epoch": 3.0374270900842513, + "grad_norm": 0.08113518357276917, + "learning_rate": 5.347675412934876e-05, + "loss": 0.2559, + "step": 37494 + }, + { + "epoch": 3.03750810110175, + "grad_norm": 0.06260479241609573, + "learning_rate": 5.347225347675413e-05, + "loss": 0.2166, + "step": 37495 + }, + { + "epoch": 3.037589112119248, + "grad_norm": 0.0738939419388771, + "learning_rate": 5.34677528241595e-05, + "loss": 0.2316, + "step": 37496 + }, + { + "epoch": 3.0376701231367464, + "grad_norm": 0.08688575029373169, + "learning_rate": 5.346325217156488e-05, + "loss": 0.1926, + "step": 37497 + }, + { + "epoch": 3.037751134154245, + "grad_norm": 0.09045497328042984, + "learning_rate": 5.345875151897025e-05, + "loss": 0.2259, + "step": 37498 + }, + { + "epoch": 3.0378321451717434, + "grad_norm": 0.07186606526374817, + "learning_rate": 5.345425086637563e-05, + "loss": 0.2487, + "step": 37499 + }, + { + "epoch": 3.0379131561892416, + "grad_norm": 0.09282194077968597, + "learning_rate": 5.3449750213781e-05, + "loss": 0.2408, + "step": 37500 + }, + { + "epoch": 3.0379941672067403, + "grad_norm": 0.06380818784236908, + "learning_rate": 5.3445249561186374e-05, + "loss": 0.1914, + "step": 37501 + }, + { + "epoch": 3.0380751782242386, + "grad_norm": 0.0604427196085453, + "learning_rate": 5.3440748908591754e-05, + "loss": 0.1788, + "step": 37502 + }, + { + "epoch": 3.038156189241737, + "grad_norm": 0.07687865197658539, + "learning_rate": 5.343624825599712e-05, + "loss": 0.1917, + "step": 37503 + }, + { + "epoch": 3.038237200259235, + "grad_norm": 0.07873617112636566, + "learning_rate": 5.3431747603402494e-05, + "loss": 0.2192, + "step": 37504 + }, + { + "epoch": 3.0383182112767337, + "grad_norm": 0.07436539977788925, + "learning_rate": 5.3427246950807875e-05, + "loss": 0.2413, + "step": 37505 + }, + { + "epoch": 3.038399222294232, + "grad_norm": 0.07715284079313278, + "learning_rate": 5.342274629821324e-05, + "loss": 0.2259, + "step": 37506 + }, + { + "epoch": 3.0384802333117302, + "grad_norm": 0.06521465629339218, + "learning_rate": 5.3418245645618615e-05, + "loss": 0.2354, + "step": 37507 + }, + { + "epoch": 3.038561244329229, + "grad_norm": 0.0683523640036583, + "learning_rate": 5.3413744993023996e-05, + "loss": 0.1877, + "step": 37508 + }, + { + "epoch": 3.038642255346727, + "grad_norm": 0.09436856210231781, + "learning_rate": 5.340924434042936e-05, + "loss": 0.2488, + "step": 37509 + }, + { + "epoch": 3.0387232663642254, + "grad_norm": 0.07124968618154526, + "learning_rate": 5.3404743687834736e-05, + "loss": 0.209, + "step": 37510 + }, + { + "epoch": 3.038804277381724, + "grad_norm": 0.07376158237457275, + "learning_rate": 5.3400243035240116e-05, + "loss": 0.2305, + "step": 37511 + }, + { + "epoch": 3.0388852883992223, + "grad_norm": 0.07440029084682465, + "learning_rate": 5.339574238264548e-05, + "loss": 0.2344, + "step": 37512 + }, + { + "epoch": 3.0389662994167206, + "grad_norm": 0.07803120464086533, + "learning_rate": 5.339124173005086e-05, + "loss": 0.2568, + "step": 37513 + }, + { + "epoch": 3.039047310434219, + "grad_norm": 0.08405184745788574, + "learning_rate": 5.338674107745624e-05, + "loss": 0.2262, + "step": 37514 + }, + { + "epoch": 3.0391283214517175, + "grad_norm": 0.08579245954751968, + "learning_rate": 5.3382240424861604e-05, + "loss": 0.2317, + "step": 37515 + }, + { + "epoch": 3.0392093324692158, + "grad_norm": 0.0725676491856575, + "learning_rate": 5.337773977226698e-05, + "loss": 0.2415, + "step": 37516 + }, + { + "epoch": 3.039290343486714, + "grad_norm": 0.0620294027030468, + "learning_rate": 5.337323911967236e-05, + "loss": 0.233, + "step": 37517 + }, + { + "epoch": 3.0393713545042127, + "grad_norm": 0.06782814115285873, + "learning_rate": 5.3368738467077725e-05, + "loss": 0.2392, + "step": 37518 + }, + { + "epoch": 3.039452365521711, + "grad_norm": 0.06763561815023422, + "learning_rate": 5.33642378144831e-05, + "loss": 0.2074, + "step": 37519 + }, + { + "epoch": 3.039533376539209, + "grad_norm": 0.06067238375544548, + "learning_rate": 5.335973716188848e-05, + "loss": 0.2242, + "step": 37520 + }, + { + "epoch": 3.039614387556708, + "grad_norm": 0.06123970076441765, + "learning_rate": 5.3355236509293846e-05, + "loss": 0.2208, + "step": 37521 + }, + { + "epoch": 3.039695398574206, + "grad_norm": 0.07573569566011429, + "learning_rate": 5.335073585669922e-05, + "loss": 0.2613, + "step": 37522 + }, + { + "epoch": 3.0397764095917044, + "grad_norm": 0.09033697843551636, + "learning_rate": 5.33462352041046e-05, + "loss": 0.1972, + "step": 37523 + }, + { + "epoch": 3.039857420609203, + "grad_norm": 0.07207215577363968, + "learning_rate": 5.3341734551509966e-05, + "loss": 0.2466, + "step": 37524 + }, + { + "epoch": 3.0399384316267013, + "grad_norm": 0.08466987311840057, + "learning_rate": 5.3337233898915354e-05, + "loss": 0.2555, + "step": 37525 + }, + { + "epoch": 3.0400194426441995, + "grad_norm": 0.06799205392599106, + "learning_rate": 5.333273324632072e-05, + "loss": 0.2397, + "step": 37526 + }, + { + "epoch": 3.040100453661698, + "grad_norm": 0.08047273755073547, + "learning_rate": 5.3328232593726094e-05, + "loss": 0.2083, + "step": 37527 + }, + { + "epoch": 3.0401814646791965, + "grad_norm": 0.07457821071147919, + "learning_rate": 5.3323731941131474e-05, + "loss": 0.2328, + "step": 37528 + }, + { + "epoch": 3.0402624756966947, + "grad_norm": 0.06416770815849304, + "learning_rate": 5.331923128853684e-05, + "loss": 0.2524, + "step": 37529 + }, + { + "epoch": 3.040343486714193, + "grad_norm": 0.08857259899377823, + "learning_rate": 5.3314730635942215e-05, + "loss": 0.1908, + "step": 37530 + }, + { + "epoch": 3.0404244977316917, + "grad_norm": 0.06999360769987106, + "learning_rate": 5.3310229983347595e-05, + "loss": 0.2341, + "step": 37531 + }, + { + "epoch": 3.04050550874919, + "grad_norm": 0.08685637265443802, + "learning_rate": 5.330572933075296e-05, + "loss": 0.2426, + "step": 37532 + }, + { + "epoch": 3.040586519766688, + "grad_norm": 0.061902157962322235, + "learning_rate": 5.3301228678158336e-05, + "loss": 0.1998, + "step": 37533 + }, + { + "epoch": 3.040667530784187, + "grad_norm": 0.0677398070693016, + "learning_rate": 5.3296728025563716e-05, + "loss": 0.2147, + "step": 37534 + }, + { + "epoch": 3.040748541801685, + "grad_norm": 0.07476639747619629, + "learning_rate": 5.329222737296908e-05, + "loss": 0.2109, + "step": 37535 + }, + { + "epoch": 3.0408295528191833, + "grad_norm": 0.06877736747264862, + "learning_rate": 5.3287726720374456e-05, + "loss": 0.1989, + "step": 37536 + }, + { + "epoch": 3.0409105638366816, + "grad_norm": 0.0667649358510971, + "learning_rate": 5.328322606777984e-05, + "loss": 0.1968, + "step": 37537 + }, + { + "epoch": 3.0409915748541803, + "grad_norm": 0.06484334915876389, + "learning_rate": 5.3278725415185204e-05, + "loss": 0.1927, + "step": 37538 + }, + { + "epoch": 3.0410725858716785, + "grad_norm": 0.0637441873550415, + "learning_rate": 5.327422476259058e-05, + "loss": 0.2431, + "step": 37539 + }, + { + "epoch": 3.0411535968891767, + "grad_norm": 0.06637068092823029, + "learning_rate": 5.326972410999596e-05, + "loss": 0.2477, + "step": 37540 + }, + { + "epoch": 3.0412346079066754, + "grad_norm": 0.07331857830286026, + "learning_rate": 5.3265223457401324e-05, + "loss": 0.2287, + "step": 37541 + }, + { + "epoch": 3.0413156189241737, + "grad_norm": 0.06472773104906082, + "learning_rate": 5.32607228048067e-05, + "loss": 0.2434, + "step": 37542 + }, + { + "epoch": 3.041396629941672, + "grad_norm": 0.061671916395425797, + "learning_rate": 5.325622215221208e-05, + "loss": 0.2499, + "step": 37543 + }, + { + "epoch": 3.0414776409591706, + "grad_norm": 0.07631280273199081, + "learning_rate": 5.3251721499617445e-05, + "loss": 0.2362, + "step": 37544 + }, + { + "epoch": 3.041558651976669, + "grad_norm": 0.07066506147384644, + "learning_rate": 5.324722084702282e-05, + "loss": 0.1933, + "step": 37545 + }, + { + "epoch": 3.041639662994167, + "grad_norm": 0.07721670717000961, + "learning_rate": 5.32427201944282e-05, + "loss": 0.2023, + "step": 37546 + }, + { + "epoch": 3.041720674011666, + "grad_norm": 0.0790068730711937, + "learning_rate": 5.3238219541833566e-05, + "loss": 0.2633, + "step": 37547 + }, + { + "epoch": 3.041801685029164, + "grad_norm": 0.06365988403558731, + "learning_rate": 5.323371888923894e-05, + "loss": 0.2293, + "step": 37548 + }, + { + "epoch": 3.0418826960466623, + "grad_norm": 0.07861355692148209, + "learning_rate": 5.322921823664432e-05, + "loss": 0.2473, + "step": 37549 + }, + { + "epoch": 3.0419637070641605, + "grad_norm": 0.07868161052465439, + "learning_rate": 5.322471758404969e-05, + "loss": 0.2582, + "step": 37550 + }, + { + "epoch": 3.042044718081659, + "grad_norm": 0.05878037214279175, + "learning_rate": 5.322021693145506e-05, + "loss": 0.2121, + "step": 37551 + }, + { + "epoch": 3.0421257290991575, + "grad_norm": 0.06938491761684418, + "learning_rate": 5.321571627886044e-05, + "loss": 0.2811, + "step": 37552 + }, + { + "epoch": 3.0422067401166557, + "grad_norm": 0.06721276044845581, + "learning_rate": 5.321121562626581e-05, + "loss": 0.2572, + "step": 37553 + }, + { + "epoch": 3.0422877511341544, + "grad_norm": 0.08425439894199371, + "learning_rate": 5.320671497367119e-05, + "loss": 0.2336, + "step": 37554 + }, + { + "epoch": 3.0423687621516526, + "grad_norm": 0.07287422567605972, + "learning_rate": 5.320221432107656e-05, + "loss": 0.2144, + "step": 37555 + }, + { + "epoch": 3.042449773169151, + "grad_norm": 0.07077626138925552, + "learning_rate": 5.319771366848193e-05, + "loss": 0.2239, + "step": 37556 + }, + { + "epoch": 3.0425307841866496, + "grad_norm": 0.07828255742788315, + "learning_rate": 5.319321301588731e-05, + "loss": 0.2138, + "step": 37557 + }, + { + "epoch": 3.042611795204148, + "grad_norm": 0.062211714684963226, + "learning_rate": 5.318871236329268e-05, + "loss": 0.2208, + "step": 37558 + }, + { + "epoch": 3.042692806221646, + "grad_norm": 0.057104844599962234, + "learning_rate": 5.318421171069805e-05, + "loss": 0.2069, + "step": 37559 + }, + { + "epoch": 3.0427738172391443, + "grad_norm": 0.07099134474992752, + "learning_rate": 5.317971105810343e-05, + "loss": 0.2105, + "step": 37560 + }, + { + "epoch": 3.042854828256643, + "grad_norm": 0.06640342622995377, + "learning_rate": 5.31752104055088e-05, + "loss": 0.2627, + "step": 37561 + }, + { + "epoch": 3.0429358392741412, + "grad_norm": 0.07271246612071991, + "learning_rate": 5.317070975291417e-05, + "loss": 0.2299, + "step": 37562 + }, + { + "epoch": 3.0430168502916395, + "grad_norm": 0.07936045527458191, + "learning_rate": 5.316620910031955e-05, + "loss": 0.2154, + "step": 37563 + }, + { + "epoch": 3.043097861309138, + "grad_norm": 0.07012918591499329, + "learning_rate": 5.3161708447724924e-05, + "loss": 0.2266, + "step": 37564 + }, + { + "epoch": 3.0431788723266364, + "grad_norm": 0.07440099865198135, + "learning_rate": 5.315720779513029e-05, + "loss": 0.2014, + "step": 37565 + }, + { + "epoch": 3.0432598833441347, + "grad_norm": 0.07617049664258957, + "learning_rate": 5.315270714253567e-05, + "loss": 0.2135, + "step": 37566 + }, + { + "epoch": 3.0433408943616334, + "grad_norm": 0.0808364674448967, + "learning_rate": 5.3148206489941045e-05, + "loss": 0.252, + "step": 37567 + }, + { + "epoch": 3.0434219053791316, + "grad_norm": 0.07821191102266312, + "learning_rate": 5.314370583734641e-05, + "loss": 0.2414, + "step": 37568 + }, + { + "epoch": 3.04350291639663, + "grad_norm": 0.07060352712869644, + "learning_rate": 5.313920518475179e-05, + "loss": 0.2288, + "step": 37569 + }, + { + "epoch": 3.0435839274141285, + "grad_norm": 0.09167982637882233, + "learning_rate": 5.3134704532157166e-05, + "loss": 0.2358, + "step": 37570 + }, + { + "epoch": 3.0436649384316268, + "grad_norm": 0.06757145375013351, + "learning_rate": 5.313020387956253e-05, + "loss": 0.223, + "step": 37571 + }, + { + "epoch": 3.043745949449125, + "grad_norm": 0.06062881648540497, + "learning_rate": 5.312570322696791e-05, + "loss": 0.2076, + "step": 37572 + }, + { + "epoch": 3.0438269604666233, + "grad_norm": 0.06633229553699493, + "learning_rate": 5.3121202574373287e-05, + "loss": 0.2267, + "step": 37573 + }, + { + "epoch": 3.043907971484122, + "grad_norm": 0.0841381773352623, + "learning_rate": 5.311670192177865e-05, + "loss": 0.2821, + "step": 37574 + }, + { + "epoch": 3.04398898250162, + "grad_norm": 0.07296059280633926, + "learning_rate": 5.3112201269184034e-05, + "loss": 0.2326, + "step": 37575 + }, + { + "epoch": 3.0440699935191184, + "grad_norm": 0.05896758660674095, + "learning_rate": 5.310770061658941e-05, + "loss": 0.2521, + "step": 37576 + }, + { + "epoch": 3.044151004536617, + "grad_norm": 0.07654000073671341, + "learning_rate": 5.3103199963994774e-05, + "loss": 0.2348, + "step": 37577 + }, + { + "epoch": 3.0442320155541154, + "grad_norm": 0.06925518065690994, + "learning_rate": 5.3098699311400155e-05, + "loss": 0.2029, + "step": 37578 + }, + { + "epoch": 3.0443130265716136, + "grad_norm": 0.06512514501810074, + "learning_rate": 5.309419865880553e-05, + "loss": 0.244, + "step": 37579 + }, + { + "epoch": 3.0443940375891123, + "grad_norm": 0.06901316344738007, + "learning_rate": 5.308969800621091e-05, + "loss": 0.2376, + "step": 37580 + }, + { + "epoch": 3.0444750486066106, + "grad_norm": 0.08202358335256577, + "learning_rate": 5.3085197353616275e-05, + "loss": 0.2596, + "step": 37581 + }, + { + "epoch": 3.044556059624109, + "grad_norm": 0.07049586623907089, + "learning_rate": 5.308069670102165e-05, + "loss": 0.2407, + "step": 37582 + }, + { + "epoch": 3.044637070641607, + "grad_norm": 0.06545942276716232, + "learning_rate": 5.307619604842703e-05, + "loss": 0.2347, + "step": 37583 + }, + { + "epoch": 3.0447180816591057, + "grad_norm": 0.06013672426342964, + "learning_rate": 5.3071695395832396e-05, + "loss": 0.209, + "step": 37584 + }, + { + "epoch": 3.044799092676604, + "grad_norm": 0.06841271370649338, + "learning_rate": 5.306719474323777e-05, + "loss": 0.1978, + "step": 37585 + }, + { + "epoch": 3.0448801036941022, + "grad_norm": 0.07165801525115967, + "learning_rate": 5.306269409064315e-05, + "loss": 0.241, + "step": 37586 + }, + { + "epoch": 3.044961114711601, + "grad_norm": 0.0669749528169632, + "learning_rate": 5.305819343804852e-05, + "loss": 0.2372, + "step": 37587 + }, + { + "epoch": 3.045042125729099, + "grad_norm": 0.07862003892660141, + "learning_rate": 5.305369278545389e-05, + "loss": 0.2322, + "step": 37588 + }, + { + "epoch": 3.0451231367465974, + "grad_norm": 0.06369896978139877, + "learning_rate": 5.304919213285927e-05, + "loss": 0.2009, + "step": 37589 + }, + { + "epoch": 3.045204147764096, + "grad_norm": 0.05961852893233299, + "learning_rate": 5.304469148026464e-05, + "loss": 0.2006, + "step": 37590 + }, + { + "epoch": 3.0452851587815943, + "grad_norm": 0.0706590786576271, + "learning_rate": 5.304019082767001e-05, + "loss": 0.2272, + "step": 37591 + }, + { + "epoch": 3.0453661697990926, + "grad_norm": 0.06891553848981857, + "learning_rate": 5.303569017507539e-05, + "loss": 0.2257, + "step": 37592 + }, + { + "epoch": 3.0454471808165913, + "grad_norm": 0.07027577608823776, + "learning_rate": 5.3031189522480765e-05, + "loss": 0.2522, + "step": 37593 + }, + { + "epoch": 3.0455281918340895, + "grad_norm": 0.06691335886716843, + "learning_rate": 5.302668886988613e-05, + "loss": 0.2302, + "step": 37594 + }, + { + "epoch": 3.0456092028515878, + "grad_norm": 0.07276862114667892, + "learning_rate": 5.302218821729151e-05, + "loss": 0.2399, + "step": 37595 + }, + { + "epoch": 3.045690213869086, + "grad_norm": 0.06181017681956291, + "learning_rate": 5.3017687564696886e-05, + "loss": 0.2431, + "step": 37596 + }, + { + "epoch": 3.0457712248865847, + "grad_norm": 0.0632273405790329, + "learning_rate": 5.301318691210225e-05, + "loss": 0.1874, + "step": 37597 + }, + { + "epoch": 3.045852235904083, + "grad_norm": 0.09335504472255707, + "learning_rate": 5.300868625950763e-05, + "loss": 0.2396, + "step": 37598 + }, + { + "epoch": 3.045933246921581, + "grad_norm": 0.06783024221658707, + "learning_rate": 5.300418560691301e-05, + "loss": 0.232, + "step": 37599 + }, + { + "epoch": 3.04601425793908, + "grad_norm": 0.060017917305231094, + "learning_rate": 5.2999684954318374e-05, + "loss": 0.1993, + "step": 37600 + }, + { + "epoch": 3.046095268956578, + "grad_norm": 0.08943045884370804, + "learning_rate": 5.2995184301723754e-05, + "loss": 0.2238, + "step": 37601 + }, + { + "epoch": 3.0461762799740764, + "grad_norm": 0.07300959527492523, + "learning_rate": 5.299068364912913e-05, + "loss": 0.2396, + "step": 37602 + }, + { + "epoch": 3.046257290991575, + "grad_norm": 0.0776297077536583, + "learning_rate": 5.2986182996534495e-05, + "loss": 0.2241, + "step": 37603 + }, + { + "epoch": 3.0463383020090733, + "grad_norm": 0.057680848985910416, + "learning_rate": 5.2981682343939875e-05, + "loss": 0.2218, + "step": 37604 + }, + { + "epoch": 3.0464193130265715, + "grad_norm": 0.06870376318693161, + "learning_rate": 5.297718169134525e-05, + "loss": 0.2368, + "step": 37605 + }, + { + "epoch": 3.04650032404407, + "grad_norm": 0.07908564060926437, + "learning_rate": 5.297268103875063e-05, + "loss": 0.2257, + "step": 37606 + }, + { + "epoch": 3.0465813350615685, + "grad_norm": 0.0836765468120575, + "learning_rate": 5.2968180386155996e-05, + "loss": 0.1985, + "step": 37607 + }, + { + "epoch": 3.0466623460790667, + "grad_norm": 0.07887356728315353, + "learning_rate": 5.296367973356137e-05, + "loss": 0.2279, + "step": 37608 + }, + { + "epoch": 3.046743357096565, + "grad_norm": 0.07012686133384705, + "learning_rate": 5.295917908096675e-05, + "loss": 0.2389, + "step": 37609 + }, + { + "epoch": 3.0468243681140637, + "grad_norm": 0.07311613857746124, + "learning_rate": 5.295467842837212e-05, + "loss": 0.2198, + "step": 37610 + }, + { + "epoch": 3.046905379131562, + "grad_norm": 0.08131827414035797, + "learning_rate": 5.295017777577749e-05, + "loss": 0.2551, + "step": 37611 + }, + { + "epoch": 3.04698639014906, + "grad_norm": 0.056082744151353836, + "learning_rate": 5.294567712318287e-05, + "loss": 0.1839, + "step": 37612 + }, + { + "epoch": 3.047067401166559, + "grad_norm": 0.06888628751039505, + "learning_rate": 5.294117647058824e-05, + "loss": 0.1866, + "step": 37613 + }, + { + "epoch": 3.047148412184057, + "grad_norm": 0.07252807915210724, + "learning_rate": 5.293667581799361e-05, + "loss": 0.1902, + "step": 37614 + }, + { + "epoch": 3.0472294232015553, + "grad_norm": 0.07436560094356537, + "learning_rate": 5.293217516539899e-05, + "loss": 0.2341, + "step": 37615 + }, + { + "epoch": 3.047310434219054, + "grad_norm": 0.05969003215432167, + "learning_rate": 5.292767451280436e-05, + "loss": 0.2154, + "step": 37616 + }, + { + "epoch": 3.0473914452365523, + "grad_norm": 0.05967477336525917, + "learning_rate": 5.292317386020973e-05, + "loss": 0.2088, + "step": 37617 + }, + { + "epoch": 3.0474724562540505, + "grad_norm": 0.06089045852422714, + "learning_rate": 5.291867320761511e-05, + "loss": 0.2153, + "step": 37618 + }, + { + "epoch": 3.0475534672715487, + "grad_norm": 0.0729355439543724, + "learning_rate": 5.291417255502048e-05, + "loss": 0.1967, + "step": 37619 + }, + { + "epoch": 3.0476344782890474, + "grad_norm": 0.06276184320449829, + "learning_rate": 5.290967190242585e-05, + "loss": 0.2377, + "step": 37620 + }, + { + "epoch": 3.0477154893065457, + "grad_norm": 0.06968085467815399, + "learning_rate": 5.290517124983123e-05, + "loss": 0.2665, + "step": 37621 + }, + { + "epoch": 3.047796500324044, + "grad_norm": 0.06097818538546562, + "learning_rate": 5.29006705972366e-05, + "loss": 0.2161, + "step": 37622 + }, + { + "epoch": 3.0478775113415426, + "grad_norm": 0.07302144169807434, + "learning_rate": 5.2896169944641973e-05, + "loss": 0.2855, + "step": 37623 + }, + { + "epoch": 3.047958522359041, + "grad_norm": 0.07345126569271088, + "learning_rate": 5.2891669292047354e-05, + "loss": 0.2534, + "step": 37624 + }, + { + "epoch": 3.048039533376539, + "grad_norm": 0.052877724170684814, + "learning_rate": 5.288716863945272e-05, + "loss": 0.2006, + "step": 37625 + }, + { + "epoch": 3.048120544394038, + "grad_norm": 0.0753074586391449, + "learning_rate": 5.2882667986858094e-05, + "loss": 0.2258, + "step": 37626 + }, + { + "epoch": 3.048201555411536, + "grad_norm": 0.07556282728910446, + "learning_rate": 5.2878167334263475e-05, + "loss": 0.2416, + "step": 37627 + }, + { + "epoch": 3.0482825664290343, + "grad_norm": 0.06304620951414108, + "learning_rate": 5.287366668166884e-05, + "loss": 0.2146, + "step": 37628 + }, + { + "epoch": 3.0483635774465325, + "grad_norm": 0.08251237124204636, + "learning_rate": 5.2869166029074215e-05, + "loss": 0.2211, + "step": 37629 + }, + { + "epoch": 3.048444588464031, + "grad_norm": 0.08227784186601639, + "learning_rate": 5.2864665376479595e-05, + "loss": 0.2459, + "step": 37630 + }, + { + "epoch": 3.0485255994815295, + "grad_norm": 0.07222383469343185, + "learning_rate": 5.286016472388496e-05, + "loss": 0.2076, + "step": 37631 + }, + { + "epoch": 3.0486066104990277, + "grad_norm": 0.08086186647415161, + "learning_rate": 5.285566407129034e-05, + "loss": 0.3006, + "step": 37632 + }, + { + "epoch": 3.0486876215165264, + "grad_norm": 0.07935461401939392, + "learning_rate": 5.2851163418695716e-05, + "loss": 0.2355, + "step": 37633 + }, + { + "epoch": 3.0487686325340246, + "grad_norm": 0.07919391989707947, + "learning_rate": 5.284666276610108e-05, + "loss": 0.197, + "step": 37634 + }, + { + "epoch": 3.048849643551523, + "grad_norm": 0.08551984280347824, + "learning_rate": 5.2842162113506463e-05, + "loss": 0.2705, + "step": 37635 + }, + { + "epoch": 3.0489306545690216, + "grad_norm": 0.06743234395980835, + "learning_rate": 5.283766146091184e-05, + "loss": 0.2522, + "step": 37636 + }, + { + "epoch": 3.04901166558652, + "grad_norm": 0.06722062081098557, + "learning_rate": 5.2833160808317204e-05, + "loss": 0.2982, + "step": 37637 + }, + { + "epoch": 3.049092676604018, + "grad_norm": 0.0709279254078865, + "learning_rate": 5.2828660155722584e-05, + "loss": 0.2492, + "step": 37638 + }, + { + "epoch": 3.0491736876215167, + "grad_norm": 0.06910178810358047, + "learning_rate": 5.282415950312796e-05, + "loss": 0.2369, + "step": 37639 + }, + { + "epoch": 3.049254698639015, + "grad_norm": 0.06689421832561493, + "learning_rate": 5.2819658850533325e-05, + "loss": 0.2122, + "step": 37640 + }, + { + "epoch": 3.0493357096565132, + "grad_norm": 0.07600487023591995, + "learning_rate": 5.2815158197938705e-05, + "loss": 0.2316, + "step": 37641 + }, + { + "epoch": 3.0494167206740115, + "grad_norm": 0.07255223393440247, + "learning_rate": 5.281065754534408e-05, + "loss": 0.2154, + "step": 37642 + }, + { + "epoch": 3.04949773169151, + "grad_norm": 0.06639236211776733, + "learning_rate": 5.2806156892749446e-05, + "loss": 0.1957, + "step": 37643 + }, + { + "epoch": 3.0495787427090084, + "grad_norm": 0.07258108258247375, + "learning_rate": 5.2801656240154826e-05, + "loss": 0.2397, + "step": 37644 + }, + { + "epoch": 3.0496597537265067, + "grad_norm": 0.0692557692527771, + "learning_rate": 5.27971555875602e-05, + "loss": 0.2283, + "step": 37645 + }, + { + "epoch": 3.0497407647440054, + "grad_norm": 0.0779244527220726, + "learning_rate": 5.2792654934965566e-05, + "loss": 0.2222, + "step": 37646 + }, + { + "epoch": 3.0498217757615036, + "grad_norm": 0.07220424711704254, + "learning_rate": 5.278815428237095e-05, + "loss": 0.2353, + "step": 37647 + }, + { + "epoch": 3.049902786779002, + "grad_norm": 0.06793083995580673, + "learning_rate": 5.278365362977632e-05, + "loss": 0.2437, + "step": 37648 + }, + { + "epoch": 3.0499837977965005, + "grad_norm": 0.08438190072774887, + "learning_rate": 5.277915297718169e-05, + "loss": 0.2031, + "step": 37649 + }, + { + "epoch": 3.0500648088139988, + "grad_norm": 0.07985883951187134, + "learning_rate": 5.277465232458707e-05, + "loss": 0.2314, + "step": 37650 + }, + { + "epoch": 3.050145819831497, + "grad_norm": 0.06336715817451477, + "learning_rate": 5.277015167199244e-05, + "loss": 0.2329, + "step": 37651 + }, + { + "epoch": 3.0502268308489953, + "grad_norm": 0.08432844281196594, + "learning_rate": 5.276565101939781e-05, + "loss": 0.239, + "step": 37652 + }, + { + "epoch": 3.050307841866494, + "grad_norm": 0.07463747262954712, + "learning_rate": 5.276115036680319e-05, + "loss": 0.2422, + "step": 37653 + }, + { + "epoch": 3.050388852883992, + "grad_norm": 0.06991366297006607, + "learning_rate": 5.275664971420856e-05, + "loss": 0.1971, + "step": 37654 + }, + { + "epoch": 3.0504698639014904, + "grad_norm": 0.07708337903022766, + "learning_rate": 5.275214906161393e-05, + "loss": 0.2578, + "step": 37655 + }, + { + "epoch": 3.050550874918989, + "grad_norm": 0.0805683434009552, + "learning_rate": 5.274764840901931e-05, + "loss": 0.2059, + "step": 37656 + }, + { + "epoch": 3.0506318859364874, + "grad_norm": 0.07840461283922195, + "learning_rate": 5.274314775642468e-05, + "loss": 0.2351, + "step": 37657 + }, + { + "epoch": 3.0507128969539856, + "grad_norm": 0.08233583718538284, + "learning_rate": 5.273864710383006e-05, + "loss": 0.2095, + "step": 37658 + }, + { + "epoch": 3.0507939079714843, + "grad_norm": 0.06016426905989647, + "learning_rate": 5.273414645123543e-05, + "loss": 0.2084, + "step": 37659 + }, + { + "epoch": 3.0508749189889826, + "grad_norm": 0.07174959033727646, + "learning_rate": 5.2729645798640804e-05, + "loss": 0.2509, + "step": 37660 + }, + { + "epoch": 3.050955930006481, + "grad_norm": 0.07257946580648422, + "learning_rate": 5.2725145146046184e-05, + "loss": 0.2225, + "step": 37661 + }, + { + "epoch": 3.051036941023979, + "grad_norm": 0.06079892814159393, + "learning_rate": 5.272064449345156e-05, + "loss": 0.212, + "step": 37662 + }, + { + "epoch": 3.0511179520414777, + "grad_norm": 0.07712015509605408, + "learning_rate": 5.2716143840856924e-05, + "loss": 0.2023, + "step": 37663 + }, + { + "epoch": 3.051198963058976, + "grad_norm": 0.06289387494325638, + "learning_rate": 5.2711643188262305e-05, + "loss": 0.1957, + "step": 37664 + }, + { + "epoch": 3.051279974076474, + "grad_norm": 0.07708357274532318, + "learning_rate": 5.270714253566768e-05, + "loss": 0.2109, + "step": 37665 + }, + { + "epoch": 3.051360985093973, + "grad_norm": 0.07875526696443558, + "learning_rate": 5.2702641883073045e-05, + "loss": 0.2204, + "step": 37666 + }, + { + "epoch": 3.051441996111471, + "grad_norm": 0.06272130459547043, + "learning_rate": 5.2698141230478426e-05, + "loss": 0.2099, + "step": 37667 + }, + { + "epoch": 3.0515230071289694, + "grad_norm": 0.07556160539388657, + "learning_rate": 5.26936405778838e-05, + "loss": 0.1985, + "step": 37668 + }, + { + "epoch": 3.051604018146468, + "grad_norm": 0.0975421816110611, + "learning_rate": 5.2689139925289166e-05, + "loss": 0.2284, + "step": 37669 + }, + { + "epoch": 3.0516850291639663, + "grad_norm": 0.0759001150727272, + "learning_rate": 5.2684639272694546e-05, + "loss": 0.2443, + "step": 37670 + }, + { + "epoch": 3.0517660401814646, + "grad_norm": 0.06809696555137634, + "learning_rate": 5.268013862009992e-05, + "loss": 0.2292, + "step": 37671 + }, + { + "epoch": 3.0518470511989633, + "grad_norm": 0.08246435970067978, + "learning_rate": 5.267563796750529e-05, + "loss": 0.2386, + "step": 37672 + }, + { + "epoch": 3.0519280622164615, + "grad_norm": 0.07349836081266403, + "learning_rate": 5.267113731491067e-05, + "loss": 0.2198, + "step": 37673 + }, + { + "epoch": 3.0520090732339598, + "grad_norm": 0.0644262284040451, + "learning_rate": 5.266663666231604e-05, + "loss": 0.2131, + "step": 37674 + }, + { + "epoch": 3.052090084251458, + "grad_norm": 0.08209548890590668, + "learning_rate": 5.266213600972141e-05, + "loss": 0.2471, + "step": 37675 + }, + { + "epoch": 3.0521710952689567, + "grad_norm": 0.07829367369413376, + "learning_rate": 5.265763535712679e-05, + "loss": 0.29, + "step": 37676 + }, + { + "epoch": 3.052252106286455, + "grad_norm": 0.07859180122613907, + "learning_rate": 5.265313470453216e-05, + "loss": 0.2662, + "step": 37677 + }, + { + "epoch": 3.052333117303953, + "grad_norm": 0.08331391215324402, + "learning_rate": 5.264863405193753e-05, + "loss": 0.2649, + "step": 37678 + }, + { + "epoch": 3.052414128321452, + "grad_norm": 0.06476303935050964, + "learning_rate": 5.264413339934291e-05, + "loss": 0.2167, + "step": 37679 + }, + { + "epoch": 3.05249513933895, + "grad_norm": 0.08074692636728287, + "learning_rate": 5.263963274674828e-05, + "loss": 0.2791, + "step": 37680 + }, + { + "epoch": 3.0525761503564484, + "grad_norm": 0.06423210352659225, + "learning_rate": 5.263513209415365e-05, + "loss": 0.2381, + "step": 37681 + }, + { + "epoch": 3.052657161373947, + "grad_norm": 0.07402287423610687, + "learning_rate": 5.263063144155903e-05, + "loss": 0.2468, + "step": 37682 + }, + { + "epoch": 3.0527381723914453, + "grad_norm": 0.06496645510196686, + "learning_rate": 5.26261307889644e-05, + "loss": 0.2047, + "step": 37683 + }, + { + "epoch": 3.0528191834089435, + "grad_norm": 0.06942404806613922, + "learning_rate": 5.2621630136369784e-05, + "loss": 0.2579, + "step": 37684 + }, + { + "epoch": 3.052900194426442, + "grad_norm": 0.07186678797006607, + "learning_rate": 5.261712948377515e-05, + "loss": 0.2149, + "step": 37685 + }, + { + "epoch": 3.0529812054439405, + "grad_norm": 0.06583134829998016, + "learning_rate": 5.2612628831180524e-05, + "loss": 0.2711, + "step": 37686 + }, + { + "epoch": 3.0530622164614387, + "grad_norm": 0.0710592195391655, + "learning_rate": 5.2608128178585904e-05, + "loss": 0.2618, + "step": 37687 + }, + { + "epoch": 3.053143227478937, + "grad_norm": 0.0674964115023613, + "learning_rate": 5.260362752599127e-05, + "loss": 0.2102, + "step": 37688 + }, + { + "epoch": 3.0532242384964356, + "grad_norm": 0.06882596760988235, + "learning_rate": 5.2599126873396645e-05, + "loss": 0.2408, + "step": 37689 + }, + { + "epoch": 3.053305249513934, + "grad_norm": 0.06895139813423157, + "learning_rate": 5.2594626220802025e-05, + "loss": 0.2427, + "step": 37690 + }, + { + "epoch": 3.053386260531432, + "grad_norm": 0.07730741798877716, + "learning_rate": 5.259012556820739e-05, + "loss": 0.2335, + "step": 37691 + }, + { + "epoch": 3.053467271548931, + "grad_norm": 0.07082430273294449, + "learning_rate": 5.2585624915612766e-05, + "loss": 0.2354, + "step": 37692 + }, + { + "epoch": 3.053548282566429, + "grad_norm": 0.08576175570487976, + "learning_rate": 5.2581124263018146e-05, + "loss": 0.2319, + "step": 37693 + }, + { + "epoch": 3.0536292935839273, + "grad_norm": 0.0777260810136795, + "learning_rate": 5.257662361042351e-05, + "loss": 0.2478, + "step": 37694 + }, + { + "epoch": 3.053710304601426, + "grad_norm": 0.08391319215297699, + "learning_rate": 5.2572122957828886e-05, + "loss": 0.2181, + "step": 37695 + }, + { + "epoch": 3.0537913156189243, + "grad_norm": 0.08710380643606186, + "learning_rate": 5.256762230523427e-05, + "loss": 0.2079, + "step": 37696 + }, + { + "epoch": 3.0538723266364225, + "grad_norm": 0.08606188744306564, + "learning_rate": 5.2563121652639634e-05, + "loss": 0.2813, + "step": 37697 + }, + { + "epoch": 3.0539533376539207, + "grad_norm": 0.06323255598545074, + "learning_rate": 5.255862100004501e-05, + "loss": 0.2385, + "step": 37698 + }, + { + "epoch": 3.0540343486714194, + "grad_norm": 0.0727868601679802, + "learning_rate": 5.255412034745039e-05, + "loss": 0.2218, + "step": 37699 + }, + { + "epoch": 3.0541153596889177, + "grad_norm": 0.1189265325665474, + "learning_rate": 5.2549619694855754e-05, + "loss": 0.2413, + "step": 37700 + }, + { + "epoch": 3.054196370706416, + "grad_norm": 0.06921849399805069, + "learning_rate": 5.254511904226113e-05, + "loss": 0.2167, + "step": 37701 + }, + { + "epoch": 3.0542773817239146, + "grad_norm": 0.07509531080722809, + "learning_rate": 5.254061838966651e-05, + "loss": 0.2447, + "step": 37702 + }, + { + "epoch": 3.054358392741413, + "grad_norm": 0.08192278444766998, + "learning_rate": 5.2536117737071875e-05, + "loss": 0.244, + "step": 37703 + }, + { + "epoch": 3.054439403758911, + "grad_norm": 0.07004018872976303, + "learning_rate": 5.253161708447725e-05, + "loss": 0.2455, + "step": 37704 + }, + { + "epoch": 3.05452041477641, + "grad_norm": 0.07350824028253555, + "learning_rate": 5.252711643188263e-05, + "loss": 0.206, + "step": 37705 + }, + { + "epoch": 3.054601425793908, + "grad_norm": 0.07443010807037354, + "learning_rate": 5.2522615779287996e-05, + "loss": 0.2564, + "step": 37706 + }, + { + "epoch": 3.0546824368114063, + "grad_norm": 0.0812426209449768, + "learning_rate": 5.251811512669337e-05, + "loss": 0.2325, + "step": 37707 + }, + { + "epoch": 3.0547634478289045, + "grad_norm": 0.06436657160520554, + "learning_rate": 5.251361447409875e-05, + "loss": 0.2336, + "step": 37708 + }, + { + "epoch": 3.054844458846403, + "grad_norm": 0.08351697772741318, + "learning_rate": 5.250911382150412e-05, + "loss": 0.2314, + "step": 37709 + }, + { + "epoch": 3.0549254698639015, + "grad_norm": 0.06691282987594604, + "learning_rate": 5.250461316890949e-05, + "loss": 0.2598, + "step": 37710 + }, + { + "epoch": 3.0550064808813997, + "grad_norm": 0.07169311493635178, + "learning_rate": 5.250011251631487e-05, + "loss": 0.2592, + "step": 37711 + }, + { + "epoch": 3.0550874918988984, + "grad_norm": 0.06714262813329697, + "learning_rate": 5.249561186372024e-05, + "loss": 0.2298, + "step": 37712 + }, + { + "epoch": 3.0551685029163966, + "grad_norm": 0.07606201618909836, + "learning_rate": 5.249111121112562e-05, + "loss": 0.2161, + "step": 37713 + }, + { + "epoch": 3.055249513933895, + "grad_norm": 0.07866179943084717, + "learning_rate": 5.248661055853099e-05, + "loss": 0.2689, + "step": 37714 + }, + { + "epoch": 3.0553305249513936, + "grad_norm": 0.07774538546800613, + "learning_rate": 5.248210990593636e-05, + "loss": 0.2358, + "step": 37715 + }, + { + "epoch": 3.055411535968892, + "grad_norm": 0.07213099300861359, + "learning_rate": 5.247760925334174e-05, + "loss": 0.2341, + "step": 37716 + }, + { + "epoch": 3.05549254698639, + "grad_norm": 0.08474995940923691, + "learning_rate": 5.247310860074711e-05, + "loss": 0.2034, + "step": 37717 + }, + { + "epoch": 3.0555735580038887, + "grad_norm": 0.07596404105424881, + "learning_rate": 5.246860794815248e-05, + "loss": 0.1956, + "step": 37718 + }, + { + "epoch": 3.055654569021387, + "grad_norm": 0.06409193575382233, + "learning_rate": 5.246410729555786e-05, + "loss": 0.2006, + "step": 37719 + }, + { + "epoch": 3.0557355800388852, + "grad_norm": 0.06833213567733765, + "learning_rate": 5.245960664296323e-05, + "loss": 0.2181, + "step": 37720 + }, + { + "epoch": 3.0558165910563835, + "grad_norm": 0.06813128292560577, + "learning_rate": 5.24551059903686e-05, + "loss": 0.2598, + "step": 37721 + }, + { + "epoch": 3.055897602073882, + "grad_norm": 0.06665215641260147, + "learning_rate": 5.245060533777398e-05, + "loss": 0.221, + "step": 37722 + }, + { + "epoch": 3.0559786130913804, + "grad_norm": 0.08951111137866974, + "learning_rate": 5.2446104685179354e-05, + "loss": 0.2169, + "step": 37723 + }, + { + "epoch": 3.0560596241088787, + "grad_norm": 0.0698397308588028, + "learning_rate": 5.244160403258472e-05, + "loss": 0.2467, + "step": 37724 + }, + { + "epoch": 3.0561406351263773, + "grad_norm": 0.07880235463380814, + "learning_rate": 5.24371033799901e-05, + "loss": 0.2124, + "step": 37725 + }, + { + "epoch": 3.0562216461438756, + "grad_norm": 0.0704299658536911, + "learning_rate": 5.2432602727395475e-05, + "loss": 0.2013, + "step": 37726 + }, + { + "epoch": 3.056302657161374, + "grad_norm": 0.06827425956726074, + "learning_rate": 5.242810207480084e-05, + "loss": 0.2079, + "step": 37727 + }, + { + "epoch": 3.0563836681788725, + "grad_norm": 0.05798809602856636, + "learning_rate": 5.242360142220622e-05, + "loss": 0.1942, + "step": 37728 + }, + { + "epoch": 3.0564646791963708, + "grad_norm": 0.07702101767063141, + "learning_rate": 5.2419100769611596e-05, + "loss": 0.24, + "step": 37729 + }, + { + "epoch": 3.056545690213869, + "grad_norm": 0.06647029519081116, + "learning_rate": 5.241460011701696e-05, + "loss": 0.2744, + "step": 37730 + }, + { + "epoch": 3.0566267012313673, + "grad_norm": 0.06671483814716339, + "learning_rate": 5.241009946442235e-05, + "loss": 0.1994, + "step": 37731 + }, + { + "epoch": 3.056707712248866, + "grad_norm": 0.08495612442493439, + "learning_rate": 5.2405598811827717e-05, + "loss": 0.2208, + "step": 37732 + }, + { + "epoch": 3.056788723266364, + "grad_norm": 0.06984516978263855, + "learning_rate": 5.240109815923308e-05, + "loss": 0.2391, + "step": 37733 + }, + { + "epoch": 3.0568697342838624, + "grad_norm": 0.07440478354692459, + "learning_rate": 5.239659750663847e-05, + "loss": 0.2529, + "step": 37734 + }, + { + "epoch": 3.056950745301361, + "grad_norm": 0.06642437726259232, + "learning_rate": 5.239209685404384e-05, + "loss": 0.2556, + "step": 37735 + }, + { + "epoch": 3.0570317563188594, + "grad_norm": 0.07040982693433762, + "learning_rate": 5.2387596201449204e-05, + "loss": 0.2275, + "step": 37736 + }, + { + "epoch": 3.0571127673363576, + "grad_norm": 0.08027375489473343, + "learning_rate": 5.238309554885459e-05, + "loss": 0.2447, + "step": 37737 + }, + { + "epoch": 3.0571937783538563, + "grad_norm": 0.06435451656579971, + "learning_rate": 5.237859489625996e-05, + "loss": 0.2394, + "step": 37738 + }, + { + "epoch": 3.0572747893713546, + "grad_norm": 0.06844855844974518, + "learning_rate": 5.237409424366534e-05, + "loss": 0.2332, + "step": 37739 + }, + { + "epoch": 3.057355800388853, + "grad_norm": 0.07512935996055603, + "learning_rate": 5.236959359107071e-05, + "loss": 0.2268, + "step": 37740 + }, + { + "epoch": 3.057436811406351, + "grad_norm": 0.06621728837490082, + "learning_rate": 5.236509293847608e-05, + "loss": 0.2143, + "step": 37741 + }, + { + "epoch": 3.0575178224238497, + "grad_norm": 0.06908552348613739, + "learning_rate": 5.236059228588146e-05, + "loss": 0.2153, + "step": 37742 + }, + { + "epoch": 3.057598833441348, + "grad_norm": 0.06851686537265778, + "learning_rate": 5.235609163328683e-05, + "loss": 0.2243, + "step": 37743 + }, + { + "epoch": 3.057679844458846, + "grad_norm": 0.0707482248544693, + "learning_rate": 5.23515909806922e-05, + "loss": 0.225, + "step": 37744 + }, + { + "epoch": 3.057760855476345, + "grad_norm": 0.06813740730285645, + "learning_rate": 5.234709032809758e-05, + "loss": 0.2526, + "step": 37745 + }, + { + "epoch": 3.057841866493843, + "grad_norm": 0.07044027745723724, + "learning_rate": 5.2342589675502954e-05, + "loss": 0.245, + "step": 37746 + }, + { + "epoch": 3.0579228775113414, + "grad_norm": 0.07610490918159485, + "learning_rate": 5.233808902290832e-05, + "loss": 0.2738, + "step": 37747 + }, + { + "epoch": 3.05800388852884, + "grad_norm": 0.07336747646331787, + "learning_rate": 5.23335883703137e-05, + "loss": 0.2617, + "step": 37748 + }, + { + "epoch": 3.0580848995463383, + "grad_norm": 0.06932365894317627, + "learning_rate": 5.2329087717719075e-05, + "loss": 0.213, + "step": 37749 + }, + { + "epoch": 3.0581659105638366, + "grad_norm": 0.06575492769479752, + "learning_rate": 5.232458706512444e-05, + "loss": 0.2413, + "step": 37750 + }, + { + "epoch": 3.0582469215813353, + "grad_norm": 0.07268187403678894, + "learning_rate": 5.232008641252982e-05, + "loss": 0.2193, + "step": 37751 + }, + { + "epoch": 3.0583279325988335, + "grad_norm": 0.08104255795478821, + "learning_rate": 5.2315585759935195e-05, + "loss": 0.2199, + "step": 37752 + }, + { + "epoch": 3.0584089436163318, + "grad_norm": 0.06563001126050949, + "learning_rate": 5.231108510734056e-05, + "loss": 0.2283, + "step": 37753 + }, + { + "epoch": 3.05848995463383, + "grad_norm": 0.076548732817173, + "learning_rate": 5.230658445474594e-05, + "loss": 0.2182, + "step": 37754 + }, + { + "epoch": 3.0585709656513287, + "grad_norm": 0.0830010399222374, + "learning_rate": 5.2302083802151316e-05, + "loss": 0.2069, + "step": 37755 + }, + { + "epoch": 3.058651976668827, + "grad_norm": 0.07072506099939346, + "learning_rate": 5.229758314955668e-05, + "loss": 0.2466, + "step": 37756 + }, + { + "epoch": 3.058732987686325, + "grad_norm": 0.06200630962848663, + "learning_rate": 5.2293082496962063e-05, + "loss": 0.1948, + "step": 37757 + }, + { + "epoch": 3.058813998703824, + "grad_norm": 0.06687376648187637, + "learning_rate": 5.228858184436744e-05, + "loss": 0.2422, + "step": 37758 + }, + { + "epoch": 3.058895009721322, + "grad_norm": 0.05735126882791519, + "learning_rate": 5.2284081191772804e-05, + "loss": 0.1972, + "step": 37759 + }, + { + "epoch": 3.0589760207388204, + "grad_norm": 0.08486035466194153, + "learning_rate": 5.2279580539178184e-05, + "loss": 0.2567, + "step": 37760 + }, + { + "epoch": 3.059057031756319, + "grad_norm": 0.07064859569072723, + "learning_rate": 5.227507988658356e-05, + "loss": 0.2122, + "step": 37761 + }, + { + "epoch": 3.0591380427738173, + "grad_norm": 0.0696304515004158, + "learning_rate": 5.2270579233988925e-05, + "loss": 0.2335, + "step": 37762 + }, + { + "epoch": 3.0592190537913155, + "grad_norm": 0.06331910192966461, + "learning_rate": 5.2266078581394305e-05, + "loss": 0.2119, + "step": 37763 + }, + { + "epoch": 3.059300064808814, + "grad_norm": 0.06532147526741028, + "learning_rate": 5.226157792879968e-05, + "loss": 0.1845, + "step": 37764 + }, + { + "epoch": 3.0593810758263125, + "grad_norm": 0.0645311176776886, + "learning_rate": 5.225707727620506e-05, + "loss": 0.2216, + "step": 37765 + }, + { + "epoch": 3.0594620868438107, + "grad_norm": 0.08908110111951828, + "learning_rate": 5.2252576623610426e-05, + "loss": 0.231, + "step": 37766 + }, + { + "epoch": 3.059543097861309, + "grad_norm": 0.07068745046854019, + "learning_rate": 5.22480759710158e-05, + "loss": 0.1985, + "step": 37767 + }, + { + "epoch": 3.0596241088788076, + "grad_norm": 0.07705529779195786, + "learning_rate": 5.224357531842118e-05, + "loss": 0.2223, + "step": 37768 + }, + { + "epoch": 3.059705119896306, + "grad_norm": 0.11516771465539932, + "learning_rate": 5.223907466582655e-05, + "loss": 0.2384, + "step": 37769 + }, + { + "epoch": 3.059786130913804, + "grad_norm": 0.08792196959257126, + "learning_rate": 5.223457401323192e-05, + "loss": 0.2319, + "step": 37770 + }, + { + "epoch": 3.059867141931303, + "grad_norm": 0.07994337379932404, + "learning_rate": 5.22300733606373e-05, + "loss": 0.2648, + "step": 37771 + }, + { + "epoch": 3.059948152948801, + "grad_norm": 0.07762891799211502, + "learning_rate": 5.222557270804267e-05, + "loss": 0.2191, + "step": 37772 + }, + { + "epoch": 3.0600291639662993, + "grad_norm": 0.07052240520715714, + "learning_rate": 5.222107205544804e-05, + "loss": 0.2416, + "step": 37773 + }, + { + "epoch": 3.060110174983798, + "grad_norm": 0.06587530672550201, + "learning_rate": 5.221657140285342e-05, + "loss": 0.2163, + "step": 37774 + }, + { + "epoch": 3.0601911860012962, + "grad_norm": 0.06059986352920532, + "learning_rate": 5.221207075025879e-05, + "loss": 0.1863, + "step": 37775 + }, + { + "epoch": 3.0602721970187945, + "grad_norm": 0.07946839928627014, + "learning_rate": 5.220757009766416e-05, + "loss": 0.223, + "step": 37776 + }, + { + "epoch": 3.0603532080362927, + "grad_norm": 0.06629893183708191, + "learning_rate": 5.220306944506954e-05, + "loss": 0.2106, + "step": 37777 + }, + { + "epoch": 3.0604342190537914, + "grad_norm": 0.07502514123916626, + "learning_rate": 5.219856879247491e-05, + "loss": 0.2046, + "step": 37778 + }, + { + "epoch": 3.0605152300712897, + "grad_norm": 0.06490511447191238, + "learning_rate": 5.219406813988028e-05, + "loss": 0.2268, + "step": 37779 + }, + { + "epoch": 3.060596241088788, + "grad_norm": 0.06450946629047394, + "learning_rate": 5.218956748728566e-05, + "loss": 0.2163, + "step": 37780 + }, + { + "epoch": 3.0606772521062866, + "grad_norm": 0.0701812207698822, + "learning_rate": 5.218506683469103e-05, + "loss": 0.2466, + "step": 37781 + }, + { + "epoch": 3.060758263123785, + "grad_norm": 0.07255059480667114, + "learning_rate": 5.2180566182096403e-05, + "loss": 0.2308, + "step": 37782 + }, + { + "epoch": 3.060839274141283, + "grad_norm": 0.05262809619307518, + "learning_rate": 5.2176065529501784e-05, + "loss": 0.198, + "step": 37783 + }, + { + "epoch": 3.060920285158782, + "grad_norm": 0.06464151293039322, + "learning_rate": 5.217156487690715e-05, + "loss": 0.2412, + "step": 37784 + }, + { + "epoch": 3.06100129617628, + "grad_norm": 0.06939256191253662, + "learning_rate": 5.2167064224312524e-05, + "loss": 0.2188, + "step": 37785 + }, + { + "epoch": 3.0610823071937783, + "grad_norm": 0.06600762903690338, + "learning_rate": 5.2162563571717905e-05, + "loss": 0.2393, + "step": 37786 + }, + { + "epoch": 3.0611633182112765, + "grad_norm": 0.08542144298553467, + "learning_rate": 5.215806291912327e-05, + "loss": 0.2108, + "step": 37787 + }, + { + "epoch": 3.061244329228775, + "grad_norm": 0.0636884793639183, + "learning_rate": 5.2153562266528645e-05, + "loss": 0.2294, + "step": 37788 + }, + { + "epoch": 3.0613253402462735, + "grad_norm": 0.07961232215166092, + "learning_rate": 5.2149061613934025e-05, + "loss": 0.247, + "step": 37789 + }, + { + "epoch": 3.0614063512637717, + "grad_norm": 0.06676743924617767, + "learning_rate": 5.214456096133939e-05, + "loss": 0.2302, + "step": 37790 + }, + { + "epoch": 3.0614873622812704, + "grad_norm": 0.06724625080823898, + "learning_rate": 5.214006030874477e-05, + "loss": 0.2113, + "step": 37791 + }, + { + "epoch": 3.0615683732987686, + "grad_norm": 0.062377527356147766, + "learning_rate": 5.2135559656150146e-05, + "loss": 0.2459, + "step": 37792 + }, + { + "epoch": 3.061649384316267, + "grad_norm": 0.08417245745658875, + "learning_rate": 5.213105900355551e-05, + "loss": 0.2, + "step": 37793 + }, + { + "epoch": 3.0617303953337656, + "grad_norm": 0.0867442935705185, + "learning_rate": 5.2126558350960894e-05, + "loss": 0.2331, + "step": 37794 + }, + { + "epoch": 3.061811406351264, + "grad_norm": 0.07381570339202881, + "learning_rate": 5.212205769836627e-05, + "loss": 0.2123, + "step": 37795 + }, + { + "epoch": 3.061892417368762, + "grad_norm": 0.08096037805080414, + "learning_rate": 5.2117557045771634e-05, + "loss": 0.2016, + "step": 37796 + }, + { + "epoch": 3.0619734283862607, + "grad_norm": 0.07170046865940094, + "learning_rate": 5.2113056393177014e-05, + "loss": 0.218, + "step": 37797 + }, + { + "epoch": 3.062054439403759, + "grad_norm": 0.06569628417491913, + "learning_rate": 5.210855574058239e-05, + "loss": 0.1975, + "step": 37798 + }, + { + "epoch": 3.0621354504212572, + "grad_norm": 0.05782657116651535, + "learning_rate": 5.2104055087987755e-05, + "loss": 0.1901, + "step": 37799 + }, + { + "epoch": 3.0622164614387555, + "grad_norm": 0.07560217380523682, + "learning_rate": 5.209955443539314e-05, + "loss": 0.2263, + "step": 37800 + }, + { + "epoch": 3.062297472456254, + "grad_norm": 0.07204508781433105, + "learning_rate": 5.209505378279851e-05, + "loss": 0.2422, + "step": 37801 + }, + { + "epoch": 3.0623784834737524, + "grad_norm": 0.0661541149020195, + "learning_rate": 5.2090553130203876e-05, + "loss": 0.2322, + "step": 37802 + }, + { + "epoch": 3.0624594944912507, + "grad_norm": 0.07273122668266296, + "learning_rate": 5.208605247760926e-05, + "loss": 0.2193, + "step": 37803 + }, + { + "epoch": 3.0625405055087493, + "grad_norm": 0.0696941390633583, + "learning_rate": 5.208155182501463e-05, + "loss": 0.2121, + "step": 37804 + }, + { + "epoch": 3.0626215165262476, + "grad_norm": 0.08383599668741226, + "learning_rate": 5.2077051172419996e-05, + "loss": 0.2512, + "step": 37805 + }, + { + "epoch": 3.062702527543746, + "grad_norm": 0.06656774878501892, + "learning_rate": 5.2072550519825384e-05, + "loss": 0.209, + "step": 37806 + }, + { + "epoch": 3.0627835385612445, + "grad_norm": 0.08085005730390549, + "learning_rate": 5.206804986723075e-05, + "loss": 0.2692, + "step": 37807 + }, + { + "epoch": 3.0628645495787428, + "grad_norm": 0.07488963752985, + "learning_rate": 5.206354921463612e-05, + "loss": 0.2244, + "step": 37808 + }, + { + "epoch": 3.062945560596241, + "grad_norm": 0.06890054047107697, + "learning_rate": 5.2059048562041504e-05, + "loss": 0.1915, + "step": 37809 + }, + { + "epoch": 3.0630265716137393, + "grad_norm": 0.06550314277410507, + "learning_rate": 5.205454790944687e-05, + "loss": 0.2211, + "step": 37810 + }, + { + "epoch": 3.063107582631238, + "grad_norm": 0.054814115166664124, + "learning_rate": 5.205004725685224e-05, + "loss": 0.1907, + "step": 37811 + }, + { + "epoch": 3.063188593648736, + "grad_norm": 0.061660099774599075, + "learning_rate": 5.2045546604257625e-05, + "loss": 0.2298, + "step": 37812 + }, + { + "epoch": 3.0632696046662344, + "grad_norm": 0.07961427420377731, + "learning_rate": 5.204104595166299e-05, + "loss": 0.2413, + "step": 37813 + }, + { + "epoch": 3.063350615683733, + "grad_norm": 0.08189357072114944, + "learning_rate": 5.203654529906836e-05, + "loss": 0.2183, + "step": 37814 + }, + { + "epoch": 3.0634316267012314, + "grad_norm": 0.08027397841215134, + "learning_rate": 5.2032044646473746e-05, + "loss": 0.2233, + "step": 37815 + }, + { + "epoch": 3.0635126377187296, + "grad_norm": 0.06822025775909424, + "learning_rate": 5.202754399387911e-05, + "loss": 0.2241, + "step": 37816 + }, + { + "epoch": 3.0635936487362283, + "grad_norm": 0.06859682500362396, + "learning_rate": 5.202304334128449e-05, + "loss": 0.2298, + "step": 37817 + }, + { + "epoch": 3.0636746597537265, + "grad_norm": 0.061326105147600174, + "learning_rate": 5.201854268868987e-05, + "loss": 0.28, + "step": 37818 + }, + { + "epoch": 3.063755670771225, + "grad_norm": 0.08826600760221481, + "learning_rate": 5.2014042036095234e-05, + "loss": 0.2569, + "step": 37819 + }, + { + "epoch": 3.0638366817887235, + "grad_norm": 0.07701018452644348, + "learning_rate": 5.2009541383500614e-05, + "loss": 0.2506, + "step": 37820 + }, + { + "epoch": 3.0639176928062217, + "grad_norm": 0.07785700261592865, + "learning_rate": 5.200504073090599e-05, + "loss": 0.2383, + "step": 37821 + }, + { + "epoch": 3.06399870382372, + "grad_norm": 0.08271851390600204, + "learning_rate": 5.2000540078311354e-05, + "loss": 0.2141, + "step": 37822 + }, + { + "epoch": 3.064079714841218, + "grad_norm": 0.0652521550655365, + "learning_rate": 5.1996039425716735e-05, + "loss": 0.2539, + "step": 37823 + }, + { + "epoch": 3.064160725858717, + "grad_norm": 0.07621287554502487, + "learning_rate": 5.199153877312211e-05, + "loss": 0.1846, + "step": 37824 + }, + { + "epoch": 3.064241736876215, + "grad_norm": 0.07543626427650452, + "learning_rate": 5.1987038120527475e-05, + "loss": 0.2251, + "step": 37825 + }, + { + "epoch": 3.0643227478937134, + "grad_norm": 0.07078852504491806, + "learning_rate": 5.1982537467932856e-05, + "loss": 0.2359, + "step": 37826 + }, + { + "epoch": 3.064403758911212, + "grad_norm": 0.08174537867307663, + "learning_rate": 5.197803681533823e-05, + "loss": 0.2348, + "step": 37827 + }, + { + "epoch": 3.0644847699287103, + "grad_norm": 0.06269580870866776, + "learning_rate": 5.1973536162743596e-05, + "loss": 0.2082, + "step": 37828 + }, + { + "epoch": 3.0645657809462086, + "grad_norm": 0.07843272387981415, + "learning_rate": 5.1969035510148976e-05, + "loss": 0.2294, + "step": 37829 + }, + { + "epoch": 3.0646467919637073, + "grad_norm": 0.08541741967201233, + "learning_rate": 5.196453485755435e-05, + "loss": 0.2572, + "step": 37830 + }, + { + "epoch": 3.0647278029812055, + "grad_norm": 0.09261652082204819, + "learning_rate": 5.196003420495972e-05, + "loss": 0.2414, + "step": 37831 + }, + { + "epoch": 3.0648088139987038, + "grad_norm": 0.06951161473989487, + "learning_rate": 5.19555335523651e-05, + "loss": 0.2247, + "step": 37832 + }, + { + "epoch": 3.064889825016202, + "grad_norm": 0.0775570273399353, + "learning_rate": 5.195103289977047e-05, + "loss": 0.213, + "step": 37833 + }, + { + "epoch": 3.0649708360337007, + "grad_norm": 0.05936042591929436, + "learning_rate": 5.194653224717584e-05, + "loss": 0.2352, + "step": 37834 + }, + { + "epoch": 3.065051847051199, + "grad_norm": 0.07738978415727615, + "learning_rate": 5.194203159458122e-05, + "loss": 0.2336, + "step": 37835 + }, + { + "epoch": 3.065132858068697, + "grad_norm": 0.07329297810792923, + "learning_rate": 5.193753094198659e-05, + "loss": 0.2307, + "step": 37836 + }, + { + "epoch": 3.065213869086196, + "grad_norm": 0.06835595518350601, + "learning_rate": 5.193303028939196e-05, + "loss": 0.2438, + "step": 37837 + }, + { + "epoch": 3.065294880103694, + "grad_norm": 0.07460511475801468, + "learning_rate": 5.192852963679734e-05, + "loss": 0.218, + "step": 37838 + }, + { + "epoch": 3.0653758911211924, + "grad_norm": 0.06165440008044243, + "learning_rate": 5.192402898420271e-05, + "loss": 0.2262, + "step": 37839 + }, + { + "epoch": 3.065456902138691, + "grad_norm": 0.07027935981750488, + "learning_rate": 5.191952833160808e-05, + "loss": 0.239, + "step": 37840 + }, + { + "epoch": 3.0655379131561893, + "grad_norm": 0.0670817494392395, + "learning_rate": 5.191502767901346e-05, + "loss": 0.2223, + "step": 37841 + }, + { + "epoch": 3.0656189241736875, + "grad_norm": 0.06680265814065933, + "learning_rate": 5.191052702641883e-05, + "loss": 0.2366, + "step": 37842 + }, + { + "epoch": 3.065699935191186, + "grad_norm": 0.06441053748130798, + "learning_rate": 5.1906026373824214e-05, + "loss": 0.2357, + "step": 37843 + }, + { + "epoch": 3.0657809462086845, + "grad_norm": 0.06061835587024689, + "learning_rate": 5.190152572122958e-05, + "loss": 0.2327, + "step": 37844 + }, + { + "epoch": 3.0658619572261827, + "grad_norm": 0.06712067872285843, + "learning_rate": 5.1897025068634954e-05, + "loss": 0.2321, + "step": 37845 + }, + { + "epoch": 3.065942968243681, + "grad_norm": 0.07448230683803558, + "learning_rate": 5.1892524416040334e-05, + "loss": 0.2567, + "step": 37846 + }, + { + "epoch": 3.0660239792611796, + "grad_norm": 0.058865804225206375, + "learning_rate": 5.18880237634457e-05, + "loss": 0.214, + "step": 37847 + }, + { + "epoch": 3.066104990278678, + "grad_norm": 0.07380334287881851, + "learning_rate": 5.1883523110851075e-05, + "loss": 0.2223, + "step": 37848 + }, + { + "epoch": 3.066186001296176, + "grad_norm": 0.07105739414691925, + "learning_rate": 5.1879022458256455e-05, + "loss": 0.2171, + "step": 37849 + }, + { + "epoch": 3.066267012313675, + "grad_norm": 0.0776592344045639, + "learning_rate": 5.187452180566182e-05, + "loss": 0.216, + "step": 37850 + }, + { + "epoch": 3.066348023331173, + "grad_norm": 0.06981410086154938, + "learning_rate": 5.1870021153067196e-05, + "loss": 0.2408, + "step": 37851 + }, + { + "epoch": 3.0664290343486713, + "grad_norm": 0.0790480226278305, + "learning_rate": 5.1865520500472576e-05, + "loss": 0.2486, + "step": 37852 + }, + { + "epoch": 3.06651004536617, + "grad_norm": 0.06860355287790298, + "learning_rate": 5.186101984787794e-05, + "loss": 0.2365, + "step": 37853 + }, + { + "epoch": 3.0665910563836682, + "grad_norm": 0.0746893510222435, + "learning_rate": 5.1856519195283316e-05, + "loss": 0.2285, + "step": 37854 + }, + { + "epoch": 3.0666720674011665, + "grad_norm": 0.06371176242828369, + "learning_rate": 5.18520185426887e-05, + "loss": 0.2371, + "step": 37855 + }, + { + "epoch": 3.0667530784186647, + "grad_norm": 0.0675220713019371, + "learning_rate": 5.1847517890094064e-05, + "loss": 0.2049, + "step": 37856 + }, + { + "epoch": 3.0668340894361634, + "grad_norm": 0.0685059204697609, + "learning_rate": 5.184301723749944e-05, + "loss": 0.2191, + "step": 37857 + }, + { + "epoch": 3.0669151004536617, + "grad_norm": 0.0811622142791748, + "learning_rate": 5.183851658490482e-05, + "loss": 0.2291, + "step": 37858 + }, + { + "epoch": 3.06699611147116, + "grad_norm": 0.05773041024804115, + "learning_rate": 5.1834015932310185e-05, + "loss": 0.2369, + "step": 37859 + }, + { + "epoch": 3.0670771224886586, + "grad_norm": 0.06531134992837906, + "learning_rate": 5.182951527971556e-05, + "loss": 0.2021, + "step": 37860 + }, + { + "epoch": 3.067158133506157, + "grad_norm": 0.08389285206794739, + "learning_rate": 5.182501462712094e-05, + "loss": 0.2364, + "step": 37861 + }, + { + "epoch": 3.067239144523655, + "grad_norm": 0.09492291510105133, + "learning_rate": 5.1820513974526305e-05, + "loss": 0.2315, + "step": 37862 + }, + { + "epoch": 3.067320155541154, + "grad_norm": 0.08566474169492722, + "learning_rate": 5.181601332193168e-05, + "loss": 0.2212, + "step": 37863 + }, + { + "epoch": 3.067401166558652, + "grad_norm": 0.0661972314119339, + "learning_rate": 5.181151266933706e-05, + "loss": 0.182, + "step": 37864 + }, + { + "epoch": 3.0674821775761503, + "grad_norm": 0.08925847709178925, + "learning_rate": 5.1807012016742426e-05, + "loss": 0.221, + "step": 37865 + }, + { + "epoch": 3.067563188593649, + "grad_norm": 0.06424117088317871, + "learning_rate": 5.18025113641478e-05, + "loss": 0.2518, + "step": 37866 + }, + { + "epoch": 3.067644199611147, + "grad_norm": 0.0826725959777832, + "learning_rate": 5.179801071155318e-05, + "loss": 0.2195, + "step": 37867 + }, + { + "epoch": 3.0677252106286454, + "grad_norm": 0.06489408016204834, + "learning_rate": 5.179351005895855e-05, + "loss": 0.2034, + "step": 37868 + }, + { + "epoch": 3.0678062216461437, + "grad_norm": 0.06176425889134407, + "learning_rate": 5.178900940636392e-05, + "loss": 0.2564, + "step": 37869 + }, + { + "epoch": 3.0678872326636424, + "grad_norm": 0.07508070766925812, + "learning_rate": 5.17845087537693e-05, + "loss": 0.2382, + "step": 37870 + }, + { + "epoch": 3.0679682436811406, + "grad_norm": 0.06734276562929153, + "learning_rate": 5.178000810117467e-05, + "loss": 0.2312, + "step": 37871 + }, + { + "epoch": 3.068049254698639, + "grad_norm": 0.06585001945495605, + "learning_rate": 5.1775507448580055e-05, + "loss": 0.2313, + "step": 37872 + }, + { + "epoch": 3.0681302657161376, + "grad_norm": 0.0641934871673584, + "learning_rate": 5.177100679598542e-05, + "loss": 0.2136, + "step": 37873 + }, + { + "epoch": 3.068211276733636, + "grad_norm": 0.07912000268697739, + "learning_rate": 5.176650614339079e-05, + "loss": 0.2236, + "step": 37874 + }, + { + "epoch": 3.068292287751134, + "grad_norm": 0.07646586745977402, + "learning_rate": 5.1762005490796176e-05, + "loss": 0.2342, + "step": 37875 + }, + { + "epoch": 3.0683732987686327, + "grad_norm": 0.06955553591251373, + "learning_rate": 5.175750483820154e-05, + "loss": 0.2141, + "step": 37876 + }, + { + "epoch": 3.068454309786131, + "grad_norm": 0.06082908436655998, + "learning_rate": 5.175300418560691e-05, + "loss": 0.2045, + "step": 37877 + }, + { + "epoch": 3.0685353208036292, + "grad_norm": 0.07860400527715683, + "learning_rate": 5.1748503533012297e-05, + "loss": 0.221, + "step": 37878 + }, + { + "epoch": 3.0686163318211275, + "grad_norm": 0.06857936084270477, + "learning_rate": 5.174400288041766e-05, + "loss": 0.2142, + "step": 37879 + }, + { + "epoch": 3.068697342838626, + "grad_norm": 0.06467515230178833, + "learning_rate": 5.173950222782303e-05, + "loss": 0.1947, + "step": 37880 + }, + { + "epoch": 3.0687783538561244, + "grad_norm": 0.06888511031866074, + "learning_rate": 5.173500157522842e-05, + "loss": 0.2594, + "step": 37881 + }, + { + "epoch": 3.0688593648736227, + "grad_norm": 0.06320245563983917, + "learning_rate": 5.1730500922633784e-05, + "loss": 0.207, + "step": 37882 + }, + { + "epoch": 3.0689403758911213, + "grad_norm": 0.07228653877973557, + "learning_rate": 5.172600027003916e-05, + "loss": 0.2266, + "step": 37883 + }, + { + "epoch": 3.0690213869086196, + "grad_norm": 0.0634072795510292, + "learning_rate": 5.172149961744454e-05, + "loss": 0.2282, + "step": 37884 + }, + { + "epoch": 3.069102397926118, + "grad_norm": 0.09231677651405334, + "learning_rate": 5.1716998964849905e-05, + "loss": 0.2451, + "step": 37885 + }, + { + "epoch": 3.0691834089436165, + "grad_norm": 0.062211230397224426, + "learning_rate": 5.171249831225528e-05, + "loss": 0.2275, + "step": 37886 + }, + { + "epoch": 3.0692644199611148, + "grad_norm": 0.06925942003726959, + "learning_rate": 5.170799765966066e-05, + "loss": 0.2077, + "step": 37887 + }, + { + "epoch": 3.069345430978613, + "grad_norm": 0.1082034781575203, + "learning_rate": 5.1703497007066026e-05, + "loss": 0.2093, + "step": 37888 + }, + { + "epoch": 3.0694264419961117, + "grad_norm": 0.059054188430309296, + "learning_rate": 5.16989963544714e-05, + "loss": 0.2651, + "step": 37889 + }, + { + "epoch": 3.06950745301361, + "grad_norm": 0.07227233797311783, + "learning_rate": 5.169449570187678e-05, + "loss": 0.2525, + "step": 37890 + }, + { + "epoch": 3.069588464031108, + "grad_norm": 0.07376521080732346, + "learning_rate": 5.1689995049282147e-05, + "loss": 0.2542, + "step": 37891 + }, + { + "epoch": 3.0696694750486064, + "grad_norm": 0.0684414729475975, + "learning_rate": 5.168549439668752e-05, + "loss": 0.229, + "step": 37892 + }, + { + "epoch": 3.069750486066105, + "grad_norm": 0.06626005470752716, + "learning_rate": 5.16809937440929e-05, + "loss": 0.2198, + "step": 37893 + }, + { + "epoch": 3.0698314970836034, + "grad_norm": 0.0789312869310379, + "learning_rate": 5.167649309149827e-05, + "loss": 0.2563, + "step": 37894 + }, + { + "epoch": 3.0699125081011016, + "grad_norm": 0.06644661724567413, + "learning_rate": 5.167199243890364e-05, + "loss": 0.2453, + "step": 37895 + }, + { + "epoch": 3.0699935191186003, + "grad_norm": 0.07774606347084045, + "learning_rate": 5.166749178630902e-05, + "loss": 0.2669, + "step": 37896 + }, + { + "epoch": 3.0700745301360985, + "grad_norm": 0.09429129958152771, + "learning_rate": 5.166299113371439e-05, + "loss": 0.2276, + "step": 37897 + }, + { + "epoch": 3.070155541153597, + "grad_norm": 0.07213341444730759, + "learning_rate": 5.165849048111977e-05, + "loss": 0.2385, + "step": 37898 + }, + { + "epoch": 3.0702365521710955, + "grad_norm": 0.06322767585515976, + "learning_rate": 5.165398982852514e-05, + "loss": 0.2112, + "step": 37899 + }, + { + "epoch": 3.0703175631885937, + "grad_norm": 0.0699484571814537, + "learning_rate": 5.164948917593051e-05, + "loss": 0.2288, + "step": 37900 + }, + { + "epoch": 3.070398574206092, + "grad_norm": 0.06551605463027954, + "learning_rate": 5.164498852333589e-05, + "loss": 0.2175, + "step": 37901 + }, + { + "epoch": 3.07047958522359, + "grad_norm": 0.0687117800116539, + "learning_rate": 5.164048787074126e-05, + "loss": 0.2439, + "step": 37902 + }, + { + "epoch": 3.070560596241089, + "grad_norm": 0.07314037531614304, + "learning_rate": 5.163598721814663e-05, + "loss": 0.1995, + "step": 37903 + }, + { + "epoch": 3.070641607258587, + "grad_norm": 0.08327914029359818, + "learning_rate": 5.163148656555201e-05, + "loss": 0.2242, + "step": 37904 + }, + { + "epoch": 3.0707226182760854, + "grad_norm": 0.08092246204614639, + "learning_rate": 5.1626985912957384e-05, + "loss": 0.2712, + "step": 37905 + }, + { + "epoch": 3.070803629293584, + "grad_norm": 0.062449127435684204, + "learning_rate": 5.162248526036275e-05, + "loss": 0.2368, + "step": 37906 + }, + { + "epoch": 3.0708846403110823, + "grad_norm": 0.05750298872590065, + "learning_rate": 5.161798460776813e-05, + "loss": 0.2053, + "step": 37907 + }, + { + "epoch": 3.0709656513285806, + "grad_norm": 0.05637628957629204, + "learning_rate": 5.1613483955173505e-05, + "loss": 0.2378, + "step": 37908 + }, + { + "epoch": 3.0710466623460793, + "grad_norm": 0.07444218546152115, + "learning_rate": 5.160898330257887e-05, + "loss": 0.2127, + "step": 37909 + }, + { + "epoch": 3.0711276733635775, + "grad_norm": 0.0685492604970932, + "learning_rate": 5.160448264998425e-05, + "loss": 0.2548, + "step": 37910 + }, + { + "epoch": 3.0712086843810757, + "grad_norm": 0.0651240348815918, + "learning_rate": 5.1599981997389625e-05, + "loss": 0.2301, + "step": 37911 + }, + { + "epoch": 3.071289695398574, + "grad_norm": 0.06461571156978607, + "learning_rate": 5.159548134479499e-05, + "loss": 0.2373, + "step": 37912 + }, + { + "epoch": 3.0713707064160727, + "grad_norm": 0.0866527408361435, + "learning_rate": 5.159098069220037e-05, + "loss": 0.2355, + "step": 37913 + }, + { + "epoch": 3.071451717433571, + "grad_norm": 0.06332384794950485, + "learning_rate": 5.1586480039605746e-05, + "loss": 0.2346, + "step": 37914 + }, + { + "epoch": 3.071532728451069, + "grad_norm": 0.07270190864801407, + "learning_rate": 5.158197938701111e-05, + "loss": 0.2162, + "step": 37915 + }, + { + "epoch": 3.071613739468568, + "grad_norm": 0.07732129096984863, + "learning_rate": 5.1577478734416493e-05, + "loss": 0.2001, + "step": 37916 + }, + { + "epoch": 3.071694750486066, + "grad_norm": 0.0813266858458519, + "learning_rate": 5.157297808182187e-05, + "loss": 0.259, + "step": 37917 + }, + { + "epoch": 3.0717757615035644, + "grad_norm": 0.07190661877393723, + "learning_rate": 5.1568477429227234e-05, + "loss": 0.2087, + "step": 37918 + }, + { + "epoch": 3.071856772521063, + "grad_norm": 0.07515159994363785, + "learning_rate": 5.1563976776632614e-05, + "loss": 0.2316, + "step": 37919 + }, + { + "epoch": 3.0719377835385613, + "grad_norm": 0.07636462152004242, + "learning_rate": 5.155947612403799e-05, + "loss": 0.2296, + "step": 37920 + }, + { + "epoch": 3.0720187945560595, + "grad_norm": 0.07358011603355408, + "learning_rate": 5.1554975471443355e-05, + "loss": 0.206, + "step": 37921 + }, + { + "epoch": 3.072099805573558, + "grad_norm": 0.05487060546875, + "learning_rate": 5.1550474818848735e-05, + "loss": 0.2, + "step": 37922 + }, + { + "epoch": 3.0721808165910565, + "grad_norm": 0.08249057084321976, + "learning_rate": 5.154597416625411e-05, + "loss": 0.2099, + "step": 37923 + }, + { + "epoch": 3.0722618276085547, + "grad_norm": 0.06941979378461838, + "learning_rate": 5.154147351365949e-05, + "loss": 0.2349, + "step": 37924 + }, + { + "epoch": 3.072342838626053, + "grad_norm": 0.07720015197992325, + "learning_rate": 5.1536972861064856e-05, + "loss": 0.2585, + "step": 37925 + }, + { + "epoch": 3.0724238496435516, + "grad_norm": 0.07195110619068146, + "learning_rate": 5.153247220847023e-05, + "loss": 0.2437, + "step": 37926 + }, + { + "epoch": 3.07250486066105, + "grad_norm": 0.08540907502174377, + "learning_rate": 5.152797155587561e-05, + "loss": 0.2424, + "step": 37927 + }, + { + "epoch": 3.072585871678548, + "grad_norm": 0.10462567955255508, + "learning_rate": 5.152347090328098e-05, + "loss": 0.2277, + "step": 37928 + }, + { + "epoch": 3.072666882696047, + "grad_norm": 0.0824456512928009, + "learning_rate": 5.151897025068635e-05, + "loss": 0.2582, + "step": 37929 + }, + { + "epoch": 3.072747893713545, + "grad_norm": 0.06701191514730453, + "learning_rate": 5.151446959809173e-05, + "loss": 0.2477, + "step": 37930 + }, + { + "epoch": 3.0728289047310433, + "grad_norm": 0.06684470176696777, + "learning_rate": 5.15099689454971e-05, + "loss": 0.2442, + "step": 37931 + }, + { + "epoch": 3.072909915748542, + "grad_norm": 0.09025054425001144, + "learning_rate": 5.150546829290247e-05, + "loss": 0.2667, + "step": 37932 + }, + { + "epoch": 3.0729909267660402, + "grad_norm": 0.06221548467874527, + "learning_rate": 5.150096764030785e-05, + "loss": 0.2143, + "step": 37933 + }, + { + "epoch": 3.0730719377835385, + "grad_norm": 0.060628268867731094, + "learning_rate": 5.149646698771322e-05, + "loss": 0.2303, + "step": 37934 + }, + { + "epoch": 3.0731529488010367, + "grad_norm": 0.08122788369655609, + "learning_rate": 5.149196633511859e-05, + "loss": 0.2504, + "step": 37935 + }, + { + "epoch": 3.0732339598185354, + "grad_norm": 0.07332400232553482, + "learning_rate": 5.148746568252397e-05, + "loss": 0.2223, + "step": 37936 + }, + { + "epoch": 3.0733149708360337, + "grad_norm": 0.0817917063832283, + "learning_rate": 5.148296502992934e-05, + "loss": 0.2812, + "step": 37937 + }, + { + "epoch": 3.073395981853532, + "grad_norm": 0.06565006822347641, + "learning_rate": 5.147846437733471e-05, + "loss": 0.351, + "step": 37938 + }, + { + "epoch": 3.0734769928710306, + "grad_norm": 0.06252454221248627, + "learning_rate": 5.147396372474009e-05, + "loss": 0.2064, + "step": 37939 + }, + { + "epoch": 3.073558003888529, + "grad_norm": 0.0847528949379921, + "learning_rate": 5.146946307214546e-05, + "loss": 0.2214, + "step": 37940 + }, + { + "epoch": 3.073639014906027, + "grad_norm": 0.06985647976398468, + "learning_rate": 5.1464962419550834e-05, + "loss": 0.2443, + "step": 37941 + }, + { + "epoch": 3.073720025923526, + "grad_norm": 0.07312333583831787, + "learning_rate": 5.1460461766956214e-05, + "loss": 0.2522, + "step": 37942 + }, + { + "epoch": 3.073801036941024, + "grad_norm": 0.07783205062150955, + "learning_rate": 5.145596111436158e-05, + "loss": 0.2536, + "step": 37943 + }, + { + "epoch": 3.0738820479585223, + "grad_norm": 0.06937301903963089, + "learning_rate": 5.1451460461766954e-05, + "loss": 0.2139, + "step": 37944 + }, + { + "epoch": 3.0739630589760205, + "grad_norm": 0.07388978451490402, + "learning_rate": 5.1446959809172335e-05, + "loss": 0.2015, + "step": 37945 + }, + { + "epoch": 3.074044069993519, + "grad_norm": 0.0640605017542839, + "learning_rate": 5.14424591565777e-05, + "loss": 0.25, + "step": 37946 + }, + { + "epoch": 3.0741250810110174, + "grad_norm": 0.06384878605604172, + "learning_rate": 5.1437958503983075e-05, + "loss": 0.2137, + "step": 37947 + }, + { + "epoch": 3.0742060920285157, + "grad_norm": 0.06250331550836563, + "learning_rate": 5.1433457851388456e-05, + "loss": 0.2125, + "step": 37948 + }, + { + "epoch": 3.0742871030460144, + "grad_norm": 0.06558407098054886, + "learning_rate": 5.142895719879382e-05, + "loss": 0.2383, + "step": 37949 + }, + { + "epoch": 3.0743681140635126, + "grad_norm": 0.05187705159187317, + "learning_rate": 5.142445654619921e-05, + "loss": 0.2134, + "step": 37950 + }, + { + "epoch": 3.074449125081011, + "grad_norm": 0.0846744105219841, + "learning_rate": 5.1419955893604576e-05, + "loss": 0.2796, + "step": 37951 + }, + { + "epoch": 3.0745301360985096, + "grad_norm": 0.0652763769030571, + "learning_rate": 5.141545524100995e-05, + "loss": 0.2173, + "step": 37952 + }, + { + "epoch": 3.074611147116008, + "grad_norm": 0.07559669762849808, + "learning_rate": 5.141095458841533e-05, + "loss": 0.2458, + "step": 37953 + }, + { + "epoch": 3.074692158133506, + "grad_norm": 0.07435882091522217, + "learning_rate": 5.14064539358207e-05, + "loss": 0.2564, + "step": 37954 + }, + { + "epoch": 3.0747731691510047, + "grad_norm": 0.08500465750694275, + "learning_rate": 5.140195328322607e-05, + "loss": 0.262, + "step": 37955 + }, + { + "epoch": 3.074854180168503, + "grad_norm": 0.05521190166473389, + "learning_rate": 5.139745263063145e-05, + "loss": 0.2248, + "step": 37956 + }, + { + "epoch": 3.0749351911860012, + "grad_norm": 0.07541017979383469, + "learning_rate": 5.139295197803682e-05, + "loss": 0.244, + "step": 37957 + }, + { + "epoch": 3.0750162022034995, + "grad_norm": 0.06455586850643158, + "learning_rate": 5.138845132544219e-05, + "loss": 0.2069, + "step": 37958 + }, + { + "epoch": 3.075097213220998, + "grad_norm": 0.08610700070858002, + "learning_rate": 5.138395067284757e-05, + "loss": 0.2151, + "step": 37959 + }, + { + "epoch": 3.0751782242384964, + "grad_norm": 0.07560251653194427, + "learning_rate": 5.137945002025294e-05, + "loss": 0.1906, + "step": 37960 + }, + { + "epoch": 3.0752592352559946, + "grad_norm": 0.07362257689237595, + "learning_rate": 5.137494936765831e-05, + "loss": 0.2349, + "step": 37961 + }, + { + "epoch": 3.0753402462734933, + "grad_norm": 0.06207628548145294, + "learning_rate": 5.137044871506369e-05, + "loss": 0.2123, + "step": 37962 + }, + { + "epoch": 3.0754212572909916, + "grad_norm": 0.07409238815307617, + "learning_rate": 5.136594806246906e-05, + "loss": 0.2077, + "step": 37963 + }, + { + "epoch": 3.07550226830849, + "grad_norm": 0.06870327144861221, + "learning_rate": 5.136144740987443e-05, + "loss": 0.2268, + "step": 37964 + }, + { + "epoch": 3.0755832793259885, + "grad_norm": 0.07700678706169128, + "learning_rate": 5.1356946757279814e-05, + "loss": 0.2208, + "step": 37965 + }, + { + "epoch": 3.0756642903434868, + "grad_norm": 0.06135139986872673, + "learning_rate": 5.135244610468518e-05, + "loss": 0.2369, + "step": 37966 + }, + { + "epoch": 3.075745301360985, + "grad_norm": 0.08341161906719208, + "learning_rate": 5.1347945452090554e-05, + "loss": 0.235, + "step": 37967 + }, + { + "epoch": 3.0758263123784833, + "grad_norm": 0.09900689870119095, + "learning_rate": 5.1343444799495934e-05, + "loss": 0.2142, + "step": 37968 + }, + { + "epoch": 3.075907323395982, + "grad_norm": 0.0842118039727211, + "learning_rate": 5.13389441469013e-05, + "loss": 0.2431, + "step": 37969 + }, + { + "epoch": 3.07598833441348, + "grad_norm": 0.06679286807775497, + "learning_rate": 5.1334443494306675e-05, + "loss": 0.2316, + "step": 37970 + }, + { + "epoch": 3.0760693454309784, + "grad_norm": 0.0700433999300003, + "learning_rate": 5.1329942841712055e-05, + "loss": 0.259, + "step": 37971 + }, + { + "epoch": 3.076150356448477, + "grad_norm": 0.07671942561864853, + "learning_rate": 5.132544218911742e-05, + "loss": 0.2664, + "step": 37972 + }, + { + "epoch": 3.0762313674659754, + "grad_norm": 0.07188671827316284, + "learning_rate": 5.1320941536522796e-05, + "loss": 0.2175, + "step": 37973 + }, + { + "epoch": 3.0763123784834736, + "grad_norm": 0.06508833169937134, + "learning_rate": 5.1316440883928176e-05, + "loss": 0.214, + "step": 37974 + }, + { + "epoch": 3.0763933895009723, + "grad_norm": 0.07478687167167664, + "learning_rate": 5.131194023133354e-05, + "loss": 0.2472, + "step": 37975 + }, + { + "epoch": 3.0764744005184705, + "grad_norm": 0.06961677968502045, + "learning_rate": 5.130743957873892e-05, + "loss": 0.2443, + "step": 37976 + }, + { + "epoch": 3.076555411535969, + "grad_norm": 0.06969679892063141, + "learning_rate": 5.13029389261443e-05, + "loss": 0.2226, + "step": 37977 + }, + { + "epoch": 3.0766364225534675, + "grad_norm": 0.060342058539390564, + "learning_rate": 5.1298438273549664e-05, + "loss": 0.2395, + "step": 37978 + }, + { + "epoch": 3.0767174335709657, + "grad_norm": 0.06313896924257278, + "learning_rate": 5.1293937620955044e-05, + "loss": 0.2749, + "step": 37979 + }, + { + "epoch": 3.076798444588464, + "grad_norm": 0.05931505560874939, + "learning_rate": 5.128943696836042e-05, + "loss": 0.221, + "step": 37980 + }, + { + "epoch": 3.076879455605962, + "grad_norm": 0.06451479345560074, + "learning_rate": 5.1284936315765784e-05, + "loss": 0.2288, + "step": 37981 + }, + { + "epoch": 3.076960466623461, + "grad_norm": 0.07562223076820374, + "learning_rate": 5.1280435663171165e-05, + "loss": 0.2471, + "step": 37982 + }, + { + "epoch": 3.077041477640959, + "grad_norm": 0.05078315734863281, + "learning_rate": 5.127593501057654e-05, + "loss": 0.2181, + "step": 37983 + }, + { + "epoch": 3.0771224886584574, + "grad_norm": 0.07727597653865814, + "learning_rate": 5.1271434357981905e-05, + "loss": 0.211, + "step": 37984 + }, + { + "epoch": 3.077203499675956, + "grad_norm": 0.06155778467655182, + "learning_rate": 5.1266933705387286e-05, + "loss": 0.2155, + "step": 37985 + }, + { + "epoch": 3.0772845106934543, + "grad_norm": 0.06652826070785522, + "learning_rate": 5.126243305279266e-05, + "loss": 0.2406, + "step": 37986 + }, + { + "epoch": 3.0773655217109526, + "grad_norm": 0.06903241574764252, + "learning_rate": 5.1257932400198026e-05, + "loss": 0.2405, + "step": 37987 + }, + { + "epoch": 3.0774465327284513, + "grad_norm": 0.07772725820541382, + "learning_rate": 5.1253431747603406e-05, + "loss": 0.2317, + "step": 37988 + }, + { + "epoch": 3.0775275437459495, + "grad_norm": 0.07417483627796173, + "learning_rate": 5.124893109500878e-05, + "loss": 0.2337, + "step": 37989 + }, + { + "epoch": 3.0776085547634477, + "grad_norm": 0.07937649637460709, + "learning_rate": 5.124443044241415e-05, + "loss": 0.2705, + "step": 37990 + }, + { + "epoch": 3.077689565780946, + "grad_norm": 0.07012887299060822, + "learning_rate": 5.123992978981953e-05, + "loss": 0.1856, + "step": 37991 + }, + { + "epoch": 3.0777705767984447, + "grad_norm": 0.06151702627539635, + "learning_rate": 5.12354291372249e-05, + "loss": 0.1996, + "step": 37992 + }, + { + "epoch": 3.077851587815943, + "grad_norm": 0.06742820888757706, + "learning_rate": 5.123092848463027e-05, + "loss": 0.2367, + "step": 37993 + }, + { + "epoch": 3.077932598833441, + "grad_norm": 0.08394744992256165, + "learning_rate": 5.122642783203565e-05, + "loss": 0.2513, + "step": 37994 + }, + { + "epoch": 3.07801360985094, + "grad_norm": 0.08035869151353836, + "learning_rate": 5.122192717944102e-05, + "loss": 0.1969, + "step": 37995 + }, + { + "epoch": 3.078094620868438, + "grad_norm": 0.06402164697647095, + "learning_rate": 5.121742652684639e-05, + "loss": 0.2315, + "step": 37996 + }, + { + "epoch": 3.0781756318859363, + "grad_norm": 0.06812452524900436, + "learning_rate": 5.121292587425177e-05, + "loss": 0.2563, + "step": 37997 + }, + { + "epoch": 3.078256642903435, + "grad_norm": 0.06456022709608078, + "learning_rate": 5.120842522165714e-05, + "loss": 0.2365, + "step": 37998 + }, + { + "epoch": 3.0783376539209333, + "grad_norm": 0.06034749373793602, + "learning_rate": 5.120392456906251e-05, + "loss": 0.2411, + "step": 37999 + }, + { + "epoch": 3.0784186649384315, + "grad_norm": 0.06097142770886421, + "learning_rate": 5.119942391646789e-05, + "loss": 0.1908, + "step": 38000 + }, + { + "epoch": 3.07849967595593, + "grad_norm": 0.07852084934711456, + "learning_rate": 5.119492326387326e-05, + "loss": 0.2292, + "step": 38001 + }, + { + "epoch": 3.0785806869734285, + "grad_norm": 0.08489027619361877, + "learning_rate": 5.1190422611278644e-05, + "loss": 0.2771, + "step": 38002 + }, + { + "epoch": 3.0786616979909267, + "grad_norm": 0.0690150335431099, + "learning_rate": 5.118592195868401e-05, + "loss": 0.2177, + "step": 38003 + }, + { + "epoch": 3.078742709008425, + "grad_norm": 0.08611797541379929, + "learning_rate": 5.1181421306089384e-05, + "loss": 0.2413, + "step": 38004 + }, + { + "epoch": 3.0788237200259236, + "grad_norm": 0.055115729570388794, + "learning_rate": 5.1176920653494764e-05, + "loss": 0.1832, + "step": 38005 + }, + { + "epoch": 3.078904731043422, + "grad_norm": 0.06743993610143661, + "learning_rate": 5.117242000090013e-05, + "loss": 0.2196, + "step": 38006 + }, + { + "epoch": 3.07898574206092, + "grad_norm": 0.06356610357761383, + "learning_rate": 5.1167919348305505e-05, + "loss": 0.2042, + "step": 38007 + }, + { + "epoch": 3.079066753078419, + "grad_norm": 0.06697381287813187, + "learning_rate": 5.1163418695710885e-05, + "loss": 0.2388, + "step": 38008 + }, + { + "epoch": 3.079147764095917, + "grad_norm": 0.08403827995061874, + "learning_rate": 5.115891804311625e-05, + "loss": 0.2314, + "step": 38009 + }, + { + "epoch": 3.0792287751134153, + "grad_norm": 0.06729169189929962, + "learning_rate": 5.1154417390521626e-05, + "loss": 0.2261, + "step": 38010 + }, + { + "epoch": 3.079309786130914, + "grad_norm": 0.07959628105163574, + "learning_rate": 5.1149916737927006e-05, + "loss": 0.2357, + "step": 38011 + }, + { + "epoch": 3.0793907971484122, + "grad_norm": 0.07899930328130722, + "learning_rate": 5.114541608533237e-05, + "loss": 0.2665, + "step": 38012 + }, + { + "epoch": 3.0794718081659105, + "grad_norm": 0.06835725158452988, + "learning_rate": 5.1140915432737747e-05, + "loss": 0.2404, + "step": 38013 + }, + { + "epoch": 3.0795528191834087, + "grad_norm": 0.06105947867035866, + "learning_rate": 5.113641478014313e-05, + "loss": 0.2399, + "step": 38014 + }, + { + "epoch": 3.0796338302009074, + "grad_norm": 0.06209400296211243, + "learning_rate": 5.1131914127548494e-05, + "loss": 0.1897, + "step": 38015 + }, + { + "epoch": 3.0797148412184057, + "grad_norm": 0.07727130502462387, + "learning_rate": 5.112741347495387e-05, + "loss": 0.2556, + "step": 38016 + }, + { + "epoch": 3.079795852235904, + "grad_norm": 0.07233865559101105, + "learning_rate": 5.112291282235925e-05, + "loss": 0.2576, + "step": 38017 + }, + { + "epoch": 3.0798768632534026, + "grad_norm": 0.07869274914264679, + "learning_rate": 5.1118412169764615e-05, + "loss": 0.2686, + "step": 38018 + }, + { + "epoch": 3.079957874270901, + "grad_norm": 0.06438925117254257, + "learning_rate": 5.111391151716999e-05, + "loss": 0.2303, + "step": 38019 + }, + { + "epoch": 3.080038885288399, + "grad_norm": 0.06574901938438416, + "learning_rate": 5.110941086457537e-05, + "loss": 0.21, + "step": 38020 + }, + { + "epoch": 3.0801198963058978, + "grad_norm": 0.09768787771463394, + "learning_rate": 5.110491021198074e-05, + "loss": 0.2309, + "step": 38021 + }, + { + "epoch": 3.080200907323396, + "grad_norm": 0.07332216203212738, + "learning_rate": 5.110040955938611e-05, + "loss": 0.249, + "step": 38022 + }, + { + "epoch": 3.0802819183408943, + "grad_norm": 0.0715082660317421, + "learning_rate": 5.109590890679149e-05, + "loss": 0.2539, + "step": 38023 + }, + { + "epoch": 3.080362929358393, + "grad_norm": 0.06920846551656723, + "learning_rate": 5.109140825419686e-05, + "loss": 0.2114, + "step": 38024 + }, + { + "epoch": 3.080443940375891, + "grad_norm": 0.07967160642147064, + "learning_rate": 5.108690760160223e-05, + "loss": 0.2177, + "step": 38025 + }, + { + "epoch": 3.0805249513933894, + "grad_norm": 0.08817549794912338, + "learning_rate": 5.108240694900761e-05, + "loss": 0.2499, + "step": 38026 + }, + { + "epoch": 3.0806059624108877, + "grad_norm": 0.05930979549884796, + "learning_rate": 5.1077906296412984e-05, + "loss": 0.2294, + "step": 38027 + }, + { + "epoch": 3.0806869734283864, + "grad_norm": 0.062281325459480286, + "learning_rate": 5.107340564381835e-05, + "loss": 0.1834, + "step": 38028 + }, + { + "epoch": 3.0807679844458846, + "grad_norm": 0.06266391277313232, + "learning_rate": 5.106890499122373e-05, + "loss": 0.2284, + "step": 38029 + }, + { + "epoch": 3.080848995463383, + "grad_norm": 0.0831487774848938, + "learning_rate": 5.1064404338629105e-05, + "loss": 0.2499, + "step": 38030 + }, + { + "epoch": 3.0809300064808816, + "grad_norm": 0.07766416668891907, + "learning_rate": 5.1059903686034485e-05, + "loss": 0.2153, + "step": 38031 + }, + { + "epoch": 3.08101101749838, + "grad_norm": 0.07681175321340561, + "learning_rate": 5.105540303343985e-05, + "loss": 0.2177, + "step": 38032 + }, + { + "epoch": 3.081092028515878, + "grad_norm": 0.06907657533884048, + "learning_rate": 5.1050902380845225e-05, + "loss": 0.2689, + "step": 38033 + }, + { + "epoch": 3.0811730395333767, + "grad_norm": 0.06728033721446991, + "learning_rate": 5.1046401728250606e-05, + "loss": 0.2024, + "step": 38034 + }, + { + "epoch": 3.081254050550875, + "grad_norm": 0.07561451196670532, + "learning_rate": 5.104190107565597e-05, + "loss": 0.2108, + "step": 38035 + }, + { + "epoch": 3.0813350615683732, + "grad_norm": 0.06470509618520737, + "learning_rate": 5.1037400423061346e-05, + "loss": 0.2404, + "step": 38036 + }, + { + "epoch": 3.0814160725858715, + "grad_norm": 0.05925104022026062, + "learning_rate": 5.1032899770466727e-05, + "loss": 0.2147, + "step": 38037 + }, + { + "epoch": 3.08149708360337, + "grad_norm": 0.07685708999633789, + "learning_rate": 5.102839911787209e-05, + "loss": 0.2463, + "step": 38038 + }, + { + "epoch": 3.0815780946208684, + "grad_norm": 0.07041988521814346, + "learning_rate": 5.102389846527747e-05, + "loss": 0.2095, + "step": 38039 + }, + { + "epoch": 3.0816591056383666, + "grad_norm": 0.0685499832034111, + "learning_rate": 5.101939781268285e-05, + "loss": 0.2095, + "step": 38040 + }, + { + "epoch": 3.0817401166558653, + "grad_norm": 0.06696108728647232, + "learning_rate": 5.1014897160088214e-05, + "loss": 0.2174, + "step": 38041 + }, + { + "epoch": 3.0818211276733636, + "grad_norm": 0.10919345170259476, + "learning_rate": 5.101039650749359e-05, + "loss": 0.2313, + "step": 38042 + }, + { + "epoch": 3.081902138690862, + "grad_norm": 0.07602211087942123, + "learning_rate": 5.100589585489897e-05, + "loss": 0.2269, + "step": 38043 + }, + { + "epoch": 3.0819831497083605, + "grad_norm": 0.0673341378569603, + "learning_rate": 5.1001395202304335e-05, + "loss": 0.2401, + "step": 38044 + }, + { + "epoch": 3.0820641607258588, + "grad_norm": 0.06349032372236252, + "learning_rate": 5.099689454970971e-05, + "loss": 0.2084, + "step": 38045 + }, + { + "epoch": 3.082145171743357, + "grad_norm": 0.05518511310219765, + "learning_rate": 5.099239389711509e-05, + "loss": 0.1958, + "step": 38046 + }, + { + "epoch": 3.0822261827608557, + "grad_norm": 0.06599873304367065, + "learning_rate": 5.0987893244520456e-05, + "loss": 0.2335, + "step": 38047 + }, + { + "epoch": 3.082307193778354, + "grad_norm": 0.07924923300743103, + "learning_rate": 5.098339259192583e-05, + "loss": 0.2228, + "step": 38048 + }, + { + "epoch": 3.082388204795852, + "grad_norm": 0.06403350830078125, + "learning_rate": 5.097889193933121e-05, + "loss": 0.1988, + "step": 38049 + }, + { + "epoch": 3.0824692158133504, + "grad_norm": 0.0660524070262909, + "learning_rate": 5.097439128673658e-05, + "loss": 0.2082, + "step": 38050 + }, + { + "epoch": 3.082550226830849, + "grad_norm": 0.07383035868406296, + "learning_rate": 5.096989063414195e-05, + "loss": 0.2288, + "step": 38051 + }, + { + "epoch": 3.0826312378483474, + "grad_norm": 0.05731251463294029, + "learning_rate": 5.096538998154733e-05, + "loss": 0.2167, + "step": 38052 + }, + { + "epoch": 3.0827122488658456, + "grad_norm": 0.08323297649621964, + "learning_rate": 5.09608893289527e-05, + "loss": 0.207, + "step": 38053 + }, + { + "epoch": 3.0827932598833443, + "grad_norm": 0.07124746590852737, + "learning_rate": 5.095638867635807e-05, + "loss": 0.2148, + "step": 38054 + }, + { + "epoch": 3.0828742709008425, + "grad_norm": 0.09191430360078812, + "learning_rate": 5.095188802376345e-05, + "loss": 0.2445, + "step": 38055 + }, + { + "epoch": 3.082955281918341, + "grad_norm": 0.06406380236148834, + "learning_rate": 5.094738737116882e-05, + "loss": 0.2147, + "step": 38056 + }, + { + "epoch": 3.0830362929358395, + "grad_norm": 0.06194882094860077, + "learning_rate": 5.09428867185742e-05, + "loss": 0.2221, + "step": 38057 + }, + { + "epoch": 3.0831173039533377, + "grad_norm": 0.07221710681915283, + "learning_rate": 5.093838606597957e-05, + "loss": 0.2295, + "step": 38058 + }, + { + "epoch": 3.083198314970836, + "grad_norm": 0.07320144772529602, + "learning_rate": 5.093388541338494e-05, + "loss": 0.2356, + "step": 38059 + }, + { + "epoch": 3.083279325988334, + "grad_norm": 0.060835100710392, + "learning_rate": 5.092938476079032e-05, + "loss": 0.2301, + "step": 38060 + }, + { + "epoch": 3.083360337005833, + "grad_norm": 0.07213728874921799, + "learning_rate": 5.092488410819569e-05, + "loss": 0.2328, + "step": 38061 + }, + { + "epoch": 3.083441348023331, + "grad_norm": 0.06496760994195938, + "learning_rate": 5.092038345560106e-05, + "loss": 0.1952, + "step": 38062 + }, + { + "epoch": 3.0835223590408294, + "grad_norm": 0.06325732171535492, + "learning_rate": 5.091588280300644e-05, + "loss": 0.2247, + "step": 38063 + }, + { + "epoch": 3.083603370058328, + "grad_norm": 0.06883818656206131, + "learning_rate": 5.0911382150411814e-05, + "loss": 0.2103, + "step": 38064 + }, + { + "epoch": 3.0836843810758263, + "grad_norm": 0.08192932605743408, + "learning_rate": 5.090688149781718e-05, + "loss": 0.2638, + "step": 38065 + }, + { + "epoch": 3.0837653920933246, + "grad_norm": 0.08824772387742996, + "learning_rate": 5.090238084522256e-05, + "loss": 0.2119, + "step": 38066 + }, + { + "epoch": 3.0838464031108233, + "grad_norm": 0.08697578310966492, + "learning_rate": 5.0897880192627935e-05, + "loss": 0.2806, + "step": 38067 + }, + { + "epoch": 3.0839274141283215, + "grad_norm": 0.06647021323442459, + "learning_rate": 5.08933795400333e-05, + "loss": 0.1708, + "step": 38068 + }, + { + "epoch": 3.0840084251458197, + "grad_norm": 0.06968235969543457, + "learning_rate": 5.088887888743868e-05, + "loss": 0.2041, + "step": 38069 + }, + { + "epoch": 3.0840894361633184, + "grad_norm": 0.06884042918682098, + "learning_rate": 5.0884378234844055e-05, + "loss": 0.2403, + "step": 38070 + }, + { + "epoch": 3.0841704471808167, + "grad_norm": 0.06541278958320618, + "learning_rate": 5.087987758224942e-05, + "loss": 0.2834, + "step": 38071 + }, + { + "epoch": 3.084251458198315, + "grad_norm": 0.07046263664960861, + "learning_rate": 5.08753769296548e-05, + "loss": 0.2242, + "step": 38072 + }, + { + "epoch": 3.084332469215813, + "grad_norm": 0.0605572834610939, + "learning_rate": 5.0870876277060176e-05, + "loss": 0.2102, + "step": 38073 + }, + { + "epoch": 3.084413480233312, + "grad_norm": 0.07622010260820389, + "learning_rate": 5.086637562446554e-05, + "loss": 0.2486, + "step": 38074 + }, + { + "epoch": 3.08449449125081, + "grad_norm": 0.06665437668561935, + "learning_rate": 5.0861874971870923e-05, + "loss": 0.214, + "step": 38075 + }, + { + "epoch": 3.0845755022683083, + "grad_norm": 0.07465348392724991, + "learning_rate": 5.08573743192763e-05, + "loss": 0.2371, + "step": 38076 + }, + { + "epoch": 3.084656513285807, + "grad_norm": 0.08477599918842316, + "learning_rate": 5.0852873666681664e-05, + "loss": 0.2299, + "step": 38077 + }, + { + "epoch": 3.0847375243033053, + "grad_norm": 0.08032052218914032, + "learning_rate": 5.0848373014087044e-05, + "loss": 0.2496, + "step": 38078 + }, + { + "epoch": 3.0848185353208035, + "grad_norm": 0.0856558084487915, + "learning_rate": 5.084387236149242e-05, + "loss": 0.2353, + "step": 38079 + }, + { + "epoch": 3.084899546338302, + "grad_norm": 0.0713203176856041, + "learning_rate": 5.0839371708897785e-05, + "loss": 0.2028, + "step": 38080 + }, + { + "epoch": 3.0849805573558005, + "grad_norm": 0.06258882582187653, + "learning_rate": 5.0834871056303165e-05, + "loss": 0.2636, + "step": 38081 + }, + { + "epoch": 3.0850615683732987, + "grad_norm": 0.07384096086025238, + "learning_rate": 5.083037040370854e-05, + "loss": 0.2494, + "step": 38082 + }, + { + "epoch": 3.085142579390797, + "grad_norm": 0.07079246640205383, + "learning_rate": 5.082586975111392e-05, + "loss": 0.2386, + "step": 38083 + }, + { + "epoch": 3.0852235904082956, + "grad_norm": 0.06467340141534805, + "learning_rate": 5.0821369098519286e-05, + "loss": 0.2367, + "step": 38084 + }, + { + "epoch": 3.085304601425794, + "grad_norm": 0.07608397305011749, + "learning_rate": 5.081686844592466e-05, + "loss": 0.2207, + "step": 38085 + }, + { + "epoch": 3.085385612443292, + "grad_norm": 0.06386285275220871, + "learning_rate": 5.081236779333004e-05, + "loss": 0.238, + "step": 38086 + }, + { + "epoch": 3.085466623460791, + "grad_norm": 0.08161813020706177, + "learning_rate": 5.080786714073541e-05, + "loss": 0.2505, + "step": 38087 + }, + { + "epoch": 3.085547634478289, + "grad_norm": 0.06811436265707016, + "learning_rate": 5.080336648814078e-05, + "loss": 0.2641, + "step": 38088 + }, + { + "epoch": 3.0856286454957873, + "grad_norm": 0.09591707587242126, + "learning_rate": 5.079886583554616e-05, + "loss": 0.2343, + "step": 38089 + }, + { + "epoch": 3.085709656513286, + "grad_norm": 0.0632985383272171, + "learning_rate": 5.0794365182951534e-05, + "loss": 0.2361, + "step": 38090 + }, + { + "epoch": 3.0857906675307842, + "grad_norm": 0.07549940794706345, + "learning_rate": 5.07898645303569e-05, + "loss": 0.2339, + "step": 38091 + }, + { + "epoch": 3.0858716785482825, + "grad_norm": 0.06874658912420273, + "learning_rate": 5.078536387776228e-05, + "loss": 0.2236, + "step": 38092 + }, + { + "epoch": 3.085952689565781, + "grad_norm": 0.059727951884269714, + "learning_rate": 5.0780863225167655e-05, + "loss": 0.2061, + "step": 38093 + }, + { + "epoch": 3.0860337005832794, + "grad_norm": 0.0709579661488533, + "learning_rate": 5.077636257257302e-05, + "loss": 0.2269, + "step": 38094 + }, + { + "epoch": 3.0861147116007777, + "grad_norm": 0.07000521570444107, + "learning_rate": 5.07718619199784e-05, + "loss": 0.2163, + "step": 38095 + }, + { + "epoch": 3.086195722618276, + "grad_norm": 0.07339578866958618, + "learning_rate": 5.0767361267383776e-05, + "loss": 0.2372, + "step": 38096 + }, + { + "epoch": 3.0862767336357746, + "grad_norm": 0.06663995236158371, + "learning_rate": 5.076286061478914e-05, + "loss": 0.2488, + "step": 38097 + }, + { + "epoch": 3.086357744653273, + "grad_norm": 0.06998470425605774, + "learning_rate": 5.075835996219452e-05, + "loss": 0.2315, + "step": 38098 + }, + { + "epoch": 3.086438755670771, + "grad_norm": 0.0681297555565834, + "learning_rate": 5.07538593095999e-05, + "loss": 0.2689, + "step": 38099 + }, + { + "epoch": 3.0865197666882698, + "grad_norm": 0.05492580682039261, + "learning_rate": 5.0749358657005264e-05, + "loss": 0.1912, + "step": 38100 + }, + { + "epoch": 3.086600777705768, + "grad_norm": 0.06821811944246292, + "learning_rate": 5.0744858004410644e-05, + "loss": 0.2167, + "step": 38101 + }, + { + "epoch": 3.0866817887232663, + "grad_norm": 0.06851505488157272, + "learning_rate": 5.074035735181602e-05, + "loss": 0.2513, + "step": 38102 + }, + { + "epoch": 3.086762799740765, + "grad_norm": 0.094993956387043, + "learning_rate": 5.0735856699221384e-05, + "loss": 0.2536, + "step": 38103 + }, + { + "epoch": 3.086843810758263, + "grad_norm": 0.06849288195371628, + "learning_rate": 5.0731356046626765e-05, + "loss": 0.2221, + "step": 38104 + }, + { + "epoch": 3.0869248217757614, + "grad_norm": 0.06943635642528534, + "learning_rate": 5.072685539403214e-05, + "loss": 0.2225, + "step": 38105 + }, + { + "epoch": 3.0870058327932597, + "grad_norm": 0.07597918063402176, + "learning_rate": 5.0722354741437505e-05, + "loss": 0.2551, + "step": 38106 + }, + { + "epoch": 3.0870868438107584, + "grad_norm": 0.07840082049369812, + "learning_rate": 5.0717854088842886e-05, + "loss": 0.2292, + "step": 38107 + }, + { + "epoch": 3.0871678548282566, + "grad_norm": 0.07592572271823883, + "learning_rate": 5.071335343624826e-05, + "loss": 0.2066, + "step": 38108 + }, + { + "epoch": 3.087248865845755, + "grad_norm": 0.05478053539991379, + "learning_rate": 5.070885278365364e-05, + "loss": 0.2228, + "step": 38109 + }, + { + "epoch": 3.0873298768632536, + "grad_norm": 0.07612153142690659, + "learning_rate": 5.0704352131059006e-05, + "loss": 0.1985, + "step": 38110 + }, + { + "epoch": 3.087410887880752, + "grad_norm": 0.0656314343214035, + "learning_rate": 5.069985147846438e-05, + "loss": 0.238, + "step": 38111 + }, + { + "epoch": 3.08749189889825, + "grad_norm": 0.06686277687549591, + "learning_rate": 5.069535082586976e-05, + "loss": 0.2263, + "step": 38112 + }, + { + "epoch": 3.0875729099157487, + "grad_norm": 0.08731774240732193, + "learning_rate": 5.069085017327513e-05, + "loss": 0.2618, + "step": 38113 + }, + { + "epoch": 3.087653920933247, + "grad_norm": 0.07624731957912445, + "learning_rate": 5.06863495206805e-05, + "loss": 0.2091, + "step": 38114 + }, + { + "epoch": 3.087734931950745, + "grad_norm": 0.06672275066375732, + "learning_rate": 5.068184886808588e-05, + "loss": 0.2189, + "step": 38115 + }, + { + "epoch": 3.087815942968244, + "grad_norm": 0.07012347877025604, + "learning_rate": 5.067734821549125e-05, + "loss": 0.2316, + "step": 38116 + }, + { + "epoch": 3.087896953985742, + "grad_norm": 0.06306098401546478, + "learning_rate": 5.067284756289662e-05, + "loss": 0.2217, + "step": 38117 + }, + { + "epoch": 3.0879779650032404, + "grad_norm": 0.060554634779691696, + "learning_rate": 5.0668346910302e-05, + "loss": 0.2176, + "step": 38118 + }, + { + "epoch": 3.0880589760207386, + "grad_norm": 0.0674269050359726, + "learning_rate": 5.066384625770737e-05, + "loss": 0.2263, + "step": 38119 + }, + { + "epoch": 3.0881399870382373, + "grad_norm": 0.07302812486886978, + "learning_rate": 5.065934560511274e-05, + "loss": 0.2039, + "step": 38120 + }, + { + "epoch": 3.0882209980557356, + "grad_norm": 0.0731666088104248, + "learning_rate": 5.065484495251812e-05, + "loss": 0.2625, + "step": 38121 + }, + { + "epoch": 3.088302009073234, + "grad_norm": 0.07279719412326813, + "learning_rate": 5.065034429992349e-05, + "loss": 0.2763, + "step": 38122 + }, + { + "epoch": 3.0883830200907325, + "grad_norm": 0.07053932547569275, + "learning_rate": 5.064584364732886e-05, + "loss": 0.2394, + "step": 38123 + }, + { + "epoch": 3.0884640311082308, + "grad_norm": 0.06680049002170563, + "learning_rate": 5.0641342994734244e-05, + "loss": 0.2238, + "step": 38124 + }, + { + "epoch": 3.088545042125729, + "grad_norm": 0.08036140352487564, + "learning_rate": 5.063684234213961e-05, + "loss": 0.27, + "step": 38125 + }, + { + "epoch": 3.0886260531432277, + "grad_norm": 0.05588580295443535, + "learning_rate": 5.0632341689544984e-05, + "loss": 0.2297, + "step": 38126 + }, + { + "epoch": 3.088707064160726, + "grad_norm": 0.07165321707725525, + "learning_rate": 5.0627841036950364e-05, + "loss": 0.2177, + "step": 38127 + }, + { + "epoch": 3.088788075178224, + "grad_norm": 0.09657379984855652, + "learning_rate": 5.062334038435573e-05, + "loss": 0.2366, + "step": 38128 + }, + { + "epoch": 3.0888690861957224, + "grad_norm": 0.08756855130195618, + "learning_rate": 5.0618839731761105e-05, + "loss": 0.2314, + "step": 38129 + }, + { + "epoch": 3.088950097213221, + "grad_norm": 0.059539131820201874, + "learning_rate": 5.0614339079166485e-05, + "loss": 0.1886, + "step": 38130 + }, + { + "epoch": 3.0890311082307194, + "grad_norm": 0.06523775309324265, + "learning_rate": 5.060983842657185e-05, + "loss": 0.2018, + "step": 38131 + }, + { + "epoch": 3.0891121192482176, + "grad_norm": 0.08024541288614273, + "learning_rate": 5.0605337773977226e-05, + "loss": 0.2334, + "step": 38132 + }, + { + "epoch": 3.0891931302657163, + "grad_norm": 0.06841821223497391, + "learning_rate": 5.0600837121382606e-05, + "loss": 0.2721, + "step": 38133 + }, + { + "epoch": 3.0892741412832145, + "grad_norm": 0.07262247055768967, + "learning_rate": 5.059633646878797e-05, + "loss": 0.2294, + "step": 38134 + }, + { + "epoch": 3.089355152300713, + "grad_norm": 0.05673075094819069, + "learning_rate": 5.059183581619335e-05, + "loss": 0.2388, + "step": 38135 + }, + { + "epoch": 3.0894361633182115, + "grad_norm": 0.06358672678470612, + "learning_rate": 5.058733516359873e-05, + "loss": 0.1944, + "step": 38136 + }, + { + "epoch": 3.0895171743357097, + "grad_norm": 0.08018980920314789, + "learning_rate": 5.0582834511004094e-05, + "loss": 0.2318, + "step": 38137 + }, + { + "epoch": 3.089598185353208, + "grad_norm": 0.0760384276509285, + "learning_rate": 5.0578333858409474e-05, + "loss": 0.218, + "step": 38138 + }, + { + "epoch": 3.089679196370706, + "grad_norm": 0.06213190406560898, + "learning_rate": 5.057383320581485e-05, + "loss": 0.2148, + "step": 38139 + }, + { + "epoch": 3.089760207388205, + "grad_norm": 0.07853163033723831, + "learning_rate": 5.0569332553220214e-05, + "loss": 0.248, + "step": 38140 + }, + { + "epoch": 3.089841218405703, + "grad_norm": 0.06849128752946854, + "learning_rate": 5.0564831900625595e-05, + "loss": 0.2004, + "step": 38141 + }, + { + "epoch": 3.0899222294232014, + "grad_norm": 0.0643618106842041, + "learning_rate": 5.056033124803097e-05, + "loss": 0.2423, + "step": 38142 + }, + { + "epoch": 3.0900032404407, + "grad_norm": 0.0633440613746643, + "learning_rate": 5.0555830595436335e-05, + "loss": 0.2309, + "step": 38143 + }, + { + "epoch": 3.0900842514581983, + "grad_norm": 0.07177025824785233, + "learning_rate": 5.0551329942841716e-05, + "loss": 0.2087, + "step": 38144 + }, + { + "epoch": 3.0901652624756966, + "grad_norm": 0.05981674790382385, + "learning_rate": 5.054682929024709e-05, + "loss": 0.2144, + "step": 38145 + }, + { + "epoch": 3.0902462734931953, + "grad_norm": 0.07178358733654022, + "learning_rate": 5.0542328637652456e-05, + "loss": 0.2663, + "step": 38146 + }, + { + "epoch": 3.0903272845106935, + "grad_norm": 0.07453365623950958, + "learning_rate": 5.0537827985057836e-05, + "loss": 0.2465, + "step": 38147 + }, + { + "epoch": 3.0904082955281917, + "grad_norm": 0.0767117440700531, + "learning_rate": 5.053332733246321e-05, + "loss": 0.2382, + "step": 38148 + }, + { + "epoch": 3.0904893065456904, + "grad_norm": 0.07359892129898071, + "learning_rate": 5.052882667986858e-05, + "loss": 0.2409, + "step": 38149 + }, + { + "epoch": 3.0905703175631887, + "grad_norm": 0.06934542953968048, + "learning_rate": 5.052432602727396e-05, + "loss": 0.1998, + "step": 38150 + }, + { + "epoch": 3.090651328580687, + "grad_norm": 0.05748188495635986, + "learning_rate": 5.051982537467933e-05, + "loss": 0.1886, + "step": 38151 + }, + { + "epoch": 3.090732339598185, + "grad_norm": 0.07303234189748764, + "learning_rate": 5.05153247220847e-05, + "loss": 0.2035, + "step": 38152 + }, + { + "epoch": 3.090813350615684, + "grad_norm": 0.06159835681319237, + "learning_rate": 5.051082406949008e-05, + "loss": 0.222, + "step": 38153 + }, + { + "epoch": 3.090894361633182, + "grad_norm": 0.07107991725206375, + "learning_rate": 5.050632341689545e-05, + "loss": 0.2005, + "step": 38154 + }, + { + "epoch": 3.0909753726506803, + "grad_norm": 0.07847020030021667, + "learning_rate": 5.050182276430082e-05, + "loss": 0.2172, + "step": 38155 + }, + { + "epoch": 3.091056383668179, + "grad_norm": 0.07008294761180878, + "learning_rate": 5.0497322111706206e-05, + "loss": 0.2543, + "step": 38156 + }, + { + "epoch": 3.0911373946856773, + "grad_norm": 0.0713619813323021, + "learning_rate": 5.049282145911157e-05, + "loss": 0.2371, + "step": 38157 + }, + { + "epoch": 3.0912184057031755, + "grad_norm": 0.07178063690662384, + "learning_rate": 5.048832080651694e-05, + "loss": 0.2314, + "step": 38158 + }, + { + "epoch": 3.091299416720674, + "grad_norm": 0.06920547783374786, + "learning_rate": 5.0483820153922326e-05, + "loss": 0.2273, + "step": 38159 + }, + { + "epoch": 3.0913804277381725, + "grad_norm": 0.07792910188436508, + "learning_rate": 5.047931950132769e-05, + "loss": 0.2061, + "step": 38160 + }, + { + "epoch": 3.0914614387556707, + "grad_norm": 0.07506173849105835, + "learning_rate": 5.0474818848733074e-05, + "loss": 0.289, + "step": 38161 + }, + { + "epoch": 3.091542449773169, + "grad_norm": 0.07946907728910446, + "learning_rate": 5.047031819613845e-05, + "loss": 0.2352, + "step": 38162 + }, + { + "epoch": 3.0916234607906676, + "grad_norm": 0.07613854110240936, + "learning_rate": 5.0465817543543814e-05, + "loss": 0.2601, + "step": 38163 + }, + { + "epoch": 3.091704471808166, + "grad_norm": 0.09603036195039749, + "learning_rate": 5.0461316890949195e-05, + "loss": 0.2455, + "step": 38164 + }, + { + "epoch": 3.091785482825664, + "grad_norm": 0.06707438081502914, + "learning_rate": 5.045681623835457e-05, + "loss": 0.2148, + "step": 38165 + }, + { + "epoch": 3.091866493843163, + "grad_norm": 0.07092217355966568, + "learning_rate": 5.0452315585759935e-05, + "loss": 0.2007, + "step": 38166 + }, + { + "epoch": 3.091947504860661, + "grad_norm": 0.07237200438976288, + "learning_rate": 5.0447814933165315e-05, + "loss": 0.2737, + "step": 38167 + }, + { + "epoch": 3.0920285158781593, + "grad_norm": 0.06157402694225311, + "learning_rate": 5.044331428057069e-05, + "loss": 0.2245, + "step": 38168 + }, + { + "epoch": 3.092109526895658, + "grad_norm": 0.059591781347990036, + "learning_rate": 5.0438813627976056e-05, + "loss": 0.2296, + "step": 38169 + }, + { + "epoch": 3.0921905379131562, + "grad_norm": 0.08246058970689774, + "learning_rate": 5.0434312975381436e-05, + "loss": 0.2673, + "step": 38170 + }, + { + "epoch": 3.0922715489306545, + "grad_norm": 0.06747202575206757, + "learning_rate": 5.042981232278681e-05, + "loss": 0.216, + "step": 38171 + }, + { + "epoch": 3.0923525599481527, + "grad_norm": 0.0788835808634758, + "learning_rate": 5.0425311670192177e-05, + "loss": 0.2228, + "step": 38172 + }, + { + "epoch": 3.0924335709656514, + "grad_norm": 0.07319427281618118, + "learning_rate": 5.042081101759756e-05, + "loss": 0.1856, + "step": 38173 + }, + { + "epoch": 3.0925145819831497, + "grad_norm": 0.07308870553970337, + "learning_rate": 5.041631036500293e-05, + "loss": 0.2299, + "step": 38174 + }, + { + "epoch": 3.092595593000648, + "grad_norm": 0.06862013787031174, + "learning_rate": 5.04118097124083e-05, + "loss": 0.2207, + "step": 38175 + }, + { + "epoch": 3.0926766040181466, + "grad_norm": 0.05893588811159134, + "learning_rate": 5.040730905981368e-05, + "loss": 0.2232, + "step": 38176 + }, + { + "epoch": 3.092757615035645, + "grad_norm": 0.06568938493728638, + "learning_rate": 5.040280840721905e-05, + "loss": 0.2481, + "step": 38177 + }, + { + "epoch": 3.092838626053143, + "grad_norm": 0.0582958459854126, + "learning_rate": 5.039830775462442e-05, + "loss": 0.2398, + "step": 38178 + }, + { + "epoch": 3.0929196370706418, + "grad_norm": 0.0737474337220192, + "learning_rate": 5.03938071020298e-05, + "loss": 0.2147, + "step": 38179 + }, + { + "epoch": 3.09300064808814, + "grad_norm": 0.06943865865468979, + "learning_rate": 5.038930644943517e-05, + "loss": 0.2435, + "step": 38180 + }, + { + "epoch": 3.0930816591056383, + "grad_norm": 0.07017700374126434, + "learning_rate": 5.038480579684054e-05, + "loss": 0.2478, + "step": 38181 + }, + { + "epoch": 3.093162670123137, + "grad_norm": 0.059834618121385574, + "learning_rate": 5.038030514424592e-05, + "loss": 0.2023, + "step": 38182 + }, + { + "epoch": 3.093243681140635, + "grad_norm": 0.07454963773488998, + "learning_rate": 5.037580449165129e-05, + "loss": 0.2126, + "step": 38183 + }, + { + "epoch": 3.0933246921581334, + "grad_norm": 0.08466517925262451, + "learning_rate": 5.037130383905666e-05, + "loss": 0.279, + "step": 38184 + }, + { + "epoch": 3.0934057031756317, + "grad_norm": 0.07189661264419556, + "learning_rate": 5.036680318646204e-05, + "loss": 0.2707, + "step": 38185 + }, + { + "epoch": 3.0934867141931304, + "grad_norm": 0.08357493579387665, + "learning_rate": 5.0362302533867414e-05, + "loss": 0.2114, + "step": 38186 + }, + { + "epoch": 3.0935677252106286, + "grad_norm": 0.08453121036291122, + "learning_rate": 5.035780188127278e-05, + "loss": 0.2577, + "step": 38187 + }, + { + "epoch": 3.093648736228127, + "grad_norm": 0.06295271962881088, + "learning_rate": 5.035330122867816e-05, + "loss": 0.1923, + "step": 38188 + }, + { + "epoch": 3.0937297472456255, + "grad_norm": 0.06339378654956818, + "learning_rate": 5.0348800576083535e-05, + "loss": 0.2428, + "step": 38189 + }, + { + "epoch": 3.093810758263124, + "grad_norm": 0.08615691214799881, + "learning_rate": 5.0344299923488915e-05, + "loss": 0.2451, + "step": 38190 + }, + { + "epoch": 3.093891769280622, + "grad_norm": 0.059974655508995056, + "learning_rate": 5.033979927089428e-05, + "loss": 0.2735, + "step": 38191 + }, + { + "epoch": 3.0939727802981207, + "grad_norm": 0.08040430396795273, + "learning_rate": 5.0335298618299655e-05, + "loss": 0.2539, + "step": 38192 + }, + { + "epoch": 3.094053791315619, + "grad_norm": 0.0721743181347847, + "learning_rate": 5.0330797965705036e-05, + "loss": 0.2348, + "step": 38193 + }, + { + "epoch": 3.094134802333117, + "grad_norm": 0.07730038464069366, + "learning_rate": 5.03262973131104e-05, + "loss": 0.2421, + "step": 38194 + }, + { + "epoch": 3.0942158133506155, + "grad_norm": 0.07701695710420609, + "learning_rate": 5.0321796660515776e-05, + "loss": 0.2411, + "step": 38195 + }, + { + "epoch": 3.094296824368114, + "grad_norm": 0.06585506349802017, + "learning_rate": 5.0317296007921157e-05, + "loss": 0.2279, + "step": 38196 + }, + { + "epoch": 3.0943778353856124, + "grad_norm": 0.07853267341852188, + "learning_rate": 5.0312795355326523e-05, + "loss": 0.222, + "step": 38197 + }, + { + "epoch": 3.0944588464031106, + "grad_norm": 0.08355189859867096, + "learning_rate": 5.03082947027319e-05, + "loss": 0.2045, + "step": 38198 + }, + { + "epoch": 3.0945398574206093, + "grad_norm": 0.07326925545930862, + "learning_rate": 5.030379405013728e-05, + "loss": 0.2256, + "step": 38199 + }, + { + "epoch": 3.0946208684381076, + "grad_norm": 0.06939083337783813, + "learning_rate": 5.0299293397542644e-05, + "loss": 0.2365, + "step": 38200 + }, + { + "epoch": 3.094701879455606, + "grad_norm": 0.0588260293006897, + "learning_rate": 5.029479274494802e-05, + "loss": 0.2442, + "step": 38201 + }, + { + "epoch": 3.0947828904731045, + "grad_norm": 0.05715958774089813, + "learning_rate": 5.02902920923534e-05, + "loss": 0.2428, + "step": 38202 + }, + { + "epoch": 3.0948639014906028, + "grad_norm": 0.08294867724180222, + "learning_rate": 5.0285791439758765e-05, + "loss": 0.238, + "step": 38203 + }, + { + "epoch": 3.094944912508101, + "grad_norm": 0.07795784622430801, + "learning_rate": 5.028129078716414e-05, + "loss": 0.2578, + "step": 38204 + }, + { + "epoch": 3.0950259235255997, + "grad_norm": 0.06538005918264389, + "learning_rate": 5.027679013456952e-05, + "loss": 0.2279, + "step": 38205 + }, + { + "epoch": 3.095106934543098, + "grad_norm": 0.08856139332056046, + "learning_rate": 5.0272289481974886e-05, + "loss": 0.2476, + "step": 38206 + }, + { + "epoch": 3.095187945560596, + "grad_norm": 0.05094992741942406, + "learning_rate": 5.026778882938026e-05, + "loss": 0.1996, + "step": 38207 + }, + { + "epoch": 3.0952689565780944, + "grad_norm": 0.09304434806108475, + "learning_rate": 5.026328817678564e-05, + "loss": 0.2361, + "step": 38208 + }, + { + "epoch": 3.095349967595593, + "grad_norm": 0.07475238293409348, + "learning_rate": 5.025878752419101e-05, + "loss": 0.2201, + "step": 38209 + }, + { + "epoch": 3.0954309786130914, + "grad_norm": 0.07106000185012817, + "learning_rate": 5.025428687159638e-05, + "loss": 0.2355, + "step": 38210 + }, + { + "epoch": 3.0955119896305896, + "grad_norm": 0.06232582405209541, + "learning_rate": 5.024978621900176e-05, + "loss": 0.1958, + "step": 38211 + }, + { + "epoch": 3.0955930006480883, + "grad_norm": 0.07143504917621613, + "learning_rate": 5.024528556640713e-05, + "loss": 0.236, + "step": 38212 + }, + { + "epoch": 3.0956740116655865, + "grad_norm": 0.07695898413658142, + "learning_rate": 5.02407849138125e-05, + "loss": 0.1984, + "step": 38213 + }, + { + "epoch": 3.095755022683085, + "grad_norm": 0.09057194739580154, + "learning_rate": 5.023628426121788e-05, + "loss": 0.2275, + "step": 38214 + }, + { + "epoch": 3.0958360337005835, + "grad_norm": 0.06527171283960342, + "learning_rate": 5.023178360862325e-05, + "loss": 0.2526, + "step": 38215 + }, + { + "epoch": 3.0959170447180817, + "grad_norm": 0.0839429423213005, + "learning_rate": 5.022728295602863e-05, + "loss": 0.2383, + "step": 38216 + }, + { + "epoch": 3.09599805573558, + "grad_norm": 0.07217514514923096, + "learning_rate": 5.0222782303434e-05, + "loss": 0.2383, + "step": 38217 + }, + { + "epoch": 3.096079066753078, + "grad_norm": 0.07640687376260757, + "learning_rate": 5.021828165083937e-05, + "loss": 0.2398, + "step": 38218 + }, + { + "epoch": 3.096160077770577, + "grad_norm": 0.06141895428299904, + "learning_rate": 5.021378099824475e-05, + "loss": 0.2374, + "step": 38219 + }, + { + "epoch": 3.096241088788075, + "grad_norm": 0.08339501917362213, + "learning_rate": 5.020928034565012e-05, + "loss": 0.2329, + "step": 38220 + }, + { + "epoch": 3.0963220998055734, + "grad_norm": 0.09006574004888535, + "learning_rate": 5.020477969305549e-05, + "loss": 0.212, + "step": 38221 + }, + { + "epoch": 3.096403110823072, + "grad_norm": 0.06869301199913025, + "learning_rate": 5.020027904046087e-05, + "loss": 0.2354, + "step": 38222 + }, + { + "epoch": 3.0964841218405703, + "grad_norm": 0.07723093777894974, + "learning_rate": 5.0195778387866244e-05, + "loss": 0.2233, + "step": 38223 + }, + { + "epoch": 3.0965651328580686, + "grad_norm": 0.07363084703683853, + "learning_rate": 5.019127773527161e-05, + "loss": 0.2316, + "step": 38224 + }, + { + "epoch": 3.0966461438755672, + "grad_norm": 0.08026370406150818, + "learning_rate": 5.0186777082677e-05, + "loss": 0.2359, + "step": 38225 + }, + { + "epoch": 3.0967271548930655, + "grad_norm": 0.06522775441408157, + "learning_rate": 5.0182276430082365e-05, + "loss": 0.2196, + "step": 38226 + }, + { + "epoch": 3.0968081659105637, + "grad_norm": 0.07051295787096024, + "learning_rate": 5.017777577748773e-05, + "loss": 0.2017, + "step": 38227 + }, + { + "epoch": 3.0968891769280624, + "grad_norm": 0.09231901168823242, + "learning_rate": 5.017327512489312e-05, + "loss": 0.2242, + "step": 38228 + }, + { + "epoch": 3.0969701879455607, + "grad_norm": 0.07775097340345383, + "learning_rate": 5.0168774472298485e-05, + "loss": 0.2495, + "step": 38229 + }, + { + "epoch": 3.097051198963059, + "grad_norm": 0.07223207503557205, + "learning_rate": 5.016427381970385e-05, + "loss": 0.2057, + "step": 38230 + }, + { + "epoch": 3.097132209980557, + "grad_norm": 0.06762401759624481, + "learning_rate": 5.015977316710924e-05, + "loss": 0.2143, + "step": 38231 + }, + { + "epoch": 3.097213220998056, + "grad_norm": 0.07598146796226501, + "learning_rate": 5.0155272514514606e-05, + "loss": 0.2392, + "step": 38232 + }, + { + "epoch": 3.097294232015554, + "grad_norm": 0.05937891826033592, + "learning_rate": 5.015077186191997e-05, + "loss": 0.1926, + "step": 38233 + }, + { + "epoch": 3.0973752430330523, + "grad_norm": 0.059172067791223526, + "learning_rate": 5.014627120932536e-05, + "loss": 0.234, + "step": 38234 + }, + { + "epoch": 3.097456254050551, + "grad_norm": 0.06860775500535965, + "learning_rate": 5.014177055673073e-05, + "loss": 0.2155, + "step": 38235 + }, + { + "epoch": 3.0975372650680493, + "grad_norm": 0.0848398208618164, + "learning_rate": 5.0137269904136094e-05, + "loss": 0.2572, + "step": 38236 + }, + { + "epoch": 3.0976182760855475, + "grad_norm": 0.05897241085767746, + "learning_rate": 5.013276925154148e-05, + "loss": 0.2143, + "step": 38237 + }, + { + "epoch": 3.097699287103046, + "grad_norm": 0.06091038137674332, + "learning_rate": 5.012826859894685e-05, + "loss": 0.2176, + "step": 38238 + }, + { + "epoch": 3.0977802981205445, + "grad_norm": 0.07070525735616684, + "learning_rate": 5.0123767946352215e-05, + "loss": 0.2277, + "step": 38239 + }, + { + "epoch": 3.0978613091380427, + "grad_norm": 0.08308535069227219, + "learning_rate": 5.01192672937576e-05, + "loss": 0.2162, + "step": 38240 + }, + { + "epoch": 3.097942320155541, + "grad_norm": 0.07822440564632416, + "learning_rate": 5.011476664116297e-05, + "loss": 0.2496, + "step": 38241 + }, + { + "epoch": 3.0980233311730396, + "grad_norm": 0.07828620076179504, + "learning_rate": 5.011026598856835e-05, + "loss": 0.232, + "step": 38242 + }, + { + "epoch": 3.098104342190538, + "grad_norm": 0.07901854068040848, + "learning_rate": 5.010576533597372e-05, + "loss": 0.2474, + "step": 38243 + }, + { + "epoch": 3.098185353208036, + "grad_norm": 0.0792955681681633, + "learning_rate": 5.010126468337909e-05, + "loss": 0.2495, + "step": 38244 + }, + { + "epoch": 3.098266364225535, + "grad_norm": 0.07473356276750565, + "learning_rate": 5.009676403078447e-05, + "loss": 0.2083, + "step": 38245 + }, + { + "epoch": 3.098347375243033, + "grad_norm": 0.07413233071565628, + "learning_rate": 5.0092263378189844e-05, + "loss": 0.2267, + "step": 38246 + }, + { + "epoch": 3.0984283862605313, + "grad_norm": 0.056385502219200134, + "learning_rate": 5.008776272559521e-05, + "loss": 0.2191, + "step": 38247 + }, + { + "epoch": 3.09850939727803, + "grad_norm": 0.06543677300214767, + "learning_rate": 5.008326207300059e-05, + "loss": 0.2259, + "step": 38248 + }, + { + "epoch": 3.0985904082955282, + "grad_norm": 0.07110854238271713, + "learning_rate": 5.0078761420405964e-05, + "loss": 0.271, + "step": 38249 + }, + { + "epoch": 3.0986714193130265, + "grad_norm": 0.06640034914016724, + "learning_rate": 5.007426076781133e-05, + "loss": 0.2557, + "step": 38250 + }, + { + "epoch": 3.098752430330525, + "grad_norm": 0.0660850927233696, + "learning_rate": 5.006976011521671e-05, + "loss": 0.2097, + "step": 38251 + }, + { + "epoch": 3.0988334413480234, + "grad_norm": 0.07259193062782288, + "learning_rate": 5.0065259462622085e-05, + "loss": 0.2261, + "step": 38252 + }, + { + "epoch": 3.0989144523655217, + "grad_norm": 0.07566555589437485, + "learning_rate": 5.006075881002745e-05, + "loss": 0.2641, + "step": 38253 + }, + { + "epoch": 3.09899546338302, + "grad_norm": 0.06218456104397774, + "learning_rate": 5.005625815743283e-05, + "loss": 0.1978, + "step": 38254 + }, + { + "epoch": 3.0990764744005186, + "grad_norm": 0.07735332101583481, + "learning_rate": 5.0051757504838206e-05, + "loss": 0.223, + "step": 38255 + }, + { + "epoch": 3.099157485418017, + "grad_norm": 0.07643113285303116, + "learning_rate": 5.004725685224357e-05, + "loss": 0.2045, + "step": 38256 + }, + { + "epoch": 3.099238496435515, + "grad_norm": 0.07494723796844482, + "learning_rate": 5.004275619964895e-05, + "loss": 0.2389, + "step": 38257 + }, + { + "epoch": 3.0993195074530138, + "grad_norm": 0.08894181251525879, + "learning_rate": 5.003825554705433e-05, + "loss": 0.1937, + "step": 38258 + }, + { + "epoch": 3.099400518470512, + "grad_norm": 0.07444025576114655, + "learning_rate": 5.0033754894459694e-05, + "loss": 0.2252, + "step": 38259 + }, + { + "epoch": 3.0994815294880103, + "grad_norm": 0.07227712869644165, + "learning_rate": 5.0029254241865074e-05, + "loss": 0.2289, + "step": 38260 + }, + { + "epoch": 3.099562540505509, + "grad_norm": 0.06759762763977051, + "learning_rate": 5.002475358927045e-05, + "loss": 0.2206, + "step": 38261 + }, + { + "epoch": 3.099643551523007, + "grad_norm": 0.07775463908910751, + "learning_rate": 5.0020252936675814e-05, + "loss": 0.2283, + "step": 38262 + }, + { + "epoch": 3.0997245625405054, + "grad_norm": 0.06791025400161743, + "learning_rate": 5.0015752284081195e-05, + "loss": 0.2887, + "step": 38263 + }, + { + "epoch": 3.0998055735580037, + "grad_norm": 0.06941854953765869, + "learning_rate": 5.001125163148657e-05, + "loss": 0.2424, + "step": 38264 + }, + { + "epoch": 3.0998865845755024, + "grad_norm": 0.08999748528003693, + "learning_rate": 5.0006750978891935e-05, + "loss": 0.2171, + "step": 38265 + }, + { + "epoch": 3.0999675955930006, + "grad_norm": 0.05982363224029541, + "learning_rate": 5.0002250326297316e-05, + "loss": 0.2452, + "step": 38266 + }, + { + "epoch": 3.100048606610499, + "grad_norm": 0.07498472929000854, + "learning_rate": 4.999774967370269e-05, + "loss": 0.1982, + "step": 38267 + }, + { + "epoch": 3.1001296176279975, + "grad_norm": 0.06741871684789658, + "learning_rate": 4.999324902110806e-05, + "loss": 0.2182, + "step": 38268 + }, + { + "epoch": 3.100210628645496, + "grad_norm": 0.07078975439071655, + "learning_rate": 4.9988748368513436e-05, + "loss": 0.2497, + "step": 38269 + }, + { + "epoch": 3.100291639662994, + "grad_norm": 0.06628455966711044, + "learning_rate": 4.998424771591881e-05, + "loss": 0.2502, + "step": 38270 + }, + { + "epoch": 3.1003726506804927, + "grad_norm": 0.059061840176582336, + "learning_rate": 4.9979747063324184e-05, + "loss": 0.2168, + "step": 38271 + }, + { + "epoch": 3.100453661697991, + "grad_norm": 0.07902442663908005, + "learning_rate": 4.997524641072956e-05, + "loss": 0.2416, + "step": 38272 + }, + { + "epoch": 3.100534672715489, + "grad_norm": 0.0617385134100914, + "learning_rate": 4.997074575813493e-05, + "loss": 0.2211, + "step": 38273 + }, + { + "epoch": 3.100615683732988, + "grad_norm": 0.07803118228912354, + "learning_rate": 4.9966245105540304e-05, + "loss": 0.2529, + "step": 38274 + }, + { + "epoch": 3.100696694750486, + "grad_norm": 0.07169701159000397, + "learning_rate": 4.996174445294568e-05, + "loss": 0.2464, + "step": 38275 + }, + { + "epoch": 3.1007777057679844, + "grad_norm": 0.06901038438081741, + "learning_rate": 4.995724380035106e-05, + "loss": 0.2396, + "step": 38276 + }, + { + "epoch": 3.1008587167854826, + "grad_norm": 0.08568874001502991, + "learning_rate": 4.9952743147756425e-05, + "loss": 0.2351, + "step": 38277 + }, + { + "epoch": 3.1009397278029813, + "grad_norm": 0.07365360110998154, + "learning_rate": 4.99482424951618e-05, + "loss": 0.2488, + "step": 38278 + }, + { + "epoch": 3.1010207388204796, + "grad_norm": 0.06841385364532471, + "learning_rate": 4.994374184256718e-05, + "loss": 0.2076, + "step": 38279 + }, + { + "epoch": 3.101101749837978, + "grad_norm": 0.051799263805150986, + "learning_rate": 4.9939241189972546e-05, + "loss": 0.2096, + "step": 38280 + }, + { + "epoch": 3.1011827608554765, + "grad_norm": 0.08439652621746063, + "learning_rate": 4.993474053737792e-05, + "loss": 0.189, + "step": 38281 + }, + { + "epoch": 3.1012637718729748, + "grad_norm": 0.061678286641836166, + "learning_rate": 4.99302398847833e-05, + "loss": 0.2429, + "step": 38282 + }, + { + "epoch": 3.101344782890473, + "grad_norm": 0.0763859674334526, + "learning_rate": 4.992573923218867e-05, + "loss": 0.2272, + "step": 38283 + }, + { + "epoch": 3.1014257939079717, + "grad_norm": 0.08517804741859436, + "learning_rate": 4.992123857959404e-05, + "loss": 0.289, + "step": 38284 + }, + { + "epoch": 3.10150680492547, + "grad_norm": 0.08954822272062302, + "learning_rate": 4.991673792699942e-05, + "loss": 0.2642, + "step": 38285 + }, + { + "epoch": 3.101587815942968, + "grad_norm": 0.08051921427249908, + "learning_rate": 4.991223727440479e-05, + "loss": 0.2428, + "step": 38286 + }, + { + "epoch": 3.1016688269604664, + "grad_norm": 0.06787646561861038, + "learning_rate": 4.990773662181016e-05, + "loss": 0.242, + "step": 38287 + }, + { + "epoch": 3.101749837977965, + "grad_norm": 0.07331487536430359, + "learning_rate": 4.990323596921554e-05, + "loss": 0.2409, + "step": 38288 + }, + { + "epoch": 3.1018308489954634, + "grad_norm": 0.07462482154369354, + "learning_rate": 4.9898735316620915e-05, + "loss": 0.2479, + "step": 38289 + }, + { + "epoch": 3.1019118600129616, + "grad_norm": 0.05779562518000603, + "learning_rate": 4.989423466402628e-05, + "loss": 0.2191, + "step": 38290 + }, + { + "epoch": 3.1019928710304603, + "grad_norm": 0.07643572986125946, + "learning_rate": 4.988973401143166e-05, + "loss": 0.2262, + "step": 38291 + }, + { + "epoch": 3.1020738820479585, + "grad_norm": 0.0754118412733078, + "learning_rate": 4.9885233358837036e-05, + "loss": 0.2095, + "step": 38292 + }, + { + "epoch": 3.1021548930654568, + "grad_norm": 0.06725703179836273, + "learning_rate": 4.98807327062424e-05, + "loss": 0.2338, + "step": 38293 + }, + { + "epoch": 3.1022359040829555, + "grad_norm": 0.06517348438501358, + "learning_rate": 4.987623205364778e-05, + "loss": 0.2653, + "step": 38294 + }, + { + "epoch": 3.1023169151004537, + "grad_norm": 0.07611946016550064, + "learning_rate": 4.987173140105316e-05, + "loss": 0.2233, + "step": 38295 + }, + { + "epoch": 3.102397926117952, + "grad_norm": 0.06755281239748001, + "learning_rate": 4.9867230748458524e-05, + "loss": 0.2169, + "step": 38296 + }, + { + "epoch": 3.1024789371354506, + "grad_norm": 0.07308715581893921, + "learning_rate": 4.9862730095863904e-05, + "loss": 0.2227, + "step": 38297 + }, + { + "epoch": 3.102559948152949, + "grad_norm": 0.0735616609454155, + "learning_rate": 4.985822944326928e-05, + "loss": 0.2366, + "step": 38298 + }, + { + "epoch": 3.102640959170447, + "grad_norm": 0.07177618891000748, + "learning_rate": 4.9853728790674645e-05, + "loss": 0.2568, + "step": 38299 + }, + { + "epoch": 3.1027219701879454, + "grad_norm": 0.06585688143968582, + "learning_rate": 4.9849228138080025e-05, + "loss": 0.2167, + "step": 38300 + }, + { + "epoch": 3.102802981205444, + "grad_norm": 0.09176765382289886, + "learning_rate": 4.98447274854854e-05, + "loss": 0.2808, + "step": 38301 + }, + { + "epoch": 3.1028839922229423, + "grad_norm": 0.07390912622213364, + "learning_rate": 4.984022683289077e-05, + "loss": 0.2624, + "step": 38302 + }, + { + "epoch": 3.1029650032404406, + "grad_norm": 0.06442528963088989, + "learning_rate": 4.9835726180296146e-05, + "loss": 0.1996, + "step": 38303 + }, + { + "epoch": 3.1030460142579392, + "grad_norm": 0.06658261269330978, + "learning_rate": 4.983122552770152e-05, + "loss": 0.1991, + "step": 38304 + }, + { + "epoch": 3.1031270252754375, + "grad_norm": 0.07667221873998642, + "learning_rate": 4.982672487510689e-05, + "loss": 0.2985, + "step": 38305 + }, + { + "epoch": 3.1032080362929357, + "grad_norm": 0.07117309421300888, + "learning_rate": 4.9822224222512267e-05, + "loss": 0.1969, + "step": 38306 + }, + { + "epoch": 3.1032890473104344, + "grad_norm": 0.06802020967006683, + "learning_rate": 4.981772356991764e-05, + "loss": 0.2193, + "step": 38307 + }, + { + "epoch": 3.1033700583279327, + "grad_norm": 0.08191298693418503, + "learning_rate": 4.9813222917323014e-05, + "loss": 0.2377, + "step": 38308 + }, + { + "epoch": 3.103451069345431, + "grad_norm": 0.09030667692422867, + "learning_rate": 4.980872226472839e-05, + "loss": 0.2476, + "step": 38309 + }, + { + "epoch": 3.103532080362929, + "grad_norm": 0.06818903982639313, + "learning_rate": 4.980422161213376e-05, + "loss": 0.2317, + "step": 38310 + }, + { + "epoch": 3.103613091380428, + "grad_norm": 0.0722852423787117, + "learning_rate": 4.9799720959539135e-05, + "loss": 0.284, + "step": 38311 + }, + { + "epoch": 3.103694102397926, + "grad_norm": 0.07246318459510803, + "learning_rate": 4.979522030694451e-05, + "loss": 0.2357, + "step": 38312 + }, + { + "epoch": 3.1037751134154243, + "grad_norm": 0.07880403101444244, + "learning_rate": 4.979071965434988e-05, + "loss": 0.2539, + "step": 38313 + }, + { + "epoch": 3.103856124432923, + "grad_norm": 0.06277298182249069, + "learning_rate": 4.9786219001755255e-05, + "loss": 0.2124, + "step": 38314 + }, + { + "epoch": 3.1039371354504213, + "grad_norm": 0.08322549611330032, + "learning_rate": 4.9781718349160636e-05, + "loss": 0.2272, + "step": 38315 + }, + { + "epoch": 3.1040181464679195, + "grad_norm": 0.0580422505736351, + "learning_rate": 4.9777217696566e-05, + "loss": 0.2291, + "step": 38316 + }, + { + "epoch": 3.104099157485418, + "grad_norm": 0.07431962341070175, + "learning_rate": 4.9772717043971376e-05, + "loss": 0.2578, + "step": 38317 + }, + { + "epoch": 3.1041801685029164, + "grad_norm": 0.06599278748035431, + "learning_rate": 4.9768216391376757e-05, + "loss": 0.2145, + "step": 38318 + }, + { + "epoch": 3.1042611795204147, + "grad_norm": 0.06994808465242386, + "learning_rate": 4.976371573878212e-05, + "loss": 0.2237, + "step": 38319 + }, + { + "epoch": 3.1043421905379134, + "grad_norm": 0.07410278916358948, + "learning_rate": 4.97592150861875e-05, + "loss": 0.1965, + "step": 38320 + }, + { + "epoch": 3.1044232015554116, + "grad_norm": 0.06452371925115585, + "learning_rate": 4.975471443359288e-05, + "loss": 0.2146, + "step": 38321 + }, + { + "epoch": 3.10450421257291, + "grad_norm": 0.06319420039653778, + "learning_rate": 4.9750213780998244e-05, + "loss": 0.2332, + "step": 38322 + }, + { + "epoch": 3.104585223590408, + "grad_norm": 0.06600657105445862, + "learning_rate": 4.974571312840362e-05, + "loss": 0.2192, + "step": 38323 + }, + { + "epoch": 3.104666234607907, + "grad_norm": 0.07321722060441971, + "learning_rate": 4.9741212475809e-05, + "loss": 0.2367, + "step": 38324 + }, + { + "epoch": 3.104747245625405, + "grad_norm": 0.06495875865221024, + "learning_rate": 4.9736711823214365e-05, + "loss": 0.2072, + "step": 38325 + }, + { + "epoch": 3.1048282566429033, + "grad_norm": 0.06502759456634521, + "learning_rate": 4.973221117061974e-05, + "loss": 0.2244, + "step": 38326 + }, + { + "epoch": 3.104909267660402, + "grad_norm": 0.06920306384563446, + "learning_rate": 4.972771051802512e-05, + "loss": 0.2158, + "step": 38327 + }, + { + "epoch": 3.1049902786779002, + "grad_norm": 0.07908426225185394, + "learning_rate": 4.972320986543049e-05, + "loss": 0.2175, + "step": 38328 + }, + { + "epoch": 3.1050712896953985, + "grad_norm": 0.07021722197532654, + "learning_rate": 4.971870921283586e-05, + "loss": 0.2196, + "step": 38329 + }, + { + "epoch": 3.105152300712897, + "grad_norm": 0.07090037316083908, + "learning_rate": 4.971420856024124e-05, + "loss": 0.202, + "step": 38330 + }, + { + "epoch": 3.1052333117303954, + "grad_norm": 0.0728762075304985, + "learning_rate": 4.970970790764661e-05, + "loss": 0.2301, + "step": 38331 + }, + { + "epoch": 3.1053143227478937, + "grad_norm": 0.07722747325897217, + "learning_rate": 4.970520725505198e-05, + "loss": 0.2302, + "step": 38332 + }, + { + "epoch": 3.105395333765392, + "grad_norm": 0.0742846429347992, + "learning_rate": 4.970070660245736e-05, + "loss": 0.2579, + "step": 38333 + }, + { + "epoch": 3.1054763447828906, + "grad_norm": 0.07764902710914612, + "learning_rate": 4.9696205949862734e-05, + "loss": 0.236, + "step": 38334 + }, + { + "epoch": 3.105557355800389, + "grad_norm": 0.07743822038173676, + "learning_rate": 4.96917052972681e-05, + "loss": 0.2418, + "step": 38335 + }, + { + "epoch": 3.105638366817887, + "grad_norm": 0.08601834625005722, + "learning_rate": 4.968720464467348e-05, + "loss": 0.2575, + "step": 38336 + }, + { + "epoch": 3.1057193778353858, + "grad_norm": 0.07223144918680191, + "learning_rate": 4.9682703992078855e-05, + "loss": 0.2316, + "step": 38337 + }, + { + "epoch": 3.105800388852884, + "grad_norm": 0.06830278784036636, + "learning_rate": 4.967820333948422e-05, + "loss": 0.2304, + "step": 38338 + }, + { + "epoch": 3.1058813998703823, + "grad_norm": 0.05036047473549843, + "learning_rate": 4.96737026868896e-05, + "loss": 0.2097, + "step": 38339 + }, + { + "epoch": 3.105962410887881, + "grad_norm": 0.08505997061729431, + "learning_rate": 4.9669202034294976e-05, + "loss": 0.2637, + "step": 38340 + }, + { + "epoch": 3.106043421905379, + "grad_norm": 0.07370181381702423, + "learning_rate": 4.966470138170035e-05, + "loss": 0.2318, + "step": 38341 + }, + { + "epoch": 3.1061244329228774, + "grad_norm": 0.08025460690259933, + "learning_rate": 4.966020072910572e-05, + "loss": 0.2421, + "step": 38342 + }, + { + "epoch": 3.1062054439403757, + "grad_norm": 0.08092350512742996, + "learning_rate": 4.9655700076511097e-05, + "loss": 0.2134, + "step": 38343 + }, + { + "epoch": 3.1062864549578744, + "grad_norm": 0.07050243020057678, + "learning_rate": 4.965119942391647e-05, + "loss": 0.2454, + "step": 38344 + }, + { + "epoch": 3.1063674659753726, + "grad_norm": 0.06997469067573547, + "learning_rate": 4.9646698771321844e-05, + "loss": 0.2398, + "step": 38345 + }, + { + "epoch": 3.106448476992871, + "grad_norm": 0.08480319380760193, + "learning_rate": 4.964219811872722e-05, + "loss": 0.2149, + "step": 38346 + }, + { + "epoch": 3.1065294880103695, + "grad_norm": 0.06980687379837036, + "learning_rate": 4.963769746613259e-05, + "loss": 0.2136, + "step": 38347 + }, + { + "epoch": 3.106610499027868, + "grad_norm": 0.06757093966007233, + "learning_rate": 4.9633196813537965e-05, + "loss": 0.2652, + "step": 38348 + }, + { + "epoch": 3.106691510045366, + "grad_norm": 0.060810089111328125, + "learning_rate": 4.962869616094334e-05, + "loss": 0.2343, + "step": 38349 + }, + { + "epoch": 3.1067725210628647, + "grad_norm": 0.08900392055511475, + "learning_rate": 4.962419550834871e-05, + "loss": 0.262, + "step": 38350 + }, + { + "epoch": 3.106853532080363, + "grad_norm": 0.06885910034179688, + "learning_rate": 4.9619694855754085e-05, + "loss": 0.2192, + "step": 38351 + }, + { + "epoch": 3.106934543097861, + "grad_norm": 0.06629056483507156, + "learning_rate": 4.961519420315946e-05, + "loss": 0.235, + "step": 38352 + }, + { + "epoch": 3.10701555411536, + "grad_norm": 0.09018058329820633, + "learning_rate": 4.961069355056483e-05, + "loss": 0.2683, + "step": 38353 + }, + { + "epoch": 3.107096565132858, + "grad_norm": 0.06973500549793243, + "learning_rate": 4.960619289797021e-05, + "loss": 0.2546, + "step": 38354 + }, + { + "epoch": 3.1071775761503564, + "grad_norm": 0.0802709087729454, + "learning_rate": 4.960169224537558e-05, + "loss": 0.2259, + "step": 38355 + }, + { + "epoch": 3.1072585871678546, + "grad_norm": 0.0760037899017334, + "learning_rate": 4.9597191592780953e-05, + "loss": 0.2167, + "step": 38356 + }, + { + "epoch": 3.1073395981853533, + "grad_norm": 0.06522723287343979, + "learning_rate": 4.9592690940186334e-05, + "loss": 0.2353, + "step": 38357 + }, + { + "epoch": 3.1074206092028516, + "grad_norm": 0.09381596744060516, + "learning_rate": 4.95881902875917e-05, + "loss": 0.2924, + "step": 38358 + }, + { + "epoch": 3.10750162022035, + "grad_norm": 0.07883308082818985, + "learning_rate": 4.9583689634997074e-05, + "loss": 0.2197, + "step": 38359 + }, + { + "epoch": 3.1075826312378485, + "grad_norm": 0.07242937386035919, + "learning_rate": 4.9579188982402455e-05, + "loss": 0.2389, + "step": 38360 + }, + { + "epoch": 3.1076636422553467, + "grad_norm": 0.07693379372358322, + "learning_rate": 4.957468832980782e-05, + "loss": 0.2318, + "step": 38361 + }, + { + "epoch": 3.107744653272845, + "grad_norm": 0.0737452581524849, + "learning_rate": 4.9570187677213195e-05, + "loss": 0.1915, + "step": 38362 + }, + { + "epoch": 3.1078256642903437, + "grad_norm": 0.0680558905005455, + "learning_rate": 4.9565687024618575e-05, + "loss": 0.2563, + "step": 38363 + }, + { + "epoch": 3.107906675307842, + "grad_norm": 0.060349684208631516, + "learning_rate": 4.956118637202394e-05, + "loss": 0.2066, + "step": 38364 + }, + { + "epoch": 3.10798768632534, + "grad_norm": 0.057918623089790344, + "learning_rate": 4.9556685719429316e-05, + "loss": 0.232, + "step": 38365 + }, + { + "epoch": 3.1080686973428384, + "grad_norm": 0.0609627366065979, + "learning_rate": 4.9552185066834696e-05, + "loss": 0.2323, + "step": 38366 + }, + { + "epoch": 3.108149708360337, + "grad_norm": 0.0612613782286644, + "learning_rate": 4.954768441424007e-05, + "loss": 0.2169, + "step": 38367 + }, + { + "epoch": 3.1082307193778353, + "grad_norm": 0.06923031061887741, + "learning_rate": 4.954318376164544e-05, + "loss": 0.2326, + "step": 38368 + }, + { + "epoch": 3.1083117303953336, + "grad_norm": 0.06448613852262497, + "learning_rate": 4.953868310905082e-05, + "loss": 0.2081, + "step": 38369 + }, + { + "epoch": 3.1083927414128323, + "grad_norm": 0.06773959845304489, + "learning_rate": 4.953418245645619e-05, + "loss": 0.233, + "step": 38370 + }, + { + "epoch": 3.1084737524303305, + "grad_norm": 0.06707341223955154, + "learning_rate": 4.952968180386156e-05, + "loss": 0.1969, + "step": 38371 + }, + { + "epoch": 3.1085547634478288, + "grad_norm": 0.06954282522201538, + "learning_rate": 4.952518115126694e-05, + "loss": 0.198, + "step": 38372 + }, + { + "epoch": 3.1086357744653275, + "grad_norm": 0.06780767440795898, + "learning_rate": 4.952068049867231e-05, + "loss": 0.2418, + "step": 38373 + }, + { + "epoch": 3.1087167854828257, + "grad_norm": 0.05618634074926376, + "learning_rate": 4.951617984607768e-05, + "loss": 0.2001, + "step": 38374 + }, + { + "epoch": 3.108797796500324, + "grad_norm": 0.05853656679391861, + "learning_rate": 4.951167919348306e-05, + "loss": 0.2318, + "step": 38375 + }, + { + "epoch": 3.108878807517822, + "grad_norm": 0.07284107059240341, + "learning_rate": 4.950717854088843e-05, + "loss": 0.2351, + "step": 38376 + }, + { + "epoch": 3.108959818535321, + "grad_norm": 0.07166719436645508, + "learning_rate": 4.95026778882938e-05, + "loss": 0.2497, + "step": 38377 + }, + { + "epoch": 3.109040829552819, + "grad_norm": 0.09151309728622437, + "learning_rate": 4.949817723569918e-05, + "loss": 0.1985, + "step": 38378 + }, + { + "epoch": 3.1091218405703174, + "grad_norm": 0.08460740745067596, + "learning_rate": 4.949367658310455e-05, + "loss": 0.2366, + "step": 38379 + }, + { + "epoch": 3.109202851587816, + "grad_norm": 0.08176060765981674, + "learning_rate": 4.948917593050993e-05, + "loss": 0.2114, + "step": 38380 + }, + { + "epoch": 3.1092838626053143, + "grad_norm": 0.06409350782632828, + "learning_rate": 4.94846752779153e-05, + "loss": 0.218, + "step": 38381 + }, + { + "epoch": 3.1093648736228126, + "grad_norm": 0.0686076357960701, + "learning_rate": 4.9480174625320674e-05, + "loss": 0.1993, + "step": 38382 + }, + { + "epoch": 3.1094458846403112, + "grad_norm": 0.07834609597921371, + "learning_rate": 4.947567397272605e-05, + "loss": 0.2239, + "step": 38383 + }, + { + "epoch": 3.1095268956578095, + "grad_norm": 0.07187711447477341, + "learning_rate": 4.947117332013142e-05, + "loss": 0.2386, + "step": 38384 + }, + { + "epoch": 3.1096079066753077, + "grad_norm": 0.05771234259009361, + "learning_rate": 4.9466672667536795e-05, + "loss": 0.2127, + "step": 38385 + }, + { + "epoch": 3.1096889176928064, + "grad_norm": 0.089955173432827, + "learning_rate": 4.946217201494217e-05, + "loss": 0.2216, + "step": 38386 + }, + { + "epoch": 3.1097699287103047, + "grad_norm": 0.08648094534873962, + "learning_rate": 4.945767136234754e-05, + "loss": 0.2784, + "step": 38387 + }, + { + "epoch": 3.109850939727803, + "grad_norm": 0.06970040500164032, + "learning_rate": 4.9453170709752916e-05, + "loss": 0.2271, + "step": 38388 + }, + { + "epoch": 3.109931950745301, + "grad_norm": 0.06475716829299927, + "learning_rate": 4.944867005715829e-05, + "loss": 0.2032, + "step": 38389 + }, + { + "epoch": 3.1100129617628, + "grad_norm": 0.07349738478660583, + "learning_rate": 4.944416940456366e-05, + "loss": 0.2017, + "step": 38390 + }, + { + "epoch": 3.110093972780298, + "grad_norm": 0.07952949404716492, + "learning_rate": 4.9439668751969036e-05, + "loss": 0.2741, + "step": 38391 + }, + { + "epoch": 3.1101749837977963, + "grad_norm": 0.09136699140071869, + "learning_rate": 4.943516809937441e-05, + "loss": 0.2298, + "step": 38392 + }, + { + "epoch": 3.110255994815295, + "grad_norm": 0.057764239609241486, + "learning_rate": 4.943066744677979e-05, + "loss": 0.2019, + "step": 38393 + }, + { + "epoch": 3.1103370058327933, + "grad_norm": 0.06546784937381744, + "learning_rate": 4.942616679418516e-05, + "loss": 0.2622, + "step": 38394 + }, + { + "epoch": 3.1104180168502915, + "grad_norm": 0.0661168321967125, + "learning_rate": 4.942166614159053e-05, + "loss": 0.2261, + "step": 38395 + }, + { + "epoch": 3.11049902786779, + "grad_norm": 0.06803473085165024, + "learning_rate": 4.941716548899591e-05, + "loss": 0.1992, + "step": 38396 + }, + { + "epoch": 3.1105800388852884, + "grad_norm": 0.09625261276960373, + "learning_rate": 4.941266483640128e-05, + "loss": 0.261, + "step": 38397 + }, + { + "epoch": 3.1106610499027867, + "grad_norm": 0.05727479234337807, + "learning_rate": 4.940816418380665e-05, + "loss": 0.2217, + "step": 38398 + }, + { + "epoch": 3.110742060920285, + "grad_norm": 0.07928014546632767, + "learning_rate": 4.940366353121203e-05, + "loss": 0.2105, + "step": 38399 + }, + { + "epoch": 3.1108230719377836, + "grad_norm": 0.07788312435150146, + "learning_rate": 4.93991628786174e-05, + "loss": 0.2197, + "step": 38400 + }, + { + "epoch": 3.110904082955282, + "grad_norm": 0.07708944380283356, + "learning_rate": 4.939466222602277e-05, + "loss": 0.2597, + "step": 38401 + }, + { + "epoch": 3.11098509397278, + "grad_norm": 0.06281648576259613, + "learning_rate": 4.939016157342815e-05, + "loss": 0.2094, + "step": 38402 + }, + { + "epoch": 3.111066104990279, + "grad_norm": 0.07252870500087738, + "learning_rate": 4.938566092083352e-05, + "loss": 0.25, + "step": 38403 + }, + { + "epoch": 3.111147116007777, + "grad_norm": 0.0702609047293663, + "learning_rate": 4.938116026823889e-05, + "loss": 0.2328, + "step": 38404 + }, + { + "epoch": 3.1112281270252753, + "grad_norm": 0.06708363443613052, + "learning_rate": 4.9376659615644274e-05, + "loss": 0.1991, + "step": 38405 + }, + { + "epoch": 3.111309138042774, + "grad_norm": 0.07520218938589096, + "learning_rate": 4.937215896304965e-05, + "loss": 0.2102, + "step": 38406 + }, + { + "epoch": 3.1113901490602722, + "grad_norm": 0.06943734735250473, + "learning_rate": 4.9367658310455014e-05, + "loss": 0.2571, + "step": 38407 + }, + { + "epoch": 3.1114711600777705, + "grad_norm": 0.06772951781749725, + "learning_rate": 4.9363157657860394e-05, + "loss": 0.2482, + "step": 38408 + }, + { + "epoch": 3.111552171095269, + "grad_norm": 0.06087531894445419, + "learning_rate": 4.935865700526577e-05, + "loss": 0.249, + "step": 38409 + }, + { + "epoch": 3.1116331821127674, + "grad_norm": 0.06537861377000809, + "learning_rate": 4.9354156352671135e-05, + "loss": 0.2136, + "step": 38410 + }, + { + "epoch": 3.1117141931302656, + "grad_norm": 0.08597944676876068, + "learning_rate": 4.9349655700076515e-05, + "loss": 0.2539, + "step": 38411 + }, + { + "epoch": 3.111795204147764, + "grad_norm": 0.07390245795249939, + "learning_rate": 4.934515504748189e-05, + "loss": 0.2308, + "step": 38412 + }, + { + "epoch": 3.1118762151652626, + "grad_norm": 0.0940677598118782, + "learning_rate": 4.934065439488726e-05, + "loss": 0.2728, + "step": 38413 + }, + { + "epoch": 3.111957226182761, + "grad_norm": 0.06742730736732483, + "learning_rate": 4.9336153742292636e-05, + "loss": 0.2078, + "step": 38414 + }, + { + "epoch": 3.112038237200259, + "grad_norm": 0.0731840655207634, + "learning_rate": 4.933165308969801e-05, + "loss": 0.2605, + "step": 38415 + }, + { + "epoch": 3.1121192482177578, + "grad_norm": 0.06594830751419067, + "learning_rate": 4.932715243710338e-05, + "loss": 0.2546, + "step": 38416 + }, + { + "epoch": 3.112200259235256, + "grad_norm": 0.06969321519136429, + "learning_rate": 4.932265178450876e-05, + "loss": 0.2551, + "step": 38417 + }, + { + "epoch": 3.1122812702527543, + "grad_norm": 0.06713997572660446, + "learning_rate": 4.931815113191413e-05, + "loss": 0.2397, + "step": 38418 + }, + { + "epoch": 3.112362281270253, + "grad_norm": 0.06411050260066986, + "learning_rate": 4.9313650479319504e-05, + "loss": 0.2283, + "step": 38419 + }, + { + "epoch": 3.112443292287751, + "grad_norm": 0.06092459335923195, + "learning_rate": 4.930914982672488e-05, + "loss": 0.2581, + "step": 38420 + }, + { + "epoch": 3.1125243033052494, + "grad_norm": 0.05901643633842468, + "learning_rate": 4.930464917413025e-05, + "loss": 0.2088, + "step": 38421 + }, + { + "epoch": 3.1126053143227477, + "grad_norm": 0.06839630007743835, + "learning_rate": 4.9300148521535625e-05, + "loss": 0.2485, + "step": 38422 + }, + { + "epoch": 3.1126863253402464, + "grad_norm": 0.08143807202577591, + "learning_rate": 4.9295647868941e-05, + "loss": 0.2186, + "step": 38423 + }, + { + "epoch": 3.1127673363577446, + "grad_norm": 0.08210988342761993, + "learning_rate": 4.929114721634637e-05, + "loss": 0.2284, + "step": 38424 + }, + { + "epoch": 3.112848347375243, + "grad_norm": 0.05726107954978943, + "learning_rate": 4.9286646563751746e-05, + "loss": 0.2158, + "step": 38425 + }, + { + "epoch": 3.1129293583927415, + "grad_norm": 0.06085515022277832, + "learning_rate": 4.928214591115712e-05, + "loss": 0.243, + "step": 38426 + }, + { + "epoch": 3.11301036941024, + "grad_norm": 0.09158769249916077, + "learning_rate": 4.927764525856249e-05, + "loss": 0.2211, + "step": 38427 + }, + { + "epoch": 3.113091380427738, + "grad_norm": 0.08236166834831238, + "learning_rate": 4.9273144605967866e-05, + "loss": 0.2031, + "step": 38428 + }, + { + "epoch": 3.1131723914452367, + "grad_norm": 0.06737654656171799, + "learning_rate": 4.926864395337324e-05, + "loss": 0.238, + "step": 38429 + }, + { + "epoch": 3.113253402462735, + "grad_norm": 0.06789126247167587, + "learning_rate": 4.9264143300778614e-05, + "loss": 0.2491, + "step": 38430 + }, + { + "epoch": 3.113334413480233, + "grad_norm": 0.07835162431001663, + "learning_rate": 4.925964264818399e-05, + "loss": 0.2356, + "step": 38431 + }, + { + "epoch": 3.113415424497732, + "grad_norm": 0.060290753841400146, + "learning_rate": 4.925514199558936e-05, + "loss": 0.2396, + "step": 38432 + }, + { + "epoch": 3.11349643551523, + "grad_norm": 0.07192988693714142, + "learning_rate": 4.9250641342994734e-05, + "loss": 0.2081, + "step": 38433 + }, + { + "epoch": 3.1135774465327284, + "grad_norm": 0.07745447009801865, + "learning_rate": 4.924614069040011e-05, + "loss": 0.2241, + "step": 38434 + }, + { + "epoch": 3.1136584575502266, + "grad_norm": 0.09003927558660507, + "learning_rate": 4.924164003780549e-05, + "loss": 0.2479, + "step": 38435 + }, + { + "epoch": 3.1137394685677253, + "grad_norm": 0.06698548048734665, + "learning_rate": 4.9237139385210855e-05, + "loss": 0.232, + "step": 38436 + }, + { + "epoch": 3.1138204795852236, + "grad_norm": 0.06464772671461105, + "learning_rate": 4.923263873261623e-05, + "loss": 0.1865, + "step": 38437 + }, + { + "epoch": 3.113901490602722, + "grad_norm": 0.07534575462341309, + "learning_rate": 4.922813808002161e-05, + "loss": 0.2207, + "step": 38438 + }, + { + "epoch": 3.1139825016202205, + "grad_norm": 0.05245482176542282, + "learning_rate": 4.9223637427426976e-05, + "loss": 0.2074, + "step": 38439 + }, + { + "epoch": 3.1140635126377187, + "grad_norm": 0.060525473207235336, + "learning_rate": 4.921913677483235e-05, + "loss": 0.2202, + "step": 38440 + }, + { + "epoch": 3.114144523655217, + "grad_norm": 0.0742216482758522, + "learning_rate": 4.921463612223773e-05, + "loss": 0.215, + "step": 38441 + }, + { + "epoch": 3.1142255346727157, + "grad_norm": 0.07293812930583954, + "learning_rate": 4.92101354696431e-05, + "loss": 0.2565, + "step": 38442 + }, + { + "epoch": 3.114306545690214, + "grad_norm": 0.07802662253379822, + "learning_rate": 4.920563481704847e-05, + "loss": 0.2224, + "step": 38443 + }, + { + "epoch": 3.114387556707712, + "grad_norm": 0.06432733684778214, + "learning_rate": 4.920113416445385e-05, + "loss": 0.2281, + "step": 38444 + }, + { + "epoch": 3.1144685677252104, + "grad_norm": 0.07466696947813034, + "learning_rate": 4.919663351185922e-05, + "loss": 0.2477, + "step": 38445 + }, + { + "epoch": 3.114549578742709, + "grad_norm": 0.07490424066781998, + "learning_rate": 4.91921328592646e-05, + "loss": 0.2081, + "step": 38446 + }, + { + "epoch": 3.1146305897602073, + "grad_norm": 0.06576748192310333, + "learning_rate": 4.918763220666997e-05, + "loss": 0.2186, + "step": 38447 + }, + { + "epoch": 3.1147116007777056, + "grad_norm": 0.05498381704092026, + "learning_rate": 4.9183131554075345e-05, + "loss": 0.2257, + "step": 38448 + }, + { + "epoch": 3.1147926117952043, + "grad_norm": 0.07218701392412186, + "learning_rate": 4.917863090148072e-05, + "loss": 0.2327, + "step": 38449 + }, + { + "epoch": 3.1148736228127025, + "grad_norm": 0.06278561800718307, + "learning_rate": 4.917413024888609e-05, + "loss": 0.2252, + "step": 38450 + }, + { + "epoch": 3.1149546338302008, + "grad_norm": 0.0699433833360672, + "learning_rate": 4.9169629596291466e-05, + "loss": 0.2236, + "step": 38451 + }, + { + "epoch": 3.1150356448476995, + "grad_norm": 0.07142642885446548, + "learning_rate": 4.916512894369684e-05, + "loss": 0.2115, + "step": 38452 + }, + { + "epoch": 3.1151166558651977, + "grad_norm": 0.07643146812915802, + "learning_rate": 4.916062829110221e-05, + "loss": 0.2068, + "step": 38453 + }, + { + "epoch": 3.115197666882696, + "grad_norm": 0.06402745842933655, + "learning_rate": 4.915612763850759e-05, + "loss": 0.2126, + "step": 38454 + }, + { + "epoch": 3.1152786779001946, + "grad_norm": 0.09096885472536087, + "learning_rate": 4.915162698591296e-05, + "loss": 0.2691, + "step": 38455 + }, + { + "epoch": 3.115359688917693, + "grad_norm": 0.07322259992361069, + "learning_rate": 4.9147126333318334e-05, + "loss": 0.2385, + "step": 38456 + }, + { + "epoch": 3.115440699935191, + "grad_norm": 0.07319648563861847, + "learning_rate": 4.914262568072371e-05, + "loss": 0.2517, + "step": 38457 + }, + { + "epoch": 3.1155217109526894, + "grad_norm": 0.07214269042015076, + "learning_rate": 4.913812502812908e-05, + "loss": 0.2342, + "step": 38458 + }, + { + "epoch": 3.115602721970188, + "grad_norm": 0.06724945455789566, + "learning_rate": 4.9133624375534455e-05, + "loss": 0.2317, + "step": 38459 + }, + { + "epoch": 3.1156837329876863, + "grad_norm": 0.10171977430582047, + "learning_rate": 4.912912372293983e-05, + "loss": 0.2638, + "step": 38460 + }, + { + "epoch": 3.1157647440051845, + "grad_norm": 0.08508376777172089, + "learning_rate": 4.91246230703452e-05, + "loss": 0.2443, + "step": 38461 + }, + { + "epoch": 3.1158457550226832, + "grad_norm": 0.061026692390441895, + "learning_rate": 4.9120122417750576e-05, + "loss": 0.2006, + "step": 38462 + }, + { + "epoch": 3.1159267660401815, + "grad_norm": 0.08301940560340881, + "learning_rate": 4.911562176515595e-05, + "loss": 0.2538, + "step": 38463 + }, + { + "epoch": 3.1160077770576797, + "grad_norm": 0.07607529312372208, + "learning_rate": 4.911112111256132e-05, + "loss": 0.2288, + "step": 38464 + }, + { + "epoch": 3.1160887880751784, + "grad_norm": 0.07203084230422974, + "learning_rate": 4.9106620459966697e-05, + "loss": 0.2263, + "step": 38465 + }, + { + "epoch": 3.1161697990926767, + "grad_norm": 0.0560927577316761, + "learning_rate": 4.910211980737207e-05, + "loss": 0.2014, + "step": 38466 + }, + { + "epoch": 3.116250810110175, + "grad_norm": 0.0738847628235817, + "learning_rate": 4.9097619154777444e-05, + "loss": 0.2086, + "step": 38467 + }, + { + "epoch": 3.116331821127673, + "grad_norm": 0.06414270401000977, + "learning_rate": 4.909311850218282e-05, + "loss": 0.2061, + "step": 38468 + }, + { + "epoch": 3.116412832145172, + "grad_norm": 0.06747682392597198, + "learning_rate": 4.908861784958819e-05, + "loss": 0.245, + "step": 38469 + }, + { + "epoch": 3.11649384316267, + "grad_norm": 0.05635647103190422, + "learning_rate": 4.9084117196993565e-05, + "loss": 0.2061, + "step": 38470 + }, + { + "epoch": 3.1165748541801683, + "grad_norm": 0.07150799036026001, + "learning_rate": 4.907961654439894e-05, + "loss": 0.2055, + "step": 38471 + }, + { + "epoch": 3.116655865197667, + "grad_norm": 0.05772681534290314, + "learning_rate": 4.907511589180431e-05, + "loss": 0.2158, + "step": 38472 + }, + { + "epoch": 3.1167368762151653, + "grad_norm": 0.05746937170624733, + "learning_rate": 4.9070615239209685e-05, + "loss": 0.2113, + "step": 38473 + }, + { + "epoch": 3.1168178872326635, + "grad_norm": 0.06963689625263214, + "learning_rate": 4.9066114586615066e-05, + "loss": 0.2076, + "step": 38474 + }, + { + "epoch": 3.116898898250162, + "grad_norm": 0.06944216787815094, + "learning_rate": 4.906161393402043e-05, + "loss": 0.2228, + "step": 38475 + }, + { + "epoch": 3.1169799092676604, + "grad_norm": 0.0683133527636528, + "learning_rate": 4.9057113281425806e-05, + "loss": 0.2156, + "step": 38476 + }, + { + "epoch": 3.1170609202851587, + "grad_norm": 0.0697324275970459, + "learning_rate": 4.9052612628831187e-05, + "loss": 0.2047, + "step": 38477 + }, + { + "epoch": 3.1171419313026574, + "grad_norm": 0.08170359581708908, + "learning_rate": 4.904811197623655e-05, + "loss": 0.2434, + "step": 38478 + }, + { + "epoch": 3.1172229423201556, + "grad_norm": 0.07048113644123077, + "learning_rate": 4.904361132364193e-05, + "loss": 0.2319, + "step": 38479 + }, + { + "epoch": 3.117303953337654, + "grad_norm": 0.0791177749633789, + "learning_rate": 4.903911067104731e-05, + "loss": 0.2627, + "step": 38480 + }, + { + "epoch": 3.117384964355152, + "grad_norm": 0.06898507475852966, + "learning_rate": 4.9034610018452674e-05, + "loss": 0.2163, + "step": 38481 + }, + { + "epoch": 3.117465975372651, + "grad_norm": 0.07705669850111008, + "learning_rate": 4.9030109365858055e-05, + "loss": 0.2502, + "step": 38482 + }, + { + "epoch": 3.117546986390149, + "grad_norm": 0.07984774559736252, + "learning_rate": 4.902560871326343e-05, + "loss": 0.2375, + "step": 38483 + }, + { + "epoch": 3.1176279974076473, + "grad_norm": 0.07025951147079468, + "learning_rate": 4.9021108060668795e-05, + "loss": 0.2279, + "step": 38484 + }, + { + "epoch": 3.117709008425146, + "grad_norm": 0.061531197279691696, + "learning_rate": 4.9016607408074175e-05, + "loss": 0.1944, + "step": 38485 + }, + { + "epoch": 3.1177900194426442, + "grad_norm": 0.061712365597486496, + "learning_rate": 4.901210675547955e-05, + "loss": 0.2538, + "step": 38486 + }, + { + "epoch": 3.1178710304601425, + "grad_norm": 0.0870019868016243, + "learning_rate": 4.900760610288492e-05, + "loss": 0.2018, + "step": 38487 + }, + { + "epoch": 3.117952041477641, + "grad_norm": 0.0668628066778183, + "learning_rate": 4.9003105450290296e-05, + "loss": 0.2187, + "step": 38488 + }, + { + "epoch": 3.1180330524951394, + "grad_norm": 0.06232215464115143, + "learning_rate": 4.899860479769567e-05, + "loss": 0.3, + "step": 38489 + }, + { + "epoch": 3.1181140635126376, + "grad_norm": 0.059925880283117294, + "learning_rate": 4.8994104145101043e-05, + "loss": 0.2246, + "step": 38490 + }, + { + "epoch": 3.118195074530136, + "grad_norm": 0.07962461560964584, + "learning_rate": 4.898960349250642e-05, + "loss": 0.2847, + "step": 38491 + }, + { + "epoch": 3.1182760855476346, + "grad_norm": 0.0759328305721283, + "learning_rate": 4.898510283991179e-05, + "loss": 0.2065, + "step": 38492 + }, + { + "epoch": 3.118357096565133, + "grad_norm": 0.06727610528469086, + "learning_rate": 4.8980602187317164e-05, + "loss": 0.1977, + "step": 38493 + }, + { + "epoch": 3.118438107582631, + "grad_norm": 0.06177781522274017, + "learning_rate": 4.897610153472254e-05, + "loss": 0.2149, + "step": 38494 + }, + { + "epoch": 3.1185191186001298, + "grad_norm": 0.0664069876074791, + "learning_rate": 4.897160088212791e-05, + "loss": 0.2184, + "step": 38495 + }, + { + "epoch": 3.118600129617628, + "grad_norm": 0.07583729922771454, + "learning_rate": 4.8967100229533285e-05, + "loss": 0.2504, + "step": 38496 + }, + { + "epoch": 3.1186811406351262, + "grad_norm": 0.062377750873565674, + "learning_rate": 4.896259957693866e-05, + "loss": 0.1987, + "step": 38497 + }, + { + "epoch": 3.118762151652625, + "grad_norm": 0.07547123730182648, + "learning_rate": 4.895809892434403e-05, + "loss": 0.2361, + "step": 38498 + }, + { + "epoch": 3.118843162670123, + "grad_norm": 0.08136604726314545, + "learning_rate": 4.8953598271749406e-05, + "loss": 0.2347, + "step": 38499 + }, + { + "epoch": 3.1189241736876214, + "grad_norm": 0.0768231749534607, + "learning_rate": 4.894909761915478e-05, + "loss": 0.2109, + "step": 38500 + }, + { + "epoch": 3.11900518470512, + "grad_norm": 0.07560477405786514, + "learning_rate": 4.894459696656015e-05, + "loss": 0.2574, + "step": 38501 + }, + { + "epoch": 3.1190861957226184, + "grad_norm": 0.07469621300697327, + "learning_rate": 4.894009631396553e-05, + "loss": 0.2551, + "step": 38502 + }, + { + "epoch": 3.1191672067401166, + "grad_norm": 0.06510797888040543, + "learning_rate": 4.89355956613709e-05, + "loss": 0.2017, + "step": 38503 + }, + { + "epoch": 3.119248217757615, + "grad_norm": 0.07766696065664291, + "learning_rate": 4.8931095008776274e-05, + "loss": 0.2066, + "step": 38504 + }, + { + "epoch": 3.1193292287751135, + "grad_norm": 0.07333401590585709, + "learning_rate": 4.892659435618165e-05, + "loss": 0.2208, + "step": 38505 + }, + { + "epoch": 3.119410239792612, + "grad_norm": 0.061164095997810364, + "learning_rate": 4.892209370358702e-05, + "loss": 0.2093, + "step": 38506 + }, + { + "epoch": 3.11949125081011, + "grad_norm": 0.08092789351940155, + "learning_rate": 4.8917593050992395e-05, + "loss": 0.2249, + "step": 38507 + }, + { + "epoch": 3.1195722618276087, + "grad_norm": 0.05781802162528038, + "learning_rate": 4.891309239839777e-05, + "loss": 0.2036, + "step": 38508 + }, + { + "epoch": 3.119653272845107, + "grad_norm": 0.0585942342877388, + "learning_rate": 4.890859174580314e-05, + "loss": 0.1811, + "step": 38509 + }, + { + "epoch": 3.119734283862605, + "grad_norm": 0.07196545600891113, + "learning_rate": 4.8904091093208515e-05, + "loss": 0.2354, + "step": 38510 + }, + { + "epoch": 3.119815294880104, + "grad_norm": 0.08211824297904968, + "learning_rate": 4.889959044061389e-05, + "loss": 0.2284, + "step": 38511 + }, + { + "epoch": 3.119896305897602, + "grad_norm": 0.07880119234323502, + "learning_rate": 4.889508978801926e-05, + "loss": 0.2553, + "step": 38512 + }, + { + "epoch": 3.1199773169151004, + "grad_norm": 0.10465053468942642, + "learning_rate": 4.889058913542464e-05, + "loss": 0.2787, + "step": 38513 + }, + { + "epoch": 3.1200583279325986, + "grad_norm": 0.06549686938524246, + "learning_rate": 4.888608848283001e-05, + "loss": 0.2457, + "step": 38514 + }, + { + "epoch": 3.1201393389500973, + "grad_norm": 0.07236936688423157, + "learning_rate": 4.888158783023539e-05, + "loss": 0.2581, + "step": 38515 + }, + { + "epoch": 3.1202203499675956, + "grad_norm": 0.059712644666433334, + "learning_rate": 4.8877087177640764e-05, + "loss": 0.2089, + "step": 38516 + }, + { + "epoch": 3.120301360985094, + "grad_norm": 0.06971476227045059, + "learning_rate": 4.887258652504613e-05, + "loss": 0.2512, + "step": 38517 + }, + { + "epoch": 3.1203823720025925, + "grad_norm": 0.06128533557057381, + "learning_rate": 4.886808587245151e-05, + "loss": 0.2296, + "step": 38518 + }, + { + "epoch": 3.1204633830200907, + "grad_norm": 0.07567324489355087, + "learning_rate": 4.8863585219856885e-05, + "loss": 0.2501, + "step": 38519 + }, + { + "epoch": 3.120544394037589, + "grad_norm": 0.06608163565397263, + "learning_rate": 4.885908456726225e-05, + "loss": 0.2608, + "step": 38520 + }, + { + "epoch": 3.1206254050550877, + "grad_norm": 0.06862304359674454, + "learning_rate": 4.885458391466763e-05, + "loss": 0.2373, + "step": 38521 + }, + { + "epoch": 3.120706416072586, + "grad_norm": 0.06940841674804688, + "learning_rate": 4.8850083262073005e-05, + "loss": 0.2527, + "step": 38522 + }, + { + "epoch": 3.120787427090084, + "grad_norm": 0.07779053598642349, + "learning_rate": 4.884558260947837e-05, + "loss": 0.2374, + "step": 38523 + }, + { + "epoch": 3.120868438107583, + "grad_norm": 0.07671964168548584, + "learning_rate": 4.884108195688375e-05, + "loss": 0.2201, + "step": 38524 + }, + { + "epoch": 3.120949449125081, + "grad_norm": 0.057076919823884964, + "learning_rate": 4.8836581304289126e-05, + "loss": 0.2252, + "step": 38525 + }, + { + "epoch": 3.1210304601425793, + "grad_norm": 0.08262202888727188, + "learning_rate": 4.88320806516945e-05, + "loss": 0.2759, + "step": 38526 + }, + { + "epoch": 3.1211114711600776, + "grad_norm": 0.07039379328489304, + "learning_rate": 4.8827579999099873e-05, + "loss": 0.2373, + "step": 38527 + }, + { + "epoch": 3.1211924821775763, + "grad_norm": 0.0531785786151886, + "learning_rate": 4.882307934650525e-05, + "loss": 0.1895, + "step": 38528 + }, + { + "epoch": 3.1212734931950745, + "grad_norm": 0.08400041610002518, + "learning_rate": 4.881857869391062e-05, + "loss": 0.2174, + "step": 38529 + }, + { + "epoch": 3.1213545042125728, + "grad_norm": 0.1000119000673294, + "learning_rate": 4.8814078041315994e-05, + "loss": 0.2544, + "step": 38530 + }, + { + "epoch": 3.1214355152300715, + "grad_norm": 0.08154396712779999, + "learning_rate": 4.880957738872137e-05, + "loss": 0.226, + "step": 38531 + }, + { + "epoch": 3.1215165262475697, + "grad_norm": 0.06984757632017136, + "learning_rate": 4.880507673612674e-05, + "loss": 0.2445, + "step": 38532 + }, + { + "epoch": 3.121597537265068, + "grad_norm": 0.06032893434166908, + "learning_rate": 4.8800576083532115e-05, + "loss": 0.2255, + "step": 38533 + }, + { + "epoch": 3.1216785482825666, + "grad_norm": 0.06319891661405563, + "learning_rate": 4.879607543093749e-05, + "loss": 0.2381, + "step": 38534 + }, + { + "epoch": 3.121759559300065, + "grad_norm": 0.06383655965328217, + "learning_rate": 4.879157477834286e-05, + "loss": 0.2084, + "step": 38535 + }, + { + "epoch": 3.121840570317563, + "grad_norm": 0.06865326315164566, + "learning_rate": 4.8787074125748236e-05, + "loss": 0.2131, + "step": 38536 + }, + { + "epoch": 3.1219215813350614, + "grad_norm": 0.08230190724134445, + "learning_rate": 4.878257347315361e-05, + "loss": 0.2205, + "step": 38537 + }, + { + "epoch": 3.12200259235256, + "grad_norm": 0.075211301445961, + "learning_rate": 4.877807282055898e-05, + "loss": 0.2094, + "step": 38538 + }, + { + "epoch": 3.1220836033700583, + "grad_norm": 0.06502600014209747, + "learning_rate": 4.877357216796436e-05, + "loss": 0.2047, + "step": 38539 + }, + { + "epoch": 3.1221646143875565, + "grad_norm": 0.06465815007686615, + "learning_rate": 4.876907151536973e-05, + "loss": 0.233, + "step": 38540 + }, + { + "epoch": 3.1222456254050552, + "grad_norm": 0.061203133314847946, + "learning_rate": 4.8764570862775104e-05, + "loss": 0.2426, + "step": 38541 + }, + { + "epoch": 3.1223266364225535, + "grad_norm": 0.061528146266937256, + "learning_rate": 4.876007021018048e-05, + "loss": 0.2538, + "step": 38542 + }, + { + "epoch": 3.1224076474400517, + "grad_norm": 0.06926919519901276, + "learning_rate": 4.875556955758585e-05, + "loss": 0.2114, + "step": 38543 + }, + { + "epoch": 3.1224886584575504, + "grad_norm": 0.07080454379320145, + "learning_rate": 4.8751068904991225e-05, + "loss": 0.2522, + "step": 38544 + }, + { + "epoch": 3.1225696694750487, + "grad_norm": 0.07412424683570862, + "learning_rate": 4.87465682523966e-05, + "loss": 0.2099, + "step": 38545 + }, + { + "epoch": 3.122650680492547, + "grad_norm": 0.06699513643980026, + "learning_rate": 4.874206759980197e-05, + "loss": 0.228, + "step": 38546 + }, + { + "epoch": 3.1227316915100456, + "grad_norm": 0.08302906900644302, + "learning_rate": 4.8737566947207346e-05, + "loss": 0.2132, + "step": 38547 + }, + { + "epoch": 3.122812702527544, + "grad_norm": 0.0748620480298996, + "learning_rate": 4.873306629461272e-05, + "loss": 0.296, + "step": 38548 + }, + { + "epoch": 3.122893713545042, + "grad_norm": 0.06947177648544312, + "learning_rate": 4.872856564201809e-05, + "loss": 0.2439, + "step": 38549 + }, + { + "epoch": 3.1229747245625403, + "grad_norm": 0.07392599433660507, + "learning_rate": 4.8724064989423466e-05, + "loss": 0.2444, + "step": 38550 + }, + { + "epoch": 3.123055735580039, + "grad_norm": 0.08208917826414108, + "learning_rate": 4.871956433682885e-05, + "loss": 0.2224, + "step": 38551 + }, + { + "epoch": 3.1231367465975373, + "grad_norm": 0.07460086792707443, + "learning_rate": 4.871506368423422e-05, + "loss": 0.2205, + "step": 38552 + }, + { + "epoch": 3.1232177576150355, + "grad_norm": 0.08083727955818176, + "learning_rate": 4.871056303163959e-05, + "loss": 0.2517, + "step": 38553 + }, + { + "epoch": 3.123298768632534, + "grad_norm": 0.07795874029397964, + "learning_rate": 4.870606237904497e-05, + "loss": 0.2164, + "step": 38554 + }, + { + "epoch": 3.1233797796500324, + "grad_norm": 0.07984881848096848, + "learning_rate": 4.870156172645034e-05, + "loss": 0.2731, + "step": 38555 + }, + { + "epoch": 3.1234607906675307, + "grad_norm": 0.07941204309463501, + "learning_rate": 4.869706107385571e-05, + "loss": 0.2455, + "step": 38556 + }, + { + "epoch": 3.1235418016850294, + "grad_norm": 0.07575903832912445, + "learning_rate": 4.869256042126109e-05, + "loss": 0.2325, + "step": 38557 + }, + { + "epoch": 3.1236228127025276, + "grad_norm": 0.07499007135629654, + "learning_rate": 4.868805976866646e-05, + "loss": 0.2502, + "step": 38558 + }, + { + "epoch": 3.123703823720026, + "grad_norm": 0.07867011427879333, + "learning_rate": 4.868355911607183e-05, + "loss": 0.2587, + "step": 38559 + }, + { + "epoch": 3.123784834737524, + "grad_norm": 0.0736221894621849, + "learning_rate": 4.867905846347721e-05, + "loss": 0.2328, + "step": 38560 + }, + { + "epoch": 3.123865845755023, + "grad_norm": 0.08023568987846375, + "learning_rate": 4.867455781088258e-05, + "loss": 0.2471, + "step": 38561 + }, + { + "epoch": 3.123946856772521, + "grad_norm": 0.08737244457006454, + "learning_rate": 4.867005715828795e-05, + "loss": 0.2694, + "step": 38562 + }, + { + "epoch": 3.1240278677900193, + "grad_norm": 0.06269481778144836, + "learning_rate": 4.866555650569333e-05, + "loss": 0.2211, + "step": 38563 + }, + { + "epoch": 3.124108878807518, + "grad_norm": 0.07235285639762878, + "learning_rate": 4.8661055853098704e-05, + "loss": 0.2377, + "step": 38564 + }, + { + "epoch": 3.124189889825016, + "grad_norm": 0.06457497179508209, + "learning_rate": 4.865655520050408e-05, + "loss": 0.1976, + "step": 38565 + }, + { + "epoch": 3.1242709008425145, + "grad_norm": 0.06922008842229843, + "learning_rate": 4.865205454790945e-05, + "loss": 0.2261, + "step": 38566 + }, + { + "epoch": 3.124351911860013, + "grad_norm": 0.05870746448636055, + "learning_rate": 4.8647553895314824e-05, + "loss": 0.2185, + "step": 38567 + }, + { + "epoch": 3.1244329228775114, + "grad_norm": 0.07148100435733795, + "learning_rate": 4.86430532427202e-05, + "loss": 0.2328, + "step": 38568 + }, + { + "epoch": 3.1245139338950096, + "grad_norm": 0.07290808856487274, + "learning_rate": 4.863855259012557e-05, + "loss": 0.2128, + "step": 38569 + }, + { + "epoch": 3.124594944912508, + "grad_norm": 0.08129823952913284, + "learning_rate": 4.8634051937530945e-05, + "loss": 0.2205, + "step": 38570 + }, + { + "epoch": 3.1246759559300066, + "grad_norm": 0.08137766271829605, + "learning_rate": 4.862955128493632e-05, + "loss": 0.2472, + "step": 38571 + }, + { + "epoch": 3.124756966947505, + "grad_norm": 0.08388211578130722, + "learning_rate": 4.862505063234169e-05, + "loss": 0.2983, + "step": 38572 + }, + { + "epoch": 3.124837977965003, + "grad_norm": 0.07012079656124115, + "learning_rate": 4.8620549979747066e-05, + "loss": 0.2072, + "step": 38573 + }, + { + "epoch": 3.1249189889825018, + "grad_norm": 0.08775711804628372, + "learning_rate": 4.861604932715244e-05, + "loss": 0.2144, + "step": 38574 + }, + { + "epoch": 3.125, + "grad_norm": 0.07253850251436234, + "learning_rate": 4.861154867455781e-05, + "loss": 0.2097, + "step": 38575 + }, + { + "epoch": 3.1250810110174982, + "grad_norm": 0.06796213984489441, + "learning_rate": 4.860704802196319e-05, + "loss": 0.2175, + "step": 38576 + }, + { + "epoch": 3.125162022034997, + "grad_norm": 0.07270730286836624, + "learning_rate": 4.860254736936856e-05, + "loss": 0.2226, + "step": 38577 + }, + { + "epoch": 3.125243033052495, + "grad_norm": 0.056501347571611404, + "learning_rate": 4.8598046716773934e-05, + "loss": 0.2236, + "step": 38578 + }, + { + "epoch": 3.1253240440699934, + "grad_norm": 0.06658945977687836, + "learning_rate": 4.859354606417931e-05, + "loss": 0.2512, + "step": 38579 + }, + { + "epoch": 3.1254050550874917, + "grad_norm": 0.07839953899383545, + "learning_rate": 4.858904541158468e-05, + "loss": 0.202, + "step": 38580 + }, + { + "epoch": 3.1254860661049904, + "grad_norm": 0.07932569831609726, + "learning_rate": 4.8584544758990055e-05, + "loss": 0.2634, + "step": 38581 + }, + { + "epoch": 3.1255670771224886, + "grad_norm": 0.06872804462909698, + "learning_rate": 4.858004410639543e-05, + "loss": 0.3022, + "step": 38582 + }, + { + "epoch": 3.125648088139987, + "grad_norm": 0.06276540458202362, + "learning_rate": 4.85755434538008e-05, + "loss": 0.2254, + "step": 38583 + }, + { + "epoch": 3.1257290991574855, + "grad_norm": 0.0721246674656868, + "learning_rate": 4.857104280120618e-05, + "loss": 0.1924, + "step": 38584 + }, + { + "epoch": 3.125810110174984, + "grad_norm": 0.07592836022377014, + "learning_rate": 4.856654214861155e-05, + "loss": 0.2381, + "step": 38585 + }, + { + "epoch": 3.125891121192482, + "grad_norm": 0.06872162222862244, + "learning_rate": 4.856204149601692e-05, + "loss": 0.2617, + "step": 38586 + }, + { + "epoch": 3.1259721322099807, + "grad_norm": 0.07416519522666931, + "learning_rate": 4.85575408434223e-05, + "loss": 0.2221, + "step": 38587 + }, + { + "epoch": 3.126053143227479, + "grad_norm": 0.07375083118677139, + "learning_rate": 4.855304019082767e-05, + "loss": 0.2604, + "step": 38588 + }, + { + "epoch": 3.126134154244977, + "grad_norm": 0.07078424096107483, + "learning_rate": 4.8548539538233044e-05, + "loss": 0.2533, + "step": 38589 + }, + { + "epoch": 3.126215165262476, + "grad_norm": 0.05443970113992691, + "learning_rate": 4.8544038885638424e-05, + "loss": 0.2002, + "step": 38590 + }, + { + "epoch": 3.126296176279974, + "grad_norm": 0.09065693616867065, + "learning_rate": 4.85395382330438e-05, + "loss": 0.2347, + "step": 38591 + }, + { + "epoch": 3.1263771872974724, + "grad_norm": 0.07778283953666687, + "learning_rate": 4.8535037580449164e-05, + "loss": 0.2068, + "step": 38592 + }, + { + "epoch": 3.126458198314971, + "grad_norm": 0.07738600671291351, + "learning_rate": 4.8530536927854545e-05, + "loss": 0.2145, + "step": 38593 + }, + { + "epoch": 3.1265392093324693, + "grad_norm": 0.0660366341471672, + "learning_rate": 4.852603627525992e-05, + "loss": 0.1892, + "step": 38594 + }, + { + "epoch": 3.1266202203499676, + "grad_norm": 0.07506325840950012, + "learning_rate": 4.8521535622665285e-05, + "loss": 0.2487, + "step": 38595 + }, + { + "epoch": 3.126701231367466, + "grad_norm": 0.0669327825307846, + "learning_rate": 4.8517034970070666e-05, + "loss": 0.2237, + "step": 38596 + }, + { + "epoch": 3.1267822423849645, + "grad_norm": 0.06828659027814865, + "learning_rate": 4.851253431747604e-05, + "loss": 0.2254, + "step": 38597 + }, + { + "epoch": 3.1268632534024627, + "grad_norm": 0.06882147490978241, + "learning_rate": 4.8508033664881406e-05, + "loss": 0.2626, + "step": 38598 + }, + { + "epoch": 3.126944264419961, + "grad_norm": 0.0750584602355957, + "learning_rate": 4.8503533012286786e-05, + "loss": 0.207, + "step": 38599 + }, + { + "epoch": 3.1270252754374597, + "grad_norm": 0.0676826611161232, + "learning_rate": 4.849903235969216e-05, + "loss": 0.2209, + "step": 38600 + }, + { + "epoch": 3.127106286454958, + "grad_norm": 0.0683126300573349, + "learning_rate": 4.849453170709753e-05, + "loss": 0.2315, + "step": 38601 + }, + { + "epoch": 3.127187297472456, + "grad_norm": 0.06574027985334396, + "learning_rate": 4.849003105450291e-05, + "loss": 0.207, + "step": 38602 + }, + { + "epoch": 3.1272683084899544, + "grad_norm": 0.06940241903066635, + "learning_rate": 4.848553040190828e-05, + "loss": 0.2542, + "step": 38603 + }, + { + "epoch": 3.127349319507453, + "grad_norm": 0.07252786308526993, + "learning_rate": 4.848102974931365e-05, + "loss": 0.2263, + "step": 38604 + }, + { + "epoch": 3.1274303305249513, + "grad_norm": 0.06789885461330414, + "learning_rate": 4.847652909671903e-05, + "loss": 0.2128, + "step": 38605 + }, + { + "epoch": 3.1275113415424496, + "grad_norm": 0.0669901892542839, + "learning_rate": 4.84720284441244e-05, + "loss": 0.2043, + "step": 38606 + }, + { + "epoch": 3.1275923525599483, + "grad_norm": 0.07184362411499023, + "learning_rate": 4.8467527791529775e-05, + "loss": 0.2282, + "step": 38607 + }, + { + "epoch": 3.1276733635774465, + "grad_norm": 0.07352180778980255, + "learning_rate": 4.846302713893515e-05, + "loss": 0.2281, + "step": 38608 + }, + { + "epoch": 3.1277543745949448, + "grad_norm": 0.06735699623823166, + "learning_rate": 4.845852648634052e-05, + "loss": 0.2241, + "step": 38609 + }, + { + "epoch": 3.1278353856124435, + "grad_norm": 0.0694347694516182, + "learning_rate": 4.8454025833745896e-05, + "loss": 0.2592, + "step": 38610 + }, + { + "epoch": 3.1279163966299417, + "grad_norm": 0.061657343059778214, + "learning_rate": 4.844952518115127e-05, + "loss": 0.2214, + "step": 38611 + }, + { + "epoch": 3.12799740764744, + "grad_norm": 0.06862164288759232, + "learning_rate": 4.844502452855664e-05, + "loss": 0.2231, + "step": 38612 + }, + { + "epoch": 3.1280784186649386, + "grad_norm": 0.07311592251062393, + "learning_rate": 4.844052387596202e-05, + "loss": 0.2551, + "step": 38613 + }, + { + "epoch": 3.128159429682437, + "grad_norm": 0.06936139613389969, + "learning_rate": 4.843602322336739e-05, + "loss": 0.2193, + "step": 38614 + }, + { + "epoch": 3.128240440699935, + "grad_norm": 0.07125812023878098, + "learning_rate": 4.8431522570772764e-05, + "loss": 0.2687, + "step": 38615 + }, + { + "epoch": 3.1283214517174334, + "grad_norm": 0.07158618420362473, + "learning_rate": 4.842702191817814e-05, + "loss": 0.2352, + "step": 38616 + }, + { + "epoch": 3.128402462734932, + "grad_norm": 0.06227901577949524, + "learning_rate": 4.842252126558351e-05, + "loss": 0.2616, + "step": 38617 + }, + { + "epoch": 3.1284834737524303, + "grad_norm": 0.05311376973986626, + "learning_rate": 4.8418020612988885e-05, + "loss": 0.224, + "step": 38618 + }, + { + "epoch": 3.1285644847699285, + "grad_norm": 0.08298182487487793, + "learning_rate": 4.841351996039426e-05, + "loss": 0.2638, + "step": 38619 + }, + { + "epoch": 3.1286454957874272, + "grad_norm": 0.06601186096668243, + "learning_rate": 4.840901930779964e-05, + "loss": 0.2168, + "step": 38620 + }, + { + "epoch": 3.1287265068049255, + "grad_norm": 0.06901419907808304, + "learning_rate": 4.8404518655205006e-05, + "loss": 0.2298, + "step": 38621 + }, + { + "epoch": 3.1288075178224237, + "grad_norm": 0.09086368978023529, + "learning_rate": 4.840001800261038e-05, + "loss": 0.2565, + "step": 38622 + }, + { + "epoch": 3.1288885288399224, + "grad_norm": 0.07500910013914108, + "learning_rate": 4.839551735001576e-05, + "loss": 0.2411, + "step": 38623 + }, + { + "epoch": 3.1289695398574207, + "grad_norm": 0.06747693568468094, + "learning_rate": 4.8391016697421127e-05, + "loss": 0.2275, + "step": 38624 + }, + { + "epoch": 3.129050550874919, + "grad_norm": 0.08328603953123093, + "learning_rate": 4.83865160448265e-05, + "loss": 0.2068, + "step": 38625 + }, + { + "epoch": 3.129131561892417, + "grad_norm": 0.06553267687559128, + "learning_rate": 4.838201539223188e-05, + "loss": 0.1888, + "step": 38626 + }, + { + "epoch": 3.129212572909916, + "grad_norm": 0.06869760900735855, + "learning_rate": 4.837751473963725e-05, + "loss": 0.2409, + "step": 38627 + }, + { + "epoch": 3.129293583927414, + "grad_norm": 0.06937336176633835, + "learning_rate": 4.837301408704262e-05, + "loss": 0.2065, + "step": 38628 + }, + { + "epoch": 3.1293745949449123, + "grad_norm": 0.09511666744947433, + "learning_rate": 4.8368513434448e-05, + "loss": 0.2427, + "step": 38629 + }, + { + "epoch": 3.129455605962411, + "grad_norm": 0.06920617818832397, + "learning_rate": 4.836401278185337e-05, + "loss": 0.2335, + "step": 38630 + }, + { + "epoch": 3.1295366169799093, + "grad_norm": 0.07520420104265213, + "learning_rate": 4.835951212925874e-05, + "loss": 0.2635, + "step": 38631 + }, + { + "epoch": 3.1296176279974075, + "grad_norm": 0.06603451073169708, + "learning_rate": 4.835501147666412e-05, + "loss": 0.2203, + "step": 38632 + }, + { + "epoch": 3.129698639014906, + "grad_norm": 0.07562818378210068, + "learning_rate": 4.8350510824069496e-05, + "loss": 0.2433, + "step": 38633 + }, + { + "epoch": 3.1297796500324044, + "grad_norm": 0.0713423416018486, + "learning_rate": 4.834601017147486e-05, + "loss": 0.1939, + "step": 38634 + }, + { + "epoch": 3.1298606610499027, + "grad_norm": 0.07066536694765091, + "learning_rate": 4.834150951888024e-05, + "loss": 0.2573, + "step": 38635 + }, + { + "epoch": 3.1299416720674014, + "grad_norm": 0.07903990894556046, + "learning_rate": 4.8337008866285617e-05, + "loss": 0.2318, + "step": 38636 + }, + { + "epoch": 3.1300226830848996, + "grad_norm": 0.07177235186100006, + "learning_rate": 4.8332508213690983e-05, + "loss": 0.2103, + "step": 38637 + }, + { + "epoch": 3.130103694102398, + "grad_norm": 0.07718789577484131, + "learning_rate": 4.8328007561096364e-05, + "loss": 0.2302, + "step": 38638 + }, + { + "epoch": 3.130184705119896, + "grad_norm": 0.08384337276220322, + "learning_rate": 4.832350690850174e-05, + "loss": 0.2397, + "step": 38639 + }, + { + "epoch": 3.130265716137395, + "grad_norm": 0.07833084464073181, + "learning_rate": 4.8319006255907104e-05, + "loss": 0.2246, + "step": 38640 + }, + { + "epoch": 3.130346727154893, + "grad_norm": 0.06530147045850754, + "learning_rate": 4.8314505603312485e-05, + "loss": 0.2187, + "step": 38641 + }, + { + "epoch": 3.1304277381723913, + "grad_norm": 0.08604307472705841, + "learning_rate": 4.831000495071786e-05, + "loss": 0.2247, + "step": 38642 + }, + { + "epoch": 3.13050874918989, + "grad_norm": 0.07353553175926208, + "learning_rate": 4.8305504298123225e-05, + "loss": 0.2109, + "step": 38643 + }, + { + "epoch": 3.130589760207388, + "grad_norm": 0.06441865861415863, + "learning_rate": 4.8301003645528605e-05, + "loss": 0.2424, + "step": 38644 + }, + { + "epoch": 3.1306707712248865, + "grad_norm": 0.06363363564014435, + "learning_rate": 4.829650299293398e-05, + "loss": 0.2517, + "step": 38645 + }, + { + "epoch": 3.130751782242385, + "grad_norm": 0.06344431638717651, + "learning_rate": 4.829200234033935e-05, + "loss": 0.245, + "step": 38646 + }, + { + "epoch": 3.1308327932598834, + "grad_norm": 0.06573443114757538, + "learning_rate": 4.8287501687744726e-05, + "loss": 0.2111, + "step": 38647 + }, + { + "epoch": 3.1309138042773816, + "grad_norm": 0.06902682781219482, + "learning_rate": 4.82830010351501e-05, + "loss": 0.2164, + "step": 38648 + }, + { + "epoch": 3.13099481529488, + "grad_norm": 0.06718235462903976, + "learning_rate": 4.8278500382555473e-05, + "loss": 0.2159, + "step": 38649 + }, + { + "epoch": 3.1310758263123786, + "grad_norm": 0.07870693504810333, + "learning_rate": 4.827399972996085e-05, + "loss": 0.225, + "step": 38650 + }, + { + "epoch": 3.131156837329877, + "grad_norm": 0.05864543467760086, + "learning_rate": 4.826949907736622e-05, + "loss": 0.2387, + "step": 38651 + }, + { + "epoch": 3.131237848347375, + "grad_norm": 0.07667594403028488, + "learning_rate": 4.8264998424771594e-05, + "loss": 0.2183, + "step": 38652 + }, + { + "epoch": 3.1313188593648738, + "grad_norm": 0.07851625978946686, + "learning_rate": 4.826049777217697e-05, + "loss": 0.2487, + "step": 38653 + }, + { + "epoch": 3.131399870382372, + "grad_norm": 0.06867916136980057, + "learning_rate": 4.825599711958234e-05, + "loss": 0.2108, + "step": 38654 + }, + { + "epoch": 3.1314808813998702, + "grad_norm": 0.08466805517673492, + "learning_rate": 4.8251496466987715e-05, + "loss": 0.2552, + "step": 38655 + }, + { + "epoch": 3.131561892417369, + "grad_norm": 0.07399707287549973, + "learning_rate": 4.824699581439309e-05, + "loss": 0.2248, + "step": 38656 + }, + { + "epoch": 3.131642903434867, + "grad_norm": 0.059375181794166565, + "learning_rate": 4.824249516179846e-05, + "loss": 0.2477, + "step": 38657 + }, + { + "epoch": 3.1317239144523654, + "grad_norm": 0.0669362023472786, + "learning_rate": 4.8237994509203836e-05, + "loss": 0.2444, + "step": 38658 + }, + { + "epoch": 3.131804925469864, + "grad_norm": 0.06974013149738312, + "learning_rate": 4.8233493856609216e-05, + "loss": 0.2411, + "step": 38659 + }, + { + "epoch": 3.1318859364873624, + "grad_norm": 0.07815289497375488, + "learning_rate": 4.822899320401458e-05, + "loss": 0.237, + "step": 38660 + }, + { + "epoch": 3.1319669475048606, + "grad_norm": 0.06880567967891693, + "learning_rate": 4.822449255141996e-05, + "loss": 0.214, + "step": 38661 + }, + { + "epoch": 3.132047958522359, + "grad_norm": 0.0771849974989891, + "learning_rate": 4.821999189882534e-05, + "loss": 0.2473, + "step": 38662 + }, + { + "epoch": 3.1321289695398575, + "grad_norm": 0.057830676436424255, + "learning_rate": 4.8215491246230704e-05, + "loss": 0.229, + "step": 38663 + }, + { + "epoch": 3.1322099805573558, + "grad_norm": 0.09336844086647034, + "learning_rate": 4.821099059363608e-05, + "loss": 0.2644, + "step": 38664 + }, + { + "epoch": 3.132290991574854, + "grad_norm": 0.06886488199234009, + "learning_rate": 4.820648994104146e-05, + "loss": 0.2195, + "step": 38665 + }, + { + "epoch": 3.1323720025923527, + "grad_norm": 0.06183936074376106, + "learning_rate": 4.8201989288446825e-05, + "loss": 0.2531, + "step": 38666 + }, + { + "epoch": 3.132453013609851, + "grad_norm": 0.07469535619020462, + "learning_rate": 4.81974886358522e-05, + "loss": 0.2538, + "step": 38667 + }, + { + "epoch": 3.132534024627349, + "grad_norm": 0.0630243569612503, + "learning_rate": 4.819298798325758e-05, + "loss": 0.207, + "step": 38668 + }, + { + "epoch": 3.132615035644848, + "grad_norm": 0.07657089084386826, + "learning_rate": 4.8188487330662945e-05, + "loss": 0.191, + "step": 38669 + }, + { + "epoch": 3.132696046662346, + "grad_norm": 0.07390504330396652, + "learning_rate": 4.818398667806832e-05, + "loss": 0.2497, + "step": 38670 + }, + { + "epoch": 3.1327770576798444, + "grad_norm": 0.060257602483034134, + "learning_rate": 4.81794860254737e-05, + "loss": 0.2322, + "step": 38671 + }, + { + "epoch": 3.1328580686973426, + "grad_norm": 0.07124596834182739, + "learning_rate": 4.817498537287907e-05, + "loss": 0.2334, + "step": 38672 + }, + { + "epoch": 3.1329390797148413, + "grad_norm": 0.07787282764911652, + "learning_rate": 4.817048472028444e-05, + "loss": 0.2473, + "step": 38673 + }, + { + "epoch": 3.1330200907323396, + "grad_norm": 0.10237501561641693, + "learning_rate": 4.816598406768982e-05, + "loss": 0.2736, + "step": 38674 + }, + { + "epoch": 3.133101101749838, + "grad_norm": 0.08048980683088303, + "learning_rate": 4.8161483415095194e-05, + "loss": 0.2468, + "step": 38675 + }, + { + "epoch": 3.1331821127673365, + "grad_norm": 0.07248836755752563, + "learning_rate": 4.815698276250056e-05, + "loss": 0.264, + "step": 38676 + }, + { + "epoch": 3.1332631237848347, + "grad_norm": 0.0636928603053093, + "learning_rate": 4.815248210990594e-05, + "loss": 0.1894, + "step": 38677 + }, + { + "epoch": 3.133344134802333, + "grad_norm": 0.07219689339399338, + "learning_rate": 4.8147981457311315e-05, + "loss": 0.217, + "step": 38678 + }, + { + "epoch": 3.1334251458198317, + "grad_norm": 0.07658065855503082, + "learning_rate": 4.814348080471668e-05, + "loss": 0.2181, + "step": 38679 + }, + { + "epoch": 3.13350615683733, + "grad_norm": 0.09006506949663162, + "learning_rate": 4.813898015212206e-05, + "loss": 0.2513, + "step": 38680 + }, + { + "epoch": 3.133587167854828, + "grad_norm": 0.0771331861615181, + "learning_rate": 4.8134479499527436e-05, + "loss": 0.2224, + "step": 38681 + }, + { + "epoch": 3.133668178872327, + "grad_norm": 0.09508946537971497, + "learning_rate": 4.81299788469328e-05, + "loss": 0.2508, + "step": 38682 + }, + { + "epoch": 3.133749189889825, + "grad_norm": 0.07007356733083725, + "learning_rate": 4.812547819433818e-05, + "loss": 0.2231, + "step": 38683 + }, + { + "epoch": 3.1338302009073233, + "grad_norm": 0.07071962207555771, + "learning_rate": 4.8120977541743556e-05, + "loss": 0.2737, + "step": 38684 + }, + { + "epoch": 3.1339112119248216, + "grad_norm": 0.07418075948953629, + "learning_rate": 4.811647688914893e-05, + "loss": 0.2416, + "step": 38685 + }, + { + "epoch": 3.1339922229423203, + "grad_norm": 0.07185198366641998, + "learning_rate": 4.8111976236554304e-05, + "loss": 0.2078, + "step": 38686 + }, + { + "epoch": 3.1340732339598185, + "grad_norm": 0.08141083270311356, + "learning_rate": 4.810747558395968e-05, + "loss": 0.2263, + "step": 38687 + }, + { + "epoch": 3.1341542449773168, + "grad_norm": 0.08361703902482986, + "learning_rate": 4.810297493136505e-05, + "loss": 0.2272, + "step": 38688 + }, + { + "epoch": 3.1342352559948155, + "grad_norm": 0.0822317898273468, + "learning_rate": 4.8098474278770424e-05, + "loss": 0.2228, + "step": 38689 + }, + { + "epoch": 3.1343162670123137, + "grad_norm": 0.07091178745031357, + "learning_rate": 4.80939736261758e-05, + "loss": 0.2742, + "step": 38690 + }, + { + "epoch": 3.134397278029812, + "grad_norm": 0.0746161937713623, + "learning_rate": 4.808947297358117e-05, + "loss": 0.2414, + "step": 38691 + }, + { + "epoch": 3.1344782890473106, + "grad_norm": 0.07142340391874313, + "learning_rate": 4.8084972320986545e-05, + "loss": 0.2145, + "step": 38692 + }, + { + "epoch": 3.134559300064809, + "grad_norm": 0.08708221465349197, + "learning_rate": 4.808047166839192e-05, + "loss": 0.2405, + "step": 38693 + }, + { + "epoch": 3.134640311082307, + "grad_norm": 0.07113545387983322, + "learning_rate": 4.807597101579729e-05, + "loss": 0.2133, + "step": 38694 + }, + { + "epoch": 3.1347213220998054, + "grad_norm": 0.07318969070911407, + "learning_rate": 4.8071470363202666e-05, + "loss": 0.233, + "step": 38695 + }, + { + "epoch": 3.134802333117304, + "grad_norm": 0.0663350448012352, + "learning_rate": 4.806696971060804e-05, + "loss": 0.2325, + "step": 38696 + }, + { + "epoch": 3.1348833441348023, + "grad_norm": 0.06916707009077072, + "learning_rate": 4.806246905801341e-05, + "loss": 0.2597, + "step": 38697 + }, + { + "epoch": 3.1349643551523005, + "grad_norm": 0.07102689146995544, + "learning_rate": 4.8057968405418794e-05, + "loss": 0.2161, + "step": 38698 + }, + { + "epoch": 3.1350453661697992, + "grad_norm": 0.08429417759180069, + "learning_rate": 4.805346775282416e-05, + "loss": 0.2835, + "step": 38699 + }, + { + "epoch": 3.1351263771872975, + "grad_norm": 0.0673832818865776, + "learning_rate": 4.8048967100229534e-05, + "loss": 0.2221, + "step": 38700 + }, + { + "epoch": 3.1352073882047957, + "grad_norm": 0.0722777470946312, + "learning_rate": 4.8044466447634914e-05, + "loss": 0.2103, + "step": 38701 + }, + { + "epoch": 3.1352883992222944, + "grad_norm": 0.0763530358672142, + "learning_rate": 4.803996579504028e-05, + "loss": 0.24, + "step": 38702 + }, + { + "epoch": 3.1353694102397927, + "grad_norm": 0.0652783066034317, + "learning_rate": 4.8035465142445655e-05, + "loss": 0.2175, + "step": 38703 + }, + { + "epoch": 3.135450421257291, + "grad_norm": 0.0717589259147644, + "learning_rate": 4.8030964489851035e-05, + "loss": 0.2039, + "step": 38704 + }, + { + "epoch": 3.1355314322747896, + "grad_norm": 0.058315105736255646, + "learning_rate": 4.80264638372564e-05, + "loss": 0.2118, + "step": 38705 + }, + { + "epoch": 3.135612443292288, + "grad_norm": 0.07413259893655777, + "learning_rate": 4.8021963184661776e-05, + "loss": 0.2673, + "step": 38706 + }, + { + "epoch": 3.135693454309786, + "grad_norm": 0.06567484140396118, + "learning_rate": 4.8017462532067156e-05, + "loss": 0.2111, + "step": 38707 + }, + { + "epoch": 3.1357744653272843, + "grad_norm": 0.06503421813249588, + "learning_rate": 4.801296187947252e-05, + "loss": 0.2197, + "step": 38708 + }, + { + "epoch": 3.135855476344783, + "grad_norm": 0.07268182933330536, + "learning_rate": 4.8008461226877896e-05, + "loss": 0.2642, + "step": 38709 + }, + { + "epoch": 3.1359364873622813, + "grad_norm": 0.065959133207798, + "learning_rate": 4.800396057428328e-05, + "loss": 0.2311, + "step": 38710 + }, + { + "epoch": 3.1360174983797795, + "grad_norm": 0.07882321625947952, + "learning_rate": 4.799945992168865e-05, + "loss": 0.2721, + "step": 38711 + }, + { + "epoch": 3.136098509397278, + "grad_norm": 0.08288038522005081, + "learning_rate": 4.799495926909402e-05, + "loss": 0.2467, + "step": 38712 + }, + { + "epoch": 3.1361795204147764, + "grad_norm": 0.06765219569206238, + "learning_rate": 4.79904586164994e-05, + "loss": 0.2218, + "step": 38713 + }, + { + "epoch": 3.1362605314322747, + "grad_norm": 0.0784728080034256, + "learning_rate": 4.798595796390477e-05, + "loss": 0.2245, + "step": 38714 + }, + { + "epoch": 3.1363415424497734, + "grad_norm": 0.07332229614257812, + "learning_rate": 4.798145731131014e-05, + "loss": 0.2335, + "step": 38715 + }, + { + "epoch": 3.1364225534672716, + "grad_norm": 0.09900877624750137, + "learning_rate": 4.797695665871552e-05, + "loss": 0.2265, + "step": 38716 + }, + { + "epoch": 3.13650356448477, + "grad_norm": 0.09115981310606003, + "learning_rate": 4.797245600612089e-05, + "loss": 0.2246, + "step": 38717 + }, + { + "epoch": 3.136584575502268, + "grad_norm": 0.07833981513977051, + "learning_rate": 4.796795535352626e-05, + "loss": 0.2263, + "step": 38718 + }, + { + "epoch": 3.136665586519767, + "grad_norm": 0.07188592851161957, + "learning_rate": 4.796345470093164e-05, + "loss": 0.2478, + "step": 38719 + }, + { + "epoch": 3.136746597537265, + "grad_norm": 0.07392287254333496, + "learning_rate": 4.795895404833701e-05, + "loss": 0.2466, + "step": 38720 + }, + { + "epoch": 3.1368276085547633, + "grad_norm": 0.07458144426345825, + "learning_rate": 4.795445339574238e-05, + "loss": 0.2373, + "step": 38721 + }, + { + "epoch": 3.136908619572262, + "grad_norm": 0.06387396156787872, + "learning_rate": 4.794995274314776e-05, + "loss": 0.2401, + "step": 38722 + }, + { + "epoch": 3.13698963058976, + "grad_norm": 0.0793103352189064, + "learning_rate": 4.7945452090553134e-05, + "loss": 0.2443, + "step": 38723 + }, + { + "epoch": 3.1370706416072585, + "grad_norm": 0.08202017098665237, + "learning_rate": 4.794095143795851e-05, + "loss": 0.2545, + "step": 38724 + }, + { + "epoch": 3.137151652624757, + "grad_norm": 0.06801655143499374, + "learning_rate": 4.793645078536388e-05, + "loss": 0.2316, + "step": 38725 + }, + { + "epoch": 3.1372326636422554, + "grad_norm": 0.07169846445322037, + "learning_rate": 4.7931950132769254e-05, + "loss": 0.2517, + "step": 38726 + }, + { + "epoch": 3.1373136746597536, + "grad_norm": 0.05438097193837166, + "learning_rate": 4.792744948017463e-05, + "loss": 0.2146, + "step": 38727 + }, + { + "epoch": 3.1373946856772523, + "grad_norm": 0.06895055621862411, + "learning_rate": 4.792294882758e-05, + "loss": 0.2305, + "step": 38728 + }, + { + "epoch": 3.1374756966947506, + "grad_norm": 0.08061118423938751, + "learning_rate": 4.7918448174985375e-05, + "loss": 0.2277, + "step": 38729 + }, + { + "epoch": 3.137556707712249, + "grad_norm": 0.0754641741514206, + "learning_rate": 4.791394752239075e-05, + "loss": 0.222, + "step": 38730 + }, + { + "epoch": 3.137637718729747, + "grad_norm": 0.09252101182937622, + "learning_rate": 4.790944686979612e-05, + "loss": 0.264, + "step": 38731 + }, + { + "epoch": 3.1377187297472457, + "grad_norm": 0.06960175186395645, + "learning_rate": 4.7904946217201496e-05, + "loss": 0.1871, + "step": 38732 + }, + { + "epoch": 3.137799740764744, + "grad_norm": 0.06823209673166275, + "learning_rate": 4.790044556460687e-05, + "loss": 0.2369, + "step": 38733 + }, + { + "epoch": 3.1378807517822422, + "grad_norm": 0.07666932791471481, + "learning_rate": 4.789594491201224e-05, + "loss": 0.2539, + "step": 38734 + }, + { + "epoch": 3.137961762799741, + "grad_norm": 0.06866523623466492, + "learning_rate": 4.789144425941762e-05, + "loss": 0.2293, + "step": 38735 + }, + { + "epoch": 3.138042773817239, + "grad_norm": 0.07156126946210861, + "learning_rate": 4.788694360682299e-05, + "loss": 0.2662, + "step": 38736 + }, + { + "epoch": 3.1381237848347374, + "grad_norm": 0.0736309066414833, + "learning_rate": 4.788244295422837e-05, + "loss": 0.2096, + "step": 38737 + }, + { + "epoch": 3.138204795852236, + "grad_norm": 0.05947743356227875, + "learning_rate": 4.787794230163374e-05, + "loss": 0.1979, + "step": 38738 + }, + { + "epoch": 3.1382858068697344, + "grad_norm": 0.05899606645107269, + "learning_rate": 4.787344164903911e-05, + "loss": 0.2109, + "step": 38739 + }, + { + "epoch": 3.1383668178872326, + "grad_norm": 0.07126261293888092, + "learning_rate": 4.786894099644449e-05, + "loss": 0.2453, + "step": 38740 + }, + { + "epoch": 3.138447828904731, + "grad_norm": 0.07778992503881454, + "learning_rate": 4.786444034384986e-05, + "loss": 0.2726, + "step": 38741 + }, + { + "epoch": 3.1385288399222295, + "grad_norm": 0.06148141250014305, + "learning_rate": 4.785993969125523e-05, + "loss": 0.2053, + "step": 38742 + }, + { + "epoch": 3.1386098509397278, + "grad_norm": 0.06504407525062561, + "learning_rate": 4.785543903866061e-05, + "loss": 0.2623, + "step": 38743 + }, + { + "epoch": 3.138690861957226, + "grad_norm": 0.058437515050172806, + "learning_rate": 4.785093838606598e-05, + "loss": 0.2091, + "step": 38744 + }, + { + "epoch": 3.1387718729747247, + "grad_norm": 0.07105541974306107, + "learning_rate": 4.784643773347135e-05, + "loss": 0.2251, + "step": 38745 + }, + { + "epoch": 3.138852883992223, + "grad_norm": 0.0854935422539711, + "learning_rate": 4.784193708087673e-05, + "loss": 0.2837, + "step": 38746 + }, + { + "epoch": 3.138933895009721, + "grad_norm": 0.06620346009731293, + "learning_rate": 4.78374364282821e-05, + "loss": 0.1855, + "step": 38747 + }, + { + "epoch": 3.13901490602722, + "grad_norm": 0.0695490911602974, + "learning_rate": 4.7832935775687474e-05, + "loss": 0.2548, + "step": 38748 + }, + { + "epoch": 3.139095917044718, + "grad_norm": 0.05896589532494545, + "learning_rate": 4.7828435123092854e-05, + "loss": 0.2148, + "step": 38749 + }, + { + "epoch": 3.1391769280622164, + "grad_norm": 0.06517542153596878, + "learning_rate": 4.782393447049823e-05, + "loss": 0.2543, + "step": 38750 + }, + { + "epoch": 3.139257939079715, + "grad_norm": 0.07684143632650375, + "learning_rate": 4.7819433817903595e-05, + "loss": 0.2843, + "step": 38751 + }, + { + "epoch": 3.1393389500972133, + "grad_norm": 0.08605005592107773, + "learning_rate": 4.7814933165308975e-05, + "loss": 0.244, + "step": 38752 + }, + { + "epoch": 3.1394199611147116, + "grad_norm": 0.07349022477865219, + "learning_rate": 4.781043251271435e-05, + "loss": 0.2604, + "step": 38753 + }, + { + "epoch": 3.13950097213221, + "grad_norm": 0.07187152653932571, + "learning_rate": 4.7805931860119715e-05, + "loss": 0.2178, + "step": 38754 + }, + { + "epoch": 3.1395819831497085, + "grad_norm": 0.06581337004899979, + "learning_rate": 4.7801431207525096e-05, + "loss": 0.2162, + "step": 38755 + }, + { + "epoch": 3.1396629941672067, + "grad_norm": 0.08627573400735855, + "learning_rate": 4.779693055493047e-05, + "loss": 0.231, + "step": 38756 + }, + { + "epoch": 3.139744005184705, + "grad_norm": 0.06073044613003731, + "learning_rate": 4.7792429902335836e-05, + "loss": 0.2178, + "step": 38757 + }, + { + "epoch": 3.1398250162022037, + "grad_norm": 0.06303252279758453, + "learning_rate": 4.7787929249741217e-05, + "loss": 0.2319, + "step": 38758 + }, + { + "epoch": 3.139906027219702, + "grad_norm": 0.08314767479896545, + "learning_rate": 4.778342859714659e-05, + "loss": 0.2227, + "step": 38759 + }, + { + "epoch": 3.1399870382372, + "grad_norm": 0.06818264722824097, + "learning_rate": 4.777892794455196e-05, + "loss": 0.2166, + "step": 38760 + }, + { + "epoch": 3.140068049254699, + "grad_norm": 0.0643196552991867, + "learning_rate": 4.777442729195734e-05, + "loss": 0.237, + "step": 38761 + }, + { + "epoch": 3.140149060272197, + "grad_norm": 0.07213971763849258, + "learning_rate": 4.776992663936271e-05, + "loss": 0.228, + "step": 38762 + }, + { + "epoch": 3.1402300712896953, + "grad_norm": 0.06124432384967804, + "learning_rate": 4.776542598676808e-05, + "loss": 0.1956, + "step": 38763 + }, + { + "epoch": 3.1403110823071936, + "grad_norm": 0.08000092953443527, + "learning_rate": 4.776092533417346e-05, + "loss": 0.2412, + "step": 38764 + }, + { + "epoch": 3.1403920933246923, + "grad_norm": 0.07622207701206207, + "learning_rate": 4.775642468157883e-05, + "loss": 0.21, + "step": 38765 + }, + { + "epoch": 3.1404731043421905, + "grad_norm": 0.06574143469333649, + "learning_rate": 4.7751924028984205e-05, + "loss": 0.1905, + "step": 38766 + }, + { + "epoch": 3.1405541153596888, + "grad_norm": 0.066102996468544, + "learning_rate": 4.774742337638958e-05, + "loss": 0.2933, + "step": 38767 + }, + { + "epoch": 3.1406351263771874, + "grad_norm": 0.06801317632198334, + "learning_rate": 4.774292272379495e-05, + "loss": 0.2423, + "step": 38768 + }, + { + "epoch": 3.1407161373946857, + "grad_norm": 0.07819636166095734, + "learning_rate": 4.7738422071200326e-05, + "loss": 0.2543, + "step": 38769 + }, + { + "epoch": 3.140797148412184, + "grad_norm": 0.07022461295127869, + "learning_rate": 4.77339214186057e-05, + "loss": 0.2368, + "step": 38770 + }, + { + "epoch": 3.1408781594296826, + "grad_norm": 0.06046614423394203, + "learning_rate": 4.772942076601107e-05, + "loss": 0.2212, + "step": 38771 + }, + { + "epoch": 3.140959170447181, + "grad_norm": 0.07519835978746414, + "learning_rate": 4.772492011341645e-05, + "loss": 0.233, + "step": 38772 + }, + { + "epoch": 3.141040181464679, + "grad_norm": 0.06469036638736725, + "learning_rate": 4.772041946082182e-05, + "loss": 0.1934, + "step": 38773 + }, + { + "epoch": 3.141121192482178, + "grad_norm": 0.07506491988897324, + "learning_rate": 4.7715918808227194e-05, + "loss": 0.2136, + "step": 38774 + }, + { + "epoch": 3.141202203499676, + "grad_norm": 0.07341165840625763, + "learning_rate": 4.771141815563257e-05, + "loss": 0.2221, + "step": 38775 + }, + { + "epoch": 3.1412832145171743, + "grad_norm": 0.05977972596883774, + "learning_rate": 4.770691750303794e-05, + "loss": 0.1928, + "step": 38776 + }, + { + "epoch": 3.1413642255346725, + "grad_norm": 0.08076482266187668, + "learning_rate": 4.7702416850443315e-05, + "loss": 0.2165, + "step": 38777 + }, + { + "epoch": 3.1414452365521712, + "grad_norm": 0.06527772545814514, + "learning_rate": 4.769791619784869e-05, + "loss": 0.225, + "step": 38778 + }, + { + "epoch": 3.1415262475696695, + "grad_norm": 0.09196940809488297, + "learning_rate": 4.769341554525407e-05, + "loss": 0.2728, + "step": 38779 + }, + { + "epoch": 3.1416072585871677, + "grad_norm": 0.06105652451515198, + "learning_rate": 4.7688914892659436e-05, + "loss": 0.1956, + "step": 38780 + }, + { + "epoch": 3.1416882696046664, + "grad_norm": 0.07764767855405807, + "learning_rate": 4.768441424006481e-05, + "loss": 0.2564, + "step": 38781 + }, + { + "epoch": 3.1417692806221647, + "grad_norm": 0.08602216839790344, + "learning_rate": 4.767991358747019e-05, + "loss": 0.2395, + "step": 38782 + }, + { + "epoch": 3.141850291639663, + "grad_norm": 0.07574327290058136, + "learning_rate": 4.7675412934875557e-05, + "loss": 0.2313, + "step": 38783 + }, + { + "epoch": 3.141931302657161, + "grad_norm": 0.0708719864487648, + "learning_rate": 4.767091228228093e-05, + "loss": 0.3054, + "step": 38784 + }, + { + "epoch": 3.14201231367466, + "grad_norm": 0.07440514117479324, + "learning_rate": 4.766641162968631e-05, + "loss": 0.2894, + "step": 38785 + }, + { + "epoch": 3.142093324692158, + "grad_norm": 0.04955153539776802, + "learning_rate": 4.766191097709168e-05, + "loss": 0.2001, + "step": 38786 + }, + { + "epoch": 3.1421743357096563, + "grad_norm": 0.07119850814342499, + "learning_rate": 4.765741032449705e-05, + "loss": 0.1937, + "step": 38787 + }, + { + "epoch": 3.142255346727155, + "grad_norm": 0.07241246104240417, + "learning_rate": 4.765290967190243e-05, + "loss": 0.218, + "step": 38788 + }, + { + "epoch": 3.1423363577446533, + "grad_norm": 0.06336984783411026, + "learning_rate": 4.76484090193078e-05, + "loss": 0.2247, + "step": 38789 + }, + { + "epoch": 3.1424173687621515, + "grad_norm": 0.0958334356546402, + "learning_rate": 4.764390836671317e-05, + "loss": 0.2178, + "step": 38790 + }, + { + "epoch": 3.14249837977965, + "grad_norm": 0.08584460616111755, + "learning_rate": 4.763940771411855e-05, + "loss": 0.2244, + "step": 38791 + }, + { + "epoch": 3.1425793907971484, + "grad_norm": 0.07357819378376007, + "learning_rate": 4.7634907061523926e-05, + "loss": 0.2327, + "step": 38792 + }, + { + "epoch": 3.1426604018146467, + "grad_norm": 0.07552488893270493, + "learning_rate": 4.763040640892929e-05, + "loss": 0.2037, + "step": 38793 + }, + { + "epoch": 3.1427414128321454, + "grad_norm": 0.07349817454814911, + "learning_rate": 4.762590575633467e-05, + "loss": 0.2213, + "step": 38794 + }, + { + "epoch": 3.1428224238496436, + "grad_norm": 0.08728177100419998, + "learning_rate": 4.762140510374005e-05, + "loss": 0.2529, + "step": 38795 + }, + { + "epoch": 3.142903434867142, + "grad_norm": 0.07271800190210342, + "learning_rate": 4.7616904451145413e-05, + "loss": 0.2495, + "step": 38796 + }, + { + "epoch": 3.1429844458846405, + "grad_norm": 0.07111764699220657, + "learning_rate": 4.7612403798550794e-05, + "loss": 0.2178, + "step": 38797 + }, + { + "epoch": 3.143065456902139, + "grad_norm": 0.0855024978518486, + "learning_rate": 4.760790314595617e-05, + "loss": 0.2584, + "step": 38798 + }, + { + "epoch": 3.143146467919637, + "grad_norm": 0.07055241614580154, + "learning_rate": 4.7603402493361534e-05, + "loss": 0.2825, + "step": 38799 + }, + { + "epoch": 3.1432274789371353, + "grad_norm": 0.07589733600616455, + "learning_rate": 4.7598901840766915e-05, + "loss": 0.2257, + "step": 38800 + }, + { + "epoch": 3.143308489954634, + "grad_norm": 0.06895039230585098, + "learning_rate": 4.759440118817229e-05, + "loss": 0.2532, + "step": 38801 + }, + { + "epoch": 3.143389500972132, + "grad_norm": 0.08680260181427002, + "learning_rate": 4.7589900535577655e-05, + "loss": 0.2195, + "step": 38802 + }, + { + "epoch": 3.1434705119896305, + "grad_norm": 0.07699909061193466, + "learning_rate": 4.7585399882983035e-05, + "loss": 0.2348, + "step": 38803 + }, + { + "epoch": 3.143551523007129, + "grad_norm": 0.07933976501226425, + "learning_rate": 4.758089923038841e-05, + "loss": 0.2237, + "step": 38804 + }, + { + "epoch": 3.1436325340246274, + "grad_norm": 0.0559968575835228, + "learning_rate": 4.757639857779378e-05, + "loss": 0.2398, + "step": 38805 + }, + { + "epoch": 3.1437135450421256, + "grad_norm": 0.06536906957626343, + "learning_rate": 4.7571897925199156e-05, + "loss": 0.224, + "step": 38806 + }, + { + "epoch": 3.143794556059624, + "grad_norm": 0.0685034990310669, + "learning_rate": 4.756739727260453e-05, + "loss": 0.2441, + "step": 38807 + }, + { + "epoch": 3.1438755670771226, + "grad_norm": 0.07415509968996048, + "learning_rate": 4.7562896620009903e-05, + "loss": 0.2, + "step": 38808 + }, + { + "epoch": 3.143956578094621, + "grad_norm": 0.06640369445085526, + "learning_rate": 4.755839596741528e-05, + "loss": 0.2082, + "step": 38809 + }, + { + "epoch": 3.144037589112119, + "grad_norm": 0.07083263993263245, + "learning_rate": 4.755389531482065e-05, + "loss": 0.2412, + "step": 38810 + }, + { + "epoch": 3.1441186001296177, + "grad_norm": 0.08034813404083252, + "learning_rate": 4.7549394662226024e-05, + "loss": 0.261, + "step": 38811 + }, + { + "epoch": 3.144199611147116, + "grad_norm": 0.06492961943149567, + "learning_rate": 4.75448940096314e-05, + "loss": 0.2251, + "step": 38812 + }, + { + "epoch": 3.1442806221646142, + "grad_norm": 0.07724326848983765, + "learning_rate": 4.754039335703677e-05, + "loss": 0.2567, + "step": 38813 + }, + { + "epoch": 3.144361633182113, + "grad_norm": 0.07996118068695068, + "learning_rate": 4.7535892704442145e-05, + "loss": 0.3033, + "step": 38814 + }, + { + "epoch": 3.144442644199611, + "grad_norm": 0.07107841223478317, + "learning_rate": 4.753139205184752e-05, + "loss": 0.23, + "step": 38815 + }, + { + "epoch": 3.1445236552171094, + "grad_norm": 0.06783238053321838, + "learning_rate": 4.752689139925289e-05, + "loss": 0.2227, + "step": 38816 + }, + { + "epoch": 3.144604666234608, + "grad_norm": 0.07720325887203217, + "learning_rate": 4.7522390746658266e-05, + "loss": 0.2294, + "step": 38817 + }, + { + "epoch": 3.1446856772521063, + "grad_norm": 0.08420754224061966, + "learning_rate": 4.7517890094063646e-05, + "loss": 0.2506, + "step": 38818 + }, + { + "epoch": 3.1447666882696046, + "grad_norm": 0.06361111998558044, + "learning_rate": 4.751338944146901e-05, + "loss": 0.2415, + "step": 38819 + }, + { + "epoch": 3.144847699287103, + "grad_norm": 0.07500552386045456, + "learning_rate": 4.750888878887439e-05, + "loss": 0.2436, + "step": 38820 + }, + { + "epoch": 3.1449287103046015, + "grad_norm": 0.08437523990869522, + "learning_rate": 4.750438813627977e-05, + "loss": 0.267, + "step": 38821 + }, + { + "epoch": 3.1450097213220998, + "grad_norm": 0.06883621960878372, + "learning_rate": 4.7499887483685134e-05, + "loss": 0.2292, + "step": 38822 + }, + { + "epoch": 3.145090732339598, + "grad_norm": 0.07928171753883362, + "learning_rate": 4.749538683109051e-05, + "loss": 0.2373, + "step": 38823 + }, + { + "epoch": 3.1451717433570967, + "grad_norm": 0.08163254708051682, + "learning_rate": 4.749088617849589e-05, + "loss": 0.2215, + "step": 38824 + }, + { + "epoch": 3.145252754374595, + "grad_norm": 0.08181016147136688, + "learning_rate": 4.7486385525901255e-05, + "loss": 0.2322, + "step": 38825 + }, + { + "epoch": 3.145333765392093, + "grad_norm": 0.08071200549602509, + "learning_rate": 4.748188487330663e-05, + "loss": 0.2189, + "step": 38826 + }, + { + "epoch": 3.145414776409592, + "grad_norm": 0.06839993596076965, + "learning_rate": 4.747738422071201e-05, + "loss": 0.2245, + "step": 38827 + }, + { + "epoch": 3.14549578742709, + "grad_norm": 0.07583677768707275, + "learning_rate": 4.7472883568117376e-05, + "loss": 0.2302, + "step": 38828 + }, + { + "epoch": 3.1455767984445884, + "grad_norm": 0.06504601240158081, + "learning_rate": 4.746838291552275e-05, + "loss": 0.2034, + "step": 38829 + }, + { + "epoch": 3.1456578094620866, + "grad_norm": 0.08468101173639297, + "learning_rate": 4.746388226292813e-05, + "loss": 0.2392, + "step": 38830 + }, + { + "epoch": 3.1457388204795853, + "grad_norm": 0.07652562111616135, + "learning_rate": 4.74593816103335e-05, + "loss": 0.2191, + "step": 38831 + }, + { + "epoch": 3.1458198314970836, + "grad_norm": 0.06976919621229172, + "learning_rate": 4.745488095773887e-05, + "loss": 0.25, + "step": 38832 + }, + { + "epoch": 3.145900842514582, + "grad_norm": 0.08537302166223526, + "learning_rate": 4.745038030514425e-05, + "loss": 0.21, + "step": 38833 + }, + { + "epoch": 3.1459818535320805, + "grad_norm": 0.07848033308982849, + "learning_rate": 4.7445879652549624e-05, + "loss": 0.2535, + "step": 38834 + }, + { + "epoch": 3.1460628645495787, + "grad_norm": 0.07927160710096359, + "learning_rate": 4.744137899995499e-05, + "loss": 0.2196, + "step": 38835 + }, + { + "epoch": 3.146143875567077, + "grad_norm": 0.09206971526145935, + "learning_rate": 4.743687834736037e-05, + "loss": 0.2628, + "step": 38836 + }, + { + "epoch": 3.1462248865845757, + "grad_norm": 0.06408418715000153, + "learning_rate": 4.7432377694765745e-05, + "loss": 0.2262, + "step": 38837 + }, + { + "epoch": 3.146305897602074, + "grad_norm": 0.07233559340238571, + "learning_rate": 4.742787704217111e-05, + "loss": 0.2381, + "step": 38838 + }, + { + "epoch": 3.146386908619572, + "grad_norm": 0.0643056184053421, + "learning_rate": 4.742337638957649e-05, + "loss": 0.1962, + "step": 38839 + }, + { + "epoch": 3.146467919637071, + "grad_norm": 0.08704189956188202, + "learning_rate": 4.7418875736981866e-05, + "loss": 0.2575, + "step": 38840 + }, + { + "epoch": 3.146548930654569, + "grad_norm": 0.06243178993463516, + "learning_rate": 4.741437508438724e-05, + "loss": 0.2182, + "step": 38841 + }, + { + "epoch": 3.1466299416720673, + "grad_norm": 0.06110955774784088, + "learning_rate": 4.740987443179261e-05, + "loss": 0.2205, + "step": 38842 + }, + { + "epoch": 3.1467109526895656, + "grad_norm": 0.07244785875082016, + "learning_rate": 4.7405373779197986e-05, + "loss": 0.2538, + "step": 38843 + }, + { + "epoch": 3.1467919637070643, + "grad_norm": 0.07918354123830795, + "learning_rate": 4.740087312660336e-05, + "loss": 0.2611, + "step": 38844 + }, + { + "epoch": 3.1468729747245625, + "grad_norm": 0.06351567059755325, + "learning_rate": 4.7396372474008734e-05, + "loss": 0.2168, + "step": 38845 + }, + { + "epoch": 3.1469539857420608, + "grad_norm": 0.07483189553022385, + "learning_rate": 4.739187182141411e-05, + "loss": 0.2444, + "step": 38846 + }, + { + "epoch": 3.1470349967595594, + "grad_norm": 0.06781207770109177, + "learning_rate": 4.738737116881948e-05, + "loss": 0.2113, + "step": 38847 + }, + { + "epoch": 3.1471160077770577, + "grad_norm": 0.08817379921674728, + "learning_rate": 4.7382870516224854e-05, + "loss": 0.2752, + "step": 38848 + }, + { + "epoch": 3.147197018794556, + "grad_norm": 0.0680769756436348, + "learning_rate": 4.737836986363023e-05, + "loss": 0.2228, + "step": 38849 + }, + { + "epoch": 3.1472780298120546, + "grad_norm": 0.06445880234241486, + "learning_rate": 4.73738692110356e-05, + "loss": 0.2545, + "step": 38850 + }, + { + "epoch": 3.147359040829553, + "grad_norm": 0.07427272945642471, + "learning_rate": 4.7369368558440975e-05, + "loss": 0.186, + "step": 38851 + }, + { + "epoch": 3.147440051847051, + "grad_norm": 0.0860607773065567, + "learning_rate": 4.736486790584635e-05, + "loss": 0.2141, + "step": 38852 + }, + { + "epoch": 3.1475210628645494, + "grad_norm": 0.057298026978969574, + "learning_rate": 4.736036725325172e-05, + "loss": 0.2251, + "step": 38853 + }, + { + "epoch": 3.147602073882048, + "grad_norm": 0.0735674574971199, + "learning_rate": 4.7355866600657096e-05, + "loss": 0.2598, + "step": 38854 + }, + { + "epoch": 3.1476830848995463, + "grad_norm": 0.07864909619092941, + "learning_rate": 4.735136594806247e-05, + "loss": 0.2058, + "step": 38855 + }, + { + "epoch": 3.1477640959170445, + "grad_norm": 0.0691840648651123, + "learning_rate": 4.734686529546784e-05, + "loss": 0.2297, + "step": 38856 + }, + { + "epoch": 3.1478451069345432, + "grad_norm": 0.0769876092672348, + "learning_rate": 4.7342364642873224e-05, + "loss": 0.2166, + "step": 38857 + }, + { + "epoch": 3.1479261179520415, + "grad_norm": 0.08170641958713531, + "learning_rate": 4.733786399027859e-05, + "loss": 0.2353, + "step": 38858 + }, + { + "epoch": 3.1480071289695397, + "grad_norm": 0.06473393738269806, + "learning_rate": 4.7333363337683964e-05, + "loss": 0.1885, + "step": 38859 + }, + { + "epoch": 3.1480881399870384, + "grad_norm": 0.07050579786300659, + "learning_rate": 4.7328862685089344e-05, + "loss": 0.2376, + "step": 38860 + }, + { + "epoch": 3.1481691510045366, + "grad_norm": 0.0721215307712555, + "learning_rate": 4.732436203249471e-05, + "loss": 0.2477, + "step": 38861 + }, + { + "epoch": 3.148250162022035, + "grad_norm": 0.08369814604520798, + "learning_rate": 4.7319861379900085e-05, + "loss": 0.2383, + "step": 38862 + }, + { + "epoch": 3.1483311730395336, + "grad_norm": 0.0835237056016922, + "learning_rate": 4.7315360727305465e-05, + "loss": 0.244, + "step": 38863 + }, + { + "epoch": 3.148412184057032, + "grad_norm": 0.062347088009119034, + "learning_rate": 4.731086007471083e-05, + "loss": 0.2307, + "step": 38864 + }, + { + "epoch": 3.14849319507453, + "grad_norm": 0.07704142481088638, + "learning_rate": 4.7306359422116206e-05, + "loss": 0.2462, + "step": 38865 + }, + { + "epoch": 3.1485742060920283, + "grad_norm": 0.08721894025802612, + "learning_rate": 4.7301858769521586e-05, + "loss": 0.2395, + "step": 38866 + }, + { + "epoch": 3.148655217109527, + "grad_norm": 0.06161055341362953, + "learning_rate": 4.729735811692695e-05, + "loss": 0.2691, + "step": 38867 + }, + { + "epoch": 3.1487362281270252, + "grad_norm": 0.07100638747215271, + "learning_rate": 4.7292857464332326e-05, + "loss": 0.2484, + "step": 38868 + }, + { + "epoch": 3.1488172391445235, + "grad_norm": 0.08029984682798386, + "learning_rate": 4.728835681173771e-05, + "loss": 0.2605, + "step": 38869 + }, + { + "epoch": 3.148898250162022, + "grad_norm": 0.06359708309173584, + "learning_rate": 4.728385615914308e-05, + "loss": 0.2368, + "step": 38870 + }, + { + "epoch": 3.1489792611795204, + "grad_norm": 0.06722856312990189, + "learning_rate": 4.727935550654845e-05, + "loss": 0.2509, + "step": 38871 + }, + { + "epoch": 3.1490602721970187, + "grad_norm": 0.08963049203157425, + "learning_rate": 4.727485485395383e-05, + "loss": 0.2237, + "step": 38872 + }, + { + "epoch": 3.1491412832145174, + "grad_norm": 0.08066631853580475, + "learning_rate": 4.72703542013592e-05, + "loss": 0.2128, + "step": 38873 + }, + { + "epoch": 3.1492222942320156, + "grad_norm": 0.07186246663331985, + "learning_rate": 4.7265853548764575e-05, + "loss": 0.2421, + "step": 38874 + }, + { + "epoch": 3.149303305249514, + "grad_norm": 0.0749310553073883, + "learning_rate": 4.726135289616995e-05, + "loss": 0.2125, + "step": 38875 + }, + { + "epoch": 3.149384316267012, + "grad_norm": 0.0684628114104271, + "learning_rate": 4.725685224357532e-05, + "loss": 0.2271, + "step": 38876 + }, + { + "epoch": 3.149465327284511, + "grad_norm": 0.08189097046852112, + "learning_rate": 4.7252351590980696e-05, + "loss": 0.2347, + "step": 38877 + }, + { + "epoch": 3.149546338302009, + "grad_norm": 0.06720206886529922, + "learning_rate": 4.724785093838607e-05, + "loss": 0.2182, + "step": 38878 + }, + { + "epoch": 3.1496273493195073, + "grad_norm": 0.07193852961063385, + "learning_rate": 4.724335028579144e-05, + "loss": 0.2394, + "step": 38879 + }, + { + "epoch": 3.149708360337006, + "grad_norm": 0.06983591616153717, + "learning_rate": 4.7238849633196816e-05, + "loss": 0.2323, + "step": 38880 + }, + { + "epoch": 3.149789371354504, + "grad_norm": 0.06906203180551529, + "learning_rate": 4.723434898060219e-05, + "loss": 0.2282, + "step": 38881 + }, + { + "epoch": 3.1498703823720025, + "grad_norm": 0.0820295587182045, + "learning_rate": 4.7229848328007564e-05, + "loss": 0.2498, + "step": 38882 + }, + { + "epoch": 3.149951393389501, + "grad_norm": 0.07069437205791473, + "learning_rate": 4.722534767541294e-05, + "loss": 0.2332, + "step": 38883 + }, + { + "epoch": 3.1500324044069994, + "grad_norm": 0.06761626899242401, + "learning_rate": 4.722084702281831e-05, + "loss": 0.2758, + "step": 38884 + }, + { + "epoch": 3.1501134154244976, + "grad_norm": 0.07170595228672028, + "learning_rate": 4.7216346370223684e-05, + "loss": 0.2273, + "step": 38885 + }, + { + "epoch": 3.1501944264419963, + "grad_norm": 0.0710739716887474, + "learning_rate": 4.721184571762906e-05, + "loss": 0.2268, + "step": 38886 + }, + { + "epoch": 3.1502754374594946, + "grad_norm": 0.08929024636745453, + "learning_rate": 4.720734506503443e-05, + "loss": 0.245, + "step": 38887 + }, + { + "epoch": 3.150356448476993, + "grad_norm": 0.0899534523487091, + "learning_rate": 4.7202844412439805e-05, + "loss": 0.2408, + "step": 38888 + }, + { + "epoch": 3.150437459494491, + "grad_norm": 0.07042014598846436, + "learning_rate": 4.719834375984518e-05, + "loss": 0.205, + "step": 38889 + }, + { + "epoch": 3.1505184705119897, + "grad_norm": 0.06146010756492615, + "learning_rate": 4.719384310725055e-05, + "loss": 0.2299, + "step": 38890 + }, + { + "epoch": 3.150599481529488, + "grad_norm": 0.08242175728082657, + "learning_rate": 4.7189342454655926e-05, + "loss": 0.2746, + "step": 38891 + }, + { + "epoch": 3.1506804925469862, + "grad_norm": 0.06836254894733429, + "learning_rate": 4.71848418020613e-05, + "loss": 0.2107, + "step": 38892 + }, + { + "epoch": 3.150761503564485, + "grad_norm": 0.07505775988101959, + "learning_rate": 4.718034114946667e-05, + "loss": 0.2615, + "step": 38893 + }, + { + "epoch": 3.150842514581983, + "grad_norm": 0.07807854562997818, + "learning_rate": 4.717584049687205e-05, + "loss": 0.2545, + "step": 38894 + }, + { + "epoch": 3.1509235255994814, + "grad_norm": 0.05729011446237564, + "learning_rate": 4.717133984427742e-05, + "loss": 0.2179, + "step": 38895 + }, + { + "epoch": 3.15100453661698, + "grad_norm": 0.06013137102127075, + "learning_rate": 4.71668391916828e-05, + "loss": 0.2246, + "step": 38896 + }, + { + "epoch": 3.1510855476344783, + "grad_norm": 0.07943231612443924, + "learning_rate": 4.716233853908817e-05, + "loss": 0.2535, + "step": 38897 + }, + { + "epoch": 3.1511665586519766, + "grad_norm": 0.058542679995298386, + "learning_rate": 4.715783788649354e-05, + "loss": 0.2031, + "step": 38898 + }, + { + "epoch": 3.151247569669475, + "grad_norm": 0.06623531132936478, + "learning_rate": 4.715333723389892e-05, + "loss": 0.2113, + "step": 38899 + }, + { + "epoch": 3.1513285806869735, + "grad_norm": 0.0729438066482544, + "learning_rate": 4.714883658130429e-05, + "loss": 0.2176, + "step": 38900 + }, + { + "epoch": 3.1514095917044718, + "grad_norm": 0.07456063479185104, + "learning_rate": 4.714433592870966e-05, + "loss": 0.2304, + "step": 38901 + }, + { + "epoch": 3.15149060272197, + "grad_norm": 0.0609896220266819, + "learning_rate": 4.713983527611504e-05, + "loss": 0.2089, + "step": 38902 + }, + { + "epoch": 3.1515716137394687, + "grad_norm": 0.05986235663294792, + "learning_rate": 4.713533462352041e-05, + "loss": 0.2242, + "step": 38903 + }, + { + "epoch": 3.151652624756967, + "grad_norm": 0.057405129075050354, + "learning_rate": 4.713083397092578e-05, + "loss": 0.2367, + "step": 38904 + }, + { + "epoch": 3.151733635774465, + "grad_norm": 0.06006966903805733, + "learning_rate": 4.712633331833116e-05, + "loss": 0.2318, + "step": 38905 + }, + { + "epoch": 3.151814646791964, + "grad_norm": 0.0800834521651268, + "learning_rate": 4.712183266573653e-05, + "loss": 0.2803, + "step": 38906 + }, + { + "epoch": 3.151895657809462, + "grad_norm": 0.07425723969936371, + "learning_rate": 4.711733201314191e-05, + "loss": 0.27, + "step": 38907 + }, + { + "epoch": 3.1519766688269604, + "grad_norm": 0.07813823223114014, + "learning_rate": 4.7112831360547284e-05, + "loss": 0.2549, + "step": 38908 + }, + { + "epoch": 3.152057679844459, + "grad_norm": 0.08356890082359314, + "learning_rate": 4.710833070795266e-05, + "loss": 0.234, + "step": 38909 + }, + { + "epoch": 3.1521386908619573, + "grad_norm": 0.08138100802898407, + "learning_rate": 4.710383005535803e-05, + "loss": 0.2303, + "step": 38910 + }, + { + "epoch": 3.1522197018794555, + "grad_norm": 0.06799155473709106, + "learning_rate": 4.7099329402763405e-05, + "loss": 0.2413, + "step": 38911 + }, + { + "epoch": 3.152300712896954, + "grad_norm": 0.07273134589195251, + "learning_rate": 4.709482875016878e-05, + "loss": 0.1978, + "step": 38912 + }, + { + "epoch": 3.1523817239144525, + "grad_norm": 0.07886601239442825, + "learning_rate": 4.709032809757415e-05, + "loss": 0.2509, + "step": 38913 + }, + { + "epoch": 3.1524627349319507, + "grad_norm": 0.06748409569263458, + "learning_rate": 4.7085827444979526e-05, + "loss": 0.2317, + "step": 38914 + }, + { + "epoch": 3.152543745949449, + "grad_norm": 0.07170841842889786, + "learning_rate": 4.70813267923849e-05, + "loss": 0.2116, + "step": 38915 + }, + { + "epoch": 3.1526247569669477, + "grad_norm": 0.09538058191537857, + "learning_rate": 4.707682613979027e-05, + "loss": 0.2623, + "step": 38916 + }, + { + "epoch": 3.152705767984446, + "grad_norm": 0.06135268881917, + "learning_rate": 4.7072325487195647e-05, + "loss": 0.2094, + "step": 38917 + }, + { + "epoch": 3.152786779001944, + "grad_norm": 0.07493219524621964, + "learning_rate": 4.706782483460102e-05, + "loss": 0.2838, + "step": 38918 + }, + { + "epoch": 3.152867790019443, + "grad_norm": 0.06381876766681671, + "learning_rate": 4.7063324182006394e-05, + "loss": 0.2348, + "step": 38919 + }, + { + "epoch": 3.152948801036941, + "grad_norm": 0.08309265226125717, + "learning_rate": 4.705882352941177e-05, + "loss": 0.2542, + "step": 38920 + }, + { + "epoch": 3.1530298120544393, + "grad_norm": 0.06406621634960175, + "learning_rate": 4.705432287681714e-05, + "loss": 0.2181, + "step": 38921 + }, + { + "epoch": 3.1531108230719376, + "grad_norm": 0.06495659798383713, + "learning_rate": 4.7049822224222515e-05, + "loss": 0.2456, + "step": 38922 + }, + { + "epoch": 3.1531918340894363, + "grad_norm": 0.05980648100376129, + "learning_rate": 4.704532157162789e-05, + "loss": 0.2007, + "step": 38923 + }, + { + "epoch": 3.1532728451069345, + "grad_norm": 0.07143256813287735, + "learning_rate": 4.704082091903326e-05, + "loss": 0.1986, + "step": 38924 + }, + { + "epoch": 3.1533538561244328, + "grad_norm": 0.06251020729541779, + "learning_rate": 4.7036320266438635e-05, + "loss": 0.2348, + "step": 38925 + }, + { + "epoch": 3.1534348671419314, + "grad_norm": 0.0595199279487133, + "learning_rate": 4.703181961384401e-05, + "loss": 0.2211, + "step": 38926 + }, + { + "epoch": 3.1535158781594297, + "grad_norm": 0.059528898447752, + "learning_rate": 4.702731896124938e-05, + "loss": 0.2235, + "step": 38927 + }, + { + "epoch": 3.153596889176928, + "grad_norm": 0.07514117658138275, + "learning_rate": 4.7022818308654756e-05, + "loss": 0.2576, + "step": 38928 + }, + { + "epoch": 3.1536779001944266, + "grad_norm": 0.06593476235866547, + "learning_rate": 4.701831765606013e-05, + "loss": 0.2625, + "step": 38929 + }, + { + "epoch": 3.153758911211925, + "grad_norm": 0.0637379065155983, + "learning_rate": 4.7013817003465503e-05, + "loss": 0.2317, + "step": 38930 + }, + { + "epoch": 3.153839922229423, + "grad_norm": 0.07333981990814209, + "learning_rate": 4.700931635087088e-05, + "loss": 0.2393, + "step": 38931 + }, + { + "epoch": 3.153920933246922, + "grad_norm": 0.06379441171884537, + "learning_rate": 4.700481569827625e-05, + "loss": 0.2496, + "step": 38932 + }, + { + "epoch": 3.15400194426442, + "grad_norm": 0.05707908794283867, + "learning_rate": 4.7000315045681624e-05, + "loss": 0.2441, + "step": 38933 + }, + { + "epoch": 3.1540829552819183, + "grad_norm": 0.07235048711299896, + "learning_rate": 4.6995814393087e-05, + "loss": 0.2207, + "step": 38934 + }, + { + "epoch": 3.1541639662994165, + "grad_norm": 0.07200153917074203, + "learning_rate": 4.699131374049237e-05, + "loss": 0.2249, + "step": 38935 + }, + { + "epoch": 3.154244977316915, + "grad_norm": 0.06244270130991936, + "learning_rate": 4.6986813087897745e-05, + "loss": 0.1957, + "step": 38936 + }, + { + "epoch": 3.1543259883344135, + "grad_norm": 0.06604592502117157, + "learning_rate": 4.698231243530312e-05, + "loss": 0.1946, + "step": 38937 + }, + { + "epoch": 3.1544069993519117, + "grad_norm": 0.08157742023468018, + "learning_rate": 4.69778117827085e-05, + "loss": 0.2259, + "step": 38938 + }, + { + "epoch": 3.1544880103694104, + "grad_norm": 0.07753434032201767, + "learning_rate": 4.6973311130113866e-05, + "loss": 0.2168, + "step": 38939 + }, + { + "epoch": 3.1545690213869086, + "grad_norm": 0.06853404641151428, + "learning_rate": 4.696881047751924e-05, + "loss": 0.2584, + "step": 38940 + }, + { + "epoch": 3.154650032404407, + "grad_norm": 0.0636419877409935, + "learning_rate": 4.696430982492462e-05, + "loss": 0.2202, + "step": 38941 + }, + { + "epoch": 3.1547310434219056, + "grad_norm": 0.0700618326663971, + "learning_rate": 4.695980917232999e-05, + "loss": 0.2059, + "step": 38942 + }, + { + "epoch": 3.154812054439404, + "grad_norm": 0.057667315006256104, + "learning_rate": 4.695530851973537e-05, + "loss": 0.2114, + "step": 38943 + }, + { + "epoch": 3.154893065456902, + "grad_norm": 0.060890886932611465, + "learning_rate": 4.695080786714074e-05, + "loss": 0.2234, + "step": 38944 + }, + { + "epoch": 3.1549740764744003, + "grad_norm": 0.06949368119239807, + "learning_rate": 4.694630721454611e-05, + "loss": 0.2236, + "step": 38945 + }, + { + "epoch": 3.155055087491899, + "grad_norm": 0.06916090846061707, + "learning_rate": 4.694180656195149e-05, + "loss": 0.2473, + "step": 38946 + }, + { + "epoch": 3.1551360985093972, + "grad_norm": 0.0846572294831276, + "learning_rate": 4.693730590935686e-05, + "loss": 0.2042, + "step": 38947 + }, + { + "epoch": 3.1552171095268955, + "grad_norm": 0.08265434950590134, + "learning_rate": 4.693280525676223e-05, + "loss": 0.2485, + "step": 38948 + }, + { + "epoch": 3.155298120544394, + "grad_norm": 0.09176456183195114, + "learning_rate": 4.692830460416761e-05, + "loss": 0.2483, + "step": 38949 + }, + { + "epoch": 3.1553791315618924, + "grad_norm": 0.07183399051427841, + "learning_rate": 4.692380395157298e-05, + "loss": 0.2667, + "step": 38950 + }, + { + "epoch": 3.1554601425793907, + "grad_norm": 0.07050126045942307, + "learning_rate": 4.6919303298978356e-05, + "loss": 0.2226, + "step": 38951 + }, + { + "epoch": 3.1555411535968894, + "grad_norm": 0.062021028250455856, + "learning_rate": 4.691480264638373e-05, + "loss": 0.2326, + "step": 38952 + }, + { + "epoch": 3.1556221646143876, + "grad_norm": 0.09520271420478821, + "learning_rate": 4.69103019937891e-05, + "loss": 0.2574, + "step": 38953 + }, + { + "epoch": 3.155703175631886, + "grad_norm": 0.08078912645578384, + "learning_rate": 4.690580134119448e-05, + "loss": 0.2703, + "step": 38954 + }, + { + "epoch": 3.1557841866493845, + "grad_norm": 0.06901061534881592, + "learning_rate": 4.690130068859985e-05, + "loss": 0.2129, + "step": 38955 + }, + { + "epoch": 3.155865197666883, + "grad_norm": 0.06271662563085556, + "learning_rate": 4.6896800036005224e-05, + "loss": 0.2145, + "step": 38956 + }, + { + "epoch": 3.155946208684381, + "grad_norm": 0.058658353984355927, + "learning_rate": 4.68922993834106e-05, + "loss": 0.1991, + "step": 38957 + }, + { + "epoch": 3.1560272197018793, + "grad_norm": 0.06902632862329483, + "learning_rate": 4.688779873081597e-05, + "loss": 0.2, + "step": 38958 + }, + { + "epoch": 3.156108230719378, + "grad_norm": 0.08432221412658691, + "learning_rate": 4.6883298078221345e-05, + "loss": 0.2528, + "step": 38959 + }, + { + "epoch": 3.156189241736876, + "grad_norm": 0.07740698009729385, + "learning_rate": 4.687879742562672e-05, + "loss": 0.2442, + "step": 38960 + }, + { + "epoch": 3.1562702527543745, + "grad_norm": 0.06914656609296799, + "learning_rate": 4.687429677303209e-05, + "loss": 0.1891, + "step": 38961 + }, + { + "epoch": 3.156351263771873, + "grad_norm": 0.07384679466485977, + "learning_rate": 4.6869796120437465e-05, + "loss": 0.2505, + "step": 38962 + }, + { + "epoch": 3.1564322747893714, + "grad_norm": 0.0760929062962532, + "learning_rate": 4.686529546784284e-05, + "loss": 0.2673, + "step": 38963 + }, + { + "epoch": 3.1565132858068696, + "grad_norm": 0.0864713042974472, + "learning_rate": 4.686079481524821e-05, + "loss": 0.2573, + "step": 38964 + }, + { + "epoch": 3.1565942968243683, + "grad_norm": 0.06917452812194824, + "learning_rate": 4.6856294162653586e-05, + "loss": 0.2287, + "step": 38965 + }, + { + "epoch": 3.1566753078418666, + "grad_norm": 0.06080272048711777, + "learning_rate": 4.685179351005896e-05, + "loss": 0.2278, + "step": 38966 + }, + { + "epoch": 3.156756318859365, + "grad_norm": 0.07739540934562683, + "learning_rate": 4.6847292857464333e-05, + "loss": 0.2087, + "step": 38967 + }, + { + "epoch": 3.156837329876863, + "grad_norm": 0.07544900476932526, + "learning_rate": 4.684279220486971e-05, + "loss": 0.2143, + "step": 38968 + }, + { + "epoch": 3.1569183408943617, + "grad_norm": 0.08161510527133942, + "learning_rate": 4.683829155227508e-05, + "loss": 0.2461, + "step": 38969 + }, + { + "epoch": 3.15699935191186, + "grad_norm": 0.06656242161989212, + "learning_rate": 4.6833790899680454e-05, + "loss": 0.2085, + "step": 38970 + }, + { + "epoch": 3.1570803629293582, + "grad_norm": 0.07763206958770752, + "learning_rate": 4.682929024708583e-05, + "loss": 0.2502, + "step": 38971 + }, + { + "epoch": 3.157161373946857, + "grad_norm": 0.07088419049978256, + "learning_rate": 4.68247895944912e-05, + "loss": 0.2343, + "step": 38972 + }, + { + "epoch": 3.157242384964355, + "grad_norm": 0.06977204233407974, + "learning_rate": 4.6820288941896575e-05, + "loss": 0.2188, + "step": 38973 + }, + { + "epoch": 3.1573233959818534, + "grad_norm": 0.07479950040578842, + "learning_rate": 4.681578828930195e-05, + "loss": 0.1926, + "step": 38974 + }, + { + "epoch": 3.157404406999352, + "grad_norm": 0.06984470039606094, + "learning_rate": 4.681128763670732e-05, + "loss": 0.2006, + "step": 38975 + }, + { + "epoch": 3.1574854180168503, + "grad_norm": 0.07612467557191849, + "learning_rate": 4.68067869841127e-05, + "loss": 0.2489, + "step": 38976 + }, + { + "epoch": 3.1575664290343486, + "grad_norm": 0.07083283364772797, + "learning_rate": 4.6802286331518076e-05, + "loss": 0.2683, + "step": 38977 + }, + { + "epoch": 3.1576474400518473, + "grad_norm": 0.07045019418001175, + "learning_rate": 4.679778567892344e-05, + "loss": 0.2393, + "step": 38978 + }, + { + "epoch": 3.1577284510693455, + "grad_norm": 0.0706915408372879, + "learning_rate": 4.6793285026328824e-05, + "loss": 0.2159, + "step": 38979 + }, + { + "epoch": 3.1578094620868438, + "grad_norm": 0.07124581187963486, + "learning_rate": 4.67887843737342e-05, + "loss": 0.251, + "step": 38980 + }, + { + "epoch": 3.157890473104342, + "grad_norm": 0.062179356813430786, + "learning_rate": 4.6784283721139564e-05, + "loss": 0.2553, + "step": 38981 + }, + { + "epoch": 3.1579714841218407, + "grad_norm": 0.08533891290426254, + "learning_rate": 4.6779783068544944e-05, + "loss": 0.2151, + "step": 38982 + }, + { + "epoch": 3.158052495139339, + "grad_norm": 0.07565402239561081, + "learning_rate": 4.677528241595032e-05, + "loss": 0.2199, + "step": 38983 + }, + { + "epoch": 3.158133506156837, + "grad_norm": 0.06953442096710205, + "learning_rate": 4.6770781763355685e-05, + "loss": 0.252, + "step": 38984 + }, + { + "epoch": 3.158214517174336, + "grad_norm": 0.06412345916032791, + "learning_rate": 4.6766281110761065e-05, + "loss": 0.2094, + "step": 38985 + }, + { + "epoch": 3.158295528191834, + "grad_norm": 0.06534215062856674, + "learning_rate": 4.676178045816644e-05, + "loss": 0.2428, + "step": 38986 + }, + { + "epoch": 3.1583765392093324, + "grad_norm": 0.06917164474725723, + "learning_rate": 4.6757279805571806e-05, + "loss": 0.2255, + "step": 38987 + }, + { + "epoch": 3.158457550226831, + "grad_norm": 0.06312058866024017, + "learning_rate": 4.6752779152977186e-05, + "loss": 0.1838, + "step": 38988 + }, + { + "epoch": 3.1585385612443293, + "grad_norm": 0.08812984824180603, + "learning_rate": 4.674827850038256e-05, + "loss": 0.2362, + "step": 38989 + }, + { + "epoch": 3.1586195722618275, + "grad_norm": 0.08063977211713791, + "learning_rate": 4.674377784778793e-05, + "loss": 0.2505, + "step": 38990 + }, + { + "epoch": 3.158700583279326, + "grad_norm": 0.06520045548677444, + "learning_rate": 4.673927719519331e-05, + "loss": 0.2504, + "step": 38991 + }, + { + "epoch": 3.1587815942968245, + "grad_norm": 0.07683936506509781, + "learning_rate": 4.673477654259868e-05, + "loss": 0.239, + "step": 38992 + }, + { + "epoch": 3.1588626053143227, + "grad_norm": 0.06478267163038254, + "learning_rate": 4.6730275890004054e-05, + "loss": 0.2414, + "step": 38993 + }, + { + "epoch": 3.158943616331821, + "grad_norm": 0.07072675973176956, + "learning_rate": 4.672577523740943e-05, + "loss": 0.2405, + "step": 38994 + }, + { + "epoch": 3.1590246273493197, + "grad_norm": 0.06452537328004837, + "learning_rate": 4.67212745848148e-05, + "loss": 0.2209, + "step": 38995 + }, + { + "epoch": 3.159105638366818, + "grad_norm": 0.07300246506929398, + "learning_rate": 4.6716773932220175e-05, + "loss": 0.2558, + "step": 38996 + }, + { + "epoch": 3.159186649384316, + "grad_norm": 0.08771176636219025, + "learning_rate": 4.671227327962555e-05, + "loss": 0.2834, + "step": 38997 + }, + { + "epoch": 3.159267660401815, + "grad_norm": 0.06920316070318222, + "learning_rate": 4.670777262703092e-05, + "loss": 0.2021, + "step": 38998 + }, + { + "epoch": 3.159348671419313, + "grad_norm": 0.07819127291440964, + "learning_rate": 4.6703271974436296e-05, + "loss": 0.2364, + "step": 38999 + }, + { + "epoch": 3.1594296824368113, + "grad_norm": 0.06422830373048782, + "learning_rate": 4.669877132184167e-05, + "loss": 0.2174, + "step": 39000 + }, + { + "epoch": 3.15951069345431, + "grad_norm": 0.0757855623960495, + "learning_rate": 4.669427066924704e-05, + "loss": 0.214, + "step": 39001 + }, + { + "epoch": 3.1595917044718083, + "grad_norm": 0.058662716299295425, + "learning_rate": 4.6689770016652416e-05, + "loss": 0.174, + "step": 39002 + }, + { + "epoch": 3.1596727154893065, + "grad_norm": 0.07447095215320587, + "learning_rate": 4.668526936405779e-05, + "loss": 0.2509, + "step": 39003 + }, + { + "epoch": 3.1597537265068047, + "grad_norm": 0.06807708740234375, + "learning_rate": 4.6680768711463164e-05, + "loss": 0.2254, + "step": 39004 + }, + { + "epoch": 3.1598347375243034, + "grad_norm": 0.0702623799443245, + "learning_rate": 4.667626805886854e-05, + "loss": 0.2911, + "step": 39005 + }, + { + "epoch": 3.1599157485418017, + "grad_norm": 0.07235629111528397, + "learning_rate": 4.667176740627391e-05, + "loss": 0.2326, + "step": 39006 + }, + { + "epoch": 3.1599967595593, + "grad_norm": 0.0885845422744751, + "learning_rate": 4.6667266753679284e-05, + "loss": 0.251, + "step": 39007 + }, + { + "epoch": 3.1600777705767986, + "grad_norm": 0.07090447098016739, + "learning_rate": 4.666276610108466e-05, + "loss": 0.229, + "step": 39008 + }, + { + "epoch": 3.160158781594297, + "grad_norm": 0.07172193378210068, + "learning_rate": 4.665826544849004e-05, + "loss": 0.2568, + "step": 39009 + }, + { + "epoch": 3.160239792611795, + "grad_norm": 0.07077902555465698, + "learning_rate": 4.6653764795895405e-05, + "loss": 0.2108, + "step": 39010 + }, + { + "epoch": 3.1603208036292934, + "grad_norm": 0.07674044370651245, + "learning_rate": 4.664926414330078e-05, + "loss": 0.2169, + "step": 39011 + }, + { + "epoch": 3.160401814646792, + "grad_norm": 0.06866537779569626, + "learning_rate": 4.664476349070616e-05, + "loss": 0.2165, + "step": 39012 + }, + { + "epoch": 3.1604828256642903, + "grad_norm": 0.083291195333004, + "learning_rate": 4.6640262838111526e-05, + "loss": 0.2606, + "step": 39013 + }, + { + "epoch": 3.1605638366817885, + "grad_norm": 0.07825589925050735, + "learning_rate": 4.66357621855169e-05, + "loss": 0.2251, + "step": 39014 + }, + { + "epoch": 3.160644847699287, + "grad_norm": 0.0755823627114296, + "learning_rate": 4.663126153292228e-05, + "loss": 0.2074, + "step": 39015 + }, + { + "epoch": 3.1607258587167855, + "grad_norm": 0.0745578184723854, + "learning_rate": 4.6626760880327654e-05, + "loss": 0.2551, + "step": 39016 + }, + { + "epoch": 3.1608068697342837, + "grad_norm": 0.0651843324303627, + "learning_rate": 4.662226022773302e-05, + "loss": 0.2119, + "step": 39017 + }, + { + "epoch": 3.1608878807517824, + "grad_norm": 0.08645545691251755, + "learning_rate": 4.66177595751384e-05, + "loss": 0.2352, + "step": 39018 + }, + { + "epoch": 3.1609688917692806, + "grad_norm": 0.05990879610180855, + "learning_rate": 4.6613258922543774e-05, + "loss": 0.2371, + "step": 39019 + }, + { + "epoch": 3.161049902786779, + "grad_norm": 0.0780632346868515, + "learning_rate": 4.660875826994914e-05, + "loss": 0.2135, + "step": 39020 + }, + { + "epoch": 3.1611309138042776, + "grad_norm": 0.07226870208978653, + "learning_rate": 4.660425761735452e-05, + "loss": 0.2408, + "step": 39021 + }, + { + "epoch": 3.161211924821776, + "grad_norm": 0.06137220934033394, + "learning_rate": 4.6599756964759895e-05, + "loss": 0.24, + "step": 39022 + }, + { + "epoch": 3.161292935839274, + "grad_norm": 0.0700208768248558, + "learning_rate": 4.659525631216526e-05, + "loss": 0.2379, + "step": 39023 + }, + { + "epoch": 3.1613739468567728, + "grad_norm": 0.06638213992118835, + "learning_rate": 4.659075565957064e-05, + "loss": 0.2173, + "step": 39024 + }, + { + "epoch": 3.161454957874271, + "grad_norm": 0.09570866078138351, + "learning_rate": 4.6586255006976016e-05, + "loss": 0.2294, + "step": 39025 + }, + { + "epoch": 3.1615359688917692, + "grad_norm": 0.06815316528081894, + "learning_rate": 4.658175435438138e-05, + "loss": 0.2335, + "step": 39026 + }, + { + "epoch": 3.1616169799092675, + "grad_norm": 0.07303832471370697, + "learning_rate": 4.657725370178676e-05, + "loss": 0.2293, + "step": 39027 + }, + { + "epoch": 3.161697990926766, + "grad_norm": 0.07717616856098175, + "learning_rate": 4.657275304919214e-05, + "loss": 0.2278, + "step": 39028 + }, + { + "epoch": 3.1617790019442644, + "grad_norm": 0.06783132255077362, + "learning_rate": 4.656825239659751e-05, + "loss": 0.2411, + "step": 39029 + }, + { + "epoch": 3.1618600129617627, + "grad_norm": 0.07071271538734436, + "learning_rate": 4.6563751744002884e-05, + "loss": 0.2536, + "step": 39030 + }, + { + "epoch": 3.1619410239792614, + "grad_norm": 0.07904054969549179, + "learning_rate": 4.655925109140826e-05, + "loss": 0.2772, + "step": 39031 + }, + { + "epoch": 3.1620220349967596, + "grad_norm": 0.0646386668086052, + "learning_rate": 4.655475043881363e-05, + "loss": 0.215, + "step": 39032 + }, + { + "epoch": 3.162103046014258, + "grad_norm": 0.07207754254341125, + "learning_rate": 4.6550249786219005e-05, + "loss": 0.2527, + "step": 39033 + }, + { + "epoch": 3.162184057031756, + "grad_norm": 0.07161663472652435, + "learning_rate": 4.654574913362438e-05, + "loss": 0.2577, + "step": 39034 + }, + { + "epoch": 3.162265068049255, + "grad_norm": 0.060680631548166275, + "learning_rate": 4.654124848102975e-05, + "loss": 0.1999, + "step": 39035 + }, + { + "epoch": 3.162346079066753, + "grad_norm": 0.07173289358615875, + "learning_rate": 4.6536747828435126e-05, + "loss": 0.2046, + "step": 39036 + }, + { + "epoch": 3.1624270900842513, + "grad_norm": 0.07027477025985718, + "learning_rate": 4.65322471758405e-05, + "loss": 0.2298, + "step": 39037 + }, + { + "epoch": 3.16250810110175, + "grad_norm": 0.06637211889028549, + "learning_rate": 4.652774652324587e-05, + "loss": 0.2421, + "step": 39038 + }, + { + "epoch": 3.162589112119248, + "grad_norm": 0.06281259655952454, + "learning_rate": 4.6523245870651246e-05, + "loss": 0.1843, + "step": 39039 + }, + { + "epoch": 3.1626701231367464, + "grad_norm": 0.08118221908807755, + "learning_rate": 4.651874521805662e-05, + "loss": 0.1962, + "step": 39040 + }, + { + "epoch": 3.162751134154245, + "grad_norm": 0.07613281160593033, + "learning_rate": 4.6514244565461994e-05, + "loss": 0.2431, + "step": 39041 + }, + { + "epoch": 3.1628321451717434, + "grad_norm": 0.07212066650390625, + "learning_rate": 4.650974391286737e-05, + "loss": 0.242, + "step": 39042 + }, + { + "epoch": 3.1629131561892416, + "grad_norm": 0.06668045371770859, + "learning_rate": 4.650524326027274e-05, + "loss": 0.1964, + "step": 39043 + }, + { + "epoch": 3.1629941672067403, + "grad_norm": 0.06738068908452988, + "learning_rate": 4.6500742607678114e-05, + "loss": 0.2564, + "step": 39044 + }, + { + "epoch": 3.1630751782242386, + "grad_norm": 0.07580120116472244, + "learning_rate": 4.6496241955083495e-05, + "loss": 0.2398, + "step": 39045 + }, + { + "epoch": 3.163156189241737, + "grad_norm": 0.07344920188188553, + "learning_rate": 4.649174130248886e-05, + "loss": 0.2268, + "step": 39046 + }, + { + "epoch": 3.163237200259235, + "grad_norm": 0.06640015542507172, + "learning_rate": 4.6487240649894235e-05, + "loss": 0.242, + "step": 39047 + }, + { + "epoch": 3.1633182112767337, + "grad_norm": 0.07229582220315933, + "learning_rate": 4.6482739997299616e-05, + "loss": 0.2087, + "step": 39048 + }, + { + "epoch": 3.163399222294232, + "grad_norm": 0.07315175235271454, + "learning_rate": 4.647823934470498e-05, + "loss": 0.209, + "step": 39049 + }, + { + "epoch": 3.1634802333117302, + "grad_norm": 0.07187490165233612, + "learning_rate": 4.6473738692110356e-05, + "loss": 0.2331, + "step": 39050 + }, + { + "epoch": 3.163561244329229, + "grad_norm": 0.07890485227108002, + "learning_rate": 4.6469238039515736e-05, + "loss": 0.2214, + "step": 39051 + }, + { + "epoch": 3.163642255346727, + "grad_norm": 0.07000371068716049, + "learning_rate": 4.64647373869211e-05, + "loss": 0.2347, + "step": 39052 + }, + { + "epoch": 3.1637232663642254, + "grad_norm": 0.07616917043924332, + "learning_rate": 4.646023673432648e-05, + "loss": 0.2386, + "step": 39053 + }, + { + "epoch": 3.163804277381724, + "grad_norm": 0.05547773092985153, + "learning_rate": 4.645573608173186e-05, + "loss": 0.2161, + "step": 39054 + }, + { + "epoch": 3.1638852883992223, + "grad_norm": 0.06775707751512527, + "learning_rate": 4.645123542913723e-05, + "loss": 0.2033, + "step": 39055 + }, + { + "epoch": 3.1639662994167206, + "grad_norm": 0.06869322806596756, + "learning_rate": 4.64467347765426e-05, + "loss": 0.2232, + "step": 39056 + }, + { + "epoch": 3.164047310434219, + "grad_norm": 0.08482500165700912, + "learning_rate": 4.644223412394798e-05, + "loss": 0.2454, + "step": 39057 + }, + { + "epoch": 3.1641283214517175, + "grad_norm": 0.07148563861846924, + "learning_rate": 4.643773347135335e-05, + "loss": 0.2288, + "step": 39058 + }, + { + "epoch": 3.1642093324692158, + "grad_norm": 0.06907016783952713, + "learning_rate": 4.643323281875872e-05, + "loss": 0.2437, + "step": 39059 + }, + { + "epoch": 3.164290343486714, + "grad_norm": 0.07077489793300629, + "learning_rate": 4.64287321661641e-05, + "loss": 0.2361, + "step": 39060 + }, + { + "epoch": 3.1643713545042127, + "grad_norm": 0.07123490422964096, + "learning_rate": 4.642423151356947e-05, + "loss": 0.2336, + "step": 39061 + }, + { + "epoch": 3.164452365521711, + "grad_norm": 0.06324376910924911, + "learning_rate": 4.641973086097484e-05, + "loss": 0.2325, + "step": 39062 + }, + { + "epoch": 3.164533376539209, + "grad_norm": 0.054295867681503296, + "learning_rate": 4.641523020838022e-05, + "loss": 0.2188, + "step": 39063 + }, + { + "epoch": 3.164614387556708, + "grad_norm": 0.07837305217981339, + "learning_rate": 4.641072955578559e-05, + "loss": 0.2237, + "step": 39064 + }, + { + "epoch": 3.164695398574206, + "grad_norm": 0.07684259116649628, + "learning_rate": 4.640622890319096e-05, + "loss": 0.2184, + "step": 39065 + }, + { + "epoch": 3.1647764095917044, + "grad_norm": 0.07637816667556763, + "learning_rate": 4.640172825059634e-05, + "loss": 0.2392, + "step": 39066 + }, + { + "epoch": 3.164857420609203, + "grad_norm": 0.06445123255252838, + "learning_rate": 4.6397227598001714e-05, + "loss": 0.2355, + "step": 39067 + }, + { + "epoch": 3.1649384316267013, + "grad_norm": 0.07527997344732285, + "learning_rate": 4.639272694540709e-05, + "loss": 0.202, + "step": 39068 + }, + { + "epoch": 3.1650194426441995, + "grad_norm": 0.07933299988508224, + "learning_rate": 4.638822629281246e-05, + "loss": 0.2518, + "step": 39069 + }, + { + "epoch": 3.165100453661698, + "grad_norm": 0.06105326488614082, + "learning_rate": 4.6383725640217835e-05, + "loss": 0.2122, + "step": 39070 + }, + { + "epoch": 3.1651814646791965, + "grad_norm": 0.06747523695230484, + "learning_rate": 4.637922498762321e-05, + "loss": 0.2347, + "step": 39071 + }, + { + "epoch": 3.1652624756966947, + "grad_norm": 0.06693252921104431, + "learning_rate": 4.637472433502858e-05, + "loss": 0.1908, + "step": 39072 + }, + { + "epoch": 3.165343486714193, + "grad_norm": 0.07058507949113846, + "learning_rate": 4.6370223682433956e-05, + "loss": 0.2492, + "step": 39073 + }, + { + "epoch": 3.1654244977316917, + "grad_norm": 0.07795606553554535, + "learning_rate": 4.636572302983933e-05, + "loss": 0.2556, + "step": 39074 + }, + { + "epoch": 3.16550550874919, + "grad_norm": 0.08045481890439987, + "learning_rate": 4.63612223772447e-05, + "loss": 0.2276, + "step": 39075 + }, + { + "epoch": 3.165586519766688, + "grad_norm": 0.07829701155424118, + "learning_rate": 4.6356721724650077e-05, + "loss": 0.2044, + "step": 39076 + }, + { + "epoch": 3.165667530784187, + "grad_norm": 0.08137937635183334, + "learning_rate": 4.635222107205545e-05, + "loss": 0.2658, + "step": 39077 + }, + { + "epoch": 3.165748541801685, + "grad_norm": 0.06312301009893417, + "learning_rate": 4.6347720419460824e-05, + "loss": 0.2316, + "step": 39078 + }, + { + "epoch": 3.1658295528191833, + "grad_norm": 0.07175661623477936, + "learning_rate": 4.63432197668662e-05, + "loss": 0.2326, + "step": 39079 + }, + { + "epoch": 3.1659105638366816, + "grad_norm": 0.07733714580535889, + "learning_rate": 4.633871911427157e-05, + "loss": 0.2684, + "step": 39080 + }, + { + "epoch": 3.1659915748541803, + "grad_norm": 0.07549124956130981, + "learning_rate": 4.633421846167695e-05, + "loss": 0.2343, + "step": 39081 + }, + { + "epoch": 3.1660725858716785, + "grad_norm": 0.0854051411151886, + "learning_rate": 4.632971780908232e-05, + "loss": 0.2459, + "step": 39082 + }, + { + "epoch": 3.1661535968891767, + "grad_norm": 0.0745304599404335, + "learning_rate": 4.632521715648769e-05, + "loss": 0.2441, + "step": 39083 + }, + { + "epoch": 3.1662346079066754, + "grad_norm": 0.06685039401054382, + "learning_rate": 4.632071650389307e-05, + "loss": 0.1976, + "step": 39084 + }, + { + "epoch": 3.1663156189241737, + "grad_norm": 0.0917067676782608, + "learning_rate": 4.631621585129844e-05, + "loss": 0.206, + "step": 39085 + }, + { + "epoch": 3.166396629941672, + "grad_norm": 0.06585230678319931, + "learning_rate": 4.631171519870381e-05, + "loss": 0.203, + "step": 39086 + }, + { + "epoch": 3.1664776409591706, + "grad_norm": 0.06591516733169556, + "learning_rate": 4.630721454610919e-05, + "loss": 0.2335, + "step": 39087 + }, + { + "epoch": 3.166558651976669, + "grad_norm": 0.06399863958358765, + "learning_rate": 4.630271389351456e-05, + "loss": 0.2319, + "step": 39088 + }, + { + "epoch": 3.166639662994167, + "grad_norm": 0.06659473478794098, + "learning_rate": 4.6298213240919933e-05, + "loss": 0.1824, + "step": 39089 + }, + { + "epoch": 3.166720674011666, + "grad_norm": 0.07460293173789978, + "learning_rate": 4.6293712588325314e-05, + "loss": 0.2189, + "step": 39090 + }, + { + "epoch": 3.166801685029164, + "grad_norm": 0.06288280338048935, + "learning_rate": 4.628921193573068e-05, + "loss": 0.2003, + "step": 39091 + }, + { + "epoch": 3.1668826960466623, + "grad_norm": 0.08445224165916443, + "learning_rate": 4.6284711283136054e-05, + "loss": 0.2306, + "step": 39092 + }, + { + "epoch": 3.1669637070641605, + "grad_norm": 0.09093958884477615, + "learning_rate": 4.6280210630541435e-05, + "loss": 0.2349, + "step": 39093 + }, + { + "epoch": 3.167044718081659, + "grad_norm": 0.07478706538677216, + "learning_rate": 4.62757099779468e-05, + "loss": 0.2249, + "step": 39094 + }, + { + "epoch": 3.1671257290991575, + "grad_norm": 0.0760730654001236, + "learning_rate": 4.6271209325352175e-05, + "loss": 0.1897, + "step": 39095 + }, + { + "epoch": 3.1672067401166557, + "grad_norm": 0.06487081199884415, + "learning_rate": 4.6266708672757555e-05, + "loss": 0.2018, + "step": 39096 + }, + { + "epoch": 3.1672877511341544, + "grad_norm": 0.06682021170854568, + "learning_rate": 4.626220802016293e-05, + "loss": 0.2032, + "step": 39097 + }, + { + "epoch": 3.1673687621516526, + "grad_norm": 0.06956232339143753, + "learning_rate": 4.6257707367568296e-05, + "loss": 0.1847, + "step": 39098 + }, + { + "epoch": 3.167449773169151, + "grad_norm": 0.06547503918409348, + "learning_rate": 4.6253206714973676e-05, + "loss": 0.213, + "step": 39099 + }, + { + "epoch": 3.1675307841866496, + "grad_norm": 0.07335997372865677, + "learning_rate": 4.624870606237905e-05, + "loss": 0.2263, + "step": 39100 + }, + { + "epoch": 3.167611795204148, + "grad_norm": 0.09277735650539398, + "learning_rate": 4.624420540978442e-05, + "loss": 0.2525, + "step": 39101 + }, + { + "epoch": 3.167692806221646, + "grad_norm": 0.06516660749912262, + "learning_rate": 4.62397047571898e-05, + "loss": 0.214, + "step": 39102 + }, + { + "epoch": 3.1677738172391443, + "grad_norm": 0.0839308649301529, + "learning_rate": 4.623520410459517e-05, + "loss": 0.2066, + "step": 39103 + }, + { + "epoch": 3.167854828256643, + "grad_norm": 0.07529613375663757, + "learning_rate": 4.623070345200054e-05, + "loss": 0.2172, + "step": 39104 + }, + { + "epoch": 3.1679358392741412, + "grad_norm": 0.10337688773870468, + "learning_rate": 4.622620279940592e-05, + "loss": 0.2361, + "step": 39105 + }, + { + "epoch": 3.1680168502916395, + "grad_norm": 0.062267303466796875, + "learning_rate": 4.622170214681129e-05, + "loss": 0.2216, + "step": 39106 + }, + { + "epoch": 3.168097861309138, + "grad_norm": 0.06088486313819885, + "learning_rate": 4.621720149421666e-05, + "loss": 0.2134, + "step": 39107 + }, + { + "epoch": 3.1681788723266364, + "grad_norm": 0.06333941221237183, + "learning_rate": 4.621270084162204e-05, + "loss": 0.2489, + "step": 39108 + }, + { + "epoch": 3.1682598833441347, + "grad_norm": 0.07425013929605484, + "learning_rate": 4.620820018902741e-05, + "loss": 0.2517, + "step": 39109 + }, + { + "epoch": 3.1683408943616334, + "grad_norm": 0.0673804059624672, + "learning_rate": 4.6203699536432786e-05, + "loss": 0.211, + "step": 39110 + }, + { + "epoch": 3.1684219053791316, + "grad_norm": 0.07636480033397675, + "learning_rate": 4.619919888383816e-05, + "loss": 0.233, + "step": 39111 + }, + { + "epoch": 3.16850291639663, + "grad_norm": 0.05976966768503189, + "learning_rate": 4.619469823124353e-05, + "loss": 0.2123, + "step": 39112 + }, + { + "epoch": 3.1685839274141285, + "grad_norm": 0.06142381206154823, + "learning_rate": 4.619019757864891e-05, + "loss": 0.2222, + "step": 39113 + }, + { + "epoch": 3.1686649384316268, + "grad_norm": 0.06508833914995193, + "learning_rate": 4.618569692605428e-05, + "loss": 0.2034, + "step": 39114 + }, + { + "epoch": 3.168745949449125, + "grad_norm": 0.08630005270242691, + "learning_rate": 4.6181196273459654e-05, + "loss": 0.2569, + "step": 39115 + }, + { + "epoch": 3.1688269604666233, + "grad_norm": 0.06716077029705048, + "learning_rate": 4.617669562086503e-05, + "loss": 0.231, + "step": 39116 + }, + { + "epoch": 3.168907971484122, + "grad_norm": 0.07154113054275513, + "learning_rate": 4.61721949682704e-05, + "loss": 0.2275, + "step": 39117 + }, + { + "epoch": 3.16898898250162, + "grad_norm": 0.08633829653263092, + "learning_rate": 4.6167694315675775e-05, + "loss": 0.2098, + "step": 39118 + }, + { + "epoch": 3.1690699935191184, + "grad_norm": 0.08164115250110626, + "learning_rate": 4.616319366308115e-05, + "loss": 0.2514, + "step": 39119 + }, + { + "epoch": 3.169151004536617, + "grad_norm": 0.05900833383202553, + "learning_rate": 4.615869301048652e-05, + "loss": 0.2175, + "step": 39120 + }, + { + "epoch": 3.1692320155541154, + "grad_norm": 0.058158889412879944, + "learning_rate": 4.6154192357891896e-05, + "loss": 0.2179, + "step": 39121 + }, + { + "epoch": 3.1693130265716136, + "grad_norm": 0.06698452681303024, + "learning_rate": 4.614969170529727e-05, + "loss": 0.1879, + "step": 39122 + }, + { + "epoch": 3.1693940375891123, + "grad_norm": 0.06863744556903839, + "learning_rate": 4.614519105270265e-05, + "loss": 0.2186, + "step": 39123 + }, + { + "epoch": 3.1694750486066106, + "grad_norm": 0.07735446095466614, + "learning_rate": 4.6140690400108016e-05, + "loss": 0.2531, + "step": 39124 + }, + { + "epoch": 3.169556059624109, + "grad_norm": 0.07899104058742523, + "learning_rate": 4.613618974751339e-05, + "loss": 0.2555, + "step": 39125 + }, + { + "epoch": 3.169637070641607, + "grad_norm": 0.06854166835546494, + "learning_rate": 4.613168909491877e-05, + "loss": 0.2155, + "step": 39126 + }, + { + "epoch": 3.1697180816591057, + "grad_norm": 0.06978372484445572, + "learning_rate": 4.612718844232414e-05, + "loss": 0.2527, + "step": 39127 + }, + { + "epoch": 3.169799092676604, + "grad_norm": 0.10613539814949036, + "learning_rate": 4.612268778972951e-05, + "loss": 0.2682, + "step": 39128 + }, + { + "epoch": 3.1698801036941022, + "grad_norm": 0.07455401122570038, + "learning_rate": 4.611818713713489e-05, + "loss": 0.2487, + "step": 39129 + }, + { + "epoch": 3.169961114711601, + "grad_norm": 0.07414111495018005, + "learning_rate": 4.611368648454026e-05, + "loss": 0.2277, + "step": 39130 + }, + { + "epoch": 3.170042125729099, + "grad_norm": 0.07571996003389359, + "learning_rate": 4.610918583194563e-05, + "loss": 0.2065, + "step": 39131 + }, + { + "epoch": 3.1701231367465974, + "grad_norm": 0.07938110083341599, + "learning_rate": 4.610468517935101e-05, + "loss": 0.2473, + "step": 39132 + }, + { + "epoch": 3.170204147764096, + "grad_norm": 0.0692886933684349, + "learning_rate": 4.610018452675638e-05, + "loss": 0.2334, + "step": 39133 + }, + { + "epoch": 3.1702851587815943, + "grad_norm": 0.08501394838094711, + "learning_rate": 4.609568387416175e-05, + "loss": 0.2151, + "step": 39134 + }, + { + "epoch": 3.1703661697990926, + "grad_norm": 0.0652298629283905, + "learning_rate": 4.609118322156713e-05, + "loss": 0.2247, + "step": 39135 + }, + { + "epoch": 3.1704471808165913, + "grad_norm": 0.07804706692695618, + "learning_rate": 4.6086682568972506e-05, + "loss": 0.2151, + "step": 39136 + }, + { + "epoch": 3.1705281918340895, + "grad_norm": 0.06097443401813507, + "learning_rate": 4.608218191637787e-05, + "loss": 0.1911, + "step": 39137 + }, + { + "epoch": 3.1706092028515878, + "grad_norm": 0.061475615948438644, + "learning_rate": 4.6077681263783254e-05, + "loss": 0.2282, + "step": 39138 + }, + { + "epoch": 3.170690213869086, + "grad_norm": 0.0577525831758976, + "learning_rate": 4.607318061118863e-05, + "loss": 0.2028, + "step": 39139 + }, + { + "epoch": 3.1707712248865847, + "grad_norm": 0.0783226266503334, + "learning_rate": 4.6068679958593994e-05, + "loss": 0.2246, + "step": 39140 + }, + { + "epoch": 3.170852235904083, + "grad_norm": 0.06661267578601837, + "learning_rate": 4.6064179305999374e-05, + "loss": 0.2263, + "step": 39141 + }, + { + "epoch": 3.170933246921581, + "grad_norm": 0.06043732538819313, + "learning_rate": 4.605967865340475e-05, + "loss": 0.2081, + "step": 39142 + }, + { + "epoch": 3.17101425793908, + "grad_norm": 0.06632187962532043, + "learning_rate": 4.6055178000810115e-05, + "loss": 0.2239, + "step": 39143 + }, + { + "epoch": 3.171095268956578, + "grad_norm": 0.0705820843577385, + "learning_rate": 4.6050677348215495e-05, + "loss": 0.2187, + "step": 39144 + }, + { + "epoch": 3.1711762799740764, + "grad_norm": 0.08212973922491074, + "learning_rate": 4.604617669562087e-05, + "loss": 0.2683, + "step": 39145 + }, + { + "epoch": 3.171257290991575, + "grad_norm": 0.08695808798074722, + "learning_rate": 4.6041676043026236e-05, + "loss": 0.2334, + "step": 39146 + }, + { + "epoch": 3.1713383020090733, + "grad_norm": 0.07749685645103455, + "learning_rate": 4.6037175390431616e-05, + "loss": 0.2731, + "step": 39147 + }, + { + "epoch": 3.1714193130265715, + "grad_norm": 0.07648932933807373, + "learning_rate": 4.603267473783699e-05, + "loss": 0.2401, + "step": 39148 + }, + { + "epoch": 3.17150032404407, + "grad_norm": 0.07884996384382248, + "learning_rate": 4.602817408524236e-05, + "loss": 0.2483, + "step": 39149 + }, + { + "epoch": 3.1715813350615685, + "grad_norm": 0.05509842559695244, + "learning_rate": 4.602367343264774e-05, + "loss": 0.2136, + "step": 39150 + }, + { + "epoch": 3.1716623460790667, + "grad_norm": 0.06663339585065842, + "learning_rate": 4.601917278005311e-05, + "loss": 0.2218, + "step": 39151 + }, + { + "epoch": 3.171743357096565, + "grad_norm": 0.06804881989955902, + "learning_rate": 4.6014672127458484e-05, + "loss": 0.2708, + "step": 39152 + }, + { + "epoch": 3.1718243681140637, + "grad_norm": 0.0790795087814331, + "learning_rate": 4.601017147486386e-05, + "loss": 0.2421, + "step": 39153 + }, + { + "epoch": 3.171905379131562, + "grad_norm": 0.0650632381439209, + "learning_rate": 4.600567082226923e-05, + "loss": 0.2207, + "step": 39154 + }, + { + "epoch": 3.17198639014906, + "grad_norm": 0.06869402527809143, + "learning_rate": 4.6001170169674605e-05, + "loss": 0.2087, + "step": 39155 + }, + { + "epoch": 3.172067401166559, + "grad_norm": 0.0867585763335228, + "learning_rate": 4.599666951707998e-05, + "loss": 0.2352, + "step": 39156 + }, + { + "epoch": 3.172148412184057, + "grad_norm": 0.06907272338867188, + "learning_rate": 4.599216886448535e-05, + "loss": 0.2191, + "step": 39157 + }, + { + "epoch": 3.1722294232015553, + "grad_norm": 0.08108095824718475, + "learning_rate": 4.5987668211890726e-05, + "loss": 0.3043, + "step": 39158 + }, + { + "epoch": 3.172310434219054, + "grad_norm": 0.0706065222620964, + "learning_rate": 4.59831675592961e-05, + "loss": 0.2135, + "step": 39159 + }, + { + "epoch": 3.1723914452365523, + "grad_norm": 0.07251002639532089, + "learning_rate": 4.597866690670147e-05, + "loss": 0.2425, + "step": 39160 + }, + { + "epoch": 3.1724724562540505, + "grad_norm": 0.069508396089077, + "learning_rate": 4.5974166254106846e-05, + "loss": 0.2001, + "step": 39161 + }, + { + "epoch": 3.1725534672715487, + "grad_norm": 0.0626758560538292, + "learning_rate": 4.596966560151223e-05, + "loss": 0.2097, + "step": 39162 + }, + { + "epoch": 3.1726344782890474, + "grad_norm": 0.07252325862646103, + "learning_rate": 4.5965164948917594e-05, + "loss": 0.2382, + "step": 39163 + }, + { + "epoch": 3.1727154893065457, + "grad_norm": 0.07185426354408264, + "learning_rate": 4.596066429632297e-05, + "loss": 0.2372, + "step": 39164 + }, + { + "epoch": 3.172796500324044, + "grad_norm": 0.06854097545146942, + "learning_rate": 4.595616364372835e-05, + "loss": 0.2276, + "step": 39165 + }, + { + "epoch": 3.1728775113415426, + "grad_norm": 0.08109313994646072, + "learning_rate": 4.5951662991133714e-05, + "loss": 0.2345, + "step": 39166 + }, + { + "epoch": 3.172958522359041, + "grad_norm": 0.07841244339942932, + "learning_rate": 4.594716233853909e-05, + "loss": 0.2221, + "step": 39167 + }, + { + "epoch": 3.173039533376539, + "grad_norm": 0.06506379693746567, + "learning_rate": 4.594266168594447e-05, + "loss": 0.2404, + "step": 39168 + }, + { + "epoch": 3.173120544394038, + "grad_norm": 0.07025019824504852, + "learning_rate": 4.5938161033349835e-05, + "loss": 0.2172, + "step": 39169 + }, + { + "epoch": 3.173201555411536, + "grad_norm": 0.06990345567464828, + "learning_rate": 4.593366038075521e-05, + "loss": 0.2176, + "step": 39170 + }, + { + "epoch": 3.1732825664290343, + "grad_norm": 0.06490842252969742, + "learning_rate": 4.592915972816059e-05, + "loss": 0.2048, + "step": 39171 + }, + { + "epoch": 3.1733635774465325, + "grad_norm": 0.08713230490684509, + "learning_rate": 4.5924659075565956e-05, + "loss": 0.2451, + "step": 39172 + }, + { + "epoch": 3.173444588464031, + "grad_norm": 0.07410930097103119, + "learning_rate": 4.592015842297133e-05, + "loss": 0.2271, + "step": 39173 + }, + { + "epoch": 3.1735255994815295, + "grad_norm": 0.06233803927898407, + "learning_rate": 4.591565777037671e-05, + "loss": 0.2077, + "step": 39174 + }, + { + "epoch": 3.1736066104990277, + "grad_norm": 0.0665794089436531, + "learning_rate": 4.5911157117782084e-05, + "loss": 0.2672, + "step": 39175 + }, + { + "epoch": 3.1736876215165264, + "grad_norm": 0.06904125958681107, + "learning_rate": 4.590665646518745e-05, + "loss": 0.2193, + "step": 39176 + }, + { + "epoch": 3.1737686325340246, + "grad_norm": 0.0749911218881607, + "learning_rate": 4.590215581259283e-05, + "loss": 0.2238, + "step": 39177 + }, + { + "epoch": 3.173849643551523, + "grad_norm": 0.07443109154701233, + "learning_rate": 4.5897655159998204e-05, + "loss": 0.2185, + "step": 39178 + }, + { + "epoch": 3.1739306545690216, + "grad_norm": 0.06918110698461533, + "learning_rate": 4.589315450740357e-05, + "loss": 0.2438, + "step": 39179 + }, + { + "epoch": 3.17401166558652, + "grad_norm": 0.059091437608003616, + "learning_rate": 4.588865385480895e-05, + "loss": 0.2122, + "step": 39180 + }, + { + "epoch": 3.174092676604018, + "grad_norm": 0.08509764820337296, + "learning_rate": 4.5884153202214325e-05, + "loss": 0.2517, + "step": 39181 + }, + { + "epoch": 3.1741736876215167, + "grad_norm": 0.07486993074417114, + "learning_rate": 4.587965254961969e-05, + "loss": 0.2424, + "step": 39182 + }, + { + "epoch": 3.174254698639015, + "grad_norm": 0.1036665216088295, + "learning_rate": 4.587515189702507e-05, + "loss": 0.2352, + "step": 39183 + }, + { + "epoch": 3.1743357096565132, + "grad_norm": 0.07043270021677017, + "learning_rate": 4.5870651244430446e-05, + "loss": 0.2577, + "step": 39184 + }, + { + "epoch": 3.1744167206740115, + "grad_norm": 0.07139958441257477, + "learning_rate": 4.586615059183581e-05, + "loss": 0.2191, + "step": 39185 + }, + { + "epoch": 3.17449773169151, + "grad_norm": 0.0614413358271122, + "learning_rate": 4.586164993924119e-05, + "loss": 0.202, + "step": 39186 + }, + { + "epoch": 3.1745787427090084, + "grad_norm": 0.0673774853348732, + "learning_rate": 4.585714928664657e-05, + "loss": 0.2389, + "step": 39187 + }, + { + "epoch": 3.1746597537265067, + "grad_norm": 0.07635167986154556, + "learning_rate": 4.585264863405194e-05, + "loss": 0.2149, + "step": 39188 + }, + { + "epoch": 3.1747407647440054, + "grad_norm": 0.07869356125593185, + "learning_rate": 4.5848147981457314e-05, + "loss": 0.2248, + "step": 39189 + }, + { + "epoch": 3.1748217757615036, + "grad_norm": 0.06915463507175446, + "learning_rate": 4.584364732886269e-05, + "loss": 0.2303, + "step": 39190 + }, + { + "epoch": 3.174902786779002, + "grad_norm": 0.0811949148774147, + "learning_rate": 4.583914667626806e-05, + "loss": 0.2208, + "step": 39191 + }, + { + "epoch": 3.1749837977965005, + "grad_norm": 0.06380307674407959, + "learning_rate": 4.5834646023673435e-05, + "loss": 0.1997, + "step": 39192 + }, + { + "epoch": 3.1750648088139988, + "grad_norm": 0.08466849476099014, + "learning_rate": 4.583014537107881e-05, + "loss": 0.2482, + "step": 39193 + }, + { + "epoch": 3.175145819831497, + "grad_norm": 0.11230959743261337, + "learning_rate": 4.582564471848418e-05, + "loss": 0.2753, + "step": 39194 + }, + { + "epoch": 3.1752268308489953, + "grad_norm": 0.062294308096170425, + "learning_rate": 4.5821144065889556e-05, + "loss": 0.19, + "step": 39195 + }, + { + "epoch": 3.175307841866494, + "grad_norm": 0.0825783833861351, + "learning_rate": 4.581664341329493e-05, + "loss": 0.268, + "step": 39196 + }, + { + "epoch": 3.175388852883992, + "grad_norm": 0.07035788148641586, + "learning_rate": 4.58121427607003e-05, + "loss": 0.27, + "step": 39197 + }, + { + "epoch": 3.1754698639014904, + "grad_norm": 0.0746840238571167, + "learning_rate": 4.5807642108105677e-05, + "loss": 0.2396, + "step": 39198 + }, + { + "epoch": 3.175550874918989, + "grad_norm": 0.06415130943059921, + "learning_rate": 4.580314145551105e-05, + "loss": 0.2372, + "step": 39199 + }, + { + "epoch": 3.1756318859364874, + "grad_norm": 0.07957671582698822, + "learning_rate": 4.5798640802916424e-05, + "loss": 0.2186, + "step": 39200 + }, + { + "epoch": 3.1757128969539856, + "grad_norm": 0.06585413962602615, + "learning_rate": 4.5794140150321804e-05, + "loss": 0.2329, + "step": 39201 + }, + { + "epoch": 3.1757939079714843, + "grad_norm": 0.06584955751895905, + "learning_rate": 4.578963949772717e-05, + "loss": 0.1931, + "step": 39202 + }, + { + "epoch": 3.1758749189889826, + "grad_norm": 0.06202779710292816, + "learning_rate": 4.5785138845132545e-05, + "loss": 0.2059, + "step": 39203 + }, + { + "epoch": 3.175955930006481, + "grad_norm": 0.08203383535146713, + "learning_rate": 4.5780638192537925e-05, + "loss": 0.2704, + "step": 39204 + }, + { + "epoch": 3.1760369410239795, + "grad_norm": 0.09432760626077652, + "learning_rate": 4.577613753994329e-05, + "loss": 0.2181, + "step": 39205 + }, + { + "epoch": 3.1761179520414777, + "grad_norm": 0.06736686825752258, + "learning_rate": 4.5771636887348665e-05, + "loss": 0.2272, + "step": 39206 + }, + { + "epoch": 3.176198963058976, + "grad_norm": 0.06527657806873322, + "learning_rate": 4.5767136234754046e-05, + "loss": 0.1973, + "step": 39207 + }, + { + "epoch": 3.176279974076474, + "grad_norm": 0.07748841494321823, + "learning_rate": 4.576263558215941e-05, + "loss": 0.2703, + "step": 39208 + }, + { + "epoch": 3.176360985093973, + "grad_norm": 0.06551697105169296, + "learning_rate": 4.5758134929564786e-05, + "loss": 0.2256, + "step": 39209 + }, + { + "epoch": 3.176441996111471, + "grad_norm": 0.07224316895008087, + "learning_rate": 4.5753634276970167e-05, + "loss": 0.2528, + "step": 39210 + }, + { + "epoch": 3.1765230071289694, + "grad_norm": 0.07008182257413864, + "learning_rate": 4.574913362437553e-05, + "loss": 0.2159, + "step": 39211 + }, + { + "epoch": 3.176604018146468, + "grad_norm": 0.09387296438217163, + "learning_rate": 4.574463297178091e-05, + "loss": 0.2536, + "step": 39212 + }, + { + "epoch": 3.1766850291639663, + "grad_norm": 0.06003788858652115, + "learning_rate": 4.574013231918629e-05, + "loss": 0.2046, + "step": 39213 + }, + { + "epoch": 3.1767660401814646, + "grad_norm": 0.06780782341957092, + "learning_rate": 4.573563166659166e-05, + "loss": 0.2366, + "step": 39214 + }, + { + "epoch": 3.176847051198963, + "grad_norm": 0.0731881707906723, + "learning_rate": 4.573113101399703e-05, + "loss": 0.2069, + "step": 39215 + }, + { + "epoch": 3.1769280622164615, + "grad_norm": 0.07370991259813309, + "learning_rate": 4.572663036140241e-05, + "loss": 0.1897, + "step": 39216 + }, + { + "epoch": 3.1770090732339598, + "grad_norm": 0.08127991855144501, + "learning_rate": 4.572212970880778e-05, + "loss": 0.2508, + "step": 39217 + }, + { + "epoch": 3.177090084251458, + "grad_norm": 0.07813630998134613, + "learning_rate": 4.571762905621315e-05, + "loss": 0.2182, + "step": 39218 + }, + { + "epoch": 3.1771710952689567, + "grad_norm": 0.07162605226039886, + "learning_rate": 4.571312840361853e-05, + "loss": 0.2367, + "step": 39219 + }, + { + "epoch": 3.177252106286455, + "grad_norm": 0.06499221175909042, + "learning_rate": 4.57086277510239e-05, + "loss": 0.2255, + "step": 39220 + }, + { + "epoch": 3.177333117303953, + "grad_norm": 0.0795610323548317, + "learning_rate": 4.570412709842927e-05, + "loss": 0.2342, + "step": 39221 + }, + { + "epoch": 3.177414128321452, + "grad_norm": 0.06543579697608948, + "learning_rate": 4.569962644583465e-05, + "loss": 0.2007, + "step": 39222 + }, + { + "epoch": 3.17749513933895, + "grad_norm": 0.06847775727510452, + "learning_rate": 4.569512579324002e-05, + "loss": 0.2321, + "step": 39223 + }, + { + "epoch": 3.1775761503564484, + "grad_norm": 0.08107810467481613, + "learning_rate": 4.569062514064539e-05, + "loss": 0.2443, + "step": 39224 + }, + { + "epoch": 3.177657161373947, + "grad_norm": 0.06877782195806503, + "learning_rate": 4.568612448805077e-05, + "loss": 0.2604, + "step": 39225 + }, + { + "epoch": 3.1777381723914453, + "grad_norm": 0.06588443368673325, + "learning_rate": 4.5681623835456144e-05, + "loss": 0.2494, + "step": 39226 + }, + { + "epoch": 3.1778191834089435, + "grad_norm": 0.07563662528991699, + "learning_rate": 4.567712318286152e-05, + "loss": 0.2448, + "step": 39227 + }, + { + "epoch": 3.1779001944264422, + "grad_norm": 0.0697861835360527, + "learning_rate": 4.567262253026689e-05, + "loss": 0.2562, + "step": 39228 + }, + { + "epoch": 3.1779812054439405, + "grad_norm": 0.07472296804189682, + "learning_rate": 4.5668121877672265e-05, + "loss": 0.2343, + "step": 39229 + }, + { + "epoch": 3.1780622164614387, + "grad_norm": 0.058154940605163574, + "learning_rate": 4.566362122507764e-05, + "loss": 0.1825, + "step": 39230 + }, + { + "epoch": 3.178143227478937, + "grad_norm": 0.06901900470256805, + "learning_rate": 4.565912057248301e-05, + "loss": 0.2349, + "step": 39231 + }, + { + "epoch": 3.1782242384964356, + "grad_norm": 0.06484868377447128, + "learning_rate": 4.5654619919888386e-05, + "loss": 0.2034, + "step": 39232 + }, + { + "epoch": 3.178305249513934, + "grad_norm": 0.06554258614778519, + "learning_rate": 4.565011926729376e-05, + "loss": 0.224, + "step": 39233 + }, + { + "epoch": 3.178386260531432, + "grad_norm": 0.06775832921266556, + "learning_rate": 4.564561861469913e-05, + "loss": 0.2217, + "step": 39234 + }, + { + "epoch": 3.178467271548931, + "grad_norm": 0.07073518633842468, + "learning_rate": 4.564111796210451e-05, + "loss": 0.2143, + "step": 39235 + }, + { + "epoch": 3.178548282566429, + "grad_norm": 0.07139239460229874, + "learning_rate": 4.563661730950988e-05, + "loss": 0.2638, + "step": 39236 + }, + { + "epoch": 3.1786292935839273, + "grad_norm": 0.07724419981241226, + "learning_rate": 4.5632116656915254e-05, + "loss": 0.2209, + "step": 39237 + }, + { + "epoch": 3.1787103046014256, + "grad_norm": 0.06226162612438202, + "learning_rate": 4.562761600432063e-05, + "loss": 0.2253, + "step": 39238 + }, + { + "epoch": 3.1787913156189243, + "grad_norm": 0.06873737275600433, + "learning_rate": 4.5623115351726e-05, + "loss": 0.2261, + "step": 39239 + }, + { + "epoch": 3.1788723266364225, + "grad_norm": 0.07333312928676605, + "learning_rate": 4.561861469913138e-05, + "loss": 0.2453, + "step": 39240 + }, + { + "epoch": 3.1789533376539207, + "grad_norm": 0.07935798913240433, + "learning_rate": 4.561411404653675e-05, + "loss": 0.2215, + "step": 39241 + }, + { + "epoch": 3.1790343486714194, + "grad_norm": 0.07684028148651123, + "learning_rate": 4.560961339394212e-05, + "loss": 0.22, + "step": 39242 + }, + { + "epoch": 3.1791153596889177, + "grad_norm": 0.08199668675661087, + "learning_rate": 4.56051127413475e-05, + "loss": 0.2536, + "step": 39243 + }, + { + "epoch": 3.179196370706416, + "grad_norm": 0.09142853319644928, + "learning_rate": 4.560061208875287e-05, + "loss": 0.2448, + "step": 39244 + }, + { + "epoch": 3.1792773817239146, + "grad_norm": 0.06457065045833588, + "learning_rate": 4.559611143615824e-05, + "loss": 0.2233, + "step": 39245 + }, + { + "epoch": 3.179358392741413, + "grad_norm": 0.09834888577461243, + "learning_rate": 4.559161078356362e-05, + "loss": 0.2493, + "step": 39246 + }, + { + "epoch": 3.179439403758911, + "grad_norm": 0.07715655118227005, + "learning_rate": 4.558711013096899e-05, + "loss": 0.2241, + "step": 39247 + }, + { + "epoch": 3.17952041477641, + "grad_norm": 0.08098747581243515, + "learning_rate": 4.5582609478374363e-05, + "loss": 0.2432, + "step": 39248 + }, + { + "epoch": 3.179601425793908, + "grad_norm": 0.0630323514342308, + "learning_rate": 4.5578108825779744e-05, + "loss": 0.2158, + "step": 39249 + }, + { + "epoch": 3.1796824368114063, + "grad_norm": 0.06567676365375519, + "learning_rate": 4.557360817318511e-05, + "loss": 0.2456, + "step": 39250 + }, + { + "epoch": 3.179763447828905, + "grad_norm": 0.06880970299243927, + "learning_rate": 4.5569107520590484e-05, + "loss": 0.2697, + "step": 39251 + }, + { + "epoch": 3.179844458846403, + "grad_norm": 0.07647331804037094, + "learning_rate": 4.5564606867995865e-05, + "loss": 0.2156, + "step": 39252 + }, + { + "epoch": 3.1799254698639015, + "grad_norm": 0.061718251556158066, + "learning_rate": 4.556010621540124e-05, + "loss": 0.214, + "step": 39253 + }, + { + "epoch": 3.1800064808813997, + "grad_norm": 0.06768068671226501, + "learning_rate": 4.5555605562806605e-05, + "loss": 0.1859, + "step": 39254 + }, + { + "epoch": 3.1800874918988984, + "grad_norm": 0.0734715387225151, + "learning_rate": 4.5551104910211985e-05, + "loss": 0.1913, + "step": 39255 + }, + { + "epoch": 3.1801685029163966, + "grad_norm": 0.0689500942826271, + "learning_rate": 4.554660425761736e-05, + "loss": 0.2301, + "step": 39256 + }, + { + "epoch": 3.180249513933895, + "grad_norm": 0.05353236570954323, + "learning_rate": 4.5542103605022726e-05, + "loss": 0.2004, + "step": 39257 + }, + { + "epoch": 3.1803305249513936, + "grad_norm": 0.05947539582848549, + "learning_rate": 4.5537602952428106e-05, + "loss": 0.2278, + "step": 39258 + }, + { + "epoch": 3.180411535968892, + "grad_norm": 0.06305386871099472, + "learning_rate": 4.553310229983348e-05, + "loss": 0.2257, + "step": 39259 + }, + { + "epoch": 3.18049254698639, + "grad_norm": 0.07101128250360489, + "learning_rate": 4.552860164723885e-05, + "loss": 0.2451, + "step": 39260 + }, + { + "epoch": 3.1805735580038883, + "grad_norm": 0.06881023198366165, + "learning_rate": 4.552410099464423e-05, + "loss": 0.2267, + "step": 39261 + }, + { + "epoch": 3.180654569021387, + "grad_norm": 0.06827352941036224, + "learning_rate": 4.55196003420496e-05, + "loss": 0.2168, + "step": 39262 + }, + { + "epoch": 3.1807355800388852, + "grad_norm": 0.07661303132772446, + "learning_rate": 4.551509968945497e-05, + "loss": 0.221, + "step": 39263 + }, + { + "epoch": 3.1808165910563835, + "grad_norm": 0.05772950127720833, + "learning_rate": 4.551059903686035e-05, + "loss": 0.2088, + "step": 39264 + }, + { + "epoch": 3.180897602073882, + "grad_norm": 0.06366662681102753, + "learning_rate": 4.550609838426572e-05, + "loss": 0.2454, + "step": 39265 + }, + { + "epoch": 3.1809786130913804, + "grad_norm": 0.07420886307954788, + "learning_rate": 4.5501597731671095e-05, + "loss": 0.2599, + "step": 39266 + }, + { + "epoch": 3.1810596241088787, + "grad_norm": 0.0739051029086113, + "learning_rate": 4.549709707907647e-05, + "loss": 0.2616, + "step": 39267 + }, + { + "epoch": 3.1811406351263773, + "grad_norm": 0.08363629132509232, + "learning_rate": 4.549259642648184e-05, + "loss": 0.2161, + "step": 39268 + }, + { + "epoch": 3.1812216461438756, + "grad_norm": 0.07953966408967972, + "learning_rate": 4.5488095773887216e-05, + "loss": 0.2474, + "step": 39269 + }, + { + "epoch": 3.181302657161374, + "grad_norm": 0.07963499426841736, + "learning_rate": 4.548359512129259e-05, + "loss": 0.2252, + "step": 39270 + }, + { + "epoch": 3.1813836681788725, + "grad_norm": 0.06578200310468674, + "learning_rate": 4.547909446869796e-05, + "loss": 0.2329, + "step": 39271 + }, + { + "epoch": 3.1814646791963708, + "grad_norm": 0.061785999685525894, + "learning_rate": 4.547459381610334e-05, + "loss": 0.2612, + "step": 39272 + }, + { + "epoch": 3.181545690213869, + "grad_norm": 0.06063782051205635, + "learning_rate": 4.547009316350871e-05, + "loss": 0.2405, + "step": 39273 + }, + { + "epoch": 3.1816267012313673, + "grad_norm": 0.07053548097610474, + "learning_rate": 4.5465592510914084e-05, + "loss": 0.1988, + "step": 39274 + }, + { + "epoch": 3.181707712248866, + "grad_norm": 0.07843134552240372, + "learning_rate": 4.546109185831946e-05, + "loss": 0.2185, + "step": 39275 + }, + { + "epoch": 3.181788723266364, + "grad_norm": 0.05935240536928177, + "learning_rate": 4.545659120572483e-05, + "loss": 0.2044, + "step": 39276 + }, + { + "epoch": 3.1818697342838624, + "grad_norm": 0.06905554234981537, + "learning_rate": 4.5452090553130205e-05, + "loss": 0.1804, + "step": 39277 + }, + { + "epoch": 3.181950745301361, + "grad_norm": 0.06303178519010544, + "learning_rate": 4.544758990053558e-05, + "loss": 0.2416, + "step": 39278 + }, + { + "epoch": 3.1820317563188594, + "grad_norm": 0.05552779883146286, + "learning_rate": 4.544308924794095e-05, + "loss": 0.1982, + "step": 39279 + }, + { + "epoch": 3.1821127673363576, + "grad_norm": 0.09019738435745239, + "learning_rate": 4.5438588595346326e-05, + "loss": 0.2264, + "step": 39280 + }, + { + "epoch": 3.1821937783538563, + "grad_norm": 0.06747749447822571, + "learning_rate": 4.54340879427517e-05, + "loss": 0.1937, + "step": 39281 + }, + { + "epoch": 3.1822747893713546, + "grad_norm": 0.07805449515581131, + "learning_rate": 4.542958729015708e-05, + "loss": 0.1897, + "step": 39282 + }, + { + "epoch": 3.182355800388853, + "grad_norm": 0.060038719326257706, + "learning_rate": 4.5425086637562446e-05, + "loss": 0.2494, + "step": 39283 + }, + { + "epoch": 3.182436811406351, + "grad_norm": 0.06551884859800339, + "learning_rate": 4.542058598496782e-05, + "loss": 0.2262, + "step": 39284 + }, + { + "epoch": 3.1825178224238497, + "grad_norm": 0.06679324060678482, + "learning_rate": 4.54160853323732e-05, + "loss": 0.236, + "step": 39285 + }, + { + "epoch": 3.182598833441348, + "grad_norm": 0.06539665907621384, + "learning_rate": 4.541158467977857e-05, + "loss": 0.23, + "step": 39286 + }, + { + "epoch": 3.182679844458846, + "grad_norm": 0.06999313086271286, + "learning_rate": 4.540708402718394e-05, + "loss": 0.2264, + "step": 39287 + }, + { + "epoch": 3.182760855476345, + "grad_norm": 0.0747220367193222, + "learning_rate": 4.540258337458932e-05, + "loss": 0.2024, + "step": 39288 + }, + { + "epoch": 3.182841866493843, + "grad_norm": 0.06839533895254135, + "learning_rate": 4.539808272199469e-05, + "loss": 0.2165, + "step": 39289 + }, + { + "epoch": 3.1829228775113414, + "grad_norm": 0.054501939564943314, + "learning_rate": 4.539358206940006e-05, + "loss": 0.2009, + "step": 39290 + }, + { + "epoch": 3.18300388852884, + "grad_norm": 0.06777974963188171, + "learning_rate": 4.538908141680544e-05, + "loss": 0.2538, + "step": 39291 + }, + { + "epoch": 3.1830848995463383, + "grad_norm": 0.06993214040994644, + "learning_rate": 4.538458076421081e-05, + "loss": 0.229, + "step": 39292 + }, + { + "epoch": 3.1831659105638366, + "grad_norm": 0.07302381843328476, + "learning_rate": 4.538008011161618e-05, + "loss": 0.2465, + "step": 39293 + }, + { + "epoch": 3.1832469215813353, + "grad_norm": 0.06529241055250168, + "learning_rate": 4.537557945902156e-05, + "loss": 0.2243, + "step": 39294 + }, + { + "epoch": 3.1833279325988335, + "grad_norm": 0.07419822365045547, + "learning_rate": 4.5371078806426936e-05, + "loss": 0.2513, + "step": 39295 + }, + { + "epoch": 3.1834089436163318, + "grad_norm": 0.06903527677059174, + "learning_rate": 4.53665781538323e-05, + "loss": 0.2284, + "step": 39296 + }, + { + "epoch": 3.18348995463383, + "grad_norm": 0.0647360309958458, + "learning_rate": 4.5362077501237684e-05, + "loss": 0.2102, + "step": 39297 + }, + { + "epoch": 3.1835709656513287, + "grad_norm": 0.07613998651504517, + "learning_rate": 4.535757684864306e-05, + "loss": 0.2236, + "step": 39298 + }, + { + "epoch": 3.183651976668827, + "grad_norm": 0.0889119952917099, + "learning_rate": 4.5353076196048424e-05, + "loss": 0.2244, + "step": 39299 + }, + { + "epoch": 3.183732987686325, + "grad_norm": 0.07261547446250916, + "learning_rate": 4.5348575543453804e-05, + "loss": 0.222, + "step": 39300 + }, + { + "epoch": 3.183813998703824, + "grad_norm": 0.06894300132989883, + "learning_rate": 4.534407489085918e-05, + "loss": 0.1955, + "step": 39301 + }, + { + "epoch": 3.183895009721322, + "grad_norm": 0.08977346867322922, + "learning_rate": 4.533957423826455e-05, + "loss": 0.2679, + "step": 39302 + }, + { + "epoch": 3.1839760207388204, + "grad_norm": 0.08320219069719315, + "learning_rate": 4.5335073585669925e-05, + "loss": 0.2693, + "step": 39303 + }, + { + "epoch": 3.184057031756319, + "grad_norm": 0.08963727951049805, + "learning_rate": 4.53305729330753e-05, + "loss": 0.2384, + "step": 39304 + }, + { + "epoch": 3.1841380427738173, + "grad_norm": 0.06358581781387329, + "learning_rate": 4.532607228048067e-05, + "loss": 0.2049, + "step": 39305 + }, + { + "epoch": 3.1842190537913155, + "grad_norm": 0.0702756866812706, + "learning_rate": 4.5321571627886046e-05, + "loss": 0.2305, + "step": 39306 + }, + { + "epoch": 3.184300064808814, + "grad_norm": 0.0786462277173996, + "learning_rate": 4.531707097529142e-05, + "loss": 0.2217, + "step": 39307 + }, + { + "epoch": 3.1843810758263125, + "grad_norm": 0.06890799850225449, + "learning_rate": 4.531257032269679e-05, + "loss": 0.2644, + "step": 39308 + }, + { + "epoch": 3.1844620868438107, + "grad_norm": 0.07675221562385559, + "learning_rate": 4.530806967010217e-05, + "loss": 0.2375, + "step": 39309 + }, + { + "epoch": 3.184543097861309, + "grad_norm": 0.07103940099477768, + "learning_rate": 4.530356901750754e-05, + "loss": 0.2751, + "step": 39310 + }, + { + "epoch": 3.1846241088788076, + "grad_norm": 0.06175985932350159, + "learning_rate": 4.5299068364912914e-05, + "loss": 0.2273, + "step": 39311 + }, + { + "epoch": 3.184705119896306, + "grad_norm": 0.06587029248476028, + "learning_rate": 4.529456771231829e-05, + "loss": 0.2443, + "step": 39312 + }, + { + "epoch": 3.184786130913804, + "grad_norm": 0.07483391463756561, + "learning_rate": 4.529006705972366e-05, + "loss": 0.2391, + "step": 39313 + }, + { + "epoch": 3.184867141931303, + "grad_norm": 0.06790148466825485, + "learning_rate": 4.5285566407129035e-05, + "loss": 0.2365, + "step": 39314 + }, + { + "epoch": 3.184948152948801, + "grad_norm": 0.05836471542716026, + "learning_rate": 4.528106575453441e-05, + "loss": 0.212, + "step": 39315 + }, + { + "epoch": 3.1850291639662993, + "grad_norm": 0.08337269723415375, + "learning_rate": 4.527656510193978e-05, + "loss": 0.2291, + "step": 39316 + }, + { + "epoch": 3.185110174983798, + "grad_norm": 0.07346945255994797, + "learning_rate": 4.5272064449345156e-05, + "loss": 0.2382, + "step": 39317 + }, + { + "epoch": 3.1851911860012962, + "grad_norm": 0.07544538378715515, + "learning_rate": 4.526756379675053e-05, + "loss": 0.2542, + "step": 39318 + }, + { + "epoch": 3.1852721970187945, + "grad_norm": 0.07884622365236282, + "learning_rate": 4.52630631441559e-05, + "loss": 0.1987, + "step": 39319 + }, + { + "epoch": 3.1853532080362927, + "grad_norm": 0.06149621307849884, + "learning_rate": 4.5258562491561276e-05, + "loss": 0.2165, + "step": 39320 + }, + { + "epoch": 3.1854342190537914, + "grad_norm": 0.0629139170050621, + "learning_rate": 4.525406183896666e-05, + "loss": 0.2683, + "step": 39321 + }, + { + "epoch": 3.1855152300712897, + "grad_norm": 0.05898153409361839, + "learning_rate": 4.5249561186372024e-05, + "loss": 0.2255, + "step": 39322 + }, + { + "epoch": 3.185596241088788, + "grad_norm": 0.07770536839962006, + "learning_rate": 4.52450605337774e-05, + "loss": 0.2499, + "step": 39323 + }, + { + "epoch": 3.1856772521062866, + "grad_norm": 0.06799954921007156, + "learning_rate": 4.524055988118278e-05, + "loss": 0.255, + "step": 39324 + }, + { + "epoch": 3.185758263123785, + "grad_norm": 0.08119504153728485, + "learning_rate": 4.5236059228588144e-05, + "loss": 0.2339, + "step": 39325 + }, + { + "epoch": 3.185839274141283, + "grad_norm": 0.07942589372396469, + "learning_rate": 4.523155857599352e-05, + "loss": 0.2523, + "step": 39326 + }, + { + "epoch": 3.185920285158782, + "grad_norm": 0.06663884222507477, + "learning_rate": 4.52270579233989e-05, + "loss": 0.1957, + "step": 39327 + }, + { + "epoch": 3.18600129617628, + "grad_norm": 0.06974213570356369, + "learning_rate": 4.5222557270804265e-05, + "loss": 0.2075, + "step": 39328 + }, + { + "epoch": 3.1860823071937783, + "grad_norm": 0.08034425973892212, + "learning_rate": 4.521805661820964e-05, + "loss": 0.2451, + "step": 39329 + }, + { + "epoch": 3.1861633182112765, + "grad_norm": 0.07607818394899368, + "learning_rate": 4.521355596561502e-05, + "loss": 0.2553, + "step": 39330 + }, + { + "epoch": 3.186244329228775, + "grad_norm": 0.09670832008123398, + "learning_rate": 4.5209055313020386e-05, + "loss": 0.2715, + "step": 39331 + }, + { + "epoch": 3.1863253402462735, + "grad_norm": 0.06915496289730072, + "learning_rate": 4.520455466042576e-05, + "loss": 0.2111, + "step": 39332 + }, + { + "epoch": 3.1864063512637717, + "grad_norm": 0.06327734887599945, + "learning_rate": 4.520005400783114e-05, + "loss": 0.2232, + "step": 39333 + }, + { + "epoch": 3.1864873622812704, + "grad_norm": 0.06974446773529053, + "learning_rate": 4.5195553355236514e-05, + "loss": 0.2408, + "step": 39334 + }, + { + "epoch": 3.1865683732987686, + "grad_norm": 0.06775286048650742, + "learning_rate": 4.519105270264189e-05, + "loss": 0.1984, + "step": 39335 + }, + { + "epoch": 3.186649384316267, + "grad_norm": 0.08392871916294098, + "learning_rate": 4.518655205004726e-05, + "loss": 0.2351, + "step": 39336 + }, + { + "epoch": 3.1867303953337656, + "grad_norm": 0.0782778263092041, + "learning_rate": 4.5182051397452634e-05, + "loss": 0.2479, + "step": 39337 + }, + { + "epoch": 3.186811406351264, + "grad_norm": 0.09481296688318253, + "learning_rate": 4.517755074485801e-05, + "loss": 0.2382, + "step": 39338 + }, + { + "epoch": 3.186892417368762, + "grad_norm": 0.08505354076623917, + "learning_rate": 4.517305009226338e-05, + "loss": 0.2402, + "step": 39339 + }, + { + "epoch": 3.1869734283862607, + "grad_norm": 0.08199076354503632, + "learning_rate": 4.5168549439668755e-05, + "loss": 0.2511, + "step": 39340 + }, + { + "epoch": 3.187054439403759, + "grad_norm": 0.072618268430233, + "learning_rate": 4.516404878707413e-05, + "loss": 0.2194, + "step": 39341 + }, + { + "epoch": 3.1871354504212572, + "grad_norm": 0.06208586320281029, + "learning_rate": 4.51595481344795e-05, + "loss": 0.2061, + "step": 39342 + }, + { + "epoch": 3.1872164614387555, + "grad_norm": 0.06540694087743759, + "learning_rate": 4.5155047481884876e-05, + "loss": 0.2373, + "step": 39343 + }, + { + "epoch": 3.187297472456254, + "grad_norm": 0.05991688743233681, + "learning_rate": 4.515054682929025e-05, + "loss": 0.2162, + "step": 39344 + }, + { + "epoch": 3.1873784834737524, + "grad_norm": 0.07174012809991837, + "learning_rate": 4.514604617669562e-05, + "loss": 0.2339, + "step": 39345 + }, + { + "epoch": 3.1874594944912507, + "grad_norm": 0.08227608352899551, + "learning_rate": 4.5141545524101e-05, + "loss": 0.2404, + "step": 39346 + }, + { + "epoch": 3.1875405055087493, + "grad_norm": 0.09132973104715347, + "learning_rate": 4.513704487150637e-05, + "loss": 0.2598, + "step": 39347 + }, + { + "epoch": 3.1876215165262476, + "grad_norm": 0.07459663599729538, + "learning_rate": 4.5132544218911744e-05, + "loss": 0.1995, + "step": 39348 + }, + { + "epoch": 3.187702527543746, + "grad_norm": 0.05755617842078209, + "learning_rate": 4.512804356631712e-05, + "loss": 0.2197, + "step": 39349 + }, + { + "epoch": 3.1877835385612445, + "grad_norm": 0.0721568837761879, + "learning_rate": 4.512354291372249e-05, + "loss": 0.2361, + "step": 39350 + }, + { + "epoch": 3.1878645495787428, + "grad_norm": 0.09603045135736465, + "learning_rate": 4.5119042261127865e-05, + "loss": 0.2237, + "step": 39351 + }, + { + "epoch": 3.187945560596241, + "grad_norm": 0.07836206257343292, + "learning_rate": 4.511454160853324e-05, + "loss": 0.2433, + "step": 39352 + }, + { + "epoch": 3.1880265716137393, + "grad_norm": 0.0568229965865612, + "learning_rate": 4.511004095593861e-05, + "loss": 0.207, + "step": 39353 + }, + { + "epoch": 3.188107582631238, + "grad_norm": 0.07176083326339722, + "learning_rate": 4.5105540303343986e-05, + "loss": 0.2325, + "step": 39354 + }, + { + "epoch": 3.188188593648736, + "grad_norm": 0.07750972360372543, + "learning_rate": 4.510103965074936e-05, + "loss": 0.249, + "step": 39355 + }, + { + "epoch": 3.1882696046662344, + "grad_norm": 0.07936552911996841, + "learning_rate": 4.509653899815473e-05, + "loss": 0.2384, + "step": 39356 + }, + { + "epoch": 3.188350615683733, + "grad_norm": 0.07242981344461441, + "learning_rate": 4.5092038345560107e-05, + "loss": 0.2362, + "step": 39357 + }, + { + "epoch": 3.1884316267012314, + "grad_norm": 0.07366965711116791, + "learning_rate": 4.508753769296548e-05, + "loss": 0.1928, + "step": 39358 + }, + { + "epoch": 3.1885126377187296, + "grad_norm": 0.060326382517814636, + "learning_rate": 4.5083037040370854e-05, + "loss": 0.238, + "step": 39359 + }, + { + "epoch": 3.1885936487362283, + "grad_norm": 0.06053246930241585, + "learning_rate": 4.5078536387776234e-05, + "loss": 0.2133, + "step": 39360 + }, + { + "epoch": 3.1886746597537265, + "grad_norm": 0.08124006539583206, + "learning_rate": 4.50740357351816e-05, + "loss": 0.2512, + "step": 39361 + }, + { + "epoch": 3.188755670771225, + "grad_norm": 0.07329577207565308, + "learning_rate": 4.5069535082586975e-05, + "loss": 0.2079, + "step": 39362 + }, + { + "epoch": 3.1888366817887235, + "grad_norm": 0.08375972509384155, + "learning_rate": 4.5065034429992355e-05, + "loss": 0.2172, + "step": 39363 + }, + { + "epoch": 3.1889176928062217, + "grad_norm": 0.0656561627984047, + "learning_rate": 4.506053377739772e-05, + "loss": 0.2171, + "step": 39364 + }, + { + "epoch": 3.18899870382372, + "grad_norm": 0.0688033252954483, + "learning_rate": 4.5056033124803095e-05, + "loss": 0.2515, + "step": 39365 + }, + { + "epoch": 3.189079714841218, + "grad_norm": 0.07227391749620438, + "learning_rate": 4.5051532472208476e-05, + "loss": 0.2278, + "step": 39366 + }, + { + "epoch": 3.189160725858717, + "grad_norm": 0.07597273588180542, + "learning_rate": 4.504703181961384e-05, + "loss": 0.2374, + "step": 39367 + }, + { + "epoch": 3.189241736876215, + "grad_norm": 0.07532206922769547, + "learning_rate": 4.504253116701922e-05, + "loss": 0.2444, + "step": 39368 + }, + { + "epoch": 3.1893227478937134, + "grad_norm": 0.06747014075517654, + "learning_rate": 4.5038030514424597e-05, + "loss": 0.24, + "step": 39369 + }, + { + "epoch": 3.189403758911212, + "grad_norm": 0.0654434859752655, + "learning_rate": 4.5033529861829963e-05, + "loss": 0.1974, + "step": 39370 + }, + { + "epoch": 3.1894847699287103, + "grad_norm": 0.06667497009038925, + "learning_rate": 4.5029029209235344e-05, + "loss": 0.2363, + "step": 39371 + }, + { + "epoch": 3.1895657809462086, + "grad_norm": 0.07937151938676834, + "learning_rate": 4.502452855664072e-05, + "loss": 0.2154, + "step": 39372 + }, + { + "epoch": 3.1896467919637073, + "grad_norm": 0.07885178923606873, + "learning_rate": 4.502002790404609e-05, + "loss": 0.2385, + "step": 39373 + }, + { + "epoch": 3.1897278029812055, + "grad_norm": 0.06830567121505737, + "learning_rate": 4.5015527251451465e-05, + "loss": 0.1888, + "step": 39374 + }, + { + "epoch": 3.1898088139987038, + "grad_norm": 0.07489234209060669, + "learning_rate": 4.501102659885684e-05, + "loss": 0.2403, + "step": 39375 + }, + { + "epoch": 3.189889825016202, + "grad_norm": 0.0693124309182167, + "learning_rate": 4.500652594626221e-05, + "loss": 0.2068, + "step": 39376 + }, + { + "epoch": 3.1899708360337007, + "grad_norm": 0.0662122517824173, + "learning_rate": 4.5002025293667585e-05, + "loss": 0.214, + "step": 39377 + }, + { + "epoch": 3.190051847051199, + "grad_norm": 0.07063662260770798, + "learning_rate": 4.499752464107296e-05, + "loss": 0.2014, + "step": 39378 + }, + { + "epoch": 3.190132858068697, + "grad_norm": 0.06921156495809555, + "learning_rate": 4.499302398847833e-05, + "loss": 0.2403, + "step": 39379 + }, + { + "epoch": 3.190213869086196, + "grad_norm": 0.07520690560340881, + "learning_rate": 4.4988523335883706e-05, + "loss": 0.2592, + "step": 39380 + }, + { + "epoch": 3.190294880103694, + "grad_norm": 0.042391337454319, + "learning_rate": 4.498402268328908e-05, + "loss": 0.2146, + "step": 39381 + }, + { + "epoch": 3.1903758911211924, + "grad_norm": 0.08247774839401245, + "learning_rate": 4.4979522030694453e-05, + "loss": 0.2418, + "step": 39382 + }, + { + "epoch": 3.190456902138691, + "grad_norm": 0.06482577323913574, + "learning_rate": 4.497502137809983e-05, + "loss": 0.2355, + "step": 39383 + }, + { + "epoch": 3.1905379131561893, + "grad_norm": 0.07847940176725388, + "learning_rate": 4.49705207255052e-05, + "loss": 0.2225, + "step": 39384 + }, + { + "epoch": 3.1906189241736875, + "grad_norm": 0.0957542285323143, + "learning_rate": 4.4966020072910574e-05, + "loss": 0.2325, + "step": 39385 + }, + { + "epoch": 3.190699935191186, + "grad_norm": 0.07233288884162903, + "learning_rate": 4.496151942031595e-05, + "loss": 0.2477, + "step": 39386 + }, + { + "epoch": 3.1907809462086845, + "grad_norm": 0.07749512791633606, + "learning_rate": 4.495701876772132e-05, + "loss": 0.2174, + "step": 39387 + }, + { + "epoch": 3.1908619572261827, + "grad_norm": 0.06869810819625854, + "learning_rate": 4.4952518115126695e-05, + "loss": 0.2335, + "step": 39388 + }, + { + "epoch": 3.190942968243681, + "grad_norm": 0.06706426292657852, + "learning_rate": 4.494801746253207e-05, + "loss": 0.1688, + "step": 39389 + }, + { + "epoch": 3.1910239792611796, + "grad_norm": 0.07330158352851868, + "learning_rate": 4.494351680993744e-05, + "loss": 0.2111, + "step": 39390 + }, + { + "epoch": 3.191104990278678, + "grad_norm": 0.06660532206296921, + "learning_rate": 4.4939016157342816e-05, + "loss": 0.2378, + "step": 39391 + }, + { + "epoch": 3.191186001296176, + "grad_norm": 0.07427835464477539, + "learning_rate": 4.493451550474819e-05, + "loss": 0.2075, + "step": 39392 + }, + { + "epoch": 3.191267012313675, + "grad_norm": 0.08421943336725235, + "learning_rate": 4.493001485215356e-05, + "loss": 0.254, + "step": 39393 + }, + { + "epoch": 3.191348023331173, + "grad_norm": 0.0874992161989212, + "learning_rate": 4.492551419955894e-05, + "loss": 0.2039, + "step": 39394 + }, + { + "epoch": 3.1914290343486713, + "grad_norm": 0.06732051074504852, + "learning_rate": 4.492101354696431e-05, + "loss": 0.2368, + "step": 39395 + }, + { + "epoch": 3.19151004536617, + "grad_norm": 0.06246805191040039, + "learning_rate": 4.4916512894369684e-05, + "loss": 0.2191, + "step": 39396 + }, + { + "epoch": 3.1915910563836682, + "grad_norm": 0.06294985860586166, + "learning_rate": 4.491201224177506e-05, + "loss": 0.2046, + "step": 39397 + }, + { + "epoch": 3.1916720674011665, + "grad_norm": 0.07085716724395752, + "learning_rate": 4.490751158918043e-05, + "loss": 0.2596, + "step": 39398 + }, + { + "epoch": 3.1917530784186647, + "grad_norm": 0.05900321155786514, + "learning_rate": 4.490301093658581e-05, + "loss": 0.2447, + "step": 39399 + }, + { + "epoch": 3.1918340894361634, + "grad_norm": 0.06928402185440063, + "learning_rate": 4.489851028399118e-05, + "loss": 0.1887, + "step": 39400 + }, + { + "epoch": 3.1919151004536617, + "grad_norm": 0.07420630753040314, + "learning_rate": 4.489400963139655e-05, + "loss": 0.2255, + "step": 39401 + }, + { + "epoch": 3.19199611147116, + "grad_norm": 0.07554472237825394, + "learning_rate": 4.488950897880193e-05, + "loss": 0.2109, + "step": 39402 + }, + { + "epoch": 3.1920771224886586, + "grad_norm": 0.07431972026824951, + "learning_rate": 4.48850083262073e-05, + "loss": 0.2052, + "step": 39403 + }, + { + "epoch": 3.192158133506157, + "grad_norm": 0.07085475325584412, + "learning_rate": 4.488050767361268e-05, + "loss": 0.2253, + "step": 39404 + }, + { + "epoch": 3.192239144523655, + "grad_norm": 0.08677297085523605, + "learning_rate": 4.487600702101805e-05, + "loss": 0.2393, + "step": 39405 + }, + { + "epoch": 3.192320155541154, + "grad_norm": 0.08472663164138794, + "learning_rate": 4.487150636842342e-05, + "loss": 0.2295, + "step": 39406 + }, + { + "epoch": 3.192401166558652, + "grad_norm": 0.07280577719211578, + "learning_rate": 4.48670057158288e-05, + "loss": 0.2509, + "step": 39407 + }, + { + "epoch": 3.1924821775761503, + "grad_norm": 0.07977905124425888, + "learning_rate": 4.4862505063234174e-05, + "loss": 0.2702, + "step": 39408 + }, + { + "epoch": 3.192563188593649, + "grad_norm": 0.0874275490641594, + "learning_rate": 4.485800441063954e-05, + "loss": 0.2464, + "step": 39409 + }, + { + "epoch": 3.192644199611147, + "grad_norm": 0.06713542342185974, + "learning_rate": 4.485350375804492e-05, + "loss": 0.2273, + "step": 39410 + }, + { + "epoch": 3.1927252106286454, + "grad_norm": 0.06258574873209, + "learning_rate": 4.4849003105450295e-05, + "loss": 0.2559, + "step": 39411 + }, + { + "epoch": 3.1928062216461437, + "grad_norm": 0.06480179727077484, + "learning_rate": 4.484450245285567e-05, + "loss": 0.2516, + "step": 39412 + }, + { + "epoch": 3.1928872326636424, + "grad_norm": 0.07016629725694656, + "learning_rate": 4.484000180026104e-05, + "loss": 0.2385, + "step": 39413 + }, + { + "epoch": 3.1929682436811406, + "grad_norm": 0.06314028799533844, + "learning_rate": 4.4835501147666415e-05, + "loss": 0.2396, + "step": 39414 + }, + { + "epoch": 3.193049254698639, + "grad_norm": 0.07210088521242142, + "learning_rate": 4.483100049507179e-05, + "loss": 0.2155, + "step": 39415 + }, + { + "epoch": 3.1931302657161376, + "grad_norm": 0.0667954757809639, + "learning_rate": 4.482649984247716e-05, + "loss": 0.2494, + "step": 39416 + }, + { + "epoch": 3.193211276733636, + "grad_norm": 0.07332837581634521, + "learning_rate": 4.4821999189882536e-05, + "loss": 0.192, + "step": 39417 + }, + { + "epoch": 3.193292287751134, + "grad_norm": 0.07549264281988144, + "learning_rate": 4.481749853728791e-05, + "loss": 0.2291, + "step": 39418 + }, + { + "epoch": 3.1933732987686327, + "grad_norm": 0.06823220103979111, + "learning_rate": 4.4812997884693284e-05, + "loss": 0.2631, + "step": 39419 + }, + { + "epoch": 3.193454309786131, + "grad_norm": 0.07784800231456757, + "learning_rate": 4.480849723209866e-05, + "loss": 0.2375, + "step": 39420 + }, + { + "epoch": 3.1935353208036292, + "grad_norm": 0.0801076591014862, + "learning_rate": 4.480399657950403e-05, + "loss": 0.2755, + "step": 39421 + }, + { + "epoch": 3.1936163318211275, + "grad_norm": 0.0587022602558136, + "learning_rate": 4.4799495926909404e-05, + "loss": 0.203, + "step": 39422 + }, + { + "epoch": 3.193697342838626, + "grad_norm": 0.07514984905719757, + "learning_rate": 4.479499527431478e-05, + "loss": 0.2331, + "step": 39423 + }, + { + "epoch": 3.1937783538561244, + "grad_norm": 0.07274370640516281, + "learning_rate": 4.479049462172015e-05, + "loss": 0.199, + "step": 39424 + }, + { + "epoch": 3.1938593648736227, + "grad_norm": 0.07186318188905716, + "learning_rate": 4.4785993969125525e-05, + "loss": 0.2202, + "step": 39425 + }, + { + "epoch": 3.1939403758911213, + "grad_norm": 0.07100582122802734, + "learning_rate": 4.47814933165309e-05, + "loss": 0.2082, + "step": 39426 + }, + { + "epoch": 3.1940213869086196, + "grad_norm": 0.07508399337530136, + "learning_rate": 4.477699266393627e-05, + "loss": 0.2369, + "step": 39427 + }, + { + "epoch": 3.194102397926118, + "grad_norm": 0.06001093611121178, + "learning_rate": 4.4772492011341646e-05, + "loss": 0.1918, + "step": 39428 + }, + { + "epoch": 3.1941834089436165, + "grad_norm": 0.07501449435949326, + "learning_rate": 4.476799135874702e-05, + "loss": 0.2387, + "step": 39429 + }, + { + "epoch": 3.1942644199611148, + "grad_norm": 0.07020004838705063, + "learning_rate": 4.476349070615239e-05, + "loss": 0.236, + "step": 39430 + }, + { + "epoch": 3.194345430978613, + "grad_norm": 0.06646616011857986, + "learning_rate": 4.475899005355777e-05, + "loss": 0.2265, + "step": 39431 + }, + { + "epoch": 3.1944264419961117, + "grad_norm": 0.0961184874176979, + "learning_rate": 4.475448940096314e-05, + "loss": 0.2664, + "step": 39432 + }, + { + "epoch": 3.19450745301361, + "grad_norm": 0.06955624371767044, + "learning_rate": 4.4749988748368514e-05, + "loss": 0.2384, + "step": 39433 + }, + { + "epoch": 3.194588464031108, + "grad_norm": 0.07705685496330261, + "learning_rate": 4.474548809577389e-05, + "loss": 0.2422, + "step": 39434 + }, + { + "epoch": 3.1946694750486064, + "grad_norm": 0.06348368525505066, + "learning_rate": 4.474098744317926e-05, + "loss": 0.2252, + "step": 39435 + }, + { + "epoch": 3.194750486066105, + "grad_norm": 0.08564847707748413, + "learning_rate": 4.4736486790584635e-05, + "loss": 0.2518, + "step": 39436 + }, + { + "epoch": 3.1948314970836034, + "grad_norm": 0.07809983193874359, + "learning_rate": 4.4731986137990015e-05, + "loss": 0.2477, + "step": 39437 + }, + { + "epoch": 3.1949125081011016, + "grad_norm": 0.07049041986465454, + "learning_rate": 4.472748548539538e-05, + "loss": 0.233, + "step": 39438 + }, + { + "epoch": 3.1949935191186003, + "grad_norm": 0.07803355157375336, + "learning_rate": 4.4722984832800756e-05, + "loss": 0.2325, + "step": 39439 + }, + { + "epoch": 3.1950745301360985, + "grad_norm": 0.07229940593242645, + "learning_rate": 4.4718484180206136e-05, + "loss": 0.2165, + "step": 39440 + }, + { + "epoch": 3.195155541153597, + "grad_norm": 0.06585335731506348, + "learning_rate": 4.471398352761151e-05, + "loss": 0.2107, + "step": 39441 + }, + { + "epoch": 3.195236552171095, + "grad_norm": 0.05904988572001457, + "learning_rate": 4.4709482875016876e-05, + "loss": 0.2193, + "step": 39442 + }, + { + "epoch": 3.1953175631885937, + "grad_norm": 0.06626173108816147, + "learning_rate": 4.470498222242226e-05, + "loss": 0.1995, + "step": 39443 + }, + { + "epoch": 3.195398574206092, + "grad_norm": 0.08097077161073685, + "learning_rate": 4.470048156982763e-05, + "loss": 0.2152, + "step": 39444 + }, + { + "epoch": 3.19547958522359, + "grad_norm": 0.06193895637989044, + "learning_rate": 4.4695980917233e-05, + "loss": 0.1984, + "step": 39445 + }, + { + "epoch": 3.195560596241089, + "grad_norm": 0.06149062514305115, + "learning_rate": 4.469148026463838e-05, + "loss": 0.179, + "step": 39446 + }, + { + "epoch": 3.195641607258587, + "grad_norm": 0.07270010560750961, + "learning_rate": 4.468697961204375e-05, + "loss": 0.2465, + "step": 39447 + }, + { + "epoch": 3.1957226182760854, + "grad_norm": 0.06477028131484985, + "learning_rate": 4.468247895944912e-05, + "loss": 0.2283, + "step": 39448 + }, + { + "epoch": 3.195803629293584, + "grad_norm": 0.06298864632844925, + "learning_rate": 4.46779783068545e-05, + "loss": 0.2256, + "step": 39449 + }, + { + "epoch": 3.1958846403110823, + "grad_norm": 0.081520214676857, + "learning_rate": 4.467347765425987e-05, + "loss": 0.2327, + "step": 39450 + }, + { + "epoch": 3.1959656513285806, + "grad_norm": 0.08572669327259064, + "learning_rate": 4.466897700166524e-05, + "loss": 0.2633, + "step": 39451 + }, + { + "epoch": 3.1960466623460793, + "grad_norm": 0.07522355765104294, + "learning_rate": 4.466447634907062e-05, + "loss": 0.2173, + "step": 39452 + }, + { + "epoch": 3.1961276733635775, + "grad_norm": 0.07185790687799454, + "learning_rate": 4.465997569647599e-05, + "loss": 0.2607, + "step": 39453 + }, + { + "epoch": 3.1962086843810757, + "grad_norm": 0.07226600497961044, + "learning_rate": 4.4655475043881366e-05, + "loss": 0.2411, + "step": 39454 + }, + { + "epoch": 3.1962896953985744, + "grad_norm": 0.06172429025173187, + "learning_rate": 4.465097439128674e-05, + "loss": 0.1842, + "step": 39455 + }, + { + "epoch": 3.1963707064160727, + "grad_norm": 0.0709967091679573, + "learning_rate": 4.4646473738692114e-05, + "loss": 0.2145, + "step": 39456 + }, + { + "epoch": 3.196451717433571, + "grad_norm": 0.07477506250143051, + "learning_rate": 4.464197308609749e-05, + "loss": 0.2486, + "step": 39457 + }, + { + "epoch": 3.196532728451069, + "grad_norm": 0.07171639055013657, + "learning_rate": 4.463747243350286e-05, + "loss": 0.2086, + "step": 39458 + }, + { + "epoch": 3.196613739468568, + "grad_norm": 0.060899294912815094, + "learning_rate": 4.4632971780908234e-05, + "loss": 0.2491, + "step": 39459 + }, + { + "epoch": 3.196694750486066, + "grad_norm": 0.0627385824918747, + "learning_rate": 4.462847112831361e-05, + "loss": 0.2359, + "step": 39460 + }, + { + "epoch": 3.1967757615035644, + "grad_norm": 0.07605332881212234, + "learning_rate": 4.462397047571898e-05, + "loss": 0.2241, + "step": 39461 + }, + { + "epoch": 3.196856772521063, + "grad_norm": 0.06061991676688194, + "learning_rate": 4.4619469823124355e-05, + "loss": 0.2445, + "step": 39462 + }, + { + "epoch": 3.1969377835385613, + "grad_norm": 0.07427114248275757, + "learning_rate": 4.461496917052973e-05, + "loss": 0.2162, + "step": 39463 + }, + { + "epoch": 3.1970187945560595, + "grad_norm": 0.09817933291196823, + "learning_rate": 4.46104685179351e-05, + "loss": 0.2217, + "step": 39464 + }, + { + "epoch": 3.1970998055735578, + "grad_norm": 0.07270355522632599, + "learning_rate": 4.4605967865340476e-05, + "loss": 0.1933, + "step": 39465 + }, + { + "epoch": 3.1971808165910565, + "grad_norm": 0.06888288259506226, + "learning_rate": 4.460146721274585e-05, + "loss": 0.2389, + "step": 39466 + }, + { + "epoch": 3.1972618276085547, + "grad_norm": 0.05891997739672661, + "learning_rate": 4.459696656015122e-05, + "loss": 0.1841, + "step": 39467 + }, + { + "epoch": 3.197342838626053, + "grad_norm": 0.07122666388750076, + "learning_rate": 4.45924659075566e-05, + "loss": 0.2278, + "step": 39468 + }, + { + "epoch": 3.1974238496435516, + "grad_norm": 0.0724184587597847, + "learning_rate": 4.458796525496197e-05, + "loss": 0.2311, + "step": 39469 + }, + { + "epoch": 3.19750486066105, + "grad_norm": 0.08630391210317612, + "learning_rate": 4.458346460236735e-05, + "loss": 0.2416, + "step": 39470 + }, + { + "epoch": 3.197585871678548, + "grad_norm": 0.07383977621793747, + "learning_rate": 4.457896394977272e-05, + "loss": 0.2148, + "step": 39471 + }, + { + "epoch": 3.197666882696047, + "grad_norm": 0.07253684848546982, + "learning_rate": 4.457446329717809e-05, + "loss": 0.1938, + "step": 39472 + }, + { + "epoch": 3.197747893713545, + "grad_norm": 0.08100492507219315, + "learning_rate": 4.456996264458347e-05, + "loss": 0.2485, + "step": 39473 + }, + { + "epoch": 3.1978289047310433, + "grad_norm": 0.0740998312830925, + "learning_rate": 4.456546199198884e-05, + "loss": 0.2162, + "step": 39474 + }, + { + "epoch": 3.197909915748542, + "grad_norm": 0.08284035325050354, + "learning_rate": 4.456096133939421e-05, + "loss": 0.2451, + "step": 39475 + }, + { + "epoch": 3.1979909267660402, + "grad_norm": 0.09450192749500275, + "learning_rate": 4.455646068679959e-05, + "loss": 0.293, + "step": 39476 + }, + { + "epoch": 3.1980719377835385, + "grad_norm": 0.06692380458116531, + "learning_rate": 4.455196003420496e-05, + "loss": 0.227, + "step": 39477 + }, + { + "epoch": 3.198152948801037, + "grad_norm": 0.0706232488155365, + "learning_rate": 4.454745938161033e-05, + "loss": 0.251, + "step": 39478 + }, + { + "epoch": 3.1982339598185354, + "grad_norm": 0.06894442439079285, + "learning_rate": 4.454295872901571e-05, + "loss": 0.2228, + "step": 39479 + }, + { + "epoch": 3.1983149708360337, + "grad_norm": 0.061690591275691986, + "learning_rate": 4.453845807642109e-05, + "loss": 0.183, + "step": 39480 + }, + { + "epoch": 3.198395981853532, + "grad_norm": 0.07705006748437881, + "learning_rate": 4.4533957423826454e-05, + "loss": 0.2654, + "step": 39481 + }, + { + "epoch": 3.1984769928710306, + "grad_norm": 0.06700455397367477, + "learning_rate": 4.4529456771231834e-05, + "loss": 0.2207, + "step": 39482 + }, + { + "epoch": 3.198558003888529, + "grad_norm": 0.07812342047691345, + "learning_rate": 4.452495611863721e-05, + "loss": 0.2465, + "step": 39483 + }, + { + "epoch": 3.198639014906027, + "grad_norm": 0.07196211814880371, + "learning_rate": 4.4520455466042574e-05, + "loss": 0.2448, + "step": 39484 + }, + { + "epoch": 3.198720025923526, + "grad_norm": 0.09784740954637527, + "learning_rate": 4.4515954813447955e-05, + "loss": 0.237, + "step": 39485 + }, + { + "epoch": 3.198801036941024, + "grad_norm": 0.061616454273462296, + "learning_rate": 4.451145416085333e-05, + "loss": 0.222, + "step": 39486 + }, + { + "epoch": 3.1988820479585223, + "grad_norm": 0.0834859237074852, + "learning_rate": 4.4506953508258695e-05, + "loss": 0.227, + "step": 39487 + }, + { + "epoch": 3.1989630589760205, + "grad_norm": 0.07035014033317566, + "learning_rate": 4.4502452855664076e-05, + "loss": 0.2646, + "step": 39488 + }, + { + "epoch": 3.199044069993519, + "grad_norm": 0.05911833420395851, + "learning_rate": 4.449795220306945e-05, + "loss": 0.198, + "step": 39489 + }, + { + "epoch": 3.1991250810110174, + "grad_norm": 0.0757574588060379, + "learning_rate": 4.4493451550474816e-05, + "loss": 0.2499, + "step": 39490 + }, + { + "epoch": 3.1992060920285157, + "grad_norm": 0.07520738989114761, + "learning_rate": 4.4488950897880196e-05, + "loss": 0.2356, + "step": 39491 + }, + { + "epoch": 3.1992871030460144, + "grad_norm": 0.07697313278913498, + "learning_rate": 4.448445024528557e-05, + "loss": 0.2107, + "step": 39492 + }, + { + "epoch": 3.1993681140635126, + "grad_norm": 0.06699033081531525, + "learning_rate": 4.4479949592690944e-05, + "loss": 0.2068, + "step": 39493 + }, + { + "epoch": 3.199449125081011, + "grad_norm": 0.06737679988145828, + "learning_rate": 4.447544894009632e-05, + "loss": 0.2204, + "step": 39494 + }, + { + "epoch": 3.1995301360985096, + "grad_norm": 0.07735428959131241, + "learning_rate": 4.447094828750169e-05, + "loss": 0.2074, + "step": 39495 + }, + { + "epoch": 3.199611147116008, + "grad_norm": 0.07633911818265915, + "learning_rate": 4.4466447634907065e-05, + "loss": 0.2447, + "step": 39496 + }, + { + "epoch": 3.199692158133506, + "grad_norm": 0.07352489233016968, + "learning_rate": 4.446194698231244e-05, + "loss": 0.1873, + "step": 39497 + }, + { + "epoch": 3.1997731691510047, + "grad_norm": 0.05530984699726105, + "learning_rate": 4.445744632971781e-05, + "loss": 0.2343, + "step": 39498 + }, + { + "epoch": 3.199854180168503, + "grad_norm": 0.06358368694782257, + "learning_rate": 4.4452945677123185e-05, + "loss": 0.182, + "step": 39499 + }, + { + "epoch": 3.1999351911860012, + "grad_norm": 0.060369789600372314, + "learning_rate": 4.444844502452856e-05, + "loss": 0.2203, + "step": 39500 + }, + { + "epoch": 3.2000162022034995, + "grad_norm": 0.07927286624908447, + "learning_rate": 4.444394437193393e-05, + "loss": 0.2598, + "step": 39501 + }, + { + "epoch": 3.200097213220998, + "grad_norm": 0.06475785374641418, + "learning_rate": 4.4439443719339306e-05, + "loss": 0.2214, + "step": 39502 + }, + { + "epoch": 3.2001782242384964, + "grad_norm": 0.08461401611566544, + "learning_rate": 4.443494306674468e-05, + "loss": 0.2505, + "step": 39503 + }, + { + "epoch": 3.2002592352559946, + "grad_norm": 0.07525065541267395, + "learning_rate": 4.443044241415005e-05, + "loss": 0.2428, + "step": 39504 + }, + { + "epoch": 3.2003402462734933, + "grad_norm": 0.05696845054626465, + "learning_rate": 4.442594176155543e-05, + "loss": 0.2137, + "step": 39505 + }, + { + "epoch": 3.2004212572909916, + "grad_norm": 0.06406644731760025, + "learning_rate": 4.442144110896081e-05, + "loss": 0.232, + "step": 39506 + }, + { + "epoch": 3.20050226830849, + "grad_norm": 0.07079577445983887, + "learning_rate": 4.4416940456366174e-05, + "loss": 0.2187, + "step": 39507 + }, + { + "epoch": 3.2005832793259885, + "grad_norm": 0.07825680077075958, + "learning_rate": 4.441243980377155e-05, + "loss": 0.2354, + "step": 39508 + }, + { + "epoch": 3.2006642903434868, + "grad_norm": 0.0631539449095726, + "learning_rate": 4.440793915117693e-05, + "loss": 0.2138, + "step": 39509 + }, + { + "epoch": 3.200745301360985, + "grad_norm": 0.09467824548482895, + "learning_rate": 4.4403438498582295e-05, + "loss": 0.2421, + "step": 39510 + }, + { + "epoch": 3.2008263123784833, + "grad_norm": 0.07566626369953156, + "learning_rate": 4.439893784598767e-05, + "loss": 0.2237, + "step": 39511 + }, + { + "epoch": 3.200907323395982, + "grad_norm": 0.07604029029607773, + "learning_rate": 4.439443719339305e-05, + "loss": 0.2669, + "step": 39512 + }, + { + "epoch": 3.20098833441348, + "grad_norm": 0.07466589659452438, + "learning_rate": 4.4389936540798416e-05, + "loss": 0.2227, + "step": 39513 + }, + { + "epoch": 3.2010693454309784, + "grad_norm": 0.07292468100786209, + "learning_rate": 4.438543588820379e-05, + "loss": 0.2382, + "step": 39514 + }, + { + "epoch": 3.201150356448477, + "grad_norm": 0.06553184986114502, + "learning_rate": 4.438093523560917e-05, + "loss": 0.1965, + "step": 39515 + }, + { + "epoch": 3.2012313674659754, + "grad_norm": 0.06865046918392181, + "learning_rate": 4.4376434583014537e-05, + "loss": 0.216, + "step": 39516 + }, + { + "epoch": 3.2013123784834736, + "grad_norm": 0.06584342569112778, + "learning_rate": 4.437193393041991e-05, + "loss": 0.2279, + "step": 39517 + }, + { + "epoch": 3.2013933895009723, + "grad_norm": 0.0877319946885109, + "learning_rate": 4.436743327782529e-05, + "loss": 0.1958, + "step": 39518 + }, + { + "epoch": 3.2014744005184705, + "grad_norm": 0.0780157521367073, + "learning_rate": 4.4362932625230664e-05, + "loss": 0.237, + "step": 39519 + }, + { + "epoch": 3.201555411535969, + "grad_norm": 0.057421378791332245, + "learning_rate": 4.435843197263603e-05, + "loss": 0.2052, + "step": 39520 + }, + { + "epoch": 3.2016364225534675, + "grad_norm": 0.085902638733387, + "learning_rate": 4.435393132004141e-05, + "loss": 0.2236, + "step": 39521 + }, + { + "epoch": 3.2017174335709657, + "grad_norm": 0.07996001839637756, + "learning_rate": 4.4349430667446785e-05, + "loss": 0.2445, + "step": 39522 + }, + { + "epoch": 3.201798444588464, + "grad_norm": 0.06373366713523865, + "learning_rate": 4.434493001485215e-05, + "loss": 0.2834, + "step": 39523 + }, + { + "epoch": 3.201879455605962, + "grad_norm": 0.09339869022369385, + "learning_rate": 4.434042936225753e-05, + "loss": 0.2771, + "step": 39524 + }, + { + "epoch": 3.201960466623461, + "grad_norm": 0.07182280719280243, + "learning_rate": 4.4335928709662906e-05, + "loss": 0.2251, + "step": 39525 + }, + { + "epoch": 3.202041477640959, + "grad_norm": 0.07255586981773376, + "learning_rate": 4.433142805706827e-05, + "loss": 0.22, + "step": 39526 + }, + { + "epoch": 3.2021224886584574, + "grad_norm": 0.09170207381248474, + "learning_rate": 4.432692740447365e-05, + "loss": 0.3099, + "step": 39527 + }, + { + "epoch": 3.202203499675956, + "grad_norm": 0.07020127028226852, + "learning_rate": 4.4322426751879027e-05, + "loss": 0.2411, + "step": 39528 + }, + { + "epoch": 3.2022845106934543, + "grad_norm": 0.10357783734798431, + "learning_rate": 4.4317926099284393e-05, + "loss": 0.2265, + "step": 39529 + }, + { + "epoch": 3.2023655217109526, + "grad_norm": 0.05146079882979393, + "learning_rate": 4.4313425446689774e-05, + "loss": 0.2233, + "step": 39530 + }, + { + "epoch": 3.2024465327284513, + "grad_norm": 0.058229777961969376, + "learning_rate": 4.430892479409515e-05, + "loss": 0.2408, + "step": 39531 + }, + { + "epoch": 3.2025275437459495, + "grad_norm": 0.06619949638843536, + "learning_rate": 4.430442414150052e-05, + "loss": 0.2193, + "step": 39532 + }, + { + "epoch": 3.2026085547634477, + "grad_norm": 0.05871860682964325, + "learning_rate": 4.4299923488905895e-05, + "loss": 0.2181, + "step": 39533 + }, + { + "epoch": 3.202689565780946, + "grad_norm": 0.08122187852859497, + "learning_rate": 4.429542283631127e-05, + "loss": 0.2359, + "step": 39534 + }, + { + "epoch": 3.2027705767984447, + "grad_norm": 0.06815316528081894, + "learning_rate": 4.429092218371664e-05, + "loss": 0.2656, + "step": 39535 + }, + { + "epoch": 3.202851587815943, + "grad_norm": 0.07689882069826126, + "learning_rate": 4.4286421531122015e-05, + "loss": 0.2508, + "step": 39536 + }, + { + "epoch": 3.202932598833441, + "grad_norm": 0.05379384383559227, + "learning_rate": 4.428192087852739e-05, + "loss": 0.1968, + "step": 39537 + }, + { + "epoch": 3.20301360985094, + "grad_norm": 0.06135953962802887, + "learning_rate": 4.427742022593276e-05, + "loss": 0.2714, + "step": 39538 + }, + { + "epoch": 3.203094620868438, + "grad_norm": 0.07056459784507751, + "learning_rate": 4.4272919573338136e-05, + "loss": 0.2241, + "step": 39539 + }, + { + "epoch": 3.2031756318859363, + "grad_norm": 0.06879568845033646, + "learning_rate": 4.426841892074351e-05, + "loss": 0.2303, + "step": 39540 + }, + { + "epoch": 3.203256642903435, + "grad_norm": 0.05646612122654915, + "learning_rate": 4.4263918268148883e-05, + "loss": 0.2042, + "step": 39541 + }, + { + "epoch": 3.2033376539209333, + "grad_norm": 0.06926844269037247, + "learning_rate": 4.425941761555426e-05, + "loss": 0.2233, + "step": 39542 + }, + { + "epoch": 3.2034186649384315, + "grad_norm": 0.07639966160058975, + "learning_rate": 4.425491696295963e-05, + "loss": 0.2522, + "step": 39543 + }, + { + "epoch": 3.20349967595593, + "grad_norm": 0.05440759286284447, + "learning_rate": 4.4250416310365004e-05, + "loss": 0.2026, + "step": 39544 + }, + { + "epoch": 3.2035806869734285, + "grad_norm": 0.06447576731443405, + "learning_rate": 4.4245915657770385e-05, + "loss": 0.242, + "step": 39545 + }, + { + "epoch": 3.2036616979909267, + "grad_norm": 0.0778748095035553, + "learning_rate": 4.424141500517575e-05, + "loss": 0.2519, + "step": 39546 + }, + { + "epoch": 3.203742709008425, + "grad_norm": 0.07720351964235306, + "learning_rate": 4.4236914352581125e-05, + "loss": 0.2254, + "step": 39547 + }, + { + "epoch": 3.2038237200259236, + "grad_norm": 0.07186136394739151, + "learning_rate": 4.4232413699986505e-05, + "loss": 0.2052, + "step": 39548 + }, + { + "epoch": 3.203904731043422, + "grad_norm": 0.06151605397462845, + "learning_rate": 4.422791304739187e-05, + "loss": 0.1893, + "step": 39549 + }, + { + "epoch": 3.20398574206092, + "grad_norm": 0.07115894556045532, + "learning_rate": 4.4223412394797246e-05, + "loss": 0.2142, + "step": 39550 + }, + { + "epoch": 3.204066753078419, + "grad_norm": 0.062248844653367996, + "learning_rate": 4.4218911742202626e-05, + "loss": 0.2066, + "step": 39551 + }, + { + "epoch": 3.204147764095917, + "grad_norm": 0.06971555203199387, + "learning_rate": 4.421441108960799e-05, + "loss": 0.218, + "step": 39552 + }, + { + "epoch": 3.2042287751134153, + "grad_norm": 0.07507779449224472, + "learning_rate": 4.420991043701337e-05, + "loss": 0.2896, + "step": 39553 + }, + { + "epoch": 3.204309786130914, + "grad_norm": 0.08263830095529556, + "learning_rate": 4.420540978441875e-05, + "loss": 0.2203, + "step": 39554 + }, + { + "epoch": 3.2043907971484122, + "grad_norm": 0.06579352170228958, + "learning_rate": 4.4200909131824114e-05, + "loss": 0.2482, + "step": 39555 + }, + { + "epoch": 3.2044718081659105, + "grad_norm": 0.07024890184402466, + "learning_rate": 4.419640847922949e-05, + "loss": 0.2254, + "step": 39556 + }, + { + "epoch": 3.2045528191834087, + "grad_norm": 0.058557070791721344, + "learning_rate": 4.419190782663487e-05, + "loss": 0.1969, + "step": 39557 + }, + { + "epoch": 3.2046338302009074, + "grad_norm": 0.07652173936367035, + "learning_rate": 4.418740717404024e-05, + "loss": 0.2163, + "step": 39558 + }, + { + "epoch": 3.2047148412184057, + "grad_norm": 0.07383786141872406, + "learning_rate": 4.418290652144561e-05, + "loss": 0.2454, + "step": 39559 + }, + { + "epoch": 3.204795852235904, + "grad_norm": 0.07195722311735153, + "learning_rate": 4.417840586885099e-05, + "loss": 0.2394, + "step": 39560 + }, + { + "epoch": 3.2048768632534026, + "grad_norm": 0.0840165838599205, + "learning_rate": 4.417390521625636e-05, + "loss": 0.2379, + "step": 39561 + }, + { + "epoch": 3.204957874270901, + "grad_norm": 0.07644258439540863, + "learning_rate": 4.416940456366173e-05, + "loss": 0.2611, + "step": 39562 + }, + { + "epoch": 3.205038885288399, + "grad_norm": 0.06418444216251373, + "learning_rate": 4.416490391106711e-05, + "loss": 0.2305, + "step": 39563 + }, + { + "epoch": 3.2051198963058978, + "grad_norm": 0.07186094671487808, + "learning_rate": 4.416040325847248e-05, + "loss": 0.2325, + "step": 39564 + }, + { + "epoch": 3.205200907323396, + "grad_norm": 0.05062396451830864, + "learning_rate": 4.415590260587785e-05, + "loss": 0.222, + "step": 39565 + }, + { + "epoch": 3.2052819183408943, + "grad_norm": 0.07012312859296799, + "learning_rate": 4.415140195328323e-05, + "loss": 0.2384, + "step": 39566 + }, + { + "epoch": 3.205362929358393, + "grad_norm": 0.0643838495016098, + "learning_rate": 4.4146901300688604e-05, + "loss": 0.2304, + "step": 39567 + }, + { + "epoch": 3.205443940375891, + "grad_norm": 0.07166772335767746, + "learning_rate": 4.414240064809397e-05, + "loss": 0.2272, + "step": 39568 + }, + { + "epoch": 3.2055249513933894, + "grad_norm": 0.07338915765285492, + "learning_rate": 4.413789999549935e-05, + "loss": 0.1954, + "step": 39569 + }, + { + "epoch": 3.2056059624108877, + "grad_norm": 0.07821765542030334, + "learning_rate": 4.4133399342904725e-05, + "loss": 0.2588, + "step": 39570 + }, + { + "epoch": 3.2056869734283864, + "grad_norm": 0.06987085193395615, + "learning_rate": 4.41288986903101e-05, + "loss": 0.232, + "step": 39571 + }, + { + "epoch": 3.2057679844458846, + "grad_norm": 0.07386733591556549, + "learning_rate": 4.412439803771547e-05, + "loss": 0.2104, + "step": 39572 + }, + { + "epoch": 3.205848995463383, + "grad_norm": 0.06136629357933998, + "learning_rate": 4.4119897385120846e-05, + "loss": 0.2384, + "step": 39573 + }, + { + "epoch": 3.2059300064808816, + "grad_norm": 0.06790085136890411, + "learning_rate": 4.411539673252622e-05, + "loss": 0.1765, + "step": 39574 + }, + { + "epoch": 3.20601101749838, + "grad_norm": 0.0645691379904747, + "learning_rate": 4.411089607993159e-05, + "loss": 0.1908, + "step": 39575 + }, + { + "epoch": 3.206092028515878, + "grad_norm": 0.06009569764137268, + "learning_rate": 4.4106395427336966e-05, + "loss": 0.2207, + "step": 39576 + }, + { + "epoch": 3.2061730395333767, + "grad_norm": 0.06850836426019669, + "learning_rate": 4.410189477474234e-05, + "loss": 0.2248, + "step": 39577 + }, + { + "epoch": 3.206254050550875, + "grad_norm": 0.07242856919765472, + "learning_rate": 4.4097394122147714e-05, + "loss": 0.2094, + "step": 39578 + }, + { + "epoch": 3.2063350615683732, + "grad_norm": 0.06407655775547028, + "learning_rate": 4.409289346955309e-05, + "loss": 0.2143, + "step": 39579 + }, + { + "epoch": 3.2064160725858715, + "grad_norm": 0.06614059209823608, + "learning_rate": 4.408839281695846e-05, + "loss": 0.2229, + "step": 39580 + }, + { + "epoch": 3.20649708360337, + "grad_norm": 0.06821591407060623, + "learning_rate": 4.4083892164363834e-05, + "loss": 0.2574, + "step": 39581 + }, + { + "epoch": 3.2065780946208684, + "grad_norm": 0.056826550513505936, + "learning_rate": 4.407939151176921e-05, + "loss": 0.1807, + "step": 39582 + }, + { + "epoch": 3.2066591056383666, + "grad_norm": 0.053199753165245056, + "learning_rate": 4.407489085917458e-05, + "loss": 0.1914, + "step": 39583 + }, + { + "epoch": 3.2067401166558653, + "grad_norm": 0.0675586462020874, + "learning_rate": 4.4070390206579955e-05, + "loss": 0.217, + "step": 39584 + }, + { + "epoch": 3.2068211276733636, + "grad_norm": 0.08078780025243759, + "learning_rate": 4.406588955398533e-05, + "loss": 0.2329, + "step": 39585 + }, + { + "epoch": 3.206902138690862, + "grad_norm": 0.06861063092947006, + "learning_rate": 4.40613889013907e-05, + "loss": 0.2259, + "step": 39586 + }, + { + "epoch": 3.2069831497083605, + "grad_norm": 0.07793055474758148, + "learning_rate": 4.405688824879608e-05, + "loss": 0.2086, + "step": 39587 + }, + { + "epoch": 3.2070641607258588, + "grad_norm": 0.06235434487462044, + "learning_rate": 4.405238759620145e-05, + "loss": 0.2656, + "step": 39588 + }, + { + "epoch": 3.207145171743357, + "grad_norm": 0.05257926881313324, + "learning_rate": 4.404788694360682e-05, + "loss": 0.1803, + "step": 39589 + }, + { + "epoch": 3.2072261827608557, + "grad_norm": 0.0629105344414711, + "learning_rate": 4.4043386291012204e-05, + "loss": 0.1994, + "step": 39590 + }, + { + "epoch": 3.207307193778354, + "grad_norm": 0.060108739882707596, + "learning_rate": 4.403888563841757e-05, + "loss": 0.2169, + "step": 39591 + }, + { + "epoch": 3.207388204795852, + "grad_norm": 0.058636438101530075, + "learning_rate": 4.4034384985822944e-05, + "loss": 0.2089, + "step": 39592 + }, + { + "epoch": 3.2074692158133504, + "grad_norm": 0.06358274072408676, + "learning_rate": 4.4029884333228324e-05, + "loss": 0.2394, + "step": 39593 + }, + { + "epoch": 3.207550226830849, + "grad_norm": 0.05463390052318573, + "learning_rate": 4.402538368063369e-05, + "loss": 0.1892, + "step": 39594 + }, + { + "epoch": 3.2076312378483474, + "grad_norm": 0.0995706096291542, + "learning_rate": 4.4020883028039065e-05, + "loss": 0.2542, + "step": 39595 + }, + { + "epoch": 3.2077122488658456, + "grad_norm": 0.07657454162836075, + "learning_rate": 4.4016382375444445e-05, + "loss": 0.2413, + "step": 39596 + }, + { + "epoch": 3.2077932598833443, + "grad_norm": 0.0618801973760128, + "learning_rate": 4.401188172284981e-05, + "loss": 0.2223, + "step": 39597 + }, + { + "epoch": 3.2078742709008425, + "grad_norm": 0.05365942418575287, + "learning_rate": 4.4007381070255186e-05, + "loss": 0.2279, + "step": 39598 + }, + { + "epoch": 3.207955281918341, + "grad_norm": 0.06541714072227478, + "learning_rate": 4.4002880417660566e-05, + "loss": 0.1824, + "step": 39599 + }, + { + "epoch": 3.2080362929358395, + "grad_norm": 0.06659148633480072, + "learning_rate": 4.399837976506594e-05, + "loss": 0.2059, + "step": 39600 + }, + { + "epoch": 3.2081173039533377, + "grad_norm": 0.07549691945314407, + "learning_rate": 4.3993879112471306e-05, + "loss": 0.2436, + "step": 39601 + }, + { + "epoch": 3.208198314970836, + "grad_norm": 0.06712198257446289, + "learning_rate": 4.398937845987669e-05, + "loss": 0.2181, + "step": 39602 + }, + { + "epoch": 3.208279325988334, + "grad_norm": 0.07315943390130997, + "learning_rate": 4.398487780728206e-05, + "loss": 0.2334, + "step": 39603 + }, + { + "epoch": 3.208360337005833, + "grad_norm": 0.06703237444162369, + "learning_rate": 4.398037715468743e-05, + "loss": 0.2409, + "step": 39604 + }, + { + "epoch": 3.208441348023331, + "grad_norm": 0.07603147625923157, + "learning_rate": 4.397587650209281e-05, + "loss": 0.2644, + "step": 39605 + }, + { + "epoch": 3.2085223590408294, + "grad_norm": 0.07170654088258743, + "learning_rate": 4.397137584949818e-05, + "loss": 0.247, + "step": 39606 + }, + { + "epoch": 3.208603370058328, + "grad_norm": 0.08088883757591248, + "learning_rate": 4.396687519690355e-05, + "loss": 0.2449, + "step": 39607 + }, + { + "epoch": 3.2086843810758263, + "grad_norm": 0.06888744235038757, + "learning_rate": 4.396237454430893e-05, + "loss": 0.2266, + "step": 39608 + }, + { + "epoch": 3.2087653920933246, + "grad_norm": 0.06969289481639862, + "learning_rate": 4.39578738917143e-05, + "loss": 0.2248, + "step": 39609 + }, + { + "epoch": 3.2088464031108233, + "grad_norm": 0.07196550071239471, + "learning_rate": 4.395337323911967e-05, + "loss": 0.213, + "step": 39610 + }, + { + "epoch": 3.2089274141283215, + "grad_norm": 0.04962148889899254, + "learning_rate": 4.394887258652505e-05, + "loss": 0.2131, + "step": 39611 + }, + { + "epoch": 3.2090084251458197, + "grad_norm": 0.06372936069965363, + "learning_rate": 4.394437193393042e-05, + "loss": 0.2421, + "step": 39612 + }, + { + "epoch": 3.2090894361633184, + "grad_norm": 0.05823584273457527, + "learning_rate": 4.3939871281335796e-05, + "loss": 0.2137, + "step": 39613 + }, + { + "epoch": 3.2091704471808167, + "grad_norm": 0.06661396473646164, + "learning_rate": 4.393537062874117e-05, + "loss": 0.2285, + "step": 39614 + }, + { + "epoch": 3.209251458198315, + "grad_norm": 0.07596556842327118, + "learning_rate": 4.3930869976146544e-05, + "loss": 0.2256, + "step": 39615 + }, + { + "epoch": 3.209332469215813, + "grad_norm": 0.08950075507164001, + "learning_rate": 4.392636932355192e-05, + "loss": 0.2299, + "step": 39616 + }, + { + "epoch": 3.209413480233312, + "grad_norm": 0.060839664191007614, + "learning_rate": 4.392186867095729e-05, + "loss": 0.1991, + "step": 39617 + }, + { + "epoch": 3.20949449125081, + "grad_norm": 0.08028296381235123, + "learning_rate": 4.3917368018362664e-05, + "loss": 0.2459, + "step": 39618 + }, + { + "epoch": 3.2095755022683083, + "grad_norm": 0.08345425873994827, + "learning_rate": 4.391286736576804e-05, + "loss": 0.2446, + "step": 39619 + }, + { + "epoch": 3.209656513285807, + "grad_norm": 0.09471633285284042, + "learning_rate": 4.390836671317341e-05, + "loss": 0.2773, + "step": 39620 + }, + { + "epoch": 3.2097375243033053, + "grad_norm": 0.08462955057621002, + "learning_rate": 4.3903866060578785e-05, + "loss": 0.2488, + "step": 39621 + }, + { + "epoch": 3.2098185353208035, + "grad_norm": 0.0768325999379158, + "learning_rate": 4.389936540798416e-05, + "loss": 0.2148, + "step": 39622 + }, + { + "epoch": 3.209899546338302, + "grad_norm": 0.08202141523361206, + "learning_rate": 4.389486475538953e-05, + "loss": 0.2505, + "step": 39623 + }, + { + "epoch": 3.2099805573558005, + "grad_norm": 0.06823612749576569, + "learning_rate": 4.3890364102794906e-05, + "loss": 0.2475, + "step": 39624 + }, + { + "epoch": 3.2100615683732987, + "grad_norm": 0.08542759716510773, + "learning_rate": 4.388586345020028e-05, + "loss": 0.2673, + "step": 39625 + }, + { + "epoch": 3.210142579390797, + "grad_norm": 0.06803088635206223, + "learning_rate": 4.388136279760566e-05, + "loss": 0.2603, + "step": 39626 + }, + { + "epoch": 3.2102235904082956, + "grad_norm": 0.06601130217313766, + "learning_rate": 4.387686214501103e-05, + "loss": 0.2041, + "step": 39627 + }, + { + "epoch": 3.210304601425794, + "grad_norm": 0.07751946896314621, + "learning_rate": 4.38723614924164e-05, + "loss": 0.2676, + "step": 39628 + }, + { + "epoch": 3.210385612443292, + "grad_norm": 0.06715001910924911, + "learning_rate": 4.386786083982178e-05, + "loss": 0.2719, + "step": 39629 + }, + { + "epoch": 3.210466623460791, + "grad_norm": 0.07272826135158539, + "learning_rate": 4.386336018722715e-05, + "loss": 0.1906, + "step": 39630 + }, + { + "epoch": 3.210547634478289, + "grad_norm": 0.06991291791200638, + "learning_rate": 4.385885953463252e-05, + "loss": 0.2383, + "step": 39631 + }, + { + "epoch": 3.2106286454957873, + "grad_norm": 0.07351662963628769, + "learning_rate": 4.38543588820379e-05, + "loss": 0.2545, + "step": 39632 + }, + { + "epoch": 3.210709656513286, + "grad_norm": 0.06192673742771149, + "learning_rate": 4.384985822944327e-05, + "loss": 0.2378, + "step": 39633 + }, + { + "epoch": 3.2107906675307842, + "grad_norm": 0.06056393310427666, + "learning_rate": 4.384535757684864e-05, + "loss": 0.2338, + "step": 39634 + }, + { + "epoch": 3.2108716785482825, + "grad_norm": 0.07918339222669601, + "learning_rate": 4.384085692425402e-05, + "loss": 0.2393, + "step": 39635 + }, + { + "epoch": 3.210952689565781, + "grad_norm": 0.0640377625823021, + "learning_rate": 4.383635627165939e-05, + "loss": 0.2395, + "step": 39636 + }, + { + "epoch": 3.2110337005832794, + "grad_norm": 0.0642380639910698, + "learning_rate": 4.383185561906476e-05, + "loss": 0.2441, + "step": 39637 + }, + { + "epoch": 3.2111147116007777, + "grad_norm": 0.08133316785097122, + "learning_rate": 4.382735496647014e-05, + "loss": 0.2384, + "step": 39638 + }, + { + "epoch": 3.211195722618276, + "grad_norm": 0.06930825859308243, + "learning_rate": 4.382285431387552e-05, + "loss": 0.2195, + "step": 39639 + }, + { + "epoch": 3.2112767336357746, + "grad_norm": 0.07919151335954666, + "learning_rate": 4.3818353661280884e-05, + "loss": 0.2524, + "step": 39640 + }, + { + "epoch": 3.211357744653273, + "grad_norm": 0.06062348932027817, + "learning_rate": 4.3813853008686264e-05, + "loss": 0.2116, + "step": 39641 + }, + { + "epoch": 3.211438755670771, + "grad_norm": 0.06613833457231522, + "learning_rate": 4.380935235609164e-05, + "loss": 0.2114, + "step": 39642 + }, + { + "epoch": 3.2115197666882698, + "grad_norm": 0.0750194787979126, + "learning_rate": 4.3804851703497005e-05, + "loss": 0.2361, + "step": 39643 + }, + { + "epoch": 3.211600777705768, + "grad_norm": 0.06285949796438217, + "learning_rate": 4.3800351050902385e-05, + "loss": 0.2189, + "step": 39644 + }, + { + "epoch": 3.2116817887232663, + "grad_norm": 0.08646561950445175, + "learning_rate": 4.379585039830776e-05, + "loss": 0.2534, + "step": 39645 + }, + { + "epoch": 3.211762799740765, + "grad_norm": 0.07352367043495178, + "learning_rate": 4.3791349745713125e-05, + "loss": 0.2232, + "step": 39646 + }, + { + "epoch": 3.211843810758263, + "grad_norm": 0.0724693015217781, + "learning_rate": 4.3786849093118506e-05, + "loss": 0.2481, + "step": 39647 + }, + { + "epoch": 3.2119248217757614, + "grad_norm": 0.07707555592060089, + "learning_rate": 4.378234844052388e-05, + "loss": 0.2501, + "step": 39648 + }, + { + "epoch": 3.2120058327932597, + "grad_norm": 0.06952345371246338, + "learning_rate": 4.3777847787929246e-05, + "loss": 0.2316, + "step": 39649 + }, + { + "epoch": 3.2120868438107584, + "grad_norm": 0.07168091088533401, + "learning_rate": 4.3773347135334627e-05, + "loss": 0.2457, + "step": 39650 + }, + { + "epoch": 3.2121678548282566, + "grad_norm": 0.06950316578149796, + "learning_rate": 4.376884648274e-05, + "loss": 0.227, + "step": 39651 + }, + { + "epoch": 3.212248865845755, + "grad_norm": 0.07516512274742126, + "learning_rate": 4.3764345830145374e-05, + "loss": 0.233, + "step": 39652 + }, + { + "epoch": 3.2123298768632536, + "grad_norm": 0.07295958697795868, + "learning_rate": 4.375984517755075e-05, + "loss": 0.182, + "step": 39653 + }, + { + "epoch": 3.212410887880752, + "grad_norm": 0.06907928735017776, + "learning_rate": 4.375534452495612e-05, + "loss": 0.202, + "step": 39654 + }, + { + "epoch": 3.21249189889825, + "grad_norm": 0.07010002434253693, + "learning_rate": 4.3750843872361495e-05, + "loss": 0.2223, + "step": 39655 + }, + { + "epoch": 3.2125729099157487, + "grad_norm": 0.08001629263162613, + "learning_rate": 4.374634321976687e-05, + "loss": 0.249, + "step": 39656 + }, + { + "epoch": 3.212653920933247, + "grad_norm": 0.07292798161506653, + "learning_rate": 4.374184256717224e-05, + "loss": 0.2687, + "step": 39657 + }, + { + "epoch": 3.212734931950745, + "grad_norm": 0.06464848667383194, + "learning_rate": 4.3737341914577615e-05, + "loss": 0.221, + "step": 39658 + }, + { + "epoch": 3.212815942968244, + "grad_norm": 0.06706438958644867, + "learning_rate": 4.373284126198299e-05, + "loss": 0.2298, + "step": 39659 + }, + { + "epoch": 3.212896953985742, + "grad_norm": 0.07052440941333771, + "learning_rate": 4.372834060938836e-05, + "loss": 0.2114, + "step": 39660 + }, + { + "epoch": 3.2129779650032404, + "grad_norm": 0.07753120362758636, + "learning_rate": 4.3723839956793736e-05, + "loss": 0.2355, + "step": 39661 + }, + { + "epoch": 3.2130589760207386, + "grad_norm": 0.09529384970664978, + "learning_rate": 4.371933930419911e-05, + "loss": 0.2509, + "step": 39662 + }, + { + "epoch": 3.2131399870382373, + "grad_norm": 0.07285930961370468, + "learning_rate": 4.371483865160448e-05, + "loss": 0.1771, + "step": 39663 + }, + { + "epoch": 3.2132209980557356, + "grad_norm": 0.07032152265310287, + "learning_rate": 4.371033799900986e-05, + "loss": 0.1986, + "step": 39664 + }, + { + "epoch": 3.213302009073234, + "grad_norm": 0.05863802880048752, + "learning_rate": 4.370583734641524e-05, + "loss": 0.2618, + "step": 39665 + }, + { + "epoch": 3.2133830200907325, + "grad_norm": 0.06316442787647247, + "learning_rate": 4.3701336693820604e-05, + "loss": 0.2136, + "step": 39666 + }, + { + "epoch": 3.2134640311082308, + "grad_norm": 0.08077801018953323, + "learning_rate": 4.369683604122598e-05, + "loss": 0.1927, + "step": 39667 + }, + { + "epoch": 3.213545042125729, + "grad_norm": 0.07931190729141235, + "learning_rate": 4.369233538863136e-05, + "loss": 0.2187, + "step": 39668 + }, + { + "epoch": 3.2136260531432272, + "grad_norm": 0.06426815688610077, + "learning_rate": 4.3687834736036725e-05, + "loss": 0.2464, + "step": 39669 + }, + { + "epoch": 3.213707064160726, + "grad_norm": 0.06752730160951614, + "learning_rate": 4.36833340834421e-05, + "loss": 0.2327, + "step": 39670 + }, + { + "epoch": 3.213788075178224, + "grad_norm": 0.06728702038526535, + "learning_rate": 4.367883343084748e-05, + "loss": 0.2123, + "step": 39671 + }, + { + "epoch": 3.2138690861957224, + "grad_norm": 0.07387802749872208, + "learning_rate": 4.3674332778252846e-05, + "loss": 0.2287, + "step": 39672 + }, + { + "epoch": 3.213950097213221, + "grad_norm": 0.08815514296293259, + "learning_rate": 4.366983212565822e-05, + "loss": 0.2163, + "step": 39673 + }, + { + "epoch": 3.2140311082307194, + "grad_norm": 0.07101006805896759, + "learning_rate": 4.36653314730636e-05, + "loss": 0.2251, + "step": 39674 + }, + { + "epoch": 3.2141121192482176, + "grad_norm": 0.07125383615493774, + "learning_rate": 4.366083082046897e-05, + "loss": 0.2396, + "step": 39675 + }, + { + "epoch": 3.2141931302657163, + "grad_norm": 0.07291600108146667, + "learning_rate": 4.365633016787434e-05, + "loss": 0.2282, + "step": 39676 + }, + { + "epoch": 3.2142741412832145, + "grad_norm": 0.061833396553993225, + "learning_rate": 4.365182951527972e-05, + "loss": 0.2206, + "step": 39677 + }, + { + "epoch": 3.214355152300713, + "grad_norm": 0.057680822908878326, + "learning_rate": 4.3647328862685094e-05, + "loss": 0.1991, + "step": 39678 + }, + { + "epoch": 3.2144361633182115, + "grad_norm": 0.07487655431032181, + "learning_rate": 4.364282821009046e-05, + "loss": 0.2321, + "step": 39679 + }, + { + "epoch": 3.2145171743357097, + "grad_norm": 0.07213618606328964, + "learning_rate": 4.363832755749584e-05, + "loss": 0.2774, + "step": 39680 + }, + { + "epoch": 3.214598185353208, + "grad_norm": 0.06856631487607956, + "learning_rate": 4.3633826904901215e-05, + "loss": 0.2266, + "step": 39681 + }, + { + "epoch": 3.2146791963707066, + "grad_norm": 0.06372292339801788, + "learning_rate": 4.362932625230658e-05, + "loss": 0.1977, + "step": 39682 + }, + { + "epoch": 3.214760207388205, + "grad_norm": 0.06928879767656326, + "learning_rate": 4.362482559971196e-05, + "loss": 0.2487, + "step": 39683 + }, + { + "epoch": 3.214841218405703, + "grad_norm": 0.07648466527462006, + "learning_rate": 4.3620324947117336e-05, + "loss": 0.2154, + "step": 39684 + }, + { + "epoch": 3.2149222294232014, + "grad_norm": 0.07209611684083939, + "learning_rate": 4.36158242945227e-05, + "loss": 0.2258, + "step": 39685 + }, + { + "epoch": 3.2150032404407, + "grad_norm": 0.06907076388597488, + "learning_rate": 4.361132364192808e-05, + "loss": 0.2124, + "step": 39686 + }, + { + "epoch": 3.2150842514581983, + "grad_norm": 0.06519749760627747, + "learning_rate": 4.360682298933346e-05, + "loss": 0.2407, + "step": 39687 + }, + { + "epoch": 3.2151652624756966, + "grad_norm": 0.06849552690982819, + "learning_rate": 4.3602322336738823e-05, + "loss": 0.2258, + "step": 39688 + }, + { + "epoch": 3.2152462734931953, + "grad_norm": 0.08750832825899124, + "learning_rate": 4.3597821684144204e-05, + "loss": 0.2739, + "step": 39689 + }, + { + "epoch": 3.2153272845106935, + "grad_norm": 0.06199619174003601, + "learning_rate": 4.359332103154958e-05, + "loss": 0.2294, + "step": 39690 + }, + { + "epoch": 3.2154082955281917, + "grad_norm": 0.05395563319325447, + "learning_rate": 4.358882037895495e-05, + "loss": 0.2295, + "step": 39691 + }, + { + "epoch": 3.21548930654569, + "grad_norm": 0.08060085028409958, + "learning_rate": 4.3584319726360325e-05, + "loss": 0.2359, + "step": 39692 + }, + { + "epoch": 3.2155703175631887, + "grad_norm": 0.07682564109563828, + "learning_rate": 4.35798190737657e-05, + "loss": 0.2066, + "step": 39693 + }, + { + "epoch": 3.215651328580687, + "grad_norm": 0.08141874521970749, + "learning_rate": 4.357531842117107e-05, + "loss": 0.2027, + "step": 39694 + }, + { + "epoch": 3.215732339598185, + "grad_norm": 0.07336656004190445, + "learning_rate": 4.3570817768576445e-05, + "loss": 0.2265, + "step": 39695 + }, + { + "epoch": 3.215813350615684, + "grad_norm": 0.09699318557977676, + "learning_rate": 4.356631711598182e-05, + "loss": 0.2345, + "step": 39696 + }, + { + "epoch": 3.215894361633182, + "grad_norm": 0.06387688219547272, + "learning_rate": 4.356181646338719e-05, + "loss": 0.1987, + "step": 39697 + }, + { + "epoch": 3.2159753726506803, + "grad_norm": 0.06244372949004173, + "learning_rate": 4.3557315810792566e-05, + "loss": 0.198, + "step": 39698 + }, + { + "epoch": 3.216056383668179, + "grad_norm": 0.07071885466575623, + "learning_rate": 4.355281515819794e-05, + "loss": 0.2346, + "step": 39699 + }, + { + "epoch": 3.2161373946856773, + "grad_norm": 0.0655641108751297, + "learning_rate": 4.3548314505603313e-05, + "loss": 0.2039, + "step": 39700 + }, + { + "epoch": 3.2162184057031755, + "grad_norm": 0.06710990518331528, + "learning_rate": 4.354381385300869e-05, + "loss": 0.2153, + "step": 39701 + }, + { + "epoch": 3.216299416720674, + "grad_norm": 0.07572393864393234, + "learning_rate": 4.353931320041406e-05, + "loss": 0.2441, + "step": 39702 + }, + { + "epoch": 3.2163804277381725, + "grad_norm": 0.07014419138431549, + "learning_rate": 4.3534812547819434e-05, + "loss": 0.2369, + "step": 39703 + }, + { + "epoch": 3.2164614387556707, + "grad_norm": 0.06862316280603409, + "learning_rate": 4.3530311895224815e-05, + "loss": 0.2343, + "step": 39704 + }, + { + "epoch": 3.216542449773169, + "grad_norm": 0.08650745451450348, + "learning_rate": 4.352581124263018e-05, + "loss": 0.2821, + "step": 39705 + }, + { + "epoch": 3.2166234607906676, + "grad_norm": 0.07822742313146591, + "learning_rate": 4.3521310590035555e-05, + "loss": 0.2062, + "step": 39706 + }, + { + "epoch": 3.216704471808166, + "grad_norm": 0.06896897405385971, + "learning_rate": 4.3516809937440935e-05, + "loss": 0.2286, + "step": 39707 + }, + { + "epoch": 3.216785482825664, + "grad_norm": 0.05687590688467026, + "learning_rate": 4.35123092848463e-05, + "loss": 0.2387, + "step": 39708 + }, + { + "epoch": 3.216866493843163, + "grad_norm": 0.06400833278894424, + "learning_rate": 4.3507808632251676e-05, + "loss": 0.2247, + "step": 39709 + }, + { + "epoch": 3.216947504860661, + "grad_norm": 0.0717509388923645, + "learning_rate": 4.3503307979657056e-05, + "loss": 0.247, + "step": 39710 + }, + { + "epoch": 3.2170285158781593, + "grad_norm": 0.08033451437950134, + "learning_rate": 4.349880732706242e-05, + "loss": 0.2517, + "step": 39711 + }, + { + "epoch": 3.217109526895658, + "grad_norm": 0.06716078519821167, + "learning_rate": 4.34943066744678e-05, + "loss": 0.2185, + "step": 39712 + }, + { + "epoch": 3.2171905379131562, + "grad_norm": 0.08149128407239914, + "learning_rate": 4.348980602187318e-05, + "loss": 0.2789, + "step": 39713 + }, + { + "epoch": 3.2172715489306545, + "grad_norm": 0.09534599632024765, + "learning_rate": 4.3485305369278544e-05, + "loss": 0.2281, + "step": 39714 + }, + { + "epoch": 3.2173525599481527, + "grad_norm": 0.08841525763273239, + "learning_rate": 4.348080471668392e-05, + "loss": 0.2501, + "step": 39715 + }, + { + "epoch": 3.2174335709656514, + "grad_norm": 0.059520173817873, + "learning_rate": 4.34763040640893e-05, + "loss": 0.1922, + "step": 39716 + }, + { + "epoch": 3.2175145819831497, + "grad_norm": 0.06427840888500214, + "learning_rate": 4.347180341149467e-05, + "loss": 0.2007, + "step": 39717 + }, + { + "epoch": 3.217595593000648, + "grad_norm": 0.05985168367624283, + "learning_rate": 4.346730275890004e-05, + "loss": 0.188, + "step": 39718 + }, + { + "epoch": 3.2176766040181466, + "grad_norm": 0.06718802452087402, + "learning_rate": 4.346280210630542e-05, + "loss": 0.2515, + "step": 39719 + }, + { + "epoch": 3.217757615035645, + "grad_norm": 0.09119051694869995, + "learning_rate": 4.345830145371079e-05, + "loss": 0.2676, + "step": 39720 + }, + { + "epoch": 3.217838626053143, + "grad_norm": 0.08295290172100067, + "learning_rate": 4.345380080111616e-05, + "loss": 0.3019, + "step": 39721 + }, + { + "epoch": 3.2179196370706418, + "grad_norm": 0.065678171813488, + "learning_rate": 4.344930014852154e-05, + "loss": 0.2297, + "step": 39722 + }, + { + "epoch": 3.21800064808814, + "grad_norm": 0.06975282728672028, + "learning_rate": 4.344479949592691e-05, + "loss": 0.2344, + "step": 39723 + }, + { + "epoch": 3.2180816591056383, + "grad_norm": 0.08624208718538284, + "learning_rate": 4.344029884333228e-05, + "loss": 0.2679, + "step": 39724 + }, + { + "epoch": 3.218162670123137, + "grad_norm": 0.05174512788653374, + "learning_rate": 4.343579819073766e-05, + "loss": 0.1905, + "step": 39725 + }, + { + "epoch": 3.218243681140635, + "grad_norm": 0.06753432750701904, + "learning_rate": 4.3431297538143034e-05, + "loss": 0.245, + "step": 39726 + }, + { + "epoch": 3.2183246921581334, + "grad_norm": 0.09749765694141388, + "learning_rate": 4.342679688554841e-05, + "loss": 0.2635, + "step": 39727 + }, + { + "epoch": 3.2184057031756317, + "grad_norm": 0.06999699026346207, + "learning_rate": 4.342229623295378e-05, + "loss": 0.2477, + "step": 39728 + }, + { + "epoch": 3.2184867141931304, + "grad_norm": 0.07547413557767868, + "learning_rate": 4.3417795580359155e-05, + "loss": 0.2268, + "step": 39729 + }, + { + "epoch": 3.2185677252106286, + "grad_norm": 0.06009672209620476, + "learning_rate": 4.341329492776453e-05, + "loss": 0.1921, + "step": 39730 + }, + { + "epoch": 3.218648736228127, + "grad_norm": 0.07534836232662201, + "learning_rate": 4.34087942751699e-05, + "loss": 0.2119, + "step": 39731 + }, + { + "epoch": 3.2187297472456255, + "grad_norm": 0.06061408668756485, + "learning_rate": 4.3404293622575276e-05, + "loss": 0.2167, + "step": 39732 + }, + { + "epoch": 3.218810758263124, + "grad_norm": 0.07624932378530502, + "learning_rate": 4.339979296998065e-05, + "loss": 0.2076, + "step": 39733 + }, + { + "epoch": 3.218891769280622, + "grad_norm": 0.05887662619352341, + "learning_rate": 4.339529231738602e-05, + "loss": 0.2435, + "step": 39734 + }, + { + "epoch": 3.2189727802981207, + "grad_norm": 0.07025136053562164, + "learning_rate": 4.3390791664791396e-05, + "loss": 0.2048, + "step": 39735 + }, + { + "epoch": 3.219053791315619, + "grad_norm": 0.06738400459289551, + "learning_rate": 4.338629101219677e-05, + "loss": 0.2293, + "step": 39736 + }, + { + "epoch": 3.219134802333117, + "grad_norm": 0.07455737888813019, + "learning_rate": 4.3381790359602144e-05, + "loss": 0.2085, + "step": 39737 + }, + { + "epoch": 3.2192158133506155, + "grad_norm": 0.07953792065382004, + "learning_rate": 4.337728970700752e-05, + "loss": 0.1988, + "step": 39738 + }, + { + "epoch": 3.219296824368114, + "grad_norm": 0.06335048377513885, + "learning_rate": 4.337278905441289e-05, + "loss": 0.1996, + "step": 39739 + }, + { + "epoch": 3.2193778353856124, + "grad_norm": 0.0741506814956665, + "learning_rate": 4.3368288401818264e-05, + "loss": 0.2117, + "step": 39740 + }, + { + "epoch": 3.2194588464031106, + "grad_norm": 0.062281083315610886, + "learning_rate": 4.336378774922364e-05, + "loss": 0.2084, + "step": 39741 + }, + { + "epoch": 3.2195398574206093, + "grad_norm": 0.05702844262123108, + "learning_rate": 4.335928709662901e-05, + "loss": 0.2453, + "step": 39742 + }, + { + "epoch": 3.2196208684381076, + "grad_norm": 0.08654730021953583, + "learning_rate": 4.335478644403439e-05, + "loss": 0.2111, + "step": 39743 + }, + { + "epoch": 3.219701879455606, + "grad_norm": 0.07214518636465073, + "learning_rate": 4.335028579143976e-05, + "loss": 0.2483, + "step": 39744 + }, + { + "epoch": 3.2197828904731045, + "grad_norm": 0.056577298790216446, + "learning_rate": 4.334578513884513e-05, + "loss": 0.2138, + "step": 39745 + }, + { + "epoch": 3.2198639014906028, + "grad_norm": 0.0786077156662941, + "learning_rate": 4.334128448625051e-05, + "loss": 0.2266, + "step": 39746 + }, + { + "epoch": 3.219944912508101, + "grad_norm": 0.07313614338636398, + "learning_rate": 4.333678383365588e-05, + "loss": 0.2651, + "step": 39747 + }, + { + "epoch": 3.2200259235255997, + "grad_norm": 0.07528704404830933, + "learning_rate": 4.333228318106125e-05, + "loss": 0.249, + "step": 39748 + }, + { + "epoch": 3.220106934543098, + "grad_norm": 0.0762871652841568, + "learning_rate": 4.3327782528466634e-05, + "loss": 0.2261, + "step": 39749 + }, + { + "epoch": 3.220187945560596, + "grad_norm": 0.07066619396209717, + "learning_rate": 4.3323281875872e-05, + "loss": 0.2282, + "step": 39750 + }, + { + "epoch": 3.2202689565780944, + "grad_norm": 0.0701572597026825, + "learning_rate": 4.3318781223277374e-05, + "loss": 0.2649, + "step": 39751 + }, + { + "epoch": 3.220349967595593, + "grad_norm": 0.07104917615652084, + "learning_rate": 4.3314280570682754e-05, + "loss": 0.2487, + "step": 39752 + }, + { + "epoch": 3.2204309786130914, + "grad_norm": 0.07371274381875992, + "learning_rate": 4.330977991808812e-05, + "loss": 0.2152, + "step": 39753 + }, + { + "epoch": 3.2205119896305896, + "grad_norm": 0.057648058980703354, + "learning_rate": 4.3305279265493495e-05, + "loss": 0.2476, + "step": 39754 + }, + { + "epoch": 3.2205930006480883, + "grad_norm": 0.06675635278224945, + "learning_rate": 4.3300778612898875e-05, + "loss": 0.2278, + "step": 39755 + }, + { + "epoch": 3.2206740116655865, + "grad_norm": 0.061084549874067307, + "learning_rate": 4.329627796030424e-05, + "loss": 0.2185, + "step": 39756 + }, + { + "epoch": 3.220755022683085, + "grad_norm": 0.06798110902309418, + "learning_rate": 4.3291777307709616e-05, + "loss": 0.2031, + "step": 39757 + }, + { + "epoch": 3.2208360337005835, + "grad_norm": 0.06163044273853302, + "learning_rate": 4.3287276655114996e-05, + "loss": 0.2724, + "step": 39758 + }, + { + "epoch": 3.2209170447180817, + "grad_norm": 0.07092875987291336, + "learning_rate": 4.328277600252037e-05, + "loss": 0.2326, + "step": 39759 + }, + { + "epoch": 3.22099805573558, + "grad_norm": 0.07430491596460342, + "learning_rate": 4.327827534992574e-05, + "loss": 0.2233, + "step": 39760 + }, + { + "epoch": 3.221079066753078, + "grad_norm": 0.06169954314827919, + "learning_rate": 4.327377469733112e-05, + "loss": 0.2433, + "step": 39761 + }, + { + "epoch": 3.221160077770577, + "grad_norm": 0.056168779730796814, + "learning_rate": 4.326927404473649e-05, + "loss": 0.2293, + "step": 39762 + }, + { + "epoch": 3.221241088788075, + "grad_norm": 0.07749295979738235, + "learning_rate": 4.3264773392141864e-05, + "loss": 0.2221, + "step": 39763 + }, + { + "epoch": 3.2213220998055734, + "grad_norm": 0.07110145688056946, + "learning_rate": 4.326027273954724e-05, + "loss": 0.2397, + "step": 39764 + }, + { + "epoch": 3.221403110823072, + "grad_norm": 0.0723486915230751, + "learning_rate": 4.325577208695261e-05, + "loss": 0.2353, + "step": 39765 + }, + { + "epoch": 3.2214841218405703, + "grad_norm": 0.0709274485707283, + "learning_rate": 4.3251271434357985e-05, + "loss": 0.2512, + "step": 39766 + }, + { + "epoch": 3.2215651328580686, + "grad_norm": 0.06597712635993958, + "learning_rate": 4.324677078176336e-05, + "loss": 0.2468, + "step": 39767 + }, + { + "epoch": 3.2216461438755672, + "grad_norm": 0.07410014420747757, + "learning_rate": 4.324227012916873e-05, + "loss": 0.2577, + "step": 39768 + }, + { + "epoch": 3.2217271548930655, + "grad_norm": 0.06988342851400375, + "learning_rate": 4.3237769476574106e-05, + "loss": 0.2031, + "step": 39769 + }, + { + "epoch": 3.2218081659105637, + "grad_norm": 0.09154324978590012, + "learning_rate": 4.323326882397948e-05, + "loss": 0.2114, + "step": 39770 + }, + { + "epoch": 3.2218891769280624, + "grad_norm": 0.070953868329525, + "learning_rate": 4.322876817138485e-05, + "loss": 0.2349, + "step": 39771 + }, + { + "epoch": 3.2219701879455607, + "grad_norm": 0.07277769595384598, + "learning_rate": 4.3224267518790226e-05, + "loss": 0.2288, + "step": 39772 + }, + { + "epoch": 3.222051198963059, + "grad_norm": 0.0824602022767067, + "learning_rate": 4.32197668661956e-05, + "loss": 0.2354, + "step": 39773 + }, + { + "epoch": 3.222132209980557, + "grad_norm": 0.07566282898187637, + "learning_rate": 4.3215266213600974e-05, + "loss": 0.2334, + "step": 39774 + }, + { + "epoch": 3.222213220998056, + "grad_norm": 0.07005234807729721, + "learning_rate": 4.321076556100635e-05, + "loss": 0.253, + "step": 39775 + }, + { + "epoch": 3.222294232015554, + "grad_norm": 0.058707863092422485, + "learning_rate": 4.320626490841172e-05, + "loss": 0.2302, + "step": 39776 + }, + { + "epoch": 3.2223752430330523, + "grad_norm": 0.0799265131354332, + "learning_rate": 4.3201764255817094e-05, + "loss": 0.2804, + "step": 39777 + }, + { + "epoch": 3.222456254050551, + "grad_norm": 0.08761447668075562, + "learning_rate": 4.319726360322247e-05, + "loss": 0.2502, + "step": 39778 + }, + { + "epoch": 3.2225372650680493, + "grad_norm": 0.07801079750061035, + "learning_rate": 4.319276295062784e-05, + "loss": 0.2351, + "step": 39779 + }, + { + "epoch": 3.2226182760855475, + "grad_norm": 0.06580349057912827, + "learning_rate": 4.3188262298033215e-05, + "loss": 0.1748, + "step": 39780 + }, + { + "epoch": 3.222699287103046, + "grad_norm": 0.07311670482158661, + "learning_rate": 4.318376164543859e-05, + "loss": 0.2315, + "step": 39781 + }, + { + "epoch": 3.2227802981205445, + "grad_norm": 0.08054853975772858, + "learning_rate": 4.317926099284396e-05, + "loss": 0.2481, + "step": 39782 + }, + { + "epoch": 3.2228613091380427, + "grad_norm": 0.08651946485042572, + "learning_rate": 4.3174760340249336e-05, + "loss": 0.2907, + "step": 39783 + }, + { + "epoch": 3.222942320155541, + "grad_norm": 0.058403633534908295, + "learning_rate": 4.317025968765471e-05, + "loss": 0.223, + "step": 39784 + }, + { + "epoch": 3.2230233311730396, + "grad_norm": 0.06779738515615463, + "learning_rate": 4.316575903506009e-05, + "loss": 0.2328, + "step": 39785 + }, + { + "epoch": 3.223104342190538, + "grad_norm": 0.06688597798347473, + "learning_rate": 4.316125838246546e-05, + "loss": 0.2056, + "step": 39786 + }, + { + "epoch": 3.223185353208036, + "grad_norm": 0.08868188410997391, + "learning_rate": 4.315675772987083e-05, + "loss": 0.229, + "step": 39787 + }, + { + "epoch": 3.223266364225535, + "grad_norm": 0.06498401612043381, + "learning_rate": 4.315225707727621e-05, + "loss": 0.1918, + "step": 39788 + }, + { + "epoch": 3.223347375243033, + "grad_norm": 0.07395123690366745, + "learning_rate": 4.314775642468158e-05, + "loss": 0.1914, + "step": 39789 + }, + { + "epoch": 3.2234283862605313, + "grad_norm": 0.0681285709142685, + "learning_rate": 4.314325577208695e-05, + "loss": 0.221, + "step": 39790 + }, + { + "epoch": 3.22350939727803, + "grad_norm": 0.09783481061458588, + "learning_rate": 4.313875511949233e-05, + "loss": 0.2124, + "step": 39791 + }, + { + "epoch": 3.2235904082955282, + "grad_norm": 0.07654182612895966, + "learning_rate": 4.31342544668977e-05, + "loss": 0.2031, + "step": 39792 + }, + { + "epoch": 3.2236714193130265, + "grad_norm": 0.06707725673913956, + "learning_rate": 4.312975381430307e-05, + "loss": 0.2458, + "step": 39793 + }, + { + "epoch": 3.223752430330525, + "grad_norm": 0.06937646120786667, + "learning_rate": 4.312525316170845e-05, + "loss": 0.2395, + "step": 39794 + }, + { + "epoch": 3.2238334413480234, + "grad_norm": 0.07003714144229889, + "learning_rate": 4.312075250911382e-05, + "loss": 0.2154, + "step": 39795 + }, + { + "epoch": 3.2239144523655217, + "grad_norm": 0.06754667311906815, + "learning_rate": 4.31162518565192e-05, + "loss": 0.2256, + "step": 39796 + }, + { + "epoch": 3.22399546338302, + "grad_norm": 0.0717126652598381, + "learning_rate": 4.311175120392457e-05, + "loss": 0.2727, + "step": 39797 + }, + { + "epoch": 3.2240764744005186, + "grad_norm": 0.06259047985076904, + "learning_rate": 4.310725055132995e-05, + "loss": 0.2122, + "step": 39798 + }, + { + "epoch": 3.224157485418017, + "grad_norm": 0.06361819058656693, + "learning_rate": 4.310274989873532e-05, + "loss": 0.2298, + "step": 39799 + }, + { + "epoch": 3.224238496435515, + "grad_norm": 0.07437151670455933, + "learning_rate": 4.3098249246140694e-05, + "loss": 0.2561, + "step": 39800 + }, + { + "epoch": 3.2243195074530138, + "grad_norm": 0.07215610891580582, + "learning_rate": 4.309374859354607e-05, + "loss": 0.2253, + "step": 39801 + }, + { + "epoch": 3.224400518470512, + "grad_norm": 0.08039887994527817, + "learning_rate": 4.308924794095144e-05, + "loss": 0.232, + "step": 39802 + }, + { + "epoch": 3.2244815294880103, + "grad_norm": 0.07376774400472641, + "learning_rate": 4.3084747288356815e-05, + "loss": 0.2657, + "step": 39803 + }, + { + "epoch": 3.224562540505509, + "grad_norm": 0.0832466259598732, + "learning_rate": 4.308024663576219e-05, + "loss": 0.2616, + "step": 39804 + }, + { + "epoch": 3.224643551523007, + "grad_norm": 0.08477449417114258, + "learning_rate": 4.307574598316756e-05, + "loss": 0.2024, + "step": 39805 + }, + { + "epoch": 3.2247245625405054, + "grad_norm": 0.059144388884305954, + "learning_rate": 4.3071245330572936e-05, + "loss": 0.2241, + "step": 39806 + }, + { + "epoch": 3.2248055735580037, + "grad_norm": 0.06593013554811478, + "learning_rate": 4.306674467797831e-05, + "loss": 0.1954, + "step": 39807 + }, + { + "epoch": 3.2248865845755024, + "grad_norm": 0.0681859627366066, + "learning_rate": 4.306224402538368e-05, + "loss": 0.2068, + "step": 39808 + }, + { + "epoch": 3.2249675955930006, + "grad_norm": 0.06685998290777206, + "learning_rate": 4.3057743372789057e-05, + "loss": 0.214, + "step": 39809 + }, + { + "epoch": 3.225048606610499, + "grad_norm": 0.0786844789981842, + "learning_rate": 4.305324272019443e-05, + "loss": 0.2236, + "step": 39810 + }, + { + "epoch": 3.2251296176279975, + "grad_norm": 0.06508845835924149, + "learning_rate": 4.3048742067599804e-05, + "loss": 0.2363, + "step": 39811 + }, + { + "epoch": 3.225210628645496, + "grad_norm": 0.06696710735559464, + "learning_rate": 4.304424141500518e-05, + "loss": 0.2477, + "step": 39812 + }, + { + "epoch": 3.225291639662994, + "grad_norm": 0.08366618305444717, + "learning_rate": 4.303974076241055e-05, + "loss": 0.2719, + "step": 39813 + }, + { + "epoch": 3.2253726506804927, + "grad_norm": 0.06795560568571091, + "learning_rate": 4.3035240109815925e-05, + "loss": 0.2218, + "step": 39814 + }, + { + "epoch": 3.225453661697991, + "grad_norm": 0.07089146971702576, + "learning_rate": 4.30307394572213e-05, + "loss": 0.2224, + "step": 39815 + }, + { + "epoch": 3.225534672715489, + "grad_norm": 0.05195857584476471, + "learning_rate": 4.302623880462667e-05, + "loss": 0.223, + "step": 39816 + }, + { + "epoch": 3.225615683732988, + "grad_norm": 0.064137764275074, + "learning_rate": 4.3021738152032045e-05, + "loss": 0.2421, + "step": 39817 + }, + { + "epoch": 3.225696694750486, + "grad_norm": 0.07576876133680344, + "learning_rate": 4.301723749943742e-05, + "loss": 0.2405, + "step": 39818 + }, + { + "epoch": 3.2257777057679844, + "grad_norm": 0.07548273354768753, + "learning_rate": 4.301273684684279e-05, + "loss": 0.2019, + "step": 39819 + }, + { + "epoch": 3.2258587167854826, + "grad_norm": 0.08538465946912766, + "learning_rate": 4.3008236194248166e-05, + "loss": 0.2192, + "step": 39820 + }, + { + "epoch": 3.2259397278029813, + "grad_norm": 0.06191878020763397, + "learning_rate": 4.300373554165354e-05, + "loss": 0.214, + "step": 39821 + }, + { + "epoch": 3.2260207388204796, + "grad_norm": 0.07498586922883987, + "learning_rate": 4.2999234889058913e-05, + "loss": 0.2399, + "step": 39822 + }, + { + "epoch": 3.226101749837978, + "grad_norm": 0.07455036044120789, + "learning_rate": 4.299473423646429e-05, + "loss": 0.259, + "step": 39823 + }, + { + "epoch": 3.2261827608554765, + "grad_norm": 0.06393108516931534, + "learning_rate": 4.299023358386967e-05, + "loss": 0.2197, + "step": 39824 + }, + { + "epoch": 3.2262637718729748, + "grad_norm": 0.08502611517906189, + "learning_rate": 4.2985732931275034e-05, + "loss": 0.2281, + "step": 39825 + }, + { + "epoch": 3.226344782890473, + "grad_norm": 0.062071532011032104, + "learning_rate": 4.298123227868041e-05, + "loss": 0.1857, + "step": 39826 + }, + { + "epoch": 3.2264257939079717, + "grad_norm": 0.06794518232345581, + "learning_rate": 4.297673162608579e-05, + "loss": 0.2, + "step": 39827 + }, + { + "epoch": 3.22650680492547, + "grad_norm": 0.07407260686159134, + "learning_rate": 4.2972230973491155e-05, + "loss": 0.23, + "step": 39828 + }, + { + "epoch": 3.226587815942968, + "grad_norm": 0.071202851831913, + "learning_rate": 4.2967730320896535e-05, + "loss": 0.2226, + "step": 39829 + }, + { + "epoch": 3.2266688269604664, + "grad_norm": 0.06576815247535706, + "learning_rate": 4.296322966830191e-05, + "loss": 0.2042, + "step": 39830 + }, + { + "epoch": 3.226749837977965, + "grad_norm": 0.07664418965578079, + "learning_rate": 4.2958729015707276e-05, + "loss": 0.2397, + "step": 39831 + }, + { + "epoch": 3.2268308489954634, + "grad_norm": 0.06686937063932419, + "learning_rate": 4.2954228363112656e-05, + "loss": 0.2074, + "step": 39832 + }, + { + "epoch": 3.2269118600129616, + "grad_norm": 0.07900944352149963, + "learning_rate": 4.294972771051803e-05, + "loss": 0.2446, + "step": 39833 + }, + { + "epoch": 3.2269928710304603, + "grad_norm": 0.07053736597299576, + "learning_rate": 4.29452270579234e-05, + "loss": 0.2417, + "step": 39834 + }, + { + "epoch": 3.2270738820479585, + "grad_norm": 0.06694454699754715, + "learning_rate": 4.294072640532878e-05, + "loss": 0.2184, + "step": 39835 + }, + { + "epoch": 3.2271548930654568, + "grad_norm": 0.066739022731781, + "learning_rate": 4.293622575273415e-05, + "loss": 0.2024, + "step": 39836 + }, + { + "epoch": 3.2272359040829555, + "grad_norm": 0.07889380306005478, + "learning_rate": 4.2931725100139524e-05, + "loss": 0.2762, + "step": 39837 + }, + { + "epoch": 3.2273169151004537, + "grad_norm": 0.07989609241485596, + "learning_rate": 4.29272244475449e-05, + "loss": 0.2569, + "step": 39838 + }, + { + "epoch": 3.227397926117952, + "grad_norm": 0.08929932117462158, + "learning_rate": 4.292272379495027e-05, + "loss": 0.2682, + "step": 39839 + }, + { + "epoch": 3.2274789371354506, + "grad_norm": 0.08622092008590698, + "learning_rate": 4.2918223142355645e-05, + "loss": 0.235, + "step": 39840 + }, + { + "epoch": 3.227559948152949, + "grad_norm": 0.05606333166360855, + "learning_rate": 4.291372248976102e-05, + "loss": 0.2131, + "step": 39841 + }, + { + "epoch": 3.227640959170447, + "grad_norm": 0.06832990795373917, + "learning_rate": 4.290922183716639e-05, + "loss": 0.2177, + "step": 39842 + }, + { + "epoch": 3.2277219701879454, + "grad_norm": 0.06548843532800674, + "learning_rate": 4.2904721184571766e-05, + "loss": 0.214, + "step": 39843 + }, + { + "epoch": 3.227802981205444, + "grad_norm": 0.06597808003425598, + "learning_rate": 4.290022053197714e-05, + "loss": 0.1829, + "step": 39844 + }, + { + "epoch": 3.2278839922229423, + "grad_norm": 0.06908122450113297, + "learning_rate": 4.289571987938251e-05, + "loss": 0.2373, + "step": 39845 + }, + { + "epoch": 3.2279650032404406, + "grad_norm": 0.06956381350755692, + "learning_rate": 4.289121922678789e-05, + "loss": 0.2024, + "step": 39846 + }, + { + "epoch": 3.2280460142579392, + "grad_norm": 0.08439984917640686, + "learning_rate": 4.288671857419326e-05, + "loss": 0.2142, + "step": 39847 + }, + { + "epoch": 3.2281270252754375, + "grad_norm": 0.07745002955198288, + "learning_rate": 4.2882217921598634e-05, + "loss": 0.2228, + "step": 39848 + }, + { + "epoch": 3.2282080362929357, + "grad_norm": 0.07908467203378677, + "learning_rate": 4.287771726900401e-05, + "loss": 0.2207, + "step": 39849 + }, + { + "epoch": 3.2282890473104344, + "grad_norm": 0.05995180085301399, + "learning_rate": 4.287321661640938e-05, + "loss": 0.2602, + "step": 39850 + }, + { + "epoch": 3.2283700583279327, + "grad_norm": 0.07221881300210953, + "learning_rate": 4.2868715963814755e-05, + "loss": 0.1869, + "step": 39851 + }, + { + "epoch": 3.228451069345431, + "grad_norm": 0.07083651423454285, + "learning_rate": 4.286421531122013e-05, + "loss": 0.2034, + "step": 39852 + }, + { + "epoch": 3.228532080362929, + "grad_norm": 0.08436165750026703, + "learning_rate": 4.28597146586255e-05, + "loss": 0.2834, + "step": 39853 + }, + { + "epoch": 3.228613091380428, + "grad_norm": 0.06885623931884766, + "learning_rate": 4.2855214006030875e-05, + "loss": 0.2169, + "step": 39854 + }, + { + "epoch": 3.228694102397926, + "grad_norm": 0.0686570331454277, + "learning_rate": 4.285071335343625e-05, + "loss": 0.2313, + "step": 39855 + }, + { + "epoch": 3.2287751134154243, + "grad_norm": 0.06884542107582092, + "learning_rate": 4.284621270084162e-05, + "loss": 0.2196, + "step": 39856 + }, + { + "epoch": 3.228856124432923, + "grad_norm": 0.06082329526543617, + "learning_rate": 4.2841712048246996e-05, + "loss": 0.2205, + "step": 39857 + }, + { + "epoch": 3.2289371354504213, + "grad_norm": 0.06664751470088959, + "learning_rate": 4.283721139565237e-05, + "loss": 0.2055, + "step": 39858 + }, + { + "epoch": 3.2290181464679195, + "grad_norm": 0.0696212649345398, + "learning_rate": 4.2832710743057744e-05, + "loss": 0.2204, + "step": 39859 + }, + { + "epoch": 3.229099157485418, + "grad_norm": 0.07365533709526062, + "learning_rate": 4.282821009046312e-05, + "loss": 0.2293, + "step": 39860 + }, + { + "epoch": 3.2291801685029164, + "grad_norm": 0.054057564586400986, + "learning_rate": 4.282370943786849e-05, + "loss": 0.2307, + "step": 39861 + }, + { + "epoch": 3.2292611795204147, + "grad_norm": 0.06658761948347092, + "learning_rate": 4.2819208785273864e-05, + "loss": 0.2066, + "step": 39862 + }, + { + "epoch": 3.2293421905379134, + "grad_norm": 0.0653328150510788, + "learning_rate": 4.2814708132679245e-05, + "loss": 0.2275, + "step": 39863 + }, + { + "epoch": 3.2294232015554116, + "grad_norm": 0.06299271434545517, + "learning_rate": 4.281020748008461e-05, + "loss": 0.206, + "step": 39864 + }, + { + "epoch": 3.22950421257291, + "grad_norm": 0.06639181077480316, + "learning_rate": 4.280570682748999e-05, + "loss": 0.2114, + "step": 39865 + }, + { + "epoch": 3.229585223590408, + "grad_norm": 0.06050573289394379, + "learning_rate": 4.2801206174895366e-05, + "loss": 0.2398, + "step": 39866 + }, + { + "epoch": 3.229666234607907, + "grad_norm": 0.07270995527505875, + "learning_rate": 4.279670552230073e-05, + "loss": 0.202, + "step": 39867 + }, + { + "epoch": 3.229747245625405, + "grad_norm": 0.07150819897651672, + "learning_rate": 4.279220486970611e-05, + "loss": 0.2307, + "step": 39868 + }, + { + "epoch": 3.2298282566429033, + "grad_norm": 0.08548440039157867, + "learning_rate": 4.2787704217111486e-05, + "loss": 0.2647, + "step": 39869 + }, + { + "epoch": 3.229909267660402, + "grad_norm": 0.07096060365438461, + "learning_rate": 4.278320356451685e-05, + "loss": 0.2003, + "step": 39870 + }, + { + "epoch": 3.2299902786779002, + "grad_norm": 0.07516411691904068, + "learning_rate": 4.2778702911922234e-05, + "loss": 0.278, + "step": 39871 + }, + { + "epoch": 3.2300712896953985, + "grad_norm": 0.07198493927717209, + "learning_rate": 4.277420225932761e-05, + "loss": 0.2163, + "step": 39872 + }, + { + "epoch": 3.230152300712897, + "grad_norm": 0.08852767944335938, + "learning_rate": 4.2769701606732974e-05, + "loss": 0.2305, + "step": 39873 + }, + { + "epoch": 3.2302333117303954, + "grad_norm": 0.08636888116598129, + "learning_rate": 4.2765200954138354e-05, + "loss": 0.2195, + "step": 39874 + }, + { + "epoch": 3.2303143227478937, + "grad_norm": 0.0890205055475235, + "learning_rate": 4.276070030154373e-05, + "loss": 0.2642, + "step": 39875 + }, + { + "epoch": 3.230395333765392, + "grad_norm": 0.06537780910730362, + "learning_rate": 4.27561996489491e-05, + "loss": 0.2491, + "step": 39876 + }, + { + "epoch": 3.2304763447828906, + "grad_norm": 0.0705234557390213, + "learning_rate": 4.2751698996354475e-05, + "loss": 0.2102, + "step": 39877 + }, + { + "epoch": 3.230557355800389, + "grad_norm": 0.06498276442289352, + "learning_rate": 4.274719834375985e-05, + "loss": 0.2042, + "step": 39878 + }, + { + "epoch": 3.230638366817887, + "grad_norm": 0.092403344810009, + "learning_rate": 4.274269769116522e-05, + "loss": 0.2329, + "step": 39879 + }, + { + "epoch": 3.2307193778353858, + "grad_norm": 0.065389484167099, + "learning_rate": 4.2738197038570596e-05, + "loss": 0.2172, + "step": 39880 + }, + { + "epoch": 3.230800388852884, + "grad_norm": 0.05993317812681198, + "learning_rate": 4.273369638597597e-05, + "loss": 0.2092, + "step": 39881 + }, + { + "epoch": 3.2308813998703823, + "grad_norm": 0.08355020731687546, + "learning_rate": 4.272919573338134e-05, + "loss": 0.2253, + "step": 39882 + }, + { + "epoch": 3.230962410887881, + "grad_norm": 0.0723675861954689, + "learning_rate": 4.272469508078672e-05, + "loss": 0.2647, + "step": 39883 + }, + { + "epoch": 3.231043421905379, + "grad_norm": 0.08424925804138184, + "learning_rate": 4.272019442819209e-05, + "loss": 0.2144, + "step": 39884 + }, + { + "epoch": 3.2311244329228774, + "grad_norm": 0.07128293812274933, + "learning_rate": 4.2715693775597464e-05, + "loss": 0.2784, + "step": 39885 + }, + { + "epoch": 3.231205443940376, + "grad_norm": 0.06768849492073059, + "learning_rate": 4.271119312300284e-05, + "loss": 0.2182, + "step": 39886 + }, + { + "epoch": 3.2312864549578744, + "grad_norm": 0.0783187672495842, + "learning_rate": 4.270669247040821e-05, + "loss": 0.2526, + "step": 39887 + }, + { + "epoch": 3.2313674659753726, + "grad_norm": 0.07558058947324753, + "learning_rate": 4.2702191817813585e-05, + "loss": 0.2154, + "step": 39888 + }, + { + "epoch": 3.231448476992871, + "grad_norm": 0.06205056607723236, + "learning_rate": 4.269769116521896e-05, + "loss": 0.2068, + "step": 39889 + }, + { + "epoch": 3.2315294880103695, + "grad_norm": 0.07086601853370667, + "learning_rate": 4.269319051262433e-05, + "loss": 0.246, + "step": 39890 + }, + { + "epoch": 3.231610499027868, + "grad_norm": 0.08032597601413727, + "learning_rate": 4.2688689860029706e-05, + "loss": 0.2278, + "step": 39891 + }, + { + "epoch": 3.231691510045366, + "grad_norm": 0.09461595863103867, + "learning_rate": 4.268418920743508e-05, + "loss": 0.2683, + "step": 39892 + }, + { + "epoch": 3.2317725210628647, + "grad_norm": 0.0689258947968483, + "learning_rate": 4.267968855484045e-05, + "loss": 0.2802, + "step": 39893 + }, + { + "epoch": 3.231853532080363, + "grad_norm": 0.0880710557103157, + "learning_rate": 4.2675187902245826e-05, + "loss": 0.2023, + "step": 39894 + }, + { + "epoch": 3.231934543097861, + "grad_norm": 0.07965899258852005, + "learning_rate": 4.26706872496512e-05, + "loss": 0.2542, + "step": 39895 + }, + { + "epoch": 3.2320155541153595, + "grad_norm": 0.07699202001094818, + "learning_rate": 4.2666186597056574e-05, + "loss": 0.2568, + "step": 39896 + }, + { + "epoch": 3.232096565132858, + "grad_norm": 0.0677374079823494, + "learning_rate": 4.266168594446195e-05, + "loss": 0.2427, + "step": 39897 + }, + { + "epoch": 3.2321775761503564, + "grad_norm": 0.07392044365406036, + "learning_rate": 4.265718529186733e-05, + "loss": 0.2171, + "step": 39898 + }, + { + "epoch": 3.2322585871678546, + "grad_norm": 0.07073640823364258, + "learning_rate": 4.2652684639272694e-05, + "loss": 0.2284, + "step": 39899 + }, + { + "epoch": 3.2323395981853533, + "grad_norm": 0.0748598575592041, + "learning_rate": 4.264818398667807e-05, + "loss": 0.1943, + "step": 39900 + }, + { + "epoch": 3.2324206092028516, + "grad_norm": 0.0716557651758194, + "learning_rate": 4.264368333408345e-05, + "loss": 0.2442, + "step": 39901 + }, + { + "epoch": 3.23250162022035, + "grad_norm": 0.07370223104953766, + "learning_rate": 4.263918268148882e-05, + "loss": 0.2576, + "step": 39902 + }, + { + "epoch": 3.2325826312378485, + "grad_norm": 0.06414464116096497, + "learning_rate": 4.263468202889419e-05, + "loss": 0.1914, + "step": 39903 + }, + { + "epoch": 3.2326636422553467, + "grad_norm": 0.0774352103471756, + "learning_rate": 4.263018137629957e-05, + "loss": 0.2338, + "step": 39904 + }, + { + "epoch": 3.232744653272845, + "grad_norm": 0.07154515385627747, + "learning_rate": 4.262568072370494e-05, + "loss": 0.2327, + "step": 39905 + }, + { + "epoch": 3.2328256642903437, + "grad_norm": 0.08657229691743851, + "learning_rate": 4.262118007111031e-05, + "loss": 0.2143, + "step": 39906 + }, + { + "epoch": 3.232906675307842, + "grad_norm": 0.07232128083705902, + "learning_rate": 4.261667941851569e-05, + "loss": 0.217, + "step": 39907 + }, + { + "epoch": 3.23298768632534, + "grad_norm": 0.07182679325342178, + "learning_rate": 4.2612178765921064e-05, + "loss": 0.249, + "step": 39908 + }, + { + "epoch": 3.233068697342839, + "grad_norm": 0.08501800894737244, + "learning_rate": 4.260767811332643e-05, + "loss": 0.2611, + "step": 39909 + }, + { + "epoch": 3.233149708360337, + "grad_norm": 0.0579972043633461, + "learning_rate": 4.260317746073181e-05, + "loss": 0.2059, + "step": 39910 + }, + { + "epoch": 3.2332307193778353, + "grad_norm": 0.06739821285009384, + "learning_rate": 4.2598676808137184e-05, + "loss": 0.2287, + "step": 39911 + }, + { + "epoch": 3.2333117303953336, + "grad_norm": 0.06128734350204468, + "learning_rate": 4.259417615554255e-05, + "loss": 0.2246, + "step": 39912 + }, + { + "epoch": 3.2333927414128323, + "grad_norm": 0.07265954464673996, + "learning_rate": 4.258967550294793e-05, + "loss": 0.2316, + "step": 39913 + }, + { + "epoch": 3.2334737524303305, + "grad_norm": 0.06592036038637161, + "learning_rate": 4.2585174850353305e-05, + "loss": 0.2136, + "step": 39914 + }, + { + "epoch": 3.2335547634478288, + "grad_norm": 0.06112872436642647, + "learning_rate": 4.258067419775867e-05, + "loss": 0.2005, + "step": 39915 + }, + { + "epoch": 3.2336357744653275, + "grad_norm": 0.07858998328447342, + "learning_rate": 4.257617354516405e-05, + "loss": 0.259, + "step": 39916 + }, + { + "epoch": 3.2337167854828257, + "grad_norm": 0.057796552777290344, + "learning_rate": 4.2571672892569426e-05, + "loss": 0.205, + "step": 39917 + }, + { + "epoch": 3.233797796500324, + "grad_norm": 0.08636032789945602, + "learning_rate": 4.25671722399748e-05, + "loss": 0.2348, + "step": 39918 + }, + { + "epoch": 3.233878807517822, + "grad_norm": 0.06539962440729141, + "learning_rate": 4.256267158738017e-05, + "loss": 0.2357, + "step": 39919 + }, + { + "epoch": 3.233959818535321, + "grad_norm": 0.07662045955657959, + "learning_rate": 4.255817093478555e-05, + "loss": 0.2161, + "step": 39920 + }, + { + "epoch": 3.234040829552819, + "grad_norm": 0.06934379786252975, + "learning_rate": 4.255367028219092e-05, + "loss": 0.2061, + "step": 39921 + }, + { + "epoch": 3.2341218405703174, + "grad_norm": 0.07253503054380417, + "learning_rate": 4.2549169629596294e-05, + "loss": 0.2175, + "step": 39922 + }, + { + "epoch": 3.234202851587816, + "grad_norm": 0.09941152483224869, + "learning_rate": 4.254466897700167e-05, + "loss": 0.2165, + "step": 39923 + }, + { + "epoch": 3.2342838626053143, + "grad_norm": 0.05564277991652489, + "learning_rate": 4.254016832440704e-05, + "loss": 0.203, + "step": 39924 + }, + { + "epoch": 3.2343648736228126, + "grad_norm": 0.07295148819684982, + "learning_rate": 4.2535667671812415e-05, + "loss": 0.2667, + "step": 39925 + }, + { + "epoch": 3.2344458846403112, + "grad_norm": 0.06901323795318604, + "learning_rate": 4.253116701921779e-05, + "loss": 0.2464, + "step": 39926 + }, + { + "epoch": 3.2345268956578095, + "grad_norm": 0.0735318511724472, + "learning_rate": 4.252666636662316e-05, + "loss": 0.2284, + "step": 39927 + }, + { + "epoch": 3.2346079066753077, + "grad_norm": 0.07409574091434479, + "learning_rate": 4.2522165714028536e-05, + "loss": 0.265, + "step": 39928 + }, + { + "epoch": 3.2346889176928064, + "grad_norm": 0.07013576477766037, + "learning_rate": 4.251766506143391e-05, + "loss": 0.2363, + "step": 39929 + }, + { + "epoch": 3.2347699287103047, + "grad_norm": 0.062335990369319916, + "learning_rate": 4.251316440883928e-05, + "loss": 0.1896, + "step": 39930 + }, + { + "epoch": 3.234850939727803, + "grad_norm": 0.0800856202840805, + "learning_rate": 4.250866375624466e-05, + "loss": 0.2091, + "step": 39931 + }, + { + "epoch": 3.234931950745301, + "grad_norm": 0.07388359308242798, + "learning_rate": 4.250416310365003e-05, + "loss": 0.2464, + "step": 39932 + }, + { + "epoch": 3.2350129617628, + "grad_norm": 0.07145456224679947, + "learning_rate": 4.2499662451055404e-05, + "loss": 0.2246, + "step": 39933 + }, + { + "epoch": 3.235093972780298, + "grad_norm": 0.07021401077508926, + "learning_rate": 4.2495161798460784e-05, + "loss": 0.2272, + "step": 39934 + }, + { + "epoch": 3.2351749837977963, + "grad_norm": 0.08557034283876419, + "learning_rate": 4.249066114586615e-05, + "loss": 0.2402, + "step": 39935 + }, + { + "epoch": 3.235255994815295, + "grad_norm": 0.06644951552152634, + "learning_rate": 4.2486160493271525e-05, + "loss": 0.2338, + "step": 39936 + }, + { + "epoch": 3.2353370058327933, + "grad_norm": 0.07197255641222, + "learning_rate": 4.2481659840676905e-05, + "loss": 0.2147, + "step": 39937 + }, + { + "epoch": 3.2354180168502915, + "grad_norm": 0.06469669938087463, + "learning_rate": 4.247715918808227e-05, + "loss": 0.22, + "step": 39938 + }, + { + "epoch": 3.23549902786779, + "grad_norm": 0.0772833302617073, + "learning_rate": 4.2472658535487645e-05, + "loss": 0.2134, + "step": 39939 + }, + { + "epoch": 3.2355800388852884, + "grad_norm": 0.0739804282784462, + "learning_rate": 4.2468157882893026e-05, + "loss": 0.2352, + "step": 39940 + }, + { + "epoch": 3.2356610499027867, + "grad_norm": 0.08174054324626923, + "learning_rate": 4.246365723029839e-05, + "loss": 0.2341, + "step": 39941 + }, + { + "epoch": 3.235742060920285, + "grad_norm": 0.0759250670671463, + "learning_rate": 4.2459156577703766e-05, + "loss": 0.2449, + "step": 39942 + }, + { + "epoch": 3.2358230719377836, + "grad_norm": 0.06863147765398026, + "learning_rate": 4.2454655925109147e-05, + "loss": 0.2184, + "step": 39943 + }, + { + "epoch": 3.235904082955282, + "grad_norm": 0.06283069401979446, + "learning_rate": 4.245015527251452e-05, + "loss": 0.2093, + "step": 39944 + }, + { + "epoch": 3.23598509397278, + "grad_norm": 0.07585699111223221, + "learning_rate": 4.244565461991989e-05, + "loss": 0.2897, + "step": 39945 + }, + { + "epoch": 3.236066104990279, + "grad_norm": 0.06817737966775894, + "learning_rate": 4.244115396732527e-05, + "loss": 0.1989, + "step": 39946 + }, + { + "epoch": 3.236147116007777, + "grad_norm": 0.07741937786340714, + "learning_rate": 4.243665331473064e-05, + "loss": 0.2499, + "step": 39947 + }, + { + "epoch": 3.2362281270252753, + "grad_norm": 0.06947393715381622, + "learning_rate": 4.243215266213601e-05, + "loss": 0.1853, + "step": 39948 + }, + { + "epoch": 3.236309138042774, + "grad_norm": 0.08097727596759796, + "learning_rate": 4.242765200954139e-05, + "loss": 0.247, + "step": 39949 + }, + { + "epoch": 3.2363901490602722, + "grad_norm": 0.07119971513748169, + "learning_rate": 4.242315135694676e-05, + "loss": 0.2289, + "step": 39950 + }, + { + "epoch": 3.2364711600777705, + "grad_norm": 0.07697370648384094, + "learning_rate": 4.241865070435213e-05, + "loss": 0.258, + "step": 39951 + }, + { + "epoch": 3.236552171095269, + "grad_norm": 0.06008782610297203, + "learning_rate": 4.241415005175751e-05, + "loss": 0.1882, + "step": 39952 + }, + { + "epoch": 3.2366331821127674, + "grad_norm": 0.08692114800214767, + "learning_rate": 4.240964939916288e-05, + "loss": 0.2321, + "step": 39953 + }, + { + "epoch": 3.2367141931302656, + "grad_norm": 0.08523430675268173, + "learning_rate": 4.240514874656825e-05, + "loss": 0.2368, + "step": 39954 + }, + { + "epoch": 3.236795204147764, + "grad_norm": 0.08042368292808533, + "learning_rate": 4.240064809397363e-05, + "loss": 0.23, + "step": 39955 + }, + { + "epoch": 3.2368762151652626, + "grad_norm": 0.07272639870643616, + "learning_rate": 4.2396147441379e-05, + "loss": 0.2252, + "step": 39956 + }, + { + "epoch": 3.236957226182761, + "grad_norm": 0.06889273226261139, + "learning_rate": 4.239164678878438e-05, + "loss": 0.2267, + "step": 39957 + }, + { + "epoch": 3.237038237200259, + "grad_norm": 0.06718260794878006, + "learning_rate": 4.238714613618975e-05, + "loss": 0.2329, + "step": 39958 + }, + { + "epoch": 3.2371192482177578, + "grad_norm": 0.08459927141666412, + "learning_rate": 4.2382645483595124e-05, + "loss": 0.2507, + "step": 39959 + }, + { + "epoch": 3.237200259235256, + "grad_norm": 0.07423102855682373, + "learning_rate": 4.23781448310005e-05, + "loss": 0.2415, + "step": 39960 + }, + { + "epoch": 3.2372812702527543, + "grad_norm": 0.06294968724250793, + "learning_rate": 4.237364417840587e-05, + "loss": 0.2341, + "step": 39961 + }, + { + "epoch": 3.237362281270253, + "grad_norm": 0.07447552680969238, + "learning_rate": 4.2369143525811245e-05, + "loss": 0.1979, + "step": 39962 + }, + { + "epoch": 3.237443292287751, + "grad_norm": 0.0652199238538742, + "learning_rate": 4.236464287321662e-05, + "loss": 0.2334, + "step": 39963 + }, + { + "epoch": 3.2375243033052494, + "grad_norm": 0.07575726509094238, + "learning_rate": 4.236014222062199e-05, + "loss": 0.2712, + "step": 39964 + }, + { + "epoch": 3.2376053143227477, + "grad_norm": 0.060297705233097076, + "learning_rate": 4.2355641568027366e-05, + "loss": 0.1856, + "step": 39965 + }, + { + "epoch": 3.2376863253402464, + "grad_norm": 0.07966022193431854, + "learning_rate": 4.235114091543274e-05, + "loss": 0.2953, + "step": 39966 + }, + { + "epoch": 3.2377673363577446, + "grad_norm": 0.07863543182611465, + "learning_rate": 4.234664026283811e-05, + "loss": 0.1878, + "step": 39967 + }, + { + "epoch": 3.237848347375243, + "grad_norm": 0.06594900041818619, + "learning_rate": 4.2342139610243487e-05, + "loss": 0.1895, + "step": 39968 + }, + { + "epoch": 3.2379293583927415, + "grad_norm": 0.08944254368543625, + "learning_rate": 4.233763895764886e-05, + "loss": 0.2714, + "step": 39969 + }, + { + "epoch": 3.23801036941024, + "grad_norm": 0.07178863883018494, + "learning_rate": 4.233313830505424e-05, + "loss": 0.2505, + "step": 39970 + }, + { + "epoch": 3.238091380427738, + "grad_norm": 0.0778353214263916, + "learning_rate": 4.232863765245961e-05, + "loss": 0.2397, + "step": 39971 + }, + { + "epoch": 3.2381723914452367, + "grad_norm": 0.06244092062115669, + "learning_rate": 4.232413699986498e-05, + "loss": 0.2348, + "step": 39972 + }, + { + "epoch": 3.238253402462735, + "grad_norm": 0.07463119179010391, + "learning_rate": 4.231963634727036e-05, + "loss": 0.2222, + "step": 39973 + }, + { + "epoch": 3.238334413480233, + "grad_norm": 0.06593679636716843, + "learning_rate": 4.231513569467573e-05, + "loss": 0.2592, + "step": 39974 + }, + { + "epoch": 3.238415424497732, + "grad_norm": 0.07185850292444229, + "learning_rate": 4.23106350420811e-05, + "loss": 0.2656, + "step": 39975 + }, + { + "epoch": 3.23849643551523, + "grad_norm": 0.060626208782196045, + "learning_rate": 4.230613438948648e-05, + "loss": 0.1949, + "step": 39976 + }, + { + "epoch": 3.2385774465327284, + "grad_norm": 0.07208986580371857, + "learning_rate": 4.230163373689185e-05, + "loss": 0.1869, + "step": 39977 + }, + { + "epoch": 3.2386584575502266, + "grad_norm": 0.07708663493394852, + "learning_rate": 4.229713308429722e-05, + "loss": 0.2435, + "step": 39978 + }, + { + "epoch": 3.2387394685677253, + "grad_norm": 0.08233334124088287, + "learning_rate": 4.22926324317026e-05, + "loss": 0.2294, + "step": 39979 + }, + { + "epoch": 3.2388204795852236, + "grad_norm": 0.07808908075094223, + "learning_rate": 4.228813177910797e-05, + "loss": 0.2123, + "step": 39980 + }, + { + "epoch": 3.238901490602722, + "grad_norm": 0.07508261501789093, + "learning_rate": 4.2283631126513343e-05, + "loss": 0.2504, + "step": 39981 + }, + { + "epoch": 3.2389825016202205, + "grad_norm": 0.06409870833158493, + "learning_rate": 4.2279130473918724e-05, + "loss": 0.2001, + "step": 39982 + }, + { + "epoch": 3.2390635126377187, + "grad_norm": 0.0930962786078453, + "learning_rate": 4.22746298213241e-05, + "loss": 0.2515, + "step": 39983 + }, + { + "epoch": 3.239144523655217, + "grad_norm": 0.06400201469659805, + "learning_rate": 4.2270129168729464e-05, + "loss": 0.2159, + "step": 39984 + }, + { + "epoch": 3.2392255346727157, + "grad_norm": 0.06780712306499481, + "learning_rate": 4.2265628516134845e-05, + "loss": 0.2247, + "step": 39985 + }, + { + "epoch": 3.239306545690214, + "grad_norm": 0.07583682984113693, + "learning_rate": 4.226112786354022e-05, + "loss": 0.2277, + "step": 39986 + }, + { + "epoch": 3.239387556707712, + "grad_norm": 0.06544255465269089, + "learning_rate": 4.2256627210945585e-05, + "loss": 0.236, + "step": 39987 + }, + { + "epoch": 3.2394685677252104, + "grad_norm": 0.05646807700395584, + "learning_rate": 4.2252126558350965e-05, + "loss": 0.2204, + "step": 39988 + }, + { + "epoch": 3.239549578742709, + "grad_norm": 0.08052621781826019, + "learning_rate": 4.224762590575634e-05, + "loss": 0.2119, + "step": 39989 + }, + { + "epoch": 3.2396305897602073, + "grad_norm": 0.07334674149751663, + "learning_rate": 4.2243125253161706e-05, + "loss": 0.2209, + "step": 39990 + }, + { + "epoch": 3.2397116007777056, + "grad_norm": 0.0667957216501236, + "learning_rate": 4.2238624600567086e-05, + "loss": 0.2564, + "step": 39991 + }, + { + "epoch": 3.2397926117952043, + "grad_norm": 0.08592815697193146, + "learning_rate": 4.223412394797246e-05, + "loss": 0.2257, + "step": 39992 + }, + { + "epoch": 3.2398736228127025, + "grad_norm": 0.06996352225542068, + "learning_rate": 4.222962329537783e-05, + "loss": 0.2608, + "step": 39993 + }, + { + "epoch": 3.2399546338302008, + "grad_norm": 0.07029546052217484, + "learning_rate": 4.222512264278321e-05, + "loss": 0.2647, + "step": 39994 + }, + { + "epoch": 3.2400356448476995, + "grad_norm": 0.07762210071086884, + "learning_rate": 4.222062199018858e-05, + "loss": 0.2515, + "step": 39995 + }, + { + "epoch": 3.2401166558651977, + "grad_norm": 0.06713078916072845, + "learning_rate": 4.2216121337593954e-05, + "loss": 0.2236, + "step": 39996 + }, + { + "epoch": 3.240197666882696, + "grad_norm": 0.058481615036726, + "learning_rate": 4.221162068499933e-05, + "loss": 0.2207, + "step": 39997 + }, + { + "epoch": 3.2402786779001946, + "grad_norm": 0.06714651733636856, + "learning_rate": 4.22071200324047e-05, + "loss": 0.2161, + "step": 39998 + }, + { + "epoch": 3.240359688917693, + "grad_norm": 0.07449248433113098, + "learning_rate": 4.2202619379810075e-05, + "loss": 0.2118, + "step": 39999 + }, + { + "epoch": 3.240440699935191, + "grad_norm": 0.07122816145420074, + "learning_rate": 4.219811872721545e-05, + "loss": 0.2579, + "step": 40000 + }, + { + "epoch": 3.2405217109526894, + "grad_norm": 0.08362056314945221, + "learning_rate": 4.219361807462082e-05, + "loss": 0.2342, + "step": 40001 + }, + { + "epoch": 3.240602721970188, + "grad_norm": 0.06606101989746094, + "learning_rate": 4.2189117422026196e-05, + "loss": 0.2261, + "step": 40002 + }, + { + "epoch": 3.2406837329876863, + "grad_norm": 0.06786011904478073, + "learning_rate": 4.218461676943157e-05, + "loss": 0.2026, + "step": 40003 + }, + { + "epoch": 3.2407647440051845, + "grad_norm": 0.05847107991576195, + "learning_rate": 4.218011611683694e-05, + "loss": 0.2293, + "step": 40004 + }, + { + "epoch": 3.2408457550226832, + "grad_norm": 0.05374923720955849, + "learning_rate": 4.217561546424232e-05, + "loss": 0.1857, + "step": 40005 + }, + { + "epoch": 3.2409267660401815, + "grad_norm": 0.0643201693892479, + "learning_rate": 4.217111481164769e-05, + "loss": 0.2363, + "step": 40006 + }, + { + "epoch": 3.2410077770576797, + "grad_norm": 0.08065766841173172, + "learning_rate": 4.2166614159053064e-05, + "loss": 0.3056, + "step": 40007 + }, + { + "epoch": 3.2410887880751784, + "grad_norm": 0.07379017770290375, + "learning_rate": 4.216211350645844e-05, + "loss": 0.2356, + "step": 40008 + }, + { + "epoch": 3.2411697990926767, + "grad_norm": 0.06779438257217407, + "learning_rate": 4.215761285386382e-05, + "loss": 0.2324, + "step": 40009 + }, + { + "epoch": 3.241250810110175, + "grad_norm": 0.082339346408844, + "learning_rate": 4.2153112201269185e-05, + "loss": 0.2111, + "step": 40010 + }, + { + "epoch": 3.241331821127673, + "grad_norm": 0.06292964518070221, + "learning_rate": 4.214861154867456e-05, + "loss": 0.2247, + "step": 40011 + }, + { + "epoch": 3.241412832145172, + "grad_norm": 0.07478726655244827, + "learning_rate": 4.214411089607994e-05, + "loss": 0.1826, + "step": 40012 + }, + { + "epoch": 3.24149384316267, + "grad_norm": 0.07911112159490585, + "learning_rate": 4.2139610243485306e-05, + "loss": 0.2407, + "step": 40013 + }, + { + "epoch": 3.2415748541801683, + "grad_norm": 0.0716332197189331, + "learning_rate": 4.213510959089068e-05, + "loss": 0.2314, + "step": 40014 + }, + { + "epoch": 3.241655865197667, + "grad_norm": 0.07222667336463928, + "learning_rate": 4.213060893829606e-05, + "loss": 0.2013, + "step": 40015 + }, + { + "epoch": 3.2417368762151653, + "grad_norm": 0.07332997024059296, + "learning_rate": 4.2126108285701426e-05, + "loss": 0.2751, + "step": 40016 + }, + { + "epoch": 3.2418178872326635, + "grad_norm": 0.05805470049381256, + "learning_rate": 4.21216076331068e-05, + "loss": 0.1887, + "step": 40017 + }, + { + "epoch": 3.241898898250162, + "grad_norm": 0.07864882051944733, + "learning_rate": 4.211710698051218e-05, + "loss": 0.2469, + "step": 40018 + }, + { + "epoch": 3.2419799092676604, + "grad_norm": 0.08058485388755798, + "learning_rate": 4.211260632791755e-05, + "loss": 0.2614, + "step": 40019 + }, + { + "epoch": 3.2420609202851587, + "grad_norm": 0.09582924097776413, + "learning_rate": 4.210810567532292e-05, + "loss": 0.2512, + "step": 40020 + }, + { + "epoch": 3.2421419313026574, + "grad_norm": 0.07741650193929672, + "learning_rate": 4.21036050227283e-05, + "loss": 0.2381, + "step": 40021 + }, + { + "epoch": 3.2422229423201556, + "grad_norm": 0.07290291041135788, + "learning_rate": 4.2099104370133675e-05, + "loss": 0.2362, + "step": 40022 + }, + { + "epoch": 3.242303953337654, + "grad_norm": 0.06707461178302765, + "learning_rate": 4.209460371753904e-05, + "loss": 0.2087, + "step": 40023 + }, + { + "epoch": 3.242384964355152, + "grad_norm": 0.06291350722312927, + "learning_rate": 4.209010306494442e-05, + "loss": 0.2387, + "step": 40024 + }, + { + "epoch": 3.242465975372651, + "grad_norm": 0.07624591886997223, + "learning_rate": 4.2085602412349796e-05, + "loss": 0.2241, + "step": 40025 + }, + { + "epoch": 3.242546986390149, + "grad_norm": 0.07474566996097565, + "learning_rate": 4.208110175975516e-05, + "loss": 0.2405, + "step": 40026 + }, + { + "epoch": 3.2426279974076473, + "grad_norm": 0.08334729075431824, + "learning_rate": 4.207660110716054e-05, + "loss": 0.2278, + "step": 40027 + }, + { + "epoch": 3.242709008425146, + "grad_norm": 0.07016227394342422, + "learning_rate": 4.2072100454565916e-05, + "loss": 0.2651, + "step": 40028 + }, + { + "epoch": 3.2427900194426442, + "grad_norm": 0.06835842877626419, + "learning_rate": 4.206759980197128e-05, + "loss": 0.2312, + "step": 40029 + }, + { + "epoch": 3.2428710304601425, + "grad_norm": 0.07836335152387619, + "learning_rate": 4.2063099149376664e-05, + "loss": 0.2378, + "step": 40030 + }, + { + "epoch": 3.242952041477641, + "grad_norm": 0.08585887402296066, + "learning_rate": 4.205859849678204e-05, + "loss": 0.2669, + "step": 40031 + }, + { + "epoch": 3.2430330524951394, + "grad_norm": 0.06686677783727646, + "learning_rate": 4.2054097844187404e-05, + "loss": 0.2415, + "step": 40032 + }, + { + "epoch": 3.2431140635126376, + "grad_norm": 0.07130207866430283, + "learning_rate": 4.2049597191592784e-05, + "loss": 0.2666, + "step": 40033 + }, + { + "epoch": 3.243195074530136, + "grad_norm": 0.07913440465927124, + "learning_rate": 4.204509653899816e-05, + "loss": 0.2444, + "step": 40034 + }, + { + "epoch": 3.2432760855476346, + "grad_norm": 0.06934109330177307, + "learning_rate": 4.204059588640353e-05, + "loss": 0.2451, + "step": 40035 + }, + { + "epoch": 3.243357096565133, + "grad_norm": 0.07670455425977707, + "learning_rate": 4.2036095233808905e-05, + "loss": 0.2396, + "step": 40036 + }, + { + "epoch": 3.243438107582631, + "grad_norm": 0.0646243691444397, + "learning_rate": 4.203159458121428e-05, + "loss": 0.1887, + "step": 40037 + }, + { + "epoch": 3.2435191186001298, + "grad_norm": 0.0583641417324543, + "learning_rate": 4.202709392861965e-05, + "loss": 0.2231, + "step": 40038 + }, + { + "epoch": 3.243600129617628, + "grad_norm": 0.0692855641245842, + "learning_rate": 4.2022593276025026e-05, + "loss": 0.2222, + "step": 40039 + }, + { + "epoch": 3.2436811406351262, + "grad_norm": 0.08121485263109207, + "learning_rate": 4.20180926234304e-05, + "loss": 0.284, + "step": 40040 + }, + { + "epoch": 3.243762151652625, + "grad_norm": 0.07695061713457108, + "learning_rate": 4.201359197083577e-05, + "loss": 0.2228, + "step": 40041 + }, + { + "epoch": 3.243843162670123, + "grad_norm": 0.09310201555490494, + "learning_rate": 4.200909131824115e-05, + "loss": 0.2258, + "step": 40042 + }, + { + "epoch": 3.2439241736876214, + "grad_norm": 0.07019844651222229, + "learning_rate": 4.200459066564652e-05, + "loss": 0.2314, + "step": 40043 + }, + { + "epoch": 3.24400518470512, + "grad_norm": 0.08403699845075607, + "learning_rate": 4.2000090013051894e-05, + "loss": 0.2421, + "step": 40044 + }, + { + "epoch": 3.2440861957226184, + "grad_norm": 0.06007116660475731, + "learning_rate": 4.199558936045727e-05, + "loss": 0.2404, + "step": 40045 + }, + { + "epoch": 3.2441672067401166, + "grad_norm": 0.07675842195749283, + "learning_rate": 4.199108870786264e-05, + "loss": 0.2554, + "step": 40046 + }, + { + "epoch": 3.244248217757615, + "grad_norm": 0.07528151571750641, + "learning_rate": 4.1986588055268015e-05, + "loss": 0.2055, + "step": 40047 + }, + { + "epoch": 3.2443292287751135, + "grad_norm": 0.07164599746465683, + "learning_rate": 4.1982087402673395e-05, + "loss": 0.2471, + "step": 40048 + }, + { + "epoch": 3.244410239792612, + "grad_norm": 0.08063235133886337, + "learning_rate": 4.197758675007876e-05, + "loss": 0.2196, + "step": 40049 + }, + { + "epoch": 3.24449125081011, + "grad_norm": 0.06814529746770859, + "learning_rate": 4.1973086097484136e-05, + "loss": 0.2558, + "step": 40050 + }, + { + "epoch": 3.2445722618276087, + "grad_norm": 0.0630267783999443, + "learning_rate": 4.1968585444889516e-05, + "loss": 0.1958, + "step": 40051 + }, + { + "epoch": 3.244653272845107, + "grad_norm": 0.07586364448070526, + "learning_rate": 4.196408479229488e-05, + "loss": 0.2516, + "step": 40052 + }, + { + "epoch": 3.244734283862605, + "grad_norm": 0.07543324679136276, + "learning_rate": 4.1959584139700256e-05, + "loss": 0.2222, + "step": 40053 + }, + { + "epoch": 3.244815294880104, + "grad_norm": 0.07649768888950348, + "learning_rate": 4.195508348710564e-05, + "loss": 0.2484, + "step": 40054 + }, + { + "epoch": 3.244896305897602, + "grad_norm": 0.06851378083229065, + "learning_rate": 4.1950582834511004e-05, + "loss": 0.2299, + "step": 40055 + }, + { + "epoch": 3.2449773169151004, + "grad_norm": 0.07393316179513931, + "learning_rate": 4.194608218191638e-05, + "loss": 0.2314, + "step": 40056 + }, + { + "epoch": 3.2450583279325986, + "grad_norm": 0.06716819852590561, + "learning_rate": 4.194158152932176e-05, + "loss": 0.2181, + "step": 40057 + }, + { + "epoch": 3.2451393389500973, + "grad_norm": 0.07790635526180267, + "learning_rate": 4.1937080876727124e-05, + "loss": 0.2305, + "step": 40058 + }, + { + "epoch": 3.2452203499675956, + "grad_norm": 0.07181359082460403, + "learning_rate": 4.19325802241325e-05, + "loss": 0.2227, + "step": 40059 + }, + { + "epoch": 3.245301360985094, + "grad_norm": 0.08683653920888901, + "learning_rate": 4.192807957153788e-05, + "loss": 0.1908, + "step": 40060 + }, + { + "epoch": 3.2453823720025925, + "grad_norm": 0.0615009181201458, + "learning_rate": 4.192357891894325e-05, + "loss": 0.2266, + "step": 40061 + }, + { + "epoch": 3.2454633830200907, + "grad_norm": 0.08514515310525894, + "learning_rate": 4.191907826634862e-05, + "loss": 0.2502, + "step": 40062 + }, + { + "epoch": 3.245544394037589, + "grad_norm": 0.0665687695145607, + "learning_rate": 4.1914577613754e-05, + "loss": 0.2217, + "step": 40063 + }, + { + "epoch": 3.2456254050550877, + "grad_norm": 0.09101278334856033, + "learning_rate": 4.191007696115937e-05, + "loss": 0.2613, + "step": 40064 + }, + { + "epoch": 3.245706416072586, + "grad_norm": 0.07187115401029587, + "learning_rate": 4.190557630856474e-05, + "loss": 0.2, + "step": 40065 + }, + { + "epoch": 3.245787427090084, + "grad_norm": 0.07283537834882736, + "learning_rate": 4.190107565597012e-05, + "loss": 0.2134, + "step": 40066 + }, + { + "epoch": 3.245868438107583, + "grad_norm": 0.06305906176567078, + "learning_rate": 4.1896575003375494e-05, + "loss": 0.2003, + "step": 40067 + }, + { + "epoch": 3.245949449125081, + "grad_norm": 0.06886248290538788, + "learning_rate": 4.189207435078086e-05, + "loss": 0.2297, + "step": 40068 + }, + { + "epoch": 3.2460304601425793, + "grad_norm": 0.06959490478038788, + "learning_rate": 4.188757369818624e-05, + "loss": 0.2007, + "step": 40069 + }, + { + "epoch": 3.2461114711600776, + "grad_norm": 0.07028921693563461, + "learning_rate": 4.1883073045591614e-05, + "loss": 0.2287, + "step": 40070 + }, + { + "epoch": 3.2461924821775763, + "grad_norm": 0.09249284863471985, + "learning_rate": 4.187857239299698e-05, + "loss": 0.2119, + "step": 40071 + }, + { + "epoch": 3.2462734931950745, + "grad_norm": 0.07930509001016617, + "learning_rate": 4.187407174040236e-05, + "loss": 0.2291, + "step": 40072 + }, + { + "epoch": 3.2463545042125728, + "grad_norm": 0.07560622692108154, + "learning_rate": 4.1869571087807735e-05, + "loss": 0.2331, + "step": 40073 + }, + { + "epoch": 3.2464355152300715, + "grad_norm": 0.06883285194635391, + "learning_rate": 4.186507043521311e-05, + "loss": 0.2736, + "step": 40074 + }, + { + "epoch": 3.2465165262475697, + "grad_norm": 0.05891996994614601, + "learning_rate": 4.186056978261848e-05, + "loss": 0.2471, + "step": 40075 + }, + { + "epoch": 3.246597537265068, + "grad_norm": 0.06350000947713852, + "learning_rate": 4.1856069130023856e-05, + "loss": 0.2182, + "step": 40076 + }, + { + "epoch": 3.2466785482825666, + "grad_norm": 0.08027451485395432, + "learning_rate": 4.185156847742923e-05, + "loss": 0.2259, + "step": 40077 + }, + { + "epoch": 3.246759559300065, + "grad_norm": 0.0639888122677803, + "learning_rate": 4.18470678248346e-05, + "loss": 0.2068, + "step": 40078 + }, + { + "epoch": 3.246840570317563, + "grad_norm": 0.07857982814311981, + "learning_rate": 4.184256717223998e-05, + "loss": 0.2484, + "step": 40079 + }, + { + "epoch": 3.2469215813350614, + "grad_norm": 0.06648974120616913, + "learning_rate": 4.183806651964535e-05, + "loss": 0.2119, + "step": 40080 + }, + { + "epoch": 3.24700259235256, + "grad_norm": 0.07797715067863464, + "learning_rate": 4.1833565867050724e-05, + "loss": 0.2033, + "step": 40081 + }, + { + "epoch": 3.2470836033700583, + "grad_norm": 0.0580095537006855, + "learning_rate": 4.18290652144561e-05, + "loss": 0.1904, + "step": 40082 + }, + { + "epoch": 3.2471646143875565, + "grad_norm": 0.06907695531845093, + "learning_rate": 4.182456456186147e-05, + "loss": 0.2335, + "step": 40083 + }, + { + "epoch": 3.2472456254050552, + "grad_norm": 0.06992605328559875, + "learning_rate": 4.1820063909266845e-05, + "loss": 0.2249, + "step": 40084 + }, + { + "epoch": 3.2473266364225535, + "grad_norm": 0.07529851049184799, + "learning_rate": 4.181556325667222e-05, + "loss": 0.2418, + "step": 40085 + }, + { + "epoch": 3.2474076474400517, + "grad_norm": 0.05927077680826187, + "learning_rate": 4.181106260407759e-05, + "loss": 0.2169, + "step": 40086 + }, + { + "epoch": 3.2474886584575504, + "grad_norm": 0.06652016192674637, + "learning_rate": 4.1806561951482966e-05, + "loss": 0.1933, + "step": 40087 + }, + { + "epoch": 3.2475696694750487, + "grad_norm": 0.08537282794713974, + "learning_rate": 4.180206129888834e-05, + "loss": 0.229, + "step": 40088 + }, + { + "epoch": 3.247650680492547, + "grad_norm": 0.0686645582318306, + "learning_rate": 4.179756064629371e-05, + "loss": 0.2335, + "step": 40089 + }, + { + "epoch": 3.2477316915100456, + "grad_norm": 0.06994078308343887, + "learning_rate": 4.179305999369909e-05, + "loss": 0.2239, + "step": 40090 + }, + { + "epoch": 3.247812702527544, + "grad_norm": 0.08000598102807999, + "learning_rate": 4.178855934110446e-05, + "loss": 0.273, + "step": 40091 + }, + { + "epoch": 3.247893713545042, + "grad_norm": 0.06356451660394669, + "learning_rate": 4.1784058688509834e-05, + "loss": 0.2106, + "step": 40092 + }, + { + "epoch": 3.2479747245625403, + "grad_norm": 0.07621801644563675, + "learning_rate": 4.1779558035915214e-05, + "loss": 0.2109, + "step": 40093 + }, + { + "epoch": 3.248055735580039, + "grad_norm": 0.0670614168047905, + "learning_rate": 4.177505738332058e-05, + "loss": 0.2243, + "step": 40094 + }, + { + "epoch": 3.2481367465975373, + "grad_norm": 0.06827739626169205, + "learning_rate": 4.1770556730725955e-05, + "loss": 0.2448, + "step": 40095 + }, + { + "epoch": 3.2482177576150355, + "grad_norm": 0.0642535388469696, + "learning_rate": 4.1766056078131335e-05, + "loss": 0.2393, + "step": 40096 + }, + { + "epoch": 3.248298768632534, + "grad_norm": 0.0656471997499466, + "learning_rate": 4.17615554255367e-05, + "loss": 0.2621, + "step": 40097 + }, + { + "epoch": 3.2483797796500324, + "grad_norm": 0.06643085181713104, + "learning_rate": 4.1757054772942075e-05, + "loss": 0.2005, + "step": 40098 + }, + { + "epoch": 3.2484607906675307, + "grad_norm": 0.06502383947372437, + "learning_rate": 4.1752554120347456e-05, + "loss": 0.2495, + "step": 40099 + }, + { + "epoch": 3.248541801685029, + "grad_norm": 0.06458619236946106, + "learning_rate": 4.174805346775282e-05, + "loss": 0.2114, + "step": 40100 + }, + { + "epoch": 3.2486228127025276, + "grad_norm": 0.06655988842248917, + "learning_rate": 4.1743552815158196e-05, + "loss": 0.229, + "step": 40101 + }, + { + "epoch": 3.248703823720026, + "grad_norm": 0.06997954100370407, + "learning_rate": 4.1739052162563577e-05, + "loss": 0.239, + "step": 40102 + }, + { + "epoch": 3.248784834737524, + "grad_norm": 0.06454320251941681, + "learning_rate": 4.173455150996895e-05, + "loss": 0.2321, + "step": 40103 + }, + { + "epoch": 3.248865845755023, + "grad_norm": 0.07845567911863327, + "learning_rate": 4.173005085737432e-05, + "loss": 0.2179, + "step": 40104 + }, + { + "epoch": 3.248946856772521, + "grad_norm": 0.07784346491098404, + "learning_rate": 4.17255502047797e-05, + "loss": 0.269, + "step": 40105 + }, + { + "epoch": 3.2490278677900193, + "grad_norm": 0.06842927634716034, + "learning_rate": 4.172104955218507e-05, + "loss": 0.2269, + "step": 40106 + }, + { + "epoch": 3.249108878807518, + "grad_norm": 0.0676320418715477, + "learning_rate": 4.171654889959044e-05, + "loss": 0.2394, + "step": 40107 + }, + { + "epoch": 3.249189889825016, + "grad_norm": 0.08034982532262802, + "learning_rate": 4.171204824699582e-05, + "loss": 0.2489, + "step": 40108 + }, + { + "epoch": 3.2492709008425145, + "grad_norm": 0.07106079906225204, + "learning_rate": 4.170754759440119e-05, + "loss": 0.2396, + "step": 40109 + }, + { + "epoch": 3.249351911860013, + "grad_norm": 0.07400369644165039, + "learning_rate": 4.170304694180656e-05, + "loss": 0.2471, + "step": 40110 + }, + { + "epoch": 3.2494329228775114, + "grad_norm": 0.07235600054264069, + "learning_rate": 4.169854628921194e-05, + "loss": 0.2298, + "step": 40111 + }, + { + "epoch": 3.2495139338950096, + "grad_norm": 0.06874226033687592, + "learning_rate": 4.169404563661731e-05, + "loss": 0.2381, + "step": 40112 + }, + { + "epoch": 3.2495949449125083, + "grad_norm": 0.07894796133041382, + "learning_rate": 4.168954498402268e-05, + "loss": 0.2341, + "step": 40113 + }, + { + "epoch": 3.2496759559300066, + "grad_norm": 0.07245932519435883, + "learning_rate": 4.168504433142806e-05, + "loss": 0.2202, + "step": 40114 + }, + { + "epoch": 3.249756966947505, + "grad_norm": 0.06470391154289246, + "learning_rate": 4.168054367883343e-05, + "loss": 0.2085, + "step": 40115 + }, + { + "epoch": 3.249837977965003, + "grad_norm": 0.06465082615613937, + "learning_rate": 4.167604302623881e-05, + "loss": 0.2549, + "step": 40116 + }, + { + "epoch": 3.2499189889825018, + "grad_norm": 0.06674449145793915, + "learning_rate": 4.167154237364418e-05, + "loss": 0.2069, + "step": 40117 + }, + { + "epoch": 3.25, + "grad_norm": 0.07038500905036926, + "learning_rate": 4.1667041721049554e-05, + "loss": 0.2391, + "step": 40118 + }, + { + "epoch": 3.2500810110174982, + "grad_norm": 0.07672902941703796, + "learning_rate": 4.166254106845493e-05, + "loss": 0.225, + "step": 40119 + }, + { + "epoch": 3.250162022034997, + "grad_norm": 0.07818491011857986, + "learning_rate": 4.16580404158603e-05, + "loss": 0.2197, + "step": 40120 + }, + { + "epoch": 3.250243033052495, + "grad_norm": 0.07456718385219574, + "learning_rate": 4.1653539763265675e-05, + "loss": 0.2212, + "step": 40121 + }, + { + "epoch": 3.2503240440699934, + "grad_norm": 0.07713481038808823, + "learning_rate": 4.164903911067105e-05, + "loss": 0.2228, + "step": 40122 + }, + { + "epoch": 3.2504050550874917, + "grad_norm": 0.06257275491952896, + "learning_rate": 4.164453845807642e-05, + "loss": 0.2329, + "step": 40123 + }, + { + "epoch": 3.2504860661049904, + "grad_norm": 0.08117084950208664, + "learning_rate": 4.1640037805481796e-05, + "loss": 0.2085, + "step": 40124 + }, + { + "epoch": 3.2505670771224886, + "grad_norm": 0.06761786341667175, + "learning_rate": 4.163553715288717e-05, + "loss": 0.2545, + "step": 40125 + }, + { + "epoch": 3.250648088139987, + "grad_norm": 0.08201505988836288, + "learning_rate": 4.163103650029254e-05, + "loss": 0.2813, + "step": 40126 + }, + { + "epoch": 3.2507290991574855, + "grad_norm": 0.07313979417085648, + "learning_rate": 4.162653584769792e-05, + "loss": 0.2312, + "step": 40127 + }, + { + "epoch": 3.250810110174984, + "grad_norm": 0.0680297464132309, + "learning_rate": 4.162203519510329e-05, + "loss": 0.238, + "step": 40128 + }, + { + "epoch": 3.250891121192482, + "grad_norm": 0.07706769555807114, + "learning_rate": 4.161753454250867e-05, + "loss": 0.2279, + "step": 40129 + }, + { + "epoch": 3.2509721322099807, + "grad_norm": 0.06838444620370865, + "learning_rate": 4.161303388991404e-05, + "loss": 0.2087, + "step": 40130 + }, + { + "epoch": 3.251053143227479, + "grad_norm": 0.08436397463083267, + "learning_rate": 4.160853323731941e-05, + "loss": 0.2532, + "step": 40131 + }, + { + "epoch": 3.251134154244977, + "grad_norm": 0.05488111823797226, + "learning_rate": 4.160403258472479e-05, + "loss": 0.2381, + "step": 40132 + }, + { + "epoch": 3.251215165262476, + "grad_norm": 0.07039676606655121, + "learning_rate": 4.159953193213016e-05, + "loss": 0.2248, + "step": 40133 + }, + { + "epoch": 3.251296176279974, + "grad_norm": 0.0652816966176033, + "learning_rate": 4.159503127953553e-05, + "loss": 0.2165, + "step": 40134 + }, + { + "epoch": 3.2513771872974724, + "grad_norm": 0.07050791382789612, + "learning_rate": 4.159053062694091e-05, + "loss": 0.235, + "step": 40135 + }, + { + "epoch": 3.251458198314971, + "grad_norm": 0.07297641783952713, + "learning_rate": 4.158602997434628e-05, + "loss": 0.2507, + "step": 40136 + }, + { + "epoch": 3.2515392093324693, + "grad_norm": 0.0634954422712326, + "learning_rate": 4.158152932175165e-05, + "loss": 0.2073, + "step": 40137 + }, + { + "epoch": 3.2516202203499676, + "grad_norm": 0.07681002467870712, + "learning_rate": 4.157702866915703e-05, + "loss": 0.2564, + "step": 40138 + }, + { + "epoch": 3.251701231367466, + "grad_norm": 0.08574255555868149, + "learning_rate": 4.15725280165624e-05, + "loss": 0.271, + "step": 40139 + }, + { + "epoch": 3.2517822423849645, + "grad_norm": 0.06906864047050476, + "learning_rate": 4.1568027363967773e-05, + "loss": 0.1695, + "step": 40140 + }, + { + "epoch": 3.2518632534024627, + "grad_norm": 0.07001274824142456, + "learning_rate": 4.1563526711373154e-05, + "loss": 0.2208, + "step": 40141 + }, + { + "epoch": 3.251944264419961, + "grad_norm": 0.07010383158922195, + "learning_rate": 4.155902605877853e-05, + "loss": 0.2613, + "step": 40142 + }, + { + "epoch": 3.2520252754374597, + "grad_norm": 0.07222133874893188, + "learning_rate": 4.1554525406183894e-05, + "loss": 0.2113, + "step": 40143 + }, + { + "epoch": 3.252106286454958, + "grad_norm": 0.08615390211343765, + "learning_rate": 4.1550024753589275e-05, + "loss": 0.2201, + "step": 40144 + }, + { + "epoch": 3.252187297472456, + "grad_norm": 0.06933669000864029, + "learning_rate": 4.154552410099465e-05, + "loss": 0.2144, + "step": 40145 + }, + { + "epoch": 3.2522683084899544, + "grad_norm": 0.07412154227495193, + "learning_rate": 4.1541023448400015e-05, + "loss": 0.2293, + "step": 40146 + }, + { + "epoch": 3.252349319507453, + "grad_norm": 0.05812745541334152, + "learning_rate": 4.1536522795805395e-05, + "loss": 0.1891, + "step": 40147 + }, + { + "epoch": 3.2524303305249513, + "grad_norm": 0.08077571541070938, + "learning_rate": 4.153202214321077e-05, + "loss": 0.2291, + "step": 40148 + }, + { + "epoch": 3.2525113415424496, + "grad_norm": 0.07784144580364227, + "learning_rate": 4.1527521490616136e-05, + "loss": 0.2176, + "step": 40149 + }, + { + "epoch": 3.2525923525599483, + "grad_norm": 0.06963331252336502, + "learning_rate": 4.1523020838021516e-05, + "loss": 0.2368, + "step": 40150 + }, + { + "epoch": 3.2526733635774465, + "grad_norm": 0.07795493304729462, + "learning_rate": 4.151852018542689e-05, + "loss": 0.2419, + "step": 40151 + }, + { + "epoch": 3.2527543745949448, + "grad_norm": 0.05834238603711128, + "learning_rate": 4.151401953283226e-05, + "loss": 0.1873, + "step": 40152 + }, + { + "epoch": 3.2528353856124435, + "grad_norm": 0.06364904344081879, + "learning_rate": 4.150951888023764e-05, + "loss": 0.2099, + "step": 40153 + }, + { + "epoch": 3.2529163966299417, + "grad_norm": 0.08391781896352768, + "learning_rate": 4.150501822764301e-05, + "loss": 0.2647, + "step": 40154 + }, + { + "epoch": 3.25299740764744, + "grad_norm": 0.08501686900854111, + "learning_rate": 4.1500517575048384e-05, + "loss": 0.2522, + "step": 40155 + }, + { + "epoch": 3.2530784186649386, + "grad_norm": 0.06751678884029388, + "learning_rate": 4.149601692245376e-05, + "loss": 0.2084, + "step": 40156 + }, + { + "epoch": 3.253159429682437, + "grad_norm": 0.07635574042797089, + "learning_rate": 4.149151626985913e-05, + "loss": 0.2444, + "step": 40157 + }, + { + "epoch": 3.253240440699935, + "grad_norm": 0.06685523688793182, + "learning_rate": 4.1487015617264505e-05, + "loss": 0.1723, + "step": 40158 + }, + { + "epoch": 3.253321451717434, + "grad_norm": 0.06515222042798996, + "learning_rate": 4.148251496466988e-05, + "loss": 0.2056, + "step": 40159 + }, + { + "epoch": 3.253402462734932, + "grad_norm": 0.06930624693632126, + "learning_rate": 4.147801431207525e-05, + "loss": 0.2188, + "step": 40160 + }, + { + "epoch": 3.2534834737524303, + "grad_norm": 0.0722808688879013, + "learning_rate": 4.1473513659480626e-05, + "loss": 0.2177, + "step": 40161 + }, + { + "epoch": 3.2535644847699285, + "grad_norm": 0.07483646273612976, + "learning_rate": 4.1469013006886e-05, + "loss": 0.258, + "step": 40162 + }, + { + "epoch": 3.2536454957874272, + "grad_norm": 0.07711175084114075, + "learning_rate": 4.146451235429137e-05, + "loss": 0.1879, + "step": 40163 + }, + { + "epoch": 3.2537265068049255, + "grad_norm": 0.08365480601787567, + "learning_rate": 4.146001170169675e-05, + "loss": 0.2328, + "step": 40164 + }, + { + "epoch": 3.2538075178224237, + "grad_norm": 0.07842250913381577, + "learning_rate": 4.145551104910212e-05, + "loss": 0.2462, + "step": 40165 + }, + { + "epoch": 3.2538885288399224, + "grad_norm": 0.07055499404668808, + "learning_rate": 4.1451010396507494e-05, + "loss": 0.2611, + "step": 40166 + }, + { + "epoch": 3.2539695398574207, + "grad_norm": 0.08839535713195801, + "learning_rate": 4.144650974391287e-05, + "loss": 0.2337, + "step": 40167 + }, + { + "epoch": 3.254050550874919, + "grad_norm": 0.06849615275859833, + "learning_rate": 4.144200909131825e-05, + "loss": 0.2392, + "step": 40168 + }, + { + "epoch": 3.254131561892417, + "grad_norm": 0.0740901306271553, + "learning_rate": 4.1437508438723615e-05, + "loss": 0.2294, + "step": 40169 + }, + { + "epoch": 3.254212572909916, + "grad_norm": 0.07550173997879028, + "learning_rate": 4.143300778612899e-05, + "loss": 0.2075, + "step": 40170 + }, + { + "epoch": 3.254293583927414, + "grad_norm": 0.06401327252388, + "learning_rate": 4.142850713353437e-05, + "loss": 0.2083, + "step": 40171 + }, + { + "epoch": 3.2543745949449123, + "grad_norm": 0.07523950189352036, + "learning_rate": 4.1424006480939736e-05, + "loss": 0.2428, + "step": 40172 + }, + { + "epoch": 3.254455605962411, + "grad_norm": 0.06384017318487167, + "learning_rate": 4.141950582834511e-05, + "loss": 0.2369, + "step": 40173 + }, + { + "epoch": 3.2545366169799093, + "grad_norm": 0.06851238757371902, + "learning_rate": 4.141500517575049e-05, + "loss": 0.2115, + "step": 40174 + }, + { + "epoch": 3.2546176279974075, + "grad_norm": 0.07990797609090805, + "learning_rate": 4.1410504523155856e-05, + "loss": 0.22, + "step": 40175 + }, + { + "epoch": 3.254698639014906, + "grad_norm": 0.05939148738980293, + "learning_rate": 4.140600387056123e-05, + "loss": 0.212, + "step": 40176 + }, + { + "epoch": 3.2547796500324044, + "grad_norm": 0.0688575804233551, + "learning_rate": 4.140150321796661e-05, + "loss": 0.2121, + "step": 40177 + }, + { + "epoch": 3.2548606610499027, + "grad_norm": 0.07078464329242706, + "learning_rate": 4.139700256537198e-05, + "loss": 0.1916, + "step": 40178 + }, + { + "epoch": 3.2549416720674014, + "grad_norm": 0.07321570813655853, + "learning_rate": 4.139250191277735e-05, + "loss": 0.2323, + "step": 40179 + }, + { + "epoch": 3.2550226830848996, + "grad_norm": 0.07249194383621216, + "learning_rate": 4.138800126018273e-05, + "loss": 0.2252, + "step": 40180 + }, + { + "epoch": 3.255103694102398, + "grad_norm": 0.08079152554273605, + "learning_rate": 4.1383500607588105e-05, + "loss": 0.226, + "step": 40181 + }, + { + "epoch": 3.2551847051198965, + "grad_norm": 0.0795033872127533, + "learning_rate": 4.137899995499347e-05, + "loss": 0.2598, + "step": 40182 + }, + { + "epoch": 3.255265716137395, + "grad_norm": 0.06471917778253555, + "learning_rate": 4.137449930239885e-05, + "loss": 0.2138, + "step": 40183 + }, + { + "epoch": 3.255346727154893, + "grad_norm": 0.07006263732910156, + "learning_rate": 4.1369998649804226e-05, + "loss": 0.2195, + "step": 40184 + }, + { + "epoch": 3.2554277381723913, + "grad_norm": 0.06144615635275841, + "learning_rate": 4.136549799720959e-05, + "loss": 0.2555, + "step": 40185 + }, + { + "epoch": 3.25550874918989, + "grad_norm": 0.0662737786769867, + "learning_rate": 4.136099734461497e-05, + "loss": 0.1939, + "step": 40186 + }, + { + "epoch": 3.255589760207388, + "grad_norm": 0.06354191899299622, + "learning_rate": 4.1356496692020346e-05, + "loss": 0.2214, + "step": 40187 + }, + { + "epoch": 3.2556707712248865, + "grad_norm": 0.08236868679523468, + "learning_rate": 4.135199603942572e-05, + "loss": 0.2174, + "step": 40188 + }, + { + "epoch": 3.255751782242385, + "grad_norm": 0.09432529658079147, + "learning_rate": 4.1347495386831094e-05, + "loss": 0.2061, + "step": 40189 + }, + { + "epoch": 3.2558327932598834, + "grad_norm": 0.06357807666063309, + "learning_rate": 4.134299473423647e-05, + "loss": 0.1987, + "step": 40190 + }, + { + "epoch": 3.2559138042773816, + "grad_norm": 0.06725596636533737, + "learning_rate": 4.133849408164184e-05, + "loss": 0.2093, + "step": 40191 + }, + { + "epoch": 3.25599481529488, + "grad_norm": 0.06739084422588348, + "learning_rate": 4.1333993429047214e-05, + "loss": 0.2309, + "step": 40192 + }, + { + "epoch": 3.2560758263123786, + "grad_norm": 0.07015515863895416, + "learning_rate": 4.132949277645259e-05, + "loss": 0.2096, + "step": 40193 + }, + { + "epoch": 3.256156837329877, + "grad_norm": 0.06270213425159454, + "learning_rate": 4.132499212385796e-05, + "loss": 0.2008, + "step": 40194 + }, + { + "epoch": 3.256237848347375, + "grad_norm": 0.08969100564718246, + "learning_rate": 4.1320491471263335e-05, + "loss": 0.2107, + "step": 40195 + }, + { + "epoch": 3.2563188593648738, + "grad_norm": 0.07705602049827576, + "learning_rate": 4.131599081866871e-05, + "loss": 0.1887, + "step": 40196 + }, + { + "epoch": 3.256399870382372, + "grad_norm": 0.08064062893390656, + "learning_rate": 4.131149016607408e-05, + "loss": 0.2065, + "step": 40197 + }, + { + "epoch": 3.2564808813998702, + "grad_norm": 0.06924363970756531, + "learning_rate": 4.1306989513479456e-05, + "loss": 0.2621, + "step": 40198 + }, + { + "epoch": 3.256561892417369, + "grad_norm": 0.06432975083589554, + "learning_rate": 4.130248886088483e-05, + "loss": 0.229, + "step": 40199 + }, + { + "epoch": 3.256642903434867, + "grad_norm": 0.0756380558013916, + "learning_rate": 4.12979882082902e-05, + "loss": 0.2193, + "step": 40200 + }, + { + "epoch": 3.2567239144523654, + "grad_norm": 0.06310634315013885, + "learning_rate": 4.129348755569558e-05, + "loss": 0.2315, + "step": 40201 + }, + { + "epoch": 3.256804925469864, + "grad_norm": 0.0665639266371727, + "learning_rate": 4.128898690310095e-05, + "loss": 0.2299, + "step": 40202 + }, + { + "epoch": 3.2568859364873624, + "grad_norm": 0.06902968138456345, + "learning_rate": 4.1284486250506324e-05, + "loss": 0.2383, + "step": 40203 + }, + { + "epoch": 3.2569669475048606, + "grad_norm": 0.0826072245836258, + "learning_rate": 4.12799855979117e-05, + "loss": 0.2294, + "step": 40204 + }, + { + "epoch": 3.2570479585223593, + "grad_norm": 0.06567224115133286, + "learning_rate": 4.127548494531707e-05, + "loss": 0.2133, + "step": 40205 + }, + { + "epoch": 3.2571289695398575, + "grad_norm": 0.0768572986125946, + "learning_rate": 4.1270984292722445e-05, + "loss": 0.2166, + "step": 40206 + }, + { + "epoch": 3.2572099805573558, + "grad_norm": 0.06855472177267075, + "learning_rate": 4.1266483640127825e-05, + "loss": 0.2298, + "step": 40207 + }, + { + "epoch": 3.257290991574854, + "grad_norm": 0.06466935575008392, + "learning_rate": 4.126198298753319e-05, + "loss": 0.2315, + "step": 40208 + }, + { + "epoch": 3.2573720025923527, + "grad_norm": 0.06445963680744171, + "learning_rate": 4.1257482334938566e-05, + "loss": 0.2114, + "step": 40209 + }, + { + "epoch": 3.257453013609851, + "grad_norm": 0.06506424397230148, + "learning_rate": 4.1252981682343946e-05, + "loss": 0.2329, + "step": 40210 + }, + { + "epoch": 3.257534024627349, + "grad_norm": 0.06592871993780136, + "learning_rate": 4.124848102974931e-05, + "loss": 0.2447, + "step": 40211 + }, + { + "epoch": 3.257615035644848, + "grad_norm": 0.05974597483873367, + "learning_rate": 4.1243980377154686e-05, + "loss": 0.2061, + "step": 40212 + }, + { + "epoch": 3.257696046662346, + "grad_norm": 0.07356038689613342, + "learning_rate": 4.123947972456007e-05, + "loss": 0.2689, + "step": 40213 + }, + { + "epoch": 3.2577770576798444, + "grad_norm": 0.06664173305034637, + "learning_rate": 4.1234979071965434e-05, + "loss": 0.2183, + "step": 40214 + }, + { + "epoch": 3.2578580686973426, + "grad_norm": 0.0713634043931961, + "learning_rate": 4.123047841937081e-05, + "loss": 0.2332, + "step": 40215 + }, + { + "epoch": 3.2579390797148413, + "grad_norm": 0.07688642293214798, + "learning_rate": 4.122597776677619e-05, + "loss": 0.2101, + "step": 40216 + }, + { + "epoch": 3.2580200907323396, + "grad_norm": 0.06881708651781082, + "learning_rate": 4.1221477114181554e-05, + "loss": 0.2483, + "step": 40217 + }, + { + "epoch": 3.258101101749838, + "grad_norm": 0.07010731101036072, + "learning_rate": 4.121697646158693e-05, + "loss": 0.2214, + "step": 40218 + }, + { + "epoch": 3.2581821127673365, + "grad_norm": 0.06224513053894043, + "learning_rate": 4.121247580899231e-05, + "loss": 0.1854, + "step": 40219 + }, + { + "epoch": 3.2582631237848347, + "grad_norm": 0.08643602579832077, + "learning_rate": 4.120797515639768e-05, + "loss": 0.2819, + "step": 40220 + }, + { + "epoch": 3.258344134802333, + "grad_norm": 0.08633658289909363, + "learning_rate": 4.1203474503803056e-05, + "loss": 0.2573, + "step": 40221 + }, + { + "epoch": 3.2584251458198317, + "grad_norm": 0.06527513265609741, + "learning_rate": 4.119897385120843e-05, + "loss": 0.2206, + "step": 40222 + }, + { + "epoch": 3.25850615683733, + "grad_norm": 0.08194277435541153, + "learning_rate": 4.11944731986138e-05, + "loss": 0.2267, + "step": 40223 + }, + { + "epoch": 3.258587167854828, + "grad_norm": 0.07805872708559036, + "learning_rate": 4.1189972546019176e-05, + "loss": 0.2558, + "step": 40224 + }, + { + "epoch": 3.258668178872327, + "grad_norm": 0.07843924313783646, + "learning_rate": 4.118547189342455e-05, + "loss": 0.209, + "step": 40225 + }, + { + "epoch": 3.258749189889825, + "grad_norm": 0.0763542577624321, + "learning_rate": 4.1180971240829924e-05, + "loss": 0.218, + "step": 40226 + }, + { + "epoch": 3.2588302009073233, + "grad_norm": 0.07482190430164337, + "learning_rate": 4.11764705882353e-05, + "loss": 0.24, + "step": 40227 + }, + { + "epoch": 3.2589112119248216, + "grad_norm": 0.08386291563510895, + "learning_rate": 4.117196993564067e-05, + "loss": 0.2434, + "step": 40228 + }, + { + "epoch": 3.2589922229423203, + "grad_norm": 0.08602840453386307, + "learning_rate": 4.1167469283046044e-05, + "loss": 0.2439, + "step": 40229 + }, + { + "epoch": 3.2590732339598185, + "grad_norm": 0.08088722825050354, + "learning_rate": 4.116296863045142e-05, + "loss": 0.2181, + "step": 40230 + }, + { + "epoch": 3.2591542449773168, + "grad_norm": 0.057865746319293976, + "learning_rate": 4.115846797785679e-05, + "loss": 0.1942, + "step": 40231 + }, + { + "epoch": 3.2592352559948155, + "grad_norm": 0.06597369909286499, + "learning_rate": 4.1153967325262165e-05, + "loss": 0.225, + "step": 40232 + }, + { + "epoch": 3.2593162670123137, + "grad_norm": 0.0643196552991867, + "learning_rate": 4.114946667266754e-05, + "loss": 0.2137, + "step": 40233 + }, + { + "epoch": 3.259397278029812, + "grad_norm": 0.06918679922819138, + "learning_rate": 4.114496602007291e-05, + "loss": 0.3121, + "step": 40234 + }, + { + "epoch": 3.2594782890473106, + "grad_norm": 0.07351045310497284, + "learning_rate": 4.1140465367478286e-05, + "loss": 0.2281, + "step": 40235 + }, + { + "epoch": 3.259559300064809, + "grad_norm": 0.07151784747838974, + "learning_rate": 4.113596471488366e-05, + "loss": 0.2064, + "step": 40236 + }, + { + "epoch": 3.259640311082307, + "grad_norm": 0.06628546118736267, + "learning_rate": 4.113146406228903e-05, + "loss": 0.2263, + "step": 40237 + }, + { + "epoch": 3.2597213220998054, + "grad_norm": 0.06502491235733032, + "learning_rate": 4.112696340969441e-05, + "loss": 0.2323, + "step": 40238 + }, + { + "epoch": 3.259802333117304, + "grad_norm": 0.06255894899368286, + "learning_rate": 4.112246275709978e-05, + "loss": 0.1968, + "step": 40239 + }, + { + "epoch": 3.2598833441348023, + "grad_norm": 0.09315559267997742, + "learning_rate": 4.1117962104505154e-05, + "loss": 0.2387, + "step": 40240 + }, + { + "epoch": 3.2599643551523005, + "grad_norm": 0.07603956013917923, + "learning_rate": 4.111346145191053e-05, + "loss": 0.2689, + "step": 40241 + }, + { + "epoch": 3.2600453661697992, + "grad_norm": 0.07923232764005661, + "learning_rate": 4.11089607993159e-05, + "loss": 0.2161, + "step": 40242 + }, + { + "epoch": 3.2601263771872975, + "grad_norm": 0.07558530569076538, + "learning_rate": 4.1104460146721275e-05, + "loss": 0.2411, + "step": 40243 + }, + { + "epoch": 3.2602073882047957, + "grad_norm": 0.0685918852686882, + "learning_rate": 4.109995949412665e-05, + "loss": 0.2282, + "step": 40244 + }, + { + "epoch": 3.2602883992222944, + "grad_norm": 0.06940195709466934, + "learning_rate": 4.109545884153202e-05, + "loss": 0.2205, + "step": 40245 + }, + { + "epoch": 3.2603694102397927, + "grad_norm": 0.050486329942941666, + "learning_rate": 4.1090958188937396e-05, + "loss": 0.1769, + "step": 40246 + }, + { + "epoch": 3.260450421257291, + "grad_norm": 0.06900981813669205, + "learning_rate": 4.108645753634277e-05, + "loss": 0.1998, + "step": 40247 + }, + { + "epoch": 3.2605314322747896, + "grad_norm": 0.07367018610239029, + "learning_rate": 4.108195688374814e-05, + "loss": 0.2257, + "step": 40248 + }, + { + "epoch": 3.260612443292288, + "grad_norm": 0.07741806656122208, + "learning_rate": 4.107745623115352e-05, + "loss": 0.2097, + "step": 40249 + }, + { + "epoch": 3.260693454309786, + "grad_norm": 0.0637543573975563, + "learning_rate": 4.107295557855889e-05, + "loss": 0.2302, + "step": 40250 + }, + { + "epoch": 3.2607744653272843, + "grad_norm": 0.08200757205486298, + "learning_rate": 4.1068454925964264e-05, + "loss": 0.2664, + "step": 40251 + }, + { + "epoch": 3.260855476344783, + "grad_norm": 0.07200410962104797, + "learning_rate": 4.1063954273369644e-05, + "loss": 0.1957, + "step": 40252 + }, + { + "epoch": 3.2609364873622813, + "grad_norm": 0.08035401999950409, + "learning_rate": 4.105945362077501e-05, + "loss": 0.2495, + "step": 40253 + }, + { + "epoch": 3.2610174983797795, + "grad_norm": 0.07450471818447113, + "learning_rate": 4.1054952968180385e-05, + "loss": 0.245, + "step": 40254 + }, + { + "epoch": 3.261098509397278, + "grad_norm": 0.07003799080848694, + "learning_rate": 4.1050452315585765e-05, + "loss": 0.24, + "step": 40255 + }, + { + "epoch": 3.2611795204147764, + "grad_norm": 0.06997160613536835, + "learning_rate": 4.104595166299113e-05, + "loss": 0.249, + "step": 40256 + }, + { + "epoch": 3.2612605314322747, + "grad_norm": 0.062063608318567276, + "learning_rate": 4.104145101039651e-05, + "loss": 0.2189, + "step": 40257 + }, + { + "epoch": 3.261341542449773, + "grad_norm": 0.08056069165468216, + "learning_rate": 4.1036950357801886e-05, + "loss": 0.25, + "step": 40258 + }, + { + "epoch": 3.2614225534672716, + "grad_norm": 0.06770657747983932, + "learning_rate": 4.103244970520725e-05, + "loss": 0.2254, + "step": 40259 + }, + { + "epoch": 3.26150356448477, + "grad_norm": 0.06321107596158981, + "learning_rate": 4.102794905261263e-05, + "loss": 0.2001, + "step": 40260 + }, + { + "epoch": 3.261584575502268, + "grad_norm": 0.0895034447312355, + "learning_rate": 4.1023448400018007e-05, + "loss": 0.2436, + "step": 40261 + }, + { + "epoch": 3.261665586519767, + "grad_norm": 0.06674846261739731, + "learning_rate": 4.101894774742338e-05, + "loss": 0.2137, + "step": 40262 + }, + { + "epoch": 3.261746597537265, + "grad_norm": 0.06829430162906647, + "learning_rate": 4.1014447094828754e-05, + "loss": 0.2496, + "step": 40263 + }, + { + "epoch": 3.2618276085547633, + "grad_norm": 0.07296734303236008, + "learning_rate": 4.100994644223413e-05, + "loss": 0.207, + "step": 40264 + }, + { + "epoch": 3.261908619572262, + "grad_norm": 0.09419719874858856, + "learning_rate": 4.10054457896395e-05, + "loss": 0.2538, + "step": 40265 + }, + { + "epoch": 3.26198963058976, + "grad_norm": 0.06881719082593918, + "learning_rate": 4.1000945137044875e-05, + "loss": 0.2064, + "step": 40266 + }, + { + "epoch": 3.2620706416072585, + "grad_norm": 0.0725458487868309, + "learning_rate": 4.099644448445025e-05, + "loss": 0.2233, + "step": 40267 + }, + { + "epoch": 3.262151652624757, + "grad_norm": 0.06781957298517227, + "learning_rate": 4.099194383185562e-05, + "loss": 0.2181, + "step": 40268 + }, + { + "epoch": 3.2622326636422554, + "grad_norm": 0.070824533700943, + "learning_rate": 4.0987443179260995e-05, + "loss": 0.2251, + "step": 40269 + }, + { + "epoch": 3.2623136746597536, + "grad_norm": 0.06839597970247269, + "learning_rate": 4.098294252666637e-05, + "loss": 0.2105, + "step": 40270 + }, + { + "epoch": 3.2623946856772523, + "grad_norm": 0.05092064291238785, + "learning_rate": 4.097844187407174e-05, + "loss": 0.1938, + "step": 40271 + }, + { + "epoch": 3.2624756966947506, + "grad_norm": 0.08955680578947067, + "learning_rate": 4.0973941221477116e-05, + "loss": 0.2246, + "step": 40272 + }, + { + "epoch": 3.262556707712249, + "grad_norm": 0.0841311514377594, + "learning_rate": 4.096944056888249e-05, + "loss": 0.2397, + "step": 40273 + }, + { + "epoch": 3.262637718729747, + "grad_norm": 0.06255053728818893, + "learning_rate": 4.0964939916287863e-05, + "loss": 0.2135, + "step": 40274 + }, + { + "epoch": 3.2627187297472457, + "grad_norm": 0.06263235956430435, + "learning_rate": 4.096043926369324e-05, + "loss": 0.2218, + "step": 40275 + }, + { + "epoch": 3.262799740764744, + "grad_norm": 0.0728955864906311, + "learning_rate": 4.095593861109861e-05, + "loss": 0.2313, + "step": 40276 + }, + { + "epoch": 3.2628807517822422, + "grad_norm": 0.06179650127887726, + "learning_rate": 4.0951437958503984e-05, + "loss": 0.218, + "step": 40277 + }, + { + "epoch": 3.262961762799741, + "grad_norm": 0.06603959947824478, + "learning_rate": 4.094693730590936e-05, + "loss": 0.2428, + "step": 40278 + }, + { + "epoch": 3.263042773817239, + "grad_norm": 0.0733291283249855, + "learning_rate": 4.094243665331473e-05, + "loss": 0.2307, + "step": 40279 + }, + { + "epoch": 3.2631237848347374, + "grad_norm": 0.0856151282787323, + "learning_rate": 4.0937936000720105e-05, + "loss": 0.254, + "step": 40280 + }, + { + "epoch": 3.2632047958522357, + "grad_norm": 0.06327536702156067, + "learning_rate": 4.093343534812548e-05, + "loss": 0.1955, + "step": 40281 + }, + { + "epoch": 3.2632858068697344, + "grad_norm": 0.07989270985126495, + "learning_rate": 4.092893469553085e-05, + "loss": 0.2325, + "step": 40282 + }, + { + "epoch": 3.2633668178872326, + "grad_norm": 0.0660238191485405, + "learning_rate": 4.0924434042936226e-05, + "loss": 0.2338, + "step": 40283 + }, + { + "epoch": 3.263447828904731, + "grad_norm": 0.06523341685533524, + "learning_rate": 4.09199333903416e-05, + "loss": 0.2042, + "step": 40284 + }, + { + "epoch": 3.2635288399222295, + "grad_norm": 0.07220982015132904, + "learning_rate": 4.091543273774697e-05, + "loss": 0.2523, + "step": 40285 + }, + { + "epoch": 3.2636098509397278, + "grad_norm": 0.0707927718758583, + "learning_rate": 4.091093208515235e-05, + "loss": 0.2179, + "step": 40286 + }, + { + "epoch": 3.263690861957226, + "grad_norm": 0.06387905031442642, + "learning_rate": 4.090643143255772e-05, + "loss": 0.2315, + "step": 40287 + }, + { + "epoch": 3.2637718729747247, + "grad_norm": 0.07509984076023102, + "learning_rate": 4.09019307799631e-05, + "loss": 0.2146, + "step": 40288 + }, + { + "epoch": 3.263852883992223, + "grad_norm": 0.07076673209667206, + "learning_rate": 4.089743012736847e-05, + "loss": 0.2459, + "step": 40289 + }, + { + "epoch": 3.263933895009721, + "grad_norm": 0.0829244926571846, + "learning_rate": 4.089292947477385e-05, + "loss": 0.2001, + "step": 40290 + }, + { + "epoch": 3.26401490602722, + "grad_norm": 0.08614283800125122, + "learning_rate": 4.088842882217922e-05, + "loss": 0.2058, + "step": 40291 + }, + { + "epoch": 3.264095917044718, + "grad_norm": 0.06972739100456238, + "learning_rate": 4.088392816958459e-05, + "loss": 0.2176, + "step": 40292 + }, + { + "epoch": 3.2641769280622164, + "grad_norm": 0.07807248830795288, + "learning_rate": 4.087942751698997e-05, + "loss": 0.2018, + "step": 40293 + }, + { + "epoch": 3.264257939079715, + "grad_norm": 0.07096074521541595, + "learning_rate": 4.087492686439534e-05, + "loss": 0.22, + "step": 40294 + }, + { + "epoch": 3.2643389500972133, + "grad_norm": 0.0878463014960289, + "learning_rate": 4.087042621180071e-05, + "loss": 0.2645, + "step": 40295 + }, + { + "epoch": 3.2644199611147116, + "grad_norm": 0.07176758348941803, + "learning_rate": 4.086592555920609e-05, + "loss": 0.2477, + "step": 40296 + }, + { + "epoch": 3.26450097213221, + "grad_norm": 0.07876536250114441, + "learning_rate": 4.086142490661146e-05, + "loss": 0.2319, + "step": 40297 + }, + { + "epoch": 3.2645819831497085, + "grad_norm": 0.07930036634206772, + "learning_rate": 4.085692425401683e-05, + "loss": 0.3047, + "step": 40298 + }, + { + "epoch": 3.2646629941672067, + "grad_norm": 0.07112422585487366, + "learning_rate": 4.085242360142221e-05, + "loss": 0.2439, + "step": 40299 + }, + { + "epoch": 3.264744005184705, + "grad_norm": 0.07274965941905975, + "learning_rate": 4.0847922948827584e-05, + "loss": 0.2461, + "step": 40300 + }, + { + "epoch": 3.2648250162022037, + "grad_norm": 0.06299559026956558, + "learning_rate": 4.084342229623296e-05, + "loss": 0.2217, + "step": 40301 + }, + { + "epoch": 3.264906027219702, + "grad_norm": 0.058486804366111755, + "learning_rate": 4.083892164363833e-05, + "loss": 0.2019, + "step": 40302 + }, + { + "epoch": 3.2649870382372, + "grad_norm": 0.07279517501592636, + "learning_rate": 4.0834420991043705e-05, + "loss": 0.2286, + "step": 40303 + }, + { + "epoch": 3.2650680492546984, + "grad_norm": 0.07226823270320892, + "learning_rate": 4.082992033844908e-05, + "loss": 0.2191, + "step": 40304 + }, + { + "epoch": 3.265149060272197, + "grad_norm": 0.07637904584407806, + "learning_rate": 4.082541968585445e-05, + "loss": 0.2184, + "step": 40305 + }, + { + "epoch": 3.2652300712896953, + "grad_norm": 0.0646221861243248, + "learning_rate": 4.0820919033259826e-05, + "loss": 0.22, + "step": 40306 + }, + { + "epoch": 3.2653110823071936, + "grad_norm": 0.06766191124916077, + "learning_rate": 4.08164183806652e-05, + "loss": 0.2598, + "step": 40307 + }, + { + "epoch": 3.2653920933246923, + "grad_norm": 0.07320290803909302, + "learning_rate": 4.081191772807057e-05, + "loss": 0.2415, + "step": 40308 + }, + { + "epoch": 3.2654731043421905, + "grad_norm": 0.0638764277100563, + "learning_rate": 4.0807417075475946e-05, + "loss": 0.2177, + "step": 40309 + }, + { + "epoch": 3.2655541153596888, + "grad_norm": 0.07175182551145554, + "learning_rate": 4.080291642288132e-05, + "loss": 0.2193, + "step": 40310 + }, + { + "epoch": 3.2656351263771874, + "grad_norm": 0.057568471878767014, + "learning_rate": 4.0798415770286694e-05, + "loss": 0.2124, + "step": 40311 + }, + { + "epoch": 3.2657161373946857, + "grad_norm": 0.06688519567251205, + "learning_rate": 4.079391511769207e-05, + "loss": 0.2058, + "step": 40312 + }, + { + "epoch": 3.265797148412184, + "grad_norm": 0.06936531513929367, + "learning_rate": 4.078941446509744e-05, + "loss": 0.2071, + "step": 40313 + }, + { + "epoch": 3.2658781594296826, + "grad_norm": 0.07027362287044525, + "learning_rate": 4.0784913812502814e-05, + "loss": 0.2251, + "step": 40314 + }, + { + "epoch": 3.265959170447181, + "grad_norm": 0.07744862139225006, + "learning_rate": 4.078041315990819e-05, + "loss": 0.2324, + "step": 40315 + }, + { + "epoch": 3.266040181464679, + "grad_norm": 0.06890869140625, + "learning_rate": 4.077591250731356e-05, + "loss": 0.1964, + "step": 40316 + }, + { + "epoch": 3.266121192482178, + "grad_norm": 0.06374898552894592, + "learning_rate": 4.0771411854718935e-05, + "loss": 0.2448, + "step": 40317 + }, + { + "epoch": 3.266202203499676, + "grad_norm": 0.07263403385877609, + "learning_rate": 4.076691120212431e-05, + "loss": 0.2453, + "step": 40318 + }, + { + "epoch": 3.2662832145171743, + "grad_norm": 0.08141748607158661, + "learning_rate": 4.076241054952968e-05, + "loss": 0.2504, + "step": 40319 + }, + { + "epoch": 3.2663642255346725, + "grad_norm": 0.08458217233419418, + "learning_rate": 4.0757909896935056e-05, + "loss": 0.2755, + "step": 40320 + }, + { + "epoch": 3.2664452365521712, + "grad_norm": 0.05991069972515106, + "learning_rate": 4.075340924434043e-05, + "loss": 0.2408, + "step": 40321 + }, + { + "epoch": 3.2665262475696695, + "grad_norm": 0.060942504554986954, + "learning_rate": 4.07489085917458e-05, + "loss": 0.2189, + "step": 40322 + }, + { + "epoch": 3.2666072585871677, + "grad_norm": 0.07964294403791428, + "learning_rate": 4.0744407939151184e-05, + "loss": 0.2039, + "step": 40323 + }, + { + "epoch": 3.2666882696046664, + "grad_norm": 0.08119510114192963, + "learning_rate": 4.073990728655655e-05, + "loss": 0.2906, + "step": 40324 + }, + { + "epoch": 3.2667692806221647, + "grad_norm": 0.0648564025759697, + "learning_rate": 4.0735406633961924e-05, + "loss": 0.2205, + "step": 40325 + }, + { + "epoch": 3.266850291639663, + "grad_norm": 0.08307959139347076, + "learning_rate": 4.0730905981367304e-05, + "loss": 0.2415, + "step": 40326 + }, + { + "epoch": 3.266931302657161, + "grad_norm": 0.0625900849699974, + "learning_rate": 4.072640532877268e-05, + "loss": 0.2018, + "step": 40327 + }, + { + "epoch": 3.26701231367466, + "grad_norm": 0.06181721389293671, + "learning_rate": 4.0721904676178045e-05, + "loss": 0.2271, + "step": 40328 + }, + { + "epoch": 3.267093324692158, + "grad_norm": 0.06439773738384247, + "learning_rate": 4.0717404023583425e-05, + "loss": 0.2007, + "step": 40329 + }, + { + "epoch": 3.2671743357096563, + "grad_norm": 0.06405269354581833, + "learning_rate": 4.07129033709888e-05, + "loss": 0.2313, + "step": 40330 + }, + { + "epoch": 3.267255346727155, + "grad_norm": 0.06496585160493851, + "learning_rate": 4.0708402718394166e-05, + "loss": 0.2219, + "step": 40331 + }, + { + "epoch": 3.2673363577446533, + "grad_norm": 0.06631439924240112, + "learning_rate": 4.0703902065799546e-05, + "loss": 0.2238, + "step": 40332 + }, + { + "epoch": 3.2674173687621515, + "grad_norm": 0.06565840542316437, + "learning_rate": 4.069940141320492e-05, + "loss": 0.2012, + "step": 40333 + }, + { + "epoch": 3.26749837977965, + "grad_norm": 0.07592849433422089, + "learning_rate": 4.0694900760610286e-05, + "loss": 0.2167, + "step": 40334 + }, + { + "epoch": 3.2675793907971484, + "grad_norm": 0.08509879559278488, + "learning_rate": 4.069040010801567e-05, + "loss": 0.2258, + "step": 40335 + }, + { + "epoch": 3.2676604018146467, + "grad_norm": 0.07429090887308121, + "learning_rate": 4.068589945542104e-05, + "loss": 0.2345, + "step": 40336 + }, + { + "epoch": 3.2677414128321454, + "grad_norm": 0.05805768445134163, + "learning_rate": 4.068139880282641e-05, + "loss": 0.245, + "step": 40337 + }, + { + "epoch": 3.2678224238496436, + "grad_norm": 0.06344527751207352, + "learning_rate": 4.067689815023179e-05, + "loss": 0.2481, + "step": 40338 + }, + { + "epoch": 3.267903434867142, + "grad_norm": 0.06499172002077103, + "learning_rate": 4.067239749763716e-05, + "loss": 0.2008, + "step": 40339 + }, + { + "epoch": 3.2679844458846405, + "grad_norm": 0.06943495571613312, + "learning_rate": 4.0667896845042535e-05, + "loss": 0.2198, + "step": 40340 + }, + { + "epoch": 3.268065456902139, + "grad_norm": 0.08585415780544281, + "learning_rate": 4.066339619244791e-05, + "loss": 0.2425, + "step": 40341 + }, + { + "epoch": 3.268146467919637, + "grad_norm": 0.07219972461462021, + "learning_rate": 4.065889553985328e-05, + "loss": 0.2182, + "step": 40342 + }, + { + "epoch": 3.2682274789371353, + "grad_norm": 0.07773865759372711, + "learning_rate": 4.0654394887258656e-05, + "loss": 0.2571, + "step": 40343 + }, + { + "epoch": 3.268308489954634, + "grad_norm": 0.0748126357793808, + "learning_rate": 4.064989423466403e-05, + "loss": 0.2264, + "step": 40344 + }, + { + "epoch": 3.268389500972132, + "grad_norm": 0.06606455147266388, + "learning_rate": 4.06453935820694e-05, + "loss": 0.2304, + "step": 40345 + }, + { + "epoch": 3.2684705119896305, + "grad_norm": 0.05831682309508324, + "learning_rate": 4.0640892929474776e-05, + "loss": 0.2294, + "step": 40346 + }, + { + "epoch": 3.268551523007129, + "grad_norm": 0.07438673824071884, + "learning_rate": 4.063639227688015e-05, + "loss": 0.2101, + "step": 40347 + }, + { + "epoch": 3.2686325340246274, + "grad_norm": 0.08037227392196655, + "learning_rate": 4.0631891624285524e-05, + "loss": 0.2391, + "step": 40348 + }, + { + "epoch": 3.2687135450421256, + "grad_norm": 0.07156945019960403, + "learning_rate": 4.06273909716909e-05, + "loss": 0.2302, + "step": 40349 + }, + { + "epoch": 3.268794556059624, + "grad_norm": 0.07022353261709213, + "learning_rate": 4.062289031909627e-05, + "loss": 0.2187, + "step": 40350 + }, + { + "epoch": 3.2688755670771226, + "grad_norm": 0.071316197514534, + "learning_rate": 4.0618389666501644e-05, + "loss": 0.2265, + "step": 40351 + }, + { + "epoch": 3.268956578094621, + "grad_norm": 0.07702403515577316, + "learning_rate": 4.061388901390702e-05, + "loss": 0.2527, + "step": 40352 + }, + { + "epoch": 3.269037589112119, + "grad_norm": 0.07936025410890579, + "learning_rate": 4.060938836131239e-05, + "loss": 0.2151, + "step": 40353 + }, + { + "epoch": 3.2691186001296177, + "grad_norm": 0.06787516921758652, + "learning_rate": 4.0604887708717765e-05, + "loss": 0.2043, + "step": 40354 + }, + { + "epoch": 3.269199611147116, + "grad_norm": 0.08268849551677704, + "learning_rate": 4.060038705612314e-05, + "loss": 0.2357, + "step": 40355 + }, + { + "epoch": 3.2692806221646142, + "grad_norm": 0.06543977558612823, + "learning_rate": 4.059588640352851e-05, + "loss": 0.2564, + "step": 40356 + }, + { + "epoch": 3.269361633182113, + "grad_norm": 0.059460386633872986, + "learning_rate": 4.0591385750933886e-05, + "loss": 0.2374, + "step": 40357 + }, + { + "epoch": 3.269442644199611, + "grad_norm": 0.08025439828634262, + "learning_rate": 4.058688509833926e-05, + "loss": 0.2432, + "step": 40358 + }, + { + "epoch": 3.2695236552171094, + "grad_norm": 0.06559722125530243, + "learning_rate": 4.058238444574464e-05, + "loss": 0.2397, + "step": 40359 + }, + { + "epoch": 3.269604666234608, + "grad_norm": 0.05569949746131897, + "learning_rate": 4.057788379315001e-05, + "loss": 0.1952, + "step": 40360 + }, + { + "epoch": 3.2696856772521063, + "grad_norm": 0.061552103608846664, + "learning_rate": 4.057338314055538e-05, + "loss": 0.2192, + "step": 40361 + }, + { + "epoch": 3.2697666882696046, + "grad_norm": 0.06350647658109665, + "learning_rate": 4.056888248796076e-05, + "loss": 0.2239, + "step": 40362 + }, + { + "epoch": 3.2698476992871033, + "grad_norm": 0.07742627710103989, + "learning_rate": 4.056438183536613e-05, + "loss": 0.2484, + "step": 40363 + }, + { + "epoch": 3.2699287103046015, + "grad_norm": 0.055037204176187515, + "learning_rate": 4.05598811827715e-05, + "loss": 0.1932, + "step": 40364 + }, + { + "epoch": 3.2700097213220998, + "grad_norm": 0.06263311207294464, + "learning_rate": 4.055538053017688e-05, + "loss": 0.2252, + "step": 40365 + }, + { + "epoch": 3.270090732339598, + "grad_norm": 0.08150181174278259, + "learning_rate": 4.0550879877582255e-05, + "loss": 0.215, + "step": 40366 + }, + { + "epoch": 3.2701717433570967, + "grad_norm": 0.058354031294584274, + "learning_rate": 4.054637922498762e-05, + "loss": 0.2248, + "step": 40367 + }, + { + "epoch": 3.270252754374595, + "grad_norm": 0.04838745668530464, + "learning_rate": 4.0541878572393e-05, + "loss": 0.2019, + "step": 40368 + }, + { + "epoch": 3.270333765392093, + "grad_norm": 0.07810920476913452, + "learning_rate": 4.0537377919798376e-05, + "loss": 0.2471, + "step": 40369 + }, + { + "epoch": 3.270414776409592, + "grad_norm": 0.07518798112869263, + "learning_rate": 4.053287726720374e-05, + "loss": 0.2306, + "step": 40370 + }, + { + "epoch": 3.27049578742709, + "grad_norm": 0.07304823398590088, + "learning_rate": 4.052837661460912e-05, + "loss": 0.2235, + "step": 40371 + }, + { + "epoch": 3.2705767984445884, + "grad_norm": 0.06685718148946762, + "learning_rate": 4.05238759620145e-05, + "loss": 0.2342, + "step": 40372 + }, + { + "epoch": 3.2706578094620866, + "grad_norm": 0.07246522605419159, + "learning_rate": 4.0519375309419864e-05, + "loss": 0.2164, + "step": 40373 + }, + { + "epoch": 3.2707388204795853, + "grad_norm": 0.06804099678993225, + "learning_rate": 4.0514874656825244e-05, + "loss": 0.2291, + "step": 40374 + }, + { + "epoch": 3.2708198314970836, + "grad_norm": 0.06406091898679733, + "learning_rate": 4.051037400423062e-05, + "loss": 0.2341, + "step": 40375 + }, + { + "epoch": 3.270900842514582, + "grad_norm": 0.08121266961097717, + "learning_rate": 4.0505873351635985e-05, + "loss": 0.245, + "step": 40376 + }, + { + "epoch": 3.2709818535320805, + "grad_norm": 0.0758911669254303, + "learning_rate": 4.0501372699041365e-05, + "loss": 0.2786, + "step": 40377 + }, + { + "epoch": 3.2710628645495787, + "grad_norm": 0.06743774563074112, + "learning_rate": 4.049687204644674e-05, + "loss": 0.2404, + "step": 40378 + }, + { + "epoch": 3.271143875567077, + "grad_norm": 0.08256151527166367, + "learning_rate": 4.049237139385211e-05, + "loss": 0.1982, + "step": 40379 + }, + { + "epoch": 3.2712248865845757, + "grad_norm": 0.06504231691360474, + "learning_rate": 4.0487870741257486e-05, + "loss": 0.2153, + "step": 40380 + }, + { + "epoch": 3.271305897602074, + "grad_norm": 0.07067107409238815, + "learning_rate": 4.048337008866286e-05, + "loss": 0.2151, + "step": 40381 + }, + { + "epoch": 3.271386908619572, + "grad_norm": 0.07374674826860428, + "learning_rate": 4.047886943606823e-05, + "loss": 0.2442, + "step": 40382 + }, + { + "epoch": 3.271467919637071, + "grad_norm": 0.0919092670083046, + "learning_rate": 4.0474368783473607e-05, + "loss": 0.2523, + "step": 40383 + }, + { + "epoch": 3.271548930654569, + "grad_norm": 0.08143769204616547, + "learning_rate": 4.046986813087898e-05, + "loss": 0.237, + "step": 40384 + }, + { + "epoch": 3.2716299416720673, + "grad_norm": 0.05799148604273796, + "learning_rate": 4.0465367478284354e-05, + "loss": 0.1916, + "step": 40385 + }, + { + "epoch": 3.271710952689566, + "grad_norm": 0.07556015253067017, + "learning_rate": 4.046086682568973e-05, + "loss": 0.2196, + "step": 40386 + }, + { + "epoch": 3.2717919637070643, + "grad_norm": 0.05838407948613167, + "learning_rate": 4.04563661730951e-05, + "loss": 0.2266, + "step": 40387 + }, + { + "epoch": 3.2718729747245625, + "grad_norm": 0.0696408823132515, + "learning_rate": 4.0451865520500475e-05, + "loss": 0.2339, + "step": 40388 + }, + { + "epoch": 3.2719539857420608, + "grad_norm": 0.07987789064645767, + "learning_rate": 4.044736486790585e-05, + "loss": 0.2522, + "step": 40389 + }, + { + "epoch": 3.2720349967595594, + "grad_norm": 0.08299795538187027, + "learning_rate": 4.044286421531122e-05, + "loss": 0.2658, + "step": 40390 + }, + { + "epoch": 3.2721160077770577, + "grad_norm": 0.0716930627822876, + "learning_rate": 4.0438363562716595e-05, + "loss": 0.2457, + "step": 40391 + }, + { + "epoch": 3.272197018794556, + "grad_norm": 0.07929502427577972, + "learning_rate": 4.0433862910121976e-05, + "loss": 0.2447, + "step": 40392 + }, + { + "epoch": 3.2722780298120546, + "grad_norm": 0.09498481452465057, + "learning_rate": 4.042936225752734e-05, + "loss": 0.2099, + "step": 40393 + }, + { + "epoch": 3.272359040829553, + "grad_norm": 0.07571402192115784, + "learning_rate": 4.0424861604932716e-05, + "loss": 0.2166, + "step": 40394 + }, + { + "epoch": 3.272440051847051, + "grad_norm": 0.060386765748262405, + "learning_rate": 4.0420360952338097e-05, + "loss": 0.225, + "step": 40395 + }, + { + "epoch": 3.2725210628645494, + "grad_norm": 0.06799789518117905, + "learning_rate": 4.041586029974346e-05, + "loss": 0.2147, + "step": 40396 + }, + { + "epoch": 3.272602073882048, + "grad_norm": 0.0849473848938942, + "learning_rate": 4.041135964714884e-05, + "loss": 0.2257, + "step": 40397 + }, + { + "epoch": 3.2726830848995463, + "grad_norm": 0.07534343749284744, + "learning_rate": 4.040685899455422e-05, + "loss": 0.2048, + "step": 40398 + }, + { + "epoch": 3.2727640959170445, + "grad_norm": 0.05903489515185356, + "learning_rate": 4.0402358341959584e-05, + "loss": 0.1867, + "step": 40399 + }, + { + "epoch": 3.2728451069345432, + "grad_norm": 0.08578018099069595, + "learning_rate": 4.039785768936496e-05, + "loss": 0.2531, + "step": 40400 + }, + { + "epoch": 3.2729261179520415, + "grad_norm": 0.06364873796701431, + "learning_rate": 4.039335703677034e-05, + "loss": 0.2331, + "step": 40401 + }, + { + "epoch": 3.2730071289695397, + "grad_norm": 0.08088508993387222, + "learning_rate": 4.0388856384175705e-05, + "loss": 0.2451, + "step": 40402 + }, + { + "epoch": 3.2730881399870384, + "grad_norm": 0.07197795808315277, + "learning_rate": 4.038435573158108e-05, + "loss": 0.2527, + "step": 40403 + }, + { + "epoch": 3.2731691510045366, + "grad_norm": 0.06356197595596313, + "learning_rate": 4.037985507898646e-05, + "loss": 0.2293, + "step": 40404 + }, + { + "epoch": 3.273250162022035, + "grad_norm": 0.0621635764837265, + "learning_rate": 4.037535442639183e-05, + "loss": 0.2317, + "step": 40405 + }, + { + "epoch": 3.2733311730395336, + "grad_norm": 0.07144168764352798, + "learning_rate": 4.03708537737972e-05, + "loss": 0.1993, + "step": 40406 + }, + { + "epoch": 3.273412184057032, + "grad_norm": 0.06132879480719566, + "learning_rate": 4.036635312120258e-05, + "loss": 0.2408, + "step": 40407 + }, + { + "epoch": 3.27349319507453, + "grad_norm": 0.08265918493270874, + "learning_rate": 4.036185246860795e-05, + "loss": 0.2241, + "step": 40408 + }, + { + "epoch": 3.2735742060920288, + "grad_norm": 0.06276227533817291, + "learning_rate": 4.035735181601332e-05, + "loss": 0.2331, + "step": 40409 + }, + { + "epoch": 3.273655217109527, + "grad_norm": 0.06253093481063843, + "learning_rate": 4.03528511634187e-05, + "loss": 0.2156, + "step": 40410 + }, + { + "epoch": 3.2737362281270252, + "grad_norm": 0.0725327879190445, + "learning_rate": 4.0348350510824074e-05, + "loss": 0.2435, + "step": 40411 + }, + { + "epoch": 3.2738172391445235, + "grad_norm": 0.0664612203836441, + "learning_rate": 4.034384985822944e-05, + "loss": 0.2163, + "step": 40412 + }, + { + "epoch": 3.273898250162022, + "grad_norm": 0.05309682711958885, + "learning_rate": 4.033934920563482e-05, + "loss": 0.2277, + "step": 40413 + }, + { + "epoch": 3.2739792611795204, + "grad_norm": 0.0698176920413971, + "learning_rate": 4.0334848553040195e-05, + "loss": 0.2574, + "step": 40414 + }, + { + "epoch": 3.2740602721970187, + "grad_norm": 0.07634953409433365, + "learning_rate": 4.033034790044556e-05, + "loss": 0.2099, + "step": 40415 + }, + { + "epoch": 3.2741412832145174, + "grad_norm": 0.054520655423402786, + "learning_rate": 4.032584724785094e-05, + "loss": 0.2122, + "step": 40416 + }, + { + "epoch": 3.2742222942320156, + "grad_norm": 0.06059606000781059, + "learning_rate": 4.0321346595256316e-05, + "loss": 0.221, + "step": 40417 + }, + { + "epoch": 3.274303305249514, + "grad_norm": 0.06204358860850334, + "learning_rate": 4.031684594266168e-05, + "loss": 0.2128, + "step": 40418 + }, + { + "epoch": 3.274384316267012, + "grad_norm": 0.06221948191523552, + "learning_rate": 4.031234529006706e-05, + "loss": 0.1796, + "step": 40419 + }, + { + "epoch": 3.274465327284511, + "grad_norm": 0.0861034169793129, + "learning_rate": 4.0307844637472437e-05, + "loss": 0.2372, + "step": 40420 + }, + { + "epoch": 3.274546338302009, + "grad_norm": 0.08388126641511917, + "learning_rate": 4.030334398487781e-05, + "loss": 0.232, + "step": 40421 + }, + { + "epoch": 3.2746273493195073, + "grad_norm": 0.06882581114768982, + "learning_rate": 4.0298843332283184e-05, + "loss": 0.2234, + "step": 40422 + }, + { + "epoch": 3.274708360337006, + "grad_norm": 0.07220038771629333, + "learning_rate": 4.029434267968856e-05, + "loss": 0.2176, + "step": 40423 + }, + { + "epoch": 3.274789371354504, + "grad_norm": 0.07071521878242493, + "learning_rate": 4.028984202709393e-05, + "loss": 0.2071, + "step": 40424 + }, + { + "epoch": 3.2748703823720025, + "grad_norm": 0.07368500530719757, + "learning_rate": 4.0285341374499305e-05, + "loss": 0.2279, + "step": 40425 + }, + { + "epoch": 3.274951393389501, + "grad_norm": 0.058155667036771774, + "learning_rate": 4.028084072190468e-05, + "loss": 0.1896, + "step": 40426 + }, + { + "epoch": 3.2750324044069994, + "grad_norm": 0.07234590500593185, + "learning_rate": 4.027634006931005e-05, + "loss": 0.2005, + "step": 40427 + }, + { + "epoch": 3.2751134154244976, + "grad_norm": 0.08725034445524216, + "learning_rate": 4.0271839416715425e-05, + "loss": 0.2611, + "step": 40428 + }, + { + "epoch": 3.2751944264419963, + "grad_norm": 0.0634271427989006, + "learning_rate": 4.02673387641208e-05, + "loss": 0.2076, + "step": 40429 + }, + { + "epoch": 3.2752754374594946, + "grad_norm": 0.08502479642629623, + "learning_rate": 4.026283811152617e-05, + "loss": 0.2315, + "step": 40430 + }, + { + "epoch": 3.275356448476993, + "grad_norm": 0.0666709840297699, + "learning_rate": 4.0258337458931546e-05, + "loss": 0.2061, + "step": 40431 + }, + { + "epoch": 3.2754374594944915, + "grad_norm": 0.06762423366308212, + "learning_rate": 4.025383680633692e-05, + "loss": 0.212, + "step": 40432 + }, + { + "epoch": 3.2755184705119897, + "grad_norm": 0.07069918513298035, + "learning_rate": 4.0249336153742293e-05, + "loss": 0.2186, + "step": 40433 + }, + { + "epoch": 3.275599481529488, + "grad_norm": 0.08242760598659515, + "learning_rate": 4.0244835501147674e-05, + "loss": 0.2641, + "step": 40434 + }, + { + "epoch": 3.2756804925469862, + "grad_norm": 0.07050781697034836, + "learning_rate": 4.024033484855304e-05, + "loss": 0.2126, + "step": 40435 + }, + { + "epoch": 3.275761503564485, + "grad_norm": 0.06657261401414871, + "learning_rate": 4.0235834195958414e-05, + "loss": 0.2132, + "step": 40436 + }, + { + "epoch": 3.275842514581983, + "grad_norm": 0.07559150457382202, + "learning_rate": 4.0231333543363795e-05, + "loss": 0.2111, + "step": 40437 + }, + { + "epoch": 3.2759235255994814, + "grad_norm": 0.08614648133516312, + "learning_rate": 4.022683289076916e-05, + "loss": 0.2746, + "step": 40438 + }, + { + "epoch": 3.27600453661698, + "grad_norm": 0.06892319023609161, + "learning_rate": 4.0222332238174535e-05, + "loss": 0.2025, + "step": 40439 + }, + { + "epoch": 3.2760855476344783, + "grad_norm": 0.06139829382300377, + "learning_rate": 4.0217831585579915e-05, + "loss": 0.2142, + "step": 40440 + }, + { + "epoch": 3.2761665586519766, + "grad_norm": 0.06792233884334564, + "learning_rate": 4.021333093298528e-05, + "loss": 0.2377, + "step": 40441 + }, + { + "epoch": 3.276247569669475, + "grad_norm": 0.08298711478710175, + "learning_rate": 4.0208830280390656e-05, + "loss": 0.2313, + "step": 40442 + }, + { + "epoch": 3.2763285806869735, + "grad_norm": 0.0739048421382904, + "learning_rate": 4.0204329627796036e-05, + "loss": 0.2191, + "step": 40443 + }, + { + "epoch": 3.2764095917044718, + "grad_norm": 0.07907218486070633, + "learning_rate": 4.01998289752014e-05, + "loss": 0.223, + "step": 40444 + }, + { + "epoch": 3.27649060272197, + "grad_norm": 0.08246040344238281, + "learning_rate": 4.019532832260678e-05, + "loss": 0.2294, + "step": 40445 + }, + { + "epoch": 3.2765716137394687, + "grad_norm": 0.0847393274307251, + "learning_rate": 4.019082767001216e-05, + "loss": 0.2493, + "step": 40446 + }, + { + "epoch": 3.276652624756967, + "grad_norm": 0.08675988763570786, + "learning_rate": 4.018632701741753e-05, + "loss": 0.2401, + "step": 40447 + }, + { + "epoch": 3.276733635774465, + "grad_norm": 0.06091444939374924, + "learning_rate": 4.01818263648229e-05, + "loss": 0.2607, + "step": 40448 + }, + { + "epoch": 3.276814646791964, + "grad_norm": 0.06214442104101181, + "learning_rate": 4.017732571222828e-05, + "loss": 0.2137, + "step": 40449 + }, + { + "epoch": 3.276895657809462, + "grad_norm": 0.06971146911382675, + "learning_rate": 4.017282505963365e-05, + "loss": 0.2195, + "step": 40450 + }, + { + "epoch": 3.2769766688269604, + "grad_norm": 0.059449177235364914, + "learning_rate": 4.016832440703902e-05, + "loss": 0.2032, + "step": 40451 + }, + { + "epoch": 3.277057679844459, + "grad_norm": 0.08385682851076126, + "learning_rate": 4.01638237544444e-05, + "loss": 0.2421, + "step": 40452 + }, + { + "epoch": 3.2771386908619573, + "grad_norm": 0.08064158260822296, + "learning_rate": 4.015932310184977e-05, + "loss": 0.2353, + "step": 40453 + }, + { + "epoch": 3.2772197018794555, + "grad_norm": 0.09409360587596893, + "learning_rate": 4.015482244925514e-05, + "loss": 0.2285, + "step": 40454 + }, + { + "epoch": 3.277300712896954, + "grad_norm": 0.06640844792127609, + "learning_rate": 4.015032179666052e-05, + "loss": 0.2104, + "step": 40455 + }, + { + "epoch": 3.2773817239144525, + "grad_norm": 0.07138046622276306, + "learning_rate": 4.014582114406589e-05, + "loss": 0.2271, + "step": 40456 + }, + { + "epoch": 3.2774627349319507, + "grad_norm": 0.07712315022945404, + "learning_rate": 4.014132049147126e-05, + "loss": 0.2353, + "step": 40457 + }, + { + "epoch": 3.277543745949449, + "grad_norm": 0.06329129636287689, + "learning_rate": 4.013681983887664e-05, + "loss": 0.2241, + "step": 40458 + }, + { + "epoch": 3.2776247569669477, + "grad_norm": 0.06462210416793823, + "learning_rate": 4.0132319186282014e-05, + "loss": 0.224, + "step": 40459 + }, + { + "epoch": 3.277705767984446, + "grad_norm": 0.0607602559030056, + "learning_rate": 4.012781853368739e-05, + "loss": 0.2053, + "step": 40460 + }, + { + "epoch": 3.277786779001944, + "grad_norm": 0.06479512155056, + "learning_rate": 4.012331788109276e-05, + "loss": 0.1935, + "step": 40461 + }, + { + "epoch": 3.2778677900194424, + "grad_norm": 0.06079896539449692, + "learning_rate": 4.0118817228498135e-05, + "loss": 0.2833, + "step": 40462 + }, + { + "epoch": 3.277948801036941, + "grad_norm": 0.06312386691570282, + "learning_rate": 4.011431657590351e-05, + "loss": 0.2488, + "step": 40463 + }, + { + "epoch": 3.2780298120544393, + "grad_norm": 0.0699615329504013, + "learning_rate": 4.010981592330888e-05, + "loss": 0.2318, + "step": 40464 + }, + { + "epoch": 3.2781108230719376, + "grad_norm": 0.07368670403957367, + "learning_rate": 4.0105315270714256e-05, + "loss": 0.2467, + "step": 40465 + }, + { + "epoch": 3.2781918340894363, + "grad_norm": 0.08029686659574509, + "learning_rate": 4.010081461811963e-05, + "loss": 0.2667, + "step": 40466 + }, + { + "epoch": 3.2782728451069345, + "grad_norm": 0.06958233565092087, + "learning_rate": 4.0096313965525e-05, + "loss": 0.2282, + "step": 40467 + }, + { + "epoch": 3.2783538561244328, + "grad_norm": 0.059386830776929855, + "learning_rate": 4.0091813312930376e-05, + "loss": 0.1847, + "step": 40468 + }, + { + "epoch": 3.2784348671419314, + "grad_norm": 0.0706775113940239, + "learning_rate": 4.008731266033575e-05, + "loss": 0.2304, + "step": 40469 + }, + { + "epoch": 3.2785158781594297, + "grad_norm": 0.07206568121910095, + "learning_rate": 4.0082812007741124e-05, + "loss": 0.1964, + "step": 40470 + }, + { + "epoch": 3.278596889176928, + "grad_norm": 0.06114106625318527, + "learning_rate": 4.00783113551465e-05, + "loss": 0.2166, + "step": 40471 + }, + { + "epoch": 3.2786779001944266, + "grad_norm": 0.06136857345700264, + "learning_rate": 4.007381070255187e-05, + "loss": 0.2232, + "step": 40472 + }, + { + "epoch": 3.278758911211925, + "grad_norm": 0.07858065515756607, + "learning_rate": 4.006931004995725e-05, + "loss": 0.2318, + "step": 40473 + }, + { + "epoch": 3.278839922229423, + "grad_norm": 0.07737397402524948, + "learning_rate": 4.006480939736262e-05, + "loss": 0.2279, + "step": 40474 + }, + { + "epoch": 3.278920933246922, + "grad_norm": 0.06615752726793289, + "learning_rate": 4.006030874476799e-05, + "loss": 0.2415, + "step": 40475 + }, + { + "epoch": 3.27900194426442, + "grad_norm": 0.06866739690303802, + "learning_rate": 4.005580809217337e-05, + "loss": 0.2021, + "step": 40476 + }, + { + "epoch": 3.2790829552819183, + "grad_norm": 0.06152362748980522, + "learning_rate": 4.005130743957874e-05, + "loss": 0.2113, + "step": 40477 + }, + { + "epoch": 3.2791639662994165, + "grad_norm": 0.06418390572071075, + "learning_rate": 4.004680678698411e-05, + "loss": 0.2114, + "step": 40478 + }, + { + "epoch": 3.279244977316915, + "grad_norm": 0.06302060931921005, + "learning_rate": 4.004230613438949e-05, + "loss": 0.2226, + "step": 40479 + }, + { + "epoch": 3.2793259883344135, + "grad_norm": 0.0747298002243042, + "learning_rate": 4.003780548179486e-05, + "loss": 0.2506, + "step": 40480 + }, + { + "epoch": 3.2794069993519117, + "grad_norm": 0.10246086120605469, + "learning_rate": 4.003330482920023e-05, + "loss": 0.2145, + "step": 40481 + }, + { + "epoch": 3.2794880103694104, + "grad_norm": 0.07003115862607956, + "learning_rate": 4.0028804176605614e-05, + "loss": 0.2032, + "step": 40482 + }, + { + "epoch": 3.2795690213869086, + "grad_norm": 0.06867416203022003, + "learning_rate": 4.002430352401098e-05, + "loss": 0.2116, + "step": 40483 + }, + { + "epoch": 3.279650032404407, + "grad_norm": 0.07221326977014542, + "learning_rate": 4.0019802871416354e-05, + "loss": 0.2057, + "step": 40484 + }, + { + "epoch": 3.279731043421905, + "grad_norm": 0.0709087997674942, + "learning_rate": 4.0015302218821734e-05, + "loss": 0.2183, + "step": 40485 + }, + { + "epoch": 3.279812054439404, + "grad_norm": 0.06863710284233093, + "learning_rate": 4.001080156622711e-05, + "loss": 0.2072, + "step": 40486 + }, + { + "epoch": 3.279893065456902, + "grad_norm": 0.07001881301403046, + "learning_rate": 4.0006300913632475e-05, + "loss": 0.2213, + "step": 40487 + }, + { + "epoch": 3.2799740764744003, + "grad_norm": 0.06810453534126282, + "learning_rate": 4.0001800261037855e-05, + "loss": 0.2266, + "step": 40488 + }, + { + "epoch": 3.280055087491899, + "grad_norm": 0.0766795426607132, + "learning_rate": 3.999729960844323e-05, + "loss": 0.2426, + "step": 40489 + }, + { + "epoch": 3.2801360985093972, + "grad_norm": 0.08484476059675217, + "learning_rate": 3.9992798955848596e-05, + "loss": 0.2461, + "step": 40490 + }, + { + "epoch": 3.2802171095268955, + "grad_norm": 0.07386729121208191, + "learning_rate": 3.9988298303253976e-05, + "loss": 0.2298, + "step": 40491 + }, + { + "epoch": 3.280298120544394, + "grad_norm": 0.0644812285900116, + "learning_rate": 3.998379765065935e-05, + "loss": 0.2276, + "step": 40492 + }, + { + "epoch": 3.2803791315618924, + "grad_norm": 0.060633908957242966, + "learning_rate": 3.9979296998064716e-05, + "loss": 0.2299, + "step": 40493 + }, + { + "epoch": 3.2804601425793907, + "grad_norm": 0.07635070383548737, + "learning_rate": 3.99747963454701e-05, + "loss": 0.2038, + "step": 40494 + }, + { + "epoch": 3.2805411535968894, + "grad_norm": 0.06670946627855301, + "learning_rate": 3.997029569287547e-05, + "loss": 0.219, + "step": 40495 + }, + { + "epoch": 3.2806221646143876, + "grad_norm": 0.08006652444601059, + "learning_rate": 3.996579504028084e-05, + "loss": 0.2482, + "step": 40496 + }, + { + "epoch": 3.280703175631886, + "grad_norm": 0.07331784814596176, + "learning_rate": 3.996129438768622e-05, + "loss": 0.2031, + "step": 40497 + }, + { + "epoch": 3.2807841866493845, + "grad_norm": 0.08051098138093948, + "learning_rate": 3.995679373509159e-05, + "loss": 0.211, + "step": 40498 + }, + { + "epoch": 3.280865197666883, + "grad_norm": 0.06945125758647919, + "learning_rate": 3.9952293082496965e-05, + "loss": 0.2323, + "step": 40499 + }, + { + "epoch": 3.280946208684381, + "grad_norm": 0.0819791629910469, + "learning_rate": 3.994779242990234e-05, + "loss": 0.2157, + "step": 40500 + }, + { + "epoch": 3.2810272197018793, + "grad_norm": 0.07022712379693985, + "learning_rate": 3.994329177730771e-05, + "loss": 0.2277, + "step": 40501 + }, + { + "epoch": 3.281108230719378, + "grad_norm": 0.07423558086156845, + "learning_rate": 3.9938791124713086e-05, + "loss": 0.2255, + "step": 40502 + }, + { + "epoch": 3.281189241736876, + "grad_norm": 0.0815899595618248, + "learning_rate": 3.993429047211846e-05, + "loss": 0.2152, + "step": 40503 + }, + { + "epoch": 3.2812702527543745, + "grad_norm": 0.06895309686660767, + "learning_rate": 3.992978981952383e-05, + "loss": 0.2385, + "step": 40504 + }, + { + "epoch": 3.281351263771873, + "grad_norm": 0.06182735040783882, + "learning_rate": 3.9925289166929206e-05, + "loss": 0.1874, + "step": 40505 + }, + { + "epoch": 3.2814322747893714, + "grad_norm": 0.09717794507741928, + "learning_rate": 3.992078851433458e-05, + "loss": 0.2255, + "step": 40506 + }, + { + "epoch": 3.2815132858068696, + "grad_norm": 0.06994237005710602, + "learning_rate": 3.9916287861739954e-05, + "loss": 0.2349, + "step": 40507 + }, + { + "epoch": 3.281594296824368, + "grad_norm": 0.07174836099147797, + "learning_rate": 3.991178720914533e-05, + "loss": 0.2341, + "step": 40508 + }, + { + "epoch": 3.2816753078418666, + "grad_norm": 0.06857135891914368, + "learning_rate": 3.99072865565507e-05, + "loss": 0.2258, + "step": 40509 + }, + { + "epoch": 3.281756318859365, + "grad_norm": 0.06680644303560257, + "learning_rate": 3.9902785903956074e-05, + "loss": 0.2283, + "step": 40510 + }, + { + "epoch": 3.281837329876863, + "grad_norm": 0.058539021760225296, + "learning_rate": 3.989828525136145e-05, + "loss": 0.214, + "step": 40511 + }, + { + "epoch": 3.2819183408943617, + "grad_norm": 0.07493918389081955, + "learning_rate": 3.989378459876683e-05, + "loss": 0.1984, + "step": 40512 + }, + { + "epoch": 3.28199935191186, + "grad_norm": 0.0886545479297638, + "learning_rate": 3.9889283946172195e-05, + "loss": 0.2976, + "step": 40513 + }, + { + "epoch": 3.2820803629293582, + "grad_norm": 0.06237650662660599, + "learning_rate": 3.988478329357757e-05, + "loss": 0.2202, + "step": 40514 + }, + { + "epoch": 3.282161373946857, + "grad_norm": 0.07362060993909836, + "learning_rate": 3.988028264098295e-05, + "loss": 0.2323, + "step": 40515 + }, + { + "epoch": 3.282242384964355, + "grad_norm": 0.06613124161958694, + "learning_rate": 3.9875781988388316e-05, + "loss": 0.2699, + "step": 40516 + }, + { + "epoch": 3.2823233959818534, + "grad_norm": 0.06270667165517807, + "learning_rate": 3.987128133579369e-05, + "loss": 0.2224, + "step": 40517 + }, + { + "epoch": 3.282404406999352, + "grad_norm": 0.061522651463747025, + "learning_rate": 3.986678068319907e-05, + "loss": 0.2401, + "step": 40518 + }, + { + "epoch": 3.2824854180168503, + "grad_norm": 0.07176893949508667, + "learning_rate": 3.986228003060444e-05, + "loss": 0.2344, + "step": 40519 + }, + { + "epoch": 3.2825664290343486, + "grad_norm": 0.07144494354724884, + "learning_rate": 3.985777937800981e-05, + "loss": 0.2687, + "step": 40520 + }, + { + "epoch": 3.2826474400518473, + "grad_norm": 0.08052974939346313, + "learning_rate": 3.985327872541519e-05, + "loss": 0.2198, + "step": 40521 + }, + { + "epoch": 3.2827284510693455, + "grad_norm": 0.07403968274593353, + "learning_rate": 3.984877807282056e-05, + "loss": 0.2515, + "step": 40522 + }, + { + "epoch": 3.2828094620868438, + "grad_norm": 0.06775449961423874, + "learning_rate": 3.984427742022593e-05, + "loss": 0.2263, + "step": 40523 + }, + { + "epoch": 3.282890473104342, + "grad_norm": 0.06973139196634293, + "learning_rate": 3.983977676763131e-05, + "loss": 0.2733, + "step": 40524 + }, + { + "epoch": 3.2829714841218407, + "grad_norm": 0.07829160243272781, + "learning_rate": 3.9835276115036685e-05, + "loss": 0.2271, + "step": 40525 + }, + { + "epoch": 3.283052495139339, + "grad_norm": 0.06388890743255615, + "learning_rate": 3.983077546244205e-05, + "loss": 0.202, + "step": 40526 + }, + { + "epoch": 3.283133506156837, + "grad_norm": 0.06818761676549911, + "learning_rate": 3.982627480984743e-05, + "loss": 0.2385, + "step": 40527 + }, + { + "epoch": 3.283214517174336, + "grad_norm": 0.06943784654140472, + "learning_rate": 3.9821774157252806e-05, + "loss": 0.213, + "step": 40528 + }, + { + "epoch": 3.283295528191834, + "grad_norm": 0.08590265363454819, + "learning_rate": 3.981727350465817e-05, + "loss": 0.2111, + "step": 40529 + }, + { + "epoch": 3.2833765392093324, + "grad_norm": 0.06639125943183899, + "learning_rate": 3.981277285206355e-05, + "loss": 0.278, + "step": 40530 + }, + { + "epoch": 3.2834575502268306, + "grad_norm": 0.0770188719034195, + "learning_rate": 3.980827219946893e-05, + "loss": 0.2138, + "step": 40531 + }, + { + "epoch": 3.2835385612443293, + "grad_norm": 0.06980788707733154, + "learning_rate": 3.9803771546874294e-05, + "loss": 0.2119, + "step": 40532 + }, + { + "epoch": 3.2836195722618275, + "grad_norm": 0.07194875180721283, + "learning_rate": 3.9799270894279674e-05, + "loss": 0.1955, + "step": 40533 + }, + { + "epoch": 3.283700583279326, + "grad_norm": 0.06898615509271622, + "learning_rate": 3.979477024168505e-05, + "loss": 0.2319, + "step": 40534 + }, + { + "epoch": 3.2837815942968245, + "grad_norm": 0.08618927747011185, + "learning_rate": 3.9790269589090415e-05, + "loss": 0.2448, + "step": 40535 + }, + { + "epoch": 3.2838626053143227, + "grad_norm": 0.08610135316848755, + "learning_rate": 3.9785768936495795e-05, + "loss": 0.2309, + "step": 40536 + }, + { + "epoch": 3.283943616331821, + "grad_norm": 0.06609229743480682, + "learning_rate": 3.978126828390117e-05, + "loss": 0.2198, + "step": 40537 + }, + { + "epoch": 3.2840246273493197, + "grad_norm": 0.08390206843614578, + "learning_rate": 3.977676763130654e-05, + "loss": 0.2391, + "step": 40538 + }, + { + "epoch": 3.284105638366818, + "grad_norm": 0.06741616129875183, + "learning_rate": 3.9772266978711916e-05, + "loss": 0.1953, + "step": 40539 + }, + { + "epoch": 3.284186649384316, + "grad_norm": 0.059727005660533905, + "learning_rate": 3.976776632611729e-05, + "loss": 0.2071, + "step": 40540 + }, + { + "epoch": 3.284267660401815, + "grad_norm": 0.06814241409301758, + "learning_rate": 3.976326567352266e-05, + "loss": 0.2035, + "step": 40541 + }, + { + "epoch": 3.284348671419313, + "grad_norm": 0.04796573519706726, + "learning_rate": 3.9758765020928037e-05, + "loss": 0.1851, + "step": 40542 + }, + { + "epoch": 3.2844296824368113, + "grad_norm": 0.08344428241252899, + "learning_rate": 3.975426436833341e-05, + "loss": 0.2263, + "step": 40543 + }, + { + "epoch": 3.28451069345431, + "grad_norm": 0.0701231062412262, + "learning_rate": 3.9749763715738784e-05, + "loss": 0.2233, + "step": 40544 + }, + { + "epoch": 3.2845917044718083, + "grad_norm": 0.06695102900266647, + "learning_rate": 3.974526306314416e-05, + "loss": 0.2326, + "step": 40545 + }, + { + "epoch": 3.2846727154893065, + "grad_norm": 0.09770773351192474, + "learning_rate": 3.974076241054953e-05, + "loss": 0.2145, + "step": 40546 + }, + { + "epoch": 3.2847537265068047, + "grad_norm": 0.08435174822807312, + "learning_rate": 3.9736261757954905e-05, + "loss": 0.2209, + "step": 40547 + }, + { + "epoch": 3.2848347375243034, + "grad_norm": 0.0652317926287651, + "learning_rate": 3.973176110536028e-05, + "loss": 0.2079, + "step": 40548 + }, + { + "epoch": 3.2849157485418017, + "grad_norm": 0.0690438449382782, + "learning_rate": 3.972726045276565e-05, + "loss": 0.2337, + "step": 40549 + }, + { + "epoch": 3.2849967595593, + "grad_norm": 0.07208141684532166, + "learning_rate": 3.9722759800171025e-05, + "loss": 0.2534, + "step": 40550 + }, + { + "epoch": 3.2850777705767986, + "grad_norm": 0.0637679249048233, + "learning_rate": 3.9718259147576406e-05, + "loss": 0.2205, + "step": 40551 + }, + { + "epoch": 3.285158781594297, + "grad_norm": 0.08957676589488983, + "learning_rate": 3.971375849498177e-05, + "loss": 0.225, + "step": 40552 + }, + { + "epoch": 3.285239792611795, + "grad_norm": 0.07437330484390259, + "learning_rate": 3.9709257842387146e-05, + "loss": 0.2674, + "step": 40553 + }, + { + "epoch": 3.2853208036292934, + "grad_norm": 0.07008083164691925, + "learning_rate": 3.9704757189792527e-05, + "loss": 0.222, + "step": 40554 + }, + { + "epoch": 3.285401814646792, + "grad_norm": 0.08292362093925476, + "learning_rate": 3.970025653719789e-05, + "loss": 0.242, + "step": 40555 + }, + { + "epoch": 3.2854828256642903, + "grad_norm": 0.08934476226568222, + "learning_rate": 3.969575588460327e-05, + "loss": 0.2498, + "step": 40556 + }, + { + "epoch": 3.2855638366817885, + "grad_norm": 0.0667760893702507, + "learning_rate": 3.969125523200865e-05, + "loss": 0.2217, + "step": 40557 + }, + { + "epoch": 3.285644847699287, + "grad_norm": 0.06794371455907822, + "learning_rate": 3.9686754579414014e-05, + "loss": 0.2221, + "step": 40558 + }, + { + "epoch": 3.2857258587167855, + "grad_norm": 0.06920351088047028, + "learning_rate": 3.968225392681939e-05, + "loss": 0.2646, + "step": 40559 + }, + { + "epoch": 3.2858068697342837, + "grad_norm": 0.06409312784671783, + "learning_rate": 3.967775327422477e-05, + "loss": 0.2163, + "step": 40560 + }, + { + "epoch": 3.2858878807517824, + "grad_norm": 0.10056743025779724, + "learning_rate": 3.9673252621630135e-05, + "loss": 0.2238, + "step": 40561 + }, + { + "epoch": 3.2859688917692806, + "grad_norm": 0.06588996201753616, + "learning_rate": 3.966875196903551e-05, + "loss": 0.2304, + "step": 40562 + }, + { + "epoch": 3.286049902786779, + "grad_norm": 0.06542570888996124, + "learning_rate": 3.966425131644089e-05, + "loss": 0.2056, + "step": 40563 + }, + { + "epoch": 3.2861309138042776, + "grad_norm": 0.08661080151796341, + "learning_rate": 3.965975066384626e-05, + "loss": 0.2425, + "step": 40564 + }, + { + "epoch": 3.286211924821776, + "grad_norm": 0.0732247456908226, + "learning_rate": 3.965525001125163e-05, + "loss": 0.2573, + "step": 40565 + }, + { + "epoch": 3.286292935839274, + "grad_norm": 0.1030229851603508, + "learning_rate": 3.965074935865701e-05, + "loss": 0.2844, + "step": 40566 + }, + { + "epoch": 3.2863739468567728, + "grad_norm": 0.07147184759378433, + "learning_rate": 3.9646248706062383e-05, + "loss": 0.2216, + "step": 40567 + }, + { + "epoch": 3.286454957874271, + "grad_norm": 0.060462355613708496, + "learning_rate": 3.964174805346775e-05, + "loss": 0.208, + "step": 40568 + }, + { + "epoch": 3.2865359688917692, + "grad_norm": 0.06986522674560547, + "learning_rate": 3.963724740087313e-05, + "loss": 0.2047, + "step": 40569 + }, + { + "epoch": 3.2866169799092675, + "grad_norm": 0.070008285343647, + "learning_rate": 3.9632746748278504e-05, + "loss": 0.2261, + "step": 40570 + }, + { + "epoch": 3.286697990926766, + "grad_norm": 0.0804152712225914, + "learning_rate": 3.962824609568387e-05, + "loss": 0.2305, + "step": 40571 + }, + { + "epoch": 3.2867790019442644, + "grad_norm": 0.06697341054677963, + "learning_rate": 3.962374544308925e-05, + "loss": 0.2189, + "step": 40572 + }, + { + "epoch": 3.2868600129617627, + "grad_norm": 0.06043664366006851, + "learning_rate": 3.9619244790494625e-05, + "loss": 0.2129, + "step": 40573 + }, + { + "epoch": 3.2869410239792614, + "grad_norm": 0.07191526889801025, + "learning_rate": 3.961474413789999e-05, + "loss": 0.2602, + "step": 40574 + }, + { + "epoch": 3.2870220349967596, + "grad_norm": 0.07072187960147858, + "learning_rate": 3.961024348530537e-05, + "loss": 0.2137, + "step": 40575 + }, + { + "epoch": 3.287103046014258, + "grad_norm": 0.0730489045381546, + "learning_rate": 3.9605742832710746e-05, + "loss": 0.223, + "step": 40576 + }, + { + "epoch": 3.287184057031756, + "grad_norm": 0.09229131042957306, + "learning_rate": 3.960124218011611e-05, + "loss": 0.2081, + "step": 40577 + }, + { + "epoch": 3.287265068049255, + "grad_norm": 0.0711924135684967, + "learning_rate": 3.959674152752149e-05, + "loss": 0.1944, + "step": 40578 + }, + { + "epoch": 3.287346079066753, + "grad_norm": 0.10902519524097443, + "learning_rate": 3.959224087492687e-05, + "loss": 0.2063, + "step": 40579 + }, + { + "epoch": 3.2874270900842513, + "grad_norm": 0.0829775333404541, + "learning_rate": 3.958774022233224e-05, + "loss": 0.2497, + "step": 40580 + }, + { + "epoch": 3.28750810110175, + "grad_norm": 0.06518006324768066, + "learning_rate": 3.9583239569737614e-05, + "loss": 0.2016, + "step": 40581 + }, + { + "epoch": 3.287589112119248, + "grad_norm": 0.0657275915145874, + "learning_rate": 3.957873891714299e-05, + "loss": 0.2508, + "step": 40582 + }, + { + "epoch": 3.2876701231367464, + "grad_norm": 0.0721287801861763, + "learning_rate": 3.957423826454836e-05, + "loss": 0.2437, + "step": 40583 + }, + { + "epoch": 3.287751134154245, + "grad_norm": 0.057864025235176086, + "learning_rate": 3.9569737611953735e-05, + "loss": 0.2423, + "step": 40584 + }, + { + "epoch": 3.2878321451717434, + "grad_norm": 0.08614750951528549, + "learning_rate": 3.956523695935911e-05, + "loss": 0.2137, + "step": 40585 + }, + { + "epoch": 3.2879131561892416, + "grad_norm": 0.0680968388915062, + "learning_rate": 3.956073630676448e-05, + "loss": 0.2327, + "step": 40586 + }, + { + "epoch": 3.2879941672067403, + "grad_norm": 0.05906018987298012, + "learning_rate": 3.9556235654169855e-05, + "loss": 0.2161, + "step": 40587 + }, + { + "epoch": 3.2880751782242386, + "grad_norm": 0.061834465712308884, + "learning_rate": 3.955173500157523e-05, + "loss": 0.2365, + "step": 40588 + }, + { + "epoch": 3.288156189241737, + "grad_norm": 0.07073098421096802, + "learning_rate": 3.95472343489806e-05, + "loss": 0.2204, + "step": 40589 + }, + { + "epoch": 3.2882372002592355, + "grad_norm": 0.0943407267332077, + "learning_rate": 3.9542733696385976e-05, + "loss": 0.2297, + "step": 40590 + }, + { + "epoch": 3.2883182112767337, + "grad_norm": 0.0708475187420845, + "learning_rate": 3.953823304379135e-05, + "loss": 0.1893, + "step": 40591 + }, + { + "epoch": 3.288399222294232, + "grad_norm": 0.09488806873559952, + "learning_rate": 3.9533732391196723e-05, + "loss": 0.2665, + "step": 40592 + }, + { + "epoch": 3.2884802333117302, + "grad_norm": 0.06400935351848602, + "learning_rate": 3.9529231738602104e-05, + "loss": 0.228, + "step": 40593 + }, + { + "epoch": 3.288561244329229, + "grad_norm": 0.0658169537782669, + "learning_rate": 3.952473108600747e-05, + "loss": 0.1969, + "step": 40594 + }, + { + "epoch": 3.288642255346727, + "grad_norm": 0.07230688631534576, + "learning_rate": 3.9520230433412844e-05, + "loss": 0.2115, + "step": 40595 + }, + { + "epoch": 3.2887232663642254, + "grad_norm": 0.0746084600687027, + "learning_rate": 3.9515729780818225e-05, + "loss": 0.2529, + "step": 40596 + }, + { + "epoch": 3.288804277381724, + "grad_norm": 0.0718778520822525, + "learning_rate": 3.951122912822359e-05, + "loss": 0.2176, + "step": 40597 + }, + { + "epoch": 3.2888852883992223, + "grad_norm": 0.059904858469963074, + "learning_rate": 3.9506728475628965e-05, + "loss": 0.2094, + "step": 40598 + }, + { + "epoch": 3.2889662994167206, + "grad_norm": 0.0675879716873169, + "learning_rate": 3.9502227823034345e-05, + "loss": 0.2341, + "step": 40599 + }, + { + "epoch": 3.289047310434219, + "grad_norm": 0.08008304983377457, + "learning_rate": 3.949772717043971e-05, + "loss": 0.2378, + "step": 40600 + }, + { + "epoch": 3.2891283214517175, + "grad_norm": 0.084351547062397, + "learning_rate": 3.9493226517845086e-05, + "loss": 0.1937, + "step": 40601 + }, + { + "epoch": 3.2892093324692158, + "grad_norm": 0.06926941871643066, + "learning_rate": 3.9488725865250466e-05, + "loss": 0.2311, + "step": 40602 + }, + { + "epoch": 3.289290343486714, + "grad_norm": 0.08134906738996506, + "learning_rate": 3.948422521265583e-05, + "loss": 0.2573, + "step": 40603 + }, + { + "epoch": 3.2893713545042127, + "grad_norm": 0.0564156100153923, + "learning_rate": 3.947972456006121e-05, + "loss": 0.1998, + "step": 40604 + }, + { + "epoch": 3.289452365521711, + "grad_norm": 0.06534619629383087, + "learning_rate": 3.947522390746659e-05, + "loss": 0.2142, + "step": 40605 + }, + { + "epoch": 3.289533376539209, + "grad_norm": 0.08344676345586777, + "learning_rate": 3.947072325487196e-05, + "loss": 0.2201, + "step": 40606 + }, + { + "epoch": 3.289614387556708, + "grad_norm": 0.08071395009756088, + "learning_rate": 3.946622260227733e-05, + "loss": 0.2553, + "step": 40607 + }, + { + "epoch": 3.289695398574206, + "grad_norm": 0.06508363038301468, + "learning_rate": 3.946172194968271e-05, + "loss": 0.2413, + "step": 40608 + }, + { + "epoch": 3.2897764095917044, + "grad_norm": 0.07355929911136627, + "learning_rate": 3.945722129708808e-05, + "loss": 0.2702, + "step": 40609 + }, + { + "epoch": 3.289857420609203, + "grad_norm": 0.0816044956445694, + "learning_rate": 3.945272064449345e-05, + "loss": 0.2117, + "step": 40610 + }, + { + "epoch": 3.2899384316267013, + "grad_norm": 0.07990920543670654, + "learning_rate": 3.944821999189883e-05, + "loss": 0.2459, + "step": 40611 + }, + { + "epoch": 3.2900194426441995, + "grad_norm": 0.0657232403755188, + "learning_rate": 3.94437193393042e-05, + "loss": 0.1885, + "step": 40612 + }, + { + "epoch": 3.2901004536616982, + "grad_norm": 0.06775541603565216, + "learning_rate": 3.943921868670957e-05, + "loss": 0.201, + "step": 40613 + }, + { + "epoch": 3.2901814646791965, + "grad_norm": 0.0732579305768013, + "learning_rate": 3.943471803411495e-05, + "loss": 0.2058, + "step": 40614 + }, + { + "epoch": 3.2902624756966947, + "grad_norm": 0.07913044095039368, + "learning_rate": 3.943021738152032e-05, + "loss": 0.198, + "step": 40615 + }, + { + "epoch": 3.290343486714193, + "grad_norm": 0.07696747034788132, + "learning_rate": 3.94257167289257e-05, + "loss": 0.2087, + "step": 40616 + }, + { + "epoch": 3.2904244977316917, + "grad_norm": 0.0815991535782814, + "learning_rate": 3.942121607633107e-05, + "loss": 0.2441, + "step": 40617 + }, + { + "epoch": 3.29050550874919, + "grad_norm": 0.07421345263719559, + "learning_rate": 3.9416715423736444e-05, + "loss": 0.2168, + "step": 40618 + }, + { + "epoch": 3.290586519766688, + "grad_norm": 0.07157240062952042, + "learning_rate": 3.941221477114182e-05, + "loss": 0.2335, + "step": 40619 + }, + { + "epoch": 3.290667530784187, + "grad_norm": 0.08413759618997574, + "learning_rate": 3.940771411854719e-05, + "loss": 0.2532, + "step": 40620 + }, + { + "epoch": 3.290748541801685, + "grad_norm": 0.08253608644008636, + "learning_rate": 3.9403213465952565e-05, + "loss": 0.2379, + "step": 40621 + }, + { + "epoch": 3.2908295528191833, + "grad_norm": 0.07659203559160233, + "learning_rate": 3.939871281335794e-05, + "loss": 0.241, + "step": 40622 + }, + { + "epoch": 3.2909105638366816, + "grad_norm": 0.06601595133543015, + "learning_rate": 3.939421216076331e-05, + "loss": 0.2507, + "step": 40623 + }, + { + "epoch": 3.2909915748541803, + "grad_norm": 0.06105021387338638, + "learning_rate": 3.9389711508168686e-05, + "loss": 0.2026, + "step": 40624 + }, + { + "epoch": 3.2910725858716785, + "grad_norm": 0.06365049630403519, + "learning_rate": 3.938521085557406e-05, + "loss": 0.2026, + "step": 40625 + }, + { + "epoch": 3.2911535968891767, + "grad_norm": 0.0851701870560646, + "learning_rate": 3.938071020297943e-05, + "loss": 0.2263, + "step": 40626 + }, + { + "epoch": 3.2912346079066754, + "grad_norm": 0.06830804795026779, + "learning_rate": 3.9376209550384806e-05, + "loss": 0.2102, + "step": 40627 + }, + { + "epoch": 3.2913156189241737, + "grad_norm": 0.06827913969755173, + "learning_rate": 3.937170889779018e-05, + "loss": 0.1907, + "step": 40628 + }, + { + "epoch": 3.291396629941672, + "grad_norm": 0.06598909199237823, + "learning_rate": 3.9367208245195554e-05, + "loss": 0.2028, + "step": 40629 + }, + { + "epoch": 3.2914776409591706, + "grad_norm": 0.0733598917722702, + "learning_rate": 3.936270759260093e-05, + "loss": 0.2264, + "step": 40630 + }, + { + "epoch": 3.291558651976669, + "grad_norm": 0.07068891823291779, + "learning_rate": 3.93582069400063e-05, + "loss": 0.2373, + "step": 40631 + }, + { + "epoch": 3.291639662994167, + "grad_norm": 0.08565499633550644, + "learning_rate": 3.935370628741168e-05, + "loss": 0.2537, + "step": 40632 + }, + { + "epoch": 3.291720674011666, + "grad_norm": 0.08677096664905548, + "learning_rate": 3.934920563481705e-05, + "loss": 0.2212, + "step": 40633 + }, + { + "epoch": 3.291801685029164, + "grad_norm": 0.06935994327068329, + "learning_rate": 3.934470498222242e-05, + "loss": 0.2052, + "step": 40634 + }, + { + "epoch": 3.2918826960466623, + "grad_norm": 0.06672733277082443, + "learning_rate": 3.93402043296278e-05, + "loss": 0.2098, + "step": 40635 + }, + { + "epoch": 3.291963707064161, + "grad_norm": 0.09355857968330383, + "learning_rate": 3.933570367703317e-05, + "loss": 0.256, + "step": 40636 + }, + { + "epoch": 3.292044718081659, + "grad_norm": 0.06566230952739716, + "learning_rate": 3.933120302443854e-05, + "loss": 0.2027, + "step": 40637 + }, + { + "epoch": 3.2921257290991575, + "grad_norm": 0.08503949642181396, + "learning_rate": 3.932670237184392e-05, + "loss": 0.23, + "step": 40638 + }, + { + "epoch": 3.2922067401166557, + "grad_norm": 0.07533713430166245, + "learning_rate": 3.932220171924929e-05, + "loss": 0.2462, + "step": 40639 + }, + { + "epoch": 3.2922877511341544, + "grad_norm": 0.07142367959022522, + "learning_rate": 3.931770106665466e-05, + "loss": 0.2276, + "step": 40640 + }, + { + "epoch": 3.2923687621516526, + "grad_norm": 0.07090745121240616, + "learning_rate": 3.9313200414060044e-05, + "loss": 0.2312, + "step": 40641 + }, + { + "epoch": 3.292449773169151, + "grad_norm": 0.07475874572992325, + "learning_rate": 3.930869976146541e-05, + "loss": 0.2095, + "step": 40642 + }, + { + "epoch": 3.2925307841866496, + "grad_norm": 0.07840246707201004, + "learning_rate": 3.9304199108870784e-05, + "loss": 0.2488, + "step": 40643 + }, + { + "epoch": 3.292611795204148, + "grad_norm": 0.07238543778657913, + "learning_rate": 3.9299698456276164e-05, + "loss": 0.21, + "step": 40644 + }, + { + "epoch": 3.292692806221646, + "grad_norm": 0.07699737697839737, + "learning_rate": 3.929519780368154e-05, + "loss": 0.2449, + "step": 40645 + }, + { + "epoch": 3.2927738172391443, + "grad_norm": 0.07308366894721985, + "learning_rate": 3.9290697151086905e-05, + "loss": 0.2296, + "step": 40646 + }, + { + "epoch": 3.292854828256643, + "grad_norm": 0.09641941636800766, + "learning_rate": 3.9286196498492285e-05, + "loss": 0.2019, + "step": 40647 + }, + { + "epoch": 3.2929358392741412, + "grad_norm": 0.07292996346950531, + "learning_rate": 3.928169584589766e-05, + "loss": 0.218, + "step": 40648 + }, + { + "epoch": 3.2930168502916395, + "grad_norm": 0.0700625404715538, + "learning_rate": 3.927719519330303e-05, + "loss": 0.2398, + "step": 40649 + }, + { + "epoch": 3.293097861309138, + "grad_norm": 0.05972727760672569, + "learning_rate": 3.9272694540708406e-05, + "loss": 0.2302, + "step": 40650 + }, + { + "epoch": 3.2931788723266364, + "grad_norm": 0.06765510886907578, + "learning_rate": 3.926819388811378e-05, + "loss": 0.2193, + "step": 40651 + }, + { + "epoch": 3.2932598833441347, + "grad_norm": 0.09009873867034912, + "learning_rate": 3.926369323551915e-05, + "loss": 0.2183, + "step": 40652 + }, + { + "epoch": 3.2933408943616334, + "grad_norm": 0.08619244396686554, + "learning_rate": 3.925919258292453e-05, + "loss": 0.213, + "step": 40653 + }, + { + "epoch": 3.2934219053791316, + "grad_norm": 0.0694895014166832, + "learning_rate": 3.92546919303299e-05, + "loss": 0.2265, + "step": 40654 + }, + { + "epoch": 3.29350291639663, + "grad_norm": 0.055945686995983124, + "learning_rate": 3.9250191277735274e-05, + "loss": 0.1879, + "step": 40655 + }, + { + "epoch": 3.2935839274141285, + "grad_norm": 0.07787401229143143, + "learning_rate": 3.924569062514065e-05, + "loss": 0.2311, + "step": 40656 + }, + { + "epoch": 3.2936649384316268, + "grad_norm": 0.0769646093249321, + "learning_rate": 3.924118997254602e-05, + "loss": 0.2234, + "step": 40657 + }, + { + "epoch": 3.293745949449125, + "grad_norm": 0.06488602608442307, + "learning_rate": 3.9236689319951395e-05, + "loss": 0.2414, + "step": 40658 + }, + { + "epoch": 3.2938269604666237, + "grad_norm": 0.07599738240242004, + "learning_rate": 3.923218866735677e-05, + "loss": 0.2041, + "step": 40659 + }, + { + "epoch": 3.293907971484122, + "grad_norm": 0.07230847328901291, + "learning_rate": 3.922768801476214e-05, + "loss": 0.2485, + "step": 40660 + }, + { + "epoch": 3.29398898250162, + "grad_norm": 0.06198279932141304, + "learning_rate": 3.9223187362167516e-05, + "loss": 0.2238, + "step": 40661 + }, + { + "epoch": 3.2940699935191184, + "grad_norm": 0.07815616577863693, + "learning_rate": 3.921868670957289e-05, + "loss": 0.2235, + "step": 40662 + }, + { + "epoch": 3.294151004536617, + "grad_norm": 0.07445298880338669, + "learning_rate": 3.921418605697826e-05, + "loss": 0.2592, + "step": 40663 + }, + { + "epoch": 3.2942320155541154, + "grad_norm": 0.06103277578949928, + "learning_rate": 3.9209685404383636e-05, + "loss": 0.2375, + "step": 40664 + }, + { + "epoch": 3.2943130265716136, + "grad_norm": 0.07844781875610352, + "learning_rate": 3.920518475178901e-05, + "loss": 0.1955, + "step": 40665 + }, + { + "epoch": 3.2943940375891123, + "grad_norm": 0.0629831850528717, + "learning_rate": 3.9200684099194384e-05, + "loss": 0.2317, + "step": 40666 + }, + { + "epoch": 3.2944750486066106, + "grad_norm": 0.07163090258836746, + "learning_rate": 3.919618344659976e-05, + "loss": 0.2178, + "step": 40667 + }, + { + "epoch": 3.294556059624109, + "grad_norm": 0.09038316458463669, + "learning_rate": 3.919168279400513e-05, + "loss": 0.2388, + "step": 40668 + }, + { + "epoch": 3.294637070641607, + "grad_norm": 0.0876893624663353, + "learning_rate": 3.9187182141410504e-05, + "loss": 0.2163, + "step": 40669 + }, + { + "epoch": 3.2947180816591057, + "grad_norm": 0.05798359215259552, + "learning_rate": 3.918268148881588e-05, + "loss": 0.2172, + "step": 40670 + }, + { + "epoch": 3.294799092676604, + "grad_norm": 0.08011886477470398, + "learning_rate": 3.917818083622126e-05, + "loss": 0.2366, + "step": 40671 + }, + { + "epoch": 3.2948801036941022, + "grad_norm": 0.08385702222585678, + "learning_rate": 3.9173680183626625e-05, + "loss": 0.203, + "step": 40672 + }, + { + "epoch": 3.294961114711601, + "grad_norm": 0.08129411935806274, + "learning_rate": 3.9169179531032e-05, + "loss": 0.2321, + "step": 40673 + }, + { + "epoch": 3.295042125729099, + "grad_norm": 0.06942925602197647, + "learning_rate": 3.916467887843738e-05, + "loss": 0.2214, + "step": 40674 + }, + { + "epoch": 3.2951231367465974, + "grad_norm": 0.07936610281467438, + "learning_rate": 3.9160178225842746e-05, + "loss": 0.2299, + "step": 40675 + }, + { + "epoch": 3.295204147764096, + "grad_norm": 0.06970565766096115, + "learning_rate": 3.915567757324812e-05, + "loss": 0.2492, + "step": 40676 + }, + { + "epoch": 3.2952851587815943, + "grad_norm": 0.07343757152557373, + "learning_rate": 3.91511769206535e-05, + "loss": 0.1815, + "step": 40677 + }, + { + "epoch": 3.2953661697990926, + "grad_norm": 0.09622737020254135, + "learning_rate": 3.914667626805887e-05, + "loss": 0.2402, + "step": 40678 + }, + { + "epoch": 3.2954471808165913, + "grad_norm": 0.08356326073408127, + "learning_rate": 3.914217561546424e-05, + "loss": 0.2072, + "step": 40679 + }, + { + "epoch": 3.2955281918340895, + "grad_norm": 0.06472276896238327, + "learning_rate": 3.913767496286962e-05, + "loss": 0.2038, + "step": 40680 + }, + { + "epoch": 3.2956092028515878, + "grad_norm": 0.07029642909765244, + "learning_rate": 3.913317431027499e-05, + "loss": 0.2132, + "step": 40681 + }, + { + "epoch": 3.295690213869086, + "grad_norm": 0.07775336503982544, + "learning_rate": 3.912867365768037e-05, + "loss": 0.2136, + "step": 40682 + }, + { + "epoch": 3.2957712248865847, + "grad_norm": 0.07341821491718292, + "learning_rate": 3.912417300508574e-05, + "loss": 0.235, + "step": 40683 + }, + { + "epoch": 3.295852235904083, + "grad_norm": 0.06530492007732391, + "learning_rate": 3.9119672352491115e-05, + "loss": 0.2277, + "step": 40684 + }, + { + "epoch": 3.295933246921581, + "grad_norm": 0.07276744395494461, + "learning_rate": 3.911517169989649e-05, + "loss": 0.2489, + "step": 40685 + }, + { + "epoch": 3.29601425793908, + "grad_norm": 0.0678088366985321, + "learning_rate": 3.911067104730186e-05, + "loss": 0.2075, + "step": 40686 + }, + { + "epoch": 3.296095268956578, + "grad_norm": 0.07170473784208298, + "learning_rate": 3.9106170394707236e-05, + "loss": 0.2811, + "step": 40687 + }, + { + "epoch": 3.2961762799740764, + "grad_norm": 0.08175055682659149, + "learning_rate": 3.910166974211261e-05, + "loss": 0.2341, + "step": 40688 + }, + { + "epoch": 3.2962572909915746, + "grad_norm": 0.08029843121767044, + "learning_rate": 3.909716908951798e-05, + "loss": 0.2256, + "step": 40689 + }, + { + "epoch": 3.2963383020090733, + "grad_norm": 0.09754343330860138, + "learning_rate": 3.909266843692336e-05, + "loss": 0.2308, + "step": 40690 + }, + { + "epoch": 3.2964193130265715, + "grad_norm": 0.06310459226369858, + "learning_rate": 3.908816778432873e-05, + "loss": 0.2222, + "step": 40691 + }, + { + "epoch": 3.29650032404407, + "grad_norm": 0.06087411940097809, + "learning_rate": 3.9083667131734104e-05, + "loss": 0.2274, + "step": 40692 + }, + { + "epoch": 3.2965813350615685, + "grad_norm": 0.08009161055088043, + "learning_rate": 3.907916647913948e-05, + "loss": 0.2568, + "step": 40693 + }, + { + "epoch": 3.2966623460790667, + "grad_norm": 0.06610788404941559, + "learning_rate": 3.907466582654485e-05, + "loss": 0.2114, + "step": 40694 + }, + { + "epoch": 3.296743357096565, + "grad_norm": 0.07393686473369598, + "learning_rate": 3.9070165173950225e-05, + "loss": 0.2408, + "step": 40695 + }, + { + "epoch": 3.2968243681140637, + "grad_norm": 0.07974232733249664, + "learning_rate": 3.90656645213556e-05, + "loss": 0.2261, + "step": 40696 + }, + { + "epoch": 3.296905379131562, + "grad_norm": 0.0787620022892952, + "learning_rate": 3.906116386876097e-05, + "loss": 0.2442, + "step": 40697 + }, + { + "epoch": 3.29698639014906, + "grad_norm": 0.12104468047618866, + "learning_rate": 3.9056663216166346e-05, + "loss": 0.2038, + "step": 40698 + }, + { + "epoch": 3.297067401166559, + "grad_norm": 0.07490499317646027, + "learning_rate": 3.905216256357172e-05, + "loss": 0.186, + "step": 40699 + }, + { + "epoch": 3.297148412184057, + "grad_norm": 0.07450409978628159, + "learning_rate": 3.904766191097709e-05, + "loss": 0.1863, + "step": 40700 + }, + { + "epoch": 3.2972294232015553, + "grad_norm": 0.08557897061109543, + "learning_rate": 3.9043161258382467e-05, + "loss": 0.2356, + "step": 40701 + }, + { + "epoch": 3.297310434219054, + "grad_norm": 0.0652608573436737, + "learning_rate": 3.903866060578784e-05, + "loss": 0.2056, + "step": 40702 + }, + { + "epoch": 3.2973914452365523, + "grad_norm": 0.07898595929145813, + "learning_rate": 3.9034159953193214e-05, + "loss": 0.294, + "step": 40703 + }, + { + "epoch": 3.2974724562540505, + "grad_norm": 0.08902262151241302, + "learning_rate": 3.902965930059859e-05, + "loss": 0.1963, + "step": 40704 + }, + { + "epoch": 3.2975534672715487, + "grad_norm": 0.06819375604391098, + "learning_rate": 3.902515864800396e-05, + "loss": 0.2214, + "step": 40705 + }, + { + "epoch": 3.2976344782890474, + "grad_norm": 0.08384226262569427, + "learning_rate": 3.9020657995409335e-05, + "loss": 0.218, + "step": 40706 + }, + { + "epoch": 3.2977154893065457, + "grad_norm": 0.06585050374269485, + "learning_rate": 3.901615734281471e-05, + "loss": 0.2238, + "step": 40707 + }, + { + "epoch": 3.297796500324044, + "grad_norm": 0.06808405369520187, + "learning_rate": 3.901165669022008e-05, + "loss": 0.2746, + "step": 40708 + }, + { + "epoch": 3.2978775113415426, + "grad_norm": 0.05338800698518753, + "learning_rate": 3.9007156037625455e-05, + "loss": 0.2007, + "step": 40709 + }, + { + "epoch": 3.297958522359041, + "grad_norm": 0.07853983342647552, + "learning_rate": 3.9002655385030836e-05, + "loss": 0.2175, + "step": 40710 + }, + { + "epoch": 3.298039533376539, + "grad_norm": 0.0728057250380516, + "learning_rate": 3.89981547324362e-05, + "loss": 0.2283, + "step": 40711 + }, + { + "epoch": 3.2981205443940373, + "grad_norm": 0.06241932511329651, + "learning_rate": 3.8993654079841576e-05, + "loss": 0.2096, + "step": 40712 + }, + { + "epoch": 3.298201555411536, + "grad_norm": 0.07981060445308685, + "learning_rate": 3.8989153427246957e-05, + "loss": 0.208, + "step": 40713 + }, + { + "epoch": 3.2982825664290343, + "grad_norm": 0.0637211948633194, + "learning_rate": 3.8984652774652323e-05, + "loss": 0.2124, + "step": 40714 + }, + { + "epoch": 3.2983635774465325, + "grad_norm": 0.058054119348526, + "learning_rate": 3.89801521220577e-05, + "loss": 0.2283, + "step": 40715 + }, + { + "epoch": 3.298444588464031, + "grad_norm": 0.086003877222538, + "learning_rate": 3.897565146946308e-05, + "loss": 0.2738, + "step": 40716 + }, + { + "epoch": 3.2985255994815295, + "grad_norm": 0.06300433725118637, + "learning_rate": 3.8971150816868444e-05, + "loss": 0.208, + "step": 40717 + }, + { + "epoch": 3.2986066104990277, + "grad_norm": 0.0810168907046318, + "learning_rate": 3.8966650164273825e-05, + "loss": 0.2374, + "step": 40718 + }, + { + "epoch": 3.2986876215165264, + "grad_norm": 0.07268819212913513, + "learning_rate": 3.89621495116792e-05, + "loss": 0.2419, + "step": 40719 + }, + { + "epoch": 3.2987686325340246, + "grad_norm": 0.06387283653020859, + "learning_rate": 3.8957648859084565e-05, + "loss": 0.2113, + "step": 40720 + }, + { + "epoch": 3.298849643551523, + "grad_norm": 0.08623532950878143, + "learning_rate": 3.8953148206489945e-05, + "loss": 0.255, + "step": 40721 + }, + { + "epoch": 3.2989306545690216, + "grad_norm": 0.0770018920302391, + "learning_rate": 3.894864755389532e-05, + "loss": 0.2395, + "step": 40722 + }, + { + "epoch": 3.29901166558652, + "grad_norm": 0.06887763738632202, + "learning_rate": 3.894414690130069e-05, + "loss": 0.2648, + "step": 40723 + }, + { + "epoch": 3.299092676604018, + "grad_norm": 0.06291645020246506, + "learning_rate": 3.8939646248706066e-05, + "loss": 0.1935, + "step": 40724 + }, + { + "epoch": 3.2991736876215167, + "grad_norm": 0.06266850233078003, + "learning_rate": 3.893514559611144e-05, + "loss": 0.2232, + "step": 40725 + }, + { + "epoch": 3.299254698639015, + "grad_norm": 0.07828648388385773, + "learning_rate": 3.8930644943516813e-05, + "loss": 0.2509, + "step": 40726 + }, + { + "epoch": 3.2993357096565132, + "grad_norm": 0.07096115499734879, + "learning_rate": 3.892614429092219e-05, + "loss": 0.1945, + "step": 40727 + }, + { + "epoch": 3.2994167206740115, + "grad_norm": 0.05984153971076012, + "learning_rate": 3.892164363832756e-05, + "loss": 0.2376, + "step": 40728 + }, + { + "epoch": 3.29949773169151, + "grad_norm": 0.06432328373193741, + "learning_rate": 3.8917142985732934e-05, + "loss": 0.2067, + "step": 40729 + }, + { + "epoch": 3.2995787427090084, + "grad_norm": 0.06568169593811035, + "learning_rate": 3.891264233313831e-05, + "loss": 0.216, + "step": 40730 + }, + { + "epoch": 3.2996597537265067, + "grad_norm": 0.08853048086166382, + "learning_rate": 3.890814168054368e-05, + "loss": 0.2494, + "step": 40731 + }, + { + "epoch": 3.2997407647440054, + "grad_norm": 0.08453629165887833, + "learning_rate": 3.8903641027949055e-05, + "loss": 0.2407, + "step": 40732 + }, + { + "epoch": 3.2998217757615036, + "grad_norm": 0.0698961466550827, + "learning_rate": 3.889914037535443e-05, + "loss": 0.2613, + "step": 40733 + }, + { + "epoch": 3.299902786779002, + "grad_norm": 0.07776353508234024, + "learning_rate": 3.88946397227598e-05, + "loss": 0.2194, + "step": 40734 + }, + { + "epoch": 3.2999837977965, + "grad_norm": 0.06141514703631401, + "learning_rate": 3.8890139070165176e-05, + "loss": 0.2003, + "step": 40735 + }, + { + "epoch": 3.3000648088139988, + "grad_norm": 0.06544289737939835, + "learning_rate": 3.888563841757055e-05, + "loss": 0.2205, + "step": 40736 + }, + { + "epoch": 3.300145819831497, + "grad_norm": 0.06846226751804352, + "learning_rate": 3.888113776497592e-05, + "loss": 0.2053, + "step": 40737 + }, + { + "epoch": 3.3002268308489953, + "grad_norm": 0.06529880315065384, + "learning_rate": 3.88766371123813e-05, + "loss": 0.2147, + "step": 40738 + }, + { + "epoch": 3.300307841866494, + "grad_norm": 0.07303272187709808, + "learning_rate": 3.887213645978667e-05, + "loss": 0.2141, + "step": 40739 + }, + { + "epoch": 3.300388852883992, + "grad_norm": 0.0753205344080925, + "learning_rate": 3.8867635807192044e-05, + "loss": 0.2377, + "step": 40740 + }, + { + "epoch": 3.3004698639014904, + "grad_norm": 0.06747289001941681, + "learning_rate": 3.886313515459742e-05, + "loss": 0.2327, + "step": 40741 + }, + { + "epoch": 3.300550874918989, + "grad_norm": 0.08344721794128418, + "learning_rate": 3.885863450200279e-05, + "loss": 0.2464, + "step": 40742 + }, + { + "epoch": 3.3006318859364874, + "grad_norm": 0.08885085582733154, + "learning_rate": 3.8854133849408165e-05, + "loss": 0.2519, + "step": 40743 + }, + { + "epoch": 3.3007128969539856, + "grad_norm": 0.06508319079875946, + "learning_rate": 3.884963319681354e-05, + "loss": 0.2035, + "step": 40744 + }, + { + "epoch": 3.3007939079714843, + "grad_norm": 0.052353620529174805, + "learning_rate": 3.884513254421891e-05, + "loss": 0.2429, + "step": 40745 + }, + { + "epoch": 3.3008749189889826, + "grad_norm": 0.052554935216903687, + "learning_rate": 3.8840631891624285e-05, + "loss": 0.2111, + "step": 40746 + }, + { + "epoch": 3.300955930006481, + "grad_norm": 0.0698976144194603, + "learning_rate": 3.883613123902966e-05, + "loss": 0.2467, + "step": 40747 + }, + { + "epoch": 3.3010369410239795, + "grad_norm": 0.0756709948182106, + "learning_rate": 3.883163058643503e-05, + "loss": 0.2193, + "step": 40748 + }, + { + "epoch": 3.3011179520414777, + "grad_norm": 0.08819136023521423, + "learning_rate": 3.8827129933840406e-05, + "loss": 0.2592, + "step": 40749 + }, + { + "epoch": 3.301198963058976, + "grad_norm": 0.06856835633516312, + "learning_rate": 3.882262928124578e-05, + "loss": 0.2521, + "step": 40750 + }, + { + "epoch": 3.301279974076474, + "grad_norm": 0.0643986389040947, + "learning_rate": 3.881812862865116e-05, + "loss": 0.1904, + "step": 40751 + }, + { + "epoch": 3.301360985093973, + "grad_norm": 0.060446128249168396, + "learning_rate": 3.8813627976056534e-05, + "loss": 0.1933, + "step": 40752 + }, + { + "epoch": 3.301441996111471, + "grad_norm": 0.07070574164390564, + "learning_rate": 3.88091273234619e-05, + "loss": 0.2103, + "step": 40753 + }, + { + "epoch": 3.3015230071289694, + "grad_norm": 0.06437844783067703, + "learning_rate": 3.880462667086728e-05, + "loss": 0.2391, + "step": 40754 + }, + { + "epoch": 3.301604018146468, + "grad_norm": 0.04855308309197426, + "learning_rate": 3.8800126018272655e-05, + "loss": 0.2116, + "step": 40755 + }, + { + "epoch": 3.3016850291639663, + "grad_norm": 0.06874486058950424, + "learning_rate": 3.879562536567802e-05, + "loss": 0.2446, + "step": 40756 + }, + { + "epoch": 3.3017660401814646, + "grad_norm": 0.08705892413854599, + "learning_rate": 3.87911247130834e-05, + "loss": 0.2385, + "step": 40757 + }, + { + "epoch": 3.301847051198963, + "grad_norm": 0.0645841583609581, + "learning_rate": 3.8786624060488776e-05, + "loss": 0.2009, + "step": 40758 + }, + { + "epoch": 3.3019280622164615, + "grad_norm": 0.07083959132432938, + "learning_rate": 3.878212340789414e-05, + "loss": 0.2311, + "step": 40759 + }, + { + "epoch": 3.3020090732339598, + "grad_norm": 0.07376854866743088, + "learning_rate": 3.877762275529952e-05, + "loss": 0.1957, + "step": 40760 + }, + { + "epoch": 3.302090084251458, + "grad_norm": 0.059838343411684036, + "learning_rate": 3.8773122102704896e-05, + "loss": 0.205, + "step": 40761 + }, + { + "epoch": 3.3021710952689567, + "grad_norm": 0.05825914815068245, + "learning_rate": 3.876862145011026e-05, + "loss": 0.1942, + "step": 40762 + }, + { + "epoch": 3.302252106286455, + "grad_norm": 0.08400315791368484, + "learning_rate": 3.8764120797515644e-05, + "loss": 0.2344, + "step": 40763 + }, + { + "epoch": 3.302333117303953, + "grad_norm": 0.06611832231283188, + "learning_rate": 3.875962014492102e-05, + "loss": 0.2043, + "step": 40764 + }, + { + "epoch": 3.302414128321452, + "grad_norm": 0.08515192568302155, + "learning_rate": 3.875511949232639e-05, + "loss": 0.2245, + "step": 40765 + }, + { + "epoch": 3.30249513933895, + "grad_norm": 0.06319393962621689, + "learning_rate": 3.8750618839731764e-05, + "loss": 0.2129, + "step": 40766 + }, + { + "epoch": 3.3025761503564484, + "grad_norm": 0.0669749304652214, + "learning_rate": 3.874611818713714e-05, + "loss": 0.2245, + "step": 40767 + }, + { + "epoch": 3.302657161373947, + "grad_norm": 0.06981271505355835, + "learning_rate": 3.874161753454251e-05, + "loss": 0.2412, + "step": 40768 + }, + { + "epoch": 3.3027381723914453, + "grad_norm": 0.09386993944644928, + "learning_rate": 3.8737116881947885e-05, + "loss": 0.2234, + "step": 40769 + }, + { + "epoch": 3.3028191834089435, + "grad_norm": 0.0718587264418602, + "learning_rate": 3.873261622935326e-05, + "loss": 0.2105, + "step": 40770 + }, + { + "epoch": 3.3029001944264422, + "grad_norm": 0.06613459438085556, + "learning_rate": 3.872811557675863e-05, + "loss": 0.2457, + "step": 40771 + }, + { + "epoch": 3.3029812054439405, + "grad_norm": 0.05869852751493454, + "learning_rate": 3.8723614924164006e-05, + "loss": 0.2081, + "step": 40772 + }, + { + "epoch": 3.3030622164614387, + "grad_norm": 0.07454083859920502, + "learning_rate": 3.871911427156938e-05, + "loss": 0.2027, + "step": 40773 + }, + { + "epoch": 3.303143227478937, + "grad_norm": 0.07137610018253326, + "learning_rate": 3.871461361897475e-05, + "loss": 0.2281, + "step": 40774 + }, + { + "epoch": 3.3032242384964356, + "grad_norm": 0.07652374356985092, + "learning_rate": 3.871011296638013e-05, + "loss": 0.2299, + "step": 40775 + }, + { + "epoch": 3.303305249513934, + "grad_norm": 0.0735517367720604, + "learning_rate": 3.87056123137855e-05, + "loss": 0.2249, + "step": 40776 + }, + { + "epoch": 3.303386260531432, + "grad_norm": 0.06907539069652557, + "learning_rate": 3.8701111661190874e-05, + "loss": 0.2501, + "step": 40777 + }, + { + "epoch": 3.303467271548931, + "grad_norm": 0.0660337433218956, + "learning_rate": 3.869661100859625e-05, + "loss": 0.2071, + "step": 40778 + }, + { + "epoch": 3.303548282566429, + "grad_norm": 0.09033715724945068, + "learning_rate": 3.869211035600162e-05, + "loss": 0.2094, + "step": 40779 + }, + { + "epoch": 3.3036292935839273, + "grad_norm": 0.06318441033363342, + "learning_rate": 3.8687609703406995e-05, + "loss": 0.2159, + "step": 40780 + }, + { + "epoch": 3.3037103046014256, + "grad_norm": 0.067022405564785, + "learning_rate": 3.868310905081237e-05, + "loss": 0.2768, + "step": 40781 + }, + { + "epoch": 3.3037913156189243, + "grad_norm": 0.08411199599504471, + "learning_rate": 3.867860839821774e-05, + "loss": 0.2385, + "step": 40782 + }, + { + "epoch": 3.3038723266364225, + "grad_norm": 0.08436904102563858, + "learning_rate": 3.8674107745623116e-05, + "loss": 0.2055, + "step": 40783 + }, + { + "epoch": 3.3039533376539207, + "grad_norm": 0.054027408361434937, + "learning_rate": 3.8669607093028496e-05, + "loss": 0.2361, + "step": 40784 + }, + { + "epoch": 3.3040343486714194, + "grad_norm": 0.06735050678253174, + "learning_rate": 3.866510644043386e-05, + "loss": 0.1889, + "step": 40785 + }, + { + "epoch": 3.3041153596889177, + "grad_norm": 0.0733322873711586, + "learning_rate": 3.8660605787839236e-05, + "loss": 0.2393, + "step": 40786 + }, + { + "epoch": 3.304196370706416, + "grad_norm": 0.06890393793582916, + "learning_rate": 3.865610513524462e-05, + "loss": 0.2059, + "step": 40787 + }, + { + "epoch": 3.3042773817239146, + "grad_norm": 0.08290769159793854, + "learning_rate": 3.8651604482649984e-05, + "loss": 0.2401, + "step": 40788 + }, + { + "epoch": 3.304358392741413, + "grad_norm": 0.07497076690196991, + "learning_rate": 3.864710383005536e-05, + "loss": 0.2217, + "step": 40789 + }, + { + "epoch": 3.304439403758911, + "grad_norm": 0.06635642796754837, + "learning_rate": 3.864260317746074e-05, + "loss": 0.2094, + "step": 40790 + }, + { + "epoch": 3.30452041477641, + "grad_norm": 0.0692463219165802, + "learning_rate": 3.863810252486611e-05, + "loss": 0.2341, + "step": 40791 + }, + { + "epoch": 3.304601425793908, + "grad_norm": 0.08122988045215607, + "learning_rate": 3.863360187227148e-05, + "loss": 0.2077, + "step": 40792 + }, + { + "epoch": 3.3046824368114063, + "grad_norm": 0.0768023282289505, + "learning_rate": 3.862910121967686e-05, + "loss": 0.2513, + "step": 40793 + }, + { + "epoch": 3.304763447828905, + "grad_norm": 0.0745762512087822, + "learning_rate": 3.862460056708223e-05, + "loss": 0.2296, + "step": 40794 + }, + { + "epoch": 3.304844458846403, + "grad_norm": 0.0761742889881134, + "learning_rate": 3.86200999144876e-05, + "loss": 0.2314, + "step": 40795 + }, + { + "epoch": 3.3049254698639015, + "grad_norm": 0.07914459705352783, + "learning_rate": 3.861559926189298e-05, + "loss": 0.2257, + "step": 40796 + }, + { + "epoch": 3.3050064808813997, + "grad_norm": 0.0910719782114029, + "learning_rate": 3.861109860929835e-05, + "loss": 0.3021, + "step": 40797 + }, + { + "epoch": 3.3050874918988984, + "grad_norm": 0.07248116284608841, + "learning_rate": 3.860659795670372e-05, + "loss": 0.2256, + "step": 40798 + }, + { + "epoch": 3.3051685029163966, + "grad_norm": 0.07484246790409088, + "learning_rate": 3.86020973041091e-05, + "loss": 0.2063, + "step": 40799 + }, + { + "epoch": 3.305249513933895, + "grad_norm": 0.08107568323612213, + "learning_rate": 3.8597596651514474e-05, + "loss": 0.2075, + "step": 40800 + }, + { + "epoch": 3.3053305249513936, + "grad_norm": 0.07556001842021942, + "learning_rate": 3.859309599891984e-05, + "loss": 0.2149, + "step": 40801 + }, + { + "epoch": 3.305411535968892, + "grad_norm": 0.07405587285757065, + "learning_rate": 3.858859534632522e-05, + "loss": 0.2081, + "step": 40802 + }, + { + "epoch": 3.30549254698639, + "grad_norm": 0.05908035486936569, + "learning_rate": 3.8584094693730594e-05, + "loss": 0.2221, + "step": 40803 + }, + { + "epoch": 3.3055735580038883, + "grad_norm": 0.09235452115535736, + "learning_rate": 3.857959404113597e-05, + "loss": 0.2333, + "step": 40804 + }, + { + "epoch": 3.305654569021387, + "grad_norm": 0.06490433216094971, + "learning_rate": 3.857509338854134e-05, + "loss": 0.191, + "step": 40805 + }, + { + "epoch": 3.3057355800388852, + "grad_norm": 0.07053831964731216, + "learning_rate": 3.8570592735946715e-05, + "loss": 0.2508, + "step": 40806 + }, + { + "epoch": 3.3058165910563835, + "grad_norm": 0.06669102609157562, + "learning_rate": 3.856609208335209e-05, + "loss": 0.2089, + "step": 40807 + }, + { + "epoch": 3.305897602073882, + "grad_norm": 0.07495436817407608, + "learning_rate": 3.856159143075746e-05, + "loss": 0.212, + "step": 40808 + }, + { + "epoch": 3.3059786130913804, + "grad_norm": 0.06892978399991989, + "learning_rate": 3.8557090778162836e-05, + "loss": 0.2598, + "step": 40809 + }, + { + "epoch": 3.3060596241088787, + "grad_norm": 0.0675312802195549, + "learning_rate": 3.855259012556821e-05, + "loss": 0.2283, + "step": 40810 + }, + { + "epoch": 3.3061406351263773, + "grad_norm": 0.07323117554187775, + "learning_rate": 3.854808947297358e-05, + "loss": 0.2133, + "step": 40811 + }, + { + "epoch": 3.3062216461438756, + "grad_norm": 0.06845073401927948, + "learning_rate": 3.854358882037896e-05, + "loss": 0.2194, + "step": 40812 + }, + { + "epoch": 3.306302657161374, + "grad_norm": 0.09151905030012131, + "learning_rate": 3.853908816778433e-05, + "loss": 0.2426, + "step": 40813 + }, + { + "epoch": 3.3063836681788725, + "grad_norm": 0.08238843083381653, + "learning_rate": 3.8534587515189704e-05, + "loss": 0.2228, + "step": 40814 + }, + { + "epoch": 3.3064646791963708, + "grad_norm": 0.06988435983657837, + "learning_rate": 3.853008686259508e-05, + "loss": 0.2416, + "step": 40815 + }, + { + "epoch": 3.306545690213869, + "grad_norm": 0.06417524069547653, + "learning_rate": 3.852558621000045e-05, + "loss": 0.2228, + "step": 40816 + }, + { + "epoch": 3.3066267012313677, + "grad_norm": 0.06799071282148361, + "learning_rate": 3.8521085557405825e-05, + "loss": 0.2113, + "step": 40817 + }, + { + "epoch": 3.306707712248866, + "grad_norm": 0.07856892049312592, + "learning_rate": 3.85165849048112e-05, + "loss": 0.2194, + "step": 40818 + }, + { + "epoch": 3.306788723266364, + "grad_norm": 0.09208235144615173, + "learning_rate": 3.851208425221657e-05, + "loss": 0.2542, + "step": 40819 + }, + { + "epoch": 3.3068697342838624, + "grad_norm": 0.06966143846511841, + "learning_rate": 3.850758359962195e-05, + "loss": 0.2285, + "step": 40820 + }, + { + "epoch": 3.306950745301361, + "grad_norm": 0.0819198414683342, + "learning_rate": 3.850308294702732e-05, + "loss": 0.2144, + "step": 40821 + }, + { + "epoch": 3.3070317563188594, + "grad_norm": 0.07809050381183624, + "learning_rate": 3.849858229443269e-05, + "loss": 0.2435, + "step": 40822 + }, + { + "epoch": 3.3071127673363576, + "grad_norm": 0.0795537456870079, + "learning_rate": 3.849408164183807e-05, + "loss": 0.2093, + "step": 40823 + }, + { + "epoch": 3.3071937783538563, + "grad_norm": 0.08147698640823364, + "learning_rate": 3.848958098924344e-05, + "loss": 0.2786, + "step": 40824 + }, + { + "epoch": 3.3072747893713546, + "grad_norm": 0.05749053508043289, + "learning_rate": 3.8485080336648814e-05, + "loss": 0.2318, + "step": 40825 + }, + { + "epoch": 3.307355800388853, + "grad_norm": 0.07302997261285782, + "learning_rate": 3.8480579684054194e-05, + "loss": 0.2358, + "step": 40826 + }, + { + "epoch": 3.307436811406351, + "grad_norm": 0.0854649692773819, + "learning_rate": 3.847607903145956e-05, + "loss": 0.246, + "step": 40827 + }, + { + "epoch": 3.3075178224238497, + "grad_norm": 0.06140410900115967, + "learning_rate": 3.8471578378864935e-05, + "loss": 0.275, + "step": 40828 + }, + { + "epoch": 3.307598833441348, + "grad_norm": 0.07026208937168121, + "learning_rate": 3.8467077726270315e-05, + "loss": 0.2614, + "step": 40829 + }, + { + "epoch": 3.307679844458846, + "grad_norm": 0.05768724903464317, + "learning_rate": 3.846257707367569e-05, + "loss": 0.2325, + "step": 40830 + }, + { + "epoch": 3.307760855476345, + "grad_norm": 0.0600363127887249, + "learning_rate": 3.8458076421081055e-05, + "loss": 0.1755, + "step": 40831 + }, + { + "epoch": 3.307841866493843, + "grad_norm": 0.08797100186347961, + "learning_rate": 3.8453575768486436e-05, + "loss": 0.2295, + "step": 40832 + }, + { + "epoch": 3.3079228775113414, + "grad_norm": 0.07335330545902252, + "learning_rate": 3.844907511589181e-05, + "loss": 0.2101, + "step": 40833 + }, + { + "epoch": 3.30800388852884, + "grad_norm": 0.07781440019607544, + "learning_rate": 3.8444574463297176e-05, + "loss": 0.216, + "step": 40834 + }, + { + "epoch": 3.3080848995463383, + "grad_norm": 0.08074433356523514, + "learning_rate": 3.8440073810702557e-05, + "loss": 0.2502, + "step": 40835 + }, + { + "epoch": 3.3081659105638366, + "grad_norm": 0.05874902382493019, + "learning_rate": 3.843557315810793e-05, + "loss": 0.2076, + "step": 40836 + }, + { + "epoch": 3.3082469215813353, + "grad_norm": 0.06767843663692474, + "learning_rate": 3.84310725055133e-05, + "loss": 0.2357, + "step": 40837 + }, + { + "epoch": 3.3083279325988335, + "grad_norm": 0.07101554423570633, + "learning_rate": 3.842657185291868e-05, + "loss": 0.2101, + "step": 40838 + }, + { + "epoch": 3.3084089436163318, + "grad_norm": 0.06890977919101715, + "learning_rate": 3.842207120032405e-05, + "loss": 0.2384, + "step": 40839 + }, + { + "epoch": 3.3084899546338304, + "grad_norm": 0.06549681723117828, + "learning_rate": 3.841757054772942e-05, + "loss": 0.226, + "step": 40840 + }, + { + "epoch": 3.3085709656513287, + "grad_norm": 0.06217101216316223, + "learning_rate": 3.84130698951348e-05, + "loss": 0.2338, + "step": 40841 + }, + { + "epoch": 3.308651976668827, + "grad_norm": 0.07235632836818695, + "learning_rate": 3.840856924254017e-05, + "loss": 0.2122, + "step": 40842 + }, + { + "epoch": 3.308732987686325, + "grad_norm": 0.06837397813796997, + "learning_rate": 3.8404068589945545e-05, + "loss": 0.2475, + "step": 40843 + }, + { + "epoch": 3.308813998703824, + "grad_norm": 0.0652531161904335, + "learning_rate": 3.839956793735092e-05, + "loss": 0.2657, + "step": 40844 + }, + { + "epoch": 3.308895009721322, + "grad_norm": 0.06572028994560242, + "learning_rate": 3.839506728475629e-05, + "loss": 0.2221, + "step": 40845 + }, + { + "epoch": 3.3089760207388204, + "grad_norm": 0.07158073782920837, + "learning_rate": 3.8390566632161666e-05, + "loss": 0.2385, + "step": 40846 + }, + { + "epoch": 3.309057031756319, + "grad_norm": 0.07187211513519287, + "learning_rate": 3.838606597956704e-05, + "loss": 0.2762, + "step": 40847 + }, + { + "epoch": 3.3091380427738173, + "grad_norm": 0.06367867439985275, + "learning_rate": 3.838156532697241e-05, + "loss": 0.192, + "step": 40848 + }, + { + "epoch": 3.3092190537913155, + "grad_norm": 0.0679791048169136, + "learning_rate": 3.837706467437779e-05, + "loss": 0.2284, + "step": 40849 + }, + { + "epoch": 3.309300064808814, + "grad_norm": 0.061335284262895584, + "learning_rate": 3.837256402178316e-05, + "loss": 0.2441, + "step": 40850 + }, + { + "epoch": 3.3093810758263125, + "grad_norm": 0.06405185163021088, + "learning_rate": 3.8368063369188534e-05, + "loss": 0.2051, + "step": 40851 + }, + { + "epoch": 3.3094620868438107, + "grad_norm": 0.07147097587585449, + "learning_rate": 3.836356271659391e-05, + "loss": 0.2153, + "step": 40852 + }, + { + "epoch": 3.309543097861309, + "grad_norm": 0.07569832354784012, + "learning_rate": 3.835906206399928e-05, + "loss": 0.2225, + "step": 40853 + }, + { + "epoch": 3.3096241088788076, + "grad_norm": 0.08858956396579742, + "learning_rate": 3.8354561411404655e-05, + "loss": 0.2254, + "step": 40854 + }, + { + "epoch": 3.309705119896306, + "grad_norm": 0.06722646206617355, + "learning_rate": 3.835006075881003e-05, + "loss": 0.2188, + "step": 40855 + }, + { + "epoch": 3.309786130913804, + "grad_norm": 0.06068732589483261, + "learning_rate": 3.834556010621541e-05, + "loss": 0.234, + "step": 40856 + }, + { + "epoch": 3.309867141931303, + "grad_norm": 0.0778256431221962, + "learning_rate": 3.8341059453620776e-05, + "loss": 0.2329, + "step": 40857 + }, + { + "epoch": 3.309948152948801, + "grad_norm": 0.08266248553991318, + "learning_rate": 3.833655880102615e-05, + "loss": 0.2201, + "step": 40858 + }, + { + "epoch": 3.3100291639662993, + "grad_norm": 0.07404722273349762, + "learning_rate": 3.833205814843153e-05, + "loss": 0.2013, + "step": 40859 + }, + { + "epoch": 3.310110174983798, + "grad_norm": 0.08323337882757187, + "learning_rate": 3.8327557495836897e-05, + "loss": 0.2271, + "step": 40860 + }, + { + "epoch": 3.3101911860012962, + "grad_norm": 0.059005096554756165, + "learning_rate": 3.832305684324227e-05, + "loss": 0.1849, + "step": 40861 + }, + { + "epoch": 3.3102721970187945, + "grad_norm": 0.08371363580226898, + "learning_rate": 3.831855619064765e-05, + "loss": 0.2213, + "step": 40862 + }, + { + "epoch": 3.310353208036293, + "grad_norm": 0.07347278296947479, + "learning_rate": 3.831405553805302e-05, + "loss": 0.2097, + "step": 40863 + }, + { + "epoch": 3.3104342190537914, + "grad_norm": 0.07287754863500595, + "learning_rate": 3.830955488545839e-05, + "loss": 0.2723, + "step": 40864 + }, + { + "epoch": 3.3105152300712897, + "grad_norm": 0.06831778585910797, + "learning_rate": 3.830505423286377e-05, + "loss": 0.2293, + "step": 40865 + }, + { + "epoch": 3.310596241088788, + "grad_norm": 0.08180266618728638, + "learning_rate": 3.830055358026914e-05, + "loss": 0.2155, + "step": 40866 + }, + { + "epoch": 3.3106772521062866, + "grad_norm": 0.0707252249121666, + "learning_rate": 3.829605292767451e-05, + "loss": 0.2041, + "step": 40867 + }, + { + "epoch": 3.310758263123785, + "grad_norm": 0.06609929352998734, + "learning_rate": 3.829155227507989e-05, + "loss": 0.2128, + "step": 40868 + }, + { + "epoch": 3.310839274141283, + "grad_norm": 0.08587994426488876, + "learning_rate": 3.8287051622485266e-05, + "loss": 0.2455, + "step": 40869 + }, + { + "epoch": 3.310920285158782, + "grad_norm": 0.07224773615598679, + "learning_rate": 3.828255096989063e-05, + "loss": 0.1964, + "step": 40870 + }, + { + "epoch": 3.31100129617628, + "grad_norm": 0.07214035838842392, + "learning_rate": 3.827805031729601e-05, + "loss": 0.2122, + "step": 40871 + }, + { + "epoch": 3.3110823071937783, + "grad_norm": 0.06749515980482101, + "learning_rate": 3.827354966470139e-05, + "loss": 0.2266, + "step": 40872 + }, + { + "epoch": 3.3111633182112765, + "grad_norm": 0.07175476849079132, + "learning_rate": 3.8269049012106753e-05, + "loss": 0.2451, + "step": 40873 + }, + { + "epoch": 3.311244329228775, + "grad_norm": 0.05612906441092491, + "learning_rate": 3.8264548359512134e-05, + "loss": 0.2184, + "step": 40874 + }, + { + "epoch": 3.3113253402462735, + "grad_norm": 0.06670593470335007, + "learning_rate": 3.826004770691751e-05, + "loss": 0.2125, + "step": 40875 + }, + { + "epoch": 3.3114063512637717, + "grad_norm": 0.07357032597064972, + "learning_rate": 3.8255547054322874e-05, + "loss": 0.2017, + "step": 40876 + }, + { + "epoch": 3.3114873622812704, + "grad_norm": 0.06423185765743256, + "learning_rate": 3.8251046401728255e-05, + "loss": 0.2121, + "step": 40877 + }, + { + "epoch": 3.3115683732987686, + "grad_norm": 0.06728553026914597, + "learning_rate": 3.824654574913363e-05, + "loss": 0.2274, + "step": 40878 + }, + { + "epoch": 3.311649384316267, + "grad_norm": 0.08222640305757523, + "learning_rate": 3.8242045096538995e-05, + "loss": 0.2713, + "step": 40879 + }, + { + "epoch": 3.3117303953337656, + "grad_norm": 0.06782739609479904, + "learning_rate": 3.8237544443944375e-05, + "loss": 0.2146, + "step": 40880 + }, + { + "epoch": 3.311811406351264, + "grad_norm": 0.07442223280668259, + "learning_rate": 3.823304379134975e-05, + "loss": 0.2592, + "step": 40881 + }, + { + "epoch": 3.311892417368762, + "grad_norm": 0.07705773413181305, + "learning_rate": 3.822854313875512e-05, + "loss": 0.2149, + "step": 40882 + }, + { + "epoch": 3.3119734283862607, + "grad_norm": 0.05849326029419899, + "learning_rate": 3.8224042486160496e-05, + "loss": 0.2335, + "step": 40883 + }, + { + "epoch": 3.312054439403759, + "grad_norm": 0.06957482546567917, + "learning_rate": 3.821954183356587e-05, + "loss": 0.2482, + "step": 40884 + }, + { + "epoch": 3.3121354504212572, + "grad_norm": 0.0883265808224678, + "learning_rate": 3.8215041180971243e-05, + "loss": 0.2015, + "step": 40885 + }, + { + "epoch": 3.3122164614387555, + "grad_norm": 0.0719548687338829, + "learning_rate": 3.821054052837662e-05, + "loss": 0.1951, + "step": 40886 + }, + { + "epoch": 3.312297472456254, + "grad_norm": 0.09112759679555893, + "learning_rate": 3.820603987578199e-05, + "loss": 0.2224, + "step": 40887 + }, + { + "epoch": 3.3123784834737524, + "grad_norm": 0.068062424659729, + "learning_rate": 3.8201539223187364e-05, + "loss": 0.2267, + "step": 40888 + }, + { + "epoch": 3.3124594944912507, + "grad_norm": 0.06875590234994888, + "learning_rate": 3.819703857059274e-05, + "loss": 0.237, + "step": 40889 + }, + { + "epoch": 3.3125405055087493, + "grad_norm": 0.06804489344358444, + "learning_rate": 3.819253791799811e-05, + "loss": 0.179, + "step": 40890 + }, + { + "epoch": 3.3126215165262476, + "grad_norm": 0.05216558277606964, + "learning_rate": 3.8188037265403485e-05, + "loss": 0.2078, + "step": 40891 + }, + { + "epoch": 3.312702527543746, + "grad_norm": 0.08079401403665543, + "learning_rate": 3.818353661280886e-05, + "loss": 0.2436, + "step": 40892 + }, + { + "epoch": 3.3127835385612445, + "grad_norm": 0.07252391427755356, + "learning_rate": 3.817903596021423e-05, + "loss": 0.1987, + "step": 40893 + }, + { + "epoch": 3.3128645495787428, + "grad_norm": 0.06319592148065567, + "learning_rate": 3.8174535307619606e-05, + "loss": 0.2297, + "step": 40894 + }, + { + "epoch": 3.312945560596241, + "grad_norm": 0.07093706727027893, + "learning_rate": 3.8170034655024986e-05, + "loss": 0.2044, + "step": 40895 + }, + { + "epoch": 3.3130265716137393, + "grad_norm": 0.06530580669641495, + "learning_rate": 3.816553400243035e-05, + "loss": 0.2554, + "step": 40896 + }, + { + "epoch": 3.313107582631238, + "grad_norm": 0.08208633214235306, + "learning_rate": 3.816103334983573e-05, + "loss": 0.2081, + "step": 40897 + }, + { + "epoch": 3.313188593648736, + "grad_norm": 0.07316944003105164, + "learning_rate": 3.815653269724111e-05, + "loss": 0.22, + "step": 40898 + }, + { + "epoch": 3.3132696046662344, + "grad_norm": 0.07189015299081802, + "learning_rate": 3.8152032044646474e-05, + "loss": 0.2151, + "step": 40899 + }, + { + "epoch": 3.313350615683733, + "grad_norm": 0.0698755532503128, + "learning_rate": 3.814753139205185e-05, + "loss": 0.2235, + "step": 40900 + }, + { + "epoch": 3.3134316267012314, + "grad_norm": 0.0643271952867508, + "learning_rate": 3.814303073945723e-05, + "loss": 0.2635, + "step": 40901 + }, + { + "epoch": 3.3135126377187296, + "grad_norm": 0.08225993067026138, + "learning_rate": 3.8138530086862595e-05, + "loss": 0.2349, + "step": 40902 + }, + { + "epoch": 3.3135936487362283, + "grad_norm": 0.07047945261001587, + "learning_rate": 3.813402943426797e-05, + "loss": 0.2403, + "step": 40903 + }, + { + "epoch": 3.3136746597537265, + "grad_norm": 0.06076255068182945, + "learning_rate": 3.812952878167335e-05, + "loss": 0.2344, + "step": 40904 + }, + { + "epoch": 3.313755670771225, + "grad_norm": 0.08206469565629959, + "learning_rate": 3.8125028129078716e-05, + "loss": 0.2581, + "step": 40905 + }, + { + "epoch": 3.3138366817887235, + "grad_norm": 0.05962511897087097, + "learning_rate": 3.812052747648409e-05, + "loss": 0.2062, + "step": 40906 + }, + { + "epoch": 3.3139176928062217, + "grad_norm": 0.08725281804800034, + "learning_rate": 3.811602682388947e-05, + "loss": 0.1962, + "step": 40907 + }, + { + "epoch": 3.31399870382372, + "grad_norm": 0.0772843286395073, + "learning_rate": 3.8111526171294836e-05, + "loss": 0.2083, + "step": 40908 + }, + { + "epoch": 3.314079714841218, + "grad_norm": 0.06500102579593658, + "learning_rate": 3.810702551870021e-05, + "loss": 0.2145, + "step": 40909 + }, + { + "epoch": 3.314160725858717, + "grad_norm": 0.0693049356341362, + "learning_rate": 3.810252486610559e-05, + "loss": 0.2264, + "step": 40910 + }, + { + "epoch": 3.314241736876215, + "grad_norm": 0.0698850080370903, + "learning_rate": 3.8098024213510964e-05, + "loss": 0.2062, + "step": 40911 + }, + { + "epoch": 3.3143227478937134, + "grad_norm": 0.07052836567163467, + "learning_rate": 3.809352356091633e-05, + "loss": 0.2382, + "step": 40912 + }, + { + "epoch": 3.314403758911212, + "grad_norm": 0.0574011467397213, + "learning_rate": 3.808902290832171e-05, + "loss": 0.21, + "step": 40913 + }, + { + "epoch": 3.3144847699287103, + "grad_norm": 0.07263054698705673, + "learning_rate": 3.8084522255727085e-05, + "loss": 0.2556, + "step": 40914 + }, + { + "epoch": 3.3145657809462086, + "grad_norm": 0.07969239354133606, + "learning_rate": 3.808002160313245e-05, + "loss": 0.2419, + "step": 40915 + }, + { + "epoch": 3.314646791963707, + "grad_norm": 0.09571804851293564, + "learning_rate": 3.807552095053783e-05, + "loss": 0.2307, + "step": 40916 + }, + { + "epoch": 3.3147278029812055, + "grad_norm": 0.06808076053857803, + "learning_rate": 3.8071020297943206e-05, + "loss": 0.2652, + "step": 40917 + }, + { + "epoch": 3.3148088139987038, + "grad_norm": 0.08558756858110428, + "learning_rate": 3.806651964534857e-05, + "loss": 0.2409, + "step": 40918 + }, + { + "epoch": 3.314889825016202, + "grad_norm": 0.0665164515376091, + "learning_rate": 3.806201899275395e-05, + "loss": 0.1753, + "step": 40919 + }, + { + "epoch": 3.3149708360337007, + "grad_norm": 0.07719418406486511, + "learning_rate": 3.8057518340159326e-05, + "loss": 0.2568, + "step": 40920 + }, + { + "epoch": 3.315051847051199, + "grad_norm": 0.0784536600112915, + "learning_rate": 3.805301768756469e-05, + "loss": 0.2139, + "step": 40921 + }, + { + "epoch": 3.315132858068697, + "grad_norm": 0.07965029031038284, + "learning_rate": 3.8048517034970074e-05, + "loss": 0.2379, + "step": 40922 + }, + { + "epoch": 3.315213869086196, + "grad_norm": 0.08881150186061859, + "learning_rate": 3.804401638237545e-05, + "loss": 0.2192, + "step": 40923 + }, + { + "epoch": 3.315294880103694, + "grad_norm": 0.0813041478395462, + "learning_rate": 3.803951572978082e-05, + "loss": 0.2427, + "step": 40924 + }, + { + "epoch": 3.3153758911211924, + "grad_norm": 0.06555255502462387, + "learning_rate": 3.8035015077186194e-05, + "loss": 0.2041, + "step": 40925 + }, + { + "epoch": 3.315456902138691, + "grad_norm": 0.08366473764181137, + "learning_rate": 3.803051442459157e-05, + "loss": 0.2269, + "step": 40926 + }, + { + "epoch": 3.3155379131561893, + "grad_norm": 0.08231698721647263, + "learning_rate": 3.802601377199694e-05, + "loss": 0.2259, + "step": 40927 + }, + { + "epoch": 3.3156189241736875, + "grad_norm": 0.0702444314956665, + "learning_rate": 3.8021513119402315e-05, + "loss": 0.2219, + "step": 40928 + }, + { + "epoch": 3.315699935191186, + "grad_norm": 0.07622160017490387, + "learning_rate": 3.801701246680769e-05, + "loss": 0.2459, + "step": 40929 + }, + { + "epoch": 3.3157809462086845, + "grad_norm": 0.06236666068434715, + "learning_rate": 3.801251181421306e-05, + "loss": 0.2388, + "step": 40930 + }, + { + "epoch": 3.3158619572261827, + "grad_norm": 0.07520124316215515, + "learning_rate": 3.8008011161618436e-05, + "loss": 0.2309, + "step": 40931 + }, + { + "epoch": 3.315942968243681, + "grad_norm": 0.06214138865470886, + "learning_rate": 3.800351050902381e-05, + "loss": 0.2032, + "step": 40932 + }, + { + "epoch": 3.3160239792611796, + "grad_norm": 0.08071856945753098, + "learning_rate": 3.799900985642918e-05, + "loss": 0.239, + "step": 40933 + }, + { + "epoch": 3.316104990278678, + "grad_norm": 0.06837823987007141, + "learning_rate": 3.799450920383456e-05, + "loss": 0.1834, + "step": 40934 + }, + { + "epoch": 3.316186001296176, + "grad_norm": 0.06929964572191238, + "learning_rate": 3.799000855123993e-05, + "loss": 0.2254, + "step": 40935 + }, + { + "epoch": 3.316267012313675, + "grad_norm": 0.058290932327508926, + "learning_rate": 3.7985507898645304e-05, + "loss": 0.1865, + "step": 40936 + }, + { + "epoch": 3.316348023331173, + "grad_norm": 0.06676318496465683, + "learning_rate": 3.7981007246050684e-05, + "loss": 0.2082, + "step": 40937 + }, + { + "epoch": 3.3164290343486713, + "grad_norm": 0.0737726241350174, + "learning_rate": 3.797650659345605e-05, + "loss": 0.1919, + "step": 40938 + }, + { + "epoch": 3.3165100453661696, + "grad_norm": 0.06238668039441109, + "learning_rate": 3.7972005940861425e-05, + "loss": 0.204, + "step": 40939 + }, + { + "epoch": 3.3165910563836682, + "grad_norm": 0.07773415744304657, + "learning_rate": 3.7967505288266805e-05, + "loss": 0.2373, + "step": 40940 + }, + { + "epoch": 3.3166720674011665, + "grad_norm": 0.06476813554763794, + "learning_rate": 3.796300463567217e-05, + "loss": 0.2138, + "step": 40941 + }, + { + "epoch": 3.3167530784186647, + "grad_norm": 0.06933821737766266, + "learning_rate": 3.7958503983077546e-05, + "loss": 0.2367, + "step": 40942 + }, + { + "epoch": 3.3168340894361634, + "grad_norm": 0.0789467841386795, + "learning_rate": 3.7954003330482926e-05, + "loss": 0.234, + "step": 40943 + }, + { + "epoch": 3.3169151004536617, + "grad_norm": 0.0722360908985138, + "learning_rate": 3.794950267788829e-05, + "loss": 0.2598, + "step": 40944 + }, + { + "epoch": 3.31699611147116, + "grad_norm": 0.07831466943025589, + "learning_rate": 3.7945002025293666e-05, + "loss": 0.2247, + "step": 40945 + }, + { + "epoch": 3.3170771224886586, + "grad_norm": 0.07463128119707108, + "learning_rate": 3.794050137269905e-05, + "loss": 0.2139, + "step": 40946 + }, + { + "epoch": 3.317158133506157, + "grad_norm": 0.060892872512340546, + "learning_rate": 3.7936000720104414e-05, + "loss": 0.2258, + "step": 40947 + }, + { + "epoch": 3.317239144523655, + "grad_norm": 0.07231955975294113, + "learning_rate": 3.793150006750979e-05, + "loss": 0.237, + "step": 40948 + }, + { + "epoch": 3.317320155541154, + "grad_norm": 0.07078664749860764, + "learning_rate": 3.792699941491517e-05, + "loss": 0.2176, + "step": 40949 + }, + { + "epoch": 3.317401166558652, + "grad_norm": 0.07049831002950668, + "learning_rate": 3.792249876232054e-05, + "loss": 0.2014, + "step": 40950 + }, + { + "epoch": 3.3174821775761503, + "grad_norm": 0.06473879516124725, + "learning_rate": 3.791799810972591e-05, + "loss": 0.2045, + "step": 40951 + }, + { + "epoch": 3.317563188593649, + "grad_norm": 0.07700739055871964, + "learning_rate": 3.791349745713129e-05, + "loss": 0.2151, + "step": 40952 + }, + { + "epoch": 3.317644199611147, + "grad_norm": 0.06077568233013153, + "learning_rate": 3.790899680453666e-05, + "loss": 0.2349, + "step": 40953 + }, + { + "epoch": 3.3177252106286454, + "grad_norm": 0.07367191463708878, + "learning_rate": 3.790449615194203e-05, + "loss": 0.1973, + "step": 40954 + }, + { + "epoch": 3.3178062216461437, + "grad_norm": 0.08809377253055573, + "learning_rate": 3.789999549934741e-05, + "loss": 0.2156, + "step": 40955 + }, + { + "epoch": 3.3178872326636424, + "grad_norm": 0.08473682403564453, + "learning_rate": 3.789549484675278e-05, + "loss": 0.242, + "step": 40956 + }, + { + "epoch": 3.3179682436811406, + "grad_norm": 0.06702679395675659, + "learning_rate": 3.789099419415815e-05, + "loss": 0.2388, + "step": 40957 + }, + { + "epoch": 3.318049254698639, + "grad_norm": 0.07082941383123398, + "learning_rate": 3.788649354156353e-05, + "loss": 0.2484, + "step": 40958 + }, + { + "epoch": 3.3181302657161376, + "grad_norm": 0.06634097546339035, + "learning_rate": 3.7881992888968904e-05, + "loss": 0.2554, + "step": 40959 + }, + { + "epoch": 3.318211276733636, + "grad_norm": 0.08151954412460327, + "learning_rate": 3.787749223637427e-05, + "loss": 0.2588, + "step": 40960 + }, + { + "epoch": 3.318292287751134, + "grad_norm": 0.06060457602143288, + "learning_rate": 3.787299158377965e-05, + "loss": 0.2046, + "step": 40961 + }, + { + "epoch": 3.3183732987686323, + "grad_norm": 0.07713788747787476, + "learning_rate": 3.7868490931185024e-05, + "loss": 0.2384, + "step": 40962 + }, + { + "epoch": 3.318454309786131, + "grad_norm": 0.05473591759800911, + "learning_rate": 3.78639902785904e-05, + "loss": 0.1844, + "step": 40963 + }, + { + "epoch": 3.3185353208036292, + "grad_norm": 0.06427813321352005, + "learning_rate": 3.785948962599577e-05, + "loss": 0.1996, + "step": 40964 + }, + { + "epoch": 3.3186163318211275, + "grad_norm": 0.05841223523020744, + "learning_rate": 3.7854988973401145e-05, + "loss": 0.2162, + "step": 40965 + }, + { + "epoch": 3.318697342838626, + "grad_norm": 0.07398484647274017, + "learning_rate": 3.785048832080652e-05, + "loss": 0.2155, + "step": 40966 + }, + { + "epoch": 3.3187783538561244, + "grad_norm": 0.07369926571846008, + "learning_rate": 3.784598766821189e-05, + "loss": 0.2326, + "step": 40967 + }, + { + "epoch": 3.3188593648736227, + "grad_norm": 0.05897458642721176, + "learning_rate": 3.7841487015617266e-05, + "loss": 0.1878, + "step": 40968 + }, + { + "epoch": 3.3189403758911213, + "grad_norm": 0.08481781929731369, + "learning_rate": 3.783698636302264e-05, + "loss": 0.2356, + "step": 40969 + }, + { + "epoch": 3.3190213869086196, + "grad_norm": 0.07468573749065399, + "learning_rate": 3.783248571042801e-05, + "loss": 0.2138, + "step": 40970 + }, + { + "epoch": 3.319102397926118, + "grad_norm": 0.07561396807432175, + "learning_rate": 3.782798505783339e-05, + "loss": 0.2199, + "step": 40971 + }, + { + "epoch": 3.3191834089436165, + "grad_norm": 0.08415020257234573, + "learning_rate": 3.782348440523876e-05, + "loss": 0.2468, + "step": 40972 + }, + { + "epoch": 3.3192644199611148, + "grad_norm": 0.06797681003808975, + "learning_rate": 3.7818983752644134e-05, + "loss": 0.2558, + "step": 40973 + }, + { + "epoch": 3.319345430978613, + "grad_norm": 0.06277844309806824, + "learning_rate": 3.781448310004951e-05, + "loss": 0.2838, + "step": 40974 + }, + { + "epoch": 3.3194264419961117, + "grad_norm": 0.06896187365055084, + "learning_rate": 3.780998244745488e-05, + "loss": 0.2197, + "step": 40975 + }, + { + "epoch": 3.31950745301361, + "grad_norm": 0.06196269765496254, + "learning_rate": 3.780548179486026e-05, + "loss": 0.24, + "step": 40976 + }, + { + "epoch": 3.319588464031108, + "grad_norm": 0.056846845895051956, + "learning_rate": 3.780098114226563e-05, + "loss": 0.2185, + "step": 40977 + }, + { + "epoch": 3.3196694750486064, + "grad_norm": 0.05916735529899597, + "learning_rate": 3.7796480489671e-05, + "loss": 0.2281, + "step": 40978 + }, + { + "epoch": 3.319750486066105, + "grad_norm": 0.08651582896709442, + "learning_rate": 3.779197983707638e-05, + "loss": 0.2281, + "step": 40979 + }, + { + "epoch": 3.3198314970836034, + "grad_norm": 0.07499776780605316, + "learning_rate": 3.778747918448175e-05, + "loss": 0.2055, + "step": 40980 + }, + { + "epoch": 3.3199125081011016, + "grad_norm": 0.08028426021337509, + "learning_rate": 3.778297853188712e-05, + "loss": 0.2084, + "step": 40981 + }, + { + "epoch": 3.3199935191186003, + "grad_norm": 0.06728916615247726, + "learning_rate": 3.77784778792925e-05, + "loss": 0.2382, + "step": 40982 + }, + { + "epoch": 3.3200745301360985, + "grad_norm": 0.07473582774400711, + "learning_rate": 3.777397722669787e-05, + "loss": 0.2539, + "step": 40983 + }, + { + "epoch": 3.320155541153597, + "grad_norm": 0.06037028506398201, + "learning_rate": 3.7769476574103244e-05, + "loss": 0.2172, + "step": 40984 + }, + { + "epoch": 3.320236552171095, + "grad_norm": 0.06607451289892197, + "learning_rate": 3.7764975921508624e-05, + "loss": 0.2263, + "step": 40985 + }, + { + "epoch": 3.3203175631885937, + "grad_norm": 0.08410777896642685, + "learning_rate": 3.776047526891399e-05, + "loss": 0.2462, + "step": 40986 + }, + { + "epoch": 3.320398574206092, + "grad_norm": 0.06139102205634117, + "learning_rate": 3.7755974616319365e-05, + "loss": 0.2257, + "step": 40987 + }, + { + "epoch": 3.32047958522359, + "grad_norm": 0.06923295557498932, + "learning_rate": 3.7751473963724745e-05, + "loss": 0.2722, + "step": 40988 + }, + { + "epoch": 3.320560596241089, + "grad_norm": 0.06262877583503723, + "learning_rate": 3.774697331113012e-05, + "loss": 0.2082, + "step": 40989 + }, + { + "epoch": 3.320641607258587, + "grad_norm": 0.06936154514551163, + "learning_rate": 3.7742472658535485e-05, + "loss": 0.2164, + "step": 40990 + }, + { + "epoch": 3.3207226182760854, + "grad_norm": 0.06605419516563416, + "learning_rate": 3.7737972005940866e-05, + "loss": 0.2194, + "step": 40991 + }, + { + "epoch": 3.320803629293584, + "grad_norm": 0.07005484402179718, + "learning_rate": 3.773347135334624e-05, + "loss": 0.2101, + "step": 40992 + }, + { + "epoch": 3.3208846403110823, + "grad_norm": 0.07076304405927658, + "learning_rate": 3.7728970700751606e-05, + "loss": 0.2131, + "step": 40993 + }, + { + "epoch": 3.3209656513285806, + "grad_norm": 0.08401350677013397, + "learning_rate": 3.7724470048156987e-05, + "loss": 0.2526, + "step": 40994 + }, + { + "epoch": 3.3210466623460793, + "grad_norm": 0.07341130822896957, + "learning_rate": 3.771996939556236e-05, + "loss": 0.2047, + "step": 40995 + }, + { + "epoch": 3.3211276733635775, + "grad_norm": 0.061423882842063904, + "learning_rate": 3.771546874296773e-05, + "loss": 0.1962, + "step": 40996 + }, + { + "epoch": 3.3212086843810757, + "grad_norm": 0.06588398665189743, + "learning_rate": 3.771096809037311e-05, + "loss": 0.1883, + "step": 40997 + }, + { + "epoch": 3.3212896953985744, + "grad_norm": 0.07813892513513565, + "learning_rate": 3.770646743777848e-05, + "loss": 0.2181, + "step": 40998 + }, + { + "epoch": 3.3213707064160727, + "grad_norm": 0.08592596650123596, + "learning_rate": 3.770196678518385e-05, + "loss": 0.2369, + "step": 40999 + }, + { + "epoch": 3.321451717433571, + "grad_norm": 0.06253305077552795, + "learning_rate": 3.769746613258923e-05, + "loss": 0.2143, + "step": 41000 + }, + { + "epoch": 3.321532728451069, + "grad_norm": 0.07199438661336899, + "learning_rate": 3.76929654799946e-05, + "loss": 0.191, + "step": 41001 + }, + { + "epoch": 3.321613739468568, + "grad_norm": 0.08037972450256348, + "learning_rate": 3.7688464827399975e-05, + "loss": 0.2358, + "step": 41002 + }, + { + "epoch": 3.321694750486066, + "grad_norm": 0.07553397119045258, + "learning_rate": 3.768396417480535e-05, + "loss": 0.2193, + "step": 41003 + }, + { + "epoch": 3.3217757615035644, + "grad_norm": 0.07229546457529068, + "learning_rate": 3.767946352221072e-05, + "loss": 0.2347, + "step": 41004 + }, + { + "epoch": 3.321856772521063, + "grad_norm": 0.06054199859499931, + "learning_rate": 3.7674962869616096e-05, + "loss": 0.2467, + "step": 41005 + }, + { + "epoch": 3.3219377835385613, + "grad_norm": 0.0800226703286171, + "learning_rate": 3.767046221702147e-05, + "loss": 0.2404, + "step": 41006 + }, + { + "epoch": 3.3220187945560595, + "grad_norm": 0.06743893027305603, + "learning_rate": 3.7665961564426843e-05, + "loss": 0.2017, + "step": 41007 + }, + { + "epoch": 3.3220998055735578, + "grad_norm": 0.07700297236442566, + "learning_rate": 3.766146091183222e-05, + "loss": 0.2492, + "step": 41008 + }, + { + "epoch": 3.3221808165910565, + "grad_norm": 0.07854170352220535, + "learning_rate": 3.765696025923759e-05, + "loss": 0.219, + "step": 41009 + }, + { + "epoch": 3.3222618276085547, + "grad_norm": 0.07884832471609116, + "learning_rate": 3.7652459606642964e-05, + "loss": 0.2217, + "step": 41010 + }, + { + "epoch": 3.322342838626053, + "grad_norm": 0.07336324453353882, + "learning_rate": 3.764795895404834e-05, + "loss": 0.2226, + "step": 41011 + }, + { + "epoch": 3.3224238496435516, + "grad_norm": 0.065107062458992, + "learning_rate": 3.764345830145371e-05, + "loss": 0.1805, + "step": 41012 + }, + { + "epoch": 3.32250486066105, + "grad_norm": 0.06360862404108047, + "learning_rate": 3.7638957648859085e-05, + "loss": 0.2039, + "step": 41013 + }, + { + "epoch": 3.322585871678548, + "grad_norm": 0.08214043825864792, + "learning_rate": 3.763445699626446e-05, + "loss": 0.2392, + "step": 41014 + }, + { + "epoch": 3.322666882696047, + "grad_norm": 0.07553614675998688, + "learning_rate": 3.762995634366984e-05, + "loss": 0.2276, + "step": 41015 + }, + { + "epoch": 3.322747893713545, + "grad_norm": 0.06392810493707657, + "learning_rate": 3.7625455691075206e-05, + "loss": 0.2248, + "step": 41016 + }, + { + "epoch": 3.3228289047310433, + "grad_norm": 0.06898233294487, + "learning_rate": 3.762095503848058e-05, + "loss": 0.239, + "step": 41017 + }, + { + "epoch": 3.322909915748542, + "grad_norm": 0.07211174070835114, + "learning_rate": 3.761645438588596e-05, + "loss": 0.1974, + "step": 41018 + }, + { + "epoch": 3.3229909267660402, + "grad_norm": 0.07649188488721848, + "learning_rate": 3.761195373329133e-05, + "loss": 0.2391, + "step": 41019 + }, + { + "epoch": 3.3230719377835385, + "grad_norm": 0.06290451437234879, + "learning_rate": 3.76074530806967e-05, + "loss": 0.2278, + "step": 41020 + }, + { + "epoch": 3.323152948801037, + "grad_norm": 0.07498381286859512, + "learning_rate": 3.760295242810208e-05, + "loss": 0.2493, + "step": 41021 + }, + { + "epoch": 3.3232339598185354, + "grad_norm": 0.06386851519346237, + "learning_rate": 3.759845177550745e-05, + "loss": 0.2204, + "step": 41022 + }, + { + "epoch": 3.3233149708360337, + "grad_norm": 0.08008598536252975, + "learning_rate": 3.759395112291282e-05, + "loss": 0.2542, + "step": 41023 + }, + { + "epoch": 3.323395981853532, + "grad_norm": 0.0818173736333847, + "learning_rate": 3.75894504703182e-05, + "loss": 0.2217, + "step": 41024 + }, + { + "epoch": 3.3234769928710306, + "grad_norm": 0.07507924735546112, + "learning_rate": 3.758494981772357e-05, + "loss": 0.2582, + "step": 41025 + }, + { + "epoch": 3.323558003888529, + "grad_norm": 0.08730128407478333, + "learning_rate": 3.758044916512894e-05, + "loss": 0.2248, + "step": 41026 + }, + { + "epoch": 3.323639014906027, + "grad_norm": 0.07072755694389343, + "learning_rate": 3.757594851253432e-05, + "loss": 0.2268, + "step": 41027 + }, + { + "epoch": 3.323720025923526, + "grad_norm": 0.08612517267465591, + "learning_rate": 3.7571447859939696e-05, + "loss": 0.2493, + "step": 41028 + }, + { + "epoch": 3.323801036941024, + "grad_norm": 0.09255276620388031, + "learning_rate": 3.756694720734506e-05, + "loss": 0.2155, + "step": 41029 + }, + { + "epoch": 3.3238820479585223, + "grad_norm": 0.07072529196739197, + "learning_rate": 3.756244655475044e-05, + "loss": 0.2385, + "step": 41030 + }, + { + "epoch": 3.3239630589760205, + "grad_norm": 0.05725023150444031, + "learning_rate": 3.755794590215582e-05, + "loss": 0.217, + "step": 41031 + }, + { + "epoch": 3.324044069993519, + "grad_norm": 0.07195749878883362, + "learning_rate": 3.7553445249561183e-05, + "loss": 0.2235, + "step": 41032 + }, + { + "epoch": 3.3241250810110174, + "grad_norm": 0.08406466245651245, + "learning_rate": 3.7548944596966564e-05, + "loss": 0.2072, + "step": 41033 + }, + { + "epoch": 3.3242060920285157, + "grad_norm": 0.07305304706096649, + "learning_rate": 3.754444394437194e-05, + "loss": 0.2314, + "step": 41034 + }, + { + "epoch": 3.3242871030460144, + "grad_norm": 0.07687455415725708, + "learning_rate": 3.7539943291777304e-05, + "loss": 0.2263, + "step": 41035 + }, + { + "epoch": 3.3243681140635126, + "grad_norm": 0.06038808450102806, + "learning_rate": 3.7535442639182685e-05, + "loss": 0.1994, + "step": 41036 + }, + { + "epoch": 3.324449125081011, + "grad_norm": 0.07261287420988083, + "learning_rate": 3.753094198658806e-05, + "loss": 0.2009, + "step": 41037 + }, + { + "epoch": 3.3245301360985096, + "grad_norm": 0.07470063120126724, + "learning_rate": 3.7526441333993425e-05, + "loss": 0.233, + "step": 41038 + }, + { + "epoch": 3.324611147116008, + "grad_norm": 0.15417388081550598, + "learning_rate": 3.7521940681398805e-05, + "loss": 0.2311, + "step": 41039 + }, + { + "epoch": 3.324692158133506, + "grad_norm": 0.07478626072406769, + "learning_rate": 3.751744002880418e-05, + "loss": 0.2175, + "step": 41040 + }, + { + "epoch": 3.3247731691510047, + "grad_norm": 0.06912209838628769, + "learning_rate": 3.751293937620955e-05, + "loss": 0.2272, + "step": 41041 + }, + { + "epoch": 3.324854180168503, + "grad_norm": 0.06295725703239441, + "learning_rate": 3.7508438723614926e-05, + "loss": 0.2332, + "step": 41042 + }, + { + "epoch": 3.3249351911860012, + "grad_norm": 0.0667472630739212, + "learning_rate": 3.75039380710203e-05, + "loss": 0.2394, + "step": 41043 + }, + { + "epoch": 3.3250162022035, + "grad_norm": 0.06752104312181473, + "learning_rate": 3.7499437418425673e-05, + "loss": 0.2683, + "step": 41044 + }, + { + "epoch": 3.325097213220998, + "grad_norm": 0.06008462980389595, + "learning_rate": 3.749493676583105e-05, + "loss": 0.2224, + "step": 41045 + }, + { + "epoch": 3.3251782242384964, + "grad_norm": 0.061989929527044296, + "learning_rate": 3.749043611323642e-05, + "loss": 0.2097, + "step": 41046 + }, + { + "epoch": 3.3252592352559946, + "grad_norm": 0.0873345360159874, + "learning_rate": 3.7485935460641794e-05, + "loss": 0.2514, + "step": 41047 + }, + { + "epoch": 3.3253402462734933, + "grad_norm": 0.059997983276844025, + "learning_rate": 3.748143480804717e-05, + "loss": 0.2436, + "step": 41048 + }, + { + "epoch": 3.3254212572909916, + "grad_norm": 0.08212314546108246, + "learning_rate": 3.747693415545254e-05, + "loss": 0.2218, + "step": 41049 + }, + { + "epoch": 3.32550226830849, + "grad_norm": 0.0879613533616066, + "learning_rate": 3.7472433502857915e-05, + "loss": 0.2378, + "step": 41050 + }, + { + "epoch": 3.3255832793259885, + "grad_norm": 0.06860115379095078, + "learning_rate": 3.746793285026329e-05, + "loss": 0.2199, + "step": 41051 + }, + { + "epoch": 3.3256642903434868, + "grad_norm": 0.10232620686292648, + "learning_rate": 3.746343219766866e-05, + "loss": 0.2238, + "step": 41052 + }, + { + "epoch": 3.325745301360985, + "grad_norm": 0.06840762495994568, + "learning_rate": 3.7458931545074036e-05, + "loss": 0.2312, + "step": 41053 + }, + { + "epoch": 3.3258263123784833, + "grad_norm": 0.07458966225385666, + "learning_rate": 3.7454430892479416e-05, + "loss": 0.2178, + "step": 41054 + }, + { + "epoch": 3.325907323395982, + "grad_norm": 0.07784730195999146, + "learning_rate": 3.744993023988478e-05, + "loss": 0.2165, + "step": 41055 + }, + { + "epoch": 3.32598833441348, + "grad_norm": 0.07316484302282333, + "learning_rate": 3.744542958729016e-05, + "loss": 0.206, + "step": 41056 + }, + { + "epoch": 3.3260693454309784, + "grad_norm": 0.06419821083545685, + "learning_rate": 3.744092893469554e-05, + "loss": 0.2106, + "step": 41057 + }, + { + "epoch": 3.326150356448477, + "grad_norm": 0.07934359461069107, + "learning_rate": 3.7436428282100904e-05, + "loss": 0.2298, + "step": 41058 + }, + { + "epoch": 3.3262313674659754, + "grad_norm": 0.07370511442422867, + "learning_rate": 3.743192762950628e-05, + "loss": 0.2486, + "step": 41059 + }, + { + "epoch": 3.3263123784834736, + "grad_norm": 0.0726943090558052, + "learning_rate": 3.742742697691166e-05, + "loss": 0.1943, + "step": 41060 + }, + { + "epoch": 3.3263933895009723, + "grad_norm": 0.0706993043422699, + "learning_rate": 3.7422926324317025e-05, + "loss": 0.2054, + "step": 41061 + }, + { + "epoch": 3.3264744005184705, + "grad_norm": 0.08410005271434784, + "learning_rate": 3.74184256717224e-05, + "loss": 0.225, + "step": 41062 + }, + { + "epoch": 3.326555411535969, + "grad_norm": 0.06017423793673515, + "learning_rate": 3.741392501912778e-05, + "loss": 0.2526, + "step": 41063 + }, + { + "epoch": 3.3266364225534675, + "grad_norm": 0.07366826385259628, + "learning_rate": 3.7409424366533146e-05, + "loss": 0.2335, + "step": 41064 + }, + { + "epoch": 3.3267174335709657, + "grad_norm": 0.06251514703035355, + "learning_rate": 3.740492371393852e-05, + "loss": 0.2121, + "step": 41065 + }, + { + "epoch": 3.326798444588464, + "grad_norm": 0.10737292468547821, + "learning_rate": 3.74004230613439e-05, + "loss": 0.2723, + "step": 41066 + }, + { + "epoch": 3.3268794556059627, + "grad_norm": 0.07564511150121689, + "learning_rate": 3.7395922408749266e-05, + "loss": 0.2048, + "step": 41067 + }, + { + "epoch": 3.326960466623461, + "grad_norm": 0.06839878857135773, + "learning_rate": 3.739142175615464e-05, + "loss": 0.1951, + "step": 41068 + }, + { + "epoch": 3.327041477640959, + "grad_norm": 0.06925508379936218, + "learning_rate": 3.738692110356002e-05, + "loss": 0.2168, + "step": 41069 + }, + { + "epoch": 3.3271224886584574, + "grad_norm": 0.08146601915359497, + "learning_rate": 3.7382420450965394e-05, + "loss": 0.2619, + "step": 41070 + }, + { + "epoch": 3.327203499675956, + "grad_norm": 0.08915166556835175, + "learning_rate": 3.737791979837076e-05, + "loss": 0.2065, + "step": 41071 + }, + { + "epoch": 3.3272845106934543, + "grad_norm": 0.0576382540166378, + "learning_rate": 3.737341914577614e-05, + "loss": 0.1842, + "step": 41072 + }, + { + "epoch": 3.3273655217109526, + "grad_norm": 0.07133428007364273, + "learning_rate": 3.7368918493181515e-05, + "loss": 0.2147, + "step": 41073 + }, + { + "epoch": 3.3274465327284513, + "grad_norm": 0.07418784499168396, + "learning_rate": 3.736441784058689e-05, + "loss": 0.2181, + "step": 41074 + }, + { + "epoch": 3.3275275437459495, + "grad_norm": 0.08585530519485474, + "learning_rate": 3.735991718799226e-05, + "loss": 0.2321, + "step": 41075 + }, + { + "epoch": 3.3276085547634477, + "grad_norm": 0.08041617274284363, + "learning_rate": 3.7355416535397636e-05, + "loss": 0.2512, + "step": 41076 + }, + { + "epoch": 3.327689565780946, + "grad_norm": 0.0663224533200264, + "learning_rate": 3.735091588280301e-05, + "loss": 0.2015, + "step": 41077 + }, + { + "epoch": 3.3277705767984447, + "grad_norm": 0.05717449635267258, + "learning_rate": 3.734641523020838e-05, + "loss": 0.1653, + "step": 41078 + }, + { + "epoch": 3.327851587815943, + "grad_norm": 0.071390800178051, + "learning_rate": 3.7341914577613756e-05, + "loss": 0.2119, + "step": 41079 + }, + { + "epoch": 3.327932598833441, + "grad_norm": 0.060663558542728424, + "learning_rate": 3.733741392501913e-05, + "loss": 0.2232, + "step": 41080 + }, + { + "epoch": 3.32801360985094, + "grad_norm": 0.05562737211585045, + "learning_rate": 3.7332913272424504e-05, + "loss": 0.2543, + "step": 41081 + }, + { + "epoch": 3.328094620868438, + "grad_norm": 0.07524958997964859, + "learning_rate": 3.732841261982988e-05, + "loss": 0.2296, + "step": 41082 + }, + { + "epoch": 3.3281756318859363, + "grad_norm": 0.09169891476631165, + "learning_rate": 3.732391196723525e-05, + "loss": 0.2707, + "step": 41083 + }, + { + "epoch": 3.328256642903435, + "grad_norm": 0.07389675825834274, + "learning_rate": 3.7319411314640624e-05, + "loss": 0.2375, + "step": 41084 + }, + { + "epoch": 3.3283376539209333, + "grad_norm": 0.08331091701984406, + "learning_rate": 3.7314910662046e-05, + "loss": 0.2336, + "step": 41085 + }, + { + "epoch": 3.3284186649384315, + "grad_norm": 0.07018539309501648, + "learning_rate": 3.731041000945137e-05, + "loss": 0.1999, + "step": 41086 + }, + { + "epoch": 3.32849967595593, + "grad_norm": 0.06626079976558685, + "learning_rate": 3.7305909356856745e-05, + "loss": 0.2342, + "step": 41087 + }, + { + "epoch": 3.3285806869734285, + "grad_norm": 0.07911474257707596, + "learning_rate": 3.730140870426212e-05, + "loss": 0.2656, + "step": 41088 + }, + { + "epoch": 3.3286616979909267, + "grad_norm": 0.07544355094432831, + "learning_rate": 3.729690805166749e-05, + "loss": 0.2379, + "step": 41089 + }, + { + "epoch": 3.3287427090084254, + "grad_norm": 0.07290852069854736, + "learning_rate": 3.7292407399072866e-05, + "loss": 0.1906, + "step": 41090 + }, + { + "epoch": 3.3288237200259236, + "grad_norm": 0.07008974999189377, + "learning_rate": 3.728790674647824e-05, + "loss": 0.2076, + "step": 41091 + }, + { + "epoch": 3.328904731043422, + "grad_norm": 0.07178022712469101, + "learning_rate": 3.728340609388361e-05, + "loss": 0.2148, + "step": 41092 + }, + { + "epoch": 3.32898574206092, + "grad_norm": 0.06797996908426285, + "learning_rate": 3.727890544128899e-05, + "loss": 0.2439, + "step": 41093 + }, + { + "epoch": 3.329066753078419, + "grad_norm": 0.06787247955799103, + "learning_rate": 3.727440478869436e-05, + "loss": 0.2219, + "step": 41094 + }, + { + "epoch": 3.329147764095917, + "grad_norm": 0.08076586574316025, + "learning_rate": 3.7269904136099734e-05, + "loss": 0.2366, + "step": 41095 + }, + { + "epoch": 3.3292287751134153, + "grad_norm": 0.0767500177025795, + "learning_rate": 3.7265403483505114e-05, + "loss": 0.2551, + "step": 41096 + }, + { + "epoch": 3.329309786130914, + "grad_norm": 0.09102772176265717, + "learning_rate": 3.726090283091048e-05, + "loss": 0.251, + "step": 41097 + }, + { + "epoch": 3.3293907971484122, + "grad_norm": 0.09674298018217087, + "learning_rate": 3.7256402178315855e-05, + "loss": 0.27, + "step": 41098 + }, + { + "epoch": 3.3294718081659105, + "grad_norm": 0.06335968524217606, + "learning_rate": 3.7251901525721235e-05, + "loss": 0.2358, + "step": 41099 + }, + { + "epoch": 3.3295528191834087, + "grad_norm": 0.07425528764724731, + "learning_rate": 3.72474008731266e-05, + "loss": 0.229, + "step": 41100 + }, + { + "epoch": 3.3296338302009074, + "grad_norm": 0.07559535652399063, + "learning_rate": 3.7242900220531976e-05, + "loss": 0.3009, + "step": 41101 + }, + { + "epoch": 3.3297148412184057, + "grad_norm": 0.06772898137569427, + "learning_rate": 3.7238399567937356e-05, + "loss": 0.2082, + "step": 41102 + }, + { + "epoch": 3.329795852235904, + "grad_norm": 0.07053738087415695, + "learning_rate": 3.723389891534272e-05, + "loss": 0.2121, + "step": 41103 + }, + { + "epoch": 3.3298768632534026, + "grad_norm": 0.09082667529582977, + "learning_rate": 3.7229398262748096e-05, + "loss": 0.2083, + "step": 41104 + }, + { + "epoch": 3.329957874270901, + "grad_norm": 0.0763431042432785, + "learning_rate": 3.722489761015348e-05, + "loss": 0.2415, + "step": 41105 + }, + { + "epoch": 3.330038885288399, + "grad_norm": 0.08213107287883759, + "learning_rate": 3.7220396957558844e-05, + "loss": 0.2301, + "step": 41106 + }, + { + "epoch": 3.3301198963058978, + "grad_norm": 0.06864434480667114, + "learning_rate": 3.721589630496422e-05, + "loss": 0.2031, + "step": 41107 + }, + { + "epoch": 3.330200907323396, + "grad_norm": 0.06650760769844055, + "learning_rate": 3.72113956523696e-05, + "loss": 0.2658, + "step": 41108 + }, + { + "epoch": 3.3302819183408943, + "grad_norm": 0.06854722648859024, + "learning_rate": 3.720689499977497e-05, + "loss": 0.2103, + "step": 41109 + }, + { + "epoch": 3.330362929358393, + "grad_norm": 0.06579779833555222, + "learning_rate": 3.7202394347180345e-05, + "loss": 0.2229, + "step": 41110 + }, + { + "epoch": 3.330443940375891, + "grad_norm": 0.08219362795352936, + "learning_rate": 3.719789369458572e-05, + "loss": 0.2209, + "step": 41111 + }, + { + "epoch": 3.3305249513933894, + "grad_norm": 0.07643623650074005, + "learning_rate": 3.719339304199109e-05, + "loss": 0.2432, + "step": 41112 + }, + { + "epoch": 3.3306059624108877, + "grad_norm": 0.07173693925142288, + "learning_rate": 3.7188892389396466e-05, + "loss": 0.2432, + "step": 41113 + }, + { + "epoch": 3.3306869734283864, + "grad_norm": 0.07207030057907104, + "learning_rate": 3.718439173680184e-05, + "loss": 0.2274, + "step": 41114 + }, + { + "epoch": 3.3307679844458846, + "grad_norm": 0.06946907937526703, + "learning_rate": 3.717989108420721e-05, + "loss": 0.2349, + "step": 41115 + }, + { + "epoch": 3.330848995463383, + "grad_norm": 0.06256712228059769, + "learning_rate": 3.7175390431612586e-05, + "loss": 0.1854, + "step": 41116 + }, + { + "epoch": 3.3309300064808816, + "grad_norm": 0.06501569598913193, + "learning_rate": 3.717088977901796e-05, + "loss": 0.2151, + "step": 41117 + }, + { + "epoch": 3.33101101749838, + "grad_norm": 0.07814828306436539, + "learning_rate": 3.7166389126423334e-05, + "loss": 0.2147, + "step": 41118 + }, + { + "epoch": 3.331092028515878, + "grad_norm": 0.07279353588819504, + "learning_rate": 3.716188847382871e-05, + "loss": 0.2258, + "step": 41119 + }, + { + "epoch": 3.3311730395333763, + "grad_norm": 0.07200466841459274, + "learning_rate": 3.715738782123408e-05, + "loss": 0.2295, + "step": 41120 + }, + { + "epoch": 3.331254050550875, + "grad_norm": 0.08193326741456985, + "learning_rate": 3.7152887168639455e-05, + "loss": 0.2487, + "step": 41121 + }, + { + "epoch": 3.3313350615683732, + "grad_norm": 0.0763644427061081, + "learning_rate": 3.714838651604483e-05, + "loss": 0.2199, + "step": 41122 + }, + { + "epoch": 3.3314160725858715, + "grad_norm": 0.0799073725938797, + "learning_rate": 3.71438858634502e-05, + "loss": 0.2549, + "step": 41123 + }, + { + "epoch": 3.33149708360337, + "grad_norm": 0.08127576112747192, + "learning_rate": 3.7139385210855575e-05, + "loss": 0.225, + "step": 41124 + }, + { + "epoch": 3.3315780946208684, + "grad_norm": 0.06715318560600281, + "learning_rate": 3.713488455826095e-05, + "loss": 0.2203, + "step": 41125 + }, + { + "epoch": 3.3316591056383666, + "grad_norm": 0.06862284243106842, + "learning_rate": 3.713038390566632e-05, + "loss": 0.1744, + "step": 41126 + }, + { + "epoch": 3.3317401166558653, + "grad_norm": 0.07042937725782394, + "learning_rate": 3.7125883253071696e-05, + "loss": 0.222, + "step": 41127 + }, + { + "epoch": 3.3318211276733636, + "grad_norm": 0.07680152356624603, + "learning_rate": 3.712138260047707e-05, + "loss": 0.2143, + "step": 41128 + }, + { + "epoch": 3.331902138690862, + "grad_norm": 0.06801113486289978, + "learning_rate": 3.711688194788244e-05, + "loss": 0.2006, + "step": 41129 + }, + { + "epoch": 3.3319831497083605, + "grad_norm": 0.07561274617910385, + "learning_rate": 3.711238129528782e-05, + "loss": 0.2103, + "step": 41130 + }, + { + "epoch": 3.3320641607258588, + "grad_norm": 0.08124644309282303, + "learning_rate": 3.710788064269319e-05, + "loss": 0.2161, + "step": 41131 + }, + { + "epoch": 3.332145171743357, + "grad_norm": 0.06947558373212814, + "learning_rate": 3.7103379990098564e-05, + "loss": 0.2009, + "step": 41132 + }, + { + "epoch": 3.3322261827608557, + "grad_norm": 0.06849096715450287, + "learning_rate": 3.709887933750394e-05, + "loss": 0.2314, + "step": 41133 + }, + { + "epoch": 3.332307193778354, + "grad_norm": 0.08039727807044983, + "learning_rate": 3.709437868490931e-05, + "loss": 0.2216, + "step": 41134 + }, + { + "epoch": 3.332388204795852, + "grad_norm": 0.06369374692440033, + "learning_rate": 3.708987803231469e-05, + "loss": 0.2547, + "step": 41135 + }, + { + "epoch": 3.3324692158133504, + "grad_norm": 0.0763283222913742, + "learning_rate": 3.708537737972006e-05, + "loss": 0.2719, + "step": 41136 + }, + { + "epoch": 3.332550226830849, + "grad_norm": 0.0758407711982727, + "learning_rate": 3.708087672712543e-05, + "loss": 0.2192, + "step": 41137 + }, + { + "epoch": 3.3326312378483474, + "grad_norm": 0.0780859962105751, + "learning_rate": 3.707637607453081e-05, + "loss": 0.2424, + "step": 41138 + }, + { + "epoch": 3.3327122488658456, + "grad_norm": 0.06562550365924835, + "learning_rate": 3.707187542193618e-05, + "loss": 0.2204, + "step": 41139 + }, + { + "epoch": 3.3327932598833443, + "grad_norm": 0.07291289418935776, + "learning_rate": 3.706737476934155e-05, + "loss": 0.1928, + "step": 41140 + }, + { + "epoch": 3.3328742709008425, + "grad_norm": 0.07883249223232269, + "learning_rate": 3.706287411674693e-05, + "loss": 0.2355, + "step": 41141 + }, + { + "epoch": 3.332955281918341, + "grad_norm": 0.07805757224559784, + "learning_rate": 3.70583734641523e-05, + "loss": 0.1924, + "step": 41142 + }, + { + "epoch": 3.333036292935839, + "grad_norm": 0.0664428249001503, + "learning_rate": 3.705387281155768e-05, + "loss": 0.2141, + "step": 41143 + }, + { + "epoch": 3.3331173039533377, + "grad_norm": 0.07167979329824448, + "learning_rate": 3.7049372158963054e-05, + "loss": 0.2545, + "step": 41144 + }, + { + "epoch": 3.333198314970836, + "grad_norm": 0.06236296892166138, + "learning_rate": 3.704487150636842e-05, + "loss": 0.1999, + "step": 41145 + }, + { + "epoch": 3.333279325988334, + "grad_norm": 0.054489415138959885, + "learning_rate": 3.70403708537738e-05, + "loss": 0.2022, + "step": 41146 + }, + { + "epoch": 3.333360337005833, + "grad_norm": 0.07027623802423477, + "learning_rate": 3.7035870201179175e-05, + "loss": 0.2133, + "step": 41147 + }, + { + "epoch": 3.333441348023331, + "grad_norm": 0.06870032846927643, + "learning_rate": 3.703136954858455e-05, + "loss": 0.2308, + "step": 41148 + }, + { + "epoch": 3.3335223590408294, + "grad_norm": 0.06924640387296677, + "learning_rate": 3.702686889598992e-05, + "loss": 0.1967, + "step": 41149 + }, + { + "epoch": 3.333603370058328, + "grad_norm": 0.07043696194887161, + "learning_rate": 3.7022368243395296e-05, + "loss": 0.2301, + "step": 41150 + }, + { + "epoch": 3.3336843810758263, + "grad_norm": 0.07276766002178192, + "learning_rate": 3.701786759080067e-05, + "loss": 0.2369, + "step": 41151 + }, + { + "epoch": 3.3337653920933246, + "grad_norm": 0.09626363217830658, + "learning_rate": 3.701336693820604e-05, + "loss": 0.2637, + "step": 41152 + }, + { + "epoch": 3.3338464031108233, + "grad_norm": 0.09370481967926025, + "learning_rate": 3.7008866285611417e-05, + "loss": 0.2779, + "step": 41153 + }, + { + "epoch": 3.3339274141283215, + "grad_norm": 0.09347948431968689, + "learning_rate": 3.700436563301679e-05, + "loss": 0.2279, + "step": 41154 + }, + { + "epoch": 3.3340084251458197, + "grad_norm": 0.0766562893986702, + "learning_rate": 3.6999864980422164e-05, + "loss": 0.2301, + "step": 41155 + }, + { + "epoch": 3.3340894361633184, + "grad_norm": 0.08153241127729416, + "learning_rate": 3.699536432782754e-05, + "loss": 0.223, + "step": 41156 + }, + { + "epoch": 3.3341704471808167, + "grad_norm": 0.08658970892429352, + "learning_rate": 3.699086367523291e-05, + "loss": 0.2521, + "step": 41157 + }, + { + "epoch": 3.334251458198315, + "grad_norm": 0.07518699765205383, + "learning_rate": 3.6986363022638285e-05, + "loss": 0.2222, + "step": 41158 + }, + { + "epoch": 3.334332469215813, + "grad_norm": 0.07649372518062592, + "learning_rate": 3.698186237004366e-05, + "loss": 0.2444, + "step": 41159 + }, + { + "epoch": 3.334413480233312, + "grad_norm": 0.07010925561189651, + "learning_rate": 3.697736171744903e-05, + "loss": 0.2491, + "step": 41160 + }, + { + "epoch": 3.33449449125081, + "grad_norm": 0.06980877369642258, + "learning_rate": 3.6972861064854405e-05, + "loss": 0.2384, + "step": 41161 + }, + { + "epoch": 3.3345755022683083, + "grad_norm": 0.06655539572238922, + "learning_rate": 3.696836041225978e-05, + "loss": 0.1995, + "step": 41162 + }, + { + "epoch": 3.334656513285807, + "grad_norm": 0.09244280308485031, + "learning_rate": 3.696385975966515e-05, + "loss": 0.3013, + "step": 41163 + }, + { + "epoch": 3.3347375243033053, + "grad_norm": 0.08281079679727554, + "learning_rate": 3.6959359107070526e-05, + "loss": 0.2368, + "step": 41164 + }, + { + "epoch": 3.3348185353208035, + "grad_norm": 0.07828114181756973, + "learning_rate": 3.69548584544759e-05, + "loss": 0.2452, + "step": 41165 + }, + { + "epoch": 3.3348995463383018, + "grad_norm": 0.06749124079942703, + "learning_rate": 3.6950357801881273e-05, + "loss": 0.1937, + "step": 41166 + }, + { + "epoch": 3.3349805573558005, + "grad_norm": 0.0752006322145462, + "learning_rate": 3.694585714928665e-05, + "loss": 0.2321, + "step": 41167 + }, + { + "epoch": 3.3350615683732987, + "grad_norm": 0.08764395862817764, + "learning_rate": 3.694135649669202e-05, + "loss": 0.2135, + "step": 41168 + }, + { + "epoch": 3.335142579390797, + "grad_norm": 0.07645867764949799, + "learning_rate": 3.6936855844097394e-05, + "loss": 0.2145, + "step": 41169 + }, + { + "epoch": 3.3352235904082956, + "grad_norm": 0.06811700761318207, + "learning_rate": 3.693235519150277e-05, + "loss": 0.2022, + "step": 41170 + }, + { + "epoch": 3.335304601425794, + "grad_norm": 0.06362304091453552, + "learning_rate": 3.692785453890814e-05, + "loss": 0.2218, + "step": 41171 + }, + { + "epoch": 3.335385612443292, + "grad_norm": 0.05618695542216301, + "learning_rate": 3.6923353886313515e-05, + "loss": 0.1979, + "step": 41172 + }, + { + "epoch": 3.335466623460791, + "grad_norm": 0.06092815101146698, + "learning_rate": 3.691885323371889e-05, + "loss": 0.2119, + "step": 41173 + }, + { + "epoch": 3.335547634478289, + "grad_norm": 0.06255250424146652, + "learning_rate": 3.691435258112427e-05, + "loss": 0.2096, + "step": 41174 + }, + { + "epoch": 3.3356286454957873, + "grad_norm": 0.06754370778799057, + "learning_rate": 3.6909851928529636e-05, + "loss": 0.2277, + "step": 41175 + }, + { + "epoch": 3.335709656513286, + "grad_norm": 0.0938117578625679, + "learning_rate": 3.690535127593501e-05, + "loss": 0.272, + "step": 41176 + }, + { + "epoch": 3.3357906675307842, + "grad_norm": 0.08052606135606766, + "learning_rate": 3.690085062334039e-05, + "loss": 0.1952, + "step": 41177 + }, + { + "epoch": 3.3358716785482825, + "grad_norm": 0.06511618942022324, + "learning_rate": 3.689634997074576e-05, + "loss": 0.231, + "step": 41178 + }, + { + "epoch": 3.335952689565781, + "grad_norm": 0.05862768739461899, + "learning_rate": 3.689184931815114e-05, + "loss": 0.1946, + "step": 41179 + }, + { + "epoch": 3.3360337005832794, + "grad_norm": 0.06720541417598724, + "learning_rate": 3.688734866555651e-05, + "loss": 0.2034, + "step": 41180 + }, + { + "epoch": 3.3361147116007777, + "grad_norm": 0.0854513868689537, + "learning_rate": 3.688284801296188e-05, + "loss": 0.2326, + "step": 41181 + }, + { + "epoch": 3.336195722618276, + "grad_norm": 0.08422550559043884, + "learning_rate": 3.687834736036726e-05, + "loss": 0.236, + "step": 41182 + }, + { + "epoch": 3.3362767336357746, + "grad_norm": 0.06728208065032959, + "learning_rate": 3.687384670777263e-05, + "loss": 0.2049, + "step": 41183 + }, + { + "epoch": 3.336357744653273, + "grad_norm": 0.06178736686706543, + "learning_rate": 3.6869346055178e-05, + "loss": 0.2077, + "step": 41184 + }, + { + "epoch": 3.336438755670771, + "grad_norm": 0.06015453487634659, + "learning_rate": 3.686484540258338e-05, + "loss": 0.2575, + "step": 41185 + }, + { + "epoch": 3.3365197666882698, + "grad_norm": 0.07950470596551895, + "learning_rate": 3.686034474998875e-05, + "loss": 0.2076, + "step": 41186 + }, + { + "epoch": 3.336600777705768, + "grad_norm": 0.08321350067853928, + "learning_rate": 3.6855844097394126e-05, + "loss": 0.2115, + "step": 41187 + }, + { + "epoch": 3.3366817887232663, + "grad_norm": 0.07731655240058899, + "learning_rate": 3.68513434447995e-05, + "loss": 0.212, + "step": 41188 + }, + { + "epoch": 3.3367627997407645, + "grad_norm": 0.08558639883995056, + "learning_rate": 3.684684279220487e-05, + "loss": 0.254, + "step": 41189 + }, + { + "epoch": 3.336843810758263, + "grad_norm": 0.07424626499414444, + "learning_rate": 3.684234213961025e-05, + "loss": 0.2648, + "step": 41190 + }, + { + "epoch": 3.3369248217757614, + "grad_norm": 0.06944981217384338, + "learning_rate": 3.683784148701562e-05, + "loss": 0.2381, + "step": 41191 + }, + { + "epoch": 3.3370058327932597, + "grad_norm": 0.07112687081098557, + "learning_rate": 3.6833340834420994e-05, + "loss": 0.2338, + "step": 41192 + }, + { + "epoch": 3.3370868438107584, + "grad_norm": 0.07397808879613876, + "learning_rate": 3.682884018182637e-05, + "loss": 0.2292, + "step": 41193 + }, + { + "epoch": 3.3371678548282566, + "grad_norm": 0.07461774349212646, + "learning_rate": 3.682433952923174e-05, + "loss": 0.2209, + "step": 41194 + }, + { + "epoch": 3.337248865845755, + "grad_norm": 0.06668192148208618, + "learning_rate": 3.6819838876637115e-05, + "loss": 0.2313, + "step": 41195 + }, + { + "epoch": 3.3373298768632536, + "grad_norm": 0.08236929029226303, + "learning_rate": 3.681533822404249e-05, + "loss": 0.211, + "step": 41196 + }, + { + "epoch": 3.337410887880752, + "grad_norm": 0.08950665593147278, + "learning_rate": 3.681083757144786e-05, + "loss": 0.2696, + "step": 41197 + }, + { + "epoch": 3.33749189889825, + "grad_norm": 0.06646319478750229, + "learning_rate": 3.6806336918853236e-05, + "loss": 0.2307, + "step": 41198 + }, + { + "epoch": 3.3375729099157487, + "grad_norm": 0.0690191239118576, + "learning_rate": 3.680183626625861e-05, + "loss": 0.2302, + "step": 41199 + }, + { + "epoch": 3.337653920933247, + "grad_norm": 0.06904525309801102, + "learning_rate": 3.679733561366398e-05, + "loss": 0.1905, + "step": 41200 + }, + { + "epoch": 3.337734931950745, + "grad_norm": 0.06719229370355606, + "learning_rate": 3.6792834961069356e-05, + "loss": 0.2102, + "step": 41201 + }, + { + "epoch": 3.337815942968244, + "grad_norm": 0.07356458902359009, + "learning_rate": 3.678833430847473e-05, + "loss": 0.1997, + "step": 41202 + }, + { + "epoch": 3.337896953985742, + "grad_norm": 0.06217246130108833, + "learning_rate": 3.6783833655880104e-05, + "loss": 0.2282, + "step": 41203 + }, + { + "epoch": 3.3379779650032404, + "grad_norm": 0.06710822880268097, + "learning_rate": 3.677933300328548e-05, + "loss": 0.2238, + "step": 41204 + }, + { + "epoch": 3.3380589760207386, + "grad_norm": 0.07744985073804855, + "learning_rate": 3.677483235069085e-05, + "loss": 0.2082, + "step": 41205 + }, + { + "epoch": 3.3381399870382373, + "grad_norm": 0.07299521565437317, + "learning_rate": 3.6770331698096224e-05, + "loss": 0.2265, + "step": 41206 + }, + { + "epoch": 3.3382209980557356, + "grad_norm": 0.06934568285942078, + "learning_rate": 3.67658310455016e-05, + "loss": 0.2083, + "step": 41207 + }, + { + "epoch": 3.338302009073234, + "grad_norm": 0.06951799243688583, + "learning_rate": 3.676133039290697e-05, + "loss": 0.2486, + "step": 41208 + }, + { + "epoch": 3.3383830200907325, + "grad_norm": 0.08006034046411514, + "learning_rate": 3.6756829740312345e-05, + "loss": 0.2156, + "step": 41209 + }, + { + "epoch": 3.3384640311082308, + "grad_norm": 0.07656034082174301, + "learning_rate": 3.675232908771772e-05, + "loss": 0.2395, + "step": 41210 + }, + { + "epoch": 3.338545042125729, + "grad_norm": 0.08563342690467834, + "learning_rate": 3.674782843512309e-05, + "loss": 0.2559, + "step": 41211 + }, + { + "epoch": 3.3386260531432272, + "grad_norm": 0.09620586782693863, + "learning_rate": 3.674332778252847e-05, + "loss": 0.245, + "step": 41212 + }, + { + "epoch": 3.338707064160726, + "grad_norm": 0.05890323594212532, + "learning_rate": 3.6738827129933846e-05, + "loss": 0.2105, + "step": 41213 + }, + { + "epoch": 3.338788075178224, + "grad_norm": 0.07511017471551895, + "learning_rate": 3.673432647733921e-05, + "loss": 0.2239, + "step": 41214 + }, + { + "epoch": 3.3388690861957224, + "grad_norm": 0.05438505485653877, + "learning_rate": 3.6729825824744594e-05, + "loss": 0.2074, + "step": 41215 + }, + { + "epoch": 3.338950097213221, + "grad_norm": 0.05938766524195671, + "learning_rate": 3.672532517214997e-05, + "loss": 0.2266, + "step": 41216 + }, + { + "epoch": 3.3390311082307194, + "grad_norm": 0.06936460733413696, + "learning_rate": 3.6720824519555334e-05, + "loss": 0.2513, + "step": 41217 + }, + { + "epoch": 3.3391121192482176, + "grad_norm": 0.06176231801509857, + "learning_rate": 3.6716323866960714e-05, + "loss": 0.2257, + "step": 41218 + }, + { + "epoch": 3.3391931302657163, + "grad_norm": 0.0678444430232048, + "learning_rate": 3.671182321436609e-05, + "loss": 0.2048, + "step": 41219 + }, + { + "epoch": 3.3392741412832145, + "grad_norm": 0.07848968356847763, + "learning_rate": 3.6707322561771455e-05, + "loss": 0.2374, + "step": 41220 + }, + { + "epoch": 3.339355152300713, + "grad_norm": 0.08253312110900879, + "learning_rate": 3.6702821909176835e-05, + "loss": 0.2313, + "step": 41221 + }, + { + "epoch": 3.3394361633182115, + "grad_norm": 0.09484583139419556, + "learning_rate": 3.669832125658221e-05, + "loss": 0.274, + "step": 41222 + }, + { + "epoch": 3.3395171743357097, + "grad_norm": 0.06523586064577103, + "learning_rate": 3.6693820603987576e-05, + "loss": 0.2287, + "step": 41223 + }, + { + "epoch": 3.339598185353208, + "grad_norm": 0.05800995975732803, + "learning_rate": 3.6689319951392956e-05, + "loss": 0.2129, + "step": 41224 + }, + { + "epoch": 3.3396791963707066, + "grad_norm": 0.07219946384429932, + "learning_rate": 3.668481929879833e-05, + "loss": 0.2564, + "step": 41225 + }, + { + "epoch": 3.339760207388205, + "grad_norm": 0.06518752127885818, + "learning_rate": 3.66803186462037e-05, + "loss": 0.2316, + "step": 41226 + }, + { + "epoch": 3.339841218405703, + "grad_norm": 0.06252483278512955, + "learning_rate": 3.667581799360908e-05, + "loss": 0.2457, + "step": 41227 + }, + { + "epoch": 3.3399222294232014, + "grad_norm": 0.07003217935562134, + "learning_rate": 3.667131734101445e-05, + "loss": 0.2106, + "step": 41228 + }, + { + "epoch": 3.3400032404407, + "grad_norm": 0.07570035755634308, + "learning_rate": 3.6666816688419824e-05, + "loss": 0.2339, + "step": 41229 + }, + { + "epoch": 3.3400842514581983, + "grad_norm": 0.08694077283143997, + "learning_rate": 3.66623160358252e-05, + "loss": 0.206, + "step": 41230 + }, + { + "epoch": 3.3401652624756966, + "grad_norm": 0.059254713356494904, + "learning_rate": 3.665781538323057e-05, + "loss": 0.2416, + "step": 41231 + }, + { + "epoch": 3.3402462734931953, + "grad_norm": 0.06904411315917969, + "learning_rate": 3.6653314730635945e-05, + "loss": 0.2027, + "step": 41232 + }, + { + "epoch": 3.3403272845106935, + "grad_norm": 0.05732307955622673, + "learning_rate": 3.664881407804132e-05, + "loss": 0.2343, + "step": 41233 + }, + { + "epoch": 3.3404082955281917, + "grad_norm": 0.07104742527008057, + "learning_rate": 3.664431342544669e-05, + "loss": 0.247, + "step": 41234 + }, + { + "epoch": 3.34048930654569, + "grad_norm": 0.0745600238442421, + "learning_rate": 3.6639812772852066e-05, + "loss": 0.2089, + "step": 41235 + }, + { + "epoch": 3.3405703175631887, + "grad_norm": 0.07792531698942184, + "learning_rate": 3.663531212025744e-05, + "loss": 0.2823, + "step": 41236 + }, + { + "epoch": 3.340651328580687, + "grad_norm": 0.06312679499387741, + "learning_rate": 3.663081146766281e-05, + "loss": 0.2054, + "step": 41237 + }, + { + "epoch": 3.340732339598185, + "grad_norm": 0.08045652508735657, + "learning_rate": 3.6626310815068186e-05, + "loss": 0.2163, + "step": 41238 + }, + { + "epoch": 3.340813350615684, + "grad_norm": 0.08712105453014374, + "learning_rate": 3.662181016247356e-05, + "loss": 0.28, + "step": 41239 + }, + { + "epoch": 3.340894361633182, + "grad_norm": 0.06204592064023018, + "learning_rate": 3.6617309509878934e-05, + "loss": 0.2088, + "step": 41240 + }, + { + "epoch": 3.3409753726506803, + "grad_norm": 0.06538501381874084, + "learning_rate": 3.661280885728431e-05, + "loss": 0.2466, + "step": 41241 + }, + { + "epoch": 3.341056383668179, + "grad_norm": 0.07625439763069153, + "learning_rate": 3.660830820468968e-05, + "loss": 0.248, + "step": 41242 + }, + { + "epoch": 3.3411373946856773, + "grad_norm": 0.06579206883907318, + "learning_rate": 3.6603807552095054e-05, + "loss": 0.1921, + "step": 41243 + }, + { + "epoch": 3.3412184057031755, + "grad_norm": 0.07133506238460541, + "learning_rate": 3.659930689950043e-05, + "loss": 0.2243, + "step": 41244 + }, + { + "epoch": 3.341299416720674, + "grad_norm": 0.07682822644710541, + "learning_rate": 3.659480624690581e-05, + "loss": 0.2282, + "step": 41245 + }, + { + "epoch": 3.3413804277381725, + "grad_norm": 0.07200987637042999, + "learning_rate": 3.6590305594311175e-05, + "loss": 0.2188, + "step": 41246 + }, + { + "epoch": 3.3414614387556707, + "grad_norm": 0.06509634107351303, + "learning_rate": 3.658580494171655e-05, + "loss": 0.2635, + "step": 41247 + }, + { + "epoch": 3.3415424497731694, + "grad_norm": 0.09091950207948685, + "learning_rate": 3.658130428912193e-05, + "loss": 0.2183, + "step": 41248 + }, + { + "epoch": 3.3416234607906676, + "grad_norm": 0.06340814381837845, + "learning_rate": 3.6576803636527296e-05, + "loss": 0.2341, + "step": 41249 + }, + { + "epoch": 3.341704471808166, + "grad_norm": 0.0767722800374031, + "learning_rate": 3.657230298393267e-05, + "loss": 0.2223, + "step": 41250 + }, + { + "epoch": 3.341785482825664, + "grad_norm": 0.07288037985563278, + "learning_rate": 3.656780233133805e-05, + "loss": 0.2096, + "step": 41251 + }, + { + "epoch": 3.341866493843163, + "grad_norm": 0.0707254484295845, + "learning_rate": 3.656330167874342e-05, + "loss": 0.213, + "step": 41252 + }, + { + "epoch": 3.341947504860661, + "grad_norm": 0.07349371910095215, + "learning_rate": 3.655880102614879e-05, + "loss": 0.21, + "step": 41253 + }, + { + "epoch": 3.3420285158781593, + "grad_norm": 0.07877951860427856, + "learning_rate": 3.655430037355417e-05, + "loss": 0.256, + "step": 41254 + }, + { + "epoch": 3.342109526895658, + "grad_norm": 0.07902669906616211, + "learning_rate": 3.6549799720959544e-05, + "loss": 0.2357, + "step": 41255 + }, + { + "epoch": 3.3421905379131562, + "grad_norm": 0.06486064195632935, + "learning_rate": 3.654529906836491e-05, + "loss": 0.2164, + "step": 41256 + }, + { + "epoch": 3.3422715489306545, + "grad_norm": 0.07269563525915146, + "learning_rate": 3.654079841577029e-05, + "loss": 0.2057, + "step": 41257 + }, + { + "epoch": 3.3423525599481527, + "grad_norm": 0.06843418627977371, + "learning_rate": 3.6536297763175665e-05, + "loss": 0.2091, + "step": 41258 + }, + { + "epoch": 3.3424335709656514, + "grad_norm": 0.06407822668552399, + "learning_rate": 3.653179711058103e-05, + "loss": 0.1805, + "step": 41259 + }, + { + "epoch": 3.3425145819831497, + "grad_norm": 0.06240003928542137, + "learning_rate": 3.652729645798641e-05, + "loss": 0.2194, + "step": 41260 + }, + { + "epoch": 3.342595593000648, + "grad_norm": 0.07912351191043854, + "learning_rate": 3.6522795805391786e-05, + "loss": 0.2346, + "step": 41261 + }, + { + "epoch": 3.3426766040181466, + "grad_norm": 0.08149971812963486, + "learning_rate": 3.651829515279715e-05, + "loss": 0.2143, + "step": 41262 + }, + { + "epoch": 3.342757615035645, + "grad_norm": 0.0701875165104866, + "learning_rate": 3.651379450020253e-05, + "loss": 0.2038, + "step": 41263 + }, + { + "epoch": 3.342838626053143, + "grad_norm": 0.07117495685815811, + "learning_rate": 3.650929384760791e-05, + "loss": 0.2272, + "step": 41264 + }, + { + "epoch": 3.3429196370706418, + "grad_norm": 0.09499608725309372, + "learning_rate": 3.6504793195013274e-05, + "loss": 0.2179, + "step": 41265 + }, + { + "epoch": 3.34300064808814, + "grad_norm": 0.08293692022562027, + "learning_rate": 3.6500292542418654e-05, + "loss": 0.2042, + "step": 41266 + }, + { + "epoch": 3.3430816591056383, + "grad_norm": 0.08190654963254929, + "learning_rate": 3.649579188982403e-05, + "loss": 0.1921, + "step": 41267 + }, + { + "epoch": 3.343162670123137, + "grad_norm": 0.07378261536359787, + "learning_rate": 3.64912912372294e-05, + "loss": 0.2202, + "step": 41268 + }, + { + "epoch": 3.343243681140635, + "grad_norm": 0.0729857087135315, + "learning_rate": 3.6486790584634775e-05, + "loss": 0.2402, + "step": 41269 + }, + { + "epoch": 3.3433246921581334, + "grad_norm": 0.07511337101459503, + "learning_rate": 3.648228993204015e-05, + "loss": 0.2423, + "step": 41270 + }, + { + "epoch": 3.343405703175632, + "grad_norm": 0.058995719999074936, + "learning_rate": 3.647778927944552e-05, + "loss": 0.2525, + "step": 41271 + }, + { + "epoch": 3.3434867141931304, + "grad_norm": 0.06570830196142197, + "learning_rate": 3.6473288626850896e-05, + "loss": 0.2253, + "step": 41272 + }, + { + "epoch": 3.3435677252106286, + "grad_norm": 0.07327885925769806, + "learning_rate": 3.646878797425627e-05, + "loss": 0.245, + "step": 41273 + }, + { + "epoch": 3.343648736228127, + "grad_norm": 0.08053727447986603, + "learning_rate": 3.646428732166164e-05, + "loss": 0.2137, + "step": 41274 + }, + { + "epoch": 3.3437297472456255, + "grad_norm": 0.07759438455104828, + "learning_rate": 3.6459786669067017e-05, + "loss": 0.2264, + "step": 41275 + }, + { + "epoch": 3.343810758263124, + "grad_norm": 0.05452917516231537, + "learning_rate": 3.645528601647239e-05, + "loss": 0.2342, + "step": 41276 + }, + { + "epoch": 3.343891769280622, + "grad_norm": 0.0786796435713768, + "learning_rate": 3.6450785363877764e-05, + "loss": 0.2138, + "step": 41277 + }, + { + "epoch": 3.3439727802981207, + "grad_norm": 0.08116442710161209, + "learning_rate": 3.644628471128314e-05, + "loss": 0.2576, + "step": 41278 + }, + { + "epoch": 3.344053791315619, + "grad_norm": 0.07618381828069687, + "learning_rate": 3.644178405868851e-05, + "loss": 0.2346, + "step": 41279 + }, + { + "epoch": 3.344134802333117, + "grad_norm": 0.08092188835144043, + "learning_rate": 3.6437283406093885e-05, + "loss": 0.2129, + "step": 41280 + }, + { + "epoch": 3.3442158133506155, + "grad_norm": 0.07491854578256607, + "learning_rate": 3.6432782753499265e-05, + "loss": 0.1987, + "step": 41281 + }, + { + "epoch": 3.344296824368114, + "grad_norm": 0.08945416659116745, + "learning_rate": 3.642828210090463e-05, + "loss": 0.2392, + "step": 41282 + }, + { + "epoch": 3.3443778353856124, + "grad_norm": 0.07511164247989655, + "learning_rate": 3.6423781448310005e-05, + "loss": 0.2361, + "step": 41283 + }, + { + "epoch": 3.3444588464031106, + "grad_norm": 0.07991006970405579, + "learning_rate": 3.6419280795715386e-05, + "loss": 0.2121, + "step": 41284 + }, + { + "epoch": 3.3445398574206093, + "grad_norm": 0.06376335024833679, + "learning_rate": 3.641478014312075e-05, + "loss": 0.2125, + "step": 41285 + }, + { + "epoch": 3.3446208684381076, + "grad_norm": 0.08875862509012222, + "learning_rate": 3.6410279490526126e-05, + "loss": 0.2539, + "step": 41286 + }, + { + "epoch": 3.344701879455606, + "grad_norm": 0.0659111961722374, + "learning_rate": 3.6405778837931507e-05, + "loss": 0.2217, + "step": 41287 + }, + { + "epoch": 3.3447828904731045, + "grad_norm": 0.0760120376944542, + "learning_rate": 3.640127818533687e-05, + "loss": 0.2192, + "step": 41288 + }, + { + "epoch": 3.3448639014906028, + "grad_norm": 0.08299257606267929, + "learning_rate": 3.639677753274225e-05, + "loss": 0.2269, + "step": 41289 + }, + { + "epoch": 3.344944912508101, + "grad_norm": 0.07124077528715134, + "learning_rate": 3.639227688014763e-05, + "loss": 0.2344, + "step": 41290 + }, + { + "epoch": 3.3450259235255997, + "grad_norm": 0.08746058493852615, + "learning_rate": 3.6387776227552994e-05, + "loss": 0.222, + "step": 41291 + }, + { + "epoch": 3.345106934543098, + "grad_norm": 0.07460647821426392, + "learning_rate": 3.638327557495837e-05, + "loss": 0.2359, + "step": 41292 + }, + { + "epoch": 3.345187945560596, + "grad_norm": 0.08213378489017487, + "learning_rate": 3.637877492236375e-05, + "loss": 0.206, + "step": 41293 + }, + { + "epoch": 3.345268956578095, + "grad_norm": 0.06644278764724731, + "learning_rate": 3.637427426976912e-05, + "loss": 0.2118, + "step": 41294 + }, + { + "epoch": 3.345349967595593, + "grad_norm": 0.08180107176303864, + "learning_rate": 3.636977361717449e-05, + "loss": 0.2258, + "step": 41295 + }, + { + "epoch": 3.3454309786130914, + "grad_norm": 0.06995628029108047, + "learning_rate": 3.636527296457987e-05, + "loss": 0.2511, + "step": 41296 + }, + { + "epoch": 3.3455119896305896, + "grad_norm": 0.08584023267030716, + "learning_rate": 3.636077231198524e-05, + "loss": 0.2358, + "step": 41297 + }, + { + "epoch": 3.3455930006480883, + "grad_norm": 0.07145940512418747, + "learning_rate": 3.635627165939061e-05, + "loss": 0.2136, + "step": 41298 + }, + { + "epoch": 3.3456740116655865, + "grad_norm": 0.07166958600282669, + "learning_rate": 3.635177100679599e-05, + "loss": 0.226, + "step": 41299 + }, + { + "epoch": 3.345755022683085, + "grad_norm": 0.07758349180221558, + "learning_rate": 3.634727035420136e-05, + "loss": 0.2007, + "step": 41300 + }, + { + "epoch": 3.3458360337005835, + "grad_norm": 0.0829416960477829, + "learning_rate": 3.634276970160673e-05, + "loss": 0.2428, + "step": 41301 + }, + { + "epoch": 3.3459170447180817, + "grad_norm": 0.07670668512582779, + "learning_rate": 3.633826904901211e-05, + "loss": 0.208, + "step": 41302 + }, + { + "epoch": 3.34599805573558, + "grad_norm": 0.06386036425828934, + "learning_rate": 3.6333768396417484e-05, + "loss": 0.2278, + "step": 41303 + }, + { + "epoch": 3.346079066753078, + "grad_norm": 0.05563236027956009, + "learning_rate": 3.632926774382285e-05, + "loss": 0.2044, + "step": 41304 + }, + { + "epoch": 3.346160077770577, + "grad_norm": 0.07629813998937607, + "learning_rate": 3.632476709122823e-05, + "loss": 0.2544, + "step": 41305 + }, + { + "epoch": 3.346241088788075, + "grad_norm": 0.08975273370742798, + "learning_rate": 3.6320266438633605e-05, + "loss": 0.1986, + "step": 41306 + }, + { + "epoch": 3.3463220998055734, + "grad_norm": 0.05901889130473137, + "learning_rate": 3.631576578603898e-05, + "loss": 0.2419, + "step": 41307 + }, + { + "epoch": 3.346403110823072, + "grad_norm": 0.06731534749269485, + "learning_rate": 3.631126513344435e-05, + "loss": 0.2171, + "step": 41308 + }, + { + "epoch": 3.3464841218405703, + "grad_norm": 0.0790141150355339, + "learning_rate": 3.6306764480849726e-05, + "loss": 0.1912, + "step": 41309 + }, + { + "epoch": 3.3465651328580686, + "grad_norm": 0.06635841727256775, + "learning_rate": 3.63022638282551e-05, + "loss": 0.1908, + "step": 41310 + }, + { + "epoch": 3.3466461438755672, + "grad_norm": 0.09180153161287308, + "learning_rate": 3.629776317566047e-05, + "loss": 0.2472, + "step": 41311 + }, + { + "epoch": 3.3467271548930655, + "grad_norm": 0.07255294919013977, + "learning_rate": 3.629326252306585e-05, + "loss": 0.1972, + "step": 41312 + }, + { + "epoch": 3.3468081659105637, + "grad_norm": 0.06764529645442963, + "learning_rate": 3.628876187047122e-05, + "loss": 0.2226, + "step": 41313 + }, + { + "epoch": 3.3468891769280624, + "grad_norm": 0.07115866243839264, + "learning_rate": 3.6284261217876594e-05, + "loss": 0.2627, + "step": 41314 + }, + { + "epoch": 3.3469701879455607, + "grad_norm": 0.07019352167844772, + "learning_rate": 3.627976056528197e-05, + "loss": 0.1983, + "step": 41315 + }, + { + "epoch": 3.347051198963059, + "grad_norm": 0.06258615106344223, + "learning_rate": 3.627525991268734e-05, + "loss": 0.2197, + "step": 41316 + }, + { + "epoch": 3.3471322099805576, + "grad_norm": 0.05955258011817932, + "learning_rate": 3.6270759260092715e-05, + "loss": 0.2273, + "step": 41317 + }, + { + "epoch": 3.347213220998056, + "grad_norm": 0.08046617358922958, + "learning_rate": 3.626625860749809e-05, + "loss": 0.2609, + "step": 41318 + }, + { + "epoch": 3.347294232015554, + "grad_norm": 0.07196833193302155, + "learning_rate": 3.626175795490346e-05, + "loss": 0.2844, + "step": 41319 + }, + { + "epoch": 3.3473752430330523, + "grad_norm": 0.0785125195980072, + "learning_rate": 3.625725730230884e-05, + "loss": 0.1894, + "step": 41320 + }, + { + "epoch": 3.347456254050551, + "grad_norm": 0.06598301231861115, + "learning_rate": 3.625275664971421e-05, + "loss": 0.2484, + "step": 41321 + }, + { + "epoch": 3.3475372650680493, + "grad_norm": 0.0736595019698143, + "learning_rate": 3.624825599711958e-05, + "loss": 0.2623, + "step": 41322 + }, + { + "epoch": 3.3476182760855475, + "grad_norm": 0.0954742580652237, + "learning_rate": 3.624375534452496e-05, + "loss": 0.2415, + "step": 41323 + }, + { + "epoch": 3.347699287103046, + "grad_norm": 0.07454533874988556, + "learning_rate": 3.623925469193033e-05, + "loss": 0.2065, + "step": 41324 + }, + { + "epoch": 3.3477802981205445, + "grad_norm": 0.06970658898353577, + "learning_rate": 3.6234754039335703e-05, + "loss": 0.205, + "step": 41325 + }, + { + "epoch": 3.3478613091380427, + "grad_norm": 0.07173918187618256, + "learning_rate": 3.6230253386741084e-05, + "loss": 0.2361, + "step": 41326 + }, + { + "epoch": 3.347942320155541, + "grad_norm": 0.08432280272245407, + "learning_rate": 3.622575273414645e-05, + "loss": 0.2616, + "step": 41327 + }, + { + "epoch": 3.3480233311730396, + "grad_norm": 0.06053037941455841, + "learning_rate": 3.6221252081551824e-05, + "loss": 0.2301, + "step": 41328 + }, + { + "epoch": 3.348104342190538, + "grad_norm": 0.05914249271154404, + "learning_rate": 3.6216751428957205e-05, + "loss": 0.2211, + "step": 41329 + }, + { + "epoch": 3.348185353208036, + "grad_norm": 0.08348757028579712, + "learning_rate": 3.621225077636257e-05, + "loss": 0.2667, + "step": 41330 + }, + { + "epoch": 3.348266364225535, + "grad_norm": 0.07757681608200073, + "learning_rate": 3.6207750123767945e-05, + "loss": 0.2267, + "step": 41331 + }, + { + "epoch": 3.348347375243033, + "grad_norm": 0.08246699720621109, + "learning_rate": 3.6203249471173325e-05, + "loss": 0.2224, + "step": 41332 + }, + { + "epoch": 3.3484283862605313, + "grad_norm": 0.06282779574394226, + "learning_rate": 3.61987488185787e-05, + "loss": 0.2222, + "step": 41333 + }, + { + "epoch": 3.34850939727803, + "grad_norm": 0.06982121616601944, + "learning_rate": 3.6194248165984066e-05, + "loss": 0.2306, + "step": 41334 + }, + { + "epoch": 3.3485904082955282, + "grad_norm": 0.06591107696294785, + "learning_rate": 3.6189747513389446e-05, + "loss": 0.2275, + "step": 41335 + }, + { + "epoch": 3.3486714193130265, + "grad_norm": 0.06394707411527634, + "learning_rate": 3.618524686079482e-05, + "loss": 0.2253, + "step": 41336 + }, + { + "epoch": 3.348752430330525, + "grad_norm": 0.08156972378492355, + "learning_rate": 3.618074620820019e-05, + "loss": 0.2386, + "step": 41337 + }, + { + "epoch": 3.3488334413480234, + "grad_norm": 0.08084411174058914, + "learning_rate": 3.617624555560557e-05, + "loss": 0.2309, + "step": 41338 + }, + { + "epoch": 3.3489144523655217, + "grad_norm": 0.06958631426095963, + "learning_rate": 3.617174490301094e-05, + "loss": 0.2317, + "step": 41339 + }, + { + "epoch": 3.34899546338302, + "grad_norm": 0.06364104896783829, + "learning_rate": 3.616724425041631e-05, + "loss": 0.222, + "step": 41340 + }, + { + "epoch": 3.3490764744005186, + "grad_norm": 0.06435956805944443, + "learning_rate": 3.616274359782169e-05, + "loss": 0.2074, + "step": 41341 + }, + { + "epoch": 3.349157485418017, + "grad_norm": 0.07496075332164764, + "learning_rate": 3.615824294522706e-05, + "loss": 0.2481, + "step": 41342 + }, + { + "epoch": 3.349238496435515, + "grad_norm": 0.08156944811344147, + "learning_rate": 3.615374229263243e-05, + "loss": 0.2184, + "step": 41343 + }, + { + "epoch": 3.3493195074530138, + "grad_norm": 0.08139017224311829, + "learning_rate": 3.614924164003781e-05, + "loss": 0.2405, + "step": 41344 + }, + { + "epoch": 3.349400518470512, + "grad_norm": 0.09298984706401825, + "learning_rate": 3.614474098744318e-05, + "loss": 0.2581, + "step": 41345 + }, + { + "epoch": 3.3494815294880103, + "grad_norm": 0.08675330132246017, + "learning_rate": 3.6140240334848556e-05, + "loss": 0.2504, + "step": 41346 + }, + { + "epoch": 3.3495625405055085, + "grad_norm": 0.07219713181257248, + "learning_rate": 3.613573968225393e-05, + "loss": 0.2393, + "step": 41347 + }, + { + "epoch": 3.349643551523007, + "grad_norm": 0.06823533028364182, + "learning_rate": 3.61312390296593e-05, + "loss": 0.2374, + "step": 41348 + }, + { + "epoch": 3.3497245625405054, + "grad_norm": 0.07592404633760452, + "learning_rate": 3.612673837706468e-05, + "loss": 0.2181, + "step": 41349 + }, + { + "epoch": 3.3498055735580037, + "grad_norm": 0.07518725842237473, + "learning_rate": 3.612223772447005e-05, + "loss": 0.2485, + "step": 41350 + }, + { + "epoch": 3.3498865845755024, + "grad_norm": 0.07888790220022202, + "learning_rate": 3.6117737071875424e-05, + "loss": 0.2276, + "step": 41351 + }, + { + "epoch": 3.3499675955930006, + "grad_norm": 0.07398751378059387, + "learning_rate": 3.61132364192808e-05, + "loss": 0.2346, + "step": 41352 + }, + { + "epoch": 3.350048606610499, + "grad_norm": 0.07038160413503647, + "learning_rate": 3.610873576668617e-05, + "loss": 0.2045, + "step": 41353 + }, + { + "epoch": 3.3501296176279975, + "grad_norm": 0.06793700158596039, + "learning_rate": 3.6104235114091545e-05, + "loss": 0.2057, + "step": 41354 + }, + { + "epoch": 3.350210628645496, + "grad_norm": 0.0781884640455246, + "learning_rate": 3.609973446149692e-05, + "loss": 0.21, + "step": 41355 + }, + { + "epoch": 3.350291639662994, + "grad_norm": 0.06680549681186676, + "learning_rate": 3.609523380890229e-05, + "loss": 0.2291, + "step": 41356 + }, + { + "epoch": 3.3503726506804927, + "grad_norm": 0.05507110431790352, + "learning_rate": 3.6090733156307666e-05, + "loss": 0.2048, + "step": 41357 + }, + { + "epoch": 3.350453661697991, + "grad_norm": 0.05875886231660843, + "learning_rate": 3.608623250371304e-05, + "loss": 0.225, + "step": 41358 + }, + { + "epoch": 3.350534672715489, + "grad_norm": 0.0633646696805954, + "learning_rate": 3.608173185111842e-05, + "loss": 0.197, + "step": 41359 + }, + { + "epoch": 3.350615683732988, + "grad_norm": 0.07051599025726318, + "learning_rate": 3.6077231198523786e-05, + "loss": 0.2266, + "step": 41360 + }, + { + "epoch": 3.350696694750486, + "grad_norm": 0.07944469153881073, + "learning_rate": 3.607273054592916e-05, + "loss": 0.2241, + "step": 41361 + }, + { + "epoch": 3.3507777057679844, + "grad_norm": 0.06918632984161377, + "learning_rate": 3.606822989333454e-05, + "loss": 0.2248, + "step": 41362 + }, + { + "epoch": 3.3508587167854826, + "grad_norm": 0.07134364545345306, + "learning_rate": 3.606372924073991e-05, + "loss": 0.2256, + "step": 41363 + }, + { + "epoch": 3.3509397278029813, + "grad_norm": 0.08421452343463898, + "learning_rate": 3.605922858814528e-05, + "loss": 0.214, + "step": 41364 + }, + { + "epoch": 3.3510207388204796, + "grad_norm": 0.08210983872413635, + "learning_rate": 3.605472793555066e-05, + "loss": 0.2212, + "step": 41365 + }, + { + "epoch": 3.351101749837978, + "grad_norm": 0.07449344545602798, + "learning_rate": 3.605022728295603e-05, + "loss": 0.2424, + "step": 41366 + }, + { + "epoch": 3.3511827608554765, + "grad_norm": 0.08094337582588196, + "learning_rate": 3.60457266303614e-05, + "loss": 0.2194, + "step": 41367 + }, + { + "epoch": 3.3512637718729748, + "grad_norm": 0.07408936321735382, + "learning_rate": 3.604122597776678e-05, + "loss": 0.2512, + "step": 41368 + }, + { + "epoch": 3.351344782890473, + "grad_norm": 0.0715651735663414, + "learning_rate": 3.603672532517215e-05, + "loss": 0.1979, + "step": 41369 + }, + { + "epoch": 3.3514257939079712, + "grad_norm": 0.07386700809001923, + "learning_rate": 3.603222467257752e-05, + "loss": 0.2566, + "step": 41370 + }, + { + "epoch": 3.35150680492547, + "grad_norm": 0.06657444685697556, + "learning_rate": 3.60277240199829e-05, + "loss": 0.2044, + "step": 41371 + }, + { + "epoch": 3.351587815942968, + "grad_norm": 0.0708659291267395, + "learning_rate": 3.6023223367388276e-05, + "loss": 0.2211, + "step": 41372 + }, + { + "epoch": 3.3516688269604664, + "grad_norm": 0.0607280470430851, + "learning_rate": 3.601872271479364e-05, + "loss": 0.2116, + "step": 41373 + }, + { + "epoch": 3.351749837977965, + "grad_norm": 0.08024850487709045, + "learning_rate": 3.6014222062199024e-05, + "loss": 0.1993, + "step": 41374 + }, + { + "epoch": 3.3518308489954634, + "grad_norm": 0.06023063883185387, + "learning_rate": 3.60097214096044e-05, + "loss": 0.2228, + "step": 41375 + }, + { + "epoch": 3.3519118600129616, + "grad_norm": 0.08129986375570297, + "learning_rate": 3.6005220757009764e-05, + "loss": 0.2486, + "step": 41376 + }, + { + "epoch": 3.3519928710304603, + "grad_norm": 0.09041114151477814, + "learning_rate": 3.6000720104415144e-05, + "loss": 0.2713, + "step": 41377 + }, + { + "epoch": 3.3520738820479585, + "grad_norm": 0.06871502846479416, + "learning_rate": 3.599621945182052e-05, + "loss": 0.2315, + "step": 41378 + }, + { + "epoch": 3.3521548930654568, + "grad_norm": 0.08983299881219864, + "learning_rate": 3.5991718799225885e-05, + "loss": 0.2357, + "step": 41379 + }, + { + "epoch": 3.3522359040829555, + "grad_norm": 0.08149930089712143, + "learning_rate": 3.5987218146631265e-05, + "loss": 0.2141, + "step": 41380 + }, + { + "epoch": 3.3523169151004537, + "grad_norm": 0.051490750163793564, + "learning_rate": 3.598271749403664e-05, + "loss": 0.242, + "step": 41381 + }, + { + "epoch": 3.352397926117952, + "grad_norm": 0.06942000240087509, + "learning_rate": 3.5978216841442006e-05, + "loss": 0.1977, + "step": 41382 + }, + { + "epoch": 3.3524789371354506, + "grad_norm": 0.08682406693696976, + "learning_rate": 3.5973716188847386e-05, + "loss": 0.2316, + "step": 41383 + }, + { + "epoch": 3.352559948152949, + "grad_norm": 0.07400385290384293, + "learning_rate": 3.596921553625276e-05, + "loss": 0.2648, + "step": 41384 + }, + { + "epoch": 3.352640959170447, + "grad_norm": 0.06687964498996735, + "learning_rate": 3.596471488365813e-05, + "loss": 0.2522, + "step": 41385 + }, + { + "epoch": 3.3527219701879454, + "grad_norm": 0.05889909341931343, + "learning_rate": 3.596021423106351e-05, + "loss": 0.1898, + "step": 41386 + }, + { + "epoch": 3.352802981205444, + "grad_norm": 0.10397472977638245, + "learning_rate": 3.595571357846888e-05, + "loss": 0.2495, + "step": 41387 + }, + { + "epoch": 3.3528839922229423, + "grad_norm": 0.06285160779953003, + "learning_rate": 3.5951212925874254e-05, + "loss": 0.2641, + "step": 41388 + }, + { + "epoch": 3.3529650032404406, + "grad_norm": 0.08349147439002991, + "learning_rate": 3.594671227327963e-05, + "loss": 0.2406, + "step": 41389 + }, + { + "epoch": 3.3530460142579392, + "grad_norm": 0.07242781668901443, + "learning_rate": 3.5942211620685e-05, + "loss": 0.2258, + "step": 41390 + }, + { + "epoch": 3.3531270252754375, + "grad_norm": 0.06428693979978561, + "learning_rate": 3.5937710968090375e-05, + "loss": 0.2019, + "step": 41391 + }, + { + "epoch": 3.3532080362929357, + "grad_norm": 0.060086168348789215, + "learning_rate": 3.593321031549575e-05, + "loss": 0.1769, + "step": 41392 + }, + { + "epoch": 3.353289047310434, + "grad_norm": 0.07465967535972595, + "learning_rate": 3.592870966290112e-05, + "loss": 0.2187, + "step": 41393 + }, + { + "epoch": 3.3533700583279327, + "grad_norm": 0.08104344457387924, + "learning_rate": 3.5924209010306496e-05, + "loss": 0.2183, + "step": 41394 + }, + { + "epoch": 3.353451069345431, + "grad_norm": 0.0797872468829155, + "learning_rate": 3.591970835771187e-05, + "loss": 0.2251, + "step": 41395 + }, + { + "epoch": 3.353532080362929, + "grad_norm": 0.07382632046937943, + "learning_rate": 3.591520770511724e-05, + "loss": 0.2221, + "step": 41396 + }, + { + "epoch": 3.353613091380428, + "grad_norm": 0.06717702001333237, + "learning_rate": 3.5910707052522616e-05, + "loss": 0.2045, + "step": 41397 + }, + { + "epoch": 3.353694102397926, + "grad_norm": 0.09043001383543015, + "learning_rate": 3.590620639992799e-05, + "loss": 0.2273, + "step": 41398 + }, + { + "epoch": 3.3537751134154243, + "grad_norm": 0.07683952152729034, + "learning_rate": 3.5901705747333364e-05, + "loss": 0.2321, + "step": 41399 + }, + { + "epoch": 3.353856124432923, + "grad_norm": 0.060896776616573334, + "learning_rate": 3.589720509473874e-05, + "loss": 0.2016, + "step": 41400 + }, + { + "epoch": 3.3539371354504213, + "grad_norm": 0.06365719437599182, + "learning_rate": 3.589270444214412e-05, + "loss": 0.2161, + "step": 41401 + }, + { + "epoch": 3.3540181464679195, + "grad_norm": 0.08571047335863113, + "learning_rate": 3.5888203789549484e-05, + "loss": 0.226, + "step": 41402 + }, + { + "epoch": 3.354099157485418, + "grad_norm": 0.06898649781942368, + "learning_rate": 3.588370313695486e-05, + "loss": 0.2279, + "step": 41403 + }, + { + "epoch": 3.3541801685029164, + "grad_norm": 0.07108738273382187, + "learning_rate": 3.587920248436024e-05, + "loss": 0.2603, + "step": 41404 + }, + { + "epoch": 3.3542611795204147, + "grad_norm": 0.07938271015882492, + "learning_rate": 3.5874701831765605e-05, + "loss": 0.2429, + "step": 41405 + }, + { + "epoch": 3.3543421905379134, + "grad_norm": 0.08200475573539734, + "learning_rate": 3.587020117917098e-05, + "loss": 0.2634, + "step": 41406 + }, + { + "epoch": 3.3544232015554116, + "grad_norm": 0.0656493604183197, + "learning_rate": 3.586570052657636e-05, + "loss": 0.2378, + "step": 41407 + }, + { + "epoch": 3.35450421257291, + "grad_norm": 0.08324894309043884, + "learning_rate": 3.5861199873981726e-05, + "loss": 0.2226, + "step": 41408 + }, + { + "epoch": 3.354585223590408, + "grad_norm": 0.07772985845804214, + "learning_rate": 3.58566992213871e-05, + "loss": 0.234, + "step": 41409 + }, + { + "epoch": 3.354666234607907, + "grad_norm": 0.07231470197439194, + "learning_rate": 3.585219856879248e-05, + "loss": 0.269, + "step": 41410 + }, + { + "epoch": 3.354747245625405, + "grad_norm": 0.07173066586256027, + "learning_rate": 3.584769791619785e-05, + "loss": 0.2278, + "step": 41411 + }, + { + "epoch": 3.3548282566429033, + "grad_norm": 0.0752212256193161, + "learning_rate": 3.584319726360322e-05, + "loss": 0.2057, + "step": 41412 + }, + { + "epoch": 3.354909267660402, + "grad_norm": 0.06340886652469635, + "learning_rate": 3.58386966110086e-05, + "loss": 0.2561, + "step": 41413 + }, + { + "epoch": 3.3549902786779002, + "grad_norm": 0.06413735449314117, + "learning_rate": 3.5834195958413974e-05, + "loss": 0.2133, + "step": 41414 + }, + { + "epoch": 3.3550712896953985, + "grad_norm": 0.07090727984905243, + "learning_rate": 3.582969530581934e-05, + "loss": 0.2272, + "step": 41415 + }, + { + "epoch": 3.3551523007128967, + "grad_norm": 0.05886656045913696, + "learning_rate": 3.582519465322472e-05, + "loss": 0.206, + "step": 41416 + }, + { + "epoch": 3.3552333117303954, + "grad_norm": 0.15847265720367432, + "learning_rate": 3.5820694000630095e-05, + "loss": 0.2223, + "step": 41417 + }, + { + "epoch": 3.3553143227478937, + "grad_norm": 0.06966357678174973, + "learning_rate": 3.581619334803546e-05, + "loss": 0.2201, + "step": 41418 + }, + { + "epoch": 3.355395333765392, + "grad_norm": 0.06600771099328995, + "learning_rate": 3.581169269544084e-05, + "loss": 0.2487, + "step": 41419 + }, + { + "epoch": 3.3554763447828906, + "grad_norm": 0.078952357172966, + "learning_rate": 3.5807192042846216e-05, + "loss": 0.208, + "step": 41420 + }, + { + "epoch": 3.355557355800389, + "grad_norm": 0.06853339076042175, + "learning_rate": 3.580269139025158e-05, + "loss": 0.2166, + "step": 41421 + }, + { + "epoch": 3.355638366817887, + "grad_norm": 0.07810437679290771, + "learning_rate": 3.579819073765696e-05, + "loss": 0.2307, + "step": 41422 + }, + { + "epoch": 3.3557193778353858, + "grad_norm": 0.06687109917402267, + "learning_rate": 3.579369008506234e-05, + "loss": 0.2272, + "step": 41423 + }, + { + "epoch": 3.355800388852884, + "grad_norm": 0.06530658155679703, + "learning_rate": 3.5789189432467704e-05, + "loss": 0.2347, + "step": 41424 + }, + { + "epoch": 3.3558813998703823, + "grad_norm": 0.060046806931495667, + "learning_rate": 3.5784688779873084e-05, + "loss": 0.2189, + "step": 41425 + }, + { + "epoch": 3.355962410887881, + "grad_norm": 0.057740457355976105, + "learning_rate": 3.578018812727846e-05, + "loss": 0.213, + "step": 41426 + }, + { + "epoch": 3.356043421905379, + "grad_norm": 0.08021904528141022, + "learning_rate": 3.577568747468383e-05, + "loss": 0.2271, + "step": 41427 + }, + { + "epoch": 3.3561244329228774, + "grad_norm": 0.06996602565050125, + "learning_rate": 3.5771186822089205e-05, + "loss": 0.225, + "step": 41428 + }, + { + "epoch": 3.356205443940376, + "grad_norm": 0.08081180602312088, + "learning_rate": 3.576668616949458e-05, + "loss": 0.2193, + "step": 41429 + }, + { + "epoch": 3.3562864549578744, + "grad_norm": 0.07503603398799896, + "learning_rate": 3.576218551689995e-05, + "loss": 0.2441, + "step": 41430 + }, + { + "epoch": 3.3563674659753726, + "grad_norm": 0.06850998848676682, + "learning_rate": 3.5757684864305326e-05, + "loss": 0.2138, + "step": 41431 + }, + { + "epoch": 3.356448476992871, + "grad_norm": 0.0689677894115448, + "learning_rate": 3.57531842117107e-05, + "loss": 0.2448, + "step": 41432 + }, + { + "epoch": 3.3565294880103695, + "grad_norm": 0.08548016846179962, + "learning_rate": 3.574868355911607e-05, + "loss": 0.2585, + "step": 41433 + }, + { + "epoch": 3.356610499027868, + "grad_norm": 0.06984124332666397, + "learning_rate": 3.5744182906521447e-05, + "loss": 0.2284, + "step": 41434 + }, + { + "epoch": 3.356691510045366, + "grad_norm": 0.06978527456521988, + "learning_rate": 3.573968225392682e-05, + "loss": 0.2139, + "step": 41435 + }, + { + "epoch": 3.3567725210628647, + "grad_norm": 0.0799226388335228, + "learning_rate": 3.5735181601332194e-05, + "loss": 0.2263, + "step": 41436 + }, + { + "epoch": 3.356853532080363, + "grad_norm": 0.08357705920934677, + "learning_rate": 3.573068094873757e-05, + "loss": 0.2084, + "step": 41437 + }, + { + "epoch": 3.356934543097861, + "grad_norm": 0.07593013346195221, + "learning_rate": 3.572618029614294e-05, + "loss": 0.1942, + "step": 41438 + }, + { + "epoch": 3.3570155541153595, + "grad_norm": 0.08029653877019882, + "learning_rate": 3.5721679643548315e-05, + "loss": 0.2392, + "step": 41439 + }, + { + "epoch": 3.357096565132858, + "grad_norm": 0.062494512647390366, + "learning_rate": 3.5717178990953695e-05, + "loss": 0.2074, + "step": 41440 + }, + { + "epoch": 3.3571775761503564, + "grad_norm": 0.07503077387809753, + "learning_rate": 3.571267833835906e-05, + "loss": 0.2082, + "step": 41441 + }, + { + "epoch": 3.3572585871678546, + "grad_norm": 0.07346827536821365, + "learning_rate": 3.5708177685764435e-05, + "loss": 0.2568, + "step": 41442 + }, + { + "epoch": 3.3573395981853533, + "grad_norm": 0.07334146648645401, + "learning_rate": 3.5703677033169816e-05, + "loss": 0.2611, + "step": 41443 + }, + { + "epoch": 3.3574206092028516, + "grad_norm": 0.06703248620033264, + "learning_rate": 3.569917638057518e-05, + "loss": 0.2296, + "step": 41444 + }, + { + "epoch": 3.35750162022035, + "grad_norm": 0.07055786997079849, + "learning_rate": 3.5694675727980556e-05, + "loss": 0.2277, + "step": 41445 + }, + { + "epoch": 3.3575826312378485, + "grad_norm": 0.08582204580307007, + "learning_rate": 3.5690175075385937e-05, + "loss": 0.2566, + "step": 41446 + }, + { + "epoch": 3.3576636422553467, + "grad_norm": 0.07949892431497574, + "learning_rate": 3.5685674422791303e-05, + "loss": 0.2025, + "step": 41447 + }, + { + "epoch": 3.357744653272845, + "grad_norm": 0.0567675344645977, + "learning_rate": 3.568117377019668e-05, + "loss": 0.2358, + "step": 41448 + }, + { + "epoch": 3.3578256642903437, + "grad_norm": 0.07468612492084503, + "learning_rate": 3.567667311760206e-05, + "loss": 0.239, + "step": 41449 + }, + { + "epoch": 3.357906675307842, + "grad_norm": 0.07461059838533401, + "learning_rate": 3.5672172465007424e-05, + "loss": 0.2381, + "step": 41450 + }, + { + "epoch": 3.35798768632534, + "grad_norm": 0.05967641994357109, + "learning_rate": 3.56676718124128e-05, + "loss": 0.2433, + "step": 41451 + }, + { + "epoch": 3.358068697342839, + "grad_norm": 0.07766906172037125, + "learning_rate": 3.566317115981818e-05, + "loss": 0.2155, + "step": 41452 + }, + { + "epoch": 3.358149708360337, + "grad_norm": 0.07030050456523895, + "learning_rate": 3.565867050722355e-05, + "loss": 0.1952, + "step": 41453 + }, + { + "epoch": 3.3582307193778353, + "grad_norm": 0.08457501977682114, + "learning_rate": 3.565416985462892e-05, + "loss": 0.2148, + "step": 41454 + }, + { + "epoch": 3.3583117303953336, + "grad_norm": 0.07360049337148666, + "learning_rate": 3.56496692020343e-05, + "loss": 0.213, + "step": 41455 + }, + { + "epoch": 3.3583927414128323, + "grad_norm": 0.06616782397031784, + "learning_rate": 3.564516854943967e-05, + "loss": 0.2151, + "step": 41456 + }, + { + "epoch": 3.3584737524303305, + "grad_norm": 0.07079682499170303, + "learning_rate": 3.564066789684504e-05, + "loss": 0.2243, + "step": 41457 + }, + { + "epoch": 3.3585547634478288, + "grad_norm": 0.057243358343839645, + "learning_rate": 3.563616724425042e-05, + "loss": 0.2123, + "step": 41458 + }, + { + "epoch": 3.3586357744653275, + "grad_norm": 0.06692282855510712, + "learning_rate": 3.5631666591655793e-05, + "loss": 0.2198, + "step": 41459 + }, + { + "epoch": 3.3587167854828257, + "grad_norm": 0.07716784626245499, + "learning_rate": 3.562716593906116e-05, + "loss": 0.2282, + "step": 41460 + }, + { + "epoch": 3.358797796500324, + "grad_norm": 0.07021788507699966, + "learning_rate": 3.562266528646654e-05, + "loss": 0.2209, + "step": 41461 + }, + { + "epoch": 3.358878807517822, + "grad_norm": 0.07160168141126633, + "learning_rate": 3.5618164633871914e-05, + "loss": 0.2052, + "step": 41462 + }, + { + "epoch": 3.358959818535321, + "grad_norm": 0.07971487939357758, + "learning_rate": 3.561366398127728e-05, + "loss": 0.2111, + "step": 41463 + }, + { + "epoch": 3.359040829552819, + "grad_norm": 0.06069502979516983, + "learning_rate": 3.560916332868266e-05, + "loss": 0.2042, + "step": 41464 + }, + { + "epoch": 3.3591218405703174, + "grad_norm": 0.07485620677471161, + "learning_rate": 3.5604662676088035e-05, + "loss": 0.2355, + "step": 41465 + }, + { + "epoch": 3.359202851587816, + "grad_norm": 0.0703471377491951, + "learning_rate": 3.560016202349341e-05, + "loss": 0.2445, + "step": 41466 + }, + { + "epoch": 3.3592838626053143, + "grad_norm": 0.05236637964844704, + "learning_rate": 3.559566137089878e-05, + "loss": 0.1918, + "step": 41467 + }, + { + "epoch": 3.3593648736228126, + "grad_norm": 0.06970858573913574, + "learning_rate": 3.5591160718304156e-05, + "loss": 0.1843, + "step": 41468 + }, + { + "epoch": 3.3594458846403112, + "grad_norm": 0.09697674959897995, + "learning_rate": 3.558666006570953e-05, + "loss": 0.2533, + "step": 41469 + }, + { + "epoch": 3.3595268956578095, + "grad_norm": 0.05919763445854187, + "learning_rate": 3.55821594131149e-05, + "loss": 0.1918, + "step": 41470 + }, + { + "epoch": 3.3596079066753077, + "grad_norm": 0.07003922760486603, + "learning_rate": 3.557765876052028e-05, + "loss": 0.2213, + "step": 41471 + }, + { + "epoch": 3.3596889176928064, + "grad_norm": 0.05733330547809601, + "learning_rate": 3.557315810792565e-05, + "loss": 0.187, + "step": 41472 + }, + { + "epoch": 3.3597699287103047, + "grad_norm": 0.06819011270999908, + "learning_rate": 3.5568657455331024e-05, + "loss": 0.2577, + "step": 41473 + }, + { + "epoch": 3.359850939727803, + "grad_norm": 0.061745062470436096, + "learning_rate": 3.55641568027364e-05, + "loss": 0.2011, + "step": 41474 + }, + { + "epoch": 3.3599319507453016, + "grad_norm": 0.0705268457531929, + "learning_rate": 3.555965615014177e-05, + "loss": 0.2203, + "step": 41475 + }, + { + "epoch": 3.3600129617628, + "grad_norm": 0.07620300352573395, + "learning_rate": 3.5555155497547145e-05, + "loss": 0.2044, + "step": 41476 + }, + { + "epoch": 3.360093972780298, + "grad_norm": 0.07525172829627991, + "learning_rate": 3.555065484495252e-05, + "loss": 0.2413, + "step": 41477 + }, + { + "epoch": 3.3601749837977963, + "grad_norm": 0.07072553038597107, + "learning_rate": 3.554615419235789e-05, + "loss": 0.2422, + "step": 41478 + }, + { + "epoch": 3.360255994815295, + "grad_norm": 0.07391635328531265, + "learning_rate": 3.554165353976327e-05, + "loss": 0.2313, + "step": 41479 + }, + { + "epoch": 3.3603370058327933, + "grad_norm": 0.06640076637268066, + "learning_rate": 3.553715288716864e-05, + "loss": 0.2207, + "step": 41480 + }, + { + "epoch": 3.3604180168502915, + "grad_norm": 0.06782186031341553, + "learning_rate": 3.553265223457401e-05, + "loss": 0.2443, + "step": 41481 + }, + { + "epoch": 3.36049902786779, + "grad_norm": 0.07683203369379044, + "learning_rate": 3.552815158197939e-05, + "loss": 0.2067, + "step": 41482 + }, + { + "epoch": 3.3605800388852884, + "grad_norm": 0.07086572796106339, + "learning_rate": 3.552365092938476e-05, + "loss": 0.2251, + "step": 41483 + }, + { + "epoch": 3.3606610499027867, + "grad_norm": 0.0764269083738327, + "learning_rate": 3.5519150276790133e-05, + "loss": 0.2153, + "step": 41484 + }, + { + "epoch": 3.360742060920285, + "grad_norm": 0.08067718893289566, + "learning_rate": 3.5514649624195514e-05, + "loss": 0.2365, + "step": 41485 + }, + { + "epoch": 3.3608230719377836, + "grad_norm": 0.0750790387392044, + "learning_rate": 3.551014897160088e-05, + "loss": 0.2577, + "step": 41486 + }, + { + "epoch": 3.360904082955282, + "grad_norm": 0.08475764095783234, + "learning_rate": 3.5505648319006254e-05, + "loss": 0.2226, + "step": 41487 + }, + { + "epoch": 3.36098509397278, + "grad_norm": 0.0630546286702156, + "learning_rate": 3.5501147666411635e-05, + "loss": 0.2075, + "step": 41488 + }, + { + "epoch": 3.361066104990279, + "grad_norm": 0.06775683164596558, + "learning_rate": 3.5496647013817e-05, + "loss": 0.2018, + "step": 41489 + }, + { + "epoch": 3.361147116007777, + "grad_norm": 0.07136673480272293, + "learning_rate": 3.5492146361222375e-05, + "loss": 0.2322, + "step": 41490 + }, + { + "epoch": 3.3612281270252753, + "grad_norm": 0.08858048915863037, + "learning_rate": 3.5487645708627755e-05, + "loss": 0.3272, + "step": 41491 + }, + { + "epoch": 3.361309138042774, + "grad_norm": 0.0807926133275032, + "learning_rate": 3.548314505603313e-05, + "loss": 0.2342, + "step": 41492 + }, + { + "epoch": 3.3613901490602722, + "grad_norm": 0.07923466712236404, + "learning_rate": 3.5478644403438496e-05, + "loss": 0.237, + "step": 41493 + }, + { + "epoch": 3.3614711600777705, + "grad_norm": 0.06726489961147308, + "learning_rate": 3.5474143750843876e-05, + "loss": 0.2268, + "step": 41494 + }, + { + "epoch": 3.361552171095269, + "grad_norm": 0.07273104041814804, + "learning_rate": 3.546964309824925e-05, + "loss": 0.2161, + "step": 41495 + }, + { + "epoch": 3.3616331821127674, + "grad_norm": 0.07782911509275436, + "learning_rate": 3.546514244565462e-05, + "loss": 0.2321, + "step": 41496 + }, + { + "epoch": 3.3617141931302656, + "grad_norm": 0.06289485096931458, + "learning_rate": 3.546064179306e-05, + "loss": 0.2091, + "step": 41497 + }, + { + "epoch": 3.3617952041477643, + "grad_norm": 0.06988698989152908, + "learning_rate": 3.545614114046537e-05, + "loss": 0.2265, + "step": 41498 + }, + { + "epoch": 3.3618762151652626, + "grad_norm": 0.08457628637552261, + "learning_rate": 3.545164048787074e-05, + "loss": 0.2137, + "step": 41499 + }, + { + "epoch": 3.361957226182761, + "grad_norm": 0.0646219551563263, + "learning_rate": 3.544713983527612e-05, + "loss": 0.2145, + "step": 41500 + }, + { + "epoch": 3.362038237200259, + "grad_norm": 0.0836854949593544, + "learning_rate": 3.544263918268149e-05, + "loss": 0.2149, + "step": 41501 + }, + { + "epoch": 3.3621192482177578, + "grad_norm": 0.06724182516336441, + "learning_rate": 3.5438138530086865e-05, + "loss": 0.1955, + "step": 41502 + }, + { + "epoch": 3.362200259235256, + "grad_norm": 0.08123544603586197, + "learning_rate": 3.543363787749224e-05, + "loss": 0.2378, + "step": 41503 + }, + { + "epoch": 3.3622812702527543, + "grad_norm": 0.08634920418262482, + "learning_rate": 3.542913722489761e-05, + "loss": 0.2397, + "step": 41504 + }, + { + "epoch": 3.362362281270253, + "grad_norm": 0.07229456305503845, + "learning_rate": 3.5424636572302986e-05, + "loss": 0.1964, + "step": 41505 + }, + { + "epoch": 3.362443292287751, + "grad_norm": 0.07100922614336014, + "learning_rate": 3.542013591970836e-05, + "loss": 0.2391, + "step": 41506 + }, + { + "epoch": 3.3625243033052494, + "grad_norm": 0.06853149086236954, + "learning_rate": 3.541563526711373e-05, + "loss": 0.2236, + "step": 41507 + }, + { + "epoch": 3.3626053143227477, + "grad_norm": 0.07335706055164337, + "learning_rate": 3.541113461451911e-05, + "loss": 0.2181, + "step": 41508 + }, + { + "epoch": 3.3626863253402464, + "grad_norm": 0.062075987458229065, + "learning_rate": 3.540663396192448e-05, + "loss": 0.2111, + "step": 41509 + }, + { + "epoch": 3.3627673363577446, + "grad_norm": 0.06849157810211182, + "learning_rate": 3.5402133309329854e-05, + "loss": 0.2474, + "step": 41510 + }, + { + "epoch": 3.362848347375243, + "grad_norm": 0.07038149982690811, + "learning_rate": 3.539763265673523e-05, + "loss": 0.2402, + "step": 41511 + }, + { + "epoch": 3.3629293583927415, + "grad_norm": 0.08231181651353836, + "learning_rate": 3.53931320041406e-05, + "loss": 0.2719, + "step": 41512 + }, + { + "epoch": 3.36301036941024, + "grad_norm": 0.07193673402070999, + "learning_rate": 3.5388631351545975e-05, + "loss": 0.2559, + "step": 41513 + }, + { + "epoch": 3.363091380427738, + "grad_norm": 0.07190826535224915, + "learning_rate": 3.538413069895135e-05, + "loss": 0.254, + "step": 41514 + }, + { + "epoch": 3.3631723914452367, + "grad_norm": 0.059861920773983, + "learning_rate": 3.537963004635672e-05, + "loss": 0.2133, + "step": 41515 + }, + { + "epoch": 3.363253402462735, + "grad_norm": 0.09745169430971146, + "learning_rate": 3.5375129393762096e-05, + "loss": 0.2472, + "step": 41516 + }, + { + "epoch": 3.363334413480233, + "grad_norm": 0.08337296545505524, + "learning_rate": 3.537062874116747e-05, + "loss": 0.2274, + "step": 41517 + }, + { + "epoch": 3.363415424497732, + "grad_norm": 0.08769182115793228, + "learning_rate": 3.536612808857285e-05, + "loss": 0.2502, + "step": 41518 + }, + { + "epoch": 3.36349643551523, + "grad_norm": 0.07994231581687927, + "learning_rate": 3.5361627435978216e-05, + "loss": 0.2367, + "step": 41519 + }, + { + "epoch": 3.3635774465327284, + "grad_norm": 0.06313978880643845, + "learning_rate": 3.535712678338359e-05, + "loss": 0.2276, + "step": 41520 + }, + { + "epoch": 3.363658457550227, + "grad_norm": 0.09761510044336319, + "learning_rate": 3.535262613078897e-05, + "loss": 0.2405, + "step": 41521 + }, + { + "epoch": 3.3637394685677253, + "grad_norm": 0.06474805623292923, + "learning_rate": 3.534812547819434e-05, + "loss": 0.2163, + "step": 41522 + }, + { + "epoch": 3.3638204795852236, + "grad_norm": 0.061308603733778, + "learning_rate": 3.534362482559971e-05, + "loss": 0.2052, + "step": 41523 + }, + { + "epoch": 3.363901490602722, + "grad_norm": 0.06479856371879578, + "learning_rate": 3.533912417300509e-05, + "loss": 0.2339, + "step": 41524 + }, + { + "epoch": 3.3639825016202205, + "grad_norm": 0.0835852101445198, + "learning_rate": 3.533462352041046e-05, + "loss": 0.2184, + "step": 41525 + }, + { + "epoch": 3.3640635126377187, + "grad_norm": 0.0681370347738266, + "learning_rate": 3.533012286781583e-05, + "loss": 0.2783, + "step": 41526 + }, + { + "epoch": 3.364144523655217, + "grad_norm": 0.08533981442451477, + "learning_rate": 3.532562221522121e-05, + "loss": 0.2507, + "step": 41527 + }, + { + "epoch": 3.3642255346727157, + "grad_norm": 0.06170447915792465, + "learning_rate": 3.532112156262658e-05, + "loss": 0.2043, + "step": 41528 + }, + { + "epoch": 3.364306545690214, + "grad_norm": 0.06911799311637878, + "learning_rate": 3.531662091003195e-05, + "loss": 0.2144, + "step": 41529 + }, + { + "epoch": 3.364387556707712, + "grad_norm": 0.0771351158618927, + "learning_rate": 3.531212025743733e-05, + "loss": 0.2393, + "step": 41530 + }, + { + "epoch": 3.3644685677252104, + "grad_norm": 0.09463619440793991, + "learning_rate": 3.5307619604842706e-05, + "loss": 0.2541, + "step": 41531 + }, + { + "epoch": 3.364549578742709, + "grad_norm": 0.08270283043384552, + "learning_rate": 3.530311895224807e-05, + "loss": 0.2583, + "step": 41532 + }, + { + "epoch": 3.3646305897602073, + "grad_norm": 0.08258695900440216, + "learning_rate": 3.5298618299653454e-05, + "loss": 0.2765, + "step": 41533 + }, + { + "epoch": 3.3647116007777056, + "grad_norm": 0.07178051024675369, + "learning_rate": 3.529411764705883e-05, + "loss": 0.2605, + "step": 41534 + }, + { + "epoch": 3.3647926117952043, + "grad_norm": 0.06768299639225006, + "learning_rate": 3.52896169944642e-05, + "loss": 0.203, + "step": 41535 + }, + { + "epoch": 3.3648736228127025, + "grad_norm": 0.060938406735658646, + "learning_rate": 3.5285116341869574e-05, + "loss": 0.2189, + "step": 41536 + }, + { + "epoch": 3.3649546338302008, + "grad_norm": 0.056721530854701996, + "learning_rate": 3.528061568927495e-05, + "loss": 0.2198, + "step": 41537 + }, + { + "epoch": 3.3650356448476995, + "grad_norm": 0.08980550616979599, + "learning_rate": 3.527611503668032e-05, + "loss": 0.2538, + "step": 41538 + }, + { + "epoch": 3.3651166558651977, + "grad_norm": 0.0845855101943016, + "learning_rate": 3.5271614384085695e-05, + "loss": 0.2587, + "step": 41539 + }, + { + "epoch": 3.365197666882696, + "grad_norm": 0.06626113504171371, + "learning_rate": 3.526711373149107e-05, + "loss": 0.2197, + "step": 41540 + }, + { + "epoch": 3.3652786779001946, + "grad_norm": 0.07001606374979019, + "learning_rate": 3.526261307889644e-05, + "loss": 0.257, + "step": 41541 + }, + { + "epoch": 3.365359688917693, + "grad_norm": 0.07456422597169876, + "learning_rate": 3.5258112426301816e-05, + "loss": 0.1906, + "step": 41542 + }, + { + "epoch": 3.365440699935191, + "grad_norm": 0.06453201174736023, + "learning_rate": 3.525361177370719e-05, + "loss": 0.1989, + "step": 41543 + }, + { + "epoch": 3.3655217109526894, + "grad_norm": 0.07249526679515839, + "learning_rate": 3.524911112111256e-05, + "loss": 0.216, + "step": 41544 + }, + { + "epoch": 3.365602721970188, + "grad_norm": 0.06700129806995392, + "learning_rate": 3.524461046851794e-05, + "loss": 0.2105, + "step": 41545 + }, + { + "epoch": 3.3656837329876863, + "grad_norm": 0.06668738275766373, + "learning_rate": 3.524010981592331e-05, + "loss": 0.2296, + "step": 41546 + }, + { + "epoch": 3.3657647440051845, + "grad_norm": 0.0665898472070694, + "learning_rate": 3.5235609163328684e-05, + "loss": 0.2088, + "step": 41547 + }, + { + "epoch": 3.3658457550226832, + "grad_norm": 0.07802686095237732, + "learning_rate": 3.523110851073406e-05, + "loss": 0.2559, + "step": 41548 + }, + { + "epoch": 3.3659267660401815, + "grad_norm": 0.09978897869586945, + "learning_rate": 3.522660785813943e-05, + "loss": 0.2663, + "step": 41549 + }, + { + "epoch": 3.3660077770576797, + "grad_norm": 0.0862162709236145, + "learning_rate": 3.5222107205544805e-05, + "loss": 0.2249, + "step": 41550 + }, + { + "epoch": 3.3660887880751784, + "grad_norm": 0.06248806044459343, + "learning_rate": 3.521760655295018e-05, + "loss": 0.2346, + "step": 41551 + }, + { + "epoch": 3.3661697990926767, + "grad_norm": 0.07725557684898376, + "learning_rate": 3.521310590035555e-05, + "loss": 0.2258, + "step": 41552 + }, + { + "epoch": 3.366250810110175, + "grad_norm": 0.08579937368631363, + "learning_rate": 3.5208605247760926e-05, + "loss": 0.2167, + "step": 41553 + }, + { + "epoch": 3.366331821127673, + "grad_norm": 0.07212509959936142, + "learning_rate": 3.52041045951663e-05, + "loss": 0.2356, + "step": 41554 + }, + { + "epoch": 3.366412832145172, + "grad_norm": 0.08952637761831284, + "learning_rate": 3.519960394257167e-05, + "loss": 0.2534, + "step": 41555 + }, + { + "epoch": 3.36649384316267, + "grad_norm": 0.05379028245806694, + "learning_rate": 3.5195103289977046e-05, + "loss": 0.2075, + "step": 41556 + }, + { + "epoch": 3.3665748541801683, + "grad_norm": 0.07278160005807877, + "learning_rate": 3.519060263738243e-05, + "loss": 0.2261, + "step": 41557 + }, + { + "epoch": 3.366655865197667, + "grad_norm": 0.07294684648513794, + "learning_rate": 3.5186101984787794e-05, + "loss": 0.2228, + "step": 41558 + }, + { + "epoch": 3.3667368762151653, + "grad_norm": 0.07547971606254578, + "learning_rate": 3.518160133219317e-05, + "loss": 0.24, + "step": 41559 + }, + { + "epoch": 3.3668178872326635, + "grad_norm": 0.08458521217107773, + "learning_rate": 3.517710067959855e-05, + "loss": 0.2331, + "step": 41560 + }, + { + "epoch": 3.366898898250162, + "grad_norm": 0.07553742080926895, + "learning_rate": 3.5172600027003915e-05, + "loss": 0.2154, + "step": 41561 + }, + { + "epoch": 3.3669799092676604, + "grad_norm": 0.07026293128728867, + "learning_rate": 3.516809937440929e-05, + "loss": 0.221, + "step": 41562 + }, + { + "epoch": 3.3670609202851587, + "grad_norm": 0.06618590652942657, + "learning_rate": 3.516359872181467e-05, + "loss": 0.2013, + "step": 41563 + }, + { + "epoch": 3.3671419313026574, + "grad_norm": 0.08520275354385376, + "learning_rate": 3.5159098069220035e-05, + "loss": 0.2444, + "step": 41564 + }, + { + "epoch": 3.3672229423201556, + "grad_norm": 0.07353122532367706, + "learning_rate": 3.515459741662541e-05, + "loss": 0.2403, + "step": 41565 + }, + { + "epoch": 3.367303953337654, + "grad_norm": 0.06767702102661133, + "learning_rate": 3.515009676403079e-05, + "loss": 0.2166, + "step": 41566 + }, + { + "epoch": 3.367384964355152, + "grad_norm": 0.062399983406066895, + "learning_rate": 3.5145596111436156e-05, + "loss": 0.2013, + "step": 41567 + }, + { + "epoch": 3.367465975372651, + "grad_norm": 0.06242172420024872, + "learning_rate": 3.514109545884153e-05, + "loss": 0.2193, + "step": 41568 + }, + { + "epoch": 3.367546986390149, + "grad_norm": 0.07391718775033951, + "learning_rate": 3.513659480624691e-05, + "loss": 0.2046, + "step": 41569 + }, + { + "epoch": 3.3676279974076473, + "grad_norm": 0.0698646679520607, + "learning_rate": 3.513209415365228e-05, + "loss": 0.3122, + "step": 41570 + }, + { + "epoch": 3.367709008425146, + "grad_norm": 0.07431770116090775, + "learning_rate": 3.512759350105766e-05, + "loss": 0.2247, + "step": 41571 + }, + { + "epoch": 3.3677900194426442, + "grad_norm": 0.0731852650642395, + "learning_rate": 3.512309284846303e-05, + "loss": 0.2467, + "step": 41572 + }, + { + "epoch": 3.3678710304601425, + "grad_norm": 0.06502462178468704, + "learning_rate": 3.5118592195868405e-05, + "loss": 0.2045, + "step": 41573 + }, + { + "epoch": 3.3679520414776407, + "grad_norm": 0.06441786885261536, + "learning_rate": 3.511409154327378e-05, + "loss": 0.2253, + "step": 41574 + }, + { + "epoch": 3.3680330524951394, + "grad_norm": 0.07801219075918198, + "learning_rate": 3.510959089067915e-05, + "loss": 0.2156, + "step": 41575 + }, + { + "epoch": 3.3681140635126376, + "grad_norm": 0.06515826284885406, + "learning_rate": 3.5105090238084525e-05, + "loss": 0.2164, + "step": 41576 + }, + { + "epoch": 3.368195074530136, + "grad_norm": 0.08127927780151367, + "learning_rate": 3.51005895854899e-05, + "loss": 0.2367, + "step": 41577 + }, + { + "epoch": 3.3682760855476346, + "grad_norm": 0.07091443240642548, + "learning_rate": 3.509608893289527e-05, + "loss": 0.222, + "step": 41578 + }, + { + "epoch": 3.368357096565133, + "grad_norm": 0.06333775073289871, + "learning_rate": 3.5091588280300646e-05, + "loss": 0.243, + "step": 41579 + }, + { + "epoch": 3.368438107582631, + "grad_norm": 0.06674503535032272, + "learning_rate": 3.508708762770602e-05, + "loss": 0.22, + "step": 41580 + }, + { + "epoch": 3.3685191186001298, + "grad_norm": 0.07354476302862167, + "learning_rate": 3.508258697511139e-05, + "loss": 0.2317, + "step": 41581 + }, + { + "epoch": 3.368600129617628, + "grad_norm": 0.07980510592460632, + "learning_rate": 3.507808632251677e-05, + "loss": 0.2393, + "step": 41582 + }, + { + "epoch": 3.3686811406351262, + "grad_norm": 0.0695723295211792, + "learning_rate": 3.507358566992214e-05, + "loss": 0.2219, + "step": 41583 + }, + { + "epoch": 3.368762151652625, + "grad_norm": 0.09566250443458557, + "learning_rate": 3.5069085017327514e-05, + "loss": 0.2341, + "step": 41584 + }, + { + "epoch": 3.368843162670123, + "grad_norm": 0.08105815201997757, + "learning_rate": 3.506458436473289e-05, + "loss": 0.2185, + "step": 41585 + }, + { + "epoch": 3.3689241736876214, + "grad_norm": 0.05623576045036316, + "learning_rate": 3.506008371213826e-05, + "loss": 0.2123, + "step": 41586 + }, + { + "epoch": 3.36900518470512, + "grad_norm": 0.07283440977334976, + "learning_rate": 3.5055583059543635e-05, + "loss": 0.2149, + "step": 41587 + }, + { + "epoch": 3.3690861957226184, + "grad_norm": 0.1059572845697403, + "learning_rate": 3.505108240694901e-05, + "loss": 0.3047, + "step": 41588 + }, + { + "epoch": 3.3691672067401166, + "grad_norm": 0.08058130741119385, + "learning_rate": 3.504658175435438e-05, + "loss": 0.2359, + "step": 41589 + }, + { + "epoch": 3.369248217757615, + "grad_norm": 0.07778458297252655, + "learning_rate": 3.5042081101759756e-05, + "loss": 0.2128, + "step": 41590 + }, + { + "epoch": 3.3693292287751135, + "grad_norm": 0.06973801553249359, + "learning_rate": 3.503758044916513e-05, + "loss": 0.2097, + "step": 41591 + }, + { + "epoch": 3.369410239792612, + "grad_norm": 0.07245367765426636, + "learning_rate": 3.50330797965705e-05, + "loss": 0.2282, + "step": 41592 + }, + { + "epoch": 3.36949125081011, + "grad_norm": 0.0660821944475174, + "learning_rate": 3.5028579143975877e-05, + "loss": 0.2259, + "step": 41593 + }, + { + "epoch": 3.3695722618276087, + "grad_norm": 0.08014898747205734, + "learning_rate": 3.502407849138125e-05, + "loss": 0.2232, + "step": 41594 + }, + { + "epoch": 3.369653272845107, + "grad_norm": 0.07944493740797043, + "learning_rate": 3.5019577838786624e-05, + "loss": 0.2678, + "step": 41595 + }, + { + "epoch": 3.369734283862605, + "grad_norm": 0.06187062710523605, + "learning_rate": 3.5015077186192e-05, + "loss": 0.2129, + "step": 41596 + }, + { + "epoch": 3.3698152948801035, + "grad_norm": 0.0689127966761589, + "learning_rate": 3.501057653359737e-05, + "loss": 0.2384, + "step": 41597 + }, + { + "epoch": 3.369896305897602, + "grad_norm": 0.07030554860830307, + "learning_rate": 3.5006075881002745e-05, + "loss": 0.2134, + "step": 41598 + }, + { + "epoch": 3.3699773169151004, + "grad_norm": 0.057273104786872864, + "learning_rate": 3.5001575228408125e-05, + "loss": 0.192, + "step": 41599 + }, + { + "epoch": 3.3700583279325986, + "grad_norm": 0.07547903805971146, + "learning_rate": 3.499707457581349e-05, + "loss": 0.2211, + "step": 41600 + }, + { + "epoch": 3.3701393389500973, + "grad_norm": 0.06552055478096008, + "learning_rate": 3.4992573923218865e-05, + "loss": 0.2295, + "step": 41601 + }, + { + "epoch": 3.3702203499675956, + "grad_norm": 0.06693745404481888, + "learning_rate": 3.4988073270624246e-05, + "loss": 0.2524, + "step": 41602 + }, + { + "epoch": 3.370301360985094, + "grad_norm": 0.06710077077150345, + "learning_rate": 3.498357261802961e-05, + "loss": 0.2026, + "step": 41603 + }, + { + "epoch": 3.3703823720025925, + "grad_norm": 0.07833865284919739, + "learning_rate": 3.497907196543499e-05, + "loss": 0.2409, + "step": 41604 + }, + { + "epoch": 3.3704633830200907, + "grad_norm": 0.06898022443056107, + "learning_rate": 3.4974571312840367e-05, + "loss": 0.2081, + "step": 41605 + }, + { + "epoch": 3.370544394037589, + "grad_norm": 0.08391853421926498, + "learning_rate": 3.4970070660245733e-05, + "loss": 0.2388, + "step": 41606 + }, + { + "epoch": 3.3706254050550877, + "grad_norm": 0.06046483665704727, + "learning_rate": 3.4965570007651114e-05, + "loss": 0.2022, + "step": 41607 + }, + { + "epoch": 3.370706416072586, + "grad_norm": 0.07326920330524445, + "learning_rate": 3.496106935505649e-05, + "loss": 0.2629, + "step": 41608 + }, + { + "epoch": 3.370787427090084, + "grad_norm": 0.061868514865636826, + "learning_rate": 3.4956568702461854e-05, + "loss": 0.2276, + "step": 41609 + }, + { + "epoch": 3.370868438107583, + "grad_norm": 0.05633601173758507, + "learning_rate": 3.4952068049867235e-05, + "loss": 0.2445, + "step": 41610 + }, + { + "epoch": 3.370949449125081, + "grad_norm": 0.07111150026321411, + "learning_rate": 3.494756739727261e-05, + "loss": 0.2166, + "step": 41611 + }, + { + "epoch": 3.3710304601425793, + "grad_norm": 0.0728144571185112, + "learning_rate": 3.494306674467798e-05, + "loss": 0.203, + "step": 41612 + }, + { + "epoch": 3.3711114711600776, + "grad_norm": 0.08260970562696457, + "learning_rate": 3.4938566092083355e-05, + "loss": 0.2125, + "step": 41613 + }, + { + "epoch": 3.3711924821775763, + "grad_norm": 0.09042169153690338, + "learning_rate": 3.493406543948873e-05, + "loss": 0.2402, + "step": 41614 + }, + { + "epoch": 3.3712734931950745, + "grad_norm": 0.09613018482923508, + "learning_rate": 3.49295647868941e-05, + "loss": 0.2167, + "step": 41615 + }, + { + "epoch": 3.3713545042125728, + "grad_norm": 0.0749058797955513, + "learning_rate": 3.4925064134299476e-05, + "loss": 0.2764, + "step": 41616 + }, + { + "epoch": 3.3714355152300715, + "grad_norm": 0.07069750875234604, + "learning_rate": 3.492056348170485e-05, + "loss": 0.2284, + "step": 41617 + }, + { + "epoch": 3.3715165262475697, + "grad_norm": 0.07239100337028503, + "learning_rate": 3.4916062829110223e-05, + "loss": 0.19, + "step": 41618 + }, + { + "epoch": 3.371597537265068, + "grad_norm": 0.0704842135310173, + "learning_rate": 3.49115621765156e-05, + "loss": 0.2203, + "step": 41619 + }, + { + "epoch": 3.371678548282566, + "grad_norm": 0.05835307016968727, + "learning_rate": 3.490706152392097e-05, + "loss": 0.2111, + "step": 41620 + }, + { + "epoch": 3.371759559300065, + "grad_norm": 0.08694245666265488, + "learning_rate": 3.4902560871326344e-05, + "loss": 0.233, + "step": 41621 + }, + { + "epoch": 3.371840570317563, + "grad_norm": 0.06228380650281906, + "learning_rate": 3.489806021873172e-05, + "loss": 0.235, + "step": 41622 + }, + { + "epoch": 3.3719215813350614, + "grad_norm": 0.07264664769172668, + "learning_rate": 3.489355956613709e-05, + "loss": 0.2427, + "step": 41623 + }, + { + "epoch": 3.37200259235256, + "grad_norm": 0.08153603225946426, + "learning_rate": 3.4889058913542465e-05, + "loss": 0.2491, + "step": 41624 + }, + { + "epoch": 3.3720836033700583, + "grad_norm": 0.08536143600940704, + "learning_rate": 3.488455826094784e-05, + "loss": 0.2419, + "step": 41625 + }, + { + "epoch": 3.3721646143875565, + "grad_norm": 0.0751512423157692, + "learning_rate": 3.488005760835321e-05, + "loss": 0.2224, + "step": 41626 + }, + { + "epoch": 3.3722456254050552, + "grad_norm": 0.07146193832159042, + "learning_rate": 3.4875556955758586e-05, + "loss": 0.2429, + "step": 41627 + }, + { + "epoch": 3.3723266364225535, + "grad_norm": 0.07874619960784912, + "learning_rate": 3.487105630316396e-05, + "loss": 0.2047, + "step": 41628 + }, + { + "epoch": 3.3724076474400517, + "grad_norm": 0.08496353775262833, + "learning_rate": 3.486655565056933e-05, + "loss": 0.2629, + "step": 41629 + }, + { + "epoch": 3.3724886584575504, + "grad_norm": 0.08023679256439209, + "learning_rate": 3.486205499797471e-05, + "loss": 0.2057, + "step": 41630 + }, + { + "epoch": 3.3725696694750487, + "grad_norm": 0.06785769015550613, + "learning_rate": 3.485755434538008e-05, + "loss": 0.2537, + "step": 41631 + }, + { + "epoch": 3.372650680492547, + "grad_norm": 0.0767764076590538, + "learning_rate": 3.4853053692785454e-05, + "loss": 0.2433, + "step": 41632 + }, + { + "epoch": 3.3727316915100456, + "grad_norm": 0.07684066146612167, + "learning_rate": 3.484855304019083e-05, + "loss": 0.2284, + "step": 41633 + }, + { + "epoch": 3.372812702527544, + "grad_norm": 0.06635360419750214, + "learning_rate": 3.48440523875962e-05, + "loss": 0.2054, + "step": 41634 + }, + { + "epoch": 3.372893713545042, + "grad_norm": 0.07786551862955093, + "learning_rate": 3.4839551735001575e-05, + "loss": 0.2492, + "step": 41635 + }, + { + "epoch": 3.3729747245625403, + "grad_norm": 0.08272644132375717, + "learning_rate": 3.483505108240695e-05, + "loss": 0.231, + "step": 41636 + }, + { + "epoch": 3.373055735580039, + "grad_norm": 0.05650690197944641, + "learning_rate": 3.483055042981233e-05, + "loss": 0.2144, + "step": 41637 + }, + { + "epoch": 3.3731367465975373, + "grad_norm": 0.09380374103784561, + "learning_rate": 3.48260497772177e-05, + "loss": 0.2343, + "step": 41638 + }, + { + "epoch": 3.3732177576150355, + "grad_norm": 0.0768943652510643, + "learning_rate": 3.482154912462307e-05, + "loss": 0.2448, + "step": 41639 + }, + { + "epoch": 3.373298768632534, + "grad_norm": 0.0706968829035759, + "learning_rate": 3.481704847202845e-05, + "loss": 0.2335, + "step": 41640 + }, + { + "epoch": 3.3733797796500324, + "grad_norm": 0.07146643102169037, + "learning_rate": 3.481254781943382e-05, + "loss": 0.2399, + "step": 41641 + }, + { + "epoch": 3.3734607906675307, + "grad_norm": 0.07195454090833664, + "learning_rate": 3.480804716683919e-05, + "loss": 0.2083, + "step": 41642 + }, + { + "epoch": 3.373541801685029, + "grad_norm": 0.07778682559728622, + "learning_rate": 3.480354651424457e-05, + "loss": 0.212, + "step": 41643 + }, + { + "epoch": 3.3736228127025276, + "grad_norm": 0.07331526279449463, + "learning_rate": 3.4799045861649944e-05, + "loss": 0.2222, + "step": 41644 + }, + { + "epoch": 3.373703823720026, + "grad_norm": 0.06694495677947998, + "learning_rate": 3.479454520905531e-05, + "loss": 0.2364, + "step": 41645 + }, + { + "epoch": 3.373784834737524, + "grad_norm": 0.07718578726053238, + "learning_rate": 3.479004455646069e-05, + "loss": 0.2363, + "step": 41646 + }, + { + "epoch": 3.373865845755023, + "grad_norm": 0.0727149173617363, + "learning_rate": 3.4785543903866065e-05, + "loss": 0.2431, + "step": 41647 + }, + { + "epoch": 3.373946856772521, + "grad_norm": 0.07330166548490524, + "learning_rate": 3.478104325127143e-05, + "loss": 0.2347, + "step": 41648 + }, + { + "epoch": 3.3740278677900193, + "grad_norm": 0.0676233097910881, + "learning_rate": 3.477654259867681e-05, + "loss": 0.2062, + "step": 41649 + }, + { + "epoch": 3.374108878807518, + "grad_norm": 0.0689397007226944, + "learning_rate": 3.4772041946082186e-05, + "loss": 0.236, + "step": 41650 + }, + { + "epoch": 3.374189889825016, + "grad_norm": 0.06201153248548508, + "learning_rate": 3.476754129348756e-05, + "loss": 0.2163, + "step": 41651 + }, + { + "epoch": 3.3742709008425145, + "grad_norm": 0.064112089574337, + "learning_rate": 3.476304064089293e-05, + "loss": 0.2089, + "step": 41652 + }, + { + "epoch": 3.374351911860013, + "grad_norm": 0.07973623275756836, + "learning_rate": 3.4758539988298306e-05, + "loss": 0.2264, + "step": 41653 + }, + { + "epoch": 3.3744329228775114, + "grad_norm": 0.06909281015396118, + "learning_rate": 3.475403933570368e-05, + "loss": 0.1831, + "step": 41654 + }, + { + "epoch": 3.3745139338950096, + "grad_norm": 0.06308528780937195, + "learning_rate": 3.4749538683109054e-05, + "loss": 0.1768, + "step": 41655 + }, + { + "epoch": 3.3745949449125083, + "grad_norm": 0.08469226956367493, + "learning_rate": 3.474503803051443e-05, + "loss": 0.2607, + "step": 41656 + }, + { + "epoch": 3.3746759559300066, + "grad_norm": 0.07713216543197632, + "learning_rate": 3.47405373779198e-05, + "loss": 0.2268, + "step": 41657 + }, + { + "epoch": 3.374756966947505, + "grad_norm": 0.0848437175154686, + "learning_rate": 3.4736036725325174e-05, + "loss": 0.2055, + "step": 41658 + }, + { + "epoch": 3.374837977965003, + "grad_norm": 0.0557897612452507, + "learning_rate": 3.473153607273055e-05, + "loss": 0.228, + "step": 41659 + }, + { + "epoch": 3.3749189889825018, + "grad_norm": 0.07862678170204163, + "learning_rate": 3.472703542013592e-05, + "loss": 0.241, + "step": 41660 + }, + { + "epoch": 3.375, + "grad_norm": 0.09499645978212357, + "learning_rate": 3.4722534767541295e-05, + "loss": 0.1893, + "step": 41661 + }, + { + "epoch": 3.3750810110174982, + "grad_norm": 0.0774679183959961, + "learning_rate": 3.471803411494667e-05, + "loss": 0.23, + "step": 41662 + }, + { + "epoch": 3.375162022034997, + "grad_norm": 0.07637955993413925, + "learning_rate": 3.471353346235204e-05, + "loss": 0.2347, + "step": 41663 + }, + { + "epoch": 3.375243033052495, + "grad_norm": 0.0803007259964943, + "learning_rate": 3.4709032809757416e-05, + "loss": 0.2368, + "step": 41664 + }, + { + "epoch": 3.3753240440699934, + "grad_norm": 0.0648980364203453, + "learning_rate": 3.470453215716279e-05, + "loss": 0.2305, + "step": 41665 + }, + { + "epoch": 3.3754050550874917, + "grad_norm": 0.08036522567272186, + "learning_rate": 3.470003150456816e-05, + "loss": 0.2259, + "step": 41666 + }, + { + "epoch": 3.3754860661049904, + "grad_norm": 0.06768336892127991, + "learning_rate": 3.469553085197354e-05, + "loss": 0.2213, + "step": 41667 + }, + { + "epoch": 3.3755670771224886, + "grad_norm": 0.09597831964492798, + "learning_rate": 3.469103019937891e-05, + "loss": 0.2566, + "step": 41668 + }, + { + "epoch": 3.375648088139987, + "grad_norm": 0.06598563492298126, + "learning_rate": 3.4686529546784284e-05, + "loss": 0.2495, + "step": 41669 + }, + { + "epoch": 3.3757290991574855, + "grad_norm": 0.06711921095848083, + "learning_rate": 3.468202889418966e-05, + "loss": 0.2652, + "step": 41670 + }, + { + "epoch": 3.375810110174984, + "grad_norm": 0.061890557408332825, + "learning_rate": 3.467752824159503e-05, + "loss": 0.2294, + "step": 41671 + }, + { + "epoch": 3.375891121192482, + "grad_norm": 0.0685076043009758, + "learning_rate": 3.4673027589000405e-05, + "loss": 0.2186, + "step": 41672 + }, + { + "epoch": 3.3759721322099807, + "grad_norm": 0.07758630812168121, + "learning_rate": 3.4668526936405785e-05, + "loss": 0.194, + "step": 41673 + }, + { + "epoch": 3.376053143227479, + "grad_norm": 0.07140947878360748, + "learning_rate": 3.466402628381115e-05, + "loss": 0.2029, + "step": 41674 + }, + { + "epoch": 3.376134154244977, + "grad_norm": 0.0828029066324234, + "learning_rate": 3.4659525631216526e-05, + "loss": 0.2559, + "step": 41675 + }, + { + "epoch": 3.376215165262476, + "grad_norm": 0.07145032286643982, + "learning_rate": 3.4655024978621906e-05, + "loss": 0.2119, + "step": 41676 + }, + { + "epoch": 3.376296176279974, + "grad_norm": 0.06601813435554504, + "learning_rate": 3.465052432602728e-05, + "loss": 0.2438, + "step": 41677 + }, + { + "epoch": 3.3763771872974724, + "grad_norm": 0.059962522238492966, + "learning_rate": 3.4646023673432646e-05, + "loss": 0.2036, + "step": 41678 + }, + { + "epoch": 3.376458198314971, + "grad_norm": 0.06766490638256073, + "learning_rate": 3.464152302083803e-05, + "loss": 0.2306, + "step": 41679 + }, + { + "epoch": 3.3765392093324693, + "grad_norm": 0.07134569436311722, + "learning_rate": 3.46370223682434e-05, + "loss": 0.2413, + "step": 41680 + }, + { + "epoch": 3.3766202203499676, + "grad_norm": 0.06784982234239578, + "learning_rate": 3.463252171564877e-05, + "loss": 0.2278, + "step": 41681 + }, + { + "epoch": 3.376701231367466, + "grad_norm": 0.07382599264383316, + "learning_rate": 3.462802106305415e-05, + "loss": 0.2438, + "step": 41682 + }, + { + "epoch": 3.3767822423849645, + "grad_norm": 0.09235008805990219, + "learning_rate": 3.462352041045952e-05, + "loss": 0.2344, + "step": 41683 + }, + { + "epoch": 3.3768632534024627, + "grad_norm": 0.08357124775648117, + "learning_rate": 3.461901975786489e-05, + "loss": 0.2648, + "step": 41684 + }, + { + "epoch": 3.376944264419961, + "grad_norm": 0.06024469807744026, + "learning_rate": 3.461451910527027e-05, + "loss": 0.227, + "step": 41685 + }, + { + "epoch": 3.3770252754374597, + "grad_norm": 0.07447897642850876, + "learning_rate": 3.461001845267564e-05, + "loss": 0.2067, + "step": 41686 + }, + { + "epoch": 3.377106286454958, + "grad_norm": 0.06601405888795853, + "learning_rate": 3.460551780008101e-05, + "loss": 0.2243, + "step": 41687 + }, + { + "epoch": 3.377187297472456, + "grad_norm": 0.06824536621570587, + "learning_rate": 3.460101714748639e-05, + "loss": 0.2331, + "step": 41688 + }, + { + "epoch": 3.3772683084899544, + "grad_norm": 0.06827297061681747, + "learning_rate": 3.459651649489176e-05, + "loss": 0.1986, + "step": 41689 + }, + { + "epoch": 3.377349319507453, + "grad_norm": 0.08309777826070786, + "learning_rate": 3.4592015842297136e-05, + "loss": 0.2556, + "step": 41690 + }, + { + "epoch": 3.3774303305249513, + "grad_norm": 0.08225461095571518, + "learning_rate": 3.458751518970251e-05, + "loss": 0.2468, + "step": 41691 + }, + { + "epoch": 3.3775113415424496, + "grad_norm": 0.07561355829238892, + "learning_rate": 3.4583014537107884e-05, + "loss": 0.2368, + "step": 41692 + }, + { + "epoch": 3.3775923525599483, + "grad_norm": 0.055802978575229645, + "learning_rate": 3.457851388451326e-05, + "loss": 0.239, + "step": 41693 + }, + { + "epoch": 3.3776733635774465, + "grad_norm": 0.06992287933826447, + "learning_rate": 3.457401323191863e-05, + "loss": 0.2551, + "step": 41694 + }, + { + "epoch": 3.3777543745949448, + "grad_norm": 0.0677470713853836, + "learning_rate": 3.4569512579324004e-05, + "loss": 0.2556, + "step": 41695 + }, + { + "epoch": 3.3778353856124435, + "grad_norm": 0.07533083856105804, + "learning_rate": 3.456501192672938e-05, + "loss": 0.2256, + "step": 41696 + }, + { + "epoch": 3.3779163966299417, + "grad_norm": 0.08108577132225037, + "learning_rate": 3.456051127413475e-05, + "loss": 0.2122, + "step": 41697 + }, + { + "epoch": 3.37799740764744, + "grad_norm": 0.06752969324588776, + "learning_rate": 3.4556010621540125e-05, + "loss": 0.2487, + "step": 41698 + }, + { + "epoch": 3.3780784186649386, + "grad_norm": 0.08408936858177185, + "learning_rate": 3.45515099689455e-05, + "loss": 0.225, + "step": 41699 + }, + { + "epoch": 3.378159429682437, + "grad_norm": 0.060827113687992096, + "learning_rate": 3.454700931635087e-05, + "loss": 0.2148, + "step": 41700 + }, + { + "epoch": 3.378240440699935, + "grad_norm": 0.07266361266374588, + "learning_rate": 3.4542508663756246e-05, + "loss": 0.2348, + "step": 41701 + }, + { + "epoch": 3.378321451717434, + "grad_norm": 0.07331918925046921, + "learning_rate": 3.453800801116162e-05, + "loss": 0.2312, + "step": 41702 + }, + { + "epoch": 3.378402462734932, + "grad_norm": 0.06699402630329132, + "learning_rate": 3.453350735856699e-05, + "loss": 0.2344, + "step": 41703 + }, + { + "epoch": 3.3784834737524303, + "grad_norm": 0.07557890564203262, + "learning_rate": 3.452900670597237e-05, + "loss": 0.2351, + "step": 41704 + }, + { + "epoch": 3.3785644847699285, + "grad_norm": 0.07931483536958694, + "learning_rate": 3.452450605337774e-05, + "loss": 0.2256, + "step": 41705 + }, + { + "epoch": 3.3786454957874272, + "grad_norm": 0.06666845083236694, + "learning_rate": 3.452000540078312e-05, + "loss": 0.2349, + "step": 41706 + }, + { + "epoch": 3.3787265068049255, + "grad_norm": 0.0748247355222702, + "learning_rate": 3.451550474818849e-05, + "loss": 0.2221, + "step": 41707 + }, + { + "epoch": 3.3788075178224237, + "grad_norm": 0.08108245581388474, + "learning_rate": 3.451100409559386e-05, + "loss": 0.2018, + "step": 41708 + }, + { + "epoch": 3.3788885288399224, + "grad_norm": 0.058831606060266495, + "learning_rate": 3.450650344299924e-05, + "loss": 0.2051, + "step": 41709 + }, + { + "epoch": 3.3789695398574207, + "grad_norm": 0.0735163614153862, + "learning_rate": 3.450200279040461e-05, + "loss": 0.2383, + "step": 41710 + }, + { + "epoch": 3.379050550874919, + "grad_norm": 0.0725238099694252, + "learning_rate": 3.449750213780998e-05, + "loss": 0.1845, + "step": 41711 + }, + { + "epoch": 3.379131561892417, + "grad_norm": 0.07492050528526306, + "learning_rate": 3.449300148521536e-05, + "loss": 0.2192, + "step": 41712 + }, + { + "epoch": 3.379212572909916, + "grad_norm": 0.06349498778581619, + "learning_rate": 3.448850083262073e-05, + "loss": 0.2394, + "step": 41713 + }, + { + "epoch": 3.379293583927414, + "grad_norm": 0.07114043831825256, + "learning_rate": 3.44840001800261e-05, + "loss": 0.2025, + "step": 41714 + }, + { + "epoch": 3.3793745949449123, + "grad_norm": 0.05922449752688408, + "learning_rate": 3.447949952743148e-05, + "loss": 0.2159, + "step": 41715 + }, + { + "epoch": 3.379455605962411, + "grad_norm": 0.07918506860733032, + "learning_rate": 3.447499887483686e-05, + "loss": 0.2496, + "step": 41716 + }, + { + "epoch": 3.3795366169799093, + "grad_norm": 0.07430548965930939, + "learning_rate": 3.4470498222242224e-05, + "loss": 0.2284, + "step": 41717 + }, + { + "epoch": 3.3796176279974075, + "grad_norm": 0.05439443141222, + "learning_rate": 3.4465997569647604e-05, + "loss": 0.2187, + "step": 41718 + }, + { + "epoch": 3.379698639014906, + "grad_norm": 0.05918572098016739, + "learning_rate": 3.446149691705298e-05, + "loss": 0.1868, + "step": 41719 + }, + { + "epoch": 3.3797796500324044, + "grad_norm": 0.06798598915338516, + "learning_rate": 3.4456996264458345e-05, + "loss": 0.1913, + "step": 41720 + }, + { + "epoch": 3.3798606610499027, + "grad_norm": 0.07275962084531784, + "learning_rate": 3.4452495611863725e-05, + "loss": 0.2509, + "step": 41721 + }, + { + "epoch": 3.3799416720674014, + "grad_norm": 0.08497357368469238, + "learning_rate": 3.44479949592691e-05, + "loss": 0.1995, + "step": 41722 + }, + { + "epoch": 3.3800226830848996, + "grad_norm": 0.04967997223138809, + "learning_rate": 3.4443494306674465e-05, + "loss": 0.2066, + "step": 41723 + }, + { + "epoch": 3.380103694102398, + "grad_norm": 0.07163437455892563, + "learning_rate": 3.4438993654079846e-05, + "loss": 0.22, + "step": 41724 + }, + { + "epoch": 3.3801847051198965, + "grad_norm": 0.06914282590150833, + "learning_rate": 3.443449300148522e-05, + "loss": 0.2032, + "step": 41725 + }, + { + "epoch": 3.380265716137395, + "grad_norm": 0.09515602886676788, + "learning_rate": 3.4429992348890586e-05, + "loss": 0.2449, + "step": 41726 + }, + { + "epoch": 3.380346727154893, + "grad_norm": 0.06940874457359314, + "learning_rate": 3.4425491696295967e-05, + "loss": 0.2025, + "step": 41727 + }, + { + "epoch": 3.3804277381723913, + "grad_norm": 0.08153418451547623, + "learning_rate": 3.442099104370134e-05, + "loss": 0.2202, + "step": 41728 + }, + { + "epoch": 3.38050874918989, + "grad_norm": 0.06321097910404205, + "learning_rate": 3.441649039110671e-05, + "loss": 0.2001, + "step": 41729 + }, + { + "epoch": 3.380589760207388, + "grad_norm": 0.07211658358573914, + "learning_rate": 3.441198973851209e-05, + "loss": 0.2133, + "step": 41730 + }, + { + "epoch": 3.3806707712248865, + "grad_norm": 0.07158449292182922, + "learning_rate": 3.440748908591746e-05, + "loss": 0.2732, + "step": 41731 + }, + { + "epoch": 3.380751782242385, + "grad_norm": 0.06440142542123795, + "learning_rate": 3.4402988433322835e-05, + "loss": 0.2198, + "step": 41732 + }, + { + "epoch": 3.3808327932598834, + "grad_norm": 0.06171039864420891, + "learning_rate": 3.439848778072821e-05, + "loss": 0.2142, + "step": 41733 + }, + { + "epoch": 3.3809138042773816, + "grad_norm": 0.06435028463602066, + "learning_rate": 3.439398712813358e-05, + "loss": 0.1972, + "step": 41734 + }, + { + "epoch": 3.38099481529488, + "grad_norm": 0.058364566415548325, + "learning_rate": 3.4389486475538955e-05, + "loss": 0.2247, + "step": 41735 + }, + { + "epoch": 3.3810758263123786, + "grad_norm": 0.06774954497814178, + "learning_rate": 3.438498582294433e-05, + "loss": 0.2277, + "step": 41736 + }, + { + "epoch": 3.381156837329877, + "grad_norm": 0.06185242161154747, + "learning_rate": 3.43804851703497e-05, + "loss": 0.1978, + "step": 41737 + }, + { + "epoch": 3.381237848347375, + "grad_norm": 0.07757459580898285, + "learning_rate": 3.4375984517755076e-05, + "loss": 0.2111, + "step": 41738 + }, + { + "epoch": 3.3813188593648738, + "grad_norm": 0.07607049494981766, + "learning_rate": 3.437148386516045e-05, + "loss": 0.227, + "step": 41739 + }, + { + "epoch": 3.381399870382372, + "grad_norm": 0.07231981307268143, + "learning_rate": 3.436698321256582e-05, + "loss": 0.2336, + "step": 41740 + }, + { + "epoch": 3.3814808813998702, + "grad_norm": 0.06916042417287827, + "learning_rate": 3.43624825599712e-05, + "loss": 0.2382, + "step": 41741 + }, + { + "epoch": 3.381561892417369, + "grad_norm": 0.08472224324941635, + "learning_rate": 3.435798190737657e-05, + "loss": 0.2729, + "step": 41742 + }, + { + "epoch": 3.381642903434867, + "grad_norm": 0.06384436786174774, + "learning_rate": 3.4353481254781944e-05, + "loss": 0.1799, + "step": 41743 + }, + { + "epoch": 3.3817239144523654, + "grad_norm": 0.07143661379814148, + "learning_rate": 3.434898060218732e-05, + "loss": 0.2198, + "step": 41744 + }, + { + "epoch": 3.381804925469864, + "grad_norm": 0.0806695744395256, + "learning_rate": 3.43444799495927e-05, + "loss": 0.243, + "step": 41745 + }, + { + "epoch": 3.3818859364873624, + "grad_norm": 0.07502878457307816, + "learning_rate": 3.4339979296998065e-05, + "loss": 0.2412, + "step": 41746 + }, + { + "epoch": 3.3819669475048606, + "grad_norm": 0.07038351148366928, + "learning_rate": 3.433547864440344e-05, + "loss": 0.2576, + "step": 41747 + }, + { + "epoch": 3.3820479585223593, + "grad_norm": 0.0637182965874672, + "learning_rate": 3.433097799180882e-05, + "loss": 0.2341, + "step": 41748 + }, + { + "epoch": 3.3821289695398575, + "grad_norm": 0.06181885302066803, + "learning_rate": 3.4326477339214186e-05, + "loss": 0.2047, + "step": 41749 + }, + { + "epoch": 3.3822099805573558, + "grad_norm": 0.06934962421655655, + "learning_rate": 3.432197668661956e-05, + "loss": 0.1977, + "step": 41750 + }, + { + "epoch": 3.382290991574854, + "grad_norm": 0.05986544117331505, + "learning_rate": 3.431747603402494e-05, + "loss": 0.2148, + "step": 41751 + }, + { + "epoch": 3.3823720025923527, + "grad_norm": 0.06989867985248566, + "learning_rate": 3.431297538143031e-05, + "loss": 0.2379, + "step": 41752 + }, + { + "epoch": 3.382453013609851, + "grad_norm": 0.07669881731271744, + "learning_rate": 3.430847472883568e-05, + "loss": 0.2418, + "step": 41753 + }, + { + "epoch": 3.382534024627349, + "grad_norm": 0.06954249739646912, + "learning_rate": 3.430397407624106e-05, + "loss": 0.2336, + "step": 41754 + }, + { + "epoch": 3.382615035644848, + "grad_norm": 0.0775119811296463, + "learning_rate": 3.429947342364643e-05, + "loss": 0.191, + "step": 41755 + }, + { + "epoch": 3.382696046662346, + "grad_norm": 0.0706276074051857, + "learning_rate": 3.42949727710518e-05, + "loss": 0.2577, + "step": 41756 + }, + { + "epoch": 3.3827770576798444, + "grad_norm": 0.08283329755067825, + "learning_rate": 3.429047211845718e-05, + "loss": 0.235, + "step": 41757 + }, + { + "epoch": 3.3828580686973426, + "grad_norm": 0.07096508890390396, + "learning_rate": 3.4285971465862555e-05, + "loss": 0.2305, + "step": 41758 + }, + { + "epoch": 3.3829390797148413, + "grad_norm": 0.07957513630390167, + "learning_rate": 3.428147081326792e-05, + "loss": 0.21, + "step": 41759 + }, + { + "epoch": 3.3830200907323396, + "grad_norm": 0.07610480487346649, + "learning_rate": 3.42769701606733e-05, + "loss": 0.2155, + "step": 41760 + }, + { + "epoch": 3.383101101749838, + "grad_norm": 0.07064136117696762, + "learning_rate": 3.4272469508078676e-05, + "loss": 0.2277, + "step": 41761 + }, + { + "epoch": 3.3831821127673365, + "grad_norm": 0.10831783711910248, + "learning_rate": 3.426796885548404e-05, + "loss": 0.2739, + "step": 41762 + }, + { + "epoch": 3.3832631237848347, + "grad_norm": 0.0787920281291008, + "learning_rate": 3.426346820288942e-05, + "loss": 0.2118, + "step": 41763 + }, + { + "epoch": 3.383344134802333, + "grad_norm": 0.06703635305166245, + "learning_rate": 3.42589675502948e-05, + "loss": 0.2023, + "step": 41764 + }, + { + "epoch": 3.3834251458198317, + "grad_norm": 0.06550689041614532, + "learning_rate": 3.4254466897700163e-05, + "loss": 0.2192, + "step": 41765 + }, + { + "epoch": 3.38350615683733, + "grad_norm": 0.08957525342702866, + "learning_rate": 3.4249966245105544e-05, + "loss": 0.2328, + "step": 41766 + }, + { + "epoch": 3.383587167854828, + "grad_norm": 0.07353270053863525, + "learning_rate": 3.424546559251092e-05, + "loss": 0.2321, + "step": 41767 + }, + { + "epoch": 3.383668178872327, + "grad_norm": 0.06502358615398407, + "learning_rate": 3.4240964939916284e-05, + "loss": 0.2528, + "step": 41768 + }, + { + "epoch": 3.383749189889825, + "grad_norm": 0.061305753886699677, + "learning_rate": 3.4236464287321665e-05, + "loss": 0.1764, + "step": 41769 + }, + { + "epoch": 3.3838302009073233, + "grad_norm": 0.07350903749465942, + "learning_rate": 3.423196363472704e-05, + "loss": 0.2103, + "step": 41770 + }, + { + "epoch": 3.3839112119248216, + "grad_norm": 0.07233782857656479, + "learning_rate": 3.422746298213241e-05, + "loss": 0.2348, + "step": 41771 + }, + { + "epoch": 3.3839922229423203, + "grad_norm": 0.06870030611753464, + "learning_rate": 3.4222962329537785e-05, + "loss": 0.2086, + "step": 41772 + }, + { + "epoch": 3.3840732339598185, + "grad_norm": 0.06497982144355774, + "learning_rate": 3.421846167694316e-05, + "loss": 0.1936, + "step": 41773 + }, + { + "epoch": 3.3841542449773168, + "grad_norm": 0.06733326613903046, + "learning_rate": 3.421396102434853e-05, + "loss": 0.1935, + "step": 41774 + }, + { + "epoch": 3.3842352559948155, + "grad_norm": 0.07830017805099487, + "learning_rate": 3.4209460371753906e-05, + "loss": 0.1875, + "step": 41775 + }, + { + "epoch": 3.3843162670123137, + "grad_norm": 0.06126867234706879, + "learning_rate": 3.420495971915928e-05, + "loss": 0.2175, + "step": 41776 + }, + { + "epoch": 3.384397278029812, + "grad_norm": 0.06090138480067253, + "learning_rate": 3.4200459066564653e-05, + "loss": 0.2057, + "step": 41777 + }, + { + "epoch": 3.3844782890473106, + "grad_norm": 0.07315081357955933, + "learning_rate": 3.419595841397003e-05, + "loss": 0.22, + "step": 41778 + }, + { + "epoch": 3.384559300064809, + "grad_norm": 0.07009439915418625, + "learning_rate": 3.41914577613754e-05, + "loss": 0.1932, + "step": 41779 + }, + { + "epoch": 3.384640311082307, + "grad_norm": 0.0699535682797432, + "learning_rate": 3.4186957108780774e-05, + "loss": 0.2874, + "step": 41780 + }, + { + "epoch": 3.3847213220998054, + "grad_norm": 0.08572935312986374, + "learning_rate": 3.418245645618615e-05, + "loss": 0.233, + "step": 41781 + }, + { + "epoch": 3.384802333117304, + "grad_norm": 0.06120715290307999, + "learning_rate": 3.417795580359152e-05, + "loss": 0.1902, + "step": 41782 + }, + { + "epoch": 3.3848833441348023, + "grad_norm": 0.07531646639108658, + "learning_rate": 3.4173455150996895e-05, + "loss": 0.2209, + "step": 41783 + }, + { + "epoch": 3.3849643551523005, + "grad_norm": 0.08339875936508179, + "learning_rate": 3.4168954498402275e-05, + "loss": 0.2598, + "step": 41784 + }, + { + "epoch": 3.3850453661697992, + "grad_norm": 0.0988910049200058, + "learning_rate": 3.416445384580764e-05, + "loss": 0.2628, + "step": 41785 + }, + { + "epoch": 3.3851263771872975, + "grad_norm": 0.08542412519454956, + "learning_rate": 3.4159953193213016e-05, + "loss": 0.2276, + "step": 41786 + }, + { + "epoch": 3.3852073882047957, + "grad_norm": 0.0794190838932991, + "learning_rate": 3.4155452540618396e-05, + "loss": 0.251, + "step": 41787 + }, + { + "epoch": 3.3852883992222944, + "grad_norm": 0.05835704505443573, + "learning_rate": 3.415095188802376e-05, + "loss": 0.1812, + "step": 41788 + }, + { + "epoch": 3.3853694102397927, + "grad_norm": 0.08290042728185654, + "learning_rate": 3.414645123542914e-05, + "loss": 0.2212, + "step": 41789 + }, + { + "epoch": 3.385450421257291, + "grad_norm": 0.07653053849935532, + "learning_rate": 3.414195058283452e-05, + "loss": 0.2371, + "step": 41790 + }, + { + "epoch": 3.3855314322747896, + "grad_norm": 0.07488921284675598, + "learning_rate": 3.4137449930239884e-05, + "loss": 0.2309, + "step": 41791 + }, + { + "epoch": 3.385612443292288, + "grad_norm": 0.06250148266553879, + "learning_rate": 3.413294927764526e-05, + "loss": 0.1861, + "step": 41792 + }, + { + "epoch": 3.385693454309786, + "grad_norm": 0.07396798580884933, + "learning_rate": 3.412844862505064e-05, + "loss": 0.2872, + "step": 41793 + }, + { + "epoch": 3.3857744653272843, + "grad_norm": 0.09425674378871918, + "learning_rate": 3.4123947972456005e-05, + "loss": 0.2424, + "step": 41794 + }, + { + "epoch": 3.385855476344783, + "grad_norm": 0.07500012218952179, + "learning_rate": 3.411944731986138e-05, + "loss": 0.2288, + "step": 41795 + }, + { + "epoch": 3.3859364873622813, + "grad_norm": 0.060686469078063965, + "learning_rate": 3.411494666726676e-05, + "loss": 0.2192, + "step": 41796 + }, + { + "epoch": 3.3860174983797795, + "grad_norm": 0.06987955421209335, + "learning_rate": 3.411044601467213e-05, + "loss": 0.2451, + "step": 41797 + }, + { + "epoch": 3.386098509397278, + "grad_norm": 0.056687429547309875, + "learning_rate": 3.41059453620775e-05, + "loss": 0.2562, + "step": 41798 + }, + { + "epoch": 3.3861795204147764, + "grad_norm": 0.07379105687141418, + "learning_rate": 3.410144470948288e-05, + "loss": 0.2316, + "step": 41799 + }, + { + "epoch": 3.3862605314322747, + "grad_norm": 0.0667329877614975, + "learning_rate": 3.409694405688825e-05, + "loss": 0.2222, + "step": 41800 + }, + { + "epoch": 3.386341542449773, + "grad_norm": 0.07343947142362595, + "learning_rate": 3.409244340429362e-05, + "loss": 0.2404, + "step": 41801 + }, + { + "epoch": 3.3864225534672716, + "grad_norm": 0.07047673314809799, + "learning_rate": 3.4087942751699e-05, + "loss": 0.1889, + "step": 41802 + }, + { + "epoch": 3.38650356448477, + "grad_norm": 0.07641629129648209, + "learning_rate": 3.4083442099104374e-05, + "loss": 0.2693, + "step": 41803 + }, + { + "epoch": 3.386584575502268, + "grad_norm": 0.0820465087890625, + "learning_rate": 3.407894144650974e-05, + "loss": 0.2291, + "step": 41804 + }, + { + "epoch": 3.386665586519767, + "grad_norm": 0.062225062400102615, + "learning_rate": 3.407444079391512e-05, + "loss": 0.2055, + "step": 41805 + }, + { + "epoch": 3.386746597537265, + "grad_norm": 0.06488920748233795, + "learning_rate": 3.4069940141320495e-05, + "loss": 0.2022, + "step": 41806 + }, + { + "epoch": 3.3868276085547633, + "grad_norm": 0.08183945715427399, + "learning_rate": 3.406543948872586e-05, + "loss": 0.2388, + "step": 41807 + }, + { + "epoch": 3.386908619572262, + "grad_norm": 0.08639533072710037, + "learning_rate": 3.406093883613124e-05, + "loss": 0.2438, + "step": 41808 + }, + { + "epoch": 3.38698963058976, + "grad_norm": 0.07558859139680862, + "learning_rate": 3.4056438183536616e-05, + "loss": 0.2301, + "step": 41809 + }, + { + "epoch": 3.3870706416072585, + "grad_norm": 0.07002075016498566, + "learning_rate": 3.405193753094199e-05, + "loss": 0.2043, + "step": 41810 + }, + { + "epoch": 3.387151652624757, + "grad_norm": 0.06354018300771713, + "learning_rate": 3.404743687834736e-05, + "loss": 0.1763, + "step": 41811 + }, + { + "epoch": 3.3872326636422554, + "grad_norm": 0.0757867842912674, + "learning_rate": 3.4042936225752736e-05, + "loss": 0.2062, + "step": 41812 + }, + { + "epoch": 3.3873136746597536, + "grad_norm": 0.07359560579061508, + "learning_rate": 3.403843557315811e-05, + "loss": 0.2227, + "step": 41813 + }, + { + "epoch": 3.3873946856772523, + "grad_norm": 0.08458617329597473, + "learning_rate": 3.4033934920563484e-05, + "loss": 0.3023, + "step": 41814 + }, + { + "epoch": 3.3874756966947506, + "grad_norm": 0.07034522294998169, + "learning_rate": 3.402943426796886e-05, + "loss": 0.2235, + "step": 41815 + }, + { + "epoch": 3.387556707712249, + "grad_norm": 0.07200920581817627, + "learning_rate": 3.402493361537423e-05, + "loss": 0.2339, + "step": 41816 + }, + { + "epoch": 3.387637718729747, + "grad_norm": 0.06833247095346451, + "learning_rate": 3.4020432962779604e-05, + "loss": 0.2089, + "step": 41817 + }, + { + "epoch": 3.3877187297472457, + "grad_norm": 0.08015815168619156, + "learning_rate": 3.401593231018498e-05, + "loss": 0.219, + "step": 41818 + }, + { + "epoch": 3.387799740764744, + "grad_norm": 0.059349171817302704, + "learning_rate": 3.401143165759035e-05, + "loss": 0.2029, + "step": 41819 + }, + { + "epoch": 3.3878807517822422, + "grad_norm": 0.07790016382932663, + "learning_rate": 3.4006931004995725e-05, + "loss": 0.2399, + "step": 41820 + }, + { + "epoch": 3.387961762799741, + "grad_norm": 0.057797957211732864, + "learning_rate": 3.40024303524011e-05, + "loss": 0.2152, + "step": 41821 + }, + { + "epoch": 3.388042773817239, + "grad_norm": 0.07001172006130219, + "learning_rate": 3.399792969980647e-05, + "loss": 0.2168, + "step": 41822 + }, + { + "epoch": 3.3881237848347374, + "grad_norm": 0.06563981622457504, + "learning_rate": 3.399342904721185e-05, + "loss": 0.2015, + "step": 41823 + }, + { + "epoch": 3.3882047958522357, + "grad_norm": 0.0651637613773346, + "learning_rate": 3.398892839461722e-05, + "loss": 0.2154, + "step": 41824 + }, + { + "epoch": 3.3882858068697344, + "grad_norm": 0.08092446625232697, + "learning_rate": 3.398442774202259e-05, + "loss": 0.2017, + "step": 41825 + }, + { + "epoch": 3.3883668178872326, + "grad_norm": 0.07722778618335724, + "learning_rate": 3.3979927089427974e-05, + "loss": 0.212, + "step": 41826 + }, + { + "epoch": 3.388447828904731, + "grad_norm": 0.06523314863443375, + "learning_rate": 3.397542643683334e-05, + "loss": 0.2102, + "step": 41827 + }, + { + "epoch": 3.3885288399222295, + "grad_norm": 0.0650385171175003, + "learning_rate": 3.3970925784238714e-05, + "loss": 0.2051, + "step": 41828 + }, + { + "epoch": 3.3886098509397278, + "grad_norm": 0.06373479217290878, + "learning_rate": 3.3966425131644094e-05, + "loss": 0.2232, + "step": 41829 + }, + { + "epoch": 3.388690861957226, + "grad_norm": 0.071293905377388, + "learning_rate": 3.396192447904946e-05, + "loss": 0.2273, + "step": 41830 + }, + { + "epoch": 3.3887718729747247, + "grad_norm": 0.0766294002532959, + "learning_rate": 3.3957423826454835e-05, + "loss": 0.2549, + "step": 41831 + }, + { + "epoch": 3.388852883992223, + "grad_norm": 0.06483791768550873, + "learning_rate": 3.3952923173860215e-05, + "loss": 0.2059, + "step": 41832 + }, + { + "epoch": 3.388933895009721, + "grad_norm": 0.07027419656515121, + "learning_rate": 3.394842252126558e-05, + "loss": 0.2068, + "step": 41833 + }, + { + "epoch": 3.38901490602722, + "grad_norm": 0.058067139238119125, + "learning_rate": 3.3943921868670956e-05, + "loss": 0.198, + "step": 41834 + }, + { + "epoch": 3.389095917044718, + "grad_norm": 0.06929459422826767, + "learning_rate": 3.3939421216076336e-05, + "loss": 0.2262, + "step": 41835 + }, + { + "epoch": 3.3891769280622164, + "grad_norm": 0.07518819719552994, + "learning_rate": 3.393492056348171e-05, + "loss": 0.2761, + "step": 41836 + }, + { + "epoch": 3.389257939079715, + "grad_norm": 0.0849183201789856, + "learning_rate": 3.3930419910887076e-05, + "loss": 0.2318, + "step": 41837 + }, + { + "epoch": 3.3893389500972133, + "grad_norm": 0.07857996225357056, + "learning_rate": 3.392591925829246e-05, + "loss": 0.211, + "step": 41838 + }, + { + "epoch": 3.3894199611147116, + "grad_norm": 0.06797852367162704, + "learning_rate": 3.392141860569783e-05, + "loss": 0.2219, + "step": 41839 + }, + { + "epoch": 3.38950097213221, + "grad_norm": 0.05962246283888817, + "learning_rate": 3.39169179531032e-05, + "loss": 0.1953, + "step": 41840 + }, + { + "epoch": 3.3895819831497085, + "grad_norm": 0.07180878520011902, + "learning_rate": 3.391241730050858e-05, + "loss": 0.2332, + "step": 41841 + }, + { + "epoch": 3.3896629941672067, + "grad_norm": 0.07170886546373367, + "learning_rate": 3.390791664791395e-05, + "loss": 0.2289, + "step": 41842 + }, + { + "epoch": 3.389744005184705, + "grad_norm": 0.059607286006212234, + "learning_rate": 3.390341599531932e-05, + "loss": 0.2315, + "step": 41843 + }, + { + "epoch": 3.3898250162022037, + "grad_norm": 0.08054961264133453, + "learning_rate": 3.38989153427247e-05, + "loss": 0.2367, + "step": 41844 + }, + { + "epoch": 3.389906027219702, + "grad_norm": 0.05538221821188927, + "learning_rate": 3.389441469013007e-05, + "loss": 0.1927, + "step": 41845 + }, + { + "epoch": 3.3899870382372, + "grad_norm": 0.0739208310842514, + "learning_rate": 3.388991403753544e-05, + "loss": 0.2381, + "step": 41846 + }, + { + "epoch": 3.3900680492546984, + "grad_norm": 0.070955790579319, + "learning_rate": 3.388541338494082e-05, + "loss": 0.2299, + "step": 41847 + }, + { + "epoch": 3.390149060272197, + "grad_norm": 0.09719790518283844, + "learning_rate": 3.388091273234619e-05, + "loss": 0.2684, + "step": 41848 + }, + { + "epoch": 3.3902300712896953, + "grad_norm": 0.08513044565916061, + "learning_rate": 3.3876412079751566e-05, + "loss": 0.2464, + "step": 41849 + }, + { + "epoch": 3.3903110823071936, + "grad_norm": 0.07656481117010117, + "learning_rate": 3.387191142715694e-05, + "loss": 0.2528, + "step": 41850 + }, + { + "epoch": 3.3903920933246923, + "grad_norm": 0.07512432336807251, + "learning_rate": 3.3867410774562314e-05, + "loss": 0.2255, + "step": 41851 + }, + { + "epoch": 3.3904731043421905, + "grad_norm": 0.07700356096029282, + "learning_rate": 3.386291012196769e-05, + "loss": 0.2007, + "step": 41852 + }, + { + "epoch": 3.3905541153596888, + "grad_norm": 0.07253925502300262, + "learning_rate": 3.385840946937306e-05, + "loss": 0.239, + "step": 41853 + }, + { + "epoch": 3.3906351263771874, + "grad_norm": 0.06891423463821411, + "learning_rate": 3.3853908816778434e-05, + "loss": 0.2072, + "step": 41854 + }, + { + "epoch": 3.3907161373946857, + "grad_norm": 0.0744556412100792, + "learning_rate": 3.384940816418381e-05, + "loss": 0.2432, + "step": 41855 + }, + { + "epoch": 3.390797148412184, + "grad_norm": 0.07903169840574265, + "learning_rate": 3.384490751158918e-05, + "loss": 0.2118, + "step": 41856 + }, + { + "epoch": 3.3908781594296826, + "grad_norm": 0.0659727156162262, + "learning_rate": 3.3840406858994555e-05, + "loss": 0.2289, + "step": 41857 + }, + { + "epoch": 3.390959170447181, + "grad_norm": 0.06620805710554123, + "learning_rate": 3.383590620639993e-05, + "loss": 0.2178, + "step": 41858 + }, + { + "epoch": 3.391040181464679, + "grad_norm": 0.0583527609705925, + "learning_rate": 3.38314055538053e-05, + "loss": 0.1915, + "step": 41859 + }, + { + "epoch": 3.391121192482178, + "grad_norm": 0.06404381990432739, + "learning_rate": 3.3826904901210676e-05, + "loss": 0.2422, + "step": 41860 + }, + { + "epoch": 3.391202203499676, + "grad_norm": 0.06776855140924454, + "learning_rate": 3.382240424861605e-05, + "loss": 0.2118, + "step": 41861 + }, + { + "epoch": 3.3912832145171743, + "grad_norm": 0.07318595796823502, + "learning_rate": 3.381790359602143e-05, + "loss": 0.258, + "step": 41862 + }, + { + "epoch": 3.3913642255346725, + "grad_norm": 0.06983473151922226, + "learning_rate": 3.38134029434268e-05, + "loss": 0.2543, + "step": 41863 + }, + { + "epoch": 3.3914452365521712, + "grad_norm": 0.0680810734629631, + "learning_rate": 3.380890229083217e-05, + "loss": 0.2119, + "step": 41864 + }, + { + "epoch": 3.3915262475696695, + "grad_norm": 0.08268336206674576, + "learning_rate": 3.380440163823755e-05, + "loss": 0.2675, + "step": 41865 + }, + { + "epoch": 3.3916072585871677, + "grad_norm": 0.08607517182826996, + "learning_rate": 3.379990098564292e-05, + "loss": 0.2608, + "step": 41866 + }, + { + "epoch": 3.3916882696046664, + "grad_norm": 0.07086408883333206, + "learning_rate": 3.379540033304829e-05, + "loss": 0.202, + "step": 41867 + }, + { + "epoch": 3.3917692806221647, + "grad_norm": 0.08508818596601486, + "learning_rate": 3.379089968045367e-05, + "loss": 0.2167, + "step": 41868 + }, + { + "epoch": 3.391850291639663, + "grad_norm": 0.07000960409641266, + "learning_rate": 3.378639902785904e-05, + "loss": 0.2311, + "step": 41869 + }, + { + "epoch": 3.391931302657161, + "grad_norm": 0.06851160526275635, + "learning_rate": 3.378189837526441e-05, + "loss": 0.2337, + "step": 41870 + }, + { + "epoch": 3.39201231367466, + "grad_norm": 0.07425341755151749, + "learning_rate": 3.377739772266979e-05, + "loss": 0.2318, + "step": 41871 + }, + { + "epoch": 3.392093324692158, + "grad_norm": 0.08614206314086914, + "learning_rate": 3.377289707007516e-05, + "loss": 0.2477, + "step": 41872 + }, + { + "epoch": 3.3921743357096563, + "grad_norm": 0.06568558514118195, + "learning_rate": 3.376839641748053e-05, + "loss": 0.2143, + "step": 41873 + }, + { + "epoch": 3.392255346727155, + "grad_norm": 0.07687494158744812, + "learning_rate": 3.376389576488591e-05, + "loss": 0.2093, + "step": 41874 + }, + { + "epoch": 3.3923363577446533, + "grad_norm": 0.08374509960412979, + "learning_rate": 3.375939511229129e-05, + "loss": 0.2257, + "step": 41875 + }, + { + "epoch": 3.3924173687621515, + "grad_norm": 0.07319393008947372, + "learning_rate": 3.3754894459696654e-05, + "loss": 0.2314, + "step": 41876 + }, + { + "epoch": 3.39249837977965, + "grad_norm": 0.07026918232440948, + "learning_rate": 3.3750393807102034e-05, + "loss": 0.2239, + "step": 41877 + }, + { + "epoch": 3.3925793907971484, + "grad_norm": 0.07389682531356812, + "learning_rate": 3.374589315450741e-05, + "loss": 0.2606, + "step": 41878 + }, + { + "epoch": 3.3926604018146467, + "grad_norm": 0.07238435745239258, + "learning_rate": 3.3741392501912775e-05, + "loss": 0.2271, + "step": 41879 + }, + { + "epoch": 3.3927414128321454, + "grad_norm": 0.09210827946662903, + "learning_rate": 3.3736891849318155e-05, + "loss": 0.2411, + "step": 41880 + }, + { + "epoch": 3.3928224238496436, + "grad_norm": 0.06820737570524216, + "learning_rate": 3.373239119672353e-05, + "loss": 0.2363, + "step": 41881 + }, + { + "epoch": 3.392903434867142, + "grad_norm": 0.07121577113866806, + "learning_rate": 3.3727890544128895e-05, + "loss": 0.2266, + "step": 41882 + }, + { + "epoch": 3.3929844458846405, + "grad_norm": 0.07507503777742386, + "learning_rate": 3.3723389891534276e-05, + "loss": 0.2798, + "step": 41883 + }, + { + "epoch": 3.393065456902139, + "grad_norm": 0.07295151799917221, + "learning_rate": 3.371888923893965e-05, + "loss": 0.2719, + "step": 41884 + }, + { + "epoch": 3.393146467919637, + "grad_norm": 0.06200207769870758, + "learning_rate": 3.3714388586345016e-05, + "loss": 0.2045, + "step": 41885 + }, + { + "epoch": 3.3932274789371353, + "grad_norm": 0.07828261703252792, + "learning_rate": 3.3709887933750397e-05, + "loss": 0.2602, + "step": 41886 + }, + { + "epoch": 3.393308489954634, + "grad_norm": 0.07141652703285217, + "learning_rate": 3.370538728115577e-05, + "loss": 0.2113, + "step": 41887 + }, + { + "epoch": 3.393389500972132, + "grad_norm": 0.08014405518770218, + "learning_rate": 3.370088662856114e-05, + "loss": 0.23, + "step": 41888 + }, + { + "epoch": 3.3934705119896305, + "grad_norm": 0.06970424205064774, + "learning_rate": 3.369638597596652e-05, + "loss": 0.2211, + "step": 41889 + }, + { + "epoch": 3.393551523007129, + "grad_norm": 0.07382189482450485, + "learning_rate": 3.369188532337189e-05, + "loss": 0.244, + "step": 41890 + }, + { + "epoch": 3.3936325340246274, + "grad_norm": 0.11215295642614365, + "learning_rate": 3.3687384670777265e-05, + "loss": 0.2367, + "step": 41891 + }, + { + "epoch": 3.3937135450421256, + "grad_norm": 0.06525477766990662, + "learning_rate": 3.368288401818264e-05, + "loss": 0.2217, + "step": 41892 + }, + { + "epoch": 3.393794556059624, + "grad_norm": 0.07368913292884827, + "learning_rate": 3.367838336558801e-05, + "loss": 0.2218, + "step": 41893 + }, + { + "epoch": 3.3938755670771226, + "grad_norm": 0.06014920398592949, + "learning_rate": 3.3673882712993385e-05, + "loss": 0.2072, + "step": 41894 + }, + { + "epoch": 3.393956578094621, + "grad_norm": 0.06717801094055176, + "learning_rate": 3.366938206039876e-05, + "loss": 0.2289, + "step": 41895 + }, + { + "epoch": 3.394037589112119, + "grad_norm": 0.0629340410232544, + "learning_rate": 3.366488140780413e-05, + "loss": 0.2115, + "step": 41896 + }, + { + "epoch": 3.3941186001296177, + "grad_norm": 0.07047463953495026, + "learning_rate": 3.3660380755209506e-05, + "loss": 0.2017, + "step": 41897 + }, + { + "epoch": 3.394199611147116, + "grad_norm": 0.07417044043540955, + "learning_rate": 3.365588010261488e-05, + "loss": 0.2439, + "step": 41898 + }, + { + "epoch": 3.3942806221646142, + "grad_norm": 0.0626869797706604, + "learning_rate": 3.3651379450020253e-05, + "loss": 0.2207, + "step": 41899 + }, + { + "epoch": 3.394361633182113, + "grad_norm": 0.08169003576040268, + "learning_rate": 3.364687879742563e-05, + "loss": 0.2167, + "step": 41900 + }, + { + "epoch": 3.394442644199611, + "grad_norm": 0.07657211273908615, + "learning_rate": 3.3642378144831e-05, + "loss": 0.2304, + "step": 41901 + }, + { + "epoch": 3.3945236552171094, + "grad_norm": 0.07610435783863068, + "learning_rate": 3.3637877492236374e-05, + "loss": 0.2185, + "step": 41902 + }, + { + "epoch": 3.394604666234608, + "grad_norm": 0.07222923636436462, + "learning_rate": 3.363337683964175e-05, + "loss": 0.2256, + "step": 41903 + }, + { + "epoch": 3.3946856772521063, + "grad_norm": 0.061299409717321396, + "learning_rate": 3.362887618704713e-05, + "loss": 0.203, + "step": 41904 + }, + { + "epoch": 3.3947666882696046, + "grad_norm": 0.053315408527851105, + "learning_rate": 3.3624375534452495e-05, + "loss": 0.1838, + "step": 41905 + }, + { + "epoch": 3.3948476992871033, + "grad_norm": 0.06681319326162338, + "learning_rate": 3.361987488185787e-05, + "loss": 0.2455, + "step": 41906 + }, + { + "epoch": 3.3949287103046015, + "grad_norm": 0.05498621612787247, + "learning_rate": 3.361537422926325e-05, + "loss": 0.1759, + "step": 41907 + }, + { + "epoch": 3.3950097213220998, + "grad_norm": 0.06275831907987595, + "learning_rate": 3.3610873576668616e-05, + "loss": 0.2654, + "step": 41908 + }, + { + "epoch": 3.395090732339598, + "grad_norm": 0.07026954740285873, + "learning_rate": 3.360637292407399e-05, + "loss": 0.2193, + "step": 41909 + }, + { + "epoch": 3.3951717433570967, + "grad_norm": 0.0646926537156105, + "learning_rate": 3.360187227147937e-05, + "loss": 0.22, + "step": 41910 + }, + { + "epoch": 3.395252754374595, + "grad_norm": 0.0680764839053154, + "learning_rate": 3.359737161888474e-05, + "loss": 0.2178, + "step": 41911 + }, + { + "epoch": 3.395333765392093, + "grad_norm": 0.06526144593954086, + "learning_rate": 3.359287096629011e-05, + "loss": 0.2329, + "step": 41912 + }, + { + "epoch": 3.395414776409592, + "grad_norm": 0.07439577579498291, + "learning_rate": 3.358837031369549e-05, + "loss": 0.2322, + "step": 41913 + }, + { + "epoch": 3.39549578742709, + "grad_norm": 0.05572812259197235, + "learning_rate": 3.358386966110086e-05, + "loss": 0.2094, + "step": 41914 + }, + { + "epoch": 3.3955767984445884, + "grad_norm": 0.07370764017105103, + "learning_rate": 3.357936900850623e-05, + "loss": 0.2234, + "step": 41915 + }, + { + "epoch": 3.3956578094620866, + "grad_norm": 0.07162272930145264, + "learning_rate": 3.357486835591161e-05, + "loss": 0.2457, + "step": 41916 + }, + { + "epoch": 3.3957388204795853, + "grad_norm": 0.06438523530960083, + "learning_rate": 3.3570367703316985e-05, + "loss": 0.2496, + "step": 41917 + }, + { + "epoch": 3.3958198314970836, + "grad_norm": 0.06554609537124634, + "learning_rate": 3.356586705072235e-05, + "loss": 0.1977, + "step": 41918 + }, + { + "epoch": 3.395900842514582, + "grad_norm": 0.06860709190368652, + "learning_rate": 3.356136639812773e-05, + "loss": 0.2366, + "step": 41919 + }, + { + "epoch": 3.3959818535320805, + "grad_norm": 0.059882767498493195, + "learning_rate": 3.3556865745533106e-05, + "loss": 0.2326, + "step": 41920 + }, + { + "epoch": 3.3960628645495787, + "grad_norm": 0.06205613911151886, + "learning_rate": 3.355236509293847e-05, + "loss": 0.2359, + "step": 41921 + }, + { + "epoch": 3.396143875567077, + "grad_norm": 0.08020277321338654, + "learning_rate": 3.354786444034385e-05, + "loss": 0.2034, + "step": 41922 + }, + { + "epoch": 3.3962248865845757, + "grad_norm": 0.07167024910449982, + "learning_rate": 3.354336378774923e-05, + "loss": 0.2411, + "step": 41923 + }, + { + "epoch": 3.396305897602074, + "grad_norm": 0.06740758568048477, + "learning_rate": 3.3538863135154593e-05, + "loss": 0.2538, + "step": 41924 + }, + { + "epoch": 3.396386908619572, + "grad_norm": 0.07286737859249115, + "learning_rate": 3.3534362482559974e-05, + "loss": 0.2211, + "step": 41925 + }, + { + "epoch": 3.396467919637071, + "grad_norm": 0.09053151309490204, + "learning_rate": 3.352986182996535e-05, + "loss": 0.2463, + "step": 41926 + }, + { + "epoch": 3.396548930654569, + "grad_norm": 0.07128996402025223, + "learning_rate": 3.3525361177370714e-05, + "loss": 0.2289, + "step": 41927 + }, + { + "epoch": 3.3966299416720673, + "grad_norm": 0.07041050493717194, + "learning_rate": 3.3520860524776095e-05, + "loss": 0.2312, + "step": 41928 + }, + { + "epoch": 3.396710952689566, + "grad_norm": 0.07474519312381744, + "learning_rate": 3.351635987218147e-05, + "loss": 0.256, + "step": 41929 + }, + { + "epoch": 3.3967919637070643, + "grad_norm": 0.07732457667589188, + "learning_rate": 3.351185921958684e-05, + "loss": 0.2071, + "step": 41930 + }, + { + "epoch": 3.3968729747245625, + "grad_norm": 0.06436406075954437, + "learning_rate": 3.3507358566992215e-05, + "loss": 0.2461, + "step": 41931 + }, + { + "epoch": 3.3969539857420608, + "grad_norm": 0.07016737014055252, + "learning_rate": 3.350285791439759e-05, + "loss": 0.2261, + "step": 41932 + }, + { + "epoch": 3.3970349967595594, + "grad_norm": 0.06618059426546097, + "learning_rate": 3.349835726180296e-05, + "loss": 0.1887, + "step": 41933 + }, + { + "epoch": 3.3971160077770577, + "grad_norm": 0.07330776005983353, + "learning_rate": 3.3493856609208336e-05, + "loss": 0.2334, + "step": 41934 + }, + { + "epoch": 3.397197018794556, + "grad_norm": 0.08016584813594818, + "learning_rate": 3.348935595661371e-05, + "loss": 0.1919, + "step": 41935 + }, + { + "epoch": 3.3972780298120546, + "grad_norm": 0.07798706740140915, + "learning_rate": 3.3484855304019084e-05, + "loss": 0.1995, + "step": 41936 + }, + { + "epoch": 3.397359040829553, + "grad_norm": 0.06129057705402374, + "learning_rate": 3.348035465142446e-05, + "loss": 0.1945, + "step": 41937 + }, + { + "epoch": 3.397440051847051, + "grad_norm": 0.08152814209461212, + "learning_rate": 3.347585399882983e-05, + "loss": 0.2431, + "step": 41938 + }, + { + "epoch": 3.3975210628645494, + "grad_norm": 0.06842713803052902, + "learning_rate": 3.3471353346235204e-05, + "loss": 0.2326, + "step": 41939 + }, + { + "epoch": 3.397602073882048, + "grad_norm": 0.06839247792959213, + "learning_rate": 3.346685269364058e-05, + "loss": 0.2144, + "step": 41940 + }, + { + "epoch": 3.3976830848995463, + "grad_norm": 0.06789159774780273, + "learning_rate": 3.346235204104595e-05, + "loss": 0.2215, + "step": 41941 + }, + { + "epoch": 3.3977640959170445, + "grad_norm": 0.07566022872924805, + "learning_rate": 3.3457851388451325e-05, + "loss": 0.2042, + "step": 41942 + }, + { + "epoch": 3.3978451069345432, + "grad_norm": 0.08345601707696915, + "learning_rate": 3.3453350735856706e-05, + "loss": 0.2257, + "step": 41943 + }, + { + "epoch": 3.3979261179520415, + "grad_norm": 0.08608893305063248, + "learning_rate": 3.344885008326207e-05, + "loss": 0.2301, + "step": 41944 + }, + { + "epoch": 3.3980071289695397, + "grad_norm": 0.066611148416996, + "learning_rate": 3.3444349430667446e-05, + "loss": 0.2341, + "step": 41945 + }, + { + "epoch": 3.3980881399870384, + "grad_norm": 0.07658085972070694, + "learning_rate": 3.3439848778072826e-05, + "loss": 0.2327, + "step": 41946 + }, + { + "epoch": 3.3981691510045366, + "grad_norm": 0.08586086332798004, + "learning_rate": 3.343534812547819e-05, + "loss": 0.2459, + "step": 41947 + }, + { + "epoch": 3.398250162022035, + "grad_norm": 0.06390102207660675, + "learning_rate": 3.343084747288357e-05, + "loss": 0.2175, + "step": 41948 + }, + { + "epoch": 3.3983311730395336, + "grad_norm": 0.061589255928993225, + "learning_rate": 3.342634682028895e-05, + "loss": 0.204, + "step": 41949 + }, + { + "epoch": 3.398412184057032, + "grad_norm": 0.069948211312294, + "learning_rate": 3.3421846167694314e-05, + "loss": 0.2128, + "step": 41950 + }, + { + "epoch": 3.39849319507453, + "grad_norm": 0.08519947528839111, + "learning_rate": 3.341734551509969e-05, + "loss": 0.2675, + "step": 41951 + }, + { + "epoch": 3.3985742060920288, + "grad_norm": 0.057270586490631104, + "learning_rate": 3.341284486250507e-05, + "loss": 0.2281, + "step": 41952 + }, + { + "epoch": 3.398655217109527, + "grad_norm": 0.0729655846953392, + "learning_rate": 3.3408344209910435e-05, + "loss": 0.2319, + "step": 41953 + }, + { + "epoch": 3.3987362281270252, + "grad_norm": 0.0660521537065506, + "learning_rate": 3.340384355731581e-05, + "loss": 0.229, + "step": 41954 + }, + { + "epoch": 3.3988172391445235, + "grad_norm": 0.0661383792757988, + "learning_rate": 3.339934290472119e-05, + "loss": 0.2346, + "step": 41955 + }, + { + "epoch": 3.398898250162022, + "grad_norm": 0.08104848861694336, + "learning_rate": 3.339484225212656e-05, + "loss": 0.2119, + "step": 41956 + }, + { + "epoch": 3.3989792611795204, + "grad_norm": 0.07406584918498993, + "learning_rate": 3.339034159953193e-05, + "loss": 0.2148, + "step": 41957 + }, + { + "epoch": 3.3990602721970187, + "grad_norm": 0.07290060818195343, + "learning_rate": 3.338584094693731e-05, + "loss": 0.2254, + "step": 41958 + }, + { + "epoch": 3.3991412832145174, + "grad_norm": 0.07462384551763535, + "learning_rate": 3.338134029434268e-05, + "loss": 0.2184, + "step": 41959 + }, + { + "epoch": 3.3992222942320156, + "grad_norm": 0.08314083516597748, + "learning_rate": 3.337683964174805e-05, + "loss": 0.2084, + "step": 41960 + }, + { + "epoch": 3.399303305249514, + "grad_norm": 0.08253400027751923, + "learning_rate": 3.337233898915343e-05, + "loss": 0.2203, + "step": 41961 + }, + { + "epoch": 3.399384316267012, + "grad_norm": 0.07560184597969055, + "learning_rate": 3.3367838336558804e-05, + "loss": 0.2147, + "step": 41962 + }, + { + "epoch": 3.399465327284511, + "grad_norm": 0.07779417186975479, + "learning_rate": 3.336333768396418e-05, + "loss": 0.2544, + "step": 41963 + }, + { + "epoch": 3.399546338302009, + "grad_norm": 0.06740397214889526, + "learning_rate": 3.335883703136955e-05, + "loss": 0.2268, + "step": 41964 + }, + { + "epoch": 3.3996273493195073, + "grad_norm": 0.07044196128845215, + "learning_rate": 3.3354336378774925e-05, + "loss": 0.2096, + "step": 41965 + }, + { + "epoch": 3.399708360337006, + "grad_norm": 0.0865856185555458, + "learning_rate": 3.33498357261803e-05, + "loss": 0.21, + "step": 41966 + }, + { + "epoch": 3.399789371354504, + "grad_norm": 0.05665763095021248, + "learning_rate": 3.334533507358567e-05, + "loss": 0.2054, + "step": 41967 + }, + { + "epoch": 3.3998703823720025, + "grad_norm": 0.07386376708745956, + "learning_rate": 3.3340834420991046e-05, + "loss": 0.2552, + "step": 41968 + }, + { + "epoch": 3.399951393389501, + "grad_norm": 0.06082529574632645, + "learning_rate": 3.333633376839642e-05, + "loss": 0.2202, + "step": 41969 + }, + { + "epoch": 3.4000324044069994, + "grad_norm": 0.07679502665996552, + "learning_rate": 3.333183311580179e-05, + "loss": 0.2321, + "step": 41970 + }, + { + "epoch": 3.4001134154244976, + "grad_norm": 0.07322760671377182, + "learning_rate": 3.3327332463207166e-05, + "loss": 0.2666, + "step": 41971 + }, + { + "epoch": 3.4001944264419963, + "grad_norm": 0.06411641091108322, + "learning_rate": 3.332283181061254e-05, + "loss": 0.2117, + "step": 41972 + }, + { + "epoch": 3.4002754374594946, + "grad_norm": 0.07164231687784195, + "learning_rate": 3.3318331158017914e-05, + "loss": 0.2492, + "step": 41973 + }, + { + "epoch": 3.400356448476993, + "grad_norm": 0.06656774133443832, + "learning_rate": 3.331383050542329e-05, + "loss": 0.2498, + "step": 41974 + }, + { + "epoch": 3.4004374594944915, + "grad_norm": 0.07032846659421921, + "learning_rate": 3.330932985282866e-05, + "loss": 0.2298, + "step": 41975 + }, + { + "epoch": 3.4005184705119897, + "grad_norm": 0.07302508503198624, + "learning_rate": 3.3304829200234034e-05, + "loss": 0.2349, + "step": 41976 + }, + { + "epoch": 3.400599481529488, + "grad_norm": 0.06317179650068283, + "learning_rate": 3.330032854763941e-05, + "loss": 0.2036, + "step": 41977 + }, + { + "epoch": 3.4006804925469862, + "grad_norm": 0.08565467596054077, + "learning_rate": 3.329582789504478e-05, + "loss": 0.2776, + "step": 41978 + }, + { + "epoch": 3.400761503564485, + "grad_norm": 0.06535403430461884, + "learning_rate": 3.3291327242450155e-05, + "loss": 0.1986, + "step": 41979 + }, + { + "epoch": 3.400842514581983, + "grad_norm": 0.0668690949678421, + "learning_rate": 3.328682658985553e-05, + "loss": 0.1971, + "step": 41980 + }, + { + "epoch": 3.4009235255994814, + "grad_norm": 0.07167673110961914, + "learning_rate": 3.32823259372609e-05, + "loss": 0.2251, + "step": 41981 + }, + { + "epoch": 3.40100453661698, + "grad_norm": 0.07527213543653488, + "learning_rate": 3.327782528466628e-05, + "loss": 0.2217, + "step": 41982 + }, + { + "epoch": 3.4010855476344783, + "grad_norm": 0.06892262399196625, + "learning_rate": 3.327332463207165e-05, + "loss": 0.2242, + "step": 41983 + }, + { + "epoch": 3.4011665586519766, + "grad_norm": 0.0705183818936348, + "learning_rate": 3.326882397947702e-05, + "loss": 0.2061, + "step": 41984 + }, + { + "epoch": 3.401247569669475, + "grad_norm": 0.0694856271147728, + "learning_rate": 3.3264323326882404e-05, + "loss": 0.2324, + "step": 41985 + }, + { + "epoch": 3.4013285806869735, + "grad_norm": 0.06926519423723221, + "learning_rate": 3.325982267428777e-05, + "loss": 0.2091, + "step": 41986 + }, + { + "epoch": 3.4014095917044718, + "grad_norm": 0.0646764412522316, + "learning_rate": 3.3255322021693144e-05, + "loss": 0.2188, + "step": 41987 + }, + { + "epoch": 3.40149060272197, + "grad_norm": 0.09222811460494995, + "learning_rate": 3.3250821369098524e-05, + "loss": 0.2191, + "step": 41988 + }, + { + "epoch": 3.4015716137394687, + "grad_norm": 0.06459737569093704, + "learning_rate": 3.324632071650389e-05, + "loss": 0.2464, + "step": 41989 + }, + { + "epoch": 3.401652624756967, + "grad_norm": 0.07572778314352036, + "learning_rate": 3.3241820063909265e-05, + "loss": 0.2692, + "step": 41990 + }, + { + "epoch": 3.401733635774465, + "grad_norm": 0.0832890048623085, + "learning_rate": 3.3237319411314645e-05, + "loss": 0.2134, + "step": 41991 + }, + { + "epoch": 3.401814646791964, + "grad_norm": 0.07352113723754883, + "learning_rate": 3.323281875872001e-05, + "loss": 0.2078, + "step": 41992 + }, + { + "epoch": 3.401895657809462, + "grad_norm": 0.08216522634029388, + "learning_rate": 3.3228318106125386e-05, + "loss": 0.2582, + "step": 41993 + }, + { + "epoch": 3.4019766688269604, + "grad_norm": 0.07835206389427185, + "learning_rate": 3.3223817453530766e-05, + "loss": 0.2371, + "step": 41994 + }, + { + "epoch": 3.402057679844459, + "grad_norm": 0.06377358734607697, + "learning_rate": 3.321931680093614e-05, + "loss": 0.2443, + "step": 41995 + }, + { + "epoch": 3.4021386908619573, + "grad_norm": 0.0716206505894661, + "learning_rate": 3.321481614834151e-05, + "loss": 0.2213, + "step": 41996 + }, + { + "epoch": 3.4022197018794555, + "grad_norm": 0.0773889496922493, + "learning_rate": 3.321031549574689e-05, + "loss": 0.2053, + "step": 41997 + }, + { + "epoch": 3.402300712896954, + "grad_norm": 0.05962640419602394, + "learning_rate": 3.320581484315226e-05, + "loss": 0.2136, + "step": 41998 + }, + { + "epoch": 3.4023817239144525, + "grad_norm": 0.07761330157518387, + "learning_rate": 3.3201314190557634e-05, + "loss": 0.2927, + "step": 41999 + }, + { + "epoch": 3.4024627349319507, + "grad_norm": 0.06332354247570038, + "learning_rate": 3.319681353796301e-05, + "loss": 0.2187, + "step": 42000 + }, + { + "epoch": 3.402543745949449, + "grad_norm": 0.06515365839004517, + "learning_rate": 3.319231288536838e-05, + "loss": 0.2044, + "step": 42001 + }, + { + "epoch": 3.4026247569669477, + "grad_norm": 0.07421457767486572, + "learning_rate": 3.3187812232773755e-05, + "loss": 0.2322, + "step": 42002 + }, + { + "epoch": 3.402705767984446, + "grad_norm": 0.06750653684139252, + "learning_rate": 3.318331158017913e-05, + "loss": 0.2325, + "step": 42003 + }, + { + "epoch": 3.402786779001944, + "grad_norm": 0.0802840143442154, + "learning_rate": 3.31788109275845e-05, + "loss": 0.2274, + "step": 42004 + }, + { + "epoch": 3.4028677900194424, + "grad_norm": 0.054571714252233505, + "learning_rate": 3.3174310274989876e-05, + "loss": 0.2, + "step": 42005 + }, + { + "epoch": 3.402948801036941, + "grad_norm": 0.05173730105161667, + "learning_rate": 3.316980962239525e-05, + "loss": 0.2173, + "step": 42006 + }, + { + "epoch": 3.4030298120544393, + "grad_norm": 0.07291796058416367, + "learning_rate": 3.316530896980062e-05, + "loss": 0.2111, + "step": 42007 + }, + { + "epoch": 3.4031108230719376, + "grad_norm": 0.07727206498384476, + "learning_rate": 3.3160808317205997e-05, + "loss": 0.218, + "step": 42008 + }, + { + "epoch": 3.4031918340894363, + "grad_norm": 0.07800658047199249, + "learning_rate": 3.315630766461137e-05, + "loss": 0.2317, + "step": 42009 + }, + { + "epoch": 3.4032728451069345, + "grad_norm": 0.07881782948970795, + "learning_rate": 3.3151807012016744e-05, + "loss": 0.2173, + "step": 42010 + }, + { + "epoch": 3.4033538561244328, + "grad_norm": 0.06382521986961365, + "learning_rate": 3.314730635942212e-05, + "loss": 0.2202, + "step": 42011 + }, + { + "epoch": 3.4034348671419314, + "grad_norm": 0.05812549963593483, + "learning_rate": 3.314280570682749e-05, + "loss": 0.2114, + "step": 42012 + }, + { + "epoch": 3.4035158781594297, + "grad_norm": 0.08771507441997528, + "learning_rate": 3.3138305054232865e-05, + "loss": 0.2637, + "step": 42013 + }, + { + "epoch": 3.403596889176928, + "grad_norm": 0.07088574022054672, + "learning_rate": 3.313380440163824e-05, + "loss": 0.2169, + "step": 42014 + }, + { + "epoch": 3.4036779001944266, + "grad_norm": 0.07294151932001114, + "learning_rate": 3.312930374904361e-05, + "loss": 0.2226, + "step": 42015 + }, + { + "epoch": 3.403758911211925, + "grad_norm": 0.06775092333555222, + "learning_rate": 3.3124803096448985e-05, + "loss": 0.2253, + "step": 42016 + }, + { + "epoch": 3.403839922229423, + "grad_norm": 0.07559601962566376, + "learning_rate": 3.312030244385436e-05, + "loss": 0.2339, + "step": 42017 + }, + { + "epoch": 3.403920933246922, + "grad_norm": 0.0761316642165184, + "learning_rate": 3.311580179125973e-05, + "loss": 0.213, + "step": 42018 + }, + { + "epoch": 3.40400194426442, + "grad_norm": 0.0763896033167839, + "learning_rate": 3.3111301138665106e-05, + "loss": 0.191, + "step": 42019 + }, + { + "epoch": 3.4040829552819183, + "grad_norm": 0.05668618530035019, + "learning_rate": 3.310680048607048e-05, + "loss": 0.202, + "step": 42020 + }, + { + "epoch": 3.4041639662994165, + "grad_norm": 0.07589863985776901, + "learning_rate": 3.310229983347586e-05, + "loss": 0.2269, + "step": 42021 + }, + { + "epoch": 3.404244977316915, + "grad_norm": 0.06405602395534515, + "learning_rate": 3.309779918088123e-05, + "loss": 0.1936, + "step": 42022 + }, + { + "epoch": 3.4043259883344135, + "grad_norm": 0.06749610602855682, + "learning_rate": 3.30932985282866e-05, + "loss": 0.2179, + "step": 42023 + }, + { + "epoch": 3.4044069993519117, + "grad_norm": 0.11305932700634003, + "learning_rate": 3.308879787569198e-05, + "loss": 0.259, + "step": 42024 + }, + { + "epoch": 3.4044880103694104, + "grad_norm": 0.07221528142690659, + "learning_rate": 3.308429722309735e-05, + "loss": 0.2161, + "step": 42025 + }, + { + "epoch": 3.4045690213869086, + "grad_norm": 0.08941753208637238, + "learning_rate": 3.307979657050272e-05, + "loss": 0.2061, + "step": 42026 + }, + { + "epoch": 3.404650032404407, + "grad_norm": 0.052454691380262375, + "learning_rate": 3.30752959179081e-05, + "loss": 0.1872, + "step": 42027 + }, + { + "epoch": 3.404731043421905, + "grad_norm": 0.07510961592197418, + "learning_rate": 3.307079526531347e-05, + "loss": 0.2377, + "step": 42028 + }, + { + "epoch": 3.404812054439404, + "grad_norm": 0.07032472640275955, + "learning_rate": 3.306629461271884e-05, + "loss": 0.2375, + "step": 42029 + }, + { + "epoch": 3.404893065456902, + "grad_norm": 0.082360178232193, + "learning_rate": 3.306179396012422e-05, + "loss": 0.2331, + "step": 42030 + }, + { + "epoch": 3.4049740764744003, + "grad_norm": 0.07889021933078766, + "learning_rate": 3.305729330752959e-05, + "loss": 0.2524, + "step": 42031 + }, + { + "epoch": 3.405055087491899, + "grad_norm": 0.0628356859087944, + "learning_rate": 3.305279265493497e-05, + "loss": 0.2124, + "step": 42032 + }, + { + "epoch": 3.4051360985093972, + "grad_norm": 0.07455432415008545, + "learning_rate": 3.304829200234034e-05, + "loss": 0.2268, + "step": 42033 + }, + { + "epoch": 3.4052171095268955, + "grad_norm": 0.06572439521551132, + "learning_rate": 3.304379134974572e-05, + "loss": 0.1964, + "step": 42034 + }, + { + "epoch": 3.405298120544394, + "grad_norm": 0.0840260237455368, + "learning_rate": 3.303929069715109e-05, + "loss": 0.2326, + "step": 42035 + }, + { + "epoch": 3.4053791315618924, + "grad_norm": 0.08687842637300491, + "learning_rate": 3.3034790044556464e-05, + "loss": 0.264, + "step": 42036 + }, + { + "epoch": 3.4054601425793907, + "grad_norm": 0.06921391189098358, + "learning_rate": 3.303028939196184e-05, + "loss": 0.2377, + "step": 42037 + }, + { + "epoch": 3.4055411535968894, + "grad_norm": 0.08886100351810455, + "learning_rate": 3.302578873936721e-05, + "loss": 0.2083, + "step": 42038 + }, + { + "epoch": 3.4056221646143876, + "grad_norm": 0.06395532190799713, + "learning_rate": 3.3021288086772585e-05, + "loss": 0.2289, + "step": 42039 + }, + { + "epoch": 3.405703175631886, + "grad_norm": 0.0698009729385376, + "learning_rate": 3.301678743417796e-05, + "loss": 0.2304, + "step": 42040 + }, + { + "epoch": 3.4057841866493845, + "grad_norm": 0.05952319875359535, + "learning_rate": 3.301228678158333e-05, + "loss": 0.2267, + "step": 42041 + }, + { + "epoch": 3.405865197666883, + "grad_norm": 0.08061492443084717, + "learning_rate": 3.3007786128988706e-05, + "loss": 0.2407, + "step": 42042 + }, + { + "epoch": 3.405946208684381, + "grad_norm": 0.0819898247718811, + "learning_rate": 3.300328547639408e-05, + "loss": 0.2488, + "step": 42043 + }, + { + "epoch": 3.4060272197018793, + "grad_norm": 0.08814553171396255, + "learning_rate": 3.299878482379945e-05, + "loss": 0.2529, + "step": 42044 + }, + { + "epoch": 3.406108230719378, + "grad_norm": 0.056524164974689484, + "learning_rate": 3.2994284171204827e-05, + "loss": 0.2292, + "step": 42045 + }, + { + "epoch": 3.406189241736876, + "grad_norm": 0.07974979281425476, + "learning_rate": 3.29897835186102e-05, + "loss": 0.2689, + "step": 42046 + }, + { + "epoch": 3.4062702527543745, + "grad_norm": 0.073820099234581, + "learning_rate": 3.2985282866015574e-05, + "loss": 0.1867, + "step": 42047 + }, + { + "epoch": 3.406351263771873, + "grad_norm": 0.08438839763402939, + "learning_rate": 3.298078221342095e-05, + "loss": 0.2103, + "step": 42048 + }, + { + "epoch": 3.4064322747893714, + "grad_norm": 0.08996589481830597, + "learning_rate": 3.297628156082632e-05, + "loss": 0.2489, + "step": 42049 + }, + { + "epoch": 3.4065132858068696, + "grad_norm": 0.07115452736616135, + "learning_rate": 3.2971780908231695e-05, + "loss": 0.2511, + "step": 42050 + }, + { + "epoch": 3.406594296824368, + "grad_norm": 0.057320065796375275, + "learning_rate": 3.296728025563707e-05, + "loss": 0.2197, + "step": 42051 + }, + { + "epoch": 3.4066753078418666, + "grad_norm": 0.07681119441986084, + "learning_rate": 3.296277960304244e-05, + "loss": 0.2344, + "step": 42052 + }, + { + "epoch": 3.406756318859365, + "grad_norm": 0.07666799426078796, + "learning_rate": 3.2958278950447815e-05, + "loss": 0.227, + "step": 42053 + }, + { + "epoch": 3.406837329876863, + "grad_norm": 0.13270337879657745, + "learning_rate": 3.295377829785319e-05, + "loss": 0.186, + "step": 42054 + }, + { + "epoch": 3.4069183408943617, + "grad_norm": 0.06277471780776978, + "learning_rate": 3.294927764525856e-05, + "loss": 0.2208, + "step": 42055 + }, + { + "epoch": 3.40699935191186, + "grad_norm": 0.07769570499658585, + "learning_rate": 3.2944776992663936e-05, + "loss": 0.221, + "step": 42056 + }, + { + "epoch": 3.4070803629293582, + "grad_norm": 0.08962923288345337, + "learning_rate": 3.294027634006931e-05, + "loss": 0.2116, + "step": 42057 + }, + { + "epoch": 3.407161373946857, + "grad_norm": 0.08468111604452133, + "learning_rate": 3.2935775687474683e-05, + "loss": 0.2549, + "step": 42058 + }, + { + "epoch": 3.407242384964355, + "grad_norm": 0.07267171889543533, + "learning_rate": 3.293127503488006e-05, + "loss": 0.2008, + "step": 42059 + }, + { + "epoch": 3.4073233959818534, + "grad_norm": 0.09492810070514679, + "learning_rate": 3.292677438228543e-05, + "loss": 0.2306, + "step": 42060 + }, + { + "epoch": 3.407404406999352, + "grad_norm": 0.06584914028644562, + "learning_rate": 3.2922273729690804e-05, + "loss": 0.1956, + "step": 42061 + }, + { + "epoch": 3.4074854180168503, + "grad_norm": 0.06724611669778824, + "learning_rate": 3.291777307709618e-05, + "loss": 0.2572, + "step": 42062 + }, + { + "epoch": 3.4075664290343486, + "grad_norm": 0.066348135471344, + "learning_rate": 3.291327242450156e-05, + "loss": 0.2384, + "step": 42063 + }, + { + "epoch": 3.4076474400518473, + "grad_norm": 0.08316512405872345, + "learning_rate": 3.2908771771906925e-05, + "loss": 0.2129, + "step": 42064 + }, + { + "epoch": 3.4077284510693455, + "grad_norm": 0.0932237058877945, + "learning_rate": 3.2904271119312305e-05, + "loss": 0.2358, + "step": 42065 + }, + { + "epoch": 3.4078094620868438, + "grad_norm": 0.07332948595285416, + "learning_rate": 3.289977046671768e-05, + "loss": 0.2339, + "step": 42066 + }, + { + "epoch": 3.407890473104342, + "grad_norm": 0.07958756387233734, + "learning_rate": 3.2895269814123046e-05, + "loss": 0.2409, + "step": 42067 + }, + { + "epoch": 3.4079714841218407, + "grad_norm": 0.08175527304410934, + "learning_rate": 3.2890769161528426e-05, + "loss": 0.2372, + "step": 42068 + }, + { + "epoch": 3.408052495139339, + "grad_norm": 0.08566399663686752, + "learning_rate": 3.28862685089338e-05, + "loss": 0.2355, + "step": 42069 + }, + { + "epoch": 3.408133506156837, + "grad_norm": 0.07212253659963608, + "learning_rate": 3.288176785633917e-05, + "loss": 0.2183, + "step": 42070 + }, + { + "epoch": 3.408214517174336, + "grad_norm": 0.08052658289670944, + "learning_rate": 3.287726720374455e-05, + "loss": 0.215, + "step": 42071 + }, + { + "epoch": 3.408295528191834, + "grad_norm": 0.07595360279083252, + "learning_rate": 3.287276655114992e-05, + "loss": 0.2222, + "step": 42072 + }, + { + "epoch": 3.4083765392093324, + "grad_norm": 0.08452226221561432, + "learning_rate": 3.286826589855529e-05, + "loss": 0.2687, + "step": 42073 + }, + { + "epoch": 3.4084575502268306, + "grad_norm": 0.06990109384059906, + "learning_rate": 3.286376524596067e-05, + "loss": 0.2341, + "step": 42074 + }, + { + "epoch": 3.4085385612443293, + "grad_norm": 0.0811610296368599, + "learning_rate": 3.285926459336604e-05, + "loss": 0.2131, + "step": 42075 + }, + { + "epoch": 3.4086195722618275, + "grad_norm": 0.0643961951136589, + "learning_rate": 3.2854763940771415e-05, + "loss": 0.2123, + "step": 42076 + }, + { + "epoch": 3.408700583279326, + "grad_norm": 0.08913237601518631, + "learning_rate": 3.285026328817679e-05, + "loss": 0.2205, + "step": 42077 + }, + { + "epoch": 3.4087815942968245, + "grad_norm": 0.07711271941661835, + "learning_rate": 3.284576263558216e-05, + "loss": 0.2441, + "step": 42078 + }, + { + "epoch": 3.4088626053143227, + "grad_norm": 0.0935467854142189, + "learning_rate": 3.2841261982987536e-05, + "loss": 0.2372, + "step": 42079 + }, + { + "epoch": 3.408943616331821, + "grad_norm": 0.06729929149150848, + "learning_rate": 3.283676133039291e-05, + "loss": 0.1943, + "step": 42080 + }, + { + "epoch": 3.4090246273493197, + "grad_norm": 0.07701172679662704, + "learning_rate": 3.283226067779828e-05, + "loss": 0.2222, + "step": 42081 + }, + { + "epoch": 3.409105638366818, + "grad_norm": 0.06777993589639664, + "learning_rate": 3.282776002520366e-05, + "loss": 0.2144, + "step": 42082 + }, + { + "epoch": 3.409186649384316, + "grad_norm": 0.06507458537817001, + "learning_rate": 3.282325937260903e-05, + "loss": 0.22, + "step": 42083 + }, + { + "epoch": 3.409267660401815, + "grad_norm": 0.07021746039390564, + "learning_rate": 3.2818758720014404e-05, + "loss": 0.2211, + "step": 42084 + }, + { + "epoch": 3.409348671419313, + "grad_norm": 0.07517733424901962, + "learning_rate": 3.281425806741978e-05, + "loss": 0.23, + "step": 42085 + }, + { + "epoch": 3.4094296824368113, + "grad_norm": 0.06435801088809967, + "learning_rate": 3.280975741482515e-05, + "loss": 0.2325, + "step": 42086 + }, + { + "epoch": 3.40951069345431, + "grad_norm": 0.09303963929414749, + "learning_rate": 3.2805256762230525e-05, + "loss": 0.2481, + "step": 42087 + }, + { + "epoch": 3.4095917044718083, + "grad_norm": 0.06462705135345459, + "learning_rate": 3.28007561096359e-05, + "loss": 0.2269, + "step": 42088 + }, + { + "epoch": 3.4096727154893065, + "grad_norm": 0.07319168001413345, + "learning_rate": 3.279625545704127e-05, + "loss": 0.2431, + "step": 42089 + }, + { + "epoch": 3.4097537265068047, + "grad_norm": 0.07174596190452576, + "learning_rate": 3.2791754804446646e-05, + "loss": 0.2157, + "step": 42090 + }, + { + "epoch": 3.4098347375243034, + "grad_norm": 0.07358871400356293, + "learning_rate": 3.278725415185202e-05, + "loss": 0.206, + "step": 42091 + }, + { + "epoch": 3.4099157485418017, + "grad_norm": 0.07019001245498657, + "learning_rate": 3.278275349925739e-05, + "loss": 0.1846, + "step": 42092 + }, + { + "epoch": 3.4099967595593, + "grad_norm": 0.08530454337596893, + "learning_rate": 3.2778252846662766e-05, + "loss": 0.2406, + "step": 42093 + }, + { + "epoch": 3.4100777705767986, + "grad_norm": 0.0668715164065361, + "learning_rate": 3.277375219406814e-05, + "loss": 0.2663, + "step": 42094 + }, + { + "epoch": 3.410158781594297, + "grad_norm": 0.07468143850564957, + "learning_rate": 3.2769251541473514e-05, + "loss": 0.2462, + "step": 42095 + }, + { + "epoch": 3.410239792611795, + "grad_norm": 0.06530489772558212, + "learning_rate": 3.276475088887889e-05, + "loss": 0.2158, + "step": 42096 + }, + { + "epoch": 3.4103208036292934, + "grad_norm": 0.07643679529428482, + "learning_rate": 3.276025023628426e-05, + "loss": 0.2148, + "step": 42097 + }, + { + "epoch": 3.410401814646792, + "grad_norm": 0.08456093817949295, + "learning_rate": 3.275574958368964e-05, + "loss": 0.2449, + "step": 42098 + }, + { + "epoch": 3.4104828256642903, + "grad_norm": 0.07500334829092026, + "learning_rate": 3.275124893109501e-05, + "loss": 0.2554, + "step": 42099 + }, + { + "epoch": 3.4105638366817885, + "grad_norm": 0.05971658602356911, + "learning_rate": 3.274674827850038e-05, + "loss": 0.2061, + "step": 42100 + }, + { + "epoch": 3.410644847699287, + "grad_norm": 0.06988444924354553, + "learning_rate": 3.274224762590576e-05, + "loss": 0.239, + "step": 42101 + }, + { + "epoch": 3.4107258587167855, + "grad_norm": 0.0700775682926178, + "learning_rate": 3.2737746973311136e-05, + "loss": 0.2653, + "step": 42102 + }, + { + "epoch": 3.4108068697342837, + "grad_norm": 0.08620148152112961, + "learning_rate": 3.27332463207165e-05, + "loss": 0.209, + "step": 42103 + }, + { + "epoch": 3.4108878807517824, + "grad_norm": 0.06676038354635239, + "learning_rate": 3.272874566812188e-05, + "loss": 0.2513, + "step": 42104 + }, + { + "epoch": 3.4109688917692806, + "grad_norm": 0.07150786370038986, + "learning_rate": 3.2724245015527256e-05, + "loss": 0.2662, + "step": 42105 + }, + { + "epoch": 3.411049902786779, + "grad_norm": 0.07146339863538742, + "learning_rate": 3.271974436293262e-05, + "loss": 0.2214, + "step": 42106 + }, + { + "epoch": 3.4111309138042776, + "grad_norm": 0.07464755326509476, + "learning_rate": 3.2715243710338004e-05, + "loss": 0.2759, + "step": 42107 + }, + { + "epoch": 3.411211924821776, + "grad_norm": 0.08268515020608902, + "learning_rate": 3.271074305774338e-05, + "loss": 0.2658, + "step": 42108 + }, + { + "epoch": 3.411292935839274, + "grad_norm": 0.05757330358028412, + "learning_rate": 3.2706242405148744e-05, + "loss": 0.2068, + "step": 42109 + }, + { + "epoch": 3.4113739468567728, + "grad_norm": 0.06498200446367264, + "learning_rate": 3.2701741752554124e-05, + "loss": 0.2338, + "step": 42110 + }, + { + "epoch": 3.411454957874271, + "grad_norm": 0.08416248857975006, + "learning_rate": 3.26972410999595e-05, + "loss": 0.232, + "step": 42111 + }, + { + "epoch": 3.4115359688917692, + "grad_norm": 0.07173217833042145, + "learning_rate": 3.2692740447364865e-05, + "loss": 0.2071, + "step": 42112 + }, + { + "epoch": 3.4116169799092675, + "grad_norm": 0.06545745581388474, + "learning_rate": 3.2688239794770245e-05, + "loss": 0.2342, + "step": 42113 + }, + { + "epoch": 3.411697990926766, + "grad_norm": 0.0655227079987526, + "learning_rate": 3.268373914217562e-05, + "loss": 0.2479, + "step": 42114 + }, + { + "epoch": 3.4117790019442644, + "grad_norm": 0.07191194593906403, + "learning_rate": 3.267923848958099e-05, + "loss": 0.2217, + "step": 42115 + }, + { + "epoch": 3.4118600129617627, + "grad_norm": 0.07084406167268753, + "learning_rate": 3.2674737836986366e-05, + "loss": 0.224, + "step": 42116 + }, + { + "epoch": 3.4119410239792614, + "grad_norm": 0.06499455124139786, + "learning_rate": 3.267023718439174e-05, + "loss": 0.2151, + "step": 42117 + }, + { + "epoch": 3.4120220349967596, + "grad_norm": 0.06535441428422928, + "learning_rate": 3.266573653179711e-05, + "loss": 0.2235, + "step": 42118 + }, + { + "epoch": 3.412103046014258, + "grad_norm": 0.064212866127491, + "learning_rate": 3.266123587920249e-05, + "loss": 0.2189, + "step": 42119 + }, + { + "epoch": 3.412184057031756, + "grad_norm": 0.06990158557891846, + "learning_rate": 3.265673522660786e-05, + "loss": 0.2222, + "step": 42120 + }, + { + "epoch": 3.412265068049255, + "grad_norm": 0.06559919565916061, + "learning_rate": 3.2652234574013234e-05, + "loss": 0.2046, + "step": 42121 + }, + { + "epoch": 3.412346079066753, + "grad_norm": 0.08060586452484131, + "learning_rate": 3.264773392141861e-05, + "loss": 0.2186, + "step": 42122 + }, + { + "epoch": 3.4124270900842513, + "grad_norm": 0.07627402991056442, + "learning_rate": 3.264323326882398e-05, + "loss": 0.231, + "step": 42123 + }, + { + "epoch": 3.41250810110175, + "grad_norm": 0.07456567883491516, + "learning_rate": 3.2638732616229355e-05, + "loss": 0.2277, + "step": 42124 + }, + { + "epoch": 3.412589112119248, + "grad_norm": 0.0751354917883873, + "learning_rate": 3.263423196363473e-05, + "loss": 0.2283, + "step": 42125 + }, + { + "epoch": 3.4126701231367464, + "grad_norm": 0.08349383622407913, + "learning_rate": 3.26297313110401e-05, + "loss": 0.207, + "step": 42126 + }, + { + "epoch": 3.412751134154245, + "grad_norm": 0.07395781576633453, + "learning_rate": 3.2625230658445476e-05, + "loss": 0.2362, + "step": 42127 + }, + { + "epoch": 3.4128321451717434, + "grad_norm": 0.06975575536489487, + "learning_rate": 3.262073000585085e-05, + "loss": 0.209, + "step": 42128 + }, + { + "epoch": 3.4129131561892416, + "grad_norm": 0.060393065214157104, + "learning_rate": 3.261622935325622e-05, + "loss": 0.2266, + "step": 42129 + }, + { + "epoch": 3.4129941672067403, + "grad_norm": 0.08148559927940369, + "learning_rate": 3.2611728700661596e-05, + "loss": 0.2365, + "step": 42130 + }, + { + "epoch": 3.4130751782242386, + "grad_norm": 0.07058338075876236, + "learning_rate": 3.260722804806697e-05, + "loss": 0.2393, + "step": 42131 + }, + { + "epoch": 3.413156189241737, + "grad_norm": 0.09528659284114838, + "learning_rate": 3.2602727395472344e-05, + "loss": 0.2786, + "step": 42132 + }, + { + "epoch": 3.4132372002592355, + "grad_norm": 0.08425279706716537, + "learning_rate": 3.259822674287772e-05, + "loss": 0.2412, + "step": 42133 + }, + { + "epoch": 3.4133182112767337, + "grad_norm": 0.07566172629594803, + "learning_rate": 3.25937260902831e-05, + "loss": 0.2079, + "step": 42134 + }, + { + "epoch": 3.413399222294232, + "grad_norm": 0.08536390215158463, + "learning_rate": 3.2589225437688464e-05, + "loss": 0.245, + "step": 42135 + }, + { + "epoch": 3.4134802333117302, + "grad_norm": 0.05408225953578949, + "learning_rate": 3.258472478509384e-05, + "loss": 0.1951, + "step": 42136 + }, + { + "epoch": 3.413561244329229, + "grad_norm": 0.08090090751647949, + "learning_rate": 3.258022413249922e-05, + "loss": 0.2493, + "step": 42137 + }, + { + "epoch": 3.413642255346727, + "grad_norm": 0.07815258949995041, + "learning_rate": 3.2575723479904585e-05, + "loss": 0.2374, + "step": 42138 + }, + { + "epoch": 3.4137232663642254, + "grad_norm": 0.07899107038974762, + "learning_rate": 3.257122282730996e-05, + "loss": 0.2339, + "step": 42139 + }, + { + "epoch": 3.413804277381724, + "grad_norm": 0.08084166795015335, + "learning_rate": 3.256672217471534e-05, + "loss": 0.2064, + "step": 42140 + }, + { + "epoch": 3.4138852883992223, + "grad_norm": 0.06659887731075287, + "learning_rate": 3.256222152212071e-05, + "loss": 0.2341, + "step": 42141 + }, + { + "epoch": 3.4139662994167206, + "grad_norm": 0.06368932873010635, + "learning_rate": 3.255772086952608e-05, + "loss": 0.2005, + "step": 42142 + }, + { + "epoch": 3.414047310434219, + "grad_norm": 0.07554011791944504, + "learning_rate": 3.255322021693146e-05, + "loss": 0.2267, + "step": 42143 + }, + { + "epoch": 3.4141283214517175, + "grad_norm": 0.07116712629795074, + "learning_rate": 3.2548719564336834e-05, + "loss": 0.2392, + "step": 42144 + }, + { + "epoch": 3.4142093324692158, + "grad_norm": 0.07155577838420868, + "learning_rate": 3.25442189117422e-05, + "loss": 0.2459, + "step": 42145 + }, + { + "epoch": 3.414290343486714, + "grad_norm": 0.07143085449934006, + "learning_rate": 3.253971825914758e-05, + "loss": 0.2184, + "step": 42146 + }, + { + "epoch": 3.4143713545042127, + "grad_norm": 0.07936885952949524, + "learning_rate": 3.2535217606552954e-05, + "loss": 0.2474, + "step": 42147 + }, + { + "epoch": 3.414452365521711, + "grad_norm": 0.06565649062395096, + "learning_rate": 3.253071695395832e-05, + "loss": 0.2152, + "step": 42148 + }, + { + "epoch": 3.414533376539209, + "grad_norm": 0.0671452209353447, + "learning_rate": 3.25262163013637e-05, + "loss": 0.2304, + "step": 42149 + }, + { + "epoch": 3.414614387556708, + "grad_norm": 0.08637391775846481, + "learning_rate": 3.2521715648769075e-05, + "loss": 0.209, + "step": 42150 + }, + { + "epoch": 3.414695398574206, + "grad_norm": 0.082486093044281, + "learning_rate": 3.251721499617444e-05, + "loss": 0.2421, + "step": 42151 + }, + { + "epoch": 3.4147764095917044, + "grad_norm": 0.0916626825928688, + "learning_rate": 3.251271434357982e-05, + "loss": 0.2255, + "step": 42152 + }, + { + "epoch": 3.414857420609203, + "grad_norm": 0.08788851648569107, + "learning_rate": 3.2508213690985196e-05, + "loss": 0.246, + "step": 42153 + }, + { + "epoch": 3.4149384316267013, + "grad_norm": 0.06302738189697266, + "learning_rate": 3.250371303839057e-05, + "loss": 0.2306, + "step": 42154 + }, + { + "epoch": 3.4150194426441995, + "grad_norm": 0.0750952884554863, + "learning_rate": 3.249921238579594e-05, + "loss": 0.2855, + "step": 42155 + }, + { + "epoch": 3.4151004536616982, + "grad_norm": 0.06570327281951904, + "learning_rate": 3.249471173320132e-05, + "loss": 0.1777, + "step": 42156 + }, + { + "epoch": 3.4151814646791965, + "grad_norm": 0.0844057947397232, + "learning_rate": 3.249021108060669e-05, + "loss": 0.2406, + "step": 42157 + }, + { + "epoch": 3.4152624756966947, + "grad_norm": 0.0791000947356224, + "learning_rate": 3.2485710428012064e-05, + "loss": 0.2303, + "step": 42158 + }, + { + "epoch": 3.415343486714193, + "grad_norm": 0.07193057239055634, + "learning_rate": 3.248120977541744e-05, + "loss": 0.2337, + "step": 42159 + }, + { + "epoch": 3.4154244977316917, + "grad_norm": 0.06310512125492096, + "learning_rate": 3.247670912282281e-05, + "loss": 0.2387, + "step": 42160 + }, + { + "epoch": 3.41550550874919, + "grad_norm": 0.06961341202259064, + "learning_rate": 3.2472208470228185e-05, + "loss": 0.1972, + "step": 42161 + }, + { + "epoch": 3.415586519766688, + "grad_norm": 0.08259893208742142, + "learning_rate": 3.246770781763356e-05, + "loss": 0.2519, + "step": 42162 + }, + { + "epoch": 3.415667530784187, + "grad_norm": 0.07711710780858994, + "learning_rate": 3.246320716503893e-05, + "loss": 0.2417, + "step": 42163 + }, + { + "epoch": 3.415748541801685, + "grad_norm": 0.07411278784275055, + "learning_rate": 3.2458706512444306e-05, + "loss": 0.2201, + "step": 42164 + }, + { + "epoch": 3.4158295528191833, + "grad_norm": 0.06382159888744354, + "learning_rate": 3.245420585984968e-05, + "loss": 0.2448, + "step": 42165 + }, + { + "epoch": 3.4159105638366816, + "grad_norm": 0.07473917305469513, + "learning_rate": 3.244970520725505e-05, + "loss": 0.2149, + "step": 42166 + }, + { + "epoch": 3.4159915748541803, + "grad_norm": 0.06263245642185211, + "learning_rate": 3.244520455466043e-05, + "loss": 0.2017, + "step": 42167 + }, + { + "epoch": 3.4160725858716785, + "grad_norm": 0.08167140185832977, + "learning_rate": 3.24407039020658e-05, + "loss": 0.2018, + "step": 42168 + }, + { + "epoch": 3.4161535968891767, + "grad_norm": 0.06857883185148239, + "learning_rate": 3.2436203249471174e-05, + "loss": 0.2309, + "step": 42169 + }, + { + "epoch": 3.4162346079066754, + "grad_norm": 0.06811191141605377, + "learning_rate": 3.2431702596876554e-05, + "loss": 0.2043, + "step": 42170 + }, + { + "epoch": 3.4163156189241737, + "grad_norm": 0.06153199449181557, + "learning_rate": 3.242720194428192e-05, + "loss": 0.2259, + "step": 42171 + }, + { + "epoch": 3.416396629941672, + "grad_norm": 0.07126566022634506, + "learning_rate": 3.2422701291687295e-05, + "loss": 0.2328, + "step": 42172 + }, + { + "epoch": 3.4164776409591706, + "grad_norm": 0.06919670104980469, + "learning_rate": 3.2418200639092675e-05, + "loss": 0.2584, + "step": 42173 + }, + { + "epoch": 3.416558651976669, + "grad_norm": 0.07292457669973373, + "learning_rate": 3.241369998649804e-05, + "loss": 0.2119, + "step": 42174 + }, + { + "epoch": 3.416639662994167, + "grad_norm": 0.08359368145465851, + "learning_rate": 3.2409199333903415e-05, + "loss": 0.2248, + "step": 42175 + }, + { + "epoch": 3.416720674011666, + "grad_norm": 0.06527353078126907, + "learning_rate": 3.2404698681308796e-05, + "loss": 0.2011, + "step": 42176 + }, + { + "epoch": 3.416801685029164, + "grad_norm": 0.06260892003774643, + "learning_rate": 3.240019802871416e-05, + "loss": 0.2024, + "step": 42177 + }, + { + "epoch": 3.4168826960466623, + "grad_norm": 0.06110214442014694, + "learning_rate": 3.2395697376119536e-05, + "loss": 0.195, + "step": 42178 + }, + { + "epoch": 3.416963707064161, + "grad_norm": 0.08721933513879776, + "learning_rate": 3.2391196723524917e-05, + "loss": 0.2181, + "step": 42179 + }, + { + "epoch": 3.417044718081659, + "grad_norm": 0.07511059194803238, + "learning_rate": 3.238669607093029e-05, + "loss": 0.2377, + "step": 42180 + }, + { + "epoch": 3.4171257290991575, + "grad_norm": 0.06601770967245102, + "learning_rate": 3.238219541833566e-05, + "loss": 0.2181, + "step": 42181 + }, + { + "epoch": 3.4172067401166557, + "grad_norm": 0.07815555483102798, + "learning_rate": 3.237769476574104e-05, + "loss": 0.2616, + "step": 42182 + }, + { + "epoch": 3.4172877511341544, + "grad_norm": 0.08321576565504074, + "learning_rate": 3.237319411314641e-05, + "loss": 0.2319, + "step": 42183 + }, + { + "epoch": 3.4173687621516526, + "grad_norm": 0.06954724341630936, + "learning_rate": 3.236869346055178e-05, + "loss": 0.2003, + "step": 42184 + }, + { + "epoch": 3.417449773169151, + "grad_norm": 0.07065138220787048, + "learning_rate": 3.236419280795716e-05, + "loss": 0.205, + "step": 42185 + }, + { + "epoch": 3.4175307841866496, + "grad_norm": 0.07739468663930893, + "learning_rate": 3.235969215536253e-05, + "loss": 0.2019, + "step": 42186 + }, + { + "epoch": 3.417611795204148, + "grad_norm": 0.06493613123893738, + "learning_rate": 3.23551915027679e-05, + "loss": 0.238, + "step": 42187 + }, + { + "epoch": 3.417692806221646, + "grad_norm": 0.07237594574689865, + "learning_rate": 3.235069085017328e-05, + "loss": 0.2355, + "step": 42188 + }, + { + "epoch": 3.4177738172391443, + "grad_norm": 0.08828700333833694, + "learning_rate": 3.234619019757865e-05, + "loss": 0.2311, + "step": 42189 + }, + { + "epoch": 3.417854828256643, + "grad_norm": 0.09129707515239716, + "learning_rate": 3.234168954498402e-05, + "loss": 0.237, + "step": 42190 + }, + { + "epoch": 3.4179358392741412, + "grad_norm": 0.06474599242210388, + "learning_rate": 3.23371888923894e-05, + "loss": 0.2354, + "step": 42191 + }, + { + "epoch": 3.4180168502916395, + "grad_norm": 0.07153916358947754, + "learning_rate": 3.2332688239794773e-05, + "loss": 0.1957, + "step": 42192 + }, + { + "epoch": 3.418097861309138, + "grad_norm": 0.074933722615242, + "learning_rate": 3.232818758720015e-05, + "loss": 0.282, + "step": 42193 + }, + { + "epoch": 3.4181788723266364, + "grad_norm": 0.07410770654678345, + "learning_rate": 3.232368693460552e-05, + "loss": 0.2521, + "step": 42194 + }, + { + "epoch": 3.4182598833441347, + "grad_norm": 0.0762971043586731, + "learning_rate": 3.2319186282010894e-05, + "loss": 0.252, + "step": 42195 + }, + { + "epoch": 3.4183408943616334, + "grad_norm": 0.06339481472969055, + "learning_rate": 3.231468562941627e-05, + "loss": 0.2324, + "step": 42196 + }, + { + "epoch": 3.4184219053791316, + "grad_norm": 0.0860566571354866, + "learning_rate": 3.231018497682164e-05, + "loss": 0.2395, + "step": 42197 + }, + { + "epoch": 3.41850291639663, + "grad_norm": 0.07654612511396408, + "learning_rate": 3.2305684324227015e-05, + "loss": 0.2552, + "step": 42198 + }, + { + "epoch": 3.4185839274141285, + "grad_norm": 0.06669507175683975, + "learning_rate": 3.230118367163239e-05, + "loss": 0.2329, + "step": 42199 + }, + { + "epoch": 3.4186649384316268, + "grad_norm": 0.06842575967311859, + "learning_rate": 3.229668301903776e-05, + "loss": 0.2053, + "step": 42200 + }, + { + "epoch": 3.418745949449125, + "grad_norm": 0.07196977734565735, + "learning_rate": 3.2292182366443136e-05, + "loss": 0.2686, + "step": 42201 + }, + { + "epoch": 3.4188269604666237, + "grad_norm": 0.06731472909450531, + "learning_rate": 3.228768171384851e-05, + "loss": 0.2441, + "step": 42202 + }, + { + "epoch": 3.418907971484122, + "grad_norm": 0.09737128764390945, + "learning_rate": 3.228318106125388e-05, + "loss": 0.2306, + "step": 42203 + }, + { + "epoch": 3.41898898250162, + "grad_norm": 0.06198712810873985, + "learning_rate": 3.227868040865926e-05, + "loss": 0.2055, + "step": 42204 + }, + { + "epoch": 3.4190699935191184, + "grad_norm": 0.0738559439778328, + "learning_rate": 3.227417975606463e-05, + "loss": 0.2272, + "step": 42205 + }, + { + "epoch": 3.419151004536617, + "grad_norm": 0.07987718284130096, + "learning_rate": 3.226967910347001e-05, + "loss": 0.2554, + "step": 42206 + }, + { + "epoch": 3.4192320155541154, + "grad_norm": 0.07169181108474731, + "learning_rate": 3.226517845087538e-05, + "loss": 0.1866, + "step": 42207 + }, + { + "epoch": 3.4193130265716136, + "grad_norm": 0.0694207176566124, + "learning_rate": 3.226067779828075e-05, + "loss": 0.2508, + "step": 42208 + }, + { + "epoch": 3.4193940375891123, + "grad_norm": 0.07899581640958786, + "learning_rate": 3.225617714568613e-05, + "loss": 0.2186, + "step": 42209 + }, + { + "epoch": 3.4194750486066106, + "grad_norm": 0.05652213841676712, + "learning_rate": 3.22516764930915e-05, + "loss": 0.211, + "step": 42210 + }, + { + "epoch": 3.419556059624109, + "grad_norm": 0.06238555163145065, + "learning_rate": 3.224717584049687e-05, + "loss": 0.2145, + "step": 42211 + }, + { + "epoch": 3.419637070641607, + "grad_norm": 0.07030345499515533, + "learning_rate": 3.224267518790225e-05, + "loss": 0.2142, + "step": 42212 + }, + { + "epoch": 3.4197180816591057, + "grad_norm": 0.06452500075101852, + "learning_rate": 3.223817453530762e-05, + "loss": 0.2548, + "step": 42213 + }, + { + "epoch": 3.419799092676604, + "grad_norm": 0.06940393894910812, + "learning_rate": 3.223367388271299e-05, + "loss": 0.2664, + "step": 42214 + }, + { + "epoch": 3.4198801036941022, + "grad_norm": 0.06870554387569427, + "learning_rate": 3.222917323011837e-05, + "loss": 0.1984, + "step": 42215 + }, + { + "epoch": 3.419961114711601, + "grad_norm": 0.0725892186164856, + "learning_rate": 3.222467257752374e-05, + "loss": 0.2112, + "step": 42216 + }, + { + "epoch": 3.420042125729099, + "grad_norm": 0.08166629821062088, + "learning_rate": 3.2220171924929113e-05, + "loss": 0.2612, + "step": 42217 + }, + { + "epoch": 3.4201231367465974, + "grad_norm": 0.09648881107568741, + "learning_rate": 3.2215671272334494e-05, + "loss": 0.2564, + "step": 42218 + }, + { + "epoch": 3.420204147764096, + "grad_norm": 0.07916789501905441, + "learning_rate": 3.221117061973986e-05, + "loss": 0.2157, + "step": 42219 + }, + { + "epoch": 3.4202851587815943, + "grad_norm": 0.07626831531524658, + "learning_rate": 3.2206669967145234e-05, + "loss": 0.2415, + "step": 42220 + }, + { + "epoch": 3.4203661697990926, + "grad_norm": 0.06276006996631622, + "learning_rate": 3.2202169314550615e-05, + "loss": 0.1931, + "step": 42221 + }, + { + "epoch": 3.4204471808165913, + "grad_norm": 0.06504850834608078, + "learning_rate": 3.219766866195599e-05, + "loss": 0.2402, + "step": 42222 + }, + { + "epoch": 3.4205281918340895, + "grad_norm": 0.07947579771280289, + "learning_rate": 3.2193168009361355e-05, + "loss": 0.2329, + "step": 42223 + }, + { + "epoch": 3.4206092028515878, + "grad_norm": 0.06318650394678116, + "learning_rate": 3.2188667356766735e-05, + "loss": 0.2145, + "step": 42224 + }, + { + "epoch": 3.420690213869086, + "grad_norm": 0.07059568166732788, + "learning_rate": 3.218416670417211e-05, + "loss": 0.2133, + "step": 42225 + }, + { + "epoch": 3.4207712248865847, + "grad_norm": 0.06386543810367584, + "learning_rate": 3.2179666051577476e-05, + "loss": 0.2222, + "step": 42226 + }, + { + "epoch": 3.420852235904083, + "grad_norm": 0.061109598726034164, + "learning_rate": 3.2175165398982856e-05, + "loss": 0.2319, + "step": 42227 + }, + { + "epoch": 3.420933246921581, + "grad_norm": 0.06750620156526566, + "learning_rate": 3.217066474638823e-05, + "loss": 0.2535, + "step": 42228 + }, + { + "epoch": 3.42101425793908, + "grad_norm": 0.06807658076286316, + "learning_rate": 3.21661640937936e-05, + "loss": 0.2076, + "step": 42229 + }, + { + "epoch": 3.421095268956578, + "grad_norm": 0.06978774815797806, + "learning_rate": 3.216166344119898e-05, + "loss": 0.2202, + "step": 42230 + }, + { + "epoch": 3.4211762799740764, + "grad_norm": 0.07130514085292816, + "learning_rate": 3.215716278860435e-05, + "loss": 0.2647, + "step": 42231 + }, + { + "epoch": 3.4212572909915746, + "grad_norm": 0.04566334933042526, + "learning_rate": 3.215266213600972e-05, + "loss": 0.2001, + "step": 42232 + }, + { + "epoch": 3.4213383020090733, + "grad_norm": 0.06830474734306335, + "learning_rate": 3.21481614834151e-05, + "loss": 0.2452, + "step": 42233 + }, + { + "epoch": 3.4214193130265715, + "grad_norm": 0.0593663826584816, + "learning_rate": 3.214366083082047e-05, + "loss": 0.2232, + "step": 42234 + }, + { + "epoch": 3.42150032404407, + "grad_norm": 0.06897901743650436, + "learning_rate": 3.2139160178225845e-05, + "loss": 0.259, + "step": 42235 + }, + { + "epoch": 3.4215813350615685, + "grad_norm": 0.06421735137701035, + "learning_rate": 3.213465952563122e-05, + "loss": 0.2263, + "step": 42236 + }, + { + "epoch": 3.4216623460790667, + "grad_norm": 0.0799030214548111, + "learning_rate": 3.213015887303659e-05, + "loss": 0.2761, + "step": 42237 + }, + { + "epoch": 3.421743357096565, + "grad_norm": 0.07802151888608932, + "learning_rate": 3.2125658220441966e-05, + "loss": 0.219, + "step": 42238 + }, + { + "epoch": 3.4218243681140637, + "grad_norm": 0.08840132504701614, + "learning_rate": 3.212115756784734e-05, + "loss": 0.2507, + "step": 42239 + }, + { + "epoch": 3.421905379131562, + "grad_norm": 0.09409713000059128, + "learning_rate": 3.211665691525271e-05, + "loss": 0.2447, + "step": 42240 + }, + { + "epoch": 3.42198639014906, + "grad_norm": 0.08871884644031525, + "learning_rate": 3.211215626265809e-05, + "loss": 0.2194, + "step": 42241 + }, + { + "epoch": 3.422067401166559, + "grad_norm": 0.06486831605434418, + "learning_rate": 3.210765561006346e-05, + "loss": 0.2718, + "step": 42242 + }, + { + "epoch": 3.422148412184057, + "grad_norm": 0.07197066396474838, + "learning_rate": 3.2103154957468834e-05, + "loss": 0.2166, + "step": 42243 + }, + { + "epoch": 3.4222294232015553, + "grad_norm": 0.0681011825799942, + "learning_rate": 3.209865430487421e-05, + "loss": 0.2244, + "step": 42244 + }, + { + "epoch": 3.422310434219054, + "grad_norm": 0.07793204486370087, + "learning_rate": 3.209415365227958e-05, + "loss": 0.2591, + "step": 42245 + }, + { + "epoch": 3.4223914452365523, + "grad_norm": 0.08512086421251297, + "learning_rate": 3.2089652999684955e-05, + "loss": 0.2473, + "step": 42246 + }, + { + "epoch": 3.4224724562540505, + "grad_norm": 0.07522961497306824, + "learning_rate": 3.208515234709033e-05, + "loss": 0.2238, + "step": 42247 + }, + { + "epoch": 3.4225534672715487, + "grad_norm": 0.059873390942811966, + "learning_rate": 3.208065169449571e-05, + "loss": 0.2099, + "step": 42248 + }, + { + "epoch": 3.4226344782890474, + "grad_norm": 0.07078446447849274, + "learning_rate": 3.2076151041901076e-05, + "loss": 0.2062, + "step": 42249 + }, + { + "epoch": 3.4227154893065457, + "grad_norm": 0.08134453743696213, + "learning_rate": 3.207165038930645e-05, + "loss": 0.2315, + "step": 42250 + }, + { + "epoch": 3.422796500324044, + "grad_norm": 0.057578153908252716, + "learning_rate": 3.206714973671183e-05, + "loss": 0.2245, + "step": 42251 + }, + { + "epoch": 3.4228775113415426, + "grad_norm": 0.08758700639009476, + "learning_rate": 3.2062649084117196e-05, + "loss": 0.2325, + "step": 42252 + }, + { + "epoch": 3.422958522359041, + "grad_norm": 0.07909160107374191, + "learning_rate": 3.205814843152257e-05, + "loss": 0.2182, + "step": 42253 + }, + { + "epoch": 3.423039533376539, + "grad_norm": 0.07638631761074066, + "learning_rate": 3.205364777892795e-05, + "loss": 0.2338, + "step": 42254 + }, + { + "epoch": 3.4231205443940373, + "grad_norm": 0.0887717455625534, + "learning_rate": 3.204914712633332e-05, + "loss": 0.2464, + "step": 42255 + }, + { + "epoch": 3.423201555411536, + "grad_norm": 0.07865258306264877, + "learning_rate": 3.204464647373869e-05, + "loss": 0.2144, + "step": 42256 + }, + { + "epoch": 3.4232825664290343, + "grad_norm": 0.07845400273799896, + "learning_rate": 3.204014582114407e-05, + "loss": 0.223, + "step": 42257 + }, + { + "epoch": 3.4233635774465325, + "grad_norm": 0.0784381777048111, + "learning_rate": 3.203564516854944e-05, + "loss": 0.2613, + "step": 42258 + }, + { + "epoch": 3.423444588464031, + "grad_norm": 0.07567717880010605, + "learning_rate": 3.203114451595481e-05, + "loss": 0.2165, + "step": 42259 + }, + { + "epoch": 3.4235255994815295, + "grad_norm": 0.07774611562490463, + "learning_rate": 3.202664386336019e-05, + "loss": 0.2537, + "step": 42260 + }, + { + "epoch": 3.4236066104990277, + "grad_norm": 0.05703600496053696, + "learning_rate": 3.2022143210765566e-05, + "loss": 0.1936, + "step": 42261 + }, + { + "epoch": 3.4236876215165264, + "grad_norm": 0.061656661331653595, + "learning_rate": 3.201764255817093e-05, + "loss": 0.219, + "step": 42262 + }, + { + "epoch": 3.4237686325340246, + "grad_norm": 0.06856316328048706, + "learning_rate": 3.201314190557631e-05, + "loss": 0.2273, + "step": 42263 + }, + { + "epoch": 3.423849643551523, + "grad_norm": 0.08919894695281982, + "learning_rate": 3.2008641252981686e-05, + "loss": 0.2685, + "step": 42264 + }, + { + "epoch": 3.4239306545690216, + "grad_norm": 0.09014753997325897, + "learning_rate": 3.200414060038705e-05, + "loss": 0.2524, + "step": 42265 + }, + { + "epoch": 3.42401166558652, + "grad_norm": 0.057525500655174255, + "learning_rate": 3.1999639947792434e-05, + "loss": 0.2013, + "step": 42266 + }, + { + "epoch": 3.424092676604018, + "grad_norm": 0.07143286615610123, + "learning_rate": 3.199513929519781e-05, + "loss": 0.2349, + "step": 42267 + }, + { + "epoch": 3.4241736876215167, + "grad_norm": 0.06419500708580017, + "learning_rate": 3.1990638642603174e-05, + "loss": 0.2281, + "step": 42268 + }, + { + "epoch": 3.424254698639015, + "grad_norm": 0.05905544385313988, + "learning_rate": 3.1986137990008554e-05, + "loss": 0.2045, + "step": 42269 + }, + { + "epoch": 3.4243357096565132, + "grad_norm": 0.06558793783187866, + "learning_rate": 3.198163733741393e-05, + "loss": 0.2118, + "step": 42270 + }, + { + "epoch": 3.4244167206740115, + "grad_norm": 0.06668176501989365, + "learning_rate": 3.1977136684819295e-05, + "loss": 0.2378, + "step": 42271 + }, + { + "epoch": 3.42449773169151, + "grad_norm": 0.05425681918859482, + "learning_rate": 3.1972636032224675e-05, + "loss": 0.1832, + "step": 42272 + }, + { + "epoch": 3.4245787427090084, + "grad_norm": 0.05808640643954277, + "learning_rate": 3.196813537963005e-05, + "loss": 0.2174, + "step": 42273 + }, + { + "epoch": 3.4246597537265067, + "grad_norm": 0.08048558980226517, + "learning_rate": 3.196363472703542e-05, + "loss": 0.2616, + "step": 42274 + }, + { + "epoch": 3.4247407647440054, + "grad_norm": 0.07026704400777817, + "learning_rate": 3.1959134074440796e-05, + "loss": 0.2646, + "step": 42275 + }, + { + "epoch": 3.4248217757615036, + "grad_norm": 0.0638585314154625, + "learning_rate": 3.195463342184617e-05, + "loss": 0.2102, + "step": 42276 + }, + { + "epoch": 3.424902786779002, + "grad_norm": 0.06224946305155754, + "learning_rate": 3.195013276925154e-05, + "loss": 0.2134, + "step": 42277 + }, + { + "epoch": 3.4249837977965, + "grad_norm": 0.07324585318565369, + "learning_rate": 3.194563211665692e-05, + "loss": 0.2348, + "step": 42278 + }, + { + "epoch": 3.4250648088139988, + "grad_norm": 0.07813339680433273, + "learning_rate": 3.194113146406229e-05, + "loss": 0.2311, + "step": 42279 + }, + { + "epoch": 3.425145819831497, + "grad_norm": 0.07276789844036102, + "learning_rate": 3.1936630811467664e-05, + "loss": 0.2428, + "step": 42280 + }, + { + "epoch": 3.4252268308489953, + "grad_norm": 0.06303577870130539, + "learning_rate": 3.193213015887304e-05, + "loss": 0.1835, + "step": 42281 + }, + { + "epoch": 3.425307841866494, + "grad_norm": 0.06274379789829254, + "learning_rate": 3.192762950627841e-05, + "loss": 0.2052, + "step": 42282 + }, + { + "epoch": 3.425388852883992, + "grad_norm": 0.0668783113360405, + "learning_rate": 3.1923128853683785e-05, + "loss": 0.2421, + "step": 42283 + }, + { + "epoch": 3.4254698639014904, + "grad_norm": 0.08327113837003708, + "learning_rate": 3.191862820108916e-05, + "loss": 0.2334, + "step": 42284 + }, + { + "epoch": 3.425550874918989, + "grad_norm": 0.06803330034017563, + "learning_rate": 3.191412754849453e-05, + "loss": 0.1938, + "step": 42285 + }, + { + "epoch": 3.4256318859364874, + "grad_norm": 0.06721960753202438, + "learning_rate": 3.1909626895899906e-05, + "loss": 0.2019, + "step": 42286 + }, + { + "epoch": 3.4257128969539856, + "grad_norm": 0.07778126001358032, + "learning_rate": 3.1905126243305286e-05, + "loss": 0.2143, + "step": 42287 + }, + { + "epoch": 3.4257939079714843, + "grad_norm": 0.08955970406532288, + "learning_rate": 3.190062559071065e-05, + "loss": 0.2377, + "step": 42288 + }, + { + "epoch": 3.4258749189889826, + "grad_norm": 0.07158930599689484, + "learning_rate": 3.1896124938116026e-05, + "loss": 0.1812, + "step": 42289 + }, + { + "epoch": 3.425955930006481, + "grad_norm": 0.06083231419324875, + "learning_rate": 3.189162428552141e-05, + "loss": 0.1819, + "step": 42290 + }, + { + "epoch": 3.4260369410239795, + "grad_norm": 0.08491310477256775, + "learning_rate": 3.1887123632926774e-05, + "loss": 0.258, + "step": 42291 + }, + { + "epoch": 3.4261179520414777, + "grad_norm": 0.06328444182872772, + "learning_rate": 3.188262298033215e-05, + "loss": 0.2139, + "step": 42292 + }, + { + "epoch": 3.426198963058976, + "grad_norm": 0.07530766725540161, + "learning_rate": 3.187812232773753e-05, + "loss": 0.201, + "step": 42293 + }, + { + "epoch": 3.426279974076474, + "grad_norm": 0.07389926165342331, + "learning_rate": 3.1873621675142894e-05, + "loss": 0.2153, + "step": 42294 + }, + { + "epoch": 3.426360985093973, + "grad_norm": 0.0851617231965065, + "learning_rate": 3.186912102254827e-05, + "loss": 0.2333, + "step": 42295 + }, + { + "epoch": 3.426441996111471, + "grad_norm": 0.09308183193206787, + "learning_rate": 3.186462036995365e-05, + "loss": 0.2593, + "step": 42296 + }, + { + "epoch": 3.4265230071289694, + "grad_norm": 0.06928377598524094, + "learning_rate": 3.1860119717359015e-05, + "loss": 0.2096, + "step": 42297 + }, + { + "epoch": 3.426604018146468, + "grad_norm": 0.06693492829799652, + "learning_rate": 3.185561906476439e-05, + "loss": 0.2151, + "step": 42298 + }, + { + "epoch": 3.4266850291639663, + "grad_norm": 0.06244364008307457, + "learning_rate": 3.185111841216977e-05, + "loss": 0.2239, + "step": 42299 + }, + { + "epoch": 3.4267660401814646, + "grad_norm": 0.07289474457502365, + "learning_rate": 3.184661775957514e-05, + "loss": 0.2284, + "step": 42300 + }, + { + "epoch": 3.426847051198963, + "grad_norm": 0.08296742290258408, + "learning_rate": 3.184211710698051e-05, + "loss": 0.2659, + "step": 42301 + }, + { + "epoch": 3.4269280622164615, + "grad_norm": 0.08548368513584137, + "learning_rate": 3.183761645438589e-05, + "loss": 0.2224, + "step": 42302 + }, + { + "epoch": 3.4270090732339598, + "grad_norm": 0.07132833451032639, + "learning_rate": 3.1833115801791264e-05, + "loss": 0.2214, + "step": 42303 + }, + { + "epoch": 3.427090084251458, + "grad_norm": 0.08157029747962952, + "learning_rate": 3.182861514919663e-05, + "loss": 0.2263, + "step": 42304 + }, + { + "epoch": 3.4271710952689567, + "grad_norm": 0.07904928177595139, + "learning_rate": 3.182411449660201e-05, + "loss": 0.2624, + "step": 42305 + }, + { + "epoch": 3.427252106286455, + "grad_norm": 0.09257368743419647, + "learning_rate": 3.1819613844007384e-05, + "loss": 0.2928, + "step": 42306 + }, + { + "epoch": 3.427333117303953, + "grad_norm": 0.07520698755979538, + "learning_rate": 3.181511319141275e-05, + "loss": 0.211, + "step": 42307 + }, + { + "epoch": 3.427414128321452, + "grad_norm": 0.06931690126657486, + "learning_rate": 3.181061253881813e-05, + "loss": 0.2588, + "step": 42308 + }, + { + "epoch": 3.42749513933895, + "grad_norm": 0.07930534332990646, + "learning_rate": 3.1806111886223505e-05, + "loss": 0.2293, + "step": 42309 + }, + { + "epoch": 3.4275761503564484, + "grad_norm": 0.08392751216888428, + "learning_rate": 3.180161123362887e-05, + "loss": 0.2733, + "step": 42310 + }, + { + "epoch": 3.427657161373947, + "grad_norm": 0.06310337781906128, + "learning_rate": 3.179711058103425e-05, + "loss": 0.2267, + "step": 42311 + }, + { + "epoch": 3.4277381723914453, + "grad_norm": 0.08746607601642609, + "learning_rate": 3.1792609928439626e-05, + "loss": 0.1982, + "step": 42312 + }, + { + "epoch": 3.4278191834089435, + "grad_norm": 0.059589460492134094, + "learning_rate": 3.1788109275845e-05, + "loss": 0.2267, + "step": 42313 + }, + { + "epoch": 3.4279001944264422, + "grad_norm": 0.07890890538692474, + "learning_rate": 3.178360862325037e-05, + "loss": 0.2439, + "step": 42314 + }, + { + "epoch": 3.4279812054439405, + "grad_norm": 0.07259047776460648, + "learning_rate": 3.177910797065575e-05, + "loss": 0.2305, + "step": 42315 + }, + { + "epoch": 3.4280622164614387, + "grad_norm": 0.09121621400117874, + "learning_rate": 3.177460731806112e-05, + "loss": 0.241, + "step": 42316 + }, + { + "epoch": 3.428143227478937, + "grad_norm": 0.0720064714550972, + "learning_rate": 3.1770106665466494e-05, + "loss": 0.1922, + "step": 42317 + }, + { + "epoch": 3.4282242384964356, + "grad_norm": 0.07883750647306442, + "learning_rate": 3.176560601287187e-05, + "loss": 0.2434, + "step": 42318 + }, + { + "epoch": 3.428305249513934, + "grad_norm": 0.07507482171058655, + "learning_rate": 3.176110536027724e-05, + "loss": 0.2453, + "step": 42319 + }, + { + "epoch": 3.428386260531432, + "grad_norm": 0.09118028730154037, + "learning_rate": 3.1756604707682615e-05, + "loss": 0.251, + "step": 42320 + }, + { + "epoch": 3.428467271548931, + "grad_norm": 0.08633448928594589, + "learning_rate": 3.175210405508799e-05, + "loss": 0.229, + "step": 42321 + }, + { + "epoch": 3.428548282566429, + "grad_norm": 0.07962105423212051, + "learning_rate": 3.174760340249336e-05, + "loss": 0.1936, + "step": 42322 + }, + { + "epoch": 3.4286292935839273, + "grad_norm": 0.0843440443277359, + "learning_rate": 3.1743102749898736e-05, + "loss": 0.2523, + "step": 42323 + }, + { + "epoch": 3.4287103046014256, + "grad_norm": 0.06754980981349945, + "learning_rate": 3.173860209730411e-05, + "loss": 0.2247, + "step": 42324 + }, + { + "epoch": 3.4287913156189243, + "grad_norm": 0.0783877968788147, + "learning_rate": 3.173410144470948e-05, + "loss": 0.2428, + "step": 42325 + }, + { + "epoch": 3.4288723266364225, + "grad_norm": 0.06497252732515335, + "learning_rate": 3.172960079211486e-05, + "loss": 0.2004, + "step": 42326 + }, + { + "epoch": 3.4289533376539207, + "grad_norm": 0.08234023302793503, + "learning_rate": 3.172510013952023e-05, + "loss": 0.2052, + "step": 42327 + }, + { + "epoch": 3.4290343486714194, + "grad_norm": 0.06907796114683151, + "learning_rate": 3.1720599486925604e-05, + "loss": 0.2391, + "step": 42328 + }, + { + "epoch": 3.4291153596889177, + "grad_norm": 0.06292956322431564, + "learning_rate": 3.1716098834330984e-05, + "loss": 0.2225, + "step": 42329 + }, + { + "epoch": 3.429196370706416, + "grad_norm": 0.08327022939920425, + "learning_rate": 3.171159818173635e-05, + "loss": 0.2288, + "step": 42330 + }, + { + "epoch": 3.4292773817239146, + "grad_norm": 0.07891655713319778, + "learning_rate": 3.1707097529141725e-05, + "loss": 0.25, + "step": 42331 + }, + { + "epoch": 3.429358392741413, + "grad_norm": 0.08023487776517868, + "learning_rate": 3.1702596876547105e-05, + "loss": 0.2491, + "step": 42332 + }, + { + "epoch": 3.429439403758911, + "grad_norm": 0.07012294977903366, + "learning_rate": 3.169809622395247e-05, + "loss": 0.2195, + "step": 42333 + }, + { + "epoch": 3.42952041477641, + "grad_norm": 0.06975067406892776, + "learning_rate": 3.1693595571357845e-05, + "loss": 0.2312, + "step": 42334 + }, + { + "epoch": 3.429601425793908, + "grad_norm": 0.05552361533045769, + "learning_rate": 3.1689094918763226e-05, + "loss": 0.2132, + "step": 42335 + }, + { + "epoch": 3.4296824368114063, + "grad_norm": 0.08807866275310516, + "learning_rate": 3.168459426616859e-05, + "loss": 0.2089, + "step": 42336 + }, + { + "epoch": 3.429763447828905, + "grad_norm": 0.05967969819903374, + "learning_rate": 3.1680093613573966e-05, + "loss": 0.2422, + "step": 42337 + }, + { + "epoch": 3.429844458846403, + "grad_norm": 0.07136420905590057, + "learning_rate": 3.1675592960979347e-05, + "loss": 0.2563, + "step": 42338 + }, + { + "epoch": 3.4299254698639015, + "grad_norm": 0.06767350435256958, + "learning_rate": 3.167109230838472e-05, + "loss": 0.2122, + "step": 42339 + }, + { + "epoch": 3.4300064808813997, + "grad_norm": 0.07169254124164581, + "learning_rate": 3.166659165579009e-05, + "loss": 0.2215, + "step": 42340 + }, + { + "epoch": 3.4300874918988984, + "grad_norm": 0.07639869302511215, + "learning_rate": 3.166209100319547e-05, + "loss": 0.2776, + "step": 42341 + }, + { + "epoch": 3.4301685029163966, + "grad_norm": 0.057808637619018555, + "learning_rate": 3.165759035060084e-05, + "loss": 0.1903, + "step": 42342 + }, + { + "epoch": 3.430249513933895, + "grad_norm": 0.07123946398496628, + "learning_rate": 3.165308969800621e-05, + "loss": 0.2204, + "step": 42343 + }, + { + "epoch": 3.4303305249513936, + "grad_norm": 0.08755022287368774, + "learning_rate": 3.164858904541159e-05, + "loss": 0.2224, + "step": 42344 + }, + { + "epoch": 3.430411535968892, + "grad_norm": 0.07249519228935242, + "learning_rate": 3.164408839281696e-05, + "loss": 0.2239, + "step": 42345 + }, + { + "epoch": 3.43049254698639, + "grad_norm": 0.08545974642038345, + "learning_rate": 3.163958774022233e-05, + "loss": 0.2506, + "step": 42346 + }, + { + "epoch": 3.4305735580038883, + "grad_norm": 0.08462615311145782, + "learning_rate": 3.163508708762771e-05, + "loss": 0.2436, + "step": 42347 + }, + { + "epoch": 3.430654569021387, + "grad_norm": 0.06520073860883713, + "learning_rate": 3.163058643503308e-05, + "loss": 0.226, + "step": 42348 + }, + { + "epoch": 3.4307355800388852, + "grad_norm": 0.08690381050109863, + "learning_rate": 3.162608578243845e-05, + "loss": 0.2199, + "step": 42349 + }, + { + "epoch": 3.4308165910563835, + "grad_norm": 0.07840859144926071, + "learning_rate": 3.162158512984383e-05, + "loss": 0.2489, + "step": 42350 + }, + { + "epoch": 3.430897602073882, + "grad_norm": 0.07008511573076248, + "learning_rate": 3.1617084477249203e-05, + "loss": 0.2561, + "step": 42351 + }, + { + "epoch": 3.4309786130913804, + "grad_norm": 0.05991966649889946, + "learning_rate": 3.161258382465458e-05, + "loss": 0.2487, + "step": 42352 + }, + { + "epoch": 3.4310596241088787, + "grad_norm": 0.06471392512321472, + "learning_rate": 3.160808317205995e-05, + "loss": 0.2362, + "step": 42353 + }, + { + "epoch": 3.4311406351263773, + "grad_norm": 0.07750452309846878, + "learning_rate": 3.1603582519465324e-05, + "loss": 0.2639, + "step": 42354 + }, + { + "epoch": 3.4312216461438756, + "grad_norm": 0.07131041586399078, + "learning_rate": 3.15990818668707e-05, + "loss": 0.2227, + "step": 42355 + }, + { + "epoch": 3.431302657161374, + "grad_norm": 0.07483736425638199, + "learning_rate": 3.159458121427607e-05, + "loss": 0.2224, + "step": 42356 + }, + { + "epoch": 3.4313836681788725, + "grad_norm": 0.06175358220934868, + "learning_rate": 3.1590080561681445e-05, + "loss": 0.2631, + "step": 42357 + }, + { + "epoch": 3.4314646791963708, + "grad_norm": 0.07144345343112946, + "learning_rate": 3.158557990908682e-05, + "loss": 0.2151, + "step": 42358 + }, + { + "epoch": 3.431545690213869, + "grad_norm": 0.07201637327671051, + "learning_rate": 3.158107925649219e-05, + "loss": 0.2097, + "step": 42359 + }, + { + "epoch": 3.4316267012313677, + "grad_norm": 0.06086525321006775, + "learning_rate": 3.1576578603897566e-05, + "loss": 0.2005, + "step": 42360 + }, + { + "epoch": 3.431707712248866, + "grad_norm": 0.062122467905282974, + "learning_rate": 3.157207795130294e-05, + "loss": 0.2261, + "step": 42361 + }, + { + "epoch": 3.431788723266364, + "grad_norm": 0.06578835099935532, + "learning_rate": 3.156757729870831e-05, + "loss": 0.2348, + "step": 42362 + }, + { + "epoch": 3.4318697342838624, + "grad_norm": 0.06421864032745361, + "learning_rate": 3.156307664611369e-05, + "loss": 0.218, + "step": 42363 + }, + { + "epoch": 3.431950745301361, + "grad_norm": 0.069000244140625, + "learning_rate": 3.155857599351906e-05, + "loss": 0.2665, + "step": 42364 + }, + { + "epoch": 3.4320317563188594, + "grad_norm": 0.07800819724798203, + "learning_rate": 3.155407534092444e-05, + "loss": 0.2314, + "step": 42365 + }, + { + "epoch": 3.4321127673363576, + "grad_norm": 0.06821920722723007, + "learning_rate": 3.154957468832981e-05, + "loss": 0.2119, + "step": 42366 + }, + { + "epoch": 3.4321937783538563, + "grad_norm": 0.05828966572880745, + "learning_rate": 3.154507403573518e-05, + "loss": 0.2396, + "step": 42367 + }, + { + "epoch": 3.4322747893713546, + "grad_norm": 0.08024381101131439, + "learning_rate": 3.154057338314056e-05, + "loss": 0.2105, + "step": 42368 + }, + { + "epoch": 3.432355800388853, + "grad_norm": 0.0659569799900055, + "learning_rate": 3.153607273054593e-05, + "loss": 0.2102, + "step": 42369 + }, + { + "epoch": 3.432436811406351, + "grad_norm": 0.08766531944274902, + "learning_rate": 3.15315720779513e-05, + "loss": 0.2241, + "step": 42370 + }, + { + "epoch": 3.4325178224238497, + "grad_norm": 0.08283737301826477, + "learning_rate": 3.152707142535668e-05, + "loss": 0.2663, + "step": 42371 + }, + { + "epoch": 3.432598833441348, + "grad_norm": 0.07166048884391785, + "learning_rate": 3.152257077276205e-05, + "loss": 0.2281, + "step": 42372 + }, + { + "epoch": 3.432679844458846, + "grad_norm": 0.08123382180929184, + "learning_rate": 3.151807012016742e-05, + "loss": 0.2408, + "step": 42373 + }, + { + "epoch": 3.432760855476345, + "grad_norm": 0.07305345684289932, + "learning_rate": 3.15135694675728e-05, + "loss": 0.2455, + "step": 42374 + }, + { + "epoch": 3.432841866493843, + "grad_norm": 0.0648297443985939, + "learning_rate": 3.150906881497817e-05, + "loss": 0.2104, + "step": 42375 + }, + { + "epoch": 3.4329228775113414, + "grad_norm": 0.0737859457731247, + "learning_rate": 3.1504568162383544e-05, + "loss": 0.2275, + "step": 42376 + }, + { + "epoch": 3.43300388852884, + "grad_norm": 0.07172111421823502, + "learning_rate": 3.1500067509788924e-05, + "loss": 0.2426, + "step": 42377 + }, + { + "epoch": 3.4330848995463383, + "grad_norm": 0.06762681901454926, + "learning_rate": 3.14955668571943e-05, + "loss": 0.2571, + "step": 42378 + }, + { + "epoch": 3.4331659105638366, + "grad_norm": 0.07657989114522934, + "learning_rate": 3.1491066204599664e-05, + "loss": 0.1949, + "step": 42379 + }, + { + "epoch": 3.4332469215813353, + "grad_norm": 0.07408314198255539, + "learning_rate": 3.1486565552005045e-05, + "loss": 0.2221, + "step": 42380 + }, + { + "epoch": 3.4333279325988335, + "grad_norm": 0.07041918486356735, + "learning_rate": 3.148206489941042e-05, + "loss": 0.2076, + "step": 42381 + }, + { + "epoch": 3.4334089436163318, + "grad_norm": 0.0844322219491005, + "learning_rate": 3.1477564246815785e-05, + "loss": 0.225, + "step": 42382 + }, + { + "epoch": 3.4334899546338304, + "grad_norm": 0.0668899193406105, + "learning_rate": 3.1473063594221166e-05, + "loss": 0.2101, + "step": 42383 + }, + { + "epoch": 3.4335709656513287, + "grad_norm": 0.08455423265695572, + "learning_rate": 3.146856294162654e-05, + "loss": 0.2501, + "step": 42384 + }, + { + "epoch": 3.433651976668827, + "grad_norm": 0.07682643830776215, + "learning_rate": 3.1464062289031906e-05, + "loss": 0.2301, + "step": 42385 + }, + { + "epoch": 3.433732987686325, + "grad_norm": 0.08980892598628998, + "learning_rate": 3.1459561636437286e-05, + "loss": 0.2484, + "step": 42386 + }, + { + "epoch": 3.433813998703824, + "grad_norm": 0.07740815728902817, + "learning_rate": 3.145506098384266e-05, + "loss": 0.2276, + "step": 42387 + }, + { + "epoch": 3.433895009721322, + "grad_norm": 0.0819552093744278, + "learning_rate": 3.1450560331248034e-05, + "loss": 0.2466, + "step": 42388 + }, + { + "epoch": 3.4339760207388204, + "grad_norm": 0.06842090934515, + "learning_rate": 3.144605967865341e-05, + "loss": 0.2285, + "step": 42389 + }, + { + "epoch": 3.434057031756319, + "grad_norm": 0.08851540088653564, + "learning_rate": 3.144155902605878e-05, + "loss": 0.2125, + "step": 42390 + }, + { + "epoch": 3.4341380427738173, + "grad_norm": 0.07082171738147736, + "learning_rate": 3.1437058373464154e-05, + "loss": 0.2133, + "step": 42391 + }, + { + "epoch": 3.4342190537913155, + "grad_norm": 0.06632804870605469, + "learning_rate": 3.143255772086953e-05, + "loss": 0.2391, + "step": 42392 + }, + { + "epoch": 3.434300064808814, + "grad_norm": 0.08337467908859253, + "learning_rate": 3.14280570682749e-05, + "loss": 0.2033, + "step": 42393 + }, + { + "epoch": 3.4343810758263125, + "grad_norm": 0.07331390678882599, + "learning_rate": 3.1423556415680275e-05, + "loss": 0.2282, + "step": 42394 + }, + { + "epoch": 3.4344620868438107, + "grad_norm": 0.09076662361621857, + "learning_rate": 3.141905576308565e-05, + "loss": 0.265, + "step": 42395 + }, + { + "epoch": 3.434543097861309, + "grad_norm": 0.07595141977071762, + "learning_rate": 3.141455511049102e-05, + "loss": 0.2433, + "step": 42396 + }, + { + "epoch": 3.4346241088788076, + "grad_norm": 0.06144767627120018, + "learning_rate": 3.1410054457896396e-05, + "loss": 0.2033, + "step": 42397 + }, + { + "epoch": 3.434705119896306, + "grad_norm": 0.08792851865291595, + "learning_rate": 3.140555380530177e-05, + "loss": 0.2632, + "step": 42398 + }, + { + "epoch": 3.434786130913804, + "grad_norm": 0.05918162316083908, + "learning_rate": 3.140105315270714e-05, + "loss": 0.2102, + "step": 42399 + }, + { + "epoch": 3.434867141931303, + "grad_norm": 0.0852525532245636, + "learning_rate": 3.139655250011252e-05, + "loss": 0.2373, + "step": 42400 + }, + { + "epoch": 3.434948152948801, + "grad_norm": 0.07010507583618164, + "learning_rate": 3.139205184751789e-05, + "loss": 0.2158, + "step": 42401 + }, + { + "epoch": 3.4350291639662993, + "grad_norm": 0.07169964164495468, + "learning_rate": 3.1387551194923264e-05, + "loss": 0.215, + "step": 42402 + }, + { + "epoch": 3.435110174983798, + "grad_norm": 0.0834985077381134, + "learning_rate": 3.138305054232864e-05, + "loss": 0.2279, + "step": 42403 + }, + { + "epoch": 3.4351911860012962, + "grad_norm": 0.07278002798557281, + "learning_rate": 3.137854988973401e-05, + "loss": 0.2085, + "step": 42404 + }, + { + "epoch": 3.4352721970187945, + "grad_norm": 0.062233611941337585, + "learning_rate": 3.1374049237139385e-05, + "loss": 0.2187, + "step": 42405 + }, + { + "epoch": 3.435353208036293, + "grad_norm": 0.08605244755744934, + "learning_rate": 3.136954858454476e-05, + "loss": 0.2261, + "step": 42406 + }, + { + "epoch": 3.4354342190537914, + "grad_norm": 0.09702229499816895, + "learning_rate": 3.136504793195014e-05, + "loss": 0.2434, + "step": 42407 + }, + { + "epoch": 3.4355152300712897, + "grad_norm": 0.06307677924633026, + "learning_rate": 3.1360547279355506e-05, + "loss": 0.213, + "step": 42408 + }, + { + "epoch": 3.435596241088788, + "grad_norm": 0.06760883331298828, + "learning_rate": 3.135604662676088e-05, + "loss": 0.1945, + "step": 42409 + }, + { + "epoch": 3.4356772521062866, + "grad_norm": 0.06725229322910309, + "learning_rate": 3.135154597416626e-05, + "loss": 0.2248, + "step": 42410 + }, + { + "epoch": 3.435758263123785, + "grad_norm": 0.07328040897846222, + "learning_rate": 3.1347045321571626e-05, + "loss": 0.2482, + "step": 42411 + }, + { + "epoch": 3.435839274141283, + "grad_norm": 0.07860074937343597, + "learning_rate": 3.1342544668977e-05, + "loss": 0.1963, + "step": 42412 + }, + { + "epoch": 3.435920285158782, + "grad_norm": 0.07504335045814514, + "learning_rate": 3.133804401638238e-05, + "loss": 0.2351, + "step": 42413 + }, + { + "epoch": 3.43600129617628, + "grad_norm": 0.06371016055345535, + "learning_rate": 3.133354336378775e-05, + "loss": 0.2212, + "step": 42414 + }, + { + "epoch": 3.4360823071937783, + "grad_norm": 0.06477545201778412, + "learning_rate": 3.132904271119312e-05, + "loss": 0.1911, + "step": 42415 + }, + { + "epoch": 3.4361633182112765, + "grad_norm": 0.07808870077133179, + "learning_rate": 3.13245420585985e-05, + "loss": 0.2528, + "step": 42416 + }, + { + "epoch": 3.436244329228775, + "grad_norm": 0.06825084239244461, + "learning_rate": 3.132004140600387e-05, + "loss": 0.2441, + "step": 42417 + }, + { + "epoch": 3.4363253402462735, + "grad_norm": 0.07942578196525574, + "learning_rate": 3.131554075340924e-05, + "loss": 0.2501, + "step": 42418 + }, + { + "epoch": 3.4364063512637717, + "grad_norm": 0.07507924735546112, + "learning_rate": 3.131104010081462e-05, + "loss": 0.2097, + "step": 42419 + }, + { + "epoch": 3.4364873622812704, + "grad_norm": 0.08393153548240662, + "learning_rate": 3.1306539448219996e-05, + "loss": 0.2316, + "step": 42420 + }, + { + "epoch": 3.4365683732987686, + "grad_norm": 0.07208391278982162, + "learning_rate": 3.130203879562536e-05, + "loss": 0.2284, + "step": 42421 + }, + { + "epoch": 3.436649384316267, + "grad_norm": 0.0677122250199318, + "learning_rate": 3.129753814303074e-05, + "loss": 0.2468, + "step": 42422 + }, + { + "epoch": 3.4367303953337656, + "grad_norm": 0.07315247505903244, + "learning_rate": 3.1293037490436116e-05, + "loss": 0.2323, + "step": 42423 + }, + { + "epoch": 3.436811406351264, + "grad_norm": 0.07844259589910507, + "learning_rate": 3.128853683784149e-05, + "loss": 0.2493, + "step": 42424 + }, + { + "epoch": 3.436892417368762, + "grad_norm": 0.09640884399414062, + "learning_rate": 3.1284036185246864e-05, + "loss": 0.2681, + "step": 42425 + }, + { + "epoch": 3.4369734283862607, + "grad_norm": 0.07181743532419205, + "learning_rate": 3.127953553265224e-05, + "loss": 0.2238, + "step": 42426 + }, + { + "epoch": 3.437054439403759, + "grad_norm": 0.07064533233642578, + "learning_rate": 3.127503488005761e-05, + "loss": 0.2211, + "step": 42427 + }, + { + "epoch": 3.4371354504212572, + "grad_norm": 0.06328742206096649, + "learning_rate": 3.1270534227462984e-05, + "loss": 0.2169, + "step": 42428 + }, + { + "epoch": 3.4372164614387555, + "grad_norm": 0.08524394035339355, + "learning_rate": 3.126603357486836e-05, + "loss": 0.2512, + "step": 42429 + }, + { + "epoch": 3.437297472456254, + "grad_norm": 0.07788166403770447, + "learning_rate": 3.126153292227373e-05, + "loss": 0.1988, + "step": 42430 + }, + { + "epoch": 3.4373784834737524, + "grad_norm": 0.07452834397554398, + "learning_rate": 3.1257032269679105e-05, + "loss": 0.229, + "step": 42431 + }, + { + "epoch": 3.4374594944912507, + "grad_norm": 0.06433499604463577, + "learning_rate": 3.125253161708448e-05, + "loss": 0.2086, + "step": 42432 + }, + { + "epoch": 3.4375405055087493, + "grad_norm": 0.07874725759029388, + "learning_rate": 3.124803096448985e-05, + "loss": 0.2252, + "step": 42433 + }, + { + "epoch": 3.4376215165262476, + "grad_norm": 0.06409906595945358, + "learning_rate": 3.1243530311895226e-05, + "loss": 0.1908, + "step": 42434 + }, + { + "epoch": 3.437702527543746, + "grad_norm": 0.07005061954259872, + "learning_rate": 3.12390296593006e-05, + "loss": 0.2266, + "step": 42435 + }, + { + "epoch": 3.4377835385612445, + "grad_norm": 0.0679619088768959, + "learning_rate": 3.123452900670597e-05, + "loss": 0.1863, + "step": 42436 + }, + { + "epoch": 3.4378645495787428, + "grad_norm": 0.07140480726957321, + "learning_rate": 3.123002835411135e-05, + "loss": 0.2588, + "step": 42437 + }, + { + "epoch": 3.437945560596241, + "grad_norm": 0.07242782413959503, + "learning_rate": 3.122552770151672e-05, + "loss": 0.2201, + "step": 42438 + }, + { + "epoch": 3.4380265716137393, + "grad_norm": 0.05919842794537544, + "learning_rate": 3.1221027048922094e-05, + "loss": 0.251, + "step": 42439 + }, + { + "epoch": 3.438107582631238, + "grad_norm": 0.08086790144443512, + "learning_rate": 3.121652639632747e-05, + "loss": 0.2448, + "step": 42440 + }, + { + "epoch": 3.438188593648736, + "grad_norm": 0.0732114315032959, + "learning_rate": 3.121202574373284e-05, + "loss": 0.2167, + "step": 42441 + }, + { + "epoch": 3.4382696046662344, + "grad_norm": 0.07175198942422867, + "learning_rate": 3.1207525091138215e-05, + "loss": 0.1999, + "step": 42442 + }, + { + "epoch": 3.438350615683733, + "grad_norm": 0.0730404332280159, + "learning_rate": 3.120302443854359e-05, + "loss": 0.2388, + "step": 42443 + }, + { + "epoch": 3.4384316267012314, + "grad_norm": 0.07166948169469833, + "learning_rate": 3.119852378594896e-05, + "loss": 0.2328, + "step": 42444 + }, + { + "epoch": 3.4385126377187296, + "grad_norm": 0.0726485624909401, + "learning_rate": 3.1194023133354336e-05, + "loss": 0.2126, + "step": 42445 + }, + { + "epoch": 3.4385936487362283, + "grad_norm": 0.09775371849536896, + "learning_rate": 3.1189522480759716e-05, + "loss": 0.2205, + "step": 42446 + }, + { + "epoch": 3.4386746597537265, + "grad_norm": 0.07552385330200195, + "learning_rate": 3.118502182816508e-05, + "loss": 0.2254, + "step": 42447 + }, + { + "epoch": 3.438755670771225, + "grad_norm": 0.06823941320180893, + "learning_rate": 3.1180521175570456e-05, + "loss": 0.2337, + "step": 42448 + }, + { + "epoch": 3.4388366817887235, + "grad_norm": 0.06638476252555847, + "learning_rate": 3.117602052297584e-05, + "loss": 0.2607, + "step": 42449 + }, + { + "epoch": 3.4389176928062217, + "grad_norm": 0.06269118934869766, + "learning_rate": 3.1171519870381204e-05, + "loss": 0.194, + "step": 42450 + }, + { + "epoch": 3.43899870382372, + "grad_norm": 0.07186885923147202, + "learning_rate": 3.116701921778658e-05, + "loss": 0.209, + "step": 42451 + }, + { + "epoch": 3.439079714841218, + "grad_norm": 0.06401878595352173, + "learning_rate": 3.116251856519196e-05, + "loss": 0.2364, + "step": 42452 + }, + { + "epoch": 3.439160725858717, + "grad_norm": 0.08331364393234253, + "learning_rate": 3.1158017912597325e-05, + "loss": 0.2699, + "step": 42453 + }, + { + "epoch": 3.439241736876215, + "grad_norm": 0.07670908421278, + "learning_rate": 3.11535172600027e-05, + "loss": 0.2253, + "step": 42454 + }, + { + "epoch": 3.4393227478937134, + "grad_norm": 0.06328018009662628, + "learning_rate": 3.114901660740808e-05, + "loss": 0.2341, + "step": 42455 + }, + { + "epoch": 3.439403758911212, + "grad_norm": 0.06630222499370575, + "learning_rate": 3.1144515954813445e-05, + "loss": 0.2109, + "step": 42456 + }, + { + "epoch": 3.4394847699287103, + "grad_norm": 0.07363289594650269, + "learning_rate": 3.1140015302218826e-05, + "loss": 0.258, + "step": 42457 + }, + { + "epoch": 3.4395657809462086, + "grad_norm": 0.07480932027101517, + "learning_rate": 3.11355146496242e-05, + "loss": 0.208, + "step": 42458 + }, + { + "epoch": 3.439646791963707, + "grad_norm": 0.07937634736299515, + "learning_rate": 3.113101399702957e-05, + "loss": 0.2327, + "step": 42459 + }, + { + "epoch": 3.4397278029812055, + "grad_norm": 0.07142050564289093, + "learning_rate": 3.1126513344434947e-05, + "loss": 0.216, + "step": 42460 + }, + { + "epoch": 3.4398088139987038, + "grad_norm": 0.09153217822313309, + "learning_rate": 3.112201269184032e-05, + "loss": 0.2545, + "step": 42461 + }, + { + "epoch": 3.439889825016202, + "grad_norm": 0.07688187062740326, + "learning_rate": 3.1117512039245694e-05, + "loss": 0.2388, + "step": 42462 + }, + { + "epoch": 3.4399708360337007, + "grad_norm": 0.07844226807355881, + "learning_rate": 3.111301138665107e-05, + "loss": 0.2126, + "step": 42463 + }, + { + "epoch": 3.440051847051199, + "grad_norm": 0.06151213496923447, + "learning_rate": 3.110851073405644e-05, + "loss": 0.2331, + "step": 42464 + }, + { + "epoch": 3.440132858068697, + "grad_norm": 0.06838814169168472, + "learning_rate": 3.1104010081461815e-05, + "loss": 0.2185, + "step": 42465 + }, + { + "epoch": 3.440213869086196, + "grad_norm": 0.09179142862558365, + "learning_rate": 3.109950942886719e-05, + "loss": 0.2498, + "step": 42466 + }, + { + "epoch": 3.440294880103694, + "grad_norm": 0.089014932513237, + "learning_rate": 3.109500877627256e-05, + "loss": 0.271, + "step": 42467 + }, + { + "epoch": 3.4403758911211924, + "grad_norm": 0.07677599787712097, + "learning_rate": 3.1090508123677935e-05, + "loss": 0.2306, + "step": 42468 + }, + { + "epoch": 3.440456902138691, + "grad_norm": 0.0653160959482193, + "learning_rate": 3.108600747108331e-05, + "loss": 0.2216, + "step": 42469 + }, + { + "epoch": 3.4405379131561893, + "grad_norm": 0.06036091968417168, + "learning_rate": 3.108150681848868e-05, + "loss": 0.2405, + "step": 42470 + }, + { + "epoch": 3.4406189241736875, + "grad_norm": 0.0778571143746376, + "learning_rate": 3.1077006165894056e-05, + "loss": 0.2298, + "step": 42471 + }, + { + "epoch": 3.440699935191186, + "grad_norm": 0.07820609211921692, + "learning_rate": 3.107250551329943e-05, + "loss": 0.2263, + "step": 42472 + }, + { + "epoch": 3.4407809462086845, + "grad_norm": 0.07398509234189987, + "learning_rate": 3.10680048607048e-05, + "loss": 0.238, + "step": 42473 + }, + { + "epoch": 3.4408619572261827, + "grad_norm": 0.06346040219068527, + "learning_rate": 3.106350420811018e-05, + "loss": 0.1943, + "step": 42474 + }, + { + "epoch": 3.440942968243681, + "grad_norm": 0.08498956263065338, + "learning_rate": 3.105900355551555e-05, + "loss": 0.242, + "step": 42475 + }, + { + "epoch": 3.4410239792611796, + "grad_norm": 0.08488177508115768, + "learning_rate": 3.1054502902920924e-05, + "loss": 0.2226, + "step": 42476 + }, + { + "epoch": 3.441104990278678, + "grad_norm": 0.08173447102308273, + "learning_rate": 3.10500022503263e-05, + "loss": 0.2493, + "step": 42477 + }, + { + "epoch": 3.441186001296176, + "grad_norm": 0.08201484382152557, + "learning_rate": 3.104550159773167e-05, + "loss": 0.2469, + "step": 42478 + }, + { + "epoch": 3.441267012313675, + "grad_norm": 0.06878629326820374, + "learning_rate": 3.1041000945137045e-05, + "loss": 0.2157, + "step": 42479 + }, + { + "epoch": 3.441348023331173, + "grad_norm": 0.08908562362194061, + "learning_rate": 3.103650029254242e-05, + "loss": 0.2142, + "step": 42480 + }, + { + "epoch": 3.4414290343486713, + "grad_norm": 0.06992624700069427, + "learning_rate": 3.103199963994779e-05, + "loss": 0.2487, + "step": 42481 + }, + { + "epoch": 3.4415100453661696, + "grad_norm": 0.08812936395406723, + "learning_rate": 3.1027498987353166e-05, + "loss": 0.2578, + "step": 42482 + }, + { + "epoch": 3.4415910563836682, + "grad_norm": 0.05788000300526619, + "learning_rate": 3.102299833475854e-05, + "loss": 0.212, + "step": 42483 + }, + { + "epoch": 3.4416720674011665, + "grad_norm": 0.07030339539051056, + "learning_rate": 3.101849768216391e-05, + "loss": 0.2193, + "step": 42484 + }, + { + "epoch": 3.4417530784186647, + "grad_norm": 0.07213641703128815, + "learning_rate": 3.101399702956929e-05, + "loss": 0.2363, + "step": 42485 + }, + { + "epoch": 3.4418340894361634, + "grad_norm": 0.07877527922391891, + "learning_rate": 3.100949637697466e-05, + "loss": 0.2461, + "step": 42486 + }, + { + "epoch": 3.4419151004536617, + "grad_norm": 0.07526003569364548, + "learning_rate": 3.1004995724380034e-05, + "loss": 0.2308, + "step": 42487 + }, + { + "epoch": 3.44199611147116, + "grad_norm": 0.05849730223417282, + "learning_rate": 3.1000495071785414e-05, + "loss": 0.2031, + "step": 42488 + }, + { + "epoch": 3.4420771224886586, + "grad_norm": 0.08493011444807053, + "learning_rate": 3.099599441919078e-05, + "loss": 0.2673, + "step": 42489 + }, + { + "epoch": 3.442158133506157, + "grad_norm": 0.07385018467903137, + "learning_rate": 3.0991493766596155e-05, + "loss": 0.2149, + "step": 42490 + }, + { + "epoch": 3.442239144523655, + "grad_norm": 0.08337543904781342, + "learning_rate": 3.0986993114001535e-05, + "loss": 0.2323, + "step": 42491 + }, + { + "epoch": 3.442320155541154, + "grad_norm": 0.0729101151227951, + "learning_rate": 3.09824924614069e-05, + "loss": 0.2447, + "step": 42492 + }, + { + "epoch": 3.442401166558652, + "grad_norm": 0.0771954283118248, + "learning_rate": 3.097799180881228e-05, + "loss": 0.2047, + "step": 42493 + }, + { + "epoch": 3.4424821775761503, + "grad_norm": 0.06417211145162582, + "learning_rate": 3.0973491156217656e-05, + "loss": 0.1951, + "step": 42494 + }, + { + "epoch": 3.442563188593649, + "grad_norm": 0.07637476921081543, + "learning_rate": 3.096899050362302e-05, + "loss": 0.2544, + "step": 42495 + }, + { + "epoch": 3.442644199611147, + "grad_norm": 0.08546575158834457, + "learning_rate": 3.09644898510284e-05, + "loss": 0.2199, + "step": 42496 + }, + { + "epoch": 3.4427252106286454, + "grad_norm": 0.074388287961483, + "learning_rate": 3.095998919843378e-05, + "loss": 0.2207, + "step": 42497 + }, + { + "epoch": 3.4428062216461437, + "grad_norm": 0.07267254590988159, + "learning_rate": 3.095548854583915e-05, + "loss": 0.2217, + "step": 42498 + }, + { + "epoch": 3.4428872326636424, + "grad_norm": 0.06971371173858643, + "learning_rate": 3.0950987893244524e-05, + "loss": 0.2058, + "step": 42499 + }, + { + "epoch": 3.4429682436811406, + "grad_norm": 0.06479272991418839, + "learning_rate": 3.09464872406499e-05, + "loss": 0.23, + "step": 42500 + }, + { + "epoch": 3.443049254698639, + "grad_norm": 0.06856130063533783, + "learning_rate": 3.094198658805527e-05, + "loss": 0.2244, + "step": 42501 + }, + { + "epoch": 3.4431302657161376, + "grad_norm": 0.070685975253582, + "learning_rate": 3.0937485935460645e-05, + "loss": 0.2489, + "step": 42502 + }, + { + "epoch": 3.443211276733636, + "grad_norm": 0.07081031054258347, + "learning_rate": 3.093298528286602e-05, + "loss": 0.2575, + "step": 42503 + }, + { + "epoch": 3.443292287751134, + "grad_norm": 0.08072758466005325, + "learning_rate": 3.092848463027139e-05, + "loss": 0.2277, + "step": 42504 + }, + { + "epoch": 3.4433732987686323, + "grad_norm": 0.05815386399626732, + "learning_rate": 3.0923983977676765e-05, + "loss": 0.2051, + "step": 42505 + }, + { + "epoch": 3.443454309786131, + "grad_norm": 0.07763691991567612, + "learning_rate": 3.091948332508214e-05, + "loss": 0.2027, + "step": 42506 + }, + { + "epoch": 3.4435353208036292, + "grad_norm": 0.0745823010802269, + "learning_rate": 3.091498267248751e-05, + "loss": 0.2735, + "step": 42507 + }, + { + "epoch": 3.4436163318211275, + "grad_norm": 0.07333028316497803, + "learning_rate": 3.0910482019892886e-05, + "loss": 0.218, + "step": 42508 + }, + { + "epoch": 3.443697342838626, + "grad_norm": 0.08793089538812637, + "learning_rate": 3.090598136729826e-05, + "loss": 0.2198, + "step": 42509 + }, + { + "epoch": 3.4437783538561244, + "grad_norm": 0.06536927819252014, + "learning_rate": 3.0901480714703633e-05, + "loss": 0.232, + "step": 42510 + }, + { + "epoch": 3.4438593648736227, + "grad_norm": 0.07713781297206879, + "learning_rate": 3.089698006210901e-05, + "loss": 0.1961, + "step": 42511 + }, + { + "epoch": 3.4439403758911213, + "grad_norm": 0.08600252866744995, + "learning_rate": 3.089247940951438e-05, + "loss": 0.2401, + "step": 42512 + }, + { + "epoch": 3.4440213869086196, + "grad_norm": 0.0582621768116951, + "learning_rate": 3.0887978756919754e-05, + "loss": 0.2505, + "step": 42513 + }, + { + "epoch": 3.444102397926118, + "grad_norm": 0.06772822141647339, + "learning_rate": 3.088347810432513e-05, + "loss": 0.2127, + "step": 42514 + }, + { + "epoch": 3.4441834089436165, + "grad_norm": 0.08750282973051071, + "learning_rate": 3.08789774517305e-05, + "loss": 0.2538, + "step": 42515 + }, + { + "epoch": 3.4442644199611148, + "grad_norm": 0.07079533487558365, + "learning_rate": 3.0874476799135875e-05, + "loss": 0.25, + "step": 42516 + }, + { + "epoch": 3.444345430978613, + "grad_norm": 0.08859393000602722, + "learning_rate": 3.086997614654125e-05, + "loss": 0.221, + "step": 42517 + }, + { + "epoch": 3.4444264419961117, + "grad_norm": 0.07913894951343536, + "learning_rate": 3.086547549394662e-05, + "loss": 0.1933, + "step": 42518 + }, + { + "epoch": 3.44450745301361, + "grad_norm": 0.06824357062578201, + "learning_rate": 3.0860974841351996e-05, + "loss": 0.1972, + "step": 42519 + }, + { + "epoch": 3.444588464031108, + "grad_norm": 0.07941274344921112, + "learning_rate": 3.085647418875737e-05, + "loss": 0.2063, + "step": 42520 + }, + { + "epoch": 3.4446694750486064, + "grad_norm": 0.06322202831506729, + "learning_rate": 3.085197353616274e-05, + "loss": 0.2167, + "step": 42521 + }, + { + "epoch": 3.444750486066105, + "grad_norm": 0.07784758508205414, + "learning_rate": 3.084747288356812e-05, + "loss": 0.2057, + "step": 42522 + }, + { + "epoch": 3.4448314970836034, + "grad_norm": 0.07789554446935654, + "learning_rate": 3.084297223097349e-05, + "loss": 0.2247, + "step": 42523 + }, + { + "epoch": 3.4449125081011016, + "grad_norm": 0.07606974244117737, + "learning_rate": 3.083847157837887e-05, + "loss": 0.2362, + "step": 42524 + }, + { + "epoch": 3.4449935191186003, + "grad_norm": 0.05988244712352753, + "learning_rate": 3.083397092578424e-05, + "loss": 0.1871, + "step": 42525 + }, + { + "epoch": 3.4450745301360985, + "grad_norm": 0.07931521534919739, + "learning_rate": 3.082947027318962e-05, + "loss": 0.2101, + "step": 42526 + }, + { + "epoch": 3.445155541153597, + "grad_norm": 0.07285697013139725, + "learning_rate": 3.082496962059499e-05, + "loss": 0.2001, + "step": 42527 + }, + { + "epoch": 3.445236552171095, + "grad_norm": 0.07134629040956497, + "learning_rate": 3.082046896800036e-05, + "loss": 0.2359, + "step": 42528 + }, + { + "epoch": 3.4453175631885937, + "grad_norm": 0.0749007910490036, + "learning_rate": 3.081596831540574e-05, + "loss": 0.2506, + "step": 42529 + }, + { + "epoch": 3.445398574206092, + "grad_norm": 0.08527930825948715, + "learning_rate": 3.081146766281111e-05, + "loss": 0.2467, + "step": 42530 + }, + { + "epoch": 3.44547958522359, + "grad_norm": 0.08154378086328506, + "learning_rate": 3.080696701021648e-05, + "loss": 0.2353, + "step": 42531 + }, + { + "epoch": 3.445560596241089, + "grad_norm": 0.07114233076572418, + "learning_rate": 3.080246635762186e-05, + "loss": 0.2078, + "step": 42532 + }, + { + "epoch": 3.445641607258587, + "grad_norm": 0.07162083685398102, + "learning_rate": 3.079796570502723e-05, + "loss": 0.2133, + "step": 42533 + }, + { + "epoch": 3.4457226182760854, + "grad_norm": 0.0754132941365242, + "learning_rate": 3.07934650524326e-05, + "loss": 0.1714, + "step": 42534 + }, + { + "epoch": 3.445803629293584, + "grad_norm": 0.07492772489786148, + "learning_rate": 3.078896439983798e-05, + "loss": 0.2537, + "step": 42535 + }, + { + "epoch": 3.4458846403110823, + "grad_norm": 0.0607149600982666, + "learning_rate": 3.0784463747243354e-05, + "loss": 0.2311, + "step": 42536 + }, + { + "epoch": 3.4459656513285806, + "grad_norm": 0.06479800492525101, + "learning_rate": 3.077996309464873e-05, + "loss": 0.2059, + "step": 42537 + }, + { + "epoch": 3.4460466623460793, + "grad_norm": 0.07805703580379486, + "learning_rate": 3.07754624420541e-05, + "loss": 0.2178, + "step": 42538 + }, + { + "epoch": 3.4461276733635775, + "grad_norm": 0.07639947533607483, + "learning_rate": 3.0770961789459475e-05, + "loss": 0.2371, + "step": 42539 + }, + { + "epoch": 3.4462086843810757, + "grad_norm": 0.06437902897596359, + "learning_rate": 3.076646113686485e-05, + "loss": 0.2374, + "step": 42540 + }, + { + "epoch": 3.4462896953985744, + "grad_norm": 0.06914235651493073, + "learning_rate": 3.076196048427022e-05, + "loss": 0.2035, + "step": 42541 + }, + { + "epoch": 3.4463707064160727, + "grad_norm": 0.0812096893787384, + "learning_rate": 3.0757459831675596e-05, + "loss": 0.2375, + "step": 42542 + }, + { + "epoch": 3.446451717433571, + "grad_norm": 0.07674495875835419, + "learning_rate": 3.075295917908097e-05, + "loss": 0.2185, + "step": 42543 + }, + { + "epoch": 3.446532728451069, + "grad_norm": 0.09959354251623154, + "learning_rate": 3.074845852648634e-05, + "loss": 0.2403, + "step": 42544 + }, + { + "epoch": 3.446613739468568, + "grad_norm": 0.07620353251695633, + "learning_rate": 3.0743957873891716e-05, + "loss": 0.2011, + "step": 42545 + }, + { + "epoch": 3.446694750486066, + "grad_norm": 0.07273352146148682, + "learning_rate": 3.073945722129709e-05, + "loss": 0.2212, + "step": 42546 + }, + { + "epoch": 3.4467757615035644, + "grad_norm": 0.0625937357544899, + "learning_rate": 3.0734956568702464e-05, + "loss": 0.2321, + "step": 42547 + }, + { + "epoch": 3.446856772521063, + "grad_norm": 0.06092469394207001, + "learning_rate": 3.073045591610784e-05, + "loss": 0.2243, + "step": 42548 + }, + { + "epoch": 3.4469377835385613, + "grad_norm": 0.06320256739854813, + "learning_rate": 3.072595526351321e-05, + "loss": 0.2125, + "step": 42549 + }, + { + "epoch": 3.4470187945560595, + "grad_norm": 0.07723885774612427, + "learning_rate": 3.0721454610918584e-05, + "loss": 0.1935, + "step": 42550 + }, + { + "epoch": 3.4470998055735578, + "grad_norm": 0.07916872948408127, + "learning_rate": 3.071695395832396e-05, + "loss": 0.2217, + "step": 42551 + }, + { + "epoch": 3.4471808165910565, + "grad_norm": 0.07740025222301483, + "learning_rate": 3.071245330572933e-05, + "loss": 0.2333, + "step": 42552 + }, + { + "epoch": 3.4472618276085547, + "grad_norm": 0.08805128186941147, + "learning_rate": 3.0707952653134705e-05, + "loss": 0.2696, + "step": 42553 + }, + { + "epoch": 3.447342838626053, + "grad_norm": 0.07534347474575043, + "learning_rate": 3.070345200054008e-05, + "loss": 0.2199, + "step": 42554 + }, + { + "epoch": 3.4474238496435516, + "grad_norm": 0.0770130380988121, + "learning_rate": 3.069895134794545e-05, + "loss": 0.2316, + "step": 42555 + }, + { + "epoch": 3.44750486066105, + "grad_norm": 0.07147173583507538, + "learning_rate": 3.0694450695350826e-05, + "loss": 0.2115, + "step": 42556 + }, + { + "epoch": 3.447585871678548, + "grad_norm": 0.07185516506433487, + "learning_rate": 3.06899500427562e-05, + "loss": 0.232, + "step": 42557 + }, + { + "epoch": 3.447666882696047, + "grad_norm": 0.07114926725625992, + "learning_rate": 3.068544939016157e-05, + "loss": 0.2137, + "step": 42558 + }, + { + "epoch": 3.447747893713545, + "grad_norm": 0.07756253331899643, + "learning_rate": 3.0680948737566954e-05, + "loss": 0.2644, + "step": 42559 + }, + { + "epoch": 3.4478289047310433, + "grad_norm": 0.07162553817033768, + "learning_rate": 3.067644808497232e-05, + "loss": 0.2374, + "step": 42560 + }, + { + "epoch": 3.447909915748542, + "grad_norm": 0.0673312172293663, + "learning_rate": 3.0671947432377694e-05, + "loss": 0.2171, + "step": 42561 + }, + { + "epoch": 3.4479909267660402, + "grad_norm": 0.09096536785364151, + "learning_rate": 3.0667446779783074e-05, + "loss": 0.2637, + "step": 42562 + }, + { + "epoch": 3.4480719377835385, + "grad_norm": 0.0750901997089386, + "learning_rate": 3.066294612718844e-05, + "loss": 0.2274, + "step": 42563 + }, + { + "epoch": 3.448152948801037, + "grad_norm": 0.06524889916181564, + "learning_rate": 3.0658445474593815e-05, + "loss": 0.2416, + "step": 42564 + }, + { + "epoch": 3.4482339598185354, + "grad_norm": 0.08983972668647766, + "learning_rate": 3.0653944821999195e-05, + "loss": 0.2537, + "step": 42565 + }, + { + "epoch": 3.4483149708360337, + "grad_norm": 0.06232641637325287, + "learning_rate": 3.064944416940457e-05, + "loss": 0.1841, + "step": 42566 + }, + { + "epoch": 3.448395981853532, + "grad_norm": 0.06145894527435303, + "learning_rate": 3.0644943516809936e-05, + "loss": 0.1862, + "step": 42567 + }, + { + "epoch": 3.4484769928710306, + "grad_norm": 0.054291676729917526, + "learning_rate": 3.0640442864215316e-05, + "loss": 0.2503, + "step": 42568 + }, + { + "epoch": 3.448558003888529, + "grad_norm": 0.06751246005296707, + "learning_rate": 3.063594221162069e-05, + "loss": 0.2346, + "step": 42569 + }, + { + "epoch": 3.448639014906027, + "grad_norm": 0.0678846463561058, + "learning_rate": 3.0631441559026056e-05, + "loss": 0.2143, + "step": 42570 + }, + { + "epoch": 3.448720025923526, + "grad_norm": 0.07107531279325485, + "learning_rate": 3.062694090643144e-05, + "loss": 0.2345, + "step": 42571 + }, + { + "epoch": 3.448801036941024, + "grad_norm": 0.08533624559640884, + "learning_rate": 3.062244025383681e-05, + "loss": 0.2227, + "step": 42572 + }, + { + "epoch": 3.4488820479585223, + "grad_norm": 0.07481777667999268, + "learning_rate": 3.061793960124218e-05, + "loss": 0.3018, + "step": 42573 + }, + { + "epoch": 3.4489630589760205, + "grad_norm": 0.07119978964328766, + "learning_rate": 3.061343894864756e-05, + "loss": 0.2134, + "step": 42574 + }, + { + "epoch": 3.449044069993519, + "grad_norm": 0.06313995271921158, + "learning_rate": 3.060893829605293e-05, + "loss": 0.2476, + "step": 42575 + }, + { + "epoch": 3.4491250810110174, + "grad_norm": 0.086004838347435, + "learning_rate": 3.06044376434583e-05, + "loss": 0.2488, + "step": 42576 + }, + { + "epoch": 3.4492060920285157, + "grad_norm": 0.08241745084524155, + "learning_rate": 3.059993699086368e-05, + "loss": 0.2519, + "step": 42577 + }, + { + "epoch": 3.4492871030460144, + "grad_norm": 0.08791528642177582, + "learning_rate": 3.059543633826905e-05, + "loss": 0.2179, + "step": 42578 + }, + { + "epoch": 3.4493681140635126, + "grad_norm": 0.06407441943883896, + "learning_rate": 3.0590935685674426e-05, + "loss": 0.2079, + "step": 42579 + }, + { + "epoch": 3.449449125081011, + "grad_norm": 0.06448465585708618, + "learning_rate": 3.05864350330798e-05, + "loss": 0.1946, + "step": 42580 + }, + { + "epoch": 3.4495301360985096, + "grad_norm": 0.06738763302564621, + "learning_rate": 3.058193438048517e-05, + "loss": 0.2527, + "step": 42581 + }, + { + "epoch": 3.449611147116008, + "grad_norm": 0.07119040191173553, + "learning_rate": 3.0577433727890546e-05, + "loss": 0.2128, + "step": 42582 + }, + { + "epoch": 3.449692158133506, + "grad_norm": 0.08521244674921036, + "learning_rate": 3.057293307529592e-05, + "loss": 0.2353, + "step": 42583 + }, + { + "epoch": 3.4497731691510047, + "grad_norm": 0.056287288665771484, + "learning_rate": 3.0568432422701294e-05, + "loss": 0.1935, + "step": 42584 + }, + { + "epoch": 3.449854180168503, + "grad_norm": 0.07885636389255524, + "learning_rate": 3.056393177010667e-05, + "loss": 0.2544, + "step": 42585 + }, + { + "epoch": 3.4499351911860012, + "grad_norm": 0.07785996794700623, + "learning_rate": 3.055943111751204e-05, + "loss": 0.2056, + "step": 42586 + }, + { + "epoch": 3.4500162022035, + "grad_norm": 0.07002022117376328, + "learning_rate": 3.0554930464917414e-05, + "loss": 0.254, + "step": 42587 + }, + { + "epoch": 3.450097213220998, + "grad_norm": 0.0954364538192749, + "learning_rate": 3.055042981232279e-05, + "loss": 0.2299, + "step": 42588 + }, + { + "epoch": 3.4501782242384964, + "grad_norm": 0.07318907231092453, + "learning_rate": 3.054592915972816e-05, + "loss": 0.2806, + "step": 42589 + }, + { + "epoch": 3.4502592352559946, + "grad_norm": 0.08387971669435501, + "learning_rate": 3.0541428507133535e-05, + "loss": 0.2068, + "step": 42590 + }, + { + "epoch": 3.4503402462734933, + "grad_norm": 0.06505469977855682, + "learning_rate": 3.053692785453891e-05, + "loss": 0.2197, + "step": 42591 + }, + { + "epoch": 3.4504212572909916, + "grad_norm": 0.06631197035312653, + "learning_rate": 3.053242720194428e-05, + "loss": 0.2046, + "step": 42592 + }, + { + "epoch": 3.45050226830849, + "grad_norm": 0.06794437021017075, + "learning_rate": 3.0527926549349656e-05, + "loss": 0.2153, + "step": 42593 + }, + { + "epoch": 3.4505832793259885, + "grad_norm": 0.07331722229719162, + "learning_rate": 3.052342589675503e-05, + "loss": 0.2154, + "step": 42594 + }, + { + "epoch": 3.4506642903434868, + "grad_norm": 0.07726750522851944, + "learning_rate": 3.051892524416041e-05, + "loss": 0.21, + "step": 42595 + }, + { + "epoch": 3.450745301360985, + "grad_norm": 0.06050015985965729, + "learning_rate": 3.0514424591565777e-05, + "loss": 0.1939, + "step": 42596 + }, + { + "epoch": 3.4508263123784833, + "grad_norm": 0.10080495476722717, + "learning_rate": 3.0509923938971154e-05, + "loss": 0.2516, + "step": 42597 + }, + { + "epoch": 3.450907323395982, + "grad_norm": 0.07157032191753387, + "learning_rate": 3.0505423286376527e-05, + "loss": 0.1925, + "step": 42598 + }, + { + "epoch": 3.45098833441348, + "grad_norm": 0.05965049937367439, + "learning_rate": 3.0500922633781898e-05, + "loss": 0.2142, + "step": 42599 + }, + { + "epoch": 3.4510693454309784, + "grad_norm": 0.08102771639823914, + "learning_rate": 3.0496421981187275e-05, + "loss": 0.2468, + "step": 42600 + }, + { + "epoch": 3.451150356448477, + "grad_norm": 0.07460810244083405, + "learning_rate": 3.0491921328592648e-05, + "loss": 0.2373, + "step": 42601 + }, + { + "epoch": 3.4512313674659754, + "grad_norm": 0.08648321032524109, + "learning_rate": 3.048742067599802e-05, + "loss": 0.2106, + "step": 42602 + }, + { + "epoch": 3.4513123784834736, + "grad_norm": 0.07186142355203629, + "learning_rate": 3.0482920023403396e-05, + "loss": 0.2229, + "step": 42603 + }, + { + "epoch": 3.4513933895009723, + "grad_norm": 0.06909585744142532, + "learning_rate": 3.047841937080877e-05, + "loss": 0.1977, + "step": 42604 + }, + { + "epoch": 3.4514744005184705, + "grad_norm": 0.08842332661151886, + "learning_rate": 3.0473918718214146e-05, + "loss": 0.2428, + "step": 42605 + }, + { + "epoch": 3.451555411535969, + "grad_norm": 0.07989372313022614, + "learning_rate": 3.0469418065619516e-05, + "loss": 0.2748, + "step": 42606 + }, + { + "epoch": 3.4516364225534675, + "grad_norm": 0.06668754667043686, + "learning_rate": 3.046491741302489e-05, + "loss": 0.2262, + "step": 42607 + }, + { + "epoch": 3.4517174335709657, + "grad_norm": 0.06295004487037659, + "learning_rate": 3.0460416760430267e-05, + "loss": 0.2197, + "step": 42608 + }, + { + "epoch": 3.451798444588464, + "grad_norm": 0.08387071639299393, + "learning_rate": 3.0455916107835637e-05, + "loss": 0.2375, + "step": 42609 + }, + { + "epoch": 3.4518794556059627, + "grad_norm": 0.07910163700580597, + "learning_rate": 3.045141545524101e-05, + "loss": 0.2017, + "step": 42610 + }, + { + "epoch": 3.451960466623461, + "grad_norm": 0.07617625594139099, + "learning_rate": 3.0446914802646388e-05, + "loss": 0.2246, + "step": 42611 + }, + { + "epoch": 3.452041477640959, + "grad_norm": 0.07298461347818375, + "learning_rate": 3.0442414150051758e-05, + "loss": 0.2318, + "step": 42612 + }, + { + "epoch": 3.4521224886584574, + "grad_norm": 0.07805348932743073, + "learning_rate": 3.043791349745713e-05, + "loss": 0.2166, + "step": 42613 + }, + { + "epoch": 3.452203499675956, + "grad_norm": 0.07293111830949783, + "learning_rate": 3.043341284486251e-05, + "loss": 0.2259, + "step": 42614 + }, + { + "epoch": 3.4522845106934543, + "grad_norm": 0.08659076690673828, + "learning_rate": 3.042891219226788e-05, + "loss": 0.1841, + "step": 42615 + }, + { + "epoch": 3.4523655217109526, + "grad_norm": 0.07256795465946198, + "learning_rate": 3.0424411539673252e-05, + "loss": 0.2487, + "step": 42616 + }, + { + "epoch": 3.4524465327284513, + "grad_norm": 0.07146996259689331, + "learning_rate": 3.041991088707863e-05, + "loss": 0.2341, + "step": 42617 + }, + { + "epoch": 3.4525275437459495, + "grad_norm": 0.06735842674970627, + "learning_rate": 3.0415410234484003e-05, + "loss": 0.2418, + "step": 42618 + }, + { + "epoch": 3.4526085547634477, + "grad_norm": 0.08807352930307388, + "learning_rate": 3.0410909581889373e-05, + "loss": 0.232, + "step": 42619 + }, + { + "epoch": 3.452689565780946, + "grad_norm": 0.07156084477901459, + "learning_rate": 3.040640892929475e-05, + "loss": 0.2337, + "step": 42620 + }, + { + "epoch": 3.4527705767984447, + "grad_norm": 0.06210574135184288, + "learning_rate": 3.0401908276700124e-05, + "loss": 0.2258, + "step": 42621 + }, + { + "epoch": 3.452851587815943, + "grad_norm": 0.0830637589097023, + "learning_rate": 3.0397407624105494e-05, + "loss": 0.1914, + "step": 42622 + }, + { + "epoch": 3.452932598833441, + "grad_norm": 0.0659496858716011, + "learning_rate": 3.039290697151087e-05, + "loss": 0.1955, + "step": 42623 + }, + { + "epoch": 3.45301360985094, + "grad_norm": 0.08773655444383621, + "learning_rate": 3.0388406318916245e-05, + "loss": 0.2396, + "step": 42624 + }, + { + "epoch": 3.453094620868438, + "grad_norm": 0.0867881178855896, + "learning_rate": 3.0383905666321615e-05, + "loss": 0.2583, + "step": 42625 + }, + { + "epoch": 3.4531756318859363, + "grad_norm": 0.06408640742301941, + "learning_rate": 3.0379405013726992e-05, + "loss": 0.2227, + "step": 42626 + }, + { + "epoch": 3.453256642903435, + "grad_norm": 0.06799597293138504, + "learning_rate": 3.0374904361132365e-05, + "loss": 0.2251, + "step": 42627 + }, + { + "epoch": 3.4533376539209333, + "grad_norm": 0.0695396289229393, + "learning_rate": 3.0370403708537736e-05, + "loss": 0.2336, + "step": 42628 + }, + { + "epoch": 3.4534186649384315, + "grad_norm": 0.07411754876375198, + "learning_rate": 3.0365903055943113e-05, + "loss": 0.2278, + "step": 42629 + }, + { + "epoch": 3.45349967595593, + "grad_norm": 0.09416381269693375, + "learning_rate": 3.0361402403348486e-05, + "loss": 0.27, + "step": 42630 + }, + { + "epoch": 3.4535806869734285, + "grad_norm": 0.07336270809173584, + "learning_rate": 3.0356901750753863e-05, + "loss": 0.2177, + "step": 42631 + }, + { + "epoch": 3.4536616979909267, + "grad_norm": 0.0843655914068222, + "learning_rate": 3.0352401098159233e-05, + "loss": 0.2416, + "step": 42632 + }, + { + "epoch": 3.4537427090084254, + "grad_norm": 0.06593769788742065, + "learning_rate": 3.034790044556461e-05, + "loss": 0.1886, + "step": 42633 + }, + { + "epoch": 3.4538237200259236, + "grad_norm": 0.06238039955496788, + "learning_rate": 3.0343399792969984e-05, + "loss": 0.2111, + "step": 42634 + }, + { + "epoch": 3.453904731043422, + "grad_norm": 0.07243414968252182, + "learning_rate": 3.0338899140375354e-05, + "loss": 0.2137, + "step": 42635 + }, + { + "epoch": 3.45398574206092, + "grad_norm": 0.06280165165662766, + "learning_rate": 3.033439848778073e-05, + "loss": 0.2483, + "step": 42636 + }, + { + "epoch": 3.454066753078419, + "grad_norm": 0.0687374547123909, + "learning_rate": 3.0329897835186105e-05, + "loss": 0.2537, + "step": 42637 + }, + { + "epoch": 3.454147764095917, + "grad_norm": 0.060395676642656326, + "learning_rate": 3.0325397182591475e-05, + "loss": 0.2205, + "step": 42638 + }, + { + "epoch": 3.4542287751134153, + "grad_norm": 0.06727840006351471, + "learning_rate": 3.0320896529996852e-05, + "loss": 0.226, + "step": 42639 + }, + { + "epoch": 3.454309786130914, + "grad_norm": 0.07435212284326553, + "learning_rate": 3.0316395877402226e-05, + "loss": 0.2189, + "step": 42640 + }, + { + "epoch": 3.4543907971484122, + "grad_norm": 0.08039019256830215, + "learning_rate": 3.0311895224807596e-05, + "loss": 0.2404, + "step": 42641 + }, + { + "epoch": 3.4544718081659105, + "grad_norm": 0.08695101737976074, + "learning_rate": 3.0307394572212973e-05, + "loss": 0.2338, + "step": 42642 + }, + { + "epoch": 3.4545528191834087, + "grad_norm": 0.06483660638332367, + "learning_rate": 3.0302893919618346e-05, + "loss": 0.2118, + "step": 42643 + }, + { + "epoch": 3.4546338302009074, + "grad_norm": 0.06342937052249908, + "learning_rate": 3.0298393267023723e-05, + "loss": 0.24, + "step": 42644 + }, + { + "epoch": 3.4547148412184057, + "grad_norm": 0.06715777516365051, + "learning_rate": 3.0293892614429094e-05, + "loss": 0.2376, + "step": 42645 + }, + { + "epoch": 3.454795852235904, + "grad_norm": 0.0639479011297226, + "learning_rate": 3.0289391961834467e-05, + "loss": 0.2272, + "step": 42646 + }, + { + "epoch": 3.4548768632534026, + "grad_norm": 0.0595063641667366, + "learning_rate": 3.0284891309239844e-05, + "loss": 0.218, + "step": 42647 + }, + { + "epoch": 3.454957874270901, + "grad_norm": 0.07153860479593277, + "learning_rate": 3.0280390656645214e-05, + "loss": 0.2089, + "step": 42648 + }, + { + "epoch": 3.455038885288399, + "grad_norm": 0.07404875010251999, + "learning_rate": 3.0275890004050588e-05, + "loss": 0.2961, + "step": 42649 + }, + { + "epoch": 3.4551198963058978, + "grad_norm": 0.07781454920768738, + "learning_rate": 3.0271389351455965e-05, + "loss": 0.209, + "step": 42650 + }, + { + "epoch": 3.455200907323396, + "grad_norm": 0.0751013308763504, + "learning_rate": 3.0266888698861335e-05, + "loss": 0.2487, + "step": 42651 + }, + { + "epoch": 3.4552819183408943, + "grad_norm": 0.07977674901485443, + "learning_rate": 3.026238804626671e-05, + "loss": 0.2522, + "step": 42652 + }, + { + "epoch": 3.455362929358393, + "grad_norm": 0.07325046509504318, + "learning_rate": 3.0257887393672086e-05, + "loss": 0.2277, + "step": 42653 + }, + { + "epoch": 3.455443940375891, + "grad_norm": 0.07782243192195892, + "learning_rate": 3.0253386741077456e-05, + "loss": 0.1968, + "step": 42654 + }, + { + "epoch": 3.4555249513933894, + "grad_norm": 0.06764143705368042, + "learning_rate": 3.024888608848283e-05, + "loss": 0.2293, + "step": 42655 + }, + { + "epoch": 3.4556059624108877, + "grad_norm": 0.0683240294456482, + "learning_rate": 3.0244385435888207e-05, + "loss": 0.2326, + "step": 42656 + }, + { + "epoch": 3.4556869734283864, + "grad_norm": 0.07400035113096237, + "learning_rate": 3.023988478329358e-05, + "loss": 0.2471, + "step": 42657 + }, + { + "epoch": 3.4557679844458846, + "grad_norm": 0.08437030017375946, + "learning_rate": 3.023538413069895e-05, + "loss": 0.2547, + "step": 42658 + }, + { + "epoch": 3.455848995463383, + "grad_norm": 0.06690854579210281, + "learning_rate": 3.0230883478104327e-05, + "loss": 0.2286, + "step": 42659 + }, + { + "epoch": 3.4559300064808816, + "grad_norm": 0.06564860790967941, + "learning_rate": 3.02263828255097e-05, + "loss": 0.2177, + "step": 42660 + }, + { + "epoch": 3.45601101749838, + "grad_norm": 0.0681687518954277, + "learning_rate": 3.022188217291507e-05, + "loss": 0.2529, + "step": 42661 + }, + { + "epoch": 3.456092028515878, + "grad_norm": 0.07344760745763779, + "learning_rate": 3.0217381520320448e-05, + "loss": 0.249, + "step": 42662 + }, + { + "epoch": 3.4561730395333763, + "grad_norm": 0.07287075370550156, + "learning_rate": 3.0212880867725822e-05, + "loss": 0.2255, + "step": 42663 + }, + { + "epoch": 3.456254050550875, + "grad_norm": 0.06859376281499863, + "learning_rate": 3.0208380215131192e-05, + "loss": 0.2455, + "step": 42664 + }, + { + "epoch": 3.4563350615683732, + "grad_norm": 0.07294625788927078, + "learning_rate": 3.020387956253657e-05, + "loss": 0.2292, + "step": 42665 + }, + { + "epoch": 3.4564160725858715, + "grad_norm": 0.07278914749622345, + "learning_rate": 3.0199378909941946e-05, + "loss": 0.2337, + "step": 42666 + }, + { + "epoch": 3.45649708360337, + "grad_norm": 0.07642342150211334, + "learning_rate": 3.0194878257347313e-05, + "loss": 0.2148, + "step": 42667 + }, + { + "epoch": 3.4565780946208684, + "grad_norm": 0.07237835973501205, + "learning_rate": 3.019037760475269e-05, + "loss": 0.2142, + "step": 42668 + }, + { + "epoch": 3.4566591056383666, + "grad_norm": 0.07787852734327316, + "learning_rate": 3.0185876952158067e-05, + "loss": 0.2466, + "step": 42669 + }, + { + "epoch": 3.4567401166558653, + "grad_norm": 0.06031728535890579, + "learning_rate": 3.018137629956344e-05, + "loss": 0.1905, + "step": 42670 + }, + { + "epoch": 3.4568211276733636, + "grad_norm": 0.07175780087709427, + "learning_rate": 3.017687564696881e-05, + "loss": 0.2108, + "step": 42671 + }, + { + "epoch": 3.456902138690862, + "grad_norm": 0.07543084025382996, + "learning_rate": 3.0172374994374188e-05, + "loss": 0.2642, + "step": 42672 + }, + { + "epoch": 3.4569831497083605, + "grad_norm": 0.06601322442293167, + "learning_rate": 3.016787434177956e-05, + "loss": 0.2427, + "step": 42673 + }, + { + "epoch": 3.4570641607258588, + "grad_norm": 0.06880936771631241, + "learning_rate": 3.016337368918493e-05, + "loss": 0.2351, + "step": 42674 + }, + { + "epoch": 3.457145171743357, + "grad_norm": 0.06941180676221848, + "learning_rate": 3.015887303659031e-05, + "loss": 0.2167, + "step": 42675 + }, + { + "epoch": 3.4572261827608557, + "grad_norm": 0.08035728335380554, + "learning_rate": 3.0154372383995682e-05, + "loss": 0.2554, + "step": 42676 + }, + { + "epoch": 3.457307193778354, + "grad_norm": 0.0804125964641571, + "learning_rate": 3.0149871731401052e-05, + "loss": 0.2234, + "step": 42677 + }, + { + "epoch": 3.457388204795852, + "grad_norm": 0.05431007593870163, + "learning_rate": 3.014537107880643e-05, + "loss": 0.2322, + "step": 42678 + }, + { + "epoch": 3.4574692158133504, + "grad_norm": 0.06997882574796677, + "learning_rate": 3.0140870426211803e-05, + "loss": 0.2212, + "step": 42679 + }, + { + "epoch": 3.457550226830849, + "grad_norm": 0.07095611095428467, + "learning_rate": 3.0136369773617173e-05, + "loss": 0.2024, + "step": 42680 + }, + { + "epoch": 3.4576312378483474, + "grad_norm": 0.0627041757106781, + "learning_rate": 3.013186912102255e-05, + "loss": 0.233, + "step": 42681 + }, + { + "epoch": 3.4577122488658456, + "grad_norm": 0.059571947902441025, + "learning_rate": 3.0127368468427924e-05, + "loss": 0.2446, + "step": 42682 + }, + { + "epoch": 3.4577932598833443, + "grad_norm": 0.06569252908229828, + "learning_rate": 3.01228678158333e-05, + "loss": 0.234, + "step": 42683 + }, + { + "epoch": 3.4578742709008425, + "grad_norm": 0.06776624172925949, + "learning_rate": 3.011836716323867e-05, + "loss": 0.215, + "step": 42684 + }, + { + "epoch": 3.457955281918341, + "grad_norm": 0.06598973274230957, + "learning_rate": 3.0113866510644045e-05, + "loss": 0.1994, + "step": 42685 + }, + { + "epoch": 3.458036292935839, + "grad_norm": 0.06201748922467232, + "learning_rate": 3.010936585804942e-05, + "loss": 0.2279, + "step": 42686 + }, + { + "epoch": 3.4581173039533377, + "grad_norm": 0.08058135211467743, + "learning_rate": 3.0104865205454792e-05, + "loss": 0.2569, + "step": 42687 + }, + { + "epoch": 3.458198314970836, + "grad_norm": 0.06606631726026535, + "learning_rate": 3.0100364552860165e-05, + "loss": 0.2738, + "step": 42688 + }, + { + "epoch": 3.458279325988334, + "grad_norm": 0.06655286252498627, + "learning_rate": 3.0095863900265542e-05, + "loss": 0.2433, + "step": 42689 + }, + { + "epoch": 3.458360337005833, + "grad_norm": 0.08217357099056244, + "learning_rate": 3.0091363247670913e-05, + "loss": 0.2057, + "step": 42690 + }, + { + "epoch": 3.458441348023331, + "grad_norm": 0.06689063459634781, + "learning_rate": 3.0086862595076286e-05, + "loss": 0.2438, + "step": 42691 + }, + { + "epoch": 3.4585223590408294, + "grad_norm": 0.07152974605560303, + "learning_rate": 3.0082361942481663e-05, + "loss": 0.2558, + "step": 42692 + }, + { + "epoch": 3.458603370058328, + "grad_norm": 0.0755951926112175, + "learning_rate": 3.0077861289887033e-05, + "loss": 0.2117, + "step": 42693 + }, + { + "epoch": 3.4586843810758263, + "grad_norm": 0.08308055996894836, + "learning_rate": 3.0073360637292407e-05, + "loss": 0.2148, + "step": 42694 + }, + { + "epoch": 3.4587653920933246, + "grad_norm": 0.07032884657382965, + "learning_rate": 3.0068859984697784e-05, + "loss": 0.222, + "step": 42695 + }, + { + "epoch": 3.4588464031108233, + "grad_norm": 0.05964222922921181, + "learning_rate": 3.0064359332103158e-05, + "loss": 0.2174, + "step": 42696 + }, + { + "epoch": 3.4589274141283215, + "grad_norm": 0.07548788189888, + "learning_rate": 3.0059858679508528e-05, + "loss": 0.2334, + "step": 42697 + }, + { + "epoch": 3.4590084251458197, + "grad_norm": 0.07545255869626999, + "learning_rate": 3.0055358026913905e-05, + "loss": 0.2385, + "step": 42698 + }, + { + "epoch": 3.4590894361633184, + "grad_norm": 0.06462171673774719, + "learning_rate": 3.005085737431928e-05, + "loss": 0.2, + "step": 42699 + }, + { + "epoch": 3.4591704471808167, + "grad_norm": 0.07996838539838791, + "learning_rate": 3.004635672172465e-05, + "loss": 0.2185, + "step": 42700 + }, + { + "epoch": 3.459251458198315, + "grad_norm": 0.0730762779712677, + "learning_rate": 3.0041856069130026e-05, + "loss": 0.2399, + "step": 42701 + }, + { + "epoch": 3.459332469215813, + "grad_norm": 0.06021992489695549, + "learning_rate": 3.0037355416535403e-05, + "loss": 0.2486, + "step": 42702 + }, + { + "epoch": 3.459413480233312, + "grad_norm": 0.06835592538118362, + "learning_rate": 3.003285476394077e-05, + "loss": 0.2186, + "step": 42703 + }, + { + "epoch": 3.45949449125081, + "grad_norm": 0.06691218912601471, + "learning_rate": 3.0028354111346146e-05, + "loss": 0.239, + "step": 42704 + }, + { + "epoch": 3.4595755022683083, + "grad_norm": 0.07505834847688675, + "learning_rate": 3.0023853458751523e-05, + "loss": 0.2934, + "step": 42705 + }, + { + "epoch": 3.459656513285807, + "grad_norm": 0.06843426823616028, + "learning_rate": 3.001935280615689e-05, + "loss": 0.2292, + "step": 42706 + }, + { + "epoch": 3.4597375243033053, + "grad_norm": 0.07168073207139969, + "learning_rate": 3.0014852153562267e-05, + "loss": 0.2144, + "step": 42707 + }, + { + "epoch": 3.4598185353208035, + "grad_norm": 0.07357820868492126, + "learning_rate": 3.0010351500967644e-05, + "loss": 0.1994, + "step": 42708 + }, + { + "epoch": 3.4598995463383018, + "grad_norm": 0.06848014891147614, + "learning_rate": 3.0005850848373018e-05, + "loss": 0.2146, + "step": 42709 + }, + { + "epoch": 3.4599805573558005, + "grad_norm": 0.07098592072725296, + "learning_rate": 3.0001350195778388e-05, + "loss": 0.2236, + "step": 42710 + }, + { + "epoch": 3.4600615683732987, + "grad_norm": 0.06135905534029007, + "learning_rate": 2.9996849543183765e-05, + "loss": 0.2137, + "step": 42711 + }, + { + "epoch": 3.460142579390797, + "grad_norm": 0.07563517242670059, + "learning_rate": 2.999234889058914e-05, + "loss": 0.2131, + "step": 42712 + }, + { + "epoch": 3.4602235904082956, + "grad_norm": 0.07762432843446732, + "learning_rate": 2.998784823799451e-05, + "loss": 0.2029, + "step": 42713 + }, + { + "epoch": 3.460304601425794, + "grad_norm": 0.06432337313890457, + "learning_rate": 2.9983347585399886e-05, + "loss": 0.2375, + "step": 42714 + }, + { + "epoch": 3.460385612443292, + "grad_norm": 0.07871246337890625, + "learning_rate": 2.997884693280526e-05, + "loss": 0.269, + "step": 42715 + }, + { + "epoch": 3.460466623460791, + "grad_norm": 0.07545270025730133, + "learning_rate": 2.997434628021063e-05, + "loss": 0.2148, + "step": 42716 + }, + { + "epoch": 3.460547634478289, + "grad_norm": 0.09875917434692383, + "learning_rate": 2.9969845627616007e-05, + "loss": 0.2502, + "step": 42717 + }, + { + "epoch": 3.4606286454957873, + "grad_norm": 0.06610118597745895, + "learning_rate": 2.996534497502138e-05, + "loss": 0.1822, + "step": 42718 + }, + { + "epoch": 3.460709656513286, + "grad_norm": 0.07913530617952347, + "learning_rate": 2.996084432242675e-05, + "loss": 0.2197, + "step": 42719 + }, + { + "epoch": 3.4607906675307842, + "grad_norm": 0.06806863844394684, + "learning_rate": 2.9956343669832127e-05, + "loss": 0.2239, + "step": 42720 + }, + { + "epoch": 3.4608716785482825, + "grad_norm": 0.06233429163694382, + "learning_rate": 2.99518430172375e-05, + "loss": 0.2375, + "step": 42721 + }, + { + "epoch": 3.460952689565781, + "grad_norm": 0.08534996211528778, + "learning_rate": 2.994734236464287e-05, + "loss": 0.2351, + "step": 42722 + }, + { + "epoch": 3.4610337005832794, + "grad_norm": 0.0827810987830162, + "learning_rate": 2.9942841712048248e-05, + "loss": 0.2308, + "step": 42723 + }, + { + "epoch": 3.4611147116007777, + "grad_norm": 0.09024398773908615, + "learning_rate": 2.9938341059453622e-05, + "loss": 0.2467, + "step": 42724 + }, + { + "epoch": 3.461195722618276, + "grad_norm": 0.07115122675895691, + "learning_rate": 2.9933840406859e-05, + "loss": 0.2086, + "step": 42725 + }, + { + "epoch": 3.4612767336357746, + "grad_norm": 0.08028433471918106, + "learning_rate": 2.992933975426437e-05, + "loss": 0.2335, + "step": 42726 + }, + { + "epoch": 3.461357744653273, + "grad_norm": 0.0784420445561409, + "learning_rate": 2.9924839101669743e-05, + "loss": 0.2203, + "step": 42727 + }, + { + "epoch": 3.461438755670771, + "grad_norm": 0.07200008630752563, + "learning_rate": 2.992033844907512e-05, + "loss": 0.2139, + "step": 42728 + }, + { + "epoch": 3.4615197666882698, + "grad_norm": 0.06650616973638535, + "learning_rate": 2.991583779648049e-05, + "loss": 0.2014, + "step": 42729 + }, + { + "epoch": 3.461600777705768, + "grad_norm": 0.07266805320978165, + "learning_rate": 2.9911337143885863e-05, + "loss": 0.2083, + "step": 42730 + }, + { + "epoch": 3.4616817887232663, + "grad_norm": 0.0538320429623127, + "learning_rate": 2.990683649129124e-05, + "loss": 0.2266, + "step": 42731 + }, + { + "epoch": 3.4617627997407645, + "grad_norm": 0.07561052590608597, + "learning_rate": 2.990233583869661e-05, + "loss": 0.2478, + "step": 42732 + }, + { + "epoch": 3.461843810758263, + "grad_norm": 0.07131976634263992, + "learning_rate": 2.9897835186101984e-05, + "loss": 0.2351, + "step": 42733 + }, + { + "epoch": 3.4619248217757614, + "grad_norm": 0.07233821600675583, + "learning_rate": 2.989333453350736e-05, + "loss": 0.2601, + "step": 42734 + }, + { + "epoch": 3.4620058327932597, + "grad_norm": 0.07123769819736481, + "learning_rate": 2.988883388091273e-05, + "loss": 0.2259, + "step": 42735 + }, + { + "epoch": 3.4620868438107584, + "grad_norm": 0.08232641220092773, + "learning_rate": 2.9884333228318105e-05, + "loss": 0.2919, + "step": 42736 + }, + { + "epoch": 3.4621678548282566, + "grad_norm": 0.07963483780622482, + "learning_rate": 2.9879832575723482e-05, + "loss": 0.2462, + "step": 42737 + }, + { + "epoch": 3.462248865845755, + "grad_norm": 0.06596418470144272, + "learning_rate": 2.987533192312886e-05, + "loss": 0.2181, + "step": 42738 + }, + { + "epoch": 3.4623298768632536, + "grad_norm": 0.07897919416427612, + "learning_rate": 2.9870831270534226e-05, + "loss": 0.222, + "step": 42739 + }, + { + "epoch": 3.462410887880752, + "grad_norm": 0.07320919632911682, + "learning_rate": 2.9866330617939603e-05, + "loss": 0.2362, + "step": 42740 + }, + { + "epoch": 3.46249189889825, + "grad_norm": 0.08484750986099243, + "learning_rate": 2.986182996534498e-05, + "loss": 0.2283, + "step": 42741 + }, + { + "epoch": 3.4625729099157487, + "grad_norm": 0.05763061344623566, + "learning_rate": 2.985732931275035e-05, + "loss": 0.2019, + "step": 42742 + }, + { + "epoch": 3.462653920933247, + "grad_norm": 0.06858108192682266, + "learning_rate": 2.9852828660155724e-05, + "loss": 0.24, + "step": 42743 + }, + { + "epoch": 3.462734931950745, + "grad_norm": 0.0650968849658966, + "learning_rate": 2.98483280075611e-05, + "loss": 0.2225, + "step": 42744 + }, + { + "epoch": 3.462815942968244, + "grad_norm": 0.0646698996424675, + "learning_rate": 2.984382735496647e-05, + "loss": 0.2295, + "step": 42745 + }, + { + "epoch": 3.462896953985742, + "grad_norm": 0.08138510584831238, + "learning_rate": 2.9839326702371844e-05, + "loss": 0.2516, + "step": 42746 + }, + { + "epoch": 3.4629779650032404, + "grad_norm": 0.07128766179084778, + "learning_rate": 2.983482604977722e-05, + "loss": 0.2226, + "step": 42747 + }, + { + "epoch": 3.4630589760207386, + "grad_norm": 0.05778047442436218, + "learning_rate": 2.9830325397182592e-05, + "loss": 0.2015, + "step": 42748 + }, + { + "epoch": 3.4631399870382373, + "grad_norm": 0.06885864585638046, + "learning_rate": 2.9825824744587965e-05, + "loss": 0.2254, + "step": 42749 + }, + { + "epoch": 3.4632209980557356, + "grad_norm": 0.07103770226240158, + "learning_rate": 2.9821324091993342e-05, + "loss": 0.229, + "step": 42750 + }, + { + "epoch": 3.463302009073234, + "grad_norm": 0.08950777351856232, + "learning_rate": 2.9816823439398716e-05, + "loss": 0.2434, + "step": 42751 + }, + { + "epoch": 3.4633830200907325, + "grad_norm": 0.06826742738485336, + "learning_rate": 2.9812322786804086e-05, + "loss": 0.2148, + "step": 42752 + }, + { + "epoch": 3.4634640311082308, + "grad_norm": 0.07242395728826523, + "learning_rate": 2.9807822134209463e-05, + "loss": 0.2436, + "step": 42753 + }, + { + "epoch": 3.463545042125729, + "grad_norm": 0.07269168645143509, + "learning_rate": 2.9803321481614837e-05, + "loss": 0.2314, + "step": 42754 + }, + { + "epoch": 3.4636260531432272, + "grad_norm": 0.06759374588727951, + "learning_rate": 2.9798820829020207e-05, + "loss": 0.2461, + "step": 42755 + }, + { + "epoch": 3.463707064160726, + "grad_norm": 0.09427613019943237, + "learning_rate": 2.9794320176425584e-05, + "loss": 0.2412, + "step": 42756 + }, + { + "epoch": 3.463788075178224, + "grad_norm": 0.06852936744689941, + "learning_rate": 2.9789819523830958e-05, + "loss": 0.2618, + "step": 42757 + }, + { + "epoch": 3.4638690861957224, + "grad_norm": 0.07090926915407181, + "learning_rate": 2.9785318871236328e-05, + "loss": 0.258, + "step": 42758 + }, + { + "epoch": 3.463950097213221, + "grad_norm": 0.06704474985599518, + "learning_rate": 2.9780818218641705e-05, + "loss": 0.2101, + "step": 42759 + }, + { + "epoch": 3.4640311082307194, + "grad_norm": 0.06343290954828262, + "learning_rate": 2.977631756604708e-05, + "loss": 0.2247, + "step": 42760 + }, + { + "epoch": 3.4641121192482176, + "grad_norm": 0.07604165375232697, + "learning_rate": 2.977181691345245e-05, + "loss": 0.2268, + "step": 42761 + }, + { + "epoch": 3.4641931302657163, + "grad_norm": 0.0740513950586319, + "learning_rate": 2.9767316260857826e-05, + "loss": 0.2323, + "step": 42762 + }, + { + "epoch": 3.4642741412832145, + "grad_norm": 0.07418538630008698, + "learning_rate": 2.97628156082632e-05, + "loss": 0.2016, + "step": 42763 + }, + { + "epoch": 3.464355152300713, + "grad_norm": 0.07796464115381241, + "learning_rate": 2.9758314955668576e-05, + "loss": 0.2353, + "step": 42764 + }, + { + "epoch": 3.4644361633182115, + "grad_norm": 0.0578065924346447, + "learning_rate": 2.9753814303073946e-05, + "loss": 0.226, + "step": 42765 + }, + { + "epoch": 3.4645171743357097, + "grad_norm": 0.060822539031505585, + "learning_rate": 2.974931365047932e-05, + "loss": 0.2199, + "step": 42766 + }, + { + "epoch": 3.464598185353208, + "grad_norm": 0.068793386220932, + "learning_rate": 2.9744812997884697e-05, + "loss": 0.2505, + "step": 42767 + }, + { + "epoch": 3.4646791963707066, + "grad_norm": 0.08377143740653992, + "learning_rate": 2.9740312345290067e-05, + "loss": 0.2449, + "step": 42768 + }, + { + "epoch": 3.464760207388205, + "grad_norm": 0.08305362612009048, + "learning_rate": 2.973581169269544e-05, + "loss": 0.2428, + "step": 42769 + }, + { + "epoch": 3.464841218405703, + "grad_norm": 0.07506440579891205, + "learning_rate": 2.9731311040100818e-05, + "loss": 0.2282, + "step": 42770 + }, + { + "epoch": 3.4649222294232014, + "grad_norm": 0.056740131229162216, + "learning_rate": 2.9726810387506188e-05, + "loss": 0.1754, + "step": 42771 + }, + { + "epoch": 3.4650032404407, + "grad_norm": 0.06253889203071594, + "learning_rate": 2.972230973491156e-05, + "loss": 0.2289, + "step": 42772 + }, + { + "epoch": 3.4650842514581983, + "grad_norm": 0.06224200129508972, + "learning_rate": 2.971780908231694e-05, + "loss": 0.2004, + "step": 42773 + }, + { + "epoch": 3.4651652624756966, + "grad_norm": 0.06049289181828499, + "learning_rate": 2.971330842972231e-05, + "loss": 0.2276, + "step": 42774 + }, + { + "epoch": 3.4652462734931953, + "grad_norm": 0.07351897656917572, + "learning_rate": 2.9708807777127682e-05, + "loss": 0.2244, + "step": 42775 + }, + { + "epoch": 3.4653272845106935, + "grad_norm": 0.07887002825737, + "learning_rate": 2.970430712453306e-05, + "loss": 0.2632, + "step": 42776 + }, + { + "epoch": 3.4654082955281917, + "grad_norm": 0.07231858372688293, + "learning_rate": 2.9699806471938436e-05, + "loss": 0.2121, + "step": 42777 + }, + { + "epoch": 3.46548930654569, + "grad_norm": 0.05875080078840256, + "learning_rate": 2.9695305819343807e-05, + "loss": 0.2003, + "step": 42778 + }, + { + "epoch": 3.4655703175631887, + "grad_norm": 0.05856175720691681, + "learning_rate": 2.969080516674918e-05, + "loss": 0.205, + "step": 42779 + }, + { + "epoch": 3.465651328580687, + "grad_norm": 0.10160711407661438, + "learning_rate": 2.9686304514154557e-05, + "loss": 0.234, + "step": 42780 + }, + { + "epoch": 3.465732339598185, + "grad_norm": 0.08699193596839905, + "learning_rate": 2.9681803861559927e-05, + "loss": 0.2177, + "step": 42781 + }, + { + "epoch": 3.465813350615684, + "grad_norm": 0.07056750357151031, + "learning_rate": 2.96773032089653e-05, + "loss": 0.2105, + "step": 42782 + }, + { + "epoch": 3.465894361633182, + "grad_norm": 0.08346658945083618, + "learning_rate": 2.9672802556370678e-05, + "loss": 0.238, + "step": 42783 + }, + { + "epoch": 3.4659753726506803, + "grad_norm": 0.08231760561466217, + "learning_rate": 2.9668301903776048e-05, + "loss": 0.2705, + "step": 42784 + }, + { + "epoch": 3.466056383668179, + "grad_norm": 0.08830294013023376, + "learning_rate": 2.9663801251181422e-05, + "loss": 0.21, + "step": 42785 + }, + { + "epoch": 3.4661373946856773, + "grad_norm": 0.1320062279701233, + "learning_rate": 2.96593005985868e-05, + "loss": 0.2115, + "step": 42786 + }, + { + "epoch": 3.4662184057031755, + "grad_norm": 0.06545694917440414, + "learning_rate": 2.965479994599217e-05, + "loss": 0.2369, + "step": 42787 + }, + { + "epoch": 3.466299416720674, + "grad_norm": 0.07637354731559753, + "learning_rate": 2.9650299293397543e-05, + "loss": 0.2218, + "step": 42788 + }, + { + "epoch": 3.4663804277381725, + "grad_norm": 0.0652073547244072, + "learning_rate": 2.964579864080292e-05, + "loss": 0.2212, + "step": 42789 + }, + { + "epoch": 3.4664614387556707, + "grad_norm": 0.08444193005561829, + "learning_rate": 2.9641297988208293e-05, + "loss": 0.2438, + "step": 42790 + }, + { + "epoch": 3.4665424497731694, + "grad_norm": 0.06109117716550827, + "learning_rate": 2.9636797335613663e-05, + "loss": 0.2048, + "step": 42791 + }, + { + "epoch": 3.4666234607906676, + "grad_norm": 0.07534314692020416, + "learning_rate": 2.963229668301904e-05, + "loss": 0.2061, + "step": 42792 + }, + { + "epoch": 3.466704471808166, + "grad_norm": 0.072405144572258, + "learning_rate": 2.9627796030424414e-05, + "loss": 0.2275, + "step": 42793 + }, + { + "epoch": 3.466785482825664, + "grad_norm": 0.07258030027151108, + "learning_rate": 2.9623295377829784e-05, + "loss": 0.2439, + "step": 42794 + }, + { + "epoch": 3.466866493843163, + "grad_norm": 0.08834215998649597, + "learning_rate": 2.961879472523516e-05, + "loss": 0.2243, + "step": 42795 + }, + { + "epoch": 3.466947504860661, + "grad_norm": 0.09430413693189621, + "learning_rate": 2.9614294072640535e-05, + "loss": 0.2272, + "step": 42796 + }, + { + "epoch": 3.4670285158781593, + "grad_norm": 0.0866408497095108, + "learning_rate": 2.9609793420045905e-05, + "loss": 0.2357, + "step": 42797 + }, + { + "epoch": 3.467109526895658, + "grad_norm": 0.06997352838516235, + "learning_rate": 2.9605292767451282e-05, + "loss": 0.207, + "step": 42798 + }, + { + "epoch": 3.4671905379131562, + "grad_norm": 0.0892815962433815, + "learning_rate": 2.9600792114856656e-05, + "loss": 0.2432, + "step": 42799 + }, + { + "epoch": 3.4672715489306545, + "grad_norm": 0.05779581516981125, + "learning_rate": 2.9596291462262026e-05, + "loss": 0.242, + "step": 42800 + }, + { + "epoch": 3.4673525599481527, + "grad_norm": 0.07875336706638336, + "learning_rate": 2.9591790809667403e-05, + "loss": 0.2588, + "step": 42801 + }, + { + "epoch": 3.4674335709656514, + "grad_norm": 0.06230032816529274, + "learning_rate": 2.9587290157072776e-05, + "loss": 0.2007, + "step": 42802 + }, + { + "epoch": 3.4675145819831497, + "grad_norm": 0.0884285494685173, + "learning_rate": 2.9582789504478153e-05, + "loss": 0.2151, + "step": 42803 + }, + { + "epoch": 3.467595593000648, + "grad_norm": 0.07607505470514297, + "learning_rate": 2.9578288851883524e-05, + "loss": 0.2268, + "step": 42804 + }, + { + "epoch": 3.4676766040181466, + "grad_norm": 0.06472492963075638, + "learning_rate": 2.9573788199288897e-05, + "loss": 0.2265, + "step": 42805 + }, + { + "epoch": 3.467757615035645, + "grad_norm": 0.06241315230727196, + "learning_rate": 2.9569287546694274e-05, + "loss": 0.2371, + "step": 42806 + }, + { + "epoch": 3.467838626053143, + "grad_norm": 0.0674692764878273, + "learning_rate": 2.9564786894099644e-05, + "loss": 0.2064, + "step": 42807 + }, + { + "epoch": 3.4679196370706418, + "grad_norm": 0.06604752689599991, + "learning_rate": 2.9560286241505018e-05, + "loss": 0.2242, + "step": 42808 + }, + { + "epoch": 3.46800064808814, + "grad_norm": 0.07254812121391296, + "learning_rate": 2.9555785588910395e-05, + "loss": 0.237, + "step": 42809 + }, + { + "epoch": 3.4680816591056383, + "grad_norm": 0.05818432196974754, + "learning_rate": 2.9551284936315765e-05, + "loss": 0.2357, + "step": 42810 + }, + { + "epoch": 3.468162670123137, + "grad_norm": 0.06459543853998184, + "learning_rate": 2.9546784283721142e-05, + "loss": 0.2142, + "step": 42811 + }, + { + "epoch": 3.468243681140635, + "grad_norm": 0.06363950669765472, + "learning_rate": 2.9542283631126516e-05, + "loss": 0.2252, + "step": 42812 + }, + { + "epoch": 3.4683246921581334, + "grad_norm": 0.07214115560054779, + "learning_rate": 2.9537782978531886e-05, + "loss": 0.1997, + "step": 42813 + }, + { + "epoch": 3.468405703175632, + "grad_norm": 0.0833434909582138, + "learning_rate": 2.9533282325937263e-05, + "loss": 0.2266, + "step": 42814 + }, + { + "epoch": 3.4684867141931304, + "grad_norm": 0.060628198087215424, + "learning_rate": 2.9528781673342637e-05, + "loss": 0.2256, + "step": 42815 + }, + { + "epoch": 3.4685677252106286, + "grad_norm": 0.07649362087249756, + "learning_rate": 2.9524281020748014e-05, + "loss": 0.2218, + "step": 42816 + }, + { + "epoch": 3.468648736228127, + "grad_norm": 0.05994151905179024, + "learning_rate": 2.9519780368153384e-05, + "loss": 0.1972, + "step": 42817 + }, + { + "epoch": 3.4687297472456255, + "grad_norm": 0.05603544041514397, + "learning_rate": 2.9515279715558757e-05, + "loss": 0.1852, + "step": 42818 + }, + { + "epoch": 3.468810758263124, + "grad_norm": 0.0875602439045906, + "learning_rate": 2.9510779062964134e-05, + "loss": 0.277, + "step": 42819 + }, + { + "epoch": 3.468891769280622, + "grad_norm": 0.06956169009208679, + "learning_rate": 2.9506278410369505e-05, + "loss": 0.2408, + "step": 42820 + }, + { + "epoch": 3.4689727802981207, + "grad_norm": 0.06525658071041107, + "learning_rate": 2.9501777757774878e-05, + "loss": 0.2127, + "step": 42821 + }, + { + "epoch": 3.469053791315619, + "grad_norm": 0.053318217396736145, + "learning_rate": 2.9497277105180255e-05, + "loss": 0.2058, + "step": 42822 + }, + { + "epoch": 3.469134802333117, + "grad_norm": 0.05750075727701187, + "learning_rate": 2.9492776452585626e-05, + "loss": 0.1984, + "step": 42823 + }, + { + "epoch": 3.4692158133506155, + "grad_norm": 0.055032555013895035, + "learning_rate": 2.9488275799991e-05, + "loss": 0.2495, + "step": 42824 + }, + { + "epoch": 3.469296824368114, + "grad_norm": 0.06822854280471802, + "learning_rate": 2.9483775147396376e-05, + "loss": 0.2897, + "step": 42825 + }, + { + "epoch": 3.4693778353856124, + "grad_norm": 0.06751962006092072, + "learning_rate": 2.9479274494801746e-05, + "loss": 0.2156, + "step": 42826 + }, + { + "epoch": 3.4694588464031106, + "grad_norm": 0.06314662098884583, + "learning_rate": 2.947477384220712e-05, + "loss": 0.2023, + "step": 42827 + }, + { + "epoch": 3.4695398574206093, + "grad_norm": 0.07605284452438354, + "learning_rate": 2.9470273189612497e-05, + "loss": 0.2359, + "step": 42828 + }, + { + "epoch": 3.4696208684381076, + "grad_norm": 0.07924285531044006, + "learning_rate": 2.946577253701787e-05, + "loss": 0.2344, + "step": 42829 + }, + { + "epoch": 3.469701879455606, + "grad_norm": 0.07829609513282776, + "learning_rate": 2.946127188442324e-05, + "loss": 0.2456, + "step": 42830 + }, + { + "epoch": 3.4697828904731045, + "grad_norm": 0.07626158744096756, + "learning_rate": 2.9456771231828618e-05, + "loss": 0.2074, + "step": 42831 + }, + { + "epoch": 3.4698639014906028, + "grad_norm": 0.0745619535446167, + "learning_rate": 2.945227057923399e-05, + "loss": 0.2177, + "step": 42832 + }, + { + "epoch": 3.469944912508101, + "grad_norm": 0.06465175747871399, + "learning_rate": 2.944776992663936e-05, + "loss": 0.207, + "step": 42833 + }, + { + "epoch": 3.4700259235255997, + "grad_norm": 0.06583263725042343, + "learning_rate": 2.944326927404474e-05, + "loss": 0.2152, + "step": 42834 + }, + { + "epoch": 3.470106934543098, + "grad_norm": 0.07880668342113495, + "learning_rate": 2.9438768621450112e-05, + "loss": 0.2204, + "step": 42835 + }, + { + "epoch": 3.470187945560596, + "grad_norm": 0.08511168509721756, + "learning_rate": 2.9434267968855482e-05, + "loss": 0.2511, + "step": 42836 + }, + { + "epoch": 3.470268956578095, + "grad_norm": 0.0641389861702919, + "learning_rate": 2.942976731626086e-05, + "loss": 0.2086, + "step": 42837 + }, + { + "epoch": 3.470349967595593, + "grad_norm": 0.06406936794519424, + "learning_rate": 2.9425266663666233e-05, + "loss": 0.2147, + "step": 42838 + }, + { + "epoch": 3.4704309786130914, + "grad_norm": 0.1066378578543663, + "learning_rate": 2.9420766011071603e-05, + "loss": 0.2282, + "step": 42839 + }, + { + "epoch": 3.4705119896305896, + "grad_norm": 0.10814861208200455, + "learning_rate": 2.941626535847698e-05, + "loss": 0.2357, + "step": 42840 + }, + { + "epoch": 3.4705930006480883, + "grad_norm": 0.06493860483169556, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.1892, + "step": 42841 + }, + { + "epoch": 3.4706740116655865, + "grad_norm": 0.06477928161621094, + "learning_rate": 2.940726405328773e-05, + "loss": 0.2015, + "step": 42842 + }, + { + "epoch": 3.470755022683085, + "grad_norm": 0.057188913226127625, + "learning_rate": 2.94027634006931e-05, + "loss": 0.2055, + "step": 42843 + }, + { + "epoch": 3.4708360337005835, + "grad_norm": 0.06894127279520035, + "learning_rate": 2.9398262748098475e-05, + "loss": 0.241, + "step": 42844 + }, + { + "epoch": 3.4709170447180817, + "grad_norm": 0.07011944055557251, + "learning_rate": 2.939376209550385e-05, + "loss": 0.2181, + "step": 42845 + }, + { + "epoch": 3.47099805573558, + "grad_norm": 0.07745971530675888, + "learning_rate": 2.9389261442909222e-05, + "loss": 0.2004, + "step": 42846 + }, + { + "epoch": 3.471079066753078, + "grad_norm": 0.07370658218860626, + "learning_rate": 2.93847607903146e-05, + "loss": 0.2229, + "step": 42847 + }, + { + "epoch": 3.471160077770577, + "grad_norm": 0.07393095642328262, + "learning_rate": 2.9380260137719972e-05, + "loss": 0.2218, + "step": 42848 + }, + { + "epoch": 3.471241088788075, + "grad_norm": 0.07989127933979034, + "learning_rate": 2.9375759485125343e-05, + "loss": 0.2318, + "step": 42849 + }, + { + "epoch": 3.4713220998055734, + "grad_norm": 0.07488720864057541, + "learning_rate": 2.937125883253072e-05, + "loss": 0.2146, + "step": 42850 + }, + { + "epoch": 3.471403110823072, + "grad_norm": 0.057390645146369934, + "learning_rate": 2.9366758179936093e-05, + "loss": 0.2061, + "step": 42851 + }, + { + "epoch": 3.4714841218405703, + "grad_norm": 0.05493978410959244, + "learning_rate": 2.9362257527341463e-05, + "loss": 0.2559, + "step": 42852 + }, + { + "epoch": 3.4715651328580686, + "grad_norm": 0.056346312165260315, + "learning_rate": 2.935775687474684e-05, + "loss": 0.2003, + "step": 42853 + }, + { + "epoch": 3.4716461438755672, + "grad_norm": 0.07699105143547058, + "learning_rate": 2.9353256222152214e-05, + "loss": 0.2161, + "step": 42854 + }, + { + "epoch": 3.4717271548930655, + "grad_norm": 0.06203935667872429, + "learning_rate": 2.934875556955759e-05, + "loss": 0.2139, + "step": 42855 + }, + { + "epoch": 3.4718081659105637, + "grad_norm": 0.06947055459022522, + "learning_rate": 2.934425491696296e-05, + "loss": 0.2353, + "step": 42856 + }, + { + "epoch": 3.4718891769280624, + "grad_norm": 0.07427409291267395, + "learning_rate": 2.9339754264368335e-05, + "loss": 0.2506, + "step": 42857 + }, + { + "epoch": 3.4719701879455607, + "grad_norm": 0.0813642218708992, + "learning_rate": 2.9335253611773712e-05, + "loss": 0.2438, + "step": 42858 + }, + { + "epoch": 3.472051198963059, + "grad_norm": 0.07532806694507599, + "learning_rate": 2.9330752959179082e-05, + "loss": 0.2136, + "step": 42859 + }, + { + "epoch": 3.4721322099805576, + "grad_norm": 0.07544232904911041, + "learning_rate": 2.9326252306584456e-05, + "loss": 0.252, + "step": 42860 + }, + { + "epoch": 3.472213220998056, + "grad_norm": 0.07405072450637817, + "learning_rate": 2.9321751653989833e-05, + "loss": 0.2137, + "step": 42861 + }, + { + "epoch": 3.472294232015554, + "grad_norm": 0.07314892113208771, + "learning_rate": 2.9317251001395203e-05, + "loss": 0.2255, + "step": 42862 + }, + { + "epoch": 3.4723752430330523, + "grad_norm": 0.0625855028629303, + "learning_rate": 2.9312750348800576e-05, + "loss": 0.2336, + "step": 42863 + }, + { + "epoch": 3.472456254050551, + "grad_norm": 0.05610349029302597, + "learning_rate": 2.9308249696205953e-05, + "loss": 0.2248, + "step": 42864 + }, + { + "epoch": 3.4725372650680493, + "grad_norm": 0.058661460876464844, + "learning_rate": 2.9303749043611324e-05, + "loss": 0.1841, + "step": 42865 + }, + { + "epoch": 3.4726182760855475, + "grad_norm": 0.06500184535980225, + "learning_rate": 2.9299248391016697e-05, + "loss": 0.2377, + "step": 42866 + }, + { + "epoch": 3.472699287103046, + "grad_norm": 0.07732019573450089, + "learning_rate": 2.9294747738422074e-05, + "loss": 0.2616, + "step": 42867 + }, + { + "epoch": 3.4727802981205445, + "grad_norm": 0.07218549400568008, + "learning_rate": 2.9290247085827448e-05, + "loss": 0.2203, + "step": 42868 + }, + { + "epoch": 3.4728613091380427, + "grad_norm": 0.07090730220079422, + "learning_rate": 2.9285746433232818e-05, + "loss": 0.2377, + "step": 42869 + }, + { + "epoch": 3.472942320155541, + "grad_norm": 0.0621664933860302, + "learning_rate": 2.9281245780638195e-05, + "loss": 0.238, + "step": 42870 + }, + { + "epoch": 3.4730233311730396, + "grad_norm": 0.08462394773960114, + "learning_rate": 2.927674512804357e-05, + "loss": 0.2387, + "step": 42871 + }, + { + "epoch": 3.473104342190538, + "grad_norm": 0.0961620882153511, + "learning_rate": 2.927224447544894e-05, + "loss": 0.2399, + "step": 42872 + }, + { + "epoch": 3.473185353208036, + "grad_norm": 0.07041720300912857, + "learning_rate": 2.9267743822854316e-05, + "loss": 0.1945, + "step": 42873 + }, + { + "epoch": 3.473266364225535, + "grad_norm": 0.08104278892278671, + "learning_rate": 2.926324317025969e-05, + "loss": 0.2402, + "step": 42874 + }, + { + "epoch": 3.473347375243033, + "grad_norm": 0.08936689049005508, + "learning_rate": 2.925874251766506e-05, + "loss": 0.2431, + "step": 42875 + }, + { + "epoch": 3.4734283862605313, + "grad_norm": 0.057427264750003815, + "learning_rate": 2.9254241865070437e-05, + "loss": 0.1859, + "step": 42876 + }, + { + "epoch": 3.47350939727803, + "grad_norm": 0.06501757353544235, + "learning_rate": 2.924974121247581e-05, + "loss": 0.2515, + "step": 42877 + }, + { + "epoch": 3.4735904082955282, + "grad_norm": 0.06856517493724823, + "learning_rate": 2.924524055988118e-05, + "loss": 0.2219, + "step": 42878 + }, + { + "epoch": 3.4736714193130265, + "grad_norm": 0.08182412385940552, + "learning_rate": 2.9240739907286557e-05, + "loss": 0.2265, + "step": 42879 + }, + { + "epoch": 3.473752430330525, + "grad_norm": 0.08065382391214371, + "learning_rate": 2.9236239254691934e-05, + "loss": 0.2306, + "step": 42880 + }, + { + "epoch": 3.4738334413480234, + "grad_norm": 0.08492909371852875, + "learning_rate": 2.92317386020973e-05, + "loss": 0.2243, + "step": 42881 + }, + { + "epoch": 3.4739144523655217, + "grad_norm": 0.07409442961215973, + "learning_rate": 2.9227237949502678e-05, + "loss": 0.2079, + "step": 42882 + }, + { + "epoch": 3.47399546338302, + "grad_norm": 0.06843043118715286, + "learning_rate": 2.9222737296908055e-05, + "loss": 0.2137, + "step": 42883 + }, + { + "epoch": 3.4740764744005186, + "grad_norm": 0.08158732205629349, + "learning_rate": 2.921823664431343e-05, + "loss": 0.218, + "step": 42884 + }, + { + "epoch": 3.474157485418017, + "grad_norm": 0.06149592250585556, + "learning_rate": 2.92137359917188e-05, + "loss": 0.2288, + "step": 42885 + }, + { + "epoch": 3.474238496435515, + "grad_norm": 0.08045820891857147, + "learning_rate": 2.9209235339124176e-05, + "loss": 0.2193, + "step": 42886 + }, + { + "epoch": 3.4743195074530138, + "grad_norm": 0.07448042184114456, + "learning_rate": 2.920473468652955e-05, + "loss": 0.2279, + "step": 42887 + }, + { + "epoch": 3.474400518470512, + "grad_norm": 0.06777680665254593, + "learning_rate": 2.920023403393492e-05, + "loss": 0.2414, + "step": 42888 + }, + { + "epoch": 3.4744815294880103, + "grad_norm": 0.07255148887634277, + "learning_rate": 2.9195733381340297e-05, + "loss": 0.2281, + "step": 42889 + }, + { + "epoch": 3.4745625405055085, + "grad_norm": 0.0675332099199295, + "learning_rate": 2.919123272874567e-05, + "loss": 0.1977, + "step": 42890 + }, + { + "epoch": 3.474643551523007, + "grad_norm": 0.07232514023780823, + "learning_rate": 2.918673207615104e-05, + "loss": 0.2118, + "step": 42891 + }, + { + "epoch": 3.4747245625405054, + "grad_norm": 0.07749462872743607, + "learning_rate": 2.9182231423556418e-05, + "loss": 0.2486, + "step": 42892 + }, + { + "epoch": 3.4748055735580037, + "grad_norm": 0.06873171776533127, + "learning_rate": 2.917773077096179e-05, + "loss": 0.2243, + "step": 42893 + }, + { + "epoch": 3.4748865845755024, + "grad_norm": 0.06732004880905151, + "learning_rate": 2.917323011836716e-05, + "loss": 0.2039, + "step": 42894 + }, + { + "epoch": 3.4749675955930006, + "grad_norm": 0.0846250131726265, + "learning_rate": 2.916872946577254e-05, + "loss": 0.2103, + "step": 42895 + }, + { + "epoch": 3.475048606610499, + "grad_norm": 0.07390343397855759, + "learning_rate": 2.9164228813177912e-05, + "loss": 0.1858, + "step": 42896 + }, + { + "epoch": 3.4751296176279975, + "grad_norm": 0.06746575236320496, + "learning_rate": 2.915972816058329e-05, + "loss": 0.2331, + "step": 42897 + }, + { + "epoch": 3.475210628645496, + "grad_norm": 0.08225786685943604, + "learning_rate": 2.915522750798866e-05, + "loss": 0.2094, + "step": 42898 + }, + { + "epoch": 3.475291639662994, + "grad_norm": 0.06650905311107635, + "learning_rate": 2.9150726855394033e-05, + "loss": 0.2149, + "step": 42899 + }, + { + "epoch": 3.4753726506804927, + "grad_norm": 0.06720145791769028, + "learning_rate": 2.914622620279941e-05, + "loss": 0.2508, + "step": 42900 + }, + { + "epoch": 3.475453661697991, + "grad_norm": 0.06689774245023727, + "learning_rate": 2.914172555020478e-05, + "loss": 0.2164, + "step": 42901 + }, + { + "epoch": 3.475534672715489, + "grad_norm": 0.06267905980348587, + "learning_rate": 2.9137224897610154e-05, + "loss": 0.2223, + "step": 42902 + }, + { + "epoch": 3.475615683732988, + "grad_norm": 0.0679597407579422, + "learning_rate": 2.913272424501553e-05, + "loss": 0.208, + "step": 42903 + }, + { + "epoch": 3.475696694750486, + "grad_norm": 0.07159747183322906, + "learning_rate": 2.91282235924209e-05, + "loss": 0.2249, + "step": 42904 + }, + { + "epoch": 3.4757777057679844, + "grad_norm": 0.06298567354679108, + "learning_rate": 2.9123722939826275e-05, + "loss": 0.2318, + "step": 42905 + }, + { + "epoch": 3.4758587167854826, + "grad_norm": 0.0710611492395401, + "learning_rate": 2.911922228723165e-05, + "loss": 0.2177, + "step": 42906 + }, + { + "epoch": 3.4759397278029813, + "grad_norm": 0.05858425050973892, + "learning_rate": 2.9114721634637022e-05, + "loss": 0.2387, + "step": 42907 + }, + { + "epoch": 3.4760207388204796, + "grad_norm": 0.07267910987138748, + "learning_rate": 2.9110220982042395e-05, + "loss": 0.2312, + "step": 42908 + }, + { + "epoch": 3.476101749837978, + "grad_norm": 0.06025031954050064, + "learning_rate": 2.9105720329447772e-05, + "loss": 0.2027, + "step": 42909 + }, + { + "epoch": 3.4761827608554765, + "grad_norm": 0.0847097858786583, + "learning_rate": 2.9101219676853146e-05, + "loss": 0.2421, + "step": 42910 + }, + { + "epoch": 3.4762637718729748, + "grad_norm": 0.08266802877187729, + "learning_rate": 2.9096719024258516e-05, + "loss": 0.2149, + "step": 42911 + }, + { + "epoch": 3.476344782890473, + "grad_norm": 0.06554539501667023, + "learning_rate": 2.9092218371663893e-05, + "loss": 0.2361, + "step": 42912 + }, + { + "epoch": 3.4764257939079712, + "grad_norm": 0.06395899504423141, + "learning_rate": 2.908771771906927e-05, + "loss": 0.2227, + "step": 42913 + }, + { + "epoch": 3.47650680492547, + "grad_norm": 0.06811178475618362, + "learning_rate": 2.9083217066474637e-05, + "loss": 0.2145, + "step": 42914 + }, + { + "epoch": 3.476587815942968, + "grad_norm": 0.06188002973794937, + "learning_rate": 2.9078716413880014e-05, + "loss": 0.2076, + "step": 42915 + }, + { + "epoch": 3.4766688269604664, + "grad_norm": 0.08311402797698975, + "learning_rate": 2.907421576128539e-05, + "loss": 0.2615, + "step": 42916 + }, + { + "epoch": 3.476749837977965, + "grad_norm": 0.07584516704082489, + "learning_rate": 2.9069715108690758e-05, + "loss": 0.232, + "step": 42917 + }, + { + "epoch": 3.4768308489954634, + "grad_norm": 0.0709504559636116, + "learning_rate": 2.9065214456096135e-05, + "loss": 0.2111, + "step": 42918 + }, + { + "epoch": 3.4769118600129616, + "grad_norm": 0.07124251127243042, + "learning_rate": 2.9060713803501512e-05, + "loss": 0.2293, + "step": 42919 + }, + { + "epoch": 3.4769928710304603, + "grad_norm": 0.06938184797763824, + "learning_rate": 2.905621315090688e-05, + "loss": 0.214, + "step": 42920 + }, + { + "epoch": 3.4770738820479585, + "grad_norm": 0.06147418171167374, + "learning_rate": 2.9051712498312256e-05, + "loss": 0.2575, + "step": 42921 + }, + { + "epoch": 3.4771548930654568, + "grad_norm": 0.0724051296710968, + "learning_rate": 2.9047211845717633e-05, + "loss": 0.2184, + "step": 42922 + }, + { + "epoch": 3.4772359040829555, + "grad_norm": 0.08725505322217941, + "learning_rate": 2.9042711193123006e-05, + "loss": 0.2466, + "step": 42923 + }, + { + "epoch": 3.4773169151004537, + "grad_norm": 0.07237666845321655, + "learning_rate": 2.9038210540528376e-05, + "loss": 0.2514, + "step": 42924 + }, + { + "epoch": 3.477397926117952, + "grad_norm": 0.06182453781366348, + "learning_rate": 2.9033709887933753e-05, + "loss": 0.2208, + "step": 42925 + }, + { + "epoch": 3.4774789371354506, + "grad_norm": 0.07542017102241516, + "learning_rate": 2.9029209235339127e-05, + "loss": 0.261, + "step": 42926 + }, + { + "epoch": 3.477559948152949, + "grad_norm": 0.06871870160102844, + "learning_rate": 2.9024708582744497e-05, + "loss": 0.2279, + "step": 42927 + }, + { + "epoch": 3.477640959170447, + "grad_norm": 0.061738964170217514, + "learning_rate": 2.9020207930149874e-05, + "loss": 0.2213, + "step": 42928 + }, + { + "epoch": 3.4777219701879454, + "grad_norm": 0.06986607611179352, + "learning_rate": 2.9015707277555248e-05, + "loss": 0.2009, + "step": 42929 + }, + { + "epoch": 3.477802981205444, + "grad_norm": 0.10162478685379028, + "learning_rate": 2.9011206624960618e-05, + "loss": 0.2752, + "step": 42930 + }, + { + "epoch": 3.4778839922229423, + "grad_norm": 0.07489980012178421, + "learning_rate": 2.9006705972365995e-05, + "loss": 0.2353, + "step": 42931 + }, + { + "epoch": 3.4779650032404406, + "grad_norm": 0.0648399144411087, + "learning_rate": 2.900220531977137e-05, + "loss": 0.2228, + "step": 42932 + }, + { + "epoch": 3.4780460142579392, + "grad_norm": 0.09491732716560364, + "learning_rate": 2.899770466717674e-05, + "loss": 0.2694, + "step": 42933 + }, + { + "epoch": 3.4781270252754375, + "grad_norm": 0.07386920601129532, + "learning_rate": 2.8993204014582116e-05, + "loss": 0.2206, + "step": 42934 + }, + { + "epoch": 3.4782080362929357, + "grad_norm": 0.08212792128324509, + "learning_rate": 2.898870336198749e-05, + "loss": 0.2421, + "step": 42935 + }, + { + "epoch": 3.478289047310434, + "grad_norm": 0.0611363910138607, + "learning_rate": 2.8984202709392866e-05, + "loss": 0.2237, + "step": 42936 + }, + { + "epoch": 3.4783700583279327, + "grad_norm": 0.07726096361875534, + "learning_rate": 2.8979702056798237e-05, + "loss": 0.277, + "step": 42937 + }, + { + "epoch": 3.478451069345431, + "grad_norm": 0.05522330850362778, + "learning_rate": 2.897520140420361e-05, + "loss": 0.1983, + "step": 42938 + }, + { + "epoch": 3.478532080362929, + "grad_norm": 0.07946809381246567, + "learning_rate": 2.8970700751608987e-05, + "loss": 0.1909, + "step": 42939 + }, + { + "epoch": 3.478613091380428, + "grad_norm": 0.06915449351072311, + "learning_rate": 2.8966200099014357e-05, + "loss": 0.2124, + "step": 42940 + }, + { + "epoch": 3.478694102397926, + "grad_norm": 0.0850968137383461, + "learning_rate": 2.896169944641973e-05, + "loss": 0.2116, + "step": 42941 + }, + { + "epoch": 3.4787751134154243, + "grad_norm": 0.07038792967796326, + "learning_rate": 2.8957198793825108e-05, + "loss": 0.2848, + "step": 42942 + }, + { + "epoch": 3.478856124432923, + "grad_norm": 0.06604676693677902, + "learning_rate": 2.8952698141230478e-05, + "loss": 0.1977, + "step": 42943 + }, + { + "epoch": 3.4789371354504213, + "grad_norm": 0.09424113482236862, + "learning_rate": 2.8948197488635852e-05, + "loss": 0.2524, + "step": 42944 + }, + { + "epoch": 3.4790181464679195, + "grad_norm": 0.08853418380022049, + "learning_rate": 2.894369683604123e-05, + "loss": 0.2285, + "step": 42945 + }, + { + "epoch": 3.479099157485418, + "grad_norm": 0.0836792141199112, + "learning_rate": 2.89391961834466e-05, + "loss": 0.2105, + "step": 42946 + }, + { + "epoch": 3.4791801685029164, + "grad_norm": 0.07371016591787338, + "learning_rate": 2.8934695530851973e-05, + "loss": 0.2769, + "step": 42947 + }, + { + "epoch": 3.4792611795204147, + "grad_norm": 0.08501225709915161, + "learning_rate": 2.893019487825735e-05, + "loss": 0.2182, + "step": 42948 + }, + { + "epoch": 3.4793421905379134, + "grad_norm": 0.07889309525489807, + "learning_rate": 2.8925694225662727e-05, + "loss": 0.2577, + "step": 42949 + }, + { + "epoch": 3.4794232015554116, + "grad_norm": 0.07091579586267471, + "learning_rate": 2.8921193573068093e-05, + "loss": 0.2193, + "step": 42950 + }, + { + "epoch": 3.47950421257291, + "grad_norm": 0.07333512604236603, + "learning_rate": 2.891669292047347e-05, + "loss": 0.1897, + "step": 42951 + }, + { + "epoch": 3.479585223590408, + "grad_norm": 0.061077289283275604, + "learning_rate": 2.8912192267878847e-05, + "loss": 0.2442, + "step": 42952 + }, + { + "epoch": 3.479666234607907, + "grad_norm": 0.0764218419790268, + "learning_rate": 2.8907691615284214e-05, + "loss": 0.2307, + "step": 42953 + }, + { + "epoch": 3.479747245625405, + "grad_norm": 0.05201823636889458, + "learning_rate": 2.890319096268959e-05, + "loss": 0.2369, + "step": 42954 + }, + { + "epoch": 3.4798282566429033, + "grad_norm": 0.07234396040439606, + "learning_rate": 2.8898690310094968e-05, + "loss": 0.223, + "step": 42955 + }, + { + "epoch": 3.479909267660402, + "grad_norm": 0.07794514298439026, + "learning_rate": 2.889418965750034e-05, + "loss": 0.2579, + "step": 42956 + }, + { + "epoch": 3.4799902786779002, + "grad_norm": 0.09208627790212631, + "learning_rate": 2.8889689004905712e-05, + "loss": 0.222, + "step": 42957 + }, + { + "epoch": 3.4800712896953985, + "grad_norm": 0.09116863459348679, + "learning_rate": 2.888518835231109e-05, + "loss": 0.2219, + "step": 42958 + }, + { + "epoch": 3.4801523007128967, + "grad_norm": 0.10785874724388123, + "learning_rate": 2.888068769971646e-05, + "loss": 0.238, + "step": 42959 + }, + { + "epoch": 3.4802333117303954, + "grad_norm": 0.07596461474895477, + "learning_rate": 2.8876187047121833e-05, + "loss": 0.2486, + "step": 42960 + }, + { + "epoch": 3.4803143227478937, + "grad_norm": 0.07258222997188568, + "learning_rate": 2.887168639452721e-05, + "loss": 0.2499, + "step": 42961 + }, + { + "epoch": 3.480395333765392, + "grad_norm": 0.07630549371242523, + "learning_rate": 2.8867185741932583e-05, + "loss": 0.2674, + "step": 42962 + }, + { + "epoch": 3.4804763447828906, + "grad_norm": 0.0784044861793518, + "learning_rate": 2.8862685089337954e-05, + "loss": 0.205, + "step": 42963 + }, + { + "epoch": 3.480557355800389, + "grad_norm": 0.06193476542830467, + "learning_rate": 2.885818443674333e-05, + "loss": 0.1964, + "step": 42964 + }, + { + "epoch": 3.480638366817887, + "grad_norm": 0.07275792956352234, + "learning_rate": 2.8853683784148704e-05, + "loss": 0.2368, + "step": 42965 + }, + { + "epoch": 3.4807193778353858, + "grad_norm": 0.07585861533880234, + "learning_rate": 2.8849183131554074e-05, + "loss": 0.2616, + "step": 42966 + }, + { + "epoch": 3.480800388852884, + "grad_norm": 0.06550201028585434, + "learning_rate": 2.884468247895945e-05, + "loss": 0.2009, + "step": 42967 + }, + { + "epoch": 3.4808813998703823, + "grad_norm": 0.0743786096572876, + "learning_rate": 2.8840181826364825e-05, + "loss": 0.2461, + "step": 42968 + }, + { + "epoch": 3.480962410887881, + "grad_norm": 0.0645429864525795, + "learning_rate": 2.8835681173770195e-05, + "loss": 0.2319, + "step": 42969 + }, + { + "epoch": 3.481043421905379, + "grad_norm": 0.07213232666254044, + "learning_rate": 2.8831180521175572e-05, + "loss": 0.2039, + "step": 42970 + }, + { + "epoch": 3.4811244329228774, + "grad_norm": 0.06800594180822372, + "learning_rate": 2.8826679868580946e-05, + "loss": 0.2208, + "step": 42971 + }, + { + "epoch": 3.481205443940376, + "grad_norm": 0.06744236499071121, + "learning_rate": 2.8822179215986316e-05, + "loss": 0.2511, + "step": 42972 + }, + { + "epoch": 3.4812864549578744, + "grad_norm": 0.07173527777194977, + "learning_rate": 2.8817678563391693e-05, + "loss": 0.2388, + "step": 42973 + }, + { + "epoch": 3.4813674659753726, + "grad_norm": 0.07332679629325867, + "learning_rate": 2.8813177910797067e-05, + "loss": 0.2502, + "step": 42974 + }, + { + "epoch": 3.481448476992871, + "grad_norm": 0.0669303610920906, + "learning_rate": 2.8808677258202444e-05, + "loss": 0.2514, + "step": 42975 + }, + { + "epoch": 3.4815294880103695, + "grad_norm": 0.07635664939880371, + "learning_rate": 2.8804176605607814e-05, + "loss": 0.2145, + "step": 42976 + }, + { + "epoch": 3.481610499027868, + "grad_norm": 0.06254716217517853, + "learning_rate": 2.8799675953013188e-05, + "loss": 0.2254, + "step": 42977 + }, + { + "epoch": 3.481691510045366, + "grad_norm": 0.0803760215640068, + "learning_rate": 2.8795175300418565e-05, + "loss": 0.2156, + "step": 42978 + }, + { + "epoch": 3.4817725210628647, + "grad_norm": 0.07645880430936813, + "learning_rate": 2.8790674647823935e-05, + "loss": 0.2409, + "step": 42979 + }, + { + "epoch": 3.481853532080363, + "grad_norm": 0.06853979080915451, + "learning_rate": 2.878617399522931e-05, + "loss": 0.2102, + "step": 42980 + }, + { + "epoch": 3.481934543097861, + "grad_norm": 0.07574337720870972, + "learning_rate": 2.8781673342634685e-05, + "loss": 0.2162, + "step": 42981 + }, + { + "epoch": 3.4820155541153595, + "grad_norm": 0.07896383106708527, + "learning_rate": 2.8777172690040056e-05, + "loss": 0.2535, + "step": 42982 + }, + { + "epoch": 3.482096565132858, + "grad_norm": 0.07682851701974869, + "learning_rate": 2.877267203744543e-05, + "loss": 0.2731, + "step": 42983 + }, + { + "epoch": 3.4821775761503564, + "grad_norm": 0.07519050687551498, + "learning_rate": 2.8768171384850806e-05, + "loss": 0.2795, + "step": 42984 + }, + { + "epoch": 3.4822585871678546, + "grad_norm": 0.07776078581809998, + "learning_rate": 2.8763670732256176e-05, + "loss": 0.2527, + "step": 42985 + }, + { + "epoch": 3.4823395981853533, + "grad_norm": 0.07055551558732986, + "learning_rate": 2.875917007966155e-05, + "loss": 0.2383, + "step": 42986 + }, + { + "epoch": 3.4824206092028516, + "grad_norm": 0.06520364433526993, + "learning_rate": 2.8754669427066927e-05, + "loss": 0.2551, + "step": 42987 + }, + { + "epoch": 3.48250162022035, + "grad_norm": 0.08810622990131378, + "learning_rate": 2.8750168774472304e-05, + "loss": 0.2372, + "step": 42988 + }, + { + "epoch": 3.4825826312378485, + "grad_norm": 0.08560158312320709, + "learning_rate": 2.874566812187767e-05, + "loss": 0.2731, + "step": 42989 + }, + { + "epoch": 3.4826636422553467, + "grad_norm": 0.07116416841745377, + "learning_rate": 2.8741167469283048e-05, + "loss": 0.2183, + "step": 42990 + }, + { + "epoch": 3.482744653272845, + "grad_norm": 0.08705449104309082, + "learning_rate": 2.8736666816688425e-05, + "loss": 0.2435, + "step": 42991 + }, + { + "epoch": 3.4828256642903437, + "grad_norm": 0.07582302391529083, + "learning_rate": 2.8732166164093795e-05, + "loss": 0.2204, + "step": 42992 + }, + { + "epoch": 3.482906675307842, + "grad_norm": 0.05476190522313118, + "learning_rate": 2.872766551149917e-05, + "loss": 0.1792, + "step": 42993 + }, + { + "epoch": 3.48298768632534, + "grad_norm": 0.07866176217794418, + "learning_rate": 2.8723164858904546e-05, + "loss": 0.2561, + "step": 42994 + }, + { + "epoch": 3.483068697342839, + "grad_norm": 0.0897442102432251, + "learning_rate": 2.8718664206309916e-05, + "loss": 0.2552, + "step": 42995 + }, + { + "epoch": 3.483149708360337, + "grad_norm": 0.07748270034790039, + "learning_rate": 2.871416355371529e-05, + "loss": 0.246, + "step": 42996 + }, + { + "epoch": 3.4832307193778353, + "grad_norm": 0.07693302631378174, + "learning_rate": 2.8709662901120666e-05, + "loss": 0.2287, + "step": 42997 + }, + { + "epoch": 3.4833117303953336, + "grad_norm": 0.07699127495288849, + "learning_rate": 2.8705162248526037e-05, + "loss": 0.2412, + "step": 42998 + }, + { + "epoch": 3.4833927414128323, + "grad_norm": 0.0830434113740921, + "learning_rate": 2.870066159593141e-05, + "loss": 0.2375, + "step": 42999 + }, + { + "epoch": 3.4834737524303305, + "grad_norm": 0.08280211687088013, + "learning_rate": 2.8696160943336787e-05, + "loss": 0.2231, + "step": 43000 + }, + { + "epoch": 3.4835547634478288, + "grad_norm": 0.07986078411340714, + "learning_rate": 2.869166029074216e-05, + "loss": 0.2447, + "step": 43001 + }, + { + "epoch": 3.4836357744653275, + "grad_norm": 0.061485256999731064, + "learning_rate": 2.868715963814753e-05, + "loss": 0.212, + "step": 43002 + }, + { + "epoch": 3.4837167854828257, + "grad_norm": 0.06983110308647156, + "learning_rate": 2.8682658985552908e-05, + "loss": 0.2147, + "step": 43003 + }, + { + "epoch": 3.483797796500324, + "grad_norm": 0.07056194543838501, + "learning_rate": 2.867815833295828e-05, + "loss": 0.245, + "step": 43004 + }, + { + "epoch": 3.483878807517822, + "grad_norm": 0.06419837474822998, + "learning_rate": 2.8673657680363652e-05, + "loss": 0.1985, + "step": 43005 + }, + { + "epoch": 3.483959818535321, + "grad_norm": 0.08814400434494019, + "learning_rate": 2.866915702776903e-05, + "loss": 0.2252, + "step": 43006 + }, + { + "epoch": 3.484040829552819, + "grad_norm": 0.051433440297842026, + "learning_rate": 2.8664656375174402e-05, + "loss": 0.2354, + "step": 43007 + }, + { + "epoch": 3.4841218405703174, + "grad_norm": 0.0714108794927597, + "learning_rate": 2.8660155722579773e-05, + "loss": 0.26, + "step": 43008 + }, + { + "epoch": 3.484202851587816, + "grad_norm": 0.07795058935880661, + "learning_rate": 2.865565506998515e-05, + "loss": 0.2651, + "step": 43009 + }, + { + "epoch": 3.4842838626053143, + "grad_norm": 0.08348504453897476, + "learning_rate": 2.8651154417390523e-05, + "loss": 0.2096, + "step": 43010 + }, + { + "epoch": 3.4843648736228126, + "grad_norm": 0.07615520060062408, + "learning_rate": 2.8646653764795893e-05, + "loss": 0.2437, + "step": 43011 + }, + { + "epoch": 3.4844458846403112, + "grad_norm": 0.061513423919677734, + "learning_rate": 2.864215311220127e-05, + "loss": 0.237, + "step": 43012 + }, + { + "epoch": 3.4845268956578095, + "grad_norm": 0.06726951152086258, + "learning_rate": 2.8637652459606644e-05, + "loss": 0.2385, + "step": 43013 + }, + { + "epoch": 3.4846079066753077, + "grad_norm": 0.05101010948419571, + "learning_rate": 2.863315180701202e-05, + "loss": 0.167, + "step": 43014 + }, + { + "epoch": 3.4846889176928064, + "grad_norm": 0.07691899687051773, + "learning_rate": 2.862865115441739e-05, + "loss": 0.2779, + "step": 43015 + }, + { + "epoch": 3.4847699287103047, + "grad_norm": 0.06863217055797577, + "learning_rate": 2.8624150501822765e-05, + "loss": 0.252, + "step": 43016 + }, + { + "epoch": 3.484850939727803, + "grad_norm": 0.06520593166351318, + "learning_rate": 2.8619649849228142e-05, + "loss": 0.2115, + "step": 43017 + }, + { + "epoch": 3.4849319507453016, + "grad_norm": 0.07795144617557526, + "learning_rate": 2.8615149196633512e-05, + "loss": 0.24, + "step": 43018 + }, + { + "epoch": 3.4850129617628, + "grad_norm": 0.06358601897954941, + "learning_rate": 2.8610648544038886e-05, + "loss": 0.2066, + "step": 43019 + }, + { + "epoch": 3.485093972780298, + "grad_norm": 0.06517867743968964, + "learning_rate": 2.8606147891444263e-05, + "loss": 0.2033, + "step": 43020 + }, + { + "epoch": 3.4851749837977963, + "grad_norm": 0.07494281232357025, + "learning_rate": 2.8601647238849633e-05, + "loss": 0.2197, + "step": 43021 + }, + { + "epoch": 3.485255994815295, + "grad_norm": 0.062105692923069, + "learning_rate": 2.8597146586255006e-05, + "loss": 0.1766, + "step": 43022 + }, + { + "epoch": 3.4853370058327933, + "grad_norm": 0.07513219118118286, + "learning_rate": 2.8592645933660383e-05, + "loss": 0.2344, + "step": 43023 + }, + { + "epoch": 3.4854180168502915, + "grad_norm": 0.06747517734766006, + "learning_rate": 2.8588145281065754e-05, + "loss": 0.1994, + "step": 43024 + }, + { + "epoch": 3.48549902786779, + "grad_norm": 0.08939173817634583, + "learning_rate": 2.858364462847113e-05, + "loss": 0.2326, + "step": 43025 + }, + { + "epoch": 3.4855800388852884, + "grad_norm": 0.0805366039276123, + "learning_rate": 2.8579143975876504e-05, + "loss": 0.2367, + "step": 43026 + }, + { + "epoch": 3.4856610499027867, + "grad_norm": 0.07593267410993576, + "learning_rate": 2.857464332328188e-05, + "loss": 0.2276, + "step": 43027 + }, + { + "epoch": 3.485742060920285, + "grad_norm": 0.07195086777210236, + "learning_rate": 2.857014267068725e-05, + "loss": 0.24, + "step": 43028 + }, + { + "epoch": 3.4858230719377836, + "grad_norm": 0.07583434879779816, + "learning_rate": 2.8565642018092625e-05, + "loss": 0.2191, + "step": 43029 + }, + { + "epoch": 3.485904082955282, + "grad_norm": 0.08318190276622772, + "learning_rate": 2.8561141365498002e-05, + "loss": 0.2208, + "step": 43030 + }, + { + "epoch": 3.48598509397278, + "grad_norm": 0.06685812771320343, + "learning_rate": 2.8556640712903372e-05, + "loss": 0.2368, + "step": 43031 + }, + { + "epoch": 3.486066104990279, + "grad_norm": 0.062124937772750854, + "learning_rate": 2.8552140060308746e-05, + "loss": 0.2145, + "step": 43032 + }, + { + "epoch": 3.486147116007777, + "grad_norm": 0.07486393302679062, + "learning_rate": 2.8547639407714123e-05, + "loss": 0.2283, + "step": 43033 + }, + { + "epoch": 3.4862281270252753, + "grad_norm": 0.07032036036252975, + "learning_rate": 2.8543138755119493e-05, + "loss": 0.2012, + "step": 43034 + }, + { + "epoch": 3.486309138042774, + "grad_norm": 0.07372421026229858, + "learning_rate": 2.8538638102524867e-05, + "loss": 0.2715, + "step": 43035 + }, + { + "epoch": 3.4863901490602722, + "grad_norm": 0.06832101941108704, + "learning_rate": 2.8534137449930244e-05, + "loss": 0.2423, + "step": 43036 + }, + { + "epoch": 3.4864711600777705, + "grad_norm": 0.07774647325277328, + "learning_rate": 2.8529636797335614e-05, + "loss": 0.1948, + "step": 43037 + }, + { + "epoch": 3.486552171095269, + "grad_norm": 0.07262618094682693, + "learning_rate": 2.8525136144740987e-05, + "loss": 0.2192, + "step": 43038 + }, + { + "epoch": 3.4866331821127674, + "grad_norm": 0.07207204401493073, + "learning_rate": 2.8520635492146364e-05, + "loss": 0.2241, + "step": 43039 + }, + { + "epoch": 3.4867141931302656, + "grad_norm": 0.07984203100204468, + "learning_rate": 2.8516134839551735e-05, + "loss": 0.2085, + "step": 43040 + }, + { + "epoch": 3.4867952041477643, + "grad_norm": 0.07520009577274323, + "learning_rate": 2.8511634186957108e-05, + "loss": 0.2282, + "step": 43041 + }, + { + "epoch": 3.4868762151652626, + "grad_norm": 0.07236367464065552, + "learning_rate": 2.8507133534362485e-05, + "loss": 0.2311, + "step": 43042 + }, + { + "epoch": 3.486957226182761, + "grad_norm": 0.09670182317495346, + "learning_rate": 2.850263288176786e-05, + "loss": 0.2533, + "step": 43043 + }, + { + "epoch": 3.487038237200259, + "grad_norm": 0.060888949781656265, + "learning_rate": 2.849813222917323e-05, + "loss": 0.223, + "step": 43044 + }, + { + "epoch": 3.4871192482177578, + "grad_norm": 0.07887989282608032, + "learning_rate": 2.8493631576578606e-05, + "loss": 0.2212, + "step": 43045 + }, + { + "epoch": 3.487200259235256, + "grad_norm": 0.06833384931087494, + "learning_rate": 2.848913092398398e-05, + "loss": 0.1889, + "step": 43046 + }, + { + "epoch": 3.4872812702527543, + "grad_norm": 0.06222878396511078, + "learning_rate": 2.848463027138935e-05, + "loss": 0.1947, + "step": 43047 + }, + { + "epoch": 3.487362281270253, + "grad_norm": 0.0773792639374733, + "learning_rate": 2.8480129618794727e-05, + "loss": 0.2441, + "step": 43048 + }, + { + "epoch": 3.487443292287751, + "grad_norm": 0.06242474913597107, + "learning_rate": 2.84756289662001e-05, + "loss": 0.2325, + "step": 43049 + }, + { + "epoch": 3.4875243033052494, + "grad_norm": 0.06994125247001648, + "learning_rate": 2.847112831360547e-05, + "loss": 0.204, + "step": 43050 + }, + { + "epoch": 3.4876053143227477, + "grad_norm": 0.07311102002859116, + "learning_rate": 2.8466627661010848e-05, + "loss": 0.2739, + "step": 43051 + }, + { + "epoch": 3.4876863253402464, + "grad_norm": 0.08544214069843292, + "learning_rate": 2.846212700841622e-05, + "loss": 0.2583, + "step": 43052 + }, + { + "epoch": 3.4877673363577446, + "grad_norm": 0.07062707096338272, + "learning_rate": 2.845762635582159e-05, + "loss": 0.2504, + "step": 43053 + }, + { + "epoch": 3.487848347375243, + "grad_norm": 0.076682910323143, + "learning_rate": 2.845312570322697e-05, + "loss": 0.2503, + "step": 43054 + }, + { + "epoch": 3.4879293583927415, + "grad_norm": 0.07357282936573029, + "learning_rate": 2.8448625050632342e-05, + "loss": 0.2166, + "step": 43055 + }, + { + "epoch": 3.48801036941024, + "grad_norm": 0.059687837958335876, + "learning_rate": 2.844412439803772e-05, + "loss": 0.2396, + "step": 43056 + }, + { + "epoch": 3.488091380427738, + "grad_norm": 0.06864503026008606, + "learning_rate": 2.843962374544309e-05, + "loss": 0.237, + "step": 43057 + }, + { + "epoch": 3.4881723914452367, + "grad_norm": 0.05564036965370178, + "learning_rate": 2.8435123092848466e-05, + "loss": 0.2285, + "step": 43058 + }, + { + "epoch": 3.488253402462735, + "grad_norm": 0.07747586816549301, + "learning_rate": 2.843062244025384e-05, + "loss": 0.2396, + "step": 43059 + }, + { + "epoch": 3.488334413480233, + "grad_norm": 0.08911630511283875, + "learning_rate": 2.842612178765921e-05, + "loss": 0.2395, + "step": 43060 + }, + { + "epoch": 3.488415424497732, + "grad_norm": 0.08266628533601761, + "learning_rate": 2.8421621135064587e-05, + "loss": 0.2303, + "step": 43061 + }, + { + "epoch": 3.48849643551523, + "grad_norm": 0.0728074312210083, + "learning_rate": 2.841712048246996e-05, + "loss": 0.2531, + "step": 43062 + }, + { + "epoch": 3.4885774465327284, + "grad_norm": 0.07155589014291763, + "learning_rate": 2.841261982987533e-05, + "loss": 0.2534, + "step": 43063 + }, + { + "epoch": 3.488658457550227, + "grad_norm": 0.07294462621212006, + "learning_rate": 2.8408119177280708e-05, + "loss": 0.2265, + "step": 43064 + }, + { + "epoch": 3.4887394685677253, + "grad_norm": 0.06267351657152176, + "learning_rate": 2.840361852468608e-05, + "loss": 0.1909, + "step": 43065 + }, + { + "epoch": 3.4888204795852236, + "grad_norm": 0.07813186943531036, + "learning_rate": 2.8399117872091452e-05, + "loss": 0.265, + "step": 43066 + }, + { + "epoch": 3.488901490602722, + "grad_norm": 0.055375028401613235, + "learning_rate": 2.839461721949683e-05, + "loss": 0.1829, + "step": 43067 + }, + { + "epoch": 3.4889825016202205, + "grad_norm": 0.07581090927124023, + "learning_rate": 2.8390116566902202e-05, + "loss": 0.2428, + "step": 43068 + }, + { + "epoch": 3.4890635126377187, + "grad_norm": 0.07282670587301254, + "learning_rate": 2.838561591430758e-05, + "loss": 0.2169, + "step": 43069 + }, + { + "epoch": 3.489144523655217, + "grad_norm": 0.08286745846271515, + "learning_rate": 2.838111526171295e-05, + "loss": 0.2237, + "step": 43070 + }, + { + "epoch": 3.4892255346727157, + "grad_norm": 0.07731908559799194, + "learning_rate": 2.8376614609118323e-05, + "loss": 0.1856, + "step": 43071 + }, + { + "epoch": 3.489306545690214, + "grad_norm": 0.07564588636159897, + "learning_rate": 2.83721139565237e-05, + "loss": 0.2365, + "step": 43072 + }, + { + "epoch": 3.489387556707712, + "grad_norm": 0.07171668857336044, + "learning_rate": 2.836761330392907e-05, + "loss": 0.2576, + "step": 43073 + }, + { + "epoch": 3.4894685677252104, + "grad_norm": 0.06701047718524933, + "learning_rate": 2.8363112651334444e-05, + "loss": 0.2432, + "step": 43074 + }, + { + "epoch": 3.489549578742709, + "grad_norm": 0.0751776471734047, + "learning_rate": 2.835861199873982e-05, + "loss": 0.2124, + "step": 43075 + }, + { + "epoch": 3.4896305897602073, + "grad_norm": 0.06368733197450638, + "learning_rate": 2.835411134614519e-05, + "loss": 0.2255, + "step": 43076 + }, + { + "epoch": 3.4897116007777056, + "grad_norm": 0.06492561846971512, + "learning_rate": 2.8349610693550565e-05, + "loss": 0.1783, + "step": 43077 + }, + { + "epoch": 3.4897926117952043, + "grad_norm": 0.0625244602560997, + "learning_rate": 2.8345110040955942e-05, + "loss": 0.1977, + "step": 43078 + }, + { + "epoch": 3.4898736228127025, + "grad_norm": 0.07491852343082428, + "learning_rate": 2.8340609388361312e-05, + "loss": 0.2121, + "step": 43079 + }, + { + "epoch": 3.4899546338302008, + "grad_norm": 0.06318768858909607, + "learning_rate": 2.8336108735766686e-05, + "loss": 0.191, + "step": 43080 + }, + { + "epoch": 3.4900356448476995, + "grad_norm": 0.07628298550844193, + "learning_rate": 2.8331608083172063e-05, + "loss": 0.2084, + "step": 43081 + }, + { + "epoch": 3.4901166558651977, + "grad_norm": 0.09012982994318008, + "learning_rate": 2.8327107430577436e-05, + "loss": 0.2461, + "step": 43082 + }, + { + "epoch": 3.490197666882696, + "grad_norm": 0.07702943682670593, + "learning_rate": 2.8322606777982806e-05, + "loss": 0.2149, + "step": 43083 + }, + { + "epoch": 3.4902786779001946, + "grad_norm": 0.06603427976369858, + "learning_rate": 2.8318106125388183e-05, + "loss": 0.2223, + "step": 43084 + }, + { + "epoch": 3.490359688917693, + "grad_norm": 0.06976785510778427, + "learning_rate": 2.8313605472793557e-05, + "loss": 0.2085, + "step": 43085 + }, + { + "epoch": 3.490440699935191, + "grad_norm": 0.05959775671362877, + "learning_rate": 2.8309104820198927e-05, + "loss": 0.2169, + "step": 43086 + }, + { + "epoch": 3.4905217109526894, + "grad_norm": 0.06909187883138657, + "learning_rate": 2.8304604167604304e-05, + "loss": 0.218, + "step": 43087 + }, + { + "epoch": 3.490602721970188, + "grad_norm": 0.08458421379327774, + "learning_rate": 2.8300103515009678e-05, + "loss": 0.2573, + "step": 43088 + }, + { + "epoch": 3.4906837329876863, + "grad_norm": 0.06887239217758179, + "learning_rate": 2.8295602862415048e-05, + "loss": 0.2332, + "step": 43089 + }, + { + "epoch": 3.4907647440051845, + "grad_norm": 0.08373768627643585, + "learning_rate": 2.8291102209820425e-05, + "loss": 0.2235, + "step": 43090 + }, + { + "epoch": 3.4908457550226832, + "grad_norm": 0.0646241307258606, + "learning_rate": 2.82866015572258e-05, + "loss": 0.1888, + "step": 43091 + }, + { + "epoch": 3.4909267660401815, + "grad_norm": 0.06446173042058945, + "learning_rate": 2.828210090463117e-05, + "loss": 0.1878, + "step": 43092 + }, + { + "epoch": 3.4910077770576797, + "grad_norm": 0.07939767837524414, + "learning_rate": 2.8277600252036546e-05, + "loss": 0.2235, + "step": 43093 + }, + { + "epoch": 3.4910887880751784, + "grad_norm": 0.07496839016675949, + "learning_rate": 2.8273099599441923e-05, + "loss": 0.1852, + "step": 43094 + }, + { + "epoch": 3.4911697990926767, + "grad_norm": 0.08608748018741608, + "learning_rate": 2.8268598946847296e-05, + "loss": 0.2398, + "step": 43095 + }, + { + "epoch": 3.491250810110175, + "grad_norm": 0.07695958018302917, + "learning_rate": 2.8264098294252667e-05, + "loss": 0.2398, + "step": 43096 + }, + { + "epoch": 3.491331821127673, + "grad_norm": 0.07370971143245697, + "learning_rate": 2.8259597641658044e-05, + "loss": 0.2336, + "step": 43097 + }, + { + "epoch": 3.491412832145172, + "grad_norm": 0.08485149592161179, + "learning_rate": 2.8255096989063417e-05, + "loss": 0.2637, + "step": 43098 + }, + { + "epoch": 3.49149384316267, + "grad_norm": 0.07185976207256317, + "learning_rate": 2.8250596336468787e-05, + "loss": 0.238, + "step": 43099 + }, + { + "epoch": 3.4915748541801683, + "grad_norm": 0.06783389300107956, + "learning_rate": 2.8246095683874164e-05, + "loss": 0.1895, + "step": 43100 + }, + { + "epoch": 3.491655865197667, + "grad_norm": 0.07479429244995117, + "learning_rate": 2.8241595031279538e-05, + "loss": 0.2595, + "step": 43101 + }, + { + "epoch": 3.4917368762151653, + "grad_norm": 0.08388431370258331, + "learning_rate": 2.8237094378684908e-05, + "loss": 0.2347, + "step": 43102 + }, + { + "epoch": 3.4918178872326635, + "grad_norm": 0.07217047363519669, + "learning_rate": 2.8232593726090285e-05, + "loss": 0.2323, + "step": 43103 + }, + { + "epoch": 3.491898898250162, + "grad_norm": 0.07123685628175735, + "learning_rate": 2.822809307349566e-05, + "loss": 0.3135, + "step": 43104 + }, + { + "epoch": 3.4919799092676604, + "grad_norm": 0.06716802716255188, + "learning_rate": 2.822359242090103e-05, + "loss": 0.1935, + "step": 43105 + }, + { + "epoch": 3.4920609202851587, + "grad_norm": 0.06284788250923157, + "learning_rate": 2.8219091768306406e-05, + "loss": 0.204, + "step": 43106 + }, + { + "epoch": 3.4921419313026574, + "grad_norm": 0.0679827407002449, + "learning_rate": 2.821459111571178e-05, + "loss": 0.2349, + "step": 43107 + }, + { + "epoch": 3.4922229423201556, + "grad_norm": 0.07266151905059814, + "learning_rate": 2.8210090463117157e-05, + "loss": 0.2226, + "step": 43108 + }, + { + "epoch": 3.492303953337654, + "grad_norm": 0.08697673678398132, + "learning_rate": 2.8205589810522527e-05, + "loss": 0.255, + "step": 43109 + }, + { + "epoch": 3.492384964355152, + "grad_norm": 0.0750865638256073, + "learning_rate": 2.82010891579279e-05, + "loss": 0.2417, + "step": 43110 + }, + { + "epoch": 3.492465975372651, + "grad_norm": 0.07341280579566956, + "learning_rate": 2.8196588505333277e-05, + "loss": 0.2504, + "step": 43111 + }, + { + "epoch": 3.492546986390149, + "grad_norm": 0.07102368026971817, + "learning_rate": 2.8192087852738648e-05, + "loss": 0.2217, + "step": 43112 + }, + { + "epoch": 3.4926279974076473, + "grad_norm": 0.06863326579332352, + "learning_rate": 2.818758720014402e-05, + "loss": 0.1962, + "step": 43113 + }, + { + "epoch": 3.492709008425146, + "grad_norm": 0.0682842880487442, + "learning_rate": 2.8183086547549398e-05, + "loss": 0.2033, + "step": 43114 + }, + { + "epoch": 3.4927900194426442, + "grad_norm": 0.07887216657400131, + "learning_rate": 2.817858589495477e-05, + "loss": 0.2345, + "step": 43115 + }, + { + "epoch": 3.4928710304601425, + "grad_norm": 0.08103575557470322, + "learning_rate": 2.8174085242360142e-05, + "loss": 0.2348, + "step": 43116 + }, + { + "epoch": 3.4929520414776407, + "grad_norm": 0.07701174169778824, + "learning_rate": 2.816958458976552e-05, + "loss": 0.2695, + "step": 43117 + }, + { + "epoch": 3.4930330524951394, + "grad_norm": 0.06716512143611908, + "learning_rate": 2.816508393717089e-05, + "loss": 0.224, + "step": 43118 + }, + { + "epoch": 3.4931140635126376, + "grad_norm": 0.08580772578716278, + "learning_rate": 2.8160583284576263e-05, + "loss": 0.2444, + "step": 43119 + }, + { + "epoch": 3.493195074530136, + "grad_norm": 0.06725557148456573, + "learning_rate": 2.815608263198164e-05, + "loss": 0.2023, + "step": 43120 + }, + { + "epoch": 3.4932760855476346, + "grad_norm": 0.06230286508798599, + "learning_rate": 2.8151581979387014e-05, + "loss": 0.2167, + "step": 43121 + }, + { + "epoch": 3.493357096565133, + "grad_norm": 0.08195045590400696, + "learning_rate": 2.8147081326792384e-05, + "loss": 0.2101, + "step": 43122 + }, + { + "epoch": 3.493438107582631, + "grad_norm": 0.06331407278776169, + "learning_rate": 2.814258067419776e-05, + "loss": 0.2322, + "step": 43123 + }, + { + "epoch": 3.4935191186001298, + "grad_norm": 0.07344713062047958, + "learning_rate": 2.8138080021603134e-05, + "loss": 0.2243, + "step": 43124 + }, + { + "epoch": 3.493600129617628, + "grad_norm": 0.07891742140054703, + "learning_rate": 2.8133579369008505e-05, + "loss": 0.2219, + "step": 43125 + }, + { + "epoch": 3.4936811406351262, + "grad_norm": 0.06340258568525314, + "learning_rate": 2.812907871641388e-05, + "loss": 0.23, + "step": 43126 + }, + { + "epoch": 3.493762151652625, + "grad_norm": 0.07598544657230377, + "learning_rate": 2.812457806381926e-05, + "loss": 0.247, + "step": 43127 + }, + { + "epoch": 3.493843162670123, + "grad_norm": 0.07023898512125015, + "learning_rate": 2.8120077411224625e-05, + "loss": 0.2427, + "step": 43128 + }, + { + "epoch": 3.4939241736876214, + "grad_norm": 0.06609073281288147, + "learning_rate": 2.8115576758630002e-05, + "loss": 0.1982, + "step": 43129 + }, + { + "epoch": 3.49400518470512, + "grad_norm": 0.0786251425743103, + "learning_rate": 2.811107610603538e-05, + "loss": 0.2497, + "step": 43130 + }, + { + "epoch": 3.4940861957226184, + "grad_norm": 0.0619148351252079, + "learning_rate": 2.8106575453440746e-05, + "loss": 0.2363, + "step": 43131 + }, + { + "epoch": 3.4941672067401166, + "grad_norm": 0.06817974150180817, + "learning_rate": 2.8102074800846123e-05, + "loss": 0.2371, + "step": 43132 + }, + { + "epoch": 3.494248217757615, + "grad_norm": 0.053594715893268585, + "learning_rate": 2.80975741482515e-05, + "loss": 0.1976, + "step": 43133 + }, + { + "epoch": 3.4943292287751135, + "grad_norm": 0.07943201065063477, + "learning_rate": 2.8093073495656874e-05, + "loss": 0.2033, + "step": 43134 + }, + { + "epoch": 3.494410239792612, + "grad_norm": 0.07399984449148178, + "learning_rate": 2.8088572843062244e-05, + "loss": 0.2317, + "step": 43135 + }, + { + "epoch": 3.49449125081011, + "grad_norm": 0.07094179093837738, + "learning_rate": 2.808407219046762e-05, + "loss": 0.2202, + "step": 43136 + }, + { + "epoch": 3.4945722618276087, + "grad_norm": 0.08724545687437057, + "learning_rate": 2.8079571537872995e-05, + "loss": 0.253, + "step": 43137 + }, + { + "epoch": 3.494653272845107, + "grad_norm": 0.07042510062456131, + "learning_rate": 2.8075070885278365e-05, + "loss": 0.1907, + "step": 43138 + }, + { + "epoch": 3.494734283862605, + "grad_norm": 0.08044608682394028, + "learning_rate": 2.8070570232683742e-05, + "loss": 0.243, + "step": 43139 + }, + { + "epoch": 3.4948152948801035, + "grad_norm": 0.062232423573732376, + "learning_rate": 2.8066069580089115e-05, + "loss": 0.2121, + "step": 43140 + }, + { + "epoch": 3.494896305897602, + "grad_norm": 0.07762039452791214, + "learning_rate": 2.8061568927494486e-05, + "loss": 0.2481, + "step": 43141 + }, + { + "epoch": 3.4949773169151004, + "grad_norm": 0.0662919357419014, + "learning_rate": 2.8057068274899863e-05, + "loss": 0.2062, + "step": 43142 + }, + { + "epoch": 3.4950583279325986, + "grad_norm": 0.08298102766275406, + "learning_rate": 2.8052567622305236e-05, + "loss": 0.2362, + "step": 43143 + }, + { + "epoch": 3.4951393389500973, + "grad_norm": 0.08471732586622238, + "learning_rate": 2.8048066969710606e-05, + "loss": 0.2174, + "step": 43144 + }, + { + "epoch": 3.4952203499675956, + "grad_norm": 0.06788258254528046, + "learning_rate": 2.8043566317115983e-05, + "loss": 0.2386, + "step": 43145 + }, + { + "epoch": 3.495301360985094, + "grad_norm": 0.07684889435768127, + "learning_rate": 2.8039065664521357e-05, + "loss": 0.2072, + "step": 43146 + }, + { + "epoch": 3.4953823720025925, + "grad_norm": 0.07309133559465408, + "learning_rate": 2.8034565011926734e-05, + "loss": 0.2119, + "step": 43147 + }, + { + "epoch": 3.4954633830200907, + "grad_norm": 0.08018827438354492, + "learning_rate": 2.8030064359332104e-05, + "loss": 0.2341, + "step": 43148 + }, + { + "epoch": 3.495544394037589, + "grad_norm": 0.0783516988158226, + "learning_rate": 2.8025563706737478e-05, + "loss": 0.2104, + "step": 43149 + }, + { + "epoch": 3.4956254050550877, + "grad_norm": 0.07192262262105942, + "learning_rate": 2.8021063054142855e-05, + "loss": 0.2282, + "step": 43150 + }, + { + "epoch": 3.495706416072586, + "grad_norm": 0.07899194210767746, + "learning_rate": 2.8016562401548225e-05, + "loss": 0.2352, + "step": 43151 + }, + { + "epoch": 3.495787427090084, + "grad_norm": 0.07733827829360962, + "learning_rate": 2.80120617489536e-05, + "loss": 0.2129, + "step": 43152 + }, + { + "epoch": 3.495868438107583, + "grad_norm": 0.0802890807390213, + "learning_rate": 2.8007561096358976e-05, + "loss": 0.202, + "step": 43153 + }, + { + "epoch": 3.495949449125081, + "grad_norm": 0.08393776416778564, + "learning_rate": 2.8003060443764346e-05, + "loss": 0.22, + "step": 43154 + }, + { + "epoch": 3.4960304601425793, + "grad_norm": 0.06426830589771271, + "learning_rate": 2.799855979116972e-05, + "loss": 0.2162, + "step": 43155 + }, + { + "epoch": 3.4961114711600776, + "grad_norm": 0.06880275160074234, + "learning_rate": 2.7994059138575096e-05, + "loss": 0.245, + "step": 43156 + }, + { + "epoch": 3.4961924821775763, + "grad_norm": 0.05830361321568489, + "learning_rate": 2.7989558485980467e-05, + "loss": 0.1964, + "step": 43157 + }, + { + "epoch": 3.4962734931950745, + "grad_norm": 0.08080387115478516, + "learning_rate": 2.798505783338584e-05, + "loss": 0.2145, + "step": 43158 + }, + { + "epoch": 3.4963545042125728, + "grad_norm": 0.056177180260419846, + "learning_rate": 2.7980557180791217e-05, + "loss": 0.1757, + "step": 43159 + }, + { + "epoch": 3.4964355152300715, + "grad_norm": 0.07218663394451141, + "learning_rate": 2.7976056528196594e-05, + "loss": 0.2102, + "step": 43160 + }, + { + "epoch": 3.4965165262475697, + "grad_norm": 0.061893776059150696, + "learning_rate": 2.797155587560196e-05, + "loss": 0.2437, + "step": 43161 + }, + { + "epoch": 3.496597537265068, + "grad_norm": 0.0635591372847557, + "learning_rate": 2.7967055223007338e-05, + "loss": 0.2191, + "step": 43162 + }, + { + "epoch": 3.496678548282566, + "grad_norm": 0.07298033684492111, + "learning_rate": 2.7962554570412715e-05, + "loss": 0.2531, + "step": 43163 + }, + { + "epoch": 3.496759559300065, + "grad_norm": 0.07126526534557343, + "learning_rate": 2.7958053917818082e-05, + "loss": 0.2309, + "step": 43164 + }, + { + "epoch": 3.496840570317563, + "grad_norm": 0.06935898214578629, + "learning_rate": 2.795355326522346e-05, + "loss": 0.2039, + "step": 43165 + }, + { + "epoch": 3.4969215813350614, + "grad_norm": 0.07336989790201187, + "learning_rate": 2.7949052612628836e-05, + "loss": 0.2602, + "step": 43166 + }, + { + "epoch": 3.49700259235256, + "grad_norm": 0.07263325154781342, + "learning_rate": 2.7944551960034203e-05, + "loss": 0.2771, + "step": 43167 + }, + { + "epoch": 3.4970836033700583, + "grad_norm": 0.0619543083012104, + "learning_rate": 2.794005130743958e-05, + "loss": 0.2135, + "step": 43168 + }, + { + "epoch": 3.4971646143875565, + "grad_norm": 0.07282905280590057, + "learning_rate": 2.7935550654844957e-05, + "loss": 0.213, + "step": 43169 + }, + { + "epoch": 3.4972456254050552, + "grad_norm": 0.0697537288069725, + "learning_rate": 2.7931050002250327e-05, + "loss": 0.2358, + "step": 43170 + }, + { + "epoch": 3.4973266364225535, + "grad_norm": 0.06539995968341827, + "learning_rate": 2.79265493496557e-05, + "loss": 0.2095, + "step": 43171 + }, + { + "epoch": 3.4974076474400517, + "grad_norm": 0.07993856072425842, + "learning_rate": 2.7922048697061077e-05, + "loss": 0.222, + "step": 43172 + }, + { + "epoch": 3.4974886584575504, + "grad_norm": 0.06634227186441422, + "learning_rate": 2.791754804446645e-05, + "loss": 0.2141, + "step": 43173 + }, + { + "epoch": 3.4975696694750487, + "grad_norm": 0.07012040168046951, + "learning_rate": 2.791304739187182e-05, + "loss": 0.2357, + "step": 43174 + }, + { + "epoch": 3.497650680492547, + "grad_norm": 0.08014750480651855, + "learning_rate": 2.7908546739277198e-05, + "loss": 0.206, + "step": 43175 + }, + { + "epoch": 3.4977316915100456, + "grad_norm": 0.0736326202750206, + "learning_rate": 2.7904046086682572e-05, + "loss": 0.245, + "step": 43176 + }, + { + "epoch": 3.497812702527544, + "grad_norm": 0.08184340596199036, + "learning_rate": 2.7899545434087942e-05, + "loss": 0.2249, + "step": 43177 + }, + { + "epoch": 3.497893713545042, + "grad_norm": 0.09063094854354858, + "learning_rate": 2.789504478149332e-05, + "loss": 0.2004, + "step": 43178 + }, + { + "epoch": 3.4979747245625403, + "grad_norm": 0.07432245463132858, + "learning_rate": 2.7890544128898693e-05, + "loss": 0.2286, + "step": 43179 + }, + { + "epoch": 3.498055735580039, + "grad_norm": 0.05675497651100159, + "learning_rate": 2.7886043476304063e-05, + "loss": 0.1832, + "step": 43180 + }, + { + "epoch": 3.4981367465975373, + "grad_norm": 0.07396359741687775, + "learning_rate": 2.788154282370944e-05, + "loss": 0.2235, + "step": 43181 + }, + { + "epoch": 3.4982177576150355, + "grad_norm": 0.0632675364613533, + "learning_rate": 2.7877042171114813e-05, + "loss": 0.2125, + "step": 43182 + }, + { + "epoch": 3.498298768632534, + "grad_norm": 0.0689416453242302, + "learning_rate": 2.7872541518520184e-05, + "loss": 0.2066, + "step": 43183 + }, + { + "epoch": 3.4983797796500324, + "grad_norm": 0.06183299794793129, + "learning_rate": 2.786804086592556e-05, + "loss": 0.2191, + "step": 43184 + }, + { + "epoch": 3.4984607906675307, + "grad_norm": 0.07677946984767914, + "learning_rate": 2.7863540213330934e-05, + "loss": 0.2463, + "step": 43185 + }, + { + "epoch": 3.498541801685029, + "grad_norm": 0.06671515852212906, + "learning_rate": 2.785903956073631e-05, + "loss": 0.2442, + "step": 43186 + }, + { + "epoch": 3.4986228127025276, + "grad_norm": 0.06701928377151489, + "learning_rate": 2.785453890814168e-05, + "loss": 0.2063, + "step": 43187 + }, + { + "epoch": 3.498703823720026, + "grad_norm": 0.07234100997447968, + "learning_rate": 2.7850038255547055e-05, + "loss": 0.2092, + "step": 43188 + }, + { + "epoch": 3.498784834737524, + "grad_norm": 0.07436953485012054, + "learning_rate": 2.7845537602952432e-05, + "loss": 0.2384, + "step": 43189 + }, + { + "epoch": 3.498865845755023, + "grad_norm": 0.07147400081157684, + "learning_rate": 2.7841036950357802e-05, + "loss": 0.2286, + "step": 43190 + }, + { + "epoch": 3.498946856772521, + "grad_norm": 0.0703418031334877, + "learning_rate": 2.7836536297763176e-05, + "loss": 0.2395, + "step": 43191 + }, + { + "epoch": 3.4990278677900193, + "grad_norm": 0.08759965747594833, + "learning_rate": 2.7832035645168553e-05, + "loss": 0.219, + "step": 43192 + }, + { + "epoch": 3.499108878807518, + "grad_norm": 0.08855955302715302, + "learning_rate": 2.7827534992573923e-05, + "loss": 0.2187, + "step": 43193 + }, + { + "epoch": 3.499189889825016, + "grad_norm": 0.07268314808607101, + "learning_rate": 2.7823034339979297e-05, + "loss": 0.2551, + "step": 43194 + }, + { + "epoch": 3.4992709008425145, + "grad_norm": 0.07844178378582001, + "learning_rate": 2.7818533687384674e-05, + "loss": 0.2519, + "step": 43195 + }, + { + "epoch": 3.499351911860013, + "grad_norm": 0.0641513243317604, + "learning_rate": 2.7814033034790044e-05, + "loss": 0.2273, + "step": 43196 + }, + { + "epoch": 3.4994329228775114, + "grad_norm": 0.06389816105365753, + "learning_rate": 2.7809532382195418e-05, + "loss": 0.2312, + "step": 43197 + }, + { + "epoch": 3.4995139338950096, + "grad_norm": 0.09214530140161514, + "learning_rate": 2.7805031729600795e-05, + "loss": 0.2504, + "step": 43198 + }, + { + "epoch": 3.4995949449125083, + "grad_norm": 0.08582887053489685, + "learning_rate": 2.780053107700617e-05, + "loss": 0.2388, + "step": 43199 + }, + { + "epoch": 3.4996759559300066, + "grad_norm": 0.07315149903297424, + "learning_rate": 2.779603042441154e-05, + "loss": 0.2116, + "step": 43200 + }, + { + "epoch": 3.499756966947505, + "grad_norm": 0.07323189824819565, + "learning_rate": 2.7791529771816915e-05, + "loss": 0.2109, + "step": 43201 + }, + { + "epoch": 3.499837977965003, + "grad_norm": 0.07631869614124298, + "learning_rate": 2.7787029119222292e-05, + "loss": 0.2315, + "step": 43202 + }, + { + "epoch": 3.4999189889825018, + "grad_norm": 0.06518463045358658, + "learning_rate": 2.7782528466627663e-05, + "loss": 0.2602, + "step": 43203 + }, + { + "epoch": 3.5, + "grad_norm": 0.06721894443035126, + "learning_rate": 2.7778027814033036e-05, + "loss": 0.2188, + "step": 43204 + }, + { + "epoch": 3.5000810110174982, + "grad_norm": 0.07444871962070465, + "learning_rate": 2.7773527161438413e-05, + "loss": 0.2278, + "step": 43205 + }, + { + "epoch": 3.5001620220349965, + "grad_norm": 0.07968197762966156, + "learning_rate": 2.7769026508843783e-05, + "loss": 0.2442, + "step": 43206 + }, + { + "epoch": 3.500243033052495, + "grad_norm": 0.07547379285097122, + "learning_rate": 2.7764525856249157e-05, + "loss": 0.221, + "step": 43207 + }, + { + "epoch": 3.5003240440699934, + "grad_norm": 0.0792350247502327, + "learning_rate": 2.7760025203654534e-05, + "loss": 0.2134, + "step": 43208 + }, + { + "epoch": 3.5004050550874917, + "grad_norm": 0.054382745176553726, + "learning_rate": 2.7755524551059904e-05, + "loss": 0.2152, + "step": 43209 + }, + { + "epoch": 3.5004860661049904, + "grad_norm": 0.07119036465883255, + "learning_rate": 2.7751023898465278e-05, + "loss": 0.2673, + "step": 43210 + }, + { + "epoch": 3.5005670771224886, + "grad_norm": 0.07480863481760025, + "learning_rate": 2.7746523245870655e-05, + "loss": 0.2604, + "step": 43211 + }, + { + "epoch": 3.500648088139987, + "grad_norm": 0.08311598002910614, + "learning_rate": 2.7742022593276025e-05, + "loss": 0.2295, + "step": 43212 + }, + { + "epoch": 3.5007290991574855, + "grad_norm": 0.07891567796468735, + "learning_rate": 2.77375219406814e-05, + "loss": 0.2204, + "step": 43213 + }, + { + "epoch": 3.500810110174984, + "grad_norm": 0.07064730674028397, + "learning_rate": 2.7733021288086776e-05, + "loss": 0.2329, + "step": 43214 + }, + { + "epoch": 3.500891121192482, + "grad_norm": 0.05172451213002205, + "learning_rate": 2.772852063549215e-05, + "loss": 0.2341, + "step": 43215 + }, + { + "epoch": 3.5009721322099807, + "grad_norm": 0.07222268730401993, + "learning_rate": 2.772401998289752e-05, + "loss": 0.2346, + "step": 43216 + }, + { + "epoch": 3.501053143227479, + "grad_norm": 0.08112135529518127, + "learning_rate": 2.7719519330302896e-05, + "loss": 0.2293, + "step": 43217 + }, + { + "epoch": 3.501134154244977, + "grad_norm": 0.06870246678590775, + "learning_rate": 2.771501867770827e-05, + "loss": 0.2145, + "step": 43218 + }, + { + "epoch": 3.501215165262476, + "grad_norm": 0.08160283416509628, + "learning_rate": 2.771051802511364e-05, + "loss": 0.2234, + "step": 43219 + }, + { + "epoch": 3.501296176279974, + "grad_norm": 0.07019760459661484, + "learning_rate": 2.7706017372519017e-05, + "loss": 0.2204, + "step": 43220 + }, + { + "epoch": 3.5013771872974724, + "grad_norm": 0.07481992244720459, + "learning_rate": 2.770151671992439e-05, + "loss": 0.2439, + "step": 43221 + }, + { + "epoch": 3.501458198314971, + "grad_norm": 0.07392154633998871, + "learning_rate": 2.769701606732976e-05, + "loss": 0.2486, + "step": 43222 + }, + { + "epoch": 3.5015392093324693, + "grad_norm": 0.08620406687259674, + "learning_rate": 2.7692515414735138e-05, + "loss": 0.2821, + "step": 43223 + }, + { + "epoch": 3.5016202203499676, + "grad_norm": 0.05955655127763748, + "learning_rate": 2.768801476214051e-05, + "loss": 0.2428, + "step": 43224 + }, + { + "epoch": 3.5017012313674662, + "grad_norm": 0.09394501894712448, + "learning_rate": 2.7683514109545882e-05, + "loss": 0.266, + "step": 43225 + }, + { + "epoch": 3.5017822423849645, + "grad_norm": 0.06005888059735298, + "learning_rate": 2.767901345695126e-05, + "loss": 0.2169, + "step": 43226 + }, + { + "epoch": 3.5018632534024627, + "grad_norm": 0.07512757182121277, + "learning_rate": 2.7674512804356632e-05, + "loss": 0.2282, + "step": 43227 + }, + { + "epoch": 3.501944264419961, + "grad_norm": 0.06217007339000702, + "learning_rate": 2.767001215176201e-05, + "loss": 0.2056, + "step": 43228 + }, + { + "epoch": 3.5020252754374592, + "grad_norm": 0.07704039663076401, + "learning_rate": 2.766551149916738e-05, + "loss": 0.229, + "step": 43229 + }, + { + "epoch": 3.502106286454958, + "grad_norm": 0.09659653156995773, + "learning_rate": 2.7661010846572753e-05, + "loss": 0.2121, + "step": 43230 + }, + { + "epoch": 3.502187297472456, + "grad_norm": 0.07554148137569427, + "learning_rate": 2.765651019397813e-05, + "loss": 0.2298, + "step": 43231 + }, + { + "epoch": 3.5022683084899544, + "grad_norm": 0.08793529123067856, + "learning_rate": 2.76520095413835e-05, + "loss": 0.2959, + "step": 43232 + }, + { + "epoch": 3.502349319507453, + "grad_norm": 0.07671716064214706, + "learning_rate": 2.7647508888788874e-05, + "loss": 0.2248, + "step": 43233 + }, + { + "epoch": 3.5024303305249513, + "grad_norm": 0.07639263570308685, + "learning_rate": 2.764300823619425e-05, + "loss": 0.2581, + "step": 43234 + }, + { + "epoch": 3.5025113415424496, + "grad_norm": 0.06445121765136719, + "learning_rate": 2.763850758359962e-05, + "loss": 0.172, + "step": 43235 + }, + { + "epoch": 3.5025923525599483, + "grad_norm": 0.059592366218566895, + "learning_rate": 2.7634006931004995e-05, + "loss": 0.2377, + "step": 43236 + }, + { + "epoch": 3.5026733635774465, + "grad_norm": 0.06346636265516281, + "learning_rate": 2.7629506278410372e-05, + "loss": 0.2086, + "step": 43237 + }, + { + "epoch": 3.5027543745949448, + "grad_norm": 0.07458571344614029, + "learning_rate": 2.7625005625815742e-05, + "loss": 0.2029, + "step": 43238 + }, + { + "epoch": 3.5028353856124435, + "grad_norm": 0.08330317586660385, + "learning_rate": 2.762050497322112e-05, + "loss": 0.2488, + "step": 43239 + }, + { + "epoch": 3.5029163966299417, + "grad_norm": 0.06813330948352814, + "learning_rate": 2.7616004320626493e-05, + "loss": 0.2231, + "step": 43240 + }, + { + "epoch": 3.50299740764744, + "grad_norm": 0.05927650257945061, + "learning_rate": 2.761150366803187e-05, + "loss": 0.2718, + "step": 43241 + }, + { + "epoch": 3.5030784186649386, + "grad_norm": 0.07224024087190628, + "learning_rate": 2.760700301543724e-05, + "loss": 0.2205, + "step": 43242 + }, + { + "epoch": 3.503159429682437, + "grad_norm": 0.08098310232162476, + "learning_rate": 2.7602502362842613e-05, + "loss": 0.2277, + "step": 43243 + }, + { + "epoch": 3.503240440699935, + "grad_norm": 0.07933811098337173, + "learning_rate": 2.759800171024799e-05, + "loss": 0.2331, + "step": 43244 + }, + { + "epoch": 3.503321451717434, + "grad_norm": 0.07267658412456512, + "learning_rate": 2.759350105765336e-05, + "loss": 0.2026, + "step": 43245 + }, + { + "epoch": 3.503402462734932, + "grad_norm": 0.06561614573001862, + "learning_rate": 2.7589000405058734e-05, + "loss": 0.2246, + "step": 43246 + }, + { + "epoch": 3.5034834737524303, + "grad_norm": 0.07257362455129623, + "learning_rate": 2.758449975246411e-05, + "loss": 0.211, + "step": 43247 + }, + { + "epoch": 3.5035644847699285, + "grad_norm": 0.08041809499263763, + "learning_rate": 2.757999909986948e-05, + "loss": 0.2334, + "step": 43248 + }, + { + "epoch": 3.5036454957874272, + "grad_norm": 0.06276113539934158, + "learning_rate": 2.7575498447274855e-05, + "loss": 0.2153, + "step": 43249 + }, + { + "epoch": 3.5037265068049255, + "grad_norm": 0.07225324958562851, + "learning_rate": 2.7570997794680232e-05, + "loss": 0.2183, + "step": 43250 + }, + { + "epoch": 3.5038075178224237, + "grad_norm": 0.06255420297384262, + "learning_rate": 2.7566497142085602e-05, + "loss": 0.1954, + "step": 43251 + }, + { + "epoch": 3.503888528839922, + "grad_norm": 0.07800949364900589, + "learning_rate": 2.7561996489490976e-05, + "loss": 0.2321, + "step": 43252 + }, + { + "epoch": 3.5039695398574207, + "grad_norm": 0.06854311376810074, + "learning_rate": 2.7557495836896353e-05, + "loss": 0.2122, + "step": 43253 + }, + { + "epoch": 3.504050550874919, + "grad_norm": 0.07762733846902847, + "learning_rate": 2.7552995184301726e-05, + "loss": 0.2467, + "step": 43254 + }, + { + "epoch": 3.504131561892417, + "grad_norm": 0.07714968174695969, + "learning_rate": 2.7548494531707097e-05, + "loss": 0.2438, + "step": 43255 + }, + { + "epoch": 3.504212572909916, + "grad_norm": 0.09110550582408905, + "learning_rate": 2.7543993879112474e-05, + "loss": 0.2134, + "step": 43256 + }, + { + "epoch": 3.504293583927414, + "grad_norm": 0.07318945974111557, + "learning_rate": 2.7539493226517847e-05, + "loss": 0.2475, + "step": 43257 + }, + { + "epoch": 3.5043745949449123, + "grad_norm": 0.06953822076320648, + "learning_rate": 2.7534992573923217e-05, + "loss": 0.2478, + "step": 43258 + }, + { + "epoch": 3.504455605962411, + "grad_norm": 0.0579572394490242, + "learning_rate": 2.7530491921328594e-05, + "loss": 0.234, + "step": 43259 + }, + { + "epoch": 3.5045366169799093, + "grad_norm": 0.06732886284589767, + "learning_rate": 2.7525991268733968e-05, + "loss": 0.1999, + "step": 43260 + }, + { + "epoch": 3.5046176279974075, + "grad_norm": 0.0854315236210823, + "learning_rate": 2.7521490616139338e-05, + "loss": 0.1762, + "step": 43261 + }, + { + "epoch": 3.504698639014906, + "grad_norm": 0.0729583278298378, + "learning_rate": 2.7516989963544715e-05, + "loss": 0.2347, + "step": 43262 + }, + { + "epoch": 3.5047796500324044, + "grad_norm": 0.08315985649824142, + "learning_rate": 2.751248931095009e-05, + "loss": 0.2299, + "step": 43263 + }, + { + "epoch": 3.5048606610499027, + "grad_norm": 0.07033509016036987, + "learning_rate": 2.750798865835546e-05, + "loss": 0.2541, + "step": 43264 + }, + { + "epoch": 3.5049416720674014, + "grad_norm": 0.09127448499202728, + "learning_rate": 2.7503488005760836e-05, + "loss": 0.2135, + "step": 43265 + }, + { + "epoch": 3.5050226830848996, + "grad_norm": 0.07316549867391586, + "learning_rate": 2.749898735316621e-05, + "loss": 0.2027, + "step": 43266 + }, + { + "epoch": 3.505103694102398, + "grad_norm": 0.07362278550863266, + "learning_rate": 2.7494486700571587e-05, + "loss": 0.2349, + "step": 43267 + }, + { + "epoch": 3.5051847051198965, + "grad_norm": 0.07606764882802963, + "learning_rate": 2.7489986047976957e-05, + "loss": 0.2129, + "step": 43268 + }, + { + "epoch": 3.505265716137395, + "grad_norm": 0.06892552971839905, + "learning_rate": 2.748548539538233e-05, + "loss": 0.2179, + "step": 43269 + }, + { + "epoch": 3.505346727154893, + "grad_norm": 0.07052438706159592, + "learning_rate": 2.7480984742787708e-05, + "loss": 0.2269, + "step": 43270 + }, + { + "epoch": 3.5054277381723913, + "grad_norm": 0.06831594556570053, + "learning_rate": 2.7476484090193078e-05, + "loss": 0.22, + "step": 43271 + }, + { + "epoch": 3.50550874918989, + "grad_norm": 0.06844569742679596, + "learning_rate": 2.7471983437598455e-05, + "loss": 0.2113, + "step": 43272 + }, + { + "epoch": 3.505589760207388, + "grad_norm": 0.06712247431278229, + "learning_rate": 2.746748278500383e-05, + "loss": 0.255, + "step": 43273 + }, + { + "epoch": 3.5056707712248865, + "grad_norm": 0.06350905448198318, + "learning_rate": 2.74629821324092e-05, + "loss": 0.2123, + "step": 43274 + }, + { + "epoch": 3.5057517822423847, + "grad_norm": 0.06915382295846939, + "learning_rate": 2.7458481479814576e-05, + "loss": 0.246, + "step": 43275 + }, + { + "epoch": 3.5058327932598834, + "grad_norm": 0.061453476548194885, + "learning_rate": 2.745398082721995e-05, + "loss": 0.2213, + "step": 43276 + }, + { + "epoch": 3.5059138042773816, + "grad_norm": 0.07296491414308548, + "learning_rate": 2.744948017462532e-05, + "loss": 0.2745, + "step": 43277 + }, + { + "epoch": 3.50599481529488, + "grad_norm": 0.066260926425457, + "learning_rate": 2.7444979522030696e-05, + "loss": 0.224, + "step": 43278 + }, + { + "epoch": 3.5060758263123786, + "grad_norm": 0.0701618492603302, + "learning_rate": 2.744047886943607e-05, + "loss": 0.2119, + "step": 43279 + }, + { + "epoch": 3.506156837329877, + "grad_norm": 0.07789304852485657, + "learning_rate": 2.7435978216841447e-05, + "loss": 0.2391, + "step": 43280 + }, + { + "epoch": 3.506237848347375, + "grad_norm": 0.07035718113183975, + "learning_rate": 2.7431477564246817e-05, + "loss": 0.2296, + "step": 43281 + }, + { + "epoch": 3.5063188593648738, + "grad_norm": 0.08160745352506638, + "learning_rate": 2.742697691165219e-05, + "loss": 0.295, + "step": 43282 + }, + { + "epoch": 3.506399870382372, + "grad_norm": 0.07453840225934982, + "learning_rate": 2.7422476259057568e-05, + "loss": 0.2016, + "step": 43283 + }, + { + "epoch": 3.5064808813998702, + "grad_norm": 0.07481218129396439, + "learning_rate": 2.7417975606462938e-05, + "loss": 0.1983, + "step": 43284 + }, + { + "epoch": 3.506561892417369, + "grad_norm": 0.07593325525522232, + "learning_rate": 2.741347495386831e-05, + "loss": 0.1987, + "step": 43285 + }, + { + "epoch": 3.506642903434867, + "grad_norm": 0.07317264378070831, + "learning_rate": 2.740897430127369e-05, + "loss": 0.2239, + "step": 43286 + }, + { + "epoch": 3.5067239144523654, + "grad_norm": 0.07845775038003922, + "learning_rate": 2.740447364867906e-05, + "loss": 0.2323, + "step": 43287 + }, + { + "epoch": 3.506804925469864, + "grad_norm": 0.06483782827854156, + "learning_rate": 2.7399972996084432e-05, + "loss": 0.2071, + "step": 43288 + }, + { + "epoch": 3.5068859364873624, + "grad_norm": 0.06465201079845428, + "learning_rate": 2.739547234348981e-05, + "loss": 0.2113, + "step": 43289 + }, + { + "epoch": 3.5069669475048606, + "grad_norm": 0.07280424237251282, + "learning_rate": 2.739097169089518e-05, + "loss": 0.2113, + "step": 43290 + }, + { + "epoch": 3.5070479585223593, + "grad_norm": 0.07811792194843292, + "learning_rate": 2.7386471038300553e-05, + "loss": 0.2373, + "step": 43291 + }, + { + "epoch": 3.5071289695398575, + "grad_norm": 0.07018356025218964, + "learning_rate": 2.738197038570593e-05, + "loss": 0.213, + "step": 43292 + }, + { + "epoch": 3.5072099805573558, + "grad_norm": 0.06678864359855652, + "learning_rate": 2.7377469733111304e-05, + "loss": 0.1966, + "step": 43293 + }, + { + "epoch": 3.507290991574854, + "grad_norm": 0.07585125416517258, + "learning_rate": 2.7372969080516674e-05, + "loss": 0.1918, + "step": 43294 + }, + { + "epoch": 3.5073720025923527, + "grad_norm": 0.07252201437950134, + "learning_rate": 2.736846842792205e-05, + "loss": 0.209, + "step": 43295 + }, + { + "epoch": 3.507453013609851, + "grad_norm": 0.07959860563278198, + "learning_rate": 2.7363967775327425e-05, + "loss": 0.1957, + "step": 43296 + }, + { + "epoch": 3.507534024627349, + "grad_norm": 0.0687057226896286, + "learning_rate": 2.7359467122732795e-05, + "loss": 0.2158, + "step": 43297 + }, + { + "epoch": 3.5076150356448474, + "grad_norm": 0.08582094311714172, + "learning_rate": 2.7354966470138172e-05, + "loss": 0.2366, + "step": 43298 + }, + { + "epoch": 3.507696046662346, + "grad_norm": 0.06664074212312698, + "learning_rate": 2.7350465817543545e-05, + "loss": 0.2462, + "step": 43299 + }, + { + "epoch": 3.5077770576798444, + "grad_norm": 0.07581663131713867, + "learning_rate": 2.7345965164948916e-05, + "loss": 0.2236, + "step": 43300 + }, + { + "epoch": 3.5078580686973426, + "grad_norm": 0.07287465035915375, + "learning_rate": 2.7341464512354293e-05, + "loss": 0.2285, + "step": 43301 + }, + { + "epoch": 3.5079390797148413, + "grad_norm": 0.09149542450904846, + "learning_rate": 2.7336963859759666e-05, + "loss": 0.223, + "step": 43302 + }, + { + "epoch": 3.5080200907323396, + "grad_norm": 0.06924710422754288, + "learning_rate": 2.7332463207165036e-05, + "loss": 0.2115, + "step": 43303 + }, + { + "epoch": 3.508101101749838, + "grad_norm": 0.05233272537589073, + "learning_rate": 2.7327962554570413e-05, + "loss": 0.184, + "step": 43304 + }, + { + "epoch": 3.5081821127673365, + "grad_norm": 0.07800295948982239, + "learning_rate": 2.732346190197579e-05, + "loss": 0.2365, + "step": 43305 + }, + { + "epoch": 3.5082631237848347, + "grad_norm": 0.06714732199907303, + "learning_rate": 2.7318961249381164e-05, + "loss": 0.2179, + "step": 43306 + }, + { + "epoch": 3.508344134802333, + "grad_norm": 0.08950652927160263, + "learning_rate": 2.7314460596786534e-05, + "loss": 0.2463, + "step": 43307 + }, + { + "epoch": 3.5084251458198317, + "grad_norm": 0.06976626068353653, + "learning_rate": 2.730995994419191e-05, + "loss": 0.2343, + "step": 43308 + }, + { + "epoch": 3.50850615683733, + "grad_norm": 0.06422611325979233, + "learning_rate": 2.7305459291597285e-05, + "loss": 0.1899, + "step": 43309 + }, + { + "epoch": 3.508587167854828, + "grad_norm": 0.06161141023039818, + "learning_rate": 2.7300958639002655e-05, + "loss": 0.235, + "step": 43310 + }, + { + "epoch": 3.508668178872327, + "grad_norm": 0.06769879907369614, + "learning_rate": 2.7296457986408032e-05, + "loss": 0.2243, + "step": 43311 + }, + { + "epoch": 3.508749189889825, + "grad_norm": 0.08198713511228561, + "learning_rate": 2.7291957333813406e-05, + "loss": 0.2227, + "step": 43312 + }, + { + "epoch": 3.5088302009073233, + "grad_norm": 0.06843384355306625, + "learning_rate": 2.7287456681218776e-05, + "loss": 0.226, + "step": 43313 + }, + { + "epoch": 3.508911211924822, + "grad_norm": 0.07348859310150146, + "learning_rate": 2.7282956028624153e-05, + "loss": 0.2449, + "step": 43314 + }, + { + "epoch": 3.5089922229423203, + "grad_norm": 0.06632348895072937, + "learning_rate": 2.7278455376029526e-05, + "loss": 0.2497, + "step": 43315 + }, + { + "epoch": 3.5090732339598185, + "grad_norm": 0.0708722472190857, + "learning_rate": 2.7273954723434897e-05, + "loss": 0.2472, + "step": 43316 + }, + { + "epoch": 3.5091542449773168, + "grad_norm": 0.05557356774806976, + "learning_rate": 2.7269454070840274e-05, + "loss": 0.1889, + "step": 43317 + }, + { + "epoch": 3.5092352559948155, + "grad_norm": 0.07586564868688583, + "learning_rate": 2.7264953418245647e-05, + "loss": 0.2314, + "step": 43318 + }, + { + "epoch": 3.5093162670123137, + "grad_norm": 0.0712403655052185, + "learning_rate": 2.7260452765651024e-05, + "loss": 0.2492, + "step": 43319 + }, + { + "epoch": 3.509397278029812, + "grad_norm": 0.06740592420101166, + "learning_rate": 2.7255952113056394e-05, + "loss": 0.1801, + "step": 43320 + }, + { + "epoch": 3.50947828904731, + "grad_norm": 0.07101822644472122, + "learning_rate": 2.7251451460461768e-05, + "loss": 0.214, + "step": 43321 + }, + { + "epoch": 3.509559300064809, + "grad_norm": 0.07239147275686264, + "learning_rate": 2.7246950807867145e-05, + "loss": 0.2501, + "step": 43322 + }, + { + "epoch": 3.509640311082307, + "grad_norm": 0.08746582269668579, + "learning_rate": 2.7242450155272515e-05, + "loss": 0.2814, + "step": 43323 + }, + { + "epoch": 3.5097213220998054, + "grad_norm": 0.06535092741250992, + "learning_rate": 2.723794950267789e-05, + "loss": 0.1951, + "step": 43324 + }, + { + "epoch": 3.509802333117304, + "grad_norm": 0.07915400713682175, + "learning_rate": 2.7233448850083266e-05, + "loss": 0.2454, + "step": 43325 + }, + { + "epoch": 3.5098833441348023, + "grad_norm": 0.07169001549482346, + "learning_rate": 2.7228948197488636e-05, + "loss": 0.2505, + "step": 43326 + }, + { + "epoch": 3.5099643551523005, + "grad_norm": 0.06705567240715027, + "learning_rate": 2.722444754489401e-05, + "loss": 0.2369, + "step": 43327 + }, + { + "epoch": 3.5100453661697992, + "grad_norm": 0.07073170691728592, + "learning_rate": 2.7219946892299387e-05, + "loss": 0.2528, + "step": 43328 + }, + { + "epoch": 3.5101263771872975, + "grad_norm": 0.0736280083656311, + "learning_rate": 2.7215446239704757e-05, + "loss": 0.2452, + "step": 43329 + }, + { + "epoch": 3.5102073882047957, + "grad_norm": 0.06382174044847488, + "learning_rate": 2.721094558711013e-05, + "loss": 0.2075, + "step": 43330 + }, + { + "epoch": 3.5102883992222944, + "grad_norm": 0.0716504231095314, + "learning_rate": 2.7206444934515507e-05, + "loss": 0.2617, + "step": 43331 + }, + { + "epoch": 3.5103694102397927, + "grad_norm": 0.06723946332931519, + "learning_rate": 2.720194428192088e-05, + "loss": 0.225, + "step": 43332 + }, + { + "epoch": 3.510450421257291, + "grad_norm": 0.0614875927567482, + "learning_rate": 2.719744362932625e-05, + "loss": 0.2012, + "step": 43333 + }, + { + "epoch": 3.5105314322747896, + "grad_norm": 0.053596943616867065, + "learning_rate": 2.7192942976731628e-05, + "loss": 0.2283, + "step": 43334 + }, + { + "epoch": 3.510612443292288, + "grad_norm": 0.08914054930210114, + "learning_rate": 2.7188442324137002e-05, + "loss": 0.2425, + "step": 43335 + }, + { + "epoch": 3.510693454309786, + "grad_norm": 0.08104723691940308, + "learning_rate": 2.7183941671542372e-05, + "loss": 0.2473, + "step": 43336 + }, + { + "epoch": 3.5107744653272848, + "grad_norm": 0.06316747516393661, + "learning_rate": 2.717944101894775e-05, + "loss": 0.2088, + "step": 43337 + }, + { + "epoch": 3.510855476344783, + "grad_norm": 0.07501699030399323, + "learning_rate": 2.7174940366353123e-05, + "loss": 0.2572, + "step": 43338 + }, + { + "epoch": 3.5109364873622813, + "grad_norm": 0.06786436587572098, + "learning_rate": 2.7170439713758493e-05, + "loss": 0.2194, + "step": 43339 + }, + { + "epoch": 3.5110174983797795, + "grad_norm": 0.06833315640687943, + "learning_rate": 2.716593906116387e-05, + "loss": 0.2295, + "step": 43340 + }, + { + "epoch": 3.511098509397278, + "grad_norm": 0.08341936022043228, + "learning_rate": 2.7161438408569247e-05, + "loss": 0.2208, + "step": 43341 + }, + { + "epoch": 3.5111795204147764, + "grad_norm": 0.06512777507305145, + "learning_rate": 2.7156937755974614e-05, + "loss": 0.2167, + "step": 43342 + }, + { + "epoch": 3.5112605314322747, + "grad_norm": 0.06528540700674057, + "learning_rate": 2.715243710337999e-05, + "loss": 0.2188, + "step": 43343 + }, + { + "epoch": 3.511341542449773, + "grad_norm": 0.084648996591568, + "learning_rate": 2.7147936450785368e-05, + "loss": 0.2484, + "step": 43344 + }, + { + "epoch": 3.5114225534672716, + "grad_norm": 0.06107432767748833, + "learning_rate": 2.714343579819074e-05, + "loss": 0.2201, + "step": 43345 + }, + { + "epoch": 3.51150356448477, + "grad_norm": 0.0834152102470398, + "learning_rate": 2.713893514559611e-05, + "loss": 0.2239, + "step": 43346 + }, + { + "epoch": 3.511584575502268, + "grad_norm": 0.07597782462835312, + "learning_rate": 2.713443449300149e-05, + "loss": 0.2435, + "step": 43347 + }, + { + "epoch": 3.511665586519767, + "grad_norm": 0.07404717057943344, + "learning_rate": 2.7129933840406862e-05, + "loss": 0.236, + "step": 43348 + }, + { + "epoch": 3.511746597537265, + "grad_norm": 0.06116487458348274, + "learning_rate": 2.7125433187812232e-05, + "loss": 0.2352, + "step": 43349 + }, + { + "epoch": 3.5118276085547633, + "grad_norm": 0.069146066904068, + "learning_rate": 2.712093253521761e-05, + "loss": 0.2009, + "step": 43350 + }, + { + "epoch": 3.511908619572262, + "grad_norm": 0.07820738852024078, + "learning_rate": 2.7116431882622983e-05, + "loss": 0.2373, + "step": 43351 + }, + { + "epoch": 3.51198963058976, + "grad_norm": 0.0879845917224884, + "learning_rate": 2.7111931230028353e-05, + "loss": 0.2038, + "step": 43352 + }, + { + "epoch": 3.5120706416072585, + "grad_norm": 0.0692344680428505, + "learning_rate": 2.710743057743373e-05, + "loss": 0.2639, + "step": 43353 + }, + { + "epoch": 3.512151652624757, + "grad_norm": 0.08497405797243118, + "learning_rate": 2.7102929924839104e-05, + "loss": 0.2285, + "step": 43354 + }, + { + "epoch": 3.5122326636422554, + "grad_norm": 0.07552628964185715, + "learning_rate": 2.7098429272244474e-05, + "loss": 0.2117, + "step": 43355 + }, + { + "epoch": 3.5123136746597536, + "grad_norm": 0.07671625167131424, + "learning_rate": 2.709392861964985e-05, + "loss": 0.2403, + "step": 43356 + }, + { + "epoch": 3.5123946856772523, + "grad_norm": 0.05206482112407684, + "learning_rate": 2.7089427967055225e-05, + "loss": 0.2331, + "step": 43357 + }, + { + "epoch": 3.5124756966947506, + "grad_norm": 0.05869055166840553, + "learning_rate": 2.70849273144606e-05, + "loss": 0.2128, + "step": 43358 + }, + { + "epoch": 3.512556707712249, + "grad_norm": 0.07022881507873535, + "learning_rate": 2.7080426661865972e-05, + "loss": 0.2038, + "step": 43359 + }, + { + "epoch": 3.5126377187297475, + "grad_norm": 0.09413991868495941, + "learning_rate": 2.7075926009271345e-05, + "loss": 0.2002, + "step": 43360 + }, + { + "epoch": 3.5127187297472457, + "grad_norm": 0.073529914021492, + "learning_rate": 2.7071425356676722e-05, + "loss": 0.2242, + "step": 43361 + }, + { + "epoch": 3.512799740764744, + "grad_norm": 0.07193224877119064, + "learning_rate": 2.7066924704082093e-05, + "loss": 0.2196, + "step": 43362 + }, + { + "epoch": 3.5128807517822422, + "grad_norm": 0.07748138159513474, + "learning_rate": 2.7062424051487466e-05, + "loss": 0.1935, + "step": 43363 + }, + { + "epoch": 3.512961762799741, + "grad_norm": 0.09035055339336395, + "learning_rate": 2.7057923398892843e-05, + "loss": 0.244, + "step": 43364 + }, + { + "epoch": 3.513042773817239, + "grad_norm": 0.09117685258388519, + "learning_rate": 2.7053422746298213e-05, + "loss": 0.2753, + "step": 43365 + }, + { + "epoch": 3.5131237848347374, + "grad_norm": 0.06288989633321762, + "learning_rate": 2.7048922093703587e-05, + "loss": 0.2242, + "step": 43366 + }, + { + "epoch": 3.5132047958522357, + "grad_norm": 0.05563490465283394, + "learning_rate": 2.7044421441108964e-05, + "loss": 0.2217, + "step": 43367 + }, + { + "epoch": 3.5132858068697344, + "grad_norm": 0.06172913312911987, + "learning_rate": 2.7039920788514334e-05, + "loss": 0.2102, + "step": 43368 + }, + { + "epoch": 3.5133668178872326, + "grad_norm": 0.05635415017604828, + "learning_rate": 2.7035420135919708e-05, + "loss": 0.199, + "step": 43369 + }, + { + "epoch": 3.513447828904731, + "grad_norm": 0.0707196295261383, + "learning_rate": 2.7030919483325085e-05, + "loss": 0.2203, + "step": 43370 + }, + { + "epoch": 3.5135288399222295, + "grad_norm": 0.07637009769678116, + "learning_rate": 2.7026418830730455e-05, + "loss": 0.2486, + "step": 43371 + }, + { + "epoch": 3.5136098509397278, + "grad_norm": 0.07130232453346252, + "learning_rate": 2.702191817813583e-05, + "loss": 0.2888, + "step": 43372 + }, + { + "epoch": 3.513690861957226, + "grad_norm": 0.07822450250387192, + "learning_rate": 2.7017417525541206e-05, + "loss": 0.2172, + "step": 43373 + }, + { + "epoch": 3.5137718729747247, + "grad_norm": 0.07567783445119858, + "learning_rate": 2.7012916872946583e-05, + "loss": 0.2101, + "step": 43374 + }, + { + "epoch": 3.513852883992223, + "grad_norm": 0.08278835564851761, + "learning_rate": 2.700841622035195e-05, + "loss": 0.2311, + "step": 43375 + }, + { + "epoch": 3.513933895009721, + "grad_norm": 0.07226582616567612, + "learning_rate": 2.7003915567757326e-05, + "loss": 0.1846, + "step": 43376 + }, + { + "epoch": 3.51401490602722, + "grad_norm": 0.09501677006483078, + "learning_rate": 2.6999414915162703e-05, + "loss": 0.2464, + "step": 43377 + }, + { + "epoch": 3.514095917044718, + "grad_norm": 0.0690857470035553, + "learning_rate": 2.699491426256807e-05, + "loss": 0.198, + "step": 43378 + }, + { + "epoch": 3.5141769280622164, + "grad_norm": 0.0640997588634491, + "learning_rate": 2.6990413609973447e-05, + "loss": 0.2525, + "step": 43379 + }, + { + "epoch": 3.514257939079715, + "grad_norm": 0.08295673131942749, + "learning_rate": 2.6985912957378824e-05, + "loss": 0.2663, + "step": 43380 + }, + { + "epoch": 3.5143389500972133, + "grad_norm": 0.05473535507917404, + "learning_rate": 2.698141230478419e-05, + "loss": 0.2107, + "step": 43381 + }, + { + "epoch": 3.5144199611147116, + "grad_norm": 0.0696151927113533, + "learning_rate": 2.6976911652189568e-05, + "loss": 0.262, + "step": 43382 + }, + { + "epoch": 3.5145009721322102, + "grad_norm": 0.10080565512180328, + "learning_rate": 2.6972410999594945e-05, + "loss": 0.2504, + "step": 43383 + }, + { + "epoch": 3.5145819831497085, + "grad_norm": 0.07367312163114548, + "learning_rate": 2.6967910347000315e-05, + "loss": 0.2, + "step": 43384 + }, + { + "epoch": 3.5146629941672067, + "grad_norm": 0.0623532272875309, + "learning_rate": 2.696340969440569e-05, + "loss": 0.2122, + "step": 43385 + }, + { + "epoch": 3.514744005184705, + "grad_norm": 0.06241362541913986, + "learning_rate": 2.6958909041811066e-05, + "loss": 0.1914, + "step": 43386 + }, + { + "epoch": 3.5148250162022032, + "grad_norm": 0.07094134390354156, + "learning_rate": 2.695440838921644e-05, + "loss": 0.2357, + "step": 43387 + }, + { + "epoch": 3.514906027219702, + "grad_norm": 0.07751740515232086, + "learning_rate": 2.694990773662181e-05, + "loss": 0.2395, + "step": 43388 + }, + { + "epoch": 3.5149870382372, + "grad_norm": 0.08845694363117218, + "learning_rate": 2.6945407084027187e-05, + "loss": 0.2242, + "step": 43389 + }, + { + "epoch": 3.5150680492546984, + "grad_norm": 0.07656298577785492, + "learning_rate": 2.694090643143256e-05, + "loss": 0.2185, + "step": 43390 + }, + { + "epoch": 3.515149060272197, + "grad_norm": 0.058742620050907135, + "learning_rate": 2.693640577883793e-05, + "loss": 0.1817, + "step": 43391 + }, + { + "epoch": 3.5152300712896953, + "grad_norm": 0.06844554841518402, + "learning_rate": 2.6931905126243307e-05, + "loss": 0.2017, + "step": 43392 + }, + { + "epoch": 3.5153110823071936, + "grad_norm": 0.06278139352798462, + "learning_rate": 2.692740447364868e-05, + "loss": 0.2204, + "step": 43393 + }, + { + "epoch": 3.5153920933246923, + "grad_norm": 0.06582430005073547, + "learning_rate": 2.692290382105405e-05, + "loss": 0.2341, + "step": 43394 + }, + { + "epoch": 3.5154731043421905, + "grad_norm": 0.070784792304039, + "learning_rate": 2.6918403168459428e-05, + "loss": 0.2349, + "step": 43395 + }, + { + "epoch": 3.5155541153596888, + "grad_norm": 0.06411685794591904, + "learning_rate": 2.6913902515864802e-05, + "loss": 0.2045, + "step": 43396 + }, + { + "epoch": 3.5156351263771874, + "grad_norm": 0.08133114874362946, + "learning_rate": 2.6909401863270172e-05, + "loss": 0.2236, + "step": 43397 + }, + { + "epoch": 3.5157161373946857, + "grad_norm": 0.06460542976856232, + "learning_rate": 2.690490121067555e-05, + "loss": 0.2263, + "step": 43398 + }, + { + "epoch": 3.515797148412184, + "grad_norm": 0.07408098876476288, + "learning_rate": 2.6900400558080923e-05, + "loss": 0.19, + "step": 43399 + }, + { + "epoch": 3.5158781594296826, + "grad_norm": 0.08495966345071793, + "learning_rate": 2.68958999054863e-05, + "loss": 0.2577, + "step": 43400 + }, + { + "epoch": 3.515959170447181, + "grad_norm": 0.0630396381020546, + "learning_rate": 2.689139925289167e-05, + "loss": 0.1878, + "step": 43401 + }, + { + "epoch": 3.516040181464679, + "grad_norm": 0.07125577330589294, + "learning_rate": 2.6886898600297043e-05, + "loss": 0.2404, + "step": 43402 + }, + { + "epoch": 3.516121192482178, + "grad_norm": 0.07317827641963959, + "learning_rate": 2.688239794770242e-05, + "loss": 0.2285, + "step": 43403 + }, + { + "epoch": 3.516202203499676, + "grad_norm": 0.07188070565462112, + "learning_rate": 2.687789729510779e-05, + "loss": 0.2356, + "step": 43404 + }, + { + "epoch": 3.5162832145171743, + "grad_norm": 0.06981553882360458, + "learning_rate": 2.6873396642513164e-05, + "loss": 0.2491, + "step": 43405 + }, + { + "epoch": 3.516364225534673, + "grad_norm": 0.08644460141658783, + "learning_rate": 2.686889598991854e-05, + "loss": 0.2656, + "step": 43406 + }, + { + "epoch": 3.5164452365521712, + "grad_norm": 0.08687439560890198, + "learning_rate": 2.686439533732391e-05, + "loss": 0.2114, + "step": 43407 + }, + { + "epoch": 3.5165262475696695, + "grad_norm": 0.06241988390684128, + "learning_rate": 2.6859894684729285e-05, + "loss": 0.2051, + "step": 43408 + }, + { + "epoch": 3.5166072585871677, + "grad_norm": 0.08557629585266113, + "learning_rate": 2.6855394032134662e-05, + "loss": 0.2346, + "step": 43409 + }, + { + "epoch": 3.516688269604666, + "grad_norm": 0.07166421413421631, + "learning_rate": 2.6850893379540032e-05, + "loss": 0.201, + "step": 43410 + }, + { + "epoch": 3.5167692806221647, + "grad_norm": 0.08565226197242737, + "learning_rate": 2.6846392726945406e-05, + "loss": 0.2258, + "step": 43411 + }, + { + "epoch": 3.516850291639663, + "grad_norm": 0.07092215865850449, + "learning_rate": 2.6841892074350783e-05, + "loss": 0.2296, + "step": 43412 + }, + { + "epoch": 3.516931302657161, + "grad_norm": 0.07098492980003357, + "learning_rate": 2.683739142175616e-05, + "loss": 0.2104, + "step": 43413 + }, + { + "epoch": 3.51701231367466, + "grad_norm": 0.06782843917608261, + "learning_rate": 2.6832890769161527e-05, + "loss": 0.196, + "step": 43414 + }, + { + "epoch": 3.517093324692158, + "grad_norm": 0.07023775577545166, + "learning_rate": 2.6828390116566904e-05, + "loss": 0.246, + "step": 43415 + }, + { + "epoch": 3.5171743357096563, + "grad_norm": 0.0655292496085167, + "learning_rate": 2.682388946397228e-05, + "loss": 0.2372, + "step": 43416 + }, + { + "epoch": 3.517255346727155, + "grad_norm": 0.10283497720956802, + "learning_rate": 2.681938881137765e-05, + "loss": 0.2896, + "step": 43417 + }, + { + "epoch": 3.5173363577446533, + "grad_norm": 0.10663864761590958, + "learning_rate": 2.6814888158783025e-05, + "loss": 0.2179, + "step": 43418 + }, + { + "epoch": 3.5174173687621515, + "grad_norm": 0.07355464994907379, + "learning_rate": 2.68103875061884e-05, + "loss": 0.2129, + "step": 43419 + }, + { + "epoch": 3.51749837977965, + "grad_norm": 0.07699501514434814, + "learning_rate": 2.6805886853593772e-05, + "loss": 0.2353, + "step": 43420 + }, + { + "epoch": 3.5175793907971484, + "grad_norm": 0.0732915848493576, + "learning_rate": 2.6801386200999145e-05, + "loss": 0.2296, + "step": 43421 + }, + { + "epoch": 3.5176604018146467, + "grad_norm": 0.07379954308271408, + "learning_rate": 2.6796885548404522e-05, + "loss": 0.2263, + "step": 43422 + }, + { + "epoch": 3.5177414128321454, + "grad_norm": 0.11059697717428207, + "learning_rate": 2.6792384895809893e-05, + "loss": 0.2475, + "step": 43423 + }, + { + "epoch": 3.5178224238496436, + "grad_norm": 0.0744595155119896, + "learning_rate": 2.6787884243215266e-05, + "loss": 0.2472, + "step": 43424 + }, + { + "epoch": 3.517903434867142, + "grad_norm": 0.0822412520647049, + "learning_rate": 2.6783383590620643e-05, + "loss": 0.2009, + "step": 43425 + }, + { + "epoch": 3.5179844458846405, + "grad_norm": 0.0705241933465004, + "learning_rate": 2.6778882938026017e-05, + "loss": 0.2339, + "step": 43426 + }, + { + "epoch": 3.518065456902139, + "grad_norm": 0.05770557373762131, + "learning_rate": 2.6774382285431387e-05, + "loss": 0.21, + "step": 43427 + }, + { + "epoch": 3.518146467919637, + "grad_norm": 0.06801163405179977, + "learning_rate": 2.6769881632836764e-05, + "loss": 0.2209, + "step": 43428 + }, + { + "epoch": 3.5182274789371357, + "grad_norm": 0.0825352743268013, + "learning_rate": 2.6765380980242138e-05, + "loss": 0.2544, + "step": 43429 + }, + { + "epoch": 3.518308489954634, + "grad_norm": 0.07521167397499084, + "learning_rate": 2.6760880327647508e-05, + "loss": 0.2237, + "step": 43430 + }, + { + "epoch": 3.518389500972132, + "grad_norm": 0.06605440378189087, + "learning_rate": 2.6756379675052885e-05, + "loss": 0.2292, + "step": 43431 + }, + { + "epoch": 3.5184705119896305, + "grad_norm": 0.06706567108631134, + "learning_rate": 2.675187902245826e-05, + "loss": 0.2135, + "step": 43432 + }, + { + "epoch": 3.5185515230071287, + "grad_norm": 0.058007679879665375, + "learning_rate": 2.674737836986363e-05, + "loss": 0.2274, + "step": 43433 + }, + { + "epoch": 3.5186325340246274, + "grad_norm": 0.07411615550518036, + "learning_rate": 2.6742877717269006e-05, + "loss": 0.2037, + "step": 43434 + }, + { + "epoch": 3.5187135450421256, + "grad_norm": 0.061007410287857056, + "learning_rate": 2.673837706467438e-05, + "loss": 0.1991, + "step": 43435 + }, + { + "epoch": 3.518794556059624, + "grad_norm": 0.06855777651071548, + "learning_rate": 2.673387641207975e-05, + "loss": 0.2241, + "step": 43436 + }, + { + "epoch": 3.5188755670771226, + "grad_norm": 0.08268239349126816, + "learning_rate": 2.6729375759485126e-05, + "loss": 0.2109, + "step": 43437 + }, + { + "epoch": 3.518956578094621, + "grad_norm": 0.06946399062871933, + "learning_rate": 2.67248751068905e-05, + "loss": 0.2634, + "step": 43438 + }, + { + "epoch": 3.519037589112119, + "grad_norm": 0.06716107577085495, + "learning_rate": 2.6720374454295877e-05, + "loss": 0.2045, + "step": 43439 + }, + { + "epoch": 3.5191186001296177, + "grad_norm": 0.07133271545171738, + "learning_rate": 2.6715873801701247e-05, + "loss": 0.2299, + "step": 43440 + }, + { + "epoch": 3.519199611147116, + "grad_norm": 0.09417259693145752, + "learning_rate": 2.671137314910662e-05, + "loss": 0.2444, + "step": 43441 + }, + { + "epoch": 3.5192806221646142, + "grad_norm": 0.06738929450511932, + "learning_rate": 2.6706872496511998e-05, + "loss": 0.2101, + "step": 43442 + }, + { + "epoch": 3.519361633182113, + "grad_norm": 0.07788601517677307, + "learning_rate": 2.6702371843917368e-05, + "loss": 0.2391, + "step": 43443 + }, + { + "epoch": 3.519442644199611, + "grad_norm": 0.07475084066390991, + "learning_rate": 2.669787119132274e-05, + "loss": 0.2092, + "step": 43444 + }, + { + "epoch": 3.5195236552171094, + "grad_norm": 0.06062447652220726, + "learning_rate": 2.669337053872812e-05, + "loss": 0.1965, + "step": 43445 + }, + { + "epoch": 3.519604666234608, + "grad_norm": 0.057632926851511, + "learning_rate": 2.668886988613349e-05, + "loss": 0.1866, + "step": 43446 + }, + { + "epoch": 3.5196856772521063, + "grad_norm": 0.08677884191274643, + "learning_rate": 2.6684369233538862e-05, + "loss": 0.2396, + "step": 43447 + }, + { + "epoch": 3.5197666882696046, + "grad_norm": 0.07600312680006027, + "learning_rate": 2.667986858094424e-05, + "loss": 0.1992, + "step": 43448 + }, + { + "epoch": 3.5198476992871033, + "grad_norm": 0.08222731947898865, + "learning_rate": 2.667536792834961e-05, + "loss": 0.2397, + "step": 43449 + }, + { + "epoch": 3.5199287103046015, + "grad_norm": 0.07942266762256622, + "learning_rate": 2.6670867275754983e-05, + "loss": 0.225, + "step": 43450 + }, + { + "epoch": 3.5200097213220998, + "grad_norm": 0.09600380808115005, + "learning_rate": 2.666636662316036e-05, + "loss": 0.2707, + "step": 43451 + }, + { + "epoch": 3.5200907323395985, + "grad_norm": 0.08181590586900711, + "learning_rate": 2.6661865970565737e-05, + "loss": 0.2215, + "step": 43452 + }, + { + "epoch": 3.5201717433570967, + "grad_norm": 0.06803275644779205, + "learning_rate": 2.6657365317971107e-05, + "loss": 0.2304, + "step": 43453 + }, + { + "epoch": 3.520252754374595, + "grad_norm": 0.06476163119077682, + "learning_rate": 2.665286466537648e-05, + "loss": 0.2317, + "step": 43454 + }, + { + "epoch": 3.520333765392093, + "grad_norm": 0.06914833933115005, + "learning_rate": 2.6648364012781858e-05, + "loss": 0.2051, + "step": 43455 + }, + { + "epoch": 3.5204147764095914, + "grad_norm": 0.07922517508268356, + "learning_rate": 2.6643863360187228e-05, + "loss": 0.2147, + "step": 43456 + }, + { + "epoch": 3.52049578742709, + "grad_norm": 0.0863863155245781, + "learning_rate": 2.6639362707592602e-05, + "loss": 0.2379, + "step": 43457 + }, + { + "epoch": 3.5205767984445884, + "grad_norm": 0.0803600624203682, + "learning_rate": 2.663486205499798e-05, + "loss": 0.2228, + "step": 43458 + }, + { + "epoch": 3.5206578094620866, + "grad_norm": 0.0711086243391037, + "learning_rate": 2.663036140240335e-05, + "loss": 0.2155, + "step": 43459 + }, + { + "epoch": 3.5207388204795853, + "grad_norm": 0.07211203128099442, + "learning_rate": 2.6625860749808723e-05, + "loss": 0.2323, + "step": 43460 + }, + { + "epoch": 3.5208198314970836, + "grad_norm": 0.06690596044063568, + "learning_rate": 2.66213600972141e-05, + "loss": 0.217, + "step": 43461 + }, + { + "epoch": 3.520900842514582, + "grad_norm": 0.06400663405656815, + "learning_rate": 2.661685944461947e-05, + "loss": 0.2124, + "step": 43462 + }, + { + "epoch": 3.5209818535320805, + "grad_norm": 0.06587471812963486, + "learning_rate": 2.6612358792024843e-05, + "loss": 0.207, + "step": 43463 + }, + { + "epoch": 3.5210628645495787, + "grad_norm": 0.07347527891397476, + "learning_rate": 2.660785813943022e-05, + "loss": 0.2551, + "step": 43464 + }, + { + "epoch": 3.521143875567077, + "grad_norm": 0.0675148069858551, + "learning_rate": 2.6603357486835594e-05, + "loss": 0.2076, + "step": 43465 + }, + { + "epoch": 3.5212248865845757, + "grad_norm": 0.06692413985729218, + "learning_rate": 2.6598856834240964e-05, + "loss": 0.2183, + "step": 43466 + }, + { + "epoch": 3.521305897602074, + "grad_norm": 0.06547009944915771, + "learning_rate": 2.659435618164634e-05, + "loss": 0.2368, + "step": 43467 + }, + { + "epoch": 3.521386908619572, + "grad_norm": 0.06786834448575974, + "learning_rate": 2.6589855529051715e-05, + "loss": 0.2102, + "step": 43468 + }, + { + "epoch": 3.521467919637071, + "grad_norm": 0.07979714125394821, + "learning_rate": 2.6585354876457085e-05, + "loss": 0.2283, + "step": 43469 + }, + { + "epoch": 3.521548930654569, + "grad_norm": 0.08165741711854935, + "learning_rate": 2.6580854223862462e-05, + "loss": 0.2312, + "step": 43470 + }, + { + "epoch": 3.5216299416720673, + "grad_norm": 0.06408173590898514, + "learning_rate": 2.6576353571267836e-05, + "loss": 0.2192, + "step": 43471 + }, + { + "epoch": 3.521710952689566, + "grad_norm": 0.0637429803609848, + "learning_rate": 2.6571852918673206e-05, + "loss": 0.2311, + "step": 43472 + }, + { + "epoch": 3.5217919637070643, + "grad_norm": 0.09980673342943192, + "learning_rate": 2.6567352266078583e-05, + "loss": 0.2742, + "step": 43473 + }, + { + "epoch": 3.5218729747245625, + "grad_norm": 0.07004711031913757, + "learning_rate": 2.6562851613483956e-05, + "loss": 0.2271, + "step": 43474 + }, + { + "epoch": 3.5219539857420608, + "grad_norm": 0.07771208882331848, + "learning_rate": 2.6558350960889327e-05, + "loss": 0.256, + "step": 43475 + }, + { + "epoch": 3.5220349967595594, + "grad_norm": 0.06684209406375885, + "learning_rate": 2.6553850308294704e-05, + "loss": 0.226, + "step": 43476 + }, + { + "epoch": 3.5221160077770577, + "grad_norm": 0.07327425479888916, + "learning_rate": 2.6549349655700077e-05, + "loss": 0.2919, + "step": 43477 + }, + { + "epoch": 3.522197018794556, + "grad_norm": 0.05451936274766922, + "learning_rate": 2.6544849003105454e-05, + "loss": 0.1945, + "step": 43478 + }, + { + "epoch": 3.522278029812054, + "grad_norm": 0.06572496891021729, + "learning_rate": 2.6540348350510824e-05, + "loss": 0.212, + "step": 43479 + }, + { + "epoch": 3.522359040829553, + "grad_norm": 0.08500397950410843, + "learning_rate": 2.6535847697916198e-05, + "loss": 0.2106, + "step": 43480 + }, + { + "epoch": 3.522440051847051, + "grad_norm": 0.07888907194137573, + "learning_rate": 2.6531347045321575e-05, + "loss": 0.212, + "step": 43481 + }, + { + "epoch": 3.5225210628645494, + "grad_norm": 0.07335029542446136, + "learning_rate": 2.6526846392726945e-05, + "loss": 0.2303, + "step": 43482 + }, + { + "epoch": 3.522602073882048, + "grad_norm": 0.06485271453857422, + "learning_rate": 2.652234574013232e-05, + "loss": 0.1977, + "step": 43483 + }, + { + "epoch": 3.5226830848995463, + "grad_norm": 0.07313435524702072, + "learning_rate": 2.6517845087537696e-05, + "loss": 0.233, + "step": 43484 + }, + { + "epoch": 3.5227640959170445, + "grad_norm": 0.07400259375572205, + "learning_rate": 2.6513344434943066e-05, + "loss": 0.2333, + "step": 43485 + }, + { + "epoch": 3.5228451069345432, + "grad_norm": 0.07800918072462082, + "learning_rate": 2.6508843782348443e-05, + "loss": 0.2451, + "step": 43486 + }, + { + "epoch": 3.5229261179520415, + "grad_norm": 0.0771111249923706, + "learning_rate": 2.6504343129753817e-05, + "loss": 0.21, + "step": 43487 + }, + { + "epoch": 3.5230071289695397, + "grad_norm": 0.07032516598701477, + "learning_rate": 2.6499842477159187e-05, + "loss": 0.2256, + "step": 43488 + }, + { + "epoch": 3.5230881399870384, + "grad_norm": 0.0727558583021164, + "learning_rate": 2.6495341824564564e-05, + "loss": 0.2364, + "step": 43489 + }, + { + "epoch": 3.5231691510045366, + "grad_norm": 0.056780096143484116, + "learning_rate": 2.6490841171969938e-05, + "loss": 0.2018, + "step": 43490 + }, + { + "epoch": 3.523250162022035, + "grad_norm": 0.07738316059112549, + "learning_rate": 2.6486340519375314e-05, + "loss": 0.2495, + "step": 43491 + }, + { + "epoch": 3.5233311730395336, + "grad_norm": 0.07734917104244232, + "learning_rate": 2.6481839866780685e-05, + "loss": 0.211, + "step": 43492 + }, + { + "epoch": 3.523412184057032, + "grad_norm": 0.05843658745288849, + "learning_rate": 2.647733921418606e-05, + "loss": 0.2026, + "step": 43493 + }, + { + "epoch": 3.52349319507453, + "grad_norm": 0.06908644735813141, + "learning_rate": 2.6472838561591435e-05, + "loss": 0.2264, + "step": 43494 + }, + { + "epoch": 3.5235742060920288, + "grad_norm": 0.07848634570837021, + "learning_rate": 2.6468337908996806e-05, + "loss": 0.1974, + "step": 43495 + }, + { + "epoch": 3.523655217109527, + "grad_norm": 0.06460677087306976, + "learning_rate": 2.646383725640218e-05, + "loss": 0.2461, + "step": 43496 + }, + { + "epoch": 3.5237362281270252, + "grad_norm": 0.07518939673900604, + "learning_rate": 2.6459336603807556e-05, + "loss": 0.2351, + "step": 43497 + }, + { + "epoch": 3.5238172391445235, + "grad_norm": 0.06146230548620224, + "learning_rate": 2.6454835951212926e-05, + "loss": 0.2307, + "step": 43498 + }, + { + "epoch": 3.523898250162022, + "grad_norm": 0.08571221679449081, + "learning_rate": 2.64503352986183e-05, + "loss": 0.2154, + "step": 43499 + }, + { + "epoch": 3.5239792611795204, + "grad_norm": 0.08313552290201187, + "learning_rate": 2.6445834646023677e-05, + "loss": 0.2433, + "step": 43500 + }, + { + "epoch": 3.5240602721970187, + "grad_norm": 0.08192568272352219, + "learning_rate": 2.6441333993429047e-05, + "loss": 0.2015, + "step": 43501 + }, + { + "epoch": 3.524141283214517, + "grad_norm": 0.08400919288396835, + "learning_rate": 2.643683334083442e-05, + "loss": 0.2592, + "step": 43502 + }, + { + "epoch": 3.5242222942320156, + "grad_norm": 0.06450191885232925, + "learning_rate": 2.6432332688239798e-05, + "loss": 0.2239, + "step": 43503 + }, + { + "epoch": 3.524303305249514, + "grad_norm": 0.09003297984600067, + "learning_rate": 2.642783203564517e-05, + "loss": 0.2483, + "step": 43504 + }, + { + "epoch": 3.524384316267012, + "grad_norm": 0.07139277458190918, + "learning_rate": 2.642333138305054e-05, + "loss": 0.2072, + "step": 43505 + }, + { + "epoch": 3.524465327284511, + "grad_norm": 0.08002956211566925, + "learning_rate": 2.641883073045592e-05, + "loss": 0.2629, + "step": 43506 + }, + { + "epoch": 3.524546338302009, + "grad_norm": 0.07315345108509064, + "learning_rate": 2.6414330077861292e-05, + "loss": 0.2488, + "step": 43507 + }, + { + "epoch": 3.5246273493195073, + "grad_norm": 0.06532798707485199, + "learning_rate": 2.6409829425266662e-05, + "loss": 0.2092, + "step": 43508 + }, + { + "epoch": 3.524708360337006, + "grad_norm": 0.07230593264102936, + "learning_rate": 2.640532877267204e-05, + "loss": 0.2302, + "step": 43509 + }, + { + "epoch": 3.524789371354504, + "grad_norm": 0.06936454027891159, + "learning_rate": 2.6400828120077413e-05, + "loss": 0.2251, + "step": 43510 + }, + { + "epoch": 3.5248703823720025, + "grad_norm": 0.07026039808988571, + "learning_rate": 2.6396327467482783e-05, + "loss": 0.2332, + "step": 43511 + }, + { + "epoch": 3.524951393389501, + "grad_norm": 0.07848269492387772, + "learning_rate": 2.639182681488816e-05, + "loss": 0.224, + "step": 43512 + }, + { + "epoch": 3.5250324044069994, + "grad_norm": 0.07015900313854218, + "learning_rate": 2.6387326162293534e-05, + "loss": 0.2114, + "step": 43513 + }, + { + "epoch": 3.5251134154244976, + "grad_norm": 0.0776764303445816, + "learning_rate": 2.6382825509698904e-05, + "loss": 0.2402, + "step": 43514 + }, + { + "epoch": 3.5251944264419963, + "grad_norm": 0.06507275998592377, + "learning_rate": 2.637832485710428e-05, + "loss": 0.2209, + "step": 43515 + }, + { + "epoch": 3.5252754374594946, + "grad_norm": 0.09480078518390656, + "learning_rate": 2.6373824204509655e-05, + "loss": 0.2453, + "step": 43516 + }, + { + "epoch": 3.525356448476993, + "grad_norm": 0.057359907776117325, + "learning_rate": 2.636932355191503e-05, + "loss": 0.233, + "step": 43517 + }, + { + "epoch": 3.5254374594944915, + "grad_norm": 0.06667699664831161, + "learning_rate": 2.6364822899320402e-05, + "loss": 0.2, + "step": 43518 + }, + { + "epoch": 3.5255184705119897, + "grad_norm": 0.06882564723491669, + "learning_rate": 2.636032224672578e-05, + "loss": 0.2355, + "step": 43519 + }, + { + "epoch": 3.525599481529488, + "grad_norm": 0.06481310725212097, + "learning_rate": 2.6355821594131152e-05, + "loss": 0.2272, + "step": 43520 + }, + { + "epoch": 3.5256804925469862, + "grad_norm": 0.07352931797504425, + "learning_rate": 2.6351320941536523e-05, + "loss": 0.2435, + "step": 43521 + }, + { + "epoch": 3.525761503564485, + "grad_norm": 0.07422930747270584, + "learning_rate": 2.63468202889419e-05, + "loss": 0.2068, + "step": 43522 + }, + { + "epoch": 3.525842514581983, + "grad_norm": 0.07219050079584122, + "learning_rate": 2.6342319636347273e-05, + "loss": 0.2251, + "step": 43523 + }, + { + "epoch": 3.5259235255994814, + "grad_norm": 0.07107728719711304, + "learning_rate": 2.6337818983752643e-05, + "loss": 0.2168, + "step": 43524 + }, + { + "epoch": 3.5260045366169797, + "grad_norm": 0.07279662787914276, + "learning_rate": 2.633331833115802e-05, + "loss": 0.2369, + "step": 43525 + }, + { + "epoch": 3.5260855476344783, + "grad_norm": 0.07390367239713669, + "learning_rate": 2.6328817678563394e-05, + "loss": 0.2059, + "step": 43526 + }, + { + "epoch": 3.5261665586519766, + "grad_norm": 0.07024620473384857, + "learning_rate": 2.6324317025968764e-05, + "loss": 0.2107, + "step": 43527 + }, + { + "epoch": 3.526247569669475, + "grad_norm": 0.06905306130647659, + "learning_rate": 2.631981637337414e-05, + "loss": 0.2138, + "step": 43528 + }, + { + "epoch": 3.5263285806869735, + "grad_norm": 0.07863122969865799, + "learning_rate": 2.6315315720779515e-05, + "loss": 0.2485, + "step": 43529 + }, + { + "epoch": 3.5264095917044718, + "grad_norm": 0.06559717655181885, + "learning_rate": 2.6310815068184892e-05, + "loss": 0.2174, + "step": 43530 + }, + { + "epoch": 3.52649060272197, + "grad_norm": 0.06733140349388123, + "learning_rate": 2.6306314415590262e-05, + "loss": 0.2276, + "step": 43531 + }, + { + "epoch": 3.5265716137394687, + "grad_norm": 0.05722331628203392, + "learning_rate": 2.6301813762995636e-05, + "loss": 0.226, + "step": 43532 + }, + { + "epoch": 3.526652624756967, + "grad_norm": 0.09110940247774124, + "learning_rate": 2.6297313110401013e-05, + "loss": 0.2204, + "step": 43533 + }, + { + "epoch": 3.526733635774465, + "grad_norm": 0.08864796161651611, + "learning_rate": 2.6292812457806383e-05, + "loss": 0.2645, + "step": 43534 + }, + { + "epoch": 3.526814646791964, + "grad_norm": 0.08280379325151443, + "learning_rate": 2.6288311805211756e-05, + "loss": 0.2228, + "step": 43535 + }, + { + "epoch": 3.526895657809462, + "grad_norm": 0.07867808640003204, + "learning_rate": 2.6283811152617133e-05, + "loss": 0.261, + "step": 43536 + }, + { + "epoch": 3.5269766688269604, + "grad_norm": 0.10088161379098892, + "learning_rate": 2.6279310500022504e-05, + "loss": 0.2678, + "step": 43537 + }, + { + "epoch": 3.527057679844459, + "grad_norm": 0.0921441987156868, + "learning_rate": 2.6274809847427877e-05, + "loss": 0.2104, + "step": 43538 + }, + { + "epoch": 3.5271386908619573, + "grad_norm": 0.0665295198559761, + "learning_rate": 2.6270309194833254e-05, + "loss": 0.2285, + "step": 43539 + }, + { + "epoch": 3.5272197018794555, + "grad_norm": 0.08360760658979416, + "learning_rate": 2.6265808542238624e-05, + "loss": 0.2474, + "step": 43540 + }, + { + "epoch": 3.5273007128969542, + "grad_norm": 0.0686190202832222, + "learning_rate": 2.6261307889643998e-05, + "loss": 0.1958, + "step": 43541 + }, + { + "epoch": 3.5273817239144525, + "grad_norm": 0.06415877491235733, + "learning_rate": 2.6256807237049375e-05, + "loss": 0.2363, + "step": 43542 + }, + { + "epoch": 3.5274627349319507, + "grad_norm": 0.0608087033033371, + "learning_rate": 2.6252306584454745e-05, + "loss": 0.2145, + "step": 43543 + }, + { + "epoch": 3.527543745949449, + "grad_norm": 0.07166855782270432, + "learning_rate": 2.624780593186012e-05, + "loss": 0.2506, + "step": 43544 + }, + { + "epoch": 3.5276247569669477, + "grad_norm": 0.06283847987651825, + "learning_rate": 2.6243305279265496e-05, + "loss": 0.2147, + "step": 43545 + }, + { + "epoch": 3.527705767984446, + "grad_norm": 0.05699127912521362, + "learning_rate": 2.623880462667087e-05, + "loss": 0.2245, + "step": 43546 + }, + { + "epoch": 3.527786779001944, + "grad_norm": 0.08262984454631805, + "learning_rate": 2.623430397407624e-05, + "loss": 0.2268, + "step": 43547 + }, + { + "epoch": 3.5278677900194424, + "grad_norm": 0.07195358723402023, + "learning_rate": 2.6229803321481617e-05, + "loss": 0.212, + "step": 43548 + }, + { + "epoch": 3.527948801036941, + "grad_norm": 0.048632893711328506, + "learning_rate": 2.622530266888699e-05, + "loss": 0.2073, + "step": 43549 + }, + { + "epoch": 3.5280298120544393, + "grad_norm": 0.07497584819793701, + "learning_rate": 2.622080201629236e-05, + "loss": 0.2036, + "step": 43550 + }, + { + "epoch": 3.5281108230719376, + "grad_norm": 0.06233372911810875, + "learning_rate": 2.6216301363697737e-05, + "loss": 0.1991, + "step": 43551 + }, + { + "epoch": 3.5281918340894363, + "grad_norm": 0.0687880665063858, + "learning_rate": 2.621180071110311e-05, + "loss": 0.1973, + "step": 43552 + }, + { + "epoch": 3.5282728451069345, + "grad_norm": 0.0592275932431221, + "learning_rate": 2.620730005850848e-05, + "loss": 0.211, + "step": 43553 + }, + { + "epoch": 3.5283538561244328, + "grad_norm": 0.07839930057525635, + "learning_rate": 2.6202799405913858e-05, + "loss": 0.2164, + "step": 43554 + }, + { + "epoch": 3.5284348671419314, + "grad_norm": 0.07318031042814255, + "learning_rate": 2.6198298753319235e-05, + "loss": 0.2062, + "step": 43555 + }, + { + "epoch": 3.5285158781594297, + "grad_norm": 0.07681858539581299, + "learning_rate": 2.6193798100724602e-05, + "loss": 0.2095, + "step": 43556 + }, + { + "epoch": 3.528596889176928, + "grad_norm": 0.0828225240111351, + "learning_rate": 2.618929744812998e-05, + "loss": 0.2307, + "step": 43557 + }, + { + "epoch": 3.5286779001944266, + "grad_norm": 0.07809402793645859, + "learning_rate": 2.6184796795535356e-05, + "loss": 0.2422, + "step": 43558 + }, + { + "epoch": 3.528758911211925, + "grad_norm": 0.06778375059366226, + "learning_rate": 2.618029614294073e-05, + "loss": 0.186, + "step": 43559 + }, + { + "epoch": 3.528839922229423, + "grad_norm": 0.07688381522893906, + "learning_rate": 2.61757954903461e-05, + "loss": 0.2154, + "step": 43560 + }, + { + "epoch": 3.528920933246922, + "grad_norm": 0.06942246109247208, + "learning_rate": 2.6171294837751477e-05, + "loss": 0.2178, + "step": 43561 + }, + { + "epoch": 3.52900194426442, + "grad_norm": 0.07926230132579803, + "learning_rate": 2.616679418515685e-05, + "loss": 0.2406, + "step": 43562 + }, + { + "epoch": 3.5290829552819183, + "grad_norm": 0.05696360394358635, + "learning_rate": 2.616229353256222e-05, + "loss": 0.2173, + "step": 43563 + }, + { + "epoch": 3.529163966299417, + "grad_norm": 0.07202611863613129, + "learning_rate": 2.6157792879967598e-05, + "loss": 0.2429, + "step": 43564 + }, + { + "epoch": 3.529244977316915, + "grad_norm": 0.07968437671661377, + "learning_rate": 2.615329222737297e-05, + "loss": 0.2496, + "step": 43565 + }, + { + "epoch": 3.5293259883344135, + "grad_norm": 0.06798778474330902, + "learning_rate": 2.614879157477834e-05, + "loss": 0.1897, + "step": 43566 + }, + { + "epoch": 3.5294069993519117, + "grad_norm": 0.06220000609755516, + "learning_rate": 2.614429092218372e-05, + "loss": 0.181, + "step": 43567 + }, + { + "epoch": 3.5294880103694104, + "grad_norm": 0.06689947098493576, + "learning_rate": 2.6139790269589092e-05, + "loss": 0.213, + "step": 43568 + }, + { + "epoch": 3.5295690213869086, + "grad_norm": 0.08405417203903198, + "learning_rate": 2.6135289616994462e-05, + "loss": 0.2039, + "step": 43569 + }, + { + "epoch": 3.529650032404407, + "grad_norm": 0.0697874054312706, + "learning_rate": 2.613078896439984e-05, + "loss": 0.196, + "step": 43570 + }, + { + "epoch": 3.529731043421905, + "grad_norm": 0.06842770427465439, + "learning_rate": 2.6126288311805213e-05, + "loss": 0.1947, + "step": 43571 + }, + { + "epoch": 3.529812054439404, + "grad_norm": 0.056984636932611465, + "learning_rate": 2.612178765921059e-05, + "loss": 0.2377, + "step": 43572 + }, + { + "epoch": 3.529893065456902, + "grad_norm": 0.094151571393013, + "learning_rate": 2.611728700661596e-05, + "loss": 0.2242, + "step": 43573 + }, + { + "epoch": 3.5299740764744003, + "grad_norm": 0.07004104554653168, + "learning_rate": 2.6112786354021334e-05, + "loss": 0.241, + "step": 43574 + }, + { + "epoch": 3.530055087491899, + "grad_norm": 0.07609971612691879, + "learning_rate": 2.610828570142671e-05, + "loss": 0.2417, + "step": 43575 + }, + { + "epoch": 3.5301360985093972, + "grad_norm": 0.06777872145175934, + "learning_rate": 2.610378504883208e-05, + "loss": 0.2393, + "step": 43576 + }, + { + "epoch": 3.5302171095268955, + "grad_norm": 0.07987414300441742, + "learning_rate": 2.6099284396237455e-05, + "loss": 0.2029, + "step": 43577 + }, + { + "epoch": 3.530298120544394, + "grad_norm": 0.0721912682056427, + "learning_rate": 2.609478374364283e-05, + "loss": 0.2173, + "step": 43578 + }, + { + "epoch": 3.5303791315618924, + "grad_norm": 0.06779767572879791, + "learning_rate": 2.6090283091048202e-05, + "loss": 0.1965, + "step": 43579 + }, + { + "epoch": 3.5304601425793907, + "grad_norm": 0.08945897966623306, + "learning_rate": 2.6085782438453575e-05, + "loss": 0.246, + "step": 43580 + }, + { + "epoch": 3.5305411535968894, + "grad_norm": 0.054115235805511475, + "learning_rate": 2.6081281785858952e-05, + "loss": 0.2269, + "step": 43581 + }, + { + "epoch": 3.5306221646143876, + "grad_norm": 0.06920191645622253, + "learning_rate": 2.6076781133264323e-05, + "loss": 0.2241, + "step": 43582 + }, + { + "epoch": 3.530703175631886, + "grad_norm": 0.06602834910154343, + "learning_rate": 2.6072280480669696e-05, + "loss": 0.25, + "step": 43583 + }, + { + "epoch": 3.5307841866493845, + "grad_norm": 0.07330073416233063, + "learning_rate": 2.6067779828075073e-05, + "loss": 0.2399, + "step": 43584 + }, + { + "epoch": 3.530865197666883, + "grad_norm": 0.07389242202043533, + "learning_rate": 2.6063279175480447e-05, + "loss": 0.2061, + "step": 43585 + }, + { + "epoch": 3.530946208684381, + "grad_norm": 0.06531089544296265, + "learning_rate": 2.6058778522885817e-05, + "loss": 0.2389, + "step": 43586 + }, + { + "epoch": 3.5310272197018797, + "grad_norm": 0.06592812389135361, + "learning_rate": 2.6054277870291194e-05, + "loss": 0.2346, + "step": 43587 + }, + { + "epoch": 3.531108230719378, + "grad_norm": 0.08015481382608414, + "learning_rate": 2.604977721769657e-05, + "loss": 0.1921, + "step": 43588 + }, + { + "epoch": 3.531189241736876, + "grad_norm": 0.0655452087521553, + "learning_rate": 2.6045276565101938e-05, + "loss": 0.2252, + "step": 43589 + }, + { + "epoch": 3.5312702527543745, + "grad_norm": 0.06559097021818161, + "learning_rate": 2.6040775912507315e-05, + "loss": 0.2479, + "step": 43590 + }, + { + "epoch": 3.531351263771873, + "grad_norm": 0.07290159165859222, + "learning_rate": 2.6036275259912692e-05, + "loss": 0.2125, + "step": 43591 + }, + { + "epoch": 3.5314322747893714, + "grad_norm": 0.08643457293510437, + "learning_rate": 2.603177460731806e-05, + "loss": 0.1838, + "step": 43592 + }, + { + "epoch": 3.5315132858068696, + "grad_norm": 0.08853253722190857, + "learning_rate": 2.6027273954723436e-05, + "loss": 0.2626, + "step": 43593 + }, + { + "epoch": 3.531594296824368, + "grad_norm": 0.07214788347482681, + "learning_rate": 2.6022773302128813e-05, + "loss": 0.2144, + "step": 43594 + }, + { + "epoch": 3.5316753078418666, + "grad_norm": 0.07845582067966461, + "learning_rate": 2.601827264953418e-05, + "loss": 0.2285, + "step": 43595 + }, + { + "epoch": 3.531756318859365, + "grad_norm": 0.08354339003562927, + "learning_rate": 2.6013771996939556e-05, + "loss": 0.2341, + "step": 43596 + }, + { + "epoch": 3.531837329876863, + "grad_norm": 0.07601010799407959, + "learning_rate": 2.6009271344344933e-05, + "loss": 0.224, + "step": 43597 + }, + { + "epoch": 3.5319183408943617, + "grad_norm": 0.07792460918426514, + "learning_rate": 2.6004770691750307e-05, + "loss": 0.2374, + "step": 43598 + }, + { + "epoch": 3.53199935191186, + "grad_norm": 0.07806096225976944, + "learning_rate": 2.6000270039155677e-05, + "loss": 0.2141, + "step": 43599 + }, + { + "epoch": 3.5320803629293582, + "grad_norm": 0.07231029868125916, + "learning_rate": 2.5995769386561054e-05, + "loss": 0.2236, + "step": 43600 + }, + { + "epoch": 3.532161373946857, + "grad_norm": 0.07012427598237991, + "learning_rate": 2.5991268733966428e-05, + "loss": 0.2853, + "step": 43601 + }, + { + "epoch": 3.532242384964355, + "grad_norm": 0.06293465197086334, + "learning_rate": 2.5986768081371798e-05, + "loss": 0.2437, + "step": 43602 + }, + { + "epoch": 3.5323233959818534, + "grad_norm": 0.057620417326688766, + "learning_rate": 2.5982267428777175e-05, + "loss": 0.1869, + "step": 43603 + }, + { + "epoch": 3.532404406999352, + "grad_norm": 0.08230112493038177, + "learning_rate": 2.597776677618255e-05, + "loss": 0.2458, + "step": 43604 + }, + { + "epoch": 3.5324854180168503, + "grad_norm": 0.0726366639137268, + "learning_rate": 2.597326612358792e-05, + "loss": 0.2376, + "step": 43605 + }, + { + "epoch": 3.5325664290343486, + "grad_norm": 0.07008222490549088, + "learning_rate": 2.5968765470993296e-05, + "loss": 0.2074, + "step": 43606 + }, + { + "epoch": 3.5326474400518473, + "grad_norm": 0.05460607632994652, + "learning_rate": 2.596426481839867e-05, + "loss": 0.2169, + "step": 43607 + }, + { + "epoch": 3.5327284510693455, + "grad_norm": 0.07533252239227295, + "learning_rate": 2.595976416580404e-05, + "loss": 0.2299, + "step": 43608 + }, + { + "epoch": 3.5328094620868438, + "grad_norm": 0.07314232736825943, + "learning_rate": 2.5955263513209417e-05, + "loss": 0.2462, + "step": 43609 + }, + { + "epoch": 3.5328904731043425, + "grad_norm": 0.08383066952228546, + "learning_rate": 2.595076286061479e-05, + "loss": 0.2775, + "step": 43610 + }, + { + "epoch": 3.5329714841218407, + "grad_norm": 0.05928299203515053, + "learning_rate": 2.5946262208020167e-05, + "loss": 0.2412, + "step": 43611 + }, + { + "epoch": 3.533052495139339, + "grad_norm": 0.09445102512836456, + "learning_rate": 2.5941761555425537e-05, + "loss": 0.2609, + "step": 43612 + }, + { + "epoch": 3.533133506156837, + "grad_norm": 0.0719989463686943, + "learning_rate": 2.593726090283091e-05, + "loss": 0.2429, + "step": 43613 + }, + { + "epoch": 3.5332145171743354, + "grad_norm": 0.07715129107236862, + "learning_rate": 2.5932760250236288e-05, + "loss": 0.251, + "step": 43614 + }, + { + "epoch": 3.533295528191834, + "grad_norm": 0.06272150576114655, + "learning_rate": 2.5928259597641658e-05, + "loss": 0.2152, + "step": 43615 + }, + { + "epoch": 3.5333765392093324, + "grad_norm": 0.06173498556017876, + "learning_rate": 2.5923758945047032e-05, + "loss": 0.2567, + "step": 43616 + }, + { + "epoch": 3.5334575502268306, + "grad_norm": 0.060172874480485916, + "learning_rate": 2.591925829245241e-05, + "loss": 0.2426, + "step": 43617 + }, + { + "epoch": 3.5335385612443293, + "grad_norm": 0.06284452229738235, + "learning_rate": 2.591475763985778e-05, + "loss": 0.1975, + "step": 43618 + }, + { + "epoch": 3.5336195722618275, + "grad_norm": 0.07622663676738739, + "learning_rate": 2.5910256987263153e-05, + "loss": 0.2322, + "step": 43619 + }, + { + "epoch": 3.533700583279326, + "grad_norm": 0.0806276723742485, + "learning_rate": 2.590575633466853e-05, + "loss": 0.2461, + "step": 43620 + }, + { + "epoch": 3.5337815942968245, + "grad_norm": 0.07501956820487976, + "learning_rate": 2.59012556820739e-05, + "loss": 0.2145, + "step": 43621 + }, + { + "epoch": 3.5338626053143227, + "grad_norm": 0.06754549592733383, + "learning_rate": 2.5896755029479273e-05, + "loss": 0.2066, + "step": 43622 + }, + { + "epoch": 3.533943616331821, + "grad_norm": 0.06591545790433884, + "learning_rate": 2.589225437688465e-05, + "loss": 0.2396, + "step": 43623 + }, + { + "epoch": 3.5340246273493197, + "grad_norm": 0.0630035549402237, + "learning_rate": 2.5887753724290027e-05, + "loss": 0.2335, + "step": 43624 + }, + { + "epoch": 3.534105638366818, + "grad_norm": 0.06425105035305023, + "learning_rate": 2.5883253071695394e-05, + "loss": 0.2341, + "step": 43625 + }, + { + "epoch": 3.534186649384316, + "grad_norm": 0.0694139301776886, + "learning_rate": 2.587875241910077e-05, + "loss": 0.2123, + "step": 43626 + }, + { + "epoch": 3.534267660401815, + "grad_norm": 0.09128300100564957, + "learning_rate": 2.5874251766506148e-05, + "loss": 0.2328, + "step": 43627 + }, + { + "epoch": 3.534348671419313, + "grad_norm": 0.0733460932970047, + "learning_rate": 2.5869751113911515e-05, + "loss": 0.2387, + "step": 43628 + }, + { + "epoch": 3.5344296824368113, + "grad_norm": 0.06711853295564651, + "learning_rate": 2.5865250461316892e-05, + "loss": 0.2522, + "step": 43629 + }, + { + "epoch": 3.53451069345431, + "grad_norm": 0.058701638132333755, + "learning_rate": 2.586074980872227e-05, + "loss": 0.1884, + "step": 43630 + }, + { + "epoch": 3.5345917044718083, + "grad_norm": 0.07320652157068253, + "learning_rate": 2.585624915612764e-05, + "loss": 0.2047, + "step": 43631 + }, + { + "epoch": 3.5346727154893065, + "grad_norm": 0.05535358190536499, + "learning_rate": 2.5851748503533013e-05, + "loss": 0.1983, + "step": 43632 + }, + { + "epoch": 3.534753726506805, + "grad_norm": 0.08350860327482224, + "learning_rate": 2.584724785093839e-05, + "loss": 0.2057, + "step": 43633 + }, + { + "epoch": 3.5348347375243034, + "grad_norm": 0.0721396878361702, + "learning_rate": 2.584274719834376e-05, + "loss": 0.2156, + "step": 43634 + }, + { + "epoch": 3.5349157485418017, + "grad_norm": 0.07066772878170013, + "learning_rate": 2.5838246545749134e-05, + "loss": 0.2474, + "step": 43635 + }, + { + "epoch": 3.5349967595593, + "grad_norm": 0.07263556122779846, + "learning_rate": 2.583374589315451e-05, + "loss": 0.2433, + "step": 43636 + }, + { + "epoch": 3.535077770576798, + "grad_norm": 0.07293055951595306, + "learning_rate": 2.5829245240559884e-05, + "loss": 0.2511, + "step": 43637 + }, + { + "epoch": 3.535158781594297, + "grad_norm": 0.06933064013719559, + "learning_rate": 2.5824744587965255e-05, + "loss": 0.2128, + "step": 43638 + }, + { + "epoch": 3.535239792611795, + "grad_norm": 0.08220583200454712, + "learning_rate": 2.582024393537063e-05, + "loss": 0.2472, + "step": 43639 + }, + { + "epoch": 3.5353208036292934, + "grad_norm": 0.06579196453094482, + "learning_rate": 2.5815743282776005e-05, + "loss": 0.2268, + "step": 43640 + }, + { + "epoch": 3.535401814646792, + "grad_norm": 0.05709078907966614, + "learning_rate": 2.5811242630181375e-05, + "loss": 0.2105, + "step": 43641 + }, + { + "epoch": 3.5354828256642903, + "grad_norm": 0.07426831126213074, + "learning_rate": 2.5806741977586752e-05, + "loss": 0.2047, + "step": 43642 + }, + { + "epoch": 3.5355638366817885, + "grad_norm": 0.06896337866783142, + "learning_rate": 2.5802241324992126e-05, + "loss": 0.1821, + "step": 43643 + }, + { + "epoch": 3.535644847699287, + "grad_norm": 0.061936162412166595, + "learning_rate": 2.5797740672397496e-05, + "loss": 0.2188, + "step": 43644 + }, + { + "epoch": 3.5357258587167855, + "grad_norm": 0.07506543397903442, + "learning_rate": 2.5793240019802873e-05, + "loss": 0.2282, + "step": 43645 + }, + { + "epoch": 3.5358068697342837, + "grad_norm": 0.07365099340677261, + "learning_rate": 2.5788739367208247e-05, + "loss": 0.2455, + "step": 43646 + }, + { + "epoch": 3.5358878807517824, + "grad_norm": 0.07197272032499313, + "learning_rate": 2.5784238714613617e-05, + "loss": 0.2295, + "step": 43647 + }, + { + "epoch": 3.5359688917692806, + "grad_norm": 0.07811829447746277, + "learning_rate": 2.5779738062018994e-05, + "loss": 0.2401, + "step": 43648 + }, + { + "epoch": 3.536049902786779, + "grad_norm": 0.07260419428348541, + "learning_rate": 2.5775237409424368e-05, + "loss": 0.2405, + "step": 43649 + }, + { + "epoch": 3.5361309138042776, + "grad_norm": 0.05985453352332115, + "learning_rate": 2.5770736756829745e-05, + "loss": 0.2362, + "step": 43650 + }, + { + "epoch": 3.536211924821776, + "grad_norm": 0.07213029265403748, + "learning_rate": 2.5766236104235115e-05, + "loss": 0.1907, + "step": 43651 + }, + { + "epoch": 3.536292935839274, + "grad_norm": 0.08837267756462097, + "learning_rate": 2.576173545164049e-05, + "loss": 0.2326, + "step": 43652 + }, + { + "epoch": 3.5363739468567728, + "grad_norm": 0.09892171621322632, + "learning_rate": 2.5757234799045865e-05, + "loss": 0.2106, + "step": 43653 + }, + { + "epoch": 3.536454957874271, + "grad_norm": 0.07501991838216782, + "learning_rate": 2.5752734146451236e-05, + "loss": 0.2417, + "step": 43654 + }, + { + "epoch": 3.5365359688917692, + "grad_norm": 0.09497696161270142, + "learning_rate": 2.574823349385661e-05, + "loss": 0.2545, + "step": 43655 + }, + { + "epoch": 3.536616979909268, + "grad_norm": 0.06474223732948303, + "learning_rate": 2.5743732841261986e-05, + "loss": 0.2293, + "step": 43656 + }, + { + "epoch": 3.536697990926766, + "grad_norm": 0.0834692195057869, + "learning_rate": 2.5739232188667356e-05, + "loss": 0.229, + "step": 43657 + }, + { + "epoch": 3.5367790019442644, + "grad_norm": 0.0605698861181736, + "learning_rate": 2.573473153607273e-05, + "loss": 0.1839, + "step": 43658 + }, + { + "epoch": 3.5368600129617627, + "grad_norm": 0.06913433223962784, + "learning_rate": 2.5730230883478107e-05, + "loss": 0.2196, + "step": 43659 + }, + { + "epoch": 3.536941023979261, + "grad_norm": 0.067677341401577, + "learning_rate": 2.5725730230883477e-05, + "loss": 0.2393, + "step": 43660 + }, + { + "epoch": 3.5370220349967596, + "grad_norm": 0.06225701421499252, + "learning_rate": 2.572122957828885e-05, + "loss": 0.2052, + "step": 43661 + }, + { + "epoch": 3.537103046014258, + "grad_norm": 0.07869463413953781, + "learning_rate": 2.5716728925694228e-05, + "loss": 0.2149, + "step": 43662 + }, + { + "epoch": 3.537184057031756, + "grad_norm": 0.08113308250904083, + "learning_rate": 2.5712228273099605e-05, + "loss": 0.2209, + "step": 43663 + }, + { + "epoch": 3.537265068049255, + "grad_norm": 0.06685000658035278, + "learning_rate": 2.5707727620504975e-05, + "loss": 0.2358, + "step": 43664 + }, + { + "epoch": 3.537346079066753, + "grad_norm": 0.06469444185495377, + "learning_rate": 2.570322696791035e-05, + "loss": 0.1878, + "step": 43665 + }, + { + "epoch": 3.5374270900842513, + "grad_norm": 0.07345057278871536, + "learning_rate": 2.5698726315315726e-05, + "loss": 0.2159, + "step": 43666 + }, + { + "epoch": 3.53750810110175, + "grad_norm": 0.07360540330410004, + "learning_rate": 2.5694225662721096e-05, + "loss": 0.2452, + "step": 43667 + }, + { + "epoch": 3.537589112119248, + "grad_norm": 0.07926061004400253, + "learning_rate": 2.568972501012647e-05, + "loss": 0.2357, + "step": 43668 + }, + { + "epoch": 3.5376701231367464, + "grad_norm": 0.07265284657478333, + "learning_rate": 2.5685224357531846e-05, + "loss": 0.2303, + "step": 43669 + }, + { + "epoch": 3.537751134154245, + "grad_norm": 0.06345818936824799, + "learning_rate": 2.5680723704937217e-05, + "loss": 0.2334, + "step": 43670 + }, + { + "epoch": 3.5378321451717434, + "grad_norm": 0.08022277802228928, + "learning_rate": 2.567622305234259e-05, + "loss": 0.2569, + "step": 43671 + }, + { + "epoch": 3.5379131561892416, + "grad_norm": 0.0716053694486618, + "learning_rate": 2.5671722399747967e-05, + "loss": 0.2109, + "step": 43672 + }, + { + "epoch": 3.5379941672067403, + "grad_norm": 0.0784161239862442, + "learning_rate": 2.5667221747153337e-05, + "loss": 0.2048, + "step": 43673 + }, + { + "epoch": 3.5380751782242386, + "grad_norm": 0.07868270576000214, + "learning_rate": 2.566272109455871e-05, + "loss": 0.2192, + "step": 43674 + }, + { + "epoch": 3.538156189241737, + "grad_norm": 0.06503627449274063, + "learning_rate": 2.5658220441964088e-05, + "loss": 0.2216, + "step": 43675 + }, + { + "epoch": 3.5382372002592355, + "grad_norm": 0.06809493899345398, + "learning_rate": 2.565371978936946e-05, + "loss": 0.2062, + "step": 43676 + }, + { + "epoch": 3.5383182112767337, + "grad_norm": 0.06095467135310173, + "learning_rate": 2.5649219136774832e-05, + "loss": 0.1965, + "step": 43677 + }, + { + "epoch": 3.538399222294232, + "grad_norm": 0.08310698717832565, + "learning_rate": 2.564471848418021e-05, + "loss": 0.2011, + "step": 43678 + }, + { + "epoch": 3.5384802333117307, + "grad_norm": 0.06410634517669678, + "learning_rate": 2.5640217831585582e-05, + "loss": 0.2179, + "step": 43679 + }, + { + "epoch": 3.538561244329229, + "grad_norm": 0.07264720648527145, + "learning_rate": 2.5635717178990953e-05, + "loss": 0.2191, + "step": 43680 + }, + { + "epoch": 3.538642255346727, + "grad_norm": 0.06960725039243698, + "learning_rate": 2.563121652639633e-05, + "loss": 0.1936, + "step": 43681 + }, + { + "epoch": 3.5387232663642254, + "grad_norm": 0.06291031092405319, + "learning_rate": 2.5626715873801703e-05, + "loss": 0.1727, + "step": 43682 + }, + { + "epoch": 3.5388042773817237, + "grad_norm": 0.07345648854970932, + "learning_rate": 2.5622215221207073e-05, + "loss": 0.2459, + "step": 43683 + }, + { + "epoch": 3.5388852883992223, + "grad_norm": 0.05809454992413521, + "learning_rate": 2.561771456861245e-05, + "loss": 0.217, + "step": 43684 + }, + { + "epoch": 3.5389662994167206, + "grad_norm": 0.06693921983242035, + "learning_rate": 2.5613213916017824e-05, + "loss": 0.2741, + "step": 43685 + }, + { + "epoch": 3.539047310434219, + "grad_norm": 0.07190316915512085, + "learning_rate": 2.5608713263423194e-05, + "loss": 0.2191, + "step": 43686 + }, + { + "epoch": 3.5391283214517175, + "grad_norm": 0.06388852745294571, + "learning_rate": 2.560421261082857e-05, + "loss": 0.2488, + "step": 43687 + }, + { + "epoch": 3.5392093324692158, + "grad_norm": 0.07432588189840317, + "learning_rate": 2.5599711958233945e-05, + "loss": 0.192, + "step": 43688 + }, + { + "epoch": 3.539290343486714, + "grad_norm": 0.07199820131063461, + "learning_rate": 2.5595211305639322e-05, + "loss": 0.2664, + "step": 43689 + }, + { + "epoch": 3.5393713545042127, + "grad_norm": 0.07324870675802231, + "learning_rate": 2.5590710653044692e-05, + "loss": 0.2255, + "step": 43690 + }, + { + "epoch": 3.539452365521711, + "grad_norm": 0.06241829693317413, + "learning_rate": 2.5586210000450066e-05, + "loss": 0.1873, + "step": 43691 + }, + { + "epoch": 3.539533376539209, + "grad_norm": 0.06450480967760086, + "learning_rate": 2.5581709347855443e-05, + "loss": 0.2215, + "step": 43692 + }, + { + "epoch": 3.539614387556708, + "grad_norm": 0.09523190557956696, + "learning_rate": 2.5577208695260813e-05, + "loss": 0.1804, + "step": 43693 + }, + { + "epoch": 3.539695398574206, + "grad_norm": 0.07226721197366714, + "learning_rate": 2.5572708042666186e-05, + "loss": 0.1974, + "step": 43694 + }, + { + "epoch": 3.5397764095917044, + "grad_norm": 0.07698127627372742, + "learning_rate": 2.5568207390071563e-05, + "loss": 0.2496, + "step": 43695 + }, + { + "epoch": 3.539857420609203, + "grad_norm": 0.09337909519672394, + "learning_rate": 2.5563706737476934e-05, + "loss": 0.2504, + "step": 43696 + }, + { + "epoch": 3.5399384316267013, + "grad_norm": 0.06675826013088226, + "learning_rate": 2.5559206084882307e-05, + "loss": 0.2234, + "step": 43697 + }, + { + "epoch": 3.5400194426441995, + "grad_norm": 0.07172457128763199, + "learning_rate": 2.5554705432287684e-05, + "loss": 0.2552, + "step": 43698 + }, + { + "epoch": 3.5401004536616982, + "grad_norm": 0.06760606169700623, + "learning_rate": 2.5550204779693054e-05, + "loss": 0.1864, + "step": 43699 + }, + { + "epoch": 3.5401814646791965, + "grad_norm": 0.07144549489021301, + "learning_rate": 2.554570412709843e-05, + "loss": 0.2419, + "step": 43700 + }, + { + "epoch": 3.5402624756966947, + "grad_norm": 0.08824021369218826, + "learning_rate": 2.5541203474503805e-05, + "loss": 0.2427, + "step": 43701 + }, + { + "epoch": 3.540343486714193, + "grad_norm": 0.07417906075716019, + "learning_rate": 2.5536702821909175e-05, + "loss": 0.2503, + "step": 43702 + }, + { + "epoch": 3.5404244977316917, + "grad_norm": 0.06739521026611328, + "learning_rate": 2.5532202169314552e-05, + "loss": 0.2274, + "step": 43703 + }, + { + "epoch": 3.54050550874919, + "grad_norm": 0.08891397714614868, + "learning_rate": 2.5527701516719926e-05, + "loss": 0.2285, + "step": 43704 + }, + { + "epoch": 3.540586519766688, + "grad_norm": 0.060347575694322586, + "learning_rate": 2.5523200864125303e-05, + "loss": 0.1862, + "step": 43705 + }, + { + "epoch": 3.5406675307841864, + "grad_norm": 0.06603474169969559, + "learning_rate": 2.5518700211530673e-05, + "loss": 0.249, + "step": 43706 + }, + { + "epoch": 3.540748541801685, + "grad_norm": 0.08578146994113922, + "learning_rate": 2.5514199558936047e-05, + "loss": 0.2211, + "step": 43707 + }, + { + "epoch": 3.5408295528191833, + "grad_norm": 0.0664646178483963, + "learning_rate": 2.5509698906341424e-05, + "loss": 0.2244, + "step": 43708 + }, + { + "epoch": 3.5409105638366816, + "grad_norm": 0.06386777758598328, + "learning_rate": 2.5505198253746794e-05, + "loss": 0.2435, + "step": 43709 + }, + { + "epoch": 3.5409915748541803, + "grad_norm": 0.06778024137020111, + "learning_rate": 2.5500697601152168e-05, + "loss": 0.245, + "step": 43710 + }, + { + "epoch": 3.5410725858716785, + "grad_norm": 0.07709016650915146, + "learning_rate": 2.5496196948557544e-05, + "loss": 0.2463, + "step": 43711 + }, + { + "epoch": 3.5411535968891767, + "grad_norm": 0.06232065334916115, + "learning_rate": 2.5491696295962915e-05, + "loss": 0.2052, + "step": 43712 + }, + { + "epoch": 3.5412346079066754, + "grad_norm": 0.07343726605176926, + "learning_rate": 2.548719564336829e-05, + "loss": 0.2242, + "step": 43713 + }, + { + "epoch": 3.5413156189241737, + "grad_norm": 0.07775748521089554, + "learning_rate": 2.5482694990773665e-05, + "loss": 0.2272, + "step": 43714 + }, + { + "epoch": 3.541396629941672, + "grad_norm": 0.08513243496417999, + "learning_rate": 2.5478194338179036e-05, + "loss": 0.2375, + "step": 43715 + }, + { + "epoch": 3.5414776409591706, + "grad_norm": 0.07626286894083023, + "learning_rate": 2.547369368558441e-05, + "loss": 0.2435, + "step": 43716 + }, + { + "epoch": 3.541558651976669, + "grad_norm": 0.07494010031223297, + "learning_rate": 2.5469193032989786e-05, + "loss": 0.2179, + "step": 43717 + }, + { + "epoch": 3.541639662994167, + "grad_norm": 0.07805442065000534, + "learning_rate": 2.546469238039516e-05, + "loss": 0.2086, + "step": 43718 + }, + { + "epoch": 3.541720674011666, + "grad_norm": 0.08477230370044708, + "learning_rate": 2.546019172780053e-05, + "loss": 0.2189, + "step": 43719 + }, + { + "epoch": 3.541801685029164, + "grad_norm": 0.06619860976934433, + "learning_rate": 2.5455691075205907e-05, + "loss": 0.2509, + "step": 43720 + }, + { + "epoch": 3.5418826960466623, + "grad_norm": 0.09968457370996475, + "learning_rate": 2.545119042261128e-05, + "loss": 0.2586, + "step": 43721 + }, + { + "epoch": 3.541963707064161, + "grad_norm": 0.07434304803609848, + "learning_rate": 2.544668977001665e-05, + "loss": 0.2438, + "step": 43722 + }, + { + "epoch": 3.542044718081659, + "grad_norm": 0.08019180595874786, + "learning_rate": 2.5442189117422028e-05, + "loss": 0.2362, + "step": 43723 + }, + { + "epoch": 3.5421257290991575, + "grad_norm": 0.06282759457826614, + "learning_rate": 2.54376884648274e-05, + "loss": 0.2246, + "step": 43724 + }, + { + "epoch": 3.5422067401166557, + "grad_norm": 0.07077057659626007, + "learning_rate": 2.543318781223277e-05, + "loss": 0.2214, + "step": 43725 + }, + { + "epoch": 3.5422877511341544, + "grad_norm": 0.07170132547616959, + "learning_rate": 2.542868715963815e-05, + "loss": 0.1954, + "step": 43726 + }, + { + "epoch": 3.5423687621516526, + "grad_norm": 0.07327235490083694, + "learning_rate": 2.5424186507043522e-05, + "loss": 0.2473, + "step": 43727 + }, + { + "epoch": 3.542449773169151, + "grad_norm": 0.07679017633199692, + "learning_rate": 2.5419685854448892e-05, + "loss": 0.2075, + "step": 43728 + }, + { + "epoch": 3.542530784186649, + "grad_norm": 0.07175929844379425, + "learning_rate": 2.541518520185427e-05, + "loss": 0.2231, + "step": 43729 + }, + { + "epoch": 3.542611795204148, + "grad_norm": 0.07727406173944473, + "learning_rate": 2.5410684549259643e-05, + "loss": 0.233, + "step": 43730 + }, + { + "epoch": 3.542692806221646, + "grad_norm": 0.08105980604887009, + "learning_rate": 2.540618389666502e-05, + "loss": 0.2394, + "step": 43731 + }, + { + "epoch": 3.5427738172391443, + "grad_norm": 0.0716603472828865, + "learning_rate": 2.540168324407039e-05, + "loss": 0.2573, + "step": 43732 + }, + { + "epoch": 3.542854828256643, + "grad_norm": 0.07205747067928314, + "learning_rate": 2.5397182591475767e-05, + "loss": 0.2198, + "step": 43733 + }, + { + "epoch": 3.5429358392741412, + "grad_norm": 0.07168762385845184, + "learning_rate": 2.539268193888114e-05, + "loss": 0.2072, + "step": 43734 + }, + { + "epoch": 3.5430168502916395, + "grad_norm": 0.07003463804721832, + "learning_rate": 2.538818128628651e-05, + "loss": 0.2305, + "step": 43735 + }, + { + "epoch": 3.543097861309138, + "grad_norm": 0.06664938479661942, + "learning_rate": 2.5383680633691888e-05, + "loss": 0.1891, + "step": 43736 + }, + { + "epoch": 3.5431788723266364, + "grad_norm": 0.0784919485449791, + "learning_rate": 2.537917998109726e-05, + "loss": 0.2537, + "step": 43737 + }, + { + "epoch": 3.5432598833441347, + "grad_norm": 0.060001879930496216, + "learning_rate": 2.5374679328502632e-05, + "loss": 0.212, + "step": 43738 + }, + { + "epoch": 3.5433408943616334, + "grad_norm": 0.0571327805519104, + "learning_rate": 2.537017867590801e-05, + "loss": 0.2013, + "step": 43739 + }, + { + "epoch": 3.5434219053791316, + "grad_norm": 0.06848404556512833, + "learning_rate": 2.5365678023313382e-05, + "loss": 0.2151, + "step": 43740 + }, + { + "epoch": 3.54350291639663, + "grad_norm": 0.0731678456068039, + "learning_rate": 2.5361177370718753e-05, + "loss": 0.243, + "step": 43741 + }, + { + "epoch": 3.5435839274141285, + "grad_norm": 0.06608740985393524, + "learning_rate": 2.535667671812413e-05, + "loss": 0.2395, + "step": 43742 + }, + { + "epoch": 3.5436649384316268, + "grad_norm": 0.07724490016698837, + "learning_rate": 2.5352176065529503e-05, + "loss": 0.2537, + "step": 43743 + }, + { + "epoch": 3.543745949449125, + "grad_norm": 0.07840392738580704, + "learning_rate": 2.534767541293488e-05, + "loss": 0.2642, + "step": 43744 + }, + { + "epoch": 3.5438269604666237, + "grad_norm": 0.06792247295379639, + "learning_rate": 2.534317476034025e-05, + "loss": 0.2715, + "step": 43745 + }, + { + "epoch": 3.543907971484122, + "grad_norm": 0.08233068883419037, + "learning_rate": 2.5338674107745624e-05, + "loss": 0.3045, + "step": 43746 + }, + { + "epoch": 3.54398898250162, + "grad_norm": 0.06983482837677002, + "learning_rate": 2.5334173455151e-05, + "loss": 0.2062, + "step": 43747 + }, + { + "epoch": 3.5440699935191184, + "grad_norm": 0.07004276663064957, + "learning_rate": 2.532967280255637e-05, + "loss": 0.2199, + "step": 43748 + }, + { + "epoch": 3.544151004536617, + "grad_norm": 0.0649484321475029, + "learning_rate": 2.5325172149961745e-05, + "loss": 0.1992, + "step": 43749 + }, + { + "epoch": 3.5442320155541154, + "grad_norm": 0.06359495967626572, + "learning_rate": 2.5320671497367122e-05, + "loss": 0.207, + "step": 43750 + }, + { + "epoch": 3.5443130265716136, + "grad_norm": 0.07534097880125046, + "learning_rate": 2.5316170844772492e-05, + "loss": 0.2242, + "step": 43751 + }, + { + "epoch": 3.544394037589112, + "grad_norm": 0.07306455075740814, + "learning_rate": 2.5311670192177866e-05, + "loss": 0.2349, + "step": 43752 + }, + { + "epoch": 3.5444750486066106, + "grad_norm": 0.07966720312833786, + "learning_rate": 2.5307169539583243e-05, + "loss": 0.2856, + "step": 43753 + }, + { + "epoch": 3.544556059624109, + "grad_norm": 0.06478086858987808, + "learning_rate": 2.5302668886988613e-05, + "loss": 0.2692, + "step": 43754 + }, + { + "epoch": 3.544637070641607, + "grad_norm": 0.06660716980695724, + "learning_rate": 2.5298168234393986e-05, + "loss": 0.2397, + "step": 43755 + }, + { + "epoch": 3.5447180816591057, + "grad_norm": 0.06570249795913696, + "learning_rate": 2.5293667581799363e-05, + "loss": 0.2331, + "step": 43756 + }, + { + "epoch": 3.544799092676604, + "grad_norm": 0.07750461995601654, + "learning_rate": 2.5289166929204737e-05, + "loss": 0.2162, + "step": 43757 + }, + { + "epoch": 3.5448801036941022, + "grad_norm": 0.07073743641376495, + "learning_rate": 2.5284666276610107e-05, + "loss": 0.2553, + "step": 43758 + }, + { + "epoch": 3.544961114711601, + "grad_norm": 0.09300247579813004, + "learning_rate": 2.5280165624015484e-05, + "loss": 0.2662, + "step": 43759 + }, + { + "epoch": 3.545042125729099, + "grad_norm": 0.07046713680028915, + "learning_rate": 2.5275664971420858e-05, + "loss": 0.2389, + "step": 43760 + }, + { + "epoch": 3.5451231367465974, + "grad_norm": 0.05945826694369316, + "learning_rate": 2.5271164318826228e-05, + "loss": 0.2003, + "step": 43761 + }, + { + "epoch": 3.545204147764096, + "grad_norm": 0.052805542945861816, + "learning_rate": 2.5266663666231605e-05, + "loss": 0.2011, + "step": 43762 + }, + { + "epoch": 3.5452851587815943, + "grad_norm": 0.07712626457214355, + "learning_rate": 2.526216301363698e-05, + "loss": 0.2103, + "step": 43763 + }, + { + "epoch": 3.5453661697990926, + "grad_norm": 0.0709281861782074, + "learning_rate": 2.525766236104235e-05, + "loss": 0.2384, + "step": 43764 + }, + { + "epoch": 3.5454471808165913, + "grad_norm": 0.06912218034267426, + "learning_rate": 2.5253161708447726e-05, + "loss": 0.2296, + "step": 43765 + }, + { + "epoch": 3.5455281918340895, + "grad_norm": 0.056670188903808594, + "learning_rate": 2.5248661055853103e-05, + "loss": 0.2324, + "step": 43766 + }, + { + "epoch": 3.5456092028515878, + "grad_norm": 0.058022577315568924, + "learning_rate": 2.524416040325847e-05, + "loss": 0.2044, + "step": 43767 + }, + { + "epoch": 3.5456902138690864, + "grad_norm": 0.0642026960849762, + "learning_rate": 2.5239659750663847e-05, + "loss": 0.248, + "step": 43768 + }, + { + "epoch": 3.5457712248865847, + "grad_norm": 0.07394513487815857, + "learning_rate": 2.5235159098069224e-05, + "loss": 0.2265, + "step": 43769 + }, + { + "epoch": 3.545852235904083, + "grad_norm": 0.07777998596429825, + "learning_rate": 2.5230658445474597e-05, + "loss": 0.214, + "step": 43770 + }, + { + "epoch": 3.545933246921581, + "grad_norm": 0.06933100521564484, + "learning_rate": 2.5226157792879967e-05, + "loss": 0.243, + "step": 43771 + }, + { + "epoch": 3.54601425793908, + "grad_norm": 0.07766921818256378, + "learning_rate": 2.5221657140285344e-05, + "loss": 0.2295, + "step": 43772 + }, + { + "epoch": 3.546095268956578, + "grad_norm": 0.06734801083803177, + "learning_rate": 2.5217156487690718e-05, + "loss": 0.2374, + "step": 43773 + }, + { + "epoch": 3.5461762799740764, + "grad_norm": 0.060280006378889084, + "learning_rate": 2.5212655835096088e-05, + "loss": 0.1809, + "step": 43774 + }, + { + "epoch": 3.5462572909915746, + "grad_norm": 0.07323633134365082, + "learning_rate": 2.5208155182501465e-05, + "loss": 0.2311, + "step": 43775 + }, + { + "epoch": 3.5463383020090733, + "grad_norm": 0.07583745568990707, + "learning_rate": 2.520365452990684e-05, + "loss": 0.2303, + "step": 43776 + }, + { + "epoch": 3.5464193130265715, + "grad_norm": 0.04876444861292839, + "learning_rate": 2.519915387731221e-05, + "loss": 0.1779, + "step": 43777 + }, + { + "epoch": 3.54650032404407, + "grad_norm": 0.06728420406579971, + "learning_rate": 2.5194653224717586e-05, + "loss": 0.2362, + "step": 43778 + }, + { + "epoch": 3.5465813350615685, + "grad_norm": 0.07322170585393906, + "learning_rate": 2.519015257212296e-05, + "loss": 0.2153, + "step": 43779 + }, + { + "epoch": 3.5466623460790667, + "grad_norm": 0.08688737452030182, + "learning_rate": 2.518565191952833e-05, + "loss": 0.2691, + "step": 43780 + }, + { + "epoch": 3.546743357096565, + "grad_norm": 0.060829900205135345, + "learning_rate": 2.5181151266933707e-05, + "loss": 0.2532, + "step": 43781 + }, + { + "epoch": 3.5468243681140637, + "grad_norm": 0.07676567882299423, + "learning_rate": 2.517665061433908e-05, + "loss": 0.246, + "step": 43782 + }, + { + "epoch": 3.546905379131562, + "grad_norm": 0.062333088368177414, + "learning_rate": 2.5172149961744457e-05, + "loss": 0.2144, + "step": 43783 + }, + { + "epoch": 3.54698639014906, + "grad_norm": 0.07637736201286316, + "learning_rate": 2.5167649309149828e-05, + "loss": 0.3052, + "step": 43784 + }, + { + "epoch": 3.547067401166559, + "grad_norm": 0.1071089506149292, + "learning_rate": 2.51631486565552e-05, + "loss": 0.2472, + "step": 43785 + }, + { + "epoch": 3.547148412184057, + "grad_norm": 0.07975732535123825, + "learning_rate": 2.5158648003960578e-05, + "loss": 0.2158, + "step": 43786 + }, + { + "epoch": 3.5472294232015553, + "grad_norm": 0.08271283656358719, + "learning_rate": 2.515414735136595e-05, + "loss": 0.2287, + "step": 43787 + }, + { + "epoch": 3.547310434219054, + "grad_norm": 0.08874785155057907, + "learning_rate": 2.5149646698771322e-05, + "loss": 0.2349, + "step": 43788 + }, + { + "epoch": 3.5473914452365523, + "grad_norm": 0.06711436808109283, + "learning_rate": 2.51451460461767e-05, + "loss": 0.2569, + "step": 43789 + }, + { + "epoch": 3.5474724562540505, + "grad_norm": 0.06952887773513794, + "learning_rate": 2.514064539358207e-05, + "loss": 0.2033, + "step": 43790 + }, + { + "epoch": 3.547553467271549, + "grad_norm": 0.08235787600278854, + "learning_rate": 2.5136144740987443e-05, + "loss": 0.2402, + "step": 43791 + }, + { + "epoch": 3.5476344782890474, + "grad_norm": 0.09327933937311172, + "learning_rate": 2.513164408839282e-05, + "loss": 0.2153, + "step": 43792 + }, + { + "epoch": 3.5477154893065457, + "grad_norm": 0.06308753788471222, + "learning_rate": 2.512714343579819e-05, + "loss": 0.2094, + "step": 43793 + }, + { + "epoch": 3.547796500324044, + "grad_norm": 0.083448126912117, + "learning_rate": 2.5122642783203564e-05, + "loss": 0.2456, + "step": 43794 + }, + { + "epoch": 3.5478775113415426, + "grad_norm": 0.09958131611347198, + "learning_rate": 2.511814213060894e-05, + "loss": 0.2039, + "step": 43795 + }, + { + "epoch": 3.547958522359041, + "grad_norm": 0.06535204499959946, + "learning_rate": 2.5113641478014314e-05, + "loss": 0.2398, + "step": 43796 + }, + { + "epoch": 3.548039533376539, + "grad_norm": 0.08059913665056229, + "learning_rate": 2.5109140825419685e-05, + "loss": 0.1909, + "step": 43797 + }, + { + "epoch": 3.5481205443940373, + "grad_norm": 0.07172200083732605, + "learning_rate": 2.510464017282506e-05, + "loss": 0.2171, + "step": 43798 + }, + { + "epoch": 3.548201555411536, + "grad_norm": 0.07303963601589203, + "learning_rate": 2.5100139520230435e-05, + "loss": 0.2011, + "step": 43799 + }, + { + "epoch": 3.5482825664290343, + "grad_norm": 0.06889622658491135, + "learning_rate": 2.5095638867635805e-05, + "loss": 0.1848, + "step": 43800 + }, + { + "epoch": 3.5483635774465325, + "grad_norm": 0.06545989215373993, + "learning_rate": 2.5091138215041182e-05, + "loss": 0.2037, + "step": 43801 + }, + { + "epoch": 3.548444588464031, + "grad_norm": 0.07705027610063553, + "learning_rate": 2.508663756244656e-05, + "loss": 0.2038, + "step": 43802 + }, + { + "epoch": 3.5485255994815295, + "grad_norm": 0.08232953399419785, + "learning_rate": 2.5082136909851926e-05, + "loss": 0.2164, + "step": 43803 + }, + { + "epoch": 3.5486066104990277, + "grad_norm": 0.0697472095489502, + "learning_rate": 2.5077636257257303e-05, + "loss": 0.2141, + "step": 43804 + }, + { + "epoch": 3.5486876215165264, + "grad_norm": 0.07026585936546326, + "learning_rate": 2.507313560466268e-05, + "loss": 0.2203, + "step": 43805 + }, + { + "epoch": 3.5487686325340246, + "grad_norm": 0.06228466331958771, + "learning_rate": 2.5068634952068047e-05, + "loss": 0.2645, + "step": 43806 + }, + { + "epoch": 3.548849643551523, + "grad_norm": 0.065281443297863, + "learning_rate": 2.5064134299473424e-05, + "loss": 0.2255, + "step": 43807 + }, + { + "epoch": 3.5489306545690216, + "grad_norm": 0.07093493640422821, + "learning_rate": 2.50596336468788e-05, + "loss": 0.2631, + "step": 43808 + }, + { + "epoch": 3.54901166558652, + "grad_norm": 0.06182321906089783, + "learning_rate": 2.5055132994284175e-05, + "loss": 0.2215, + "step": 43809 + }, + { + "epoch": 3.549092676604018, + "grad_norm": 0.0823003426194191, + "learning_rate": 2.5050632341689545e-05, + "loss": 0.241, + "step": 43810 + }, + { + "epoch": 3.5491736876215167, + "grad_norm": 0.07107570767402649, + "learning_rate": 2.5046131689094922e-05, + "loss": 0.2367, + "step": 43811 + }, + { + "epoch": 3.549254698639015, + "grad_norm": 0.07341833412647247, + "learning_rate": 2.5041631036500295e-05, + "loss": 0.2445, + "step": 43812 + }, + { + "epoch": 3.5493357096565132, + "grad_norm": 0.05725165829062462, + "learning_rate": 2.5037130383905666e-05, + "loss": 0.2779, + "step": 43813 + }, + { + "epoch": 3.549416720674012, + "grad_norm": 0.06485152244567871, + "learning_rate": 2.5032629731311043e-05, + "loss": 0.2184, + "step": 43814 + }, + { + "epoch": 3.54949773169151, + "grad_norm": 0.07398539036512375, + "learning_rate": 2.5028129078716416e-05, + "loss": 0.2186, + "step": 43815 + }, + { + "epoch": 3.5495787427090084, + "grad_norm": 0.06690250337123871, + "learning_rate": 2.5023628426121786e-05, + "loss": 0.213, + "step": 43816 + }, + { + "epoch": 3.5496597537265067, + "grad_norm": 0.07127828896045685, + "learning_rate": 2.5019127773527163e-05, + "loss": 0.2559, + "step": 43817 + }, + { + "epoch": 3.5497407647440054, + "grad_norm": 0.06582987308502197, + "learning_rate": 2.5014627120932537e-05, + "loss": 0.221, + "step": 43818 + }, + { + "epoch": 3.5498217757615036, + "grad_norm": 0.06993681192398071, + "learning_rate": 2.5010126468337907e-05, + "loss": 0.2363, + "step": 43819 + }, + { + "epoch": 3.549902786779002, + "grad_norm": 0.08037863671779633, + "learning_rate": 2.5005625815743284e-05, + "loss": 0.2695, + "step": 43820 + }, + { + "epoch": 3.5499837977965, + "grad_norm": 0.06816840171813965, + "learning_rate": 2.5001125163148658e-05, + "loss": 0.2376, + "step": 43821 + }, + { + "epoch": 3.5500648088139988, + "grad_norm": 0.0628628358244896, + "learning_rate": 2.499662451055403e-05, + "loss": 0.2551, + "step": 43822 + }, + { + "epoch": 3.550145819831497, + "grad_norm": 0.060273364186286926, + "learning_rate": 2.4992123857959405e-05, + "loss": 0.2097, + "step": 43823 + }, + { + "epoch": 3.5502268308489953, + "grad_norm": 0.06653022766113281, + "learning_rate": 2.498762320536478e-05, + "loss": 0.2405, + "step": 43824 + }, + { + "epoch": 3.550307841866494, + "grad_norm": 0.06800541281700134, + "learning_rate": 2.4983122552770152e-05, + "loss": 0.2408, + "step": 43825 + }, + { + "epoch": 3.550388852883992, + "grad_norm": 0.07112803310155869, + "learning_rate": 2.497862190017553e-05, + "loss": 0.2259, + "step": 43826 + }, + { + "epoch": 3.5504698639014904, + "grad_norm": 0.07241787761449814, + "learning_rate": 2.49741212475809e-05, + "loss": 0.19, + "step": 43827 + }, + { + "epoch": 3.550550874918989, + "grad_norm": 0.06978960335254669, + "learning_rate": 2.4969620594986273e-05, + "loss": 0.2314, + "step": 43828 + }, + { + "epoch": 3.5506318859364874, + "grad_norm": 0.07811637222766876, + "learning_rate": 2.496511994239165e-05, + "loss": 0.2636, + "step": 43829 + }, + { + "epoch": 3.5507128969539856, + "grad_norm": 0.07929955422878265, + "learning_rate": 2.496061928979702e-05, + "loss": 0.214, + "step": 43830 + }, + { + "epoch": 3.5507939079714843, + "grad_norm": 0.059971027076244354, + "learning_rate": 2.4956118637202394e-05, + "loss": 0.1951, + "step": 43831 + }, + { + "epoch": 3.5508749189889826, + "grad_norm": 0.06560038775205612, + "learning_rate": 2.495161798460777e-05, + "loss": 0.2191, + "step": 43832 + }, + { + "epoch": 3.550955930006481, + "grad_norm": 0.06976732611656189, + "learning_rate": 2.494711733201314e-05, + "loss": 0.2603, + "step": 43833 + }, + { + "epoch": 3.5510369410239795, + "grad_norm": 0.06167634576559067, + "learning_rate": 2.4942616679418518e-05, + "loss": 0.2475, + "step": 43834 + }, + { + "epoch": 3.5511179520414777, + "grad_norm": 0.06971214711666107, + "learning_rate": 2.493811602682389e-05, + "loss": 0.2262, + "step": 43835 + }, + { + "epoch": 3.551198963058976, + "grad_norm": 0.07255706936120987, + "learning_rate": 2.4933615374229262e-05, + "loss": 0.233, + "step": 43836 + }, + { + "epoch": 3.5512799740764747, + "grad_norm": 0.05798391252756119, + "learning_rate": 2.492911472163464e-05, + "loss": 0.2406, + "step": 43837 + }, + { + "epoch": 3.551360985093973, + "grad_norm": 0.09059900790452957, + "learning_rate": 2.4924614069040012e-05, + "loss": 0.2585, + "step": 43838 + }, + { + "epoch": 3.551441996111471, + "grad_norm": 0.07024770975112915, + "learning_rate": 2.4920113416445386e-05, + "loss": 0.2306, + "step": 43839 + }, + { + "epoch": 3.5515230071289694, + "grad_norm": 0.07047968357801437, + "learning_rate": 2.491561276385076e-05, + "loss": 0.2022, + "step": 43840 + }, + { + "epoch": 3.5516040181464676, + "grad_norm": 0.07896172255277634, + "learning_rate": 2.4911112111256133e-05, + "loss": 0.2118, + "step": 43841 + }, + { + "epoch": 3.5516850291639663, + "grad_norm": 0.06975747644901276, + "learning_rate": 2.4906611458661507e-05, + "loss": 0.2167, + "step": 43842 + }, + { + "epoch": 3.5517660401814646, + "grad_norm": 0.06797535717487335, + "learning_rate": 2.490211080606688e-05, + "loss": 0.2196, + "step": 43843 + }, + { + "epoch": 3.551847051198963, + "grad_norm": 0.06500260531902313, + "learning_rate": 2.4897610153472254e-05, + "loss": 0.2352, + "step": 43844 + }, + { + "epoch": 3.5519280622164615, + "grad_norm": 0.08648168295621872, + "learning_rate": 2.4893109500877628e-05, + "loss": 0.2259, + "step": 43845 + }, + { + "epoch": 3.5520090732339598, + "grad_norm": 0.0645139291882515, + "learning_rate": 2.4888608848283e-05, + "loss": 0.1921, + "step": 43846 + }, + { + "epoch": 3.552090084251458, + "grad_norm": 0.07572876662015915, + "learning_rate": 2.4884108195688378e-05, + "loss": 0.2005, + "step": 43847 + }, + { + "epoch": 3.5521710952689567, + "grad_norm": 0.07796064019203186, + "learning_rate": 2.487960754309375e-05, + "loss": 0.2647, + "step": 43848 + }, + { + "epoch": 3.552252106286455, + "grad_norm": 0.06862769275903702, + "learning_rate": 2.4875106890499122e-05, + "loss": 0.1952, + "step": 43849 + }, + { + "epoch": 3.552333117303953, + "grad_norm": 0.06902038305997849, + "learning_rate": 2.48706062379045e-05, + "loss": 0.239, + "step": 43850 + }, + { + "epoch": 3.552414128321452, + "grad_norm": 0.06956470757722855, + "learning_rate": 2.486610558530987e-05, + "loss": 0.2132, + "step": 43851 + }, + { + "epoch": 3.55249513933895, + "grad_norm": 0.0691932812333107, + "learning_rate": 2.4861604932715246e-05, + "loss": 0.1995, + "step": 43852 + }, + { + "epoch": 3.5525761503564484, + "grad_norm": 0.05597129836678505, + "learning_rate": 2.485710428012062e-05, + "loss": 0.2127, + "step": 43853 + }, + { + "epoch": 3.552657161373947, + "grad_norm": 0.06926069408655167, + "learning_rate": 2.485260362752599e-05, + "loss": 0.2278, + "step": 43854 + }, + { + "epoch": 3.5527381723914453, + "grad_norm": 0.06039774417877197, + "learning_rate": 2.4848102974931367e-05, + "loss": 0.2404, + "step": 43855 + }, + { + "epoch": 3.5528191834089435, + "grad_norm": 0.07916494458913803, + "learning_rate": 2.484360232233674e-05, + "loss": 0.2635, + "step": 43856 + }, + { + "epoch": 3.5529001944264422, + "grad_norm": 0.06923363357782364, + "learning_rate": 2.483910166974211e-05, + "loss": 0.2183, + "step": 43857 + }, + { + "epoch": 3.5529812054439405, + "grad_norm": 0.07305173575878143, + "learning_rate": 2.4834601017147488e-05, + "loss": 0.2313, + "step": 43858 + }, + { + "epoch": 3.5530622164614387, + "grad_norm": 0.08665354549884796, + "learning_rate": 2.483010036455286e-05, + "loss": 0.2055, + "step": 43859 + }, + { + "epoch": 3.5531432274789374, + "grad_norm": 0.07427648454904556, + "learning_rate": 2.4825599711958235e-05, + "loss": 0.2022, + "step": 43860 + }, + { + "epoch": 3.5532242384964356, + "grad_norm": 0.053579483181238174, + "learning_rate": 2.482109905936361e-05, + "loss": 0.1787, + "step": 43861 + }, + { + "epoch": 3.553305249513934, + "grad_norm": 0.08276137709617615, + "learning_rate": 2.4816598406768982e-05, + "loss": 0.2655, + "step": 43862 + }, + { + "epoch": 3.553386260531432, + "grad_norm": 0.07999289780855179, + "learning_rate": 2.4812097754174356e-05, + "loss": 0.2256, + "step": 43863 + }, + { + "epoch": 3.5534672715489304, + "grad_norm": 0.058108363300561905, + "learning_rate": 2.480759710157973e-05, + "loss": 0.2014, + "step": 43864 + }, + { + "epoch": 3.553548282566429, + "grad_norm": 0.06204414367675781, + "learning_rate": 2.4803096448985107e-05, + "loss": 0.2261, + "step": 43865 + }, + { + "epoch": 3.5536292935839273, + "grad_norm": 0.07165088504552841, + "learning_rate": 2.4798595796390477e-05, + "loss": 0.2246, + "step": 43866 + }, + { + "epoch": 3.5537103046014256, + "grad_norm": 0.07575501501560211, + "learning_rate": 2.479409514379585e-05, + "loss": 0.2191, + "step": 43867 + }, + { + "epoch": 3.5537913156189243, + "grad_norm": 0.07097320258617401, + "learning_rate": 2.4789594491201227e-05, + "loss": 0.2366, + "step": 43868 + }, + { + "epoch": 3.5538723266364225, + "grad_norm": 0.06804100424051285, + "learning_rate": 2.4785093838606598e-05, + "loss": 0.2409, + "step": 43869 + }, + { + "epoch": 3.5539533376539207, + "grad_norm": 0.100010946393013, + "learning_rate": 2.478059318601197e-05, + "loss": 0.2307, + "step": 43870 + }, + { + "epoch": 3.5540343486714194, + "grad_norm": 0.06930720061063766, + "learning_rate": 2.4776092533417348e-05, + "loss": 0.2133, + "step": 43871 + }, + { + "epoch": 3.5541153596889177, + "grad_norm": 0.0644577294588089, + "learning_rate": 2.477159188082272e-05, + "loss": 0.2123, + "step": 43872 + }, + { + "epoch": 3.554196370706416, + "grad_norm": 0.07674825936555862, + "learning_rate": 2.4767091228228095e-05, + "loss": 0.2602, + "step": 43873 + }, + { + "epoch": 3.5542773817239146, + "grad_norm": 0.06586476415395737, + "learning_rate": 2.476259057563347e-05, + "loss": 0.2086, + "step": 43874 + }, + { + "epoch": 3.554358392741413, + "grad_norm": 0.07297840714454651, + "learning_rate": 2.475808992303884e-05, + "loss": 0.2148, + "step": 43875 + }, + { + "epoch": 3.554439403758911, + "grad_norm": 0.08475550264120102, + "learning_rate": 2.4753589270444216e-05, + "loss": 0.2141, + "step": 43876 + }, + { + "epoch": 3.55452041477641, + "grad_norm": 0.06303331255912781, + "learning_rate": 2.474908861784959e-05, + "loss": 0.2065, + "step": 43877 + }, + { + "epoch": 3.554601425793908, + "grad_norm": 0.08519791066646576, + "learning_rate": 2.4744587965254963e-05, + "loss": 0.2199, + "step": 43878 + }, + { + "epoch": 3.5546824368114063, + "grad_norm": 0.0769948959350586, + "learning_rate": 2.4740087312660337e-05, + "loss": 0.2515, + "step": 43879 + }, + { + "epoch": 3.554763447828905, + "grad_norm": 0.0675671175122261, + "learning_rate": 2.473558666006571e-05, + "loss": 0.2318, + "step": 43880 + }, + { + "epoch": 3.554844458846403, + "grad_norm": 0.09687194973230362, + "learning_rate": 2.4731086007471084e-05, + "loss": 0.22, + "step": 43881 + }, + { + "epoch": 3.5549254698639015, + "grad_norm": 0.07729329913854599, + "learning_rate": 2.4726585354876458e-05, + "loss": 0.2081, + "step": 43882 + }, + { + "epoch": 3.5550064808814, + "grad_norm": 0.06892815232276917, + "learning_rate": 2.472208470228183e-05, + "loss": 0.219, + "step": 43883 + }, + { + "epoch": 3.5550874918988984, + "grad_norm": 0.08760858327150345, + "learning_rate": 2.4717584049687205e-05, + "loss": 0.2182, + "step": 43884 + }, + { + "epoch": 3.5551685029163966, + "grad_norm": 0.08728528022766113, + "learning_rate": 2.471308339709258e-05, + "loss": 0.242, + "step": 43885 + }, + { + "epoch": 3.555249513933895, + "grad_norm": 0.060018256306648254, + "learning_rate": 2.4708582744497956e-05, + "loss": 0.2011, + "step": 43886 + }, + { + "epoch": 3.555330524951393, + "grad_norm": 0.06998419761657715, + "learning_rate": 2.4704082091903326e-05, + "loss": 0.2274, + "step": 43887 + }, + { + "epoch": 3.555411535968892, + "grad_norm": 0.06843650341033936, + "learning_rate": 2.46995814393087e-05, + "loss": 0.2482, + "step": 43888 + }, + { + "epoch": 3.55549254698639, + "grad_norm": 0.06852348148822784, + "learning_rate": 2.4695080786714076e-05, + "loss": 0.2321, + "step": 43889 + }, + { + "epoch": 3.5555735580038883, + "grad_norm": 0.07453927397727966, + "learning_rate": 2.4690580134119447e-05, + "loss": 0.2313, + "step": 43890 + }, + { + "epoch": 3.555654569021387, + "grad_norm": 0.05931844562292099, + "learning_rate": 2.4686079481524824e-05, + "loss": 0.2634, + "step": 43891 + }, + { + "epoch": 3.5557355800388852, + "grad_norm": 0.08144770562648773, + "learning_rate": 2.4681578828930197e-05, + "loss": 0.2287, + "step": 43892 + }, + { + "epoch": 3.5558165910563835, + "grad_norm": 0.07383275032043457, + "learning_rate": 2.4677078176335567e-05, + "loss": 0.2394, + "step": 43893 + }, + { + "epoch": 3.555897602073882, + "grad_norm": 0.06248151510953903, + "learning_rate": 2.4672577523740944e-05, + "loss": 0.1873, + "step": 43894 + }, + { + "epoch": 3.5559786130913804, + "grad_norm": 0.06529638916254044, + "learning_rate": 2.4668076871146318e-05, + "loss": 0.2171, + "step": 43895 + }, + { + "epoch": 3.5560596241088787, + "grad_norm": 0.07343272864818573, + "learning_rate": 2.466357621855169e-05, + "loss": 0.2316, + "step": 43896 + }, + { + "epoch": 3.5561406351263773, + "grad_norm": 0.0745483785867691, + "learning_rate": 2.4659075565957065e-05, + "loss": 0.2576, + "step": 43897 + }, + { + "epoch": 3.5562216461438756, + "grad_norm": 0.06922461092472076, + "learning_rate": 2.465457491336244e-05, + "loss": 0.2411, + "step": 43898 + }, + { + "epoch": 3.556302657161374, + "grad_norm": 0.07483216375112534, + "learning_rate": 2.4650074260767812e-05, + "loss": 0.2621, + "step": 43899 + }, + { + "epoch": 3.5563836681788725, + "grad_norm": 0.06790409982204437, + "learning_rate": 2.4645573608173186e-05, + "loss": 0.2419, + "step": 43900 + }, + { + "epoch": 3.5564646791963708, + "grad_norm": 0.0795658528804779, + "learning_rate": 2.464107295557856e-05, + "loss": 0.1978, + "step": 43901 + }, + { + "epoch": 3.556545690213869, + "grad_norm": 0.07117760181427002, + "learning_rate": 2.4636572302983933e-05, + "loss": 0.2248, + "step": 43902 + }, + { + "epoch": 3.5566267012313677, + "grad_norm": 0.07886912673711777, + "learning_rate": 2.4632071650389307e-05, + "loss": 0.2056, + "step": 43903 + }, + { + "epoch": 3.556707712248866, + "grad_norm": 0.06801647692918777, + "learning_rate": 2.462757099779468e-05, + "loss": 0.2231, + "step": 43904 + }, + { + "epoch": 3.556788723266364, + "grad_norm": 0.06421799212694168, + "learning_rate": 2.4623070345200054e-05, + "loss": 0.2224, + "step": 43905 + }, + { + "epoch": 3.556869734283863, + "grad_norm": 0.06796405464410782, + "learning_rate": 2.4618569692605428e-05, + "loss": 0.1922, + "step": 43906 + }, + { + "epoch": 3.556950745301361, + "grad_norm": 0.05750831589102745, + "learning_rate": 2.4614069040010805e-05, + "loss": 0.2175, + "step": 43907 + }, + { + "epoch": 3.5570317563188594, + "grad_norm": 0.07587180286645889, + "learning_rate": 2.4609568387416175e-05, + "loss": 0.2163, + "step": 43908 + }, + { + "epoch": 3.5571127673363576, + "grad_norm": 0.06804661452770233, + "learning_rate": 2.460506773482155e-05, + "loss": 0.2091, + "step": 43909 + }, + { + "epoch": 3.557193778353856, + "grad_norm": 0.0663495659828186, + "learning_rate": 2.4600567082226925e-05, + "loss": 0.2252, + "step": 43910 + }, + { + "epoch": 3.5572747893713546, + "grad_norm": 0.07366745918989182, + "learning_rate": 2.45960664296323e-05, + "loss": 0.2124, + "step": 43911 + }, + { + "epoch": 3.557355800388853, + "grad_norm": 0.07139547169208527, + "learning_rate": 2.4591565777037673e-05, + "loss": 0.2407, + "step": 43912 + }, + { + "epoch": 3.557436811406351, + "grad_norm": 0.06392621994018555, + "learning_rate": 2.4587065124443046e-05, + "loss": 0.2014, + "step": 43913 + }, + { + "epoch": 3.5575178224238497, + "grad_norm": 0.08841899037361145, + "learning_rate": 2.458256447184842e-05, + "loss": 0.2268, + "step": 43914 + }, + { + "epoch": 3.557598833441348, + "grad_norm": 0.08579627424478531, + "learning_rate": 2.4578063819253793e-05, + "loss": 0.2625, + "step": 43915 + }, + { + "epoch": 3.557679844458846, + "grad_norm": 0.06458761543035507, + "learning_rate": 2.4573563166659167e-05, + "loss": 0.21, + "step": 43916 + }, + { + "epoch": 3.557760855476345, + "grad_norm": 0.07201027870178223, + "learning_rate": 2.456906251406454e-05, + "loss": 0.2427, + "step": 43917 + }, + { + "epoch": 3.557841866493843, + "grad_norm": 0.07014687359333038, + "learning_rate": 2.4564561861469914e-05, + "loss": 0.2211, + "step": 43918 + }, + { + "epoch": 3.5579228775113414, + "grad_norm": 0.06852081418037415, + "learning_rate": 2.4560061208875288e-05, + "loss": 0.2001, + "step": 43919 + }, + { + "epoch": 3.55800388852884, + "grad_norm": 0.07407641410827637, + "learning_rate": 2.455556055628066e-05, + "loss": 0.2404, + "step": 43920 + }, + { + "epoch": 3.5580848995463383, + "grad_norm": 0.07178988307714462, + "learning_rate": 2.4551059903686035e-05, + "loss": 0.2773, + "step": 43921 + }, + { + "epoch": 3.5581659105638366, + "grad_norm": 0.07537788897752762, + "learning_rate": 2.454655925109141e-05, + "loss": 0.2245, + "step": 43922 + }, + { + "epoch": 3.5582469215813353, + "grad_norm": 0.07685130834579468, + "learning_rate": 2.4542058598496782e-05, + "loss": 0.2201, + "step": 43923 + }, + { + "epoch": 3.5583279325988335, + "grad_norm": 0.07150780409574509, + "learning_rate": 2.4537557945902156e-05, + "loss": 0.2516, + "step": 43924 + }, + { + "epoch": 3.5584089436163318, + "grad_norm": 0.08379261195659637, + "learning_rate": 2.4533057293307533e-05, + "loss": 0.2137, + "step": 43925 + }, + { + "epoch": 3.5584899546338304, + "grad_norm": 0.08677427470684052, + "learning_rate": 2.4528556640712903e-05, + "loss": 0.2589, + "step": 43926 + }, + { + "epoch": 3.5585709656513287, + "grad_norm": 0.08629120886325836, + "learning_rate": 2.4524055988118277e-05, + "loss": 0.2187, + "step": 43927 + }, + { + "epoch": 3.558651976668827, + "grad_norm": 0.06367433816194534, + "learning_rate": 2.4519555335523654e-05, + "loss": 0.2112, + "step": 43928 + }, + { + "epoch": 3.558732987686325, + "grad_norm": 0.08857046067714691, + "learning_rate": 2.4515054682929027e-05, + "loss": 0.2795, + "step": 43929 + }, + { + "epoch": 3.558813998703824, + "grad_norm": 0.0720190778374672, + "learning_rate": 2.4510554030334397e-05, + "loss": 0.2604, + "step": 43930 + }, + { + "epoch": 3.558895009721322, + "grad_norm": 0.07167576253414154, + "learning_rate": 2.4506053377739774e-05, + "loss": 0.189, + "step": 43931 + }, + { + "epoch": 3.5589760207388204, + "grad_norm": 0.059410810470581055, + "learning_rate": 2.4501552725145148e-05, + "loss": 0.1943, + "step": 43932 + }, + { + "epoch": 3.5590570317563186, + "grad_norm": 0.07639388740062714, + "learning_rate": 2.4497052072550522e-05, + "loss": 0.2547, + "step": 43933 + }, + { + "epoch": 3.5591380427738173, + "grad_norm": 0.07768415659666061, + "learning_rate": 2.4492551419955895e-05, + "loss": 0.2425, + "step": 43934 + }, + { + "epoch": 3.5592190537913155, + "grad_norm": 0.08310689777135849, + "learning_rate": 2.448805076736127e-05, + "loss": 0.2488, + "step": 43935 + }, + { + "epoch": 3.559300064808814, + "grad_norm": 0.07589665800333023, + "learning_rate": 2.4483550114766643e-05, + "loss": 0.2233, + "step": 43936 + }, + { + "epoch": 3.5593810758263125, + "grad_norm": 0.06544729322195053, + "learning_rate": 2.4479049462172016e-05, + "loss": 0.185, + "step": 43937 + }, + { + "epoch": 3.5594620868438107, + "grad_norm": 0.09256394952535629, + "learning_rate": 2.447454880957739e-05, + "loss": 0.2256, + "step": 43938 + }, + { + "epoch": 3.559543097861309, + "grad_norm": 0.07318930327892303, + "learning_rate": 2.4470048156982763e-05, + "loss": 0.2212, + "step": 43939 + }, + { + "epoch": 3.5596241088788076, + "grad_norm": 0.05668797716498375, + "learning_rate": 2.4465547504388137e-05, + "loss": 0.1985, + "step": 43940 + }, + { + "epoch": 3.559705119896306, + "grad_norm": 0.06777235120534897, + "learning_rate": 2.446104685179351e-05, + "loss": 0.2098, + "step": 43941 + }, + { + "epoch": 3.559786130913804, + "grad_norm": 0.0696289986371994, + "learning_rate": 2.4456546199198884e-05, + "loss": 0.2132, + "step": 43942 + }, + { + "epoch": 3.559867141931303, + "grad_norm": 0.0714426040649414, + "learning_rate": 2.4452045546604258e-05, + "loss": 0.2044, + "step": 43943 + }, + { + "epoch": 3.559948152948801, + "grad_norm": 0.09928544610738754, + "learning_rate": 2.444754489400963e-05, + "loss": 0.2655, + "step": 43944 + }, + { + "epoch": 3.5600291639662993, + "grad_norm": 0.07626913487911224, + "learning_rate": 2.4443044241415005e-05, + "loss": 0.1946, + "step": 43945 + }, + { + "epoch": 3.560110174983798, + "grad_norm": 0.10116627812385559, + "learning_rate": 2.4438543588820382e-05, + "loss": 0.1954, + "step": 43946 + }, + { + "epoch": 3.5601911860012962, + "grad_norm": 0.06626923382282257, + "learning_rate": 2.4434042936225756e-05, + "loss": 0.2106, + "step": 43947 + }, + { + "epoch": 3.5602721970187945, + "grad_norm": 0.08282341063022614, + "learning_rate": 2.4429542283631126e-05, + "loss": 0.2175, + "step": 43948 + }, + { + "epoch": 3.560353208036293, + "grad_norm": 0.0847616046667099, + "learning_rate": 2.4425041631036503e-05, + "loss": 0.2036, + "step": 43949 + }, + { + "epoch": 3.5604342190537914, + "grad_norm": 0.07245887815952301, + "learning_rate": 2.4420540978441876e-05, + "loss": 0.2534, + "step": 43950 + }, + { + "epoch": 3.5605152300712897, + "grad_norm": 0.0700652077794075, + "learning_rate": 2.441604032584725e-05, + "loss": 0.2381, + "step": 43951 + }, + { + "epoch": 3.560596241088788, + "grad_norm": 0.06606948375701904, + "learning_rate": 2.4411539673252624e-05, + "loss": 0.2612, + "step": 43952 + }, + { + "epoch": 3.5606772521062866, + "grad_norm": 0.08605430275201797, + "learning_rate": 2.4407039020657997e-05, + "loss": 0.2221, + "step": 43953 + }, + { + "epoch": 3.560758263123785, + "grad_norm": 0.08067008852958679, + "learning_rate": 2.440253836806337e-05, + "loss": 0.2909, + "step": 43954 + }, + { + "epoch": 3.560839274141283, + "grad_norm": 0.08161859959363937, + "learning_rate": 2.4398037715468744e-05, + "loss": 0.213, + "step": 43955 + }, + { + "epoch": 3.5609202851587813, + "grad_norm": 0.07555124908685684, + "learning_rate": 2.4393537062874118e-05, + "loss": 0.1861, + "step": 43956 + }, + { + "epoch": 3.56100129617628, + "grad_norm": 0.08395944535732269, + "learning_rate": 2.438903641027949e-05, + "loss": 0.2521, + "step": 43957 + }, + { + "epoch": 3.5610823071937783, + "grad_norm": 0.08223508298397064, + "learning_rate": 2.4384535757684865e-05, + "loss": 0.2328, + "step": 43958 + }, + { + "epoch": 3.5611633182112765, + "grad_norm": 0.06962686032056808, + "learning_rate": 2.438003510509024e-05, + "loss": 0.2079, + "step": 43959 + }, + { + "epoch": 3.561244329228775, + "grad_norm": 0.07295332103967667, + "learning_rate": 2.4375534452495612e-05, + "loss": 0.2196, + "step": 43960 + }, + { + "epoch": 3.5613253402462735, + "grad_norm": 0.058311235159635544, + "learning_rate": 2.4371033799900986e-05, + "loss": 0.2089, + "step": 43961 + }, + { + "epoch": 3.5614063512637717, + "grad_norm": 0.06865894049406052, + "learning_rate": 2.436653314730636e-05, + "loss": 0.2078, + "step": 43962 + }, + { + "epoch": 3.5614873622812704, + "grad_norm": 0.06817828863859177, + "learning_rate": 2.4362032494711733e-05, + "loss": 0.2547, + "step": 43963 + }, + { + "epoch": 3.5615683732987686, + "grad_norm": 0.05765335261821747, + "learning_rate": 2.435753184211711e-05, + "loss": 0.1979, + "step": 43964 + }, + { + "epoch": 3.561649384316267, + "grad_norm": 0.08066023141145706, + "learning_rate": 2.4353031189522484e-05, + "loss": 0.2031, + "step": 43965 + }, + { + "epoch": 3.5617303953337656, + "grad_norm": 0.07500291615724564, + "learning_rate": 2.4348530536927854e-05, + "loss": 0.2186, + "step": 43966 + }, + { + "epoch": 3.561811406351264, + "grad_norm": 0.07075216621160507, + "learning_rate": 2.434402988433323e-05, + "loss": 0.2086, + "step": 43967 + }, + { + "epoch": 3.561892417368762, + "grad_norm": 0.06981159001588821, + "learning_rate": 2.4339529231738605e-05, + "loss": 0.231, + "step": 43968 + }, + { + "epoch": 3.5619734283862607, + "grad_norm": 0.07031860947608948, + "learning_rate": 2.4335028579143975e-05, + "loss": 0.2415, + "step": 43969 + }, + { + "epoch": 3.562054439403759, + "grad_norm": 0.06598281115293503, + "learning_rate": 2.4330527926549352e-05, + "loss": 0.2555, + "step": 43970 + }, + { + "epoch": 3.5621354504212572, + "grad_norm": 0.06520348787307739, + "learning_rate": 2.4326027273954725e-05, + "loss": 0.2192, + "step": 43971 + }, + { + "epoch": 3.562216461438756, + "grad_norm": 0.08133994787931442, + "learning_rate": 2.43215266213601e-05, + "loss": 0.2251, + "step": 43972 + }, + { + "epoch": 3.562297472456254, + "grad_norm": 0.09172660857439041, + "learning_rate": 2.4317025968765473e-05, + "loss": 0.2615, + "step": 43973 + }, + { + "epoch": 3.5623784834737524, + "grad_norm": 0.08263950794935226, + "learning_rate": 2.4312525316170846e-05, + "loss": 0.234, + "step": 43974 + }, + { + "epoch": 3.5624594944912507, + "grad_norm": 0.07384882867336273, + "learning_rate": 2.430802466357622e-05, + "loss": 0.2309, + "step": 43975 + }, + { + "epoch": 3.5625405055087493, + "grad_norm": 0.0683225616812706, + "learning_rate": 2.4303524010981593e-05, + "loss": 0.2085, + "step": 43976 + }, + { + "epoch": 3.5626215165262476, + "grad_norm": 0.06126277148723602, + "learning_rate": 2.4299023358386967e-05, + "loss": 0.1956, + "step": 43977 + }, + { + "epoch": 3.562702527543746, + "grad_norm": 0.07429927587509155, + "learning_rate": 2.429452270579234e-05, + "loss": 0.1984, + "step": 43978 + }, + { + "epoch": 3.562783538561244, + "grad_norm": 0.06797944009304047, + "learning_rate": 2.4290022053197714e-05, + "loss": 0.2082, + "step": 43979 + }, + { + "epoch": 3.5628645495787428, + "grad_norm": 0.07353696972131729, + "learning_rate": 2.428552140060309e-05, + "loss": 0.213, + "step": 43980 + }, + { + "epoch": 3.562945560596241, + "grad_norm": 0.06893346458673477, + "learning_rate": 2.428102074800846e-05, + "loss": 0.1905, + "step": 43981 + }, + { + "epoch": 3.5630265716137393, + "grad_norm": 0.0654299408197403, + "learning_rate": 2.4276520095413835e-05, + "loss": 0.2123, + "step": 43982 + }, + { + "epoch": 3.563107582631238, + "grad_norm": 0.06341482698917389, + "learning_rate": 2.4272019442819212e-05, + "loss": 0.2231, + "step": 43983 + }, + { + "epoch": 3.563188593648736, + "grad_norm": 0.06760953366756439, + "learning_rate": 2.4267518790224582e-05, + "loss": 0.2135, + "step": 43984 + }, + { + "epoch": 3.5632696046662344, + "grad_norm": 0.0738074854016304, + "learning_rate": 2.426301813762996e-05, + "loss": 0.2402, + "step": 43985 + }, + { + "epoch": 3.563350615683733, + "grad_norm": 0.08491325378417969, + "learning_rate": 2.4258517485035333e-05, + "loss": 0.2373, + "step": 43986 + }, + { + "epoch": 3.5634316267012314, + "grad_norm": 0.08582518249750137, + "learning_rate": 2.4254016832440703e-05, + "loss": 0.1853, + "step": 43987 + }, + { + "epoch": 3.5635126377187296, + "grad_norm": 0.07919377833604813, + "learning_rate": 2.424951617984608e-05, + "loss": 0.2097, + "step": 43988 + }, + { + "epoch": 3.5635936487362283, + "grad_norm": 0.06492700427770615, + "learning_rate": 2.4245015527251454e-05, + "loss": 0.2123, + "step": 43989 + }, + { + "epoch": 3.5636746597537265, + "grad_norm": 0.07696332037448883, + "learning_rate": 2.4240514874656824e-05, + "loss": 0.2316, + "step": 43990 + }, + { + "epoch": 3.563755670771225, + "grad_norm": 0.06934797763824463, + "learning_rate": 2.42360142220622e-05, + "loss": 0.2295, + "step": 43991 + }, + { + "epoch": 3.5638366817887235, + "grad_norm": 0.06783980876207352, + "learning_rate": 2.4231513569467574e-05, + "loss": 0.2046, + "step": 43992 + }, + { + "epoch": 3.5639176928062217, + "grad_norm": 0.06434223800897598, + "learning_rate": 2.4227012916872948e-05, + "loss": 0.2079, + "step": 43993 + }, + { + "epoch": 3.56399870382372, + "grad_norm": 0.06556785106658936, + "learning_rate": 2.422251226427832e-05, + "loss": 0.2694, + "step": 43994 + }, + { + "epoch": 3.5640797148412187, + "grad_norm": 0.09260761737823486, + "learning_rate": 2.4218011611683695e-05, + "loss": 0.2398, + "step": 43995 + }, + { + "epoch": 3.564160725858717, + "grad_norm": 0.09623724967241287, + "learning_rate": 2.421351095908907e-05, + "loss": 0.2155, + "step": 43996 + }, + { + "epoch": 3.564241736876215, + "grad_norm": 0.06439271569252014, + "learning_rate": 2.4209010306494442e-05, + "loss": 0.2079, + "step": 43997 + }, + { + "epoch": 3.5643227478937134, + "grad_norm": 0.07777006179094315, + "learning_rate": 2.420450965389982e-05, + "loss": 0.2026, + "step": 43998 + }, + { + "epoch": 3.564403758911212, + "grad_norm": 0.06641001999378204, + "learning_rate": 2.420000900130519e-05, + "loss": 0.2298, + "step": 43999 + }, + { + "epoch": 3.5644847699287103, + "grad_norm": 0.07535912096500397, + "learning_rate": 2.4195508348710563e-05, + "loss": 0.2629, + "step": 44000 + }, + { + "epoch": 3.5645657809462086, + "grad_norm": 0.07521260529756546, + "learning_rate": 2.419100769611594e-05, + "loss": 0.1767, + "step": 44001 + }, + { + "epoch": 3.564646791963707, + "grad_norm": 0.0752941370010376, + "learning_rate": 2.418650704352131e-05, + "loss": 0.1997, + "step": 44002 + }, + { + "epoch": 3.5647278029812055, + "grad_norm": 0.0634898692369461, + "learning_rate": 2.4182006390926684e-05, + "loss": 0.2246, + "step": 44003 + }, + { + "epoch": 3.5648088139987038, + "grad_norm": 0.08196897059679031, + "learning_rate": 2.417750573833206e-05, + "loss": 0.2227, + "step": 44004 + }, + { + "epoch": 3.564889825016202, + "grad_norm": 0.06975170969963074, + "learning_rate": 2.417300508573743e-05, + "loss": 0.2097, + "step": 44005 + }, + { + "epoch": 3.5649708360337007, + "grad_norm": 0.061027590185403824, + "learning_rate": 2.4168504433142808e-05, + "loss": 0.2033, + "step": 44006 + }, + { + "epoch": 3.565051847051199, + "grad_norm": 0.06183544546365738, + "learning_rate": 2.4164003780548182e-05, + "loss": 0.2268, + "step": 44007 + }, + { + "epoch": 3.565132858068697, + "grad_norm": 0.06074502319097519, + "learning_rate": 2.4159503127953552e-05, + "loss": 0.1866, + "step": 44008 + }, + { + "epoch": 3.565213869086196, + "grad_norm": 0.06271813064813614, + "learning_rate": 2.415500247535893e-05, + "loss": 0.2076, + "step": 44009 + }, + { + "epoch": 3.565294880103694, + "grad_norm": 0.08766993880271912, + "learning_rate": 2.4150501822764303e-05, + "loss": 0.2547, + "step": 44010 + }, + { + "epoch": 3.5653758911211924, + "grad_norm": 0.06195381283760071, + "learning_rate": 2.4146001170169676e-05, + "loss": 0.2211, + "step": 44011 + }, + { + "epoch": 3.565456902138691, + "grad_norm": 0.0778651088476181, + "learning_rate": 2.414150051757505e-05, + "loss": 0.2419, + "step": 44012 + }, + { + "epoch": 3.5655379131561893, + "grad_norm": 0.07984745502471924, + "learning_rate": 2.4136999864980424e-05, + "loss": 0.2673, + "step": 44013 + }, + { + "epoch": 3.5656189241736875, + "grad_norm": 0.07894036918878555, + "learning_rate": 2.4132499212385797e-05, + "loss": 0.2124, + "step": 44014 + }, + { + "epoch": 3.565699935191186, + "grad_norm": 0.06846199929714203, + "learning_rate": 2.412799855979117e-05, + "loss": 0.1949, + "step": 44015 + }, + { + "epoch": 3.5657809462086845, + "grad_norm": 0.07311907410621643, + "learning_rate": 2.4123497907196544e-05, + "loss": 0.2675, + "step": 44016 + }, + { + "epoch": 3.5658619572261827, + "grad_norm": 0.09844955801963806, + "learning_rate": 2.4118997254601918e-05, + "loss": 0.2622, + "step": 44017 + }, + { + "epoch": 3.5659429682436814, + "grad_norm": 0.07872521877288818, + "learning_rate": 2.411449660200729e-05, + "loss": 0.1942, + "step": 44018 + }, + { + "epoch": 3.5660239792611796, + "grad_norm": 0.07679609209299088, + "learning_rate": 2.410999594941267e-05, + "loss": 0.2533, + "step": 44019 + }, + { + "epoch": 3.566104990278678, + "grad_norm": 0.06353718787431717, + "learning_rate": 2.410549529681804e-05, + "loss": 0.2144, + "step": 44020 + }, + { + "epoch": 3.566186001296176, + "grad_norm": 0.06551864743232727, + "learning_rate": 2.4100994644223412e-05, + "loss": 0.231, + "step": 44021 + }, + { + "epoch": 3.566267012313675, + "grad_norm": 0.05590434372425079, + "learning_rate": 2.409649399162879e-05, + "loss": 0.2214, + "step": 44022 + }, + { + "epoch": 3.566348023331173, + "grad_norm": 0.06585326045751572, + "learning_rate": 2.409199333903416e-05, + "loss": 0.2054, + "step": 44023 + }, + { + "epoch": 3.5664290343486713, + "grad_norm": 0.06935842335224152, + "learning_rate": 2.4087492686439537e-05, + "loss": 0.1959, + "step": 44024 + }, + { + "epoch": 3.5665100453661696, + "grad_norm": 0.07737302035093307, + "learning_rate": 2.408299203384491e-05, + "loss": 0.2492, + "step": 44025 + }, + { + "epoch": 3.5665910563836682, + "grad_norm": 0.07547137141227722, + "learning_rate": 2.407849138125028e-05, + "loss": 0.2421, + "step": 44026 + }, + { + "epoch": 3.5666720674011665, + "grad_norm": 0.06433389335870743, + "learning_rate": 2.4073990728655657e-05, + "loss": 0.2068, + "step": 44027 + }, + { + "epoch": 3.5667530784186647, + "grad_norm": 0.08710397034883499, + "learning_rate": 2.406949007606103e-05, + "loss": 0.2787, + "step": 44028 + }, + { + "epoch": 3.5668340894361634, + "grad_norm": 0.09563450515270233, + "learning_rate": 2.40649894234664e-05, + "loss": 0.2418, + "step": 44029 + }, + { + "epoch": 3.5669151004536617, + "grad_norm": 0.06833392381668091, + "learning_rate": 2.4060488770871778e-05, + "loss": 0.2281, + "step": 44030 + }, + { + "epoch": 3.56699611147116, + "grad_norm": 0.05998595058917999, + "learning_rate": 2.4055988118277152e-05, + "loss": 0.2187, + "step": 44031 + }, + { + "epoch": 3.5670771224886586, + "grad_norm": 0.06386803090572357, + "learning_rate": 2.4051487465682525e-05, + "loss": 0.2263, + "step": 44032 + }, + { + "epoch": 3.567158133506157, + "grad_norm": 0.07443685829639435, + "learning_rate": 2.40469868130879e-05, + "loss": 0.2222, + "step": 44033 + }, + { + "epoch": 3.567239144523655, + "grad_norm": 0.08675825595855713, + "learning_rate": 2.4042486160493273e-05, + "loss": 0.2485, + "step": 44034 + }, + { + "epoch": 3.567320155541154, + "grad_norm": 0.06265902519226074, + "learning_rate": 2.4037985507898646e-05, + "loss": 0.1981, + "step": 44035 + }, + { + "epoch": 3.567401166558652, + "grad_norm": 0.08795081079006195, + "learning_rate": 2.403348485530402e-05, + "loss": 0.2301, + "step": 44036 + }, + { + "epoch": 3.5674821775761503, + "grad_norm": 0.07681228965520859, + "learning_rate": 2.4028984202709397e-05, + "loss": 0.2507, + "step": 44037 + }, + { + "epoch": 3.567563188593649, + "grad_norm": 0.04987798258662224, + "learning_rate": 2.4024483550114767e-05, + "loss": 0.1904, + "step": 44038 + }, + { + "epoch": 3.567644199611147, + "grad_norm": 0.07152686268091202, + "learning_rate": 2.401998289752014e-05, + "loss": 0.2213, + "step": 44039 + }, + { + "epoch": 3.5677252106286454, + "grad_norm": 0.09124844521284103, + "learning_rate": 2.4015482244925518e-05, + "loss": 0.27, + "step": 44040 + }, + { + "epoch": 3.567806221646144, + "grad_norm": 0.07459650933742523, + "learning_rate": 2.4010981592330888e-05, + "loss": 0.2502, + "step": 44041 + }, + { + "epoch": 3.5678872326636424, + "grad_norm": 0.06212278828024864, + "learning_rate": 2.400648093973626e-05, + "loss": 0.193, + "step": 44042 + }, + { + "epoch": 3.5679682436811406, + "grad_norm": 0.061736591160297394, + "learning_rate": 2.400198028714164e-05, + "loss": 0.2338, + "step": 44043 + }, + { + "epoch": 3.568049254698639, + "grad_norm": 0.10944293439388275, + "learning_rate": 2.399747963454701e-05, + "loss": 0.2443, + "step": 44044 + }, + { + "epoch": 3.568130265716137, + "grad_norm": 0.07717695832252502, + "learning_rate": 2.3992978981952386e-05, + "loss": 0.212, + "step": 44045 + }, + { + "epoch": 3.568211276733636, + "grad_norm": 0.07111458480358124, + "learning_rate": 2.398847832935776e-05, + "loss": 0.187, + "step": 44046 + }, + { + "epoch": 3.568292287751134, + "grad_norm": 0.07641852647066116, + "learning_rate": 2.398397767676313e-05, + "loss": 0.2275, + "step": 44047 + }, + { + "epoch": 3.5683732987686323, + "grad_norm": 0.07449782639741898, + "learning_rate": 2.3979477024168506e-05, + "loss": 0.2465, + "step": 44048 + }, + { + "epoch": 3.568454309786131, + "grad_norm": 0.06614088267087936, + "learning_rate": 2.397497637157388e-05, + "loss": 0.1915, + "step": 44049 + }, + { + "epoch": 3.5685353208036292, + "grad_norm": 0.0939965546131134, + "learning_rate": 2.3970475718979254e-05, + "loss": 0.2293, + "step": 44050 + }, + { + "epoch": 3.5686163318211275, + "grad_norm": 0.08085455745458603, + "learning_rate": 2.3965975066384627e-05, + "loss": 0.23, + "step": 44051 + }, + { + "epoch": 3.568697342838626, + "grad_norm": 0.07873645424842834, + "learning_rate": 2.396147441379e-05, + "loss": 0.2147, + "step": 44052 + }, + { + "epoch": 3.5687783538561244, + "grad_norm": 0.0769469141960144, + "learning_rate": 2.3956973761195374e-05, + "loss": 0.2378, + "step": 44053 + }, + { + "epoch": 3.5688593648736227, + "grad_norm": 0.0792095884680748, + "learning_rate": 2.3952473108600748e-05, + "loss": 0.2376, + "step": 44054 + }, + { + "epoch": 3.5689403758911213, + "grad_norm": 0.06182113662362099, + "learning_rate": 2.394797245600612e-05, + "loss": 0.2373, + "step": 44055 + }, + { + "epoch": 3.5690213869086196, + "grad_norm": 0.06288308650255203, + "learning_rate": 2.3943471803411495e-05, + "loss": 0.2071, + "step": 44056 + }, + { + "epoch": 3.569102397926118, + "grad_norm": 0.06363116204738617, + "learning_rate": 2.393897115081687e-05, + "loss": 0.2536, + "step": 44057 + }, + { + "epoch": 3.5691834089436165, + "grad_norm": 0.07311567664146423, + "learning_rate": 2.3934470498222246e-05, + "loss": 0.229, + "step": 44058 + }, + { + "epoch": 3.5692644199611148, + "grad_norm": 0.0979669988155365, + "learning_rate": 2.3929969845627616e-05, + "loss": 0.2419, + "step": 44059 + }, + { + "epoch": 3.569345430978613, + "grad_norm": 0.0624614879488945, + "learning_rate": 2.392546919303299e-05, + "loss": 0.227, + "step": 44060 + }, + { + "epoch": 3.5694264419961117, + "grad_norm": 0.06651511788368225, + "learning_rate": 2.3920968540438367e-05, + "loss": 0.2525, + "step": 44061 + }, + { + "epoch": 3.56950745301361, + "grad_norm": 0.06575529277324677, + "learning_rate": 2.3916467887843737e-05, + "loss": 0.2151, + "step": 44062 + }, + { + "epoch": 3.569588464031108, + "grad_norm": 0.062389180064201355, + "learning_rate": 2.3911967235249114e-05, + "loss": 0.1888, + "step": 44063 + }, + { + "epoch": 3.569669475048607, + "grad_norm": 0.06860721856355667, + "learning_rate": 2.3907466582654487e-05, + "loss": 0.2299, + "step": 44064 + }, + { + "epoch": 3.569750486066105, + "grad_norm": 0.07936166226863861, + "learning_rate": 2.3902965930059858e-05, + "loss": 0.2001, + "step": 44065 + }, + { + "epoch": 3.5698314970836034, + "grad_norm": 0.06096033751964569, + "learning_rate": 2.3898465277465235e-05, + "loss": 0.1796, + "step": 44066 + }, + { + "epoch": 3.5699125081011016, + "grad_norm": 0.07629892230033875, + "learning_rate": 2.3893964624870608e-05, + "loss": 0.2292, + "step": 44067 + }, + { + "epoch": 3.5699935191186, + "grad_norm": 0.08087896555662155, + "learning_rate": 2.388946397227598e-05, + "loss": 0.2092, + "step": 44068 + }, + { + "epoch": 3.5700745301360985, + "grad_norm": 0.08344534784555435, + "learning_rate": 2.3884963319681355e-05, + "loss": 0.2359, + "step": 44069 + }, + { + "epoch": 3.570155541153597, + "grad_norm": 0.07347863167524338, + "learning_rate": 2.388046266708673e-05, + "loss": 0.2435, + "step": 44070 + }, + { + "epoch": 3.570236552171095, + "grad_norm": 0.08139392733573914, + "learning_rate": 2.3875962014492103e-05, + "loss": 0.2332, + "step": 44071 + }, + { + "epoch": 3.5703175631885937, + "grad_norm": 0.06455414742231369, + "learning_rate": 2.3871461361897476e-05, + "loss": 0.2216, + "step": 44072 + }, + { + "epoch": 3.570398574206092, + "grad_norm": 0.07281234860420227, + "learning_rate": 2.386696070930285e-05, + "loss": 0.2162, + "step": 44073 + }, + { + "epoch": 3.57047958522359, + "grad_norm": 0.06973182410001755, + "learning_rate": 2.3862460056708223e-05, + "loss": 0.2096, + "step": 44074 + }, + { + "epoch": 3.570560596241089, + "grad_norm": 0.06923564523458481, + "learning_rate": 2.3857959404113597e-05, + "loss": 0.1975, + "step": 44075 + }, + { + "epoch": 3.570641607258587, + "grad_norm": 0.0641985535621643, + "learning_rate": 2.385345875151897e-05, + "loss": 0.1835, + "step": 44076 + }, + { + "epoch": 3.5707226182760854, + "grad_norm": 0.06189914047718048, + "learning_rate": 2.3848958098924344e-05, + "loss": 0.2339, + "step": 44077 + }, + { + "epoch": 3.570803629293584, + "grad_norm": 0.06003765016794205, + "learning_rate": 2.3844457446329718e-05, + "loss": 0.2617, + "step": 44078 + }, + { + "epoch": 3.5708846403110823, + "grad_norm": 0.07658020406961441, + "learning_rate": 2.3839956793735095e-05, + "loss": 0.2086, + "step": 44079 + }, + { + "epoch": 3.5709656513285806, + "grad_norm": 0.07172147929668427, + "learning_rate": 2.3835456141140465e-05, + "loss": 0.2026, + "step": 44080 + }, + { + "epoch": 3.5710466623460793, + "grad_norm": 0.06534678488969803, + "learning_rate": 2.383095548854584e-05, + "loss": 0.206, + "step": 44081 + }, + { + "epoch": 3.5711276733635775, + "grad_norm": 0.0649232566356659, + "learning_rate": 2.3826454835951216e-05, + "loss": 0.2246, + "step": 44082 + }, + { + "epoch": 3.5712086843810757, + "grad_norm": 0.07970141619443893, + "learning_rate": 2.3821954183356586e-05, + "loss": 0.216, + "step": 44083 + }, + { + "epoch": 3.5712896953985744, + "grad_norm": 0.06200230494141579, + "learning_rate": 2.3817453530761963e-05, + "loss": 0.199, + "step": 44084 + }, + { + "epoch": 3.5713707064160727, + "grad_norm": 0.0675581619143486, + "learning_rate": 2.3812952878167337e-05, + "loss": 0.2419, + "step": 44085 + }, + { + "epoch": 3.571451717433571, + "grad_norm": 0.09428895264863968, + "learning_rate": 2.3808452225572707e-05, + "loss": 0.2572, + "step": 44086 + }, + { + "epoch": 3.5715327284510696, + "grad_norm": 0.060645028948783875, + "learning_rate": 2.3803951572978084e-05, + "loss": 0.2153, + "step": 44087 + }, + { + "epoch": 3.571613739468568, + "grad_norm": 0.07589592039585114, + "learning_rate": 2.3799450920383457e-05, + "loss": 0.2024, + "step": 44088 + }, + { + "epoch": 3.571694750486066, + "grad_norm": 0.07095162570476532, + "learning_rate": 2.3794950267788828e-05, + "loss": 0.2182, + "step": 44089 + }, + { + "epoch": 3.5717757615035644, + "grad_norm": 0.07006850838661194, + "learning_rate": 2.3790449615194205e-05, + "loss": 0.2062, + "step": 44090 + }, + { + "epoch": 3.5718567725210626, + "grad_norm": 0.0720515325665474, + "learning_rate": 2.3785948962599578e-05, + "loss": 0.2267, + "step": 44091 + }, + { + "epoch": 3.5719377835385613, + "grad_norm": 0.06832864880561829, + "learning_rate": 2.3781448310004952e-05, + "loss": 0.2431, + "step": 44092 + }, + { + "epoch": 3.5720187945560595, + "grad_norm": 0.0733826756477356, + "learning_rate": 2.3776947657410325e-05, + "loss": 0.2541, + "step": 44093 + }, + { + "epoch": 3.5720998055735578, + "grad_norm": 0.0713753029704094, + "learning_rate": 2.37724470048157e-05, + "loss": 0.23, + "step": 44094 + }, + { + "epoch": 3.5721808165910565, + "grad_norm": 0.06897611171007156, + "learning_rate": 2.3767946352221073e-05, + "loss": 0.2151, + "step": 44095 + }, + { + "epoch": 3.5722618276085547, + "grad_norm": 0.06955330073833466, + "learning_rate": 2.3763445699626446e-05, + "loss": 0.2112, + "step": 44096 + }, + { + "epoch": 3.572342838626053, + "grad_norm": 0.07778653502464294, + "learning_rate": 2.3758945047031823e-05, + "loss": 0.2102, + "step": 44097 + }, + { + "epoch": 3.5724238496435516, + "grad_norm": 0.07717503607273102, + "learning_rate": 2.3754444394437193e-05, + "loss": 0.2321, + "step": 44098 + }, + { + "epoch": 3.57250486066105, + "grad_norm": 0.06059728562831879, + "learning_rate": 2.3749943741842567e-05, + "loss": 0.2139, + "step": 44099 + }, + { + "epoch": 3.572585871678548, + "grad_norm": 0.0687459334731102, + "learning_rate": 2.3745443089247944e-05, + "loss": 0.22, + "step": 44100 + }, + { + "epoch": 3.572666882696047, + "grad_norm": 0.07666652649641037, + "learning_rate": 2.3740942436653314e-05, + "loss": 0.2173, + "step": 44101 + }, + { + "epoch": 3.572747893713545, + "grad_norm": 0.07414747774600983, + "learning_rate": 2.3736441784058688e-05, + "loss": 0.2318, + "step": 44102 + }, + { + "epoch": 3.5728289047310433, + "grad_norm": 0.0756225511431694, + "learning_rate": 2.3731941131464065e-05, + "loss": 0.2279, + "step": 44103 + }, + { + "epoch": 3.572909915748542, + "grad_norm": 0.07715298235416412, + "learning_rate": 2.3727440478869435e-05, + "loss": 0.2352, + "step": 44104 + }, + { + "epoch": 3.5729909267660402, + "grad_norm": 0.0866784080862999, + "learning_rate": 2.3722939826274812e-05, + "loss": 0.1829, + "step": 44105 + }, + { + "epoch": 3.5730719377835385, + "grad_norm": 0.0714133158326149, + "learning_rate": 2.3718439173680186e-05, + "loss": 0.2257, + "step": 44106 + }, + { + "epoch": 3.573152948801037, + "grad_norm": 0.07470285892486572, + "learning_rate": 2.3713938521085556e-05, + "loss": 0.1881, + "step": 44107 + }, + { + "epoch": 3.5732339598185354, + "grad_norm": 0.07752804458141327, + "learning_rate": 2.3709437868490933e-05, + "loss": 0.2229, + "step": 44108 + }, + { + "epoch": 3.5733149708360337, + "grad_norm": 0.06709716469049454, + "learning_rate": 2.3704937215896306e-05, + "loss": 0.2067, + "step": 44109 + }, + { + "epoch": 3.5733959818535324, + "grad_norm": 0.079058438539505, + "learning_rate": 2.370043656330168e-05, + "loss": 0.2218, + "step": 44110 + }, + { + "epoch": 3.5734769928710306, + "grad_norm": 0.07981273531913757, + "learning_rate": 2.3695935910707054e-05, + "loss": 0.2185, + "step": 44111 + }, + { + "epoch": 3.573558003888529, + "grad_norm": 0.06988833099603653, + "learning_rate": 2.3691435258112427e-05, + "loss": 0.2125, + "step": 44112 + }, + { + "epoch": 3.573639014906027, + "grad_norm": 0.07461047917604446, + "learning_rate": 2.36869346055178e-05, + "loss": 0.2626, + "step": 44113 + }, + { + "epoch": 3.5737200259235253, + "grad_norm": 0.07636402547359467, + "learning_rate": 2.3682433952923174e-05, + "loss": 0.2106, + "step": 44114 + }, + { + "epoch": 3.573801036941024, + "grad_norm": 0.06755087524652481, + "learning_rate": 2.3677933300328548e-05, + "loss": 0.2217, + "step": 44115 + }, + { + "epoch": 3.5738820479585223, + "grad_norm": 0.07610853016376495, + "learning_rate": 2.367343264773392e-05, + "loss": 0.2001, + "step": 44116 + }, + { + "epoch": 3.5739630589760205, + "grad_norm": 0.07375774532556534, + "learning_rate": 2.3668931995139295e-05, + "loss": 0.2158, + "step": 44117 + }, + { + "epoch": 3.574044069993519, + "grad_norm": 0.07939080893993378, + "learning_rate": 2.3664431342544672e-05, + "loss": 0.235, + "step": 44118 + }, + { + "epoch": 3.5741250810110174, + "grad_norm": 0.07518904656171799, + "learning_rate": 2.3659930689950042e-05, + "loss": 0.2475, + "step": 44119 + }, + { + "epoch": 3.5742060920285157, + "grad_norm": 0.06927220523357391, + "learning_rate": 2.3655430037355416e-05, + "loss": 0.2243, + "step": 44120 + }, + { + "epoch": 3.5742871030460144, + "grad_norm": 0.0808010846376419, + "learning_rate": 2.3650929384760793e-05, + "loss": 0.2474, + "step": 44121 + }, + { + "epoch": 3.5743681140635126, + "grad_norm": 0.06906265020370483, + "learning_rate": 2.3646428732166163e-05, + "loss": 0.2232, + "step": 44122 + }, + { + "epoch": 3.574449125081011, + "grad_norm": 0.06592061370611191, + "learning_rate": 2.364192807957154e-05, + "loss": 0.2287, + "step": 44123 + }, + { + "epoch": 3.5745301360985096, + "grad_norm": 0.07803937792778015, + "learning_rate": 2.3637427426976914e-05, + "loss": 0.209, + "step": 44124 + }, + { + "epoch": 3.574611147116008, + "grad_norm": 0.09058477729558945, + "learning_rate": 2.3632926774382287e-05, + "loss": 0.2473, + "step": 44125 + }, + { + "epoch": 3.574692158133506, + "grad_norm": 0.06751037389039993, + "learning_rate": 2.362842612178766e-05, + "loss": 0.2327, + "step": 44126 + }, + { + "epoch": 3.5747731691510047, + "grad_norm": 0.07758664339780807, + "learning_rate": 2.3623925469193035e-05, + "loss": 0.2296, + "step": 44127 + }, + { + "epoch": 3.574854180168503, + "grad_norm": 0.057286325842142105, + "learning_rate": 2.3619424816598408e-05, + "loss": 0.2316, + "step": 44128 + }, + { + "epoch": 3.5749351911860012, + "grad_norm": 0.08673517405986786, + "learning_rate": 2.3614924164003782e-05, + "loss": 0.2512, + "step": 44129 + }, + { + "epoch": 3.5750162022035, + "grad_norm": 0.07720606029033661, + "learning_rate": 2.3610423511409155e-05, + "loss": 0.2192, + "step": 44130 + }, + { + "epoch": 3.575097213220998, + "grad_norm": 0.07449165731668472, + "learning_rate": 2.360592285881453e-05, + "loss": 0.2266, + "step": 44131 + }, + { + "epoch": 3.5751782242384964, + "grad_norm": 0.06918004900217056, + "learning_rate": 2.3601422206219903e-05, + "loss": 0.2318, + "step": 44132 + }, + { + "epoch": 3.5752592352559946, + "grad_norm": 0.10162968188524246, + "learning_rate": 2.3596921553625276e-05, + "loss": 0.2452, + "step": 44133 + }, + { + "epoch": 3.5753402462734933, + "grad_norm": 0.05789915472269058, + "learning_rate": 2.359242090103065e-05, + "loss": 0.2582, + "step": 44134 + }, + { + "epoch": 3.5754212572909916, + "grad_norm": 0.08747924119234085, + "learning_rate": 2.3587920248436023e-05, + "loss": 0.221, + "step": 44135 + }, + { + "epoch": 3.57550226830849, + "grad_norm": 0.06346782296895981, + "learning_rate": 2.35834195958414e-05, + "loss": 0.1718, + "step": 44136 + }, + { + "epoch": 3.575583279325988, + "grad_norm": 0.06199439615011215, + "learning_rate": 2.357891894324677e-05, + "loss": 0.2305, + "step": 44137 + }, + { + "epoch": 3.5756642903434868, + "grad_norm": 0.07988022267818451, + "learning_rate": 2.3574418290652144e-05, + "loss": 0.2175, + "step": 44138 + }, + { + "epoch": 3.575745301360985, + "grad_norm": 0.06792055815458298, + "learning_rate": 2.356991763805752e-05, + "loss": 0.2659, + "step": 44139 + }, + { + "epoch": 3.5758263123784833, + "grad_norm": 0.06658469885587692, + "learning_rate": 2.356541698546289e-05, + "loss": 0.2286, + "step": 44140 + }, + { + "epoch": 3.575907323395982, + "grad_norm": 0.08243236690759659, + "learning_rate": 2.3560916332868265e-05, + "loss": 0.2586, + "step": 44141 + }, + { + "epoch": 3.57598833441348, + "grad_norm": 0.07459936290979385, + "learning_rate": 2.3556415680273642e-05, + "loss": 0.2582, + "step": 44142 + }, + { + "epoch": 3.5760693454309784, + "grad_norm": 0.07289506494998932, + "learning_rate": 2.3551915027679016e-05, + "loss": 0.2031, + "step": 44143 + }, + { + "epoch": 3.576150356448477, + "grad_norm": 0.07211292535066605, + "learning_rate": 2.354741437508439e-05, + "loss": 0.2232, + "step": 44144 + }, + { + "epoch": 3.5762313674659754, + "grad_norm": 0.06191423907876015, + "learning_rate": 2.3542913722489763e-05, + "loss": 0.2123, + "step": 44145 + }, + { + "epoch": 3.5763123784834736, + "grad_norm": 0.07505661994218826, + "learning_rate": 2.3538413069895136e-05, + "loss": 0.211, + "step": 44146 + }, + { + "epoch": 3.5763933895009723, + "grad_norm": 0.07626402378082275, + "learning_rate": 2.353391241730051e-05, + "loss": 0.2534, + "step": 44147 + }, + { + "epoch": 3.5764744005184705, + "grad_norm": 0.07256495207548141, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.2379, + "step": 44148 + }, + { + "epoch": 3.576555411535969, + "grad_norm": 0.07416626811027527, + "learning_rate": 2.3524911112111257e-05, + "loss": 0.2224, + "step": 44149 + }, + { + "epoch": 3.5766364225534675, + "grad_norm": 0.07687810063362122, + "learning_rate": 2.352041045951663e-05, + "loss": 0.2388, + "step": 44150 + }, + { + "epoch": 3.5767174335709657, + "grad_norm": 0.06605061888694763, + "learning_rate": 2.3515909806922004e-05, + "loss": 0.225, + "step": 44151 + }, + { + "epoch": 3.576798444588464, + "grad_norm": 0.07485217601060867, + "learning_rate": 2.3511409154327378e-05, + "loss": 0.2517, + "step": 44152 + }, + { + "epoch": 3.5768794556059627, + "grad_norm": 0.05736643448472023, + "learning_rate": 2.3506908501732752e-05, + "loss": 0.2449, + "step": 44153 + }, + { + "epoch": 3.576960466623461, + "grad_norm": 0.06579489260911942, + "learning_rate": 2.3502407849138125e-05, + "loss": 0.2649, + "step": 44154 + }, + { + "epoch": 3.577041477640959, + "grad_norm": 0.06708400696516037, + "learning_rate": 2.34979071965435e-05, + "loss": 0.249, + "step": 44155 + }, + { + "epoch": 3.5771224886584574, + "grad_norm": 0.05705248564481735, + "learning_rate": 2.3493406543948873e-05, + "loss": 0.2476, + "step": 44156 + }, + { + "epoch": 3.577203499675956, + "grad_norm": 0.05666022002696991, + "learning_rate": 2.348890589135425e-05, + "loss": 0.195, + "step": 44157 + }, + { + "epoch": 3.5772845106934543, + "grad_norm": 0.06512381881475449, + "learning_rate": 2.348440523875962e-05, + "loss": 0.235, + "step": 44158 + }, + { + "epoch": 3.5773655217109526, + "grad_norm": 0.07135576754808426, + "learning_rate": 2.3479904586164993e-05, + "loss": 0.1818, + "step": 44159 + }, + { + "epoch": 3.577446532728451, + "grad_norm": 0.05715243145823479, + "learning_rate": 2.347540393357037e-05, + "loss": 0.2448, + "step": 44160 + }, + { + "epoch": 3.5775275437459495, + "grad_norm": 0.07935239374637604, + "learning_rate": 2.3470903280975744e-05, + "loss": 0.2379, + "step": 44161 + }, + { + "epoch": 3.5776085547634477, + "grad_norm": 0.05796222388744354, + "learning_rate": 2.3466402628381114e-05, + "loss": 0.199, + "step": 44162 + }, + { + "epoch": 3.577689565780946, + "grad_norm": 0.09846629947423935, + "learning_rate": 2.346190197578649e-05, + "loss": 0.2093, + "step": 44163 + }, + { + "epoch": 3.5777705767984447, + "grad_norm": 0.07785390317440033, + "learning_rate": 2.3457401323191865e-05, + "loss": 0.2151, + "step": 44164 + }, + { + "epoch": 3.577851587815943, + "grad_norm": 0.06434359401464462, + "learning_rate": 2.345290067059724e-05, + "loss": 0.2295, + "step": 44165 + }, + { + "epoch": 3.577932598833441, + "grad_norm": 0.06758967787027359, + "learning_rate": 2.3448400018002612e-05, + "loss": 0.2413, + "step": 44166 + }, + { + "epoch": 3.57801360985094, + "grad_norm": 0.08527732640504837, + "learning_rate": 2.3443899365407986e-05, + "loss": 0.2458, + "step": 44167 + }, + { + "epoch": 3.578094620868438, + "grad_norm": 0.07337276637554169, + "learning_rate": 2.343939871281336e-05, + "loss": 0.2312, + "step": 44168 + }, + { + "epoch": 3.5781756318859363, + "grad_norm": 0.07388912886381149, + "learning_rate": 2.3434898060218733e-05, + "loss": 0.259, + "step": 44169 + }, + { + "epoch": 3.578256642903435, + "grad_norm": 0.06941523402929306, + "learning_rate": 2.3430397407624106e-05, + "loss": 0.2328, + "step": 44170 + }, + { + "epoch": 3.5783376539209333, + "grad_norm": 0.07117350399494171, + "learning_rate": 2.342589675502948e-05, + "loss": 0.218, + "step": 44171 + }, + { + "epoch": 3.5784186649384315, + "grad_norm": 0.0704994648694992, + "learning_rate": 2.3421396102434854e-05, + "loss": 0.2159, + "step": 44172 + }, + { + "epoch": 3.57849967595593, + "grad_norm": 0.0686672106385231, + "learning_rate": 2.3416895449840227e-05, + "loss": 0.2176, + "step": 44173 + }, + { + "epoch": 3.5785806869734285, + "grad_norm": 0.07033158093690872, + "learning_rate": 2.34123947972456e-05, + "loss": 0.2051, + "step": 44174 + }, + { + "epoch": 3.5786616979909267, + "grad_norm": 0.05757991597056389, + "learning_rate": 2.3407894144650974e-05, + "loss": 0.1934, + "step": 44175 + }, + { + "epoch": 3.5787427090084254, + "grad_norm": 0.07229195535182953, + "learning_rate": 2.340339349205635e-05, + "loss": 0.2095, + "step": 44176 + }, + { + "epoch": 3.5788237200259236, + "grad_norm": 0.07896924018859863, + "learning_rate": 2.339889283946172e-05, + "loss": 0.1986, + "step": 44177 + }, + { + "epoch": 3.578904731043422, + "grad_norm": 0.08921925723552704, + "learning_rate": 2.33943921868671e-05, + "loss": 0.2219, + "step": 44178 + }, + { + "epoch": 3.57898574206092, + "grad_norm": 0.06069227308034897, + "learning_rate": 2.3389891534272472e-05, + "loss": 0.1983, + "step": 44179 + }, + { + "epoch": 3.579066753078419, + "grad_norm": 0.06449750065803528, + "learning_rate": 2.3385390881677842e-05, + "loss": 0.1994, + "step": 44180 + }, + { + "epoch": 3.579147764095917, + "grad_norm": 0.06284713745117188, + "learning_rate": 2.338089022908322e-05, + "loss": 0.2365, + "step": 44181 + }, + { + "epoch": 3.5792287751134153, + "grad_norm": 0.08436598628759384, + "learning_rate": 2.3376389576488593e-05, + "loss": 0.2234, + "step": 44182 + }, + { + "epoch": 3.5793097861309136, + "grad_norm": 0.0867995098233223, + "learning_rate": 2.3371888923893967e-05, + "loss": 0.2313, + "step": 44183 + }, + { + "epoch": 3.5793907971484122, + "grad_norm": 0.07555483281612396, + "learning_rate": 2.336738827129934e-05, + "loss": 0.2104, + "step": 44184 + }, + { + "epoch": 3.5794718081659105, + "grad_norm": 0.0693432092666626, + "learning_rate": 2.3362887618704714e-05, + "loss": 0.172, + "step": 44185 + }, + { + "epoch": 3.5795528191834087, + "grad_norm": 0.07421540468931198, + "learning_rate": 2.3358386966110087e-05, + "loss": 0.2119, + "step": 44186 + }, + { + "epoch": 3.5796338302009074, + "grad_norm": 0.07303255051374435, + "learning_rate": 2.335388631351546e-05, + "loss": 0.234, + "step": 44187 + }, + { + "epoch": 3.5797148412184057, + "grad_norm": 0.06863290816545486, + "learning_rate": 2.3349385660920835e-05, + "loss": 0.2207, + "step": 44188 + }, + { + "epoch": 3.579795852235904, + "grad_norm": 0.06763945519924164, + "learning_rate": 2.3344885008326208e-05, + "loss": 0.2082, + "step": 44189 + }, + { + "epoch": 3.5798768632534026, + "grad_norm": 0.0775524452328682, + "learning_rate": 2.3340384355731582e-05, + "loss": 0.2449, + "step": 44190 + }, + { + "epoch": 3.579957874270901, + "grad_norm": 0.08569817990064621, + "learning_rate": 2.3335883703136955e-05, + "loss": 0.2156, + "step": 44191 + }, + { + "epoch": 3.580038885288399, + "grad_norm": 0.09265130013227463, + "learning_rate": 2.333138305054233e-05, + "loss": 0.2372, + "step": 44192 + }, + { + "epoch": 3.5801198963058978, + "grad_norm": 0.06884811818599701, + "learning_rate": 2.3326882397947703e-05, + "loss": 0.257, + "step": 44193 + }, + { + "epoch": 3.580200907323396, + "grad_norm": 0.08053863048553467, + "learning_rate": 2.332238174535308e-05, + "loss": 0.2232, + "step": 44194 + }, + { + "epoch": 3.5802819183408943, + "grad_norm": 0.08468924462795258, + "learning_rate": 2.331788109275845e-05, + "loss": 0.2075, + "step": 44195 + }, + { + "epoch": 3.580362929358393, + "grad_norm": 0.06398703157901764, + "learning_rate": 2.3313380440163827e-05, + "loss": 0.1933, + "step": 44196 + }, + { + "epoch": 3.580443940375891, + "grad_norm": 0.06383037567138672, + "learning_rate": 2.33088797875692e-05, + "loss": 0.2233, + "step": 44197 + }, + { + "epoch": 3.5805249513933894, + "grad_norm": 0.06736244261264801, + "learning_rate": 2.330437913497457e-05, + "loss": 0.2269, + "step": 44198 + }, + { + "epoch": 3.580605962410888, + "grad_norm": 0.09929540008306503, + "learning_rate": 2.3299878482379948e-05, + "loss": 0.2354, + "step": 44199 + }, + { + "epoch": 3.5806869734283864, + "grad_norm": 0.08501783013343811, + "learning_rate": 2.329537782978532e-05, + "loss": 0.2693, + "step": 44200 + }, + { + "epoch": 3.5807679844458846, + "grad_norm": 0.0647391527891159, + "learning_rate": 2.329087717719069e-05, + "loss": 0.2211, + "step": 44201 + }, + { + "epoch": 3.580848995463383, + "grad_norm": 0.07122602313756943, + "learning_rate": 2.328637652459607e-05, + "loss": 0.2434, + "step": 44202 + }, + { + "epoch": 3.5809300064808816, + "grad_norm": 0.07307540625333786, + "learning_rate": 2.3281875872001442e-05, + "loss": 0.2233, + "step": 44203 + }, + { + "epoch": 3.58101101749838, + "grad_norm": 0.06686873733997345, + "learning_rate": 2.3277375219406816e-05, + "loss": 0.1938, + "step": 44204 + }, + { + "epoch": 3.581092028515878, + "grad_norm": 0.08013508468866348, + "learning_rate": 2.327287456681219e-05, + "loss": 0.2141, + "step": 44205 + }, + { + "epoch": 3.5811730395333763, + "grad_norm": 0.07765527069568634, + "learning_rate": 2.3268373914217563e-05, + "loss": 0.2094, + "step": 44206 + }, + { + "epoch": 3.581254050550875, + "grad_norm": 0.06879838556051254, + "learning_rate": 2.3263873261622936e-05, + "loss": 0.2456, + "step": 44207 + }, + { + "epoch": 3.5813350615683732, + "grad_norm": 0.07718323916196823, + "learning_rate": 2.325937260902831e-05, + "loss": 0.2246, + "step": 44208 + }, + { + "epoch": 3.5814160725858715, + "grad_norm": 0.07552476227283478, + "learning_rate": 2.3254871956433684e-05, + "loss": 0.2333, + "step": 44209 + }, + { + "epoch": 3.58149708360337, + "grad_norm": 0.06853868067264557, + "learning_rate": 2.3250371303839057e-05, + "loss": 0.2295, + "step": 44210 + }, + { + "epoch": 3.5815780946208684, + "grad_norm": 0.05678297579288483, + "learning_rate": 2.324587065124443e-05, + "loss": 0.2204, + "step": 44211 + }, + { + "epoch": 3.5816591056383666, + "grad_norm": 0.0712512880563736, + "learning_rate": 2.3241369998649808e-05, + "loss": 0.2131, + "step": 44212 + }, + { + "epoch": 3.5817401166558653, + "grad_norm": 0.0786905288696289, + "learning_rate": 2.3236869346055178e-05, + "loss": 0.2262, + "step": 44213 + }, + { + "epoch": 3.5818211276733636, + "grad_norm": 0.05377919599413872, + "learning_rate": 2.323236869346055e-05, + "loss": 0.1908, + "step": 44214 + }, + { + "epoch": 3.581902138690862, + "grad_norm": 0.08514394611120224, + "learning_rate": 2.322786804086593e-05, + "loss": 0.2215, + "step": 44215 + }, + { + "epoch": 3.5819831497083605, + "grad_norm": 0.07618945837020874, + "learning_rate": 2.32233673882713e-05, + "loss": 0.2145, + "step": 44216 + }, + { + "epoch": 3.5820641607258588, + "grad_norm": 0.07072417438030243, + "learning_rate": 2.3218866735676676e-05, + "loss": 0.2386, + "step": 44217 + }, + { + "epoch": 3.582145171743357, + "grad_norm": 0.07065349072217941, + "learning_rate": 2.321436608308205e-05, + "loss": 0.2201, + "step": 44218 + }, + { + "epoch": 3.5822261827608557, + "grad_norm": 0.06873950362205505, + "learning_rate": 2.320986543048742e-05, + "loss": 0.1726, + "step": 44219 + }, + { + "epoch": 3.582307193778354, + "grad_norm": 0.07127075642347336, + "learning_rate": 2.3205364777892797e-05, + "loss": 0.2155, + "step": 44220 + }, + { + "epoch": 3.582388204795852, + "grad_norm": 0.08185587078332901, + "learning_rate": 2.320086412529817e-05, + "loss": 0.1968, + "step": 44221 + }, + { + "epoch": 3.582469215813351, + "grad_norm": 0.07481518387794495, + "learning_rate": 2.3196363472703544e-05, + "loss": 0.2179, + "step": 44222 + }, + { + "epoch": 3.582550226830849, + "grad_norm": 0.0727316364645958, + "learning_rate": 2.3191862820108917e-05, + "loss": 0.2648, + "step": 44223 + }, + { + "epoch": 3.5826312378483474, + "grad_norm": 0.06189148873090744, + "learning_rate": 2.318736216751429e-05, + "loss": 0.2095, + "step": 44224 + }, + { + "epoch": 3.5827122488658456, + "grad_norm": 0.0742558166384697, + "learning_rate": 2.3182861514919665e-05, + "loss": 0.2582, + "step": 44225 + }, + { + "epoch": 3.5827932598833443, + "grad_norm": 0.07165959477424622, + "learning_rate": 2.3178360862325038e-05, + "loss": 0.2193, + "step": 44226 + }, + { + "epoch": 3.5828742709008425, + "grad_norm": 0.069723941385746, + "learning_rate": 2.3173860209730412e-05, + "loss": 0.2064, + "step": 44227 + }, + { + "epoch": 3.582955281918341, + "grad_norm": 0.09108491241931915, + "learning_rate": 2.3169359557135785e-05, + "loss": 0.2345, + "step": 44228 + }, + { + "epoch": 3.583036292935839, + "grad_norm": 0.07429561018943787, + "learning_rate": 2.316485890454116e-05, + "loss": 0.1973, + "step": 44229 + }, + { + "epoch": 3.5831173039533377, + "grad_norm": 0.08311036229133606, + "learning_rate": 2.3160358251946536e-05, + "loss": 0.2343, + "step": 44230 + }, + { + "epoch": 3.583198314970836, + "grad_norm": 0.06876590847969055, + "learning_rate": 2.3155857599351906e-05, + "loss": 0.2563, + "step": 44231 + }, + { + "epoch": 3.583279325988334, + "grad_norm": 0.06580781936645508, + "learning_rate": 2.315135694675728e-05, + "loss": 0.21, + "step": 44232 + }, + { + "epoch": 3.583360337005833, + "grad_norm": 0.07575564086437225, + "learning_rate": 2.3146856294162657e-05, + "loss": 0.1992, + "step": 44233 + }, + { + "epoch": 3.583441348023331, + "grad_norm": 0.06198256090283394, + "learning_rate": 2.3142355641568027e-05, + "loss": 0.2285, + "step": 44234 + }, + { + "epoch": 3.5835223590408294, + "grad_norm": 0.06165856122970581, + "learning_rate": 2.31378549889734e-05, + "loss": 0.2144, + "step": 44235 + }, + { + "epoch": 3.583603370058328, + "grad_norm": 0.06248553469777107, + "learning_rate": 2.3133354336378778e-05, + "loss": 0.1968, + "step": 44236 + }, + { + "epoch": 3.5836843810758263, + "grad_norm": 0.06722322851419449, + "learning_rate": 2.3128853683784148e-05, + "loss": 0.2275, + "step": 44237 + }, + { + "epoch": 3.5837653920933246, + "grad_norm": 0.07760576903820038, + "learning_rate": 2.3124353031189525e-05, + "loss": 0.2138, + "step": 44238 + }, + { + "epoch": 3.5838464031108233, + "grad_norm": 0.08306940644979477, + "learning_rate": 2.31198523785949e-05, + "loss": 0.2209, + "step": 44239 + }, + { + "epoch": 3.5839274141283215, + "grad_norm": 0.09010027348995209, + "learning_rate": 2.311535172600027e-05, + "loss": 0.2129, + "step": 44240 + }, + { + "epoch": 3.5840084251458197, + "grad_norm": 0.07222206145524979, + "learning_rate": 2.3110851073405646e-05, + "loss": 0.2182, + "step": 44241 + }, + { + "epoch": 3.5840894361633184, + "grad_norm": 0.08256030827760696, + "learning_rate": 2.310635042081102e-05, + "loss": 0.2499, + "step": 44242 + }, + { + "epoch": 3.5841704471808167, + "grad_norm": 0.08997172117233276, + "learning_rate": 2.3101849768216393e-05, + "loss": 0.2351, + "step": 44243 + }, + { + "epoch": 3.584251458198315, + "grad_norm": 0.07810519635677338, + "learning_rate": 2.3097349115621767e-05, + "loss": 0.2291, + "step": 44244 + }, + { + "epoch": 3.5843324692158136, + "grad_norm": 0.08510446548461914, + "learning_rate": 2.309284846302714e-05, + "loss": 0.2035, + "step": 44245 + }, + { + "epoch": 3.584413480233312, + "grad_norm": 0.07003391534090042, + "learning_rate": 2.3088347810432514e-05, + "loss": 0.2134, + "step": 44246 + }, + { + "epoch": 3.58449449125081, + "grad_norm": 0.059189002960920334, + "learning_rate": 2.3083847157837887e-05, + "loss": 0.2268, + "step": 44247 + }, + { + "epoch": 3.5845755022683083, + "grad_norm": 0.07633404433727264, + "learning_rate": 2.307934650524326e-05, + "loss": 0.2364, + "step": 44248 + }, + { + "epoch": 3.584656513285807, + "grad_norm": 0.0753362625837326, + "learning_rate": 2.3074845852648635e-05, + "loss": 0.2082, + "step": 44249 + }, + { + "epoch": 3.5847375243033053, + "grad_norm": 0.07483591139316559, + "learning_rate": 2.3070345200054008e-05, + "loss": 0.2038, + "step": 44250 + }, + { + "epoch": 3.5848185353208035, + "grad_norm": 0.07827362418174744, + "learning_rate": 2.3065844547459385e-05, + "loss": 0.2267, + "step": 44251 + }, + { + "epoch": 3.5848995463383018, + "grad_norm": 0.08125095814466476, + "learning_rate": 2.3061343894864755e-05, + "loss": 0.2367, + "step": 44252 + }, + { + "epoch": 3.5849805573558005, + "grad_norm": 0.08842773735523224, + "learning_rate": 2.305684324227013e-05, + "loss": 0.2295, + "step": 44253 + }, + { + "epoch": 3.5850615683732987, + "grad_norm": 0.08604532480239868, + "learning_rate": 2.3052342589675506e-05, + "loss": 0.2305, + "step": 44254 + }, + { + "epoch": 3.585142579390797, + "grad_norm": 0.07639449834823608, + "learning_rate": 2.3047841937080876e-05, + "loss": 0.2107, + "step": 44255 + }, + { + "epoch": 3.5852235904082956, + "grad_norm": 0.07172799855470657, + "learning_rate": 2.3043341284486253e-05, + "loss": 0.2284, + "step": 44256 + }, + { + "epoch": 3.585304601425794, + "grad_norm": 0.08766242116689682, + "learning_rate": 2.3038840631891627e-05, + "loss": 0.2502, + "step": 44257 + }, + { + "epoch": 3.585385612443292, + "grad_norm": 0.05565030872821808, + "learning_rate": 2.3034339979296997e-05, + "loss": 0.2103, + "step": 44258 + }, + { + "epoch": 3.585466623460791, + "grad_norm": 0.08138461410999298, + "learning_rate": 2.3029839326702374e-05, + "loss": 0.2241, + "step": 44259 + }, + { + "epoch": 3.585547634478289, + "grad_norm": 0.08481591194868088, + "learning_rate": 2.3025338674107748e-05, + "loss": 0.236, + "step": 44260 + }, + { + "epoch": 3.5856286454957873, + "grad_norm": 0.0756368413567543, + "learning_rate": 2.3020838021513118e-05, + "loss": 0.2314, + "step": 44261 + }, + { + "epoch": 3.585709656513286, + "grad_norm": 0.06131047382950783, + "learning_rate": 2.3016337368918495e-05, + "loss": 0.2096, + "step": 44262 + }, + { + "epoch": 3.5857906675307842, + "grad_norm": 0.06640103459358215, + "learning_rate": 2.301183671632387e-05, + "loss": 0.2001, + "step": 44263 + }, + { + "epoch": 3.5858716785482825, + "grad_norm": 0.06853748112916946, + "learning_rate": 2.3007336063729242e-05, + "loss": 0.2255, + "step": 44264 + }, + { + "epoch": 3.585952689565781, + "grad_norm": 0.07723915576934814, + "learning_rate": 2.3002835411134616e-05, + "loss": 0.2287, + "step": 44265 + }, + { + "epoch": 3.5860337005832794, + "grad_norm": 0.06693840026855469, + "learning_rate": 2.299833475853999e-05, + "loss": 0.2201, + "step": 44266 + }, + { + "epoch": 3.5861147116007777, + "grad_norm": 0.08216645568609238, + "learning_rate": 2.2993834105945363e-05, + "loss": 0.2183, + "step": 44267 + }, + { + "epoch": 3.5861957226182763, + "grad_norm": 0.07527400553226471, + "learning_rate": 2.2989333453350736e-05, + "loss": 0.2227, + "step": 44268 + }, + { + "epoch": 3.5862767336357746, + "grad_norm": 0.06903623789548874, + "learning_rate": 2.2984832800756113e-05, + "loss": 0.2249, + "step": 44269 + }, + { + "epoch": 3.586357744653273, + "grad_norm": 0.08289389312267303, + "learning_rate": 2.2980332148161484e-05, + "loss": 0.2302, + "step": 44270 + }, + { + "epoch": 3.586438755670771, + "grad_norm": 0.07931365072727203, + "learning_rate": 2.2975831495566857e-05, + "loss": 0.2304, + "step": 44271 + }, + { + "epoch": 3.5865197666882693, + "grad_norm": 0.08668045699596405, + "learning_rate": 2.2971330842972234e-05, + "loss": 0.2336, + "step": 44272 + }, + { + "epoch": 3.586600777705768, + "grad_norm": 0.07321757823228836, + "learning_rate": 2.2966830190377604e-05, + "loss": 0.2303, + "step": 44273 + }, + { + "epoch": 3.5866817887232663, + "grad_norm": 0.0703120231628418, + "learning_rate": 2.2962329537782978e-05, + "loss": 0.2143, + "step": 44274 + }, + { + "epoch": 3.5867627997407645, + "grad_norm": 0.06930385529994965, + "learning_rate": 2.2957828885188355e-05, + "loss": 0.2813, + "step": 44275 + }, + { + "epoch": 3.586843810758263, + "grad_norm": 0.058735087513923645, + "learning_rate": 2.2953328232593725e-05, + "loss": 0.2308, + "step": 44276 + }, + { + "epoch": 3.5869248217757614, + "grad_norm": 0.06936381757259369, + "learning_rate": 2.2948827579999102e-05, + "loss": 0.2089, + "step": 44277 + }, + { + "epoch": 3.5870058327932597, + "grad_norm": 0.059996139258146286, + "learning_rate": 2.2944326927404476e-05, + "loss": 0.2222, + "step": 44278 + }, + { + "epoch": 3.5870868438107584, + "grad_norm": 0.058839645236730576, + "learning_rate": 2.2939826274809846e-05, + "loss": 0.2038, + "step": 44279 + }, + { + "epoch": 3.5871678548282566, + "grad_norm": 0.06004242226481438, + "learning_rate": 2.2935325622215223e-05, + "loss": 0.226, + "step": 44280 + }, + { + "epoch": 3.587248865845755, + "grad_norm": 0.06038869544863701, + "learning_rate": 2.2930824969620597e-05, + "loss": 0.1797, + "step": 44281 + }, + { + "epoch": 3.5873298768632536, + "grad_norm": 0.07474492490291595, + "learning_rate": 2.292632431702597e-05, + "loss": 0.2361, + "step": 44282 + }, + { + "epoch": 3.587410887880752, + "grad_norm": 0.08312978595495224, + "learning_rate": 2.2921823664431344e-05, + "loss": 0.2651, + "step": 44283 + }, + { + "epoch": 3.58749189889825, + "grad_norm": 0.06367320567369461, + "learning_rate": 2.2917323011836717e-05, + "loss": 0.1975, + "step": 44284 + }, + { + "epoch": 3.5875729099157487, + "grad_norm": 0.06153202801942825, + "learning_rate": 2.291282235924209e-05, + "loss": 0.2128, + "step": 44285 + }, + { + "epoch": 3.587653920933247, + "grad_norm": 0.06177050247788429, + "learning_rate": 2.2908321706647465e-05, + "loss": 0.21, + "step": 44286 + }, + { + "epoch": 3.587734931950745, + "grad_norm": 0.07861583679914474, + "learning_rate": 2.2903821054052838e-05, + "loss": 0.3053, + "step": 44287 + }, + { + "epoch": 3.587815942968244, + "grad_norm": 0.07128918915987015, + "learning_rate": 2.2899320401458212e-05, + "loss": 0.228, + "step": 44288 + }, + { + "epoch": 3.587896953985742, + "grad_norm": 0.0694175511598587, + "learning_rate": 2.2894819748863585e-05, + "loss": 0.2329, + "step": 44289 + }, + { + "epoch": 3.5879779650032404, + "grad_norm": 0.07123053073883057, + "learning_rate": 2.2890319096268962e-05, + "loss": 0.2514, + "step": 44290 + }, + { + "epoch": 3.588058976020739, + "grad_norm": 0.07652550935745239, + "learning_rate": 2.2885818443674333e-05, + "loss": 0.2417, + "step": 44291 + }, + { + "epoch": 3.5881399870382373, + "grad_norm": 0.07941815257072449, + "learning_rate": 2.2881317791079706e-05, + "loss": 0.231, + "step": 44292 + }, + { + "epoch": 3.5882209980557356, + "grad_norm": 0.08026675879955292, + "learning_rate": 2.2876817138485083e-05, + "loss": 0.2154, + "step": 44293 + }, + { + "epoch": 3.588302009073234, + "grad_norm": 0.07086445391178131, + "learning_rate": 2.2872316485890453e-05, + "loss": 0.1999, + "step": 44294 + }, + { + "epoch": 3.588383020090732, + "grad_norm": 0.07486844062805176, + "learning_rate": 2.286781583329583e-05, + "loss": 0.2385, + "step": 44295 + }, + { + "epoch": 3.5884640311082308, + "grad_norm": 0.08761772513389587, + "learning_rate": 2.2863315180701204e-05, + "loss": 0.2555, + "step": 44296 + }, + { + "epoch": 3.588545042125729, + "grad_norm": 0.08862952888011932, + "learning_rate": 2.2858814528106574e-05, + "loss": 0.2583, + "step": 44297 + }, + { + "epoch": 3.5886260531432272, + "grad_norm": 0.07825177162885666, + "learning_rate": 2.285431387551195e-05, + "loss": 0.2298, + "step": 44298 + }, + { + "epoch": 3.588707064160726, + "grad_norm": 0.06612584739923477, + "learning_rate": 2.2849813222917325e-05, + "loss": 0.197, + "step": 44299 + }, + { + "epoch": 3.588788075178224, + "grad_norm": 0.08789026737213135, + "learning_rate": 2.2845312570322695e-05, + "loss": 0.2412, + "step": 44300 + }, + { + "epoch": 3.5888690861957224, + "grad_norm": 0.06065966933965683, + "learning_rate": 2.2840811917728072e-05, + "loss": 0.227, + "step": 44301 + }, + { + "epoch": 3.588950097213221, + "grad_norm": 0.07560954242944717, + "learning_rate": 2.2836311265133446e-05, + "loss": 0.2034, + "step": 44302 + }, + { + "epoch": 3.5890311082307194, + "grad_norm": 0.0544184185564518, + "learning_rate": 2.283181061253882e-05, + "loss": 0.2168, + "step": 44303 + }, + { + "epoch": 3.5891121192482176, + "grad_norm": 0.06866674870252609, + "learning_rate": 2.2827309959944193e-05, + "loss": 0.2074, + "step": 44304 + }, + { + "epoch": 3.5891931302657163, + "grad_norm": 0.06643018871545792, + "learning_rate": 2.2822809307349567e-05, + "loss": 0.2423, + "step": 44305 + }, + { + "epoch": 3.5892741412832145, + "grad_norm": 0.055913008749485016, + "learning_rate": 2.281830865475494e-05, + "loss": 0.2113, + "step": 44306 + }, + { + "epoch": 3.589355152300713, + "grad_norm": 0.05831152945756912, + "learning_rate": 2.2813808002160314e-05, + "loss": 0.1939, + "step": 44307 + }, + { + "epoch": 3.5894361633182115, + "grad_norm": 0.06937964260578156, + "learning_rate": 2.280930734956569e-05, + "loss": 0.2448, + "step": 44308 + }, + { + "epoch": 3.5895171743357097, + "grad_norm": 0.08315081894397736, + "learning_rate": 2.280480669697106e-05, + "loss": 0.2412, + "step": 44309 + }, + { + "epoch": 3.589598185353208, + "grad_norm": 0.0808417946100235, + "learning_rate": 2.2800306044376435e-05, + "loss": 0.2078, + "step": 44310 + }, + { + "epoch": 3.5896791963707066, + "grad_norm": 0.055978160351514816, + "learning_rate": 2.279580539178181e-05, + "loss": 0.185, + "step": 44311 + }, + { + "epoch": 3.589760207388205, + "grad_norm": 0.06848358362913132, + "learning_rate": 2.2791304739187182e-05, + "loss": 0.1972, + "step": 44312 + }, + { + "epoch": 3.589841218405703, + "grad_norm": 0.07640522718429565, + "learning_rate": 2.2786804086592555e-05, + "loss": 0.2048, + "step": 44313 + }, + { + "epoch": 3.589922229423202, + "grad_norm": 0.06737855076789856, + "learning_rate": 2.2782303433997932e-05, + "loss": 0.2057, + "step": 44314 + }, + { + "epoch": 3.5900032404407, + "grad_norm": 0.08041815459728241, + "learning_rate": 2.2777802781403303e-05, + "loss": 0.2278, + "step": 44315 + }, + { + "epoch": 3.5900842514581983, + "grad_norm": 0.06357849389314651, + "learning_rate": 2.277330212880868e-05, + "loss": 0.2005, + "step": 44316 + }, + { + "epoch": 3.5901652624756966, + "grad_norm": 0.05959125980734825, + "learning_rate": 2.2768801476214053e-05, + "loss": 0.1978, + "step": 44317 + }, + { + "epoch": 3.590246273493195, + "grad_norm": 0.06926389038562775, + "learning_rate": 2.2764300823619423e-05, + "loss": 0.211, + "step": 44318 + }, + { + "epoch": 3.5903272845106935, + "grad_norm": 0.07221662253141403, + "learning_rate": 2.27598001710248e-05, + "loss": 0.2416, + "step": 44319 + }, + { + "epoch": 3.5904082955281917, + "grad_norm": 0.08942881226539612, + "learning_rate": 2.2755299518430174e-05, + "loss": 0.2856, + "step": 44320 + }, + { + "epoch": 3.59048930654569, + "grad_norm": 0.0759754478931427, + "learning_rate": 2.2750798865835548e-05, + "loss": 0.2244, + "step": 44321 + }, + { + "epoch": 3.5905703175631887, + "grad_norm": 0.07223279029130936, + "learning_rate": 2.274629821324092e-05, + "loss": 0.2351, + "step": 44322 + }, + { + "epoch": 3.590651328580687, + "grad_norm": 0.08292268961668015, + "learning_rate": 2.2741797560646295e-05, + "loss": 0.1953, + "step": 44323 + }, + { + "epoch": 3.590732339598185, + "grad_norm": 0.06477510929107666, + "learning_rate": 2.273729690805167e-05, + "loss": 0.1859, + "step": 44324 + }, + { + "epoch": 3.590813350615684, + "grad_norm": 0.08031325787305832, + "learning_rate": 2.2732796255457042e-05, + "loss": 0.2224, + "step": 44325 + }, + { + "epoch": 3.590894361633182, + "grad_norm": 0.07873866707086563, + "learning_rate": 2.2728295602862416e-05, + "loss": 0.2035, + "step": 44326 + }, + { + "epoch": 3.5909753726506803, + "grad_norm": 0.05760648846626282, + "learning_rate": 2.272379495026779e-05, + "loss": 0.1932, + "step": 44327 + }, + { + "epoch": 3.591056383668179, + "grad_norm": 0.06344673037528992, + "learning_rate": 2.2719294297673163e-05, + "loss": 0.1984, + "step": 44328 + }, + { + "epoch": 3.5911373946856773, + "grad_norm": 0.050335634499788284, + "learning_rate": 2.271479364507854e-05, + "loss": 0.2229, + "step": 44329 + }, + { + "epoch": 3.5912184057031755, + "grad_norm": 0.06632192432880402, + "learning_rate": 2.271029299248391e-05, + "loss": 0.2532, + "step": 44330 + }, + { + "epoch": 3.591299416720674, + "grad_norm": 0.05906035751104355, + "learning_rate": 2.2705792339889284e-05, + "loss": 0.1707, + "step": 44331 + }, + { + "epoch": 3.5913804277381725, + "grad_norm": 0.059910062700510025, + "learning_rate": 2.270129168729466e-05, + "loss": 0.2044, + "step": 44332 + }, + { + "epoch": 3.5914614387556707, + "grad_norm": 0.08762989193201065, + "learning_rate": 2.269679103470003e-05, + "loss": 0.2484, + "step": 44333 + }, + { + "epoch": 3.5915424497731694, + "grad_norm": 0.07183953374624252, + "learning_rate": 2.2692290382105404e-05, + "loss": 0.2337, + "step": 44334 + }, + { + "epoch": 3.5916234607906676, + "grad_norm": 0.05912299081683159, + "learning_rate": 2.268778972951078e-05, + "loss": 0.2325, + "step": 44335 + }, + { + "epoch": 3.591704471808166, + "grad_norm": 0.09255866706371307, + "learning_rate": 2.268328907691615e-05, + "loss": 0.2686, + "step": 44336 + }, + { + "epoch": 3.5917854828256646, + "grad_norm": 0.08099143952131271, + "learning_rate": 2.267878842432153e-05, + "loss": 0.2208, + "step": 44337 + }, + { + "epoch": 3.591866493843163, + "grad_norm": 0.08440107107162476, + "learning_rate": 2.2674287771726902e-05, + "loss": 0.2394, + "step": 44338 + }, + { + "epoch": 3.591947504860661, + "grad_norm": 0.07279936969280243, + "learning_rate": 2.2669787119132276e-05, + "loss": 0.2285, + "step": 44339 + }, + { + "epoch": 3.5920285158781593, + "grad_norm": 0.07071520388126373, + "learning_rate": 2.266528646653765e-05, + "loss": 0.2205, + "step": 44340 + }, + { + "epoch": 3.5921095268956575, + "grad_norm": 0.09466453641653061, + "learning_rate": 2.2660785813943023e-05, + "loss": 0.2376, + "step": 44341 + }, + { + "epoch": 3.5921905379131562, + "grad_norm": 0.06887936592102051, + "learning_rate": 2.2656285161348397e-05, + "loss": 0.1952, + "step": 44342 + }, + { + "epoch": 3.5922715489306545, + "grad_norm": 0.0704905167222023, + "learning_rate": 2.265178450875377e-05, + "loss": 0.2008, + "step": 44343 + }, + { + "epoch": 3.5923525599481527, + "grad_norm": 0.07464319467544556, + "learning_rate": 2.2647283856159144e-05, + "loss": 0.2208, + "step": 44344 + }, + { + "epoch": 3.5924335709656514, + "grad_norm": 0.058817021548748016, + "learning_rate": 2.2642783203564517e-05, + "loss": 0.2325, + "step": 44345 + }, + { + "epoch": 3.5925145819831497, + "grad_norm": 0.08308090269565582, + "learning_rate": 2.263828255096989e-05, + "loss": 0.1987, + "step": 44346 + }, + { + "epoch": 3.592595593000648, + "grad_norm": 0.08090384304523468, + "learning_rate": 2.2633781898375265e-05, + "loss": 0.2153, + "step": 44347 + }, + { + "epoch": 3.5926766040181466, + "grad_norm": 0.06967276334762573, + "learning_rate": 2.2629281245780638e-05, + "loss": 0.2359, + "step": 44348 + }, + { + "epoch": 3.592757615035645, + "grad_norm": 0.06994897127151489, + "learning_rate": 2.2624780593186012e-05, + "loss": 0.2142, + "step": 44349 + }, + { + "epoch": 3.592838626053143, + "grad_norm": 0.0665275827050209, + "learning_rate": 2.262027994059139e-05, + "loss": 0.21, + "step": 44350 + }, + { + "epoch": 3.5929196370706418, + "grad_norm": 0.0679289698600769, + "learning_rate": 2.261577928799676e-05, + "loss": 0.2478, + "step": 44351 + }, + { + "epoch": 3.59300064808814, + "grad_norm": 0.06700301915407181, + "learning_rate": 2.2611278635402133e-05, + "loss": 0.2499, + "step": 44352 + }, + { + "epoch": 3.5930816591056383, + "grad_norm": 0.06900755316019058, + "learning_rate": 2.260677798280751e-05, + "loss": 0.2314, + "step": 44353 + }, + { + "epoch": 3.593162670123137, + "grad_norm": 0.07342938333749771, + "learning_rate": 2.260227733021288e-05, + "loss": 0.1744, + "step": 44354 + }, + { + "epoch": 3.593243681140635, + "grad_norm": 0.08509702235460281, + "learning_rate": 2.2597776677618257e-05, + "loss": 0.2156, + "step": 44355 + }, + { + "epoch": 3.5933246921581334, + "grad_norm": 0.06477095931768417, + "learning_rate": 2.259327602502363e-05, + "loss": 0.2191, + "step": 44356 + }, + { + "epoch": 3.593405703175632, + "grad_norm": 0.0756278708577156, + "learning_rate": 2.2588775372429004e-05, + "loss": 0.2166, + "step": 44357 + }, + { + "epoch": 3.5934867141931304, + "grad_norm": 0.07322674989700317, + "learning_rate": 2.2584274719834378e-05, + "loss": 0.2112, + "step": 44358 + }, + { + "epoch": 3.5935677252106286, + "grad_norm": 0.07110437750816345, + "learning_rate": 2.257977406723975e-05, + "loss": 0.2165, + "step": 44359 + }, + { + "epoch": 3.593648736228127, + "grad_norm": 0.07313544303178787, + "learning_rate": 2.2575273414645125e-05, + "loss": 0.2229, + "step": 44360 + }, + { + "epoch": 3.5937297472456255, + "grad_norm": 0.06659787148237228, + "learning_rate": 2.25707727620505e-05, + "loss": 0.2132, + "step": 44361 + }, + { + "epoch": 3.593810758263124, + "grad_norm": 0.06736768782138824, + "learning_rate": 2.2566272109455872e-05, + "loss": 0.2004, + "step": 44362 + }, + { + "epoch": 3.593891769280622, + "grad_norm": 0.0768749788403511, + "learning_rate": 2.2561771456861246e-05, + "loss": 0.2116, + "step": 44363 + }, + { + "epoch": 3.5939727802981203, + "grad_norm": 0.06405490636825562, + "learning_rate": 2.255727080426662e-05, + "loss": 0.2244, + "step": 44364 + }, + { + "epoch": 3.594053791315619, + "grad_norm": 0.07622356712818146, + "learning_rate": 2.2552770151671993e-05, + "loss": 0.1906, + "step": 44365 + }, + { + "epoch": 3.594134802333117, + "grad_norm": 0.08193568140268326, + "learning_rate": 2.2548269499077366e-05, + "loss": 0.2702, + "step": 44366 + }, + { + "epoch": 3.5942158133506155, + "grad_norm": 0.06567830592393875, + "learning_rate": 2.254376884648274e-05, + "loss": 0.2097, + "step": 44367 + }, + { + "epoch": 3.594296824368114, + "grad_norm": 0.06329605728387833, + "learning_rate": 2.2539268193888117e-05, + "loss": 0.249, + "step": 44368 + }, + { + "epoch": 3.5943778353856124, + "grad_norm": 0.09080575406551361, + "learning_rate": 2.2534767541293487e-05, + "loss": 0.252, + "step": 44369 + }, + { + "epoch": 3.5944588464031106, + "grad_norm": 0.09907004982233047, + "learning_rate": 2.253026688869886e-05, + "loss": 0.2272, + "step": 44370 + }, + { + "epoch": 3.5945398574206093, + "grad_norm": 0.05679634213447571, + "learning_rate": 2.2525766236104238e-05, + "loss": 0.2007, + "step": 44371 + }, + { + "epoch": 3.5946208684381076, + "grad_norm": 0.07971812784671783, + "learning_rate": 2.252126558350961e-05, + "loss": 0.2106, + "step": 44372 + }, + { + "epoch": 3.594701879455606, + "grad_norm": 0.07787297666072845, + "learning_rate": 2.2516764930914982e-05, + "loss": 0.2243, + "step": 44373 + }, + { + "epoch": 3.5947828904731045, + "grad_norm": 0.0740995705127716, + "learning_rate": 2.251226427832036e-05, + "loss": 0.226, + "step": 44374 + }, + { + "epoch": 3.5948639014906028, + "grad_norm": 0.08992590010166168, + "learning_rate": 2.2507763625725732e-05, + "loss": 0.2217, + "step": 44375 + }, + { + "epoch": 3.594944912508101, + "grad_norm": 0.05741897225379944, + "learning_rate": 2.2503262973131106e-05, + "loss": 0.241, + "step": 44376 + }, + { + "epoch": 3.5950259235255997, + "grad_norm": 0.0739036351442337, + "learning_rate": 2.249876232053648e-05, + "loss": 0.2389, + "step": 44377 + }, + { + "epoch": 3.595106934543098, + "grad_norm": 0.06869626045227051, + "learning_rate": 2.2494261667941853e-05, + "loss": 0.2281, + "step": 44378 + }, + { + "epoch": 3.595187945560596, + "grad_norm": 0.07012733072042465, + "learning_rate": 2.2489761015347227e-05, + "loss": 0.2321, + "step": 44379 + }, + { + "epoch": 3.595268956578095, + "grad_norm": 0.07144074887037277, + "learning_rate": 2.24852603627526e-05, + "loss": 0.2223, + "step": 44380 + }, + { + "epoch": 3.595349967595593, + "grad_norm": 0.06714019179344177, + "learning_rate": 2.2480759710157974e-05, + "loss": 0.2119, + "step": 44381 + }, + { + "epoch": 3.5954309786130914, + "grad_norm": 0.068583644926548, + "learning_rate": 2.2476259057563348e-05, + "loss": 0.189, + "step": 44382 + }, + { + "epoch": 3.5955119896305896, + "grad_norm": 0.07480515539646149, + "learning_rate": 2.247175840496872e-05, + "loss": 0.2178, + "step": 44383 + }, + { + "epoch": 3.5955930006480883, + "grad_norm": 0.06940307468175888, + "learning_rate": 2.2467257752374095e-05, + "loss": 0.2033, + "step": 44384 + }, + { + "epoch": 3.5956740116655865, + "grad_norm": 0.058958880603313446, + "learning_rate": 2.246275709977947e-05, + "loss": 0.2293, + "step": 44385 + }, + { + "epoch": 3.595755022683085, + "grad_norm": 0.09506436437368393, + "learning_rate": 2.2458256447184842e-05, + "loss": 0.2159, + "step": 44386 + }, + { + "epoch": 3.595836033700583, + "grad_norm": 0.06890156120061874, + "learning_rate": 2.2453755794590216e-05, + "loss": 0.2072, + "step": 44387 + }, + { + "epoch": 3.5959170447180817, + "grad_norm": 0.07868379354476929, + "learning_rate": 2.244925514199559e-05, + "loss": 0.2522, + "step": 44388 + }, + { + "epoch": 3.59599805573558, + "grad_norm": 0.08256050199270248, + "learning_rate": 2.2444754489400966e-05, + "loss": 0.2084, + "step": 44389 + }, + { + "epoch": 3.596079066753078, + "grad_norm": 0.07400922477245331, + "learning_rate": 2.244025383680634e-05, + "loss": 0.2198, + "step": 44390 + }, + { + "epoch": 3.596160077770577, + "grad_norm": 0.0915006697177887, + "learning_rate": 2.243575318421171e-05, + "loss": 0.248, + "step": 44391 + }, + { + "epoch": 3.596241088788075, + "grad_norm": 0.09188992530107498, + "learning_rate": 2.2431252531617087e-05, + "loss": 0.2574, + "step": 44392 + }, + { + "epoch": 3.5963220998055734, + "grad_norm": 0.07915124297142029, + "learning_rate": 2.242675187902246e-05, + "loss": 0.2752, + "step": 44393 + }, + { + "epoch": 3.596403110823072, + "grad_norm": 0.062227100133895874, + "learning_rate": 2.2422251226427834e-05, + "loss": 0.2161, + "step": 44394 + }, + { + "epoch": 3.5964841218405703, + "grad_norm": 0.06798578798770905, + "learning_rate": 2.2417750573833208e-05, + "loss": 0.2092, + "step": 44395 + }, + { + "epoch": 3.5965651328580686, + "grad_norm": 0.06951133906841278, + "learning_rate": 2.241324992123858e-05, + "loss": 0.19, + "step": 44396 + }, + { + "epoch": 3.5966461438755672, + "grad_norm": 0.06292098015546799, + "learning_rate": 2.2408749268643955e-05, + "loss": 0.2326, + "step": 44397 + }, + { + "epoch": 3.5967271548930655, + "grad_norm": 0.0799744501709938, + "learning_rate": 2.240424861604933e-05, + "loss": 0.2192, + "step": 44398 + }, + { + "epoch": 3.5968081659105637, + "grad_norm": 0.0672338604927063, + "learning_rate": 2.2399747963454702e-05, + "loss": 0.2101, + "step": 44399 + }, + { + "epoch": 3.5968891769280624, + "grad_norm": 0.08367151021957397, + "learning_rate": 2.2395247310860076e-05, + "loss": 0.2245, + "step": 44400 + }, + { + "epoch": 3.5969701879455607, + "grad_norm": 0.07287932187318802, + "learning_rate": 2.239074665826545e-05, + "loss": 0.2278, + "step": 44401 + }, + { + "epoch": 3.597051198963059, + "grad_norm": 0.06872007995843887, + "learning_rate": 2.2386246005670823e-05, + "loss": 0.2436, + "step": 44402 + }, + { + "epoch": 3.5971322099805576, + "grad_norm": 0.08028792589902878, + "learning_rate": 2.2381745353076197e-05, + "loss": 0.2507, + "step": 44403 + }, + { + "epoch": 3.597213220998056, + "grad_norm": 0.08446816354990005, + "learning_rate": 2.237724470048157e-05, + "loss": 0.2359, + "step": 44404 + }, + { + "epoch": 3.597294232015554, + "grad_norm": 0.06247890740633011, + "learning_rate": 2.2372744047886944e-05, + "loss": 0.1942, + "step": 44405 + }, + { + "epoch": 3.5973752430330523, + "grad_norm": 0.07593336701393127, + "learning_rate": 2.2368243395292317e-05, + "loss": 0.2303, + "step": 44406 + }, + { + "epoch": 3.597456254050551, + "grad_norm": 0.06649833917617798, + "learning_rate": 2.236374274269769e-05, + "loss": 0.2258, + "step": 44407 + }, + { + "epoch": 3.5975372650680493, + "grad_norm": 0.060069162398576736, + "learning_rate": 2.2359242090103068e-05, + "loss": 0.241, + "step": 44408 + }, + { + "epoch": 3.5976182760855475, + "grad_norm": 0.0727672204375267, + "learning_rate": 2.2354741437508438e-05, + "loss": 0.2355, + "step": 44409 + }, + { + "epoch": 3.5976992871030458, + "grad_norm": 0.06563632190227509, + "learning_rate": 2.2350240784913815e-05, + "loss": 0.2434, + "step": 44410 + }, + { + "epoch": 3.5977802981205445, + "grad_norm": 0.06771665811538696, + "learning_rate": 2.234574013231919e-05, + "loss": 0.2109, + "step": 44411 + }, + { + "epoch": 3.5978613091380427, + "grad_norm": 0.07404633611440659, + "learning_rate": 2.234123947972456e-05, + "loss": 0.2942, + "step": 44412 + }, + { + "epoch": 3.597942320155541, + "grad_norm": 0.0690755620598793, + "learning_rate": 2.2336738827129936e-05, + "loss": 0.237, + "step": 44413 + }, + { + "epoch": 3.5980233311730396, + "grad_norm": 0.06853065639734268, + "learning_rate": 2.233223817453531e-05, + "loss": 0.1816, + "step": 44414 + }, + { + "epoch": 3.598104342190538, + "grad_norm": 0.07963988929986954, + "learning_rate": 2.2327737521940683e-05, + "loss": 0.2502, + "step": 44415 + }, + { + "epoch": 3.598185353208036, + "grad_norm": 0.07584356516599655, + "learning_rate": 2.2323236869346057e-05, + "loss": 0.2542, + "step": 44416 + }, + { + "epoch": 3.598266364225535, + "grad_norm": 0.07289121299982071, + "learning_rate": 2.231873621675143e-05, + "loss": 0.2325, + "step": 44417 + }, + { + "epoch": 3.598347375243033, + "grad_norm": 0.0739700198173523, + "learning_rate": 2.2314235564156804e-05, + "loss": 0.2507, + "step": 44418 + }, + { + "epoch": 3.5984283862605313, + "grad_norm": 0.07991982251405716, + "learning_rate": 2.2309734911562178e-05, + "loss": 0.2284, + "step": 44419 + }, + { + "epoch": 3.59850939727803, + "grad_norm": 0.08472296595573425, + "learning_rate": 2.230523425896755e-05, + "loss": 0.2609, + "step": 44420 + }, + { + "epoch": 3.5985904082955282, + "grad_norm": 0.06942620128393173, + "learning_rate": 2.2300733606372925e-05, + "loss": 0.2495, + "step": 44421 + }, + { + "epoch": 3.5986714193130265, + "grad_norm": 0.05914752930402756, + "learning_rate": 2.22962329537783e-05, + "loss": 0.2258, + "step": 44422 + }, + { + "epoch": 3.598752430330525, + "grad_norm": 0.06569509953260422, + "learning_rate": 2.2291732301183675e-05, + "loss": 0.2095, + "step": 44423 + }, + { + "epoch": 3.5988334413480234, + "grad_norm": 0.06737083941698074, + "learning_rate": 2.2287231648589046e-05, + "loss": 0.1989, + "step": 44424 + }, + { + "epoch": 3.5989144523655217, + "grad_norm": 0.07976718991994858, + "learning_rate": 2.228273099599442e-05, + "loss": 0.2179, + "step": 44425 + }, + { + "epoch": 3.5989954633830203, + "grad_norm": 0.06348604708909988, + "learning_rate": 2.2278230343399796e-05, + "loss": 0.2015, + "step": 44426 + }, + { + "epoch": 3.5990764744005186, + "grad_norm": 0.08161592483520508, + "learning_rate": 2.2273729690805166e-05, + "loss": 0.1966, + "step": 44427 + }, + { + "epoch": 3.599157485418017, + "grad_norm": 0.0814145877957344, + "learning_rate": 2.2269229038210543e-05, + "loss": 0.236, + "step": 44428 + }, + { + "epoch": 3.599238496435515, + "grad_norm": 0.07275941967964172, + "learning_rate": 2.2264728385615917e-05, + "loss": 0.2234, + "step": 44429 + }, + { + "epoch": 3.5993195074530138, + "grad_norm": 0.07201056182384491, + "learning_rate": 2.2260227733021287e-05, + "loss": 0.2436, + "step": 44430 + }, + { + "epoch": 3.599400518470512, + "grad_norm": 0.06446519494056702, + "learning_rate": 2.2255727080426664e-05, + "loss": 0.1978, + "step": 44431 + }, + { + "epoch": 3.5994815294880103, + "grad_norm": 0.0709887146949768, + "learning_rate": 2.2251226427832038e-05, + "loss": 0.2143, + "step": 44432 + }, + { + "epoch": 3.5995625405055085, + "grad_norm": 0.06211159750819206, + "learning_rate": 2.2246725775237408e-05, + "loss": 0.226, + "step": 44433 + }, + { + "epoch": 3.599643551523007, + "grad_norm": 0.0790424570441246, + "learning_rate": 2.2242225122642785e-05, + "loss": 0.2229, + "step": 44434 + }, + { + "epoch": 3.5997245625405054, + "grad_norm": 0.06854367256164551, + "learning_rate": 2.223772447004816e-05, + "loss": 0.2276, + "step": 44435 + }, + { + "epoch": 3.5998055735580037, + "grad_norm": 0.08491768687963486, + "learning_rate": 2.2233223817453532e-05, + "loss": 0.2221, + "step": 44436 + }, + { + "epoch": 3.5998865845755024, + "grad_norm": 0.08224175870418549, + "learning_rate": 2.2228723164858906e-05, + "loss": 0.2581, + "step": 44437 + }, + { + "epoch": 3.5999675955930006, + "grad_norm": 0.07533169537782669, + "learning_rate": 2.222422251226428e-05, + "loss": 0.2383, + "step": 44438 + }, + { + "epoch": 3.600048606610499, + "grad_norm": 0.06689838320016861, + "learning_rate": 2.2219721859669653e-05, + "loss": 0.225, + "step": 44439 + }, + { + "epoch": 3.6001296176279975, + "grad_norm": 0.07553958892822266, + "learning_rate": 2.2215221207075027e-05, + "loss": 0.2053, + "step": 44440 + }, + { + "epoch": 3.600210628645496, + "grad_norm": 0.05749347805976868, + "learning_rate": 2.2210720554480404e-05, + "loss": 0.2049, + "step": 44441 + }, + { + "epoch": 3.600291639662994, + "grad_norm": 0.0675349161028862, + "learning_rate": 2.2206219901885774e-05, + "loss": 0.2247, + "step": 44442 + }, + { + "epoch": 3.6003726506804927, + "grad_norm": 0.06793300807476044, + "learning_rate": 2.2201719249291147e-05, + "loss": 0.1925, + "step": 44443 + }, + { + "epoch": 3.600453661697991, + "grad_norm": 0.07396746426820755, + "learning_rate": 2.2197218596696524e-05, + "loss": 0.2137, + "step": 44444 + }, + { + "epoch": 3.600534672715489, + "grad_norm": 0.07258230447769165, + "learning_rate": 2.2192717944101895e-05, + "loss": 0.2222, + "step": 44445 + }, + { + "epoch": 3.600615683732988, + "grad_norm": 0.06613799929618835, + "learning_rate": 2.2188217291507268e-05, + "loss": 0.2336, + "step": 44446 + }, + { + "epoch": 3.600696694750486, + "grad_norm": 0.095277801156044, + "learning_rate": 2.2183716638912645e-05, + "loss": 0.2383, + "step": 44447 + }, + { + "epoch": 3.6007777057679844, + "grad_norm": 0.08406014740467072, + "learning_rate": 2.2179215986318015e-05, + "loss": 0.2457, + "step": 44448 + }, + { + "epoch": 3.600858716785483, + "grad_norm": 0.062288954854011536, + "learning_rate": 2.2174715333723392e-05, + "loss": 0.2496, + "step": 44449 + }, + { + "epoch": 3.6009397278029813, + "grad_norm": 0.07066863775253296, + "learning_rate": 2.2170214681128766e-05, + "loss": 0.2206, + "step": 44450 + }, + { + "epoch": 3.6010207388204796, + "grad_norm": 0.08926728367805481, + "learning_rate": 2.2165714028534136e-05, + "loss": 0.2167, + "step": 44451 + }, + { + "epoch": 3.601101749837978, + "grad_norm": 0.06768003851175308, + "learning_rate": 2.2161213375939513e-05, + "loss": 0.2062, + "step": 44452 + }, + { + "epoch": 3.6011827608554765, + "grad_norm": 0.07586654275655746, + "learning_rate": 2.2156712723344887e-05, + "loss": 0.2718, + "step": 44453 + }, + { + "epoch": 3.6012637718729748, + "grad_norm": 0.07940798252820969, + "learning_rate": 2.215221207075026e-05, + "loss": 0.2498, + "step": 44454 + }, + { + "epoch": 3.601344782890473, + "grad_norm": 0.07156102359294891, + "learning_rate": 2.2147711418155634e-05, + "loss": 0.2055, + "step": 44455 + }, + { + "epoch": 3.6014257939079712, + "grad_norm": 0.06990113854408264, + "learning_rate": 2.2143210765561008e-05, + "loss": 0.2067, + "step": 44456 + }, + { + "epoch": 3.60150680492547, + "grad_norm": 0.06797709316015244, + "learning_rate": 2.213871011296638e-05, + "loss": 0.2087, + "step": 44457 + }, + { + "epoch": 3.601587815942968, + "grad_norm": 0.06196364760398865, + "learning_rate": 2.2134209460371755e-05, + "loss": 0.2331, + "step": 44458 + }, + { + "epoch": 3.6016688269604664, + "grad_norm": 0.07308321446180344, + "learning_rate": 2.212970880777713e-05, + "loss": 0.2462, + "step": 44459 + }, + { + "epoch": 3.601749837977965, + "grad_norm": 0.08339798450469971, + "learning_rate": 2.2125208155182502e-05, + "loss": 0.1993, + "step": 44460 + }, + { + "epoch": 3.6018308489954634, + "grad_norm": 0.07711708545684814, + "learning_rate": 2.2120707502587876e-05, + "loss": 0.181, + "step": 44461 + }, + { + "epoch": 3.6019118600129616, + "grad_norm": 0.06854071468114853, + "learning_rate": 2.2116206849993253e-05, + "loss": 0.1748, + "step": 44462 + }, + { + "epoch": 3.6019928710304603, + "grad_norm": 0.07287586480379105, + "learning_rate": 2.2111706197398623e-05, + "loss": 0.2096, + "step": 44463 + }, + { + "epoch": 3.6020738820479585, + "grad_norm": 0.07109333574771881, + "learning_rate": 2.2107205544803997e-05, + "loss": 0.2754, + "step": 44464 + }, + { + "epoch": 3.6021548930654568, + "grad_norm": 0.06433144956827164, + "learning_rate": 2.2102704892209374e-05, + "loss": 0.2237, + "step": 44465 + }, + { + "epoch": 3.6022359040829555, + "grad_norm": 0.07228867709636688, + "learning_rate": 2.2098204239614744e-05, + "loss": 0.2134, + "step": 44466 + }, + { + "epoch": 3.6023169151004537, + "grad_norm": 0.09589815884828568, + "learning_rate": 2.209370358702012e-05, + "loss": 0.209, + "step": 44467 + }, + { + "epoch": 3.602397926117952, + "grad_norm": 0.08793746680021286, + "learning_rate": 2.2089202934425494e-05, + "loss": 0.2464, + "step": 44468 + }, + { + "epoch": 3.6024789371354506, + "grad_norm": 0.07410049438476562, + "learning_rate": 2.2084702281830865e-05, + "loss": 0.2563, + "step": 44469 + }, + { + "epoch": 3.602559948152949, + "grad_norm": 0.07637574523687363, + "learning_rate": 2.208020162923624e-05, + "loss": 0.2466, + "step": 44470 + }, + { + "epoch": 3.602640959170447, + "grad_norm": 0.06692693382501602, + "learning_rate": 2.2075700976641615e-05, + "loss": 0.2339, + "step": 44471 + }, + { + "epoch": 3.602721970187946, + "grad_norm": 0.06131626293063164, + "learning_rate": 2.2071200324046985e-05, + "loss": 0.2251, + "step": 44472 + }, + { + "epoch": 3.602802981205444, + "grad_norm": 0.06866336613893509, + "learning_rate": 2.2066699671452362e-05, + "loss": 0.2066, + "step": 44473 + }, + { + "epoch": 3.6028839922229423, + "grad_norm": 0.05573740229010582, + "learning_rate": 2.2062199018857736e-05, + "loss": 0.193, + "step": 44474 + }, + { + "epoch": 3.6029650032404406, + "grad_norm": 0.06605105847120285, + "learning_rate": 2.205769836626311e-05, + "loss": 0.2499, + "step": 44475 + }, + { + "epoch": 3.6030460142579392, + "grad_norm": 0.06071867421269417, + "learning_rate": 2.2053197713668483e-05, + "loss": 0.2128, + "step": 44476 + }, + { + "epoch": 3.6031270252754375, + "grad_norm": 0.06595513224601746, + "learning_rate": 2.2048697061073857e-05, + "loss": 0.1721, + "step": 44477 + }, + { + "epoch": 3.6032080362929357, + "grad_norm": 0.07105343788862228, + "learning_rate": 2.204419640847923e-05, + "loss": 0.2182, + "step": 44478 + }, + { + "epoch": 3.603289047310434, + "grad_norm": 0.06703602522611618, + "learning_rate": 2.2039695755884604e-05, + "loss": 0.2671, + "step": 44479 + }, + { + "epoch": 3.6033700583279327, + "grad_norm": 0.06646741926670074, + "learning_rate": 2.2035195103289978e-05, + "loss": 0.1988, + "step": 44480 + }, + { + "epoch": 3.603451069345431, + "grad_norm": 0.07492316514253616, + "learning_rate": 2.203069445069535e-05, + "loss": 0.2097, + "step": 44481 + }, + { + "epoch": 3.603532080362929, + "grad_norm": 0.08570502698421478, + "learning_rate": 2.2026193798100725e-05, + "loss": 0.2202, + "step": 44482 + }, + { + "epoch": 3.603613091380428, + "grad_norm": 0.07124827057123184, + "learning_rate": 2.2021693145506102e-05, + "loss": 0.2481, + "step": 44483 + }, + { + "epoch": 3.603694102397926, + "grad_norm": 0.0758836641907692, + "learning_rate": 2.2017192492911472e-05, + "loss": 0.1894, + "step": 44484 + }, + { + "epoch": 3.6037751134154243, + "grad_norm": 0.07663784176111221, + "learning_rate": 2.2012691840316846e-05, + "loss": 0.253, + "step": 44485 + }, + { + "epoch": 3.603856124432923, + "grad_norm": 0.07608118653297424, + "learning_rate": 2.2008191187722223e-05, + "loss": 0.1728, + "step": 44486 + }, + { + "epoch": 3.6039371354504213, + "grad_norm": 0.08834954351186752, + "learning_rate": 2.2003690535127593e-05, + "loss": 0.2255, + "step": 44487 + }, + { + "epoch": 3.6040181464679195, + "grad_norm": 0.09142038226127625, + "learning_rate": 2.199918988253297e-05, + "loss": 0.2572, + "step": 44488 + }, + { + "epoch": 3.604099157485418, + "grad_norm": 0.07551602274179459, + "learning_rate": 2.1994689229938343e-05, + "loss": 0.2013, + "step": 44489 + }, + { + "epoch": 3.6041801685029164, + "grad_norm": 0.07208868861198425, + "learning_rate": 2.1990188577343714e-05, + "loss": 0.2482, + "step": 44490 + }, + { + "epoch": 3.6042611795204147, + "grad_norm": 0.0773419663310051, + "learning_rate": 2.198568792474909e-05, + "loss": 0.2167, + "step": 44491 + }, + { + "epoch": 3.6043421905379134, + "grad_norm": 0.0722963884472847, + "learning_rate": 2.1981187272154464e-05, + "loss": 0.2747, + "step": 44492 + }, + { + "epoch": 3.6044232015554116, + "grad_norm": 0.08947137743234634, + "learning_rate": 2.1976686619559834e-05, + "loss": 0.1939, + "step": 44493 + }, + { + "epoch": 3.60450421257291, + "grad_norm": 0.07143185287714005, + "learning_rate": 2.197218596696521e-05, + "loss": 0.2449, + "step": 44494 + }, + { + "epoch": 3.6045852235904086, + "grad_norm": 0.08382023870944977, + "learning_rate": 2.1967685314370585e-05, + "loss": 0.2386, + "step": 44495 + }, + { + "epoch": 3.604666234607907, + "grad_norm": 0.07039798051118851, + "learning_rate": 2.196318466177596e-05, + "loss": 0.2348, + "step": 44496 + }, + { + "epoch": 3.604747245625405, + "grad_norm": 0.08140499144792557, + "learning_rate": 2.1958684009181332e-05, + "loss": 0.2135, + "step": 44497 + }, + { + "epoch": 3.6048282566429033, + "grad_norm": 0.0793289989233017, + "learning_rate": 2.1954183356586706e-05, + "loss": 0.2307, + "step": 44498 + }, + { + "epoch": 3.6049092676604015, + "grad_norm": 0.066397525370121, + "learning_rate": 2.194968270399208e-05, + "loss": 0.2205, + "step": 44499 + }, + { + "epoch": 3.6049902786779002, + "grad_norm": 0.07063345611095428, + "learning_rate": 2.1945182051397453e-05, + "loss": 0.2415, + "step": 44500 + }, + { + "epoch": 3.6050712896953985, + "grad_norm": 0.07360769063234329, + "learning_rate": 2.194068139880283e-05, + "loss": 0.2323, + "step": 44501 + }, + { + "epoch": 3.6051523007128967, + "grad_norm": 0.0656171664595604, + "learning_rate": 2.19361807462082e-05, + "loss": 0.2111, + "step": 44502 + }, + { + "epoch": 3.6052333117303954, + "grad_norm": 0.08094649016857147, + "learning_rate": 2.1931680093613574e-05, + "loss": 0.223, + "step": 44503 + }, + { + "epoch": 3.6053143227478937, + "grad_norm": 0.0766524225473404, + "learning_rate": 2.192717944101895e-05, + "loss": 0.2723, + "step": 44504 + }, + { + "epoch": 3.605395333765392, + "grad_norm": 0.057995811104774475, + "learning_rate": 2.192267878842432e-05, + "loss": 0.2477, + "step": 44505 + }, + { + "epoch": 3.6054763447828906, + "grad_norm": 0.07008778303861618, + "learning_rate": 2.1918178135829695e-05, + "loss": 0.2284, + "step": 44506 + }, + { + "epoch": 3.605557355800389, + "grad_norm": 0.06211728975176811, + "learning_rate": 2.191367748323507e-05, + "loss": 0.2335, + "step": 44507 + }, + { + "epoch": 3.605638366817887, + "grad_norm": 0.08289367705583572, + "learning_rate": 2.1909176830640442e-05, + "loss": 0.2493, + "step": 44508 + }, + { + "epoch": 3.6057193778353858, + "grad_norm": 0.06763249635696411, + "learning_rate": 2.190467617804582e-05, + "loss": 0.194, + "step": 44509 + }, + { + "epoch": 3.605800388852884, + "grad_norm": 0.07699555903673172, + "learning_rate": 2.1900175525451192e-05, + "loss": 0.2236, + "step": 44510 + }, + { + "epoch": 3.6058813998703823, + "grad_norm": 0.07511116564273834, + "learning_rate": 2.1895674872856563e-05, + "loss": 0.2775, + "step": 44511 + }, + { + "epoch": 3.605962410887881, + "grad_norm": 0.07855696231126785, + "learning_rate": 2.189117422026194e-05, + "loss": 0.2078, + "step": 44512 + }, + { + "epoch": 3.606043421905379, + "grad_norm": 0.07785475254058838, + "learning_rate": 2.1886673567667313e-05, + "loss": 0.2025, + "step": 44513 + }, + { + "epoch": 3.6061244329228774, + "grad_norm": 0.050321418792009354, + "learning_rate": 2.1882172915072687e-05, + "loss": 0.1905, + "step": 44514 + }, + { + "epoch": 3.606205443940376, + "grad_norm": 0.07907307147979736, + "learning_rate": 2.187767226247806e-05, + "loss": 0.2162, + "step": 44515 + }, + { + "epoch": 3.6062864549578744, + "grad_norm": 0.08303670585155487, + "learning_rate": 2.1873171609883434e-05, + "loss": 0.2084, + "step": 44516 + }, + { + "epoch": 3.6063674659753726, + "grad_norm": 0.07435812801122665, + "learning_rate": 2.1868670957288808e-05, + "loss": 0.2426, + "step": 44517 + }, + { + "epoch": 3.6064484769928713, + "grad_norm": 0.06999189406633377, + "learning_rate": 2.186417030469418e-05, + "loss": 0.2199, + "step": 44518 + }, + { + "epoch": 3.6065294880103695, + "grad_norm": 0.07246110588312149, + "learning_rate": 2.1859669652099555e-05, + "loss": 0.2025, + "step": 44519 + }, + { + "epoch": 3.606610499027868, + "grad_norm": 0.06282434612512589, + "learning_rate": 2.185516899950493e-05, + "loss": 0.1905, + "step": 44520 + }, + { + "epoch": 3.606691510045366, + "grad_norm": 0.08029942214488983, + "learning_rate": 2.1850668346910302e-05, + "loss": 0.248, + "step": 44521 + }, + { + "epoch": 3.6067725210628643, + "grad_norm": 0.06799287348985672, + "learning_rate": 2.184616769431568e-05, + "loss": 0.2039, + "step": 44522 + }, + { + "epoch": 3.606853532080363, + "grad_norm": 0.06617872416973114, + "learning_rate": 2.184166704172105e-05, + "loss": 0.2185, + "step": 44523 + }, + { + "epoch": 3.606934543097861, + "grad_norm": 0.06510534137487411, + "learning_rate": 2.1837166389126423e-05, + "loss": 0.2132, + "step": 44524 + }, + { + "epoch": 3.6070155541153595, + "grad_norm": 0.08161192387342453, + "learning_rate": 2.18326657365318e-05, + "loss": 0.2299, + "step": 44525 + }, + { + "epoch": 3.607096565132858, + "grad_norm": 0.0706726610660553, + "learning_rate": 2.182816508393717e-05, + "loss": 0.2386, + "step": 44526 + }, + { + "epoch": 3.6071775761503564, + "grad_norm": 0.08002948760986328, + "learning_rate": 2.1823664431342547e-05, + "loss": 0.2566, + "step": 44527 + }, + { + "epoch": 3.6072585871678546, + "grad_norm": 0.0903329849243164, + "learning_rate": 2.181916377874792e-05, + "loss": 0.2025, + "step": 44528 + }, + { + "epoch": 3.6073395981853533, + "grad_norm": 0.0838218405842781, + "learning_rate": 2.181466312615329e-05, + "loss": 0.2238, + "step": 44529 + }, + { + "epoch": 3.6074206092028516, + "grad_norm": 0.05963883921504021, + "learning_rate": 2.1810162473558668e-05, + "loss": 0.2197, + "step": 44530 + }, + { + "epoch": 3.60750162022035, + "grad_norm": 0.06153491139411926, + "learning_rate": 2.180566182096404e-05, + "loss": 0.2145, + "step": 44531 + }, + { + "epoch": 3.6075826312378485, + "grad_norm": 0.09146939218044281, + "learning_rate": 2.1801161168369412e-05, + "loss": 0.1886, + "step": 44532 + }, + { + "epoch": 3.6076636422553467, + "grad_norm": 0.0709371566772461, + "learning_rate": 2.179666051577479e-05, + "loss": 0.2091, + "step": 44533 + }, + { + "epoch": 3.607744653272845, + "grad_norm": 0.0718485489487648, + "learning_rate": 2.1792159863180162e-05, + "loss": 0.1836, + "step": 44534 + }, + { + "epoch": 3.6078256642903437, + "grad_norm": 0.07745910435914993, + "learning_rate": 2.1787659210585536e-05, + "loss": 0.2554, + "step": 44535 + }, + { + "epoch": 3.607906675307842, + "grad_norm": 0.0641883909702301, + "learning_rate": 2.178315855799091e-05, + "loss": 0.2313, + "step": 44536 + }, + { + "epoch": 3.60798768632534, + "grad_norm": 0.07381313294172287, + "learning_rate": 2.1778657905396283e-05, + "loss": 0.2247, + "step": 44537 + }, + { + "epoch": 3.608068697342839, + "grad_norm": 0.08087126165628433, + "learning_rate": 2.1774157252801657e-05, + "loss": 0.262, + "step": 44538 + }, + { + "epoch": 3.608149708360337, + "grad_norm": 0.08404241502285004, + "learning_rate": 2.176965660020703e-05, + "loss": 0.2329, + "step": 44539 + }, + { + "epoch": 3.6082307193778353, + "grad_norm": 0.06949184834957123, + "learning_rate": 2.1765155947612407e-05, + "loss": 0.2318, + "step": 44540 + }, + { + "epoch": 3.608311730395334, + "grad_norm": 0.05711599066853523, + "learning_rate": 2.1760655295017778e-05, + "loss": 0.2243, + "step": 44541 + }, + { + "epoch": 3.6083927414128323, + "grad_norm": 0.06719649583101273, + "learning_rate": 2.175615464242315e-05, + "loss": 0.2183, + "step": 44542 + }, + { + "epoch": 3.6084737524303305, + "grad_norm": 0.07708943635225296, + "learning_rate": 2.1751653989828528e-05, + "loss": 0.2206, + "step": 44543 + }, + { + "epoch": 3.6085547634478288, + "grad_norm": 0.06825980544090271, + "learning_rate": 2.17471533372339e-05, + "loss": 0.2231, + "step": 44544 + }, + { + "epoch": 3.608635774465327, + "grad_norm": 0.06791453808546066, + "learning_rate": 2.1742652684639272e-05, + "loss": 0.2441, + "step": 44545 + }, + { + "epoch": 3.6087167854828257, + "grad_norm": 0.06769877672195435, + "learning_rate": 2.173815203204465e-05, + "loss": 0.2064, + "step": 44546 + }, + { + "epoch": 3.608797796500324, + "grad_norm": 0.091277115046978, + "learning_rate": 2.173365137945002e-05, + "loss": 0.2503, + "step": 44547 + }, + { + "epoch": 3.608878807517822, + "grad_norm": 0.08346903324127197, + "learning_rate": 2.1729150726855396e-05, + "loss": 0.2338, + "step": 44548 + }, + { + "epoch": 3.608959818535321, + "grad_norm": 0.07855582237243652, + "learning_rate": 2.172465007426077e-05, + "loss": 0.2353, + "step": 44549 + }, + { + "epoch": 3.609040829552819, + "grad_norm": 0.07885964959859848, + "learning_rate": 2.172014942166614e-05, + "loss": 0.2253, + "step": 44550 + }, + { + "epoch": 3.6091218405703174, + "grad_norm": 0.08205586671829224, + "learning_rate": 2.1715648769071517e-05, + "loss": 0.2436, + "step": 44551 + }, + { + "epoch": 3.609202851587816, + "grad_norm": 0.07758751511573792, + "learning_rate": 2.171114811647689e-05, + "loss": 0.2608, + "step": 44552 + }, + { + "epoch": 3.6092838626053143, + "grad_norm": 0.08244801312685013, + "learning_rate": 2.1706647463882264e-05, + "loss": 0.2469, + "step": 44553 + }, + { + "epoch": 3.6093648736228126, + "grad_norm": 0.07393813878297806, + "learning_rate": 2.1702146811287638e-05, + "loss": 0.2355, + "step": 44554 + }, + { + "epoch": 3.6094458846403112, + "grad_norm": 0.07184799015522003, + "learning_rate": 2.169764615869301e-05, + "loss": 0.2044, + "step": 44555 + }, + { + "epoch": 3.6095268956578095, + "grad_norm": 0.08496291935443878, + "learning_rate": 2.1693145506098385e-05, + "loss": 0.2218, + "step": 44556 + }, + { + "epoch": 3.6096079066753077, + "grad_norm": 0.06793703883886337, + "learning_rate": 2.168864485350376e-05, + "loss": 0.2193, + "step": 44557 + }, + { + "epoch": 3.6096889176928064, + "grad_norm": 0.07540687918663025, + "learning_rate": 2.1684144200909132e-05, + "loss": 0.2394, + "step": 44558 + }, + { + "epoch": 3.6097699287103047, + "grad_norm": 0.08064840734004974, + "learning_rate": 2.1679643548314506e-05, + "loss": 0.2562, + "step": 44559 + }, + { + "epoch": 3.609850939727803, + "grad_norm": 0.07052040100097656, + "learning_rate": 2.167514289571988e-05, + "loss": 0.2064, + "step": 44560 + }, + { + "epoch": 3.6099319507453016, + "grad_norm": 0.07243770360946655, + "learning_rate": 2.1670642243125256e-05, + "loss": 0.2232, + "step": 44561 + }, + { + "epoch": 3.6100129617628, + "grad_norm": 0.08791134506464005, + "learning_rate": 2.1666141590530627e-05, + "loss": 0.2072, + "step": 44562 + }, + { + "epoch": 3.610093972780298, + "grad_norm": 0.07799384742975235, + "learning_rate": 2.1661640937936e-05, + "loss": 0.2253, + "step": 44563 + }, + { + "epoch": 3.6101749837977968, + "grad_norm": 0.07295829802751541, + "learning_rate": 2.1657140285341377e-05, + "loss": 0.2282, + "step": 44564 + }, + { + "epoch": 3.610255994815295, + "grad_norm": 0.06831017136573792, + "learning_rate": 2.1652639632746747e-05, + "loss": 0.1993, + "step": 44565 + }, + { + "epoch": 3.6103370058327933, + "grad_norm": 0.07613605260848999, + "learning_rate": 2.164813898015212e-05, + "loss": 0.1975, + "step": 44566 + }, + { + "epoch": 3.6104180168502915, + "grad_norm": 0.06792711466550827, + "learning_rate": 2.1643638327557498e-05, + "loss": 0.2095, + "step": 44567 + }, + { + "epoch": 3.6104990278677898, + "grad_norm": 0.06990785151720047, + "learning_rate": 2.163913767496287e-05, + "loss": 0.2229, + "step": 44568 + }, + { + "epoch": 3.6105800388852884, + "grad_norm": 0.06692981719970703, + "learning_rate": 2.1634637022368245e-05, + "loss": 0.194, + "step": 44569 + }, + { + "epoch": 3.6106610499027867, + "grad_norm": 0.061009567230939865, + "learning_rate": 2.163013636977362e-05, + "loss": 0.2055, + "step": 44570 + }, + { + "epoch": 3.610742060920285, + "grad_norm": 0.08093198388814926, + "learning_rate": 2.1625635717178992e-05, + "loss": 0.2446, + "step": 44571 + }, + { + "epoch": 3.6108230719377836, + "grad_norm": 0.07283247262239456, + "learning_rate": 2.1621135064584366e-05, + "loss": 0.2425, + "step": 44572 + }, + { + "epoch": 3.610904082955282, + "grad_norm": 0.07472264766693115, + "learning_rate": 2.161663441198974e-05, + "loss": 0.2406, + "step": 44573 + }, + { + "epoch": 3.61098509397278, + "grad_norm": 0.06830692291259766, + "learning_rate": 2.1612133759395113e-05, + "loss": 0.2285, + "step": 44574 + }, + { + "epoch": 3.611066104990279, + "grad_norm": 0.07120934873819351, + "learning_rate": 2.1607633106800487e-05, + "loss": 0.2393, + "step": 44575 + }, + { + "epoch": 3.611147116007777, + "grad_norm": 0.07623013108968735, + "learning_rate": 2.160313245420586e-05, + "loss": 0.2172, + "step": 44576 + }, + { + "epoch": 3.6112281270252753, + "grad_norm": 0.06323181092739105, + "learning_rate": 2.1598631801611234e-05, + "loss": 0.2188, + "step": 44577 + }, + { + "epoch": 3.611309138042774, + "grad_norm": 0.07442563772201538, + "learning_rate": 2.1594131149016608e-05, + "loss": 0.2233, + "step": 44578 + }, + { + "epoch": 3.6113901490602722, + "grad_norm": 0.07695084065198898, + "learning_rate": 2.158963049642198e-05, + "loss": 0.2235, + "step": 44579 + }, + { + "epoch": 3.6114711600777705, + "grad_norm": 0.04874782636761665, + "learning_rate": 2.1585129843827355e-05, + "loss": 0.2131, + "step": 44580 + }, + { + "epoch": 3.611552171095269, + "grad_norm": 0.05301962047815323, + "learning_rate": 2.158062919123273e-05, + "loss": 0.229, + "step": 44581 + }, + { + "epoch": 3.6116331821127674, + "grad_norm": 0.07020232826471329, + "learning_rate": 2.1576128538638105e-05, + "loss": 0.224, + "step": 44582 + }, + { + "epoch": 3.6117141931302656, + "grad_norm": 0.06658996641635895, + "learning_rate": 2.1571627886043476e-05, + "loss": 0.2266, + "step": 44583 + }, + { + "epoch": 3.6117952041477643, + "grad_norm": 0.059480443596839905, + "learning_rate": 2.156712723344885e-05, + "loss": 0.1787, + "step": 44584 + }, + { + "epoch": 3.6118762151652626, + "grad_norm": 0.06132667139172554, + "learning_rate": 2.1562626580854226e-05, + "loss": 0.2423, + "step": 44585 + }, + { + "epoch": 3.611957226182761, + "grad_norm": 0.06131211668252945, + "learning_rate": 2.15581259282596e-05, + "loss": 0.2182, + "step": 44586 + }, + { + "epoch": 3.612038237200259, + "grad_norm": 0.07730801403522491, + "learning_rate": 2.1553625275664973e-05, + "loss": 0.2484, + "step": 44587 + }, + { + "epoch": 3.6121192482177578, + "grad_norm": 0.07488203793764114, + "learning_rate": 2.1549124623070347e-05, + "loss": 0.2381, + "step": 44588 + }, + { + "epoch": 3.612200259235256, + "grad_norm": 0.06727041304111481, + "learning_rate": 2.154462397047572e-05, + "loss": 0.1783, + "step": 44589 + }, + { + "epoch": 3.6122812702527543, + "grad_norm": 0.084172323346138, + "learning_rate": 2.1540123317881094e-05, + "loss": 0.2574, + "step": 44590 + }, + { + "epoch": 3.6123622812702525, + "grad_norm": 0.07354126125574112, + "learning_rate": 2.1535622665286468e-05, + "loss": 0.2057, + "step": 44591 + }, + { + "epoch": 3.612443292287751, + "grad_norm": 0.08954668790102005, + "learning_rate": 2.153112201269184e-05, + "loss": 0.2535, + "step": 44592 + }, + { + "epoch": 3.6125243033052494, + "grad_norm": 0.07390826940536499, + "learning_rate": 2.1526621360097215e-05, + "loss": 0.1942, + "step": 44593 + }, + { + "epoch": 3.6126053143227477, + "grad_norm": 0.07131165266036987, + "learning_rate": 2.152212070750259e-05, + "loss": 0.2369, + "step": 44594 + }, + { + "epoch": 3.6126863253402464, + "grad_norm": 0.07621818780899048, + "learning_rate": 2.1517620054907962e-05, + "loss": 0.2525, + "step": 44595 + }, + { + "epoch": 3.6127673363577446, + "grad_norm": 0.06234192103147507, + "learning_rate": 2.1513119402313336e-05, + "loss": 0.2274, + "step": 44596 + }, + { + "epoch": 3.612848347375243, + "grad_norm": 0.0673263743519783, + "learning_rate": 2.150861874971871e-05, + "loss": 0.2239, + "step": 44597 + }, + { + "epoch": 3.6129293583927415, + "grad_norm": 0.08373507857322693, + "learning_rate": 2.1504118097124083e-05, + "loss": 0.2688, + "step": 44598 + }, + { + "epoch": 3.61301036941024, + "grad_norm": 0.0701875388622284, + "learning_rate": 2.1499617444529457e-05, + "loss": 0.2286, + "step": 44599 + }, + { + "epoch": 3.613091380427738, + "grad_norm": 0.0799219161272049, + "learning_rate": 2.1495116791934834e-05, + "loss": 0.2227, + "step": 44600 + }, + { + "epoch": 3.6131723914452367, + "grad_norm": 0.07932981103658676, + "learning_rate": 2.1490616139340204e-05, + "loss": 0.2621, + "step": 44601 + }, + { + "epoch": 3.613253402462735, + "grad_norm": 0.07419607788324356, + "learning_rate": 2.1486115486745578e-05, + "loss": 0.2149, + "step": 44602 + }, + { + "epoch": 3.613334413480233, + "grad_norm": 0.0710701197385788, + "learning_rate": 2.1481614834150955e-05, + "loss": 0.2112, + "step": 44603 + }, + { + "epoch": 3.613415424497732, + "grad_norm": 0.0597371831536293, + "learning_rate": 2.1477114181556328e-05, + "loss": 0.1734, + "step": 44604 + }, + { + "epoch": 3.61349643551523, + "grad_norm": 0.07174075394868851, + "learning_rate": 2.14726135289617e-05, + "loss": 0.1975, + "step": 44605 + }, + { + "epoch": 3.6135774465327284, + "grad_norm": 0.06435782462358475, + "learning_rate": 2.1468112876367075e-05, + "loss": 0.2274, + "step": 44606 + }, + { + "epoch": 3.613658457550227, + "grad_norm": 0.09974440187215805, + "learning_rate": 2.146361222377245e-05, + "loss": 0.3172, + "step": 44607 + }, + { + "epoch": 3.6137394685677253, + "grad_norm": 0.07410252839326859, + "learning_rate": 2.1459111571177823e-05, + "loss": 0.2305, + "step": 44608 + }, + { + "epoch": 3.6138204795852236, + "grad_norm": 0.06656160205602646, + "learning_rate": 2.1454610918583196e-05, + "loss": 0.2305, + "step": 44609 + }, + { + "epoch": 3.613901490602722, + "grad_norm": 0.07200726866722107, + "learning_rate": 2.145011026598857e-05, + "loss": 0.2414, + "step": 44610 + }, + { + "epoch": 3.6139825016202205, + "grad_norm": 0.0850789025425911, + "learning_rate": 2.1445609613393943e-05, + "loss": 0.2371, + "step": 44611 + }, + { + "epoch": 3.6140635126377187, + "grad_norm": 0.08625289052724838, + "learning_rate": 2.1441108960799317e-05, + "loss": 0.2245, + "step": 44612 + }, + { + "epoch": 3.614144523655217, + "grad_norm": 0.07585910707712173, + "learning_rate": 2.143660830820469e-05, + "loss": 0.2979, + "step": 44613 + }, + { + "epoch": 3.6142255346727152, + "grad_norm": 0.07107369601726532, + "learning_rate": 2.1432107655610064e-05, + "loss": 0.1882, + "step": 44614 + }, + { + "epoch": 3.614306545690214, + "grad_norm": 0.07323051989078522, + "learning_rate": 2.1427607003015438e-05, + "loss": 0.2188, + "step": 44615 + }, + { + "epoch": 3.614387556707712, + "grad_norm": 0.07132337987422943, + "learning_rate": 2.142310635042081e-05, + "loss": 0.2036, + "step": 44616 + }, + { + "epoch": 3.6144685677252104, + "grad_norm": 0.07494372874498367, + "learning_rate": 2.1418605697826185e-05, + "loss": 0.2321, + "step": 44617 + }, + { + "epoch": 3.614549578742709, + "grad_norm": 0.08323635160923004, + "learning_rate": 2.141410504523156e-05, + "loss": 0.2297, + "step": 44618 + }, + { + "epoch": 3.6146305897602073, + "grad_norm": 0.061273686587810516, + "learning_rate": 2.1409604392636932e-05, + "loss": 0.2212, + "step": 44619 + }, + { + "epoch": 3.6147116007777056, + "grad_norm": 0.09287270903587341, + "learning_rate": 2.1405103740042306e-05, + "loss": 0.2586, + "step": 44620 + }, + { + "epoch": 3.6147926117952043, + "grad_norm": 0.07976991683244705, + "learning_rate": 2.1400603087447683e-05, + "loss": 0.2058, + "step": 44621 + }, + { + "epoch": 3.6148736228127025, + "grad_norm": 0.06702376157045364, + "learning_rate": 2.1396102434853056e-05, + "loss": 0.2451, + "step": 44622 + }, + { + "epoch": 3.6149546338302008, + "grad_norm": 0.08406377583742142, + "learning_rate": 2.1391601782258427e-05, + "loss": 0.2201, + "step": 44623 + }, + { + "epoch": 3.6150356448476995, + "grad_norm": 0.06391263008117676, + "learning_rate": 2.1387101129663804e-05, + "loss": 0.2116, + "step": 44624 + }, + { + "epoch": 3.6151166558651977, + "grad_norm": 0.07209109514951706, + "learning_rate": 2.1382600477069177e-05, + "loss": 0.2178, + "step": 44625 + }, + { + "epoch": 3.615197666882696, + "grad_norm": 0.05859759449958801, + "learning_rate": 2.137809982447455e-05, + "loss": 0.1891, + "step": 44626 + }, + { + "epoch": 3.6152786779001946, + "grad_norm": 0.06623557209968567, + "learning_rate": 2.1373599171879924e-05, + "loss": 0.2343, + "step": 44627 + }, + { + "epoch": 3.615359688917693, + "grad_norm": 0.08955825120210648, + "learning_rate": 2.1369098519285298e-05, + "loss": 0.269, + "step": 44628 + }, + { + "epoch": 3.615440699935191, + "grad_norm": 0.08716186881065369, + "learning_rate": 2.136459786669067e-05, + "loss": 0.2382, + "step": 44629 + }, + { + "epoch": 3.61552171095269, + "grad_norm": 0.06881950795650482, + "learning_rate": 2.1360097214096045e-05, + "loss": 0.2266, + "step": 44630 + }, + { + "epoch": 3.615602721970188, + "grad_norm": 0.06224439665675163, + "learning_rate": 2.135559656150142e-05, + "loss": 0.1997, + "step": 44631 + }, + { + "epoch": 3.6156837329876863, + "grad_norm": 0.06367988884449005, + "learning_rate": 2.1351095908906792e-05, + "loss": 0.2104, + "step": 44632 + }, + { + "epoch": 3.6157647440051845, + "grad_norm": 0.07206551730632782, + "learning_rate": 2.1346595256312166e-05, + "loss": 0.2131, + "step": 44633 + }, + { + "epoch": 3.6158457550226832, + "grad_norm": 0.08080728352069855, + "learning_rate": 2.134209460371754e-05, + "loss": 0.2064, + "step": 44634 + }, + { + "epoch": 3.6159267660401815, + "grad_norm": 0.09997903555631638, + "learning_rate": 2.1337593951122913e-05, + "loss": 0.2489, + "step": 44635 + }, + { + "epoch": 3.6160077770576797, + "grad_norm": 0.07835156470537186, + "learning_rate": 2.1333093298528287e-05, + "loss": 0.2242, + "step": 44636 + }, + { + "epoch": 3.616088788075178, + "grad_norm": 0.06274702399969101, + "learning_rate": 2.1328592645933664e-05, + "loss": 0.243, + "step": 44637 + }, + { + "epoch": 3.6161697990926767, + "grad_norm": 0.08270495384931564, + "learning_rate": 2.1324091993339034e-05, + "loss": 0.2314, + "step": 44638 + }, + { + "epoch": 3.616250810110175, + "grad_norm": 0.0683775395154953, + "learning_rate": 2.131959134074441e-05, + "loss": 0.1944, + "step": 44639 + }, + { + "epoch": 3.616331821127673, + "grad_norm": 0.08411401510238647, + "learning_rate": 2.1315090688149785e-05, + "loss": 0.2662, + "step": 44640 + }, + { + "epoch": 3.616412832145172, + "grad_norm": 0.07783766090869904, + "learning_rate": 2.1310590035555155e-05, + "loss": 0.239, + "step": 44641 + }, + { + "epoch": 3.61649384316267, + "grad_norm": 0.06915144622325897, + "learning_rate": 2.1306089382960532e-05, + "loss": 0.2263, + "step": 44642 + }, + { + "epoch": 3.6165748541801683, + "grad_norm": 0.06283178925514221, + "learning_rate": 2.1301588730365905e-05, + "loss": 0.2191, + "step": 44643 + }, + { + "epoch": 3.616655865197667, + "grad_norm": 0.07214689999818802, + "learning_rate": 2.1297088077771276e-05, + "loss": 0.2167, + "step": 44644 + }, + { + "epoch": 3.6167368762151653, + "grad_norm": 0.07811303436756134, + "learning_rate": 2.1292587425176653e-05, + "loss": 0.228, + "step": 44645 + }, + { + "epoch": 3.6168178872326635, + "grad_norm": 0.06575002521276474, + "learning_rate": 2.1288086772582026e-05, + "loss": 0.2228, + "step": 44646 + }, + { + "epoch": 3.616898898250162, + "grad_norm": 0.0618891641497612, + "learning_rate": 2.12835861199874e-05, + "loss": 0.2402, + "step": 44647 + }, + { + "epoch": 3.6169799092676604, + "grad_norm": 0.06803794205188751, + "learning_rate": 2.1279085467392773e-05, + "loss": 0.2241, + "step": 44648 + }, + { + "epoch": 3.6170609202851587, + "grad_norm": 0.09507738053798676, + "learning_rate": 2.1274584814798147e-05, + "loss": 0.2172, + "step": 44649 + }, + { + "epoch": 3.6171419313026574, + "grad_norm": 0.07402309775352478, + "learning_rate": 2.127008416220352e-05, + "loss": 0.2451, + "step": 44650 + }, + { + "epoch": 3.6172229423201556, + "grad_norm": 0.0824049636721611, + "learning_rate": 2.1265583509608894e-05, + "loss": 0.2152, + "step": 44651 + }, + { + "epoch": 3.617303953337654, + "grad_norm": 0.0745280459523201, + "learning_rate": 2.1261082857014268e-05, + "loss": 0.2252, + "step": 44652 + }, + { + "epoch": 3.6173849643551526, + "grad_norm": 0.10456760972738266, + "learning_rate": 2.125658220441964e-05, + "loss": 0.2137, + "step": 44653 + }, + { + "epoch": 3.617465975372651, + "grad_norm": 0.08100705593824387, + "learning_rate": 2.1252081551825015e-05, + "loss": 0.246, + "step": 44654 + }, + { + "epoch": 3.617546986390149, + "grad_norm": 0.07619575411081314, + "learning_rate": 2.1247580899230392e-05, + "loss": 0.2351, + "step": 44655 + }, + { + "epoch": 3.6176279974076473, + "grad_norm": 0.06565256416797638, + "learning_rate": 2.1243080246635762e-05, + "loss": 0.2236, + "step": 44656 + }, + { + "epoch": 3.617709008425146, + "grad_norm": 0.07464781403541565, + "learning_rate": 2.1238579594041136e-05, + "loss": 0.2282, + "step": 44657 + }, + { + "epoch": 3.6177900194426442, + "grad_norm": 0.07250156253576279, + "learning_rate": 2.1234078941446513e-05, + "loss": 0.2558, + "step": 44658 + }, + { + "epoch": 3.6178710304601425, + "grad_norm": 0.07820381969213486, + "learning_rate": 2.1229578288851883e-05, + "loss": 0.2276, + "step": 44659 + }, + { + "epoch": 3.6179520414776407, + "grad_norm": 0.07306982576847076, + "learning_rate": 2.122507763625726e-05, + "loss": 0.2289, + "step": 44660 + }, + { + "epoch": 3.6180330524951394, + "grad_norm": 0.08155608177185059, + "learning_rate": 2.1220576983662634e-05, + "loss": 0.2199, + "step": 44661 + }, + { + "epoch": 3.6181140635126376, + "grad_norm": 0.062416404485702515, + "learning_rate": 2.1216076331068004e-05, + "loss": 0.2089, + "step": 44662 + }, + { + "epoch": 3.618195074530136, + "grad_norm": 0.09483791142702103, + "learning_rate": 2.121157567847338e-05, + "loss": 0.2483, + "step": 44663 + }, + { + "epoch": 3.6182760855476346, + "grad_norm": 0.07709171622991562, + "learning_rate": 2.1207075025878754e-05, + "loss": 0.1957, + "step": 44664 + }, + { + "epoch": 3.618357096565133, + "grad_norm": 0.06335264444351196, + "learning_rate": 2.1202574373284125e-05, + "loss": 0.2168, + "step": 44665 + }, + { + "epoch": 3.618438107582631, + "grad_norm": 0.07865205407142639, + "learning_rate": 2.11980737206895e-05, + "loss": 0.2134, + "step": 44666 + }, + { + "epoch": 3.6185191186001298, + "grad_norm": 0.09624006599187851, + "learning_rate": 2.1193573068094875e-05, + "loss": 0.2437, + "step": 44667 + }, + { + "epoch": 3.618600129617628, + "grad_norm": 0.07597095519304276, + "learning_rate": 2.118907241550025e-05, + "loss": 0.2212, + "step": 44668 + }, + { + "epoch": 3.6186811406351262, + "grad_norm": 0.06700621545314789, + "learning_rate": 2.1184571762905622e-05, + "loss": 0.2013, + "step": 44669 + }, + { + "epoch": 3.618762151652625, + "grad_norm": 0.07764141261577606, + "learning_rate": 2.1180071110310996e-05, + "loss": 0.2652, + "step": 44670 + }, + { + "epoch": 3.618843162670123, + "grad_norm": 0.08569448441267014, + "learning_rate": 2.117557045771637e-05, + "loss": 0.2149, + "step": 44671 + }, + { + "epoch": 3.6189241736876214, + "grad_norm": 0.08548179268836975, + "learning_rate": 2.1171069805121743e-05, + "loss": 0.2404, + "step": 44672 + }, + { + "epoch": 3.61900518470512, + "grad_norm": 0.06488697230815887, + "learning_rate": 2.116656915252712e-05, + "loss": 0.2117, + "step": 44673 + }, + { + "epoch": 3.6190861957226184, + "grad_norm": 0.06554609537124634, + "learning_rate": 2.116206849993249e-05, + "loss": 0.2499, + "step": 44674 + }, + { + "epoch": 3.6191672067401166, + "grad_norm": 0.05791258439421654, + "learning_rate": 2.1157567847337864e-05, + "loss": 0.2058, + "step": 44675 + }, + { + "epoch": 3.6192482177576153, + "grad_norm": 0.06734529137611389, + "learning_rate": 2.115306719474324e-05, + "loss": 0.2256, + "step": 44676 + }, + { + "epoch": 3.6193292287751135, + "grad_norm": 0.06773965060710907, + "learning_rate": 2.114856654214861e-05, + "loss": 0.2161, + "step": 44677 + }, + { + "epoch": 3.619410239792612, + "grad_norm": 0.06876647472381592, + "learning_rate": 2.1144065889553985e-05, + "loss": 0.2414, + "step": 44678 + }, + { + "epoch": 3.61949125081011, + "grad_norm": 0.06989677995443344, + "learning_rate": 2.1139565236959362e-05, + "loss": 0.2746, + "step": 44679 + }, + { + "epoch": 3.6195722618276087, + "grad_norm": 0.06485997885465622, + "learning_rate": 2.1135064584364732e-05, + "loss": 0.1826, + "step": 44680 + }, + { + "epoch": 3.619653272845107, + "grad_norm": 0.0713554248213768, + "learning_rate": 2.113056393177011e-05, + "loss": 0.2168, + "step": 44681 + }, + { + "epoch": 3.619734283862605, + "grad_norm": 0.09202663600444794, + "learning_rate": 2.1126063279175483e-05, + "loss": 0.2209, + "step": 44682 + }, + { + "epoch": 3.6198152948801035, + "grad_norm": 0.06724567711353302, + "learning_rate": 2.1121562626580853e-05, + "loss": 0.1997, + "step": 44683 + }, + { + "epoch": 3.619896305897602, + "grad_norm": 0.0922047421336174, + "learning_rate": 2.111706197398623e-05, + "loss": 0.2221, + "step": 44684 + }, + { + "epoch": 3.6199773169151004, + "grad_norm": 0.053619880229234695, + "learning_rate": 2.1112561321391604e-05, + "loss": 0.2036, + "step": 44685 + }, + { + "epoch": 3.6200583279325986, + "grad_norm": 0.07134517282247543, + "learning_rate": 2.1108060668796977e-05, + "loss": 0.2364, + "step": 44686 + }, + { + "epoch": 3.6201393389500973, + "grad_norm": 0.07642362266778946, + "learning_rate": 2.110356001620235e-05, + "loss": 0.2016, + "step": 44687 + }, + { + "epoch": 3.6202203499675956, + "grad_norm": 0.07684290409088135, + "learning_rate": 2.1099059363607724e-05, + "loss": 0.2285, + "step": 44688 + }, + { + "epoch": 3.620301360985094, + "grad_norm": 0.08038019388914108, + "learning_rate": 2.1094558711013098e-05, + "loss": 0.241, + "step": 44689 + }, + { + "epoch": 3.6203823720025925, + "grad_norm": 0.06323223561048508, + "learning_rate": 2.109005805841847e-05, + "loss": 0.1871, + "step": 44690 + }, + { + "epoch": 3.6204633830200907, + "grad_norm": 0.053566351532936096, + "learning_rate": 2.1085557405823845e-05, + "loss": 0.1933, + "step": 44691 + }, + { + "epoch": 3.620544394037589, + "grad_norm": 0.0744895339012146, + "learning_rate": 2.108105675322922e-05, + "loss": 0.2402, + "step": 44692 + }, + { + "epoch": 3.6206254050550877, + "grad_norm": 0.07990087568759918, + "learning_rate": 2.1076556100634592e-05, + "loss": 0.2582, + "step": 44693 + }, + { + "epoch": 3.620706416072586, + "grad_norm": 0.06844060868024826, + "learning_rate": 2.107205544803997e-05, + "loss": 0.2262, + "step": 44694 + }, + { + "epoch": 3.620787427090084, + "grad_norm": 0.058736447244882584, + "learning_rate": 2.106755479544534e-05, + "loss": 0.1874, + "step": 44695 + }, + { + "epoch": 3.620868438107583, + "grad_norm": 0.06610717624425888, + "learning_rate": 2.1063054142850713e-05, + "loss": 0.1953, + "step": 44696 + }, + { + "epoch": 3.620949449125081, + "grad_norm": 0.06829269230365753, + "learning_rate": 2.105855349025609e-05, + "loss": 0.219, + "step": 44697 + }, + { + "epoch": 3.6210304601425793, + "grad_norm": 0.07212626934051514, + "learning_rate": 2.105405283766146e-05, + "loss": 0.2185, + "step": 44698 + }, + { + "epoch": 3.621111471160078, + "grad_norm": 0.07561732083559036, + "learning_rate": 2.1049552185066837e-05, + "loss": 0.2205, + "step": 44699 + }, + { + "epoch": 3.6211924821775763, + "grad_norm": 0.0672653540968895, + "learning_rate": 2.104505153247221e-05, + "loss": 0.2368, + "step": 44700 + }, + { + "epoch": 3.6212734931950745, + "grad_norm": 0.07278675585985184, + "learning_rate": 2.104055087987758e-05, + "loss": 0.2466, + "step": 44701 + }, + { + "epoch": 3.6213545042125728, + "grad_norm": 0.07003212720155716, + "learning_rate": 2.1036050227282958e-05, + "loss": 0.2314, + "step": 44702 + }, + { + "epoch": 3.6214355152300715, + "grad_norm": 0.07009841501712799, + "learning_rate": 2.1031549574688332e-05, + "loss": 0.2049, + "step": 44703 + }, + { + "epoch": 3.6215165262475697, + "grad_norm": 0.07427027076482773, + "learning_rate": 2.1027048922093702e-05, + "loss": 0.2016, + "step": 44704 + }, + { + "epoch": 3.621597537265068, + "grad_norm": 0.06905297935009003, + "learning_rate": 2.102254826949908e-05, + "loss": 0.2347, + "step": 44705 + }, + { + "epoch": 3.621678548282566, + "grad_norm": 0.06712187081575394, + "learning_rate": 2.1018047616904453e-05, + "loss": 0.2052, + "step": 44706 + }, + { + "epoch": 3.621759559300065, + "grad_norm": 0.08001432567834854, + "learning_rate": 2.1013546964309826e-05, + "loss": 0.2733, + "step": 44707 + }, + { + "epoch": 3.621840570317563, + "grad_norm": 0.05815906822681427, + "learning_rate": 2.10090463117152e-05, + "loss": 0.1974, + "step": 44708 + }, + { + "epoch": 3.6219215813350614, + "grad_norm": 0.0719711035490036, + "learning_rate": 2.1004545659120573e-05, + "loss": 0.2166, + "step": 44709 + }, + { + "epoch": 3.62200259235256, + "grad_norm": 0.07093847543001175, + "learning_rate": 2.1000045006525947e-05, + "loss": 0.2192, + "step": 44710 + }, + { + "epoch": 3.6220836033700583, + "grad_norm": 0.07297228276729584, + "learning_rate": 2.099554435393132e-05, + "loss": 0.2077, + "step": 44711 + }, + { + "epoch": 3.6221646143875565, + "grad_norm": 0.09079265594482422, + "learning_rate": 2.0991043701336698e-05, + "loss": 0.2513, + "step": 44712 + }, + { + "epoch": 3.6222456254050552, + "grad_norm": 0.07077782601118088, + "learning_rate": 2.0986543048742068e-05, + "loss": 0.2004, + "step": 44713 + }, + { + "epoch": 3.6223266364225535, + "grad_norm": 0.0672997534275055, + "learning_rate": 2.098204239614744e-05, + "loss": 0.2092, + "step": 44714 + }, + { + "epoch": 3.6224076474400517, + "grad_norm": 0.06267137825489044, + "learning_rate": 2.097754174355282e-05, + "loss": 0.1994, + "step": 44715 + }, + { + "epoch": 3.6224886584575504, + "grad_norm": 0.06340795010328293, + "learning_rate": 2.097304109095819e-05, + "loss": 0.2151, + "step": 44716 + }, + { + "epoch": 3.6225696694750487, + "grad_norm": 0.06249105557799339, + "learning_rate": 2.0968540438363562e-05, + "loss": 0.2197, + "step": 44717 + }, + { + "epoch": 3.622650680492547, + "grad_norm": 0.07884196192026138, + "learning_rate": 2.096403978576894e-05, + "loss": 0.2554, + "step": 44718 + }, + { + "epoch": 3.6227316915100456, + "grad_norm": 0.10559750348329544, + "learning_rate": 2.095953913317431e-05, + "loss": 0.2637, + "step": 44719 + }, + { + "epoch": 3.622812702527544, + "grad_norm": 0.06595143675804138, + "learning_rate": 2.0955038480579686e-05, + "loss": 0.1985, + "step": 44720 + }, + { + "epoch": 3.622893713545042, + "grad_norm": 0.0710291638970375, + "learning_rate": 2.095053782798506e-05, + "loss": 0.2555, + "step": 44721 + }, + { + "epoch": 3.6229747245625408, + "grad_norm": 0.07778099924325943, + "learning_rate": 2.094603717539043e-05, + "loss": 0.2483, + "step": 44722 + }, + { + "epoch": 3.623055735580039, + "grad_norm": 0.08554724603891373, + "learning_rate": 2.0941536522795807e-05, + "loss": 0.2449, + "step": 44723 + }, + { + "epoch": 3.6231367465975373, + "grad_norm": 0.08926980197429657, + "learning_rate": 2.093703587020118e-05, + "loss": 0.2237, + "step": 44724 + }, + { + "epoch": 3.6232177576150355, + "grad_norm": 0.0684860572218895, + "learning_rate": 2.0932535217606554e-05, + "loss": 0.2221, + "step": 44725 + }, + { + "epoch": 3.6232987686325338, + "grad_norm": 0.08324017375707626, + "learning_rate": 2.0928034565011928e-05, + "loss": 0.1966, + "step": 44726 + }, + { + "epoch": 3.6233797796500324, + "grad_norm": 0.07645716518163681, + "learning_rate": 2.09235339124173e-05, + "loss": 0.222, + "step": 44727 + }, + { + "epoch": 3.6234607906675307, + "grad_norm": 0.06555616855621338, + "learning_rate": 2.0919033259822675e-05, + "loss": 0.2671, + "step": 44728 + }, + { + "epoch": 3.623541801685029, + "grad_norm": 0.07931575179100037, + "learning_rate": 2.091453260722805e-05, + "loss": 0.2121, + "step": 44729 + }, + { + "epoch": 3.6236228127025276, + "grad_norm": 0.08958851546049118, + "learning_rate": 2.0910031954633422e-05, + "loss": 0.2072, + "step": 44730 + }, + { + "epoch": 3.623703823720026, + "grad_norm": 0.08067791163921356, + "learning_rate": 2.0905531302038796e-05, + "loss": 0.2278, + "step": 44731 + }, + { + "epoch": 3.623784834737524, + "grad_norm": 0.07220610231161118, + "learning_rate": 2.090103064944417e-05, + "loss": 0.2264, + "step": 44732 + }, + { + "epoch": 3.623865845755023, + "grad_norm": 0.0624765083193779, + "learning_rate": 2.0896529996849547e-05, + "loss": 0.1901, + "step": 44733 + }, + { + "epoch": 3.623946856772521, + "grad_norm": 0.08418891578912735, + "learning_rate": 2.0892029344254917e-05, + "loss": 0.2468, + "step": 44734 + }, + { + "epoch": 3.6240278677900193, + "grad_norm": 0.0655638724565506, + "learning_rate": 2.088752869166029e-05, + "loss": 0.2259, + "step": 44735 + }, + { + "epoch": 3.624108878807518, + "grad_norm": 0.08135916292667389, + "learning_rate": 2.0883028039065667e-05, + "loss": 0.2149, + "step": 44736 + }, + { + "epoch": 3.624189889825016, + "grad_norm": 0.07866069674491882, + "learning_rate": 2.0878527386471038e-05, + "loss": 0.2547, + "step": 44737 + }, + { + "epoch": 3.6242709008425145, + "grad_norm": 0.062272604554891586, + "learning_rate": 2.087402673387641e-05, + "loss": 0.2237, + "step": 44738 + }, + { + "epoch": 3.624351911860013, + "grad_norm": 0.06917754560709, + "learning_rate": 2.0869526081281788e-05, + "loss": 0.2302, + "step": 44739 + }, + { + "epoch": 3.6244329228775114, + "grad_norm": 0.08508767187595367, + "learning_rate": 2.086502542868716e-05, + "loss": 0.2321, + "step": 44740 + }, + { + "epoch": 3.6245139338950096, + "grad_norm": 0.06781996786594391, + "learning_rate": 2.0860524776092535e-05, + "loss": 0.2626, + "step": 44741 + }, + { + "epoch": 3.6245949449125083, + "grad_norm": 0.07191802561283112, + "learning_rate": 2.085602412349791e-05, + "loss": 0.2429, + "step": 44742 + }, + { + "epoch": 3.6246759559300066, + "grad_norm": 0.06998195499181747, + "learning_rate": 2.085152347090328e-05, + "loss": 0.2048, + "step": 44743 + }, + { + "epoch": 3.624756966947505, + "grad_norm": 0.06424444913864136, + "learning_rate": 2.0847022818308656e-05, + "loss": 0.2005, + "step": 44744 + }, + { + "epoch": 3.6248379779650035, + "grad_norm": 0.06656464189291, + "learning_rate": 2.084252216571403e-05, + "loss": 0.2245, + "step": 44745 + }, + { + "epoch": 3.6249189889825018, + "grad_norm": 0.08617588877677917, + "learning_rate": 2.0838021513119403e-05, + "loss": 0.2295, + "step": 44746 + }, + { + "epoch": 3.625, + "grad_norm": 0.07731680572032928, + "learning_rate": 2.0833520860524777e-05, + "loss": 0.2244, + "step": 44747 + }, + { + "epoch": 3.6250810110174982, + "grad_norm": 0.06422511488199234, + "learning_rate": 2.082902020793015e-05, + "loss": 0.2516, + "step": 44748 + }, + { + "epoch": 3.6251620220349965, + "grad_norm": 0.08240256458520889, + "learning_rate": 2.0824519555335524e-05, + "loss": 0.2162, + "step": 44749 + }, + { + "epoch": 3.625243033052495, + "grad_norm": 0.0808911994099617, + "learning_rate": 2.0820018902740898e-05, + "loss": 0.2721, + "step": 44750 + }, + { + "epoch": 3.6253240440699934, + "grad_norm": 0.06711307168006897, + "learning_rate": 2.081551825014627e-05, + "loss": 0.2269, + "step": 44751 + }, + { + "epoch": 3.6254050550874917, + "grad_norm": 0.06413264572620392, + "learning_rate": 2.0811017597551645e-05, + "loss": 0.2177, + "step": 44752 + }, + { + "epoch": 3.6254860661049904, + "grad_norm": 0.06592356413602829, + "learning_rate": 2.080651694495702e-05, + "loss": 0.228, + "step": 44753 + }, + { + "epoch": 3.6255670771224886, + "grad_norm": 0.07257553189992905, + "learning_rate": 2.0802016292362396e-05, + "loss": 0.2527, + "step": 44754 + }, + { + "epoch": 3.625648088139987, + "grad_norm": 0.05981922522187233, + "learning_rate": 2.0797515639767766e-05, + "loss": 0.2331, + "step": 44755 + }, + { + "epoch": 3.6257290991574855, + "grad_norm": 0.08513333648443222, + "learning_rate": 2.079301498717314e-05, + "loss": 0.2322, + "step": 44756 + }, + { + "epoch": 3.625810110174984, + "grad_norm": 0.062274035066366196, + "learning_rate": 2.0788514334578517e-05, + "loss": 0.2174, + "step": 44757 + }, + { + "epoch": 3.625891121192482, + "grad_norm": 0.08532512187957764, + "learning_rate": 2.0784013681983887e-05, + "loss": 0.2387, + "step": 44758 + }, + { + "epoch": 3.6259721322099807, + "grad_norm": 0.08419835567474365, + "learning_rate": 2.0779513029389264e-05, + "loss": 0.2474, + "step": 44759 + }, + { + "epoch": 3.626053143227479, + "grad_norm": 0.06810888648033142, + "learning_rate": 2.0775012376794637e-05, + "loss": 0.2138, + "step": 44760 + }, + { + "epoch": 3.626134154244977, + "grad_norm": 0.07005812227725983, + "learning_rate": 2.0770511724200008e-05, + "loss": 0.2416, + "step": 44761 + }, + { + "epoch": 3.626215165262476, + "grad_norm": 0.0587514191865921, + "learning_rate": 2.0766011071605385e-05, + "loss": 0.1943, + "step": 44762 + }, + { + "epoch": 3.626296176279974, + "grad_norm": 0.08654726296663284, + "learning_rate": 2.0761510419010758e-05, + "loss": 0.2777, + "step": 44763 + }, + { + "epoch": 3.6263771872974724, + "grad_norm": 0.07134860754013062, + "learning_rate": 2.075700976641613e-05, + "loss": 0.1994, + "step": 44764 + }, + { + "epoch": 3.626458198314971, + "grad_norm": 0.07229122519493103, + "learning_rate": 2.0752509113821505e-05, + "loss": 0.2185, + "step": 44765 + }, + { + "epoch": 3.6265392093324693, + "grad_norm": 0.07302133738994598, + "learning_rate": 2.074800846122688e-05, + "loss": 0.2114, + "step": 44766 + }, + { + "epoch": 3.6266202203499676, + "grad_norm": 0.08091484010219574, + "learning_rate": 2.0743507808632253e-05, + "loss": 0.2138, + "step": 44767 + }, + { + "epoch": 3.6267012313674662, + "grad_norm": 0.07142447680234909, + "learning_rate": 2.0739007156037626e-05, + "loss": 0.2521, + "step": 44768 + }, + { + "epoch": 3.6267822423849645, + "grad_norm": 0.06590250134468079, + "learning_rate": 2.0734506503443e-05, + "loss": 0.2086, + "step": 44769 + }, + { + "epoch": 3.6268632534024627, + "grad_norm": 0.07560182362794876, + "learning_rate": 2.0730005850848373e-05, + "loss": 0.2083, + "step": 44770 + }, + { + "epoch": 3.626944264419961, + "grad_norm": 0.06524951756000519, + "learning_rate": 2.0725505198253747e-05, + "loss": 0.2077, + "step": 44771 + }, + { + "epoch": 3.6270252754374592, + "grad_norm": 0.07804036140441895, + "learning_rate": 2.0721004545659124e-05, + "loss": 0.2133, + "step": 44772 + }, + { + "epoch": 3.627106286454958, + "grad_norm": 0.06527618318796158, + "learning_rate": 2.0716503893064494e-05, + "loss": 0.2157, + "step": 44773 + }, + { + "epoch": 3.627187297472456, + "grad_norm": 0.07573771476745605, + "learning_rate": 2.0712003240469868e-05, + "loss": 0.2472, + "step": 44774 + }, + { + "epoch": 3.6272683084899544, + "grad_norm": 0.08255860209465027, + "learning_rate": 2.0707502587875245e-05, + "loss": 0.2381, + "step": 44775 + }, + { + "epoch": 3.627349319507453, + "grad_norm": 0.08117613196372986, + "learning_rate": 2.0703001935280615e-05, + "loss": 0.2388, + "step": 44776 + }, + { + "epoch": 3.6274303305249513, + "grad_norm": 0.07737639546394348, + "learning_rate": 2.069850128268599e-05, + "loss": 0.2194, + "step": 44777 + }, + { + "epoch": 3.6275113415424496, + "grad_norm": 0.07773453742265701, + "learning_rate": 2.0694000630091366e-05, + "loss": 0.2374, + "step": 44778 + }, + { + "epoch": 3.6275923525599483, + "grad_norm": 0.06742957979440689, + "learning_rate": 2.0689499977496736e-05, + "loss": 0.2675, + "step": 44779 + }, + { + "epoch": 3.6276733635774465, + "grad_norm": 0.07162263989448547, + "learning_rate": 2.0684999324902113e-05, + "loss": 0.238, + "step": 44780 + }, + { + "epoch": 3.6277543745949448, + "grad_norm": 0.07228162884712219, + "learning_rate": 2.0680498672307486e-05, + "loss": 0.2255, + "step": 44781 + }, + { + "epoch": 3.6278353856124435, + "grad_norm": 0.0600263848900795, + "learning_rate": 2.067599801971286e-05, + "loss": 0.2185, + "step": 44782 + }, + { + "epoch": 3.6279163966299417, + "grad_norm": 0.0768987163901329, + "learning_rate": 2.0671497367118234e-05, + "loss": 0.2082, + "step": 44783 + }, + { + "epoch": 3.62799740764744, + "grad_norm": 0.067320816218853, + "learning_rate": 2.0666996714523607e-05, + "loss": 0.2136, + "step": 44784 + }, + { + "epoch": 3.6280784186649386, + "grad_norm": 0.06339792907238007, + "learning_rate": 2.066249606192898e-05, + "loss": 0.2404, + "step": 44785 + }, + { + "epoch": 3.628159429682437, + "grad_norm": 0.079399473965168, + "learning_rate": 2.0657995409334354e-05, + "loss": 0.245, + "step": 44786 + }, + { + "epoch": 3.628240440699935, + "grad_norm": 0.08469132333993912, + "learning_rate": 2.0653494756739728e-05, + "loss": 0.2405, + "step": 44787 + }, + { + "epoch": 3.628321451717434, + "grad_norm": 0.06696237623691559, + "learning_rate": 2.06489941041451e-05, + "loss": 0.2162, + "step": 44788 + }, + { + "epoch": 3.628402462734932, + "grad_norm": 0.06755819171667099, + "learning_rate": 2.0644493451550475e-05, + "loss": 0.2242, + "step": 44789 + }, + { + "epoch": 3.6284834737524303, + "grad_norm": 0.07293059676885605, + "learning_rate": 2.063999279895585e-05, + "loss": 0.2206, + "step": 44790 + }, + { + "epoch": 3.6285644847699285, + "grad_norm": 0.07739424705505371, + "learning_rate": 2.0635492146361222e-05, + "loss": 0.2512, + "step": 44791 + }, + { + "epoch": 3.6286454957874272, + "grad_norm": 0.061704766005277634, + "learning_rate": 2.0630991493766596e-05, + "loss": 0.2304, + "step": 44792 + }, + { + "epoch": 3.6287265068049255, + "grad_norm": 0.089394211769104, + "learning_rate": 2.0626490841171973e-05, + "loss": 0.2322, + "step": 44793 + }, + { + "epoch": 3.6288075178224237, + "grad_norm": 0.06811096519231796, + "learning_rate": 2.0621990188577343e-05, + "loss": 0.2104, + "step": 44794 + }, + { + "epoch": 3.628888528839922, + "grad_norm": 0.08393049985170364, + "learning_rate": 2.0617489535982717e-05, + "loss": 0.2022, + "step": 44795 + }, + { + "epoch": 3.6289695398574207, + "grad_norm": 0.0783284604549408, + "learning_rate": 2.0612988883388094e-05, + "loss": 0.2322, + "step": 44796 + }, + { + "epoch": 3.629050550874919, + "grad_norm": 0.0840248391032219, + "learning_rate": 2.0608488230793464e-05, + "loss": 0.2561, + "step": 44797 + }, + { + "epoch": 3.629131561892417, + "grad_norm": 0.07756663858890533, + "learning_rate": 2.060398757819884e-05, + "loss": 0.2345, + "step": 44798 + }, + { + "epoch": 3.629212572909916, + "grad_norm": 0.08550877869129181, + "learning_rate": 2.0599486925604215e-05, + "loss": 0.241, + "step": 44799 + }, + { + "epoch": 3.629293583927414, + "grad_norm": 0.0756603330373764, + "learning_rate": 2.0594986273009588e-05, + "loss": 0.2138, + "step": 44800 + }, + { + "epoch": 3.6293745949449123, + "grad_norm": 0.07740037888288498, + "learning_rate": 2.0590485620414962e-05, + "loss": 0.2471, + "step": 44801 + }, + { + "epoch": 3.629455605962411, + "grad_norm": 0.07371073216199875, + "learning_rate": 2.0585984967820335e-05, + "loss": 0.212, + "step": 44802 + }, + { + "epoch": 3.6295366169799093, + "grad_norm": 0.07205826789140701, + "learning_rate": 2.058148431522571e-05, + "loss": 0.2403, + "step": 44803 + }, + { + "epoch": 3.6296176279974075, + "grad_norm": 0.07409129291772842, + "learning_rate": 2.0576983662631083e-05, + "loss": 0.231, + "step": 44804 + }, + { + "epoch": 3.629698639014906, + "grad_norm": 0.06722891330718994, + "learning_rate": 2.0572483010036456e-05, + "loss": 0.2325, + "step": 44805 + }, + { + "epoch": 3.6297796500324044, + "grad_norm": 0.10076708346605301, + "learning_rate": 2.056798235744183e-05, + "loss": 0.2289, + "step": 44806 + }, + { + "epoch": 3.6298606610499027, + "grad_norm": 0.07837366312742233, + "learning_rate": 2.0563481704847203e-05, + "loss": 0.2313, + "step": 44807 + }, + { + "epoch": 3.6299416720674014, + "grad_norm": 0.06908228248357773, + "learning_rate": 2.0558981052252577e-05, + "loss": 0.1936, + "step": 44808 + }, + { + "epoch": 3.6300226830848996, + "grad_norm": 0.08010173588991165, + "learning_rate": 2.055448039965795e-05, + "loss": 0.2201, + "step": 44809 + }, + { + "epoch": 3.630103694102398, + "grad_norm": 0.0739486813545227, + "learning_rate": 2.0549979747063324e-05, + "loss": 0.1951, + "step": 44810 + }, + { + "epoch": 3.6301847051198965, + "grad_norm": 0.07065515965223312, + "learning_rate": 2.0545479094468698e-05, + "loss": 0.2685, + "step": 44811 + }, + { + "epoch": 3.630265716137395, + "grad_norm": 0.07847166061401367, + "learning_rate": 2.054097844187407e-05, + "loss": 0.2361, + "step": 44812 + }, + { + "epoch": 3.630346727154893, + "grad_norm": 0.07692558318376541, + "learning_rate": 2.0536477789279445e-05, + "loss": 0.2746, + "step": 44813 + }, + { + "epoch": 3.6304277381723913, + "grad_norm": 0.05729920044541359, + "learning_rate": 2.0531977136684822e-05, + "loss": 0.2212, + "step": 44814 + }, + { + "epoch": 3.63050874918989, + "grad_norm": 0.07073798030614853, + "learning_rate": 2.0527476484090192e-05, + "loss": 0.2149, + "step": 44815 + }, + { + "epoch": 3.630589760207388, + "grad_norm": 0.07140544056892395, + "learning_rate": 2.0522975831495566e-05, + "loss": 0.2652, + "step": 44816 + }, + { + "epoch": 3.6306707712248865, + "grad_norm": 0.0805303305387497, + "learning_rate": 2.0518475178900943e-05, + "loss": 0.231, + "step": 44817 + }, + { + "epoch": 3.6307517822423847, + "grad_norm": 0.0681585893034935, + "learning_rate": 2.0513974526306316e-05, + "loss": 0.2598, + "step": 44818 + }, + { + "epoch": 3.6308327932598834, + "grad_norm": 0.0736590027809143, + "learning_rate": 2.050947387371169e-05, + "loss": 0.1995, + "step": 44819 + }, + { + "epoch": 3.6309138042773816, + "grad_norm": 0.07716482877731323, + "learning_rate": 2.0504973221117064e-05, + "loss": 0.2254, + "step": 44820 + }, + { + "epoch": 3.63099481529488, + "grad_norm": 0.08331847190856934, + "learning_rate": 2.0500472568522437e-05, + "loss": 0.244, + "step": 44821 + }, + { + "epoch": 3.6310758263123786, + "grad_norm": 0.07626423239707947, + "learning_rate": 2.049597191592781e-05, + "loss": 0.2124, + "step": 44822 + }, + { + "epoch": 3.631156837329877, + "grad_norm": 0.07067970186471939, + "learning_rate": 2.0491471263333185e-05, + "loss": 0.2237, + "step": 44823 + }, + { + "epoch": 3.631237848347375, + "grad_norm": 0.06633996218442917, + "learning_rate": 2.0486970610738558e-05, + "loss": 0.2167, + "step": 44824 + }, + { + "epoch": 3.6313188593648738, + "grad_norm": 0.06839239597320557, + "learning_rate": 2.0482469958143932e-05, + "loss": 0.1972, + "step": 44825 + }, + { + "epoch": 3.631399870382372, + "grad_norm": 0.07643083482980728, + "learning_rate": 2.0477969305549305e-05, + "loss": 0.2385, + "step": 44826 + }, + { + "epoch": 3.6314808813998702, + "grad_norm": 0.08145930618047714, + "learning_rate": 2.047346865295468e-05, + "loss": 0.2476, + "step": 44827 + }, + { + "epoch": 3.631561892417369, + "grad_norm": 0.08592727780342102, + "learning_rate": 2.0468968000360053e-05, + "loss": 0.2525, + "step": 44828 + }, + { + "epoch": 3.631642903434867, + "grad_norm": 0.06574290245771408, + "learning_rate": 2.0464467347765426e-05, + "loss": 0.2167, + "step": 44829 + }, + { + "epoch": 3.6317239144523654, + "grad_norm": 0.08025723695755005, + "learning_rate": 2.04599666951708e-05, + "loss": 0.246, + "step": 44830 + }, + { + "epoch": 3.631804925469864, + "grad_norm": 0.08162990212440491, + "learning_rate": 2.0455466042576173e-05, + "loss": 0.2437, + "step": 44831 + }, + { + "epoch": 3.6318859364873624, + "grad_norm": 0.07384198904037476, + "learning_rate": 2.045096538998155e-05, + "loss": 0.2257, + "step": 44832 + }, + { + "epoch": 3.6319669475048606, + "grad_norm": 0.07432828843593597, + "learning_rate": 2.0446464737386924e-05, + "loss": 0.2154, + "step": 44833 + }, + { + "epoch": 3.6320479585223593, + "grad_norm": 0.0657813623547554, + "learning_rate": 2.0441964084792294e-05, + "loss": 0.2132, + "step": 44834 + }, + { + "epoch": 3.6321289695398575, + "grad_norm": 0.06444726884365082, + "learning_rate": 2.043746343219767e-05, + "loss": 0.2374, + "step": 44835 + }, + { + "epoch": 3.6322099805573558, + "grad_norm": 0.06306802481412888, + "learning_rate": 2.0432962779603045e-05, + "loss": 0.2344, + "step": 44836 + }, + { + "epoch": 3.632290991574854, + "grad_norm": 0.07714951783418655, + "learning_rate": 2.0428462127008415e-05, + "loss": 0.2334, + "step": 44837 + }, + { + "epoch": 3.6323720025923527, + "grad_norm": 0.05176575109362602, + "learning_rate": 2.0423961474413792e-05, + "loss": 0.2235, + "step": 44838 + }, + { + "epoch": 3.632453013609851, + "grad_norm": 0.07315241545438766, + "learning_rate": 2.0419460821819166e-05, + "loss": 0.2138, + "step": 44839 + }, + { + "epoch": 3.632534024627349, + "grad_norm": 0.0775456428527832, + "learning_rate": 2.041496016922454e-05, + "loss": 0.272, + "step": 44840 + }, + { + "epoch": 3.6326150356448474, + "grad_norm": 0.07913167774677277, + "learning_rate": 2.0410459516629913e-05, + "loss": 0.2366, + "step": 44841 + }, + { + "epoch": 3.632696046662346, + "grad_norm": 0.07824068516492844, + "learning_rate": 2.0405958864035286e-05, + "loss": 0.2362, + "step": 44842 + }, + { + "epoch": 3.6327770576798444, + "grad_norm": 0.07198026031255722, + "learning_rate": 2.040145821144066e-05, + "loss": 0.2302, + "step": 44843 + }, + { + "epoch": 3.6328580686973426, + "grad_norm": 0.058635443449020386, + "learning_rate": 2.0396957558846034e-05, + "loss": 0.2049, + "step": 44844 + }, + { + "epoch": 3.6329390797148413, + "grad_norm": 0.06972447782754898, + "learning_rate": 2.0392456906251407e-05, + "loss": 0.2108, + "step": 44845 + }, + { + "epoch": 3.6330200907323396, + "grad_norm": 0.0633692741394043, + "learning_rate": 2.038795625365678e-05, + "loss": 0.2304, + "step": 44846 + }, + { + "epoch": 3.633101101749838, + "grad_norm": 0.06997337937355042, + "learning_rate": 2.0383455601062154e-05, + "loss": 0.2454, + "step": 44847 + }, + { + "epoch": 3.6331821127673365, + "grad_norm": 0.06759506464004517, + "learning_rate": 2.0378954948467528e-05, + "loss": 0.2419, + "step": 44848 + }, + { + "epoch": 3.6332631237848347, + "grad_norm": 0.09851627051830292, + "learning_rate": 2.03744542958729e-05, + "loss": 0.2466, + "step": 44849 + }, + { + "epoch": 3.633344134802333, + "grad_norm": 0.06574554741382599, + "learning_rate": 2.0369953643278275e-05, + "loss": 0.2095, + "step": 44850 + }, + { + "epoch": 3.6334251458198317, + "grad_norm": 0.07231997698545456, + "learning_rate": 2.0365452990683652e-05, + "loss": 0.2424, + "step": 44851 + }, + { + "epoch": 3.63350615683733, + "grad_norm": 0.07679183781147003, + "learning_rate": 2.0360952338089022e-05, + "loss": 0.2449, + "step": 44852 + }, + { + "epoch": 3.633587167854828, + "grad_norm": 0.07032614201307297, + "learning_rate": 2.03564516854944e-05, + "loss": 0.2703, + "step": 44853 + }, + { + "epoch": 3.633668178872327, + "grad_norm": 0.07566996663808823, + "learning_rate": 2.0351951032899773e-05, + "loss": 0.2033, + "step": 44854 + }, + { + "epoch": 3.633749189889825, + "grad_norm": 0.07417989522218704, + "learning_rate": 2.0347450380305143e-05, + "loss": 0.1984, + "step": 44855 + }, + { + "epoch": 3.6338302009073233, + "grad_norm": 0.07153971493244171, + "learning_rate": 2.034294972771052e-05, + "loss": 0.1825, + "step": 44856 + }, + { + "epoch": 3.633911211924822, + "grad_norm": 0.08344955742359161, + "learning_rate": 2.0338449075115894e-05, + "loss": 0.2373, + "step": 44857 + }, + { + "epoch": 3.6339922229423203, + "grad_norm": 0.0726422518491745, + "learning_rate": 2.0333948422521267e-05, + "loss": 0.222, + "step": 44858 + }, + { + "epoch": 3.6340732339598185, + "grad_norm": 0.07534100860357285, + "learning_rate": 2.032944776992664e-05, + "loss": 0.2417, + "step": 44859 + }, + { + "epoch": 3.6341542449773168, + "grad_norm": 0.07408151030540466, + "learning_rate": 2.0324947117332015e-05, + "loss": 0.2375, + "step": 44860 + }, + { + "epoch": 3.6342352559948155, + "grad_norm": 0.07573696970939636, + "learning_rate": 2.0320446464737388e-05, + "loss": 0.1934, + "step": 44861 + }, + { + "epoch": 3.6343162670123137, + "grad_norm": 0.0753345638513565, + "learning_rate": 2.0315945812142762e-05, + "loss": 0.2019, + "step": 44862 + }, + { + "epoch": 3.634397278029812, + "grad_norm": 0.07289306819438934, + "learning_rate": 2.0311445159548135e-05, + "loss": 0.265, + "step": 44863 + }, + { + "epoch": 3.63447828904731, + "grad_norm": 0.0770828127861023, + "learning_rate": 2.030694450695351e-05, + "loss": 0.2307, + "step": 44864 + }, + { + "epoch": 3.634559300064809, + "grad_norm": 0.07336337864398956, + "learning_rate": 2.0302443854358883e-05, + "loss": 0.2183, + "step": 44865 + }, + { + "epoch": 3.634640311082307, + "grad_norm": 0.06365802139043808, + "learning_rate": 2.0297943201764256e-05, + "loss": 0.2023, + "step": 44866 + }, + { + "epoch": 3.6347213220998054, + "grad_norm": 0.08312965929508209, + "learning_rate": 2.029344254916963e-05, + "loss": 0.2136, + "step": 44867 + }, + { + "epoch": 3.634802333117304, + "grad_norm": 0.07333937287330627, + "learning_rate": 2.0288941896575003e-05, + "loss": 0.2503, + "step": 44868 + }, + { + "epoch": 3.6348833441348023, + "grad_norm": 0.06631088256835938, + "learning_rate": 2.028444124398038e-05, + "loss": 0.2232, + "step": 44869 + }, + { + "epoch": 3.6349643551523005, + "grad_norm": 0.08965929597616196, + "learning_rate": 2.027994059138575e-05, + "loss": 0.2516, + "step": 44870 + }, + { + "epoch": 3.6350453661697992, + "grad_norm": 0.06641999632120132, + "learning_rate": 2.0275439938791128e-05, + "loss": 0.2144, + "step": 44871 + }, + { + "epoch": 3.6351263771872975, + "grad_norm": 0.06216254085302353, + "learning_rate": 2.02709392861965e-05, + "loss": 0.2177, + "step": 44872 + }, + { + "epoch": 3.6352073882047957, + "grad_norm": 0.07060118019580841, + "learning_rate": 2.026643863360187e-05, + "loss": 0.2131, + "step": 44873 + }, + { + "epoch": 3.6352883992222944, + "grad_norm": 0.055240385234355927, + "learning_rate": 2.026193798100725e-05, + "loss": 0.172, + "step": 44874 + }, + { + "epoch": 3.6353694102397927, + "grad_norm": 0.091565802693367, + "learning_rate": 2.0257437328412622e-05, + "loss": 0.2922, + "step": 44875 + }, + { + "epoch": 3.635450421257291, + "grad_norm": 0.06432320922613144, + "learning_rate": 2.0252936675817992e-05, + "loss": 0.2116, + "step": 44876 + }, + { + "epoch": 3.6355314322747896, + "grad_norm": 0.08700551092624664, + "learning_rate": 2.024843602322337e-05, + "loss": 0.2635, + "step": 44877 + }, + { + "epoch": 3.635612443292288, + "grad_norm": 0.062044791877269745, + "learning_rate": 2.0243935370628743e-05, + "loss": 0.2336, + "step": 44878 + }, + { + "epoch": 3.635693454309786, + "grad_norm": 0.07158152014017105, + "learning_rate": 2.0239434718034116e-05, + "loss": 0.2155, + "step": 44879 + }, + { + "epoch": 3.6357744653272848, + "grad_norm": 0.07280169427394867, + "learning_rate": 2.023493406543949e-05, + "loss": 0.2107, + "step": 44880 + }, + { + "epoch": 3.635855476344783, + "grad_norm": 0.06739906966686249, + "learning_rate": 2.0230433412844864e-05, + "loss": 0.1984, + "step": 44881 + }, + { + "epoch": 3.6359364873622813, + "grad_norm": 0.06984557956457138, + "learning_rate": 2.0225932760250237e-05, + "loss": 0.2176, + "step": 44882 + }, + { + "epoch": 3.6360174983797795, + "grad_norm": 0.0813283696770668, + "learning_rate": 2.022143210765561e-05, + "loss": 0.2505, + "step": 44883 + }, + { + "epoch": 3.636098509397278, + "grad_norm": 0.0551692433655262, + "learning_rate": 2.0216931455060988e-05, + "loss": 0.2228, + "step": 44884 + }, + { + "epoch": 3.6361795204147764, + "grad_norm": 0.06857820600271225, + "learning_rate": 2.0212430802466358e-05, + "loss": 0.2151, + "step": 44885 + }, + { + "epoch": 3.6362605314322747, + "grad_norm": 0.07335050404071808, + "learning_rate": 2.020793014987173e-05, + "loss": 0.2334, + "step": 44886 + }, + { + "epoch": 3.636341542449773, + "grad_norm": 0.09249746799468994, + "learning_rate": 2.020342949727711e-05, + "loss": 0.2194, + "step": 44887 + }, + { + "epoch": 3.6364225534672716, + "grad_norm": 0.06488305330276489, + "learning_rate": 2.019892884468248e-05, + "loss": 0.2164, + "step": 44888 + }, + { + "epoch": 3.63650356448477, + "grad_norm": 0.0670824721455574, + "learning_rate": 2.0194428192087852e-05, + "loss": 0.233, + "step": 44889 + }, + { + "epoch": 3.636584575502268, + "grad_norm": 0.06511478871107101, + "learning_rate": 2.018992753949323e-05, + "loss": 0.2027, + "step": 44890 + }, + { + "epoch": 3.636665586519767, + "grad_norm": 0.08423558622598648, + "learning_rate": 2.01854268868986e-05, + "loss": 0.233, + "step": 44891 + }, + { + "epoch": 3.636746597537265, + "grad_norm": 0.06754543632268906, + "learning_rate": 2.0180926234303977e-05, + "loss": 0.234, + "step": 44892 + }, + { + "epoch": 3.6368276085547633, + "grad_norm": 0.07313832640647888, + "learning_rate": 2.017642558170935e-05, + "loss": 0.2247, + "step": 44893 + }, + { + "epoch": 3.636908619572262, + "grad_norm": 0.07286649197340012, + "learning_rate": 2.017192492911472e-05, + "loss": 0.201, + "step": 44894 + }, + { + "epoch": 3.63698963058976, + "grad_norm": 0.09970583766698837, + "learning_rate": 2.0167424276520097e-05, + "loss": 0.2809, + "step": 44895 + }, + { + "epoch": 3.6370706416072585, + "grad_norm": 0.0816020667552948, + "learning_rate": 2.016292362392547e-05, + "loss": 0.2441, + "step": 44896 + }, + { + "epoch": 3.637151652624757, + "grad_norm": 0.0763312503695488, + "learning_rate": 2.015842297133084e-05, + "loss": 0.2461, + "step": 44897 + }, + { + "epoch": 3.6372326636422554, + "grad_norm": 0.08208746463060379, + "learning_rate": 2.0153922318736218e-05, + "loss": 0.228, + "step": 44898 + }, + { + "epoch": 3.6373136746597536, + "grad_norm": 0.06755077093839645, + "learning_rate": 2.0149421666141592e-05, + "loss": 0.2177, + "step": 44899 + }, + { + "epoch": 3.6373946856772523, + "grad_norm": 0.06575210392475128, + "learning_rate": 2.0144921013546966e-05, + "loss": 0.2265, + "step": 44900 + }, + { + "epoch": 3.6374756966947506, + "grad_norm": 0.08353587239980698, + "learning_rate": 2.014042036095234e-05, + "loss": 0.2128, + "step": 44901 + }, + { + "epoch": 3.637556707712249, + "grad_norm": 0.07195543497800827, + "learning_rate": 2.0135919708357713e-05, + "loss": 0.1838, + "step": 44902 + }, + { + "epoch": 3.6376377187297475, + "grad_norm": 0.0775909349322319, + "learning_rate": 2.0131419055763086e-05, + "loss": 0.2209, + "step": 44903 + }, + { + "epoch": 3.6377187297472457, + "grad_norm": 0.06664683669805527, + "learning_rate": 2.012691840316846e-05, + "loss": 0.2363, + "step": 44904 + }, + { + "epoch": 3.637799740764744, + "grad_norm": 0.08048176020383835, + "learning_rate": 2.0122417750573837e-05, + "loss": 0.2668, + "step": 44905 + }, + { + "epoch": 3.6378807517822422, + "grad_norm": 0.07442770153284073, + "learning_rate": 2.0117917097979207e-05, + "loss": 0.2433, + "step": 44906 + }, + { + "epoch": 3.637961762799741, + "grad_norm": 0.07577343285083771, + "learning_rate": 2.011341644538458e-05, + "loss": 0.2512, + "step": 44907 + }, + { + "epoch": 3.638042773817239, + "grad_norm": 0.08586008101701736, + "learning_rate": 2.0108915792789958e-05, + "loss": 0.2508, + "step": 44908 + }, + { + "epoch": 3.6381237848347374, + "grad_norm": 0.07474494725465775, + "learning_rate": 2.0104415140195328e-05, + "loss": 0.2261, + "step": 44909 + }, + { + "epoch": 3.6382047958522357, + "grad_norm": 0.061617929488420486, + "learning_rate": 2.00999144876007e-05, + "loss": 0.2242, + "step": 44910 + }, + { + "epoch": 3.6382858068697344, + "grad_norm": 0.07338300347328186, + "learning_rate": 2.009541383500608e-05, + "loss": 0.2138, + "step": 44911 + }, + { + "epoch": 3.6383668178872326, + "grad_norm": 0.07744680345058441, + "learning_rate": 2.009091318241145e-05, + "loss": 0.2349, + "step": 44912 + }, + { + "epoch": 3.638447828904731, + "grad_norm": 0.06322570890188217, + "learning_rate": 2.0086412529816826e-05, + "loss": 0.214, + "step": 44913 + }, + { + "epoch": 3.6385288399222295, + "grad_norm": 0.06235915422439575, + "learning_rate": 2.00819118772222e-05, + "loss": 0.2225, + "step": 44914 + }, + { + "epoch": 3.6386098509397278, + "grad_norm": 0.06920929253101349, + "learning_rate": 2.007741122462757e-05, + "loss": 0.2169, + "step": 44915 + }, + { + "epoch": 3.638690861957226, + "grad_norm": 0.07871360331773758, + "learning_rate": 2.0072910572032947e-05, + "loss": 0.2381, + "step": 44916 + }, + { + "epoch": 3.6387718729747247, + "grad_norm": 0.06010260805487633, + "learning_rate": 2.006840991943832e-05, + "loss": 0.2352, + "step": 44917 + }, + { + "epoch": 3.638852883992223, + "grad_norm": 0.07746662944555283, + "learning_rate": 2.0063909266843694e-05, + "loss": 0.2243, + "step": 44918 + }, + { + "epoch": 3.638933895009721, + "grad_norm": 0.0740174651145935, + "learning_rate": 2.0059408614249067e-05, + "loss": 0.2233, + "step": 44919 + }, + { + "epoch": 3.63901490602722, + "grad_norm": 0.07685885578393936, + "learning_rate": 2.005490796165444e-05, + "loss": 0.204, + "step": 44920 + }, + { + "epoch": 3.639095917044718, + "grad_norm": 0.07137834280729294, + "learning_rate": 2.0050407309059815e-05, + "loss": 0.2065, + "step": 44921 + }, + { + "epoch": 3.6391769280622164, + "grad_norm": 0.08067955821752548, + "learning_rate": 2.0045906656465188e-05, + "loss": 0.2387, + "step": 44922 + }, + { + "epoch": 3.639257939079715, + "grad_norm": 0.06330350041389465, + "learning_rate": 2.0041406003870562e-05, + "loss": 0.2375, + "step": 44923 + }, + { + "epoch": 3.6393389500972133, + "grad_norm": 0.0679686963558197, + "learning_rate": 2.0036905351275935e-05, + "loss": 0.2031, + "step": 44924 + }, + { + "epoch": 3.6394199611147116, + "grad_norm": 0.08646734058856964, + "learning_rate": 2.003240469868131e-05, + "loss": 0.2335, + "step": 44925 + }, + { + "epoch": 3.6395009721322102, + "grad_norm": 0.08247414231300354, + "learning_rate": 2.0027904046086686e-05, + "loss": 0.2442, + "step": 44926 + }, + { + "epoch": 3.6395819831497085, + "grad_norm": 0.08173704147338867, + "learning_rate": 2.0023403393492056e-05, + "loss": 0.2308, + "step": 44927 + }, + { + "epoch": 3.6396629941672067, + "grad_norm": 0.06407459825277328, + "learning_rate": 2.001890274089743e-05, + "loss": 0.1961, + "step": 44928 + }, + { + "epoch": 3.639744005184705, + "grad_norm": 0.07259169965982437, + "learning_rate": 2.0014402088302807e-05, + "loss": 0.2154, + "step": 44929 + }, + { + "epoch": 3.6398250162022032, + "grad_norm": 0.07888376712799072, + "learning_rate": 2.0009901435708177e-05, + "loss": 0.2135, + "step": 44930 + }, + { + "epoch": 3.639906027219702, + "grad_norm": 0.06632392108440399, + "learning_rate": 2.0005400783113554e-05, + "loss": 0.2069, + "step": 44931 + }, + { + "epoch": 3.6399870382372, + "grad_norm": 0.08187742531299591, + "learning_rate": 2.0000900130518928e-05, + "loss": 0.1845, + "step": 44932 + }, + { + "epoch": 3.6400680492546984, + "grad_norm": 0.07673794031143188, + "learning_rate": 1.9996399477924298e-05, + "loss": 0.2302, + "step": 44933 + }, + { + "epoch": 3.640149060272197, + "grad_norm": 0.06572210788726807, + "learning_rate": 1.9991898825329675e-05, + "loss": 0.2324, + "step": 44934 + }, + { + "epoch": 3.6402300712896953, + "grad_norm": 0.05955757200717926, + "learning_rate": 1.998739817273505e-05, + "loss": 0.2286, + "step": 44935 + }, + { + "epoch": 3.6403110823071936, + "grad_norm": 0.08277115225791931, + "learning_rate": 1.998289752014042e-05, + "loss": 0.2379, + "step": 44936 + }, + { + "epoch": 3.6403920933246923, + "grad_norm": 0.06754428893327713, + "learning_rate": 1.9978396867545796e-05, + "loss": 0.2311, + "step": 44937 + }, + { + "epoch": 3.6404731043421905, + "grad_norm": 0.06916896998882294, + "learning_rate": 1.997389621495117e-05, + "loss": 0.206, + "step": 44938 + }, + { + "epoch": 3.6405541153596888, + "grad_norm": 0.07250376790761948, + "learning_rate": 1.9969395562356543e-05, + "loss": 0.2244, + "step": 44939 + }, + { + "epoch": 3.6406351263771874, + "grad_norm": 0.0685395821928978, + "learning_rate": 1.9964894909761916e-05, + "loss": 0.2167, + "step": 44940 + }, + { + "epoch": 3.6407161373946857, + "grad_norm": 0.07498252391815186, + "learning_rate": 1.996039425716729e-05, + "loss": 0.2003, + "step": 44941 + }, + { + "epoch": 3.640797148412184, + "grad_norm": 0.07298076897859573, + "learning_rate": 1.9955893604572664e-05, + "loss": 0.2286, + "step": 44942 + }, + { + "epoch": 3.6408781594296826, + "grad_norm": 0.07976315170526505, + "learning_rate": 1.9951392951978037e-05, + "loss": 0.1953, + "step": 44943 + }, + { + "epoch": 3.640959170447181, + "grad_norm": 0.08286472409963608, + "learning_rate": 1.9946892299383414e-05, + "loss": 0.2218, + "step": 44944 + }, + { + "epoch": 3.641040181464679, + "grad_norm": 0.07928077131509781, + "learning_rate": 1.9942391646788784e-05, + "loss": 0.2501, + "step": 44945 + }, + { + "epoch": 3.641121192482178, + "grad_norm": 0.07734663784503937, + "learning_rate": 1.9937890994194158e-05, + "loss": 0.2303, + "step": 44946 + }, + { + "epoch": 3.641202203499676, + "grad_norm": 0.07841353863477707, + "learning_rate": 1.9933390341599535e-05, + "loss": 0.2627, + "step": 44947 + }, + { + "epoch": 3.6412832145171743, + "grad_norm": 0.08440117537975311, + "learning_rate": 1.9928889689004905e-05, + "loss": 0.212, + "step": 44948 + }, + { + "epoch": 3.641364225534673, + "grad_norm": 0.06597091257572174, + "learning_rate": 1.992438903641028e-05, + "loss": 0.205, + "step": 44949 + }, + { + "epoch": 3.6414452365521712, + "grad_norm": 0.055673886090517044, + "learning_rate": 1.9919888383815656e-05, + "loss": 0.2006, + "step": 44950 + }, + { + "epoch": 3.6415262475696695, + "grad_norm": 0.07033570110797882, + "learning_rate": 1.9915387731221026e-05, + "loss": 0.2611, + "step": 44951 + }, + { + "epoch": 3.6416072585871677, + "grad_norm": 0.06624488532543182, + "learning_rate": 1.9910887078626403e-05, + "loss": 0.2301, + "step": 44952 + }, + { + "epoch": 3.641688269604666, + "grad_norm": 0.07321982830762863, + "learning_rate": 1.9906386426031777e-05, + "loss": 0.2513, + "step": 44953 + }, + { + "epoch": 3.6417692806221647, + "grad_norm": 0.07193963974714279, + "learning_rate": 1.9901885773437147e-05, + "loss": 0.2259, + "step": 44954 + }, + { + "epoch": 3.641850291639663, + "grad_norm": 0.07336988300085068, + "learning_rate": 1.9897385120842524e-05, + "loss": 0.232, + "step": 44955 + }, + { + "epoch": 3.641931302657161, + "grad_norm": 0.09553751349449158, + "learning_rate": 1.9892884468247897e-05, + "loss": 0.2257, + "step": 44956 + }, + { + "epoch": 3.64201231367466, + "grad_norm": 0.06732214242219925, + "learning_rate": 1.988838381565327e-05, + "loss": 0.2179, + "step": 44957 + }, + { + "epoch": 3.642093324692158, + "grad_norm": 0.05843079090118408, + "learning_rate": 1.9883883163058645e-05, + "loss": 0.1975, + "step": 44958 + }, + { + "epoch": 3.6421743357096563, + "grad_norm": 0.052807748317718506, + "learning_rate": 1.9879382510464018e-05, + "loss": 0.2034, + "step": 44959 + }, + { + "epoch": 3.642255346727155, + "grad_norm": 0.05599704384803772, + "learning_rate": 1.9874881857869392e-05, + "loss": 0.2139, + "step": 44960 + }, + { + "epoch": 3.6423363577446533, + "grad_norm": 0.07424107939004898, + "learning_rate": 1.9870381205274765e-05, + "loss": 0.2366, + "step": 44961 + }, + { + "epoch": 3.6424173687621515, + "grad_norm": 0.08620074391365051, + "learning_rate": 1.986588055268014e-05, + "loss": 0.2823, + "step": 44962 + }, + { + "epoch": 3.64249837977965, + "grad_norm": 0.07798013836145401, + "learning_rate": 1.9861379900085513e-05, + "loss": 0.2068, + "step": 44963 + }, + { + "epoch": 3.6425793907971484, + "grad_norm": 0.0656246617436409, + "learning_rate": 1.9856879247490886e-05, + "loss": 0.2024, + "step": 44964 + }, + { + "epoch": 3.6426604018146467, + "grad_norm": 0.08685047924518585, + "learning_rate": 1.9852378594896263e-05, + "loss": 0.2391, + "step": 44965 + }, + { + "epoch": 3.6427414128321454, + "grad_norm": 0.07338472455739975, + "learning_rate": 1.9847877942301633e-05, + "loss": 0.2311, + "step": 44966 + }, + { + "epoch": 3.6428224238496436, + "grad_norm": 0.072905994951725, + "learning_rate": 1.9843377289707007e-05, + "loss": 0.2356, + "step": 44967 + }, + { + "epoch": 3.642903434867142, + "grad_norm": 0.08200103044509888, + "learning_rate": 1.9838876637112384e-05, + "loss": 0.2411, + "step": 44968 + }, + { + "epoch": 3.6429844458846405, + "grad_norm": 0.07015134394168854, + "learning_rate": 1.9834375984517754e-05, + "loss": 0.209, + "step": 44969 + }, + { + "epoch": 3.643065456902139, + "grad_norm": 0.07618094980716705, + "learning_rate": 1.982987533192313e-05, + "loss": 0.2246, + "step": 44970 + }, + { + "epoch": 3.643146467919637, + "grad_norm": 0.07204940170049667, + "learning_rate": 1.9825374679328505e-05, + "loss": 0.231, + "step": 44971 + }, + { + "epoch": 3.6432274789371357, + "grad_norm": 0.06646368652582169, + "learning_rate": 1.9820874026733875e-05, + "loss": 0.2465, + "step": 44972 + }, + { + "epoch": 3.643308489954634, + "grad_norm": 0.06563836336135864, + "learning_rate": 1.9816373374139252e-05, + "loss": 0.2139, + "step": 44973 + }, + { + "epoch": 3.643389500972132, + "grad_norm": 0.07280927151441574, + "learning_rate": 1.9811872721544626e-05, + "loss": 0.2233, + "step": 44974 + }, + { + "epoch": 3.6434705119896305, + "grad_norm": 0.07386231422424316, + "learning_rate": 1.9807372068949996e-05, + "loss": 0.1915, + "step": 44975 + }, + { + "epoch": 3.6435515230071287, + "grad_norm": 0.06035052239894867, + "learning_rate": 1.9802871416355373e-05, + "loss": 0.2234, + "step": 44976 + }, + { + "epoch": 3.6436325340246274, + "grad_norm": 0.06255166232585907, + "learning_rate": 1.9798370763760747e-05, + "loss": 0.2322, + "step": 44977 + }, + { + "epoch": 3.6437135450421256, + "grad_norm": 0.06946348398923874, + "learning_rate": 1.979387011116612e-05, + "loss": 0.2437, + "step": 44978 + }, + { + "epoch": 3.643794556059624, + "grad_norm": 0.06545937806367874, + "learning_rate": 1.9789369458571494e-05, + "loss": 0.2059, + "step": 44979 + }, + { + "epoch": 3.6438755670771226, + "grad_norm": 0.07262036204338074, + "learning_rate": 1.9784868805976867e-05, + "loss": 0.2138, + "step": 44980 + }, + { + "epoch": 3.643956578094621, + "grad_norm": 0.06998401880264282, + "learning_rate": 1.978036815338224e-05, + "loss": 0.2435, + "step": 44981 + }, + { + "epoch": 3.644037589112119, + "grad_norm": 0.07168823480606079, + "learning_rate": 1.9775867500787615e-05, + "loss": 0.2259, + "step": 44982 + }, + { + "epoch": 3.6441186001296177, + "grad_norm": 0.06272865831851959, + "learning_rate": 1.9771366848192988e-05, + "loss": 0.228, + "step": 44983 + }, + { + "epoch": 3.644199611147116, + "grad_norm": 0.060105741024017334, + "learning_rate": 1.9766866195598362e-05, + "loss": 0.2234, + "step": 44984 + }, + { + "epoch": 3.6442806221646142, + "grad_norm": 0.07992798089981079, + "learning_rate": 1.9762365543003735e-05, + "loss": 0.2194, + "step": 44985 + }, + { + "epoch": 3.644361633182113, + "grad_norm": 0.0626491978764534, + "learning_rate": 1.9757864890409112e-05, + "loss": 0.2328, + "step": 44986 + }, + { + "epoch": 3.644442644199611, + "grad_norm": 0.07601386308670044, + "learning_rate": 1.9753364237814483e-05, + "loss": 0.205, + "step": 44987 + }, + { + "epoch": 3.6445236552171094, + "grad_norm": 0.06073329225182533, + "learning_rate": 1.9748863585219856e-05, + "loss": 0.2212, + "step": 44988 + }, + { + "epoch": 3.644604666234608, + "grad_norm": 0.09335854649543762, + "learning_rate": 1.9744362932625233e-05, + "loss": 0.2202, + "step": 44989 + }, + { + "epoch": 3.6446856772521063, + "grad_norm": 0.08301527053117752, + "learning_rate": 1.9739862280030603e-05, + "loss": 0.2568, + "step": 44990 + }, + { + "epoch": 3.6447666882696046, + "grad_norm": 0.06001551076769829, + "learning_rate": 1.973536162743598e-05, + "loss": 0.1837, + "step": 44991 + }, + { + "epoch": 3.6448476992871033, + "grad_norm": 0.07624075561761856, + "learning_rate": 1.9730860974841354e-05, + "loss": 0.2352, + "step": 44992 + }, + { + "epoch": 3.6449287103046015, + "grad_norm": 0.0669778510928154, + "learning_rate": 1.9726360322246724e-05, + "loss": 0.2127, + "step": 44993 + }, + { + "epoch": 3.6450097213220998, + "grad_norm": 0.0638890415430069, + "learning_rate": 1.97218596696521e-05, + "loss": 0.2323, + "step": 44994 + }, + { + "epoch": 3.6450907323395985, + "grad_norm": 0.06809230148792267, + "learning_rate": 1.9717359017057475e-05, + "loss": 0.2385, + "step": 44995 + }, + { + "epoch": 3.6451717433570967, + "grad_norm": 0.06901613622903824, + "learning_rate": 1.971285836446285e-05, + "loss": 0.2086, + "step": 44996 + }, + { + "epoch": 3.645252754374595, + "grad_norm": 0.07063661515712738, + "learning_rate": 1.9708357711868222e-05, + "loss": 0.2189, + "step": 44997 + }, + { + "epoch": 3.645333765392093, + "grad_norm": 0.06737226247787476, + "learning_rate": 1.9703857059273596e-05, + "loss": 0.2521, + "step": 44998 + }, + { + "epoch": 3.6454147764095914, + "grad_norm": 0.06689809262752533, + "learning_rate": 1.969935640667897e-05, + "loss": 0.2213, + "step": 44999 + }, + { + "epoch": 3.64549578742709, + "grad_norm": 0.07541733980178833, + "learning_rate": 1.9694855754084343e-05, + "loss": 0.2074, + "step": 45000 + }, + { + "epoch": 3.6455767984445884, + "grad_norm": 0.06847432255744934, + "learning_rate": 1.9690355101489716e-05, + "loss": 0.2044, + "step": 45001 + }, + { + "epoch": 3.6456578094620866, + "grad_norm": 0.0679175928235054, + "learning_rate": 1.968585444889509e-05, + "loss": 0.22, + "step": 45002 + }, + { + "epoch": 3.6457388204795853, + "grad_norm": 0.07706435024738312, + "learning_rate": 1.9681353796300464e-05, + "loss": 0.199, + "step": 45003 + }, + { + "epoch": 3.6458198314970836, + "grad_norm": 0.07787571847438812, + "learning_rate": 1.967685314370584e-05, + "loss": 0.2034, + "step": 45004 + }, + { + "epoch": 3.645900842514582, + "grad_norm": 0.0687023401260376, + "learning_rate": 1.967235249111121e-05, + "loss": 0.2242, + "step": 45005 + }, + { + "epoch": 3.6459818535320805, + "grad_norm": 0.07250736653804779, + "learning_rate": 1.9667851838516584e-05, + "loss": 0.2299, + "step": 45006 + }, + { + "epoch": 3.6460628645495787, + "grad_norm": 0.05602612718939781, + "learning_rate": 1.966335118592196e-05, + "loss": 0.1778, + "step": 45007 + }, + { + "epoch": 3.646143875567077, + "grad_norm": 0.0714431181550026, + "learning_rate": 1.965885053332733e-05, + "loss": 0.23, + "step": 45008 + }, + { + "epoch": 3.6462248865845757, + "grad_norm": 0.06440123170614243, + "learning_rate": 1.9654349880732705e-05, + "loss": 0.2232, + "step": 45009 + }, + { + "epoch": 3.646305897602074, + "grad_norm": 0.06279275566339493, + "learning_rate": 1.9649849228138082e-05, + "loss": 0.2395, + "step": 45010 + }, + { + "epoch": 3.646386908619572, + "grad_norm": 0.06747173517942429, + "learning_rate": 1.9645348575543452e-05, + "loss": 0.2352, + "step": 45011 + }, + { + "epoch": 3.646467919637071, + "grad_norm": 0.06958412379026413, + "learning_rate": 1.964084792294883e-05, + "loss": 0.2271, + "step": 45012 + }, + { + "epoch": 3.646548930654569, + "grad_norm": 0.08571343868970871, + "learning_rate": 1.9636347270354203e-05, + "loss": 0.2349, + "step": 45013 + }, + { + "epoch": 3.6466299416720673, + "grad_norm": 0.06197122111916542, + "learning_rate": 1.9631846617759577e-05, + "loss": 0.2058, + "step": 45014 + }, + { + "epoch": 3.646710952689566, + "grad_norm": 0.06550417840480804, + "learning_rate": 1.962734596516495e-05, + "loss": 0.229, + "step": 45015 + }, + { + "epoch": 3.6467919637070643, + "grad_norm": 0.0715557187795639, + "learning_rate": 1.9622845312570324e-05, + "loss": 0.2679, + "step": 45016 + }, + { + "epoch": 3.6468729747245625, + "grad_norm": 0.06895950436592102, + "learning_rate": 1.9618344659975697e-05, + "loss": 0.1977, + "step": 45017 + }, + { + "epoch": 3.6469539857420608, + "grad_norm": 0.06980966776609421, + "learning_rate": 1.961384400738107e-05, + "loss": 0.249, + "step": 45018 + }, + { + "epoch": 3.6470349967595594, + "grad_norm": 0.06876849383115768, + "learning_rate": 1.9609343354786445e-05, + "loss": 0.2313, + "step": 45019 + }, + { + "epoch": 3.6471160077770577, + "grad_norm": 0.07544594258069992, + "learning_rate": 1.9604842702191818e-05, + "loss": 0.2278, + "step": 45020 + }, + { + "epoch": 3.647197018794556, + "grad_norm": 0.08122597634792328, + "learning_rate": 1.9600342049597192e-05, + "loss": 0.2362, + "step": 45021 + }, + { + "epoch": 3.647278029812054, + "grad_norm": 0.06714284420013428, + "learning_rate": 1.9595841397002565e-05, + "loss": 0.1946, + "step": 45022 + }, + { + "epoch": 3.647359040829553, + "grad_norm": 0.06398520618677139, + "learning_rate": 1.959134074440794e-05, + "loss": 0.2302, + "step": 45023 + }, + { + "epoch": 3.647440051847051, + "grad_norm": 0.07198784500360489, + "learning_rate": 1.9586840091813313e-05, + "loss": 0.2032, + "step": 45024 + }, + { + "epoch": 3.6475210628645494, + "grad_norm": 0.06664520502090454, + "learning_rate": 1.958233943921869e-05, + "loss": 0.2328, + "step": 45025 + }, + { + "epoch": 3.647602073882048, + "grad_norm": 0.06397178024053574, + "learning_rate": 1.957783878662406e-05, + "loss": 0.213, + "step": 45026 + }, + { + "epoch": 3.6476830848995463, + "grad_norm": 0.058583084493875504, + "learning_rate": 1.9573338134029433e-05, + "loss": 0.203, + "step": 45027 + }, + { + "epoch": 3.6477640959170445, + "grad_norm": 0.06231728196144104, + "learning_rate": 1.956883748143481e-05, + "loss": 0.2317, + "step": 45028 + }, + { + "epoch": 3.6478451069345432, + "grad_norm": 0.08897008001804352, + "learning_rate": 1.9564336828840184e-05, + "loss": 0.2107, + "step": 45029 + }, + { + "epoch": 3.6479261179520415, + "grad_norm": 0.08053074032068253, + "learning_rate": 1.9559836176245558e-05, + "loss": 0.1974, + "step": 45030 + }, + { + "epoch": 3.6480071289695397, + "grad_norm": 0.07111253589391708, + "learning_rate": 1.955533552365093e-05, + "loss": 0.2069, + "step": 45031 + }, + { + "epoch": 3.6480881399870384, + "grad_norm": 0.07549881190061569, + "learning_rate": 1.9550834871056305e-05, + "loss": 0.2328, + "step": 45032 + }, + { + "epoch": 3.6481691510045366, + "grad_norm": 0.058063820004463196, + "learning_rate": 1.954633421846168e-05, + "loss": 0.1701, + "step": 45033 + }, + { + "epoch": 3.648250162022035, + "grad_norm": 0.09114842861890793, + "learning_rate": 1.9541833565867052e-05, + "loss": 0.2278, + "step": 45034 + }, + { + "epoch": 3.6483311730395336, + "grad_norm": 0.08632674813270569, + "learning_rate": 1.9537332913272426e-05, + "loss": 0.2388, + "step": 45035 + }, + { + "epoch": 3.648412184057032, + "grad_norm": 0.07076793164014816, + "learning_rate": 1.95328322606778e-05, + "loss": 0.2131, + "step": 45036 + }, + { + "epoch": 3.64849319507453, + "grad_norm": 0.08358518034219742, + "learning_rate": 1.9528331608083173e-05, + "loss": 0.2379, + "step": 45037 + }, + { + "epoch": 3.6485742060920288, + "grad_norm": 0.058723390102386475, + "learning_rate": 1.9523830955488546e-05, + "loss": 0.2178, + "step": 45038 + }, + { + "epoch": 3.648655217109527, + "grad_norm": 0.09624432027339935, + "learning_rate": 1.951933030289392e-05, + "loss": 0.2327, + "step": 45039 + }, + { + "epoch": 3.6487362281270252, + "grad_norm": 0.0518026165664196, + "learning_rate": 1.9514829650299294e-05, + "loss": 0.22, + "step": 45040 + }, + { + "epoch": 3.6488172391445235, + "grad_norm": 0.07486674934625626, + "learning_rate": 1.9510328997704667e-05, + "loss": 0.1927, + "step": 45041 + }, + { + "epoch": 3.648898250162022, + "grad_norm": 0.0642525851726532, + "learning_rate": 1.950582834511004e-05, + "loss": 0.195, + "step": 45042 + }, + { + "epoch": 3.6489792611795204, + "grad_norm": 0.0849383994936943, + "learning_rate": 1.9501327692515418e-05, + "loss": 0.2345, + "step": 45043 + }, + { + "epoch": 3.6490602721970187, + "grad_norm": 0.05372621491551399, + "learning_rate": 1.9496827039920788e-05, + "loss": 0.2072, + "step": 45044 + }, + { + "epoch": 3.649141283214517, + "grad_norm": 0.07009585201740265, + "learning_rate": 1.9492326387326162e-05, + "loss": 0.2236, + "step": 45045 + }, + { + "epoch": 3.6492222942320156, + "grad_norm": 0.07817225158214569, + "learning_rate": 1.948782573473154e-05, + "loss": 0.1953, + "step": 45046 + }, + { + "epoch": 3.649303305249514, + "grad_norm": 0.06566929817199707, + "learning_rate": 1.9483325082136912e-05, + "loss": 0.2251, + "step": 45047 + }, + { + "epoch": 3.649384316267012, + "grad_norm": 0.0773925706744194, + "learning_rate": 1.9478824429542283e-05, + "loss": 0.2595, + "step": 45048 + }, + { + "epoch": 3.649465327284511, + "grad_norm": 0.08280571550130844, + "learning_rate": 1.947432377694766e-05, + "loss": 0.234, + "step": 45049 + }, + { + "epoch": 3.649546338302009, + "grad_norm": 0.060975659638643265, + "learning_rate": 1.9469823124353033e-05, + "loss": 0.175, + "step": 45050 + }, + { + "epoch": 3.6496273493195073, + "grad_norm": 0.0727422684431076, + "learning_rate": 1.9465322471758407e-05, + "loss": 0.2361, + "step": 45051 + }, + { + "epoch": 3.649708360337006, + "grad_norm": 0.06734203547239304, + "learning_rate": 1.946082181916378e-05, + "loss": 0.2283, + "step": 45052 + }, + { + "epoch": 3.649789371354504, + "grad_norm": 0.0899474248290062, + "learning_rate": 1.9456321166569154e-05, + "loss": 0.2251, + "step": 45053 + }, + { + "epoch": 3.6498703823720025, + "grad_norm": 0.05988341569900513, + "learning_rate": 1.9451820513974528e-05, + "loss": 0.2213, + "step": 45054 + }, + { + "epoch": 3.649951393389501, + "grad_norm": 0.0768548995256424, + "learning_rate": 1.94473198613799e-05, + "loss": 0.2439, + "step": 45055 + }, + { + "epoch": 3.6500324044069994, + "grad_norm": 0.0809468999505043, + "learning_rate": 1.9442819208785275e-05, + "loss": 0.2152, + "step": 45056 + }, + { + "epoch": 3.6501134154244976, + "grad_norm": 0.08236753940582275, + "learning_rate": 1.943831855619065e-05, + "loss": 0.234, + "step": 45057 + }, + { + "epoch": 3.6501944264419963, + "grad_norm": 0.05949265882372856, + "learning_rate": 1.9433817903596022e-05, + "loss": 0.1956, + "step": 45058 + }, + { + "epoch": 3.6502754374594946, + "grad_norm": 0.07996305078268051, + "learning_rate": 1.9429317251001396e-05, + "loss": 0.2295, + "step": 45059 + }, + { + "epoch": 3.650356448476993, + "grad_norm": 0.0879431888461113, + "learning_rate": 1.942481659840677e-05, + "loss": 0.2368, + "step": 45060 + }, + { + "epoch": 3.6504374594944915, + "grad_norm": 0.09156335890293121, + "learning_rate": 1.9420315945812143e-05, + "loss": 0.2713, + "step": 45061 + }, + { + "epoch": 3.6505184705119897, + "grad_norm": 0.08023006469011307, + "learning_rate": 1.9415815293217516e-05, + "loss": 0.2367, + "step": 45062 + }, + { + "epoch": 3.650599481529488, + "grad_norm": 0.054662078619003296, + "learning_rate": 1.941131464062289e-05, + "loss": 0.1989, + "step": 45063 + }, + { + "epoch": 3.6506804925469862, + "grad_norm": 0.08293486386537552, + "learning_rate": 1.9406813988028267e-05, + "loss": 0.2082, + "step": 45064 + }, + { + "epoch": 3.650761503564485, + "grad_norm": 0.06276928633451462, + "learning_rate": 1.940231333543364e-05, + "loss": 0.2343, + "step": 45065 + }, + { + "epoch": 3.650842514581983, + "grad_norm": 0.0665750503540039, + "learning_rate": 1.939781268283901e-05, + "loss": 0.2015, + "step": 45066 + }, + { + "epoch": 3.6509235255994814, + "grad_norm": 0.08064727485179901, + "learning_rate": 1.9393312030244388e-05, + "loss": 0.2151, + "step": 45067 + }, + { + "epoch": 3.6510045366169797, + "grad_norm": 0.08324388414621353, + "learning_rate": 1.938881137764976e-05, + "loss": 0.2388, + "step": 45068 + }, + { + "epoch": 3.6510855476344783, + "grad_norm": 0.0781935527920723, + "learning_rate": 1.938431072505513e-05, + "loss": 0.2498, + "step": 45069 + }, + { + "epoch": 3.6511665586519766, + "grad_norm": 0.07621213793754578, + "learning_rate": 1.937981007246051e-05, + "loss": 0.1805, + "step": 45070 + }, + { + "epoch": 3.651247569669475, + "grad_norm": 0.08255457133054733, + "learning_rate": 1.9375309419865882e-05, + "loss": 0.2374, + "step": 45071 + }, + { + "epoch": 3.6513285806869735, + "grad_norm": 0.07902143150568008, + "learning_rate": 1.9370808767271256e-05, + "loss": 0.2346, + "step": 45072 + }, + { + "epoch": 3.6514095917044718, + "grad_norm": 0.09876828640699387, + "learning_rate": 1.936630811467663e-05, + "loss": 0.2745, + "step": 45073 + }, + { + "epoch": 3.65149060272197, + "grad_norm": 0.05991111323237419, + "learning_rate": 1.9361807462082003e-05, + "loss": 0.2077, + "step": 45074 + }, + { + "epoch": 3.6515716137394687, + "grad_norm": 0.07003296166658401, + "learning_rate": 1.9357306809487377e-05, + "loss": 0.2011, + "step": 45075 + }, + { + "epoch": 3.651652624756967, + "grad_norm": 0.07214841991662979, + "learning_rate": 1.935280615689275e-05, + "loss": 0.2229, + "step": 45076 + }, + { + "epoch": 3.651733635774465, + "grad_norm": 0.07503457367420197, + "learning_rate": 1.9348305504298124e-05, + "loss": 0.2282, + "step": 45077 + }, + { + "epoch": 3.651814646791964, + "grad_norm": 0.08666056394577026, + "learning_rate": 1.9343804851703497e-05, + "loss": 0.2401, + "step": 45078 + }, + { + "epoch": 3.651895657809462, + "grad_norm": 0.0745568796992302, + "learning_rate": 1.933930419910887e-05, + "loss": 0.2199, + "step": 45079 + }, + { + "epoch": 3.6519766688269604, + "grad_norm": 0.06492049247026443, + "learning_rate": 1.9334803546514248e-05, + "loss": 0.2247, + "step": 45080 + }, + { + "epoch": 3.652057679844459, + "grad_norm": 0.07261046767234802, + "learning_rate": 1.9330302893919618e-05, + "loss": 0.2182, + "step": 45081 + }, + { + "epoch": 3.6521386908619573, + "grad_norm": 0.07351624965667725, + "learning_rate": 1.9325802241324992e-05, + "loss": 0.2581, + "step": 45082 + }, + { + "epoch": 3.6522197018794555, + "grad_norm": 0.06611928343772888, + "learning_rate": 1.932130158873037e-05, + "loss": 0.2083, + "step": 45083 + }, + { + "epoch": 3.6523007128969542, + "grad_norm": 0.06513310968875885, + "learning_rate": 1.931680093613574e-05, + "loss": 0.2314, + "step": 45084 + }, + { + "epoch": 3.6523817239144525, + "grad_norm": 0.0680699497461319, + "learning_rate": 1.9312300283541116e-05, + "loss": 0.2342, + "step": 45085 + }, + { + "epoch": 3.6524627349319507, + "grad_norm": 0.07630762457847595, + "learning_rate": 1.930779963094649e-05, + "loss": 0.2275, + "step": 45086 + }, + { + "epoch": 3.652543745949449, + "grad_norm": 0.06224377825856209, + "learning_rate": 1.930329897835186e-05, + "loss": 0.2298, + "step": 45087 + }, + { + "epoch": 3.6526247569669477, + "grad_norm": 0.06828049570322037, + "learning_rate": 1.9298798325757237e-05, + "loss": 0.2415, + "step": 45088 + }, + { + "epoch": 3.652705767984446, + "grad_norm": 0.07400552183389664, + "learning_rate": 1.929429767316261e-05, + "loss": 0.1922, + "step": 45089 + }, + { + "epoch": 3.652786779001944, + "grad_norm": 0.0771031454205513, + "learning_rate": 1.9289797020567984e-05, + "loss": 0.2652, + "step": 45090 + }, + { + "epoch": 3.6528677900194424, + "grad_norm": 0.06022996827960014, + "learning_rate": 1.9285296367973358e-05, + "loss": 0.2384, + "step": 45091 + }, + { + "epoch": 3.652948801036941, + "grad_norm": 0.06557485461235046, + "learning_rate": 1.928079571537873e-05, + "loss": 0.1857, + "step": 45092 + }, + { + "epoch": 3.6530298120544393, + "grad_norm": 0.06742022931575775, + "learning_rate": 1.9276295062784105e-05, + "loss": 0.1977, + "step": 45093 + }, + { + "epoch": 3.6531108230719376, + "grad_norm": 0.05643368512392044, + "learning_rate": 1.927179441018948e-05, + "loss": 0.2088, + "step": 45094 + }, + { + "epoch": 3.6531918340894363, + "grad_norm": 0.06223396584391594, + "learning_rate": 1.9267293757594852e-05, + "loss": 0.2098, + "step": 45095 + }, + { + "epoch": 3.6532728451069345, + "grad_norm": 0.06473883986473083, + "learning_rate": 1.9262793105000226e-05, + "loss": 0.1957, + "step": 45096 + }, + { + "epoch": 3.6533538561244328, + "grad_norm": 0.07481816411018372, + "learning_rate": 1.92582924524056e-05, + "loss": 0.2152, + "step": 45097 + }, + { + "epoch": 3.6534348671419314, + "grad_norm": 0.07294804602861404, + "learning_rate": 1.9253791799810976e-05, + "loss": 0.1953, + "step": 45098 + }, + { + "epoch": 3.6535158781594297, + "grad_norm": 0.08691900223493576, + "learning_rate": 1.9249291147216346e-05, + "loss": 0.2632, + "step": 45099 + }, + { + "epoch": 3.653596889176928, + "grad_norm": 0.07867283374071121, + "learning_rate": 1.924479049462172e-05, + "loss": 0.2454, + "step": 45100 + }, + { + "epoch": 3.6536779001944266, + "grad_norm": 0.07713471353054047, + "learning_rate": 1.9240289842027097e-05, + "loss": 0.2289, + "step": 45101 + }, + { + "epoch": 3.653758911211925, + "grad_norm": 0.06068172678351402, + "learning_rate": 1.9235789189432467e-05, + "loss": 0.2252, + "step": 45102 + }, + { + "epoch": 3.653839922229423, + "grad_norm": 0.07757342606782913, + "learning_rate": 1.9231288536837844e-05, + "loss": 0.2017, + "step": 45103 + }, + { + "epoch": 3.653920933246922, + "grad_norm": 0.07680576294660568, + "learning_rate": 1.9226787884243218e-05, + "loss": 0.2258, + "step": 45104 + }, + { + "epoch": 3.65400194426442, + "grad_norm": 0.056338656693696976, + "learning_rate": 1.9222287231648588e-05, + "loss": 0.2216, + "step": 45105 + }, + { + "epoch": 3.6540829552819183, + "grad_norm": 0.09292469918727875, + "learning_rate": 1.9217786579053965e-05, + "loss": 0.2781, + "step": 45106 + }, + { + "epoch": 3.654163966299417, + "grad_norm": 0.09539378434419632, + "learning_rate": 1.921328592645934e-05, + "loss": 0.2077, + "step": 45107 + }, + { + "epoch": 3.654244977316915, + "grad_norm": 0.06527606397867203, + "learning_rate": 1.920878527386471e-05, + "loss": 0.2205, + "step": 45108 + }, + { + "epoch": 3.6543259883344135, + "grad_norm": 0.07580121606588364, + "learning_rate": 1.9204284621270086e-05, + "loss": 0.2244, + "step": 45109 + }, + { + "epoch": 3.6544069993519117, + "grad_norm": 0.07603007555007935, + "learning_rate": 1.919978396867546e-05, + "loss": 0.1822, + "step": 45110 + }, + { + "epoch": 3.6544880103694104, + "grad_norm": 0.08321841061115265, + "learning_rate": 1.9195283316080833e-05, + "loss": 0.2211, + "step": 45111 + }, + { + "epoch": 3.6545690213869086, + "grad_norm": 0.07010239362716675, + "learning_rate": 1.9190782663486207e-05, + "loss": 0.1972, + "step": 45112 + }, + { + "epoch": 3.654650032404407, + "grad_norm": 0.07282821834087372, + "learning_rate": 1.918628201089158e-05, + "loss": 0.2093, + "step": 45113 + }, + { + "epoch": 3.654731043421905, + "grad_norm": 0.07374906539916992, + "learning_rate": 1.9181781358296954e-05, + "loss": 0.2246, + "step": 45114 + }, + { + "epoch": 3.654812054439404, + "grad_norm": 0.0666646808385849, + "learning_rate": 1.9177280705702327e-05, + "loss": 0.2218, + "step": 45115 + }, + { + "epoch": 3.654893065456902, + "grad_norm": 0.06913021951913834, + "learning_rate": 1.9172780053107704e-05, + "loss": 0.2354, + "step": 45116 + }, + { + "epoch": 3.6549740764744003, + "grad_norm": 0.06549743562936783, + "learning_rate": 1.9168279400513075e-05, + "loss": 0.2382, + "step": 45117 + }, + { + "epoch": 3.655055087491899, + "grad_norm": 0.07928057760000229, + "learning_rate": 1.9163778747918448e-05, + "loss": 0.208, + "step": 45118 + }, + { + "epoch": 3.6551360985093972, + "grad_norm": 0.07727041840553284, + "learning_rate": 1.9159278095323825e-05, + "loss": 0.1919, + "step": 45119 + }, + { + "epoch": 3.6552171095268955, + "grad_norm": 0.05975821986794472, + "learning_rate": 1.9154777442729196e-05, + "loss": 0.2016, + "step": 45120 + }, + { + "epoch": 3.655298120544394, + "grad_norm": 0.06781353056430817, + "learning_rate": 1.915027679013457e-05, + "loss": 0.1934, + "step": 45121 + }, + { + "epoch": 3.6553791315618924, + "grad_norm": 0.06860139220952988, + "learning_rate": 1.9145776137539946e-05, + "loss": 0.2195, + "step": 45122 + }, + { + "epoch": 3.6554601425793907, + "grad_norm": 0.07479297369718552, + "learning_rate": 1.9141275484945316e-05, + "loss": 0.264, + "step": 45123 + }, + { + "epoch": 3.6555411535968894, + "grad_norm": 0.05585840344429016, + "learning_rate": 1.9136774832350693e-05, + "loss": 0.2115, + "step": 45124 + }, + { + "epoch": 3.6556221646143876, + "grad_norm": 0.07544923573732376, + "learning_rate": 1.9132274179756067e-05, + "loss": 0.2128, + "step": 45125 + }, + { + "epoch": 3.655703175631886, + "grad_norm": 0.08912745863199234, + "learning_rate": 1.9127773527161437e-05, + "loss": 0.2231, + "step": 45126 + }, + { + "epoch": 3.6557841866493845, + "grad_norm": 0.0821533054113388, + "learning_rate": 1.9123272874566814e-05, + "loss": 0.232, + "step": 45127 + }, + { + "epoch": 3.655865197666883, + "grad_norm": 0.07527308911085129, + "learning_rate": 1.9118772221972188e-05, + "loss": 0.2264, + "step": 45128 + }, + { + "epoch": 3.655946208684381, + "grad_norm": 0.06770501285791397, + "learning_rate": 1.911427156937756e-05, + "loss": 0.2187, + "step": 45129 + }, + { + "epoch": 3.6560272197018797, + "grad_norm": 0.07356284558773041, + "learning_rate": 1.9109770916782935e-05, + "loss": 0.2079, + "step": 45130 + }, + { + "epoch": 3.656108230719378, + "grad_norm": 0.07901307195425034, + "learning_rate": 1.910527026418831e-05, + "loss": 0.2404, + "step": 45131 + }, + { + "epoch": 3.656189241736876, + "grad_norm": 0.07238209992647171, + "learning_rate": 1.9100769611593682e-05, + "loss": 0.2047, + "step": 45132 + }, + { + "epoch": 3.6562702527543745, + "grad_norm": 0.07739842683076859, + "learning_rate": 1.9096268958999056e-05, + "loss": 0.2203, + "step": 45133 + }, + { + "epoch": 3.656351263771873, + "grad_norm": 0.07297014445066452, + "learning_rate": 1.909176830640443e-05, + "loss": 0.215, + "step": 45134 + }, + { + "epoch": 3.6564322747893714, + "grad_norm": 0.07107474654912949, + "learning_rate": 1.9087267653809803e-05, + "loss": 0.2489, + "step": 45135 + }, + { + "epoch": 3.6565132858068696, + "grad_norm": 0.07162254303693771, + "learning_rate": 1.9082767001215177e-05, + "loss": 0.226, + "step": 45136 + }, + { + "epoch": 3.656594296824368, + "grad_norm": 0.08277851343154907, + "learning_rate": 1.9078266348620554e-05, + "loss": 0.2264, + "step": 45137 + }, + { + "epoch": 3.6566753078418666, + "grad_norm": 0.07660046964883804, + "learning_rate": 1.9073765696025924e-05, + "loss": 0.2185, + "step": 45138 + }, + { + "epoch": 3.656756318859365, + "grad_norm": 0.05719228461384773, + "learning_rate": 1.9069265043431297e-05, + "loss": 0.1851, + "step": 45139 + }, + { + "epoch": 3.656837329876863, + "grad_norm": 0.07676658034324646, + "learning_rate": 1.9064764390836674e-05, + "loss": 0.2416, + "step": 45140 + }, + { + "epoch": 3.6569183408943617, + "grad_norm": 0.08541837334632874, + "learning_rate": 1.9060263738242045e-05, + "loss": 0.2255, + "step": 45141 + }, + { + "epoch": 3.65699935191186, + "grad_norm": 0.08627410978078842, + "learning_rate": 1.9055763085647418e-05, + "loss": 0.238, + "step": 45142 + }, + { + "epoch": 3.6570803629293582, + "grad_norm": 0.06201465427875519, + "learning_rate": 1.9051262433052795e-05, + "loss": 0.1987, + "step": 45143 + }, + { + "epoch": 3.657161373946857, + "grad_norm": 0.06707758456468582, + "learning_rate": 1.9046761780458165e-05, + "loss": 0.2494, + "step": 45144 + }, + { + "epoch": 3.657242384964355, + "grad_norm": 0.05820019543170929, + "learning_rate": 1.9042261127863542e-05, + "loss": 0.1994, + "step": 45145 + }, + { + "epoch": 3.6573233959818534, + "grad_norm": 0.0598413459956646, + "learning_rate": 1.9037760475268916e-05, + "loss": 0.2162, + "step": 45146 + }, + { + "epoch": 3.657404406999352, + "grad_norm": 0.06863729655742645, + "learning_rate": 1.9033259822674286e-05, + "loss": 0.2072, + "step": 45147 + }, + { + "epoch": 3.6574854180168503, + "grad_norm": 0.09531652182340622, + "learning_rate": 1.9028759170079663e-05, + "loss": 0.2424, + "step": 45148 + }, + { + "epoch": 3.6575664290343486, + "grad_norm": 0.10843674093484879, + "learning_rate": 1.9024258517485037e-05, + "loss": 0.2341, + "step": 45149 + }, + { + "epoch": 3.6576474400518473, + "grad_norm": 0.06694623082876205, + "learning_rate": 1.901975786489041e-05, + "loss": 0.232, + "step": 45150 + }, + { + "epoch": 3.6577284510693455, + "grad_norm": 0.07675240933895111, + "learning_rate": 1.9015257212295784e-05, + "loss": 0.2522, + "step": 45151 + }, + { + "epoch": 3.6578094620868438, + "grad_norm": 0.05678509548306465, + "learning_rate": 1.9010756559701158e-05, + "loss": 0.2597, + "step": 45152 + }, + { + "epoch": 3.6578904731043425, + "grad_norm": 0.06341982632875443, + "learning_rate": 1.900625590710653e-05, + "loss": 0.2404, + "step": 45153 + }, + { + "epoch": 3.6579714841218407, + "grad_norm": 0.0864272266626358, + "learning_rate": 1.9001755254511905e-05, + "loss": 0.2441, + "step": 45154 + }, + { + "epoch": 3.658052495139339, + "grad_norm": 0.06379323452711105, + "learning_rate": 1.899725460191728e-05, + "loss": 0.2627, + "step": 45155 + }, + { + "epoch": 3.658133506156837, + "grad_norm": 0.06465981900691986, + "learning_rate": 1.8992753949322652e-05, + "loss": 0.2306, + "step": 45156 + }, + { + "epoch": 3.6582145171743354, + "grad_norm": 0.07326333969831467, + "learning_rate": 1.8988253296728026e-05, + "loss": 0.244, + "step": 45157 + }, + { + "epoch": 3.658295528191834, + "grad_norm": 0.07888638228178024, + "learning_rate": 1.8983752644133403e-05, + "loss": 0.2279, + "step": 45158 + }, + { + "epoch": 3.6583765392093324, + "grad_norm": 0.06322069466114044, + "learning_rate": 1.8979251991538773e-05, + "loss": 0.2152, + "step": 45159 + }, + { + "epoch": 3.6584575502268306, + "grad_norm": 0.08328167349100113, + "learning_rate": 1.8974751338944146e-05, + "loss": 0.2313, + "step": 45160 + }, + { + "epoch": 3.6585385612443293, + "grad_norm": 0.07311400771141052, + "learning_rate": 1.8970250686349523e-05, + "loss": 0.2291, + "step": 45161 + }, + { + "epoch": 3.6586195722618275, + "grad_norm": 0.07768473774194717, + "learning_rate": 1.8965750033754894e-05, + "loss": 0.2011, + "step": 45162 + }, + { + "epoch": 3.658700583279326, + "grad_norm": 0.07638784497976303, + "learning_rate": 1.896124938116027e-05, + "loss": 0.2685, + "step": 45163 + }, + { + "epoch": 3.6587815942968245, + "grad_norm": 0.07639626413583755, + "learning_rate": 1.8956748728565644e-05, + "loss": 0.2492, + "step": 45164 + }, + { + "epoch": 3.6588626053143227, + "grad_norm": 0.06662027537822723, + "learning_rate": 1.8952248075971014e-05, + "loss": 0.2351, + "step": 45165 + }, + { + "epoch": 3.658943616331821, + "grad_norm": 0.06432883441448212, + "learning_rate": 1.894774742337639e-05, + "loss": 0.2176, + "step": 45166 + }, + { + "epoch": 3.6590246273493197, + "grad_norm": 0.08156169950962067, + "learning_rate": 1.8943246770781765e-05, + "loss": 0.2194, + "step": 45167 + }, + { + "epoch": 3.659105638366818, + "grad_norm": 0.07922739535570145, + "learning_rate": 1.8938746118187135e-05, + "loss": 0.2315, + "step": 45168 + }, + { + "epoch": 3.659186649384316, + "grad_norm": 0.0675259456038475, + "learning_rate": 1.8934245465592512e-05, + "loss": 0.2337, + "step": 45169 + }, + { + "epoch": 3.659267660401815, + "grad_norm": 0.07149109244346619, + "learning_rate": 1.8929744812997886e-05, + "loss": 0.1945, + "step": 45170 + }, + { + "epoch": 3.659348671419313, + "grad_norm": 0.09225580841302872, + "learning_rate": 1.892524416040326e-05, + "loss": 0.2229, + "step": 45171 + }, + { + "epoch": 3.6594296824368113, + "grad_norm": 0.0599265918135643, + "learning_rate": 1.8920743507808633e-05, + "loss": 0.1962, + "step": 45172 + }, + { + "epoch": 3.65951069345431, + "grad_norm": 0.08336354047060013, + "learning_rate": 1.8916242855214007e-05, + "loss": 0.2145, + "step": 45173 + }, + { + "epoch": 3.6595917044718083, + "grad_norm": 0.08190027624368668, + "learning_rate": 1.891174220261938e-05, + "loss": 0.2341, + "step": 45174 + }, + { + "epoch": 3.6596727154893065, + "grad_norm": 0.08267060667276382, + "learning_rate": 1.8907241550024754e-05, + "loss": 0.2275, + "step": 45175 + }, + { + "epoch": 3.659753726506805, + "grad_norm": 0.06503156572580338, + "learning_rate": 1.890274089743013e-05, + "loss": 0.2179, + "step": 45176 + }, + { + "epoch": 3.6598347375243034, + "grad_norm": 0.08606370538473129, + "learning_rate": 1.88982402448355e-05, + "loss": 0.2286, + "step": 45177 + }, + { + "epoch": 3.6599157485418017, + "grad_norm": 0.06882736086845398, + "learning_rate": 1.8893739592240875e-05, + "loss": 0.2437, + "step": 45178 + }, + { + "epoch": 3.6599967595593, + "grad_norm": 0.06398408859968185, + "learning_rate": 1.888923893964625e-05, + "loss": 0.1888, + "step": 45179 + }, + { + "epoch": 3.660077770576798, + "grad_norm": 0.06924501061439514, + "learning_rate": 1.8884738287051622e-05, + "loss": 0.2021, + "step": 45180 + }, + { + "epoch": 3.660158781594297, + "grad_norm": 0.07223118096590042, + "learning_rate": 1.8880237634456995e-05, + "loss": 0.2387, + "step": 45181 + }, + { + "epoch": 3.660239792611795, + "grad_norm": 0.07276555150747299, + "learning_rate": 1.8875736981862372e-05, + "loss": 0.2485, + "step": 45182 + }, + { + "epoch": 3.6603208036292934, + "grad_norm": 0.06627250462770462, + "learning_rate": 1.8871236329267743e-05, + "loss": 0.2028, + "step": 45183 + }, + { + "epoch": 3.660401814646792, + "grad_norm": 0.07248107343912125, + "learning_rate": 1.886673567667312e-05, + "loss": 0.2226, + "step": 45184 + }, + { + "epoch": 3.6604828256642903, + "grad_norm": 0.0946134552359581, + "learning_rate": 1.8862235024078493e-05, + "loss": 0.2456, + "step": 45185 + }, + { + "epoch": 3.6605638366817885, + "grad_norm": 0.08283112943172455, + "learning_rate": 1.8857734371483863e-05, + "loss": 0.2349, + "step": 45186 + }, + { + "epoch": 3.660644847699287, + "grad_norm": 0.0703137218952179, + "learning_rate": 1.885323371888924e-05, + "loss": 0.2016, + "step": 45187 + }, + { + "epoch": 3.6607258587167855, + "grad_norm": 0.0719614326953888, + "learning_rate": 1.8848733066294614e-05, + "loss": 0.2342, + "step": 45188 + }, + { + "epoch": 3.6608068697342837, + "grad_norm": 0.07138366997241974, + "learning_rate": 1.8844232413699988e-05, + "loss": 0.225, + "step": 45189 + }, + { + "epoch": 3.6608878807517824, + "grad_norm": 0.10369545966386795, + "learning_rate": 1.883973176110536e-05, + "loss": 0.239, + "step": 45190 + }, + { + "epoch": 3.6609688917692806, + "grad_norm": 0.06715277582406998, + "learning_rate": 1.8835231108510735e-05, + "loss": 0.2064, + "step": 45191 + }, + { + "epoch": 3.661049902786779, + "grad_norm": 0.07604720443487167, + "learning_rate": 1.883073045591611e-05, + "loss": 0.2189, + "step": 45192 + }, + { + "epoch": 3.6611309138042776, + "grad_norm": 0.08600092679262161, + "learning_rate": 1.8826229803321482e-05, + "loss": 0.245, + "step": 45193 + }, + { + "epoch": 3.661211924821776, + "grad_norm": 0.07176908105611801, + "learning_rate": 1.8821729150726856e-05, + "loss": 0.2597, + "step": 45194 + }, + { + "epoch": 3.661292935839274, + "grad_norm": 0.08605284988880157, + "learning_rate": 1.881722849813223e-05, + "loss": 0.2165, + "step": 45195 + }, + { + "epoch": 3.6613739468567728, + "grad_norm": 0.0742536336183548, + "learning_rate": 1.8812727845537603e-05, + "loss": 0.2322, + "step": 45196 + }, + { + "epoch": 3.661454957874271, + "grad_norm": 0.07331538945436478, + "learning_rate": 1.880822719294298e-05, + "loss": 0.2215, + "step": 45197 + }, + { + "epoch": 3.6615359688917692, + "grad_norm": 0.07890629023313522, + "learning_rate": 1.880372654034835e-05, + "loss": 0.233, + "step": 45198 + }, + { + "epoch": 3.661616979909268, + "grad_norm": 0.06850796937942505, + "learning_rate": 1.8799225887753724e-05, + "loss": 0.203, + "step": 45199 + }, + { + "epoch": 3.661697990926766, + "grad_norm": 0.06160985678434372, + "learning_rate": 1.87947252351591e-05, + "loss": 0.201, + "step": 45200 + }, + { + "epoch": 3.6617790019442644, + "grad_norm": 0.0723324865102768, + "learning_rate": 1.879022458256447e-05, + "loss": 0.206, + "step": 45201 + }, + { + "epoch": 3.6618600129617627, + "grad_norm": 0.07769309729337692, + "learning_rate": 1.8785723929969848e-05, + "loss": 0.2628, + "step": 45202 + }, + { + "epoch": 3.661941023979261, + "grad_norm": 0.0801674947142601, + "learning_rate": 1.878122327737522e-05, + "loss": 0.2018, + "step": 45203 + }, + { + "epoch": 3.6620220349967596, + "grad_norm": 0.06425900757312775, + "learning_rate": 1.8776722624780592e-05, + "loss": 0.2094, + "step": 45204 + }, + { + "epoch": 3.662103046014258, + "grad_norm": 0.0725947692990303, + "learning_rate": 1.877222197218597e-05, + "loss": 0.2432, + "step": 45205 + }, + { + "epoch": 3.662184057031756, + "grad_norm": 0.07440198957920074, + "learning_rate": 1.8767721319591342e-05, + "loss": 0.2038, + "step": 45206 + }, + { + "epoch": 3.662265068049255, + "grad_norm": 0.07127074152231216, + "learning_rate": 1.8763220666996713e-05, + "loss": 0.214, + "step": 45207 + }, + { + "epoch": 3.662346079066753, + "grad_norm": 0.061125315725803375, + "learning_rate": 1.875872001440209e-05, + "loss": 0.1954, + "step": 45208 + }, + { + "epoch": 3.6624270900842513, + "grad_norm": 0.06937211006879807, + "learning_rate": 1.8754219361807463e-05, + "loss": 0.2045, + "step": 45209 + }, + { + "epoch": 3.66250810110175, + "grad_norm": 0.0728205218911171, + "learning_rate": 1.8749718709212837e-05, + "loss": 0.238, + "step": 45210 + }, + { + "epoch": 3.662589112119248, + "grad_norm": 0.09144295752048492, + "learning_rate": 1.874521805661821e-05, + "loss": 0.2327, + "step": 45211 + }, + { + "epoch": 3.6626701231367464, + "grad_norm": 0.0762929916381836, + "learning_rate": 1.8740717404023584e-05, + "loss": 0.2427, + "step": 45212 + }, + { + "epoch": 3.662751134154245, + "grad_norm": 0.07527792453765869, + "learning_rate": 1.8736216751428958e-05, + "loss": 0.2229, + "step": 45213 + }, + { + "epoch": 3.6628321451717434, + "grad_norm": 0.06261378526687622, + "learning_rate": 1.873171609883433e-05, + "loss": 0.2333, + "step": 45214 + }, + { + "epoch": 3.6629131561892416, + "grad_norm": 0.0688212513923645, + "learning_rate": 1.8727215446239708e-05, + "loss": 0.2074, + "step": 45215 + }, + { + "epoch": 3.6629941672067403, + "grad_norm": 0.07364526391029358, + "learning_rate": 1.872271479364508e-05, + "loss": 0.261, + "step": 45216 + }, + { + "epoch": 3.6630751782242386, + "grad_norm": 0.07287118583917618, + "learning_rate": 1.8718214141050452e-05, + "loss": 0.2361, + "step": 45217 + }, + { + "epoch": 3.663156189241737, + "grad_norm": 0.07098571956157684, + "learning_rate": 1.871371348845583e-05, + "loss": 0.2548, + "step": 45218 + }, + { + "epoch": 3.6632372002592355, + "grad_norm": 0.0735175609588623, + "learning_rate": 1.87092128358612e-05, + "loss": 0.2525, + "step": 45219 + }, + { + "epoch": 3.6633182112767337, + "grad_norm": 0.07706687599420547, + "learning_rate": 1.8704712183266573e-05, + "loss": 0.2814, + "step": 45220 + }, + { + "epoch": 3.663399222294232, + "grad_norm": 0.07573897391557693, + "learning_rate": 1.870021153067195e-05, + "loss": 0.2213, + "step": 45221 + }, + { + "epoch": 3.6634802333117307, + "grad_norm": 0.06459297239780426, + "learning_rate": 1.869571087807732e-05, + "loss": 0.2569, + "step": 45222 + }, + { + "epoch": 3.663561244329229, + "grad_norm": 0.06621073186397552, + "learning_rate": 1.8691210225482697e-05, + "loss": 0.2595, + "step": 45223 + }, + { + "epoch": 3.663642255346727, + "grad_norm": 0.07406672835350037, + "learning_rate": 1.868670957288807e-05, + "loss": 0.211, + "step": 45224 + }, + { + "epoch": 3.6637232663642254, + "grad_norm": 0.05716541036963463, + "learning_rate": 1.8682208920293444e-05, + "loss": 0.2225, + "step": 45225 + }, + { + "epoch": 3.6638042773817237, + "grad_norm": 0.061790212988853455, + "learning_rate": 1.8677708267698818e-05, + "loss": 0.2116, + "step": 45226 + }, + { + "epoch": 3.6638852883992223, + "grad_norm": 0.06380802392959595, + "learning_rate": 1.867320761510419e-05, + "loss": 0.2551, + "step": 45227 + }, + { + "epoch": 3.6639662994167206, + "grad_norm": 0.06814054399728775, + "learning_rate": 1.8668706962509565e-05, + "loss": 0.2179, + "step": 45228 + }, + { + "epoch": 3.664047310434219, + "grad_norm": 0.08436217904090881, + "learning_rate": 1.866420630991494e-05, + "loss": 0.2342, + "step": 45229 + }, + { + "epoch": 3.6641283214517175, + "grad_norm": 0.07426043599843979, + "learning_rate": 1.8659705657320312e-05, + "loss": 0.2463, + "step": 45230 + }, + { + "epoch": 3.6642093324692158, + "grad_norm": 0.0674593448638916, + "learning_rate": 1.8655205004725686e-05, + "loss": 0.2012, + "step": 45231 + }, + { + "epoch": 3.664290343486714, + "grad_norm": 0.08109057694673538, + "learning_rate": 1.865070435213106e-05, + "loss": 0.2572, + "step": 45232 + }, + { + "epoch": 3.6643713545042127, + "grad_norm": 0.07163460552692413, + "learning_rate": 1.8646203699536433e-05, + "loss": 0.211, + "step": 45233 + }, + { + "epoch": 3.664452365521711, + "grad_norm": 0.07475652545690536, + "learning_rate": 1.8641703046941807e-05, + "loss": 0.2136, + "step": 45234 + }, + { + "epoch": 3.664533376539209, + "grad_norm": 0.07576707750558853, + "learning_rate": 1.863720239434718e-05, + "loss": 0.206, + "step": 45235 + }, + { + "epoch": 3.664614387556708, + "grad_norm": 0.067570261657238, + "learning_rate": 1.8632701741752557e-05, + "loss": 0.2316, + "step": 45236 + }, + { + "epoch": 3.664695398574206, + "grad_norm": 0.057533759623765945, + "learning_rate": 1.8628201089157927e-05, + "loss": 0.2106, + "step": 45237 + }, + { + "epoch": 3.6647764095917044, + "grad_norm": 0.0835278183221817, + "learning_rate": 1.86237004365633e-05, + "loss": 0.2861, + "step": 45238 + }, + { + "epoch": 3.664857420609203, + "grad_norm": 0.07156947255134583, + "learning_rate": 1.8619199783968678e-05, + "loss": 0.2521, + "step": 45239 + }, + { + "epoch": 3.6649384316267013, + "grad_norm": 0.06274692714214325, + "learning_rate": 1.8614699131374048e-05, + "loss": 0.2358, + "step": 45240 + }, + { + "epoch": 3.6650194426441995, + "grad_norm": 0.07600396871566772, + "learning_rate": 1.8610198478779422e-05, + "loss": 0.2267, + "step": 45241 + }, + { + "epoch": 3.6651004536616982, + "grad_norm": 0.08536671102046967, + "learning_rate": 1.86056978261848e-05, + "loss": 0.2078, + "step": 45242 + }, + { + "epoch": 3.6651814646791965, + "grad_norm": 0.08486076444387436, + "learning_rate": 1.8601197173590172e-05, + "loss": 0.2428, + "step": 45243 + }, + { + "epoch": 3.6652624756966947, + "grad_norm": 0.0657820925116539, + "learning_rate": 1.8596696520995546e-05, + "loss": 0.2172, + "step": 45244 + }, + { + "epoch": 3.665343486714193, + "grad_norm": 0.08165675401687622, + "learning_rate": 1.859219586840092e-05, + "loss": 0.2448, + "step": 45245 + }, + { + "epoch": 3.6654244977316917, + "grad_norm": 0.07033119350671768, + "learning_rate": 1.8587695215806293e-05, + "loss": 0.2219, + "step": 45246 + }, + { + "epoch": 3.66550550874919, + "grad_norm": 0.07771047949790955, + "learning_rate": 1.8583194563211667e-05, + "loss": 0.252, + "step": 45247 + }, + { + "epoch": 3.665586519766688, + "grad_norm": 0.06715802103281021, + "learning_rate": 1.857869391061704e-05, + "loss": 0.2521, + "step": 45248 + }, + { + "epoch": 3.6656675307841864, + "grad_norm": 0.088836170732975, + "learning_rate": 1.8574193258022414e-05, + "loss": 0.2596, + "step": 45249 + }, + { + "epoch": 3.665748541801685, + "grad_norm": 0.06336906552314758, + "learning_rate": 1.8569692605427788e-05, + "loss": 0.2287, + "step": 45250 + }, + { + "epoch": 3.6658295528191833, + "grad_norm": 0.07023075222969055, + "learning_rate": 1.856519195283316e-05, + "loss": 0.2233, + "step": 45251 + }, + { + "epoch": 3.6659105638366816, + "grad_norm": 0.05971920117735863, + "learning_rate": 1.8560691300238535e-05, + "loss": 0.1959, + "step": 45252 + }, + { + "epoch": 3.6659915748541803, + "grad_norm": 0.06513191014528275, + "learning_rate": 1.855619064764391e-05, + "loss": 0.2232, + "step": 45253 + }, + { + "epoch": 3.6660725858716785, + "grad_norm": 0.08198903501033783, + "learning_rate": 1.8551689995049282e-05, + "loss": 0.1973, + "step": 45254 + }, + { + "epoch": 3.6661535968891767, + "grad_norm": 0.056683849543333054, + "learning_rate": 1.8547189342454656e-05, + "loss": 0.2219, + "step": 45255 + }, + { + "epoch": 3.6662346079066754, + "grad_norm": 0.08191985636949539, + "learning_rate": 1.854268868986003e-05, + "loss": 0.2283, + "step": 45256 + }, + { + "epoch": 3.6663156189241737, + "grad_norm": 0.08315394073724747, + "learning_rate": 1.8538188037265406e-05, + "loss": 0.2311, + "step": 45257 + }, + { + "epoch": 3.666396629941672, + "grad_norm": 0.06751669198274612, + "learning_rate": 1.8533687384670776e-05, + "loss": 0.2159, + "step": 45258 + }, + { + "epoch": 3.6664776409591706, + "grad_norm": 0.06708385795354843, + "learning_rate": 1.852918673207615e-05, + "loss": 0.203, + "step": 45259 + }, + { + "epoch": 3.666558651976669, + "grad_norm": 0.07147734612226486, + "learning_rate": 1.8524686079481527e-05, + "loss": 0.2724, + "step": 45260 + }, + { + "epoch": 3.666639662994167, + "grad_norm": 0.06914891302585602, + "learning_rate": 1.85201854268869e-05, + "loss": 0.2364, + "step": 45261 + }, + { + "epoch": 3.666720674011666, + "grad_norm": 0.06511449813842773, + "learning_rate": 1.8515684774292274e-05, + "loss": 0.2261, + "step": 45262 + }, + { + "epoch": 3.666801685029164, + "grad_norm": 0.0676787719130516, + "learning_rate": 1.8511184121697648e-05, + "loss": 0.2022, + "step": 45263 + }, + { + "epoch": 3.6668826960466623, + "grad_norm": 0.09487228840589523, + "learning_rate": 1.850668346910302e-05, + "loss": 0.2466, + "step": 45264 + }, + { + "epoch": 3.666963707064161, + "grad_norm": 0.07777979969978333, + "learning_rate": 1.8502182816508395e-05, + "loss": 0.2491, + "step": 45265 + }, + { + "epoch": 3.667044718081659, + "grad_norm": 0.07166828215122223, + "learning_rate": 1.849768216391377e-05, + "loss": 0.1753, + "step": 45266 + }, + { + "epoch": 3.6671257290991575, + "grad_norm": 0.06455834209918976, + "learning_rate": 1.8493181511319142e-05, + "loss": 0.1964, + "step": 45267 + }, + { + "epoch": 3.6672067401166557, + "grad_norm": 0.07258601486682892, + "learning_rate": 1.8488680858724516e-05, + "loss": 0.2158, + "step": 45268 + }, + { + "epoch": 3.6672877511341544, + "grad_norm": 0.0714699998497963, + "learning_rate": 1.848418020612989e-05, + "loss": 0.2503, + "step": 45269 + }, + { + "epoch": 3.6673687621516526, + "grad_norm": 0.0788547545671463, + "learning_rate": 1.8479679553535263e-05, + "loss": 0.2433, + "step": 45270 + }, + { + "epoch": 3.667449773169151, + "grad_norm": 0.08070094883441925, + "learning_rate": 1.8475178900940637e-05, + "loss": 0.2471, + "step": 45271 + }, + { + "epoch": 3.667530784186649, + "grad_norm": 0.09129707515239716, + "learning_rate": 1.847067824834601e-05, + "loss": 0.2362, + "step": 45272 + }, + { + "epoch": 3.667611795204148, + "grad_norm": 0.08472602814435959, + "learning_rate": 1.8466177595751384e-05, + "loss": 0.2054, + "step": 45273 + }, + { + "epoch": 3.667692806221646, + "grad_norm": 0.0719936415553093, + "learning_rate": 1.8461676943156758e-05, + "loss": 0.2079, + "step": 45274 + }, + { + "epoch": 3.6677738172391443, + "grad_norm": 0.07852540165185928, + "learning_rate": 1.8457176290562135e-05, + "loss": 0.2535, + "step": 45275 + }, + { + "epoch": 3.667854828256643, + "grad_norm": 0.08097584545612335, + "learning_rate": 1.8452675637967505e-05, + "loss": 0.2581, + "step": 45276 + }, + { + "epoch": 3.6679358392741412, + "grad_norm": 0.07683178037405014, + "learning_rate": 1.844817498537288e-05, + "loss": 0.2382, + "step": 45277 + }, + { + "epoch": 3.6680168502916395, + "grad_norm": 0.06702481955289841, + "learning_rate": 1.8443674332778255e-05, + "loss": 0.2282, + "step": 45278 + }, + { + "epoch": 3.668097861309138, + "grad_norm": 0.07527042180299759, + "learning_rate": 1.843917368018363e-05, + "loss": 0.186, + "step": 45279 + }, + { + "epoch": 3.6681788723266364, + "grad_norm": 0.06990735977888107, + "learning_rate": 1.8434673027589e-05, + "loss": 0.2354, + "step": 45280 + }, + { + "epoch": 3.6682598833441347, + "grad_norm": 0.08135319501161575, + "learning_rate": 1.8430172374994376e-05, + "loss": 0.228, + "step": 45281 + }, + { + "epoch": 3.6683408943616334, + "grad_norm": 0.07040378451347351, + "learning_rate": 1.842567172239975e-05, + "loss": 0.2158, + "step": 45282 + }, + { + "epoch": 3.6684219053791316, + "grad_norm": 0.09205630421638489, + "learning_rate": 1.8421171069805123e-05, + "loss": 0.2466, + "step": 45283 + }, + { + "epoch": 3.66850291639663, + "grad_norm": 0.06662517786026001, + "learning_rate": 1.8416670417210497e-05, + "loss": 0.2095, + "step": 45284 + }, + { + "epoch": 3.6685839274141285, + "grad_norm": 0.07559280097484589, + "learning_rate": 1.841216976461587e-05, + "loss": 0.2235, + "step": 45285 + }, + { + "epoch": 3.6686649384316268, + "grad_norm": 0.08209975808858871, + "learning_rate": 1.8407669112021244e-05, + "loss": 0.2599, + "step": 45286 + }, + { + "epoch": 3.668745949449125, + "grad_norm": 0.07545511424541473, + "learning_rate": 1.8403168459426618e-05, + "loss": 0.229, + "step": 45287 + }, + { + "epoch": 3.6688269604666237, + "grad_norm": 0.05978429690003395, + "learning_rate": 1.839866780683199e-05, + "loss": 0.2133, + "step": 45288 + }, + { + "epoch": 3.668907971484122, + "grad_norm": 0.07269906252622604, + "learning_rate": 1.8394167154237365e-05, + "loss": 0.2143, + "step": 45289 + }, + { + "epoch": 3.66898898250162, + "grad_norm": 0.0637771263718605, + "learning_rate": 1.838966650164274e-05, + "loss": 0.1974, + "step": 45290 + }, + { + "epoch": 3.6690699935191184, + "grad_norm": 0.08513235300779343, + "learning_rate": 1.8385165849048112e-05, + "loss": 0.2258, + "step": 45291 + }, + { + "epoch": 3.669151004536617, + "grad_norm": 0.08326667547225952, + "learning_rate": 1.8380665196453486e-05, + "loss": 0.2099, + "step": 45292 + }, + { + "epoch": 3.6692320155541154, + "grad_norm": 0.07560274749994278, + "learning_rate": 1.837616454385886e-05, + "loss": 0.2221, + "step": 45293 + }, + { + "epoch": 3.6693130265716136, + "grad_norm": 0.07517266273498535, + "learning_rate": 1.8371663891264236e-05, + "loss": 0.2181, + "step": 45294 + }, + { + "epoch": 3.669394037589112, + "grad_norm": 0.07194165140390396, + "learning_rate": 1.8367163238669607e-05, + "loss": 0.2318, + "step": 45295 + }, + { + "epoch": 3.6694750486066106, + "grad_norm": 0.07393741607666016, + "learning_rate": 1.8362662586074984e-05, + "loss": 0.2431, + "step": 45296 + }, + { + "epoch": 3.669556059624109, + "grad_norm": 0.0838971957564354, + "learning_rate": 1.8358161933480357e-05, + "loss": 0.2495, + "step": 45297 + }, + { + "epoch": 3.669637070641607, + "grad_norm": 0.08950010687112808, + "learning_rate": 1.8353661280885727e-05, + "loss": 0.2088, + "step": 45298 + }, + { + "epoch": 3.6697180816591057, + "grad_norm": 0.06360174715518951, + "learning_rate": 1.8349160628291104e-05, + "loss": 0.2234, + "step": 45299 + }, + { + "epoch": 3.669799092676604, + "grad_norm": 0.07285406440496445, + "learning_rate": 1.8344659975696478e-05, + "loss": 0.2441, + "step": 45300 + }, + { + "epoch": 3.6698801036941022, + "grad_norm": 0.06920921802520752, + "learning_rate": 1.834015932310185e-05, + "loss": 0.2242, + "step": 45301 + }, + { + "epoch": 3.669961114711601, + "grad_norm": 0.07369273155927658, + "learning_rate": 1.8335658670507225e-05, + "loss": 0.2412, + "step": 45302 + }, + { + "epoch": 3.670042125729099, + "grad_norm": 0.08306174725294113, + "learning_rate": 1.83311580179126e-05, + "loss": 0.2479, + "step": 45303 + }, + { + "epoch": 3.6701231367465974, + "grad_norm": 0.071095310151577, + "learning_rate": 1.8326657365317972e-05, + "loss": 0.2298, + "step": 45304 + }, + { + "epoch": 3.670204147764096, + "grad_norm": 0.07723124325275421, + "learning_rate": 1.8322156712723346e-05, + "loss": 0.2093, + "step": 45305 + }, + { + "epoch": 3.6702851587815943, + "grad_norm": 0.06528105586767197, + "learning_rate": 1.831765606012872e-05, + "loss": 0.227, + "step": 45306 + }, + { + "epoch": 3.6703661697990926, + "grad_norm": 0.07839982211589813, + "learning_rate": 1.8313155407534093e-05, + "loss": 0.1812, + "step": 45307 + }, + { + "epoch": 3.6704471808165913, + "grad_norm": 0.06877532601356506, + "learning_rate": 1.8308654754939467e-05, + "loss": 0.2531, + "step": 45308 + }, + { + "epoch": 3.6705281918340895, + "grad_norm": 0.06471942365169525, + "learning_rate": 1.830415410234484e-05, + "loss": 0.2218, + "step": 45309 + }, + { + "epoch": 3.6706092028515878, + "grad_norm": 0.06157661974430084, + "learning_rate": 1.8299653449750214e-05, + "loss": 0.2209, + "step": 45310 + }, + { + "epoch": 3.6706902138690864, + "grad_norm": 0.05959887430071831, + "learning_rate": 1.8295152797155588e-05, + "loss": 0.2317, + "step": 45311 + }, + { + "epoch": 3.6707712248865847, + "grad_norm": 0.09365382045507431, + "learning_rate": 1.8290652144560965e-05, + "loss": 0.2105, + "step": 45312 + }, + { + "epoch": 3.670852235904083, + "grad_norm": 0.059252869337797165, + "learning_rate": 1.8286151491966335e-05, + "loss": 0.2384, + "step": 45313 + }, + { + "epoch": 3.670933246921581, + "grad_norm": 0.07261160761117935, + "learning_rate": 1.828165083937171e-05, + "loss": 0.2129, + "step": 45314 + }, + { + "epoch": 3.67101425793908, + "grad_norm": 0.07140535116195679, + "learning_rate": 1.8277150186777085e-05, + "loss": 0.2484, + "step": 45315 + }, + { + "epoch": 3.671095268956578, + "grad_norm": 0.06605353951454163, + "learning_rate": 1.8272649534182456e-05, + "loss": 0.2345, + "step": 45316 + }, + { + "epoch": 3.6711762799740764, + "grad_norm": 0.0692841038107872, + "learning_rate": 1.8268148881587833e-05, + "loss": 0.2096, + "step": 45317 + }, + { + "epoch": 3.6712572909915746, + "grad_norm": 0.07458701729774475, + "learning_rate": 1.8263648228993206e-05, + "loss": 0.2243, + "step": 45318 + }, + { + "epoch": 3.6713383020090733, + "grad_norm": 0.07618386298418045, + "learning_rate": 1.8259147576398576e-05, + "loss": 0.2843, + "step": 45319 + }, + { + "epoch": 3.6714193130265715, + "grad_norm": 0.06535560637712479, + "learning_rate": 1.8254646923803953e-05, + "loss": 0.2373, + "step": 45320 + }, + { + "epoch": 3.67150032404407, + "grad_norm": 0.07870413362979889, + "learning_rate": 1.8250146271209327e-05, + "loss": 0.2164, + "step": 45321 + }, + { + "epoch": 3.6715813350615685, + "grad_norm": 0.06687940657138824, + "learning_rate": 1.82456456186147e-05, + "loss": 0.2201, + "step": 45322 + }, + { + "epoch": 3.6716623460790667, + "grad_norm": 0.05875231698155403, + "learning_rate": 1.8241144966020074e-05, + "loss": 0.2444, + "step": 45323 + }, + { + "epoch": 3.671743357096565, + "grad_norm": 0.07024704664945602, + "learning_rate": 1.8236644313425448e-05, + "loss": 0.2528, + "step": 45324 + }, + { + "epoch": 3.6718243681140637, + "grad_norm": 0.07065678387880325, + "learning_rate": 1.823214366083082e-05, + "loss": 0.2109, + "step": 45325 + }, + { + "epoch": 3.671905379131562, + "grad_norm": 0.07673554122447968, + "learning_rate": 1.8227643008236195e-05, + "loss": 0.2085, + "step": 45326 + }, + { + "epoch": 3.67198639014906, + "grad_norm": 0.08821848034858704, + "learning_rate": 1.822314235564157e-05, + "loss": 0.266, + "step": 45327 + }, + { + "epoch": 3.672067401166559, + "grad_norm": 0.09151603281497955, + "learning_rate": 1.8218641703046942e-05, + "loss": 0.2364, + "step": 45328 + }, + { + "epoch": 3.672148412184057, + "grad_norm": 0.07013548910617828, + "learning_rate": 1.8214141050452316e-05, + "loss": 0.2041, + "step": 45329 + }, + { + "epoch": 3.6722294232015553, + "grad_norm": 0.07981032878160477, + "learning_rate": 1.8209640397857693e-05, + "loss": 0.2669, + "step": 45330 + }, + { + "epoch": 3.672310434219054, + "grad_norm": 0.0923733338713646, + "learning_rate": 1.8205139745263063e-05, + "loss": 0.1997, + "step": 45331 + }, + { + "epoch": 3.6723914452365523, + "grad_norm": 0.0668359100818634, + "learning_rate": 1.8200639092668437e-05, + "loss": 0.2247, + "step": 45332 + }, + { + "epoch": 3.6724724562540505, + "grad_norm": 0.05784869194030762, + "learning_rate": 1.8196138440073814e-05, + "loss": 0.244, + "step": 45333 + }, + { + "epoch": 3.672553467271549, + "grad_norm": 0.06569833308458328, + "learning_rate": 1.8191637787479184e-05, + "loss": 0.2083, + "step": 45334 + }, + { + "epoch": 3.6726344782890474, + "grad_norm": 0.07707594335079193, + "learning_rate": 1.818713713488456e-05, + "loss": 0.2344, + "step": 45335 + }, + { + "epoch": 3.6727154893065457, + "grad_norm": 0.06917215138673782, + "learning_rate": 1.8182636482289934e-05, + "loss": 0.1973, + "step": 45336 + }, + { + "epoch": 3.672796500324044, + "grad_norm": 0.061087533831596375, + "learning_rate": 1.8178135829695305e-05, + "loss": 0.233, + "step": 45337 + }, + { + "epoch": 3.6728775113415426, + "grad_norm": 0.07393475621938705, + "learning_rate": 1.817363517710068e-05, + "loss": 0.2184, + "step": 45338 + }, + { + "epoch": 3.672958522359041, + "grad_norm": 0.08241662383079529, + "learning_rate": 1.8169134524506055e-05, + "loss": 0.2612, + "step": 45339 + }, + { + "epoch": 3.673039533376539, + "grad_norm": 0.06417885422706604, + "learning_rate": 1.8164633871911426e-05, + "loss": 0.2379, + "step": 45340 + }, + { + "epoch": 3.6731205443940373, + "grad_norm": 0.07297766953706741, + "learning_rate": 1.8160133219316803e-05, + "loss": 0.2243, + "step": 45341 + }, + { + "epoch": 3.673201555411536, + "grad_norm": 0.07211649417877197, + "learning_rate": 1.8155632566722176e-05, + "loss": 0.2282, + "step": 45342 + }, + { + "epoch": 3.6732825664290343, + "grad_norm": 0.07847045361995697, + "learning_rate": 1.815113191412755e-05, + "loss": 0.258, + "step": 45343 + }, + { + "epoch": 3.6733635774465325, + "grad_norm": 0.07077674567699432, + "learning_rate": 1.8146631261532923e-05, + "loss": 0.2415, + "step": 45344 + }, + { + "epoch": 3.673444588464031, + "grad_norm": 0.07668691873550415, + "learning_rate": 1.8142130608938297e-05, + "loss": 0.2154, + "step": 45345 + }, + { + "epoch": 3.6735255994815295, + "grad_norm": 0.066067636013031, + "learning_rate": 1.813762995634367e-05, + "loss": 0.2505, + "step": 45346 + }, + { + "epoch": 3.6736066104990277, + "grad_norm": 0.06294489651918411, + "learning_rate": 1.8133129303749044e-05, + "loss": 0.2048, + "step": 45347 + }, + { + "epoch": 3.6736876215165264, + "grad_norm": 0.07754607498645782, + "learning_rate": 1.812862865115442e-05, + "loss": 0.224, + "step": 45348 + }, + { + "epoch": 3.6737686325340246, + "grad_norm": 0.07878319174051285, + "learning_rate": 1.812412799855979e-05, + "loss": 0.2153, + "step": 45349 + }, + { + "epoch": 3.673849643551523, + "grad_norm": 0.07131267338991165, + "learning_rate": 1.8119627345965165e-05, + "loss": 0.2618, + "step": 45350 + }, + { + "epoch": 3.6739306545690216, + "grad_norm": 0.08097372204065323, + "learning_rate": 1.8115126693370542e-05, + "loss": 0.2629, + "step": 45351 + }, + { + "epoch": 3.67401166558652, + "grad_norm": 0.07331506907939911, + "learning_rate": 1.8110626040775912e-05, + "loss": 0.2477, + "step": 45352 + }, + { + "epoch": 3.674092676604018, + "grad_norm": 0.07750751078128815, + "learning_rate": 1.8106125388181286e-05, + "loss": 0.2604, + "step": 45353 + }, + { + "epoch": 3.6741736876215167, + "grad_norm": 0.06813794374465942, + "learning_rate": 1.8101624735586663e-05, + "loss": 0.2211, + "step": 45354 + }, + { + "epoch": 3.674254698639015, + "grad_norm": 0.06250642240047455, + "learning_rate": 1.8097124082992033e-05, + "loss": 0.2105, + "step": 45355 + }, + { + "epoch": 3.6743357096565132, + "grad_norm": 0.06140798330307007, + "learning_rate": 1.809262343039741e-05, + "loss": 0.2277, + "step": 45356 + }, + { + "epoch": 3.674416720674012, + "grad_norm": 0.061542950570583344, + "learning_rate": 1.8088122777802784e-05, + "loss": 0.208, + "step": 45357 + }, + { + "epoch": 3.67449773169151, + "grad_norm": 0.07049670815467834, + "learning_rate": 1.8083622125208154e-05, + "loss": 0.1901, + "step": 45358 + }, + { + "epoch": 3.6745787427090084, + "grad_norm": 0.08683442324399948, + "learning_rate": 1.807912147261353e-05, + "loss": 0.2543, + "step": 45359 + }, + { + "epoch": 3.6746597537265067, + "grad_norm": 0.060977209359407425, + "learning_rate": 1.8074620820018904e-05, + "loss": 0.2064, + "step": 45360 + }, + { + "epoch": 3.6747407647440054, + "grad_norm": 0.06796345859766006, + "learning_rate": 1.8070120167424278e-05, + "loss": 0.2111, + "step": 45361 + }, + { + "epoch": 3.6748217757615036, + "grad_norm": 0.06472915410995483, + "learning_rate": 1.806561951482965e-05, + "loss": 0.2397, + "step": 45362 + }, + { + "epoch": 3.674902786779002, + "grad_norm": 0.06002800911664963, + "learning_rate": 1.8061118862235025e-05, + "loss": 0.2346, + "step": 45363 + }, + { + "epoch": 3.6749837977965, + "grad_norm": 0.06987227499485016, + "learning_rate": 1.80566182096404e-05, + "loss": 0.1946, + "step": 45364 + }, + { + "epoch": 3.6750648088139988, + "grad_norm": 0.0703306794166565, + "learning_rate": 1.8052117557045772e-05, + "loss": 0.2144, + "step": 45365 + }, + { + "epoch": 3.675145819831497, + "grad_norm": 0.07858558744192123, + "learning_rate": 1.8047616904451146e-05, + "loss": 0.2181, + "step": 45366 + }, + { + "epoch": 3.6752268308489953, + "grad_norm": 0.0637904480099678, + "learning_rate": 1.804311625185652e-05, + "loss": 0.1904, + "step": 45367 + }, + { + "epoch": 3.675307841866494, + "grad_norm": 0.0818314254283905, + "learning_rate": 1.8038615599261893e-05, + "loss": 0.2519, + "step": 45368 + }, + { + "epoch": 3.675388852883992, + "grad_norm": 0.067131906747818, + "learning_rate": 1.803411494666727e-05, + "loss": 0.234, + "step": 45369 + }, + { + "epoch": 3.6754698639014904, + "grad_norm": 0.07786626368761063, + "learning_rate": 1.802961429407264e-05, + "loss": 0.2228, + "step": 45370 + }, + { + "epoch": 3.675550874918989, + "grad_norm": 0.0704607367515564, + "learning_rate": 1.8025113641478014e-05, + "loss": 0.2205, + "step": 45371 + }, + { + "epoch": 3.6756318859364874, + "grad_norm": 0.06771155446767807, + "learning_rate": 1.802061298888339e-05, + "loss": 0.1937, + "step": 45372 + }, + { + "epoch": 3.6757128969539856, + "grad_norm": 0.07659830898046494, + "learning_rate": 1.801611233628876e-05, + "loss": 0.2322, + "step": 45373 + }, + { + "epoch": 3.6757939079714843, + "grad_norm": 0.08131053298711777, + "learning_rate": 1.8011611683694138e-05, + "loss": 0.2694, + "step": 45374 + }, + { + "epoch": 3.6758749189889826, + "grad_norm": 0.06618554890155792, + "learning_rate": 1.8007111031099512e-05, + "loss": 0.2322, + "step": 45375 + }, + { + "epoch": 3.675955930006481, + "grad_norm": 0.07669448107481003, + "learning_rate": 1.8002610378504882e-05, + "loss": 0.229, + "step": 45376 + }, + { + "epoch": 3.6760369410239795, + "grad_norm": 0.07624157518148422, + "learning_rate": 1.799810972591026e-05, + "loss": 0.2206, + "step": 45377 + }, + { + "epoch": 3.6761179520414777, + "grad_norm": 0.06907004117965698, + "learning_rate": 1.7993609073315633e-05, + "loss": 0.2133, + "step": 45378 + }, + { + "epoch": 3.676198963058976, + "grad_norm": 0.09575460106134415, + "learning_rate": 1.7989108420721003e-05, + "loss": 0.2484, + "step": 45379 + }, + { + "epoch": 3.6762799740764747, + "grad_norm": 0.07262253761291504, + "learning_rate": 1.798460776812638e-05, + "loss": 0.2409, + "step": 45380 + }, + { + "epoch": 3.676360985093973, + "grad_norm": 0.06717146933078766, + "learning_rate": 1.7980107115531753e-05, + "loss": 0.2061, + "step": 45381 + }, + { + "epoch": 3.676441996111471, + "grad_norm": 0.0591311901807785, + "learning_rate": 1.7975606462937127e-05, + "loss": 0.2206, + "step": 45382 + }, + { + "epoch": 3.6765230071289694, + "grad_norm": 0.0643257200717926, + "learning_rate": 1.79711058103425e-05, + "loss": 0.2219, + "step": 45383 + }, + { + "epoch": 3.6766040181464676, + "grad_norm": 0.07790892571210861, + "learning_rate": 1.7966605157747874e-05, + "loss": 0.2244, + "step": 45384 + }, + { + "epoch": 3.6766850291639663, + "grad_norm": 0.08132940530776978, + "learning_rate": 1.7962104505153248e-05, + "loss": 0.2378, + "step": 45385 + }, + { + "epoch": 3.6767660401814646, + "grad_norm": 0.0682375505566597, + "learning_rate": 1.795760385255862e-05, + "loss": 0.2069, + "step": 45386 + }, + { + "epoch": 3.676847051198963, + "grad_norm": 0.0725887343287468, + "learning_rate": 1.7953103199963995e-05, + "loss": 0.2119, + "step": 45387 + }, + { + "epoch": 3.6769280622164615, + "grad_norm": 0.07341375201940536, + "learning_rate": 1.794860254736937e-05, + "loss": 0.1964, + "step": 45388 + }, + { + "epoch": 3.6770090732339598, + "grad_norm": 0.07751181721687317, + "learning_rate": 1.7944101894774742e-05, + "loss": 0.2544, + "step": 45389 + }, + { + "epoch": 3.677090084251458, + "grad_norm": 0.08196587860584259, + "learning_rate": 1.793960124218012e-05, + "loss": 0.2033, + "step": 45390 + }, + { + "epoch": 3.6771710952689567, + "grad_norm": 0.07888181507587433, + "learning_rate": 1.793510058958549e-05, + "loss": 0.2222, + "step": 45391 + }, + { + "epoch": 3.677252106286455, + "grad_norm": 0.07049840688705444, + "learning_rate": 1.7930599936990863e-05, + "loss": 0.2432, + "step": 45392 + }, + { + "epoch": 3.677333117303953, + "grad_norm": 0.07308931648731232, + "learning_rate": 1.792609928439624e-05, + "loss": 0.2591, + "step": 45393 + }, + { + "epoch": 3.677414128321452, + "grad_norm": 0.06875211000442505, + "learning_rate": 1.792159863180161e-05, + "loss": 0.2524, + "step": 45394 + }, + { + "epoch": 3.67749513933895, + "grad_norm": 0.0841333344578743, + "learning_rate": 1.7917097979206987e-05, + "loss": 0.2547, + "step": 45395 + }, + { + "epoch": 3.6775761503564484, + "grad_norm": 0.0642288327217102, + "learning_rate": 1.791259732661236e-05, + "loss": 0.2641, + "step": 45396 + }, + { + "epoch": 3.677657161373947, + "grad_norm": 0.07159867137670517, + "learning_rate": 1.790809667401773e-05, + "loss": 0.2248, + "step": 45397 + }, + { + "epoch": 3.6777381723914453, + "grad_norm": 0.06401391327381134, + "learning_rate": 1.7903596021423108e-05, + "loss": 0.2335, + "step": 45398 + }, + { + "epoch": 3.6778191834089435, + "grad_norm": 0.07349655032157898, + "learning_rate": 1.789909536882848e-05, + "loss": 0.245, + "step": 45399 + }, + { + "epoch": 3.6779001944264422, + "grad_norm": 0.08266490697860718, + "learning_rate": 1.7894594716233852e-05, + "loss": 0.2315, + "step": 45400 + }, + { + "epoch": 3.6779812054439405, + "grad_norm": 0.06793235242366791, + "learning_rate": 1.789009406363923e-05, + "loss": 0.2172, + "step": 45401 + }, + { + "epoch": 3.6780622164614387, + "grad_norm": 0.08288303017616272, + "learning_rate": 1.7885593411044602e-05, + "loss": 0.244, + "step": 45402 + }, + { + "epoch": 3.6781432274789374, + "grad_norm": 0.07585656642913818, + "learning_rate": 1.7881092758449976e-05, + "loss": 0.2338, + "step": 45403 + }, + { + "epoch": 3.6782242384964356, + "grad_norm": 0.07964345812797546, + "learning_rate": 1.787659210585535e-05, + "loss": 0.2392, + "step": 45404 + }, + { + "epoch": 3.678305249513934, + "grad_norm": 0.06705871969461441, + "learning_rate": 1.7872091453260723e-05, + "loss": 0.2142, + "step": 45405 + }, + { + "epoch": 3.678386260531432, + "grad_norm": 0.06813385337591171, + "learning_rate": 1.7867590800666097e-05, + "loss": 0.2099, + "step": 45406 + }, + { + "epoch": 3.6784672715489304, + "grad_norm": 0.08102632313966751, + "learning_rate": 1.786309014807147e-05, + "loss": 0.2217, + "step": 45407 + }, + { + "epoch": 3.678548282566429, + "grad_norm": 0.06885459274053574, + "learning_rate": 1.7858589495476847e-05, + "loss": 0.2094, + "step": 45408 + }, + { + "epoch": 3.6786292935839273, + "grad_norm": 0.06557776778936386, + "learning_rate": 1.7854088842882218e-05, + "loss": 0.2062, + "step": 45409 + }, + { + "epoch": 3.6787103046014256, + "grad_norm": 0.06176763400435448, + "learning_rate": 1.784958819028759e-05, + "loss": 0.2385, + "step": 45410 + }, + { + "epoch": 3.6787913156189243, + "grad_norm": 0.06020414084196091, + "learning_rate": 1.7845087537692968e-05, + "loss": 0.2034, + "step": 45411 + }, + { + "epoch": 3.6788723266364225, + "grad_norm": 0.09392343461513519, + "learning_rate": 1.784058688509834e-05, + "loss": 0.2268, + "step": 45412 + }, + { + "epoch": 3.6789533376539207, + "grad_norm": 0.06853241473436356, + "learning_rate": 1.7836086232503712e-05, + "loss": 0.2186, + "step": 45413 + }, + { + "epoch": 3.6790343486714194, + "grad_norm": 0.05744815245270729, + "learning_rate": 1.783158557990909e-05, + "loss": 0.2195, + "step": 45414 + }, + { + "epoch": 3.6791153596889177, + "grad_norm": 0.06485334038734436, + "learning_rate": 1.782708492731446e-05, + "loss": 0.1801, + "step": 45415 + }, + { + "epoch": 3.679196370706416, + "grad_norm": 0.06390785425901413, + "learning_rate": 1.7822584274719836e-05, + "loss": 0.2253, + "step": 45416 + }, + { + "epoch": 3.6792773817239146, + "grad_norm": 0.07828384637832642, + "learning_rate": 1.781808362212521e-05, + "loss": 0.2429, + "step": 45417 + }, + { + "epoch": 3.679358392741413, + "grad_norm": 0.06546802073717117, + "learning_rate": 1.781358296953058e-05, + "loss": 0.2563, + "step": 45418 + }, + { + "epoch": 3.679439403758911, + "grad_norm": 0.06929804384708405, + "learning_rate": 1.7809082316935957e-05, + "loss": 0.2069, + "step": 45419 + }, + { + "epoch": 3.67952041477641, + "grad_norm": 0.06710172444581985, + "learning_rate": 1.780458166434133e-05, + "loss": 0.1934, + "step": 45420 + }, + { + "epoch": 3.679601425793908, + "grad_norm": 0.08107573539018631, + "learning_rate": 1.7800081011746704e-05, + "loss": 0.2343, + "step": 45421 + }, + { + "epoch": 3.6796824368114063, + "grad_norm": 0.062125902622938156, + "learning_rate": 1.7795580359152078e-05, + "loss": 0.177, + "step": 45422 + }, + { + "epoch": 3.679763447828905, + "grad_norm": 0.07452069222927094, + "learning_rate": 1.779107970655745e-05, + "loss": 0.1993, + "step": 45423 + }, + { + "epoch": 3.679844458846403, + "grad_norm": 0.07609012722969055, + "learning_rate": 1.7786579053962825e-05, + "loss": 0.3101, + "step": 45424 + }, + { + "epoch": 3.6799254698639015, + "grad_norm": 0.06842630356550217, + "learning_rate": 1.77820784013682e-05, + "loss": 0.2318, + "step": 45425 + }, + { + "epoch": 3.6800064808814, + "grad_norm": 0.05769021809101105, + "learning_rate": 1.7777577748773572e-05, + "loss": 0.1899, + "step": 45426 + }, + { + "epoch": 3.6800874918988984, + "grad_norm": 0.0755428820848465, + "learning_rate": 1.7773077096178946e-05, + "loss": 0.2317, + "step": 45427 + }, + { + "epoch": 3.6801685029163966, + "grad_norm": 0.0577971413731575, + "learning_rate": 1.776857644358432e-05, + "loss": 0.1978, + "step": 45428 + }, + { + "epoch": 3.680249513933895, + "grad_norm": 0.0823320671916008, + "learning_rate": 1.7764075790989697e-05, + "loss": 0.2494, + "step": 45429 + }, + { + "epoch": 3.680330524951393, + "grad_norm": 0.0709446594119072, + "learning_rate": 1.7759575138395067e-05, + "loss": 0.2439, + "step": 45430 + }, + { + "epoch": 3.680411535968892, + "grad_norm": 0.08453939855098724, + "learning_rate": 1.775507448580044e-05, + "loss": 0.2154, + "step": 45431 + }, + { + "epoch": 3.68049254698639, + "grad_norm": 0.06712406128644943, + "learning_rate": 1.7750573833205817e-05, + "loss": 0.2371, + "step": 45432 + }, + { + "epoch": 3.6805735580038883, + "grad_norm": 0.09066680818796158, + "learning_rate": 1.7746073180611188e-05, + "loss": 0.2732, + "step": 45433 + }, + { + "epoch": 3.680654569021387, + "grad_norm": 0.08785214275121689, + "learning_rate": 1.7741572528016565e-05, + "loss": 0.2374, + "step": 45434 + }, + { + "epoch": 3.6807355800388852, + "grad_norm": 0.06791125982999802, + "learning_rate": 1.7737071875421938e-05, + "loss": 0.1922, + "step": 45435 + }, + { + "epoch": 3.6808165910563835, + "grad_norm": 0.06534958630800247, + "learning_rate": 1.773257122282731e-05, + "loss": 0.2348, + "step": 45436 + }, + { + "epoch": 3.680897602073882, + "grad_norm": 0.06191590055823326, + "learning_rate": 1.7728070570232685e-05, + "loss": 0.2252, + "step": 45437 + }, + { + "epoch": 3.6809786130913804, + "grad_norm": 0.07369256019592285, + "learning_rate": 1.772356991763806e-05, + "loss": 0.236, + "step": 45438 + }, + { + "epoch": 3.6810596241088787, + "grad_norm": 0.07055247575044632, + "learning_rate": 1.7719069265043433e-05, + "loss": 0.2381, + "step": 45439 + }, + { + "epoch": 3.6811406351263773, + "grad_norm": 0.06439191102981567, + "learning_rate": 1.7714568612448806e-05, + "loss": 0.2122, + "step": 45440 + }, + { + "epoch": 3.6812216461438756, + "grad_norm": 0.08796586841344833, + "learning_rate": 1.771006795985418e-05, + "loss": 0.2482, + "step": 45441 + }, + { + "epoch": 3.681302657161374, + "grad_norm": 0.07205065339803696, + "learning_rate": 1.7705567307259553e-05, + "loss": 0.2202, + "step": 45442 + }, + { + "epoch": 3.6813836681788725, + "grad_norm": 0.0916776955127716, + "learning_rate": 1.7701066654664927e-05, + "loss": 0.2375, + "step": 45443 + }, + { + "epoch": 3.6814646791963708, + "grad_norm": 0.08529650419950485, + "learning_rate": 1.76965660020703e-05, + "loss": 0.2101, + "step": 45444 + }, + { + "epoch": 3.681545690213869, + "grad_norm": 0.07912997901439667, + "learning_rate": 1.7692065349475674e-05, + "loss": 0.2204, + "step": 45445 + }, + { + "epoch": 3.6816267012313677, + "grad_norm": 0.0730692520737648, + "learning_rate": 1.7687564696881048e-05, + "loss": 0.2319, + "step": 45446 + }, + { + "epoch": 3.681707712248866, + "grad_norm": 0.09680872410535812, + "learning_rate": 1.7683064044286425e-05, + "loss": 0.2498, + "step": 45447 + }, + { + "epoch": 3.681788723266364, + "grad_norm": 0.07463949173688889, + "learning_rate": 1.7678563391691795e-05, + "loss": 0.2261, + "step": 45448 + }, + { + "epoch": 3.681869734283863, + "grad_norm": 0.06920517235994339, + "learning_rate": 1.767406273909717e-05, + "loss": 0.2513, + "step": 45449 + }, + { + "epoch": 3.681950745301361, + "grad_norm": 0.07529915124177933, + "learning_rate": 1.7669562086502546e-05, + "loss": 0.2541, + "step": 45450 + }, + { + "epoch": 3.6820317563188594, + "grad_norm": 0.058915991336107254, + "learning_rate": 1.7665061433907916e-05, + "loss": 0.1903, + "step": 45451 + }, + { + "epoch": 3.6821127673363576, + "grad_norm": 0.0809895247220993, + "learning_rate": 1.766056078131329e-05, + "loss": 0.2207, + "step": 45452 + }, + { + "epoch": 3.682193778353856, + "grad_norm": 0.0563754104077816, + "learning_rate": 1.7656060128718666e-05, + "loss": 0.227, + "step": 45453 + }, + { + "epoch": 3.6822747893713546, + "grad_norm": 0.09436178207397461, + "learning_rate": 1.7651559476124037e-05, + "loss": 0.2434, + "step": 45454 + }, + { + "epoch": 3.682355800388853, + "grad_norm": 0.08379881083965302, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.2354, + "step": 45455 + }, + { + "epoch": 3.682436811406351, + "grad_norm": 0.06248372420668602, + "learning_rate": 1.7642558170934787e-05, + "loss": 0.2303, + "step": 45456 + }, + { + "epoch": 3.6825178224238497, + "grad_norm": 0.0720660537481308, + "learning_rate": 1.763805751834016e-05, + "loss": 0.2351, + "step": 45457 + }, + { + "epoch": 3.682598833441348, + "grad_norm": 0.07073300331830978, + "learning_rate": 1.7633556865745534e-05, + "loss": 0.2239, + "step": 45458 + }, + { + "epoch": 3.682679844458846, + "grad_norm": 0.0651804730296135, + "learning_rate": 1.7629056213150908e-05, + "loss": 0.2001, + "step": 45459 + }, + { + "epoch": 3.682760855476345, + "grad_norm": 0.06839761137962341, + "learning_rate": 1.762455556055628e-05, + "loss": 0.192, + "step": 45460 + }, + { + "epoch": 3.682841866493843, + "grad_norm": 0.06647450476884842, + "learning_rate": 1.7620054907961655e-05, + "loss": 0.2151, + "step": 45461 + }, + { + "epoch": 3.6829228775113414, + "grad_norm": 0.06537552177906036, + "learning_rate": 1.761555425536703e-05, + "loss": 0.2231, + "step": 45462 + }, + { + "epoch": 3.68300388852884, + "grad_norm": 0.0682782530784607, + "learning_rate": 1.7611053602772402e-05, + "loss": 0.1934, + "step": 45463 + }, + { + "epoch": 3.6830848995463383, + "grad_norm": 0.07463079690933228, + "learning_rate": 1.7606552950177776e-05, + "loss": 0.2753, + "step": 45464 + }, + { + "epoch": 3.6831659105638366, + "grad_norm": 0.06640588492155075, + "learning_rate": 1.760205229758315e-05, + "loss": 0.2724, + "step": 45465 + }, + { + "epoch": 3.6832469215813353, + "grad_norm": 0.06108155474066734, + "learning_rate": 1.7597551644988523e-05, + "loss": 0.2318, + "step": 45466 + }, + { + "epoch": 3.6833279325988335, + "grad_norm": 0.07797868549823761, + "learning_rate": 1.7593050992393897e-05, + "loss": 0.2367, + "step": 45467 + }, + { + "epoch": 3.6834089436163318, + "grad_norm": 0.06545042991638184, + "learning_rate": 1.7588550339799274e-05, + "loss": 0.2378, + "step": 45468 + }, + { + "epoch": 3.6834899546338304, + "grad_norm": 0.06987342238426208, + "learning_rate": 1.7584049687204644e-05, + "loss": 0.2367, + "step": 45469 + }, + { + "epoch": 3.6835709656513287, + "grad_norm": 0.0632578432559967, + "learning_rate": 1.7579549034610018e-05, + "loss": 0.2142, + "step": 45470 + }, + { + "epoch": 3.683651976668827, + "grad_norm": 0.058642685413360596, + "learning_rate": 1.7575048382015395e-05, + "loss": 0.1965, + "step": 45471 + }, + { + "epoch": 3.683732987686325, + "grad_norm": 0.07923772931098938, + "learning_rate": 1.7570547729420765e-05, + "loss": 0.2682, + "step": 45472 + }, + { + "epoch": 3.683813998703824, + "grad_norm": 0.05873147025704384, + "learning_rate": 1.756604707682614e-05, + "loss": 0.1975, + "step": 45473 + }, + { + "epoch": 3.683895009721322, + "grad_norm": 0.07675875723361969, + "learning_rate": 1.7561546424231515e-05, + "loss": 0.2062, + "step": 45474 + }, + { + "epoch": 3.6839760207388204, + "grad_norm": 0.07482752203941345, + "learning_rate": 1.755704577163689e-05, + "loss": 0.2373, + "step": 45475 + }, + { + "epoch": 3.6840570317563186, + "grad_norm": 0.075741246342659, + "learning_rate": 1.7552545119042263e-05, + "loss": 0.2375, + "step": 45476 + }, + { + "epoch": 3.6841380427738173, + "grad_norm": 0.07882252335548401, + "learning_rate": 1.7548044466447636e-05, + "loss": 0.2564, + "step": 45477 + }, + { + "epoch": 3.6842190537913155, + "grad_norm": 0.07127309590578079, + "learning_rate": 1.754354381385301e-05, + "loss": 0.2293, + "step": 45478 + }, + { + "epoch": 3.684300064808814, + "grad_norm": 0.09695709496736526, + "learning_rate": 1.7539043161258383e-05, + "loss": 0.2266, + "step": 45479 + }, + { + "epoch": 3.6843810758263125, + "grad_norm": 0.06037114933133125, + "learning_rate": 1.7534542508663757e-05, + "loss": 0.2087, + "step": 45480 + }, + { + "epoch": 3.6844620868438107, + "grad_norm": 0.08382045477628708, + "learning_rate": 1.753004185606913e-05, + "loss": 0.2198, + "step": 45481 + }, + { + "epoch": 3.684543097861309, + "grad_norm": 0.08488260954618454, + "learning_rate": 1.7525541203474504e-05, + "loss": 0.2517, + "step": 45482 + }, + { + "epoch": 3.6846241088788076, + "grad_norm": 0.08701542764902115, + "learning_rate": 1.7521040550879878e-05, + "loss": 0.2522, + "step": 45483 + }, + { + "epoch": 3.684705119896306, + "grad_norm": 0.06933901458978653, + "learning_rate": 1.751653989828525e-05, + "loss": 0.2533, + "step": 45484 + }, + { + "epoch": 3.684786130913804, + "grad_norm": 0.07655087113380432, + "learning_rate": 1.7512039245690625e-05, + "loss": 0.2582, + "step": 45485 + }, + { + "epoch": 3.684867141931303, + "grad_norm": 0.07492398470640182, + "learning_rate": 1.7507538593096e-05, + "loss": 0.2234, + "step": 45486 + }, + { + "epoch": 3.684948152948801, + "grad_norm": 0.06873263418674469, + "learning_rate": 1.7503037940501372e-05, + "loss": 0.2216, + "step": 45487 + }, + { + "epoch": 3.6850291639662993, + "grad_norm": 0.07171261310577393, + "learning_rate": 1.7498537287906746e-05, + "loss": 0.2238, + "step": 45488 + }, + { + "epoch": 3.685110174983798, + "grad_norm": 0.08120883256196976, + "learning_rate": 1.7494036635312123e-05, + "loss": 0.2817, + "step": 45489 + }, + { + "epoch": 3.6851911860012962, + "grad_norm": 0.0914805680513382, + "learning_rate": 1.7489535982717496e-05, + "loss": 0.2555, + "step": 45490 + }, + { + "epoch": 3.6852721970187945, + "grad_norm": 0.06005210801959038, + "learning_rate": 1.7485035330122867e-05, + "loss": 0.1913, + "step": 45491 + }, + { + "epoch": 3.685353208036293, + "grad_norm": 0.0790003314614296, + "learning_rate": 1.7480534677528244e-05, + "loss": 0.2199, + "step": 45492 + }, + { + "epoch": 3.6854342190537914, + "grad_norm": 0.09822786599397659, + "learning_rate": 1.7476034024933617e-05, + "loss": 0.2557, + "step": 45493 + }, + { + "epoch": 3.6855152300712897, + "grad_norm": 0.08229862153530121, + "learning_rate": 1.747153337233899e-05, + "loss": 0.2445, + "step": 45494 + }, + { + "epoch": 3.685596241088788, + "grad_norm": 0.07014301419258118, + "learning_rate": 1.7467032719744365e-05, + "loss": 0.2522, + "step": 45495 + }, + { + "epoch": 3.6856772521062866, + "grad_norm": 0.06729548424482346, + "learning_rate": 1.7462532067149738e-05, + "loss": 0.216, + "step": 45496 + }, + { + "epoch": 3.685758263123785, + "grad_norm": 0.06353531032800674, + "learning_rate": 1.7458031414555112e-05, + "loss": 0.2392, + "step": 45497 + }, + { + "epoch": 3.685839274141283, + "grad_norm": 0.06674017757177353, + "learning_rate": 1.7453530761960485e-05, + "loss": 0.2371, + "step": 45498 + }, + { + "epoch": 3.6859202851587813, + "grad_norm": 0.07018964737653732, + "learning_rate": 1.744903010936586e-05, + "loss": 0.2566, + "step": 45499 + }, + { + "epoch": 3.68600129617628, + "grad_norm": 0.06647379696369171, + "learning_rate": 1.7444529456771233e-05, + "loss": 0.2034, + "step": 45500 + } + ], + "logging_steps": 1, + "max_steps": 49376, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.985306826203359e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}