{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6678476527900797, "eval_steps": 500, "global_step": 754, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 15.385598322874909, "learning_rate": 5.000000000000001e-07, "loss": 0.7627, "step": 1 }, { "epoch": 0.0, "grad_norm": 15.42812332406859, "learning_rate": 1.0000000000000002e-06, "loss": 0.794, "step": 2 }, { "epoch": 0.0, "grad_norm": 13.76934599903778, "learning_rate": 1.5e-06, "loss": 0.7894, "step": 3 }, { "epoch": 0.0, "grad_norm": 7.9055471186770685, "learning_rate": 2.0000000000000003e-06, "loss": 0.7346, "step": 4 }, { "epoch": 0.0, "grad_norm": 8.624179170790118, "learning_rate": 2.5e-06, "loss": 0.7458, "step": 5 }, { "epoch": 0.01, "grad_norm": 37.14544394485457, "learning_rate": 3e-06, "loss": 0.8249, "step": 6 }, { "epoch": 0.01, "grad_norm": 13.413192499879626, "learning_rate": 3.5e-06, "loss": 0.7692, "step": 7 }, { "epoch": 0.01, "grad_norm": 11.194156755277431, "learning_rate": 4.000000000000001e-06, "loss": 0.7724, "step": 8 }, { "epoch": 0.01, "grad_norm": 8.569279640169995, "learning_rate": 4.5e-06, "loss": 0.7851, "step": 9 }, { "epoch": 0.01, "grad_norm": 18.113903622060178, "learning_rate": 5e-06, "loss": 0.7874, "step": 10 }, { "epoch": 0.01, "grad_norm": 7.486914001687124, "learning_rate": 4.999997558722919e-06, "loss": 0.7553, "step": 11 }, { "epoch": 0.01, "grad_norm": 7.280219682440894, "learning_rate": 4.999990234896445e-06, "loss": 0.7095, "step": 12 }, { "epoch": 0.01, "grad_norm": 4.3413734180304155, "learning_rate": 4.99997802853488e-06, "loss": 0.6916, "step": 13 }, { "epoch": 0.01, "grad_norm": 5.756315245615391, "learning_rate": 4.999960939662063e-06, "loss": 0.7407, "step": 14 }, { "epoch": 0.01, "grad_norm": 5.090553047874293, "learning_rate": 4.999938968311371e-06, "loss": 0.7387, "step": 15 }, { "epoch": 0.01, "grad_norm": 5.8370558847287075, "learning_rate": 4.9999121145257126e-06, "loss": 0.7051, "step": 16 }, { "epoch": 0.02, "grad_norm": 3.986658012877664, "learning_rate": 4.999880378357535e-06, "loss": 0.6871, "step": 17 }, { "epoch": 0.02, "grad_norm": 4.141716122521651, "learning_rate": 4.9998437598688195e-06, "loss": 0.6694, "step": 18 }, { "epoch": 0.02, "grad_norm": 4.729722439630604, "learning_rate": 4.9998022591310815e-06, "loss": 0.716, "step": 19 }, { "epoch": 0.02, "grad_norm": 2.9486336901615497, "learning_rate": 4.999755876225375e-06, "loss": 0.6387, "step": 20 }, { "epoch": 0.02, "grad_norm": 2.8336874650575745, "learning_rate": 4.999704611242285e-06, "loss": 0.6542, "step": 21 }, { "epoch": 0.02, "grad_norm": 3.6724374918638905, "learning_rate": 4.999648464281934e-06, "loss": 0.6617, "step": 22 }, { "epoch": 0.02, "grad_norm": 2.941494127880678, "learning_rate": 4.999587435453979e-06, "loss": 0.6687, "step": 23 }, { "epoch": 0.02, "grad_norm": 2.6261822206464744, "learning_rate": 4.999521524877608e-06, "loss": 0.6634, "step": 24 }, { "epoch": 0.02, "grad_norm": 2.8059947014946305, "learning_rate": 4.999450732681549e-06, "loss": 0.6901, "step": 25 }, { "epoch": 0.02, "grad_norm": 3.131537494217822, "learning_rate": 4.999375059004058e-06, "loss": 0.6407, "step": 26 }, { "epoch": 0.02, "grad_norm": 2.7893212245465837, "learning_rate": 4.99929450399293e-06, "loss": 0.6638, "step": 27 }, { "epoch": 0.02, "grad_norm": 2.4411586751746, "learning_rate": 4.999209067805487e-06, "loss": 0.6196, "step": 28 }, { "epoch": 0.03, "grad_norm": 2.8807261299944082, "learning_rate": 4.999118750608591e-06, "loss": 0.6839, "step": 29 }, { "epoch": 0.03, "grad_norm": 2.879993804839069, "learning_rate": 4.9990235525786326e-06, "loss": 0.6484, "step": 30 }, { "epoch": 0.03, "grad_norm": 2.604360711268946, "learning_rate": 4.998923473901535e-06, "loss": 0.6313, "step": 31 }, { "epoch": 0.03, "grad_norm": 2.403225544767816, "learning_rate": 4.9988185147727544e-06, "loss": 0.6209, "step": 32 }, { "epoch": 0.03, "grad_norm": 2.669567772543462, "learning_rate": 4.998708675397278e-06, "loss": 0.6068, "step": 33 }, { "epoch": 0.03, "grad_norm": 2.443946495915797, "learning_rate": 4.998593955989626e-06, "loss": 0.6731, "step": 34 }, { "epoch": 0.03, "grad_norm": 2.2104680876118317, "learning_rate": 4.998474356773845e-06, "loss": 0.6243, "step": 35 }, { "epoch": 0.03, "grad_norm": 2.3602199264043957, "learning_rate": 4.9983498779835175e-06, "loss": 0.6649, "step": 36 }, { "epoch": 0.03, "grad_norm": 2.4676911263240844, "learning_rate": 4.998220519861752e-06, "loss": 0.6174, "step": 37 }, { "epoch": 0.03, "grad_norm": 2.3419026099030282, "learning_rate": 4.998086282661188e-06, "loss": 0.6123, "step": 38 }, { "epoch": 0.03, "grad_norm": 2.14900736954254, "learning_rate": 4.997947166643993e-06, "loss": 0.63, "step": 39 }, { "epoch": 0.04, "grad_norm": 2.570907426799795, "learning_rate": 4.997803172081864e-06, "loss": 0.6249, "step": 40 }, { "epoch": 0.04, "grad_norm": 2.516952735669967, "learning_rate": 4.997654299256026e-06, "loss": 0.6727, "step": 41 }, { "epoch": 0.04, "grad_norm": 2.1600457198543874, "learning_rate": 4.997500548457231e-06, "loss": 0.6719, "step": 42 }, { "epoch": 0.04, "grad_norm": 2.2177572033934743, "learning_rate": 4.997341919985756e-06, "loss": 0.6148, "step": 43 }, { "epoch": 0.04, "grad_norm": 2.397105205209689, "learning_rate": 4.997178414151409e-06, "loss": 0.6167, "step": 44 }, { "epoch": 0.04, "grad_norm": 2.1254940534972167, "learning_rate": 4.997010031273517e-06, "loss": 0.6446, "step": 45 }, { "epoch": 0.04, "grad_norm": 2.2113023791837194, "learning_rate": 4.996836771680937e-06, "loss": 0.6304, "step": 46 }, { "epoch": 0.04, "grad_norm": 2.386446316275664, "learning_rate": 4.99665863571205e-06, "loss": 0.6621, "step": 47 }, { "epoch": 0.04, "grad_norm": 2.1838934384314483, "learning_rate": 4.996475623714756e-06, "loss": 0.6214, "step": 48 }, { "epoch": 0.04, "grad_norm": 2.2047933657923586, "learning_rate": 4.996287736046485e-06, "loss": 0.6478, "step": 49 }, { "epoch": 0.04, "grad_norm": 2.208809457983808, "learning_rate": 4.996094973074183e-06, "loss": 0.6097, "step": 50 }, { "epoch": 0.05, "grad_norm": 2.1318377198138267, "learning_rate": 4.995897335174322e-06, "loss": 0.622, "step": 51 }, { "epoch": 0.05, "grad_norm": 2.0673034122993537, "learning_rate": 4.995694822732893e-06, "loss": 0.6036, "step": 52 }, { "epoch": 0.05, "grad_norm": 2.195105312645423, "learning_rate": 4.9954874361454055e-06, "loss": 0.6052, "step": 53 }, { "epoch": 0.05, "grad_norm": 2.157855029176061, "learning_rate": 4.995275175816892e-06, "loss": 0.6455, "step": 54 }, { "epoch": 0.05, "grad_norm": 2.0500405783991043, "learning_rate": 4.9950580421619e-06, "loss": 0.6353, "step": 55 }, { "epoch": 0.05, "grad_norm": 2.199629904296075, "learning_rate": 4.9948360356044965e-06, "loss": 0.6122, "step": 56 }, { "epoch": 0.05, "grad_norm": 2.186847580161491, "learning_rate": 4.994609156578267e-06, "loss": 0.6073, "step": 57 }, { "epoch": 0.05, "grad_norm": 2.0207512037097835, "learning_rate": 4.994377405526308e-06, "loss": 0.61, "step": 58 }, { "epoch": 0.05, "grad_norm": 2.3170193964114976, "learning_rate": 4.994140782901237e-06, "loss": 0.6322, "step": 59 }, { "epoch": 0.05, "grad_norm": 2.014785890436746, "learning_rate": 4.9938992891651825e-06, "loss": 0.6205, "step": 60 }, { "epoch": 0.05, "grad_norm": 1.9538385063221935, "learning_rate": 4.9936529247897854e-06, "loss": 0.5992, "step": 61 }, { "epoch": 0.05, "grad_norm": 2.084943826856202, "learning_rate": 4.993401690256203e-06, "loss": 0.6148, "step": 62 }, { "epoch": 0.06, "grad_norm": 2.135158856581583, "learning_rate": 4.9931455860551e-06, "loss": 0.5937, "step": 63 }, { "epoch": 0.06, "grad_norm": 1.982621418518698, "learning_rate": 4.992884612686655e-06, "loss": 0.6091, "step": 64 }, { "epoch": 0.06, "grad_norm": 2.1030931953494956, "learning_rate": 4.992618770660553e-06, "loss": 0.6034, "step": 65 }, { "epoch": 0.06, "grad_norm": 2.1994634556563994, "learning_rate": 4.992348060495989e-06, "loss": 0.5846, "step": 66 }, { "epoch": 0.06, "grad_norm": 2.410691403277427, "learning_rate": 4.992072482721669e-06, "loss": 0.6294, "step": 67 }, { "epoch": 0.06, "grad_norm": 1.9720494401999067, "learning_rate": 4.991792037875799e-06, "loss": 0.591, "step": 68 }, { "epoch": 0.06, "grad_norm": 2.147504025949435, "learning_rate": 4.991506726506094e-06, "loss": 0.5689, "step": 69 }, { "epoch": 0.06, "grad_norm": 2.1837702519904223, "learning_rate": 4.991216549169776e-06, "loss": 0.6422, "step": 70 }, { "epoch": 0.06, "grad_norm": 2.0883865330274958, "learning_rate": 4.9909215064335655e-06, "loss": 0.6076, "step": 71 }, { "epoch": 0.06, "grad_norm": 2.20727863923846, "learning_rate": 4.990621598873687e-06, "loss": 0.5974, "step": 72 }, { "epoch": 0.06, "grad_norm": 2.0735330806418464, "learning_rate": 4.990316827075868e-06, "loss": 0.6809, "step": 73 }, { "epoch": 0.07, "grad_norm": 2.0203203347538774, "learning_rate": 4.990007191635334e-06, "loss": 0.6107, "step": 74 }, { "epoch": 0.07, "grad_norm": 2.234889365362174, "learning_rate": 4.989692693156809e-06, "loss": 0.6218, "step": 75 }, { "epoch": 0.07, "grad_norm": 1.9902503343433904, "learning_rate": 4.989373332254516e-06, "loss": 0.6257, "step": 76 }, { "epoch": 0.07, "grad_norm": 2.1041971507252466, "learning_rate": 4.989049109552173e-06, "loss": 0.5888, "step": 77 }, { "epoch": 0.07, "grad_norm": 2.1151685783302123, "learning_rate": 4.988720025682995e-06, "loss": 0.6333, "step": 78 }, { "epoch": 0.07, "grad_norm": 1.9223819269893592, "learning_rate": 4.988386081289689e-06, "loss": 0.6442, "step": 79 }, { "epoch": 0.07, "grad_norm": 2.139676463756265, "learning_rate": 4.988047277024456e-06, "loss": 0.5966, "step": 80 }, { "epoch": 0.07, "grad_norm": 2.1665820212993068, "learning_rate": 4.987703613548988e-06, "loss": 0.603, "step": 81 }, { "epoch": 0.07, "grad_norm": 1.931456975470041, "learning_rate": 4.987355091534467e-06, "loss": 0.6122, "step": 82 }, { "epoch": 0.07, "grad_norm": 2.134995092135601, "learning_rate": 4.987001711661566e-06, "loss": 0.6213, "step": 83 }, { "epoch": 0.07, "grad_norm": 2.0173352657570818, "learning_rate": 4.98664347462044e-06, "loss": 0.5966, "step": 84 }, { "epoch": 0.08, "grad_norm": 2.0816939924571183, "learning_rate": 4.986280381110737e-06, "loss": 0.5575, "step": 85 }, { "epoch": 0.08, "grad_norm": 2.0072477771163357, "learning_rate": 4.985912431841584e-06, "loss": 0.6225, "step": 86 }, { "epoch": 0.08, "grad_norm": 2.1895945454214507, "learning_rate": 4.985539627531596e-06, "loss": 0.6169, "step": 87 }, { "epoch": 0.08, "grad_norm": 2.84518214074801, "learning_rate": 4.985161968908866e-06, "loss": 0.6317, "step": 88 }, { "epoch": 0.08, "grad_norm": 2.194209857089938, "learning_rate": 4.984779456710971e-06, "loss": 0.6205, "step": 89 }, { "epoch": 0.08, "grad_norm": 2.1604595364123083, "learning_rate": 4.9843920916849645e-06, "loss": 0.6176, "step": 90 }, { "epoch": 0.08, "grad_norm": 2.039087518829079, "learning_rate": 4.9839998745873795e-06, "loss": 0.5842, "step": 91 }, { "epoch": 0.08, "grad_norm": 2.0148570016863334, "learning_rate": 4.983602806184225e-06, "loss": 0.5936, "step": 92 }, { "epoch": 0.08, "grad_norm": 2.073137159272384, "learning_rate": 4.983200887250982e-06, "loss": 0.6317, "step": 93 }, { "epoch": 0.08, "grad_norm": 2.045469602089007, "learning_rate": 4.9827941185726095e-06, "loss": 0.5338, "step": 94 }, { "epoch": 0.08, "grad_norm": 2.1201743116757417, "learning_rate": 4.982382500943533e-06, "loss": 0.6133, "step": 95 }, { "epoch": 0.09, "grad_norm": 2.0637214917996363, "learning_rate": 4.981966035167654e-06, "loss": 0.6483, "step": 96 }, { "epoch": 0.09, "grad_norm": 2.155574452675582, "learning_rate": 4.981544722058336e-06, "loss": 0.6001, "step": 97 }, { "epoch": 0.09, "grad_norm": 1.9347601392775928, "learning_rate": 4.981118562438414e-06, "loss": 0.5954, "step": 98 }, { "epoch": 0.09, "grad_norm": 2.3054537863874756, "learning_rate": 4.980687557140187e-06, "loss": 0.6338, "step": 99 }, { "epoch": 0.09, "grad_norm": 2.0421104909837338, "learning_rate": 4.980251707005417e-06, "loss": 0.6166, "step": 100 }, { "epoch": 0.09, "grad_norm": 2.023167301994367, "learning_rate": 4.979811012885329e-06, "loss": 0.5682, "step": 101 }, { "epoch": 0.09, "grad_norm": 2.0583654213007967, "learning_rate": 4.979365475640609e-06, "loss": 0.5759, "step": 102 }, { "epoch": 0.09, "grad_norm": 2.008917223929121, "learning_rate": 4.9789150961414e-06, "loss": 0.6324, "step": 103 }, { "epoch": 0.09, "grad_norm": 2.1111479338304306, "learning_rate": 4.978459875267303e-06, "loss": 0.5821, "step": 104 }, { "epoch": 0.09, "grad_norm": 2.400366962461983, "learning_rate": 4.977999813907375e-06, "loss": 0.5699, "step": 105 }, { "epoch": 0.09, "grad_norm": 2.090668061316384, "learning_rate": 4.977534912960124e-06, "loss": 0.5754, "step": 106 }, { "epoch": 0.09, "grad_norm": 2.2103419288491466, "learning_rate": 4.977065173333515e-06, "loss": 0.6005, "step": 107 }, { "epoch": 0.1, "grad_norm": 2.1332380447628294, "learning_rate": 4.9765905959449565e-06, "loss": 0.6178, "step": 108 }, { "epoch": 0.1, "grad_norm": 2.1372224949542464, "learning_rate": 4.976111181721309e-06, "loss": 0.6021, "step": 109 }, { "epoch": 0.1, "grad_norm": 2.636052326949506, "learning_rate": 4.97562693159888e-06, "loss": 0.6418, "step": 110 }, { "epoch": 0.1, "grad_norm": 2.1234423477493443, "learning_rate": 4.975137846523419e-06, "loss": 0.6231, "step": 111 }, { "epoch": 0.1, "grad_norm": 2.2817790529425315, "learning_rate": 4.974643927450121e-06, "loss": 0.5681, "step": 112 }, { "epoch": 0.1, "grad_norm": 2.2605060344304713, "learning_rate": 4.9741451753436205e-06, "loss": 0.5803, "step": 113 }, { "epoch": 0.1, "grad_norm": 2.0355236974665876, "learning_rate": 4.973641591177991e-06, "loss": 0.6003, "step": 114 }, { "epoch": 0.1, "grad_norm": 2.4343221170301415, "learning_rate": 4.973133175936743e-06, "loss": 0.5882, "step": 115 }, { "epoch": 0.1, "grad_norm": 2.2135760843199734, "learning_rate": 4.972619930612822e-06, "loss": 0.5886, "step": 116 }, { "epoch": 0.1, "grad_norm": 2.161909448676307, "learning_rate": 4.972101856208609e-06, "loss": 0.5792, "step": 117 }, { "epoch": 0.1, "grad_norm": 2.0871148781401927, "learning_rate": 4.9715789537359126e-06, "loss": 0.6383, "step": 118 }, { "epoch": 0.11, "grad_norm": 2.1159018206478626, "learning_rate": 4.971051224215973e-06, "loss": 0.5865, "step": 119 }, { "epoch": 0.11, "grad_norm": 2.2036428070670375, "learning_rate": 4.970518668679459e-06, "loss": 0.5905, "step": 120 }, { "epoch": 0.11, "grad_norm": 2.22262007661876, "learning_rate": 4.969981288166461e-06, "loss": 0.5951, "step": 121 }, { "epoch": 0.11, "grad_norm": 2.0713458839382786, "learning_rate": 4.969439083726496e-06, "loss": 0.6011, "step": 122 }, { "epoch": 0.11, "grad_norm": 2.0686060725186897, "learning_rate": 4.9688920564185e-06, "loss": 0.6038, "step": 123 }, { "epoch": 0.11, "grad_norm": 2.1825376161159964, "learning_rate": 4.968340207310832e-06, "loss": 0.6098, "step": 124 }, { "epoch": 0.11, "grad_norm": 2.142436541976576, "learning_rate": 4.967783537481262e-06, "loss": 0.6119, "step": 125 }, { "epoch": 0.11, "grad_norm": 2.330044622755397, "learning_rate": 4.967222048016979e-06, "loss": 0.6057, "step": 126 }, { "epoch": 0.11, "grad_norm": 2.109116942854107, "learning_rate": 4.966655740014585e-06, "loss": 0.5958, "step": 127 }, { "epoch": 0.11, "grad_norm": 2.174219068914296, "learning_rate": 4.9660846145800914e-06, "loss": 0.6276, "step": 128 }, { "epoch": 0.11, "grad_norm": 2.135736248304593, "learning_rate": 4.965508672828918e-06, "loss": 0.6309, "step": 129 }, { "epoch": 0.12, "grad_norm": 2.2339234058672885, "learning_rate": 4.964927915885893e-06, "loss": 0.5879, "step": 130 }, { "epoch": 0.12, "grad_norm": 2.0960660335616224, "learning_rate": 4.9643423448852455e-06, "loss": 0.6218, "step": 131 }, { "epoch": 0.12, "grad_norm": 1.9468729925472703, "learning_rate": 4.963751960970609e-06, "loss": 0.5998, "step": 132 }, { "epoch": 0.12, "grad_norm": 2.1623168252289915, "learning_rate": 4.9631567652950164e-06, "loss": 0.6885, "step": 133 }, { "epoch": 0.12, "grad_norm": 2.084420579583794, "learning_rate": 4.962556759020898e-06, "loss": 0.5758, "step": 134 }, { "epoch": 0.12, "grad_norm": 2.1082890389844713, "learning_rate": 4.961951943320078e-06, "loss": 0.6116, "step": 135 }, { "epoch": 0.12, "grad_norm": 2.006123424806457, "learning_rate": 4.9613423193737754e-06, "loss": 0.5708, "step": 136 }, { "epoch": 0.12, "grad_norm": 2.309431970929405, "learning_rate": 4.960727888372599e-06, "loss": 0.621, "step": 137 }, { "epoch": 0.12, "grad_norm": 2.226488524758773, "learning_rate": 4.9601086515165456e-06, "loss": 0.5896, "step": 138 }, { "epoch": 0.12, "grad_norm": 2.1242070778655253, "learning_rate": 4.959484610014997e-06, "loss": 0.624, "step": 139 }, { "epoch": 0.12, "grad_norm": 2.2147491445730516, "learning_rate": 4.958855765086722e-06, "loss": 0.6064, "step": 140 }, { "epoch": 0.12, "grad_norm": 2.1818004600393, "learning_rate": 4.958222117959868e-06, "loss": 0.6252, "step": 141 }, { "epoch": 0.13, "grad_norm": 2.1094535889409696, "learning_rate": 4.95758366987196e-06, "loss": 0.5779, "step": 142 }, { "epoch": 0.13, "grad_norm": 2.2043056809252577, "learning_rate": 4.9569404220699025e-06, "loss": 0.6156, "step": 143 }, { "epoch": 0.13, "grad_norm": 2.158056342799238, "learning_rate": 4.956292375809971e-06, "loss": 0.5662, "step": 144 }, { "epoch": 0.13, "grad_norm": 1.987581635345228, "learning_rate": 4.955639532357815e-06, "loss": 0.6148, "step": 145 }, { "epoch": 0.13, "grad_norm": 2.266145451051948, "learning_rate": 4.954981892988451e-06, "loss": 0.5867, "step": 146 }, { "epoch": 0.13, "grad_norm": 2.071082600205798, "learning_rate": 4.954319458986264e-06, "loss": 0.5976, "step": 147 }, { "epoch": 0.13, "grad_norm": 2.1615342548575374, "learning_rate": 4.953652231645002e-06, "loss": 0.5643, "step": 148 }, { "epoch": 0.13, "grad_norm": 2.145126231371731, "learning_rate": 4.952980212267773e-06, "loss": 0.5592, "step": 149 }, { "epoch": 0.13, "grad_norm": 1.9161750244434461, "learning_rate": 4.952303402167047e-06, "loss": 0.5547, "step": 150 }, { "epoch": 0.13, "grad_norm": 2.234370958372018, "learning_rate": 4.9516218026646475e-06, "loss": 0.578, "step": 151 }, { "epoch": 0.13, "grad_norm": 2.149553338429868, "learning_rate": 4.950935415091753e-06, "loss": 0.5952, "step": 152 }, { "epoch": 0.14, "grad_norm": 2.1021801657048016, "learning_rate": 4.950244240788895e-06, "loss": 0.573, "step": 153 }, { "epoch": 0.14, "grad_norm": 2.488711367210497, "learning_rate": 4.949548281105951e-06, "loss": 0.5776, "step": 154 }, { "epoch": 0.14, "grad_norm": 2.0302393290147167, "learning_rate": 4.948847537402145e-06, "loss": 0.5685, "step": 155 }, { "epoch": 0.14, "grad_norm": 2.1563261797248043, "learning_rate": 4.948142011046044e-06, "loss": 0.6185, "step": 156 }, { "epoch": 0.14, "grad_norm": 2.1308303224609997, "learning_rate": 4.947431703415558e-06, "loss": 0.6229, "step": 157 }, { "epoch": 0.14, "grad_norm": 2.0988414912992273, "learning_rate": 4.946716615897932e-06, "loss": 0.6167, "step": 158 }, { "epoch": 0.14, "grad_norm": 2.3558302474583095, "learning_rate": 4.9459967498897485e-06, "loss": 0.5903, "step": 159 }, { "epoch": 0.14, "grad_norm": 2.1505555405055223, "learning_rate": 4.945272106796919e-06, "loss": 0.5709, "step": 160 }, { "epoch": 0.14, "grad_norm": 2.0604140956574635, "learning_rate": 4.94454268803469e-06, "loss": 0.635, "step": 161 }, { "epoch": 0.14, "grad_norm": 2.3699836246614696, "learning_rate": 4.943808495027631e-06, "loss": 0.581, "step": 162 }, { "epoch": 0.14, "grad_norm": 1.9809907136859368, "learning_rate": 4.9430695292096365e-06, "loss": 0.5703, "step": 163 }, { "epoch": 0.15, "grad_norm": 2.213101907296851, "learning_rate": 4.942325792023922e-06, "loss": 0.5915, "step": 164 }, { "epoch": 0.15, "grad_norm": 2.3778783149383944, "learning_rate": 4.941577284923025e-06, "loss": 0.537, "step": 165 }, { "epoch": 0.15, "grad_norm": 1.9283694807512721, "learning_rate": 4.9408240093687934e-06, "loss": 0.579, "step": 166 }, { "epoch": 0.15, "grad_norm": 2.083087334039033, "learning_rate": 4.940065966832392e-06, "loss": 0.5612, "step": 167 }, { "epoch": 0.15, "grad_norm": 2.314684793845775, "learning_rate": 4.939303158794294e-06, "loss": 0.6001, "step": 168 }, { "epoch": 0.15, "grad_norm": 2.131977461745334, "learning_rate": 4.93853558674428e-06, "loss": 0.5809, "step": 169 }, { "epoch": 0.15, "grad_norm": 2.1291924932946755, "learning_rate": 4.937763252181434e-06, "loss": 0.6216, "step": 170 }, { "epoch": 0.15, "grad_norm": 1.9366549866764742, "learning_rate": 4.936986156614144e-06, "loss": 0.5888, "step": 171 }, { "epoch": 0.15, "grad_norm": 2.231889540095555, "learning_rate": 4.9362043015600934e-06, "loss": 0.6437, "step": 172 }, { "epoch": 0.15, "grad_norm": 2.0696023557568233, "learning_rate": 4.9354176885462626e-06, "loss": 0.5951, "step": 173 }, { "epoch": 0.15, "grad_norm": 2.10974806039572, "learning_rate": 4.934626319108923e-06, "loss": 0.5817, "step": 174 }, { "epoch": 0.16, "grad_norm": 2.0633698321381946, "learning_rate": 4.933830194793636e-06, "loss": 0.5692, "step": 175 }, { "epoch": 0.16, "grad_norm": 2.0163693967733423, "learning_rate": 4.933029317155251e-06, "loss": 0.5322, "step": 176 }, { "epoch": 0.16, "grad_norm": 2.1118176135699813, "learning_rate": 4.932223687757899e-06, "loss": 0.5809, "step": 177 }, { "epoch": 0.16, "grad_norm": 2.181431947183138, "learning_rate": 4.9314133081749906e-06, "loss": 0.5444, "step": 178 }, { "epoch": 0.16, "grad_norm": 2.2055197469621386, "learning_rate": 4.930598179989215e-06, "loss": 0.6063, "step": 179 }, { "epoch": 0.16, "grad_norm": 2.1103699877035638, "learning_rate": 4.929778304792537e-06, "loss": 0.5908, "step": 180 }, { "epoch": 0.16, "grad_norm": 2.01692648335164, "learning_rate": 4.928953684186189e-06, "loss": 0.5729, "step": 181 }, { "epoch": 0.16, "grad_norm": 1.990744003423107, "learning_rate": 4.928124319780673e-06, "loss": 0.5935, "step": 182 }, { "epoch": 0.16, "grad_norm": 1.9898687560952446, "learning_rate": 4.9272902131957555e-06, "loss": 0.6008, "step": 183 }, { "epoch": 0.16, "grad_norm": 1.9499116832570582, "learning_rate": 4.926451366060465e-06, "loss": 0.5731, "step": 184 }, { "epoch": 0.16, "grad_norm": 1.8933258467243923, "learning_rate": 4.925607780013088e-06, "loss": 0.5822, "step": 185 }, { "epoch": 0.16, "grad_norm": 1.9711936623837691, "learning_rate": 4.924759456701167e-06, "loss": 0.5433, "step": 186 }, { "epoch": 0.17, "grad_norm": 1.9981254191144715, "learning_rate": 4.923906397781495e-06, "loss": 0.5603, "step": 187 }, { "epoch": 0.17, "grad_norm": 1.9489584101682442, "learning_rate": 4.923048604920115e-06, "loss": 0.592, "step": 188 }, { "epoch": 0.17, "grad_norm": 2.14587896098926, "learning_rate": 4.922186079792315e-06, "loss": 0.5861, "step": 189 }, { "epoch": 0.17, "grad_norm": 2.093505234897306, "learning_rate": 4.921318824082625e-06, "loss": 0.5756, "step": 190 }, { "epoch": 0.17, "grad_norm": 1.9726924068956073, "learning_rate": 4.920446839484814e-06, "loss": 0.5954, "step": 191 }, { "epoch": 0.17, "grad_norm": 2.0009011296035886, "learning_rate": 4.919570127701888e-06, "loss": 0.5185, "step": 192 }, { "epoch": 0.17, "grad_norm": 2.0801246171281993, "learning_rate": 4.9186886904460826e-06, "loss": 0.5788, "step": 193 }, { "epoch": 0.17, "grad_norm": 2.7712602468155096, "learning_rate": 4.917802529438865e-06, "loss": 0.6637, "step": 194 }, { "epoch": 0.17, "grad_norm": 1.9721040372060654, "learning_rate": 4.916911646410926e-06, "loss": 0.5926, "step": 195 }, { "epoch": 0.17, "grad_norm": 2.1199089061376855, "learning_rate": 4.91601604310218e-06, "loss": 0.5854, "step": 196 }, { "epoch": 0.17, "grad_norm": 1.9518281461372036, "learning_rate": 4.915115721261759e-06, "loss": 0.5456, "step": 197 }, { "epoch": 0.18, "grad_norm": 2.1537515435847734, "learning_rate": 4.9142106826480114e-06, "loss": 0.6152, "step": 198 }, { "epoch": 0.18, "grad_norm": 2.3461320565666344, "learning_rate": 4.913300929028498e-06, "loss": 0.617, "step": 199 }, { "epoch": 0.18, "grad_norm": 1.9789785575462193, "learning_rate": 4.912386462179987e-06, "loss": 0.5845, "step": 200 }, { "epoch": 0.18, "grad_norm": 2.0705337307209253, "learning_rate": 4.9114672838884515e-06, "loss": 0.6062, "step": 201 }, { "epoch": 0.18, "grad_norm": 1.9972918925367322, "learning_rate": 4.910543395949066e-06, "loss": 0.6318, "step": 202 }, { "epoch": 0.18, "grad_norm": 2.03173534028091, "learning_rate": 4.9096148001662055e-06, "loss": 0.64, "step": 203 }, { "epoch": 0.18, "grad_norm": 2.0861416304602356, "learning_rate": 4.908681498353436e-06, "loss": 0.5859, "step": 204 }, { "epoch": 0.18, "grad_norm": 1.932510611788884, "learning_rate": 4.907743492333517e-06, "loss": 0.5483, "step": 205 }, { "epoch": 0.18, "grad_norm": 1.9618471764126828, "learning_rate": 4.906800783938395e-06, "loss": 0.5767, "step": 206 }, { "epoch": 0.18, "grad_norm": 2.3557796360921786, "learning_rate": 4.905853375009198e-06, "loss": 0.5934, "step": 207 }, { "epoch": 0.18, "grad_norm": 2.0993364379712784, "learning_rate": 4.9049012673962385e-06, "loss": 0.5879, "step": 208 }, { "epoch": 0.19, "grad_norm": 2.2015612636555155, "learning_rate": 4.903944462959001e-06, "loss": 0.5598, "step": 209 }, { "epoch": 0.19, "grad_norm": 2.0374544745406062, "learning_rate": 4.902982963566147e-06, "loss": 0.577, "step": 210 }, { "epoch": 0.19, "grad_norm": 2.194866218807, "learning_rate": 4.902016771095506e-06, "loss": 0.5848, "step": 211 }, { "epoch": 0.19, "grad_norm": 2.2545375351308614, "learning_rate": 4.901045887434072e-06, "loss": 0.5846, "step": 212 }, { "epoch": 0.19, "grad_norm": 2.017012770131601, "learning_rate": 4.900070314478001e-06, "loss": 0.5651, "step": 213 }, { "epoch": 0.19, "grad_norm": 2.150900326654639, "learning_rate": 4.899090054132609e-06, "loss": 0.568, "step": 214 }, { "epoch": 0.19, "grad_norm": 2.0404886979870454, "learning_rate": 4.898105108312366e-06, "loss": 0.5277, "step": 215 }, { "epoch": 0.19, "grad_norm": 2.036614254190257, "learning_rate": 4.897115478940892e-06, "loss": 0.5754, "step": 216 }, { "epoch": 0.19, "grad_norm": 2.041133008809928, "learning_rate": 4.896121167950954e-06, "loss": 0.6294, "step": 217 }, { "epoch": 0.19, "grad_norm": 2.0029503409054885, "learning_rate": 4.895122177284465e-06, "loss": 0.5531, "step": 218 }, { "epoch": 0.19, "grad_norm": 2.0303439698174754, "learning_rate": 4.894118508892474e-06, "loss": 0.6008, "step": 219 }, { "epoch": 0.19, "grad_norm": 1.899982778272908, "learning_rate": 4.893110164735167e-06, "loss": 0.6076, "step": 220 }, { "epoch": 0.2, "grad_norm": 2.170640326694132, "learning_rate": 4.892097146781862e-06, "loss": 0.5806, "step": 221 }, { "epoch": 0.2, "grad_norm": 1.961802557992624, "learning_rate": 4.8910794570110055e-06, "loss": 0.5456, "step": 222 }, { "epoch": 0.2, "grad_norm": 2.1149182672715807, "learning_rate": 4.890057097410167e-06, "loss": 0.5683, "step": 223 }, { "epoch": 0.2, "grad_norm": 1.9988574008443096, "learning_rate": 4.889030069976038e-06, "loss": 0.5603, "step": 224 }, { "epoch": 0.2, "grad_norm": 2.137840782586502, "learning_rate": 4.887998376714424e-06, "loss": 0.5713, "step": 225 }, { "epoch": 0.2, "grad_norm": 2.2956357234771634, "learning_rate": 4.886962019640244e-06, "loss": 0.5635, "step": 226 }, { "epoch": 0.2, "grad_norm": 2.2175517801056346, "learning_rate": 4.885921000777528e-06, "loss": 0.631, "step": 227 }, { "epoch": 0.2, "grad_norm": 2.0861966792656546, "learning_rate": 4.884875322159407e-06, "loss": 0.5521, "step": 228 }, { "epoch": 0.2, "grad_norm": 2.170862650134145, "learning_rate": 4.883824985828114e-06, "loss": 0.5953, "step": 229 }, { "epoch": 0.2, "grad_norm": 2.016871028914906, "learning_rate": 4.882769993834978e-06, "loss": 0.5745, "step": 230 }, { "epoch": 0.2, "grad_norm": 2.4069309610367107, "learning_rate": 4.8817103482404236e-06, "loss": 0.5752, "step": 231 }, { "epoch": 0.21, "grad_norm": 1.9834780557891722, "learning_rate": 4.880646051113959e-06, "loss": 0.5619, "step": 232 }, { "epoch": 0.21, "grad_norm": 2.1221686040256005, "learning_rate": 4.87957710453418e-06, "loss": 0.561, "step": 233 }, { "epoch": 0.21, "grad_norm": 2.1497751964139002, "learning_rate": 4.878503510588764e-06, "loss": 0.5754, "step": 234 }, { "epoch": 0.21, "grad_norm": 1.8535318318419167, "learning_rate": 4.877425271374462e-06, "loss": 0.5551, "step": 235 }, { "epoch": 0.21, "grad_norm": 2.1537345489224404, "learning_rate": 4.876342388997099e-06, "loss": 0.544, "step": 236 }, { "epoch": 0.21, "grad_norm": 1.9695512744073471, "learning_rate": 4.875254865571567e-06, "loss": 0.6003, "step": 237 }, { "epoch": 0.21, "grad_norm": 2.2550853928957193, "learning_rate": 4.874162703221823e-06, "loss": 0.5968, "step": 238 }, { "epoch": 0.21, "grad_norm": 2.0658630166795917, "learning_rate": 4.873065904080884e-06, "loss": 0.5658, "step": 239 }, { "epoch": 0.21, "grad_norm": 2.0821280326495524, "learning_rate": 4.871964470290823e-06, "loss": 0.5711, "step": 240 }, { "epoch": 0.21, "grad_norm": 1.9833074137024158, "learning_rate": 4.8708584040027636e-06, "loss": 0.5899, "step": 241 }, { "epoch": 0.21, "grad_norm": 2.0288963441502195, "learning_rate": 4.869747707376877e-06, "loss": 0.5601, "step": 242 }, { "epoch": 0.22, "grad_norm": 2.0970435875726463, "learning_rate": 4.868632382582378e-06, "loss": 0.6381, "step": 243 }, { "epoch": 0.22, "grad_norm": 2.1303280408644194, "learning_rate": 4.86751243179752e-06, "loss": 0.5495, "step": 244 }, { "epoch": 0.22, "grad_norm": 2.0851781018580584, "learning_rate": 4.866387857209591e-06, "loss": 0.5901, "step": 245 }, { "epoch": 0.22, "grad_norm": 1.8310760160854438, "learning_rate": 4.86525866101491e-06, "loss": 0.5513, "step": 246 }, { "epoch": 0.22, "grad_norm": 2.199726167537497, "learning_rate": 4.8641248454188205e-06, "loss": 0.5873, "step": 247 }, { "epoch": 0.22, "grad_norm": 1.9776691221978735, "learning_rate": 4.862986412635691e-06, "loss": 0.6143, "step": 248 }, { "epoch": 0.22, "grad_norm": 2.0663231641830873, "learning_rate": 4.8618433648889034e-06, "loss": 0.5937, "step": 249 }, { "epoch": 0.22, "grad_norm": 2.170520506577784, "learning_rate": 4.860695704410856e-06, "loss": 0.5374, "step": 250 }, { "epoch": 0.22, "grad_norm": 1.9685756224067419, "learning_rate": 4.8595434334429535e-06, "loss": 0.5139, "step": 251 }, { "epoch": 0.22, "grad_norm": 1.9668205539999677, "learning_rate": 4.8583865542356065e-06, "loss": 0.5459, "step": 252 }, { "epoch": 0.22, "grad_norm": 2.0793578279258704, "learning_rate": 4.857225069048226e-06, "loss": 0.593, "step": 253 }, { "epoch": 0.22, "grad_norm": 1.9265474492849337, "learning_rate": 4.8560589801492165e-06, "loss": 0.5559, "step": 254 }, { "epoch": 0.23, "grad_norm": 2.8555278122830696, "learning_rate": 4.854888289815976e-06, "loss": 0.5949, "step": 255 }, { "epoch": 0.23, "grad_norm": 2.063838630196542, "learning_rate": 4.853713000334887e-06, "loss": 0.5712, "step": 256 }, { "epoch": 0.23, "grad_norm": 2.168668910730517, "learning_rate": 4.852533114001316e-06, "loss": 0.5475, "step": 257 }, { "epoch": 0.23, "grad_norm": 2.064042820960706, "learning_rate": 4.8513486331196055e-06, "loss": 0.5616, "step": 258 }, { "epoch": 0.23, "grad_norm": 2.026751060346143, "learning_rate": 4.850159560003074e-06, "loss": 0.5997, "step": 259 }, { "epoch": 0.23, "grad_norm": 2.1228129299875254, "learning_rate": 4.848965896974006e-06, "loss": 0.5622, "step": 260 }, { "epoch": 0.23, "grad_norm": 1.9418510365881214, "learning_rate": 4.847767646363652e-06, "loss": 0.5741, "step": 261 }, { "epoch": 0.23, "grad_norm": 2.070611833895483, "learning_rate": 4.846564810512221e-06, "loss": 0.5729, "step": 262 }, { "epoch": 0.23, "grad_norm": 1.8833621440375596, "learning_rate": 4.845357391768877e-06, "loss": 0.5503, "step": 263 }, { "epoch": 0.23, "grad_norm": 2.1022924907055387, "learning_rate": 4.844145392491735e-06, "loss": 0.6204, "step": 264 }, { "epoch": 0.23, "grad_norm": 2.024625007813473, "learning_rate": 4.842928815047856e-06, "loss": 0.5776, "step": 265 }, { "epoch": 0.24, "grad_norm": 1.9123739071371275, "learning_rate": 4.8417076618132434e-06, "loss": 0.5417, "step": 266 }, { "epoch": 0.24, "grad_norm": 2.062879186086598, "learning_rate": 4.8404819351728336e-06, "loss": 0.5387, "step": 267 }, { "epoch": 0.24, "grad_norm": 1.9944627549250884, "learning_rate": 4.8392516375204986e-06, "loss": 0.5731, "step": 268 }, { "epoch": 0.24, "grad_norm": 1.9859912626846585, "learning_rate": 4.838016771259037e-06, "loss": 0.5969, "step": 269 }, { "epoch": 0.24, "grad_norm": 2.043069520519082, "learning_rate": 4.836777338800168e-06, "loss": 0.6217, "step": 270 }, { "epoch": 0.24, "grad_norm": 1.913212451622778, "learning_rate": 4.835533342564531e-06, "loss": 0.5527, "step": 271 }, { "epoch": 0.24, "grad_norm": 1.978858281238778, "learning_rate": 4.834284784981678e-06, "loss": 0.5997, "step": 272 }, { "epoch": 0.24, "grad_norm": 2.004628826916504, "learning_rate": 4.833031668490067e-06, "loss": 0.551, "step": 273 }, { "epoch": 0.24, "grad_norm": 2.164370107566024, "learning_rate": 4.8317739955370645e-06, "loss": 0.5537, "step": 274 }, { "epoch": 0.24, "grad_norm": 1.891772326146366, "learning_rate": 4.83051176857893e-06, "loss": 0.6075, "step": 275 }, { "epoch": 0.24, "grad_norm": 2.0553128913886645, "learning_rate": 4.8292449900808216e-06, "loss": 0.5854, "step": 276 }, { "epoch": 0.25, "grad_norm": 2.009000622167072, "learning_rate": 4.827973662516786e-06, "loss": 0.5503, "step": 277 }, { "epoch": 0.25, "grad_norm": 1.9385043396652537, "learning_rate": 4.826697788369752e-06, "loss": 0.5704, "step": 278 }, { "epoch": 0.25, "grad_norm": 2.3263786060073826, "learning_rate": 4.8254173701315295e-06, "loss": 0.5604, "step": 279 }, { "epoch": 0.25, "grad_norm": 1.9251504140774536, "learning_rate": 4.8241324103028055e-06, "loss": 0.5647, "step": 280 }, { "epoch": 0.25, "grad_norm": 1.9714117964729747, "learning_rate": 4.822842911393131e-06, "loss": 0.604, "step": 281 }, { "epoch": 0.25, "grad_norm": 2.034372279161665, "learning_rate": 4.821548875920927e-06, "loss": 0.5803, "step": 282 }, { "epoch": 0.25, "grad_norm": 1.9849114644945505, "learning_rate": 4.8202503064134725e-06, "loss": 0.5854, "step": 283 }, { "epoch": 0.25, "grad_norm": 2.3435998455971343, "learning_rate": 4.818947205406902e-06, "loss": 0.4988, "step": 284 }, { "epoch": 0.25, "grad_norm": 2.0672779732760924, "learning_rate": 4.8176395754462e-06, "loss": 0.5734, "step": 285 }, { "epoch": 0.25, "grad_norm": 2.1206384205127544, "learning_rate": 4.816327419085197e-06, "loss": 0.563, "step": 286 }, { "epoch": 0.25, "grad_norm": 2.1105254841893095, "learning_rate": 4.815010738886561e-06, "loss": 0.5765, "step": 287 }, { "epoch": 0.26, "grad_norm": 2.072546090747287, "learning_rate": 4.813689537421798e-06, "loss": 0.6003, "step": 288 }, { "epoch": 0.26, "grad_norm": 2.1131138426394442, "learning_rate": 4.812363817271243e-06, "loss": 0.6097, "step": 289 }, { "epoch": 0.26, "grad_norm": 1.9218545344238502, "learning_rate": 4.811033581024056e-06, "loss": 0.6272, "step": 290 }, { "epoch": 0.26, "grad_norm": 2.235420687671868, "learning_rate": 4.809698831278217e-06, "loss": 0.5519, "step": 291 }, { "epoch": 0.26, "grad_norm": 1.8915062282224397, "learning_rate": 4.808359570640522e-06, "loss": 0.5832, "step": 292 }, { "epoch": 0.26, "grad_norm": 1.9185231023206675, "learning_rate": 4.8070158017265755e-06, "loss": 0.5854, "step": 293 }, { "epoch": 0.26, "grad_norm": 2.086526046887808, "learning_rate": 4.805667527160788e-06, "loss": 0.5314, "step": 294 }, { "epoch": 0.26, "grad_norm": 1.9995370937944454, "learning_rate": 4.804314749576368e-06, "loss": 0.5749, "step": 295 }, { "epoch": 0.26, "grad_norm": 2.099313489806141, "learning_rate": 4.802957471615319e-06, "loss": 0.5173, "step": 296 }, { "epoch": 0.26, "grad_norm": 2.067736275086448, "learning_rate": 4.8015956959284346e-06, "loss": 0.5434, "step": 297 }, { "epoch": 0.26, "grad_norm": 2.005525416579935, "learning_rate": 4.800229425175294e-06, "loss": 0.5589, "step": 298 }, { "epoch": 0.26, "grad_norm": 2.172708847484724, "learning_rate": 4.7988586620242515e-06, "loss": 0.5919, "step": 299 }, { "epoch": 0.27, "grad_norm": 2.0010542748493823, "learning_rate": 4.797483409152438e-06, "loss": 0.5803, "step": 300 }, { "epoch": 0.27, "grad_norm": 2.1169505971764506, "learning_rate": 4.7961036692457516e-06, "loss": 0.5763, "step": 301 }, { "epoch": 0.27, "grad_norm": 2.202849419501746, "learning_rate": 4.794719444998856e-06, "loss": 0.5691, "step": 302 }, { "epoch": 0.27, "grad_norm": 1.9765013761990564, "learning_rate": 4.793330739115169e-06, "loss": 0.5657, "step": 303 }, { "epoch": 0.27, "grad_norm": 2.0404392238791136, "learning_rate": 4.791937554306863e-06, "loss": 0.5648, "step": 304 }, { "epoch": 0.27, "grad_norm": 2.0298920886210516, "learning_rate": 4.790539893294861e-06, "loss": 0.5353, "step": 305 }, { "epoch": 0.27, "grad_norm": 2.03157486915788, "learning_rate": 4.789137758808823e-06, "loss": 0.5716, "step": 306 }, { "epoch": 0.27, "grad_norm": 2.060346338513047, "learning_rate": 4.787731153587149e-06, "loss": 0.5502, "step": 307 }, { "epoch": 0.27, "grad_norm": 1.9286831590091769, "learning_rate": 4.786320080376968e-06, "loss": 0.5646, "step": 308 }, { "epoch": 0.27, "grad_norm": 2.042346254905274, "learning_rate": 4.7849045419341376e-06, "loss": 0.6085, "step": 309 }, { "epoch": 0.27, "grad_norm": 2.0758243469708293, "learning_rate": 4.7834845410232356e-06, "loss": 0.5452, "step": 310 }, { "epoch": 0.28, "grad_norm": 2.0454965773706553, "learning_rate": 4.782060080417553e-06, "loss": 0.514, "step": 311 }, { "epoch": 0.28, "grad_norm": 2.073931876222572, "learning_rate": 4.780631162899094e-06, "loss": 0.5884, "step": 312 }, { "epoch": 0.28, "grad_norm": 1.9699688248650635, "learning_rate": 4.7791977912585645e-06, "loss": 0.529, "step": 313 }, { "epoch": 0.28, "grad_norm": 1.9886162974888701, "learning_rate": 4.7777599682953696e-06, "loss": 0.5796, "step": 314 }, { "epoch": 0.28, "grad_norm": 1.9579685029739566, "learning_rate": 4.7763176968176106e-06, "loss": 0.5553, "step": 315 }, { "epoch": 0.28, "grad_norm": 2.2181861411036086, "learning_rate": 4.7748709796420735e-06, "loss": 0.5806, "step": 316 }, { "epoch": 0.28, "grad_norm": 2.0345738930041777, "learning_rate": 4.773419819594228e-06, "loss": 0.6059, "step": 317 }, { "epoch": 0.28, "grad_norm": 2.0710385535524902, "learning_rate": 4.7719642195082224e-06, "loss": 0.5539, "step": 318 }, { "epoch": 0.28, "grad_norm": 2.1239710444371442, "learning_rate": 4.770504182226875e-06, "loss": 0.5655, "step": 319 }, { "epoch": 0.28, "grad_norm": 1.9564631444382952, "learning_rate": 4.769039710601669e-06, "loss": 0.5914, "step": 320 }, { "epoch": 0.28, "grad_norm": 1.9969926160116234, "learning_rate": 4.767570807492752e-06, "loss": 0.55, "step": 321 }, { "epoch": 0.29, "grad_norm": 1.9650736880864492, "learning_rate": 4.766097475768919e-06, "loss": 0.5804, "step": 322 }, { "epoch": 0.29, "grad_norm": 2.1946368157969194, "learning_rate": 4.7646197183076236e-06, "loss": 0.5631, "step": 323 }, { "epoch": 0.29, "grad_norm": 1.9834181085585831, "learning_rate": 4.763137537994955e-06, "loss": 0.5779, "step": 324 }, { "epoch": 0.29, "grad_norm": 2.1081651164417057, "learning_rate": 4.7616509377256445e-06, "loss": 0.5375, "step": 325 }, { "epoch": 0.29, "grad_norm": 1.9972027344990544, "learning_rate": 4.760159920403055e-06, "loss": 0.5608, "step": 326 }, { "epoch": 0.29, "grad_norm": 1.9554967826543683, "learning_rate": 4.758664488939174e-06, "loss": 0.5613, "step": 327 }, { "epoch": 0.29, "grad_norm": 2.211716512822424, "learning_rate": 4.757164646254614e-06, "loss": 0.5863, "step": 328 }, { "epoch": 0.29, "grad_norm": 1.9203184200502181, "learning_rate": 4.755660395278598e-06, "loss": 0.5275, "step": 329 }, { "epoch": 0.29, "grad_norm": 2.0355308159742505, "learning_rate": 4.7541517389489626e-06, "loss": 0.5304, "step": 330 }, { "epoch": 0.29, "grad_norm": 2.005680103405306, "learning_rate": 4.752638680212145e-06, "loss": 0.5782, "step": 331 }, { "epoch": 0.29, "grad_norm": 1.9930094995522492, "learning_rate": 4.751121222023183e-06, "loss": 0.5197, "step": 332 }, { "epoch": 0.29, "grad_norm": 2.130907347619711, "learning_rate": 4.749599367345703e-06, "loss": 0.5453, "step": 333 }, { "epoch": 0.3, "grad_norm": 2.0380649677356715, "learning_rate": 4.748073119151923e-06, "loss": 0.5394, "step": 334 }, { "epoch": 0.3, "grad_norm": 2.02655053696048, "learning_rate": 4.7465424804226366e-06, "loss": 0.5359, "step": 335 }, { "epoch": 0.3, "grad_norm": 2.108255877778432, "learning_rate": 4.745007454147215e-06, "loss": 0.5262, "step": 336 }, { "epoch": 0.3, "grad_norm": 1.8422966312136684, "learning_rate": 4.7434680433235986e-06, "loss": 0.529, "step": 337 }, { "epoch": 0.3, "grad_norm": 2.1387816386921004, "learning_rate": 4.741924250958289e-06, "loss": 0.5599, "step": 338 }, { "epoch": 0.3, "grad_norm": 2.2063774820548794, "learning_rate": 4.740376080066346e-06, "loss": 0.6014, "step": 339 }, { "epoch": 0.3, "grad_norm": 1.917696303327652, "learning_rate": 4.738823533671383e-06, "loss": 0.615, "step": 340 }, { "epoch": 0.3, "grad_norm": 2.0283765999277916, "learning_rate": 4.737266614805554e-06, "loss": 0.5802, "step": 341 }, { "epoch": 0.3, "grad_norm": 2.0340264609590437, "learning_rate": 4.7357053265095575e-06, "loss": 0.5331, "step": 342 }, { "epoch": 0.3, "grad_norm": 2.102037194450825, "learning_rate": 4.734139671832622e-06, "loss": 0.5534, "step": 343 }, { "epoch": 0.3, "grad_norm": 2.4389875670618113, "learning_rate": 4.732569653832505e-06, "loss": 0.5637, "step": 344 }, { "epoch": 0.31, "grad_norm": 2.1143521053252012, "learning_rate": 4.730995275575486e-06, "loss": 0.6539, "step": 345 }, { "epoch": 0.31, "grad_norm": 2.6240136232872064, "learning_rate": 4.7294165401363616e-06, "loss": 0.5515, "step": 346 }, { "epoch": 0.31, "grad_norm": 2.037602072097695, "learning_rate": 4.727833450598433e-06, "loss": 0.5609, "step": 347 }, { "epoch": 0.31, "grad_norm": 2.10711733636797, "learning_rate": 4.72624601005351e-06, "loss": 0.5719, "step": 348 }, { "epoch": 0.31, "grad_norm": 2.277613433738313, "learning_rate": 4.724654221601899e-06, "loss": 0.5815, "step": 349 }, { "epoch": 0.31, "grad_norm": 2.0082624113337824, "learning_rate": 4.7230580883523955e-06, "loss": 0.5524, "step": 350 }, { "epoch": 0.31, "grad_norm": 1.8922591374161477, "learning_rate": 4.721457613422285e-06, "loss": 0.5981, "step": 351 }, { "epoch": 0.31, "grad_norm": 2.108229047424278, "learning_rate": 4.7198527999373266e-06, "loss": 0.57, "step": 352 }, { "epoch": 0.31, "grad_norm": 2.152965480400126, "learning_rate": 4.718243651031759e-06, "loss": 0.5996, "step": 353 }, { "epoch": 0.31, "grad_norm": 1.8885994019827148, "learning_rate": 4.716630169848282e-06, "loss": 0.5543, "step": 354 }, { "epoch": 0.31, "grad_norm": 2.221396082747074, "learning_rate": 4.715012359538062e-06, "loss": 0.5423, "step": 355 }, { "epoch": 0.32, "grad_norm": 2.247525651087526, "learning_rate": 4.7133902232607145e-06, "loss": 0.6049, "step": 356 }, { "epoch": 0.32, "grad_norm": 1.905837742487114, "learning_rate": 4.711763764184309e-06, "loss": 0.5523, "step": 357 }, { "epoch": 0.32, "grad_norm": 2.117965067814315, "learning_rate": 4.710132985485355e-06, "loss": 0.5682, "step": 358 }, { "epoch": 0.32, "grad_norm": 2.1530948606389373, "learning_rate": 4.7084978903487985e-06, "loss": 0.5506, "step": 359 }, { "epoch": 0.32, "grad_norm": 1.8738866858316863, "learning_rate": 4.706858481968017e-06, "loss": 0.5426, "step": 360 }, { "epoch": 0.32, "grad_norm": 1.9967053512246618, "learning_rate": 4.705214763544806e-06, "loss": 0.5555, "step": 361 }, { "epoch": 0.32, "grad_norm": 2.352080896364055, "learning_rate": 4.703566738289389e-06, "loss": 0.587, "step": 362 }, { "epoch": 0.32, "grad_norm": 2.031696719881503, "learning_rate": 4.701914409420392e-06, "loss": 0.6088, "step": 363 }, { "epoch": 0.32, "grad_norm": 2.140107830595095, "learning_rate": 4.700257780164849e-06, "loss": 0.5596, "step": 364 }, { "epoch": 0.32, "grad_norm": 2.125236417141067, "learning_rate": 4.698596853758194e-06, "loss": 0.5513, "step": 365 }, { "epoch": 0.32, "grad_norm": 1.8878623518397697, "learning_rate": 4.696931633444251e-06, "loss": 0.5557, "step": 366 }, { "epoch": 0.33, "grad_norm": 1.9523463678463824, "learning_rate": 4.695262122475232e-06, "loss": 0.5317, "step": 367 }, { "epoch": 0.33, "grad_norm": 2.3748547328434455, "learning_rate": 4.6935883241117286e-06, "loss": 0.5733, "step": 368 }, { "epoch": 0.33, "grad_norm": 1.9248854873148575, "learning_rate": 4.691910241622704e-06, "loss": 0.5523, "step": 369 }, { "epoch": 0.33, "grad_norm": 2.1731794693383923, "learning_rate": 4.69022787828549e-06, "loss": 0.6489, "step": 370 }, { "epoch": 0.33, "grad_norm": 1.996570702327501, "learning_rate": 4.688541237385781e-06, "loss": 0.584, "step": 371 }, { "epoch": 0.33, "grad_norm": 2.0272036390008097, "learning_rate": 4.68685032221762e-06, "loss": 0.554, "step": 372 }, { "epoch": 0.33, "grad_norm": 1.9986403184037858, "learning_rate": 4.685155136083401e-06, "loss": 0.5798, "step": 373 }, { "epoch": 0.33, "grad_norm": 2.24642442330448, "learning_rate": 4.683455682293863e-06, "loss": 0.5486, "step": 374 }, { "epoch": 0.33, "grad_norm": 2.916261956844043, "learning_rate": 4.681751964168071e-06, "loss": 0.5678, "step": 375 }, { "epoch": 0.33, "grad_norm": 2.1597492287443396, "learning_rate": 4.680043985033427e-06, "loss": 0.5801, "step": 376 }, { "epoch": 0.33, "grad_norm": 1.9634034606261326, "learning_rate": 4.6783317482256506e-06, "loss": 0.5412, "step": 377 }, { "epoch": 0.33, "grad_norm": 2.0128604293697263, "learning_rate": 4.676615257088777e-06, "loss": 0.5538, "step": 378 }, { "epoch": 0.34, "grad_norm": 2.2205659530523976, "learning_rate": 4.674894514975149e-06, "loss": 0.494, "step": 379 }, { "epoch": 0.34, "grad_norm": 2.000557085172021, "learning_rate": 4.673169525245416e-06, "loss": 0.5459, "step": 380 }, { "epoch": 0.34, "grad_norm": 2.0089256125274826, "learning_rate": 4.671440291268518e-06, "loss": 0.5729, "step": 381 }, { "epoch": 0.34, "grad_norm": 2.076112293053539, "learning_rate": 4.66970681642169e-06, "loss": 0.5277, "step": 382 }, { "epoch": 0.34, "grad_norm": 1.996445627957894, "learning_rate": 4.667969104090441e-06, "loss": 0.5879, "step": 383 }, { "epoch": 0.34, "grad_norm": 2.379165029211644, "learning_rate": 4.666227157668564e-06, "loss": 0.5924, "step": 384 }, { "epoch": 0.34, "grad_norm": 2.101190475222136, "learning_rate": 4.664480980558118e-06, "loss": 0.6466, "step": 385 }, { "epoch": 0.34, "grad_norm": 2.035159570620747, "learning_rate": 4.662730576169423e-06, "loss": 0.5979, "step": 386 }, { "epoch": 0.34, "grad_norm": 2.1034174780447814, "learning_rate": 4.660975947921058e-06, "loss": 0.5635, "step": 387 }, { "epoch": 0.34, "grad_norm": 2.131573174129039, "learning_rate": 4.65921709923985e-06, "loss": 0.5602, "step": 388 }, { "epoch": 0.34, "grad_norm": 1.9282515780121203, "learning_rate": 4.657454033560868e-06, "loss": 0.5292, "step": 389 }, { "epoch": 0.35, "grad_norm": 1.922997066030009, "learning_rate": 4.655686754327419e-06, "loss": 0.5475, "step": 390 }, { "epoch": 0.35, "grad_norm": 1.9692624098665525, "learning_rate": 4.653915264991035e-06, "loss": 0.5529, "step": 391 }, { "epoch": 0.35, "grad_norm": 1.976011234185068, "learning_rate": 4.652139569011475e-06, "loss": 0.5439, "step": 392 }, { "epoch": 0.35, "grad_norm": 1.909657950321316, "learning_rate": 4.650359669856711e-06, "loss": 0.5558, "step": 393 }, { "epoch": 0.35, "grad_norm": 1.9134183734362904, "learning_rate": 4.648575571002926e-06, "loss": 0.5428, "step": 394 }, { "epoch": 0.35, "grad_norm": 2.067168876792994, "learning_rate": 4.646787275934501e-06, "loss": 0.6261, "step": 395 }, { "epoch": 0.35, "grad_norm": 1.9358304010171785, "learning_rate": 4.644994788144017e-06, "loss": 0.5698, "step": 396 }, { "epoch": 0.35, "grad_norm": 1.9671634072657547, "learning_rate": 4.643198111132241e-06, "loss": 0.5345, "step": 397 }, { "epoch": 0.35, "grad_norm": 2.0176052011599133, "learning_rate": 4.641397248408122e-06, "loss": 0.5028, "step": 398 }, { "epoch": 0.35, "grad_norm": 1.9960700483606102, "learning_rate": 4.639592203488784e-06, "loss": 0.5253, "step": 399 }, { "epoch": 0.35, "grad_norm": 1.9329472749401087, "learning_rate": 4.63778297989952e-06, "loss": 0.615, "step": 400 }, { "epoch": 0.36, "grad_norm": 1.9689526846990402, "learning_rate": 4.6359695811737805e-06, "loss": 0.5558, "step": 401 }, { "epoch": 0.36, "grad_norm": 2.043494453339269, "learning_rate": 4.634152010853175e-06, "loss": 0.5955, "step": 402 }, { "epoch": 0.36, "grad_norm": 1.9251519214200417, "learning_rate": 4.632330272487455e-06, "loss": 0.5587, "step": 403 }, { "epoch": 0.36, "grad_norm": 2.2049650629169495, "learning_rate": 4.6305043696345175e-06, "loss": 0.5633, "step": 404 }, { "epoch": 0.36, "grad_norm": 1.8971004366601951, "learning_rate": 4.628674305860389e-06, "loss": 0.5147, "step": 405 }, { "epoch": 0.36, "grad_norm": 1.958131978242853, "learning_rate": 4.626840084739224e-06, "loss": 0.558, "step": 406 }, { "epoch": 0.36, "grad_norm": 1.8809187299789303, "learning_rate": 4.625001709853296e-06, "loss": 0.6029, "step": 407 }, { "epoch": 0.36, "grad_norm": 2.07376704403877, "learning_rate": 4.623159184792992e-06, "loss": 0.5985, "step": 408 }, { "epoch": 0.36, "grad_norm": 1.9773215118384355, "learning_rate": 4.621312513156801e-06, "loss": 0.5592, "step": 409 }, { "epoch": 0.36, "grad_norm": 2.2454931529711373, "learning_rate": 4.6194616985513144e-06, "loss": 0.5265, "step": 410 }, { "epoch": 0.36, "grad_norm": 1.917266484743525, "learning_rate": 4.617606744591214e-06, "loss": 0.5579, "step": 411 }, { "epoch": 0.36, "grad_norm": 1.9196448264725143, "learning_rate": 4.615747654899263e-06, "loss": 0.5345, "step": 412 }, { "epoch": 0.37, "grad_norm": 1.9733157447209138, "learning_rate": 4.613884433106306e-06, "loss": 0.528, "step": 413 }, { "epoch": 0.37, "grad_norm": 1.994664364309963, "learning_rate": 4.612017082851253e-06, "loss": 0.5489, "step": 414 }, { "epoch": 0.37, "grad_norm": 1.8266904473141898, "learning_rate": 4.610145607781081e-06, "loss": 0.5411, "step": 415 }, { "epoch": 0.37, "grad_norm": 2.0294108873934364, "learning_rate": 4.608270011550823e-06, "loss": 0.5963, "step": 416 }, { "epoch": 0.37, "grad_norm": 1.9735002273071562, "learning_rate": 4.606390297823555e-06, "loss": 0.5858, "step": 417 }, { "epoch": 0.37, "grad_norm": 1.8987568737188125, "learning_rate": 4.604506470270403e-06, "loss": 0.493, "step": 418 }, { "epoch": 0.37, "grad_norm": 1.9371998611194052, "learning_rate": 4.6026185325705195e-06, "loss": 0.521, "step": 419 }, { "epoch": 0.37, "grad_norm": 1.8926221916061328, "learning_rate": 4.60072648841109e-06, "loss": 0.4922, "step": 420 }, { "epoch": 0.37, "grad_norm": 1.8759546163633927, "learning_rate": 4.598830341487317e-06, "loss": 0.5487, "step": 421 }, { "epoch": 0.37, "grad_norm": 1.9425705301229708, "learning_rate": 4.596930095502416e-06, "loss": 0.5155, "step": 422 }, { "epoch": 0.37, "grad_norm": 1.8718904454318124, "learning_rate": 4.59502575416761e-06, "loss": 0.5372, "step": 423 }, { "epoch": 0.38, "grad_norm": 1.8361742824749525, "learning_rate": 4.593117321202117e-06, "loss": 0.556, "step": 424 }, { "epoch": 0.38, "grad_norm": 1.8520540031413573, "learning_rate": 4.59120480033315e-06, "loss": 0.6213, "step": 425 }, { "epoch": 0.38, "grad_norm": 1.9670746741442957, "learning_rate": 4.5892881952959015e-06, "loss": 0.5685, "step": 426 }, { "epoch": 0.38, "grad_norm": 1.969557039139786, "learning_rate": 4.587367509833543e-06, "loss": 0.5472, "step": 427 }, { "epoch": 0.38, "grad_norm": 1.9873217018861624, "learning_rate": 4.585442747697218e-06, "loss": 0.5419, "step": 428 }, { "epoch": 0.38, "grad_norm": 1.9508580236237527, "learning_rate": 4.5835139126460234e-06, "loss": 0.566, "step": 429 }, { "epoch": 0.38, "grad_norm": 1.8929503262145966, "learning_rate": 4.58158100844702e-06, "loss": 0.5526, "step": 430 }, { "epoch": 0.38, "grad_norm": 1.9394545018501204, "learning_rate": 4.57964403887521e-06, "loss": 0.5469, "step": 431 }, { "epoch": 0.38, "grad_norm": 2.1045619298179927, "learning_rate": 4.577703007713538e-06, "loss": 0.5397, "step": 432 }, { "epoch": 0.38, "grad_norm": 1.8886665443222683, "learning_rate": 4.575757918752879e-06, "loss": 0.5174, "step": 433 }, { "epoch": 0.38, "grad_norm": 1.849256286655662, "learning_rate": 4.573808775792033e-06, "loss": 0.558, "step": 434 }, { "epoch": 0.39, "grad_norm": 1.89537230772545, "learning_rate": 4.5718555826377195e-06, "loss": 0.6155, "step": 435 }, { "epoch": 0.39, "grad_norm": 2.028600611269796, "learning_rate": 4.569898343104568e-06, "loss": 0.5639, "step": 436 }, { "epoch": 0.39, "grad_norm": 2.1153787641168273, "learning_rate": 4.567937061015107e-06, "loss": 0.5883, "step": 437 }, { "epoch": 0.39, "grad_norm": 2.0217937777574075, "learning_rate": 4.5659717401997655e-06, "loss": 0.5936, "step": 438 }, { "epoch": 0.39, "grad_norm": 2.248716610859176, "learning_rate": 4.564002384496856e-06, "loss": 0.5539, "step": 439 }, { "epoch": 0.39, "grad_norm": 1.9689879082294663, "learning_rate": 4.562028997752574e-06, "loss": 0.5636, "step": 440 }, { "epoch": 0.39, "grad_norm": 1.763292547062648, "learning_rate": 4.560051583820987e-06, "loss": 0.5402, "step": 441 }, { "epoch": 0.39, "grad_norm": 2.129235681815295, "learning_rate": 4.558070146564025e-06, "loss": 0.5279, "step": 442 }, { "epoch": 0.39, "grad_norm": 1.987329959970642, "learning_rate": 4.55608468985148e-06, "loss": 0.5597, "step": 443 }, { "epoch": 0.39, "grad_norm": 1.8223595251951752, "learning_rate": 4.554095217560991e-06, "loss": 0.5523, "step": 444 }, { "epoch": 0.39, "grad_norm": 1.8945373677348296, "learning_rate": 4.55210173357804e-06, "loss": 0.5611, "step": 445 }, { "epoch": 0.4, "grad_norm": 1.8010628987468362, "learning_rate": 4.550104241795946e-06, "loss": 0.5406, "step": 446 }, { "epoch": 0.4, "grad_norm": 1.7680591979019162, "learning_rate": 4.548102746115852e-06, "loss": 0.5392, "step": 447 }, { "epoch": 0.4, "grad_norm": 1.9894409183828397, "learning_rate": 4.546097250446724e-06, "loss": 0.568, "step": 448 }, { "epoch": 0.4, "grad_norm": 1.9527217933389673, "learning_rate": 4.544087758705338e-06, "loss": 0.5616, "step": 449 }, { "epoch": 0.4, "grad_norm": 1.8813970745759399, "learning_rate": 4.5420742748162735e-06, "loss": 0.5857, "step": 450 }, { "epoch": 0.4, "grad_norm": 1.9697471415378363, "learning_rate": 4.540056802711911e-06, "loss": 0.5563, "step": 451 }, { "epoch": 0.4, "grad_norm": 1.8610261764458738, "learning_rate": 4.5380353463324135e-06, "loss": 0.5414, "step": 452 }, { "epoch": 0.4, "grad_norm": 2.0760585222699075, "learning_rate": 4.536009909625733e-06, "loss": 0.6113, "step": 453 }, { "epoch": 0.4, "grad_norm": 1.9376608369819073, "learning_rate": 4.533980496547588e-06, "loss": 0.5567, "step": 454 }, { "epoch": 0.4, "grad_norm": 1.9360208325717025, "learning_rate": 4.5319471110614676e-06, "loss": 0.5637, "step": 455 }, { "epoch": 0.4, "grad_norm": 1.9103146510774847, "learning_rate": 4.529909757138619e-06, "loss": 0.5049, "step": 456 }, { "epoch": 0.4, "grad_norm": 1.9645365532954322, "learning_rate": 4.5278684387580356e-06, "loss": 0.5424, "step": 457 }, { "epoch": 0.41, "grad_norm": 2.0430691701895065, "learning_rate": 4.52582315990646e-06, "loss": 0.547, "step": 458 }, { "epoch": 0.41, "grad_norm": 1.995685349345533, "learning_rate": 4.523773924578362e-06, "loss": 0.6005, "step": 459 }, { "epoch": 0.41, "grad_norm": 1.9830544751269077, "learning_rate": 4.521720736775947e-06, "loss": 0.5563, "step": 460 }, { "epoch": 0.41, "grad_norm": 1.8473463212841006, "learning_rate": 4.519663600509131e-06, "loss": 0.5913, "step": 461 }, { "epoch": 0.41, "grad_norm": 1.8993140839815026, "learning_rate": 4.5176025197955495e-06, "loss": 0.5653, "step": 462 }, { "epoch": 0.41, "grad_norm": 1.8179551662772986, "learning_rate": 4.515537498660535e-06, "loss": 0.5485, "step": 463 }, { "epoch": 0.41, "grad_norm": 1.9275228062086758, "learning_rate": 4.51346854113712e-06, "loss": 0.5248, "step": 464 }, { "epoch": 0.41, "grad_norm": 1.9668428438048349, "learning_rate": 4.511395651266023e-06, "loss": 0.5939, "step": 465 }, { "epoch": 0.41, "grad_norm": 1.9602042152930792, "learning_rate": 4.509318833095642e-06, "loss": 0.5452, "step": 466 }, { "epoch": 0.41, "grad_norm": 1.8348566721600683, "learning_rate": 4.507238090682049e-06, "loss": 0.5514, "step": 467 }, { "epoch": 0.41, "grad_norm": 1.938525142403929, "learning_rate": 4.505153428088979e-06, "loss": 0.5822, "step": 468 }, { "epoch": 0.42, "grad_norm": 2.008973560332548, "learning_rate": 4.503064849387822e-06, "loss": 0.5765, "step": 469 }, { "epoch": 0.42, "grad_norm": 1.8911779425902009, "learning_rate": 4.500972358657618e-06, "loss": 0.5465, "step": 470 }, { "epoch": 0.42, "grad_norm": 1.9224818772820709, "learning_rate": 4.4988759599850485e-06, "loss": 0.5897, "step": 471 }, { "epoch": 0.42, "grad_norm": 1.990817812633161, "learning_rate": 4.496775657464423e-06, "loss": 0.5505, "step": 472 }, { "epoch": 0.42, "grad_norm": 1.9167562026803746, "learning_rate": 4.4946714551976795e-06, "loss": 0.5779, "step": 473 }, { "epoch": 0.42, "grad_norm": 1.9388400892712594, "learning_rate": 4.492563357294369e-06, "loss": 0.574, "step": 474 }, { "epoch": 0.42, "grad_norm": 2.0140312788131762, "learning_rate": 4.490451367871655e-06, "loss": 0.4928, "step": 475 }, { "epoch": 0.42, "grad_norm": 2.074902721101316, "learning_rate": 4.488335491054296e-06, "loss": 0.5366, "step": 476 }, { "epoch": 0.42, "grad_norm": 1.8245504149698855, "learning_rate": 4.486215730974646e-06, "loss": 0.581, "step": 477 }, { "epoch": 0.42, "grad_norm": 2.1100306515160656, "learning_rate": 4.4840920917726425e-06, "loss": 0.5677, "step": 478 }, { "epoch": 0.42, "grad_norm": 1.9560380000004616, "learning_rate": 4.4819645775958e-06, "loss": 0.5426, "step": 479 }, { "epoch": 0.43, "grad_norm": 1.721267171163405, "learning_rate": 4.479833192599198e-06, "loss": 0.5868, "step": 480 }, { "epoch": 0.43, "grad_norm": 2.0001169229847124, "learning_rate": 4.477697940945478e-06, "loss": 0.5667, "step": 481 }, { "epoch": 0.43, "grad_norm": 2.0111322894409134, "learning_rate": 4.475558826804833e-06, "loss": 0.5707, "step": 482 }, { "epoch": 0.43, "grad_norm": 1.8179588699061133, "learning_rate": 4.473415854355e-06, "loss": 0.5484, "step": 483 }, { "epoch": 0.43, "grad_norm": 2.0491236128150345, "learning_rate": 4.47126902778125e-06, "loss": 0.5575, "step": 484 }, { "epoch": 0.43, "grad_norm": 2.049676347036571, "learning_rate": 4.469118351276381e-06, "loss": 0.5807, "step": 485 }, { "epoch": 0.43, "grad_norm": 1.8999028972772445, "learning_rate": 4.4669638290407115e-06, "loss": 0.5447, "step": 486 }, { "epoch": 0.43, "grad_norm": 2.0754807768031687, "learning_rate": 4.464805465282071e-06, "loss": 0.503, "step": 487 }, { "epoch": 0.43, "grad_norm": 1.9532719169013661, "learning_rate": 4.462643264215789e-06, "loss": 0.5304, "step": 488 }, { "epoch": 0.43, "grad_norm": 2.038547881198709, "learning_rate": 4.460477230064693e-06, "loss": 0.6116, "step": 489 }, { "epoch": 0.43, "grad_norm": 2.1342568039197136, "learning_rate": 4.458307367059092e-06, "loss": 0.5632, "step": 490 }, { "epoch": 0.43, "grad_norm": 1.9267024509918977, "learning_rate": 4.456133679436778e-06, "loss": 0.5574, "step": 491 }, { "epoch": 0.44, "grad_norm": 1.795213135692931, "learning_rate": 4.453956171443008e-06, "loss": 0.5737, "step": 492 }, { "epoch": 0.44, "grad_norm": 1.9428252328171443, "learning_rate": 4.451774847330505e-06, "loss": 0.5685, "step": 493 }, { "epoch": 0.44, "grad_norm": 1.7903749800219122, "learning_rate": 4.449589711359439e-06, "loss": 0.5214, "step": 494 }, { "epoch": 0.44, "grad_norm": 2.111615491479605, "learning_rate": 4.447400767797429e-06, "loss": 0.5329, "step": 495 }, { "epoch": 0.44, "grad_norm": 1.936578332165912, "learning_rate": 4.445208020919531e-06, "loss": 0.543, "step": 496 }, { "epoch": 0.44, "grad_norm": 2.0005145681262473, "learning_rate": 4.4430114750082246e-06, "loss": 0.5593, "step": 497 }, { "epoch": 0.44, "grad_norm": 1.9720912009242426, "learning_rate": 4.4408111343534125e-06, "loss": 0.5812, "step": 498 }, { "epoch": 0.44, "grad_norm": 2.0486055586452787, "learning_rate": 4.4386070032524085e-06, "loss": 0.5563, "step": 499 }, { "epoch": 0.44, "grad_norm": 1.8043262288689983, "learning_rate": 4.436399086009928e-06, "loss": 0.4905, "step": 500 }, { "epoch": 0.44, "grad_norm": 1.9608580808640215, "learning_rate": 4.43418738693808e-06, "loss": 0.5548, "step": 501 }, { "epoch": 0.44, "grad_norm": 2.008548225584814, "learning_rate": 4.431971910356363e-06, "loss": 0.5955, "step": 502 }, { "epoch": 0.45, "grad_norm": 1.8974274240345173, "learning_rate": 4.429752660591648e-06, "loss": 0.5742, "step": 503 }, { "epoch": 0.45, "grad_norm": 1.8257689605722616, "learning_rate": 4.427529641978181e-06, "loss": 0.6177, "step": 504 }, { "epoch": 0.45, "grad_norm": 2.0327301577551764, "learning_rate": 4.425302858857563e-06, "loss": 0.5872, "step": 505 }, { "epoch": 0.45, "grad_norm": 1.9539661576324254, "learning_rate": 4.42307231557875e-06, "loss": 0.5728, "step": 506 }, { "epoch": 0.45, "grad_norm": 1.9346302819034207, "learning_rate": 4.420838016498043e-06, "loss": 0.6019, "step": 507 }, { "epoch": 0.45, "grad_norm": 2.1255667417446054, "learning_rate": 4.418599965979074e-06, "loss": 0.5981, "step": 508 }, { "epoch": 0.45, "grad_norm": 1.8293805714793054, "learning_rate": 4.416358168392806e-06, "loss": 0.5497, "step": 509 }, { "epoch": 0.45, "grad_norm": 1.929762647152706, "learning_rate": 4.414112628117518e-06, "loss": 0.5655, "step": 510 }, { "epoch": 0.45, "grad_norm": 1.9808758258773635, "learning_rate": 4.411863349538798e-06, "loss": 0.5465, "step": 511 }, { "epoch": 0.45, "grad_norm": 2.0413084054198647, "learning_rate": 4.409610337049537e-06, "loss": 0.5264, "step": 512 }, { "epoch": 0.45, "grad_norm": 1.9506473664088613, "learning_rate": 4.4073535950499155e-06, "loss": 0.5284, "step": 513 }, { "epoch": 0.46, "grad_norm": 1.7875399190820846, "learning_rate": 4.405093127947402e-06, "loss": 0.5406, "step": 514 }, { "epoch": 0.46, "grad_norm": 1.9594159192262046, "learning_rate": 4.402828940156735e-06, "loss": 0.573, "step": 515 }, { "epoch": 0.46, "grad_norm": 2.025943836966642, "learning_rate": 4.400561036099924e-06, "loss": 0.5227, "step": 516 }, { "epoch": 0.46, "grad_norm": 1.9439140060564322, "learning_rate": 4.398289420206235e-06, "loss": 0.5802, "step": 517 }, { "epoch": 0.46, "grad_norm": 1.891060025336787, "learning_rate": 4.396014096912182e-06, "loss": 0.55, "step": 518 }, { "epoch": 0.46, "grad_norm": 1.9575594944193413, "learning_rate": 4.393735070661521e-06, "loss": 0.5213, "step": 519 }, { "epoch": 0.46, "grad_norm": 2.024463679893138, "learning_rate": 4.391452345905239e-06, "loss": 0.5354, "step": 520 }, { "epoch": 0.46, "grad_norm": 1.825359223217947, "learning_rate": 4.389165927101549e-06, "loss": 0.5506, "step": 521 }, { "epoch": 0.46, "grad_norm": 2.0284690208197484, "learning_rate": 4.386875818715875e-06, "loss": 0.5763, "step": 522 }, { "epoch": 0.46, "grad_norm": 1.9021830177238082, "learning_rate": 4.3845820252208476e-06, "loss": 0.5596, "step": 523 }, { "epoch": 0.46, "grad_norm": 2.0000504821060203, "learning_rate": 4.3822845510962966e-06, "loss": 0.5701, "step": 524 }, { "epoch": 0.47, "grad_norm": 1.7341340075311633, "learning_rate": 4.379983400829237e-06, "loss": 0.5315, "step": 525 }, { "epoch": 0.47, "grad_norm": 1.9297447671947465, "learning_rate": 4.377678578913868e-06, "loss": 0.5798, "step": 526 }, { "epoch": 0.47, "grad_norm": 1.9233069620366818, "learning_rate": 4.375370089851554e-06, "loss": 0.5391, "step": 527 }, { "epoch": 0.47, "grad_norm": 1.976671700063146, "learning_rate": 4.3730579381508254e-06, "loss": 0.5674, "step": 528 }, { "epoch": 0.47, "grad_norm": 1.914097057045113, "learning_rate": 4.3707421283273645e-06, "loss": 0.5367, "step": 529 }, { "epoch": 0.47, "grad_norm": 1.8477362806445459, "learning_rate": 4.368422664903997e-06, "loss": 0.5349, "step": 530 }, { "epoch": 0.47, "grad_norm": 1.9704477099484594, "learning_rate": 4.366099552410686e-06, "loss": 0.501, "step": 531 }, { "epoch": 0.47, "grad_norm": 1.9297086500071385, "learning_rate": 4.363772795384522e-06, "loss": 0.5352, "step": 532 }, { "epoch": 0.47, "grad_norm": 1.9090996748848685, "learning_rate": 4.36144239836971e-06, "loss": 0.5457, "step": 533 }, { "epoch": 0.47, "grad_norm": 1.905870882711107, "learning_rate": 4.3591083659175655e-06, "loss": 0.5685, "step": 534 }, { "epoch": 0.47, "grad_norm": 1.968618442539214, "learning_rate": 4.356770702586506e-06, "loss": 0.5476, "step": 535 }, { "epoch": 0.47, "grad_norm": 1.9431218136805426, "learning_rate": 4.354429412942038e-06, "loss": 0.5719, "step": 536 }, { "epoch": 0.48, "grad_norm": 2.0756451350956215, "learning_rate": 4.3520845015567495e-06, "loss": 0.5502, "step": 537 }, { "epoch": 0.48, "grad_norm": 1.8350117686217275, "learning_rate": 4.349735973010306e-06, "loss": 0.5417, "step": 538 }, { "epoch": 0.48, "grad_norm": 2.03495920394236, "learning_rate": 4.3473838318894324e-06, "loss": 0.545, "step": 539 }, { "epoch": 0.48, "grad_norm": 1.7864245375307775, "learning_rate": 4.3450280827879125e-06, "loss": 0.5242, "step": 540 }, { "epoch": 0.48, "grad_norm": 1.9018530036883652, "learning_rate": 4.342668730306575e-06, "loss": 0.554, "step": 541 }, { "epoch": 0.48, "grad_norm": 1.8575071370513128, "learning_rate": 4.340305779053286e-06, "loss": 0.5287, "step": 542 }, { "epoch": 0.48, "grad_norm": 1.8480049595126469, "learning_rate": 4.33793923364294e-06, "loss": 0.5554, "step": 543 }, { "epoch": 0.48, "grad_norm": 2.103039565778625, "learning_rate": 4.335569098697454e-06, "loss": 0.5526, "step": 544 }, { "epoch": 0.48, "grad_norm": 1.8712145108160219, "learning_rate": 4.33319537884575e-06, "loss": 0.5472, "step": 545 }, { "epoch": 0.48, "grad_norm": 1.9271972466285336, "learning_rate": 4.330818078723756e-06, "loss": 0.5827, "step": 546 }, { "epoch": 0.48, "grad_norm": 1.954438973741856, "learning_rate": 4.328437202974389e-06, "loss": 0.5433, "step": 547 }, { "epoch": 0.49, "grad_norm": 2.0467264178153726, "learning_rate": 4.326052756247553e-06, "loss": 0.5981, "step": 548 }, { "epoch": 0.49, "grad_norm": 1.9418055408636266, "learning_rate": 4.323664743200123e-06, "loss": 0.5832, "step": 549 }, { "epoch": 0.49, "grad_norm": 2.444044603553196, "learning_rate": 4.32127316849594e-06, "loss": 0.5638, "step": 550 }, { "epoch": 0.49, "grad_norm": 1.8791947879326414, "learning_rate": 4.318878036805802e-06, "loss": 0.5864, "step": 551 }, { "epoch": 0.49, "grad_norm": 1.872356245946924, "learning_rate": 4.3164793528074525e-06, "loss": 0.5337, "step": 552 }, { "epoch": 0.49, "grad_norm": 2.025493213646544, "learning_rate": 4.3140771211855725e-06, "loss": 0.5401, "step": 553 }, { "epoch": 0.49, "grad_norm": 1.9845857759145742, "learning_rate": 4.3116713466317745e-06, "loss": 0.5712, "step": 554 }, { "epoch": 0.49, "grad_norm": 1.9091874317608197, "learning_rate": 4.309262033844587e-06, "loss": 0.5337, "step": 555 }, { "epoch": 0.49, "grad_norm": 1.926646558220673, "learning_rate": 4.30684918752945e-06, "loss": 0.5787, "step": 556 }, { "epoch": 0.49, "grad_norm": 2.0450560123448165, "learning_rate": 4.304432812398704e-06, "loss": 0.5704, "step": 557 }, { "epoch": 0.49, "grad_norm": 1.915800332391142, "learning_rate": 4.302012913171584e-06, "loss": 0.5194, "step": 558 }, { "epoch": 0.5, "grad_norm": 1.9050588229807015, "learning_rate": 4.299589494574204e-06, "loss": 0.5104, "step": 559 }, { "epoch": 0.5, "grad_norm": 1.9241714112001687, "learning_rate": 4.297162561339554e-06, "loss": 0.5388, "step": 560 }, { "epoch": 0.5, "grad_norm": 1.8520273210081386, "learning_rate": 4.294732118207486e-06, "loss": 0.5363, "step": 561 }, { "epoch": 0.5, "grad_norm": 2.0240180827444205, "learning_rate": 4.292298169924709e-06, "loss": 0.5632, "step": 562 }, { "epoch": 0.5, "grad_norm": 1.8385436745856445, "learning_rate": 4.289860721244776e-06, "loss": 0.542, "step": 563 }, { "epoch": 0.5, "grad_norm": 1.9260618068482396, "learning_rate": 4.287419776928078e-06, "loss": 0.5555, "step": 564 }, { "epoch": 0.5, "grad_norm": 3.155290692386073, "learning_rate": 4.284975341741833e-06, "loss": 0.5336, "step": 565 }, { "epoch": 0.5, "grad_norm": 2.461077264148098, "learning_rate": 4.282527420460073e-06, "loss": 0.5794, "step": 566 }, { "epoch": 0.5, "grad_norm": 1.8539810703173831, "learning_rate": 4.280076017863643e-06, "loss": 0.5298, "step": 567 }, { "epoch": 0.5, "grad_norm": 1.981150552962984, "learning_rate": 4.277621138740185e-06, "loss": 0.5862, "step": 568 }, { "epoch": 0.5, "grad_norm": 1.8768796036679432, "learning_rate": 4.275162787884132e-06, "loss": 0.5255, "step": 569 }, { "epoch": 0.5, "grad_norm": 2.022795676637582, "learning_rate": 4.272700970096696e-06, "loss": 0.5984, "step": 570 }, { "epoch": 0.51, "grad_norm": 1.835618231704385, "learning_rate": 4.27023569018586e-06, "loss": 0.5297, "step": 571 }, { "epoch": 0.51, "grad_norm": 1.853495005213679, "learning_rate": 4.267766952966369e-06, "loss": 0.5188, "step": 572 }, { "epoch": 0.51, "grad_norm": 1.8841750183665413, "learning_rate": 4.265294763259721e-06, "loss": 0.5678, "step": 573 }, { "epoch": 0.51, "grad_norm": 1.8013177249236558, "learning_rate": 4.262819125894156e-06, "loss": 0.5286, "step": 574 }, { "epoch": 0.51, "grad_norm": 1.8320928495052518, "learning_rate": 4.2603400457046476e-06, "loss": 0.5341, "step": 575 }, { "epoch": 0.51, "grad_norm": 1.8323864124122828, "learning_rate": 4.257857527532891e-06, "loss": 0.5283, "step": 576 }, { "epoch": 0.51, "grad_norm": 1.9487038959665601, "learning_rate": 4.255371576227301e-06, "loss": 0.5418, "step": 577 }, { "epoch": 0.51, "grad_norm": 1.7875154296015772, "learning_rate": 4.252882196642993e-06, "loss": 0.5065, "step": 578 }, { "epoch": 0.51, "grad_norm": 2.089827238376911, "learning_rate": 4.250389393641778e-06, "loss": 0.5919, "step": 579 }, { "epoch": 0.51, "grad_norm": 1.9078348658003164, "learning_rate": 4.247893172092157e-06, "loss": 0.5212, "step": 580 }, { "epoch": 0.51, "grad_norm": 1.9952457072102052, "learning_rate": 4.245393536869303e-06, "loss": 0.5284, "step": 581 }, { "epoch": 0.52, "grad_norm": 2.0728561008210384, "learning_rate": 4.242890492855056e-06, "loss": 0.5214, "step": 582 }, { "epoch": 0.52, "grad_norm": 1.97825451090628, "learning_rate": 4.240384044937919e-06, "loss": 0.5586, "step": 583 }, { "epoch": 0.52, "grad_norm": 1.85380003580073, "learning_rate": 4.237874198013037e-06, "loss": 0.6078, "step": 584 }, { "epoch": 0.52, "grad_norm": 1.8198051628607304, "learning_rate": 4.235360956982196e-06, "loss": 0.5677, "step": 585 }, { "epoch": 0.52, "grad_norm": 2.1343351043013183, "learning_rate": 4.23284432675381e-06, "loss": 0.5706, "step": 586 }, { "epoch": 0.52, "grad_norm": 2.0294462862804896, "learning_rate": 4.230324312242911e-06, "loss": 0.5399, "step": 587 }, { "epoch": 0.52, "grad_norm": 1.9618881336969853, "learning_rate": 4.227800918371145e-06, "loss": 0.5292, "step": 588 }, { "epoch": 0.52, "grad_norm": 1.9665398714083597, "learning_rate": 4.225274150066752e-06, "loss": 0.5414, "step": 589 }, { "epoch": 0.52, "grad_norm": 2.0976099857689268, "learning_rate": 4.222744012264567e-06, "loss": 0.5204, "step": 590 }, { "epoch": 0.52, "grad_norm": 1.968032018982793, "learning_rate": 4.220210509906002e-06, "loss": 0.5622, "step": 591 }, { "epoch": 0.52, "grad_norm": 2.0055542027073523, "learning_rate": 4.217673647939044e-06, "loss": 0.5723, "step": 592 }, { "epoch": 0.53, "grad_norm": 2.031612125247833, "learning_rate": 4.215133431318239e-06, "loss": 0.5727, "step": 593 }, { "epoch": 0.53, "grad_norm": 2.04253552367063, "learning_rate": 4.212589865004684e-06, "loss": 0.5676, "step": 594 }, { "epoch": 0.53, "grad_norm": 1.9143447724555291, "learning_rate": 4.2100429539660205e-06, "loss": 0.5452, "step": 595 }, { "epoch": 0.53, "grad_norm": 2.1284999811605334, "learning_rate": 4.20749270317642e-06, "loss": 0.5679, "step": 596 }, { "epoch": 0.53, "grad_norm": 1.9726237378545723, "learning_rate": 4.204939117616578e-06, "loss": 0.5514, "step": 597 }, { "epoch": 0.53, "grad_norm": 2.0537722291479583, "learning_rate": 4.202382202273702e-06, "loss": 0.5979, "step": 598 }, { "epoch": 0.53, "grad_norm": 1.9695944675405062, "learning_rate": 4.1998219621415035e-06, "loss": 0.5519, "step": 599 }, { "epoch": 0.53, "grad_norm": 2.1175148159531196, "learning_rate": 4.197258402220187e-06, "loss": 0.5437, "step": 600 }, { "epoch": 0.53, "grad_norm": 1.9698920488340708, "learning_rate": 4.19469152751644e-06, "loss": 0.5765, "step": 601 }, { "epoch": 0.53, "grad_norm": 1.879379971551763, "learning_rate": 4.192121343043424e-06, "loss": 0.5219, "step": 602 }, { "epoch": 0.53, "grad_norm": 1.9668215341266202, "learning_rate": 4.189547853820767e-06, "loss": 0.4967, "step": 603 }, { "epoch": 0.53, "grad_norm": 2.0264415648360723, "learning_rate": 4.186971064874547e-06, "loss": 0.5591, "step": 604 }, { "epoch": 0.54, "grad_norm": 1.9996711001240413, "learning_rate": 4.18439098123729e-06, "loss": 0.5909, "step": 605 }, { "epoch": 0.54, "grad_norm": 1.9209919754307736, "learning_rate": 4.181807607947954e-06, "loss": 0.5516, "step": 606 }, { "epoch": 0.54, "grad_norm": 1.8120062816345244, "learning_rate": 4.1792209500519245e-06, "loss": 0.5112, "step": 607 }, { "epoch": 0.54, "grad_norm": 1.9265993932694714, "learning_rate": 4.176631012601e-06, "loss": 0.5716, "step": 608 }, { "epoch": 0.54, "grad_norm": 1.7951063568824173, "learning_rate": 4.1740378006533835e-06, "loss": 0.5546, "step": 609 }, { "epoch": 0.54, "grad_norm": 1.9478736935670538, "learning_rate": 4.1714413192736756e-06, "loss": 0.5137, "step": 610 }, { "epoch": 0.54, "grad_norm": 1.9166713700159672, "learning_rate": 4.168841573532859e-06, "loss": 0.5285, "step": 611 }, { "epoch": 0.54, "grad_norm": 1.903061790874867, "learning_rate": 4.166238568508294e-06, "loss": 0.5643, "step": 612 }, { "epoch": 0.54, "grad_norm": 1.8709574261812854, "learning_rate": 4.1636323092837065e-06, "loss": 0.5531, "step": 613 }, { "epoch": 0.54, "grad_norm": 1.891374469060374, "learning_rate": 4.161022800949177e-06, "loss": 0.5386, "step": 614 }, { "epoch": 0.54, "grad_norm": 1.8621023435008923, "learning_rate": 4.1584100486011315e-06, "loss": 0.5472, "step": 615 }, { "epoch": 0.55, "grad_norm": 1.8927480615848256, "learning_rate": 4.155794057342333e-06, "loss": 0.567, "step": 616 }, { "epoch": 0.55, "grad_norm": 1.9157957155248084, "learning_rate": 4.153174832281867e-06, "loss": 0.5295, "step": 617 }, { "epoch": 0.55, "grad_norm": 1.7900976303440275, "learning_rate": 4.150552378535138e-06, "loss": 0.5374, "step": 618 }, { "epoch": 0.55, "grad_norm": 1.9233860209522704, "learning_rate": 4.1479267012238555e-06, "loss": 0.5673, "step": 619 }, { "epoch": 0.55, "grad_norm": 1.904244620695313, "learning_rate": 4.145297805476023e-06, "loss": 0.5674, "step": 620 }, { "epoch": 0.55, "grad_norm": 1.8633100020518014, "learning_rate": 4.142665696425932e-06, "loss": 0.5717, "step": 621 }, { "epoch": 0.55, "grad_norm": 2.0449274851229764, "learning_rate": 4.140030379214147e-06, "loss": 0.5382, "step": 622 }, { "epoch": 0.55, "grad_norm": 1.8437126524936716, "learning_rate": 4.137391858987502e-06, "loss": 0.5635, "step": 623 }, { "epoch": 0.55, "grad_norm": 1.9476300616110815, "learning_rate": 4.134750140899082e-06, "loss": 0.5354, "step": 624 }, { "epoch": 0.55, "grad_norm": 1.8187836169409277, "learning_rate": 4.132105230108221e-06, "loss": 0.5678, "step": 625 }, { "epoch": 0.55, "grad_norm": 1.8325255303792565, "learning_rate": 4.1294571317804854e-06, "loss": 0.5497, "step": 626 }, { "epoch": 0.56, "grad_norm": 1.947073088948294, "learning_rate": 4.12680585108767e-06, "loss": 0.6005, "step": 627 }, { "epoch": 0.56, "grad_norm": 1.9094602677105208, "learning_rate": 4.1241513932077835e-06, "loss": 0.5442, "step": 628 }, { "epoch": 0.56, "grad_norm": 1.9308069577521967, "learning_rate": 4.121493763325039e-06, "loss": 0.4952, "step": 629 }, { "epoch": 0.56, "grad_norm": 1.955225453108231, "learning_rate": 4.118832966629847e-06, "loss": 0.5161, "step": 630 }, { "epoch": 0.56, "grad_norm": 1.8884686835300686, "learning_rate": 4.116169008318798e-06, "loss": 0.5834, "step": 631 }, { "epoch": 0.56, "grad_norm": 1.851971220446282, "learning_rate": 4.113501893594662e-06, "loss": 0.5762, "step": 632 }, { "epoch": 0.56, "grad_norm": 1.982231343732386, "learning_rate": 4.110831627666372e-06, "loss": 0.5043, "step": 633 }, { "epoch": 0.56, "grad_norm": 1.8783480932058496, "learning_rate": 4.108158215749014e-06, "loss": 0.5202, "step": 634 }, { "epoch": 0.56, "grad_norm": 1.7472053862830499, "learning_rate": 4.105481663063821e-06, "loss": 0.5064, "step": 635 }, { "epoch": 0.56, "grad_norm": 4.71435326799849, "learning_rate": 4.102801974838158e-06, "loss": 0.5808, "step": 636 }, { "epoch": 0.56, "grad_norm": 1.9383972995582568, "learning_rate": 4.100119156305514e-06, "loss": 0.5268, "step": 637 }, { "epoch": 0.57, "grad_norm": 1.7165619283230378, "learning_rate": 4.097433212705492e-06, "loss": 0.5376, "step": 638 }, { "epoch": 0.57, "grad_norm": 1.8524888535442023, "learning_rate": 4.094744149283796e-06, "loss": 0.5388, "step": 639 }, { "epoch": 0.57, "grad_norm": 1.958121956311822, "learning_rate": 4.092051971292228e-06, "loss": 0.5273, "step": 640 }, { "epoch": 0.57, "grad_norm": 1.8752806971174674, "learning_rate": 4.089356683988668e-06, "loss": 0.5283, "step": 641 }, { "epoch": 0.57, "grad_norm": 2.4399117721583465, "learning_rate": 4.086658292637072e-06, "loss": 0.5643, "step": 642 }, { "epoch": 0.57, "grad_norm": 1.897865148445396, "learning_rate": 4.083956802507456e-06, "loss": 0.5432, "step": 643 }, { "epoch": 0.57, "grad_norm": 2.0947253224544826, "learning_rate": 4.0812522188758874e-06, "loss": 0.6738, "step": 644 }, { "epoch": 0.57, "grad_norm": 1.8801252766945993, "learning_rate": 4.078544547024479e-06, "loss": 0.5516, "step": 645 }, { "epoch": 0.57, "grad_norm": 1.884681207915535, "learning_rate": 4.075833792241371e-06, "loss": 0.5521, "step": 646 }, { "epoch": 0.57, "grad_norm": 1.911314829964074, "learning_rate": 4.073119959820728e-06, "loss": 0.5279, "step": 647 }, { "epoch": 0.57, "grad_norm": 1.860637117587055, "learning_rate": 4.070403055062721e-06, "loss": 0.5543, "step": 648 }, { "epoch": 0.57, "grad_norm": 2.0453601596603157, "learning_rate": 4.0676830832735245e-06, "loss": 0.5757, "step": 649 }, { "epoch": 0.58, "grad_norm": 1.8114060321351384, "learning_rate": 4.064960049765304e-06, "loss": 0.5049, "step": 650 }, { "epoch": 0.58, "grad_norm": 1.959305167631277, "learning_rate": 4.062233959856202e-06, "loss": 0.5378, "step": 651 }, { "epoch": 0.58, "grad_norm": 1.8509512649844786, "learning_rate": 4.059504818870332e-06, "loss": 0.5695, "step": 652 }, { "epoch": 0.58, "grad_norm": 2.0120311393374677, "learning_rate": 4.056772632137762e-06, "loss": 0.5548, "step": 653 }, { "epoch": 0.58, "grad_norm": 2.185006431209757, "learning_rate": 4.054037404994516e-06, "loss": 0.5796, "step": 654 }, { "epoch": 0.58, "grad_norm": 1.8639659087725635, "learning_rate": 4.05129914278255e-06, "loss": 0.503, "step": 655 }, { "epoch": 0.58, "grad_norm": 2.0128366658538726, "learning_rate": 4.048557850849749e-06, "loss": 0.5543, "step": 656 }, { "epoch": 0.58, "grad_norm": 2.0493127075126467, "learning_rate": 4.045813534549917e-06, "loss": 0.5971, "step": 657 }, { "epoch": 0.58, "grad_norm": 1.8943877873256292, "learning_rate": 4.043066199242762e-06, "loss": 0.5512, "step": 658 }, { "epoch": 0.58, "grad_norm": 1.8607643797927613, "learning_rate": 4.04031585029389e-06, "loss": 0.5755, "step": 659 }, { "epoch": 0.58, "grad_norm": 1.933467010931308, "learning_rate": 4.037562493074792e-06, "loss": 0.546, "step": 660 }, { "epoch": 0.59, "grad_norm": 1.870898209604796, "learning_rate": 4.034806132962834e-06, "loss": 0.5101, "step": 661 }, { "epoch": 0.59, "grad_norm": 1.7765005525064146, "learning_rate": 4.032046775341247e-06, "loss": 0.535, "step": 662 }, { "epoch": 0.59, "grad_norm": 1.808388020113739, "learning_rate": 4.029284425599116e-06, "loss": 0.5532, "step": 663 }, { "epoch": 0.59, "grad_norm": 1.9444426383785842, "learning_rate": 4.026519089131371e-06, "loss": 0.5804, "step": 664 }, { "epoch": 0.59, "grad_norm": 1.8810929458792174, "learning_rate": 4.023750771338774e-06, "loss": 0.5023, "step": 665 }, { "epoch": 0.59, "grad_norm": 1.7587173598023012, "learning_rate": 4.020979477627907e-06, "loss": 0.588, "step": 666 }, { "epoch": 0.59, "grad_norm": 1.8616544736960938, "learning_rate": 4.018205213411169e-06, "loss": 0.5604, "step": 667 }, { "epoch": 0.59, "grad_norm": 1.8517363531329913, "learning_rate": 4.015427984106759e-06, "loss": 0.5503, "step": 668 }, { "epoch": 0.59, "grad_norm": 1.7164279131663547, "learning_rate": 4.012647795138664e-06, "loss": 0.5353, "step": 669 }, { "epoch": 0.59, "grad_norm": 1.8490922932257532, "learning_rate": 4.009864651936653e-06, "loss": 0.5527, "step": 670 }, { "epoch": 0.59, "grad_norm": 1.9222471762582807, "learning_rate": 4.007078559936268e-06, "loss": 0.5449, "step": 671 }, { "epoch": 0.6, "grad_norm": 1.7126406752680576, "learning_rate": 4.0042895245788035e-06, "loss": 0.5102, "step": 672 }, { "epoch": 0.6, "grad_norm": 1.7999692875631594, "learning_rate": 4.001497551311308e-06, "loss": 0.514, "step": 673 }, { "epoch": 0.6, "grad_norm": 1.8482521644616647, "learning_rate": 3.998702645586565e-06, "loss": 0.546, "step": 674 }, { "epoch": 0.6, "grad_norm": 1.8124842120343776, "learning_rate": 3.995904812863086e-06, "loss": 0.5432, "step": 675 }, { "epoch": 0.6, "grad_norm": 1.9053654350943952, "learning_rate": 3.993104058605099e-06, "loss": 0.6222, "step": 676 }, { "epoch": 0.6, "grad_norm": 1.851530834120678, "learning_rate": 3.9903003882825396e-06, "loss": 0.5069, "step": 677 }, { "epoch": 0.6, "grad_norm": 1.824612938648448, "learning_rate": 3.987493807371033e-06, "loss": 0.5279, "step": 678 }, { "epoch": 0.6, "grad_norm": 1.8322983038942529, "learning_rate": 3.984684321351895e-06, "loss": 0.504, "step": 679 }, { "epoch": 0.6, "grad_norm": 2.1601679247075105, "learning_rate": 3.981871935712112e-06, "loss": 0.5448, "step": 680 }, { "epoch": 0.6, "grad_norm": 1.9324323412240167, "learning_rate": 3.979056655944335e-06, "loss": 0.5696, "step": 681 }, { "epoch": 0.6, "grad_norm": 1.8887222870071794, "learning_rate": 3.9762384875468645e-06, "loss": 0.5147, "step": 682 }, { "epoch": 0.6, "grad_norm": 1.9025483031058836, "learning_rate": 3.973417436023646e-06, "loss": 0.5322, "step": 683 }, { "epoch": 0.61, "grad_norm": 1.944754689874286, "learning_rate": 3.970593506884254e-06, "loss": 0.564, "step": 684 }, { "epoch": 0.61, "grad_norm": 1.8782062559948918, "learning_rate": 3.9677667056438824e-06, "loss": 0.5179, "step": 685 }, { "epoch": 0.61, "grad_norm": 1.7615090001622373, "learning_rate": 3.964937037823337e-06, "loss": 0.52, "step": 686 }, { "epoch": 0.61, "grad_norm": 1.877979446527034, "learning_rate": 3.962104508949018e-06, "loss": 0.5611, "step": 687 }, { "epoch": 0.61, "grad_norm": 1.8668900126580097, "learning_rate": 3.9592691245529174e-06, "loss": 0.5398, "step": 688 }, { "epoch": 0.61, "grad_norm": 2.0467424748632395, "learning_rate": 3.9564308901726016e-06, "loss": 0.5429, "step": 689 }, { "epoch": 0.61, "grad_norm": 1.7523480652481473, "learning_rate": 3.9535898113512046e-06, "loss": 0.5456, "step": 690 }, { "epoch": 0.61, "grad_norm": 1.9384307177445268, "learning_rate": 3.950745893637414e-06, "loss": 0.5298, "step": 691 }, { "epoch": 0.61, "grad_norm": 2.0200307543606266, "learning_rate": 3.947899142585464e-06, "loss": 0.5813, "step": 692 }, { "epoch": 0.61, "grad_norm": 1.8825594318661294, "learning_rate": 3.945049563755119e-06, "loss": 0.5843, "step": 693 }, { "epoch": 0.61, "grad_norm": 1.801304483173922, "learning_rate": 3.94219716271167e-06, "loss": 0.5332, "step": 694 }, { "epoch": 0.62, "grad_norm": 1.789336412692842, "learning_rate": 3.939341945025918e-06, "loss": 0.5712, "step": 695 }, { "epoch": 0.62, "grad_norm": 1.6764596672056864, "learning_rate": 3.936483916274163e-06, "loss": 0.5471, "step": 696 }, { "epoch": 0.62, "grad_norm": 1.8160991340297739, "learning_rate": 3.933623082038199e-06, "loss": 0.5172, "step": 697 }, { "epoch": 0.62, "grad_norm": 1.9958719154660882, "learning_rate": 3.930759447905298e-06, "loss": 0.5243, "step": 698 }, { "epoch": 0.62, "grad_norm": 1.7844190098902166, "learning_rate": 3.927893019468196e-06, "loss": 0.5679, "step": 699 }, { "epoch": 0.62, "grad_norm": 1.8231700761644845, "learning_rate": 3.925023802325094e-06, "loss": 0.5415, "step": 700 }, { "epoch": 0.62, "grad_norm": 1.8577751348591511, "learning_rate": 3.922151802079633e-06, "loss": 0.5451, "step": 701 }, { "epoch": 0.62, "grad_norm": 1.872268020286279, "learning_rate": 3.919277024340891e-06, "loss": 0.5805, "step": 702 }, { "epoch": 0.62, "grad_norm": 1.956916033214976, "learning_rate": 3.916399474723373e-06, "loss": 0.5142, "step": 703 }, { "epoch": 0.62, "grad_norm": 1.8690696320721123, "learning_rate": 3.913519158846994e-06, "loss": 0.5377, "step": 704 }, { "epoch": 0.62, "grad_norm": 1.8932224298053513, "learning_rate": 3.910636082337076e-06, "loss": 0.5174, "step": 705 }, { "epoch": 0.63, "grad_norm": 1.7671002724508906, "learning_rate": 3.907750250824327e-06, "loss": 0.5227, "step": 706 }, { "epoch": 0.63, "grad_norm": 1.8537234882936333, "learning_rate": 3.904861669944839e-06, "loss": 0.5672, "step": 707 }, { "epoch": 0.63, "grad_norm": 1.8993796687475375, "learning_rate": 3.901970345340075e-06, "loss": 0.5131, "step": 708 }, { "epoch": 0.63, "grad_norm": 1.8118617206389966, "learning_rate": 3.899076282656853e-06, "loss": 0.5243, "step": 709 }, { "epoch": 0.63, "grad_norm": 1.8195324114535576, "learning_rate": 3.89617948754734e-06, "loss": 0.5255, "step": 710 }, { "epoch": 0.63, "grad_norm": 1.777076552111516, "learning_rate": 3.89327996566904e-06, "loss": 0.5482, "step": 711 }, { "epoch": 0.63, "grad_norm": 1.7960584295638569, "learning_rate": 3.890377722684782e-06, "loss": 0.5232, "step": 712 }, { "epoch": 0.63, "grad_norm": 2.0180517293259777, "learning_rate": 3.887472764262709e-06, "loss": 0.4988, "step": 713 }, { "epoch": 0.63, "grad_norm": 1.7698597985590767, "learning_rate": 3.884565096076269e-06, "loss": 0.4934, "step": 714 }, { "epoch": 0.63, "grad_norm": 1.9593013419554524, "learning_rate": 3.8816547238042e-06, "loss": 0.554, "step": 715 }, { "epoch": 0.63, "grad_norm": 1.803176799671639, "learning_rate": 3.878741653130521e-06, "loss": 0.5058, "step": 716 }, { "epoch": 0.64, "grad_norm": 1.8739139669777212, "learning_rate": 3.875825889744525e-06, "loss": 0.5291, "step": 717 }, { "epoch": 0.64, "grad_norm": 1.7425957572489872, "learning_rate": 3.872907439340758e-06, "loss": 0.5132, "step": 718 }, { "epoch": 0.64, "grad_norm": 1.7880023308134785, "learning_rate": 3.86998630761902e-06, "loss": 0.5388, "step": 719 }, { "epoch": 0.64, "grad_norm": 2.035324802689225, "learning_rate": 3.867062500284342e-06, "loss": 0.5225, "step": 720 }, { "epoch": 0.64, "grad_norm": 1.7720228048563502, "learning_rate": 3.864136023046984e-06, "loss": 0.5535, "step": 721 }, { "epoch": 0.64, "grad_norm": 1.893636721431615, "learning_rate": 3.861206881622419e-06, "loss": 0.5445, "step": 722 }, { "epoch": 0.64, "grad_norm": 1.9975882991420841, "learning_rate": 3.8582750817313245e-06, "loss": 0.498, "step": 723 }, { "epoch": 0.64, "grad_norm": 1.8894358056153195, "learning_rate": 3.855340629099568e-06, "loss": 0.5262, "step": 724 }, { "epoch": 0.64, "grad_norm": 1.8226831631189866, "learning_rate": 3.852403529458199e-06, "loss": 0.5289, "step": 725 }, { "epoch": 0.64, "grad_norm": 1.9219589460322386, "learning_rate": 3.84946378854344e-06, "loss": 0.5828, "step": 726 }, { "epoch": 0.64, "grad_norm": 1.9524000874112546, "learning_rate": 3.846521412096665e-06, "loss": 0.5755, "step": 727 }, { "epoch": 0.64, "grad_norm": 1.7855988589662195, "learning_rate": 3.8435764058643994e-06, "loss": 0.508, "step": 728 }, { "epoch": 0.65, "grad_norm": 1.7556968697529176, "learning_rate": 3.840628775598306e-06, "loss": 0.5038, "step": 729 }, { "epoch": 0.65, "grad_norm": 1.8615629845007688, "learning_rate": 3.837678527055168e-06, "loss": 0.5658, "step": 730 }, { "epoch": 0.65, "grad_norm": 3.355106616980178, "learning_rate": 3.834725665996889e-06, "loss": 0.6255, "step": 731 }, { "epoch": 0.65, "grad_norm": 2.057901705133853, "learning_rate": 3.8317701981904655e-06, "loss": 0.5009, "step": 732 }, { "epoch": 0.65, "grad_norm": 1.8144866213511652, "learning_rate": 3.828812129407994e-06, "loss": 0.5378, "step": 733 }, { "epoch": 0.65, "grad_norm": 1.895740992214761, "learning_rate": 3.825851465426643e-06, "loss": 0.5414, "step": 734 }, { "epoch": 0.65, "grad_norm": 1.7690202691648218, "learning_rate": 3.822888212028658e-06, "loss": 0.5782, "step": 735 }, { "epoch": 0.65, "grad_norm": 1.9910212850942313, "learning_rate": 3.819922375001334e-06, "loss": 0.538, "step": 736 }, { "epoch": 0.65, "grad_norm": 2.022977401775343, "learning_rate": 3.816953960137017e-06, "loss": 0.5265, "step": 737 }, { "epoch": 0.65, "grad_norm": 2.18942238369997, "learning_rate": 3.8139829732330833e-06, "loss": 0.5419, "step": 738 }, { "epoch": 0.65, "grad_norm": 2.0143145051916487, "learning_rate": 3.8110094200919356e-06, "loss": 0.5396, "step": 739 }, { "epoch": 0.66, "grad_norm": 1.8684895296380082, "learning_rate": 3.8080333065209885e-06, "loss": 0.5285, "step": 740 }, { "epoch": 0.66, "grad_norm": 1.899758991227905, "learning_rate": 3.8050546383326546e-06, "loss": 0.5392, "step": 741 }, { "epoch": 0.66, "grad_norm": 1.7830347822365242, "learning_rate": 3.8020734213443392e-06, "loss": 0.5395, "step": 742 }, { "epoch": 0.66, "grad_norm": 1.9688219937316351, "learning_rate": 3.799089661378423e-06, "loss": 0.5832, "step": 743 }, { "epoch": 0.66, "grad_norm": 1.8380061964557934, "learning_rate": 3.7961033642622536e-06, "loss": 0.5182, "step": 744 }, { "epoch": 0.66, "grad_norm": 1.9752769027783192, "learning_rate": 3.793114535828134e-06, "loss": 0.5189, "step": 745 }, { "epoch": 0.66, "grad_norm": 1.9908258845677271, "learning_rate": 3.7901231819133104e-06, "loss": 0.5863, "step": 746 }, { "epoch": 0.66, "grad_norm": 1.8419144313470388, "learning_rate": 3.787129308359963e-06, "loss": 0.5596, "step": 747 }, { "epoch": 0.66, "grad_norm": 1.8578409208981632, "learning_rate": 3.7841329210151905e-06, "loss": 0.5757, "step": 748 }, { "epoch": 0.66, "grad_norm": 1.8125362585272666, "learning_rate": 3.7811340257310036e-06, "loss": 0.5625, "step": 749 }, { "epoch": 0.66, "grad_norm": 1.8266843142853604, "learning_rate": 3.778132628364309e-06, "loss": 0.5121, "step": 750 }, { "epoch": 0.67, "grad_norm": 1.9286747700189457, "learning_rate": 3.7751287347769006e-06, "loss": 0.5856, "step": 751 }, { "epoch": 0.67, "grad_norm": 1.8358169963837994, "learning_rate": 3.772122350835447e-06, "loss": 0.5363, "step": 752 }, { "epoch": 0.67, "grad_norm": 1.8751145280860322, "learning_rate": 3.769113482411483e-06, "loss": 0.5435, "step": 753 }, { "epoch": 0.67, "grad_norm": 1.7372022137266947, "learning_rate": 3.766102135381393e-06, "loss": 0.5114, "step": 754 } ], "logging_steps": 1, "max_steps": 2258, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 377, "total_flos": 355094809804800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }