{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0017714791851196, "eval_steps": 500, "global_step": 1131, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 15.385598322874909, "learning_rate": 5.000000000000001e-07, "loss": 0.7627, "step": 1 }, { "epoch": 0.0, "grad_norm": 15.42812332406859, "learning_rate": 1.0000000000000002e-06, "loss": 0.794, "step": 2 }, { "epoch": 0.0, "grad_norm": 13.76934599903778, "learning_rate": 1.5e-06, "loss": 0.7894, "step": 3 }, { "epoch": 0.0, "grad_norm": 7.9055471186770685, "learning_rate": 2.0000000000000003e-06, "loss": 0.7346, "step": 4 }, { "epoch": 0.0, "grad_norm": 8.624179170790118, "learning_rate": 2.5e-06, "loss": 0.7458, "step": 5 }, { "epoch": 0.01, "grad_norm": 37.14544394485457, "learning_rate": 3e-06, "loss": 0.8249, "step": 6 }, { "epoch": 0.01, "grad_norm": 13.413192499879626, "learning_rate": 3.5e-06, "loss": 0.7692, "step": 7 }, { "epoch": 0.01, "grad_norm": 11.194156755277431, "learning_rate": 4.000000000000001e-06, "loss": 0.7724, "step": 8 }, { "epoch": 0.01, "grad_norm": 8.569279640169995, "learning_rate": 4.5e-06, "loss": 0.7851, "step": 9 }, { "epoch": 0.01, "grad_norm": 18.113903622060178, "learning_rate": 5e-06, "loss": 0.7874, "step": 10 }, { "epoch": 0.01, "grad_norm": 7.486914001687124, "learning_rate": 4.999997558722919e-06, "loss": 0.7553, "step": 11 }, { "epoch": 0.01, "grad_norm": 7.280219682440894, "learning_rate": 4.999990234896445e-06, "loss": 0.7095, "step": 12 }, { "epoch": 0.01, "grad_norm": 4.3413734180304155, "learning_rate": 4.99997802853488e-06, "loss": 0.6916, "step": 13 }, { "epoch": 0.01, "grad_norm": 5.756315245615391, "learning_rate": 4.999960939662063e-06, "loss": 0.7407, "step": 14 }, { "epoch": 0.01, "grad_norm": 5.090553047874293, "learning_rate": 4.999938968311371e-06, "loss": 0.7387, "step": 15 }, { "epoch": 0.01, "grad_norm": 5.8370558847287075, "learning_rate": 4.9999121145257126e-06, "loss": 0.7051, "step": 16 }, { "epoch": 0.02, "grad_norm": 3.986658012877664, "learning_rate": 4.999880378357535e-06, "loss": 0.6871, "step": 17 }, { "epoch": 0.02, "grad_norm": 4.141716122521651, "learning_rate": 4.9998437598688195e-06, "loss": 0.6694, "step": 18 }, { "epoch": 0.02, "grad_norm": 4.729722439630604, "learning_rate": 4.9998022591310815e-06, "loss": 0.716, "step": 19 }, { "epoch": 0.02, "grad_norm": 2.9486336901615497, "learning_rate": 4.999755876225375e-06, "loss": 0.6387, "step": 20 }, { "epoch": 0.02, "grad_norm": 2.8336874650575745, "learning_rate": 4.999704611242285e-06, "loss": 0.6542, "step": 21 }, { "epoch": 0.02, "grad_norm": 3.6724374918638905, "learning_rate": 4.999648464281934e-06, "loss": 0.6617, "step": 22 }, { "epoch": 0.02, "grad_norm": 2.941494127880678, "learning_rate": 4.999587435453979e-06, "loss": 0.6687, "step": 23 }, { "epoch": 0.02, "grad_norm": 2.6261822206464744, "learning_rate": 4.999521524877608e-06, "loss": 0.6634, "step": 24 }, { "epoch": 0.02, "grad_norm": 2.8059947014946305, "learning_rate": 4.999450732681549e-06, "loss": 0.6901, "step": 25 }, { "epoch": 0.02, "grad_norm": 3.131537494217822, "learning_rate": 4.999375059004058e-06, "loss": 0.6407, "step": 26 }, { "epoch": 0.02, "grad_norm": 2.7893212245465837, "learning_rate": 4.99929450399293e-06, "loss": 0.6638, "step": 27 }, { "epoch": 0.02, "grad_norm": 2.4411586751746, "learning_rate": 4.999209067805487e-06, "loss": 0.6196, "step": 28 }, { "epoch": 0.03, "grad_norm": 2.8807261299944082, "learning_rate": 4.999118750608591e-06, "loss": 0.6839, "step": 29 }, { "epoch": 0.03, "grad_norm": 2.879993804839069, "learning_rate": 4.9990235525786326e-06, "loss": 0.6484, "step": 30 }, { "epoch": 0.03, "grad_norm": 2.604360711268946, "learning_rate": 4.998923473901535e-06, "loss": 0.6313, "step": 31 }, { "epoch": 0.03, "grad_norm": 2.403225544767816, "learning_rate": 4.9988185147727544e-06, "loss": 0.6209, "step": 32 }, { "epoch": 0.03, "grad_norm": 2.669567772543462, "learning_rate": 4.998708675397278e-06, "loss": 0.6068, "step": 33 }, { "epoch": 0.03, "grad_norm": 2.443946495915797, "learning_rate": 4.998593955989626e-06, "loss": 0.6731, "step": 34 }, { "epoch": 0.03, "grad_norm": 2.2104680876118317, "learning_rate": 4.998474356773845e-06, "loss": 0.6243, "step": 35 }, { "epoch": 0.03, "grad_norm": 2.3602199264043957, "learning_rate": 4.9983498779835175e-06, "loss": 0.6649, "step": 36 }, { "epoch": 0.03, "grad_norm": 2.4676911263240844, "learning_rate": 4.998220519861752e-06, "loss": 0.6174, "step": 37 }, { "epoch": 0.03, "grad_norm": 2.3419026099030282, "learning_rate": 4.998086282661188e-06, "loss": 0.6123, "step": 38 }, { "epoch": 0.03, "grad_norm": 2.14900736954254, "learning_rate": 4.997947166643993e-06, "loss": 0.63, "step": 39 }, { "epoch": 0.04, "grad_norm": 2.570907426799795, "learning_rate": 4.997803172081864e-06, "loss": 0.6249, "step": 40 }, { "epoch": 0.04, "grad_norm": 2.516952735669967, "learning_rate": 4.997654299256026e-06, "loss": 0.6727, "step": 41 }, { "epoch": 0.04, "grad_norm": 2.1600457198543874, "learning_rate": 4.997500548457231e-06, "loss": 0.6719, "step": 42 }, { "epoch": 0.04, "grad_norm": 2.2177572033934743, "learning_rate": 4.997341919985756e-06, "loss": 0.6148, "step": 43 }, { "epoch": 0.04, "grad_norm": 2.397105205209689, "learning_rate": 4.997178414151409e-06, "loss": 0.6167, "step": 44 }, { "epoch": 0.04, "grad_norm": 2.1254940534972167, "learning_rate": 4.997010031273517e-06, "loss": 0.6446, "step": 45 }, { "epoch": 0.04, "grad_norm": 2.2113023791837194, "learning_rate": 4.996836771680937e-06, "loss": 0.6304, "step": 46 }, { "epoch": 0.04, "grad_norm": 2.386446316275664, "learning_rate": 4.99665863571205e-06, "loss": 0.6621, "step": 47 }, { "epoch": 0.04, "grad_norm": 2.1838934384314483, "learning_rate": 4.996475623714756e-06, "loss": 0.6214, "step": 48 }, { "epoch": 0.04, "grad_norm": 2.2047933657923586, "learning_rate": 4.996287736046485e-06, "loss": 0.6478, "step": 49 }, { "epoch": 0.04, "grad_norm": 2.208809457983808, "learning_rate": 4.996094973074183e-06, "loss": 0.6097, "step": 50 }, { "epoch": 0.05, "grad_norm": 2.1318377198138267, "learning_rate": 4.995897335174322e-06, "loss": 0.622, "step": 51 }, { "epoch": 0.05, "grad_norm": 2.0673034122993537, "learning_rate": 4.995694822732893e-06, "loss": 0.6036, "step": 52 }, { "epoch": 0.05, "grad_norm": 2.195105312645423, "learning_rate": 4.9954874361454055e-06, "loss": 0.6052, "step": 53 }, { "epoch": 0.05, "grad_norm": 2.157855029176061, "learning_rate": 4.995275175816892e-06, "loss": 0.6455, "step": 54 }, { "epoch": 0.05, "grad_norm": 2.0500405783991043, "learning_rate": 4.9950580421619e-06, "loss": 0.6353, "step": 55 }, { "epoch": 0.05, "grad_norm": 2.199629904296075, "learning_rate": 4.9948360356044965e-06, "loss": 0.6122, "step": 56 }, { "epoch": 0.05, "grad_norm": 2.186847580161491, "learning_rate": 4.994609156578267e-06, "loss": 0.6073, "step": 57 }, { "epoch": 0.05, "grad_norm": 2.0207512037097835, "learning_rate": 4.994377405526308e-06, "loss": 0.61, "step": 58 }, { "epoch": 0.05, "grad_norm": 2.3170193964114976, "learning_rate": 4.994140782901237e-06, "loss": 0.6322, "step": 59 }, { "epoch": 0.05, "grad_norm": 2.014785890436746, "learning_rate": 4.9938992891651825e-06, "loss": 0.6205, "step": 60 }, { "epoch": 0.05, "grad_norm": 1.9538385063221935, "learning_rate": 4.9936529247897854e-06, "loss": 0.5992, "step": 61 }, { "epoch": 0.05, "grad_norm": 2.084943826856202, "learning_rate": 4.993401690256203e-06, "loss": 0.6148, "step": 62 }, { "epoch": 0.06, "grad_norm": 2.135158856581583, "learning_rate": 4.9931455860551e-06, "loss": 0.5937, "step": 63 }, { "epoch": 0.06, "grad_norm": 1.982621418518698, "learning_rate": 4.992884612686655e-06, "loss": 0.6091, "step": 64 }, { "epoch": 0.06, "grad_norm": 2.1030931953494956, "learning_rate": 4.992618770660553e-06, "loss": 0.6034, "step": 65 }, { "epoch": 0.06, "grad_norm": 2.1994634556563994, "learning_rate": 4.992348060495989e-06, "loss": 0.5846, "step": 66 }, { "epoch": 0.06, "grad_norm": 2.410691403277427, "learning_rate": 4.992072482721669e-06, "loss": 0.6294, "step": 67 }, { "epoch": 0.06, "grad_norm": 1.9720494401999067, "learning_rate": 4.991792037875799e-06, "loss": 0.591, "step": 68 }, { "epoch": 0.06, "grad_norm": 2.147504025949435, "learning_rate": 4.991506726506094e-06, "loss": 0.5689, "step": 69 }, { "epoch": 0.06, "grad_norm": 2.1837702519904223, "learning_rate": 4.991216549169776e-06, "loss": 0.6422, "step": 70 }, { "epoch": 0.06, "grad_norm": 2.0883865330274958, "learning_rate": 4.9909215064335655e-06, "loss": 0.6076, "step": 71 }, { "epoch": 0.06, "grad_norm": 2.20727863923846, "learning_rate": 4.990621598873687e-06, "loss": 0.5974, "step": 72 }, { "epoch": 0.06, "grad_norm": 2.0735330806418464, "learning_rate": 4.990316827075868e-06, "loss": 0.6809, "step": 73 }, { "epoch": 0.07, "grad_norm": 2.0203203347538774, "learning_rate": 4.990007191635334e-06, "loss": 0.6107, "step": 74 }, { "epoch": 0.07, "grad_norm": 2.234889365362174, "learning_rate": 4.989692693156809e-06, "loss": 0.6218, "step": 75 }, { "epoch": 0.07, "grad_norm": 1.9902503343433904, "learning_rate": 4.989373332254516e-06, "loss": 0.6257, "step": 76 }, { "epoch": 0.07, "grad_norm": 2.1041971507252466, "learning_rate": 4.989049109552173e-06, "loss": 0.5888, "step": 77 }, { "epoch": 0.07, "grad_norm": 2.1151685783302123, "learning_rate": 4.988720025682995e-06, "loss": 0.6333, "step": 78 }, { "epoch": 0.07, "grad_norm": 1.9223819269893592, "learning_rate": 4.988386081289689e-06, "loss": 0.6442, "step": 79 }, { "epoch": 0.07, "grad_norm": 2.139676463756265, "learning_rate": 4.988047277024456e-06, "loss": 0.5966, "step": 80 }, { "epoch": 0.07, "grad_norm": 2.1665820212993068, "learning_rate": 4.987703613548988e-06, "loss": 0.603, "step": 81 }, { "epoch": 0.07, "grad_norm": 1.931456975470041, "learning_rate": 4.987355091534467e-06, "loss": 0.6122, "step": 82 }, { "epoch": 0.07, "grad_norm": 2.134995092135601, "learning_rate": 4.987001711661566e-06, "loss": 0.6213, "step": 83 }, { "epoch": 0.07, "grad_norm": 2.0173352657570818, "learning_rate": 4.98664347462044e-06, "loss": 0.5966, "step": 84 }, { "epoch": 0.08, "grad_norm": 2.0816939924571183, "learning_rate": 4.986280381110737e-06, "loss": 0.5575, "step": 85 }, { "epoch": 0.08, "grad_norm": 2.0072477771163357, "learning_rate": 4.985912431841584e-06, "loss": 0.6225, "step": 86 }, { "epoch": 0.08, "grad_norm": 2.1895945454214507, "learning_rate": 4.985539627531596e-06, "loss": 0.6169, "step": 87 }, { "epoch": 0.08, "grad_norm": 2.84518214074801, "learning_rate": 4.985161968908866e-06, "loss": 0.6317, "step": 88 }, { "epoch": 0.08, "grad_norm": 2.194209857089938, "learning_rate": 4.984779456710971e-06, "loss": 0.6205, "step": 89 }, { "epoch": 0.08, "grad_norm": 2.1604595364123083, "learning_rate": 4.9843920916849645e-06, "loss": 0.6176, "step": 90 }, { "epoch": 0.08, "grad_norm": 2.039087518829079, "learning_rate": 4.9839998745873795e-06, "loss": 0.5842, "step": 91 }, { "epoch": 0.08, "grad_norm": 2.0148570016863334, "learning_rate": 4.983602806184225e-06, "loss": 0.5936, "step": 92 }, { "epoch": 0.08, "grad_norm": 2.073137159272384, "learning_rate": 4.983200887250982e-06, "loss": 0.6317, "step": 93 }, { "epoch": 0.08, "grad_norm": 2.045469602089007, "learning_rate": 4.9827941185726095e-06, "loss": 0.5338, "step": 94 }, { "epoch": 0.08, "grad_norm": 2.1201743116757417, "learning_rate": 4.982382500943533e-06, "loss": 0.6133, "step": 95 }, { "epoch": 0.09, "grad_norm": 2.0637214917996363, "learning_rate": 4.981966035167654e-06, "loss": 0.6483, "step": 96 }, { "epoch": 0.09, "grad_norm": 2.155574452675582, "learning_rate": 4.981544722058336e-06, "loss": 0.6001, "step": 97 }, { "epoch": 0.09, "grad_norm": 1.9347601392775928, "learning_rate": 4.981118562438414e-06, "loss": 0.5954, "step": 98 }, { "epoch": 0.09, "grad_norm": 2.3054537863874756, "learning_rate": 4.980687557140187e-06, "loss": 0.6338, "step": 99 }, { "epoch": 0.09, "grad_norm": 2.0421104909837338, "learning_rate": 4.980251707005417e-06, "loss": 0.6166, "step": 100 }, { "epoch": 0.09, "grad_norm": 2.023167301994367, "learning_rate": 4.979811012885329e-06, "loss": 0.5682, "step": 101 }, { "epoch": 0.09, "grad_norm": 2.0583654213007967, "learning_rate": 4.979365475640609e-06, "loss": 0.5759, "step": 102 }, { "epoch": 0.09, "grad_norm": 2.008917223929121, "learning_rate": 4.9789150961414e-06, "loss": 0.6324, "step": 103 }, { "epoch": 0.09, "grad_norm": 2.1111479338304306, "learning_rate": 4.978459875267303e-06, "loss": 0.5821, "step": 104 }, { "epoch": 0.09, "grad_norm": 2.400366962461983, "learning_rate": 4.977999813907375e-06, "loss": 0.5699, "step": 105 }, { "epoch": 0.09, "grad_norm": 2.090668061316384, "learning_rate": 4.977534912960124e-06, "loss": 0.5754, "step": 106 }, { "epoch": 0.09, "grad_norm": 2.2103419288491466, "learning_rate": 4.977065173333515e-06, "loss": 0.6005, "step": 107 }, { "epoch": 0.1, "grad_norm": 2.1332380447628294, "learning_rate": 4.9765905959449565e-06, "loss": 0.6178, "step": 108 }, { "epoch": 0.1, "grad_norm": 2.1372224949542464, "learning_rate": 4.976111181721309e-06, "loss": 0.6021, "step": 109 }, { "epoch": 0.1, "grad_norm": 2.636052326949506, "learning_rate": 4.97562693159888e-06, "loss": 0.6418, "step": 110 }, { "epoch": 0.1, "grad_norm": 2.1234423477493443, "learning_rate": 4.975137846523419e-06, "loss": 0.6231, "step": 111 }, { "epoch": 0.1, "grad_norm": 2.2817790529425315, "learning_rate": 4.974643927450121e-06, "loss": 0.5681, "step": 112 }, { "epoch": 0.1, "grad_norm": 2.2605060344304713, "learning_rate": 4.9741451753436205e-06, "loss": 0.5803, "step": 113 }, { "epoch": 0.1, "grad_norm": 2.0355236974665876, "learning_rate": 4.973641591177991e-06, "loss": 0.6003, "step": 114 }, { "epoch": 0.1, "grad_norm": 2.4343221170301415, "learning_rate": 4.973133175936743e-06, "loss": 0.5882, "step": 115 }, { "epoch": 0.1, "grad_norm": 2.2135760843199734, "learning_rate": 4.972619930612822e-06, "loss": 0.5886, "step": 116 }, { "epoch": 0.1, "grad_norm": 2.161909448676307, "learning_rate": 4.972101856208609e-06, "loss": 0.5792, "step": 117 }, { "epoch": 0.1, "grad_norm": 2.0871148781401927, "learning_rate": 4.9715789537359126e-06, "loss": 0.6383, "step": 118 }, { "epoch": 0.11, "grad_norm": 2.1159018206478626, "learning_rate": 4.971051224215973e-06, "loss": 0.5865, "step": 119 }, { "epoch": 0.11, "grad_norm": 2.2036428070670375, "learning_rate": 4.970518668679459e-06, "loss": 0.5905, "step": 120 }, { "epoch": 0.11, "grad_norm": 2.22262007661876, "learning_rate": 4.969981288166461e-06, "loss": 0.5951, "step": 121 }, { "epoch": 0.11, "grad_norm": 2.0713458839382786, "learning_rate": 4.969439083726496e-06, "loss": 0.6011, "step": 122 }, { "epoch": 0.11, "grad_norm": 2.0686060725186897, "learning_rate": 4.9688920564185e-06, "loss": 0.6038, "step": 123 }, { "epoch": 0.11, "grad_norm": 2.1825376161159964, "learning_rate": 4.968340207310832e-06, "loss": 0.6098, "step": 124 }, { "epoch": 0.11, "grad_norm": 2.142436541976576, "learning_rate": 4.967783537481262e-06, "loss": 0.6119, "step": 125 }, { "epoch": 0.11, "grad_norm": 2.330044622755397, "learning_rate": 4.967222048016979e-06, "loss": 0.6057, "step": 126 }, { "epoch": 0.11, "grad_norm": 2.109116942854107, "learning_rate": 4.966655740014585e-06, "loss": 0.5958, "step": 127 }, { "epoch": 0.11, "grad_norm": 2.174219068914296, "learning_rate": 4.9660846145800914e-06, "loss": 0.6276, "step": 128 }, { "epoch": 0.11, "grad_norm": 2.135736248304593, "learning_rate": 4.965508672828918e-06, "loss": 0.6309, "step": 129 }, { "epoch": 0.12, "grad_norm": 2.2339234058672885, "learning_rate": 4.964927915885893e-06, "loss": 0.5879, "step": 130 }, { "epoch": 0.12, "grad_norm": 2.0960660335616224, "learning_rate": 4.9643423448852455e-06, "loss": 0.6218, "step": 131 }, { "epoch": 0.12, "grad_norm": 1.9468729925472703, "learning_rate": 4.963751960970609e-06, "loss": 0.5998, "step": 132 }, { "epoch": 0.12, "grad_norm": 2.1623168252289915, "learning_rate": 4.9631567652950164e-06, "loss": 0.6885, "step": 133 }, { "epoch": 0.12, "grad_norm": 2.084420579583794, "learning_rate": 4.962556759020898e-06, "loss": 0.5758, "step": 134 }, { "epoch": 0.12, "grad_norm": 2.1082890389844713, "learning_rate": 4.961951943320078e-06, "loss": 0.6116, "step": 135 }, { "epoch": 0.12, "grad_norm": 2.006123424806457, "learning_rate": 4.9613423193737754e-06, "loss": 0.5708, "step": 136 }, { "epoch": 0.12, "grad_norm": 2.309431970929405, "learning_rate": 4.960727888372599e-06, "loss": 0.621, "step": 137 }, { "epoch": 0.12, "grad_norm": 2.226488524758773, "learning_rate": 4.9601086515165456e-06, "loss": 0.5896, "step": 138 }, { "epoch": 0.12, "grad_norm": 2.1242070778655253, "learning_rate": 4.959484610014997e-06, "loss": 0.624, "step": 139 }, { "epoch": 0.12, "grad_norm": 2.2147491445730516, "learning_rate": 4.958855765086722e-06, "loss": 0.6064, "step": 140 }, { "epoch": 0.12, "grad_norm": 2.1818004600393, "learning_rate": 4.958222117959868e-06, "loss": 0.6252, "step": 141 }, { "epoch": 0.13, "grad_norm": 2.1094535889409696, "learning_rate": 4.95758366987196e-06, "loss": 0.5779, "step": 142 }, { "epoch": 0.13, "grad_norm": 2.2043056809252577, "learning_rate": 4.9569404220699025e-06, "loss": 0.6156, "step": 143 }, { "epoch": 0.13, "grad_norm": 2.158056342799238, "learning_rate": 4.956292375809971e-06, "loss": 0.5662, "step": 144 }, { "epoch": 0.13, "grad_norm": 1.987581635345228, "learning_rate": 4.955639532357815e-06, "loss": 0.6148, "step": 145 }, { "epoch": 0.13, "grad_norm": 2.266145451051948, "learning_rate": 4.954981892988451e-06, "loss": 0.5867, "step": 146 }, { "epoch": 0.13, "grad_norm": 2.071082600205798, "learning_rate": 4.954319458986264e-06, "loss": 0.5976, "step": 147 }, { "epoch": 0.13, "grad_norm": 2.1615342548575374, "learning_rate": 4.953652231645002e-06, "loss": 0.5643, "step": 148 }, { "epoch": 0.13, "grad_norm": 2.145126231371731, "learning_rate": 4.952980212267773e-06, "loss": 0.5592, "step": 149 }, { "epoch": 0.13, "grad_norm": 1.9161750244434461, "learning_rate": 4.952303402167047e-06, "loss": 0.5547, "step": 150 }, { "epoch": 0.13, "grad_norm": 2.234370958372018, "learning_rate": 4.9516218026646475e-06, "loss": 0.578, "step": 151 }, { "epoch": 0.13, "grad_norm": 2.149553338429868, "learning_rate": 4.950935415091753e-06, "loss": 0.5952, "step": 152 }, { "epoch": 0.14, "grad_norm": 2.1021801657048016, "learning_rate": 4.950244240788895e-06, "loss": 0.573, "step": 153 }, { "epoch": 0.14, "grad_norm": 2.488711367210497, "learning_rate": 4.949548281105951e-06, "loss": 0.5776, "step": 154 }, { "epoch": 0.14, "grad_norm": 2.0302393290147167, "learning_rate": 4.948847537402145e-06, "loss": 0.5685, "step": 155 }, { "epoch": 0.14, "grad_norm": 2.1563261797248043, "learning_rate": 4.948142011046044e-06, "loss": 0.6185, "step": 156 }, { "epoch": 0.14, "grad_norm": 2.1308303224609997, "learning_rate": 4.947431703415558e-06, "loss": 0.6229, "step": 157 }, { "epoch": 0.14, "grad_norm": 2.0988414912992273, "learning_rate": 4.946716615897932e-06, "loss": 0.6167, "step": 158 }, { "epoch": 0.14, "grad_norm": 2.3558302474583095, "learning_rate": 4.9459967498897485e-06, "loss": 0.5903, "step": 159 }, { "epoch": 0.14, "grad_norm": 2.1505555405055223, "learning_rate": 4.945272106796919e-06, "loss": 0.5709, "step": 160 }, { "epoch": 0.14, "grad_norm": 2.0604140956574635, "learning_rate": 4.94454268803469e-06, "loss": 0.635, "step": 161 }, { "epoch": 0.14, "grad_norm": 2.3699836246614696, "learning_rate": 4.943808495027631e-06, "loss": 0.581, "step": 162 }, { "epoch": 0.14, "grad_norm": 1.9809907136859368, "learning_rate": 4.9430695292096365e-06, "loss": 0.5703, "step": 163 }, { "epoch": 0.15, "grad_norm": 2.213101907296851, "learning_rate": 4.942325792023922e-06, "loss": 0.5915, "step": 164 }, { "epoch": 0.15, "grad_norm": 2.3778783149383944, "learning_rate": 4.941577284923025e-06, "loss": 0.537, "step": 165 }, { "epoch": 0.15, "grad_norm": 1.9283694807512721, "learning_rate": 4.9408240093687934e-06, "loss": 0.579, "step": 166 }, { "epoch": 0.15, "grad_norm": 2.083087334039033, "learning_rate": 4.940065966832392e-06, "loss": 0.5612, "step": 167 }, { "epoch": 0.15, "grad_norm": 2.314684793845775, "learning_rate": 4.939303158794294e-06, "loss": 0.6001, "step": 168 }, { "epoch": 0.15, "grad_norm": 2.131977461745334, "learning_rate": 4.93853558674428e-06, "loss": 0.5809, "step": 169 }, { "epoch": 0.15, "grad_norm": 2.1291924932946755, "learning_rate": 4.937763252181434e-06, "loss": 0.6216, "step": 170 }, { "epoch": 0.15, "grad_norm": 1.9366549866764742, "learning_rate": 4.936986156614144e-06, "loss": 0.5888, "step": 171 }, { "epoch": 0.15, "grad_norm": 2.231889540095555, "learning_rate": 4.9362043015600934e-06, "loss": 0.6437, "step": 172 }, { "epoch": 0.15, "grad_norm": 2.0696023557568233, "learning_rate": 4.9354176885462626e-06, "loss": 0.5951, "step": 173 }, { "epoch": 0.15, "grad_norm": 2.10974806039572, "learning_rate": 4.934626319108923e-06, "loss": 0.5817, "step": 174 }, { "epoch": 0.16, "grad_norm": 2.0633698321381946, "learning_rate": 4.933830194793636e-06, "loss": 0.5692, "step": 175 }, { "epoch": 0.16, "grad_norm": 2.0163693967733423, "learning_rate": 4.933029317155251e-06, "loss": 0.5322, "step": 176 }, { "epoch": 0.16, "grad_norm": 2.1118176135699813, "learning_rate": 4.932223687757899e-06, "loss": 0.5809, "step": 177 }, { "epoch": 0.16, "grad_norm": 2.181431947183138, "learning_rate": 4.9314133081749906e-06, "loss": 0.5444, "step": 178 }, { "epoch": 0.16, "grad_norm": 2.2055197469621386, "learning_rate": 4.930598179989215e-06, "loss": 0.6063, "step": 179 }, { "epoch": 0.16, "grad_norm": 2.1103699877035638, "learning_rate": 4.929778304792537e-06, "loss": 0.5908, "step": 180 }, { "epoch": 0.16, "grad_norm": 2.01692648335164, "learning_rate": 4.928953684186189e-06, "loss": 0.5729, "step": 181 }, { "epoch": 0.16, "grad_norm": 1.990744003423107, "learning_rate": 4.928124319780673e-06, "loss": 0.5935, "step": 182 }, { "epoch": 0.16, "grad_norm": 1.9898687560952446, "learning_rate": 4.9272902131957555e-06, "loss": 0.6008, "step": 183 }, { "epoch": 0.16, "grad_norm": 1.9499116832570582, "learning_rate": 4.926451366060465e-06, "loss": 0.5731, "step": 184 }, { "epoch": 0.16, "grad_norm": 1.8933258467243923, "learning_rate": 4.925607780013088e-06, "loss": 0.5822, "step": 185 }, { "epoch": 0.16, "grad_norm": 1.9711936623837691, "learning_rate": 4.924759456701167e-06, "loss": 0.5433, "step": 186 }, { "epoch": 0.17, "grad_norm": 1.9981254191144715, "learning_rate": 4.923906397781495e-06, "loss": 0.5603, "step": 187 }, { "epoch": 0.17, "grad_norm": 1.9489584101682442, "learning_rate": 4.923048604920115e-06, "loss": 0.592, "step": 188 }, { "epoch": 0.17, "grad_norm": 2.14587896098926, "learning_rate": 4.922186079792315e-06, "loss": 0.5861, "step": 189 }, { "epoch": 0.17, "grad_norm": 2.093505234897306, "learning_rate": 4.921318824082625e-06, "loss": 0.5756, "step": 190 }, { "epoch": 0.17, "grad_norm": 1.9726924068956073, "learning_rate": 4.920446839484814e-06, "loss": 0.5954, "step": 191 }, { "epoch": 0.17, "grad_norm": 2.0009011296035886, "learning_rate": 4.919570127701888e-06, "loss": 0.5185, "step": 192 }, { "epoch": 0.17, "grad_norm": 2.0801246171281993, "learning_rate": 4.9186886904460826e-06, "loss": 0.5788, "step": 193 }, { "epoch": 0.17, "grad_norm": 2.7712602468155096, "learning_rate": 4.917802529438865e-06, "loss": 0.6637, "step": 194 }, { "epoch": 0.17, "grad_norm": 1.9721040372060654, "learning_rate": 4.916911646410926e-06, "loss": 0.5926, "step": 195 }, { "epoch": 0.17, "grad_norm": 2.1199089061376855, "learning_rate": 4.91601604310218e-06, "loss": 0.5854, "step": 196 }, { "epoch": 0.17, "grad_norm": 1.9518281461372036, "learning_rate": 4.915115721261759e-06, "loss": 0.5456, "step": 197 }, { "epoch": 0.18, "grad_norm": 2.1537515435847734, "learning_rate": 4.9142106826480114e-06, "loss": 0.6152, "step": 198 }, { "epoch": 0.18, "grad_norm": 2.3461320565666344, "learning_rate": 4.913300929028498e-06, "loss": 0.617, "step": 199 }, { "epoch": 0.18, "grad_norm": 1.9789785575462193, "learning_rate": 4.912386462179987e-06, "loss": 0.5845, "step": 200 }, { "epoch": 0.18, "grad_norm": 2.0705337307209253, "learning_rate": 4.9114672838884515e-06, "loss": 0.6062, "step": 201 }, { "epoch": 0.18, "grad_norm": 1.9972918925367322, "learning_rate": 4.910543395949066e-06, "loss": 0.6318, "step": 202 }, { "epoch": 0.18, "grad_norm": 2.03173534028091, "learning_rate": 4.9096148001662055e-06, "loss": 0.64, "step": 203 }, { "epoch": 0.18, "grad_norm": 2.0861416304602356, "learning_rate": 4.908681498353436e-06, "loss": 0.5859, "step": 204 }, { "epoch": 0.18, "grad_norm": 1.932510611788884, "learning_rate": 4.907743492333517e-06, "loss": 0.5483, "step": 205 }, { "epoch": 0.18, "grad_norm": 1.9618471764126828, "learning_rate": 4.906800783938395e-06, "loss": 0.5767, "step": 206 }, { "epoch": 0.18, "grad_norm": 2.3557796360921786, "learning_rate": 4.905853375009198e-06, "loss": 0.5934, "step": 207 }, { "epoch": 0.18, "grad_norm": 2.0993364379712784, "learning_rate": 4.9049012673962385e-06, "loss": 0.5879, "step": 208 }, { "epoch": 0.19, "grad_norm": 2.2015612636555155, "learning_rate": 4.903944462959001e-06, "loss": 0.5598, "step": 209 }, { "epoch": 0.19, "grad_norm": 2.0374544745406062, "learning_rate": 4.902982963566147e-06, "loss": 0.577, "step": 210 }, { "epoch": 0.19, "grad_norm": 2.194866218807, "learning_rate": 4.902016771095506e-06, "loss": 0.5848, "step": 211 }, { "epoch": 0.19, "grad_norm": 2.2545375351308614, "learning_rate": 4.901045887434072e-06, "loss": 0.5846, "step": 212 }, { "epoch": 0.19, "grad_norm": 2.017012770131601, "learning_rate": 4.900070314478001e-06, "loss": 0.5651, "step": 213 }, { "epoch": 0.19, "grad_norm": 2.150900326654639, "learning_rate": 4.899090054132609e-06, "loss": 0.568, "step": 214 }, { "epoch": 0.19, "grad_norm": 2.0404886979870454, "learning_rate": 4.898105108312366e-06, "loss": 0.5277, "step": 215 }, { "epoch": 0.19, "grad_norm": 2.036614254190257, "learning_rate": 4.897115478940892e-06, "loss": 0.5754, "step": 216 }, { "epoch": 0.19, "grad_norm": 2.041133008809928, "learning_rate": 4.896121167950954e-06, "loss": 0.6294, "step": 217 }, { "epoch": 0.19, "grad_norm": 2.0029503409054885, "learning_rate": 4.895122177284465e-06, "loss": 0.5531, "step": 218 }, { "epoch": 0.19, "grad_norm": 2.0303439698174754, "learning_rate": 4.894118508892474e-06, "loss": 0.6008, "step": 219 }, { "epoch": 0.19, "grad_norm": 1.899982778272908, "learning_rate": 4.893110164735167e-06, "loss": 0.6076, "step": 220 }, { "epoch": 0.2, "grad_norm": 2.170640326694132, "learning_rate": 4.892097146781862e-06, "loss": 0.5806, "step": 221 }, { "epoch": 0.2, "grad_norm": 1.961802557992624, "learning_rate": 4.8910794570110055e-06, "loss": 0.5456, "step": 222 }, { "epoch": 0.2, "grad_norm": 2.1149182672715807, "learning_rate": 4.890057097410167e-06, "loss": 0.5683, "step": 223 }, { "epoch": 0.2, "grad_norm": 1.9988574008443096, "learning_rate": 4.889030069976038e-06, "loss": 0.5603, "step": 224 }, { "epoch": 0.2, "grad_norm": 2.137840782586502, "learning_rate": 4.887998376714424e-06, "loss": 0.5713, "step": 225 }, { "epoch": 0.2, "grad_norm": 2.2956357234771634, "learning_rate": 4.886962019640244e-06, "loss": 0.5635, "step": 226 }, { "epoch": 0.2, "grad_norm": 2.2175517801056346, "learning_rate": 4.885921000777528e-06, "loss": 0.631, "step": 227 }, { "epoch": 0.2, "grad_norm": 2.0861966792656546, "learning_rate": 4.884875322159407e-06, "loss": 0.5521, "step": 228 }, { "epoch": 0.2, "grad_norm": 2.170862650134145, "learning_rate": 4.883824985828114e-06, "loss": 0.5953, "step": 229 }, { "epoch": 0.2, "grad_norm": 2.016871028914906, "learning_rate": 4.882769993834978e-06, "loss": 0.5745, "step": 230 }, { "epoch": 0.2, "grad_norm": 2.4069309610367107, "learning_rate": 4.8817103482404236e-06, "loss": 0.5752, "step": 231 }, { "epoch": 0.21, "grad_norm": 1.9834780557891722, "learning_rate": 4.880646051113959e-06, "loss": 0.5619, "step": 232 }, { "epoch": 0.21, "grad_norm": 2.1221686040256005, "learning_rate": 4.87957710453418e-06, "loss": 0.561, "step": 233 }, { "epoch": 0.21, "grad_norm": 2.1497751964139002, "learning_rate": 4.878503510588764e-06, "loss": 0.5754, "step": 234 }, { "epoch": 0.21, "grad_norm": 1.8535318318419167, "learning_rate": 4.877425271374462e-06, "loss": 0.5551, "step": 235 }, { "epoch": 0.21, "grad_norm": 2.1537345489224404, "learning_rate": 4.876342388997099e-06, "loss": 0.544, "step": 236 }, { "epoch": 0.21, "grad_norm": 1.9695512744073471, "learning_rate": 4.875254865571567e-06, "loss": 0.6003, "step": 237 }, { "epoch": 0.21, "grad_norm": 2.2550853928957193, "learning_rate": 4.874162703221823e-06, "loss": 0.5968, "step": 238 }, { "epoch": 0.21, "grad_norm": 2.0658630166795917, "learning_rate": 4.873065904080884e-06, "loss": 0.5658, "step": 239 }, { "epoch": 0.21, "grad_norm": 2.0821280326495524, "learning_rate": 4.871964470290823e-06, "loss": 0.5711, "step": 240 }, { "epoch": 0.21, "grad_norm": 1.9833074137024158, "learning_rate": 4.8708584040027636e-06, "loss": 0.5899, "step": 241 }, { "epoch": 0.21, "grad_norm": 2.0288963441502195, "learning_rate": 4.869747707376877e-06, "loss": 0.5601, "step": 242 }, { "epoch": 0.22, "grad_norm": 2.0970435875726463, "learning_rate": 4.868632382582378e-06, "loss": 0.6381, "step": 243 }, { "epoch": 0.22, "grad_norm": 2.1303280408644194, "learning_rate": 4.86751243179752e-06, "loss": 0.5495, "step": 244 }, { "epoch": 0.22, "grad_norm": 2.0851781018580584, "learning_rate": 4.866387857209591e-06, "loss": 0.5901, "step": 245 }, { "epoch": 0.22, "grad_norm": 1.8310760160854438, "learning_rate": 4.86525866101491e-06, "loss": 0.5513, "step": 246 }, { "epoch": 0.22, "grad_norm": 2.199726167537497, "learning_rate": 4.8641248454188205e-06, "loss": 0.5873, "step": 247 }, { "epoch": 0.22, "grad_norm": 1.9776691221978735, "learning_rate": 4.862986412635691e-06, "loss": 0.6143, "step": 248 }, { "epoch": 0.22, "grad_norm": 2.0663231641830873, "learning_rate": 4.8618433648889034e-06, "loss": 0.5937, "step": 249 }, { "epoch": 0.22, "grad_norm": 2.170520506577784, "learning_rate": 4.860695704410856e-06, "loss": 0.5374, "step": 250 }, { "epoch": 0.22, "grad_norm": 1.9685756224067419, "learning_rate": 4.8595434334429535e-06, "loss": 0.5139, "step": 251 }, { "epoch": 0.22, "grad_norm": 1.9668205539999677, "learning_rate": 4.8583865542356065e-06, "loss": 0.5459, "step": 252 }, { "epoch": 0.22, "grad_norm": 2.0793578279258704, "learning_rate": 4.857225069048226e-06, "loss": 0.593, "step": 253 }, { "epoch": 0.22, "grad_norm": 1.9265474492849337, "learning_rate": 4.8560589801492165e-06, "loss": 0.5559, "step": 254 }, { "epoch": 0.23, "grad_norm": 2.8555278122830696, "learning_rate": 4.854888289815976e-06, "loss": 0.5949, "step": 255 }, { "epoch": 0.23, "grad_norm": 2.063838630196542, "learning_rate": 4.853713000334887e-06, "loss": 0.5712, "step": 256 }, { "epoch": 0.23, "grad_norm": 2.168668910730517, "learning_rate": 4.852533114001316e-06, "loss": 0.5475, "step": 257 }, { "epoch": 0.23, "grad_norm": 2.064042820960706, "learning_rate": 4.8513486331196055e-06, "loss": 0.5616, "step": 258 }, { "epoch": 0.23, "grad_norm": 2.026751060346143, "learning_rate": 4.850159560003074e-06, "loss": 0.5997, "step": 259 }, { "epoch": 0.23, "grad_norm": 2.1228129299875254, "learning_rate": 4.848965896974006e-06, "loss": 0.5622, "step": 260 }, { "epoch": 0.23, "grad_norm": 1.9418510365881214, "learning_rate": 4.847767646363652e-06, "loss": 0.5741, "step": 261 }, { "epoch": 0.23, "grad_norm": 2.070611833895483, "learning_rate": 4.846564810512221e-06, "loss": 0.5729, "step": 262 }, { "epoch": 0.23, "grad_norm": 1.8833621440375596, "learning_rate": 4.845357391768877e-06, "loss": 0.5503, "step": 263 }, { "epoch": 0.23, "grad_norm": 2.1022924907055387, "learning_rate": 4.844145392491735e-06, "loss": 0.6204, "step": 264 }, { "epoch": 0.23, "grad_norm": 2.024625007813473, "learning_rate": 4.842928815047856e-06, "loss": 0.5776, "step": 265 }, { "epoch": 0.24, "grad_norm": 1.9123739071371275, "learning_rate": 4.8417076618132434e-06, "loss": 0.5417, "step": 266 }, { "epoch": 0.24, "grad_norm": 2.062879186086598, "learning_rate": 4.8404819351728336e-06, "loss": 0.5387, "step": 267 }, { "epoch": 0.24, "grad_norm": 1.9944627549250884, "learning_rate": 4.8392516375204986e-06, "loss": 0.5731, "step": 268 }, { "epoch": 0.24, "grad_norm": 1.9859912626846585, "learning_rate": 4.838016771259037e-06, "loss": 0.5969, "step": 269 }, { "epoch": 0.24, "grad_norm": 2.043069520519082, "learning_rate": 4.836777338800168e-06, "loss": 0.6217, "step": 270 }, { "epoch": 0.24, "grad_norm": 1.913212451622778, "learning_rate": 4.835533342564531e-06, "loss": 0.5527, "step": 271 }, { "epoch": 0.24, "grad_norm": 1.978858281238778, "learning_rate": 4.834284784981678e-06, "loss": 0.5997, "step": 272 }, { "epoch": 0.24, "grad_norm": 2.004628826916504, "learning_rate": 4.833031668490067e-06, "loss": 0.551, "step": 273 }, { "epoch": 0.24, "grad_norm": 2.164370107566024, "learning_rate": 4.8317739955370645e-06, "loss": 0.5537, "step": 274 }, { "epoch": 0.24, "grad_norm": 1.891772326146366, "learning_rate": 4.83051176857893e-06, "loss": 0.6075, "step": 275 }, { "epoch": 0.24, "grad_norm": 2.0553128913886645, "learning_rate": 4.8292449900808216e-06, "loss": 0.5854, "step": 276 }, { "epoch": 0.25, "grad_norm": 2.009000622167072, "learning_rate": 4.827973662516786e-06, "loss": 0.5503, "step": 277 }, { "epoch": 0.25, "grad_norm": 1.9385043396652537, "learning_rate": 4.826697788369752e-06, "loss": 0.5704, "step": 278 }, { "epoch": 0.25, "grad_norm": 2.3263786060073826, "learning_rate": 4.8254173701315295e-06, "loss": 0.5604, "step": 279 }, { "epoch": 0.25, "grad_norm": 1.9251504140774536, "learning_rate": 4.8241324103028055e-06, "loss": 0.5647, "step": 280 }, { "epoch": 0.25, "grad_norm": 1.9714117964729747, "learning_rate": 4.822842911393131e-06, "loss": 0.604, "step": 281 }, { "epoch": 0.25, "grad_norm": 2.034372279161665, "learning_rate": 4.821548875920927e-06, "loss": 0.5803, "step": 282 }, { "epoch": 0.25, "grad_norm": 1.9849114644945505, "learning_rate": 4.8202503064134725e-06, "loss": 0.5854, "step": 283 }, { "epoch": 0.25, "grad_norm": 2.3435998455971343, "learning_rate": 4.818947205406902e-06, "loss": 0.4988, "step": 284 }, { "epoch": 0.25, "grad_norm": 2.0672779732760924, "learning_rate": 4.8176395754462e-06, "loss": 0.5734, "step": 285 }, { "epoch": 0.25, "grad_norm": 2.1206384205127544, "learning_rate": 4.816327419085197e-06, "loss": 0.563, "step": 286 }, { "epoch": 0.25, "grad_norm": 2.1105254841893095, "learning_rate": 4.815010738886561e-06, "loss": 0.5765, "step": 287 }, { "epoch": 0.26, "grad_norm": 2.072546090747287, "learning_rate": 4.813689537421798e-06, "loss": 0.6003, "step": 288 }, { "epoch": 0.26, "grad_norm": 2.1131138426394442, "learning_rate": 4.812363817271243e-06, "loss": 0.6097, "step": 289 }, { "epoch": 0.26, "grad_norm": 1.9218545344238502, "learning_rate": 4.811033581024056e-06, "loss": 0.6272, "step": 290 }, { "epoch": 0.26, "grad_norm": 2.235420687671868, "learning_rate": 4.809698831278217e-06, "loss": 0.5519, "step": 291 }, { "epoch": 0.26, "grad_norm": 1.8915062282224397, "learning_rate": 4.808359570640522e-06, "loss": 0.5832, "step": 292 }, { "epoch": 0.26, "grad_norm": 1.9185231023206675, "learning_rate": 4.8070158017265755e-06, "loss": 0.5854, "step": 293 }, { "epoch": 0.26, "grad_norm": 2.086526046887808, "learning_rate": 4.805667527160788e-06, "loss": 0.5314, "step": 294 }, { "epoch": 0.26, "grad_norm": 1.9995370937944454, "learning_rate": 4.804314749576368e-06, "loss": 0.5749, "step": 295 }, { "epoch": 0.26, "grad_norm": 2.099313489806141, "learning_rate": 4.802957471615319e-06, "loss": 0.5173, "step": 296 }, { "epoch": 0.26, "grad_norm": 2.067736275086448, "learning_rate": 4.8015956959284346e-06, "loss": 0.5434, "step": 297 }, { "epoch": 0.26, "grad_norm": 2.005525416579935, "learning_rate": 4.800229425175294e-06, "loss": 0.5589, "step": 298 }, { "epoch": 0.26, "grad_norm": 2.172708847484724, "learning_rate": 4.7988586620242515e-06, "loss": 0.5919, "step": 299 }, { "epoch": 0.27, "grad_norm": 2.0010542748493823, "learning_rate": 4.797483409152438e-06, "loss": 0.5803, "step": 300 }, { "epoch": 0.27, "grad_norm": 2.1169505971764506, "learning_rate": 4.7961036692457516e-06, "loss": 0.5763, "step": 301 }, { "epoch": 0.27, "grad_norm": 2.202849419501746, "learning_rate": 4.794719444998856e-06, "loss": 0.5691, "step": 302 }, { "epoch": 0.27, "grad_norm": 1.9765013761990564, "learning_rate": 4.793330739115169e-06, "loss": 0.5657, "step": 303 }, { "epoch": 0.27, "grad_norm": 2.0404392238791136, "learning_rate": 4.791937554306863e-06, "loss": 0.5648, "step": 304 }, { "epoch": 0.27, "grad_norm": 2.0298920886210516, "learning_rate": 4.790539893294861e-06, "loss": 0.5353, "step": 305 }, { "epoch": 0.27, "grad_norm": 2.03157486915788, "learning_rate": 4.789137758808823e-06, "loss": 0.5716, "step": 306 }, { "epoch": 0.27, "grad_norm": 2.060346338513047, "learning_rate": 4.787731153587149e-06, "loss": 0.5502, "step": 307 }, { "epoch": 0.27, "grad_norm": 1.9286831590091769, "learning_rate": 4.786320080376968e-06, "loss": 0.5646, "step": 308 }, { "epoch": 0.27, "grad_norm": 2.042346254905274, "learning_rate": 4.7849045419341376e-06, "loss": 0.6085, "step": 309 }, { "epoch": 0.27, "grad_norm": 2.0758243469708293, "learning_rate": 4.7834845410232356e-06, "loss": 0.5452, "step": 310 }, { "epoch": 0.28, "grad_norm": 2.0454965773706553, "learning_rate": 4.782060080417553e-06, "loss": 0.514, "step": 311 }, { "epoch": 0.28, "grad_norm": 2.073931876222572, "learning_rate": 4.780631162899094e-06, "loss": 0.5884, "step": 312 }, { "epoch": 0.28, "grad_norm": 1.9699688248650635, "learning_rate": 4.7791977912585645e-06, "loss": 0.529, "step": 313 }, { "epoch": 0.28, "grad_norm": 1.9886162974888701, "learning_rate": 4.7777599682953696e-06, "loss": 0.5796, "step": 314 }, { "epoch": 0.28, "grad_norm": 1.9579685029739566, "learning_rate": 4.7763176968176106e-06, "loss": 0.5553, "step": 315 }, { "epoch": 0.28, "grad_norm": 2.2181861411036086, "learning_rate": 4.7748709796420735e-06, "loss": 0.5806, "step": 316 }, { "epoch": 0.28, "grad_norm": 2.0345738930041777, "learning_rate": 4.773419819594228e-06, "loss": 0.6059, "step": 317 }, { "epoch": 0.28, "grad_norm": 2.0710385535524902, "learning_rate": 4.7719642195082224e-06, "loss": 0.5539, "step": 318 }, { "epoch": 0.28, "grad_norm": 2.1239710444371442, "learning_rate": 4.770504182226875e-06, "loss": 0.5655, "step": 319 }, { "epoch": 0.28, "grad_norm": 1.9564631444382952, "learning_rate": 4.769039710601669e-06, "loss": 0.5914, "step": 320 }, { "epoch": 0.28, "grad_norm": 1.9969926160116234, "learning_rate": 4.767570807492752e-06, "loss": 0.55, "step": 321 }, { "epoch": 0.29, "grad_norm": 1.9650736880864492, "learning_rate": 4.766097475768919e-06, "loss": 0.5804, "step": 322 }, { "epoch": 0.29, "grad_norm": 2.1946368157969194, "learning_rate": 4.7646197183076236e-06, "loss": 0.5631, "step": 323 }, { "epoch": 0.29, "grad_norm": 1.9834181085585831, "learning_rate": 4.763137537994955e-06, "loss": 0.5779, "step": 324 }, { "epoch": 0.29, "grad_norm": 2.1081651164417057, "learning_rate": 4.7616509377256445e-06, "loss": 0.5375, "step": 325 }, { "epoch": 0.29, "grad_norm": 1.9972027344990544, "learning_rate": 4.760159920403055e-06, "loss": 0.5608, "step": 326 }, { "epoch": 0.29, "grad_norm": 1.9554967826543683, "learning_rate": 4.758664488939174e-06, "loss": 0.5613, "step": 327 }, { "epoch": 0.29, "grad_norm": 2.211716512822424, "learning_rate": 4.757164646254614e-06, "loss": 0.5863, "step": 328 }, { "epoch": 0.29, "grad_norm": 1.9203184200502181, "learning_rate": 4.755660395278598e-06, "loss": 0.5275, "step": 329 }, { "epoch": 0.29, "grad_norm": 2.0355308159742505, "learning_rate": 4.7541517389489626e-06, "loss": 0.5304, "step": 330 }, { "epoch": 0.29, "grad_norm": 2.005680103405306, "learning_rate": 4.752638680212145e-06, "loss": 0.5782, "step": 331 }, { "epoch": 0.29, "grad_norm": 1.9930094995522492, "learning_rate": 4.751121222023183e-06, "loss": 0.5197, "step": 332 }, { "epoch": 0.29, "grad_norm": 2.130907347619711, "learning_rate": 4.749599367345703e-06, "loss": 0.5453, "step": 333 }, { "epoch": 0.3, "grad_norm": 2.0380649677356715, "learning_rate": 4.748073119151923e-06, "loss": 0.5394, "step": 334 }, { "epoch": 0.3, "grad_norm": 2.02655053696048, "learning_rate": 4.7465424804226366e-06, "loss": 0.5359, "step": 335 }, { "epoch": 0.3, "grad_norm": 2.108255877778432, "learning_rate": 4.745007454147215e-06, "loss": 0.5262, "step": 336 }, { "epoch": 0.3, "grad_norm": 1.8422966312136684, "learning_rate": 4.7434680433235986e-06, "loss": 0.529, "step": 337 }, { "epoch": 0.3, "grad_norm": 2.1387816386921004, "learning_rate": 4.741924250958289e-06, "loss": 0.5599, "step": 338 }, { "epoch": 0.3, "grad_norm": 2.2063774820548794, "learning_rate": 4.740376080066346e-06, "loss": 0.6014, "step": 339 }, { "epoch": 0.3, "grad_norm": 1.917696303327652, "learning_rate": 4.738823533671383e-06, "loss": 0.615, "step": 340 }, { "epoch": 0.3, "grad_norm": 2.0283765999277916, "learning_rate": 4.737266614805554e-06, "loss": 0.5802, "step": 341 }, { "epoch": 0.3, "grad_norm": 2.0340264609590437, "learning_rate": 4.7357053265095575e-06, "loss": 0.5331, "step": 342 }, { "epoch": 0.3, "grad_norm": 2.102037194450825, "learning_rate": 4.734139671832622e-06, "loss": 0.5534, "step": 343 }, { "epoch": 0.3, "grad_norm": 2.4389875670618113, "learning_rate": 4.732569653832505e-06, "loss": 0.5637, "step": 344 }, { "epoch": 0.31, "grad_norm": 2.1143521053252012, "learning_rate": 4.730995275575486e-06, "loss": 0.6539, "step": 345 }, { "epoch": 0.31, "grad_norm": 2.6240136232872064, "learning_rate": 4.7294165401363616e-06, "loss": 0.5515, "step": 346 }, { "epoch": 0.31, "grad_norm": 2.037602072097695, "learning_rate": 4.727833450598433e-06, "loss": 0.5609, "step": 347 }, { "epoch": 0.31, "grad_norm": 2.10711733636797, "learning_rate": 4.72624601005351e-06, "loss": 0.5719, "step": 348 }, { "epoch": 0.31, "grad_norm": 2.277613433738313, "learning_rate": 4.724654221601899e-06, "loss": 0.5815, "step": 349 }, { "epoch": 0.31, "grad_norm": 2.0082624113337824, "learning_rate": 4.7230580883523955e-06, "loss": 0.5524, "step": 350 }, { "epoch": 0.31, "grad_norm": 1.8922591374161477, "learning_rate": 4.721457613422285e-06, "loss": 0.5981, "step": 351 }, { "epoch": 0.31, "grad_norm": 2.108229047424278, "learning_rate": 4.7198527999373266e-06, "loss": 0.57, "step": 352 }, { "epoch": 0.31, "grad_norm": 2.152965480400126, "learning_rate": 4.718243651031759e-06, "loss": 0.5996, "step": 353 }, { "epoch": 0.31, "grad_norm": 1.8885994019827148, "learning_rate": 4.716630169848282e-06, "loss": 0.5543, "step": 354 }, { "epoch": 0.31, "grad_norm": 2.221396082747074, "learning_rate": 4.715012359538062e-06, "loss": 0.5423, "step": 355 }, { "epoch": 0.32, "grad_norm": 2.247525651087526, "learning_rate": 4.7133902232607145e-06, "loss": 0.6049, "step": 356 }, { "epoch": 0.32, "grad_norm": 1.905837742487114, "learning_rate": 4.711763764184309e-06, "loss": 0.5523, "step": 357 }, { "epoch": 0.32, "grad_norm": 2.117965067814315, "learning_rate": 4.710132985485355e-06, "loss": 0.5682, "step": 358 }, { "epoch": 0.32, "grad_norm": 2.1530948606389373, "learning_rate": 4.7084978903487985e-06, "loss": 0.5506, "step": 359 }, { "epoch": 0.32, "grad_norm": 1.8738866858316863, "learning_rate": 4.706858481968017e-06, "loss": 0.5426, "step": 360 }, { "epoch": 0.32, "grad_norm": 1.9967053512246618, "learning_rate": 4.705214763544806e-06, "loss": 0.5555, "step": 361 }, { "epoch": 0.32, "grad_norm": 2.352080896364055, "learning_rate": 4.703566738289389e-06, "loss": 0.587, "step": 362 }, { "epoch": 0.32, "grad_norm": 2.031696719881503, "learning_rate": 4.701914409420392e-06, "loss": 0.6088, "step": 363 }, { "epoch": 0.32, "grad_norm": 2.140107830595095, "learning_rate": 4.700257780164849e-06, "loss": 0.5596, "step": 364 }, { "epoch": 0.32, "grad_norm": 2.125236417141067, "learning_rate": 4.698596853758194e-06, "loss": 0.5513, "step": 365 }, { "epoch": 0.32, "grad_norm": 1.8878623518397697, "learning_rate": 4.696931633444251e-06, "loss": 0.5557, "step": 366 }, { "epoch": 0.33, "grad_norm": 1.9523463678463824, "learning_rate": 4.695262122475232e-06, "loss": 0.5317, "step": 367 }, { "epoch": 0.33, "grad_norm": 2.3748547328434455, "learning_rate": 4.6935883241117286e-06, "loss": 0.5733, "step": 368 }, { "epoch": 0.33, "grad_norm": 1.9248854873148575, "learning_rate": 4.691910241622704e-06, "loss": 0.5523, "step": 369 }, { "epoch": 0.33, "grad_norm": 2.1731794693383923, "learning_rate": 4.69022787828549e-06, "loss": 0.6489, "step": 370 }, { "epoch": 0.33, "grad_norm": 1.996570702327501, "learning_rate": 4.688541237385781e-06, "loss": 0.584, "step": 371 }, { "epoch": 0.33, "grad_norm": 2.0272036390008097, "learning_rate": 4.68685032221762e-06, "loss": 0.554, "step": 372 }, { "epoch": 0.33, "grad_norm": 1.9986403184037858, "learning_rate": 4.685155136083401e-06, "loss": 0.5798, "step": 373 }, { "epoch": 0.33, "grad_norm": 2.24642442330448, "learning_rate": 4.683455682293863e-06, "loss": 0.5486, "step": 374 }, { "epoch": 0.33, "grad_norm": 2.916261956844043, "learning_rate": 4.681751964168071e-06, "loss": 0.5678, "step": 375 }, { "epoch": 0.33, "grad_norm": 2.1597492287443396, "learning_rate": 4.680043985033427e-06, "loss": 0.5801, "step": 376 }, { "epoch": 0.33, "grad_norm": 1.9634034606261326, "learning_rate": 4.6783317482256506e-06, "loss": 0.5412, "step": 377 }, { "epoch": 0.33, "grad_norm": 2.0128604293697263, "learning_rate": 4.676615257088777e-06, "loss": 0.5538, "step": 378 }, { "epoch": 0.34, "grad_norm": 2.2205659530523976, "learning_rate": 4.674894514975149e-06, "loss": 0.494, "step": 379 }, { "epoch": 0.34, "grad_norm": 2.000557085172021, "learning_rate": 4.673169525245416e-06, "loss": 0.5459, "step": 380 }, { "epoch": 0.34, "grad_norm": 2.0089256125274826, "learning_rate": 4.671440291268518e-06, "loss": 0.5729, "step": 381 }, { "epoch": 0.34, "grad_norm": 2.076112293053539, "learning_rate": 4.66970681642169e-06, "loss": 0.5277, "step": 382 }, { "epoch": 0.34, "grad_norm": 1.996445627957894, "learning_rate": 4.667969104090441e-06, "loss": 0.5879, "step": 383 }, { "epoch": 0.34, "grad_norm": 2.379165029211644, "learning_rate": 4.666227157668564e-06, "loss": 0.5924, "step": 384 }, { "epoch": 0.34, "grad_norm": 2.101190475222136, "learning_rate": 4.664480980558118e-06, "loss": 0.6466, "step": 385 }, { "epoch": 0.34, "grad_norm": 2.035159570620747, "learning_rate": 4.662730576169423e-06, "loss": 0.5979, "step": 386 }, { "epoch": 0.34, "grad_norm": 2.1034174780447814, "learning_rate": 4.660975947921058e-06, "loss": 0.5635, "step": 387 }, { "epoch": 0.34, "grad_norm": 2.131573174129039, "learning_rate": 4.65921709923985e-06, "loss": 0.5602, "step": 388 }, { "epoch": 0.34, "grad_norm": 1.9282515780121203, "learning_rate": 4.657454033560868e-06, "loss": 0.5292, "step": 389 }, { "epoch": 0.35, "grad_norm": 1.922997066030009, "learning_rate": 4.655686754327419e-06, "loss": 0.5475, "step": 390 }, { "epoch": 0.35, "grad_norm": 1.9692624098665525, "learning_rate": 4.653915264991035e-06, "loss": 0.5529, "step": 391 }, { "epoch": 0.35, "grad_norm": 1.976011234185068, "learning_rate": 4.652139569011475e-06, "loss": 0.5439, "step": 392 }, { "epoch": 0.35, "grad_norm": 1.909657950321316, "learning_rate": 4.650359669856711e-06, "loss": 0.5558, "step": 393 }, { "epoch": 0.35, "grad_norm": 1.9134183734362904, "learning_rate": 4.648575571002926e-06, "loss": 0.5428, "step": 394 }, { "epoch": 0.35, "grad_norm": 2.067168876792994, "learning_rate": 4.646787275934501e-06, "loss": 0.6261, "step": 395 }, { "epoch": 0.35, "grad_norm": 1.9358304010171785, "learning_rate": 4.644994788144017e-06, "loss": 0.5698, "step": 396 }, { "epoch": 0.35, "grad_norm": 1.9671634072657547, "learning_rate": 4.643198111132241e-06, "loss": 0.5345, "step": 397 }, { "epoch": 0.35, "grad_norm": 2.0176052011599133, "learning_rate": 4.641397248408122e-06, "loss": 0.5028, "step": 398 }, { "epoch": 0.35, "grad_norm": 1.9960700483606102, "learning_rate": 4.639592203488784e-06, "loss": 0.5253, "step": 399 }, { "epoch": 0.35, "grad_norm": 1.9329472749401087, "learning_rate": 4.63778297989952e-06, "loss": 0.615, "step": 400 }, { "epoch": 0.36, "grad_norm": 1.9689526846990402, "learning_rate": 4.6359695811737805e-06, "loss": 0.5558, "step": 401 }, { "epoch": 0.36, "grad_norm": 2.043494453339269, "learning_rate": 4.634152010853175e-06, "loss": 0.5955, "step": 402 }, { "epoch": 0.36, "grad_norm": 1.9251519214200417, "learning_rate": 4.632330272487455e-06, "loss": 0.5587, "step": 403 }, { "epoch": 0.36, "grad_norm": 2.2049650629169495, "learning_rate": 4.6305043696345175e-06, "loss": 0.5633, "step": 404 }, { "epoch": 0.36, "grad_norm": 1.8971004366601951, "learning_rate": 4.628674305860389e-06, "loss": 0.5147, "step": 405 }, { "epoch": 0.36, "grad_norm": 1.958131978242853, "learning_rate": 4.626840084739224e-06, "loss": 0.558, "step": 406 }, { "epoch": 0.36, "grad_norm": 1.8809187299789303, "learning_rate": 4.625001709853296e-06, "loss": 0.6029, "step": 407 }, { "epoch": 0.36, "grad_norm": 2.07376704403877, "learning_rate": 4.623159184792992e-06, "loss": 0.5985, "step": 408 }, { "epoch": 0.36, "grad_norm": 1.9773215118384355, "learning_rate": 4.621312513156801e-06, "loss": 0.5592, "step": 409 }, { "epoch": 0.36, "grad_norm": 2.2454931529711373, "learning_rate": 4.6194616985513144e-06, "loss": 0.5265, "step": 410 }, { "epoch": 0.36, "grad_norm": 1.917266484743525, "learning_rate": 4.617606744591214e-06, "loss": 0.5579, "step": 411 }, { "epoch": 0.36, "grad_norm": 1.9196448264725143, "learning_rate": 4.615747654899263e-06, "loss": 0.5345, "step": 412 }, { "epoch": 0.37, "grad_norm": 1.9733157447209138, "learning_rate": 4.613884433106306e-06, "loss": 0.528, "step": 413 }, { "epoch": 0.37, "grad_norm": 1.994664364309963, "learning_rate": 4.612017082851253e-06, "loss": 0.5489, "step": 414 }, { "epoch": 0.37, "grad_norm": 1.8266904473141898, "learning_rate": 4.610145607781081e-06, "loss": 0.5411, "step": 415 }, { "epoch": 0.37, "grad_norm": 2.0294108873934364, "learning_rate": 4.608270011550823e-06, "loss": 0.5963, "step": 416 }, { "epoch": 0.37, "grad_norm": 1.9735002273071562, "learning_rate": 4.606390297823555e-06, "loss": 0.5858, "step": 417 }, { "epoch": 0.37, "grad_norm": 1.8987568737188125, "learning_rate": 4.604506470270403e-06, "loss": 0.493, "step": 418 }, { "epoch": 0.37, "grad_norm": 1.9371998611194052, "learning_rate": 4.6026185325705195e-06, "loss": 0.521, "step": 419 }, { "epoch": 0.37, "grad_norm": 1.8926221916061328, "learning_rate": 4.60072648841109e-06, "loss": 0.4922, "step": 420 }, { "epoch": 0.37, "grad_norm": 1.8759546163633927, "learning_rate": 4.598830341487317e-06, "loss": 0.5487, "step": 421 }, { "epoch": 0.37, "grad_norm": 1.9425705301229708, "learning_rate": 4.596930095502416e-06, "loss": 0.5155, "step": 422 }, { "epoch": 0.37, "grad_norm": 1.8718904454318124, "learning_rate": 4.59502575416761e-06, "loss": 0.5372, "step": 423 }, { "epoch": 0.38, "grad_norm": 1.8361742824749525, "learning_rate": 4.593117321202117e-06, "loss": 0.556, "step": 424 }, { "epoch": 0.38, "grad_norm": 1.8520540031413573, "learning_rate": 4.59120480033315e-06, "loss": 0.6213, "step": 425 }, { "epoch": 0.38, "grad_norm": 1.9670746741442957, "learning_rate": 4.5892881952959015e-06, "loss": 0.5685, "step": 426 }, { "epoch": 0.38, "grad_norm": 1.969557039139786, "learning_rate": 4.587367509833543e-06, "loss": 0.5472, "step": 427 }, { "epoch": 0.38, "grad_norm": 1.9873217018861624, "learning_rate": 4.585442747697218e-06, "loss": 0.5419, "step": 428 }, { "epoch": 0.38, "grad_norm": 1.9508580236237527, "learning_rate": 4.5835139126460234e-06, "loss": 0.566, "step": 429 }, { "epoch": 0.38, "grad_norm": 1.8929503262145966, "learning_rate": 4.58158100844702e-06, "loss": 0.5526, "step": 430 }, { "epoch": 0.38, "grad_norm": 1.9394545018501204, "learning_rate": 4.57964403887521e-06, "loss": 0.5469, "step": 431 }, { "epoch": 0.38, "grad_norm": 2.1045619298179927, "learning_rate": 4.577703007713538e-06, "loss": 0.5397, "step": 432 }, { "epoch": 0.38, "grad_norm": 1.8886665443222683, "learning_rate": 4.575757918752879e-06, "loss": 0.5174, "step": 433 }, { "epoch": 0.38, "grad_norm": 1.849256286655662, "learning_rate": 4.573808775792033e-06, "loss": 0.558, "step": 434 }, { "epoch": 0.39, "grad_norm": 1.89537230772545, "learning_rate": 4.5718555826377195e-06, "loss": 0.6155, "step": 435 }, { "epoch": 0.39, "grad_norm": 2.028600611269796, "learning_rate": 4.569898343104568e-06, "loss": 0.5639, "step": 436 }, { "epoch": 0.39, "grad_norm": 2.1153787641168273, "learning_rate": 4.567937061015107e-06, "loss": 0.5883, "step": 437 }, { "epoch": 0.39, "grad_norm": 2.0217937777574075, "learning_rate": 4.5659717401997655e-06, "loss": 0.5936, "step": 438 }, { "epoch": 0.39, "grad_norm": 2.248716610859176, "learning_rate": 4.564002384496856e-06, "loss": 0.5539, "step": 439 }, { "epoch": 0.39, "grad_norm": 1.9689879082294663, "learning_rate": 4.562028997752574e-06, "loss": 0.5636, "step": 440 }, { "epoch": 0.39, "grad_norm": 1.763292547062648, "learning_rate": 4.560051583820987e-06, "loss": 0.5402, "step": 441 }, { "epoch": 0.39, "grad_norm": 2.129235681815295, "learning_rate": 4.558070146564025e-06, "loss": 0.5279, "step": 442 }, { "epoch": 0.39, "grad_norm": 1.987329959970642, "learning_rate": 4.55608468985148e-06, "loss": 0.5597, "step": 443 }, { "epoch": 0.39, "grad_norm": 1.8223595251951752, "learning_rate": 4.554095217560991e-06, "loss": 0.5523, "step": 444 }, { "epoch": 0.39, "grad_norm": 1.8945373677348296, "learning_rate": 4.55210173357804e-06, "loss": 0.5611, "step": 445 }, { "epoch": 0.4, "grad_norm": 1.8010628987468362, "learning_rate": 4.550104241795946e-06, "loss": 0.5406, "step": 446 }, { "epoch": 0.4, "grad_norm": 1.7680591979019162, "learning_rate": 4.548102746115852e-06, "loss": 0.5392, "step": 447 }, { "epoch": 0.4, "grad_norm": 1.9894409183828397, "learning_rate": 4.546097250446724e-06, "loss": 0.568, "step": 448 }, { "epoch": 0.4, "grad_norm": 1.9527217933389673, "learning_rate": 4.544087758705338e-06, "loss": 0.5616, "step": 449 }, { "epoch": 0.4, "grad_norm": 1.8813970745759399, "learning_rate": 4.5420742748162735e-06, "loss": 0.5857, "step": 450 }, { "epoch": 0.4, "grad_norm": 1.9697471415378363, "learning_rate": 4.540056802711911e-06, "loss": 0.5563, "step": 451 }, { "epoch": 0.4, "grad_norm": 1.8610261764458738, "learning_rate": 4.5380353463324135e-06, "loss": 0.5414, "step": 452 }, { "epoch": 0.4, "grad_norm": 2.0760585222699075, "learning_rate": 4.536009909625733e-06, "loss": 0.6113, "step": 453 }, { "epoch": 0.4, "grad_norm": 1.9376608369819073, "learning_rate": 4.533980496547588e-06, "loss": 0.5567, "step": 454 }, { "epoch": 0.4, "grad_norm": 1.9360208325717025, "learning_rate": 4.5319471110614676e-06, "loss": 0.5637, "step": 455 }, { "epoch": 0.4, "grad_norm": 1.9103146510774847, "learning_rate": 4.529909757138619e-06, "loss": 0.5049, "step": 456 }, { "epoch": 0.4, "grad_norm": 1.9645365532954322, "learning_rate": 4.5278684387580356e-06, "loss": 0.5424, "step": 457 }, { "epoch": 0.41, "grad_norm": 2.0430691701895065, "learning_rate": 4.52582315990646e-06, "loss": 0.547, "step": 458 }, { "epoch": 0.41, "grad_norm": 1.995685349345533, "learning_rate": 4.523773924578362e-06, "loss": 0.6005, "step": 459 }, { "epoch": 0.41, "grad_norm": 1.9830544751269077, "learning_rate": 4.521720736775947e-06, "loss": 0.5563, "step": 460 }, { "epoch": 0.41, "grad_norm": 1.8473463212841006, "learning_rate": 4.519663600509131e-06, "loss": 0.5913, "step": 461 }, { "epoch": 0.41, "grad_norm": 1.8993140839815026, "learning_rate": 4.5176025197955495e-06, "loss": 0.5653, "step": 462 }, { "epoch": 0.41, "grad_norm": 1.8179551662772986, "learning_rate": 4.515537498660535e-06, "loss": 0.5485, "step": 463 }, { "epoch": 0.41, "grad_norm": 1.9275228062086758, "learning_rate": 4.51346854113712e-06, "loss": 0.5248, "step": 464 }, { "epoch": 0.41, "grad_norm": 1.9668428438048349, "learning_rate": 4.511395651266023e-06, "loss": 0.5939, "step": 465 }, { "epoch": 0.41, "grad_norm": 1.9602042152930792, "learning_rate": 4.509318833095642e-06, "loss": 0.5452, "step": 466 }, { "epoch": 0.41, "grad_norm": 1.8348566721600683, "learning_rate": 4.507238090682049e-06, "loss": 0.5514, "step": 467 }, { "epoch": 0.41, "grad_norm": 1.938525142403929, "learning_rate": 4.505153428088979e-06, "loss": 0.5822, "step": 468 }, { "epoch": 0.42, "grad_norm": 2.008973560332548, "learning_rate": 4.503064849387822e-06, "loss": 0.5765, "step": 469 }, { "epoch": 0.42, "grad_norm": 1.8911779425902009, "learning_rate": 4.500972358657618e-06, "loss": 0.5465, "step": 470 }, { "epoch": 0.42, "grad_norm": 1.9224818772820709, "learning_rate": 4.4988759599850485e-06, "loss": 0.5897, "step": 471 }, { "epoch": 0.42, "grad_norm": 1.990817812633161, "learning_rate": 4.496775657464423e-06, "loss": 0.5505, "step": 472 }, { "epoch": 0.42, "grad_norm": 1.9167562026803746, "learning_rate": 4.4946714551976795e-06, "loss": 0.5779, "step": 473 }, { "epoch": 0.42, "grad_norm": 1.9388400892712594, "learning_rate": 4.492563357294369e-06, "loss": 0.574, "step": 474 }, { "epoch": 0.42, "grad_norm": 2.0140312788131762, "learning_rate": 4.490451367871655e-06, "loss": 0.4928, "step": 475 }, { "epoch": 0.42, "grad_norm": 2.074902721101316, "learning_rate": 4.488335491054296e-06, "loss": 0.5366, "step": 476 }, { "epoch": 0.42, "grad_norm": 1.8245504149698855, "learning_rate": 4.486215730974646e-06, "loss": 0.581, "step": 477 }, { "epoch": 0.42, "grad_norm": 2.1100306515160656, "learning_rate": 4.4840920917726425e-06, "loss": 0.5677, "step": 478 }, { "epoch": 0.42, "grad_norm": 1.9560380000004616, "learning_rate": 4.4819645775958e-06, "loss": 0.5426, "step": 479 }, { "epoch": 0.43, "grad_norm": 1.721267171163405, "learning_rate": 4.479833192599198e-06, "loss": 0.5868, "step": 480 }, { "epoch": 0.43, "grad_norm": 2.0001169229847124, "learning_rate": 4.477697940945478e-06, "loss": 0.5667, "step": 481 }, { "epoch": 0.43, "grad_norm": 2.0111322894409134, "learning_rate": 4.475558826804833e-06, "loss": 0.5707, "step": 482 }, { "epoch": 0.43, "grad_norm": 1.8179588699061133, "learning_rate": 4.473415854355e-06, "loss": 0.5484, "step": 483 }, { "epoch": 0.43, "grad_norm": 2.0491236128150345, "learning_rate": 4.47126902778125e-06, "loss": 0.5575, "step": 484 }, { "epoch": 0.43, "grad_norm": 2.049676347036571, "learning_rate": 4.469118351276381e-06, "loss": 0.5807, "step": 485 }, { "epoch": 0.43, "grad_norm": 1.8999028972772445, "learning_rate": 4.4669638290407115e-06, "loss": 0.5447, "step": 486 }, { "epoch": 0.43, "grad_norm": 2.0754807768031687, "learning_rate": 4.464805465282071e-06, "loss": 0.503, "step": 487 }, { "epoch": 0.43, "grad_norm": 1.9532719169013661, "learning_rate": 4.462643264215789e-06, "loss": 0.5304, "step": 488 }, { "epoch": 0.43, "grad_norm": 2.038547881198709, "learning_rate": 4.460477230064693e-06, "loss": 0.6116, "step": 489 }, { "epoch": 0.43, "grad_norm": 2.1342568039197136, "learning_rate": 4.458307367059092e-06, "loss": 0.5632, "step": 490 }, { "epoch": 0.43, "grad_norm": 1.9267024509918977, "learning_rate": 4.456133679436778e-06, "loss": 0.5574, "step": 491 }, { "epoch": 0.44, "grad_norm": 1.795213135692931, "learning_rate": 4.453956171443008e-06, "loss": 0.5737, "step": 492 }, { "epoch": 0.44, "grad_norm": 1.9428252328171443, "learning_rate": 4.451774847330505e-06, "loss": 0.5685, "step": 493 }, { "epoch": 0.44, "grad_norm": 1.7903749800219122, "learning_rate": 4.449589711359439e-06, "loss": 0.5214, "step": 494 }, { "epoch": 0.44, "grad_norm": 2.111615491479605, "learning_rate": 4.447400767797429e-06, "loss": 0.5329, "step": 495 }, { "epoch": 0.44, "grad_norm": 1.936578332165912, "learning_rate": 4.445208020919531e-06, "loss": 0.543, "step": 496 }, { "epoch": 0.44, "grad_norm": 2.0005145681262473, "learning_rate": 4.4430114750082246e-06, "loss": 0.5593, "step": 497 }, { "epoch": 0.44, "grad_norm": 1.9720912009242426, "learning_rate": 4.4408111343534125e-06, "loss": 0.5812, "step": 498 }, { "epoch": 0.44, "grad_norm": 2.0486055586452787, "learning_rate": 4.4386070032524085e-06, "loss": 0.5563, "step": 499 }, { "epoch": 0.44, "grad_norm": 1.8043262288689983, "learning_rate": 4.436399086009928e-06, "loss": 0.4905, "step": 500 }, { "epoch": 0.44, "grad_norm": 1.9608580808640215, "learning_rate": 4.43418738693808e-06, "loss": 0.5548, "step": 501 }, { "epoch": 0.44, "grad_norm": 2.008548225584814, "learning_rate": 4.431971910356363e-06, "loss": 0.5955, "step": 502 }, { "epoch": 0.45, "grad_norm": 1.8974274240345173, "learning_rate": 4.429752660591648e-06, "loss": 0.5742, "step": 503 }, { "epoch": 0.45, "grad_norm": 1.8257689605722616, "learning_rate": 4.427529641978181e-06, "loss": 0.6177, "step": 504 }, { "epoch": 0.45, "grad_norm": 2.0327301577551764, "learning_rate": 4.425302858857563e-06, "loss": 0.5872, "step": 505 }, { "epoch": 0.45, "grad_norm": 1.9539661576324254, "learning_rate": 4.42307231557875e-06, "loss": 0.5728, "step": 506 }, { "epoch": 0.45, "grad_norm": 1.9346302819034207, "learning_rate": 4.420838016498043e-06, "loss": 0.6019, "step": 507 }, { "epoch": 0.45, "grad_norm": 2.1255667417446054, "learning_rate": 4.418599965979074e-06, "loss": 0.5981, "step": 508 }, { "epoch": 0.45, "grad_norm": 1.8293805714793054, "learning_rate": 4.416358168392806e-06, "loss": 0.5497, "step": 509 }, { "epoch": 0.45, "grad_norm": 1.929762647152706, "learning_rate": 4.414112628117518e-06, "loss": 0.5655, "step": 510 }, { "epoch": 0.45, "grad_norm": 1.9808758258773635, "learning_rate": 4.411863349538798e-06, "loss": 0.5465, "step": 511 }, { "epoch": 0.45, "grad_norm": 2.0413084054198647, "learning_rate": 4.409610337049537e-06, "loss": 0.5264, "step": 512 }, { "epoch": 0.45, "grad_norm": 1.9506473664088613, "learning_rate": 4.4073535950499155e-06, "loss": 0.5284, "step": 513 }, { "epoch": 0.46, "grad_norm": 1.7875399190820846, "learning_rate": 4.405093127947402e-06, "loss": 0.5406, "step": 514 }, { "epoch": 0.46, "grad_norm": 1.9594159192262046, "learning_rate": 4.402828940156735e-06, "loss": 0.573, "step": 515 }, { "epoch": 0.46, "grad_norm": 2.025943836966642, "learning_rate": 4.400561036099924e-06, "loss": 0.5227, "step": 516 }, { "epoch": 0.46, "grad_norm": 1.9439140060564322, "learning_rate": 4.398289420206235e-06, "loss": 0.5802, "step": 517 }, { "epoch": 0.46, "grad_norm": 1.891060025336787, "learning_rate": 4.396014096912182e-06, "loss": 0.55, "step": 518 }, { "epoch": 0.46, "grad_norm": 1.9575594944193413, "learning_rate": 4.393735070661521e-06, "loss": 0.5213, "step": 519 }, { "epoch": 0.46, "grad_norm": 2.024463679893138, "learning_rate": 4.391452345905239e-06, "loss": 0.5354, "step": 520 }, { "epoch": 0.46, "grad_norm": 1.825359223217947, "learning_rate": 4.389165927101549e-06, "loss": 0.5506, "step": 521 }, { "epoch": 0.46, "grad_norm": 2.0284690208197484, "learning_rate": 4.386875818715875e-06, "loss": 0.5763, "step": 522 }, { "epoch": 0.46, "grad_norm": 1.9021830177238082, "learning_rate": 4.3845820252208476e-06, "loss": 0.5596, "step": 523 }, { "epoch": 0.46, "grad_norm": 2.0000504821060203, "learning_rate": 4.3822845510962966e-06, "loss": 0.5701, "step": 524 }, { "epoch": 0.47, "grad_norm": 1.7341340075311633, "learning_rate": 4.379983400829237e-06, "loss": 0.5315, "step": 525 }, { "epoch": 0.47, "grad_norm": 1.9297447671947465, "learning_rate": 4.377678578913868e-06, "loss": 0.5798, "step": 526 }, { "epoch": 0.47, "grad_norm": 1.9233069620366818, "learning_rate": 4.375370089851554e-06, "loss": 0.5391, "step": 527 }, { "epoch": 0.47, "grad_norm": 1.976671700063146, "learning_rate": 4.3730579381508254e-06, "loss": 0.5674, "step": 528 }, { "epoch": 0.47, "grad_norm": 1.914097057045113, "learning_rate": 4.3707421283273645e-06, "loss": 0.5367, "step": 529 }, { "epoch": 0.47, "grad_norm": 1.8477362806445459, "learning_rate": 4.368422664903997e-06, "loss": 0.5349, "step": 530 }, { "epoch": 0.47, "grad_norm": 1.9704477099484594, "learning_rate": 4.366099552410686e-06, "loss": 0.501, "step": 531 }, { "epoch": 0.47, "grad_norm": 1.9297086500071385, "learning_rate": 4.363772795384522e-06, "loss": 0.5352, "step": 532 }, { "epoch": 0.47, "grad_norm": 1.9090996748848685, "learning_rate": 4.36144239836971e-06, "loss": 0.5457, "step": 533 }, { "epoch": 0.47, "grad_norm": 1.905870882711107, "learning_rate": 4.3591083659175655e-06, "loss": 0.5685, "step": 534 }, { "epoch": 0.47, "grad_norm": 1.968618442539214, "learning_rate": 4.356770702586506e-06, "loss": 0.5476, "step": 535 }, { "epoch": 0.47, "grad_norm": 1.9431218136805426, "learning_rate": 4.354429412942038e-06, "loss": 0.5719, "step": 536 }, { "epoch": 0.48, "grad_norm": 2.0756451350956215, "learning_rate": 4.3520845015567495e-06, "loss": 0.5502, "step": 537 }, { "epoch": 0.48, "grad_norm": 1.8350117686217275, "learning_rate": 4.349735973010306e-06, "loss": 0.5417, "step": 538 }, { "epoch": 0.48, "grad_norm": 2.03495920394236, "learning_rate": 4.3473838318894324e-06, "loss": 0.545, "step": 539 }, { "epoch": 0.48, "grad_norm": 1.7864245375307775, "learning_rate": 4.3450280827879125e-06, "loss": 0.5242, "step": 540 }, { "epoch": 0.48, "grad_norm": 1.9018530036883652, "learning_rate": 4.342668730306575e-06, "loss": 0.554, "step": 541 }, { "epoch": 0.48, "grad_norm": 1.8575071370513128, "learning_rate": 4.340305779053286e-06, "loss": 0.5287, "step": 542 }, { "epoch": 0.48, "grad_norm": 1.8480049595126469, "learning_rate": 4.33793923364294e-06, "loss": 0.5554, "step": 543 }, { "epoch": 0.48, "grad_norm": 2.103039565778625, "learning_rate": 4.335569098697454e-06, "loss": 0.5526, "step": 544 }, { "epoch": 0.48, "grad_norm": 1.8712145108160219, "learning_rate": 4.33319537884575e-06, "loss": 0.5472, "step": 545 }, { "epoch": 0.48, "grad_norm": 1.9271972466285336, "learning_rate": 4.330818078723756e-06, "loss": 0.5827, "step": 546 }, { "epoch": 0.48, "grad_norm": 1.954438973741856, "learning_rate": 4.328437202974389e-06, "loss": 0.5433, "step": 547 }, { "epoch": 0.49, "grad_norm": 2.0467264178153726, "learning_rate": 4.326052756247553e-06, "loss": 0.5981, "step": 548 }, { "epoch": 0.49, "grad_norm": 1.9418055408636266, "learning_rate": 4.323664743200123e-06, "loss": 0.5832, "step": 549 }, { "epoch": 0.49, "grad_norm": 2.444044603553196, "learning_rate": 4.32127316849594e-06, "loss": 0.5638, "step": 550 }, { "epoch": 0.49, "grad_norm": 1.8791947879326414, "learning_rate": 4.318878036805802e-06, "loss": 0.5864, "step": 551 }, { "epoch": 0.49, "grad_norm": 1.872356245946924, "learning_rate": 4.3164793528074525e-06, "loss": 0.5337, "step": 552 }, { "epoch": 0.49, "grad_norm": 2.025493213646544, "learning_rate": 4.3140771211855725e-06, "loss": 0.5401, "step": 553 }, { "epoch": 0.49, "grad_norm": 1.9845857759145742, "learning_rate": 4.3116713466317745e-06, "loss": 0.5712, "step": 554 }, { "epoch": 0.49, "grad_norm": 1.9091874317608197, "learning_rate": 4.309262033844587e-06, "loss": 0.5337, "step": 555 }, { "epoch": 0.49, "grad_norm": 1.926646558220673, "learning_rate": 4.30684918752945e-06, "loss": 0.5787, "step": 556 }, { "epoch": 0.49, "grad_norm": 2.0450560123448165, "learning_rate": 4.304432812398704e-06, "loss": 0.5704, "step": 557 }, { "epoch": 0.49, "grad_norm": 1.915800332391142, "learning_rate": 4.302012913171584e-06, "loss": 0.5194, "step": 558 }, { "epoch": 0.5, "grad_norm": 1.9050588229807015, "learning_rate": 4.299589494574204e-06, "loss": 0.5104, "step": 559 }, { "epoch": 0.5, "grad_norm": 1.9241714112001687, "learning_rate": 4.297162561339554e-06, "loss": 0.5388, "step": 560 }, { "epoch": 0.5, "grad_norm": 1.8520273210081386, "learning_rate": 4.294732118207486e-06, "loss": 0.5363, "step": 561 }, { "epoch": 0.5, "grad_norm": 2.0240180827444205, "learning_rate": 4.292298169924709e-06, "loss": 0.5632, "step": 562 }, { "epoch": 0.5, "grad_norm": 1.8385436745856445, "learning_rate": 4.289860721244776e-06, "loss": 0.542, "step": 563 }, { "epoch": 0.5, "grad_norm": 1.9260618068482396, "learning_rate": 4.287419776928078e-06, "loss": 0.5555, "step": 564 }, { "epoch": 0.5, "grad_norm": 3.155290692386073, "learning_rate": 4.284975341741833e-06, "loss": 0.5336, "step": 565 }, { "epoch": 0.5, "grad_norm": 2.461077264148098, "learning_rate": 4.282527420460073e-06, "loss": 0.5794, "step": 566 }, { "epoch": 0.5, "grad_norm": 1.8539810703173831, "learning_rate": 4.280076017863643e-06, "loss": 0.5298, "step": 567 }, { "epoch": 0.5, "grad_norm": 1.981150552962984, "learning_rate": 4.277621138740185e-06, "loss": 0.5862, "step": 568 }, { "epoch": 0.5, "grad_norm": 1.8768796036679432, "learning_rate": 4.275162787884132e-06, "loss": 0.5255, "step": 569 }, { "epoch": 0.5, "grad_norm": 2.022795676637582, "learning_rate": 4.272700970096696e-06, "loss": 0.5984, "step": 570 }, { "epoch": 0.51, "grad_norm": 1.835618231704385, "learning_rate": 4.27023569018586e-06, "loss": 0.5297, "step": 571 }, { "epoch": 0.51, "grad_norm": 1.853495005213679, "learning_rate": 4.267766952966369e-06, "loss": 0.5188, "step": 572 }, { "epoch": 0.51, "grad_norm": 1.8841750183665413, "learning_rate": 4.265294763259721e-06, "loss": 0.5678, "step": 573 }, { "epoch": 0.51, "grad_norm": 1.8013177249236558, "learning_rate": 4.262819125894156e-06, "loss": 0.5286, "step": 574 }, { "epoch": 0.51, "grad_norm": 1.8320928495052518, "learning_rate": 4.2603400457046476e-06, "loss": 0.5341, "step": 575 }, { "epoch": 0.51, "grad_norm": 1.8323864124122828, "learning_rate": 4.257857527532891e-06, "loss": 0.5283, "step": 576 }, { "epoch": 0.51, "grad_norm": 1.9487038959665601, "learning_rate": 4.255371576227301e-06, "loss": 0.5418, "step": 577 }, { "epoch": 0.51, "grad_norm": 1.7875154296015772, "learning_rate": 4.252882196642993e-06, "loss": 0.5065, "step": 578 }, { "epoch": 0.51, "grad_norm": 2.089827238376911, "learning_rate": 4.250389393641778e-06, "loss": 0.5919, "step": 579 }, { "epoch": 0.51, "grad_norm": 1.9078348658003164, "learning_rate": 4.247893172092157e-06, "loss": 0.5212, "step": 580 }, { "epoch": 0.51, "grad_norm": 1.9952457072102052, "learning_rate": 4.245393536869303e-06, "loss": 0.5284, "step": 581 }, { "epoch": 0.52, "grad_norm": 2.0728561008210384, "learning_rate": 4.242890492855056e-06, "loss": 0.5214, "step": 582 }, { "epoch": 0.52, "grad_norm": 1.97825451090628, "learning_rate": 4.240384044937919e-06, "loss": 0.5586, "step": 583 }, { "epoch": 0.52, "grad_norm": 1.85380003580073, "learning_rate": 4.237874198013037e-06, "loss": 0.6078, "step": 584 }, { "epoch": 0.52, "grad_norm": 1.8198051628607304, "learning_rate": 4.235360956982196e-06, "loss": 0.5677, "step": 585 }, { "epoch": 0.52, "grad_norm": 2.1343351043013183, "learning_rate": 4.23284432675381e-06, "loss": 0.5706, "step": 586 }, { "epoch": 0.52, "grad_norm": 2.0294462862804896, "learning_rate": 4.230324312242911e-06, "loss": 0.5399, "step": 587 }, { "epoch": 0.52, "grad_norm": 1.9618881336969853, "learning_rate": 4.227800918371145e-06, "loss": 0.5292, "step": 588 }, { "epoch": 0.52, "grad_norm": 1.9665398714083597, "learning_rate": 4.225274150066752e-06, "loss": 0.5414, "step": 589 }, { "epoch": 0.52, "grad_norm": 2.0976099857689268, "learning_rate": 4.222744012264567e-06, "loss": 0.5204, "step": 590 }, { "epoch": 0.52, "grad_norm": 1.968032018982793, "learning_rate": 4.220210509906002e-06, "loss": 0.5622, "step": 591 }, { "epoch": 0.52, "grad_norm": 2.0055542027073523, "learning_rate": 4.217673647939044e-06, "loss": 0.5723, "step": 592 }, { "epoch": 0.53, "grad_norm": 2.031612125247833, "learning_rate": 4.215133431318239e-06, "loss": 0.5727, "step": 593 }, { "epoch": 0.53, "grad_norm": 2.04253552367063, "learning_rate": 4.212589865004684e-06, "loss": 0.5676, "step": 594 }, { "epoch": 0.53, "grad_norm": 1.9143447724555291, "learning_rate": 4.2100429539660205e-06, "loss": 0.5452, "step": 595 }, { "epoch": 0.53, "grad_norm": 2.1284999811605334, "learning_rate": 4.20749270317642e-06, "loss": 0.5679, "step": 596 }, { "epoch": 0.53, "grad_norm": 1.9726237378545723, "learning_rate": 4.204939117616578e-06, "loss": 0.5514, "step": 597 }, { "epoch": 0.53, "grad_norm": 2.0537722291479583, "learning_rate": 4.202382202273702e-06, "loss": 0.5979, "step": 598 }, { "epoch": 0.53, "grad_norm": 1.9695944675405062, "learning_rate": 4.1998219621415035e-06, "loss": 0.5519, "step": 599 }, { "epoch": 0.53, "grad_norm": 2.1175148159531196, "learning_rate": 4.197258402220187e-06, "loss": 0.5437, "step": 600 }, { "epoch": 0.53, "grad_norm": 1.9698920488340708, "learning_rate": 4.19469152751644e-06, "loss": 0.5765, "step": 601 }, { "epoch": 0.53, "grad_norm": 1.879379971551763, "learning_rate": 4.192121343043424e-06, "loss": 0.5219, "step": 602 }, { "epoch": 0.53, "grad_norm": 1.9668215341266202, "learning_rate": 4.189547853820767e-06, "loss": 0.4967, "step": 603 }, { "epoch": 0.53, "grad_norm": 2.0264415648360723, "learning_rate": 4.186971064874547e-06, "loss": 0.5591, "step": 604 }, { "epoch": 0.54, "grad_norm": 1.9996711001240413, "learning_rate": 4.18439098123729e-06, "loss": 0.5909, "step": 605 }, { "epoch": 0.54, "grad_norm": 1.9209919754307736, "learning_rate": 4.181807607947954e-06, "loss": 0.5516, "step": 606 }, { "epoch": 0.54, "grad_norm": 1.8120062816345244, "learning_rate": 4.1792209500519245e-06, "loss": 0.5112, "step": 607 }, { "epoch": 0.54, "grad_norm": 1.9265993932694714, "learning_rate": 4.176631012601e-06, "loss": 0.5716, "step": 608 }, { "epoch": 0.54, "grad_norm": 1.7951063568824173, "learning_rate": 4.1740378006533835e-06, "loss": 0.5546, "step": 609 }, { "epoch": 0.54, "grad_norm": 1.9478736935670538, "learning_rate": 4.1714413192736756e-06, "loss": 0.5137, "step": 610 }, { "epoch": 0.54, "grad_norm": 1.9166713700159672, "learning_rate": 4.168841573532859e-06, "loss": 0.5285, "step": 611 }, { "epoch": 0.54, "grad_norm": 1.903061790874867, "learning_rate": 4.166238568508294e-06, "loss": 0.5643, "step": 612 }, { "epoch": 0.54, "grad_norm": 1.8709574261812854, "learning_rate": 4.1636323092837065e-06, "loss": 0.5531, "step": 613 }, { "epoch": 0.54, "grad_norm": 1.891374469060374, "learning_rate": 4.161022800949177e-06, "loss": 0.5386, "step": 614 }, { "epoch": 0.54, "grad_norm": 1.8621023435008923, "learning_rate": 4.1584100486011315e-06, "loss": 0.5472, "step": 615 }, { "epoch": 0.55, "grad_norm": 1.8927480615848256, "learning_rate": 4.155794057342333e-06, "loss": 0.567, "step": 616 }, { "epoch": 0.55, "grad_norm": 1.9157957155248084, "learning_rate": 4.153174832281867e-06, "loss": 0.5295, "step": 617 }, { "epoch": 0.55, "grad_norm": 1.7900976303440275, "learning_rate": 4.150552378535138e-06, "loss": 0.5374, "step": 618 }, { "epoch": 0.55, "grad_norm": 1.9233860209522704, "learning_rate": 4.1479267012238555e-06, "loss": 0.5673, "step": 619 }, { "epoch": 0.55, "grad_norm": 1.904244620695313, "learning_rate": 4.145297805476023e-06, "loss": 0.5674, "step": 620 }, { "epoch": 0.55, "grad_norm": 1.8633100020518014, "learning_rate": 4.142665696425932e-06, "loss": 0.5717, "step": 621 }, { "epoch": 0.55, "grad_norm": 2.0449274851229764, "learning_rate": 4.140030379214147e-06, "loss": 0.5382, "step": 622 }, { "epoch": 0.55, "grad_norm": 1.8437126524936716, "learning_rate": 4.137391858987502e-06, "loss": 0.5635, "step": 623 }, { "epoch": 0.55, "grad_norm": 1.9476300616110815, "learning_rate": 4.134750140899082e-06, "loss": 0.5354, "step": 624 }, { "epoch": 0.55, "grad_norm": 1.8187836169409277, "learning_rate": 4.132105230108221e-06, "loss": 0.5678, "step": 625 }, { "epoch": 0.55, "grad_norm": 1.8325255303792565, "learning_rate": 4.1294571317804854e-06, "loss": 0.5497, "step": 626 }, { "epoch": 0.56, "grad_norm": 1.947073088948294, "learning_rate": 4.12680585108767e-06, "loss": 0.6005, "step": 627 }, { "epoch": 0.56, "grad_norm": 1.9094602677105208, "learning_rate": 4.1241513932077835e-06, "loss": 0.5442, "step": 628 }, { "epoch": 0.56, "grad_norm": 1.9308069577521967, "learning_rate": 4.121493763325039e-06, "loss": 0.4952, "step": 629 }, { "epoch": 0.56, "grad_norm": 1.955225453108231, "learning_rate": 4.118832966629847e-06, "loss": 0.5161, "step": 630 }, { "epoch": 0.56, "grad_norm": 1.8884686835300686, "learning_rate": 4.116169008318798e-06, "loss": 0.5834, "step": 631 }, { "epoch": 0.56, "grad_norm": 1.851971220446282, "learning_rate": 4.113501893594662e-06, "loss": 0.5762, "step": 632 }, { "epoch": 0.56, "grad_norm": 1.982231343732386, "learning_rate": 4.110831627666372e-06, "loss": 0.5043, "step": 633 }, { "epoch": 0.56, "grad_norm": 1.8783480932058496, "learning_rate": 4.108158215749014e-06, "loss": 0.5202, "step": 634 }, { "epoch": 0.56, "grad_norm": 1.7472053862830499, "learning_rate": 4.105481663063821e-06, "loss": 0.5064, "step": 635 }, { "epoch": 0.56, "grad_norm": 4.71435326799849, "learning_rate": 4.102801974838158e-06, "loss": 0.5808, "step": 636 }, { "epoch": 0.56, "grad_norm": 1.9383972995582568, "learning_rate": 4.100119156305514e-06, "loss": 0.5268, "step": 637 }, { "epoch": 0.57, "grad_norm": 1.7165619283230378, "learning_rate": 4.097433212705492e-06, "loss": 0.5376, "step": 638 }, { "epoch": 0.57, "grad_norm": 1.8524888535442023, "learning_rate": 4.094744149283796e-06, "loss": 0.5388, "step": 639 }, { "epoch": 0.57, "grad_norm": 1.958121956311822, "learning_rate": 4.092051971292228e-06, "loss": 0.5273, "step": 640 }, { "epoch": 0.57, "grad_norm": 1.8752806971174674, "learning_rate": 4.089356683988668e-06, "loss": 0.5283, "step": 641 }, { "epoch": 0.57, "grad_norm": 2.4399117721583465, "learning_rate": 4.086658292637072e-06, "loss": 0.5643, "step": 642 }, { "epoch": 0.57, "grad_norm": 1.897865148445396, "learning_rate": 4.083956802507456e-06, "loss": 0.5432, "step": 643 }, { "epoch": 0.57, "grad_norm": 2.0947253224544826, "learning_rate": 4.0812522188758874e-06, "loss": 0.6738, "step": 644 }, { "epoch": 0.57, "grad_norm": 1.8801252766945993, "learning_rate": 4.078544547024479e-06, "loss": 0.5516, "step": 645 }, { "epoch": 0.57, "grad_norm": 1.884681207915535, "learning_rate": 4.075833792241371e-06, "loss": 0.5521, "step": 646 }, { "epoch": 0.57, "grad_norm": 1.911314829964074, "learning_rate": 4.073119959820728e-06, "loss": 0.5279, "step": 647 }, { "epoch": 0.57, "grad_norm": 1.860637117587055, "learning_rate": 4.070403055062721e-06, "loss": 0.5543, "step": 648 }, { "epoch": 0.57, "grad_norm": 2.0453601596603157, "learning_rate": 4.0676830832735245e-06, "loss": 0.5757, "step": 649 }, { "epoch": 0.58, "grad_norm": 1.8114060321351384, "learning_rate": 4.064960049765304e-06, "loss": 0.5049, "step": 650 }, { "epoch": 0.58, "grad_norm": 1.959305167631277, "learning_rate": 4.062233959856202e-06, "loss": 0.5378, "step": 651 }, { "epoch": 0.58, "grad_norm": 1.8509512649844786, "learning_rate": 4.059504818870332e-06, "loss": 0.5695, "step": 652 }, { "epoch": 0.58, "grad_norm": 2.0120311393374677, "learning_rate": 4.056772632137762e-06, "loss": 0.5548, "step": 653 }, { "epoch": 0.58, "grad_norm": 2.185006431209757, "learning_rate": 4.054037404994516e-06, "loss": 0.5796, "step": 654 }, { "epoch": 0.58, "grad_norm": 1.8639659087725635, "learning_rate": 4.05129914278255e-06, "loss": 0.503, "step": 655 }, { "epoch": 0.58, "grad_norm": 2.0128366658538726, "learning_rate": 4.048557850849749e-06, "loss": 0.5543, "step": 656 }, { "epoch": 0.58, "grad_norm": 2.0493127075126467, "learning_rate": 4.045813534549917e-06, "loss": 0.5971, "step": 657 }, { "epoch": 0.58, "grad_norm": 1.8943877873256292, "learning_rate": 4.043066199242762e-06, "loss": 0.5512, "step": 658 }, { "epoch": 0.58, "grad_norm": 1.8607643797927613, "learning_rate": 4.04031585029389e-06, "loss": 0.5755, "step": 659 }, { "epoch": 0.58, "grad_norm": 1.933467010931308, "learning_rate": 4.037562493074792e-06, "loss": 0.546, "step": 660 }, { "epoch": 0.59, "grad_norm": 1.870898209604796, "learning_rate": 4.034806132962834e-06, "loss": 0.5101, "step": 661 }, { "epoch": 0.59, "grad_norm": 1.7765005525064146, "learning_rate": 4.032046775341247e-06, "loss": 0.535, "step": 662 }, { "epoch": 0.59, "grad_norm": 1.808388020113739, "learning_rate": 4.029284425599116e-06, "loss": 0.5532, "step": 663 }, { "epoch": 0.59, "grad_norm": 1.9444426383785842, "learning_rate": 4.026519089131371e-06, "loss": 0.5804, "step": 664 }, { "epoch": 0.59, "grad_norm": 1.8810929458792174, "learning_rate": 4.023750771338774e-06, "loss": 0.5023, "step": 665 }, { "epoch": 0.59, "grad_norm": 1.7587173598023012, "learning_rate": 4.020979477627907e-06, "loss": 0.588, "step": 666 }, { "epoch": 0.59, "grad_norm": 1.8616544736960938, "learning_rate": 4.018205213411169e-06, "loss": 0.5604, "step": 667 }, { "epoch": 0.59, "grad_norm": 1.8517363531329913, "learning_rate": 4.015427984106759e-06, "loss": 0.5503, "step": 668 }, { "epoch": 0.59, "grad_norm": 1.7164279131663547, "learning_rate": 4.012647795138664e-06, "loss": 0.5353, "step": 669 }, { "epoch": 0.59, "grad_norm": 1.8490922932257532, "learning_rate": 4.009864651936653e-06, "loss": 0.5527, "step": 670 }, { "epoch": 0.59, "grad_norm": 1.9222471762582807, "learning_rate": 4.007078559936268e-06, "loss": 0.5449, "step": 671 }, { "epoch": 0.6, "grad_norm": 1.7126406752680576, "learning_rate": 4.0042895245788035e-06, "loss": 0.5102, "step": 672 }, { "epoch": 0.6, "grad_norm": 1.7999692875631594, "learning_rate": 4.001497551311308e-06, "loss": 0.514, "step": 673 }, { "epoch": 0.6, "grad_norm": 1.8482521644616647, "learning_rate": 3.998702645586565e-06, "loss": 0.546, "step": 674 }, { "epoch": 0.6, "grad_norm": 1.8124842120343776, "learning_rate": 3.995904812863086e-06, "loss": 0.5432, "step": 675 }, { "epoch": 0.6, "grad_norm": 1.9053654350943952, "learning_rate": 3.993104058605099e-06, "loss": 0.6222, "step": 676 }, { "epoch": 0.6, "grad_norm": 1.851530834120678, "learning_rate": 3.9903003882825396e-06, "loss": 0.5069, "step": 677 }, { "epoch": 0.6, "grad_norm": 1.824612938648448, "learning_rate": 3.987493807371033e-06, "loss": 0.5279, "step": 678 }, { "epoch": 0.6, "grad_norm": 1.8322983038942529, "learning_rate": 3.984684321351895e-06, "loss": 0.504, "step": 679 }, { "epoch": 0.6, "grad_norm": 2.1601679247075105, "learning_rate": 3.981871935712112e-06, "loss": 0.5448, "step": 680 }, { "epoch": 0.6, "grad_norm": 1.9324323412240167, "learning_rate": 3.979056655944335e-06, "loss": 0.5696, "step": 681 }, { "epoch": 0.6, "grad_norm": 1.8887222870071794, "learning_rate": 3.9762384875468645e-06, "loss": 0.5147, "step": 682 }, { "epoch": 0.6, "grad_norm": 1.9025483031058836, "learning_rate": 3.973417436023646e-06, "loss": 0.5322, "step": 683 }, { "epoch": 0.61, "grad_norm": 1.944754689874286, "learning_rate": 3.970593506884254e-06, "loss": 0.564, "step": 684 }, { "epoch": 0.61, "grad_norm": 1.8782062559948918, "learning_rate": 3.9677667056438824e-06, "loss": 0.5179, "step": 685 }, { "epoch": 0.61, "grad_norm": 1.7615090001622373, "learning_rate": 3.964937037823337e-06, "loss": 0.52, "step": 686 }, { "epoch": 0.61, "grad_norm": 1.877979446527034, "learning_rate": 3.962104508949018e-06, "loss": 0.5611, "step": 687 }, { "epoch": 0.61, "grad_norm": 1.8668900126580097, "learning_rate": 3.9592691245529174e-06, "loss": 0.5398, "step": 688 }, { "epoch": 0.61, "grad_norm": 2.0467424748632395, "learning_rate": 3.9564308901726016e-06, "loss": 0.5429, "step": 689 }, { "epoch": 0.61, "grad_norm": 1.7523480652481473, "learning_rate": 3.9535898113512046e-06, "loss": 0.5456, "step": 690 }, { "epoch": 0.61, "grad_norm": 1.9384307177445268, "learning_rate": 3.950745893637414e-06, "loss": 0.5298, "step": 691 }, { "epoch": 0.61, "grad_norm": 2.0200307543606266, "learning_rate": 3.947899142585464e-06, "loss": 0.5813, "step": 692 }, { "epoch": 0.61, "grad_norm": 1.8825594318661294, "learning_rate": 3.945049563755119e-06, "loss": 0.5843, "step": 693 }, { "epoch": 0.61, "grad_norm": 1.801304483173922, "learning_rate": 3.94219716271167e-06, "loss": 0.5332, "step": 694 }, { "epoch": 0.62, "grad_norm": 1.789336412692842, "learning_rate": 3.939341945025918e-06, "loss": 0.5712, "step": 695 }, { "epoch": 0.62, "grad_norm": 1.6764596672056864, "learning_rate": 3.936483916274163e-06, "loss": 0.5471, "step": 696 }, { "epoch": 0.62, "grad_norm": 1.8160991340297739, "learning_rate": 3.933623082038199e-06, "loss": 0.5172, "step": 697 }, { "epoch": 0.62, "grad_norm": 1.9958719154660882, "learning_rate": 3.930759447905298e-06, "loss": 0.5243, "step": 698 }, { "epoch": 0.62, "grad_norm": 1.7844190098902166, "learning_rate": 3.927893019468196e-06, "loss": 0.5679, "step": 699 }, { "epoch": 0.62, "grad_norm": 1.8231700761644845, "learning_rate": 3.925023802325094e-06, "loss": 0.5415, "step": 700 }, { "epoch": 0.62, "grad_norm": 1.8577751348591511, "learning_rate": 3.922151802079633e-06, "loss": 0.5451, "step": 701 }, { "epoch": 0.62, "grad_norm": 1.872268020286279, "learning_rate": 3.919277024340891e-06, "loss": 0.5805, "step": 702 }, { "epoch": 0.62, "grad_norm": 1.956916033214976, "learning_rate": 3.916399474723373e-06, "loss": 0.5142, "step": 703 }, { "epoch": 0.62, "grad_norm": 1.8690696320721123, "learning_rate": 3.913519158846994e-06, "loss": 0.5377, "step": 704 }, { "epoch": 0.62, "grad_norm": 1.8932224298053513, "learning_rate": 3.910636082337076e-06, "loss": 0.5174, "step": 705 }, { "epoch": 0.63, "grad_norm": 1.7671002724508906, "learning_rate": 3.907750250824327e-06, "loss": 0.5227, "step": 706 }, { "epoch": 0.63, "grad_norm": 1.8537234882936333, "learning_rate": 3.904861669944839e-06, "loss": 0.5672, "step": 707 }, { "epoch": 0.63, "grad_norm": 1.8993796687475375, "learning_rate": 3.901970345340075e-06, "loss": 0.5131, "step": 708 }, { "epoch": 0.63, "grad_norm": 1.8118617206389966, "learning_rate": 3.899076282656853e-06, "loss": 0.5243, "step": 709 }, { "epoch": 0.63, "grad_norm": 1.8195324114535576, "learning_rate": 3.89617948754734e-06, "loss": 0.5255, "step": 710 }, { "epoch": 0.63, "grad_norm": 1.777076552111516, "learning_rate": 3.89327996566904e-06, "loss": 0.5482, "step": 711 }, { "epoch": 0.63, "grad_norm": 1.7960584295638569, "learning_rate": 3.890377722684782e-06, "loss": 0.5232, "step": 712 }, { "epoch": 0.63, "grad_norm": 2.0180517293259777, "learning_rate": 3.887472764262709e-06, "loss": 0.4988, "step": 713 }, { "epoch": 0.63, "grad_norm": 1.7698597985590767, "learning_rate": 3.884565096076269e-06, "loss": 0.4934, "step": 714 }, { "epoch": 0.63, "grad_norm": 1.9593013419554524, "learning_rate": 3.8816547238042e-06, "loss": 0.554, "step": 715 }, { "epoch": 0.63, "grad_norm": 1.803176799671639, "learning_rate": 3.878741653130521e-06, "loss": 0.5058, "step": 716 }, { "epoch": 0.64, "grad_norm": 1.8739139669777212, "learning_rate": 3.875825889744525e-06, "loss": 0.5291, "step": 717 }, { "epoch": 0.64, "grad_norm": 1.7425957572489872, "learning_rate": 3.872907439340758e-06, "loss": 0.5132, "step": 718 }, { "epoch": 0.64, "grad_norm": 1.7880023308134785, "learning_rate": 3.86998630761902e-06, "loss": 0.5388, "step": 719 }, { "epoch": 0.64, "grad_norm": 2.035324802689225, "learning_rate": 3.867062500284342e-06, "loss": 0.5225, "step": 720 }, { "epoch": 0.64, "grad_norm": 1.7720228048563502, "learning_rate": 3.864136023046984e-06, "loss": 0.5535, "step": 721 }, { "epoch": 0.64, "grad_norm": 1.893636721431615, "learning_rate": 3.861206881622419e-06, "loss": 0.5445, "step": 722 }, { "epoch": 0.64, "grad_norm": 1.9975882991420841, "learning_rate": 3.8582750817313245e-06, "loss": 0.498, "step": 723 }, { "epoch": 0.64, "grad_norm": 1.8894358056153195, "learning_rate": 3.855340629099568e-06, "loss": 0.5262, "step": 724 }, { "epoch": 0.64, "grad_norm": 1.8226831631189866, "learning_rate": 3.852403529458199e-06, "loss": 0.5289, "step": 725 }, { "epoch": 0.64, "grad_norm": 1.9219589460322386, "learning_rate": 3.84946378854344e-06, "loss": 0.5828, "step": 726 }, { "epoch": 0.64, "grad_norm": 1.9524000874112546, "learning_rate": 3.846521412096665e-06, "loss": 0.5755, "step": 727 }, { "epoch": 0.64, "grad_norm": 1.7855988589662195, "learning_rate": 3.8435764058643994e-06, "loss": 0.508, "step": 728 }, { "epoch": 0.65, "grad_norm": 1.7556968697529176, "learning_rate": 3.840628775598306e-06, "loss": 0.5038, "step": 729 }, { "epoch": 0.65, "grad_norm": 1.8615629845007688, "learning_rate": 3.837678527055168e-06, "loss": 0.5658, "step": 730 }, { "epoch": 0.65, "grad_norm": 3.355106616980178, "learning_rate": 3.834725665996889e-06, "loss": 0.6255, "step": 731 }, { "epoch": 0.65, "grad_norm": 2.057901705133853, "learning_rate": 3.8317701981904655e-06, "loss": 0.5009, "step": 732 }, { "epoch": 0.65, "grad_norm": 1.8144866213511652, "learning_rate": 3.828812129407994e-06, "loss": 0.5378, "step": 733 }, { "epoch": 0.65, "grad_norm": 1.895740992214761, "learning_rate": 3.825851465426643e-06, "loss": 0.5414, "step": 734 }, { "epoch": 0.65, "grad_norm": 1.7690202691648218, "learning_rate": 3.822888212028658e-06, "loss": 0.5782, "step": 735 }, { "epoch": 0.65, "grad_norm": 1.9910212850942313, "learning_rate": 3.819922375001334e-06, "loss": 0.538, "step": 736 }, { "epoch": 0.65, "grad_norm": 2.022977401775343, "learning_rate": 3.816953960137017e-06, "loss": 0.5265, "step": 737 }, { "epoch": 0.65, "grad_norm": 2.18942238369997, "learning_rate": 3.8139829732330833e-06, "loss": 0.5419, "step": 738 }, { "epoch": 0.65, "grad_norm": 2.0143145051916487, "learning_rate": 3.8110094200919356e-06, "loss": 0.5396, "step": 739 }, { "epoch": 0.66, "grad_norm": 1.8684895296380082, "learning_rate": 3.8080333065209885e-06, "loss": 0.5285, "step": 740 }, { "epoch": 0.66, "grad_norm": 1.899758991227905, "learning_rate": 3.8050546383326546e-06, "loss": 0.5392, "step": 741 }, { "epoch": 0.66, "grad_norm": 1.7830347822365242, "learning_rate": 3.8020734213443392e-06, "loss": 0.5395, "step": 742 }, { "epoch": 0.66, "grad_norm": 1.9688219937316351, "learning_rate": 3.799089661378423e-06, "loss": 0.5832, "step": 743 }, { "epoch": 0.66, "grad_norm": 1.8380061964557934, "learning_rate": 3.7961033642622536e-06, "loss": 0.5182, "step": 744 }, { "epoch": 0.66, "grad_norm": 1.9752769027783192, "learning_rate": 3.793114535828134e-06, "loss": 0.5189, "step": 745 }, { "epoch": 0.66, "grad_norm": 1.9908258845677271, "learning_rate": 3.7901231819133104e-06, "loss": 0.5863, "step": 746 }, { "epoch": 0.66, "grad_norm": 1.8419144313470388, "learning_rate": 3.787129308359963e-06, "loss": 0.5596, "step": 747 }, { "epoch": 0.66, "grad_norm": 1.8578409208981632, "learning_rate": 3.7841329210151905e-06, "loss": 0.5757, "step": 748 }, { "epoch": 0.66, "grad_norm": 1.8125362585272666, "learning_rate": 3.7811340257310036e-06, "loss": 0.5625, "step": 749 }, { "epoch": 0.66, "grad_norm": 1.8266843142853604, "learning_rate": 3.778132628364309e-06, "loss": 0.5121, "step": 750 }, { "epoch": 0.67, "grad_norm": 1.9286747700189457, "learning_rate": 3.7751287347769006e-06, "loss": 0.5856, "step": 751 }, { "epoch": 0.67, "grad_norm": 1.8358169963837994, "learning_rate": 3.772122350835447e-06, "loss": 0.5363, "step": 752 }, { "epoch": 0.67, "grad_norm": 1.8751145280860322, "learning_rate": 3.769113482411483e-06, "loss": 0.5435, "step": 753 }, { "epoch": 0.67, "grad_norm": 1.7372022137266947, "learning_rate": 3.766102135381393e-06, "loss": 0.5114, "step": 754 }, { "epoch": 0.67, "grad_norm": 1.848532567966691, "learning_rate": 3.763088315626402e-06, "loss": 0.4887, "step": 755 }, { "epoch": 0.67, "grad_norm": 1.8724024281108291, "learning_rate": 3.7600720290325666e-06, "loss": 0.5681, "step": 756 }, { "epoch": 0.67, "grad_norm": 1.7564274203136065, "learning_rate": 3.757053281490759e-06, "loss": 0.5365, "step": 757 }, { "epoch": 0.67, "grad_norm": 1.7090468035537372, "learning_rate": 3.75403207889666e-06, "loss": 0.4976, "step": 758 }, { "epoch": 0.67, "grad_norm": 1.8628034310476902, "learning_rate": 3.7510084271507417e-06, "loss": 0.5908, "step": 759 }, { "epoch": 0.67, "grad_norm": 1.8673457440060792, "learning_rate": 3.7479823321582624e-06, "loss": 0.5641, "step": 760 }, { "epoch": 0.67, "grad_norm": 1.8378062191959523, "learning_rate": 3.744953799829252e-06, "loss": 0.5175, "step": 761 }, { "epoch": 0.67, "grad_norm": 1.779154712157358, "learning_rate": 3.7419228360784987e-06, "loss": 0.5539, "step": 762 }, { "epoch": 0.68, "grad_norm": 2.1820639181555315, "learning_rate": 3.73888944682554e-06, "loss": 0.5247, "step": 763 }, { "epoch": 0.68, "grad_norm": 1.927216958283792, "learning_rate": 3.735853637994652e-06, "loss": 0.5851, "step": 764 }, { "epoch": 0.68, "grad_norm": 1.7670365768745326, "learning_rate": 3.732815415514834e-06, "loss": 0.5829, "step": 765 }, { "epoch": 0.68, "grad_norm": 1.825202964363253, "learning_rate": 3.729774785319801e-06, "loss": 0.5257, "step": 766 }, { "epoch": 0.68, "grad_norm": 1.8200852022234557, "learning_rate": 3.72673175334797e-06, "loss": 0.55, "step": 767 }, { "epoch": 0.68, "grad_norm": 1.9436493930137209, "learning_rate": 3.723686325542448e-06, "loss": 0.5583, "step": 768 }, { "epoch": 0.68, "grad_norm": 1.7581670709714554, "learning_rate": 3.7206385078510204e-06, "loss": 0.5267, "step": 769 }, { "epoch": 0.68, "grad_norm": 1.9439324051591973, "learning_rate": 3.717588306226143e-06, "loss": 0.5686, "step": 770 }, { "epoch": 0.68, "grad_norm": 1.8154349894294908, "learning_rate": 3.7145357266249248e-06, "loss": 0.5668, "step": 771 }, { "epoch": 0.68, "grad_norm": 1.829602382975092, "learning_rate": 3.7114807750091198e-06, "loss": 0.5096, "step": 772 }, { "epoch": 0.68, "grad_norm": 1.7902487805325054, "learning_rate": 3.7084234573451145e-06, "loss": 0.5387, "step": 773 }, { "epoch": 0.69, "grad_norm": 1.7734778927084154, "learning_rate": 3.7053637796039173e-06, "loss": 0.5227, "step": 774 }, { "epoch": 0.69, "grad_norm": 1.8359664701196194, "learning_rate": 3.7023017477611444e-06, "loss": 0.5183, "step": 775 }, { "epoch": 0.69, "grad_norm": 2.049839823780983, "learning_rate": 3.699237367797011e-06, "loss": 0.5158, "step": 776 }, { "epoch": 0.69, "grad_norm": 1.839740383172249, "learning_rate": 3.6961706456963166e-06, "loss": 0.509, "step": 777 }, { "epoch": 0.69, "grad_norm": 1.7742492301936488, "learning_rate": 3.693101587448436e-06, "loss": 0.547, "step": 778 }, { "epoch": 0.69, "grad_norm": 1.7626686489679533, "learning_rate": 3.6900301990473074e-06, "loss": 0.5501, "step": 779 }, { "epoch": 0.69, "grad_norm": 1.81358040457354, "learning_rate": 3.686956486491419e-06, "loss": 0.5258, "step": 780 }, { "epoch": 0.69, "grad_norm": 1.8446309626844912, "learning_rate": 3.6838804557837972e-06, "loss": 0.5438, "step": 781 }, { "epoch": 0.69, "grad_norm": 1.8020540676799555, "learning_rate": 3.680802112931996e-06, "loss": 0.5333, "step": 782 }, { "epoch": 0.69, "grad_norm": 1.8177001575706107, "learning_rate": 3.677721463948087e-06, "loss": 0.5194, "step": 783 }, { "epoch": 0.69, "grad_norm": 1.7662648614084315, "learning_rate": 3.6746385148486437e-06, "loss": 0.5229, "step": 784 }, { "epoch": 0.7, "grad_norm": 1.7914748738808024, "learning_rate": 3.6715532716547325e-06, "loss": 0.5443, "step": 785 }, { "epoch": 0.7, "grad_norm": 1.6582914688424026, "learning_rate": 3.6684657403919005e-06, "loss": 0.4672, "step": 786 }, { "epoch": 0.7, "grad_norm": 1.8779379042503213, "learning_rate": 3.6653759270901634e-06, "loss": 0.5361, "step": 787 }, { "epoch": 0.7, "grad_norm": 1.843796012903189, "learning_rate": 3.6622838377839927e-06, "loss": 0.5903, "step": 788 }, { "epoch": 0.7, "grad_norm": 1.7389903959091482, "learning_rate": 3.6591894785123065e-06, "loss": 0.5232, "step": 789 }, { "epoch": 0.7, "grad_norm": 2.1531271375101912, "learning_rate": 3.6560928553184556e-06, "loss": 0.5811, "step": 790 }, { "epoch": 0.7, "grad_norm": 1.8744519871212226, "learning_rate": 3.6529939742502114e-06, "loss": 0.5094, "step": 791 }, { "epoch": 0.7, "grad_norm": 2.1796693544184405, "learning_rate": 3.649892841359756e-06, "loss": 0.5324, "step": 792 }, { "epoch": 0.7, "grad_norm": 1.7983464824305884, "learning_rate": 3.6467894627036697e-06, "loss": 0.5406, "step": 793 }, { "epoch": 0.7, "grad_norm": 1.8385213368207254, "learning_rate": 3.6436838443429177e-06, "loss": 0.5116, "step": 794 }, { "epoch": 0.7, "grad_norm": 1.8303911353695022, "learning_rate": 3.64057599234284e-06, "loss": 0.5032, "step": 795 }, { "epoch": 0.71, "grad_norm": 1.8212476470235475, "learning_rate": 3.6374659127731394e-06, "loss": 0.4977, "step": 796 }, { "epoch": 0.71, "grad_norm": 1.788273127421183, "learning_rate": 3.6343536117078674e-06, "loss": 0.5132, "step": 797 }, { "epoch": 0.71, "grad_norm": 1.792471501776643, "learning_rate": 3.631239095225417e-06, "loss": 0.6034, "step": 798 }, { "epoch": 0.71, "grad_norm": 1.7180614128401976, "learning_rate": 3.6281223694085055e-06, "loss": 0.5125, "step": 799 }, { "epoch": 0.71, "grad_norm": 1.968143388774121, "learning_rate": 3.625003440344166e-06, "loss": 0.5192, "step": 800 }, { "epoch": 0.71, "grad_norm": 1.78698643398069, "learning_rate": 3.6218823141237346e-06, "loss": 0.5389, "step": 801 }, { "epoch": 0.71, "grad_norm": 1.7360516235744345, "learning_rate": 3.6187589968428388e-06, "loss": 0.55, "step": 802 }, { "epoch": 0.71, "grad_norm": 1.8926768947040113, "learning_rate": 3.6156334946013844e-06, "loss": 0.5402, "step": 803 }, { "epoch": 0.71, "grad_norm": 1.7341073776764506, "learning_rate": 3.612505813503545e-06, "loss": 0.5156, "step": 804 }, { "epoch": 0.71, "grad_norm": 1.9037532755321576, "learning_rate": 3.6093759596577493e-06, "loss": 0.5035, "step": 805 }, { "epoch": 0.71, "grad_norm": 1.768026916515408, "learning_rate": 3.60624393917667e-06, "loss": 0.5317, "step": 806 }, { "epoch": 0.71, "grad_norm": 1.8195441338851683, "learning_rate": 3.6031097581772123e-06, "loss": 0.5173, "step": 807 }, { "epoch": 0.72, "grad_norm": 1.7451524294172138, "learning_rate": 3.599973422780497e-06, "loss": 0.5447, "step": 808 }, { "epoch": 0.72, "grad_norm": 1.7994216931464604, "learning_rate": 3.5968349391118573e-06, "loss": 0.5468, "step": 809 }, { "epoch": 0.72, "grad_norm": 1.7827289419599717, "learning_rate": 3.5936943133008183e-06, "loss": 0.5036, "step": 810 }, { "epoch": 0.72, "grad_norm": 1.9123669337349365, "learning_rate": 3.590551551481091e-06, "loss": 0.4994, "step": 811 }, { "epoch": 0.72, "grad_norm": 1.909151918011393, "learning_rate": 3.5874066597905573e-06, "loss": 0.5437, "step": 812 }, { "epoch": 0.72, "grad_norm": 1.8314316190947115, "learning_rate": 3.5842596443712586e-06, "loss": 0.5327, "step": 813 }, { "epoch": 0.72, "grad_norm": 2.025080353968657, "learning_rate": 3.581110511369384e-06, "loss": 0.5207, "step": 814 }, { "epoch": 0.72, "grad_norm": 1.719830501688002, "learning_rate": 3.5779592669352588e-06, "loss": 0.5043, "step": 815 }, { "epoch": 0.72, "grad_norm": 1.8856042934205883, "learning_rate": 3.574805917223332e-06, "loss": 0.534, "step": 816 }, { "epoch": 0.72, "grad_norm": 1.8669902777268896, "learning_rate": 3.5716504683921626e-06, "loss": 0.5487, "step": 817 }, { "epoch": 0.72, "grad_norm": 1.8420217203623648, "learning_rate": 3.568492926604412e-06, "loss": 0.4655, "step": 818 }, { "epoch": 0.73, "grad_norm": 1.8587298766263622, "learning_rate": 3.5653332980268267e-06, "loss": 0.5308, "step": 819 }, { "epoch": 0.73, "grad_norm": 1.8329162913986954, "learning_rate": 3.562171588830231e-06, "loss": 0.5061, "step": 820 }, { "epoch": 0.73, "grad_norm": 1.7226245016695787, "learning_rate": 3.5590078051895105e-06, "loss": 0.5022, "step": 821 }, { "epoch": 0.73, "grad_norm": 1.7947516408265423, "learning_rate": 3.555841953283603e-06, "loss": 0.5059, "step": 822 }, { "epoch": 0.73, "grad_norm": 1.7754650010913384, "learning_rate": 3.552674039295486e-06, "loss": 0.5183, "step": 823 }, { "epoch": 0.73, "grad_norm": 2.0058342412884267, "learning_rate": 3.5495040694121644e-06, "loss": 0.5717, "step": 824 }, { "epoch": 0.73, "grad_norm": 1.8536876200790606, "learning_rate": 3.546332049824659e-06, "loss": 0.5445, "step": 825 }, { "epoch": 0.73, "grad_norm": 1.9446394955278312, "learning_rate": 3.543157986727991e-06, "loss": 0.5778, "step": 826 }, { "epoch": 0.73, "grad_norm": 1.7769561446293407, "learning_rate": 3.5399818863211747e-06, "loss": 0.5209, "step": 827 }, { "epoch": 0.73, "grad_norm": 1.7847626696288204, "learning_rate": 3.5368037548072042e-06, "loss": 0.5684, "step": 828 }, { "epoch": 0.73, "grad_norm": 1.856855628494933, "learning_rate": 3.5336235983930383e-06, "loss": 0.5277, "step": 829 }, { "epoch": 0.74, "grad_norm": 1.799135122090622, "learning_rate": 3.530441423289591e-06, "loss": 0.53, "step": 830 }, { "epoch": 0.74, "grad_norm": 1.7372348199564838, "learning_rate": 3.5272572357117208e-06, "loss": 0.5082, "step": 831 }, { "epoch": 0.74, "grad_norm": 1.7713730143331359, "learning_rate": 3.5240710418782137e-06, "loss": 0.5127, "step": 832 }, { "epoch": 0.74, "grad_norm": 1.808116845193293, "learning_rate": 3.520882848011775e-06, "loss": 0.5339, "step": 833 }, { "epoch": 0.74, "grad_norm": 1.8168585745209507, "learning_rate": 3.5176926603390176e-06, "loss": 0.5773, "step": 834 }, { "epoch": 0.74, "grad_norm": 1.8433472787266432, "learning_rate": 3.514500485090446e-06, "loss": 0.5446, "step": 835 }, { "epoch": 0.74, "grad_norm": 1.7473743951502463, "learning_rate": 3.511306328500449e-06, "loss": 0.5182, "step": 836 }, { "epoch": 0.74, "grad_norm": 1.9068925551475813, "learning_rate": 3.5081101968072818e-06, "loss": 0.5428, "step": 837 }, { "epoch": 0.74, "grad_norm": 1.8621077674572017, "learning_rate": 3.5049120962530608e-06, "loss": 0.5783, "step": 838 }, { "epoch": 0.74, "grad_norm": 1.8188442080835585, "learning_rate": 3.501712033083744e-06, "loss": 0.559, "step": 839 }, { "epoch": 0.74, "grad_norm": 1.9008658249988244, "learning_rate": 3.4985100135491245e-06, "loss": 0.5322, "step": 840 }, { "epoch": 0.74, "grad_norm": 1.8107617898563186, "learning_rate": 3.495306043902817e-06, "loss": 0.592, "step": 841 }, { "epoch": 0.75, "grad_norm": 1.8972175021059394, "learning_rate": 3.4921001304022422e-06, "loss": 0.527, "step": 842 }, { "epoch": 0.75, "grad_norm": 1.773730752308571, "learning_rate": 3.4888922793086192e-06, "loss": 0.5422, "step": 843 }, { "epoch": 0.75, "grad_norm": 1.8207201600566427, "learning_rate": 3.4856824968869506e-06, "loss": 0.5463, "step": 844 }, { "epoch": 0.75, "grad_norm": 1.7825701352278942, "learning_rate": 3.4824707894060108e-06, "loss": 0.5376, "step": 845 }, { "epoch": 0.75, "grad_norm": 1.8186780308546509, "learning_rate": 3.4792571631383345e-06, "loss": 0.5448, "step": 846 }, { "epoch": 0.75, "grad_norm": 1.7196535770637023, "learning_rate": 3.4760416243602034e-06, "loss": 0.5719, "step": 847 }, { "epoch": 0.75, "grad_norm": 1.7996950762262636, "learning_rate": 3.4728241793516345e-06, "loss": 0.575, "step": 848 }, { "epoch": 0.75, "grad_norm": 1.8460755337411012, "learning_rate": 3.4696048343963667e-06, "loss": 0.5303, "step": 849 }, { "epoch": 0.75, "grad_norm": 1.8518850346827596, "learning_rate": 3.4663835957818515e-06, "loss": 0.5294, "step": 850 }, { "epoch": 0.75, "grad_norm": 1.761477307422264, "learning_rate": 3.463160469799237e-06, "loss": 0.5303, "step": 851 }, { "epoch": 0.75, "grad_norm": 1.8476905525063971, "learning_rate": 3.459935462743359e-06, "loss": 0.5365, "step": 852 }, { "epoch": 0.76, "grad_norm": 1.7748738324934357, "learning_rate": 3.4567085809127247e-06, "loss": 0.5581, "step": 853 }, { "epoch": 0.76, "grad_norm": 1.69994493873254, "learning_rate": 3.4534798306095054e-06, "loss": 0.5142, "step": 854 }, { "epoch": 0.76, "grad_norm": 1.7867273775159276, "learning_rate": 3.45024921813952e-06, "loss": 0.5397, "step": 855 }, { "epoch": 0.76, "grad_norm": 1.8894059211718275, "learning_rate": 3.4470167498122253e-06, "loss": 0.5327, "step": 856 }, { "epoch": 0.76, "grad_norm": 1.8759154191563252, "learning_rate": 3.4437824319407003e-06, "loss": 0.5091, "step": 857 }, { "epoch": 0.76, "grad_norm": 1.7992806971923871, "learning_rate": 3.4405462708416393e-06, "loss": 0.5206, "step": 858 }, { "epoch": 0.76, "grad_norm": 1.8238604800708562, "learning_rate": 3.437308272835335e-06, "loss": 0.5452, "step": 859 }, { "epoch": 0.76, "grad_norm": 1.8504559231955047, "learning_rate": 3.4340684442456673e-06, "loss": 0.4953, "step": 860 }, { "epoch": 0.76, "grad_norm": 1.754272242495459, "learning_rate": 3.4308267914000915e-06, "loss": 0.5897, "step": 861 }, { "epoch": 0.76, "grad_norm": 1.8733571713304673, "learning_rate": 3.427583320629626e-06, "loss": 0.4897, "step": 862 }, { "epoch": 0.76, "grad_norm": 1.8284259921968489, "learning_rate": 3.4243380382688395e-06, "loss": 0.5285, "step": 863 }, { "epoch": 0.77, "grad_norm": 2.4115721951019933, "learning_rate": 3.4210909506558383e-06, "loss": 0.5327, "step": 864 }, { "epoch": 0.77, "grad_norm": 1.827035801006768, "learning_rate": 3.4178420641322564e-06, "loss": 0.5959, "step": 865 }, { "epoch": 0.77, "grad_norm": 1.7275971455556467, "learning_rate": 3.414591385043237e-06, "loss": 0.5378, "step": 866 }, { "epoch": 0.77, "grad_norm": 1.8399392023051784, "learning_rate": 3.411338919737429e-06, "loss": 0.4737, "step": 867 }, { "epoch": 0.77, "grad_norm": 1.768301025681768, "learning_rate": 3.408084674566967e-06, "loss": 0.5237, "step": 868 }, { "epoch": 0.77, "grad_norm": 1.7940606795442973, "learning_rate": 3.404828655887462e-06, "loss": 0.5199, "step": 869 }, { "epoch": 0.77, "grad_norm": 1.736302967715387, "learning_rate": 3.4015708700579893e-06, "loss": 0.5103, "step": 870 }, { "epoch": 0.77, "grad_norm": 1.864705554020529, "learning_rate": 3.398311323441075e-06, "loss": 0.5456, "step": 871 }, { "epoch": 0.77, "grad_norm": 1.7371337216784375, "learning_rate": 3.3950500224026838e-06, "loss": 0.54, "step": 872 }, { "epoch": 0.77, "grad_norm": 1.7936602187941955, "learning_rate": 3.3917869733122082e-06, "loss": 0.5079, "step": 873 }, { "epoch": 0.77, "grad_norm": 1.78627252413609, "learning_rate": 3.3885221825424535e-06, "loss": 0.5272, "step": 874 }, { "epoch": 0.78, "grad_norm": 2.0255442379828588, "learning_rate": 3.385255656469627e-06, "loss": 0.5451, "step": 875 }, { "epoch": 0.78, "grad_norm": 1.8151205951225127, "learning_rate": 3.3819874014733245e-06, "loss": 0.545, "step": 876 }, { "epoch": 0.78, "grad_norm": 1.7644602173142565, "learning_rate": 3.3787174239365183e-06, "loss": 0.5021, "step": 877 }, { "epoch": 0.78, "grad_norm": 1.88690726704404, "learning_rate": 3.3754457302455464e-06, "loss": 0.5518, "step": 878 }, { "epoch": 0.78, "grad_norm": 1.9466161438131033, "learning_rate": 3.372172326790097e-06, "loss": 0.5499, "step": 879 }, { "epoch": 0.78, "grad_norm": 1.7759200801637758, "learning_rate": 3.3688972199631974e-06, "loss": 0.5165, "step": 880 }, { "epoch": 0.78, "grad_norm": 1.7404813059594972, "learning_rate": 3.365620416161204e-06, "loss": 0.4914, "step": 881 }, { "epoch": 0.78, "grad_norm": 1.7186493344503415, "learning_rate": 3.3623419217837836e-06, "loss": 0.4742, "step": 882 }, { "epoch": 0.78, "grad_norm": 1.688196680775216, "learning_rate": 3.3590617432339077e-06, "loss": 0.4973, "step": 883 }, { "epoch": 0.78, "grad_norm": 1.9998510596311416, "learning_rate": 3.355779886917836e-06, "loss": 0.4844, "step": 884 }, { "epoch": 0.78, "grad_norm": 1.9138346820930676, "learning_rate": 3.3524963592451048e-06, "loss": 0.5767, "step": 885 }, { "epoch": 0.78, "grad_norm": 1.8240977441306703, "learning_rate": 3.349211166628515e-06, "loss": 0.5535, "step": 886 }, { "epoch": 0.79, "grad_norm": 1.866188876988342, "learning_rate": 3.3459243154841194e-06, "loss": 0.5293, "step": 887 }, { "epoch": 0.79, "grad_norm": 1.8428560106324356, "learning_rate": 3.342635812231208e-06, "loss": 0.5545, "step": 888 }, { "epoch": 0.79, "grad_norm": 1.946339663223573, "learning_rate": 3.3393456632922997e-06, "loss": 0.5662, "step": 889 }, { "epoch": 0.79, "grad_norm": 1.7835322668971936, "learning_rate": 3.3360538750931277e-06, "loss": 0.5343, "step": 890 }, { "epoch": 0.79, "grad_norm": 1.8985737358987655, "learning_rate": 3.3327604540626245e-06, "loss": 0.4882, "step": 891 }, { "epoch": 0.79, "grad_norm": 1.7452799601454962, "learning_rate": 3.3294654066329125e-06, "loss": 0.4847, "step": 892 }, { "epoch": 0.79, "grad_norm": 1.8001237054125527, "learning_rate": 3.3261687392392917e-06, "loss": 0.5294, "step": 893 }, { "epoch": 0.79, "grad_norm": 1.878202857326882, "learning_rate": 3.3228704583202244e-06, "loss": 0.5506, "step": 894 }, { "epoch": 0.79, "grad_norm": 1.9555722164046163, "learning_rate": 3.319570570317324e-06, "loss": 0.5675, "step": 895 }, { "epoch": 0.79, "grad_norm": 1.842178231242227, "learning_rate": 3.316269081675345e-06, "loss": 0.507, "step": 896 }, { "epoch": 0.79, "grad_norm": 1.7925971037996111, "learning_rate": 3.3129659988421646e-06, "loss": 0.544, "step": 897 }, { "epoch": 0.8, "grad_norm": 1.8448861762114805, "learning_rate": 3.309661328268776e-06, "loss": 0.5547, "step": 898 }, { "epoch": 0.8, "grad_norm": 1.8798388041152536, "learning_rate": 3.3063550764092722e-06, "loss": 0.5535, "step": 899 }, { "epoch": 0.8, "grad_norm": 2.111205651077239, "learning_rate": 3.3030472497208354e-06, "loss": 0.5372, "step": 900 }, { "epoch": 0.8, "grad_norm": 1.9023950174091275, "learning_rate": 3.2997378546637217e-06, "loss": 0.5183, "step": 901 }, { "epoch": 0.8, "grad_norm": 1.828168427249714, "learning_rate": 3.296426897701251e-06, "loss": 0.5139, "step": 902 }, { "epoch": 0.8, "grad_norm": 1.752269482139502, "learning_rate": 3.293114385299795e-06, "loss": 0.4977, "step": 903 }, { "epoch": 0.8, "grad_norm": 1.8319951115110833, "learning_rate": 3.2898003239287626e-06, "loss": 0.4762, "step": 904 }, { "epoch": 0.8, "grad_norm": 1.9203452380089554, "learning_rate": 3.2864847200605864e-06, "loss": 0.5328, "step": 905 }, { "epoch": 0.8, "grad_norm": 1.9603318007718882, "learning_rate": 3.2831675801707126e-06, "loss": 0.5114, "step": 906 }, { "epoch": 0.8, "grad_norm": 1.772386222577394, "learning_rate": 3.2798489107375875e-06, "loss": 0.5365, "step": 907 }, { "epoch": 0.8, "grad_norm": 1.7664388279000272, "learning_rate": 3.2765287182426445e-06, "loss": 0.5218, "step": 908 }, { "epoch": 0.81, "grad_norm": 1.705238499414661, "learning_rate": 3.2732070091702928e-06, "loss": 0.515, "step": 909 }, { "epoch": 0.81, "grad_norm": 1.8346490363510246, "learning_rate": 3.2698837900078995e-06, "loss": 0.5032, "step": 910 }, { "epoch": 0.81, "grad_norm": 2.1169074366870504, "learning_rate": 3.2665590672457853e-06, "loss": 0.5463, "step": 911 }, { "epoch": 0.81, "grad_norm": 1.9794978557420737, "learning_rate": 3.263232847377205e-06, "loss": 0.5556, "step": 912 }, { "epoch": 0.81, "grad_norm": 1.8775372141713855, "learning_rate": 3.2599051368983393e-06, "loss": 0.5479, "step": 913 }, { "epoch": 0.81, "grad_norm": 1.9608965084656977, "learning_rate": 3.256575942308278e-06, "loss": 0.4934, "step": 914 }, { "epoch": 0.81, "grad_norm": 1.9035969324400404, "learning_rate": 3.2532452701090107e-06, "loss": 0.494, "step": 915 }, { "epoch": 0.81, "grad_norm": 1.8348725792159002, "learning_rate": 3.2499131268054114e-06, "loss": 0.5101, "step": 916 }, { "epoch": 0.81, "grad_norm": 1.837442323872043, "learning_rate": 3.2465795189052283e-06, "loss": 0.5028, "step": 917 }, { "epoch": 0.81, "grad_norm": 2.0588580347681114, "learning_rate": 3.2432444529190714e-06, "loss": 0.5572, "step": 918 }, { "epoch": 0.81, "grad_norm": 1.800197863385395, "learning_rate": 3.2399079353603958e-06, "loss": 0.5456, "step": 919 }, { "epoch": 0.81, "grad_norm": 1.8642409261562531, "learning_rate": 3.236569972745492e-06, "loss": 0.4677, "step": 920 }, { "epoch": 0.82, "grad_norm": 1.8605177191737032, "learning_rate": 3.2332305715934735e-06, "loss": 0.5086, "step": 921 }, { "epoch": 0.82, "grad_norm": 1.8779408638935786, "learning_rate": 3.229889738426264e-06, "loss": 0.4576, "step": 922 }, { "epoch": 0.82, "grad_norm": 1.8069917958596904, "learning_rate": 3.226547479768582e-06, "loss": 0.4847, "step": 923 }, { "epoch": 0.82, "grad_norm": 1.949377976689351, "learning_rate": 3.2232038021479317e-06, "loss": 0.5095, "step": 924 }, { "epoch": 0.82, "grad_norm": 1.9043326063097796, "learning_rate": 3.2198587120945878e-06, "loss": 0.5382, "step": 925 }, { "epoch": 0.82, "grad_norm": 1.8420984644699558, "learning_rate": 3.2165122161415844e-06, "loss": 0.5354, "step": 926 }, { "epoch": 0.82, "grad_norm": 1.9159042477860826, "learning_rate": 3.2131643208246994e-06, "loss": 0.5676, "step": 927 }, { "epoch": 0.82, "grad_norm": 1.8091292349745058, "learning_rate": 3.209815032682445e-06, "loss": 0.5152, "step": 928 }, { "epoch": 0.82, "grad_norm": 1.9172852365194688, "learning_rate": 3.206464358256054e-06, "loss": 0.4965, "step": 929 }, { "epoch": 0.82, "grad_norm": 1.8611473653995623, "learning_rate": 3.2031123040894658e-06, "loss": 0.5222, "step": 930 }, { "epoch": 0.82, "grad_norm": 2.0718827285873, "learning_rate": 3.1997588767293146e-06, "loss": 0.5512, "step": 931 }, { "epoch": 0.83, "grad_norm": 1.8367854431958046, "learning_rate": 3.196404082724918e-06, "loss": 0.522, "step": 932 }, { "epoch": 0.83, "grad_norm": 1.9326854247843166, "learning_rate": 3.19304792862826e-06, "loss": 0.5262, "step": 933 }, { "epoch": 0.83, "grad_norm": 1.8127395054303974, "learning_rate": 3.1896904209939827e-06, "loss": 0.4792, "step": 934 }, { "epoch": 0.83, "grad_norm": 1.7562676297882738, "learning_rate": 3.1863315663793715e-06, "loss": 0.5132, "step": 935 }, { "epoch": 0.83, "grad_norm": 2.1115973982625826, "learning_rate": 3.182971371344342e-06, "loss": 0.5431, "step": 936 }, { "epoch": 0.83, "grad_norm": 1.9125267865316575, "learning_rate": 3.179609842451428e-06, "loss": 0.5049, "step": 937 }, { "epoch": 0.83, "grad_norm": 1.8084301603852846, "learning_rate": 3.1762469862657673e-06, "loss": 0.5057, "step": 938 }, { "epoch": 0.83, "grad_norm": 1.979887599791109, "learning_rate": 3.172882809355092e-06, "loss": 0.5076, "step": 939 }, { "epoch": 0.83, "grad_norm": 1.8023843851685244, "learning_rate": 3.1695173182897126e-06, "loss": 0.507, "step": 940 }, { "epoch": 0.83, "grad_norm": 1.894018453771296, "learning_rate": 3.166150519642506e-06, "loss": 0.4892, "step": 941 }, { "epoch": 0.83, "grad_norm": 2.085200027059979, "learning_rate": 3.162782419988901e-06, "loss": 0.5109, "step": 942 }, { "epoch": 0.84, "grad_norm": 1.9145317338940404, "learning_rate": 3.1594130259068723e-06, "loss": 0.5597, "step": 943 }, { "epoch": 0.84, "grad_norm": 2.6898725390450196, "learning_rate": 3.1560423439769173e-06, "loss": 0.5364, "step": 944 }, { "epoch": 0.84, "grad_norm": 1.8953702977370355, "learning_rate": 3.152670380782052e-06, "loss": 0.5402, "step": 945 }, { "epoch": 0.84, "grad_norm": 1.8989394358006901, "learning_rate": 3.1492971429077924e-06, "loss": 0.499, "step": 946 }, { "epoch": 0.84, "grad_norm": 1.8295299154755171, "learning_rate": 3.1459226369421465e-06, "loss": 0.5133, "step": 947 }, { "epoch": 0.84, "grad_norm": 1.9849867895935545, "learning_rate": 3.1425468694755968e-06, "loss": 0.5173, "step": 948 }, { "epoch": 0.84, "grad_norm": 1.7806451045050948, "learning_rate": 3.13916984710109e-06, "loss": 0.5314, "step": 949 }, { "epoch": 0.84, "grad_norm": 1.8227836319972825, "learning_rate": 3.1357915764140247e-06, "loss": 0.5413, "step": 950 }, { "epoch": 0.84, "grad_norm": 1.873012898370893, "learning_rate": 3.1324120640122362e-06, "loss": 0.5582, "step": 951 }, { "epoch": 0.84, "grad_norm": 1.7312834865810094, "learning_rate": 3.129031316495986e-06, "loss": 0.4969, "step": 952 }, { "epoch": 0.84, "grad_norm": 1.850102780247153, "learning_rate": 3.1256493404679468e-06, "loss": 0.4981, "step": 953 }, { "epoch": 0.84, "grad_norm": 1.85121227661343, "learning_rate": 3.122266142533191e-06, "loss": 0.4926, "step": 954 }, { "epoch": 0.85, "grad_norm": 1.911516866472808, "learning_rate": 3.118881729299178e-06, "loss": 0.5141, "step": 955 }, { "epoch": 0.85, "grad_norm": 1.9562838385609387, "learning_rate": 3.1154961073757388e-06, "loss": 0.5119, "step": 956 }, { "epoch": 0.85, "grad_norm": 1.9792813407411627, "learning_rate": 3.1121092833750684e-06, "loss": 0.5379, "step": 957 }, { "epoch": 0.85, "grad_norm": 2.02442320634539, "learning_rate": 3.1087212639117057e-06, "loss": 0.5516, "step": 958 }, { "epoch": 0.85, "grad_norm": 1.9139240600717167, "learning_rate": 3.1053320556025272e-06, "loss": 0.5035, "step": 959 }, { "epoch": 0.85, "grad_norm": 1.6820068229198286, "learning_rate": 3.10194166506673e-06, "loss": 0.5082, "step": 960 }, { "epoch": 0.85, "grad_norm": 1.837945615423465, "learning_rate": 3.098550098925819e-06, "loss": 0.5301, "step": 961 }, { "epoch": 0.85, "grad_norm": 1.8297516724958631, "learning_rate": 3.095157363803598e-06, "loss": 0.531, "step": 962 }, { "epoch": 0.85, "grad_norm": 1.8057255627930757, "learning_rate": 3.091763466326152e-06, "loss": 0.4962, "step": 963 }, { "epoch": 0.85, "grad_norm": 1.8568993199742134, "learning_rate": 3.0883684131218356e-06, "loss": 0.5555, "step": 964 }, { "epoch": 0.85, "grad_norm": 1.7537389006494144, "learning_rate": 3.084972210821261e-06, "loss": 0.4783, "step": 965 }, { "epoch": 0.86, "grad_norm": 1.936835841446932, "learning_rate": 3.0815748660572856e-06, "loss": 0.5696, "step": 966 }, { "epoch": 0.86, "grad_norm": 1.818312553754802, "learning_rate": 3.078176385464997e-06, "loss": 0.5125, "step": 967 }, { "epoch": 0.86, "grad_norm": 1.9098144545445246, "learning_rate": 3.074776775681702e-06, "loss": 0.5472, "step": 968 }, { "epoch": 0.86, "grad_norm": 1.8530900425697827, "learning_rate": 3.071376043346912e-06, "loss": 0.5387, "step": 969 }, { "epoch": 0.86, "grad_norm": 1.734080732564932, "learning_rate": 3.0679741951023302e-06, "loss": 0.5082, "step": 970 }, { "epoch": 0.86, "grad_norm": 1.7157271380716255, "learning_rate": 3.06457123759184e-06, "loss": 0.5057, "step": 971 }, { "epoch": 0.86, "grad_norm": 1.8615941154610314, "learning_rate": 3.061167177461492e-06, "loss": 0.5326, "step": 972 }, { "epoch": 0.86, "grad_norm": 1.8820053895933144, "learning_rate": 3.0577620213594888e-06, "loss": 0.5446, "step": 973 }, { "epoch": 0.86, "grad_norm": 1.8157963098312144, "learning_rate": 3.0543557759361735e-06, "loss": 0.5627, "step": 974 }, { "epoch": 0.86, "grad_norm": 1.7642611841801312, "learning_rate": 3.0509484478440187e-06, "loss": 0.5062, "step": 975 }, { "epoch": 0.86, "grad_norm": 1.6839843509551078, "learning_rate": 3.047540043737609e-06, "loss": 0.526, "step": 976 }, { "epoch": 0.87, "grad_norm": 1.9004464286881788, "learning_rate": 3.0441305702736314e-06, "loss": 0.5617, "step": 977 }, { "epoch": 0.87, "grad_norm": 1.9767954561122347, "learning_rate": 3.0407200341108618e-06, "loss": 0.5077, "step": 978 }, { "epoch": 0.87, "grad_norm": 1.825193444039661, "learning_rate": 3.0373084419101506e-06, "loss": 0.5097, "step": 979 }, { "epoch": 0.87, "grad_norm": 1.6810496770660706, "learning_rate": 3.0338958003344115e-06, "loss": 0.4993, "step": 980 }, { "epoch": 0.87, "grad_norm": 1.7411591022211208, "learning_rate": 3.0304821160486086e-06, "loss": 0.4789, "step": 981 }, { "epoch": 0.87, "grad_norm": 1.7580191857406102, "learning_rate": 3.0270673957197393e-06, "loss": 0.5225, "step": 982 }, { "epoch": 0.87, "grad_norm": 1.7440391739784626, "learning_rate": 3.023651646016828e-06, "loss": 0.5281, "step": 983 }, { "epoch": 0.87, "grad_norm": 1.8458326991098015, "learning_rate": 3.0202348736109074e-06, "loss": 0.5419, "step": 984 }, { "epoch": 0.87, "grad_norm": 1.7105130101397825, "learning_rate": 3.0168170851750077e-06, "loss": 0.5113, "step": 985 }, { "epoch": 0.87, "grad_norm": 1.74741112552671, "learning_rate": 3.013398287384144e-06, "loss": 0.5389, "step": 986 }, { "epoch": 0.87, "grad_norm": 1.7962043049830843, "learning_rate": 3.009978486915302e-06, "loss": 0.5212, "step": 987 }, { "epoch": 0.88, "grad_norm": 1.698744627764944, "learning_rate": 3.006557690447427e-06, "loss": 0.508, "step": 988 }, { "epoch": 0.88, "grad_norm": 1.852219826000981, "learning_rate": 3.0031359046614073e-06, "loss": 0.5491, "step": 989 }, { "epoch": 0.88, "grad_norm": 1.8471065567470235, "learning_rate": 2.9997131362400666e-06, "loss": 0.4937, "step": 990 }, { "epoch": 0.88, "grad_norm": 1.7925416653935446, "learning_rate": 2.996289391868144e-06, "loss": 0.4691, "step": 991 }, { "epoch": 0.88, "grad_norm": 1.8399091219230026, "learning_rate": 2.9928646782322875e-06, "loss": 0.5317, "step": 992 }, { "epoch": 0.88, "grad_norm": 1.7232111222956334, "learning_rate": 2.989439002021036e-06, "loss": 0.5152, "step": 993 }, { "epoch": 0.88, "grad_norm": 1.8514924128683583, "learning_rate": 2.986012369924811e-06, "loss": 0.573, "step": 994 }, { "epoch": 0.88, "grad_norm": 1.9226274135737127, "learning_rate": 2.982584788635897e-06, "loss": 0.5168, "step": 995 }, { "epoch": 0.88, "grad_norm": 1.7726209925124323, "learning_rate": 2.979156264848437e-06, "loss": 0.5157, "step": 996 }, { "epoch": 0.88, "grad_norm": 1.9564580777074403, "learning_rate": 2.9757268052584097e-06, "loss": 0.5693, "step": 997 }, { "epoch": 0.88, "grad_norm": 1.959067937570625, "learning_rate": 2.9722964165636263e-06, "loss": 0.5151, "step": 998 }, { "epoch": 0.88, "grad_norm": 1.9170985685573452, "learning_rate": 2.9688651054637086e-06, "loss": 0.5944, "step": 999 }, { "epoch": 0.89, "grad_norm": 1.8773917767941883, "learning_rate": 2.9654328786600823e-06, "loss": 0.5128, "step": 1000 }, { "epoch": 0.89, "grad_norm": 1.8184350092212354, "learning_rate": 2.96199974285596e-06, "loss": 0.5052, "step": 1001 }, { "epoch": 0.89, "grad_norm": 1.9785501833054495, "learning_rate": 2.9585657047563314e-06, "loss": 0.5794, "step": 1002 }, { "epoch": 0.89, "grad_norm": 1.8252999737390432, "learning_rate": 2.9551307710679467e-06, "loss": 0.5657, "step": 1003 }, { "epoch": 0.89, "grad_norm": 1.8050415950517775, "learning_rate": 2.9516949484993055e-06, "loss": 0.5054, "step": 1004 }, { "epoch": 0.89, "grad_norm": 1.7751399822789855, "learning_rate": 2.9482582437606445e-06, "loss": 0.5025, "step": 1005 }, { "epoch": 0.89, "grad_norm": 1.7388276457967873, "learning_rate": 2.9448206635639213e-06, "loss": 0.48, "step": 1006 }, { "epoch": 0.89, "grad_norm": 1.9401107131003557, "learning_rate": 2.941382214622806e-06, "loss": 0.5503, "step": 1007 }, { "epoch": 0.89, "grad_norm": 1.8055033222058048, "learning_rate": 2.937942903652663e-06, "loss": 0.5589, "step": 1008 }, { "epoch": 0.89, "grad_norm": 1.8833337691151302, "learning_rate": 2.93450273737054e-06, "loss": 0.5395, "step": 1009 }, { "epoch": 0.89, "grad_norm": 1.875491652961695, "learning_rate": 2.9310617224951594e-06, "loss": 0.5316, "step": 1010 }, { "epoch": 0.9, "grad_norm": 1.801842376116382, "learning_rate": 2.9276198657468947e-06, "loss": 0.5369, "step": 1011 }, { "epoch": 0.9, "grad_norm": 1.7434378005034878, "learning_rate": 2.9241771738477686e-06, "loss": 0.5345, "step": 1012 }, { "epoch": 0.9, "grad_norm": 1.840106192598806, "learning_rate": 2.920733653521432e-06, "loss": 0.5391, "step": 1013 }, { "epoch": 0.9, "grad_norm": 1.847462115860291, "learning_rate": 2.917289311493155e-06, "loss": 0.5176, "step": 1014 }, { "epoch": 0.9, "grad_norm": 1.7647442625122556, "learning_rate": 2.9138441544898123e-06, "loss": 0.502, "step": 1015 }, { "epoch": 0.9, "grad_norm": 1.7981764340288842, "learning_rate": 2.9103981892398698e-06, "loss": 0.5422, "step": 1016 }, { "epoch": 0.9, "grad_norm": 1.8491619302175528, "learning_rate": 2.9069514224733725e-06, "loss": 0.4993, "step": 1017 }, { "epoch": 0.9, "grad_norm": 1.8345458812848932, "learning_rate": 2.903503860921931e-06, "loss": 0.5322, "step": 1018 }, { "epoch": 0.9, "grad_norm": 1.9341637425072102, "learning_rate": 2.900055511318707e-06, "loss": 0.5338, "step": 1019 }, { "epoch": 0.9, "grad_norm": 1.8629389822642988, "learning_rate": 2.896606380398402e-06, "loss": 0.538, "step": 1020 }, { "epoch": 0.9, "grad_norm": 1.8190212821977385, "learning_rate": 2.8931564748972446e-06, "loss": 0.5417, "step": 1021 }, { "epoch": 0.91, "grad_norm": 1.808827329636345, "learning_rate": 2.8897058015529734e-06, "loss": 0.5142, "step": 1022 }, { "epoch": 0.91, "grad_norm": 1.8593710361108637, "learning_rate": 2.8862543671048288e-06, "loss": 0.5148, "step": 1023 }, { "epoch": 0.91, "grad_norm": 1.9421214945128942, "learning_rate": 2.882802178293538e-06, "loss": 0.5375, "step": 1024 }, { "epoch": 0.91, "grad_norm": 1.8337412539689857, "learning_rate": 2.879349241861299e-06, "loss": 0.5179, "step": 1025 }, { "epoch": 0.91, "grad_norm": 1.8368160375080673, "learning_rate": 2.8758955645517724e-06, "loss": 0.5404, "step": 1026 }, { "epoch": 0.91, "grad_norm": 1.8549078592919745, "learning_rate": 2.8724411531100642e-06, "loss": 0.5668, "step": 1027 }, { "epoch": 0.91, "grad_norm": 1.7783870646526379, "learning_rate": 2.8689860142827153e-06, "loss": 0.5556, "step": 1028 }, { "epoch": 0.91, "grad_norm": 1.8409533014441846, "learning_rate": 2.865530154817687e-06, "loss": 0.4876, "step": 1029 }, { "epoch": 0.91, "grad_norm": 1.786131185447664, "learning_rate": 2.8620735814643467e-06, "loss": 0.5503, "step": 1030 }, { "epoch": 0.91, "grad_norm": 1.8105893579918746, "learning_rate": 2.858616300973458e-06, "loss": 0.4895, "step": 1031 }, { "epoch": 0.91, "grad_norm": 1.8569751090297868, "learning_rate": 2.8551583200971638e-06, "loss": 0.5826, "step": 1032 }, { "epoch": 0.91, "grad_norm": 1.9048981456653071, "learning_rate": 2.8516996455889763e-06, "loss": 0.5319, "step": 1033 }, { "epoch": 0.92, "grad_norm": 1.7483385624776147, "learning_rate": 2.8482402842037615e-06, "loss": 0.4664, "step": 1034 }, { "epoch": 0.92, "grad_norm": 1.9686504696650498, "learning_rate": 2.844780242697727e-06, "loss": 0.5459, "step": 1035 }, { "epoch": 0.92, "grad_norm": 1.7882376390021062, "learning_rate": 2.8413195278284084e-06, "loss": 0.5272, "step": 1036 }, { "epoch": 0.92, "grad_norm": 1.9426980086335028, "learning_rate": 2.8378581463546578e-06, "loss": 0.4785, "step": 1037 }, { "epoch": 0.92, "grad_norm": 1.7027235807049006, "learning_rate": 2.8343961050366275e-06, "loss": 0.5295, "step": 1038 }, { "epoch": 0.92, "grad_norm": 1.7896557139712275, "learning_rate": 2.8309334106357606e-06, "loss": 0.4917, "step": 1039 }, { "epoch": 0.92, "grad_norm": 1.9559071265127668, "learning_rate": 2.827470069914772e-06, "loss": 0.4813, "step": 1040 }, { "epoch": 0.92, "grad_norm": 1.8624058139739532, "learning_rate": 2.8240060896376425e-06, "loss": 0.5173, "step": 1041 }, { "epoch": 0.92, "grad_norm": 1.8517426107017696, "learning_rate": 2.8205414765696005e-06, "loss": 0.5022, "step": 1042 }, { "epoch": 0.92, "grad_norm": 1.8669678553165407, "learning_rate": 2.817076237477111e-06, "loss": 0.5153, "step": 1043 }, { "epoch": 0.92, "grad_norm": 1.765641529260409, "learning_rate": 2.8136103791278597e-06, "loss": 0.5459, "step": 1044 }, { "epoch": 0.93, "grad_norm": 1.7951199125059072, "learning_rate": 2.8101439082907432e-06, "loss": 0.5556, "step": 1045 }, { "epoch": 0.93, "grad_norm": 1.809753719361155, "learning_rate": 2.806676831735855e-06, "loss": 0.5082, "step": 1046 }, { "epoch": 0.93, "grad_norm": 2.022714382944429, "learning_rate": 2.8032091562344704e-06, "loss": 0.5079, "step": 1047 }, { "epoch": 0.93, "grad_norm": 1.7851810141612734, "learning_rate": 2.7997408885590355e-06, "loss": 0.5044, "step": 1048 }, { "epoch": 0.93, "grad_norm": 1.6942393347489977, "learning_rate": 2.7962720354831507e-06, "loss": 0.4845, "step": 1049 }, { "epoch": 0.93, "grad_norm": 1.8154999007296455, "learning_rate": 2.792802603781562e-06, "loss": 0.5039, "step": 1050 }, { "epoch": 0.93, "grad_norm": 1.909875578351421, "learning_rate": 2.7893326002301446e-06, "loss": 0.5081, "step": 1051 }, { "epoch": 0.93, "grad_norm": 1.8216639768552991, "learning_rate": 2.785862031605891e-06, "loss": 0.5022, "step": 1052 }, { "epoch": 0.93, "grad_norm": 1.968304394575798, "learning_rate": 2.7823909046868957e-06, "loss": 0.5217, "step": 1053 }, { "epoch": 0.93, "grad_norm": 2.269471892811331, "learning_rate": 2.778919226252346e-06, "loss": 0.5526, "step": 1054 }, { "epoch": 0.93, "grad_norm": 1.8562637541083824, "learning_rate": 2.775447003082505e-06, "loss": 0.5686, "step": 1055 }, { "epoch": 0.94, "grad_norm": 2.2263991007598114, "learning_rate": 2.7719742419586998e-06, "loss": 0.5402, "step": 1056 }, { "epoch": 0.94, "grad_norm": 1.6883177707586092, "learning_rate": 2.7685009496633075e-06, "loss": 0.5033, "step": 1057 }, { "epoch": 0.94, "grad_norm": 1.8528118029727803, "learning_rate": 2.765027132979743e-06, "loss": 0.5544, "step": 1058 }, { "epoch": 0.94, "grad_norm": 1.9477833558101318, "learning_rate": 2.761552798692446e-06, "loss": 0.5255, "step": 1059 }, { "epoch": 0.94, "grad_norm": 1.8332232845916867, "learning_rate": 2.7580779535868675e-06, "loss": 0.5296, "step": 1060 }, { "epoch": 0.94, "grad_norm": 1.7825492914819279, "learning_rate": 2.754602604449454e-06, "loss": 0.5071, "step": 1061 }, { "epoch": 0.94, "grad_norm": 1.8629044416803393, "learning_rate": 2.7511267580676382e-06, "loss": 0.5242, "step": 1062 }, { "epoch": 0.94, "grad_norm": 1.7488315230717792, "learning_rate": 2.7476504212298233e-06, "loss": 0.5252, "step": 1063 }, { "epoch": 0.94, "grad_norm": 1.8087959812729764, "learning_rate": 2.7441736007253705e-06, "loss": 0.4935, "step": 1064 }, { "epoch": 0.94, "grad_norm": 1.805794397151574, "learning_rate": 2.740696303344585e-06, "loss": 0.5819, "step": 1065 }, { "epoch": 0.94, "grad_norm": 1.7460467388665153, "learning_rate": 2.737218535878705e-06, "loss": 0.5411, "step": 1066 }, { "epoch": 0.95, "grad_norm": 1.9047016227341778, "learning_rate": 2.7337403051198846e-06, "loss": 0.4755, "step": 1067 }, { "epoch": 0.95, "grad_norm": 1.839222519978661, "learning_rate": 2.730261617861185e-06, "loss": 0.4855, "step": 1068 }, { "epoch": 0.95, "grad_norm": 1.862929087519683, "learning_rate": 2.726782480896557e-06, "loss": 0.5431, "step": 1069 }, { "epoch": 0.95, "grad_norm": 1.7864340196228758, "learning_rate": 2.723302901020831e-06, "loss": 0.5108, "step": 1070 }, { "epoch": 0.95, "grad_norm": 1.827547647278096, "learning_rate": 2.719822885029701e-06, "loss": 0.5029, "step": 1071 }, { "epoch": 0.95, "grad_norm": 1.6624898067287452, "learning_rate": 2.716342439719714e-06, "loss": 0.4861, "step": 1072 }, { "epoch": 0.95, "grad_norm": 1.8852871442731454, "learning_rate": 2.7128615718882554e-06, "loss": 0.5053, "step": 1073 }, { "epoch": 0.95, "grad_norm": 1.9534449028119654, "learning_rate": 2.7093802883335357e-06, "loss": 0.5654, "step": 1074 }, { "epoch": 0.95, "grad_norm": 1.7249582061537097, "learning_rate": 2.7058985958545765e-06, "loss": 0.5002, "step": 1075 }, { "epoch": 0.95, "grad_norm": 1.7562844672053906, "learning_rate": 2.702416501251199e-06, "loss": 0.5436, "step": 1076 }, { "epoch": 0.95, "grad_norm": 1.986901333170154, "learning_rate": 2.6989340113240087e-06, "loss": 0.527, "step": 1077 }, { "epoch": 0.95, "grad_norm": 1.7825768030688796, "learning_rate": 2.695451132874385e-06, "loss": 0.525, "step": 1078 }, { "epoch": 0.96, "grad_norm": 1.6606555374476397, "learning_rate": 2.691967872704464e-06, "loss": 0.476, "step": 1079 }, { "epoch": 0.96, "grad_norm": 1.7825083810087277, "learning_rate": 2.688484237617129e-06, "loss": 0.477, "step": 1080 }, { "epoch": 0.96, "grad_norm": 1.9686430333958531, "learning_rate": 2.6850002344159943e-06, "loss": 0.5434, "step": 1081 }, { "epoch": 0.96, "grad_norm": 1.717636244450827, "learning_rate": 2.6815158699053935e-06, "loss": 0.5794, "step": 1082 }, { "epoch": 0.96, "grad_norm": 1.704099898400831, "learning_rate": 2.6780311508903673e-06, "loss": 0.5107, "step": 1083 }, { "epoch": 0.96, "grad_norm": 1.7521786436297433, "learning_rate": 2.6745460841766456e-06, "loss": 0.543, "step": 1084 }, { "epoch": 0.96, "grad_norm": 1.8288817857074091, "learning_rate": 2.67106067657064e-06, "loss": 0.4888, "step": 1085 }, { "epoch": 0.96, "grad_norm": 1.7606668312978737, "learning_rate": 2.6675749348794273e-06, "loss": 0.5438, "step": 1086 }, { "epoch": 0.96, "grad_norm": 1.8974007350921405, "learning_rate": 2.6640888659107355e-06, "loss": 0.5103, "step": 1087 }, { "epoch": 0.96, "grad_norm": 1.8366492024047152, "learning_rate": 2.660602476472935e-06, "loss": 0.5211, "step": 1088 }, { "epoch": 0.96, "grad_norm": 1.813780419169426, "learning_rate": 2.657115773375018e-06, "loss": 0.4786, "step": 1089 }, { "epoch": 0.97, "grad_norm": 1.7751633927736221, "learning_rate": 2.6536287634265918e-06, "loss": 0.5456, "step": 1090 }, { "epoch": 0.97, "grad_norm": 1.8112706588151493, "learning_rate": 2.6501414534378616e-06, "loss": 0.536, "step": 1091 }, { "epoch": 0.97, "grad_norm": 1.7362256266600447, "learning_rate": 2.646653850219621e-06, "loss": 0.5266, "step": 1092 }, { "epoch": 0.97, "grad_norm": 1.7746963075577837, "learning_rate": 2.643165960583233e-06, "loss": 0.4845, "step": 1093 }, { "epoch": 0.97, "grad_norm": 1.7798827281249407, "learning_rate": 2.6396777913406228e-06, "loss": 0.457, "step": 1094 }, { "epoch": 0.97, "grad_norm": 1.801038100374856, "learning_rate": 2.6361893493042594e-06, "loss": 0.5093, "step": 1095 }, { "epoch": 0.97, "grad_norm": 1.70079798226535, "learning_rate": 2.632700641287147e-06, "loss": 0.5093, "step": 1096 }, { "epoch": 0.97, "grad_norm": 1.7688921140375633, "learning_rate": 2.6292116741028073e-06, "loss": 0.4999, "step": 1097 }, { "epoch": 0.97, "grad_norm": 1.7535762670261703, "learning_rate": 2.6257224545652688e-06, "loss": 0.5292, "step": 1098 }, { "epoch": 0.97, "grad_norm": 1.6511600588345345, "learning_rate": 2.622232989489052e-06, "loss": 0.5098, "step": 1099 }, { "epoch": 0.97, "grad_norm": 1.8226418118893164, "learning_rate": 2.6187432856891585e-06, "loss": 0.4995, "step": 1100 }, { "epoch": 0.98, "grad_norm": 1.7453251697809469, "learning_rate": 2.6152533499810567e-06, "loss": 0.5324, "step": 1101 }, { "epoch": 0.98, "grad_norm": 3.854764123671763, "learning_rate": 2.611763189180665e-06, "loss": 0.55, "step": 1102 }, { "epoch": 0.98, "grad_norm": 1.8822045822720739, "learning_rate": 2.608272810104343e-06, "loss": 0.4948, "step": 1103 }, { "epoch": 0.98, "grad_norm": 1.7655712435117557, "learning_rate": 2.6047822195688775e-06, "loss": 0.5361, "step": 1104 }, { "epoch": 0.98, "grad_norm": 1.7601014248725226, "learning_rate": 2.6012914243914667e-06, "loss": 0.4455, "step": 1105 }, { "epoch": 0.98, "grad_norm": 1.7832576894792285, "learning_rate": 2.5978004313897104e-06, "loss": 0.5356, "step": 1106 }, { "epoch": 0.98, "grad_norm": 1.7379093473470857, "learning_rate": 2.5943092473815922e-06, "loss": 0.4881, "step": 1107 }, { "epoch": 0.98, "grad_norm": 1.9126265402481486, "learning_rate": 2.590817879185471e-06, "loss": 0.4768, "step": 1108 }, { "epoch": 0.98, "grad_norm": 1.9554559921329315, "learning_rate": 2.5873263336200636e-06, "loss": 0.572, "step": 1109 }, { "epoch": 0.98, "grad_norm": 1.7969024529150637, "learning_rate": 2.5838346175044355e-06, "loss": 0.4894, "step": 1110 }, { "epoch": 0.98, "grad_norm": 1.7234439211809627, "learning_rate": 2.5803427376579824e-06, "loss": 0.4926, "step": 1111 }, { "epoch": 0.98, "grad_norm": 1.9809564443971945, "learning_rate": 2.5768507009004224e-06, "loss": 0.5677, "step": 1112 }, { "epoch": 0.99, "grad_norm": 1.7861662896762107, "learning_rate": 2.573358514051779e-06, "loss": 0.5283, "step": 1113 }, { "epoch": 0.99, "grad_norm": 1.7627618988801406, "learning_rate": 2.569866183932368e-06, "loss": 0.5366, "step": 1114 }, { "epoch": 0.99, "grad_norm": 1.870448428328825, "learning_rate": 2.5663737173627863e-06, "loss": 0.4864, "step": 1115 }, { "epoch": 0.99, "grad_norm": 1.7537459242093265, "learning_rate": 2.5628811211638967e-06, "loss": 0.5091, "step": 1116 }, { "epoch": 0.99, "grad_norm": 1.7948869258749416, "learning_rate": 2.5593884021568143e-06, "loss": 0.4851, "step": 1117 }, { "epoch": 0.99, "grad_norm": 1.7948580243638423, "learning_rate": 2.5558955671628964e-06, "loss": 0.5038, "step": 1118 }, { "epoch": 0.99, "grad_norm": 1.7071321162179267, "learning_rate": 2.552402623003726e-06, "loss": 0.5172, "step": 1119 }, { "epoch": 0.99, "grad_norm": 1.9189581718497948, "learning_rate": 2.548909576501096e-06, "loss": 0.5421, "step": 1120 }, { "epoch": 0.99, "grad_norm": 1.80649128798113, "learning_rate": 2.5454164344770044e-06, "loss": 0.5418, "step": 1121 }, { "epoch": 0.99, "grad_norm": 1.7691828394369862, "learning_rate": 2.5419232037536316e-06, "loss": 0.5103, "step": 1122 }, { "epoch": 0.99, "grad_norm": 1.7960539293917688, "learning_rate": 2.5384298911533344e-06, "loss": 0.5318, "step": 1123 }, { "epoch": 1.0, "grad_norm": 1.8973140103512256, "learning_rate": 2.5349365034986267e-06, "loss": 0.5705, "step": 1124 }, { "epoch": 1.0, "grad_norm": 1.821467182784847, "learning_rate": 2.531443047612171e-06, "loss": 0.5195, "step": 1125 }, { "epoch": 1.0, "grad_norm": 1.951851802158412, "learning_rate": 2.527949530316762e-06, "loss": 0.5033, "step": 1126 }, { "epoch": 1.0, "grad_norm": 1.866099609048635, "learning_rate": 2.5244559584353146e-06, "loss": 0.5482, "step": 1127 }, { "epoch": 1.0, "grad_norm": 1.764005332766906, "learning_rate": 2.520962338790851e-06, "loss": 0.4973, "step": 1128 }, { "epoch": 1.0, "grad_norm": 1.863408871226844, "learning_rate": 2.517468678206485e-06, "loss": 0.5249, "step": 1129 }, { "epoch": 1.0, "grad_norm": 1.7459173200826197, "learning_rate": 2.5139749835054123e-06, "loss": 0.4845, "step": 1130 }, { "epoch": 1.0, "grad_norm": 1.9910802418986058, "learning_rate": 2.5104812615108943e-06, "loss": 0.5702, "step": 1131 } ], "logging_steps": 1, "max_steps": 2258, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 377, "total_flos": 532701102735360.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }