{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9980022197558268, "eval_steps": 500, "global_step": 281, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.9832738637924194, "learning_rate": 1e-05, "loss": 0.5668, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.9458171129226685, "learning_rate": 1e-05, "loss": 0.527, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.8415082097053528, "learning_rate": 1e-05, "loss": 0.5078, "step": 3 }, { "epoch": 0.01, "grad_norm": 0.839102029800415, "learning_rate": 1e-05, "loss": 0.5188, "step": 4 }, { "epoch": 0.02, "grad_norm": 2.0685036182403564, "learning_rate": 1e-05, "loss": 0.5227, "step": 5 }, { "epoch": 0.02, "grad_norm": 0.8744383454322815, "learning_rate": 1e-05, "loss": 0.5149, "step": 6 }, { "epoch": 0.02, "grad_norm": 0.6805229187011719, "learning_rate": 1e-05, "loss": 0.4921, "step": 7 }, { "epoch": 0.03, "grad_norm": 1.070035696029663, "learning_rate": 1e-05, "loss": 0.4625, "step": 8 }, { "epoch": 0.03, "grad_norm": 0.6167441010475159, "learning_rate": 1e-05, "loss": 0.4819, "step": 9 }, { "epoch": 0.04, "grad_norm": 0.6767921447753906, "learning_rate": 1e-05, "loss": 0.4884, "step": 10 }, { "epoch": 0.04, "grad_norm": 1.1164625883102417, "learning_rate": 1e-05, "loss": 0.4896, "step": 11 }, { "epoch": 0.04, "grad_norm": 0.8692913055419922, "learning_rate": 1e-05, "loss": 0.4893, "step": 12 }, { "epoch": 0.05, "grad_norm": 0.41812172532081604, "learning_rate": 1e-05, "loss": 0.493, "step": 13 }, { "epoch": 0.05, "grad_norm": 0.49972113966941833, "learning_rate": 1e-05, "loss": 0.5021, "step": 14 }, { "epoch": 0.05, "grad_norm": 0.5513176321983337, "learning_rate": 1e-05, "loss": 0.4859, "step": 15 }, { "epoch": 0.06, "grad_norm": 0.7295736074447632, "learning_rate": 1e-05, "loss": 0.4439, "step": 16 }, { "epoch": 0.06, "grad_norm": 0.45257988572120667, "learning_rate": 1e-05, "loss": 0.4719, "step": 17 }, { "epoch": 0.06, "grad_norm": 0.535362720489502, "learning_rate": 1e-05, "loss": 0.4823, "step": 18 }, { "epoch": 0.07, "grad_norm": 0.5812283158302307, "learning_rate": 1e-05, "loss": 0.4773, "step": 19 }, { "epoch": 0.07, "grad_norm": 1.1126588582992554, "learning_rate": 1e-05, "loss": 0.4477, "step": 20 }, { "epoch": 0.07, "grad_norm": 0.44774916768074036, "learning_rate": 1e-05, "loss": 0.4627, "step": 21 }, { "epoch": 0.08, "grad_norm": 0.4647807478904724, "learning_rate": 1e-05, "loss": 0.4783, "step": 22 }, { "epoch": 0.08, "grad_norm": 0.7982929944992065, "learning_rate": 1e-05, "loss": 0.4654, "step": 23 }, { "epoch": 0.09, "grad_norm": 0.758514940738678, "learning_rate": 1e-05, "loss": 0.4453, "step": 24 }, { "epoch": 0.09, "grad_norm": 0.47228220105171204, "learning_rate": 1e-05, "loss": 0.4799, "step": 25 }, { "epoch": 0.09, "grad_norm": 0.5305970907211304, "learning_rate": 1e-05, "loss": 0.476, "step": 26 }, { "epoch": 0.1, "grad_norm": 0.6885311603546143, "learning_rate": 1e-05, "loss": 0.4396, "step": 27 }, { "epoch": 0.1, "grad_norm": 0.9302653074264526, "learning_rate": 1e-05, "loss": 0.4251, "step": 28 }, { "epoch": 0.1, "grad_norm": 0.44655555486679077, "learning_rate": 1e-05, "loss": 0.4753, "step": 29 }, { "epoch": 0.11, "grad_norm": 0.5989087820053101, "learning_rate": 1e-05, "loss": 0.4533, "step": 30 }, { "epoch": 0.11, "grad_norm": 0.5649559497833252, "learning_rate": 1e-05, "loss": 0.4757, "step": 31 }, { "epoch": 0.11, "grad_norm": 0.6152830719947815, "learning_rate": 1e-05, "loss": 0.4625, "step": 32 }, { "epoch": 0.12, "grad_norm": 0.550224244594574, "learning_rate": 1e-05, "loss": 0.4634, "step": 33 }, { "epoch": 0.12, "grad_norm": 0.6088584661483765, "learning_rate": 1e-05, "loss": 0.4617, "step": 34 }, { "epoch": 0.12, "grad_norm": 0.7076464891433716, "learning_rate": 1e-05, "loss": 0.4538, "step": 35 }, { "epoch": 0.13, "grad_norm": 1.5110942125320435, "learning_rate": 1e-05, "loss": 0.4279, "step": 36 }, { "epoch": 0.13, "grad_norm": 0.9704209566116333, "learning_rate": 1e-05, "loss": 0.4765, "step": 37 }, { "epoch": 0.13, "grad_norm": 0.5762418508529663, "learning_rate": 1e-05, "loss": 0.4633, "step": 38 }, { "epoch": 0.14, "grad_norm": 0.6088821291923523, "learning_rate": 1e-05, "loss": 0.4441, "step": 39 }, { "epoch": 0.14, "grad_norm": 0.6568654775619507, "learning_rate": 1e-05, "loss": 0.4464, "step": 40 }, { "epoch": 0.15, "grad_norm": 0.5699493885040283, "learning_rate": 1e-05, "loss": 0.4626, "step": 41 }, { "epoch": 0.15, "grad_norm": 0.4806917905807495, "learning_rate": 1e-05, "loss": 0.4769, "step": 42 }, { "epoch": 0.15, "grad_norm": 0.5842737555503845, "learning_rate": 1e-05, "loss": 0.4506, "step": 43 }, { "epoch": 0.16, "grad_norm": 0.7217497825622559, "learning_rate": 1e-05, "loss": 0.444, "step": 44 }, { "epoch": 0.16, "grad_norm": 0.4894813597202301, "learning_rate": 1e-05, "loss": 0.473, "step": 45 }, { "epoch": 0.16, "grad_norm": 0.4453926086425781, "learning_rate": 1e-05, "loss": 0.4619, "step": 46 }, { "epoch": 0.17, "grad_norm": 0.7311206459999084, "learning_rate": 1e-05, "loss": 0.4492, "step": 47 }, { "epoch": 0.17, "grad_norm": 1.1100599765777588, "learning_rate": 1e-05, "loss": 0.4111, "step": 48 }, { "epoch": 0.17, "grad_norm": 0.5209776163101196, "learning_rate": 1e-05, "loss": 0.458, "step": 49 }, { "epoch": 0.18, "grad_norm": 0.5144385099411011, "learning_rate": 1e-05, "loss": 0.4591, "step": 50 }, { "epoch": 0.18, "grad_norm": 0.505511462688446, "learning_rate": 1e-05, "loss": 0.4557, "step": 51 }, { "epoch": 0.18, "grad_norm": 0.8851682543754578, "learning_rate": 1e-05, "loss": 0.4229, "step": 52 }, { "epoch": 0.19, "grad_norm": 0.43110543489456177, "learning_rate": 1e-05, "loss": 0.4799, "step": 53 }, { "epoch": 0.19, "grad_norm": 0.7971346378326416, "learning_rate": 1e-05, "loss": 0.4519, "step": 54 }, { "epoch": 0.2, "grad_norm": 0.8832608461380005, "learning_rate": 1e-05, "loss": 0.4339, "step": 55 }, { "epoch": 0.2, "grad_norm": 1.1655290126800537, "learning_rate": 1e-05, "loss": 0.3927, "step": 56 }, { "epoch": 0.2, "grad_norm": 0.42177918553352356, "learning_rate": 1e-05, "loss": 0.441, "step": 57 }, { "epoch": 0.21, "grad_norm": 0.520730197429657, "learning_rate": 1e-05, "loss": 0.4636, "step": 58 }, { "epoch": 0.21, "grad_norm": 0.585159420967102, "learning_rate": 1e-05, "loss": 0.4649, "step": 59 }, { "epoch": 0.21, "grad_norm": 1.1990737915039062, "learning_rate": 1e-05, "loss": 0.4255, "step": 60 }, { "epoch": 0.22, "grad_norm": 0.6422901749610901, "learning_rate": 1e-05, "loss": 0.4672, "step": 61 }, { "epoch": 0.22, "grad_norm": 0.4904385507106781, "learning_rate": 1e-05, "loss": 0.4679, "step": 62 }, { "epoch": 0.22, "grad_norm": 0.6044018864631653, "learning_rate": 1e-05, "loss": 0.4524, "step": 63 }, { "epoch": 0.23, "grad_norm": 0.6931694746017456, "learning_rate": 1e-05, "loss": 0.4267, "step": 64 }, { "epoch": 0.23, "grad_norm": 0.42534416913986206, "learning_rate": 1e-05, "loss": 0.4733, "step": 65 }, { "epoch": 0.23, "grad_norm": 0.509138286113739, "learning_rate": 1e-05, "loss": 0.442, "step": 66 }, { "epoch": 0.24, "grad_norm": 0.6317541003227234, "learning_rate": 1e-05, "loss": 0.4461, "step": 67 }, { "epoch": 0.24, "grad_norm": 0.7053780555725098, "learning_rate": 1e-05, "loss": 0.4328, "step": 68 }, { "epoch": 0.25, "grad_norm": 0.4144704043865204, "learning_rate": 1e-05, "loss": 0.4713, "step": 69 }, { "epoch": 0.25, "grad_norm": 0.6023023724555969, "learning_rate": 1e-05, "loss": 0.4686, "step": 70 }, { "epoch": 0.25, "grad_norm": 0.6286914348602295, "learning_rate": 1e-05, "loss": 0.43, "step": 71 }, { "epoch": 0.26, "grad_norm": 1.7562507390975952, "learning_rate": 1e-05, "loss": 0.4354, "step": 72 }, { "epoch": 0.26, "grad_norm": 0.4561932682991028, "learning_rate": 1e-05, "loss": 0.454, "step": 73 }, { "epoch": 0.26, "grad_norm": 0.7432600855827332, "learning_rate": 1e-05, "loss": 0.4556, "step": 74 }, { "epoch": 0.27, "grad_norm": 0.6152967810630798, "learning_rate": 1e-05, "loss": 0.4367, "step": 75 }, { "epoch": 0.27, "grad_norm": 1.1905455589294434, "learning_rate": 1e-05, "loss": 0.45, "step": 76 }, { "epoch": 0.27, "grad_norm": 0.46641895174980164, "learning_rate": 1e-05, "loss": 0.4584, "step": 77 }, { "epoch": 0.28, "grad_norm": 0.5764038562774658, "learning_rate": 1e-05, "loss": 0.4461, "step": 78 }, { "epoch": 0.28, "grad_norm": 0.5731326341629028, "learning_rate": 1e-05, "loss": 0.4566, "step": 79 }, { "epoch": 0.28, "grad_norm": 0.9596810936927795, "learning_rate": 1e-05, "loss": 0.4488, "step": 80 }, { "epoch": 0.29, "grad_norm": 0.42653772234916687, "learning_rate": 1e-05, "loss": 0.466, "step": 81 }, { "epoch": 0.29, "grad_norm": 0.4516827464103699, "learning_rate": 1e-05, "loss": 0.4555, "step": 82 }, { "epoch": 0.29, "grad_norm": 0.5996358394622803, "learning_rate": 1e-05, "loss": 0.444, "step": 83 }, { "epoch": 0.3, "grad_norm": 0.9187763929367065, "learning_rate": 1e-05, "loss": 0.3769, "step": 84 }, { "epoch": 0.3, "grad_norm": 0.4854133725166321, "learning_rate": 1e-05, "loss": 0.4711, "step": 85 }, { "epoch": 0.31, "grad_norm": 0.4637454152107239, "learning_rate": 1e-05, "loss": 0.4636, "step": 86 }, { "epoch": 0.31, "grad_norm": 0.5341816544532776, "learning_rate": 1e-05, "loss": 0.4358, "step": 87 }, { "epoch": 0.31, "grad_norm": 0.7789736390113831, "learning_rate": 1e-05, "loss": 0.4088, "step": 88 }, { "epoch": 0.32, "grad_norm": 0.3757864534854889, "learning_rate": 1e-05, "loss": 0.4654, "step": 89 }, { "epoch": 0.32, "grad_norm": 0.5197014808654785, "learning_rate": 1e-05, "loss": 0.4434, "step": 90 }, { "epoch": 0.32, "grad_norm": 0.5166840553283691, "learning_rate": 1e-05, "loss": 0.4474, "step": 91 }, { "epoch": 0.33, "grad_norm": 0.6799307465553284, "learning_rate": 1e-05, "loss": 0.4435, "step": 92 }, { "epoch": 0.33, "grad_norm": 0.49623581767082214, "learning_rate": 1e-05, "loss": 0.4636, "step": 93 }, { "epoch": 0.33, "grad_norm": 0.5105348229408264, "learning_rate": 1e-05, "loss": 0.4297, "step": 94 }, { "epoch": 0.34, "grad_norm": 0.5134614706039429, "learning_rate": 1e-05, "loss": 0.4345, "step": 95 }, { "epoch": 0.34, "grad_norm": 0.6770723462104797, "learning_rate": 1e-05, "loss": 0.4181, "step": 96 }, { "epoch": 0.34, "grad_norm": 0.580609917640686, "learning_rate": 1e-05, "loss": 0.4648, "step": 97 }, { "epoch": 0.35, "grad_norm": 0.5286965370178223, "learning_rate": 1e-05, "loss": 0.4329, "step": 98 }, { "epoch": 0.35, "grad_norm": 0.5784552693367004, "learning_rate": 1e-05, "loss": 0.4512, "step": 99 }, { "epoch": 0.36, "grad_norm": 1.0502313375473022, "learning_rate": 1e-05, "loss": 0.4105, "step": 100 }, { "epoch": 0.36, "grad_norm": 0.40381279587745667, "learning_rate": 1e-05, "loss": 0.4508, "step": 101 }, { "epoch": 0.36, "grad_norm": 0.5827537775039673, "learning_rate": 1e-05, "loss": 0.4464, "step": 102 }, { "epoch": 0.37, "grad_norm": 0.536665141582489, "learning_rate": 1e-05, "loss": 0.4382, "step": 103 }, { "epoch": 0.37, "grad_norm": 0.6119434237480164, "learning_rate": 1e-05, "loss": 0.4246, "step": 104 }, { "epoch": 0.37, "grad_norm": 0.3719892203807831, "learning_rate": 1e-05, "loss": 0.4596, "step": 105 }, { "epoch": 0.38, "grad_norm": 0.4976269006729126, "learning_rate": 1e-05, "loss": 0.4563, "step": 106 }, { "epoch": 0.38, "grad_norm": 0.6612491011619568, "learning_rate": 1e-05, "loss": 0.4364, "step": 107 }, { "epoch": 0.38, "grad_norm": 0.6775332093238831, "learning_rate": 1e-05, "loss": 0.4193, "step": 108 }, { "epoch": 0.39, "grad_norm": 0.8782187700271606, "learning_rate": 1e-05, "loss": 0.4464, "step": 109 }, { "epoch": 0.39, "grad_norm": 0.5137239694595337, "learning_rate": 1e-05, "loss": 0.4595, "step": 110 }, { "epoch": 0.39, "grad_norm": 0.5837895274162292, "learning_rate": 1e-05, "loss": 0.4553, "step": 111 }, { "epoch": 0.4, "grad_norm": 2.0835506916046143, "learning_rate": 1e-05, "loss": 0.4225, "step": 112 }, { "epoch": 0.4, "grad_norm": 0.4510575234889984, "learning_rate": 1e-05, "loss": 0.4524, "step": 113 }, { "epoch": 0.4, "grad_norm": 0.5319203734397888, "learning_rate": 1e-05, "loss": 0.4589, "step": 114 }, { "epoch": 0.41, "grad_norm": 0.48096978664398193, "learning_rate": 1e-05, "loss": 0.4504, "step": 115 }, { "epoch": 0.41, "grad_norm": 0.7882823944091797, "learning_rate": 1e-05, "loss": 0.4282, "step": 116 }, { "epoch": 0.42, "grad_norm": 0.4668346643447876, "learning_rate": 1e-05, "loss": 0.4649, "step": 117 }, { "epoch": 0.42, "grad_norm": 0.523875892162323, "learning_rate": 1e-05, "loss": 0.4399, "step": 118 }, { "epoch": 0.42, "grad_norm": 0.7798515558242798, "learning_rate": 1e-05, "loss": 0.4293, "step": 119 }, { "epoch": 0.43, "grad_norm": 1.6797575950622559, "learning_rate": 1e-05, "loss": 0.4334, "step": 120 }, { "epoch": 0.43, "grad_norm": 0.372816801071167, "learning_rate": 1e-05, "loss": 0.4552, "step": 121 }, { "epoch": 0.43, "grad_norm": 0.43806058168411255, "learning_rate": 1e-05, "loss": 0.4533, "step": 122 }, { "epoch": 0.44, "grad_norm": 0.5015602111816406, "learning_rate": 1e-05, "loss": 0.452, "step": 123 }, { "epoch": 0.44, "grad_norm": 0.8157485127449036, "learning_rate": 1e-05, "loss": 0.4266, "step": 124 }, { "epoch": 0.44, "grad_norm": 0.5614244937896729, "learning_rate": 1e-05, "loss": 0.4498, "step": 125 }, { "epoch": 0.45, "grad_norm": 0.5229978561401367, "learning_rate": 1e-05, "loss": 0.4435, "step": 126 }, { "epoch": 0.45, "grad_norm": 0.5133040547370911, "learning_rate": 1e-05, "loss": 0.4365, "step": 127 }, { "epoch": 0.45, "grad_norm": 0.9675442576408386, "learning_rate": 1e-05, "loss": 0.3885, "step": 128 }, { "epoch": 0.46, "grad_norm": 0.3860965669155121, "learning_rate": 1e-05, "loss": 0.449, "step": 129 }, { "epoch": 0.46, "grad_norm": 0.49970802664756775, "learning_rate": 1e-05, "loss": 0.43, "step": 130 }, { "epoch": 0.47, "grad_norm": 0.5704831480979919, "learning_rate": 1e-05, "loss": 0.447, "step": 131 }, { "epoch": 0.47, "grad_norm": 0.9462538361549377, "learning_rate": 1e-05, "loss": 0.4076, "step": 132 }, { "epoch": 0.47, "grad_norm": 0.4560747742652893, "learning_rate": 1e-05, "loss": 0.449, "step": 133 }, { "epoch": 0.48, "grad_norm": 0.4721296429634094, "learning_rate": 1e-05, "loss": 0.4324, "step": 134 }, { "epoch": 0.48, "grad_norm": 0.5056577920913696, "learning_rate": 1e-05, "loss": 0.4406, "step": 135 }, { "epoch": 0.48, "grad_norm": 0.5873216390609741, "learning_rate": 1e-05, "loss": 0.4111, "step": 136 }, { "epoch": 0.49, "grad_norm": 0.3777410686016083, "learning_rate": 1e-05, "loss": 0.4656, "step": 137 }, { "epoch": 0.49, "grad_norm": 0.4282991588115692, "learning_rate": 1e-05, "loss": 0.4189, "step": 138 }, { "epoch": 0.49, "grad_norm": 0.4305042624473572, "learning_rate": 1e-05, "loss": 0.4523, "step": 139 }, { "epoch": 0.5, "grad_norm": 0.6944953203201294, "learning_rate": 1e-05, "loss": 0.4284, "step": 140 }, { "epoch": 0.5, "grad_norm": 0.4139988124370575, "learning_rate": 1e-05, "loss": 0.4431, "step": 141 }, { "epoch": 0.5, "grad_norm": 0.5378341674804688, "learning_rate": 1e-05, "loss": 0.4527, "step": 142 }, { "epoch": 0.51, "grad_norm": 0.5123686790466309, "learning_rate": 1e-05, "loss": 0.4361, "step": 143 }, { "epoch": 0.51, "grad_norm": 0.7670745849609375, "learning_rate": 1e-05, "loss": 0.4221, "step": 144 }, { "epoch": 0.51, "grad_norm": 0.7429537773132324, "learning_rate": 1e-05, "loss": 0.438, "step": 145 }, { "epoch": 0.52, "grad_norm": 0.5465981960296631, "learning_rate": 1e-05, "loss": 0.4515, "step": 146 }, { "epoch": 0.52, "grad_norm": 0.5833389163017273, "learning_rate": 1e-05, "loss": 0.4331, "step": 147 }, { "epoch": 0.53, "grad_norm": 0.8879693746566772, "learning_rate": 1e-05, "loss": 0.4137, "step": 148 }, { "epoch": 0.53, "grad_norm": 0.40366512537002563, "learning_rate": 1e-05, "loss": 0.4505, "step": 149 }, { "epoch": 0.53, "grad_norm": 0.775814414024353, "learning_rate": 1e-05, "loss": 0.4291, "step": 150 }, { "epoch": 0.54, "grad_norm": 0.5580259561538696, "learning_rate": 1e-05, "loss": 0.4311, "step": 151 }, { "epoch": 0.54, "grad_norm": 1.0347977876663208, "learning_rate": 1e-05, "loss": 0.4054, "step": 152 }, { "epoch": 0.54, "grad_norm": 0.4463992714881897, "learning_rate": 1e-05, "loss": 0.4631, "step": 153 }, { "epoch": 0.55, "grad_norm": 0.5559613108634949, "learning_rate": 1e-05, "loss": 0.4339, "step": 154 }, { "epoch": 0.55, "grad_norm": 0.4996100068092346, "learning_rate": 1e-05, "loss": 0.4522, "step": 155 }, { "epoch": 0.55, "grad_norm": 0.639824390411377, "learning_rate": 1e-05, "loss": 0.432, "step": 156 }, { "epoch": 0.56, "grad_norm": 0.38671043515205383, "learning_rate": 1e-05, "loss": 0.4665, "step": 157 }, { "epoch": 0.56, "grad_norm": 0.4905610978603363, "learning_rate": 1e-05, "loss": 0.4532, "step": 158 }, { "epoch": 0.56, "grad_norm": 0.6270061731338501, "learning_rate": 1e-05, "loss": 0.4406, "step": 159 }, { "epoch": 0.57, "grad_norm": 0.789944589138031, "learning_rate": 1e-05, "loss": 0.3897, "step": 160 }, { "epoch": 0.57, "grad_norm": 0.83070969581604, "learning_rate": 1e-05, "loss": 0.4747, "step": 161 }, { "epoch": 0.58, "grad_norm": 0.4273388087749481, "learning_rate": 1e-05, "loss": 0.4414, "step": 162 }, { "epoch": 0.58, "grad_norm": 0.5557335019111633, "learning_rate": 1e-05, "loss": 0.4322, "step": 163 }, { "epoch": 0.58, "grad_norm": 0.5714684724807739, "learning_rate": 1e-05, "loss": 0.4293, "step": 164 }, { "epoch": 0.59, "grad_norm": 0.3697482645511627, "learning_rate": 1e-05, "loss": 0.4584, "step": 165 }, { "epoch": 0.59, "grad_norm": 0.45178842544555664, "learning_rate": 1e-05, "loss": 0.4481, "step": 166 }, { "epoch": 0.59, "grad_norm": 0.5978602170944214, "learning_rate": 1e-05, "loss": 0.4082, "step": 167 }, { "epoch": 0.6, "grad_norm": 0.6311948895454407, "learning_rate": 1e-05, "loss": 0.422, "step": 168 }, { "epoch": 0.6, "grad_norm": 0.3639450669288635, "learning_rate": 1e-05, "loss": 0.4388, "step": 169 }, { "epoch": 0.6, "grad_norm": 0.49596938490867615, "learning_rate": 1e-05, "loss": 0.4453, "step": 170 }, { "epoch": 0.61, "grad_norm": 0.49753278493881226, "learning_rate": 1e-05, "loss": 0.4276, "step": 171 }, { "epoch": 0.61, "grad_norm": 0.770423948764801, "learning_rate": 1e-05, "loss": 0.4273, "step": 172 }, { "epoch": 0.61, "grad_norm": 0.43721652030944824, "learning_rate": 1e-05, "loss": 0.4556, "step": 173 }, { "epoch": 0.62, "grad_norm": 0.44143715500831604, "learning_rate": 1e-05, "loss": 0.4579, "step": 174 }, { "epoch": 0.62, "grad_norm": 0.6291133165359497, "learning_rate": 1e-05, "loss": 0.4237, "step": 175 }, { "epoch": 0.63, "grad_norm": 0.909862220287323, "learning_rate": 1e-05, "loss": 0.3763, "step": 176 }, { "epoch": 0.63, "grad_norm": 0.43771499395370483, "learning_rate": 1e-05, "loss": 0.4402, "step": 177 }, { "epoch": 0.63, "grad_norm": 0.5810952186584473, "learning_rate": 1e-05, "loss": 0.4359, "step": 178 }, { "epoch": 0.64, "grad_norm": 0.48108798265457153, "learning_rate": 1e-05, "loss": 0.4488, "step": 179 }, { "epoch": 0.64, "grad_norm": 0.7238831520080566, "learning_rate": 1e-05, "loss": 0.4207, "step": 180 }, { "epoch": 0.64, "grad_norm": 0.4471929371356964, "learning_rate": 1e-05, "loss": 0.4391, "step": 181 }, { "epoch": 0.65, "grad_norm": 0.46182355284690857, "learning_rate": 1e-05, "loss": 0.4405, "step": 182 }, { "epoch": 0.65, "grad_norm": 0.5112327337265015, "learning_rate": 1e-05, "loss": 0.4404, "step": 183 }, { "epoch": 0.65, "grad_norm": 0.7985268235206604, "learning_rate": 1e-05, "loss": 0.3994, "step": 184 }, { "epoch": 0.66, "grad_norm": 0.4040653705596924, "learning_rate": 1e-05, "loss": 0.451, "step": 185 }, { "epoch": 0.66, "grad_norm": 0.4301965832710266, "learning_rate": 1e-05, "loss": 0.4472, "step": 186 }, { "epoch": 0.66, "grad_norm": 0.6474922895431519, "learning_rate": 1e-05, "loss": 0.4331, "step": 187 }, { "epoch": 0.67, "grad_norm": 0.8670556545257568, "learning_rate": 1e-05, "loss": 0.4318, "step": 188 }, { "epoch": 0.67, "grad_norm": 0.5550181269645691, "learning_rate": 1e-05, "loss": 0.4618, "step": 189 }, { "epoch": 0.67, "grad_norm": 0.44763821363449097, "learning_rate": 1e-05, "loss": 0.4515, "step": 190 }, { "epoch": 0.68, "grad_norm": 0.5281996726989746, "learning_rate": 1e-05, "loss": 0.4094, "step": 191 }, { "epoch": 0.68, "grad_norm": 0.6521373987197876, "learning_rate": 1e-05, "loss": 0.4202, "step": 192 }, { "epoch": 0.69, "grad_norm": 0.4100494682788849, "learning_rate": 1e-05, "loss": 0.4454, "step": 193 }, { "epoch": 0.69, "grad_norm": 0.5752792954444885, "learning_rate": 1e-05, "loss": 0.4488, "step": 194 }, { "epoch": 0.69, "grad_norm": 0.6012833118438721, "learning_rate": 1e-05, "loss": 0.4329, "step": 195 }, { "epoch": 0.7, "grad_norm": 0.6597912311553955, "learning_rate": 1e-05, "loss": 0.4157, "step": 196 }, { "epoch": 0.7, "grad_norm": 0.4282044768333435, "learning_rate": 1e-05, "loss": 0.434, "step": 197 }, { "epoch": 0.7, "grad_norm": 0.5571615099906921, "learning_rate": 1e-05, "loss": 0.4482, "step": 198 }, { "epoch": 0.71, "grad_norm": 0.5163079500198364, "learning_rate": 1e-05, "loss": 0.4304, "step": 199 }, { "epoch": 0.71, "grad_norm": 0.8305063843727112, "learning_rate": 1e-05, "loss": 0.4022, "step": 200 }, { "epoch": 0.71, "grad_norm": 0.4464957118034363, "learning_rate": 1e-05, "loss": 0.4503, "step": 201 }, { "epoch": 0.72, "grad_norm": 0.46648305654525757, "learning_rate": 1e-05, "loss": 0.4489, "step": 202 }, { "epoch": 0.72, "grad_norm": 0.6516060829162598, "learning_rate": 1e-05, "loss": 0.4342, "step": 203 }, { "epoch": 0.72, "grad_norm": 0.6265027523040771, "learning_rate": 1e-05, "loss": 0.4175, "step": 204 }, { "epoch": 0.73, "grad_norm": 0.4391736686229706, "learning_rate": 1e-05, "loss": 0.4586, "step": 205 }, { "epoch": 0.73, "grad_norm": 0.4877251386642456, "learning_rate": 1e-05, "loss": 0.4369, "step": 206 }, { "epoch": 0.74, "grad_norm": 0.5316377282142639, "learning_rate": 1e-05, "loss": 0.4319, "step": 207 }, { "epoch": 0.74, "grad_norm": 1.0411840677261353, "learning_rate": 1e-05, "loss": 0.4256, "step": 208 }, { "epoch": 0.74, "grad_norm": 0.5061484575271606, "learning_rate": 1e-05, "loss": 0.4319, "step": 209 }, { "epoch": 0.75, "grad_norm": 2.133131980895996, "learning_rate": 1e-05, "loss": 0.4485, "step": 210 }, { "epoch": 0.75, "grad_norm": 0.5447837114334106, "learning_rate": 1e-05, "loss": 0.4416, "step": 211 }, { "epoch": 0.75, "grad_norm": 0.7947729825973511, "learning_rate": 1e-05, "loss": 0.4075, "step": 212 }, { "epoch": 0.76, "grad_norm": 0.44168147444725037, "learning_rate": 1e-05, "loss": 0.4526, "step": 213 }, { "epoch": 0.76, "grad_norm": 0.4681871235370636, "learning_rate": 1e-05, "loss": 0.4384, "step": 214 }, { "epoch": 0.76, "grad_norm": 0.568696141242981, "learning_rate": 1e-05, "loss": 0.4309, "step": 215 }, { "epoch": 0.77, "grad_norm": 0.676292896270752, "learning_rate": 1e-05, "loss": 0.3851, "step": 216 }, { "epoch": 0.77, "grad_norm": 0.42382004857063293, "learning_rate": 1e-05, "loss": 0.4657, "step": 217 }, { "epoch": 0.77, "grad_norm": 1.7760436534881592, "learning_rate": 1e-05, "loss": 0.4471, "step": 218 }, { "epoch": 0.78, "grad_norm": 0.570806622505188, "learning_rate": 1e-05, "loss": 0.4307, "step": 219 }, { "epoch": 0.78, "grad_norm": 0.913851261138916, "learning_rate": 1e-05, "loss": 0.4132, "step": 220 }, { "epoch": 0.78, "grad_norm": 0.4583609700202942, "learning_rate": 1e-05, "loss": 0.4508, "step": 221 }, { "epoch": 0.79, "grad_norm": 0.5516847968101501, "learning_rate": 1e-05, "loss": 0.4521, "step": 222 }, { "epoch": 0.79, "grad_norm": 0.526192843914032, "learning_rate": 1e-05, "loss": 0.4473, "step": 223 }, { "epoch": 0.8, "grad_norm": 0.852449357509613, "learning_rate": 1e-05, "loss": 0.4297, "step": 224 }, { "epoch": 0.8, "grad_norm": 0.8044112920761108, "learning_rate": 1e-05, "loss": 0.4392, "step": 225 }, { "epoch": 0.8, "grad_norm": 0.4642798602581024, "learning_rate": 1e-05, "loss": 0.4188, "step": 226 }, { "epoch": 0.81, "grad_norm": 0.5552318692207336, "learning_rate": 1e-05, "loss": 0.4447, "step": 227 }, { "epoch": 0.81, "grad_norm": 0.7442976832389832, "learning_rate": 1e-05, "loss": 0.404, "step": 228 }, { "epoch": 0.81, "grad_norm": 0.5462561845779419, "learning_rate": 1e-05, "loss": 0.4434, "step": 229 }, { "epoch": 0.82, "grad_norm": 0.4776632785797119, "learning_rate": 1e-05, "loss": 0.4487, "step": 230 }, { "epoch": 0.82, "grad_norm": 0.8107818365097046, "learning_rate": 1e-05, "loss": 0.4256, "step": 231 }, { "epoch": 0.82, "grad_norm": 0.7552162408828735, "learning_rate": 1e-05, "loss": 0.4116, "step": 232 }, { "epoch": 0.83, "grad_norm": 0.5497348308563232, "learning_rate": 1e-05, "loss": 0.4397, "step": 233 }, { "epoch": 0.83, "grad_norm": 0.5682913661003113, "learning_rate": 1e-05, "loss": 0.4432, "step": 234 }, { "epoch": 0.83, "grad_norm": 0.6614949703216553, "learning_rate": 1e-05, "loss": 0.4408, "step": 235 }, { "epoch": 0.84, "grad_norm": 0.8305519223213196, "learning_rate": 1e-05, "loss": 0.4217, "step": 236 }, { "epoch": 0.84, "grad_norm": 0.42773041129112244, "learning_rate": 1e-05, "loss": 0.4517, "step": 237 }, { "epoch": 0.85, "grad_norm": 0.771364688873291, "learning_rate": 1e-05, "loss": 0.4258, "step": 238 }, { "epoch": 0.85, "grad_norm": 0.7548272013664246, "learning_rate": 1e-05, "loss": 0.4329, "step": 239 }, { "epoch": 0.85, "grad_norm": 0.673877477645874, "learning_rate": 1e-05, "loss": 0.4241, "step": 240 }, { "epoch": 0.86, "grad_norm": 0.42514756321907043, "learning_rate": 1e-05, "loss": 0.45, "step": 241 }, { "epoch": 0.86, "grad_norm": 0.48240309953689575, "learning_rate": 1e-05, "loss": 0.4253, "step": 242 }, { "epoch": 0.86, "grad_norm": 0.5749714970588684, "learning_rate": 1e-05, "loss": 0.4459, "step": 243 }, { "epoch": 0.87, "grad_norm": 0.6530050039291382, "learning_rate": 1e-05, "loss": 0.412, "step": 244 }, { "epoch": 0.87, "grad_norm": 0.690308690071106, "learning_rate": 1e-05, "loss": 0.4205, "step": 245 }, { "epoch": 0.87, "grad_norm": 0.40998125076293945, "learning_rate": 1e-05, "loss": 0.4478, "step": 246 }, { "epoch": 0.88, "grad_norm": 0.586068332195282, "learning_rate": 1e-05, "loss": 0.4115, "step": 247 }, { "epoch": 0.88, "grad_norm": 0.6974151730537415, "learning_rate": 1e-05, "loss": 0.3864, "step": 248 }, { "epoch": 0.88, "grad_norm": 0.4335140883922577, "learning_rate": 1e-05, "loss": 0.4669, "step": 249 }, { "epoch": 0.89, "grad_norm": 0.44653356075286865, "learning_rate": 1e-05, "loss": 0.4139, "step": 250 }, { "epoch": 0.89, "grad_norm": 0.750583291053772, "learning_rate": 1e-05, "loss": 0.4152, "step": 251 }, { "epoch": 0.9, "grad_norm": 0.7355042099952698, "learning_rate": 1e-05, "loss": 0.4149, "step": 252 }, { "epoch": 0.9, "grad_norm": 0.5112454295158386, "learning_rate": 1e-05, "loss": 0.4444, "step": 253 }, { "epoch": 0.9, "grad_norm": 0.7759450674057007, "learning_rate": 1e-05, "loss": 0.4474, "step": 254 }, { "epoch": 0.91, "grad_norm": 0.6432544589042664, "learning_rate": 1e-05, "loss": 0.4555, "step": 255 }, { "epoch": 0.91, "grad_norm": 0.723268985748291, "learning_rate": 1e-05, "loss": 0.3851, "step": 256 }, { "epoch": 0.91, "grad_norm": 0.4089478850364685, "learning_rate": 1e-05, "loss": 0.447, "step": 257 }, { "epoch": 0.92, "grad_norm": 0.5243247747421265, "learning_rate": 1e-05, "loss": 0.4565, "step": 258 }, { "epoch": 0.92, "grad_norm": 0.776333212852478, "learning_rate": 1e-05, "loss": 0.4145, "step": 259 }, { "epoch": 0.92, "grad_norm": 0.4910255968570709, "learning_rate": 1e-05, "loss": 0.4163, "step": 260 }, { "epoch": 0.93, "grad_norm": 0.46245935559272766, "learning_rate": 1e-05, "loss": 0.4669, "step": 261 }, { "epoch": 0.93, "grad_norm": 0.5398200154304504, "learning_rate": 1e-05, "loss": 0.4321, "step": 262 }, { "epoch": 0.93, "grad_norm": 0.8473610281944275, "learning_rate": 1e-05, "loss": 0.3985, "step": 263 }, { "epoch": 0.94, "grad_norm": 0.8328168988227844, "learning_rate": 1e-05, "loss": 0.3997, "step": 264 }, { "epoch": 0.94, "grad_norm": 0.5752345323562622, "learning_rate": 1e-05, "loss": 0.4513, "step": 265 }, { "epoch": 0.94, "grad_norm": 0.45842844247817993, "learning_rate": 1e-05, "loss": 0.4268, "step": 266 }, { "epoch": 0.95, "grad_norm": 0.5272755026817322, "learning_rate": 1e-05, "loss": 0.4205, "step": 267 }, { "epoch": 0.95, "grad_norm": 0.6478350758552551, "learning_rate": 1e-05, "loss": 0.417, "step": 268 }, { "epoch": 0.96, "grad_norm": 0.4239070415496826, "learning_rate": 1e-05, "loss": 0.4369, "step": 269 }, { "epoch": 0.96, "grad_norm": 0.48916614055633545, "learning_rate": 1e-05, "loss": 0.443, "step": 270 }, { "epoch": 0.96, "grad_norm": 0.6267674565315247, "learning_rate": 1e-05, "loss": 0.4152, "step": 271 }, { "epoch": 0.97, "grad_norm": 0.5838894844055176, "learning_rate": 1e-05, "loss": 0.4177, "step": 272 }, { "epoch": 0.97, "grad_norm": 0.40867578983306885, "learning_rate": 1e-05, "loss": 0.445, "step": 273 }, { "epoch": 0.97, "grad_norm": 0.6627911925315857, "learning_rate": 1e-05, "loss": 0.4379, "step": 274 }, { "epoch": 0.98, "grad_norm": 0.5618704557418823, "learning_rate": 1e-05, "loss": 0.4188, "step": 275 }, { "epoch": 0.98, "grad_norm": 0.64666748046875, "learning_rate": 1e-05, "loss": 0.4159, "step": 276 }, { "epoch": 0.98, "grad_norm": 0.5590734481811523, "learning_rate": 1e-05, "loss": 0.4416, "step": 277 }, { "epoch": 0.99, "grad_norm": 0.4989932179450989, "learning_rate": 1e-05, "loss": 0.4428, "step": 278 }, { "epoch": 0.99, "grad_norm": 0.6639497876167297, "learning_rate": 1e-05, "loss": 0.4064, "step": 279 }, { "epoch": 0.99, "grad_norm": 0.6805359125137329, "learning_rate": 1e-05, "loss": 0.4119, "step": 280 }, { "epoch": 1.0, "grad_norm": 0.6433411836624146, "learning_rate": 1e-05, "loss": 0.4374, "step": 281 }, { "epoch": 1.0, "step": 281, "total_flos": 423312888037376.0, "train_loss": 0.44328434908517317, "train_runtime": 88939.3662, "train_samples_per_second": 0.051, "train_steps_per_second": 0.003 } ], "logging_steps": 1.0, "max_steps": 281, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 30, "total_flos": 423312888037376.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }