{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990627928772259, "eval_steps": 134, "global_step": 533, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.000000000000001e-06, "loss": 3.2302, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.179551601409912, "eval_runtime": 55.4081, "eval_samples_per_second": 24.329, "eval_steps_per_second": 1.029, "step": 1 }, { "epoch": 0.0, "learning_rate": 8.000000000000001e-06, "loss": 3.1024, "step": 2 }, { "epoch": 0.01, "learning_rate": 1.2e-05, "loss": 3.2198, "step": 3 }, { "epoch": 0.01, "learning_rate": 1.6000000000000003e-05, "loss": 3.0828, "step": 4 }, { "epoch": 0.01, "learning_rate": 2e-05, "loss": 2.9847, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.4e-05, "loss": 2.7929, "step": 6 }, { "epoch": 0.01, "learning_rate": 2.8000000000000003e-05, "loss": 2.7878, "step": 7 }, { "epoch": 0.01, "learning_rate": 3.2000000000000005e-05, "loss": 2.6629, "step": 8 }, { "epoch": 0.02, "learning_rate": 3.6e-05, "loss": 2.6465, "step": 9 }, { "epoch": 0.02, "learning_rate": 4e-05, "loss": 2.5353, "step": 10 }, { "epoch": 0.02, "learning_rate": 4.4000000000000006e-05, "loss": 2.5154, "step": 11 }, { "epoch": 0.02, "learning_rate": 4.8e-05, "loss": 2.498, "step": 12 }, { "epoch": 0.02, "learning_rate": 5.2000000000000004e-05, "loss": 2.4848, "step": 13 }, { "epoch": 0.03, "learning_rate": 5.6000000000000006e-05, "loss": 2.4794, "step": 14 }, { "epoch": 0.03, "learning_rate": 6e-05, "loss": 2.4636, "step": 15 }, { "epoch": 0.03, "learning_rate": 6.400000000000001e-05, "loss": 2.4932, "step": 16 }, { "epoch": 0.03, "learning_rate": 6.800000000000001e-05, "loss": 2.4167, "step": 17 }, { "epoch": 0.03, "learning_rate": 7.2e-05, "loss": 2.4665, "step": 18 }, { "epoch": 0.04, "learning_rate": 7.6e-05, "loss": 2.3863, "step": 19 }, { "epoch": 0.04, "learning_rate": 8e-05, "loss": 2.4089, "step": 20 }, { "epoch": 0.04, "learning_rate": 8.4e-05, "loss": 2.4105, "step": 21 }, { "epoch": 0.04, "learning_rate": 8.800000000000001e-05, "loss": 2.4187, "step": 22 }, { "epoch": 0.04, "learning_rate": 9.200000000000001e-05, "loss": 2.3987, "step": 23 }, { "epoch": 0.04, "learning_rate": 9.6e-05, "loss": 2.3898, "step": 24 }, { "epoch": 0.05, "learning_rate": 0.0001, "loss": 2.4524, "step": 25 }, { "epoch": 0.05, "learning_rate": 0.00010400000000000001, "loss": 2.4081, "step": 26 }, { "epoch": 0.05, "learning_rate": 0.00010800000000000001, "loss": 2.4231, "step": 27 }, { "epoch": 0.05, "learning_rate": 0.00011200000000000001, "loss": 2.3854, "step": 28 }, { "epoch": 0.05, "learning_rate": 0.000116, "loss": 2.4511, "step": 29 }, { "epoch": 0.06, "learning_rate": 0.00012, "loss": 2.4137, "step": 30 }, { "epoch": 0.06, "learning_rate": 0.000124, "loss": 2.4189, "step": 31 }, { "epoch": 0.06, "learning_rate": 0.00012800000000000002, "loss": 2.4123, "step": 32 }, { "epoch": 0.06, "learning_rate": 0.000132, "loss": 2.3896, "step": 33 }, { "epoch": 0.06, "learning_rate": 0.00013600000000000003, "loss": 2.4448, "step": 34 }, { "epoch": 0.07, "learning_rate": 0.00014, "loss": 2.3936, "step": 35 }, { "epoch": 0.07, "learning_rate": 0.000144, "loss": 2.4505, "step": 36 }, { "epoch": 0.07, "learning_rate": 0.000148, "loss": 2.4321, "step": 37 }, { "epoch": 0.07, "learning_rate": 0.000152, "loss": 2.4273, "step": 38 }, { "epoch": 0.07, "learning_rate": 0.00015600000000000002, "loss": 2.4394, "step": 39 }, { "epoch": 0.07, "learning_rate": 0.00016, "loss": 2.4661, "step": 40 }, { "epoch": 0.08, "learning_rate": 0.000164, "loss": 2.4693, "step": 41 }, { "epoch": 0.08, "learning_rate": 0.000168, "loss": 2.4416, "step": 42 }, { "epoch": 0.08, "learning_rate": 0.000172, "loss": 2.4219, "step": 43 }, { "epoch": 0.08, "learning_rate": 0.00017600000000000002, "loss": 2.4929, "step": 44 }, { "epoch": 0.08, "learning_rate": 0.00018, "loss": 2.5098, "step": 45 }, { "epoch": 0.09, "learning_rate": 0.00018400000000000003, "loss": 2.5142, "step": 46 }, { "epoch": 0.09, "learning_rate": 0.000188, "loss": 2.4719, "step": 47 }, { "epoch": 0.09, "learning_rate": 0.000192, "loss": 2.4686, "step": 48 }, { "epoch": 0.09, "learning_rate": 0.000196, "loss": 2.4731, "step": 49 }, { "epoch": 0.09, "learning_rate": 0.0002, "loss": 2.4735, "step": 50 }, { "epoch": 0.1, "learning_rate": 0.00019999788469031855, "loss": 2.5597, "step": 51 }, { "epoch": 0.1, "learning_rate": 0.00019999153885076487, "loss": 2.4634, "step": 52 }, { "epoch": 0.1, "learning_rate": 0.00019998096274980728, "loss": 2.4727, "step": 53 }, { "epoch": 0.1, "learning_rate": 0.00019996615683488039, "loss": 2.4603, "step": 54 }, { "epoch": 0.1, "learning_rate": 0.00019994712173236604, "loss": 2.4999, "step": 55 }, { "epoch": 0.1, "learning_rate": 0.000199923858247567, "loss": 2.4886, "step": 56 }, { "epoch": 0.11, "learning_rate": 0.00019989636736467278, "loss": 2.5067, "step": 57 }, { "epoch": 0.11, "learning_rate": 0.00019986465024671794, "loss": 2.4929, "step": 58 }, { "epoch": 0.11, "learning_rate": 0.00019982870823553308, "loss": 2.4504, "step": 59 }, { "epoch": 0.11, "learning_rate": 0.00019978854285168784, "loss": 2.4733, "step": 60 }, { "epoch": 0.11, "learning_rate": 0.00019974415579442675, "loss": 2.5515, "step": 61 }, { "epoch": 0.12, "learning_rate": 0.00019969554894159723, "loss": 2.4798, "step": 62 }, { "epoch": 0.12, "learning_rate": 0.00019964272434957022, "loss": 2.5098, "step": 63 }, { "epoch": 0.12, "learning_rate": 0.00019958568425315314, "loss": 2.5603, "step": 64 }, { "epoch": 0.12, "learning_rate": 0.00019952443106549533, "loss": 2.68, "step": 65 }, { "epoch": 0.12, "learning_rate": 0.00019945896737798603, "loss": 2.7329, "step": 66 }, { "epoch": 0.13, "learning_rate": 0.0001993892959601447, "loss": 2.5781, "step": 67 }, { "epoch": 0.13, "learning_rate": 0.00019931541975950378, "loss": 2.5786, "step": 68 }, { "epoch": 0.13, "learning_rate": 0.00019923734190148418, "loss": 2.5429, "step": 69 }, { "epoch": 0.13, "learning_rate": 0.0001991550656892628, "loss": 2.4816, "step": 70 }, { "epoch": 0.13, "learning_rate": 0.00019906859460363307, "loss": 2.5149, "step": 71 }, { "epoch": 0.13, "learning_rate": 0.00019897793230285748, "loss": 2.5982, "step": 72 }, { "epoch": 0.14, "learning_rate": 0.00019888308262251285, "loss": 2.5852, "step": 73 }, { "epoch": 0.14, "learning_rate": 0.00019878404957532814, "loss": 2.5715, "step": 74 }, { "epoch": 0.14, "learning_rate": 0.00019868083735101463, "loss": 2.52, "step": 75 }, { "epoch": 0.14, "learning_rate": 0.00019857345031608868, "loss": 2.5562, "step": 76 }, { "epoch": 0.14, "learning_rate": 0.0001984618930136869, "loss": 2.4902, "step": 77 }, { "epoch": 0.15, "learning_rate": 0.0001983461701633742, "loss": 2.4734, "step": 78 }, { "epoch": 0.15, "learning_rate": 0.0001982262866609439, "loss": 2.4944, "step": 79 }, { "epoch": 0.15, "learning_rate": 0.00019810224757821064, "loss": 2.4315, "step": 80 }, { "epoch": 0.15, "learning_rate": 0.00019797405816279585, "loss": 2.4392, "step": 81 }, { "epoch": 0.15, "learning_rate": 0.00019784172383790582, "loss": 2.4658, "step": 82 }, { "epoch": 0.16, "learning_rate": 0.00019770525020210204, "loss": 2.4779, "step": 83 }, { "epoch": 0.16, "learning_rate": 0.00019756464302906465, "loss": 2.4132, "step": 84 }, { "epoch": 0.16, "learning_rate": 0.00019741990826734794, "loss": 2.4443, "step": 85 }, { "epoch": 0.16, "learning_rate": 0.0001972710520401287, "loss": 2.5367, "step": 86 }, { "epoch": 0.16, "learning_rate": 0.0001971180806449473, "loss": 2.4563, "step": 87 }, { "epoch": 0.16, "learning_rate": 0.00019696100055344124, "loss": 2.4693, "step": 88 }, { "epoch": 0.17, "learning_rate": 0.0001967998184110713, "loss": 2.4599, "step": 89 }, { "epoch": 0.17, "learning_rate": 0.0001966345410368404, "loss": 2.458, "step": 90 }, { "epoch": 0.17, "learning_rate": 0.00019646517542300514, "loss": 2.4395, "step": 91 }, { "epoch": 0.17, "learning_rate": 0.00019629172873477995, "loss": 2.453, "step": 92 }, { "epoch": 0.17, "learning_rate": 0.000196114208310034, "loss": 2.4638, "step": 93 }, { "epoch": 0.18, "learning_rate": 0.00019593262165898076, "loss": 2.4314, "step": 94 }, { "epoch": 0.18, "learning_rate": 0.00019574697646386027, "loss": 2.4776, "step": 95 }, { "epoch": 0.18, "learning_rate": 0.0001955572805786141, "loss": 2.4625, "step": 96 }, { "epoch": 0.18, "learning_rate": 0.00019536354202855308, "loss": 2.4451, "step": 97 }, { "epoch": 0.18, "learning_rate": 0.0001951657690100178, "loss": 2.4473, "step": 98 }, { "epoch": 0.19, "learning_rate": 0.00019496396989003193, "loss": 2.4928, "step": 99 }, { "epoch": 0.19, "learning_rate": 0.0001947581532059481, "loss": 2.4132, "step": 100 }, { "epoch": 0.19, "learning_rate": 0.0001945483276650868, "loss": 2.4591, "step": 101 }, { "epoch": 0.19, "learning_rate": 0.00019433450214436797, "loss": 2.465, "step": 102 }, { "epoch": 0.19, "learning_rate": 0.0001941166856899355, "loss": 2.3805, "step": 103 }, { "epoch": 0.19, "learning_rate": 0.0001938948875167745, "loss": 2.4012, "step": 104 }, { "epoch": 0.2, "learning_rate": 0.00019366911700832145, "loss": 2.3852, "step": 105 }, { "epoch": 0.2, "learning_rate": 0.00019343938371606712, "loss": 2.3642, "step": 106 }, { "epoch": 0.2, "learning_rate": 0.00019320569735915271, "loss": 2.39, "step": 107 }, { "epoch": 0.2, "learning_rate": 0.0001929680678239585, "loss": 2.4373, "step": 108 }, { "epoch": 0.2, "learning_rate": 0.0001927265051636856, "loss": 2.4666, "step": 109 }, { "epoch": 0.21, "learning_rate": 0.00019248101959793066, "loss": 2.4267, "step": 110 }, { "epoch": 0.21, "learning_rate": 0.0001922316215122536, "loss": 2.4158, "step": 111 }, { "epoch": 0.21, "learning_rate": 0.0001919783214577381, "loss": 2.3634, "step": 112 }, { "epoch": 0.21, "learning_rate": 0.00019172113015054532, "loss": 2.4179, "step": 113 }, { "epoch": 0.21, "learning_rate": 0.0001914600584714605, "loss": 2.364, "step": 114 }, { "epoch": 0.22, "learning_rate": 0.00019119511746543263, "loss": 2.3435, "step": 115 }, { "epoch": 0.22, "learning_rate": 0.00019092631834110723, "loss": 2.2741, "step": 116 }, { "epoch": 0.22, "learning_rate": 0.00019065367247035213, "loss": 2.3201, "step": 117 }, { "epoch": 0.22, "learning_rate": 0.00019037719138777627, "loss": 2.4475, "step": 118 }, { "epoch": 0.22, "learning_rate": 0.0001900968867902419, "loss": 2.3559, "step": 119 }, { "epoch": 0.22, "learning_rate": 0.0001898127705363696, "loss": 2.3547, "step": 120 }, { "epoch": 0.23, "learning_rate": 0.00018952485464603664, "loss": 2.2853, "step": 121 }, { "epoch": 0.23, "learning_rate": 0.00018923315129986835, "loss": 2.4317, "step": 122 }, { "epoch": 0.23, "learning_rate": 0.00018893767283872305, "loss": 2.3741, "step": 123 }, { "epoch": 0.23, "learning_rate": 0.0001886384317631697, "loss": 2.2959, "step": 124 }, { "epoch": 0.23, "learning_rate": 0.00018833544073295917, "loss": 2.3672, "step": 125 }, { "epoch": 0.24, "learning_rate": 0.00018802871256648872, "loss": 2.3465, "step": 126 }, { "epoch": 0.24, "learning_rate": 0.00018771826024025946, "loss": 2.2856, "step": 127 }, { "epoch": 0.24, "learning_rate": 0.00018740409688832764, "loss": 2.28, "step": 128 }, { "epoch": 0.24, "learning_rate": 0.00018708623580174889, "loss": 2.2592, "step": 129 }, { "epoch": 0.24, "learning_rate": 0.00018676469042801588, "loss": 2.3545, "step": 130 }, { "epoch": 0.25, "learning_rate": 0.00018643947437048944, "loss": 2.3505, "step": 131 }, { "epoch": 0.25, "learning_rate": 0.00018611060138782305, "loss": 2.3164, "step": 132 }, { "epoch": 0.25, "learning_rate": 0.00018577808539338087, "loss": 2.3547, "step": 133 }, { "epoch": 0.25, "learning_rate": 0.00018544194045464886, "loss": 2.3772, "step": 134 }, { "epoch": 0.25, "eval_loss": 2.3696775436401367, "eval_runtime": 55.6786, "eval_samples_per_second": 24.21, "eval_steps_per_second": 1.024, "step": 134 }, { "epoch": 0.25, "learning_rate": 0.00018510218079263995, "loss": 2.4287, "step": 135 }, { "epoch": 0.25, "learning_rate": 0.00018475882078129212, "loss": 2.3653, "step": 136 }, { "epoch": 0.26, "learning_rate": 0.00018441187494686053, "loss": 2.3392, "step": 137 }, { "epoch": 0.26, "learning_rate": 0.00018406135796730287, "loss": 2.3177, "step": 138 }, { "epoch": 0.26, "learning_rate": 0.00018370728467165828, "loss": 2.3777, "step": 139 }, { "epoch": 0.26, "learning_rate": 0.0001833496700394202, "loss": 2.3158, "step": 140 }, { "epoch": 0.26, "learning_rate": 0.00018298852919990252, "loss": 2.3367, "step": 141 }, { "epoch": 0.27, "learning_rate": 0.0001826238774315995, "loss": 2.231, "step": 142 }, { "epoch": 0.27, "learning_rate": 0.00018225573016153945, "loss": 2.2859, "step": 143 }, { "epoch": 0.27, "learning_rate": 0.0001818841029646321, "loss": 2.2594, "step": 144 }, { "epoch": 0.27, "learning_rate": 0.00018150901156300956, "loss": 2.2945, "step": 145 }, { "epoch": 0.27, "learning_rate": 0.00018113047182536127, "loss": 2.2277, "step": 146 }, { "epoch": 0.28, "learning_rate": 0.00018074849976626274, "loss": 2.285, "step": 147 }, { "epoch": 0.28, "learning_rate": 0.00018036311154549784, "loss": 2.287, "step": 148 }, { "epoch": 0.28, "learning_rate": 0.00017997432346737524, "loss": 2.2308, "step": 149 }, { "epoch": 0.28, "learning_rate": 0.00017958215198003865, "loss": 2.3425, "step": 150 }, { "epoch": 0.28, "learning_rate": 0.00017918661367477098, "loss": 2.1763, "step": 151 }, { "epoch": 0.28, "learning_rate": 0.00017878772528529232, "loss": 2.3044, "step": 152 }, { "epoch": 0.29, "learning_rate": 0.00017838550368705217, "loss": 2.1648, "step": 153 }, { "epoch": 0.29, "learning_rate": 0.0001779799658965153, "loss": 2.3134, "step": 154 }, { "epoch": 0.29, "learning_rate": 0.000177571129070442, "loss": 2.2027, "step": 155 }, { "epoch": 0.29, "learning_rate": 0.0001771590105051622, "loss": 2.2242, "step": 156 }, { "epoch": 0.29, "learning_rate": 0.00017674362763584375, "loss": 2.2897, "step": 157 }, { "epoch": 0.3, "learning_rate": 0.00017632499803575474, "loss": 2.1541, "step": 158 }, { "epoch": 0.3, "learning_rate": 0.00017590313941552002, "loss": 2.173, "step": 159 }, { "epoch": 0.3, "learning_rate": 0.0001754780696223722, "loss": 2.2662, "step": 160 }, { "epoch": 0.3, "learning_rate": 0.00017504980663939613, "loss": 2.2121, "step": 161 }, { "epoch": 0.3, "learning_rate": 0.00017461836858476856, "loss": 2.2703, "step": 162 }, { "epoch": 0.31, "learning_rate": 0.00017418377371099136, "loss": 2.2875, "step": 163 }, { "epoch": 0.31, "learning_rate": 0.00017374604040411935, "loss": 2.2159, "step": 164 }, { "epoch": 0.31, "learning_rate": 0.00017330518718298264, "loss": 2.2517, "step": 165 }, { "epoch": 0.31, "learning_rate": 0.00017286123269840293, "loss": 2.2079, "step": 166 }, { "epoch": 0.31, "learning_rate": 0.00017241419573240462, "loss": 2.1622, "step": 167 }, { "epoch": 0.31, "learning_rate": 0.0001719640951974202, "loss": 2.2229, "step": 168 }, { "epoch": 0.32, "learning_rate": 0.00017151095013548994, "loss": 2.2151, "step": 169 }, { "epoch": 0.32, "learning_rate": 0.00017105477971745666, "loss": 2.232, "step": 170 }, { "epoch": 0.32, "learning_rate": 0.00017059560324215436, "loss": 2.2971, "step": 171 }, { "epoch": 0.32, "learning_rate": 0.00017013344013559197, "loss": 2.1772, "step": 172 }, { "epoch": 0.32, "learning_rate": 0.00016966830995013133, "loss": 2.2384, "step": 173 }, { "epoch": 0.33, "learning_rate": 0.00016920023236366002, "loss": 2.1376, "step": 174 }, { "epoch": 0.33, "learning_rate": 0.00016872922717875923, "loss": 2.1521, "step": 175 }, { "epoch": 0.33, "learning_rate": 0.00016825531432186543, "loss": 2.2222, "step": 176 }, { "epoch": 0.33, "learning_rate": 0.00016777851384242766, "loss": 2.1104, "step": 177 }, { "epoch": 0.33, "learning_rate": 0.0001672988459120594, "loss": 2.2398, "step": 178 }, { "epoch": 0.34, "learning_rate": 0.00016681633082368498, "loss": 2.1505, "step": 179 }, { "epoch": 0.34, "learning_rate": 0.0001663309889906811, "loss": 2.1549, "step": 180 }, { "epoch": 0.34, "learning_rate": 0.0001658428409460134, "loss": 2.2301, "step": 181 }, { "epoch": 0.34, "learning_rate": 0.0001653519073413675, "loss": 2.1479, "step": 182 }, { "epoch": 0.34, "learning_rate": 0.0001648582089462756, "loss": 2.1095, "step": 183 }, { "epoch": 0.34, "learning_rate": 0.0001643617666472376, "loss": 2.1487, "step": 184 }, { "epoch": 0.35, "learning_rate": 0.00016386260144683745, "loss": 2.1334, "step": 185 }, { "epoch": 0.35, "learning_rate": 0.00016336073446285485, "loss": 2.0953, "step": 186 }, { "epoch": 0.35, "learning_rate": 0.00016285618692737157, "loss": 2.1386, "step": 187 }, { "epoch": 0.35, "learning_rate": 0.00016234898018587337, "loss": 2.1415, "step": 188 }, { "epoch": 0.35, "learning_rate": 0.0001618391356963468, "loss": 2.0492, "step": 189 }, { "epoch": 0.36, "learning_rate": 0.00016132667502837165, "loss": 2.056, "step": 190 }, { "epoch": 0.36, "learning_rate": 0.00016081161986220807, "loss": 2.2249, "step": 191 }, { "epoch": 0.36, "learning_rate": 0.00016029399198787974, "loss": 2.1115, "step": 192 }, { "epoch": 0.36, "learning_rate": 0.00015977381330425163, "loss": 2.1275, "step": 193 }, { "epoch": 0.36, "learning_rate": 0.00015925110581810394, "loss": 1.9615, "step": 194 }, { "epoch": 0.37, "learning_rate": 0.00015872589164320078, "loss": 2.1591, "step": 195 }, { "epoch": 0.37, "learning_rate": 0.0001581981929993547, "loss": 2.1233, "step": 196 }, { "epoch": 0.37, "learning_rate": 0.00015766803221148673, "loss": 2.2147, "step": 197 }, { "epoch": 0.37, "learning_rate": 0.0001571354317086818, "loss": 2.1625, "step": 198 }, { "epoch": 0.37, "learning_rate": 0.0001566004140232399, "loss": 2.0565, "step": 199 }, { "epoch": 0.37, "learning_rate": 0.00015606300178972287, "loss": 2.1389, "step": 200 }, { "epoch": 0.38, "learning_rate": 0.00015552321774399666, "loss": 2.126, "step": 201 }, { "epoch": 0.38, "learning_rate": 0.00015498108472226964, "loss": 2.1251, "step": 202 }, { "epoch": 0.38, "learning_rate": 0.00015443662566012645, "loss": 2.0343, "step": 203 }, { "epoch": 0.38, "learning_rate": 0.00015388986359155758, "loss": 2.076, "step": 204 }, { "epoch": 0.38, "learning_rate": 0.00015334082164798489, "loss": 1.9876, "step": 205 }, { "epoch": 0.39, "learning_rate": 0.00015278952305728324, "loss": 2.1316, "step": 206 }, { "epoch": 0.39, "learning_rate": 0.00015223599114279755, "loss": 2.06, "step": 207 }, { "epoch": 0.39, "learning_rate": 0.00015168024932235617, "loss": 2.0479, "step": 208 }, { "epoch": 0.39, "learning_rate": 0.00015112232110728015, "loss": 1.9749, "step": 209 }, { "epoch": 0.39, "learning_rate": 0.00015056223010138857, "loss": 2.1165, "step": 210 }, { "epoch": 0.4, "learning_rate": 0.00015000000000000001, "loss": 2.0656, "step": 211 }, { "epoch": 0.4, "learning_rate": 0.00014943565458893, "loss": 2.0241, "step": 212 }, { "epoch": 0.4, "learning_rate": 0.00014886921774348472, "loss": 2.0393, "step": 213 }, { "epoch": 0.4, "learning_rate": 0.00014830071342745112, "loss": 2.0376, "step": 214 }, { "epoch": 0.4, "learning_rate": 0.00014773016569208283, "loss": 2.0739, "step": 215 }, { "epoch": 0.4, "learning_rate": 0.0001471575986750828, "loss": 2.0254, "step": 216 }, { "epoch": 0.41, "learning_rate": 0.0001465830365995821, "loss": 2.0989, "step": 217 }, { "epoch": 0.41, "learning_rate": 0.00014600650377311522, "loss": 2.1056, "step": 218 }, { "epoch": 0.41, "learning_rate": 0.00014542802458659152, "loss": 2.0197, "step": 219 }, { "epoch": 0.41, "learning_rate": 0.00014484762351326343, "loss": 2.055, "step": 220 }, { "epoch": 0.41, "learning_rate": 0.0001442653251076912, "loss": 2.058, "step": 221 }, { "epoch": 0.42, "learning_rate": 0.00014368115400470392, "loss": 1.9918, "step": 222 }, { "epoch": 0.42, "learning_rate": 0.00014309513491835734, "loss": 2.0102, "step": 223 }, { "epoch": 0.42, "learning_rate": 0.00014250729264088843, "loss": 2.0356, "step": 224 }, { "epoch": 0.42, "learning_rate": 0.00014191765204166643, "loss": 2.0166, "step": 225 }, { "epoch": 0.42, "learning_rate": 0.00014132623806614063, "loss": 1.9309, "step": 226 }, { "epoch": 0.43, "learning_rate": 0.00014073307573478526, "loss": 2.0886, "step": 227 }, { "epoch": 0.43, "learning_rate": 0.00014013819014204075, "loss": 1.973, "step": 228 }, { "epoch": 0.43, "learning_rate": 0.00013954160645525217, "loss": 1.9996, "step": 229 }, { "epoch": 0.43, "learning_rate": 0.00013894334991360448, "loss": 1.9444, "step": 230 }, { "epoch": 0.43, "learning_rate": 0.00013834344582705474, "loss": 2.0583, "step": 231 }, { "epoch": 0.43, "learning_rate": 0.00013774191957526143, "loss": 1.9805, "step": 232 }, { "epoch": 0.44, "learning_rate": 0.00013713879660651068, "loss": 2.0694, "step": 233 }, { "epoch": 0.44, "learning_rate": 0.00013653410243663952, "loss": 2.0294, "step": 234 }, { "epoch": 0.44, "learning_rate": 0.00013592786264795658, "loss": 1.966, "step": 235 }, { "epoch": 0.44, "learning_rate": 0.0001353201028881598, "loss": 1.8961, "step": 236 }, { "epoch": 0.44, "learning_rate": 0.00013471084886925122, "loss": 2.0144, "step": 237 }, { "epoch": 0.45, "learning_rate": 0.00013410012636644935, "loss": 1.9803, "step": 238 }, { "epoch": 0.45, "learning_rate": 0.00013348796121709862, "loss": 1.984, "step": 239 }, { "epoch": 0.45, "learning_rate": 0.0001328743793195764, "loss": 2.011, "step": 240 }, { "epoch": 0.45, "learning_rate": 0.00013225940663219726, "loss": 1.9768, "step": 241 }, { "epoch": 0.45, "learning_rate": 0.00013164306917211476, "loss": 1.9221, "step": 242 }, { "epoch": 0.46, "learning_rate": 0.00013102539301422086, "loss": 1.9793, "step": 243 }, { "epoch": 0.46, "learning_rate": 0.00013040640429004267, "loss": 1.944, "step": 244 }, { "epoch": 0.46, "learning_rate": 0.000129786129186637, "loss": 1.8944, "step": 245 }, { "epoch": 0.46, "learning_rate": 0.0001291645939454825, "loss": 2.0456, "step": 246 }, { "epoch": 0.46, "learning_rate": 0.00012854182486136942, "loss": 1.992, "step": 247 }, { "epoch": 0.46, "learning_rate": 0.00012791784828128724, "loss": 1.9942, "step": 248 }, { "epoch": 0.47, "learning_rate": 0.00012729269060330999, "loss": 1.9879, "step": 249 }, { "epoch": 0.47, "learning_rate": 0.00012666637827547932, "loss": 1.9231, "step": 250 }, { "epoch": 0.47, "learning_rate": 0.00012603893779468604, "loss": 1.9368, "step": 251 }, { "epoch": 0.47, "learning_rate": 0.0001254103957055485, "loss": 1.856, "step": 252 }, { "epoch": 0.47, "learning_rate": 0.00012478077859929, "loss": 1.9358, "step": 253 }, { "epoch": 0.48, "learning_rate": 0.0001241501131126138, "loss": 1.995, "step": 254 }, { "epoch": 0.48, "learning_rate": 0.00012351842592657613, "loss": 1.8455, "step": 255 }, { "epoch": 0.48, "learning_rate": 0.00012288574376545733, "loss": 1.8912, "step": 256 }, { "epoch": 0.48, "learning_rate": 0.00012225209339563145, "loss": 1.9491, "step": 257 }, { "epoch": 0.48, "learning_rate": 0.00012161750162443371, "loss": 1.8341, "step": 258 }, { "epoch": 0.49, "learning_rate": 0.00012098199529902648, "loss": 1.8941, "step": 259 }, { "epoch": 0.49, "learning_rate": 0.0001203456013052634, "loss": 1.8808, "step": 260 }, { "epoch": 0.49, "learning_rate": 0.00011970834656655199, "loss": 1.8794, "step": 261 }, { "epoch": 0.49, "learning_rate": 0.00011907025804271461, "loss": 1.8828, "step": 262 }, { "epoch": 0.49, "learning_rate": 0.00011843136272884794, "loss": 1.818, "step": 263 }, { "epoch": 0.49, "learning_rate": 0.00011779168765418079, "loss": 1.847, "step": 264 }, { "epoch": 0.5, "learning_rate": 0.00011715125988093074, "loss": 1.8181, "step": 265 }, { "epoch": 0.5, "learning_rate": 0.00011651010650315923, "loss": 1.8297, "step": 266 }, { "epoch": 0.5, "learning_rate": 0.00011586825464562514, "loss": 1.9047, "step": 267 }, { "epoch": 0.5, "learning_rate": 0.00011522573146263744, "loss": 1.8538, "step": 268 }, { "epoch": 0.5, "eval_loss": 1.8599034547805786, "eval_runtime": 55.6724, "eval_samples_per_second": 24.213, "eval_steps_per_second": 1.024, "step": 268 }, { "epoch": 0.5, "learning_rate": 0.00011458256413690633, "loss": 2.0143, "step": 269 }, { "epoch": 0.51, "learning_rate": 0.00011393877987839327, "loss": 1.7532, "step": 270 }, { "epoch": 0.51, "learning_rate": 0.00011329440592315971, "loss": 1.825, "step": 271 }, { "epoch": 0.51, "learning_rate": 0.00011264946953221496, "loss": 1.7548, "step": 272 }, { "epoch": 0.51, "learning_rate": 0.00011200399799036288, "loss": 1.8505, "step": 273 }, { "epoch": 0.51, "learning_rate": 0.00011135801860504749, "loss": 1.8608, "step": 274 }, { "epoch": 0.52, "learning_rate": 0.00011071155870519777, "loss": 1.8352, "step": 275 }, { "epoch": 0.52, "learning_rate": 0.00011006464564007138, "loss": 1.9602, "step": 276 }, { "epoch": 0.52, "learning_rate": 0.00010941730677809772, "loss": 1.7833, "step": 277 }, { "epoch": 0.52, "learning_rate": 0.00010876956950572006, "loss": 1.8567, "step": 278 }, { "epoch": 0.52, "learning_rate": 0.00010812146122623683, "loss": 1.8606, "step": 279 }, { "epoch": 0.52, "learning_rate": 0.00010747300935864243, "loss": 1.7611, "step": 280 }, { "epoch": 0.53, "learning_rate": 0.0001068242413364671, "loss": 1.8188, "step": 281 }, { "epoch": 0.53, "learning_rate": 0.00010617518460661644, "loss": 1.7576, "step": 282 }, { "epoch": 0.53, "learning_rate": 0.00010552586662821009, "loss": 1.8525, "step": 283 }, { "epoch": 0.53, "learning_rate": 0.00010487631487142017, "loss": 1.7134, "step": 284 }, { "epoch": 0.53, "learning_rate": 0.00010422655681630917, "loss": 1.8414, "step": 285 }, { "epoch": 0.54, "learning_rate": 0.00010357661995166705, "loss": 1.815, "step": 286 }, { "epoch": 0.54, "learning_rate": 0.00010292653177384876, "loss": 1.7789, "step": 287 }, { "epoch": 0.54, "learning_rate": 0.00010227631978561056, "loss": 1.82, "step": 288 }, { "epoch": 0.54, "learning_rate": 0.00010162601149494676, "loss": 1.8095, "step": 289 }, { "epoch": 0.54, "learning_rate": 0.00010097563441392581, "loss": 1.7666, "step": 290 }, { "epoch": 0.55, "learning_rate": 0.00010032521605752646, "loss": 1.8287, "step": 291 }, { "epoch": 0.55, "learning_rate": 9.967478394247357e-05, "loss": 1.8818, "step": 292 }, { "epoch": 0.55, "learning_rate": 9.90243655860742e-05, "loss": 1.7085, "step": 293 }, { "epoch": 0.55, "learning_rate": 9.837398850505324e-05, "loss": 1.7111, "step": 294 }, { "epoch": 0.55, "learning_rate": 9.772368021438943e-05, "loss": 1.7282, "step": 295 }, { "epoch": 0.55, "learning_rate": 9.707346822615128e-05, "loss": 1.8105, "step": 296 }, { "epoch": 0.56, "learning_rate": 9.642338004833295e-05, "loss": 1.7592, "step": 297 }, { "epoch": 0.56, "learning_rate": 9.577344318369084e-05, "loss": 1.6769, "step": 298 }, { "epoch": 0.56, "learning_rate": 9.512368512857984e-05, "loss": 1.721, "step": 299 }, { "epoch": 0.56, "learning_rate": 9.447413337178995e-05, "loss": 1.6353, "step": 300 }, { "epoch": 0.56, "learning_rate": 9.38248153933836e-05, "loss": 1.654, "step": 301 }, { "epoch": 0.57, "learning_rate": 9.317575866353292e-05, "loss": 1.7222, "step": 302 }, { "epoch": 0.57, "learning_rate": 9.252699064135758e-05, "loss": 1.7727, "step": 303 }, { "epoch": 0.57, "learning_rate": 9.187853877376318e-05, "loss": 1.8161, "step": 304 }, { "epoch": 0.57, "learning_rate": 9.123043049427995e-05, "loss": 1.7088, "step": 305 }, { "epoch": 0.57, "learning_rate": 9.058269322190228e-05, "loss": 1.7274, "step": 306 }, { "epoch": 0.58, "learning_rate": 8.993535435992867e-05, "loss": 1.6949, "step": 307 }, { "epoch": 0.58, "learning_rate": 8.928844129480227e-05, "loss": 1.7429, "step": 308 }, { "epoch": 0.58, "learning_rate": 8.86419813949525e-05, "loss": 1.7443, "step": 309 }, { "epoch": 0.58, "learning_rate": 8.799600200963716e-05, "loss": 1.5959, "step": 310 }, { "epoch": 0.58, "learning_rate": 8.735053046778506e-05, "loss": 1.6557, "step": 311 }, { "epoch": 0.58, "learning_rate": 8.670559407684034e-05, "loss": 1.6743, "step": 312 }, { "epoch": 0.59, "learning_rate": 8.606122012160676e-05, "loss": 1.7672, "step": 313 }, { "epoch": 0.59, "learning_rate": 8.541743586309365e-05, "loss": 1.6774, "step": 314 }, { "epoch": 0.59, "learning_rate": 8.477426853736257e-05, "loss": 1.6597, "step": 315 }, { "epoch": 0.59, "learning_rate": 8.413174535437487e-05, "loss": 1.7112, "step": 316 }, { "epoch": 0.59, "learning_rate": 8.348989349684076e-05, "loss": 1.6499, "step": 317 }, { "epoch": 0.6, "learning_rate": 8.284874011906927e-05, "loss": 1.625, "step": 318 }, { "epoch": 0.6, "learning_rate": 8.220831234581922e-05, "loss": 1.6922, "step": 319 }, { "epoch": 0.6, "learning_rate": 8.156863727115211e-05, "loss": 1.6797, "step": 320 }, { "epoch": 0.6, "learning_rate": 8.092974195728542e-05, "loss": 1.6772, "step": 321 }, { "epoch": 0.6, "learning_rate": 8.029165343344805e-05, "loss": 1.6439, "step": 322 }, { "epoch": 0.61, "learning_rate": 7.965439869473664e-05, "loss": 1.6358, "step": 323 }, { "epoch": 0.61, "learning_rate": 7.901800470097355e-05, "loss": 1.6808, "step": 324 }, { "epoch": 0.61, "learning_rate": 7.838249837556629e-05, "loss": 1.6013, "step": 325 }, { "epoch": 0.61, "learning_rate": 7.774790660436858e-05, "loss": 1.6607, "step": 326 }, { "epoch": 0.61, "learning_rate": 7.711425623454267e-05, "loss": 1.7424, "step": 327 }, { "epoch": 0.61, "learning_rate": 7.648157407342386e-05, "loss": 1.6395, "step": 328 }, { "epoch": 0.62, "learning_rate": 7.584988688738622e-05, "loss": 1.7159, "step": 329 }, { "epoch": 0.62, "learning_rate": 7.521922140071002e-05, "loss": 1.57, "step": 330 }, { "epoch": 0.62, "learning_rate": 7.458960429445157e-05, "loss": 1.6616, "step": 331 }, { "epoch": 0.62, "learning_rate": 7.396106220531398e-05, "loss": 1.6996, "step": 332 }, { "epoch": 0.62, "learning_rate": 7.333362172452065e-05, "loss": 1.6087, "step": 333 }, { "epoch": 0.63, "learning_rate": 7.270730939669006e-05, "loss": 1.6516, "step": 334 }, { "epoch": 0.63, "learning_rate": 7.208215171871277e-05, "loss": 1.6155, "step": 335 }, { "epoch": 0.63, "learning_rate": 7.145817513863057e-05, "loss": 1.7189, "step": 336 }, { "epoch": 0.63, "learning_rate": 7.08354060545175e-05, "loss": 1.6217, "step": 337 }, { "epoch": 0.63, "learning_rate": 7.021387081336301e-05, "loss": 1.6072, "step": 338 }, { "epoch": 0.64, "learning_rate": 6.959359570995738e-05, "loss": 1.6163, "step": 339 }, { "epoch": 0.64, "learning_rate": 6.897460698577918e-05, "loss": 1.5596, "step": 340 }, { "epoch": 0.64, "learning_rate": 6.835693082788525e-05, "loss": 1.5035, "step": 341 }, { "epoch": 0.64, "learning_rate": 6.774059336780277e-05, "loss": 1.5525, "step": 342 }, { "epoch": 0.64, "learning_rate": 6.712562068042361e-05, "loss": 1.578, "step": 343 }, { "epoch": 0.64, "learning_rate": 6.651203878290139e-05, "loss": 1.5724, "step": 344 }, { "epoch": 0.65, "learning_rate": 6.589987363355068e-05, "loss": 1.6454, "step": 345 }, { "epoch": 0.65, "learning_rate": 6.528915113074879e-05, "loss": 1.5945, "step": 346 }, { "epoch": 0.65, "learning_rate": 6.46798971118402e-05, "loss": 1.5424, "step": 347 }, { "epoch": 0.65, "learning_rate": 6.407213735204343e-05, "loss": 1.6592, "step": 348 }, { "epoch": 0.65, "learning_rate": 6.34658975633605e-05, "loss": 1.6077, "step": 349 }, { "epoch": 0.66, "learning_rate": 6.286120339348935e-05, "loss": 1.5977, "step": 350 }, { "epoch": 0.66, "learning_rate": 6.225808042473858e-05, "loss": 1.6713, "step": 351 }, { "epoch": 0.66, "learning_rate": 6.165655417294527e-05, "loss": 1.4955, "step": 352 }, { "epoch": 0.66, "learning_rate": 6.105665008639557e-05, "loss": 1.5353, "step": 353 }, { "epoch": 0.66, "learning_rate": 6.045839354474786e-05, "loss": 1.4601, "step": 354 }, { "epoch": 0.67, "learning_rate": 5.986180985795926e-05, "loss": 1.5212, "step": 355 }, { "epoch": 0.67, "learning_rate": 5.926692426521474e-05, "loss": 1.5534, "step": 356 }, { "epoch": 0.67, "learning_rate": 5.867376193385936e-05, "loss": 1.5822, "step": 357 }, { "epoch": 0.67, "learning_rate": 5.8082347958333625e-05, "loss": 1.583, "step": 358 }, { "epoch": 0.67, "learning_rate": 5.749270735911158e-05, "loss": 1.484, "step": 359 }, { "epoch": 0.67, "learning_rate": 5.6904865081642676e-05, "loss": 1.404, "step": 360 }, { "epoch": 0.68, "learning_rate": 5.631884599529611e-05, "loss": 1.4449, "step": 361 }, { "epoch": 0.68, "learning_rate": 5.573467489230879e-05, "loss": 1.553, "step": 362 }, { "epoch": 0.68, "learning_rate": 5.515237648673656e-05, "loss": 1.4247, "step": 363 }, { "epoch": 0.68, "learning_rate": 5.457197541340853e-05, "loss": 1.4967, "step": 364 }, { "epoch": 0.68, "learning_rate": 5.399349622688479e-05, "loss": 1.5317, "step": 365 }, { "epoch": 0.69, "learning_rate": 5.3416963400417905e-05, "loss": 1.5611, "step": 366 }, { "epoch": 0.69, "learning_rate": 5.284240132491727e-05, "loss": 1.588, "step": 367 }, { "epoch": 0.69, "learning_rate": 5.226983430791722e-05, "loss": 1.5095, "step": 368 }, { "epoch": 0.69, "learning_rate": 5.16992865725489e-05, "loss": 1.4771, "step": 369 }, { "epoch": 0.69, "learning_rate": 5.113078225651529e-05, "loss": 1.4928, "step": 370 }, { "epoch": 0.7, "learning_rate": 5.0564345411070025e-05, "loss": 1.5018, "step": 371 }, { "epoch": 0.7, "learning_rate": 5.000000000000002e-05, "loss": 1.4768, "step": 372 }, { "epoch": 0.7, "learning_rate": 4.943776989861145e-05, "loss": 1.5867, "step": 373 }, { "epoch": 0.7, "learning_rate": 4.8877678892719866e-05, "loss": 1.4886, "step": 374 }, { "epoch": 0.7, "learning_rate": 4.831975067764387e-05, "loss": 1.5394, "step": 375 }, { "epoch": 0.7, "learning_rate": 4.7764008857202425e-05, "loss": 1.4141, "step": 376 }, { "epoch": 0.71, "learning_rate": 4.721047694271676e-05, "loss": 1.4668, "step": 377 }, { "epoch": 0.71, "learning_rate": 4.665917835201512e-05, "loss": 1.507, "step": 378 }, { "epoch": 0.71, "learning_rate": 4.611013640844245e-05, "loss": 1.4728, "step": 379 }, { "epoch": 0.71, "learning_rate": 4.556337433987359e-05, "loss": 1.4566, "step": 380 }, { "epoch": 0.71, "learning_rate": 4.501891527773038e-05, "loss": 1.4617, "step": 381 }, { "epoch": 0.72, "learning_rate": 4.447678225600337e-05, "loss": 1.4384, "step": 382 }, { "epoch": 0.72, "learning_rate": 4.393699821027716e-05, "loss": 1.5093, "step": 383 }, { "epoch": 0.72, "learning_rate": 4.3399585976760105e-05, "loss": 1.4602, "step": 384 }, { "epoch": 0.72, "learning_rate": 4.286456829131821e-05, "loss": 1.3591, "step": 385 }, { "epoch": 0.72, "learning_rate": 4.2331967788513295e-05, "loss": 1.4804, "step": 386 }, { "epoch": 0.73, "learning_rate": 4.180180700064531e-05, "loss": 1.4427, "step": 387 }, { "epoch": 0.73, "learning_rate": 4.127410835679926e-05, "loss": 1.3901, "step": 388 }, { "epoch": 0.73, "learning_rate": 4.074889418189608e-05, "loss": 1.4564, "step": 389 }, { "epoch": 0.73, "learning_rate": 4.022618669574839e-05, "loss": 1.4548, "step": 390 }, { "epoch": 0.73, "learning_rate": 3.97060080121203e-05, "loss": 1.4485, "step": 391 }, { "epoch": 0.73, "learning_rate": 3.9188380137791936e-05, "loss": 1.418, "step": 392 }, { "epoch": 0.74, "learning_rate": 3.8673324971628357e-05, "loss": 1.4664, "step": 393 }, { "epoch": 0.74, "learning_rate": 3.816086430365321e-05, "loss": 1.4593, "step": 394 }, { "epoch": 0.74, "learning_rate": 3.7651019814126654e-05, "loss": 1.4556, "step": 395 }, { "epoch": 0.74, "learning_rate": 3.7143813072628465e-05, "loss": 1.3472, "step": 396 }, { "epoch": 0.74, "learning_rate": 3.663926553714518e-05, "loss": 1.2962, "step": 397 }, { "epoch": 0.75, "learning_rate": 3.613739855316257e-05, "loss": 1.4875, "step": 398 }, { "epoch": 0.75, "learning_rate": 3.563823335276244e-05, "loss": 1.382, "step": 399 }, { "epoch": 0.75, "learning_rate": 3.5141791053724405e-05, "loss": 1.3698, "step": 400 }, { "epoch": 0.75, "learning_rate": 3.46480926586325e-05, "loss": 1.4564, "step": 401 }, { "epoch": 0.75, "learning_rate": 3.415715905398664e-05, "loss": 1.5117, "step": 402 }, { "epoch": 0.75, "eval_loss": 1.422906517982483, "eval_runtime": 55.7451, "eval_samples_per_second": 24.182, "eval_steps_per_second": 1.023, "step": 402 }, { "epoch": 0.76, "learning_rate": 3.366901100931892e-05, "loss": 1.407, "step": 403 }, { "epoch": 0.76, "learning_rate": 3.3183669176315045e-05, "loss": 1.3833, "step": 404 }, { "epoch": 0.76, "learning_rate": 3.27011540879406e-05, "loss": 1.4071, "step": 405 }, { "epoch": 0.76, "learning_rate": 3.2221486157572324e-05, "loss": 1.439, "step": 406 }, { "epoch": 0.76, "learning_rate": 3.174468567813461e-05, "loss": 1.4009, "step": 407 }, { "epoch": 0.76, "learning_rate": 3.1270772821240776e-05, "loss": 1.417, "step": 408 }, { "epoch": 0.77, "learning_rate": 3.079976763633996e-05, "loss": 1.2888, "step": 409 }, { "epoch": 0.77, "learning_rate": 3.033169004986873e-05, "loss": 1.475, "step": 410 }, { "epoch": 0.77, "learning_rate": 2.986655986440805e-05, "loss": 1.3512, "step": 411 }, { "epoch": 0.77, "learning_rate": 2.940439675784563e-05, "loss": 1.2825, "step": 412 }, { "epoch": 0.77, "learning_rate": 2.894522028254334e-05, "loss": 1.3424, "step": 413 }, { "epoch": 0.78, "learning_rate": 2.8489049864510054e-05, "loss": 1.391, "step": 414 }, { "epoch": 0.78, "learning_rate": 2.803590480257985e-05, "loss": 1.2999, "step": 415 }, { "epoch": 0.78, "learning_rate": 2.7585804267595384e-05, "loss": 1.39, "step": 416 }, { "epoch": 0.78, "learning_rate": 2.7138767301597067e-05, "loss": 1.3686, "step": 417 }, { "epoch": 0.78, "learning_rate": 2.669481281701739e-05, "loss": 1.3871, "step": 418 }, { "epoch": 0.79, "learning_rate": 2.6253959595880673e-05, "loss": 1.3755, "step": 419 }, { "epoch": 0.79, "learning_rate": 2.581622628900868e-05, "loss": 1.4435, "step": 420 }, { "epoch": 0.79, "learning_rate": 2.5381631415231454e-05, "loss": 1.3808, "step": 421 }, { "epoch": 0.79, "learning_rate": 2.495019336060387e-05, "loss": 1.4653, "step": 422 }, { "epoch": 0.79, "learning_rate": 2.4521930377627812e-05, "loss": 1.3653, "step": 423 }, { "epoch": 0.79, "learning_rate": 2.4096860584479974e-05, "loss": 1.4224, "step": 424 }, { "epoch": 0.8, "learning_rate": 2.367500196424529e-05, "loss": 1.3416, "step": 425 }, { "epoch": 0.8, "learning_rate": 2.3256372364156286e-05, "loss": 1.3177, "step": 426 }, { "epoch": 0.8, "learning_rate": 2.2840989494837793e-05, "loss": 1.403, "step": 427 }, { "epoch": 0.8, "learning_rate": 2.242887092955801e-05, "loss": 1.4582, "step": 428 }, { "epoch": 0.8, "learning_rate": 2.202003410348473e-05, "loss": 1.4202, "step": 429 }, { "epoch": 0.81, "learning_rate": 2.1614496312947852e-05, "loss": 1.3492, "step": 430 }, { "epoch": 0.81, "learning_rate": 2.121227471470768e-05, "loss": 1.201, "step": 431 }, { "epoch": 0.81, "learning_rate": 2.0813386325229055e-05, "loss": 1.3131, "step": 432 }, { "epoch": 0.81, "learning_rate": 2.0417848019961372e-05, "loss": 1.3579, "step": 433 }, { "epoch": 0.81, "learning_rate": 2.002567653262479e-05, "loss": 1.3943, "step": 434 }, { "epoch": 0.82, "learning_rate": 1.9636888454502178e-05, "loss": 1.4777, "step": 435 }, { "epoch": 0.82, "learning_rate": 1.925150023373726e-05, "loss": 1.4449, "step": 436 }, { "epoch": 0.82, "learning_rate": 1.8869528174638752e-05, "loss": 1.3711, "step": 437 }, { "epoch": 0.82, "learning_rate": 1.8490988436990488e-05, "loss": 1.2473, "step": 438 }, { "epoch": 0.82, "learning_rate": 1.8115897035367934e-05, "loss": 1.4266, "step": 439 }, { "epoch": 0.82, "learning_rate": 1.774426983846058e-05, "loss": 1.3646, "step": 440 }, { "epoch": 0.83, "learning_rate": 1.7376122568400532e-05, "loss": 1.2857, "step": 441 }, { "epoch": 0.83, "learning_rate": 1.7011470800097496e-05, "loss": 1.3349, "step": 442 }, { "epoch": 0.83, "learning_rate": 1.6650329960579792e-05, "loss": 1.3196, "step": 443 }, { "epoch": 0.83, "learning_rate": 1.6292715328341712e-05, "loss": 1.3533, "step": 444 }, { "epoch": 0.83, "learning_rate": 1.593864203269716e-05, "loss": 1.3276, "step": 445 }, { "epoch": 0.84, "learning_rate": 1.5588125053139468e-05, "loss": 1.434, "step": 446 }, { "epoch": 0.84, "learning_rate": 1.5241179218707891e-05, "loss": 1.4045, "step": 447 }, { "epoch": 0.84, "learning_rate": 1.4897819207360098e-05, "loss": 1.4106, "step": 448 }, { "epoch": 0.84, "learning_rate": 1.4558059545351143e-05, "loss": 1.3125, "step": 449 }, { "epoch": 0.84, "learning_rate": 1.4221914606619135e-05, "loss": 1.2878, "step": 450 }, { "epoch": 0.85, "learning_rate": 1.3889398612176941e-05, "loss": 1.3981, "step": 451 }, { "epoch": 0.85, "learning_rate": 1.3560525629510568e-05, "loss": 1.2914, "step": 452 }, { "epoch": 0.85, "learning_rate": 1.3235309571984156e-05, "loss": 1.2669, "step": 453 }, { "epoch": 0.85, "learning_rate": 1.2913764198251132e-05, "loss": 1.405, "step": 454 }, { "epoch": 0.85, "learning_rate": 1.259590311167238e-05, "loss": 1.3743, "step": 455 }, { "epoch": 0.85, "learning_rate": 1.2281739759740574e-05, "loss": 1.2633, "step": 456 }, { "epoch": 0.86, "learning_rate": 1.1971287433511313e-05, "loss": 1.2473, "step": 457 }, { "epoch": 0.86, "learning_rate": 1.166455926704082e-05, "loss": 1.366, "step": 458 }, { "epoch": 0.86, "learning_rate": 1.1361568236830323e-05, "loss": 1.286, "step": 459 }, { "epoch": 0.86, "learning_rate": 1.1062327161276963e-05, "loss": 1.3363, "step": 460 }, { "epoch": 0.86, "learning_rate": 1.0766848700131648e-05, "loss": 1.3854, "step": 461 }, { "epoch": 0.87, "learning_rate": 1.0475145353963389e-05, "loss": 1.4039, "step": 462 }, { "epoch": 0.87, "learning_rate": 1.01872294636304e-05, "loss": 1.446, "step": 463 }, { "epoch": 0.87, "learning_rate": 9.903113209758096e-06, "loss": 1.3371, "step": 464 }, { "epoch": 0.87, "learning_rate": 9.62280861222372e-06, "loss": 1.3936, "step": 465 }, { "epoch": 0.87, "learning_rate": 9.346327529647868e-06, "loss": 1.2979, "step": 466 }, { "epoch": 0.88, "learning_rate": 9.073681658892775e-06, "loss": 1.3019, "step": 467 }, { "epoch": 0.88, "learning_rate": 8.804882534567382e-06, "loss": 1.3869, "step": 468 }, { "epoch": 0.88, "learning_rate": 8.53994152853952e-06, "loss": 1.3959, "step": 469 }, { "epoch": 0.88, "learning_rate": 8.278869849454718e-06, "loss": 1.2561, "step": 470 }, { "epoch": 0.88, "learning_rate": 8.021678542261924e-06, "loss": 1.3396, "step": 471 }, { "epoch": 0.88, "learning_rate": 7.76837848774642e-06, "loss": 1.3443, "step": 472 }, { "epoch": 0.89, "learning_rate": 7.5189804020693536e-06, "loss": 1.3167, "step": 473 }, { "epoch": 0.89, "learning_rate": 7.2734948363144206e-06, "loss": 1.2964, "step": 474 }, { "epoch": 0.89, "learning_rate": 7.031932176041523e-06, "loss": 1.3125, "step": 475 }, { "epoch": 0.89, "learning_rate": 6.794302640847294e-06, "loss": 1.2952, "step": 476 }, { "epoch": 0.89, "learning_rate": 6.560616283932897e-06, "loss": 1.2655, "step": 477 }, { "epoch": 0.9, "learning_rate": 6.330882991678577e-06, "loss": 1.3877, "step": 478 }, { "epoch": 0.9, "learning_rate": 6.1051124832254944e-06, "loss": 1.33, "step": 479 }, { "epoch": 0.9, "learning_rate": 5.883314310064492e-06, "loss": 1.3948, "step": 480 }, { "epoch": 0.9, "learning_rate": 5.6654978556320405e-06, "loss": 1.3569, "step": 481 }, { "epoch": 0.9, "learning_rate": 5.451672334913216e-06, "loss": 1.2655, "step": 482 }, { "epoch": 0.91, "learning_rate": 5.2418467940519185e-06, "loss": 1.3639, "step": 483 }, { "epoch": 0.91, "learning_rate": 5.036030109968082e-06, "loss": 1.427, "step": 484 }, { "epoch": 0.91, "learning_rate": 4.834230989982213e-06, "loss": 1.3306, "step": 485 }, { "epoch": 0.91, "learning_rate": 4.63645797144695e-06, "loss": 1.3196, "step": 486 }, { "epoch": 0.91, "learning_rate": 4.442719421385922e-06, "loss": 1.2518, "step": 487 }, { "epoch": 0.91, "learning_rate": 4.253023536139733e-06, "loss": 1.2918, "step": 488 }, { "epoch": 0.92, "learning_rate": 4.067378341019257e-06, "loss": 1.3128, "step": 489 }, { "epoch": 0.92, "learning_rate": 3.885791689966023e-06, "loss": 1.3452, "step": 490 }, { "epoch": 0.92, "learning_rate": 3.7082712652200867e-06, "loss": 1.4055, "step": 491 }, { "epoch": 0.92, "learning_rate": 3.5348245769948773e-06, "loss": 1.2194, "step": 492 }, { "epoch": 0.92, "learning_rate": 3.3654589631595955e-06, "loss": 1.338, "step": 493 }, { "epoch": 0.93, "learning_rate": 3.2001815889286856e-06, "loss": 1.158, "step": 494 }, { "epoch": 0.93, "learning_rate": 3.0389994465587545e-06, "loss": 1.3595, "step": 495 }, { "epoch": 0.93, "learning_rate": 2.881919355052709e-06, "loss": 1.2421, "step": 496 }, { "epoch": 0.93, "learning_rate": 2.728947959871353e-06, "loss": 1.3172, "step": 497 }, { "epoch": 0.93, "learning_rate": 2.580091732652101e-06, "loss": 1.2558, "step": 498 }, { "epoch": 0.94, "learning_rate": 2.435356970935354e-06, "loss": 1.2086, "step": 499 }, { "epoch": 0.94, "learning_rate": 2.294749797897955e-06, "loss": 1.3303, "step": 500 }, { "epoch": 0.94, "learning_rate": 2.158276162094197e-06, "loss": 1.3262, "step": 501 }, { "epoch": 0.94, "learning_rate": 2.0259418372041485e-06, "loss": 1.3066, "step": 502 }, { "epoch": 0.94, "learning_rate": 1.8977524217893783e-06, "loss": 1.3071, "step": 503 }, { "epoch": 0.94, "learning_rate": 1.7737133390561046e-06, "loss": 1.338, "step": 504 }, { "epoch": 0.95, "learning_rate": 1.6538298366257976e-06, "loss": 1.3407, "step": 505 }, { "epoch": 0.95, "learning_rate": 1.5381069863131037e-06, "loss": 1.4355, "step": 506 }, { "epoch": 0.95, "learning_rate": 1.426549683911349e-06, "loss": 1.341, "step": 507 }, { "epoch": 0.95, "learning_rate": 1.3191626489853615e-06, "loss": 1.3546, "step": 508 }, { "epoch": 0.95, "learning_rate": 1.2159504246718522e-06, "loss": 1.3322, "step": 509 }, { "epoch": 0.96, "learning_rate": 1.1169173774871478e-06, "loss": 1.3374, "step": 510 }, { "epoch": 0.96, "learning_rate": 1.0220676971425257e-06, "loss": 1.3706, "step": 511 }, { "epoch": 0.96, "learning_rate": 9.314053963669245e-07, "loss": 1.3249, "step": 512 }, { "epoch": 0.96, "learning_rate": 8.44934310737211e-07, "loss": 1.4235, "step": 513 }, { "epoch": 0.96, "learning_rate": 7.62658098515856e-07, "loss": 1.2759, "step": 514 }, { "epoch": 0.97, "learning_rate": 6.845802404962243e-07, "loss": 1.342, "step": 515 }, { "epoch": 0.97, "learning_rate": 6.107040398553144e-07, "loss": 1.4004, "step": 516 }, { "epoch": 0.97, "learning_rate": 5.410326220139705e-07, "loss": 1.3671, "step": 517 }, { "epoch": 0.97, "learning_rate": 4.7556893450466653e-07, "loss": 1.2746, "step": 518 }, { "epoch": 0.97, "learning_rate": 4.143157468468717e-07, "loss": 1.3585, "step": 519 }, { "epoch": 0.97, "learning_rate": 3.5727565042978915e-07, "loss": 1.3258, "step": 520 }, { "epoch": 0.98, "learning_rate": 3.044510584027771e-07, "loss": 1.2909, "step": 521 }, { "epoch": 0.98, "learning_rate": 2.558442055732524e-07, "loss": 1.2929, "step": 522 }, { "epoch": 0.98, "learning_rate": 2.1145714831216634e-07, "loss": 1.389, "step": 523 }, { "epoch": 0.98, "learning_rate": 1.7129176446692984e-07, "loss": 1.4132, "step": 524 }, { "epoch": 0.98, "learning_rate": 1.3534975328205467e-07, "loss": 1.2848, "step": 525 }, { "epoch": 0.99, "learning_rate": 1.0363263532724432e-07, "loss": 1.3483, "step": 526 }, { "epoch": 0.99, "learning_rate": 7.614175243301213e-08, "loss": 1.3588, "step": 527 }, { "epoch": 0.99, "learning_rate": 5.287826763398229e-08, "loss": 1.3012, "step": 528 }, { "epoch": 0.99, "learning_rate": 3.384316511964025e-08, "loss": 1.3524, "step": 529 }, { "epoch": 0.99, "learning_rate": 1.9037250192732726e-08, "loss": 1.3232, "step": 530 }, { "epoch": 1.0, "learning_rate": 8.46114923513941e-09, "loss": 1.3092, "step": 531 }, { "epoch": 1.0, "learning_rate": 2.1153096814607865e-09, "loss": 1.2231, "step": 532 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 1.4531, "step": 533 } ], "logging_steps": 1, "max_steps": 533, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.023661144257331e+17, "train_batch_size": 24, "trial_name": null, "trial_params": null }