{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 3.125e-05, "loss": 6.2298, "step": 1000 }, { "epoch": 0.11, "learning_rate": 6.25e-05, "loss": 5.0094, "step": 2000 }, { "epoch": 0.16, "learning_rate": 9.375e-05, "loss": 4.6802, "step": 3000 }, { "epoch": 0.22, "learning_rate": 0.000125, "loss": 4.4542, "step": 4000 }, { "epoch": 0.27, "learning_rate": 0.00015625, "loss": 4.2925, "step": 5000 }, { "epoch": 0.32, "learning_rate": 0.0001875, "loss": 4.1748, "step": 6000 }, { "epoch": 0.38, "learning_rate": 0.00021875, "loss": 4.0701, "step": 7000 }, { "epoch": 0.43, "learning_rate": 0.00025, "loss": 3.9851, "step": 8000 }, { "epoch": 0.48, "learning_rate": 0.00028121875, "loss": 3.913, "step": 9000 }, { "epoch": 0.54, "learning_rate": 0.00031246875000000003, "loss": 3.8497, "step": 10000 }, { "epoch": 0.59, "learning_rate": 0.00034368749999999997, "loss": 3.8077, "step": 11000 }, { "epoch": 0.65, "learning_rate": 0.0003749375, "loss": 3.7627, "step": 12000 }, { "epoch": 0.7, "learning_rate": 0.00040615625, "loss": 3.7397, "step": 13000 }, { "epoch": 0.75, "learning_rate": 0.00043737500000000005, "loss": 3.701, "step": 14000 }, { "epoch": 0.81, "learning_rate": 0.000468625, "loss": 3.6693, "step": 15000 }, { "epoch": 0.86, "learning_rate": 0.00049984375, "loss": 3.6554, "step": 16000 }, { "epoch": 0.91, "learning_rate": 0.00053109375, "loss": 3.6301, "step": 17000 }, { "epoch": 0.97, "learning_rate": 0.00056234375, "loss": 3.6072, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.3577459544837306, "eval_loss": 3.797168016433716, "eval_runtime": 152.3829, "eval_samples_per_second": 380.089, "eval_steps_per_second": 5.939, "step": 18592 }, { "epoch": 1.02, "learning_rate": 0.0005935625, "loss": 3.5799, "step": 19000 }, { "epoch": 1.08, "learning_rate": 0.0006248125, "loss": 3.5485, "step": 20000 }, { "epoch": 1.13, "learning_rate": 0.0006560625, "loss": 3.5406, "step": 21000 }, { "epoch": 1.18, "learning_rate": 0.00068728125, "loss": 3.5264, "step": 22000 }, { "epoch": 1.24, "learning_rate": 0.00071853125, "loss": 3.518, "step": 23000 }, { "epoch": 1.29, "learning_rate": 0.00074978125, "loss": 3.5133, "step": 24000 }, { "epoch": 1.34, "learning_rate": 0.0007810312499999999, "loss": 3.4955, "step": 25000 }, { "epoch": 1.4, "learning_rate": 0.00081225, "loss": 3.4875, "step": 26000 }, { "epoch": 1.45, "learning_rate": 0.0008435000000000001, "loss": 3.4788, "step": 27000 }, { "epoch": 1.51, "learning_rate": 0.00087471875, "loss": 3.4699, "step": 28000 }, { "epoch": 1.56, "learning_rate": 0.00090596875, "loss": 3.4636, "step": 29000 }, { "epoch": 1.61, "learning_rate": 0.00093721875, "loss": 3.4553, "step": 30000 }, { "epoch": 1.67, "learning_rate": 0.0009684375, "loss": 3.4444, "step": 31000 }, { "epoch": 1.72, "learning_rate": 0.0009996875, "loss": 3.437, "step": 32000 }, { "epoch": 1.77, "learning_rate": 0.0009970868644067796, "loss": 3.4315, "step": 33000 }, { "epoch": 1.83, "learning_rate": 0.0009941443032015066, "loss": 3.4193, "step": 34000 }, { "epoch": 1.88, "learning_rate": 0.0009912017419962335, "loss": 3.4093, "step": 35000 }, { "epoch": 1.94, "learning_rate": 0.0009882621233521657, "loss": 3.3989, "step": 36000 }, { "epoch": 1.99, "learning_rate": 0.0009853225047080979, "loss": 3.3863, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.38044785878682175, "eval_loss": 3.5804407596588135, "eval_runtime": 153.7328, "eval_samples_per_second": 376.751, "eval_steps_per_second": 5.887, "step": 37184 }, { "epoch": 2.04, "learning_rate": 0.000982379943502825, "loss": 3.3396, "step": 38000 }, { "epoch": 2.1, "learning_rate": 0.0009794373822975518, "loss": 3.328, "step": 39000 }, { "epoch": 2.15, "learning_rate": 0.000976497763653484, "loss": 3.3247, "step": 40000 }, { "epoch": 2.21, "learning_rate": 0.0009735552024482109, "loss": 3.3252, "step": 41000 }, { "epoch": 2.26, "learning_rate": 0.0009706155838041431, "loss": 3.3166, "step": 42000 }, { "epoch": 2.31, "learning_rate": 0.00096767302259887, "loss": 3.3188, "step": 43000 }, { "epoch": 2.37, "learning_rate": 0.0009647334039548023, "loss": 3.3129, "step": 44000 }, { "epoch": 2.42, "learning_rate": 0.0009617908427495293, "loss": 3.304, "step": 45000 }, { "epoch": 2.47, "learning_rate": 0.0009588482815442562, "loss": 3.2975, "step": 46000 }, { "epoch": 2.53, "learning_rate": 0.0009559086629001883, "loss": 3.2928, "step": 47000 }, { "epoch": 2.58, "learning_rate": 0.0009529661016949153, "loss": 3.2871, "step": 48000 }, { "epoch": 2.64, "learning_rate": 0.0009500235404896422, "loss": 3.2821, "step": 49000 }, { "epoch": 2.69, "learning_rate": 0.0009470809792843691, "loss": 3.2765, "step": 50000 }, { "epoch": 2.74, "learning_rate": 0.0009441413606403013, "loss": 3.2783, "step": 51000 }, { "epoch": 2.8, "learning_rate": 0.0009411987994350282, "loss": 3.2737, "step": 52000 }, { "epoch": 2.85, "learning_rate": 0.0009382591807909605, "loss": 3.2713, "step": 53000 }, { "epoch": 2.9, "learning_rate": 0.0009353166195856874, "loss": 3.2633, "step": 54000 }, { "epoch": 2.96, "learning_rate": 0.0009323770009416196, "loss": 3.2556, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.3910322096812649, "eval_loss": 3.472982168197632, "eval_runtime": 154.4664, "eval_samples_per_second": 374.962, "eval_steps_per_second": 5.859, "step": 55776 }, { "epoch": 3.01, "learning_rate": 0.0009294344397363465, "loss": 3.2424, "step": 56000 }, { "epoch": 3.07, "learning_rate": 0.0009264918785310734, "loss": 3.1944, "step": 57000 }, { "epoch": 3.12, "learning_rate": 0.0009235493173258004, "loss": 3.1947, "step": 58000 }, { "epoch": 3.17, "learning_rate": 0.0009206096986817327, "loss": 3.1979, "step": 59000 }, { "epoch": 3.23, "learning_rate": 0.0009176671374764596, "loss": 3.1974, "step": 60000 }, { "epoch": 3.28, "learning_rate": 0.0009147245762711865, "loss": 3.1978, "step": 61000 }, { "epoch": 3.33, "learning_rate": 0.0009117849576271187, "loss": 3.196, "step": 62000 }, { "epoch": 3.39, "learning_rate": 0.0009088423964218456, "loss": 3.1975, "step": 63000 }, { "epoch": 3.44, "learning_rate": 0.0009059057203389832, "loss": 3.1989, "step": 64000 }, { "epoch": 3.5, "learning_rate": 0.00090296315913371, "loss": 3.2003, "step": 65000 }, { "epoch": 3.55, "learning_rate": 0.000900020597928437, "loss": 3.1914, "step": 66000 }, { "epoch": 3.6, "learning_rate": 0.0008970809792843692, "loss": 3.1914, "step": 67000 }, { "epoch": 3.66, "learning_rate": 0.0008941413606403014, "loss": 3.1919, "step": 68000 }, { "epoch": 3.71, "learning_rate": 0.0008911987994350282, "loss": 3.1876, "step": 69000 }, { "epoch": 3.77, "learning_rate": 0.0008882562382297552, "loss": 3.1891, "step": 70000 }, { "epoch": 3.82, "learning_rate": 0.0008853136770244821, "loss": 3.1853, "step": 71000 }, { "epoch": 3.87, "learning_rate": 0.000882371115819209, "loss": 3.184, "step": 72000 }, { "epoch": 3.93, "learning_rate": 0.000879428554613936, "loss": 3.1845, "step": 73000 }, { "epoch": 3.98, "learning_rate": 0.0008764889359698683, "loss": 3.1829, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3992117530567142, "eval_loss": 3.4019179344177246, "eval_runtime": 153.2776, "eval_samples_per_second": 377.87, "eval_steps_per_second": 5.904, "step": 74368 }, { "epoch": 4.03, "learning_rate": 0.0008735463747645951, "loss": 3.1352, "step": 75000 }, { "epoch": 4.09, "learning_rate": 0.0008706067561205273, "loss": 3.1132, "step": 76000 }, { "epoch": 4.14, "learning_rate": 0.0008676641949152543, "loss": 3.1218, "step": 77000 }, { "epoch": 4.2, "learning_rate": 0.0008647216337099811, "loss": 3.1223, "step": 78000 }, { "epoch": 4.25, "learning_rate": 0.0008617790725047081, "loss": 3.1286, "step": 79000 }, { "epoch": 4.3, "learning_rate": 0.0008588394538606403, "loss": 3.1235, "step": 80000 }, { "epoch": 4.36, "learning_rate": 0.0008558998352165726, "loss": 3.1243, "step": 81000 }, { "epoch": 4.41, "learning_rate": 0.0008529572740112994, "loss": 3.13, "step": 82000 }, { "epoch": 4.46, "learning_rate": 0.0008500147128060264, "loss": 3.1321, "step": 83000 }, { "epoch": 4.52, "learning_rate": 0.0008470750941619586, "loss": 3.1309, "step": 84000 }, { "epoch": 4.57, "learning_rate": 0.0008441325329566854, "loss": 3.132, "step": 85000 }, { "epoch": 4.63, "learning_rate": 0.0008411899717514124, "loss": 3.1335, "step": 86000 }, { "epoch": 4.68, "learning_rate": 0.0008382474105461394, "loss": 3.1249, "step": 87000 }, { "epoch": 4.73, "learning_rate": 0.0008353077919020717, "loss": 3.125, "step": 88000 }, { "epoch": 4.79, "learning_rate": 0.0008323652306967985, "loss": 3.13, "step": 89000 }, { "epoch": 4.84, "learning_rate": 0.0008294226694915255, "loss": 3.1299, "step": 90000 }, { "epoch": 4.89, "learning_rate": 0.0008264801082862525, "loss": 3.1265, "step": 91000 }, { "epoch": 4.95, "learning_rate": 0.0008235404896421845, "loss": 3.1264, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.4020474588723565, "eval_loss": 3.382812976837158, "eval_runtime": 153.1227, "eval_samples_per_second": 378.252, "eval_steps_per_second": 5.91, "step": 92960 }, { "epoch": 5.0, "learning_rate": 0.0008205979284369115, "loss": 3.1204, "step": 93000 }, { "epoch": 5.06, "learning_rate": 0.0008176553672316385, "loss": 3.0543, "step": 94000 }, { "epoch": 5.11, "learning_rate": 0.0008147157485875707, "loss": 3.0628, "step": 95000 }, { "epoch": 5.16, "learning_rate": 0.0008117761299435028, "loss": 3.067, "step": 96000 }, { "epoch": 5.22, "learning_rate": 0.0008088335687382298, "loss": 3.0676, "step": 97000 }, { "epoch": 5.27, "learning_rate": 0.0008058910075329568, "loss": 3.0752, "step": 98000 }, { "epoch": 5.32, "learning_rate": 0.0008029484463276836, "loss": 3.0826, "step": 99000 }, { "epoch": 5.38, "learning_rate": 0.0008000088276836158, "loss": 3.0768, "step": 100000 }, { "epoch": 5.43, "learning_rate": 0.0007970662664783428, "loss": 3.0752, "step": 101000 }, { "epoch": 5.49, "learning_rate": 0.000794126647834275, "loss": 3.0835, "step": 102000 }, { "epoch": 5.54, "learning_rate": 0.0007911840866290018, "loss": 3.0767, "step": 103000 }, { "epoch": 5.59, "learning_rate": 0.0007882415254237288, "loss": 3.0816, "step": 104000 }, { "epoch": 5.65, "learning_rate": 0.0007852989642184559, "loss": 3.0783, "step": 105000 }, { "epoch": 5.7, "learning_rate": 0.0007823564030131827, "loss": 3.0838, "step": 106000 }, { "epoch": 5.76, "learning_rate": 0.0007794138418079097, "loss": 3.0834, "step": 107000 }, { "epoch": 5.81, "learning_rate": 0.0007764712806026365, "loss": 3.0844, "step": 108000 }, { "epoch": 5.86, "learning_rate": 0.0007735316619585687, "loss": 3.0811, "step": 109000 }, { "epoch": 5.92, "learning_rate": 0.0007705891007532957, "loss": 3.0826, "step": 110000 }, { "epoch": 5.97, "learning_rate": 0.0007676465395480227, "loss": 3.0827, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.40310327390667616, "eval_loss": 3.3848841190338135, "eval_runtime": 153.1292, "eval_samples_per_second": 378.236, "eval_steps_per_second": 5.91, "step": 111552 }, { "epoch": 6.02, "learning_rate": 0.0007647098634651602, "loss": 3.0519, "step": 112000 }, { "epoch": 6.08, "learning_rate": 0.000761767302259887, "loss": 3.0168, "step": 113000 }, { "epoch": 6.13, "learning_rate": 0.000758824741054614, "loss": 3.0242, "step": 114000 }, { "epoch": 6.19, "learning_rate": 0.000755882179849341, "loss": 3.0258, "step": 115000 }, { "epoch": 6.24, "learning_rate": 0.0007529396186440678, "loss": 3.0341, "step": 116000 }, { "epoch": 6.29, "learning_rate": 0.00075, "loss": 3.0319, "step": 117000 }, { "epoch": 6.35, "learning_rate": 0.000747057438794727, "loss": 3.0359, "step": 118000 }, { "epoch": 6.4, "learning_rate": 0.0007441178201506591, "loss": 3.0377, "step": 119000 }, { "epoch": 6.45, "learning_rate": 0.000741175258945386, "loss": 3.0378, "step": 120000 }, { "epoch": 6.51, "learning_rate": 0.000738232697740113, "loss": 3.0402, "step": 121000 }, { "epoch": 6.56, "learning_rate": 0.0007352901365348398, "loss": 3.046, "step": 122000 }, { "epoch": 6.62, "learning_rate": 0.0007323505178907721, "loss": 3.0444, "step": 123000 }, { "epoch": 6.67, "learning_rate": 0.0007294079566854991, "loss": 3.0424, "step": 124000 }, { "epoch": 6.72, "learning_rate": 0.0007264712806026365, "loss": 3.0428, "step": 125000 }, { "epoch": 6.78, "learning_rate": 0.0007235316619585688, "loss": 3.0425, "step": 126000 }, { "epoch": 6.83, "learning_rate": 0.0007205891007532957, "loss": 3.0448, "step": 127000 }, { "epoch": 6.88, "learning_rate": 0.0007176465395480226, "loss": 3.049, "step": 128000 }, { "epoch": 6.94, "learning_rate": 0.0007147039783427496, "loss": 3.0425, "step": 129000 }, { "epoch": 6.99, "learning_rate": 0.0007117614171374765, "loss": 3.0461, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.4050249835692312, "eval_loss": 3.3727521896362305, "eval_runtime": 153.1303, "eval_samples_per_second": 378.233, "eval_steps_per_second": 5.91, "step": 130144 }, { "epoch": 7.05, "learning_rate": 0.0007088188559322034, "loss": 2.9834, "step": 131000 }, { "epoch": 7.1, "learning_rate": 0.0007058792372881356, "loss": 2.9854, "step": 132000 }, { "epoch": 7.15, "learning_rate": 0.0007029366760828625, "loss": 2.9855, "step": 133000 }, { "epoch": 7.21, "learning_rate": 0.0006999970574387948, "loss": 2.9948, "step": 134000 }, { "epoch": 7.26, "learning_rate": 0.0006970544962335217, "loss": 2.9986, "step": 135000 }, { "epoch": 7.31, "learning_rate": 0.0006941148775894539, "loss": 3.0007, "step": 136000 }, { "epoch": 7.37, "learning_rate": 0.0006911723163841808, "loss": 3.0032, "step": 137000 }, { "epoch": 7.42, "learning_rate": 0.0006882297551789077, "loss": 3.0065, "step": 138000 }, { "epoch": 7.48, "learning_rate": 0.0006852871939736347, "loss": 3.0083, "step": 139000 }, { "epoch": 7.53, "learning_rate": 0.0006823505178907722, "loss": 3.0086, "step": 140000 }, { "epoch": 7.58, "learning_rate": 0.0006794079566854991, "loss": 3.0111, "step": 141000 }, { "epoch": 7.64, "learning_rate": 0.000676465395480226, "loss": 3.0091, "step": 142000 }, { "epoch": 7.69, "learning_rate": 0.000673522834274953, "loss": 3.0138, "step": 143000 }, { "epoch": 7.75, "learning_rate": 0.0006705802730696799, "loss": 3.0099, "step": 144000 }, { "epoch": 7.8, "learning_rate": 0.0006676406544256121, "loss": 3.0149, "step": 145000 }, { "epoch": 7.85, "learning_rate": 0.0006647010357815442, "loss": 3.0104, "step": 146000 }, { "epoch": 7.91, "learning_rate": 0.0006617584745762712, "loss": 3.0183, "step": 147000 }, { "epoch": 7.96, "learning_rate": 0.0006588159133709981, "loss": 3.0111, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.40690403335988545, "eval_loss": 3.3608620166778564, "eval_runtime": 153.1275, "eval_samples_per_second": 378.24, "eval_steps_per_second": 5.91, "step": 148736 }, { "epoch": 8.01, "learning_rate": 0.0006558762947269304, "loss": 2.9952, "step": 149000 }, { "epoch": 8.07, "learning_rate": 0.0006529337335216573, "loss": 2.9533, "step": 150000 }, { "epoch": 8.12, "learning_rate": 0.0006499911723163842, "loss": 2.9567, "step": 151000 }, { "epoch": 8.18, "learning_rate": 0.0006470515536723164, "loss": 2.9616, "step": 152000 }, { "epoch": 8.23, "learning_rate": 0.0006441089924670433, "loss": 2.964, "step": 153000 }, { "epoch": 8.28, "learning_rate": 0.0006411664312617702, "loss": 2.9719, "step": 154000 }, { "epoch": 8.34, "learning_rate": 0.0006382238700564972, "loss": 2.97, "step": 155000 }, { "epoch": 8.39, "learning_rate": 0.0006352813088512241, "loss": 2.9762, "step": 156000 }, { "epoch": 8.44, "learning_rate": 0.0006323446327683616, "loss": 2.9736, "step": 157000 }, { "epoch": 8.5, "learning_rate": 0.0006294020715630885, "loss": 2.983, "step": 158000 }, { "epoch": 8.55, "learning_rate": 0.0006264624529190207, "loss": 2.9757, "step": 159000 }, { "epoch": 8.61, "learning_rate": 0.0006235198917137477, "loss": 2.9828, "step": 160000 }, { "epoch": 8.66, "learning_rate": 0.0006205773305084745, "loss": 2.9829, "step": 161000 }, { "epoch": 8.71, "learning_rate": 0.0006176347693032015, "loss": 2.9784, "step": 162000 }, { "epoch": 8.77, "learning_rate": 0.0006146922080979284, "loss": 2.9807, "step": 163000 }, { "epoch": 8.82, "learning_rate": 0.0006117525894538607, "loss": 2.9858, "step": 164000 }, { "epoch": 8.87, "learning_rate": 0.0006088100282485876, "loss": 2.9856, "step": 165000 }, { "epoch": 8.93, "learning_rate": 0.0006058704096045198, "loss": 2.9875, "step": 166000 }, { "epoch": 8.98, "learning_rate": 0.0006029278483992468, "loss": 2.9857, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.4082261534843072, "eval_loss": 3.3496146202087402, "eval_runtime": 153.15, "eval_samples_per_second": 378.185, "eval_steps_per_second": 5.909, "step": 167328 }, { "epoch": 9.04, "learning_rate": 0.0005999852871939736, "loss": 2.9408, "step": 168000 }, { "epoch": 9.09, "learning_rate": 0.0005970427259887006, "loss": 2.9243, "step": 169000 }, { "epoch": 9.14, "learning_rate": 0.0005941031073446328, "loss": 2.9328, "step": 170000 }, { "epoch": 9.2, "learning_rate": 0.0005911634887005651, "loss": 2.9327, "step": 171000 }, { "epoch": 9.25, "learning_rate": 0.0005882209274952919, "loss": 2.9438, "step": 172000 }, { "epoch": 9.31, "learning_rate": 0.0005852783662900189, "loss": 2.9379, "step": 173000 }, { "epoch": 9.36, "learning_rate": 0.0005823358050847458, "loss": 2.9497, "step": 174000 }, { "epoch": 9.41, "learning_rate": 0.0005793961864406779, "loss": 2.9508, "step": 175000 }, { "epoch": 9.47, "learning_rate": 0.0005764536252354049, "loss": 2.9503, "step": 176000 }, { "epoch": 9.52, "learning_rate": 0.0005735110640301319, "loss": 2.9494, "step": 177000 }, { "epoch": 9.57, "learning_rate": 0.0005705685028248587, "loss": 2.954, "step": 178000 }, { "epoch": 9.63, "learning_rate": 0.0005676259416195857, "loss": 2.9555, "step": 179000 }, { "epoch": 9.68, "learning_rate": 0.0005646863229755179, "loss": 2.9602, "step": 180000 }, { "epoch": 9.74, "learning_rate": 0.0005617437617702449, "loss": 2.9581, "step": 181000 }, { "epoch": 9.79, "learning_rate": 0.000558804143126177, "loss": 2.9585, "step": 182000 }, { "epoch": 9.84, "learning_rate": 0.000555861581920904, "loss": 2.96, "step": 183000 }, { "epoch": 9.9, "learning_rate": 0.000552919020715631, "loss": 2.9614, "step": 184000 }, { "epoch": 9.95, "learning_rate": 0.000549979402071563, "loss": 2.9608, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.4075345932932098, "eval_loss": 3.3682754039764404, "eval_runtime": 153.8415, "eval_samples_per_second": 376.485, "eval_steps_per_second": 5.883, "step": 185920 }, { "epoch": 10.0, "learning_rate": 0.0005470397834274953, "loss": 2.9544, "step": 186000 }, { "epoch": 10.06, "learning_rate": 0.0005440972222222223, "loss": 2.8967, "step": 187000 }, { "epoch": 10.11, "learning_rate": 0.0005411576035781545, "loss": 2.9034, "step": 188000 }, { "epoch": 10.17, "learning_rate": 0.0005382150423728813, "loss": 2.9113, "step": 189000 }, { "epoch": 10.22, "learning_rate": 0.0005352724811676083, "loss": 2.9107, "step": 190000 }, { "epoch": 10.27, "learning_rate": 0.0005323328625235405, "loss": 2.9144, "step": 191000 }, { "epoch": 10.33, "learning_rate": 0.0005293903013182673, "loss": 2.9206, "step": 192000 }, { "epoch": 10.38, "learning_rate": 0.0005264477401129943, "loss": 2.9229, "step": 193000 }, { "epoch": 10.43, "learning_rate": 0.0005235051789077213, "loss": 2.9233, "step": 194000 }, { "epoch": 10.49, "learning_rate": 0.0005205626177024481, "loss": 2.9323, "step": 195000 }, { "epoch": 10.54, "learning_rate": 0.0005176229990583804, "loss": 2.9275, "step": 196000 }, { "epoch": 10.6, "learning_rate": 0.0005146804378531074, "loss": 2.9309, "step": 197000 }, { "epoch": 10.65, "learning_rate": 0.0005117408192090396, "loss": 2.9346, "step": 198000 }, { "epoch": 10.7, "learning_rate": 0.0005088012005649718, "loss": 2.9339, "step": 199000 }, { "epoch": 10.76, "learning_rate": 0.0005058586393596986, "loss": 2.9358, "step": 200000 }, { "epoch": 10.81, "learning_rate": 0.0005029160781544256, "loss": 2.9336, "step": 201000 }, { "epoch": 10.86, "learning_rate": 0.0004999735169491526, "loss": 2.938, "step": 202000 }, { "epoch": 10.92, "learning_rate": 0.0004970309557438794, "loss": 2.9378, "step": 203000 }, { "epoch": 10.97, "learning_rate": 0.0004940913370998117, "loss": 2.9402, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40864939315828364, "eval_loss": 3.372835159301758, "eval_runtime": 153.3961, "eval_samples_per_second": 377.578, "eval_steps_per_second": 5.9, "step": 204512 }, { "epoch": 11.03, "learning_rate": 0.0004911487758945386, "loss": 2.9052, "step": 205000 }, { "epoch": 11.08, "learning_rate": 0.0004882091572504708, "loss": 2.8772, "step": 206000 }, { "epoch": 11.13, "learning_rate": 0.0004852665960451977, "loss": 2.8834, "step": 207000 }, { "epoch": 11.19, "learning_rate": 0.0004823240348399247, "loss": 2.8883, "step": 208000 }, { "epoch": 11.24, "learning_rate": 0.0004793844161958569, "loss": 2.8927, "step": 209000 }, { "epoch": 11.3, "learning_rate": 0.0004764418549905838, "loss": 2.8952, "step": 210000 }, { "epoch": 11.35, "learning_rate": 0.00047349929378531075, "loss": 2.9015, "step": 211000 }, { "epoch": 11.4, "learning_rate": 0.00047055967514124294, "loss": 2.9018, "step": 212000 }, { "epoch": 11.46, "learning_rate": 0.00046762005649717513, "loss": 2.9079, "step": 213000 }, { "epoch": 11.51, "learning_rate": 0.00046467749529190204, "loss": 2.9015, "step": 214000 }, { "epoch": 11.56, "learning_rate": 0.00046173493408662906, "loss": 2.9078, "step": 215000 }, { "epoch": 11.62, "learning_rate": 0.00045879237288135597, "loss": 2.9094, "step": 216000 }, { "epoch": 11.67, "learning_rate": 0.00045585275423728816, "loss": 2.9103, "step": 217000 }, { "epoch": 11.73, "learning_rate": 0.00045291019303201507, "loss": 2.912, "step": 218000 }, { "epoch": 11.78, "learning_rate": 0.00044997057438794726, "loss": 2.9127, "step": 219000 }, { "epoch": 11.83, "learning_rate": 0.0004470309557438795, "loss": 2.9159, "step": 220000 }, { "epoch": 11.89, "learning_rate": 0.0004440883945386064, "loss": 2.9151, "step": 221000 }, { "epoch": 11.94, "learning_rate": 0.0004411458333333334, "loss": 2.9164, "step": 222000 }, { "epoch": 11.99, "learning_rate": 0.0004382062146892655, "loss": 2.9154, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.4082875568274841, "eval_loss": 3.384491205215454, "eval_runtime": 153.6213, "eval_samples_per_second": 377.025, "eval_steps_per_second": 5.891, "step": 223104 }, { "epoch": 12.05, "learning_rate": 0.0004352636534839925, "loss": 2.8564, "step": 224000 }, { "epoch": 12.1, "learning_rate": 0.00043232403483992466, "loss": 2.8614, "step": 225000 }, { "epoch": 12.16, "learning_rate": 0.00042938147363465163, "loss": 2.8663, "step": 226000 }, { "epoch": 12.21, "learning_rate": 0.00042643891242937854, "loss": 2.8715, "step": 227000 }, { "epoch": 12.26, "learning_rate": 0.00042349635122410545, "loss": 2.8779, "step": 228000 }, { "epoch": 12.32, "learning_rate": 0.0004205567325800377, "loss": 2.8745, "step": 229000 }, { "epoch": 12.37, "learning_rate": 0.0004176141713747646, "loss": 2.8812, "step": 230000 }, { "epoch": 12.42, "learning_rate": 0.0004146716101694915, "loss": 2.8815, "step": 231000 }, { "epoch": 12.48, "learning_rate": 0.0004117290489642185, "loss": 2.8818, "step": 232000 }, { "epoch": 12.53, "learning_rate": 0.00040878943032015067, "loss": 2.8834, "step": 233000 }, { "epoch": 12.59, "learning_rate": 0.00040584981167608286, "loss": 2.8884, "step": 234000 }, { "epoch": 12.64, "learning_rate": 0.00040290725047080977, "loss": 2.8882, "step": 235000 }, { "epoch": 12.69, "learning_rate": 0.00039996468926553673, "loss": 2.8902, "step": 236000 }, { "epoch": 12.75, "learning_rate": 0.00039702212806026364, "loss": 2.8924, "step": 237000 }, { "epoch": 12.8, "learning_rate": 0.0003940795668549906, "loss": 2.8938, "step": 238000 }, { "epoch": 12.85, "learning_rate": 0.0003911399482109228, "loss": 2.8921, "step": 239000 }, { "epoch": 12.91, "learning_rate": 0.00038820032956685504, "loss": 2.8936, "step": 240000 }, { "epoch": 12.96, "learning_rate": 0.00038525776836158195, "loss": 2.891, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4097539815264616, "eval_loss": 3.374075174331665, "eval_runtime": 153.5328, "eval_samples_per_second": 377.242, "eval_steps_per_second": 5.895, "step": 241696 }, { "epoch": 13.02, "learning_rate": 0.00038231520715630886, "loss": 2.8823, "step": 242000 }, { "epoch": 13.07, "learning_rate": 0.00037937264595103577, "loss": 2.8372, "step": 243000 }, { "epoch": 13.12, "learning_rate": 0.00037643596986817324, "loss": 2.8443, "step": 244000 }, { "epoch": 13.18, "learning_rate": 0.0003734934086629002, "loss": 2.8465, "step": 245000 }, { "epoch": 13.23, "learning_rate": 0.0003705508474576271, "loss": 2.8528, "step": 246000 }, { "epoch": 13.29, "learning_rate": 0.000367608286252354, "loss": 2.8543, "step": 247000 }, { "epoch": 13.34, "learning_rate": 0.000364665725047081, "loss": 2.8555, "step": 248000 }, { "epoch": 13.39, "learning_rate": 0.0003617231638418079, "loss": 2.863, "step": 249000 }, { "epoch": 13.45, "learning_rate": 0.00035878060263653486, "loss": 2.8651, "step": 250000 }, { "epoch": 13.5, "learning_rate": 0.00035584686911487755, "loss": 2.866, "step": 251000 }, { "epoch": 13.55, "learning_rate": 0.0003529043079096045, "loss": 2.8672, "step": 252000 }, { "epoch": 13.61, "learning_rate": 0.00034996174670433143, "loss": 2.8659, "step": 253000 }, { "epoch": 13.66, "learning_rate": 0.00034701918549905834, "loss": 2.8692, "step": 254000 }, { "epoch": 13.72, "learning_rate": 0.00034407662429378536, "loss": 2.8718, "step": 255000 }, { "epoch": 13.77, "learning_rate": 0.0003411370056497175, "loss": 2.8712, "step": 256000 }, { "epoch": 13.82, "learning_rate": 0.00033819738700564974, "loss": 2.8723, "step": 257000 }, { "epoch": 13.88, "learning_rate": 0.00033525482580037665, "loss": 2.8748, "step": 258000 }, { "epoch": 13.93, "learning_rate": 0.0003323122645951036, "loss": 2.8718, "step": 259000 }, { "epoch": 13.98, "learning_rate": 0.0003293726459510358, "loss": 2.8754, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.41060670869817323, "eval_loss": 3.367408514022827, "eval_runtime": 152.6297, "eval_samples_per_second": 379.474, "eval_steps_per_second": 5.929, "step": 260288 }, { "epoch": 14.04, "learning_rate": 0.00032643008474576277, "loss": 2.8386, "step": 261000 }, { "epoch": 14.09, "learning_rate": 0.0003234875235404897, "loss": 2.8267, "step": 262000 }, { "epoch": 14.15, "learning_rate": 0.0003205449623352166, "loss": 2.8268, "step": 263000 }, { "epoch": 14.2, "learning_rate": 0.0003176024011299435, "loss": 2.8328, "step": 264000 }, { "epoch": 14.25, "learning_rate": 0.0003146627824858757, "loss": 2.8368, "step": 265000 }, { "epoch": 14.31, "learning_rate": 0.0003117202212806026, "loss": 2.8369, "step": 266000 }, { "epoch": 14.36, "learning_rate": 0.00030878060263653484, "loss": 2.8409, "step": 267000 }, { "epoch": 14.41, "learning_rate": 0.00030583804143126175, "loss": 2.8439, "step": 268000 }, { "epoch": 14.47, "learning_rate": 0.000302898422787194, "loss": 2.8428, "step": 269000 }, { "epoch": 14.52, "learning_rate": 0.0002999588041431262, "loss": 2.8454, "step": 270000 }, { "epoch": 14.58, "learning_rate": 0.00029701624293785315, "loss": 2.8435, "step": 271000 }, { "epoch": 14.63, "learning_rate": 0.00029407368173258006, "loss": 2.8488, "step": 272000 }, { "epoch": 14.68, "learning_rate": 0.00029113112052730697, "loss": 2.8501, "step": 273000 }, { "epoch": 14.74, "learning_rate": 0.00028818855932203393, "loss": 2.8496, "step": 274000 }, { "epoch": 14.79, "learning_rate": 0.00028524894067796607, "loss": 2.8539, "step": 275000 }, { "epoch": 14.85, "learning_rate": 0.00028230637947269303, "loss": 2.8539, "step": 276000 }, { "epoch": 14.9, "learning_rate": 0.0002793667608286252, "loss": 2.8558, "step": 277000 }, { "epoch": 14.95, "learning_rate": 0.00027642714218455746, "loss": 2.8555, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.4095201248113644, "eval_loss": 3.3868112564086914, "eval_runtime": 153.6538, "eval_samples_per_second": 376.945, "eval_steps_per_second": 5.89, "step": 278880 }, { "epoch": 15.01, "learning_rate": 0.0002734845809792844, "loss": 2.8488, "step": 279000 }, { "epoch": 15.06, "learning_rate": 0.00027054201977401134, "loss": 2.8093, "step": 280000 }, { "epoch": 15.11, "learning_rate": 0.00026759945856873825, "loss": 2.8137, "step": 281000 }, { "epoch": 15.17, "learning_rate": 0.00026465983992467044, "loss": 2.8098, "step": 282000 }, { "epoch": 15.22, "learning_rate": 0.0002617172787193974, "loss": 2.815, "step": 283000 }, { "epoch": 15.28, "learning_rate": 0.00025877766007532954, "loss": 2.8226, "step": 284000 }, { "epoch": 15.33, "learning_rate": 0.0002558350988700565, "loss": 2.8227, "step": 285000 }, { "epoch": 15.38, "learning_rate": 0.0002528925376647834, "loss": 2.8204, "step": 286000 }, { "epoch": 15.44, "learning_rate": 0.0002499529190207156, "loss": 2.8239, "step": 287000 }, { "epoch": 15.49, "learning_rate": 0.00024701035781544257, "loss": 2.8234, "step": 288000 }, { "epoch": 15.54, "learning_rate": 0.0002440677966101695, "loss": 2.8268, "step": 289000 }, { "epoch": 15.6, "learning_rate": 0.00024113112052730697, "loss": 2.8285, "step": 290000 }, { "epoch": 15.65, "learning_rate": 0.0002381885593220339, "loss": 2.8301, "step": 291000 }, { "epoch": 15.71, "learning_rate": 0.00023524599811676085, "loss": 2.831, "step": 292000 }, { "epoch": 15.76, "learning_rate": 0.00023230343691148776, "loss": 2.8319, "step": 293000 }, { "epoch": 15.81, "learning_rate": 0.00022936381826741997, "loss": 2.8331, "step": 294000 }, { "epoch": 15.87, "learning_rate": 0.00022642125706214688, "loss": 2.837, "step": 295000 }, { "epoch": 15.92, "learning_rate": 0.0002234816384180791, "loss": 2.835, "step": 296000 }, { "epoch": 15.97, "learning_rate": 0.00022053907721280604, "loss": 2.8368, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.40982626817554074, "eval_loss": 3.3892040252685547, "eval_runtime": 153.9496, "eval_samples_per_second": 376.22, "eval_steps_per_second": 5.879, "step": 297472 }, { "epoch": 16.03, "learning_rate": 0.00021759651600753298, "loss": 2.8135, "step": 298000 }, { "epoch": 16.08, "learning_rate": 0.00021465689736346516, "loss": 2.7892, "step": 299000 }, { "epoch": 16.14, "learning_rate": 0.0002117143361581921, "loss": 2.7966, "step": 300000 }, { "epoch": 16.19, "learning_rate": 0.0002087747175141243, "loss": 2.7981, "step": 301000 }, { "epoch": 16.24, "learning_rate": 0.0002058350988700565, "loss": 2.7999, "step": 302000 }, { "epoch": 16.3, "learning_rate": 0.00020289253766478344, "loss": 2.8005, "step": 303000 }, { "epoch": 16.35, "learning_rate": 0.00019995291902071563, "loss": 2.8064, "step": 304000 }, { "epoch": 16.4, "learning_rate": 0.00019701035781544257, "loss": 2.8087, "step": 305000 }, { "epoch": 16.46, "learning_rate": 0.00019406779661016948, "loss": 2.8069, "step": 306000 }, { "epoch": 16.51, "learning_rate": 0.00019112523540489642, "loss": 2.8089, "step": 307000 }, { "epoch": 16.57, "learning_rate": 0.00018818561676082864, "loss": 2.8103, "step": 308000 }, { "epoch": 16.62, "learning_rate": 0.00018524305555555557, "loss": 2.8099, "step": 309000 }, { "epoch": 16.67, "learning_rate": 0.00018230343691148776, "loss": 2.8134, "step": 310000 }, { "epoch": 16.73, "learning_rate": 0.0001793608757062147, "loss": 2.8105, "step": 311000 }, { "epoch": 16.78, "learning_rate": 0.0001764212570621469, "loss": 2.8145, "step": 312000 }, { "epoch": 16.84, "learning_rate": 0.00017347869585687383, "loss": 2.817, "step": 313000 }, { "epoch": 16.89, "learning_rate": 0.00017053613465160076, "loss": 2.8195, "step": 314000 }, { "epoch": 16.94, "learning_rate": 0.00016759651600753295, "loss": 2.8218, "step": 315000 }, { "epoch": 17.0, "learning_rate": 0.0001646539548022599, "loss": 2.8185, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4105727151624538, "eval_loss": 3.3865416049957275, "eval_runtime": 155.0516, "eval_samples_per_second": 373.547, "eval_steps_per_second": 5.837, "step": 316064 }, { "epoch": 17.05, "learning_rate": 0.00016171139359698683, "loss": 2.7805, "step": 317000 }, { "epoch": 17.1, "learning_rate": 0.00015877177495291902, "loss": 2.7868, "step": 318000 }, { "epoch": 17.16, "learning_rate": 0.00015582921374764595, "loss": 2.7826, "step": 319000 }, { "epoch": 17.21, "learning_rate": 0.0001528866525423729, "loss": 2.783, "step": 320000 }, { "epoch": 17.27, "learning_rate": 0.00014994703389830508, "loss": 2.7881, "step": 321000 }, { "epoch": 17.32, "learning_rate": 0.00014700447269303202, "loss": 2.7889, "step": 322000 }, { "epoch": 17.37, "learning_rate": 0.0001440648540489642, "loss": 2.7868, "step": 323000 }, { "epoch": 17.43, "learning_rate": 0.00014112229284369115, "loss": 2.794, "step": 324000 }, { "epoch": 17.48, "learning_rate": 0.00013818267419962336, "loss": 2.7907, "step": 325000 }, { "epoch": 17.53, "learning_rate": 0.0001352401129943503, "loss": 2.7941, "step": 326000 }, { "epoch": 17.59, "learning_rate": 0.0001322975517890772, "loss": 2.7927, "step": 327000 }, { "epoch": 17.64, "learning_rate": 0.00012935499058380415, "loss": 2.7957, "step": 328000 }, { "epoch": 17.7, "learning_rate": 0.00012641537193973634, "loss": 2.7919, "step": 329000 }, { "epoch": 17.75, "learning_rate": 0.00012347281073446327, "loss": 2.7963, "step": 330000 }, { "epoch": 17.8, "learning_rate": 0.00012053024952919021, "loss": 2.798, "step": 331000 }, { "epoch": 17.86, "learning_rate": 0.00011759063088512241, "loss": 2.7969, "step": 332000 }, { "epoch": 17.91, "learning_rate": 0.00011464806967984934, "loss": 2.7977, "step": 333000 }, { "epoch": 17.96, "learning_rate": 0.00011170845103578154, "loss": 2.7969, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.4098990250909243, "eval_loss": 3.400566577911377, "eval_runtime": 153.6726, "eval_samples_per_second": 376.899, "eval_steps_per_second": 5.889, "step": 334656 }, { "epoch": 18.02, "learning_rate": 0.00010876588983050848, "loss": 2.7861, "step": 335000 }, { "epoch": 18.07, "learning_rate": 0.0001058233286252354, "loss": 2.7663, "step": 336000 }, { "epoch": 18.13, "learning_rate": 0.00010288076741996234, "loss": 2.769, "step": 337000 }, { "epoch": 18.18, "learning_rate": 9.994114877589454e-05, "loss": 2.7697, "step": 338000 }, { "epoch": 18.23, "learning_rate": 9.699858757062147e-05, "loss": 2.7771, "step": 339000 }, { "epoch": 18.29, "learning_rate": 9.40560263653484e-05, "loss": 2.7719, "step": 340000 }, { "epoch": 18.34, "learning_rate": 9.111346516007534e-05, "loss": 2.7793, "step": 341000 }, { "epoch": 18.4, "learning_rate": 8.817384651600753e-05, "loss": 2.7736, "step": 342000 }, { "epoch": 18.45, "learning_rate": 8.523422787193975e-05, "loss": 2.777, "step": 343000 }, { "epoch": 18.5, "learning_rate": 8.229166666666667e-05, "loss": 2.7778, "step": 344000 }, { "epoch": 18.56, "learning_rate": 7.935204802259887e-05, "loss": 2.7766, "step": 345000 }, { "epoch": 18.61, "learning_rate": 7.640948681732581e-05, "loss": 2.7771, "step": 346000 }, { "epoch": 18.66, "learning_rate": 7.3469868173258e-05, "loss": 2.7785, "step": 347000 }, { "epoch": 18.72, "learning_rate": 7.052730696798494e-05, "loss": 2.7756, "step": 348000 }, { "epoch": 18.77, "learning_rate": 6.758474576271187e-05, "loss": 2.7762, "step": 349000 }, { "epoch": 18.83, "learning_rate": 6.464512711864406e-05, "loss": 2.7811, "step": 350000 }, { "epoch": 18.88, "learning_rate": 6.1702565913371e-05, "loss": 2.7773, "step": 351000 }, { "epoch": 18.93, "learning_rate": 5.8760004708097925e-05, "loss": 2.7786, "step": 352000 }, { "epoch": 18.99, "learning_rate": 5.5820386064030134e-05, "loss": 2.7805, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.41040066487593735, "eval_loss": 3.399707794189453, "eval_runtime": 153.9752, "eval_samples_per_second": 376.158, "eval_steps_per_second": 5.878, "step": 353248 }, { "epoch": 19.04, "learning_rate": 5.2877824858757065e-05, "loss": 2.7638, "step": 354000 }, { "epoch": 19.09, "learning_rate": 4.993526365348399e-05, "loss": 2.7559, "step": 355000 }, { "epoch": 19.15, "learning_rate": 4.699270244821093e-05, "loss": 2.7604, "step": 356000 }, { "epoch": 19.2, "learning_rate": 4.405308380414313e-05, "loss": 2.762, "step": 357000 }, { "epoch": 19.26, "learning_rate": 4.111346516007533e-05, "loss": 2.7598, "step": 358000 }, { "epoch": 19.31, "learning_rate": 3.8170903954802256e-05, "loss": 2.76, "step": 359000 }, { "epoch": 19.36, "learning_rate": 3.523128531073446e-05, "loss": 2.7655, "step": 360000 }, { "epoch": 19.42, "learning_rate": 3.2288724105461396e-05, "loss": 2.7589, "step": 361000 }, { "epoch": 19.47, "learning_rate": 2.9346162900188327e-05, "loss": 2.7604, "step": 362000 }, { "epoch": 19.52, "learning_rate": 2.6403601694915254e-05, "loss": 2.7634, "step": 363000 }, { "epoch": 19.58, "learning_rate": 2.3461040489642185e-05, "loss": 2.7618, "step": 364000 }, { "epoch": 19.63, "learning_rate": 2.0518479284369116e-05, "loss": 2.764, "step": 365000 }, { "epoch": 19.69, "learning_rate": 1.7575918079096047e-05, "loss": 2.7651, "step": 366000 }, { "epoch": 19.74, "learning_rate": 1.4636299435028249e-05, "loss": 2.7575, "step": 367000 }, { "epoch": 19.79, "learning_rate": 1.169373822975518e-05, "loss": 2.7625, "step": 368000 }, { "epoch": 19.85, "learning_rate": 8.754119585687382e-06, "loss": 2.7612, "step": 369000 }, { "epoch": 19.9, "learning_rate": 5.814500941619586e-06, "loss": 2.7639, "step": 370000 }, { "epoch": 19.95, "learning_rate": 2.871939736346516e-06, "loss": 2.7623, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.40988659662430754, "eval_loss": 3.4119651317596436, "eval_runtime": 154.9649, "eval_samples_per_second": 373.755, "eval_steps_per_second": 5.84, "step": 371840 }, { "epoch": 20.0, "step": 371840, "total_flos": 1.56667295384064e+18, "train_loss": 3.030422134530729, "train_runtime": 80926.4318, "train_samples_per_second": 147.032, "train_steps_per_second": 4.595 } ], "logging_steps": 1000, "max_steps": 371840, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56667295384064e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }