{ "best_metric": 0.733613908290863, "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle-cot13b/checkpoint-15200", "epoch": 2.9124353324391645, "global_step": 15200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.9999999999999995e-05, "loss": 1.7259, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00011999999999999999, "loss": 1.4365, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00017999999999999998, "loss": 1.0988, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.00023999999999999998, "loss": 1.0373, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.0003, "loss": 0.9935, "step": 100 }, { "epoch": 0.02, "learning_rate": 0.00029961432152728675, "loss": 0.9734, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.0002992286430545735, "loss": 0.9538, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.00029884296458186025, "loss": 0.9304, "step": 160 }, { "epoch": 0.03, "learning_rate": 0.00029845728610914697, "loss": 0.9159, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.00029807160763643375, "loss": 0.9056, "step": 200 }, { "epoch": 0.04, "eval_loss": 0.9217711091041565, "eval_runtime": 25.3617, "eval_samples_per_second": 78.859, "eval_steps_per_second": 1.262, "step": 200 }, { "epoch": 0.04, "learning_rate": 0.00029768592916372047, "loss": 0.9028, "step": 220 }, { "epoch": 0.05, "learning_rate": 0.00029730025069100725, "loss": 0.8939, "step": 240 }, { "epoch": 0.05, "learning_rate": 0.00029691457221829397, "loss": 0.8839, "step": 260 }, { "epoch": 0.05, "learning_rate": 0.00029652889374558075, "loss": 0.8929, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.00029614321527286747, "loss": 0.8708, "step": 300 }, { "epoch": 0.06, "learning_rate": 0.00029575753680015425, "loss": 0.8824, "step": 320 }, { "epoch": 0.07, "learning_rate": 0.000295371858327441, "loss": 0.8705, "step": 340 }, { "epoch": 0.07, "learning_rate": 0.00029498617985472775, "loss": 0.8678, "step": 360 }, { "epoch": 0.07, "learning_rate": 0.0002946005013820145, "loss": 0.8687, "step": 380 }, { "epoch": 0.08, "learning_rate": 0.00029421482290930125, "loss": 0.8609, "step": 400 }, { "epoch": 0.08, "eval_loss": 0.8822715878486633, "eval_runtime": 25.3188, "eval_samples_per_second": 78.993, "eval_steps_per_second": 1.264, "step": 400 }, { "epoch": 0.08, "learning_rate": 0.000293829144436588, "loss": 0.8603, "step": 420 }, { "epoch": 0.08, "learning_rate": 0.00029344346596387475, "loss": 0.8662, "step": 440 }, { "epoch": 0.09, "learning_rate": 0.0002930577874911615, "loss": 0.8591, "step": 460 }, { "epoch": 0.09, "learning_rate": 0.00029267210901844825, "loss": 0.8442, "step": 480 }, { "epoch": 0.1, "learning_rate": 0.000292286430545735, "loss": 0.8482, "step": 500 }, { "epoch": 0.1, "learning_rate": 0.00029190075207302175, "loss": 0.8458, "step": 520 }, { "epoch": 0.1, "learning_rate": 0.0002915150736003085, "loss": 0.8377, "step": 540 }, { "epoch": 0.11, "learning_rate": 0.00029112939512759525, "loss": 0.8372, "step": 560 }, { "epoch": 0.11, "learning_rate": 0.000290743716654882, "loss": 0.8444, "step": 580 }, { "epoch": 0.11, "learning_rate": 0.0002903580381821688, "loss": 0.84, "step": 600 }, { "epoch": 0.11, "eval_loss": 0.8602503538131714, "eval_runtime": 25.3547, "eval_samples_per_second": 78.881, "eval_steps_per_second": 1.262, "step": 600 }, { "epoch": 0.12, "learning_rate": 0.0002899723597094555, "loss": 0.8428, "step": 620 }, { "epoch": 0.12, "learning_rate": 0.0002895866812367423, "loss": 0.8366, "step": 640 }, { "epoch": 0.13, "learning_rate": 0.000289201002764029, "loss": 0.8408, "step": 660 }, { "epoch": 0.13, "learning_rate": 0.0002888153242913158, "loss": 0.8445, "step": 680 }, { "epoch": 0.13, "learning_rate": 0.0002884296458186025, "loss": 0.8335, "step": 700 }, { "epoch": 0.14, "learning_rate": 0.0002880439673458893, "loss": 0.8316, "step": 720 }, { "epoch": 0.14, "learning_rate": 0.000287658288873176, "loss": 0.8449, "step": 740 }, { "epoch": 0.15, "learning_rate": 0.0002872726104004628, "loss": 0.836, "step": 760 }, { "epoch": 0.15, "learning_rate": 0.0002868869319277495, "loss": 0.8257, "step": 780 }, { "epoch": 0.15, "learning_rate": 0.0002865012534550363, "loss": 0.8252, "step": 800 }, { "epoch": 0.15, "eval_loss": 0.8450831174850464, "eval_runtime": 25.4039, "eval_samples_per_second": 78.728, "eval_steps_per_second": 1.26, "step": 800 }, { "epoch": 0.16, "learning_rate": 0.0002861155749823231, "loss": 0.8227, "step": 820 }, { "epoch": 0.16, "learning_rate": 0.0002857298965096098, "loss": 0.8274, "step": 840 }, { "epoch": 0.16, "learning_rate": 0.00028534421803689657, "loss": 0.8197, "step": 860 }, { "epoch": 0.17, "learning_rate": 0.0002849585395641833, "loss": 0.823, "step": 880 }, { "epoch": 0.17, "learning_rate": 0.00028457286109147007, "loss": 0.8176, "step": 900 }, { "epoch": 0.18, "learning_rate": 0.0002841871826187568, "loss": 0.8092, "step": 920 }, { "epoch": 0.18, "learning_rate": 0.00028380150414604357, "loss": 0.8171, "step": 940 }, { "epoch": 0.18, "learning_rate": 0.0002834158256733303, "loss": 0.816, "step": 960 }, { "epoch": 0.19, "learning_rate": 0.00028303014720061707, "loss": 0.816, "step": 980 }, { "epoch": 0.19, "learning_rate": 0.0002826444687279038, "loss": 0.8066, "step": 1000 }, { "epoch": 0.19, "eval_loss": 0.8338332176208496, "eval_runtime": 25.3851, "eval_samples_per_second": 78.786, "eval_steps_per_second": 1.261, "step": 1000 }, { "epoch": 0.2, "learning_rate": 0.00028225879025519057, "loss": 0.82, "step": 1020 }, { "epoch": 0.2, "learning_rate": 0.0002818731117824773, "loss": 0.8116, "step": 1040 }, { "epoch": 0.2, "learning_rate": 0.00028148743330976407, "loss": 0.8156, "step": 1060 }, { "epoch": 0.21, "learning_rate": 0.00028110175483705085, "loss": 0.8135, "step": 1080 }, { "epoch": 0.21, "learning_rate": 0.00028071607636433757, "loss": 0.8055, "step": 1100 }, { "epoch": 0.21, "learning_rate": 0.00028033039789162435, "loss": 0.8062, "step": 1120 }, { "epoch": 0.22, "learning_rate": 0.00027994471941891107, "loss": 0.8082, "step": 1140 }, { "epoch": 0.22, "learning_rate": 0.00027955904094619785, "loss": 0.8144, "step": 1160 }, { "epoch": 0.23, "learning_rate": 0.00027917336247348457, "loss": 0.8067, "step": 1180 }, { "epoch": 0.23, "learning_rate": 0.0002787876840007713, "loss": 0.8042, "step": 1200 }, { "epoch": 0.23, "eval_loss": 0.8253737688064575, "eval_runtime": 25.4089, "eval_samples_per_second": 78.713, "eval_steps_per_second": 1.259, "step": 1200 }, { "epoch": 0.23, "learning_rate": 0.00027840200552805807, "loss": 0.8093, "step": 1220 }, { "epoch": 0.24, "learning_rate": 0.00027801632705534485, "loss": 0.801, "step": 1240 }, { "epoch": 0.24, "learning_rate": 0.00027763064858263157, "loss": 0.8043, "step": 1260 }, { "epoch": 0.25, "learning_rate": 0.00027724497010991834, "loss": 0.8027, "step": 1280 }, { "epoch": 0.25, "learning_rate": 0.0002768592916372051, "loss": 0.7979, "step": 1300 }, { "epoch": 0.25, "learning_rate": 0.00027647361316449184, "loss": 0.7988, "step": 1320 }, { "epoch": 0.26, "learning_rate": 0.0002760879346917786, "loss": 0.8051, "step": 1340 }, { "epoch": 0.26, "learning_rate": 0.00027570225621906534, "loss": 0.7962, "step": 1360 }, { "epoch": 0.26, "learning_rate": 0.0002753165777463521, "loss": 0.8034, "step": 1380 }, { "epoch": 0.27, "learning_rate": 0.00027493089927363884, "loss": 0.7994, "step": 1400 }, { "epoch": 0.27, "eval_loss": 0.8166970014572144, "eval_runtime": 25.3787, "eval_samples_per_second": 78.806, "eval_steps_per_second": 1.261, "step": 1400 }, { "epoch": 0.27, "learning_rate": 0.00027454522080092557, "loss": 0.7949, "step": 1420 }, { "epoch": 0.28, "learning_rate": 0.00027415954232821234, "loss": 0.7919, "step": 1440 }, { "epoch": 0.28, "learning_rate": 0.0002737738638554991, "loss": 0.7983, "step": 1460 }, { "epoch": 0.28, "learning_rate": 0.00027338818538278584, "loss": 0.7828, "step": 1480 }, { "epoch": 0.29, "learning_rate": 0.0002730025069100726, "loss": 0.7926, "step": 1500 }, { "epoch": 0.29, "learning_rate": 0.0002726168284373594, "loss": 0.7837, "step": 1520 }, { "epoch": 0.3, "learning_rate": 0.0002722311499646461, "loss": 0.7922, "step": 1540 }, { "epoch": 0.3, "learning_rate": 0.0002718454714919329, "loss": 0.7852, "step": 1560 }, { "epoch": 0.3, "learning_rate": 0.0002714597930192196, "loss": 0.7846, "step": 1580 }, { "epoch": 0.31, "learning_rate": 0.0002710741145465064, "loss": 0.782, "step": 1600 }, { "epoch": 0.31, "eval_loss": 0.8094187378883362, "eval_runtime": 25.4544, "eval_samples_per_second": 78.572, "eval_steps_per_second": 1.257, "step": 1600 }, { "epoch": 0.31, "learning_rate": 0.0002706884360737931, "loss": 0.7822, "step": 1620 }, { "epoch": 0.31, "learning_rate": 0.00027030275760107984, "loss": 0.7787, "step": 1640 }, { "epoch": 0.32, "learning_rate": 0.0002699170791283666, "loss": 0.7913, "step": 1660 }, { "epoch": 0.32, "learning_rate": 0.0002695314006556534, "loss": 0.79, "step": 1680 }, { "epoch": 0.33, "learning_rate": 0.0002691457221829401, "loss": 0.7934, "step": 1700 }, { "epoch": 0.33, "learning_rate": 0.0002687600437102269, "loss": 0.7816, "step": 1720 }, { "epoch": 0.33, "learning_rate": 0.0002683743652375136, "loss": 0.7825, "step": 1740 }, { "epoch": 0.34, "learning_rate": 0.0002679886867648004, "loss": 0.7903, "step": 1760 }, { "epoch": 0.34, "learning_rate": 0.00026760300829208717, "loss": 0.7906, "step": 1780 }, { "epoch": 0.34, "learning_rate": 0.0002672173298193739, "loss": 0.7778, "step": 1800 }, { "epoch": 0.34, "eval_loss": 0.8045867681503296, "eval_runtime": 25.4351, "eval_samples_per_second": 78.632, "eval_steps_per_second": 1.258, "step": 1800 }, { "epoch": 0.35, "learning_rate": 0.00026683165134666067, "loss": 0.7815, "step": 1820 }, { "epoch": 0.35, "learning_rate": 0.0002664459728739474, "loss": 0.7851, "step": 1840 }, { "epoch": 0.36, "learning_rate": 0.00026606029440123417, "loss": 0.7807, "step": 1860 }, { "epoch": 0.36, "learning_rate": 0.0002656746159285209, "loss": 0.7856, "step": 1880 }, { "epoch": 0.36, "learning_rate": 0.0002652889374558076, "loss": 0.7798, "step": 1900 }, { "epoch": 0.37, "learning_rate": 0.0002649032589830944, "loss": 0.7777, "step": 1920 }, { "epoch": 0.37, "learning_rate": 0.00026451758051038117, "loss": 0.7798, "step": 1940 }, { "epoch": 0.38, "learning_rate": 0.0002641319020376679, "loss": 0.7783, "step": 1960 }, { "epoch": 0.38, "learning_rate": 0.00026374622356495467, "loss": 0.7739, "step": 1980 }, { "epoch": 0.38, "learning_rate": 0.00026336054509224144, "loss": 0.7823, "step": 2000 }, { "epoch": 0.38, "eval_loss": 0.7984708547592163, "eval_runtime": 25.4598, "eval_samples_per_second": 78.555, "eval_steps_per_second": 1.257, "step": 2000 }, { "epoch": 0.39, "learning_rate": 0.00026297486661952817, "loss": 0.7774, "step": 2020 }, { "epoch": 0.39, "learning_rate": 0.00026258918814681494, "loss": 0.7701, "step": 2040 }, { "epoch": 0.39, "learning_rate": 0.00026220350967410167, "loss": 0.7777, "step": 2060 }, { "epoch": 0.4, "learning_rate": 0.00026181783120138844, "loss": 0.781, "step": 2080 }, { "epoch": 0.4, "learning_rate": 0.00026143215272867517, "loss": 0.779, "step": 2100 }, { "epoch": 0.41, "learning_rate": 0.0002610464742559619, "loss": 0.7703, "step": 2120 }, { "epoch": 0.41, "learning_rate": 0.00026066079578324867, "loss": 0.7749, "step": 2140 }, { "epoch": 0.41, "learning_rate": 0.00026027511731053544, "loss": 0.772, "step": 2160 }, { "epoch": 0.42, "learning_rate": 0.00025988943883782216, "loss": 0.771, "step": 2180 }, { "epoch": 0.42, "learning_rate": 0.00025950376036510894, "loss": 0.7757, "step": 2200 }, { "epoch": 0.42, "eval_loss": 0.7949528694152832, "eval_runtime": 25.4504, "eval_samples_per_second": 78.584, "eval_steps_per_second": 1.257, "step": 2200 }, { "epoch": 0.43, "learning_rate": 0.00025911808189239566, "loss": 0.7776, "step": 2220 }, { "epoch": 0.43, "learning_rate": 0.00025873240341968244, "loss": 0.7689, "step": 2240 }, { "epoch": 0.43, "learning_rate": 0.0002583467249469692, "loss": 0.7646, "step": 2260 }, { "epoch": 0.44, "learning_rate": 0.00025796104647425594, "loss": 0.7805, "step": 2280 }, { "epoch": 0.44, "learning_rate": 0.0002575753680015427, "loss": 0.7717, "step": 2300 }, { "epoch": 0.44, "learning_rate": 0.00025718968952882944, "loss": 0.7672, "step": 2320 }, { "epoch": 0.45, "learning_rate": 0.00025680401105611616, "loss": 0.7716, "step": 2340 }, { "epoch": 0.45, "learning_rate": 0.00025641833258340294, "loss": 0.7661, "step": 2360 }, { "epoch": 0.46, "learning_rate": 0.00025603265411068966, "loss": 0.7659, "step": 2380 }, { "epoch": 0.46, "learning_rate": 0.00025564697563797644, "loss": 0.7697, "step": 2400 }, { "epoch": 0.46, "eval_loss": 0.7915205359458923, "eval_runtime": 25.4326, "eval_samples_per_second": 78.639, "eval_steps_per_second": 1.258, "step": 2400 }, { "epoch": 0.46, "learning_rate": 0.0002552612971652632, "loss": 0.7686, "step": 2420 }, { "epoch": 0.47, "learning_rate": 0.00025487561869254994, "loss": 0.7691, "step": 2440 }, { "epoch": 0.47, "learning_rate": 0.0002544899402198367, "loss": 0.768, "step": 2460 }, { "epoch": 0.48, "learning_rate": 0.0002541042617471235, "loss": 0.7663, "step": 2480 }, { "epoch": 0.48, "learning_rate": 0.0002537185832744102, "loss": 0.767, "step": 2500 }, { "epoch": 0.48, "learning_rate": 0.000253332904801697, "loss": 0.769, "step": 2520 }, { "epoch": 0.49, "learning_rate": 0.0002529472263289837, "loss": 0.7686, "step": 2540 }, { "epoch": 0.49, "learning_rate": 0.00025256154785627044, "loss": 0.7722, "step": 2560 }, { "epoch": 0.49, "learning_rate": 0.0002521758693835572, "loss": 0.7691, "step": 2580 }, { "epoch": 0.5, "learning_rate": 0.00025179019091084394, "loss": 0.7742, "step": 2600 }, { "epoch": 0.5, "eval_loss": 0.7875179648399353, "eval_runtime": 25.4595, "eval_samples_per_second": 78.556, "eval_steps_per_second": 1.257, "step": 2600 }, { "epoch": 0.5, "learning_rate": 0.0002514045124381307, "loss": 0.7682, "step": 2620 }, { "epoch": 0.51, "learning_rate": 0.0002510188339654175, "loss": 0.7574, "step": 2640 }, { "epoch": 0.51, "learning_rate": 0.0002506331554927042, "loss": 0.77, "step": 2660 }, { "epoch": 0.51, "learning_rate": 0.000250247477019991, "loss": 0.7638, "step": 2680 }, { "epoch": 0.52, "learning_rate": 0.00024986179854727777, "loss": 0.7517, "step": 2700 }, { "epoch": 0.52, "learning_rate": 0.0002494761200745645, "loss": 0.7596, "step": 2720 }, { "epoch": 0.53, "learning_rate": 0.00024909044160185127, "loss": 0.7608, "step": 2740 }, { "epoch": 0.53, "learning_rate": 0.000248704763129138, "loss": 0.7571, "step": 2760 }, { "epoch": 0.53, "learning_rate": 0.0002483190846564247, "loss": 0.7597, "step": 2780 }, { "epoch": 0.54, "learning_rate": 0.0002479334061837115, "loss": 0.7659, "step": 2800 }, { "epoch": 0.54, "eval_loss": 0.7841727137565613, "eval_runtime": 25.4853, "eval_samples_per_second": 78.477, "eval_steps_per_second": 1.256, "step": 2800 }, { "epoch": 0.54, "learning_rate": 0.0002475477277109982, "loss": 0.7694, "step": 2820 }, { "epoch": 0.54, "learning_rate": 0.000247162049238285, "loss": 0.7722, "step": 2840 }, { "epoch": 0.55, "learning_rate": 0.00024677637076557176, "loss": 0.7513, "step": 2860 }, { "epoch": 0.55, "learning_rate": 0.0002463906922928585, "loss": 0.7553, "step": 2880 }, { "epoch": 0.56, "learning_rate": 0.00024600501382014526, "loss": 0.7611, "step": 2900 }, { "epoch": 0.56, "learning_rate": 0.000245619335347432, "loss": 0.7614, "step": 2920 }, { "epoch": 0.56, "learning_rate": 0.00024523365687471876, "loss": 0.761, "step": 2940 }, { "epoch": 0.57, "learning_rate": 0.00024484797840200554, "loss": 0.7568, "step": 2960 }, { "epoch": 0.57, "learning_rate": 0.00024446229992929226, "loss": 0.7571, "step": 2980 }, { "epoch": 0.57, "learning_rate": 0.000244076621456579, "loss": 0.7514, "step": 3000 }, { "epoch": 0.57, "eval_loss": 0.7828710675239563, "eval_runtime": 25.4475, "eval_samples_per_second": 78.593, "eval_steps_per_second": 1.257, "step": 3000 }, { "epoch": 0.58, "learning_rate": 0.0002436909429838658, "loss": 0.7564, "step": 3020 }, { "epoch": 0.58, "learning_rate": 0.0002433052645111525, "loss": 0.7593, "step": 3040 }, { "epoch": 0.59, "learning_rate": 0.00024291958603843926, "loss": 0.7533, "step": 3060 }, { "epoch": 0.59, "learning_rate": 0.000242533907565726, "loss": 0.7566, "step": 3080 }, { "epoch": 0.59, "learning_rate": 0.00024214822909301276, "loss": 0.7667, "step": 3100 }, { "epoch": 0.6, "learning_rate": 0.00024176255062029954, "loss": 0.7638, "step": 3120 }, { "epoch": 0.6, "learning_rate": 0.00024137687214758626, "loss": 0.7613, "step": 3140 }, { "epoch": 0.61, "learning_rate": 0.00024099119367487304, "loss": 0.755, "step": 3160 }, { "epoch": 0.61, "learning_rate": 0.0002406055152021598, "loss": 0.7547, "step": 3180 }, { "epoch": 0.61, "learning_rate": 0.0002402198367294465, "loss": 0.7611, "step": 3200 }, { "epoch": 0.61, "eval_loss": 0.7789185643196106, "eval_runtime": 25.4744, "eval_samples_per_second": 78.51, "eval_steps_per_second": 1.256, "step": 3200 }, { "epoch": 0.62, "learning_rate": 0.0002398341582567333, "loss": 0.7498, "step": 3220 }, { "epoch": 0.62, "learning_rate": 0.00023944847978402, "loss": 0.757, "step": 3240 }, { "epoch": 0.62, "learning_rate": 0.0002390628013113068, "loss": 0.7472, "step": 3260 }, { "epoch": 0.63, "learning_rate": 0.00023867712283859354, "loss": 0.7557, "step": 3280 }, { "epoch": 0.63, "learning_rate": 0.0002382914443658803, "loss": 0.7602, "step": 3300 }, { "epoch": 0.64, "learning_rate": 0.00023790576589316704, "loss": 0.7573, "step": 3320 }, { "epoch": 0.64, "learning_rate": 0.0002375200874204538, "loss": 0.7565, "step": 3340 }, { "epoch": 0.64, "learning_rate": 0.00023713440894774054, "loss": 0.7517, "step": 3360 }, { "epoch": 0.65, "learning_rate": 0.0002367487304750273, "loss": 0.7521, "step": 3380 }, { "epoch": 0.65, "learning_rate": 0.00023636305200231404, "loss": 0.7575, "step": 3400 }, { "epoch": 0.65, "eval_loss": 0.7771645784378052, "eval_runtime": 25.4832, "eval_samples_per_second": 78.483, "eval_steps_per_second": 1.256, "step": 3400 }, { "epoch": 0.66, "learning_rate": 0.0002359773735296008, "loss": 0.7605, "step": 3420 }, { "epoch": 0.66, "learning_rate": 0.00023559169505688756, "loss": 0.7547, "step": 3440 }, { "epoch": 0.66, "learning_rate": 0.00023520601658417428, "loss": 0.7522, "step": 3460 }, { "epoch": 0.67, "learning_rate": 0.00023482033811146106, "loss": 0.757, "step": 3480 }, { "epoch": 0.67, "learning_rate": 0.0002344346596387478, "loss": 0.7561, "step": 3500 }, { "epoch": 0.67, "learning_rate": 0.00023404898116603456, "loss": 0.7486, "step": 3520 }, { "epoch": 0.68, "learning_rate": 0.0002336633026933213, "loss": 0.7519, "step": 3540 }, { "epoch": 0.68, "learning_rate": 0.00023327762422060806, "loss": 0.7487, "step": 3560 }, { "epoch": 0.69, "learning_rate": 0.0002328919457478948, "loss": 0.747, "step": 3580 }, { "epoch": 0.69, "learning_rate": 0.0002325062672751816, "loss": 0.7523, "step": 3600 }, { "epoch": 0.69, "eval_loss": 0.7746226787567139, "eval_runtime": 25.4795, "eval_samples_per_second": 78.494, "eval_steps_per_second": 1.256, "step": 3600 }, { "epoch": 0.69, "learning_rate": 0.0002321205888024683, "loss": 0.7427, "step": 3620 }, { "epoch": 0.7, "learning_rate": 0.0002317349103297551, "loss": 0.7442, "step": 3640 }, { "epoch": 0.7, "learning_rate": 0.00023134923185704184, "loss": 0.7587, "step": 3660 }, { "epoch": 0.71, "learning_rate": 0.00023096355338432856, "loss": 0.7506, "step": 3680 }, { "epoch": 0.71, "learning_rate": 0.00023057787491161534, "loss": 0.7514, "step": 3700 }, { "epoch": 0.71, "learning_rate": 0.00023019219643890206, "loss": 0.7475, "step": 3720 }, { "epoch": 0.72, "learning_rate": 0.00022980651796618884, "loss": 0.7601, "step": 3740 }, { "epoch": 0.72, "learning_rate": 0.00022942083949347559, "loss": 0.7474, "step": 3760 }, { "epoch": 0.72, "learning_rate": 0.00022903516102076233, "loss": 0.7529, "step": 3780 }, { "epoch": 0.73, "learning_rate": 0.00022864948254804908, "loss": 0.7458, "step": 3800 }, { "epoch": 0.73, "eval_loss": 0.7719505429267883, "eval_runtime": 25.4724, "eval_samples_per_second": 78.516, "eval_steps_per_second": 1.256, "step": 3800 }, { "epoch": 0.73, "learning_rate": 0.00022826380407533586, "loss": 0.7584, "step": 3820 }, { "epoch": 0.74, "learning_rate": 0.00022787812560262258, "loss": 0.7416, "step": 3840 }, { "epoch": 0.74, "learning_rate": 0.00022749244712990936, "loss": 0.7444, "step": 3860 }, { "epoch": 0.74, "learning_rate": 0.0002271067686571961, "loss": 0.7459, "step": 3880 }, { "epoch": 0.75, "learning_rate": 0.00022672109018448283, "loss": 0.7476, "step": 3900 }, { "epoch": 0.75, "learning_rate": 0.0002263354117117696, "loss": 0.7473, "step": 3920 }, { "epoch": 0.75, "learning_rate": 0.00022594973323905633, "loss": 0.7434, "step": 3940 }, { "epoch": 0.76, "learning_rate": 0.0002255640547663431, "loss": 0.7463, "step": 3960 }, { "epoch": 0.76, "learning_rate": 0.00022517837629362986, "loss": 0.7435, "step": 3980 }, { "epoch": 0.77, "learning_rate": 0.0002247926978209166, "loss": 0.7445, "step": 4000 }, { "epoch": 0.77, "eval_loss": 0.7707083821296692, "eval_runtime": 25.4747, "eval_samples_per_second": 78.509, "eval_steps_per_second": 1.256, "step": 4000 }, { "epoch": 0.77, "learning_rate": 0.00022440701934820336, "loss": 0.7321, "step": 4020 }, { "epoch": 0.77, "learning_rate": 0.00022402134087549014, "loss": 0.7525, "step": 4040 }, { "epoch": 0.78, "learning_rate": 0.00022363566240277686, "loss": 0.7494, "step": 4060 }, { "epoch": 0.78, "learning_rate": 0.00022324998393006364, "loss": 0.7533, "step": 4080 }, { "epoch": 0.79, "learning_rate": 0.00022286430545735036, "loss": 0.7442, "step": 4100 }, { "epoch": 0.79, "learning_rate": 0.0002224786269846371, "loss": 0.7423, "step": 4120 }, { "epoch": 0.79, "learning_rate": 0.00022209294851192388, "loss": 0.7443, "step": 4140 }, { "epoch": 0.8, "learning_rate": 0.0002217072700392106, "loss": 0.7388, "step": 4160 }, { "epoch": 0.8, "learning_rate": 0.00022132159156649738, "loss": 0.7425, "step": 4180 }, { "epoch": 0.8, "learning_rate": 0.00022093591309378413, "loss": 0.7507, "step": 4200 }, { "epoch": 0.8, "eval_loss": 0.7691813111305237, "eval_runtime": 25.5011, "eval_samples_per_second": 78.428, "eval_steps_per_second": 1.255, "step": 4200 }, { "epoch": 0.81, "learning_rate": 0.00022055023462107088, "loss": 0.7276, "step": 4220 }, { "epoch": 0.81, "learning_rate": 0.00022016455614835763, "loss": 0.7399, "step": 4240 }, { "epoch": 0.82, "learning_rate": 0.00021977887767564438, "loss": 0.7409, "step": 4260 }, { "epoch": 0.82, "learning_rate": 0.00021939319920293113, "loss": 0.7391, "step": 4280 }, { "epoch": 0.82, "learning_rate": 0.0002190075207302179, "loss": 0.741, "step": 4300 }, { "epoch": 0.83, "learning_rate": 0.00021862184225750463, "loss": 0.7404, "step": 4320 }, { "epoch": 0.83, "learning_rate": 0.00021823616378479138, "loss": 0.7356, "step": 4340 }, { "epoch": 0.84, "learning_rate": 0.00021785048531207816, "loss": 0.7458, "step": 4360 }, { "epoch": 0.84, "learning_rate": 0.00021746480683936488, "loss": 0.7373, "step": 4380 }, { "epoch": 0.84, "learning_rate": 0.00021707912836665166, "loss": 0.7455, "step": 4400 }, { "epoch": 0.84, "eval_loss": 0.7680566310882568, "eval_runtime": 25.4479, "eval_samples_per_second": 78.592, "eval_steps_per_second": 1.257, "step": 4400 }, { "epoch": 0.85, "learning_rate": 0.00021669344989393838, "loss": 0.7376, "step": 4420 }, { "epoch": 0.85, "learning_rate": 0.00021630777142122516, "loss": 0.7396, "step": 4440 }, { "epoch": 0.85, "learning_rate": 0.0002159220929485119, "loss": 0.7367, "step": 4460 }, { "epoch": 0.86, "learning_rate": 0.00021553641447579866, "loss": 0.7354, "step": 4480 }, { "epoch": 0.86, "learning_rate": 0.0002151507360030854, "loss": 0.7337, "step": 4500 }, { "epoch": 0.87, "learning_rate": 0.00021476505753037218, "loss": 0.7384, "step": 4520 }, { "epoch": 0.87, "learning_rate": 0.0002143793790576589, "loss": 0.7334, "step": 4540 }, { "epoch": 0.87, "learning_rate": 0.00021399370058494568, "loss": 0.742, "step": 4560 }, { "epoch": 0.88, "learning_rate": 0.0002136080221122324, "loss": 0.7408, "step": 4580 }, { "epoch": 0.88, "learning_rate": 0.00021322234363951916, "loss": 0.7466, "step": 4600 }, { "epoch": 0.88, "eval_loss": 0.7663780450820923, "eval_runtime": 25.53, "eval_samples_per_second": 78.339, "eval_steps_per_second": 1.253, "step": 4600 }, { "epoch": 0.89, "learning_rate": 0.00021283666516680593, "loss": 0.7399, "step": 4620 }, { "epoch": 0.89, "learning_rate": 0.00021245098669409266, "loss": 0.746, "step": 4640 }, { "epoch": 0.89, "learning_rate": 0.00021206530822137943, "loss": 0.7397, "step": 4660 }, { "epoch": 0.9, "learning_rate": 0.00021167962974866618, "loss": 0.7349, "step": 4680 }, { "epoch": 0.9, "learning_rate": 0.00021129395127595293, "loss": 0.7334, "step": 4700 }, { "epoch": 0.9, "learning_rate": 0.00021090827280323968, "loss": 0.738, "step": 4720 }, { "epoch": 0.91, "learning_rate": 0.0002105225943305264, "loss": 0.7398, "step": 4740 }, { "epoch": 0.91, "learning_rate": 0.00021013691585781318, "loss": 0.7465, "step": 4760 }, { "epoch": 0.92, "learning_rate": 0.00020975123738509996, "loss": 0.7388, "step": 4780 }, { "epoch": 0.92, "learning_rate": 0.00020936555891238668, "loss": 0.7462, "step": 4800 }, { "epoch": 0.92, "eval_loss": 0.7650267481803894, "eval_runtime": 25.5463, "eval_samples_per_second": 78.289, "eval_steps_per_second": 1.253, "step": 4800 }, { "epoch": 0.92, "learning_rate": 0.00020897988043967343, "loss": 0.7463, "step": 4820 }, { "epoch": 0.93, "learning_rate": 0.0002085942019669602, "loss": 0.7389, "step": 4840 }, { "epoch": 0.93, "learning_rate": 0.00020820852349424693, "loss": 0.7316, "step": 4860 }, { "epoch": 0.94, "learning_rate": 0.0002078228450215337, "loss": 0.73, "step": 4880 }, { "epoch": 0.94, "learning_rate": 0.00020743716654882043, "loss": 0.7472, "step": 4900 }, { "epoch": 0.94, "learning_rate": 0.0002070514880761072, "loss": 0.7494, "step": 4920 }, { "epoch": 0.95, "learning_rate": 0.00020666580960339396, "loss": 0.7424, "step": 4940 }, { "epoch": 0.95, "learning_rate": 0.0002062801311306807, "loss": 0.7443, "step": 4960 }, { "epoch": 0.95, "learning_rate": 0.00020589445265796746, "loss": 0.7355, "step": 4980 }, { "epoch": 0.96, "learning_rate": 0.00020550877418525423, "loss": 0.7388, "step": 5000 }, { "epoch": 0.96, "eval_loss": 0.7630622386932373, "eval_runtime": 25.8654, "eval_samples_per_second": 77.323, "eval_steps_per_second": 1.237, "step": 5000 }, { "epoch": 0.96, "learning_rate": 0.00020512309571254096, "loss": 0.7317, "step": 5020 }, { "epoch": 0.97, "learning_rate": 0.0002047374172398277, "loss": 0.7385, "step": 5040 }, { "epoch": 0.97, "learning_rate": 0.00020435173876711445, "loss": 0.7369, "step": 5060 }, { "epoch": 0.97, "learning_rate": 0.0002039660602944012, "loss": 0.7243, "step": 5080 }, { "epoch": 0.98, "learning_rate": 0.00020358038182168798, "loss": 0.7334, "step": 5100 }, { "epoch": 0.98, "learning_rate": 0.0002031947033489747, "loss": 0.7433, "step": 5120 }, { "epoch": 0.98, "learning_rate": 0.00020280902487626148, "loss": 0.7202, "step": 5140 }, { "epoch": 0.99, "learning_rate": 0.00020242334640354823, "loss": 0.7336, "step": 5160 }, { "epoch": 0.99, "learning_rate": 0.00020203766793083498, "loss": 0.7324, "step": 5180 }, { "epoch": 1.0, "learning_rate": 0.00020165198945812173, "loss": 0.7363, "step": 5200 }, { "epoch": 1.0, "eval_loss": 0.7617191076278687, "eval_runtime": 25.4884, "eval_samples_per_second": 78.467, "eval_steps_per_second": 1.255, "step": 5200 }, { "epoch": 1.0, "learning_rate": 0.0002012663109854085, "loss": 0.7359, "step": 5220 }, { "epoch": 1.0, "learning_rate": 0.00020088063251269523, "loss": 0.7347, "step": 5240 }, { "epoch": 1.01, "learning_rate": 0.00020049495403998198, "loss": 0.732, "step": 5260 }, { "epoch": 1.01, "learning_rate": 0.00020010927556726873, "loss": 0.7385, "step": 5280 }, { "epoch": 1.02, "learning_rate": 0.00019972359709455548, "loss": 0.7313, "step": 5300 }, { "epoch": 1.02, "learning_rate": 0.00019933791862184226, "loss": 0.7337, "step": 5320 }, { "epoch": 1.02, "learning_rate": 0.00019895224014912898, "loss": 0.733, "step": 5340 }, { "epoch": 1.03, "learning_rate": 0.00019856656167641576, "loss": 0.7226, "step": 5360 }, { "epoch": 1.03, "learning_rate": 0.0001981808832037025, "loss": 0.7363, "step": 5380 }, { "epoch": 1.03, "learning_rate": 0.00019779520473098925, "loss": 0.7296, "step": 5400 }, { "epoch": 1.03, "eval_loss": 0.7608480453491211, "eval_runtime": 25.5307, "eval_samples_per_second": 78.337, "eval_steps_per_second": 1.253, "step": 5400 }, { "epoch": 1.04, "learning_rate": 0.000197409526258276, "loss": 0.7237, "step": 5420 }, { "epoch": 1.04, "learning_rate": 0.00019702384778556273, "loss": 0.735, "step": 5440 }, { "epoch": 1.05, "learning_rate": 0.0001966381693128495, "loss": 0.7379, "step": 5460 }, { "epoch": 1.05, "learning_rate": 0.00019625249084013628, "loss": 0.7372, "step": 5480 }, { "epoch": 1.05, "learning_rate": 0.000195866812367423, "loss": 0.7332, "step": 5500 }, { "epoch": 1.06, "learning_rate": 0.00019548113389470975, "loss": 0.7375, "step": 5520 }, { "epoch": 1.06, "learning_rate": 0.00019509545542199653, "loss": 0.7352, "step": 5540 }, { "epoch": 1.07, "learning_rate": 0.00019470977694928325, "loss": 0.7336, "step": 5560 }, { "epoch": 1.07, "learning_rate": 0.00019432409847657003, "loss": 0.7266, "step": 5580 }, { "epoch": 1.07, "learning_rate": 0.00019393842000385675, "loss": 0.7325, "step": 5600 }, { "epoch": 1.07, "eval_loss": 0.7595871686935425, "eval_runtime": 25.4845, "eval_samples_per_second": 78.479, "eval_steps_per_second": 1.256, "step": 5600 }, { "epoch": 1.08, "learning_rate": 0.00019355274153114353, "loss": 0.7259, "step": 5620 }, { "epoch": 1.08, "learning_rate": 0.00019316706305843028, "loss": 0.7274, "step": 5640 }, { "epoch": 1.08, "learning_rate": 0.000192781384585717, "loss": 0.7254, "step": 5660 }, { "epoch": 1.09, "learning_rate": 0.00019239570611300378, "loss": 0.7332, "step": 5680 }, { "epoch": 1.09, "learning_rate": 0.00019201002764029056, "loss": 0.73, "step": 5700 }, { "epoch": 1.1, "learning_rate": 0.00019162434916757728, "loss": 0.7365, "step": 5720 }, { "epoch": 1.1, "learning_rate": 0.00019123867069486403, "loss": 0.7261, "step": 5740 }, { "epoch": 1.1, "learning_rate": 0.00019085299222215078, "loss": 0.7331, "step": 5760 }, { "epoch": 1.11, "learning_rate": 0.00019046731374943753, "loss": 0.7272, "step": 5780 }, { "epoch": 1.11, "learning_rate": 0.0001900816352767243, "loss": 0.7325, "step": 5800 }, { "epoch": 1.11, "eval_loss": 0.7583591341972351, "eval_runtime": 25.49, "eval_samples_per_second": 78.462, "eval_steps_per_second": 1.255, "step": 5800 }, { "epoch": 1.12, "learning_rate": 0.00018969595680401103, "loss": 0.7277, "step": 5820 }, { "epoch": 1.12, "learning_rate": 0.0001893102783312978, "loss": 0.7352, "step": 5840 }, { "epoch": 1.12, "learning_rate": 0.00018892459985858455, "loss": 0.7312, "step": 5860 }, { "epoch": 1.13, "learning_rate": 0.00018853892138587128, "loss": 0.7296, "step": 5880 }, { "epoch": 1.13, "learning_rate": 0.00018815324291315805, "loss": 0.7275, "step": 5900 }, { "epoch": 1.13, "learning_rate": 0.00018776756444044478, "loss": 0.7345, "step": 5920 }, { "epoch": 1.14, "learning_rate": 0.00018738188596773155, "loss": 0.7322, "step": 5940 }, { "epoch": 1.14, "learning_rate": 0.0001869962074950183, "loss": 0.737, "step": 5960 }, { "epoch": 1.15, "learning_rate": 0.00018661052902230505, "loss": 0.7243, "step": 5980 }, { "epoch": 1.15, "learning_rate": 0.0001862248505495918, "loss": 0.7303, "step": 6000 }, { "epoch": 1.15, "eval_loss": 0.7572018504142761, "eval_runtime": 25.7126, "eval_samples_per_second": 77.783, "eval_steps_per_second": 1.245, "step": 6000 }, { "epoch": 1.15, "learning_rate": 0.00018583917207687858, "loss": 0.7237, "step": 6020 }, { "epoch": 1.16, "learning_rate": 0.0001854534936041653, "loss": 0.735, "step": 6040 }, { "epoch": 1.16, "learning_rate": 0.00018506781513145208, "loss": 0.727, "step": 6060 }, { "epoch": 1.16, "learning_rate": 0.0001846821366587388, "loss": 0.7226, "step": 6080 }, { "epoch": 1.17, "learning_rate": 0.00018429645818602558, "loss": 0.7213, "step": 6100 }, { "epoch": 1.17, "learning_rate": 0.000183930063636948, "loss": 0.7206, "step": 6120 }, { "epoch": 1.18, "learning_rate": 0.0001835443851642347, "loss": 0.7292, "step": 6140 }, { "epoch": 1.18, "learning_rate": 0.0001831587066915215, "loss": 0.7316, "step": 6160 }, { "epoch": 1.18, "learning_rate": 0.00018277302821880824, "loss": 0.7318, "step": 6180 }, { "epoch": 1.19, "learning_rate": 0.000182387349746095, "loss": 0.7302, "step": 6200 }, { "epoch": 1.19, "eval_loss": 0.755982518196106, "eval_runtime": 25.5055, "eval_samples_per_second": 78.415, "eval_steps_per_second": 1.255, "step": 6200 }, { "epoch": 1.19, "learning_rate": 0.00018200167127338174, "loss": 0.728, "step": 6220 }, { "epoch": 1.2, "learning_rate": 0.00018161599280066851, "loss": 0.7334, "step": 6240 }, { "epoch": 1.2, "learning_rate": 0.00018123031432795524, "loss": 0.7303, "step": 6260 }, { "epoch": 1.2, "learning_rate": 0.000180844635855242, "loss": 0.7274, "step": 6280 }, { "epoch": 1.21, "learning_rate": 0.00018045895738252874, "loss": 0.7368, "step": 6300 }, { "epoch": 1.21, "learning_rate": 0.00018007327890981549, "loss": 0.724, "step": 6320 }, { "epoch": 1.21, "learning_rate": 0.00017968760043710226, "loss": 0.7229, "step": 6340 }, { "epoch": 1.22, "learning_rate": 0.00017930192196438899, "loss": 0.7216, "step": 6360 }, { "epoch": 1.22, "learning_rate": 0.00017891624349167576, "loss": 0.7292, "step": 6380 }, { "epoch": 1.23, "learning_rate": 0.0001785305650189625, "loss": 0.7226, "step": 6400 }, { "epoch": 1.23, "eval_loss": 0.7554095387458801, "eval_runtime": 25.5062, "eval_samples_per_second": 78.412, "eval_steps_per_second": 1.255, "step": 6400 }, { "epoch": 1.23, "learning_rate": 0.00017814488654624926, "loss": 0.7262, "step": 6420 }, { "epoch": 1.23, "learning_rate": 0.000177759208073536, "loss": 0.7274, "step": 6440 }, { "epoch": 1.24, "learning_rate": 0.00017737352960082276, "loss": 0.7271, "step": 6460 }, { "epoch": 1.24, "learning_rate": 0.0001769878511281095, "loss": 0.7299, "step": 6480 }, { "epoch": 1.25, "learning_rate": 0.0001766021726553963, "loss": 0.7264, "step": 6500 }, { "epoch": 1.25, "learning_rate": 0.000176216494182683, "loss": 0.7285, "step": 6520 }, { "epoch": 1.25, "learning_rate": 0.0001758308157099698, "loss": 0.7216, "step": 6540 }, { "epoch": 1.26, "learning_rate": 0.00017544513723725654, "loss": 0.7215, "step": 6560 }, { "epoch": 1.26, "learning_rate": 0.00017505945876454326, "loss": 0.7253, "step": 6580 }, { "epoch": 1.26, "learning_rate": 0.00017467378029183004, "loss": 0.7246, "step": 6600 }, { "epoch": 1.26, "eval_loss": 0.7540405988693237, "eval_runtime": 25.4725, "eval_samples_per_second": 78.516, "eval_steps_per_second": 1.256, "step": 6600 }, { "epoch": 1.27, "learning_rate": 0.00017428810181911676, "loss": 0.7166, "step": 6620 }, { "epoch": 1.27, "learning_rate": 0.00017390242334640354, "loss": 0.7213, "step": 6640 }, { "epoch": 1.28, "learning_rate": 0.00017351674487369029, "loss": 0.7305, "step": 6660 }, { "epoch": 1.28, "learning_rate": 0.00017313106640097704, "loss": 0.7347, "step": 6680 }, { "epoch": 1.28, "learning_rate": 0.00017274538792826379, "loss": 0.7272, "step": 6700 }, { "epoch": 1.29, "learning_rate": 0.00017235970945555056, "loss": 0.7224, "step": 6720 }, { "epoch": 1.29, "learning_rate": 0.00017197403098283728, "loss": 0.7327, "step": 6740 }, { "epoch": 1.3, "learning_rate": 0.00017158835251012406, "loss": 0.7228, "step": 6760 }, { "epoch": 1.3, "learning_rate": 0.00017120267403741078, "loss": 0.7344, "step": 6780 }, { "epoch": 1.3, "learning_rate": 0.00017081699556469753, "loss": 0.7269, "step": 6800 }, { "epoch": 1.3, "eval_loss": 0.7531024813652039, "eval_runtime": 25.6796, "eval_samples_per_second": 77.883, "eval_steps_per_second": 1.246, "step": 6800 }, { "epoch": 1.31, "learning_rate": 0.0001704313170919843, "loss": 0.7362, "step": 6820 }, { "epoch": 1.31, "learning_rate": 0.00017004563861927103, "loss": 0.7293, "step": 6840 }, { "epoch": 1.31, "learning_rate": 0.0001696599601465578, "loss": 0.7286, "step": 6860 }, { "epoch": 1.32, "learning_rate": 0.00016927428167384456, "loss": 0.7148, "step": 6880 }, { "epoch": 1.32, "learning_rate": 0.0001688886032011313, "loss": 0.72, "step": 6900 }, { "epoch": 1.33, "learning_rate": 0.00016850292472841806, "loss": 0.7239, "step": 6920 }, { "epoch": 1.33, "learning_rate": 0.0001681172462557048, "loss": 0.726, "step": 6940 }, { "epoch": 1.33, "learning_rate": 0.00016773156778299156, "loss": 0.7286, "step": 6960 }, { "epoch": 1.34, "learning_rate": 0.00016734588931027834, "loss": 0.7276, "step": 6980 }, { "epoch": 1.34, "learning_rate": 0.00016696021083756506, "loss": 0.7258, "step": 7000 }, { "epoch": 1.34, "eval_loss": 0.7521764636039734, "eval_runtime": 25.5237, "eval_samples_per_second": 78.359, "eval_steps_per_second": 1.254, "step": 7000 }, { "epoch": 1.35, "learning_rate": 0.0001665745323648518, "loss": 0.7326, "step": 7020 }, { "epoch": 1.35, "learning_rate": 0.00016618885389213859, "loss": 0.7311, "step": 7040 }, { "epoch": 1.35, "learning_rate": 0.0001658031754194253, "loss": 0.7295, "step": 7060 }, { "epoch": 1.36, "learning_rate": 0.00016541749694671208, "loss": 0.7279, "step": 7080 }, { "epoch": 1.36, "learning_rate": 0.00016503181847399883, "loss": 0.7293, "step": 7100 }, { "epoch": 1.36, "learning_rate": 0.00016464614000128558, "loss": 0.7256, "step": 7120 }, { "epoch": 1.37, "learning_rate": 0.00016426046152857233, "loss": 0.7204, "step": 7140 }, { "epoch": 1.37, "learning_rate": 0.00016387478305585908, "loss": 0.718, "step": 7160 }, { "epoch": 1.38, "learning_rate": 0.00016348910458314583, "loss": 0.7206, "step": 7180 }, { "epoch": 1.38, "learning_rate": 0.0001631034261104326, "loss": 0.7282, "step": 7200 }, { "epoch": 1.38, "eval_loss": 0.7514960765838623, "eval_runtime": 25.5249, "eval_samples_per_second": 78.355, "eval_steps_per_second": 1.254, "step": 7200 }, { "epoch": 1.38, "learning_rate": 0.00016271774763771933, "loss": 0.7162, "step": 7220 }, { "epoch": 1.39, "learning_rate": 0.00016233206916500608, "loss": 0.7277, "step": 7240 }, { "epoch": 1.39, "learning_rate": 0.00016194639069229286, "loss": 0.7147, "step": 7260 }, { "epoch": 1.39, "learning_rate": 0.00016156071221957958, "loss": 0.7339, "step": 7280 }, { "epoch": 1.4, "learning_rate": 0.00016117503374686636, "loss": 0.7257, "step": 7300 }, { "epoch": 1.4, "learning_rate": 0.00016078935527415308, "loss": 0.728, "step": 7320 }, { "epoch": 1.41, "learning_rate": 0.00016040367680143986, "loss": 0.7139, "step": 7340 }, { "epoch": 1.41, "learning_rate": 0.0001600179983287266, "loss": 0.7202, "step": 7360 }, { "epoch": 1.41, "learning_rate": 0.00015963231985601336, "loss": 0.7323, "step": 7380 }, { "epoch": 1.42, "learning_rate": 0.0001592466413833001, "loss": 0.7198, "step": 7400 }, { "epoch": 1.42, "eval_loss": 0.750492513179779, "eval_runtime": 25.8887, "eval_samples_per_second": 77.254, "eval_steps_per_second": 1.236, "step": 7400 }, { "epoch": 1.42, "learning_rate": 0.00015886096291058688, "loss": 0.7138, "step": 7420 }, { "epoch": 1.43, "learning_rate": 0.0001584752844378736, "loss": 0.7205, "step": 7440 }, { "epoch": 1.43, "learning_rate": 0.00015808960596516038, "loss": 0.7178, "step": 7460 }, { "epoch": 1.43, "learning_rate": 0.0001577039274924471, "loss": 0.7251, "step": 7480 }, { "epoch": 1.44, "learning_rate": 0.00015731824901973386, "loss": 0.7187, "step": 7500 }, { "epoch": 1.44, "learning_rate": 0.00015693257054702063, "loss": 0.7238, "step": 7520 }, { "epoch": 1.44, "learning_rate": 0.00015654689207430736, "loss": 0.7283, "step": 7540 }, { "epoch": 1.45, "learning_rate": 0.00015616121360159413, "loss": 0.7189, "step": 7560 }, { "epoch": 1.45, "learning_rate": 0.00015577553512888088, "loss": 0.7216, "step": 7580 }, { "epoch": 1.46, "learning_rate": 0.00015538985665616763, "loss": 0.7219, "step": 7600 }, { "epoch": 1.46, "eval_loss": 0.7496184706687927, "eval_runtime": 25.4957, "eval_samples_per_second": 78.445, "eval_steps_per_second": 1.255, "step": 7600 }, { "epoch": 1.46, "learning_rate": 0.00015500417818345438, "loss": 0.7233, "step": 7620 }, { "epoch": 1.46, "learning_rate": 0.0001546184997107411, "loss": 0.7241, "step": 7640 }, { "epoch": 1.47, "learning_rate": 0.00015423282123802788, "loss": 0.7194, "step": 7660 }, { "epoch": 1.47, "learning_rate": 0.00015384714276531466, "loss": 0.7229, "step": 7680 }, { "epoch": 1.48, "learning_rate": 0.00015346146429260138, "loss": 0.7219, "step": 7700 }, { "epoch": 1.48, "learning_rate": 0.00015307578581988813, "loss": 0.7027, "step": 7720 }, { "epoch": 1.48, "learning_rate": 0.0001526901073471749, "loss": 0.7171, "step": 7740 }, { "epoch": 1.49, "learning_rate": 0.00015230442887446163, "loss": 0.7193, "step": 7760 }, { "epoch": 1.49, "learning_rate": 0.0001519187504017484, "loss": 0.7269, "step": 7780 }, { "epoch": 1.49, "learning_rate": 0.00015153307192903513, "loss": 0.7171, "step": 7800 }, { "epoch": 1.49, "eval_loss": 0.7494381070137024, "eval_runtime": 25.5318, "eval_samples_per_second": 78.334, "eval_steps_per_second": 1.253, "step": 7800 }, { "epoch": 1.5, "learning_rate": 0.0001511473934563219, "loss": 0.7186, "step": 7820 }, { "epoch": 1.5, "learning_rate": 0.00015076171498360866, "loss": 0.7137, "step": 7840 }, { "epoch": 1.51, "learning_rate": 0.00015037603651089538, "loss": 0.7212, "step": 7860 }, { "epoch": 1.51, "learning_rate": 0.00014999035803818216, "loss": 0.7167, "step": 7880 }, { "epoch": 1.51, "learning_rate": 0.0001496046795654689, "loss": 0.7203, "step": 7900 }, { "epoch": 1.52, "learning_rate": 0.00014921900109275566, "loss": 0.714, "step": 7920 }, { "epoch": 1.52, "learning_rate": 0.0001488333226200424, "loss": 0.7153, "step": 7940 }, { "epoch": 1.53, "learning_rate": 0.00014844764414732916, "loss": 0.7176, "step": 7960 }, { "epoch": 1.53, "learning_rate": 0.0001480619656746159, "loss": 0.7049, "step": 7980 }, { "epoch": 1.53, "learning_rate": 0.00014767628720190265, "loss": 0.7204, "step": 8000 }, { "epoch": 1.53, "eval_loss": 0.7486086487770081, "eval_runtime": 25.5275, "eval_samples_per_second": 78.347, "eval_steps_per_second": 1.254, "step": 8000 }, { "epoch": 1.54, "learning_rate": 0.00014729060872918943, "loss": 0.7167, "step": 8020 }, { "epoch": 1.54, "learning_rate": 0.00014690493025647618, "loss": 0.72, "step": 8040 }, { "epoch": 1.54, "learning_rate": 0.00014651925178376293, "loss": 0.7203, "step": 8060 }, { "epoch": 1.55, "learning_rate": 0.00014613357331104968, "loss": 0.7258, "step": 8080 }, { "epoch": 1.55, "learning_rate": 0.00014574789483833643, "loss": 0.715, "step": 8100 }, { "epoch": 1.56, "learning_rate": 0.00014536221636562318, "loss": 0.7245, "step": 8120 }, { "epoch": 1.56, "learning_rate": 0.00014497653789290993, "loss": 0.7258, "step": 8140 }, { "epoch": 1.56, "learning_rate": 0.00014459085942019668, "loss": 0.7234, "step": 8160 }, { "epoch": 1.57, "learning_rate": 0.00014420518094748343, "loss": 0.7128, "step": 8180 }, { "epoch": 1.57, "learning_rate": 0.00014381950247477018, "loss": 0.7181, "step": 8200 }, { "epoch": 1.57, "eval_loss": 0.7475513219833374, "eval_runtime": 25.5412, "eval_samples_per_second": 78.305, "eval_steps_per_second": 1.253, "step": 8200 }, { "epoch": 1.58, "learning_rate": 0.00014343382400205693, "loss": 0.7236, "step": 8220 }, { "epoch": 1.58, "learning_rate": 0.0001430481455293437, "loss": 0.7125, "step": 8240 }, { "epoch": 1.58, "learning_rate": 0.00014266246705663046, "loss": 0.7186, "step": 8260 }, { "epoch": 1.59, "learning_rate": 0.0001422767885839172, "loss": 0.7203, "step": 8280 }, { "epoch": 1.59, "learning_rate": 0.00014189111011120396, "loss": 0.7156, "step": 8300 }, { "epoch": 1.59, "learning_rate": 0.0001415054316384907, "loss": 0.714, "step": 8320 }, { "epoch": 1.6, "learning_rate": 0.00014111975316577745, "loss": 0.7129, "step": 8340 }, { "epoch": 1.6, "learning_rate": 0.0001407340746930642, "loss": 0.7179, "step": 8360 }, { "epoch": 1.61, "learning_rate": 0.00014036768014398662, "loss": 0.7197, "step": 8380 }, { "epoch": 1.61, "learning_rate": 0.00013998200167127337, "loss": 0.7287, "step": 8400 }, { "epoch": 1.61, "eval_loss": 0.7470650672912598, "eval_runtime": 25.5238, "eval_samples_per_second": 78.358, "eval_steps_per_second": 1.254, "step": 8400 }, { "epoch": 1.61, "learning_rate": 0.00013959632319856011, "loss": 0.718, "step": 8420 }, { "epoch": 1.62, "learning_rate": 0.00013921064472584686, "loss": 0.7166, "step": 8440 }, { "epoch": 1.62, "learning_rate": 0.00013882496625313361, "loss": 0.7218, "step": 8460 }, { "epoch": 1.62, "learning_rate": 0.0001384392877804204, "loss": 0.723, "step": 8480 }, { "epoch": 1.63, "learning_rate": 0.00013805360930770714, "loss": 0.7104, "step": 8500 }, { "epoch": 1.63, "learning_rate": 0.0001376679308349939, "loss": 0.7136, "step": 8520 }, { "epoch": 1.64, "learning_rate": 0.0001372822523622806, "loss": 0.7237, "step": 8540 }, { "epoch": 1.64, "learning_rate": 0.0001368965738895674, "loss": 0.7196, "step": 8560 }, { "epoch": 1.64, "learning_rate": 0.00013651089541685414, "loss": 0.7218, "step": 8580 }, { "epoch": 1.65, "learning_rate": 0.0001361252169441409, "loss": 0.7132, "step": 8600 }, { "epoch": 1.65, "eval_loss": 0.7465201020240784, "eval_runtime": 25.542, "eval_samples_per_second": 78.302, "eval_steps_per_second": 1.253, "step": 8600 }, { "epoch": 1.65, "learning_rate": 0.00013573953847142764, "loss": 0.7139, "step": 8620 }, { "epoch": 1.66, "learning_rate": 0.0001353538599987144, "loss": 0.7093, "step": 8640 }, { "epoch": 1.66, "learning_rate": 0.00013496818152600114, "loss": 0.7243, "step": 8660 }, { "epoch": 1.66, "learning_rate": 0.0001345825030532879, "loss": 0.7127, "step": 8680 }, { "epoch": 1.67, "learning_rate": 0.00013419682458057464, "loss": 0.7148, "step": 8700 }, { "epoch": 1.67, "learning_rate": 0.00013381114610786142, "loss": 0.7236, "step": 8720 }, { "epoch": 1.67, "learning_rate": 0.00013342546763514817, "loss": 0.7103, "step": 8740 }, { "epoch": 1.68, "learning_rate": 0.0001330397891624349, "loss": 0.7133, "step": 8760 }, { "epoch": 1.68, "learning_rate": 0.00013265411068972164, "loss": 0.7182, "step": 8780 }, { "epoch": 1.69, "learning_rate": 0.00013226843221700841, "loss": 0.7198, "step": 8800 }, { "epoch": 1.69, "eval_loss": 0.7450763583183289, "eval_runtime": 25.4725, "eval_samples_per_second": 78.516, "eval_steps_per_second": 1.256, "step": 8800 }, { "epoch": 1.69, "learning_rate": 0.00013188275374429516, "loss": 0.7073, "step": 8820 }, { "epoch": 1.69, "learning_rate": 0.00013149707527158191, "loss": 0.7208, "step": 8840 }, { "epoch": 1.7, "learning_rate": 0.00013111139679886866, "loss": 0.7067, "step": 8860 }, { "epoch": 1.7, "learning_rate": 0.0001307257183261554, "loss": 0.7149, "step": 8880 }, { "epoch": 1.71, "learning_rate": 0.00013034003985344216, "loss": 0.7133, "step": 8900 }, { "epoch": 1.71, "learning_rate": 0.0001299543613807289, "loss": 0.7137, "step": 8920 }, { "epoch": 1.71, "learning_rate": 0.0001295686829080157, "loss": 0.719, "step": 8940 }, { "epoch": 1.72, "learning_rate": 0.00012918300443530244, "loss": 0.7198, "step": 8960 }, { "epoch": 1.72, "learning_rate": 0.0001287973259625892, "loss": 0.7074, "step": 8980 }, { "epoch": 1.72, "learning_rate": 0.0001284116474898759, "loss": 0.7216, "step": 9000 }, { "epoch": 1.72, "eval_loss": 0.7454459071159363, "eval_runtime": 25.5717, "eval_samples_per_second": 78.211, "eval_steps_per_second": 1.251, "step": 9000 }, { "epoch": 1.73, "learning_rate": 0.0001280259690171627, "loss": 0.7203, "step": 9020 }, { "epoch": 1.73, "learning_rate": 0.00012764029054444944, "loss": 0.7133, "step": 9040 }, { "epoch": 1.74, "learning_rate": 0.0001272546120717362, "loss": 0.7081, "step": 9060 }, { "epoch": 1.74, "learning_rate": 0.00012686893359902294, "loss": 0.7153, "step": 9080 }, { "epoch": 1.74, "learning_rate": 0.0001264832551263097, "loss": 0.7108, "step": 9100 }, { "epoch": 1.75, "learning_rate": 0.00012609757665359644, "loss": 0.7106, "step": 9120 }, { "epoch": 1.75, "learning_rate": 0.0001257118981808832, "loss": 0.7117, "step": 9140 }, { "epoch": 1.76, "learning_rate": 0.00012532621970816994, "loss": 0.7171, "step": 9160 }, { "epoch": 1.76, "learning_rate": 0.00012494054123545671, "loss": 0.7148, "step": 9180 }, { "epoch": 1.76, "learning_rate": 0.00012455486276274346, "loss": 0.714, "step": 9200 }, { "epoch": 1.76, "eval_loss": 0.7446411848068237, "eval_runtime": 25.5622, "eval_samples_per_second": 78.24, "eval_steps_per_second": 1.252, "step": 9200 }, { "epoch": 1.77, "learning_rate": 0.00012416918429003019, "loss": 0.7133, "step": 9220 }, { "epoch": 1.77, "learning_rate": 0.00012378350581731694, "loss": 0.7108, "step": 9240 }, { "epoch": 1.77, "learning_rate": 0.0001233978273446037, "loss": 0.7147, "step": 9260 }, { "epoch": 1.78, "learning_rate": 0.00012301214887189046, "loss": 0.715, "step": 9280 }, { "epoch": 1.78, "learning_rate": 0.0001226264703991772, "loss": 0.7255, "step": 9300 }, { "epoch": 1.79, "learning_rate": 0.00012224079192646396, "loss": 0.7168, "step": 9320 }, { "epoch": 1.79, "learning_rate": 0.00012185511345375073, "loss": 0.7155, "step": 9340 }, { "epoch": 1.79, "learning_rate": 0.00012146943498103746, "loss": 0.7064, "step": 9360 }, { "epoch": 1.8, "learning_rate": 0.00012108375650832421, "loss": 0.716, "step": 9380 }, { "epoch": 1.8, "learning_rate": 0.00012069807803561096, "loss": 0.7145, "step": 9400 }, { "epoch": 1.8, "eval_loss": 0.7441000938415527, "eval_runtime": 25.7378, "eval_samples_per_second": 77.707, "eval_steps_per_second": 1.243, "step": 9400 }, { "epoch": 1.8, "learning_rate": 0.00012031239956289772, "loss": 0.7135, "step": 9420 }, { "epoch": 1.81, "learning_rate": 0.00011992672109018447, "loss": 0.7164, "step": 9440 }, { "epoch": 1.81, "learning_rate": 0.00011954104261747122, "loss": 0.714, "step": 9460 }, { "epoch": 1.82, "learning_rate": 0.00011915536414475797, "loss": 0.7173, "step": 9480 }, { "epoch": 1.82, "learning_rate": 0.00011876968567204474, "loss": 0.7102, "step": 9500 }, { "epoch": 1.82, "learning_rate": 0.00011838400719933149, "loss": 0.7122, "step": 9520 }, { "epoch": 1.83, "learning_rate": 0.00011799832872661822, "loss": 0.7197, "step": 9540 }, { "epoch": 1.83, "learning_rate": 0.00011761265025390497, "loss": 0.7132, "step": 9560 }, { "epoch": 1.84, "learning_rate": 0.00011722697178119174, "loss": 0.7255, "step": 9580 }, { "epoch": 1.84, "learning_rate": 0.00011684129330847849, "loss": 0.7175, "step": 9600 }, { "epoch": 1.84, "eval_loss": 0.7432481646537781, "eval_runtime": 25.5076, "eval_samples_per_second": 78.408, "eval_steps_per_second": 1.255, "step": 9600 }, { "epoch": 1.84, "learning_rate": 0.00011645561483576524, "loss": 0.7125, "step": 9620 }, { "epoch": 1.85, "learning_rate": 0.00011606993636305199, "loss": 0.7119, "step": 9640 }, { "epoch": 1.85, "learning_rate": 0.00011568425789033875, "loss": 0.7147, "step": 9660 }, { "epoch": 1.85, "learning_rate": 0.0001152985794176255, "loss": 0.7101, "step": 9680 }, { "epoch": 1.86, "learning_rate": 0.00011491290094491225, "loss": 0.7105, "step": 9700 }, { "epoch": 1.86, "learning_rate": 0.000114527222472199, "loss": 0.7153, "step": 9720 }, { "epoch": 1.87, "learning_rate": 0.00011414154399948576, "loss": 0.7047, "step": 9740 }, { "epoch": 1.87, "learning_rate": 0.00011375586552677251, "loss": 0.6967, "step": 9760 }, { "epoch": 1.87, "learning_rate": 0.00011337018705405925, "loss": 0.7094, "step": 9780 }, { "epoch": 1.88, "learning_rate": 0.000112984508581346, "loss": 0.7195, "step": 9800 }, { "epoch": 1.88, "eval_loss": 0.7431700229644775, "eval_runtime": 25.5752, "eval_samples_per_second": 78.201, "eval_steps_per_second": 1.251, "step": 9800 }, { "epoch": 1.88, "learning_rate": 0.00011259883010863276, "loss": 0.7122, "step": 9820 }, { "epoch": 1.89, "learning_rate": 0.00011221315163591951, "loss": 0.7193, "step": 9840 }, { "epoch": 1.89, "learning_rate": 0.00011182747316320626, "loss": 0.7147, "step": 9860 }, { "epoch": 1.89, "learning_rate": 0.00011144179469049301, "loss": 0.7058, "step": 9880 }, { "epoch": 1.9, "learning_rate": 0.00011105611621777977, "loss": 0.7106, "step": 9900 }, { "epoch": 1.9, "learning_rate": 0.00011067043774506652, "loss": 0.71, "step": 9920 }, { "epoch": 1.9, "learning_rate": 0.00011028475927235327, "loss": 0.7182, "step": 9940 }, { "epoch": 1.91, "learning_rate": 0.00010989908079964002, "loss": 0.7048, "step": 9960 }, { "epoch": 1.91, "learning_rate": 0.00010951340232692679, "loss": 0.7165, "step": 9980 }, { "epoch": 1.92, "learning_rate": 0.00010912772385421352, "loss": 0.7153, "step": 10000 }, { "epoch": 1.92, "eval_loss": 0.7425808310508728, "eval_runtime": 25.6297, "eval_samples_per_second": 78.034, "eval_steps_per_second": 1.249, "step": 10000 }, { "epoch": 1.92, "learning_rate": 0.00010874204538150027, "loss": 0.7127, "step": 10020 }, { "epoch": 1.92, "learning_rate": 0.00010835636690878702, "loss": 0.7062, "step": 10040 }, { "epoch": 1.93, "learning_rate": 0.00010797068843607378, "loss": 0.7125, "step": 10060 }, { "epoch": 1.93, "learning_rate": 0.00010758500996336053, "loss": 0.7114, "step": 10080 }, { "epoch": 1.94, "learning_rate": 0.00010719933149064728, "loss": 0.7096, "step": 10100 }, { "epoch": 1.94, "learning_rate": 0.00010681365301793405, "loss": 0.7119, "step": 10120 }, { "epoch": 1.94, "learning_rate": 0.0001064279745452208, "loss": 0.7034, "step": 10140 }, { "epoch": 1.95, "learning_rate": 0.00010604229607250755, "loss": 0.7049, "step": 10160 }, { "epoch": 1.95, "learning_rate": 0.0001056566175997943, "loss": 0.7156, "step": 10180 }, { "epoch": 1.95, "learning_rate": 0.00010527093912708106, "loss": 0.718, "step": 10200 }, { "epoch": 1.95, "eval_loss": 0.7418650984764099, "eval_runtime": 25.554, "eval_samples_per_second": 78.266, "eval_steps_per_second": 1.252, "step": 10200 }, { "epoch": 1.96, "learning_rate": 0.00010488526065436781, "loss": 0.7141, "step": 10220 }, { "epoch": 1.96, "learning_rate": 0.00010449958218165455, "loss": 0.7073, "step": 10240 }, { "epoch": 1.97, "learning_rate": 0.0001041139037089413, "loss": 0.7129, "step": 10260 }, { "epoch": 1.97, "learning_rate": 0.00010372822523622806, "loss": 0.7174, "step": 10280 }, { "epoch": 1.97, "learning_rate": 0.00010334254676351481, "loss": 0.7112, "step": 10300 }, { "epoch": 1.98, "learning_rate": 0.00010295686829080156, "loss": 0.7073, "step": 10320 }, { "epoch": 1.98, "learning_rate": 0.00010257118981808831, "loss": 0.7164, "step": 10340 }, { "epoch": 1.99, "learning_rate": 0.00010218551134537507, "loss": 0.7057, "step": 10360 }, { "epoch": 1.99, "learning_rate": 0.00010179983287266182, "loss": 0.709, "step": 10380 }, { "epoch": 1.99, "learning_rate": 0.00010141415439994857, "loss": 0.7147, "step": 10400 }, { "epoch": 1.99, "eval_loss": 0.7417293787002563, "eval_runtime": 25.4964, "eval_samples_per_second": 78.443, "eval_steps_per_second": 1.255, "step": 10400 }, { "epoch": 2.0, "learning_rate": 0.00010102847592723531, "loss": 0.713, "step": 10420 }, { "epoch": 2.0, "learning_rate": 0.00010064279745452208, "loss": 0.7128, "step": 10440 }, { "epoch": 2.0, "learning_rate": 0.00010025711898180882, "loss": 0.7094, "step": 10460 }, { "epoch": 2.01, "learning_rate": 9.987144050909557e-05, "loss": 0.7008, "step": 10480 }, { "epoch": 2.01, "learning_rate": 9.948576203638232e-05, "loss": 0.7083, "step": 10500 }, { "epoch": 2.02, "learning_rate": 9.910008356366908e-05, "loss": 0.7049, "step": 10520 }, { "epoch": 2.02, "learning_rate": 9.871440509095583e-05, "loss": 0.7041, "step": 10540 }, { "epoch": 2.02, "learning_rate": 9.834801054187824e-05, "loss": 0.7105, "step": 10560 }, { "epoch": 2.03, "learning_rate": 9.7962332069165e-05, "loss": 0.7041, "step": 10580 }, { "epoch": 2.03, "learning_rate": 9.757665359645176e-05, "loss": 0.7103, "step": 10600 }, { "epoch": 2.03, "eval_loss": 0.7410894632339478, "eval_runtime": 25.5424, "eval_samples_per_second": 78.301, "eval_steps_per_second": 1.253, "step": 10600 }, { "epoch": 2.03, "learning_rate": 9.71909751237385e-05, "loss": 0.7037, "step": 10620 }, { "epoch": 2.04, "learning_rate": 9.680529665102524e-05, "loss": 0.7078, "step": 10640 }, { "epoch": 2.04, "learning_rate": 9.641961817831202e-05, "loss": 0.7116, "step": 10660 }, { "epoch": 2.05, "learning_rate": 9.603393970559876e-05, "loss": 0.7094, "step": 10680 }, { "epoch": 2.05, "learning_rate": 9.56482612328855e-05, "loss": 0.7217, "step": 10700 }, { "epoch": 2.05, "learning_rate": 9.526258276017226e-05, "loss": 0.7038, "step": 10720 }, { "epoch": 2.06, "learning_rate": 9.487690428745902e-05, "loss": 0.7131, "step": 10740 }, { "epoch": 2.06, "learning_rate": 9.449122581474577e-05, "loss": 0.7051, "step": 10760 }, { "epoch": 2.07, "learning_rate": 9.410554734203252e-05, "loss": 0.7058, "step": 10780 }, { "epoch": 2.07, "learning_rate": 9.371986886931927e-05, "loss": 0.7039, "step": 10800 }, { "epoch": 2.07, "eval_loss": 0.7405736446380615, "eval_runtime": 25.7467, "eval_samples_per_second": 77.68, "eval_steps_per_second": 1.243, "step": 10800 }, { "epoch": 2.07, "learning_rate": 9.333419039660603e-05, "loss": 0.7101, "step": 10820 }, { "epoch": 2.08, "learning_rate": 9.294851192389278e-05, "loss": 0.6991, "step": 10840 }, { "epoch": 2.08, "learning_rate": 9.256283345117953e-05, "loss": 0.7069, "step": 10860 }, { "epoch": 2.08, "learning_rate": 9.217715497846627e-05, "loss": 0.7094, "step": 10880 }, { "epoch": 2.09, "learning_rate": 9.179147650575303e-05, "loss": 0.7103, "step": 10900 }, { "epoch": 2.09, "learning_rate": 9.140579803303978e-05, "loss": 0.7015, "step": 10920 }, { "epoch": 2.1, "learning_rate": 9.102011956032653e-05, "loss": 0.712, "step": 10940 }, { "epoch": 2.1, "learning_rate": 9.063444108761328e-05, "loss": 0.707, "step": 10960 }, { "epoch": 2.1, "learning_rate": 9.024876261490004e-05, "loss": 0.7009, "step": 10980 }, { "epoch": 2.11, "learning_rate": 8.986308414218679e-05, "loss": 0.7062, "step": 11000 }, { "epoch": 2.11, "eval_loss": 0.7398320436477661, "eval_runtime": 25.5459, "eval_samples_per_second": 78.29, "eval_steps_per_second": 1.253, "step": 11000 }, { "epoch": 2.11, "learning_rate": 8.947740566947354e-05, "loss": 0.7054, "step": 11020 }, { "epoch": 2.12, "learning_rate": 8.909172719676029e-05, "loss": 0.7094, "step": 11040 }, { "epoch": 2.12, "learning_rate": 8.870604872404706e-05, "loss": 0.7059, "step": 11060 }, { "epoch": 2.12, "learning_rate": 8.83203702513338e-05, "loss": 0.7202, "step": 11080 }, { "epoch": 2.13, "learning_rate": 8.793469177862054e-05, "loss": 0.699, "step": 11100 }, { "epoch": 2.13, "learning_rate": 8.754901330590729e-05, "loss": 0.7137, "step": 11120 }, { "epoch": 2.13, "learning_rate": 8.716333483319405e-05, "loss": 0.7048, "step": 11140 }, { "epoch": 2.14, "learning_rate": 8.67776563604808e-05, "loss": 0.7089, "step": 11160 }, { "epoch": 2.14, "learning_rate": 8.639197788776755e-05, "loss": 0.7057, "step": 11180 }, { "epoch": 2.15, "learning_rate": 8.60062994150543e-05, "loss": 0.709, "step": 11200 }, { "epoch": 2.15, "eval_loss": 0.7393301725387573, "eval_runtime": 25.7257, "eval_samples_per_second": 77.743, "eval_steps_per_second": 1.244, "step": 11200 }, { "epoch": 2.15, "learning_rate": 8.562062094234107e-05, "loss": 0.7027, "step": 11220 }, { "epoch": 2.15, "learning_rate": 8.523494246962782e-05, "loss": 0.7082, "step": 11240 }, { "epoch": 2.16, "learning_rate": 8.484926399691457e-05, "loss": 0.7007, "step": 11260 }, { "epoch": 2.16, "learning_rate": 8.446358552420132e-05, "loss": 0.7011, "step": 11280 }, { "epoch": 2.17, "learning_rate": 8.407790705148808e-05, "loss": 0.7067, "step": 11300 }, { "epoch": 2.17, "learning_rate": 8.369222857877483e-05, "loss": 0.702, "step": 11320 }, { "epoch": 2.17, "learning_rate": 8.330655010606157e-05, "loss": 0.7126, "step": 11340 }, { "epoch": 2.18, "learning_rate": 8.292087163334832e-05, "loss": 0.6947, "step": 11360 }, { "epoch": 2.18, "learning_rate": 8.253519316063508e-05, "loss": 0.7033, "step": 11380 }, { "epoch": 2.18, "learning_rate": 8.214951468792183e-05, "loss": 0.7075, "step": 11400 }, { "epoch": 2.18, "eval_loss": 0.7390503883361816, "eval_runtime": 25.6097, "eval_samples_per_second": 78.095, "eval_steps_per_second": 1.25, "step": 11400 }, { "epoch": 2.19, "learning_rate": 8.176383621520858e-05, "loss": 0.7081, "step": 11420 }, { "epoch": 2.19, "learning_rate": 8.137815774249533e-05, "loss": 0.7114, "step": 11440 }, { "epoch": 2.2, "learning_rate": 8.099247926978209e-05, "loss": 0.7105, "step": 11460 }, { "epoch": 2.2, "learning_rate": 8.060680079706884e-05, "loss": 0.7113, "step": 11480 }, { "epoch": 2.2, "learning_rate": 8.022112232435559e-05, "loss": 0.7109, "step": 11500 }, { "epoch": 2.21, "learning_rate": 7.983544385164233e-05, "loss": 0.7039, "step": 11520 }, { "epoch": 2.21, "learning_rate": 7.94497653789291e-05, "loss": 0.7144, "step": 11540 }, { "epoch": 2.21, "learning_rate": 7.906408690621584e-05, "loss": 0.7003, "step": 11560 }, { "epoch": 2.22, "learning_rate": 7.867840843350259e-05, "loss": 0.7028, "step": 11580 }, { "epoch": 2.22, "learning_rate": 7.829272996078934e-05, "loss": 0.7018, "step": 11600 }, { "epoch": 2.22, "eval_loss": 0.7388148307800293, "eval_runtime": 25.5069, "eval_samples_per_second": 78.41, "eval_steps_per_second": 1.255, "step": 11600 }, { "epoch": 2.23, "learning_rate": 7.79070514880761e-05, "loss": 0.7113, "step": 11620 }, { "epoch": 2.23, "learning_rate": 7.752137301536285e-05, "loss": 0.7136, "step": 11640 }, { "epoch": 2.23, "learning_rate": 7.71356945426496e-05, "loss": 0.7097, "step": 11660 }, { "epoch": 2.24, "learning_rate": 7.675001606993635e-05, "loss": 0.7057, "step": 11680 }, { "epoch": 2.24, "learning_rate": 7.636433759722312e-05, "loss": 0.7028, "step": 11700 }, { "epoch": 2.25, "learning_rate": 7.597865912450986e-05, "loss": 0.708, "step": 11720 }, { "epoch": 2.25, "learning_rate": 7.559298065179661e-05, "loss": 0.7088, "step": 11740 }, { "epoch": 2.25, "learning_rate": 7.520730217908335e-05, "loss": 0.7024, "step": 11760 }, { "epoch": 2.26, "learning_rate": 7.482162370637011e-05, "loss": 0.7016, "step": 11780 }, { "epoch": 2.26, "learning_rate": 7.443594523365686e-05, "loss": 0.7132, "step": 11800 }, { "epoch": 2.26, "eval_loss": 0.7381731271743774, "eval_runtime": 25.4976, "eval_samples_per_second": 78.439, "eval_steps_per_second": 1.255, "step": 11800 }, { "epoch": 2.26, "learning_rate": 7.405026676094361e-05, "loss": 0.6969, "step": 11820 }, { "epoch": 2.27, "learning_rate": 7.366458828823038e-05, "loss": 0.7042, "step": 11840 }, { "epoch": 2.27, "learning_rate": 7.327890981551713e-05, "loss": 0.7088, "step": 11860 }, { "epoch": 2.28, "learning_rate": 7.289323134280388e-05, "loss": 0.7109, "step": 11880 }, { "epoch": 2.28, "learning_rate": 7.250755287009063e-05, "loss": 0.7046, "step": 11900 }, { "epoch": 2.28, "learning_rate": 7.212187439737738e-05, "loss": 0.706, "step": 11920 }, { "epoch": 2.29, "learning_rate": 7.173619592466414e-05, "loss": 0.7045, "step": 11940 }, { "epoch": 2.29, "learning_rate": 7.135051745195089e-05, "loss": 0.7121, "step": 11960 }, { "epoch": 2.3, "learning_rate": 7.096483897923764e-05, "loss": 0.6946, "step": 11980 }, { "epoch": 2.3, "learning_rate": 7.057916050652439e-05, "loss": 0.7003, "step": 12000 }, { "epoch": 2.3, "eval_loss": 0.7378225922584534, "eval_runtime": 25.5221, "eval_samples_per_second": 78.363, "eval_steps_per_second": 1.254, "step": 12000 }, { "epoch": 2.3, "learning_rate": 7.019348203381114e-05, "loss": 0.7147, "step": 12020 }, { "epoch": 2.31, "learning_rate": 6.980780356109789e-05, "loss": 0.7066, "step": 12040 }, { "epoch": 2.31, "learning_rate": 6.942212508838465e-05, "loss": 0.6997, "step": 12060 }, { "epoch": 2.31, "learning_rate": 6.90364466156714e-05, "loss": 0.7083, "step": 12080 }, { "epoch": 2.32, "learning_rate": 6.865076814295815e-05, "loss": 0.6991, "step": 12100 }, { "epoch": 2.32, "learning_rate": 6.82650896702449e-05, "loss": 0.6982, "step": 12120 }, { "epoch": 2.33, "learning_rate": 6.787941119753165e-05, "loss": 0.7028, "step": 12140 }, { "epoch": 2.33, "learning_rate": 6.74937327248184e-05, "loss": 0.704, "step": 12160 }, { "epoch": 2.33, "learning_rate": 6.710805425210516e-05, "loss": 0.7084, "step": 12180 }, { "epoch": 2.34, "learning_rate": 6.672237577939191e-05, "loss": 0.7061, "step": 12200 }, { "epoch": 2.34, "eval_loss": 0.7376002669334412, "eval_runtime": 25.5156, "eval_samples_per_second": 78.383, "eval_steps_per_second": 1.254, "step": 12200 }, { "epoch": 2.34, "learning_rate": 6.633669730667866e-05, "loss": 0.7017, "step": 12220 }, { "epoch": 2.35, "learning_rate": 6.595101883396541e-05, "loss": 0.6949, "step": 12240 }, { "epoch": 2.35, "learning_rate": 6.556534036125216e-05, "loss": 0.6985, "step": 12260 }, { "epoch": 2.35, "learning_rate": 6.517966188853891e-05, "loss": 0.7075, "step": 12280 }, { "epoch": 2.36, "learning_rate": 6.479398341582568e-05, "loss": 0.6997, "step": 12300 }, { "epoch": 2.36, "learning_rate": 6.440830494311241e-05, "loss": 0.7045, "step": 12320 }, { "epoch": 2.36, "learning_rate": 6.402262647039918e-05, "loss": 0.7148, "step": 12340 }, { "epoch": 2.37, "learning_rate": 6.363694799768592e-05, "loss": 0.7085, "step": 12360 }, { "epoch": 2.37, "learning_rate": 6.325126952497267e-05, "loss": 0.7062, "step": 12380 }, { "epoch": 2.38, "learning_rate": 6.286559105225942e-05, "loss": 0.7092, "step": 12400 }, { "epoch": 2.38, "eval_loss": 0.7370800971984863, "eval_runtime": 25.5432, "eval_samples_per_second": 78.299, "eval_steps_per_second": 1.253, "step": 12400 }, { "epoch": 2.38, "learning_rate": 6.247991257954619e-05, "loss": 0.7069, "step": 12420 }, { "epoch": 2.38, "learning_rate": 6.209423410683292e-05, "loss": 0.7083, "step": 12440 }, { "epoch": 2.39, "learning_rate": 6.170855563411969e-05, "loss": 0.7126, "step": 12460 }, { "epoch": 2.39, "learning_rate": 6.132287716140644e-05, "loss": 0.7062, "step": 12480 }, { "epoch": 2.4, "learning_rate": 6.0937198688693187e-05, "loss": 0.7149, "step": 12500 }, { "epoch": 2.4, "learning_rate": 6.0551520215979936e-05, "loss": 0.7111, "step": 12520 }, { "epoch": 2.4, "learning_rate": 6.016584174326669e-05, "loss": 0.7059, "step": 12540 }, { "epoch": 2.41, "learning_rate": 5.978016327055344e-05, "loss": 0.7169, "step": 12560 }, { "epoch": 2.41, "learning_rate": 5.93944847978402e-05, "loss": 0.7052, "step": 12580 }, { "epoch": 2.41, "learning_rate": 5.900880632512694e-05, "loss": 0.7019, "step": 12600 }, { "epoch": 2.41, "eval_loss": 0.7369959354400635, "eval_runtime": 25.5241, "eval_samples_per_second": 78.357, "eval_steps_per_second": 1.254, "step": 12600 }, { "epoch": 2.42, "learning_rate": 5.86231278524137e-05, "loss": 0.7026, "step": 12620 }, { "epoch": 2.42, "learning_rate": 5.823744937970045e-05, "loss": 0.6981, "step": 12640 }, { "epoch": 2.43, "learning_rate": 5.7851770906987205e-05, "loss": 0.7044, "step": 12660 }, { "epoch": 2.43, "learning_rate": 5.7466092434273955e-05, "loss": 0.7087, "step": 12680 }, { "epoch": 2.43, "learning_rate": 5.708041396156071e-05, "loss": 0.7039, "step": 12700 }, { "epoch": 2.44, "learning_rate": 5.6694735488847454e-05, "loss": 0.7015, "step": 12720 }, { "epoch": 2.44, "learning_rate": 5.630905701613421e-05, "loss": 0.7053, "step": 12740 }, { "epoch": 2.44, "learning_rate": 5.592337854342096e-05, "loss": 0.7037, "step": 12760 }, { "epoch": 2.45, "learning_rate": 5.553770007070772e-05, "loss": 0.6938, "step": 12780 }, { "epoch": 2.45, "learning_rate": 5.515202159799447e-05, "loss": 0.7063, "step": 12800 }, { "epoch": 2.45, "eval_loss": 0.7364639639854431, "eval_runtime": 25.4856, "eval_samples_per_second": 78.476, "eval_steps_per_second": 1.256, "step": 12800 }, { "epoch": 2.46, "learning_rate": 5.476634312528122e-05, "loss": 0.7013, "step": 12820 }, { "epoch": 2.46, "learning_rate": 5.4380664652567966e-05, "loss": 0.7012, "step": 12840 }, { "epoch": 2.46, "learning_rate": 5.399498617985472e-05, "loss": 0.7, "step": 12860 }, { "epoch": 2.47, "learning_rate": 5.360930770714147e-05, "loss": 0.7017, "step": 12880 }, { "epoch": 2.47, "learning_rate": 5.322362923442823e-05, "loss": 0.7145, "step": 12900 }, { "epoch": 2.48, "learning_rate": 5.283795076171498e-05, "loss": 0.7156, "step": 12920 }, { "epoch": 2.48, "learning_rate": 5.247155621263739e-05, "loss": 0.6965, "step": 12940 }, { "epoch": 2.48, "learning_rate": 5.2085877739924146e-05, "loss": 0.7001, "step": 12960 }, { "epoch": 2.49, "learning_rate": 5.1700199267210896e-05, "loss": 0.7012, "step": 12980 }, { "epoch": 2.49, "learning_rate": 5.131452079449765e-05, "loss": 0.6939, "step": 13000 }, { "epoch": 2.49, "eval_loss": 0.7364306449890137, "eval_runtime": 25.5093, "eval_samples_per_second": 78.403, "eval_steps_per_second": 1.254, "step": 13000 }, { "epoch": 2.49, "learning_rate": 5.09288423217844e-05, "loss": 0.7084, "step": 13020 }, { "epoch": 2.5, "learning_rate": 5.054316384907115e-05, "loss": 0.6987, "step": 13040 }, { "epoch": 2.5, "learning_rate": 5.01574853763579e-05, "loss": 0.7087, "step": 13060 }, { "epoch": 2.51, "learning_rate": 4.977180690364466e-05, "loss": 0.7028, "step": 13080 }, { "epoch": 2.51, "learning_rate": 4.938612843093141e-05, "loss": 0.7012, "step": 13100 }, { "epoch": 2.51, "learning_rate": 4.9000449958218165e-05, "loss": 0.6959, "step": 13120 }, { "epoch": 2.52, "learning_rate": 4.861477148550491e-05, "loss": 0.7056, "step": 13140 }, { "epoch": 2.52, "learning_rate": 4.8229093012791664e-05, "loss": 0.716, "step": 13160 }, { "epoch": 2.53, "learning_rate": 4.7843414540078414e-05, "loss": 0.7144, "step": 13180 }, { "epoch": 2.53, "learning_rate": 4.745773606736517e-05, "loss": 0.6969, "step": 13200 }, { "epoch": 2.53, "eval_loss": 0.7360122203826904, "eval_runtime": 25.4878, "eval_samples_per_second": 78.469, "eval_steps_per_second": 1.256, "step": 13200 }, { "epoch": 2.53, "learning_rate": 4.707205759465192e-05, "loss": 0.6993, "step": 13220 }, { "epoch": 2.54, "learning_rate": 4.668637912193868e-05, "loss": 0.7013, "step": 13240 }, { "epoch": 2.54, "learning_rate": 4.630070064922542e-05, "loss": 0.7033, "step": 13260 }, { "epoch": 2.54, "learning_rate": 4.5915022176512176e-05, "loss": 0.7067, "step": 13280 }, { "epoch": 2.55, "learning_rate": 4.5529343703798926e-05, "loss": 0.6886, "step": 13300 }, { "epoch": 2.55, "learning_rate": 4.514366523108568e-05, "loss": 0.7061, "step": 13320 }, { "epoch": 2.56, "learning_rate": 4.475798675837243e-05, "loss": 0.7027, "step": 13340 }, { "epoch": 2.56, "learning_rate": 4.437230828565919e-05, "loss": 0.6982, "step": 13360 }, { "epoch": 2.56, "learning_rate": 4.398662981294593e-05, "loss": 0.7042, "step": 13380 }, { "epoch": 2.57, "learning_rate": 4.360095134023269e-05, "loss": 0.6956, "step": 13400 }, { "epoch": 2.57, "eval_loss": 0.7356610298156738, "eval_runtime": 25.5629, "eval_samples_per_second": 78.238, "eval_steps_per_second": 1.252, "step": 13400 }, { "epoch": 2.57, "learning_rate": 4.321527286751944e-05, "loss": 0.7046, "step": 13420 }, { "epoch": 2.58, "learning_rate": 4.2829594394806195e-05, "loss": 0.7053, "step": 13440 }, { "epoch": 2.58, "learning_rate": 4.2443915922092944e-05, "loss": 0.707, "step": 13460 }, { "epoch": 2.58, "learning_rate": 4.20582374493797e-05, "loss": 0.7123, "step": 13480 }, { "epoch": 2.59, "learning_rate": 4.1672558976666444e-05, "loss": 0.7032, "step": 13500 }, { "epoch": 2.59, "learning_rate": 4.12868805039532e-05, "loss": 0.6942, "step": 13520 }, { "epoch": 2.59, "learning_rate": 4.090120203123995e-05, "loss": 0.6981, "step": 13540 }, { "epoch": 2.6, "learning_rate": 4.051552355852671e-05, "loss": 0.7052, "step": 13560 }, { "epoch": 2.6, "learning_rate": 4.012984508581345e-05, "loss": 0.7044, "step": 13580 }, { "epoch": 2.61, "learning_rate": 3.9744166613100206e-05, "loss": 0.6978, "step": 13600 }, { "epoch": 2.61, "eval_loss": 0.7352051734924316, "eval_runtime": 25.5016, "eval_samples_per_second": 78.426, "eval_steps_per_second": 1.255, "step": 13600 }, { "epoch": 2.61, "learning_rate": 3.9358488140386956e-05, "loss": 0.7001, "step": 13620 }, { "epoch": 2.61, "learning_rate": 3.897280966767371e-05, "loss": 0.7065, "step": 13640 }, { "epoch": 2.62, "learning_rate": 3.858713119496046e-05, "loss": 0.6999, "step": 13660 }, { "epoch": 2.62, "learning_rate": 3.820145272224722e-05, "loss": 0.7104, "step": 13680 }, { "epoch": 2.63, "learning_rate": 3.781577424953396e-05, "loss": 0.7079, "step": 13700 }, { "epoch": 2.63, "learning_rate": 3.743009577682072e-05, "loss": 0.7059, "step": 13720 }, { "epoch": 2.63, "learning_rate": 3.7063701227743136e-05, "loss": 0.7088, "step": 13740 }, { "epoch": 2.64, "learning_rate": 3.6678022755029886e-05, "loss": 0.7051, "step": 13760 }, { "epoch": 2.64, "learning_rate": 3.629234428231664e-05, "loss": 0.7004, "step": 13780 }, { "epoch": 2.64, "learning_rate": 3.590666580960339e-05, "loss": 0.7, "step": 13800 }, { "epoch": 2.64, "eval_loss": 0.7350977659225464, "eval_runtime": 25.4618, "eval_samples_per_second": 78.549, "eval_steps_per_second": 1.257, "step": 13800 }, { "epoch": 2.65, "learning_rate": 3.552098733689014e-05, "loss": 0.7044, "step": 13820 }, { "epoch": 2.65, "learning_rate": 3.51353088641769e-05, "loss": 0.6967, "step": 13840 }, { "epoch": 2.66, "learning_rate": 3.474963039146365e-05, "loss": 0.6932, "step": 13860 }, { "epoch": 2.66, "learning_rate": 3.43639519187504e-05, "loss": 0.6982, "step": 13880 }, { "epoch": 2.66, "learning_rate": 3.3978273446037154e-05, "loss": 0.7064, "step": 13900 }, { "epoch": 2.67, "learning_rate": 3.3592594973323904e-05, "loss": 0.7064, "step": 13920 }, { "epoch": 2.67, "learning_rate": 3.3206916500610654e-05, "loss": 0.6975, "step": 13940 }, { "epoch": 2.67, "learning_rate": 3.282123802789741e-05, "loss": 0.7023, "step": 13960 }, { "epoch": 2.68, "learning_rate": 3.243555955518416e-05, "loss": 0.706, "step": 13980 }, { "epoch": 2.68, "learning_rate": 3.204988108247091e-05, "loss": 0.696, "step": 14000 }, { "epoch": 2.68, "eval_loss": 0.7347920536994934, "eval_runtime": 25.5132, "eval_samples_per_second": 78.391, "eval_steps_per_second": 1.254, "step": 14000 }, { "epoch": 2.69, "learning_rate": 3.1664202609757666e-05, "loss": 0.6995, "step": 14020 }, { "epoch": 2.69, "learning_rate": 3.1278524137044416e-05, "loss": 0.7022, "step": 14040 }, { "epoch": 2.69, "learning_rate": 3.0892845664331166e-05, "loss": 0.7086, "step": 14060 }, { "epoch": 2.7, "learning_rate": 3.050716719161792e-05, "loss": 0.7135, "step": 14080 }, { "epoch": 2.7, "learning_rate": 3.0121488718904672e-05, "loss": 0.7036, "step": 14100 }, { "epoch": 2.71, "learning_rate": 2.9735810246191422e-05, "loss": 0.6979, "step": 14120 }, { "epoch": 2.71, "learning_rate": 2.9350131773478175e-05, "loss": 0.7082, "step": 14140 }, { "epoch": 2.71, "learning_rate": 2.8964453300764928e-05, "loss": 0.7008, "step": 14160 }, { "epoch": 2.72, "learning_rate": 2.8578774828051678e-05, "loss": 0.7085, "step": 14180 }, { "epoch": 2.72, "learning_rate": 2.819309635533843e-05, "loss": 0.6983, "step": 14200 }, { "epoch": 2.72, "eval_loss": 0.73465496301651, "eval_runtime": 25.4933, "eval_samples_per_second": 78.452, "eval_steps_per_second": 1.255, "step": 14200 }, { "epoch": 2.72, "learning_rate": 2.7807417882625184e-05, "loss": 0.7123, "step": 14220 }, { "epoch": 2.73, "learning_rate": 2.7421739409911934e-05, "loss": 0.7027, "step": 14240 }, { "epoch": 2.73, "learning_rate": 2.7036060937198687e-05, "loss": 0.7124, "step": 14260 }, { "epoch": 2.74, "learning_rate": 2.6650382464485437e-05, "loss": 0.7102, "step": 14280 }, { "epoch": 2.74, "learning_rate": 2.626470399177219e-05, "loss": 0.7062, "step": 14300 }, { "epoch": 2.74, "learning_rate": 2.5879025519058943e-05, "loss": 0.7094, "step": 14320 }, { "epoch": 2.75, "learning_rate": 2.5493347046345693e-05, "loss": 0.7017, "step": 14340 }, { "epoch": 2.75, "learning_rate": 2.5107668573632446e-05, "loss": 0.7033, "step": 14360 }, { "epoch": 2.76, "learning_rate": 2.47219901009192e-05, "loss": 0.7036, "step": 14380 }, { "epoch": 2.76, "learning_rate": 2.433631162820595e-05, "loss": 0.7041, "step": 14400 }, { "epoch": 2.76, "eval_loss": 0.7345843315124512, "eval_runtime": 25.4933, "eval_samples_per_second": 78.452, "eval_steps_per_second": 1.255, "step": 14400 }, { "epoch": 2.76, "learning_rate": 2.3950633155492702e-05, "loss": 0.6983, "step": 14420 }, { "epoch": 2.77, "learning_rate": 2.3564954682779455e-05, "loss": 0.7006, "step": 14440 }, { "epoch": 2.77, "learning_rate": 2.3179276210066205e-05, "loss": 0.7047, "step": 14460 }, { "epoch": 2.77, "learning_rate": 2.2793597737352958e-05, "loss": 0.7036, "step": 14480 }, { "epoch": 2.78, "learning_rate": 2.2407919264639708e-05, "loss": 0.7025, "step": 14500 }, { "epoch": 2.78, "learning_rate": 2.202224079192646e-05, "loss": 0.699, "step": 14520 }, { "epoch": 2.79, "learning_rate": 2.1636562319213214e-05, "loss": 0.699, "step": 14540 }, { "epoch": 2.79, "learning_rate": 2.1250883846499964e-05, "loss": 0.6968, "step": 14560 }, { "epoch": 2.79, "learning_rate": 2.0865205373786717e-05, "loss": 0.697, "step": 14580 }, { "epoch": 2.8, "learning_rate": 2.047952690107347e-05, "loss": 0.6981, "step": 14600 }, { "epoch": 2.8, "eval_loss": 0.7341080904006958, "eval_runtime": 25.516, "eval_samples_per_second": 78.382, "eval_steps_per_second": 1.254, "step": 14600 }, { "epoch": 2.8, "learning_rate": 2.009384842836022e-05, "loss": 0.706, "step": 14620 }, { "epoch": 2.81, "learning_rate": 1.9708169955646973e-05, "loss": 0.6964, "step": 14640 }, { "epoch": 2.81, "learning_rate": 1.9322491482933726e-05, "loss": 0.7043, "step": 14660 }, { "epoch": 2.81, "learning_rate": 1.8936813010220476e-05, "loss": 0.7044, "step": 14680 }, { "epoch": 2.82, "learning_rate": 1.855113453750723e-05, "loss": 0.7079, "step": 14700 }, { "epoch": 2.82, "learning_rate": 1.8165456064793982e-05, "loss": 0.7096, "step": 14720 }, { "epoch": 2.82, "learning_rate": 1.7779777592080735e-05, "loss": 0.6977, "step": 14740 }, { "epoch": 2.83, "learning_rate": 1.7394099119367485e-05, "loss": 0.6997, "step": 14760 }, { "epoch": 2.83, "learning_rate": 1.7008420646654238e-05, "loss": 0.7033, "step": 14780 }, { "epoch": 2.84, "learning_rate": 1.662274217394099e-05, "loss": 0.7016, "step": 14800 }, { "epoch": 2.84, "eval_loss": 0.7337221503257751, "eval_runtime": 25.5058, "eval_samples_per_second": 78.413, "eval_steps_per_second": 1.255, "step": 14800 }, { "epoch": 2.84, "learning_rate": 1.623706370122774e-05, "loss": 0.6907, "step": 14820 }, { "epoch": 2.84, "learning_rate": 1.5851385228514494e-05, "loss": 0.7043, "step": 14840 }, { "epoch": 2.85, "learning_rate": 1.5465706755801247e-05, "loss": 0.7058, "step": 14860 }, { "epoch": 2.85, "learning_rate": 1.5080028283087997e-05, "loss": 0.6956, "step": 14880 }, { "epoch": 2.85, "learning_rate": 1.469434981037475e-05, "loss": 0.7109, "step": 14900 }, { "epoch": 2.86, "learning_rate": 1.4308671337661502e-05, "loss": 0.7055, "step": 14920 }, { "epoch": 2.86, "learning_rate": 1.3922992864948253e-05, "loss": 0.7011, "step": 14940 }, { "epoch": 2.87, "learning_rate": 1.3537314392235005e-05, "loss": 0.7009, "step": 14960 }, { "epoch": 2.87, "learning_rate": 1.3151635919521758e-05, "loss": 0.7069, "step": 14980 }, { "epoch": 2.87, "learning_rate": 1.276595744680851e-05, "loss": 0.7038, "step": 15000 }, { "epoch": 2.87, "eval_loss": 0.7338148355484009, "eval_runtime": 25.4764, "eval_samples_per_second": 78.504, "eval_steps_per_second": 1.256, "step": 15000 }, { "epoch": 2.88, "learning_rate": 1.238027897409526e-05, "loss": 0.706, "step": 15020 }, { "epoch": 2.88, "learning_rate": 1.1994600501382012e-05, "loss": 0.6918, "step": 15040 }, { "epoch": 2.89, "learning_rate": 1.1608922028668765e-05, "loss": 0.7045, "step": 15060 }, { "epoch": 2.89, "learning_rate": 1.1223243555955517e-05, "loss": 0.6984, "step": 15080 }, { "epoch": 2.89, "learning_rate": 1.0837565083242268e-05, "loss": 0.7126, "step": 15100 }, { "epoch": 2.9, "learning_rate": 1.0451886610529021e-05, "loss": 0.6974, "step": 15120 }, { "epoch": 2.9, "learning_rate": 1.0066208137815773e-05, "loss": 0.7063, "step": 15140 }, { "epoch": 2.9, "learning_rate": 9.680529665102524e-06, "loss": 0.697, "step": 15160 }, { "epoch": 2.91, "learning_rate": 9.294851192389277e-06, "loss": 0.6965, "step": 15180 }, { "epoch": 2.91, "learning_rate": 8.909172719676029e-06, "loss": 0.7001, "step": 15200 }, { "epoch": 2.91, "eval_loss": 0.733613908290863, "eval_runtime": 25.4875, "eval_samples_per_second": 78.47, "eval_steps_per_second": 1.256, "step": 15200 } ], "max_steps": 15657, "num_train_epochs": 3, "total_flos": 7.685422038604841e+19, "trial_name": null, "trial_params": null }