diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10535 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999956610404825, + "global_step": 17285, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.0, + "loss": 8.2501, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.936416184971099e-07, + "loss": 7.9965, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 3.0057803468208094e-06, + "loss": 8.0268, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 5.317919075144509e-06, + "loss": 7.9878, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 7.398843930635839e-06, + "loss": 7.6933, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 9.710982658959537e-06, + "loss": 7.4624, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.2023121387283238e-05, + "loss": 7.3907, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 1.4335260115606938e-05, + "loss": 7.0312, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 1.6647398843930635e-05, + "loss": 6.6125, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.8959537572254336e-05, + "loss": 6.3013, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 2.1271676300578036e-05, + "loss": 6.068, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 2.3583815028901734e-05, + "loss": 5.8308, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 2.5895953757225434e-05, + "loss": 5.7656, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 2.8208092485549138e-05, + "loss": 5.5955, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 3.0520231213872835e-05, + "loss": 5.3842, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 3.283236994219653e-05, + "loss": 5.2866, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 3.514450867052023e-05, + "loss": 5.0532, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 3.7456647398843934e-05, + "loss": 4.9624, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 3.976878612716764e-05, + "loss": 4.8342, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 4.2080924855491335e-05, + "loss": 4.6055, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 4.439306358381503e-05, + "loss": 4.5276, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 4.670520231213873e-05, + "loss": 4.3676, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 4.9017341040462426e-05, + "loss": 4.2029, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 5.1329479768786124e-05, + "loss": 4.0336, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 5.364161849710983e-05, + "loss": 3.8993, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 5.595375722543353e-05, + "loss": 3.834, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 5.8265895953757235e-05, + "loss": 3.7466, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 6.057803468208093e-05, + "loss": 3.6144, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 6.289017341040462e-05, + "loss": 3.4977, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 6.520231213872833e-05, + "loss": 3.4428, + "step": 290 + }, + { + "epoch": 0.05, + "learning_rate": 6.751445086705203e-05, + "loss": 3.2823, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 6.982658959537573e-05, + "loss": 3.2296, + "step": 310 + }, + { + "epoch": 0.06, + "learning_rate": 7.213872832369943e-05, + "loss": 3.1029, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 7.445086705202312e-05, + "loss": 3.1717, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 7.676300578034682e-05, + "loss": 3.0968, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 7.907514450867053e-05, + "loss": 2.9926, + "step": 350 + }, + { + "epoch": 0.06, + "learning_rate": 8.138728323699423e-05, + "loss": 3.0021, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 8.369942196531792e-05, + "loss": 2.891, + "step": 370 + }, + { + "epoch": 0.07, + "learning_rate": 8.601156069364162e-05, + "loss": 2.8498, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 8.832369942196532e-05, + "loss": 2.8172, + "step": 390 + }, + { + "epoch": 0.07, + "learning_rate": 9.063583815028902e-05, + "loss": 2.8302, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 9.294797687861271e-05, + "loss": 2.7333, + "step": 410 + }, + { + "epoch": 0.07, + "learning_rate": 9.526011560693642e-05, + "loss": 2.7135, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 9.757225433526012e-05, + "loss": 2.6811, + "step": 430 + }, + { + "epoch": 0.08, + "learning_rate": 9.988439306358382e-05, + "loss": 2.6537, + "step": 440 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010219653179190752, + "loss": 2.6031, + "step": 450 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010450867052023121, + "loss": 2.6037, + "step": 460 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010682080924855491, + "loss": 2.5387, + "step": 470 + }, + { + "epoch": 0.08, + "learning_rate": 0.00010913294797687861, + "loss": 2.5393, + "step": 480 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011144508670520233, + "loss": 2.5387, + "step": 490 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011375722543352603, + "loss": 2.4848, + "step": 500 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011606936416184973, + "loss": 2.4773, + "step": 510 + }, + { + "epoch": 0.09, + "learning_rate": 0.00011838150289017342, + "loss": 2.4453, + "step": 520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012069364161849712, + "loss": 2.3941, + "step": 530 + }, + { + "epoch": 0.09, + "learning_rate": 0.00012300578034682083, + "loss": 2.431, + "step": 540 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012531791907514453, + "loss": 2.4208, + "step": 550 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012763005780346823, + "loss": 2.4313, + "step": 560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00012971098265895952, + "loss": 2.3427, + "step": 570 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013202312138728322, + "loss": 2.3415, + "step": 580 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013433526011560694, + "loss": 2.2621, + "step": 590 + }, + { + "epoch": 0.1, + "learning_rate": 0.00013664739884393064, + "loss": 2.3606, + "step": 600 + }, + { + "epoch": 0.11, + "learning_rate": 0.00013895953757225434, + "loss": 2.3175, + "step": 610 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014127167630057804, + "loss": 2.2297, + "step": 620 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014358381502890176, + "loss": 2.1856, + "step": 630 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014589595375722546, + "loss": 2.2633, + "step": 640 + }, + { + "epoch": 0.11, + "learning_rate": 0.00014820809248554915, + "loss": 2.2474, + "step": 650 + }, + { + "epoch": 0.11, + "learning_rate": 0.00015052023121387285, + "loss": 2.2024, + "step": 660 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015260115606936415, + "loss": 2.1947, + "step": 670 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015491329479768785, + "loss": 2.234, + "step": 680 + }, + { + "epoch": 0.12, + "learning_rate": 0.00015722543352601157, + "loss": 2.2061, + "step": 690 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001593063583815029, + "loss": 2.1553, + "step": 700 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001616184971098266, + "loss": 2.2286, + "step": 710 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001639306358381503, + "loss": 2.161, + "step": 720 + }, + { + "epoch": 0.13, + "learning_rate": 0.000166242774566474, + "loss": 2.1628, + "step": 730 + }, + { + "epoch": 0.13, + "learning_rate": 0.00016855491329479768, + "loss": 2.1371, + "step": 740 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017063583815028904, + "loss": 2.2181, + "step": 750 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017271676300578033, + "loss": 2.154, + "step": 760 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017502890173410406, + "loss": 2.1695, + "step": 770 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017734104046242776, + "loss": 2.1685, + "step": 780 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017965317919075145, + "loss": 2.192, + "step": 790 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018196531791907515, + "loss": 2.1567, + "step": 800 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018427745664739887, + "loss": 2.0987, + "step": 810 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018658959537572257, + "loss": 2.1687, + "step": 820 + }, + { + "epoch": 0.14, + "learning_rate": 0.00018890173410404627, + "loss": 2.0736, + "step": 830 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019121387283236997, + "loss": 2.0683, + "step": 840 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019352601156069366, + "loss": 2.0872, + "step": 850 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019583815028901736, + "loss": 2.1029, + "step": 860 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019815028901734106, + "loss": 2.0301, + "step": 870 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019999999267878048, + "loss": 2.0957, + "step": 880 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001999997364362091, + "loss": 2.0484, + "step": 890 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999911413373273, + "loss": 2.0489, + "step": 900 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999812577362934, + "loss": 2.0073, + "step": 910 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999967713595169, + "loss": 2.0241, + "step": 920 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999505089635347, + "loss": 2.0097, + "step": 930 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001999929643904369, + "loss": 2.0251, + "step": 940 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019999051184940516, + "loss": 1.9893, + "step": 950 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998769328223598, + "loss": 1.9893, + "step": 960 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998450869924703, + "loss": 1.9321, + "step": 970 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019998095811209587, + "loss": 2.0008, + "step": 980 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019997704153377978, + "loss": 2.0254, + "step": 990 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001999727589786358, + "loss": 1.873, + "step": 1000 + }, + { + "epoch": 0.17, + "eval_loss": 1.0050371885299683, + "eval_runtime": 62.5449, + "eval_samples_per_second": 8.378, + "eval_steps_per_second": 0.528, + "step": 1000 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019996811046234077, + "loss": 1.9664, + "step": 1010 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019996309600191098, + "loss": 1.9404, + "step": 1020 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019995771561570248, + "loss": 1.969, + "step": 1030 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019995196932341073, + "loss": 1.9545, + "step": 1040 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019994585714607066, + "loss": 1.9141, + "step": 1050 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019993937910605658, + "loss": 1.9299, + "step": 1060 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019993253522708205, + "loss": 1.9305, + "step": 1070 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001999253255341998, + "loss": 1.902, + "step": 1080 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019991775005380173, + "loss": 1.9416, + "step": 1090 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019991061939600934, + "loss": 1.9164, + "step": 1100 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019990234899683635, + "loss": 1.947, + "step": 1110 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019989371289425568, + "loss": 1.9242, + "step": 1120 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019988471111988062, + "loss": 1.9037, + "step": 1130 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019987534370666328, + "loss": 1.915, + "step": 1140 + }, + { + "epoch": 0.2, + "learning_rate": 0.000199865610688894, + "loss": 1.9268, + "step": 1150 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019985551210220158, + "loss": 1.9268, + "step": 1160 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019984611084327463, + "loss": 1.9629, + "step": 1170 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019983531777857815, + "loss": 1.854, + "step": 1180 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019982415925584902, + "loss": 1.9051, + "step": 1190 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019981263531593422, + "loss": 1.8801, + "step": 1200 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019980074600101837, + "loss": 1.8322, + "step": 1210 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978849135462366, + "loss": 1.8857, + "step": 1220 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977587142160945, + "loss": 1.8805, + "step": 1230 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976288624817248, + "loss": 1.8511, + "step": 1240 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019974953588184632, + "loss": 1.8872, + "step": 1250 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019973582037150148, + "loss": 1.8636, + "step": 1260 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019972173976734507, + "loss": 1.8701, + "step": 1270 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019970729412092063, + "loss": 1.8454, + "step": 1280 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019969248348510808, + "loss": 1.8941, + "step": 1290 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019967730791412328, + "loss": 1.8561, + "step": 1300 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019966176746351818, + "loss": 1.8992, + "step": 1310 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019964586219018018, + "loss": 1.8372, + "step": 1320 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001996295921523323, + "loss": 1.8278, + "step": 1330 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019961295740953278, + "loss": 1.8311, + "step": 1340 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019959595802267492, + "loss": 1.8281, + "step": 1350 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001995785940539868, + "loss": 1.8188, + "step": 1360 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019956086556703113, + "loss": 1.8156, + "step": 1370 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019954277262670495, + "loss": 1.7751, + "step": 1380 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019952431529923949, + "loss": 1.832, + "step": 1390 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019950549365219968, + "loss": 1.8475, + "step": 1400 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019948630775448433, + "loss": 1.8329, + "step": 1410 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019946675767632544, + "loss": 1.8352, + "step": 1420 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019944684348928822, + "loss": 1.8325, + "step": 1430 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019942860946808643, + "loss": 1.8484, + "step": 1440 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019940800367611585, + "loss": 1.837, + "step": 1450 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019938703399034234, + "loss": 1.8295, + "step": 1460 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019936570048752775, + "loss": 1.8153, + "step": 1470 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019934400324576564, + "loss": 1.7925, + "step": 1480 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001993219423444811, + "loss": 1.8383, + "step": 1490 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001992995178644305, + "loss": 1.8135, + "step": 1500 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019927672988770105, + "loss": 1.8036, + "step": 1510 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019925357849771066, + "loss": 1.8035, + "step": 1520 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019923243159839809, + "loss": 1.8135, + "step": 1530 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019920858995779232, + "loss": 1.7839, + "step": 1540 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019918438515335927, + "loss": 1.7759, + "step": 1550 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019915981727370316, + "loss": 1.7933, + "step": 1560 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019913488640875744, + "loss": 1.7977, + "step": 1570 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019910959264978422, + "loss": 1.7797, + "step": 1580 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019908393608937406, + "loss": 1.7656, + "step": 1590 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019905791682144557, + "loss": 1.798, + "step": 1600 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019903153494124518, + "loss": 1.7618, + "step": 1610 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019900479054534652, + "loss": 1.7879, + "step": 1620 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019897768373165046, + "loss": 1.7972, + "step": 1630 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019895297781409127, + "loss": 1.7738, + "step": 1640 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019892518268104788, + "loss": 1.7901, + "step": 1650 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019889702542162026, + "loss": 1.7489, + "step": 1660 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019887137435523912, + "loss": 1.8051, + "step": 1670 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019884252934074216, + "loss": 1.7428, + "step": 1680 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001988133225024225, + "loss": 1.7961, + "step": 1690 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019878375394719502, + "loss": 1.7779, + "step": 1700 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019875382378329857, + "loss": 1.8037, + "step": 1710 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001987235321202958, + "loss": 1.7767, + "step": 1720 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019869287906907265, + "loss": 1.8044, + "step": 1730 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001986618647418379, + "loss": 1.7517, + "step": 1740 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001986304892521229, + "loss": 1.8253, + "step": 1750 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019859875271478102, + "loss": 1.7588, + "step": 1760 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019856665524598733, + "loss": 1.7948, + "step": 1770 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019853419696323806, + "loss": 1.8023, + "step": 1780 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019850467611100676, + "loss": 1.7663, + "step": 1790 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019847153261017426, + "loss": 1.7566, + "step": 1800 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019843802864359298, + "loss": 1.7882, + "step": 1810 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019840416433390782, + "loss": 1.782, + "step": 1820 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019836993980508268, + "loss": 1.7849, + "step": 1830 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019833535518240031, + "loss": 1.7793, + "step": 1840 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001983004105924614, + "loss": 1.7761, + "step": 1850 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019826865279520944, + "loss": 1.7676, + "step": 1860 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001982330246209872, + "loss": 1.7274, + "step": 1870 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019819703685410058, + "loss": 1.7513, + "step": 1880 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001981606896262867, + "loss": 1.7478, + "step": 1890 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019812398307059856, + "loss": 1.781, + "step": 1900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019808691732140448, + "loss": 1.7504, + "step": 1910 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019804949251438767, + "loss": 1.7552, + "step": 1920 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001980117087865457, + "loss": 1.8154, + "step": 1930 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019797356627619, + "loss": 1.7762, + "step": 1940 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019793506512294542, + "loss": 1.7263, + "step": 1950 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019789620546774956, + "loss": 1.7446, + "step": 1960 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019786485971773587, + "loss": 1.694, + "step": 1970 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001978253551183793, + "loss": 1.7198, + "step": 1980 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019778549241867687, + "loss": 1.7423, + "step": 1990 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001977452717645503, + "loss": 1.7434, + "step": 2000 + }, + { + "epoch": 0.35, + "eval_loss": 0.90117347240448, + "eval_runtime": 61.8399, + "eval_samples_per_second": 8.473, + "eval_steps_per_second": 0.534, + "step": 2000 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019770469330323174, + "loss": 1.7791, + "step": 2010 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019766375718326297, + "loss": 1.7459, + "step": 2020 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019762246355449516, + "loss": 1.7342, + "step": 2030 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019758081256808816, + "loss": 1.7564, + "step": 2040 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019753880437650985, + "loss": 1.7394, + "step": 2050 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019749643913353582, + "loss": 1.7663, + "step": 2060 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019745371699424864, + "loss": 1.7222, + "step": 2070 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019741063811503734, + "loss": 1.7046, + "step": 2080 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001973759182648501, + "loss": 1.6904, + "step": 2090 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019733219765204383, + "loss": 1.6956, + "step": 2100 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019728812074414819, + "loss": 1.7511, + "step": 2110 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019724368770251155, + "loss": 1.7262, + "step": 2120 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001971988986897858, + "loss": 1.7114, + "step": 2130 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019715375386992608, + "loss": 1.7182, + "step": 2140 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019710825340818987, + "loss": 1.7034, + "step": 2150 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019706239747113656, + "loss": 1.7282, + "step": 2160 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019701618622662678, + "loss": 1.74, + "step": 2170 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019696961984382182, + "loss": 1.6854, + "step": 2180 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019692269849318303, + "loss": 1.756, + "step": 2190 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019687542234647106, + "loss": 1.7159, + "step": 2200 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019682779157674537, + "loss": 1.7095, + "step": 2210 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019677980635836363, + "loss": 1.7071, + "step": 2220 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019673146686698093, + "loss": 1.7077, + "step": 2230 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019668277327954917, + "loss": 1.7144, + "step": 2240 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019663372577431663, + "loss": 1.6873, + "step": 2250 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001965892805682537, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019653956111491275, + "loss": 1.6979, + "step": 2270 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019648948826801467, + "loss": 1.698, + "step": 2280 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019644412070578336, + "loss": 1.692, + "step": 2290 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019639337691717884, + "loss": 1.6938, + "step": 2300 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019634228027014033, + "loss": 1.7152, + "step": 2310 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019629083095171264, + "loss": 1.7155, + "step": 2320 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001962390291502316, + "loss": 1.6839, + "step": 2330 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019618687505532334, + "loss": 1.6888, + "step": 2340 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001961343688579036, + "loss": 1.6962, + "step": 2350 + }, + { + "epoch": 0.41, + "learning_rate": 0.000196081510750177, + "loss": 1.6784, + "step": 2360 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019602830092563643, + "loss": 1.672, + "step": 2370 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019597473957906224, + "loss": 1.6769, + "step": 2380 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019592082690652148, + "loss": 1.6975, + "step": 2390 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019586656310536743, + "loss": 1.7687, + "step": 2400 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019581194837423857, + "loss": 1.685, + "step": 2410 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019575698291305813, + "loss": 1.6858, + "step": 2420 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001957016669230331, + "loss": 1.6883, + "step": 2430 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019565158299718013, + "loss": 1.6471, + "step": 2440 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001955956015612708, + "loss": 1.6831, + "step": 2450 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001955392701872709, + "loss": 1.6686, + "step": 2460 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019548258908138753, + "loss": 1.7006, + "step": 2470 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019542555845110805, + "loss": 1.7317, + "step": 2480 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019536817850519927, + "loss": 1.6572, + "step": 2490 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001953104494537067, + "loss": 1.6916, + "step": 2500 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001952523715079538, + "loss": 1.6533, + "step": 2510 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019519394488054127, + "loss": 1.6463, + "step": 2520 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019513516978534608, + "loss": 1.6984, + "step": 2530 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019508197443751353, + "loss": 1.6643, + "step": 2540 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001950284971627635, + "loss": 1.6853, + "step": 2550 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019496874750645754, + "loss": 1.6864, + "step": 2560 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019490865020672837, + "loss": 1.6562, + "step": 2570 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019484820548356873, + "loss": 1.6825, + "step": 2580 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019478741355824313, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019472627465328692, + "loss": 1.7077, + "step": 2600 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001946647889925058, + "loss": 1.7098, + "step": 2610 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019460915560757066, + "loss": 1.6647, + "step": 2620 + }, + { + "epoch": 0.46, + "learning_rate": 0.000194547011731852, + "loss": 1.6919, + "step": 2630 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019448452175651983, + "loss": 1.6805, + "step": 2640 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001944216859103255, + "loss": 1.7276, + "step": 2650 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019435850442328637, + "loss": 1.6987, + "step": 2660 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019429497752668516, + "loss": 1.6923, + "step": 2670 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019423110545306908, + "loss": 1.6908, + "step": 2680 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019416688843624873, + "loss": 1.6799, + "step": 2690 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019410232671129745, + "loss": 1.7065, + "step": 2700 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001940374205145505, + "loss": 1.682, + "step": 2710 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019397217008360404, + "loss": 1.654, + "step": 2720 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001939065756573144, + "loss": 1.6809, + "step": 2730 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019384063747579706, + "loss": 1.6426, + "step": 2740 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019377435578042592, + "loss": 1.6453, + "step": 2750 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019370773081383235, + "loss": 1.6419, + "step": 2760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019364076281990427, + "loss": 1.7025, + "step": 2770 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001935734520437853, + "loss": 1.6897, + "step": 2780 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019350579873187384, + "loss": 1.652, + "step": 2790 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001934378031318222, + "loss": 1.6461, + "step": 2800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019336946549253567, + "loss": 1.6482, + "step": 2810 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019330078606417164, + "loss": 1.684, + "step": 2820 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019323176509813855, + "loss": 1.7247, + "step": 2830 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001931624028470952, + "loss": 1.6417, + "step": 2840 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019309269956494963, + "loss": 1.688, + "step": 2850 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019302967524028727, + "loss": 1.6498, + "step": 2860 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019295932470303454, + "loss": 1.6872, + "step": 2870 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019289571826614754, + "loss": 1.6668, + "step": 2880 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019283183638479643, + "loss": 1.652, + "step": 2890 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019276053369488895, + "loss": 1.6777, + "step": 2900 + }, + { + "epoch": 0.51, + "learning_rate": 0.000192688891444965, + "loss": 1.6466, + "step": 2910 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019261690989727875, + "loss": 1.6432, + "step": 2920 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019254458931532655, + "loss": 1.6499, + "step": 2930 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019247192996384572, + "loss": 1.6599, + "step": 2940 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019239893210881373, + "loss": 1.6458, + "step": 2950 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019232559601744712, + "loss": 1.69, + "step": 2960 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019225192195820067, + "loss": 1.6294, + "step": 2970 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019217791020076627, + "loss": 1.7088, + "step": 2980 + }, + { + "epoch": 0.52, + "learning_rate": 0.000192103561016072, + "loss": 1.6531, + "step": 2990 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019202887467628115, + "loss": 1.6708, + "step": 3000 + }, + { + "epoch": 0.52, + "eval_loss": 0.862983226776123, + "eval_runtime": 61.7517, + "eval_samples_per_second": 8.486, + "eval_steps_per_second": 0.534, + "step": 3000 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019195385145479116, + "loss": 1.6732, + "step": 3010 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001918784916262327, + "loss": 1.6961, + "step": 3020 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001918027954664686, + "loss": 1.6361, + "step": 3030 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019172676325259288, + "loss": 1.708, + "step": 3040 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019165039526292975, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001915736917770325, + "loss": 1.667, + "step": 3060 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019149665307568263, + "loss": 1.6649, + "step": 3070 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019141927944088863, + "loss": 1.6981, + "step": 3080 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001913415711558851, + "loss": 1.6095, + "step": 3090 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019127916377084718, + "loss": 1.6629, + "step": 3100 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019120085383312737, + "loss": 1.6908, + "step": 3110 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019112221004476872, + "loss": 1.6602, + "step": 3120 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019104323269365537, + "loss": 1.6378, + "step": 3130 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019096392206889248, + "loss": 1.642, + "step": 3140 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019088427846080527, + "loss": 1.6605, + "step": 3150 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019080430216093778, + "loss": 1.6055, + "step": 3160 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019072399346205197, + "loss": 1.6423, + "step": 3170 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019064335265812652, + "loss": 1.6856, + "step": 3180 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019056238004435593, + "loss": 1.6115, + "step": 3190 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001904810759171492, + "loss": 1.6145, + "step": 3200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019042396593693816, + "loss": 1.6301, + "step": 3210 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019034209892058318, + "loss": 1.6615, + "step": 3220 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019025990119715506, + "loss": 1.6515, + "step": 3230 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019017737306754754, + "loss": 1.7024, + "step": 3240 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019009451483386375, + "loss": 1.6598, + "step": 3250 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001900113267994153, + "loss": 1.6995, + "step": 3260 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018992780926872102, + "loss": 1.684, + "step": 3270 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018984396254750593, + "loss": 1.6553, + "step": 3280 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018975978694270003, + "loss": 1.6515, + "step": 3290 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018967528276243734, + "loss": 1.6754, + "step": 3300 + }, + { + "epoch": 0.57, + "learning_rate": 0.00018959045031605453, + "loss": 1.6483, + "step": 3310 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018950528991409, + "loss": 1.6569, + "step": 3320 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018941980186828263, + "loss": 1.6626, + "step": 3330 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001893339864915708, + "loss": 1.6508, + "step": 3340 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018924784409809093, + "loss": 1.6662, + "step": 3350 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001891613750031767, + "loss": 1.6426, + "step": 3360 + }, + { + "epoch": 0.58, + "learning_rate": 0.00018907457952335754, + "loss": 1.6468, + "step": 3370 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001889874579763578, + "loss": 1.6326, + "step": 3380 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018890001068109534, + "loss": 1.6034, + "step": 3390 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018881223795768053, + "loss": 1.6951, + "step": 3400 + }, + { + "epoch": 0.59, + "learning_rate": 0.00018872414012741494, + "loss": 1.5691, + "step": 3410 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001886445743803333, + "loss": 1.6343, + "step": 3420 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018857362860914253, + "loss": 1.6143, + "step": 3430 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018848465460459042, + "loss": 1.663, + "step": 3440 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018839535669234195, + "loss": 1.617, + "step": 3450 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018830573519928195, + "loss": 1.6374, + "step": 3460 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001882157904534795, + "loss": 1.6472, + "step": 3470 + }, + { + "epoch": 0.6, + "learning_rate": 0.00018812552278418726, + "loss": 1.6326, + "step": 3480 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018803493252183976, + "loss": 1.6444, + "step": 3490 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018794401999805248, + "loss": 1.6167, + "step": 3500 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018785278554562065, + "loss": 1.6498, + "step": 3510 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018776122949851792, + "loss": 1.6605, + "step": 3520 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018766935219189507, + "loss": 1.6455, + "step": 3530 + }, + { + "epoch": 0.61, + "learning_rate": 0.00018757715396207903, + "loss": 1.671, + "step": 3540 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018748463514657146, + "loss": 1.6176, + "step": 3550 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018739179608404747, + "loss": 1.6459, + "step": 3560 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018729863711435457, + "loss": 1.6481, + "step": 3570 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018720515857851132, + "loss": 1.6823, + "step": 3580 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018711136081870605, + "loss": 1.6239, + "step": 3590 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018701724417829565, + "loss": 1.6209, + "step": 3600 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001869228090018043, + "loss": 1.6381, + "step": 3610 + }, + { + "epoch": 0.63, + "learning_rate": 0.00018682805563492225, + "loss": 1.6498, + "step": 3620 + }, + { + "epoch": 0.63, + "learning_rate": 0.00018673298442450448, + "loss": 1.6377, + "step": 3630 + }, + { + "epoch": 0.63, + "learning_rate": 0.00018663759571856952, + "loss": 1.6513, + "step": 3640 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001865514747131347, + "loss": 1.6385, + "step": 3650 + }, + { + "epoch": 0.64, + "learning_rate": 0.00018646509707450926, + "loss": 1.6137, + "step": 3660 + }, + { + "epoch": 0.64, + "learning_rate": 0.00018636882124247248, + "loss": 1.6402, + "step": 3670 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001862819026646694, + "loss": 1.6949, + "step": 3680 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001861850264262445, + "loss": 1.6283, + "step": 3690 + }, + { + "epoch": 0.64, + "learning_rate": 0.00018608783469816221, + "loss": 1.633, + "step": 3700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00018599032783620342, + "loss": 1.6442, + "step": 3710 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018589250619730253, + "loss": 1.629, + "step": 3720 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018580419788394125, + "loss": 1.599, + "step": 3730 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018570577915633075, + "loss": 1.6526, + "step": 3740 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018560704669339962, + "loss": 1.6345, + "step": 3750 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018550800085656875, + "loss": 1.6609, + "step": 3760 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018540864200840615, + "loss": 1.6241, + "step": 3770 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001853089705126257, + "loss": 1.6081, + "step": 3780 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018520898673408576, + "loss": 1.6591, + "step": 3790 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018510869103878781, + "loss": 1.6196, + "step": 3800 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018500808379387515, + "loss": 1.6015, + "step": 3810 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018490716536763153, + "loss": 1.6196, + "step": 3820 + }, + { + "epoch": 0.66, + "learning_rate": 0.00018480593612947978, + "loss": 1.6504, + "step": 3830 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018470439644998062, + "loss": 1.6474, + "step": 3840 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018460254670083103, + "loss": 1.6038, + "step": 3850 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018450038725486306, + "loss": 1.712, + "step": 3860 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018439791848604253, + "loss": 1.6311, + "step": 3870 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018429514076946746, + "loss": 1.626, + "step": 3880 + }, + { + "epoch": 0.68, + "learning_rate": 0.00018419205448136686, + "loss": 1.6255, + "step": 3890 + }, + { + "epoch": 0.68, + "learning_rate": 0.00018408865999909932, + "loss": 1.6269, + "step": 3900 + }, + { + "epoch": 0.68, + "learning_rate": 0.00018398495770115153, + "loss": 1.5649, + "step": 3910 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001838809479671371, + "loss": 1.6243, + "step": 3920 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001837766311777949, + "loss": 1.6366, + "step": 3930 + }, + { + "epoch": 0.68, + "learning_rate": 0.00018367200771498787, + "loss": 1.6387, + "step": 3940 + }, + { + "epoch": 0.69, + "learning_rate": 0.00018356707796170161, + "loss": 1.6256, + "step": 3950 + }, + { + "epoch": 0.69, + "learning_rate": 0.00018346184230204292, + "loss": 1.6158, + "step": 3960 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001833563011212383, + "loss": 1.6103, + "step": 3970 + }, + { + "epoch": 0.69, + "learning_rate": 0.00018325045480563273, + "loss": 1.6038, + "step": 3980 + }, + { + "epoch": 0.69, + "learning_rate": 0.00018314430374268817, + "loss": 1.5909, + "step": 3990 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001830378483209821, + "loss": 1.612, + "step": 4000 + }, + { + "epoch": 0.69, + "eval_loss": 0.8420035243034363, + "eval_runtime": 62.0042, + "eval_samples_per_second": 8.451, + "eval_steps_per_second": 0.532, + "step": 4000 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001829310889302062, + "loss": 1.6545, + "step": 4010 + }, + { + "epoch": 0.7, + "learning_rate": 0.00018284546282243836, + "loss": 1.6391, + "step": 4020 + }, + { + "epoch": 0.7, + "learning_rate": 0.00018273815727291054, + "loss": 1.6237, + "step": 4030 + }, + { + "epoch": 0.7, + "learning_rate": 0.00018263054885136454, + "loss": 1.6281, + "step": 4040 + }, + { + "epoch": 0.7, + "learning_rate": 0.00018252263795171263, + "loss": 1.6102, + "step": 4050 + }, + { + "epoch": 0.7, + "learning_rate": 0.00018241442496897444, + "loss": 1.6246, + "step": 4060 + }, + { + "epoch": 0.71, + "learning_rate": 0.00018230591029927537, + "loss": 1.5991, + "step": 4070 + }, + { + "epoch": 0.71, + "learning_rate": 0.00018219709433984512, + "loss": 1.6252, + "step": 4080 + }, + { + "epoch": 0.71, + "learning_rate": 0.00018208797748901637, + "loss": 1.6047, + "step": 4090 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001819785601462232, + "loss": 1.6173, + "step": 4100 + }, + { + "epoch": 0.71, + "learning_rate": 0.00018186884271199967, + "loss": 1.5678, + "step": 4110 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001817588255879784, + "loss": 1.6143, + "step": 4120 + }, + { + "epoch": 0.72, + "learning_rate": 0.000181648509176889, + "loss": 1.6248, + "step": 4130 + }, + { + "epoch": 0.72, + "learning_rate": 0.00018153789388255677, + "loss": 1.6552, + "step": 4140 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001814269801099009, + "loss": 1.626, + "step": 4150 + }, + { + "epoch": 0.72, + "learning_rate": 0.00018131576826493337, + "loss": 1.6096, + "step": 4160 + }, + { + "epoch": 0.72, + "learning_rate": 0.00018120425875475723, + "loss": 1.6182, + "step": 4170 + }, + { + "epoch": 0.73, + "learning_rate": 0.00018109245198756518, + "loss": 1.6014, + "step": 4180 + }, + { + "epoch": 0.73, + "learning_rate": 0.00018099157208059183, + "loss": 1.5923, + "step": 4190 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001808792016535363, + "loss": 1.5841, + "step": 4200 + }, + { + "epoch": 0.73, + "learning_rate": 0.00018076653515937166, + "loss": 1.5837, + "step": 4210 + }, + { + "epoch": 0.73, + "learning_rate": 0.00018065357301052593, + "loss": 1.6354, + "step": 4220 + }, + { + "epoch": 0.73, + "learning_rate": 0.00018054031562050928, + "loss": 1.6433, + "step": 4230 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001804267634039127, + "loss": 1.6109, + "step": 4240 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001803243146757791, + "loss": 1.6059, + "step": 4250 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018021020343474294, + "loss": 1.6127, + "step": 4260 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001800957985755384, + "loss": 1.6006, + "step": 4270 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017998110051695688, + "loss": 1.5927, + "step": 4280 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001798661096788631, + "loss": 1.6142, + "step": 4290 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017975082648219356, + "loss": 1.6272, + "step": 4300 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001796352513489549, + "loss": 1.6459, + "step": 4310 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017951938470222247, + "loss": 1.6373, + "step": 4320 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001794032269661387, + "loss": 1.6331, + "step": 4330 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017928677856591163, + "loss": 1.6007, + "step": 4340 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001791700399278133, + "loss": 1.6359, + "step": 4350 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017905301147917816, + "loss": 1.5939, + "step": 4360 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017893569364840154, + "loss": 1.5889, + "step": 4370 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001788180868649382, + "loss": 1.6206, + "step": 4380 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017870019155930047, + "loss": 1.5902, + "step": 4390 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017858200816305697, + "loss": 1.6394, + "step": 4400 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017846353710883087, + "loss": 1.6193, + "step": 4410 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017835666757086383, + "loss": 1.6162, + "step": 4420 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017823765116211767, + "loss": 1.6329, + "step": 4430 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001781183483559451, + "loss": 1.6248, + "step": 4440 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017799875958906703, + "loss": 1.6109, + "step": 4450 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001778788852992512, + "loss": 1.5499, + "step": 4460 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001777587259253104, + "loss": 1.6107, + "step": 4470 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017763828190710113, + "loss": 1.5865, + "step": 4480 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017751755368552178, + "loss": 1.6013, + "step": 4490 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017739654170251116, + "loss": 1.5829, + "step": 4500 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017727524640104674, + "loss": 1.6356, + "step": 4510 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017715366822514318, + "loss": 1.6237, + "step": 4520 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017703180761985063, + "loss": 1.5802, + "step": 4530 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017690966503125307, + "loss": 1.5659, + "step": 4540 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001767872409064667, + "loss": 1.61, + "step": 4550 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017666453569363836, + "loss": 1.6184, + "step": 4560 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017654154984194382, + "loss": 1.5797, + "step": 4570 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017641828380158612, + "loss": 1.6256, + "step": 4580 + }, + { + "epoch": 0.8, + "learning_rate": 0.00017629473802379403, + "loss": 1.5783, + "step": 4590 + }, + { + "epoch": 0.8, + "learning_rate": 0.00017617091296082032, + "loss": 1.5988, + "step": 4600 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001760468090659401, + "loss": 1.5904, + "step": 4610 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001759224267934491, + "loss": 1.611, + "step": 4620 + }, + { + "epoch": 0.8, + "learning_rate": 0.00017579776659866218, + "loss": 1.6066, + "step": 4630 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017569783864540068, + "loss": 1.5947, + "step": 4640 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017557267934112085, + "loss": 1.576, + "step": 4650 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017544724339483368, + "loss": 1.6143, + "step": 4660 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017532153126571107, + "loss": 1.5985, + "step": 4670 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017519554341393593, + "loss": 1.5992, + "step": 4680 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017506928030070054, + "loss": 1.5891, + "step": 4690 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017494274238820468, + "loss": 1.5622, + "step": 4700 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017482862369640954, + "loss": 1.5646, + "step": 4710 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017470156494228082, + "loss": 1.6121, + "step": 4720 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001745742327349537, + "loss": 1.5766, + "step": 4730 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017444662754054156, + "loss": 1.557, + "step": 4740 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017431874982615708, + "loss": 1.5716, + "step": 4750 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017419060005991054, + "loss": 1.5992, + "step": 4760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001740621787109081, + "loss": 1.6036, + "step": 4770 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017393348624925004, + "loss": 1.6121, + "step": 4780 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017380452314602916, + "loss": 1.6076, + "step": 4790 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017367528987332885, + "loss": 1.5798, + "step": 4800 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017354578690422157, + "loss": 1.5597, + "step": 4810 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017341601471276708, + "loss": 1.5834, + "step": 4820 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001732859737740105, + "loss": 1.6169, + "step": 4830 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017315566456398086, + "loss": 1.5933, + "step": 4840 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017303815729724509, + "loss": 1.6006, + "step": 4850 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017292043346556449, + "loss": 1.6013, + "step": 4860 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001727893756367969, + "loss": 1.6042, + "step": 4870 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017265805135460778, + "loss": 1.5738, + "step": 4880 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017252646109972383, + "loss": 1.6376, + "step": 4890 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017240780286177955, + "loss": 1.5904, + "step": 4900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001722757085866635, + "loss": 1.5651, + "step": 4910 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017214334973845988, + "loss": 1.5923, + "step": 4920 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001720107268016827, + "loss": 1.6032, + "step": 4930 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017187784026181265, + "loss": 1.5859, + "step": 4940 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017174469060529527, + "loss": 1.5376, + "step": 4950 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017161127831953946, + "loss": 1.5445, + "step": 4960 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001714776038929153, + "loss": 1.5652, + "step": 4970 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017134366781475262, + "loss": 1.5267, + "step": 4980 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017120947057533897, + "loss": 1.5805, + "step": 4990 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017108847017299018, + "loss": 1.5876, + "step": 5000 + }, + { + "epoch": 0.87, + "eval_loss": 0.826651930809021, + "eval_runtime": 61.8972, + "eval_samples_per_second": 8.466, + "eval_steps_per_second": 0.533, + "step": 5000 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017095377808136445, + "loss": 1.6163, + "step": 5010 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017083233311224484, + "loss": 1.629, + "step": 5020 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001706971479483343, + "loss": 1.5993, + "step": 5030 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017056170398982906, + "loss": 1.6056, + "step": 5040 + }, + { + "epoch": 0.88, + "learning_rate": 0.00017042600173253645, + "loss": 1.5728, + "step": 5050 + }, + { + "epoch": 0.88, + "learning_rate": 0.00017029004167320926, + "loss": 1.6298, + "step": 5060 + }, + { + "epoch": 0.88, + "learning_rate": 0.00017015382430954413, + "loss": 1.5792, + "step": 5070 + }, + { + "epoch": 0.88, + "learning_rate": 0.00017001735014017955, + "loss": 1.5615, + "step": 5080 + }, + { + "epoch": 0.88, + "learning_rate": 0.000169880619664694, + "loss": 1.6449, + "step": 5090 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016974363338360425, + "loss": 1.5903, + "step": 5100 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001696063917983635, + "loss": 1.5682, + "step": 5110 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016946889541135946, + "loss": 1.5754, + "step": 5120 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016933114472591262, + "loss": 1.6168, + "step": 5130 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016919314024627436, + "loss": 1.5977, + "step": 5140 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016906871963807865, + "loss": 1.6037, + "step": 5150 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016894409423469082, + "loss": 1.582, + "step": 5160 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016880538182183466, + "loss": 1.5863, + "step": 5170 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016866641753939926, + "loss": 1.5792, + "step": 5180 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016852720189607857, + "loss": 1.5481, + "step": 5190 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016838773540148655, + "loss": 1.594, + "step": 5200 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016824801856615547, + "loss": 1.5484, + "step": 5210 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016810805190153397, + "loss": 1.551, + "step": 5220 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001679678359199853, + "loss": 1.6115, + "step": 5230 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001678273711347852, + "loss": 1.5713, + "step": 5240 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016770074052593968, + "loss": 1.532, + "step": 5250 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016755980443113736, + "loss": 1.6103, + "step": 5260 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016741862102632728, + "loss": 1.5881, + "step": 5270 + }, + { + "epoch": 0.92, + "learning_rate": 0.00016727719082832666, + "loss": 1.5909, + "step": 5280 + }, + { + "epoch": 0.92, + "learning_rate": 0.00016713551435485608, + "loss": 1.5632, + "step": 5290 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001669935921245377, + "loss": 1.5794, + "step": 5300 + }, + { + "epoch": 0.92, + "learning_rate": 0.00016685142465689326, + "loss": 1.5779, + "step": 5310 + }, + { + "epoch": 0.92, + "learning_rate": 0.00016670901247234224, + "loss": 1.615, + "step": 5320 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001665663560921999, + "loss": 1.6188, + "step": 5330 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016642345603867545, + "loss": 1.5351, + "step": 5340 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016628031283487006, + "loss": 1.6056, + "step": 5350 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016613692700477494, + "loss": 1.6137, + "step": 5360 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001659932990732696, + "loss": 1.5894, + "step": 5370 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016584942956611963, + "loss": 1.6144, + "step": 5380 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016570531900997497, + "loss": 1.5606, + "step": 5390 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016556096793236805, + "loss": 1.5671, + "step": 5400 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016541637686171167, + "loss": 1.5839, + "step": 5410 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016527154632729713, + "loss": 1.5991, + "step": 5420 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016512647685929235, + "loss": 1.5912, + "step": 5430 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001649811689887399, + "loss": 1.578, + "step": 5440 + }, + { + "epoch": 0.95, + "learning_rate": 0.00016483562324755502, + "loss": 1.5861, + "step": 5450 + }, + { + "epoch": 0.95, + "learning_rate": 0.00016468984016852374, + "loss": 1.5587, + "step": 5460 + }, + { + "epoch": 0.95, + "learning_rate": 0.0001645438202853008, + "loss": 1.5391, + "step": 5470 + }, + { + "epoch": 0.95, + "learning_rate": 0.00016439756413240793, + "loss": 1.5762, + "step": 5480 + }, + { + "epoch": 0.95, + "learning_rate": 0.00016425107224523168, + "loss": 1.6125, + "step": 5490 + }, + { + "epoch": 0.95, + "learning_rate": 0.0001641190284371531, + "loss": 1.5776, + "step": 5500 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016397209013291726, + "loss": 1.5469, + "step": 5510 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016382491765189186, + "loss": 1.5636, + "step": 5520 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016367751153281774, + "loss": 1.5732, + "step": 5530 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016352987231529103, + "loss": 1.547, + "step": 5540 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016338200053976108, + "loss": 1.6132, + "step": 5550 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016323389674752868, + "loss": 1.5797, + "step": 5560 + }, + { + "epoch": 0.97, + "learning_rate": 0.00016308556148074378, + "loss": 1.5448, + "step": 5570 + }, + { + "epoch": 0.97, + "learning_rate": 0.00016293699528240386, + "loss": 1.5573, + "step": 5580 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001627881986963515, + "loss": 1.5472, + "step": 5590 + }, + { + "epoch": 0.97, + "learning_rate": 0.00016263917226727286, + "loss": 1.5518, + "step": 5600 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001624899165406954, + "loss": 1.5813, + "step": 5610 + }, + { + "epoch": 0.98, + "learning_rate": 0.00016234043206298586, + "loss": 1.5408, + "step": 5620 + }, + { + "epoch": 0.98, + "learning_rate": 0.00016219071938134845, + "loss": 1.5346, + "step": 5630 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001620557833064539, + "loss": 1.5452, + "step": 5640 + }, + { + "epoch": 0.98, + "learning_rate": 0.00016192066323037722, + "loss": 1.5825, + "step": 5650 + }, + { + "epoch": 0.98, + "learning_rate": 0.00016177031449597098, + "loss": 1.5871, + "step": 5660 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001616197396446142, + "loss": 1.5518, + "step": 5670 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001614689392275025, + "loss": 1.5645, + "step": 5680 + }, + { + "epoch": 0.99, + "learning_rate": 0.00016131791379665717, + "loss": 1.549, + "step": 5690 + }, + { + "epoch": 0.99, + "learning_rate": 0.00016116666390492325, + "loss": 1.5868, + "step": 5700 + }, + { + "epoch": 0.99, + "learning_rate": 0.00016101519010596743, + "loss": 1.5491, + "step": 5710 + }, + { + "epoch": 0.99, + "learning_rate": 0.00016086349295427595, + "loss": 1.5768, + "step": 5720 + }, + { + "epoch": 0.99, + "learning_rate": 0.00016072677501010647, + "loss": 1.5492, + "step": 5730 + }, + { + "epoch": 1.0, + "learning_rate": 0.00016057465501875367, + "loss": 1.5695, + "step": 5740 + }, + { + "epoch": 1.0, + "learning_rate": 0.00016042231328729185, + "loss": 1.5693, + "step": 5750 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001602697503733844, + "loss": 1.5273, + "step": 5760 + }, + { + "epoch": 1.0, + "learning_rate": 0.00016011696683550456, + "loss": 1.4587, + "step": 5770 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015996396323293295, + "loss": 1.4827, + "step": 5780 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015981074012575593, + "loss": 1.4734, + "step": 5790 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001596572980748634, + "loss": 1.4913, + "step": 5800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015950363764194662, + "loss": 1.4804, + "step": 5810 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001593497593894963, + "loss": 1.4434, + "step": 5820 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015919566388080048, + "loss": 1.4501, + "step": 5830 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015904135167994264, + "loss": 1.4815, + "step": 5840 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015888682335179924, + "loss": 1.4588, + "step": 5850 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015873207946203802, + "loss": 1.4268, + "step": 5860 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015857712057711592, + "loss": 1.4502, + "step": 5870 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015843747422863421, + "loss": 1.4486, + "step": 5880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015828210841631188, + "loss": 1.4331, + "step": 5890 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001581265292559965, + "loss": 1.4565, + "step": 5900 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015797073731720253, + "loss": 1.453, + "step": 5910 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015781473317022333, + "loss": 1.45, + "step": 5920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015765851738612895, + "loss": 1.4356, + "step": 5930 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015750209053676432, + "loss": 1.4915, + "step": 5940 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015734545319474693, + "loss": 1.4443, + "step": 5950 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015718860593346473, + "loss": 1.4245, + "step": 5960 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001570315493270742, + "loss": 1.4927, + "step": 5970 + }, + { + "epoch": 1.04, + "learning_rate": 0.00015687428395049814, + "loss": 1.4698, + "step": 5980 + }, + { + "epoch": 1.04, + "learning_rate": 0.00015671681037942355, + "loss": 1.4568, + "step": 5990 + }, + { + "epoch": 1.04, + "learning_rate": 0.00015655912919029953, + "loss": 1.4498, + "step": 6000 + }, + { + "epoch": 1.04, + "eval_loss": 0.8124380707740784, + "eval_runtime": 62.0134, + "eval_samples_per_second": 8.45, + "eval_steps_per_second": 0.532, + "step": 6000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00015640124096033526, + "loss": 1.4347, + "step": 6010 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001562431462674977, + "loss": 1.4754, + "step": 6020 + }, + { + "epoch": 1.05, + "learning_rate": 0.00015608484569050975, + "loss": 1.4716, + "step": 6030 + }, + { + "epoch": 1.05, + "learning_rate": 0.00015592633980884778, + "loss": 1.4523, + "step": 6040 + }, + { + "epoch": 1.05, + "learning_rate": 0.00015578350945939874, + "loss": 1.4578, + "step": 6050 + }, + { + "epoch": 1.05, + "learning_rate": 0.00015562461509800382, + "loss": 1.4311, + "step": 6060 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001554814360610988, + "loss": 1.4419, + "step": 6070 + }, + { + "epoch": 1.06, + "learning_rate": 0.00015532215531972608, + "loss": 1.449, + "step": 6080 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001551786294874456, + "loss": 1.4604, + "step": 6090 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001550189644709561, + "loss": 1.4673, + "step": 6100 + }, + { + "epoch": 1.06, + "learning_rate": 0.00015485909805156665, + "loss": 1.4787, + "step": 6110 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001546990308144857, + "loss": 1.4571, + "step": 6120 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001545387633456568, + "loss": 1.4773, + "step": 6130 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015437829623175637, + "loss": 1.4816, + "step": 6140 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015421763006019177, + "loss": 1.5024, + "step": 6150 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015405676541909897, + "loss": 1.4956, + "step": 6160 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015389570289734046, + "loss": 1.4515, + "step": 6170 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015373444308450313, + "loss": 1.4745, + "step": 6180 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015357298657089606, + "loss": 1.4185, + "step": 6190 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015341133394754838, + "loss": 1.4462, + "step": 6200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015324948580620703, + "loss": 1.4664, + "step": 6210 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015308744273933477, + "loss": 1.4747, + "step": 6220 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015292520534010784, + "loss": 1.4091, + "step": 6230 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015277902601747382, + "loss": 1.4634, + "step": 6240 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001526164210235197, + "loss": 1.4462, + "step": 6250 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001524536234214371, + "loss": 1.4771, + "step": 6260 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001522906338071643, + "loss": 1.4639, + "step": 6270 + }, + { + "epoch": 1.09, + "learning_rate": 0.00015212745277734259, + "loss": 1.4302, + "step": 6280 + }, + { + "epoch": 1.09, + "learning_rate": 0.00015196408092931383, + "loss": 1.4412, + "step": 6290 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001518005188611184, + "loss": 1.4518, + "step": 6300 + }, + { + "epoch": 1.1, + "learning_rate": 0.00015163676717149308, + "loss": 1.4316, + "step": 6310 + }, + { + "epoch": 1.1, + "learning_rate": 0.00015147282645986866, + "loss": 1.4317, + "step": 6320 + }, + { + "epoch": 1.1, + "learning_rate": 0.00015130869732636804, + "loss": 1.4911, + "step": 6330 + }, + { + "epoch": 1.1, + "learning_rate": 0.00015114438037180364, + "loss": 1.4359, + "step": 6340 + }, + { + "epoch": 1.1, + "learning_rate": 0.00015097987619767556, + "loss": 1.4517, + "step": 6350 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001508316628659255, + "loss": 1.4638, + "step": 6360 + }, + { + "epoch": 1.11, + "learning_rate": 0.000150666804634212, + "loss": 1.4206, + "step": 6370 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001505017609311527, + "loss": 1.4665, + "step": 6380 + }, + { + "epoch": 1.11, + "learning_rate": 0.00015033653236090806, + "loss": 1.4544, + "step": 6390 + }, + { + "epoch": 1.11, + "learning_rate": 0.00015018766908612838, + "loss": 1.4356, + "step": 6400 + }, + { + "epoch": 1.11, + "learning_rate": 0.00015002209093511546, + "loss": 1.4497, + "step": 6410 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014985632967280134, + "loss": 1.4469, + "step": 6420 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014969038590597315, + "loss": 1.4334, + "step": 6430 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001495242602420861, + "loss": 1.4714, + "step": 6440 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014935795328926125, + "loss": 1.4577, + "step": 6450 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014919146565628327, + "loss": 1.4247, + "step": 6460 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014902479795259822, + "loss": 1.4811, + "step": 6470 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014885795078831132, + "loss": 1.4309, + "step": 6480 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014869092477418482, + "loss": 1.4853, + "step": 6490 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014852372052163553, + "loss": 1.4507, + "step": 6500 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014835633864273287, + "loss": 1.4455, + "step": 6510 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001482055435875876, + "loss": 1.4421, + "step": 6520 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001480378259071914, + "loss": 1.4758, + "step": 6530 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014786993237911187, + "loss": 1.4564, + "step": 6540 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014771867836201847, + "loss": 1.4497, + "step": 6550 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014755045241707308, + "loss": 1.4975, + "step": 6560 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014738205240852806, + "loss": 1.4897, + "step": 6570 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014721347895282978, + "loss": 1.4567, + "step": 6580 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001470447326670598, + "loss": 1.4813, + "step": 6590 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014687581416893218, + "loss": 1.4861, + "step": 6600 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001467067240767915, + "loss": 1.4759, + "step": 6610 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014653746300961038, + "loss": 1.4753, + "step": 6620 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014636803158698738, + "loss": 1.464, + "step": 6630 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014619843042914466, + "loss": 1.4849, + "step": 6640 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014602866015692563, + "loss": 1.4503, + "step": 6650 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014587572283276284, + "loss": 1.4429, + "step": 6660 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014572264948280539, + "loss": 1.481, + "step": 6670 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014555240899848083, + "loss": 1.4358, + "step": 6680 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014538200176461162, + "loss": 1.4236, + "step": 6690 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014521142840499203, + "loss": 1.4508, + "step": 6700 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001450406895440244, + "loss": 1.4858, + "step": 6710 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001448697858067168, + "loss": 1.4396, + "step": 6720 + }, + { + "epoch": 1.17, + "learning_rate": 0.00014469871781868098, + "loss": 1.4119, + "step": 6730 + }, + { + "epoch": 1.17, + "learning_rate": 0.00014452748620612992, + "loss": 1.4862, + "step": 6740 + }, + { + "epoch": 1.17, + "learning_rate": 0.00014435609159587555, + "loss": 1.4585, + "step": 6750 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001442016976021512, + "loss": 1.4725, + "step": 6760 + }, + { + "epoch": 1.17, + "learning_rate": 0.00014402999502526254, + "loss": 1.4497, + "step": 6770 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014385813127179106, + "loss": 1.4455, + "step": 6780 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014368610697086277, + "loss": 1.4252, + "step": 6790 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014351392275219134, + "loss": 1.4629, + "step": 6800 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014334157924607578, + "loss": 1.4628, + "step": 6810 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014316907708339822, + "loss": 1.4321, + "step": 6820 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014299641689562156, + "loss": 1.4558, + "step": 6830 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001428235993147873, + "loss": 1.4762, + "step": 6840 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014265062497351285, + "loss": 1.4601, + "step": 6850 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014247749450498962, + "loss": 1.4782, + "step": 6860 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014230420854298054, + "loss": 1.4407, + "step": 6870 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014213076772181767, + "loss": 1.4164, + "step": 6880 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014195717267640004, + "loss": 1.4223, + "step": 6890 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014178342404219118, + "loss": 1.3949, + "step": 6900 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014160952245521682, + "loss": 1.4718, + "step": 6910 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014145288077845185, + "loss": 1.4722, + "step": 6920 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014127869033547745, + "loss": 1.4446, + "step": 6930 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014112178972372757, + "loss": 1.4475, + "step": 6940 + }, + { + "epoch": 1.21, + "learning_rate": 0.00014094731272664267, + "loss": 1.4202, + "step": 6950 + }, + { + "epoch": 1.21, + "learning_rate": 0.00014077268583746858, + "loss": 1.4854, + "step": 6960 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001405979096954461, + "loss": 1.4541, + "step": 6970 + }, + { + "epoch": 1.21, + "learning_rate": 0.00014042298494036228, + "loss": 1.4756, + "step": 6980 + }, + { + "epoch": 1.21, + "learning_rate": 0.00014024791221254815, + "loss": 1.435, + "step": 6990 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001400726921528765, + "loss": 1.3868, + "step": 7000 + }, + { + "epoch": 1.21, + "eval_loss": 0.8074263334274292, + "eval_runtime": 61.8502, + "eval_samples_per_second": 8.472, + "eval_steps_per_second": 0.534, + "step": 7000 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001398973254027594, + "loss": 1.4526, + "step": 7010 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013972181260414585, + "loss": 1.4434, + "step": 7020 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001395461543995196, + "loss": 1.4733, + "step": 7030 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013937035143189657, + "loss": 1.4456, + "step": 7040 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013919440434482266, + "loss": 1.4451, + "step": 7050 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013901831378237124, + "loss": 1.4572, + "step": 7060 + }, + { + "epoch": 1.23, + "learning_rate": 0.000138842080389141, + "loss": 1.4455, + "step": 7070 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013866570481025346, + "loss": 1.438, + "step": 7080 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013848918769135055, + "loss": 1.4261, + "step": 7090 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013831252967859238, + "loss": 1.436, + "step": 7100 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013813573141865484, + "loss": 1.4295, + "step": 7110 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013797649360826399, + "loss": 1.4416, + "step": 7120 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013779943066211437, + "loss": 1.4494, + "step": 7130 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001376222293470401, + "loss": 1.4558, + "step": 7140 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013744489031170578, + "loss": 1.4371, + "step": 7150 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001372851679656103, + "loss": 1.4876, + "step": 7160 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013710756905065686, + "loss": 1.4465, + "step": 7170 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013692983429941337, + "loss": 1.5011, + "step": 7180 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013675196436249725, + "loss": 1.4399, + "step": 7190 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013657395989102067, + "loss": 1.4586, + "step": 7200 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013639582153658842, + "loss": 1.431, + "step": 7210 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013621754995129522, + "loss": 1.4681, + "step": 7220 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001360391457877237, + "loss": 1.47, + "step": 7230 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001358606096989416, + "loss": 1.4658, + "step": 7240 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001356819423384997, + "loss": 1.4789, + "step": 7250 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013550314436042932, + "loss": 1.4218, + "step": 7260 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001353242164192399, + "loss": 1.4351, + "step": 7270 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013514515916991657, + "loss": 1.4711, + "step": 7280 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013496597326791786, + "loss": 1.4263, + "step": 7290 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013478665936917332, + "loss": 1.4692, + "step": 7300 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013460721813008086, + "loss": 1.457, + "step": 7310 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001344276502075047, + "loss": 1.4114, + "step": 7320 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013424795625877276, + "loss": 1.395, + "step": 7330 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001340681369416742, + "loss": 1.4456, + "step": 7340 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013388819291445723, + "loss": 1.4459, + "step": 7350 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001337081248358265, + "loss": 1.4642, + "step": 7360 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001335279333649408, + "loss": 1.4538, + "step": 7370 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013334761916141064, + "loss": 1.4443, + "step": 7380 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013316718288529567, + "loss": 1.4769, + "step": 7390 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013300468641063172, + "loss": 1.4329, + "step": 7400 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013282402001666874, + "loss": 1.4563, + "step": 7410 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013264323346681258, + "loss": 1.487, + "step": 7420 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013246232742285206, + "loss": 1.4135, + "step": 7430 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013228130254701342, + "loss": 1.485, + "step": 7440 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001321182791051834, + "loss": 1.4554, + "step": 7450 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013193703027476557, + "loss": 1.4543, + "step": 7460 + }, + { + "epoch": 1.3, + "learning_rate": 0.00013175566453537692, + "loss": 1.4419, + "step": 7470 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001315741825509265, + "loss": 1.4353, + "step": 7480 + }, + { + "epoch": 1.3, + "learning_rate": 0.00013139258498574873, + "loss": 1.4382, + "step": 7490 + }, + { + "epoch": 1.3, + "learning_rate": 0.00013121087250460132, + "loss": 1.4579, + "step": 7500 + }, + { + "epoch": 1.3, + "learning_rate": 0.00013102904577266255, + "loss": 1.4475, + "step": 7510 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013084710545552893, + "loss": 1.442, + "step": 7520 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013066505221921273, + "loss": 1.4578, + "step": 7530 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013048288673013966, + "loss": 1.4778, + "step": 7540 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013030060965514632, + "loss": 1.4279, + "step": 7550 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013011822166147767, + "loss": 1.4175, + "step": 7560 + }, + { + "epoch": 1.31, + "learning_rate": 0.00012993572341678483, + "loss": 1.4537, + "step": 7570 + }, + { + "epoch": 1.32, + "learning_rate": 0.00012975311558912248, + "loss": 1.473, + "step": 7580 + }, + { + "epoch": 1.32, + "learning_rate": 0.00012957039884694638, + "loss": 1.4041, + "step": 7590 + }, + { + "epoch": 1.32, + "learning_rate": 0.00012938757385911104, + "loss": 1.4453, + "step": 7600 + }, + { + "epoch": 1.32, + "learning_rate": 0.00012920464129486723, + "loss": 1.4795, + "step": 7610 + }, + { + "epoch": 1.32, + "learning_rate": 0.00012903991056267166, + "loss": 1.4592, + "step": 7620 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001288567754484459, + "loss": 1.4767, + "step": 7630 + }, + { + "epoch": 1.33, + "learning_rate": 0.00012867353470085696, + "loss": 1.4769, + "step": 7640 + }, + { + "epoch": 1.33, + "learning_rate": 0.00012849018899067748, + "loss": 1.4212, + "step": 7650 + }, + { + "epoch": 1.33, + "learning_rate": 0.00012830673898906435, + "loss": 1.4932, + "step": 7660 + }, + { + "epoch": 1.33, + "learning_rate": 0.00012812318536755622, + "loss": 1.4644, + "step": 7670 + }, + { + "epoch": 1.33, + "learning_rate": 0.00012795789906852118, + "loss": 1.4359, + "step": 7680 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001277741504206582, + "loss": 1.4672, + "step": 7690 + }, + { + "epoch": 1.34, + "learning_rate": 0.00012759030010249867, + "loss": 1.4161, + "step": 7700 + }, + { + "epoch": 1.34, + "learning_rate": 0.00012740634878704655, + "loss": 1.4479, + "step": 7710 + }, + { + "epoch": 1.34, + "learning_rate": 0.00012722229714767566, + "loss": 1.5016, + "step": 7720 + }, + { + "epoch": 1.34, + "learning_rate": 0.00012703814585812706, + "loss": 1.4459, + "step": 7730 + }, + { + "epoch": 1.34, + "learning_rate": 0.00012685389559250655, + "loss": 1.4491, + "step": 7740 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012666954702528224, + "loss": 1.4229, + "step": 7750 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012648510083128212, + "loss": 1.4286, + "step": 7760 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012631901634382203, + "loss": 1.4043, + "step": 7770 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012613438651937683, + "loss": 1.44, + "step": 7780 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012594966102716905, + "loss": 1.4456, + "step": 7790 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012576484054340636, + "loss": 1.4206, + "step": 7800 + }, + { + "epoch": 1.36, + "learning_rate": 0.00012557992574464428, + "loss": 1.4273, + "step": 7810 + }, + { + "epoch": 1.36, + "learning_rate": 0.00012539491730778355, + "loss": 1.4658, + "step": 7820 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001252098159100676, + "loss": 1.423, + "step": 7830 + }, + { + "epoch": 1.36, + "learning_rate": 0.00012502462222908025, + "loss": 1.4591, + "step": 7840 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001248393369427431, + "loss": 1.3987, + "step": 7850 + }, + { + "epoch": 1.36, + "learning_rate": 0.00012465396072931307, + "loss": 1.4278, + "step": 7860 + }, + { + "epoch": 1.37, + "learning_rate": 0.00012446849426737996, + "loss": 1.4273, + "step": 7870 + }, + { + "epoch": 1.37, + "learning_rate": 0.00012428293823586387, + "loss": 1.4464, + "step": 7880 + }, + { + "epoch": 1.37, + "learning_rate": 0.00012409729331401288, + "loss": 1.4407, + "step": 7890 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001239115601814004, + "loss": 1.4192, + "step": 7900 + }, + { + "epoch": 1.37, + "learning_rate": 0.00012372573951792271, + "loss": 1.4327, + "step": 7910 + }, + { + "epoch": 1.37, + "learning_rate": 0.00012355842664409558, + "loss": 1.4286, + "step": 7920 + }, + { + "epoch": 1.38, + "learning_rate": 0.00012337244154623397, + "loss": 1.4381, + "step": 7930 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001231863708910095, + "loss": 1.4084, + "step": 7940 + }, + { + "epoch": 1.38, + "learning_rate": 0.00012300021535955412, + "loss": 1.4431, + "step": 7950 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001228139756333103, + "loss": 1.4226, + "step": 7960 + }, + { + "epoch": 1.38, + "learning_rate": 0.00012262765239402884, + "loss": 1.3949, + "step": 7970 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001224412463237662, + "loss": 1.4388, + "step": 7980 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012225475810488206, + "loss": 1.4102, + "step": 7990 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012208684903502762, + "loss": 1.4059, + "step": 8000 + }, + { + "epoch": 1.39, + "eval_loss": 0.8009569048881531, + "eval_runtime": 62.0166, + "eval_samples_per_second": 8.449, + "eval_steps_per_second": 0.532, + "step": 8000 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012190020661473858, + "loss": 1.4513, + "step": 8010 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012171348402636268, + "loss": 1.4368, + "step": 8020 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012152668195341832, + "loss": 1.4503, + "step": 8030 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012133980107971474, + "loss": 1.4039, + "step": 8040 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012115284208934969, + "loss": 1.4467, + "step": 8050 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012096580566670692, + "loss": 1.4028, + "step": 8060 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012077869249645357, + "loss": 1.4299, + "step": 8070 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012059150326353772, + "loss": 1.4264, + "step": 8080 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012040423865318591, + "loss": 1.453, + "step": 8090 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001202168993509006, + "loss": 1.4452, + "step": 8100 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012002948604245768, + "loss": 1.4251, + "step": 8110 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011984199941390392, + "loss": 1.4419, + "step": 8120 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011965444015155452, + "loss": 1.4453, + "step": 8130 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011946680894199054, + "loss": 1.4178, + "step": 8140 + }, + { + "epoch": 1.41, + "learning_rate": 0.00011927910647205644, + "loss": 1.4655, + "step": 8150 + }, + { + "epoch": 1.42, + "learning_rate": 0.00011909133342885747, + "loss": 1.4289, + "step": 8160 + }, + { + "epoch": 1.42, + "learning_rate": 0.00011890349049975729, + "loss": 1.4673, + "step": 8170 + }, + { + "epoch": 1.42, + "learning_rate": 0.00011871557837237537, + "loss": 1.4313, + "step": 8180 + }, + { + "epoch": 1.42, + "learning_rate": 0.00011852759773458446, + "loss": 1.4631, + "step": 8190 + }, + { + "epoch": 1.42, + "learning_rate": 0.00011835835715290196, + "loss": 1.4695, + "step": 8200 + }, + { + "epoch": 1.42, + "learning_rate": 0.00011817024824131962, + "loss": 1.4487, + "step": 8210 + }, + { + "epoch": 1.43, + "learning_rate": 0.00011798207281556853, + "loss": 1.4025, + "step": 8220 + }, + { + "epoch": 1.43, + "learning_rate": 0.00011779383156448527, + "loss": 1.4665, + "step": 8230 + }, + { + "epoch": 1.43, + "learning_rate": 0.00011760552517714743, + "loss": 1.4005, + "step": 8240 + }, + { + "epoch": 1.43, + "learning_rate": 0.00011741715434287097, + "loss": 1.4488, + "step": 8250 + }, + { + "epoch": 1.43, + "learning_rate": 0.00011722871975120782, + "loss": 1.4511, + "step": 8260 + }, + { + "epoch": 1.44, + "learning_rate": 0.00011705907467624817, + "loss": 1.4192, + "step": 8270 + }, + { + "epoch": 1.44, + "learning_rate": 0.00011687052084609971, + "loss": 1.4309, + "step": 8280 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001166819052595759, + "loss": 1.4495, + "step": 8290 + }, + { + "epoch": 1.44, + "learning_rate": 0.00011649322860712455, + "loss": 1.4375, + "step": 8300 + }, + { + "epoch": 1.44, + "learning_rate": 0.00011630449157941714, + "loss": 1.4502, + "step": 8310 + }, + { + "epoch": 1.44, + "learning_rate": 0.00011611569486734603, + "loss": 1.4179, + "step": 8320 + }, + { + "epoch": 1.45, + "learning_rate": 0.00011592683916202211, + "loss": 1.4581, + "step": 8330 + }, + { + "epoch": 1.45, + "learning_rate": 0.00011573792515477222, + "loss": 1.4211, + "step": 8340 + }, + { + "epoch": 1.45, + "learning_rate": 0.00011554895353713662, + "loss": 1.4118, + "step": 8350 + }, + { + "epoch": 1.45, + "learning_rate": 0.00011535992500086643, + "loss": 1.4308, + "step": 8360 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001151708402379212, + "loss": 1.429, + "step": 8370 + }, + { + "epoch": 1.45, + "learning_rate": 0.00011498169994046621, + "loss": 1.4262, + "step": 8380 + }, + { + "epoch": 1.46, + "learning_rate": 0.00011479250480087011, + "loss": 1.4375, + "step": 8390 + }, + { + "epoch": 1.46, + "learning_rate": 0.00011462218285760746, + "loss": 1.4508, + "step": 8400 + }, + { + "epoch": 1.46, + "learning_rate": 0.00011443288542613578, + "loss": 1.4318, + "step": 8410 + }, + { + "epoch": 1.46, + "learning_rate": 0.00011424353516151814, + "loss": 1.4416, + "step": 8420 + }, + { + "epoch": 1.46, + "learning_rate": 0.00011405413275689179, + "loss": 1.4296, + "step": 8430 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001138646789055848, + "loss": 1.4192, + "step": 8440 + }, + { + "epoch": 1.47, + "learning_rate": 0.00011367517430111365, + "loss": 1.4411, + "step": 8450 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001134856196371805, + "loss": 1.407, + "step": 8460 + }, + { + "epoch": 1.47, + "learning_rate": 0.00011329601560767078, + "loss": 1.447, + "step": 8470 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001131063629066507, + "loss": 1.4344, + "step": 8480 + }, + { + "epoch": 1.47, + "learning_rate": 0.00011291666222836454, + "loss": 1.4774, + "step": 8490 + }, + { + "epoch": 1.48, + "learning_rate": 0.00011274589117127904, + "loss": 1.4209, + "step": 8500 + }, + { + "epoch": 1.48, + "learning_rate": 0.00011255610124945745, + "loss": 1.3907, + "step": 8510 + }, + { + "epoch": 1.48, + "learning_rate": 0.00011236626536466241, + "loss": 1.4373, + "step": 8520 + }, + { + "epoch": 1.48, + "learning_rate": 0.00011217638421180883, + "loss": 1.4305, + "step": 8530 + }, + { + "epoch": 1.48, + "learning_rate": 0.00011198645848597729, + "loss": 1.4338, + "step": 8540 + }, + { + "epoch": 1.48, + "learning_rate": 0.00011179648888241155, + "loss": 1.4363, + "step": 8550 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011160647609651597, + "loss": 1.4053, + "step": 8560 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011141642082385304, + "loss": 1.4481, + "step": 8570 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011122632376014078, + "loss": 1.3928, + "step": 8580 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011103618560125007, + "loss": 1.3817, + "step": 8590 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011084600704320238, + "loss": 1.4496, + "step": 8600 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011065578878216696, + "loss": 1.4582, + "step": 8610 + }, + { + "epoch": 1.5, + "learning_rate": 0.00011046553151445844, + "loss": 1.451, + "step": 8620 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001102752359365342, + "loss": 1.469, + "step": 8630 + }, + { + "epoch": 1.5, + "learning_rate": 0.00011008490274499193, + "loss": 1.4299, + "step": 8640 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010989453263656697, + "loss": 1.4298, + "step": 8650 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010972316855101048, + "loss": 1.4192, + "step": 8660 + }, + { + "epoch": 1.5, + "learning_rate": 0.00010953273022049615, + "loss": 1.4405, + "step": 8670 + }, + { + "epoch": 1.51, + "learning_rate": 0.00010934225699438665, + "loss": 1.4636, + "step": 8680 + }, + { + "epoch": 1.51, + "learning_rate": 0.00010917080183142705, + "loss": 1.4231, + "step": 8690 + }, + { + "epoch": 1.51, + "learning_rate": 0.00010898026422470837, + "loss": 1.4357, + "step": 8700 + }, + { + "epoch": 1.51, + "learning_rate": 0.00010878969374475633, + "loss": 1.4173, + "step": 8710 + }, + { + "epoch": 1.51, + "learning_rate": 0.00010859909108917496, + "loss": 1.4286, + "step": 8720 + }, + { + "epoch": 1.52, + "learning_rate": 0.00010840845695568593, + "loss": 1.4365, + "step": 8730 + }, + { + "epoch": 1.52, + "learning_rate": 0.00010821779204212623, + "loss": 1.4292, + "step": 8740 + }, + { + "epoch": 1.52, + "learning_rate": 0.00010804616787981517, + "loss": 1.4429, + "step": 8750 + }, + { + "epoch": 1.52, + "learning_rate": 0.00010785544640706349, + "loss": 1.43, + "step": 8760 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001076646961785964, + "loss": 1.4428, + "step": 8770 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001074739178926758, + "loss": 1.4511, + "step": 8780 + }, + { + "epoch": 1.53, + "learning_rate": 0.00010728311224766634, + "loss": 1.4283, + "step": 8790 + }, + { + "epoch": 1.53, + "learning_rate": 0.00010709227994203286, + "loss": 1.4041, + "step": 8800 + }, + { + "epoch": 1.53, + "learning_rate": 0.00010690142167433773, + "loss": 1.4574, + "step": 8810 + }, + { + "epoch": 1.53, + "learning_rate": 0.00010671053814323834, + "loss": 1.4421, + "step": 8820 + }, + { + "epoch": 1.53, + "learning_rate": 0.00010651963004748471, + "loss": 1.453, + "step": 8830 + }, + { + "epoch": 1.53, + "learning_rate": 0.00010632869808591662, + "loss": 1.4239, + "step": 8840 + }, + { + "epoch": 1.54, + "learning_rate": 0.00010613774295746124, + "loss": 1.4069, + "step": 8850 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001059467653611306, + "loss": 1.4335, + "step": 8860 + }, + { + "epoch": 1.54, + "learning_rate": 0.00010575576599601895, + "loss": 1.4341, + "step": 8870 + }, + { + "epoch": 1.54, + "learning_rate": 0.00010556474556130025, + "loss": 1.3864, + "step": 8880 + }, + { + "epoch": 1.54, + "learning_rate": 0.00010537370475622554, + "loss": 1.4389, + "step": 8890 + }, + { + "epoch": 1.54, + "learning_rate": 0.00010518264428012043, + "loss": 1.4895, + "step": 8900 + }, + { + "epoch": 1.55, + "learning_rate": 0.00010499156483238262, + "loss": 1.4252, + "step": 8910 + }, + { + "epoch": 1.55, + "learning_rate": 0.00010480046711247918, + "loss": 1.4495, + "step": 8920 + }, + { + "epoch": 1.55, + "learning_rate": 0.00010460935181994404, + "loss": 1.4292, + "step": 8930 + }, + { + "epoch": 1.55, + "learning_rate": 0.00010441821965437556, + "loss": 1.405, + "step": 8940 + }, + { + "epoch": 1.55, + "learning_rate": 0.00010422707131543377, + "loss": 1.4666, + "step": 8950 + }, + { + "epoch": 1.56, + "learning_rate": 0.00010405502456046876, + "loss": 1.4412, + "step": 8960 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001038638474198912, + "loss": 1.3975, + "step": 8970 + }, + { + "epoch": 1.56, + "learning_rate": 0.00010367265613528012, + "loss": 1.4423, + "step": 8980 + }, + { + "epoch": 1.56, + "learning_rate": 0.00010348145140651204, + "loss": 1.4614, + "step": 8990 + }, + { + "epoch": 1.56, + "learning_rate": 0.00010329023393351272, + "loss": 1.4521, + "step": 9000 + }, + { + "epoch": 1.56, + "eval_loss": 0.7902594208717346, + "eval_runtime": 61.869, + "eval_samples_per_second": 8.47, + "eval_steps_per_second": 0.533, + "step": 9000 + }, + { + "epoch": 1.56, + "learning_rate": 0.00010309900441625435, + "loss": 1.4428, + "step": 9010 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001029077635547535, + "loss": 1.4417, + "step": 9020 + }, + { + "epoch": 1.57, + "learning_rate": 0.00010271651204906811, + "loss": 1.4228, + "step": 9030 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001025252505992951, + "loss": 1.4302, + "step": 9040 + }, + { + "epoch": 1.57, + "learning_rate": 0.00010233397990556775, + "loss": 1.4544, + "step": 9050 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001021618289563197, + "loss": 1.4378, + "step": 9060 + }, + { + "epoch": 1.57, + "learning_rate": 0.00010198967159704729, + "loss": 1.4494, + "step": 9070 + }, + { + "epoch": 1.58, + "learning_rate": 0.00010179837874523537, + "loss": 1.4467, + "step": 9080 + }, + { + "epoch": 1.58, + "learning_rate": 0.00010160707931026259, + "loss": 1.4443, + "step": 9090 + }, + { + "epoch": 1.58, + "learning_rate": 0.00010143490476895921, + "loss": 1.4272, + "step": 9100 + }, + { + "epoch": 1.58, + "learning_rate": 0.000101243594755249, + "loss": 1.4492, + "step": 9110 + }, + { + "epoch": 1.58, + "learning_rate": 0.00010105228018922502, + "loss": 1.4289, + "step": 9120 + }, + { + "epoch": 1.58, + "learning_rate": 0.00010086096177121504, + "loss": 1.3822, + "step": 9130 + }, + { + "epoch": 1.59, + "learning_rate": 0.00010066964020156091, + "loss": 1.413, + "step": 9140 + }, + { + "epoch": 1.59, + "learning_rate": 0.000100478316180616, + "loss": 1.4419, + "step": 9150 + }, + { + "epoch": 1.59, + "learning_rate": 0.00010028699040874277, + "loss": 1.3911, + "step": 9160 + }, + { + "epoch": 1.59, + "learning_rate": 0.00010009566358630991, + "loss": 1.4321, + "step": 9170 + }, + { + "epoch": 1.59, + "learning_rate": 9.990433641369012e-05, + "loss": 1.4358, + "step": 9180 + }, + { + "epoch": 1.6, + "learning_rate": 9.971300959125727e-05, + "loss": 1.4307, + "step": 9190 + }, + { + "epoch": 1.6, + "learning_rate": 9.952168381938401e-05, + "loss": 1.4235, + "step": 9200 + }, + { + "epoch": 1.6, + "learning_rate": 9.933035979843912e-05, + "loss": 1.4123, + "step": 9210 + }, + { + "epoch": 1.6, + "learning_rate": 9.913903822878499e-05, + "loss": 1.4267, + "step": 9220 + }, + { + "epoch": 1.6, + "learning_rate": 9.8947719810775e-05, + "loss": 1.4151, + "step": 9230 + }, + { + "epoch": 1.6, + "learning_rate": 9.875640524475103e-05, + "loss": 1.4215, + "step": 9240 + }, + { + "epoch": 1.61, + "learning_rate": 9.856509523104083e-05, + "loss": 1.3905, + "step": 9250 + }, + { + "epoch": 1.61, + "learning_rate": 9.83737904699555e-05, + "loss": 1.4147, + "step": 9260 + }, + { + "epoch": 1.61, + "learning_rate": 9.820162125476466e-05, + "loss": 1.4225, + "step": 9270 + }, + { + "epoch": 1.61, + "learning_rate": 9.802945737193441e-05, + "loss": 1.4273, + "step": 9280 + }, + { + "epoch": 1.61, + "learning_rate": 9.783817104368033e-05, + "loss": 1.4644, + "step": 9290 + }, + { + "epoch": 1.61, + "learning_rate": 9.764689262903611e-05, + "loss": 1.4026, + "step": 9300 + }, + { + "epoch": 1.62, + "learning_rate": 9.74556228281972e-05, + "loss": 1.4154, + "step": 9310 + }, + { + "epoch": 1.62, + "learning_rate": 9.726436234132755e-05, + "loss": 1.438, + "step": 9320 + }, + { + "epoch": 1.62, + "learning_rate": 9.707311186855684e-05, + "loss": 1.4191, + "step": 9330 + }, + { + "epoch": 1.62, + "learning_rate": 9.68818721099783e-05, + "loss": 1.4027, + "step": 9340 + }, + { + "epoch": 1.62, + "learning_rate": 9.669064376564584e-05, + "loss": 1.4297, + "step": 9350 + }, + { + "epoch": 1.62, + "learning_rate": 9.64994275355716e-05, + "loss": 1.4164, + "step": 9360 + }, + { + "epoch": 1.63, + "learning_rate": 9.630822411972336e-05, + "loss": 1.3995, + "step": 9370 + }, + { + "epoch": 1.63, + "learning_rate": 9.611703421802204e-05, + "loss": 1.4077, + "step": 9380 + }, + { + "epoch": 1.63, + "learning_rate": 9.592585853033905e-05, + "loss": 1.4506, + "step": 9390 + }, + { + "epoch": 1.63, + "learning_rate": 9.573469775649374e-05, + "loss": 1.4236, + "step": 9400 + }, + { + "epoch": 1.63, + "learning_rate": 9.554355259625092e-05, + "loss": 1.3765, + "step": 9410 + }, + { + "epoch": 1.63, + "learning_rate": 9.535242374931823e-05, + "loss": 1.4404, + "step": 9420 + }, + { + "epoch": 1.64, + "learning_rate": 9.516131191534359e-05, + "loss": 1.4332, + "step": 9430 + }, + { + "epoch": 1.64, + "learning_rate": 9.497021779391265e-05, + "loss": 1.4082, + "step": 9440 + }, + { + "epoch": 1.64, + "learning_rate": 9.477914208454618e-05, + "loss": 1.4542, + "step": 9450 + }, + { + "epoch": 1.64, + "learning_rate": 9.458808548669762e-05, + "loss": 1.4153, + "step": 9460 + }, + { + "epoch": 1.64, + "learning_rate": 9.439704869975043e-05, + "loss": 1.4286, + "step": 9470 + }, + { + "epoch": 1.65, + "learning_rate": 9.426333511085766e-05, + "loss": 1.3663, + "step": 9480 + }, + { + "epoch": 1.65, + "learning_rate": 9.407233360732119e-05, + "loss": 1.389, + "step": 9490 + }, + { + "epoch": 1.65, + "learning_rate": 9.388135380265187e-05, + "loss": 1.4096, + "step": 9500 + }, + { + "epoch": 1.65, + "learning_rate": 9.369039639595209e-05, + "loss": 1.4111, + "step": 9510 + }, + { + "epoch": 1.65, + "learning_rate": 9.349946208624212e-05, + "loss": 1.3776, + "step": 9520 + }, + { + "epoch": 1.65, + "learning_rate": 9.330855157245775e-05, + "loss": 1.4375, + "step": 9530 + }, + { + "epoch": 1.66, + "learning_rate": 9.31176655534477e-05, + "loss": 1.4131, + "step": 9540 + }, + { + "epoch": 1.66, + "learning_rate": 9.292680472797101e-05, + "loss": 1.3917, + "step": 9550 + }, + { + "epoch": 1.66, + "learning_rate": 9.273596979469446e-05, + "loss": 1.4346, + "step": 9560 + }, + { + "epoch": 1.66, + "learning_rate": 9.254516145219005e-05, + "loss": 1.4056, + "step": 9570 + }, + { + "epoch": 1.66, + "learning_rate": 9.235438039893248e-05, + "loss": 1.4045, + "step": 9580 + }, + { + "epoch": 1.66, + "learning_rate": 9.216362733329655e-05, + "loss": 1.4248, + "step": 9590 + }, + { + "epoch": 1.67, + "learning_rate": 9.197290295355454e-05, + "loss": 1.4291, + "step": 9600 + }, + { + "epoch": 1.67, + "learning_rate": 9.17822079578738e-05, + "loss": 1.4162, + "step": 9610 + }, + { + "epoch": 1.67, + "learning_rate": 9.159154304431409e-05, + "loss": 1.4189, + "step": 9620 + }, + { + "epoch": 1.67, + "learning_rate": 9.140090891082506e-05, + "loss": 1.4231, + "step": 9630 + }, + { + "epoch": 1.67, + "learning_rate": 9.121030625524365e-05, + "loss": 1.4008, + "step": 9640 + }, + { + "epoch": 1.67, + "learning_rate": 9.103879135550087e-05, + "loss": 1.4321, + "step": 9650 + }, + { + "epoch": 1.68, + "learning_rate": 9.084825043007008e-05, + "loss": 1.4719, + "step": 9660 + }, + { + "epoch": 1.68, + "learning_rate": 9.065774300561337e-05, + "loss": 1.4252, + "step": 9670 + }, + { + "epoch": 1.68, + "learning_rate": 9.04672697795039e-05, + "loss": 1.4116, + "step": 9680 + }, + { + "epoch": 1.68, + "learning_rate": 9.029587369187029e-05, + "loss": 1.421, + "step": 9690 + }, + { + "epoch": 1.68, + "learning_rate": 9.010546736343308e-05, + "loss": 1.4574, + "step": 9700 + }, + { + "epoch": 1.69, + "learning_rate": 8.991509725500809e-05, + "loss": 1.3972, + "step": 9710 + }, + { + "epoch": 1.69, + "learning_rate": 8.972476406346583e-05, + "loss": 1.3886, + "step": 9720 + }, + { + "epoch": 1.69, + "learning_rate": 8.953446848554158e-05, + "loss": 1.4333, + "step": 9730 + }, + { + "epoch": 1.69, + "learning_rate": 8.934421121783305e-05, + "loss": 1.3839, + "step": 9740 + }, + { + "epoch": 1.69, + "learning_rate": 8.915399295679763e-05, + "loss": 1.4411, + "step": 9750 + }, + { + "epoch": 1.69, + "learning_rate": 8.896381439874992e-05, + "loss": 1.4303, + "step": 9760 + }, + { + "epoch": 1.7, + "learning_rate": 8.877367623985927e-05, + "loss": 1.453, + "step": 9770 + }, + { + "epoch": 1.7, + "learning_rate": 8.858357917614699e-05, + "loss": 1.4066, + "step": 9780 + }, + { + "epoch": 1.7, + "learning_rate": 8.839352390348404e-05, + "loss": 1.3924, + "step": 9790 + }, + { + "epoch": 1.7, + "learning_rate": 8.820351111758849e-05, + "loss": 1.3878, + "step": 9800 + }, + { + "epoch": 1.7, + "learning_rate": 8.801354151402274e-05, + "loss": 1.3849, + "step": 9810 + }, + { + "epoch": 1.7, + "learning_rate": 8.782361578819118e-05, + "loss": 1.4135, + "step": 9820 + }, + { + "epoch": 1.71, + "learning_rate": 8.763373463533758e-05, + "loss": 1.4187, + "step": 9830 + }, + { + "epoch": 1.71, + "learning_rate": 8.748186227269857e-05, + "loss": 1.4153, + "step": 9840 + }, + { + "epoch": 1.71, + "learning_rate": 8.729206310269713e-05, + "loss": 1.4083, + "step": 9850 + }, + { + "epoch": 1.71, + "learning_rate": 8.710231045148006e-05, + "loss": 1.4055, + "step": 9860 + }, + { + "epoch": 1.71, + "learning_rate": 8.691260501365754e-05, + "loss": 1.42, + "step": 9870 + }, + { + "epoch": 1.71, + "learning_rate": 8.672294748366692e-05, + "loss": 1.438, + "step": 9880 + }, + { + "epoch": 1.72, + "learning_rate": 8.653333855577024e-05, + "loss": 1.4077, + "step": 9890 + }, + { + "epoch": 1.72, + "learning_rate": 8.634377892405157e-05, + "loss": 1.4448, + "step": 9900 + }, + { + "epoch": 1.72, + "learning_rate": 8.615426928241457e-05, + "loss": 1.3921, + "step": 9910 + }, + { + "epoch": 1.72, + "learning_rate": 8.596481032457986e-05, + "loss": 1.4282, + "step": 9920 + }, + { + "epoch": 1.72, + "learning_rate": 8.577540274408256e-05, + "loss": 1.4084, + "step": 9930 + }, + { + "epoch": 1.73, + "learning_rate": 8.558604723426972e-05, + "loss": 1.4007, + "step": 9940 + }, + { + "epoch": 1.73, + "learning_rate": 8.539674448829775e-05, + "loss": 1.4371, + "step": 9950 + }, + { + "epoch": 1.73, + "learning_rate": 8.520749519912991e-05, + "loss": 1.4304, + "step": 9960 + }, + { + "epoch": 1.73, + "learning_rate": 8.501830005953381e-05, + "loss": 1.3866, + "step": 9970 + }, + { + "epoch": 1.73, + "learning_rate": 8.482915976207883e-05, + "loss": 1.4156, + "step": 9980 + }, + { + "epoch": 1.73, + "learning_rate": 8.464007499913359e-05, + "loss": 1.4735, + "step": 9990 + }, + { + "epoch": 1.74, + "learning_rate": 8.445104646286339e-05, + "loss": 1.3907, + "step": 10000 + }, + { + "epoch": 1.74, + "eval_loss": 0.7795238494873047, + "eval_runtime": 61.8635, + "eval_samples_per_second": 8.47, + "eval_steps_per_second": 0.533, + "step": 10000 + }, + { + "epoch": 1.74, + "learning_rate": 8.428096942593624e-05, + "loss": 1.3785, + "step": 10010 + }, + { + "epoch": 1.74, + "learning_rate": 8.409204962652496e-05, + "loss": 1.4134, + "step": 10020 + }, + { + "epoch": 1.74, + "learning_rate": 8.392207157637791e-05, + "loss": 1.3956, + "step": 10030 + }, + { + "epoch": 1.74, + "learning_rate": 8.373326301036039e-05, + "loss": 1.3801, + "step": 10040 + }, + { + "epoch": 1.74, + "learning_rate": 8.354451399050185e-05, + "loss": 1.3976, + "step": 10050 + }, + { + "epoch": 1.75, + "learning_rate": 8.335582520773848e-05, + "loss": 1.4065, + "step": 10060 + }, + { + "epoch": 1.75, + "learning_rate": 8.316719735278616e-05, + "loss": 1.4058, + "step": 10070 + }, + { + "epoch": 1.75, + "learning_rate": 8.29786311161376e-05, + "loss": 1.3986, + "step": 10080 + }, + { + "epoch": 1.75, + "learning_rate": 8.279012718806004e-05, + "loss": 1.3801, + "step": 10090 + }, + { + "epoch": 1.75, + "learning_rate": 8.260168625859259e-05, + "loss": 1.4051, + "step": 10100 + }, + { + "epoch": 1.75, + "learning_rate": 8.241330901754376e-05, + "loss": 1.4178, + "step": 10110 + }, + { + "epoch": 1.76, + "learning_rate": 8.222499615448894e-05, + "loss": 1.3994, + "step": 10120 + }, + { + "epoch": 1.76, + "learning_rate": 8.203674835876778e-05, + "loss": 1.3948, + "step": 10130 + }, + { + "epoch": 1.76, + "learning_rate": 8.184856631948184e-05, + "loss": 1.412, + "step": 10140 + }, + { + "epoch": 1.76, + "learning_rate": 8.16604507254919e-05, + "loss": 1.4018, + "step": 10150 + }, + { + "epoch": 1.76, + "learning_rate": 8.147240226541555e-05, + "loss": 1.395, + "step": 10160 + }, + { + "epoch": 1.77, + "learning_rate": 8.128442162762465e-05, + "loss": 1.4177, + "step": 10170 + }, + { + "epoch": 1.77, + "learning_rate": 8.109650950024272e-05, + "loss": 1.4684, + "step": 10180 + }, + { + "epoch": 1.77, + "learning_rate": 8.090866657114254e-05, + "loss": 1.3745, + "step": 10190 + }, + { + "epoch": 1.77, + "learning_rate": 8.072089352794361e-05, + "loss": 1.4243, + "step": 10200 + }, + { + "epoch": 1.77, + "learning_rate": 8.055195810962145e-05, + "loss": 1.3984, + "step": 10210 + }, + { + "epoch": 1.77, + "learning_rate": 8.036431974310813e-05, + "loss": 1.4003, + "step": 10220 + }, + { + "epoch": 1.78, + "learning_rate": 8.017675325513676e-05, + "loss": 1.4197, + "step": 10230 + }, + { + "epoch": 1.78, + "learning_rate": 8.000800543960246e-05, + "loss": 1.389, + "step": 10240 + }, + { + "epoch": 1.78, + "learning_rate": 7.9820577412245e-05, + "loss": 1.3928, + "step": 10250 + }, + { + "epoch": 1.78, + "learning_rate": 7.96332232538574e-05, + "loss": 1.4402, + "step": 10260 + }, + { + "epoch": 1.78, + "learning_rate": 7.94459436502699e-05, + "loss": 1.4059, + "step": 10270 + }, + { + "epoch": 1.78, + "learning_rate": 7.925873928703986e-05, + "loss": 1.4186, + "step": 10280 + }, + { + "epoch": 1.79, + "learning_rate": 7.90716108494492e-05, + "loss": 1.4009, + "step": 10290 + }, + { + "epoch": 1.79, + "learning_rate": 7.888455902250194e-05, + "loss": 1.4537, + "step": 10300 + }, + { + "epoch": 1.79, + "learning_rate": 7.869758449092155e-05, + "loss": 1.4237, + "step": 10310 + }, + { + "epoch": 1.79, + "learning_rate": 7.851068793914867e-05, + "loss": 1.4158, + "step": 10320 + }, + { + "epoch": 1.79, + "learning_rate": 7.832387005133845e-05, + "loss": 1.3977, + "step": 10330 + }, + { + "epoch": 1.79, + "learning_rate": 7.813713151135805e-05, + "loss": 1.4253, + "step": 10340 + }, + { + "epoch": 1.8, + "learning_rate": 7.795047300278422e-05, + "loss": 1.3966, + "step": 10350 + }, + { + "epoch": 1.8, + "learning_rate": 7.776389520890071e-05, + "loss": 1.4211, + "step": 10360 + }, + { + "epoch": 1.8, + "learning_rate": 7.757739881269582e-05, + "loss": 1.3777, + "step": 10370 + }, + { + "epoch": 1.8, + "learning_rate": 7.739098449685987e-05, + "loss": 1.3922, + "step": 10380 + }, + { + "epoch": 1.8, + "learning_rate": 7.720465294378272e-05, + "loss": 1.3884, + "step": 10390 + }, + { + "epoch": 1.81, + "learning_rate": 7.70184048355513e-05, + "loss": 1.4122, + "step": 10400 + }, + { + "epoch": 1.81, + "learning_rate": 7.683224085394702e-05, + "loss": 1.403, + "step": 10410 + }, + { + "epoch": 1.81, + "learning_rate": 7.664616168044339e-05, + "loss": 1.433, + "step": 10420 + }, + { + "epoch": 1.81, + "learning_rate": 7.646016799620345e-05, + "loss": 1.3929, + "step": 10430 + }, + { + "epoch": 1.81, + "learning_rate": 7.631143505862324e-05, + "loss": 1.4136, + "step": 10440 + }, + { + "epoch": 1.81, + "learning_rate": 7.614417685784577e-05, + "loss": 1.3835, + "step": 10450 + }, + { + "epoch": 1.82, + "learning_rate": 7.595841739456996e-05, + "loss": 1.3911, + "step": 10460 + }, + { + "epoch": 1.82, + "learning_rate": 7.577274593812058e-05, + "loss": 1.3981, + "step": 10470 + }, + { + "epoch": 1.82, + "learning_rate": 7.558716316816814e-05, + "loss": 1.4022, + "step": 10480 + }, + { + "epoch": 1.82, + "learning_rate": 7.54016697640586e-05, + "loss": 1.4368, + "step": 10490 + }, + { + "epoch": 1.82, + "learning_rate": 7.521626640481061e-05, + "loss": 1.4221, + "step": 10500 + }, + { + "epoch": 1.82, + "learning_rate": 7.503095376911342e-05, + "loss": 1.4189, + "step": 10510 + }, + { + "epoch": 1.83, + "learning_rate": 7.484573253532406e-05, + "loss": 1.4389, + "step": 10520 + }, + { + "epoch": 1.83, + "learning_rate": 7.466060338146498e-05, + "loss": 1.3616, + "step": 10530 + }, + { + "epoch": 1.83, + "learning_rate": 7.447556698522156e-05, + "loss": 1.4295, + "step": 10540 + }, + { + "epoch": 1.83, + "learning_rate": 7.429062402393965e-05, + "loss": 1.3832, + "step": 10550 + }, + { + "epoch": 1.83, + "learning_rate": 7.410577517462307e-05, + "loss": 1.3762, + "step": 10560 + }, + { + "epoch": 1.83, + "learning_rate": 7.392102111393116e-05, + "loss": 1.4565, + "step": 10570 + }, + { + "epoch": 1.84, + "learning_rate": 7.373636251817615e-05, + "loss": 1.4553, + "step": 10580 + }, + { + "epoch": 1.84, + "learning_rate": 7.355180006332097e-05, + "loss": 1.4171, + "step": 10590 + }, + { + "epoch": 1.84, + "learning_rate": 7.336733442497654e-05, + "loss": 1.3761, + "step": 10600 + }, + { + "epoch": 1.84, + "learning_rate": 7.318296627839935e-05, + "loss": 1.3928, + "step": 10610 + }, + { + "epoch": 1.84, + "learning_rate": 7.299869629848908e-05, + "loss": 1.3833, + "step": 10620 + }, + { + "epoch": 1.84, + "learning_rate": 7.281452515978599e-05, + "loss": 1.3991, + "step": 10630 + }, + { + "epoch": 1.85, + "learning_rate": 7.263045353646861e-05, + "loss": 1.4247, + "step": 10640 + }, + { + "epoch": 1.85, + "learning_rate": 7.248326834170777e-05, + "loss": 1.473, + "step": 10650 + }, + { + "epoch": 1.85, + "learning_rate": 7.229937754384992e-05, + "loss": 1.3753, + "step": 10660 + }, + { + "epoch": 1.85, + "learning_rate": 7.211558814713165e-05, + "loss": 1.41, + "step": 10670 + }, + { + "epoch": 1.85, + "learning_rate": 7.195026494412065e-05, + "loss": 1.4259, + "step": 10680 + }, + { + "epoch": 1.86, + "learning_rate": 7.176667006277049e-05, + "loss": 1.4327, + "step": 10690 + }, + { + "epoch": 1.86, + "learning_rate": 7.158317853259342e-05, + "loss": 1.397, + "step": 10700 + }, + { + "epoch": 1.86, + "learning_rate": 7.13997910252802e-05, + "loss": 1.3827, + "step": 10710 + }, + { + "epoch": 1.86, + "learning_rate": 7.121650821214074e-05, + "loss": 1.417, + "step": 10720 + }, + { + "epoch": 1.86, + "learning_rate": 7.103333076410166e-05, + "loss": 1.4142, + "step": 10730 + }, + { + "epoch": 1.86, + "learning_rate": 7.085025935170397e-05, + "loss": 1.4047, + "step": 10740 + }, + { + "epoch": 1.87, + "learning_rate": 7.066729464510045e-05, + "loss": 1.4153, + "step": 10750 + }, + { + "epoch": 1.87, + "learning_rate": 7.04844373140533e-05, + "loss": 1.3896, + "step": 10760 + }, + { + "epoch": 1.87, + "learning_rate": 7.030168802793164e-05, + "loss": 1.4255, + "step": 10770 + }, + { + "epoch": 1.87, + "learning_rate": 7.011904745570912e-05, + "loss": 1.4286, + "step": 10780 + }, + { + "epoch": 1.87, + "learning_rate": 6.993651626596138e-05, + "loss": 1.4337, + "step": 10790 + }, + { + "epoch": 1.87, + "learning_rate": 6.97540951268637e-05, + "loss": 1.3943, + "step": 10800 + }, + { + "epoch": 1.88, + "learning_rate": 6.95717847061885e-05, + "loss": 1.4138, + "step": 10810 + }, + { + "epoch": 1.88, + "learning_rate": 6.938958567130285e-05, + "loss": 1.3604, + "step": 10820 + }, + { + "epoch": 1.88, + "learning_rate": 6.920749868916618e-05, + "loss": 1.4096, + "step": 10830 + }, + { + "epoch": 1.88, + "learning_rate": 6.902552442632765e-05, + "loss": 1.3915, + "step": 10840 + }, + { + "epoch": 1.88, + "learning_rate": 6.88436635489238e-05, + "loss": 1.3946, + "step": 10850 + }, + { + "epoch": 1.88, + "learning_rate": 6.868008625403449e-05, + "loss": 1.3984, + "step": 10860 + }, + { + "epoch": 1.89, + "learning_rate": 6.851660182560898e-05, + "loss": 1.4222, + "step": 10870 + }, + { + "epoch": 1.89, + "learning_rate": 6.833506196772657e-05, + "loss": 1.43, + "step": 10880 + }, + { + "epoch": 1.89, + "learning_rate": 6.815363802279173e-05, + "loss": 1.3911, + "step": 10890 + }, + { + "epoch": 1.89, + "learning_rate": 6.797233065492654e-05, + "loss": 1.4225, + "step": 10900 + }, + { + "epoch": 1.89, + "learning_rate": 6.779114052782636e-05, + "loss": 1.4097, + "step": 10910 + }, + { + "epoch": 1.9, + "learning_rate": 6.761006830475733e-05, + "loss": 1.3827, + "step": 10920 + }, + { + "epoch": 1.9, + "learning_rate": 6.742911464855399e-05, + "loss": 1.4585, + "step": 10930 + }, + { + "epoch": 1.9, + "learning_rate": 6.724828022161692e-05, + "loss": 1.4062, + "step": 10940 + }, + { + "epoch": 1.9, + "learning_rate": 6.706756568591013e-05, + "loss": 1.4324, + "step": 10950 + }, + { + "epoch": 1.9, + "learning_rate": 6.68869717029588e-05, + "loss": 1.3791, + "step": 10960 + }, + { + "epoch": 1.9, + "learning_rate": 6.670649893384692e-05, + "loss": 1.4147, + "step": 10970 + }, + { + "epoch": 1.91, + "learning_rate": 6.652614803921461e-05, + "loss": 1.4273, + "step": 10980 + }, + { + "epoch": 1.91, + "learning_rate": 6.634591967925598e-05, + "loss": 1.3976, + "step": 10990 + }, + { + "epoch": 1.91, + "learning_rate": 6.616581451371651e-05, + "loss": 1.3826, + "step": 11000 + }, + { + "epoch": 1.91, + "eval_loss": 0.7706289887428284, + "eval_runtime": 63.1592, + "eval_samples_per_second": 8.296, + "eval_steps_per_second": 0.522, + "step": 11000 + }, + { + "epoch": 1.91, + "learning_rate": 6.598583320189075e-05, + "loss": 1.4273, + "step": 11010 + }, + { + "epoch": 1.91, + "learning_rate": 6.580597640261978e-05, + "loss": 1.3904, + "step": 11020 + }, + { + "epoch": 1.91, + "learning_rate": 6.562624477428905e-05, + "loss": 1.3824, + "step": 11030 + }, + { + "epoch": 1.92, + "learning_rate": 6.544663897482568e-05, + "loss": 1.3709, + "step": 11040 + }, + { + "epoch": 1.92, + "learning_rate": 6.528510188239592e-05, + "loss": 1.4232, + "step": 11050 + }, + { + "epoch": 1.92, + "learning_rate": 6.510573696871829e-05, + "loss": 1.3842, + "step": 11060 + }, + { + "epoch": 1.92, + "learning_rate": 6.492649978928341e-05, + "loss": 1.3655, + "step": 11070 + }, + { + "epoch": 1.92, + "learning_rate": 6.47473910002085e-05, + "loss": 1.3651, + "step": 11080 + }, + { + "epoch": 1.92, + "learning_rate": 6.456841125714071e-05, + "loss": 1.3925, + "step": 11090 + }, + { + "epoch": 1.93, + "learning_rate": 6.440744036422758e-05, + "loss": 1.3946, + "step": 11100 + }, + { + "epoch": 1.93, + "learning_rate": 6.422870761318759e-05, + "loss": 1.4216, + "step": 11110 + }, + { + "epoch": 1.93, + "learning_rate": 6.405010580685171e-05, + "loss": 1.3654, + "step": 11120 + }, + { + "epoch": 1.93, + "learning_rate": 6.387163559901117e-05, + "loss": 1.3438, + "step": 11130 + }, + { + "epoch": 1.93, + "learning_rate": 6.36932976429756e-05, + "loss": 1.4255, + "step": 11140 + }, + { + "epoch": 1.94, + "learning_rate": 6.35150925915705e-05, + "loss": 1.4268, + "step": 11150 + }, + { + "epoch": 1.94, + "learning_rate": 6.333702109713477e-05, + "loss": 1.3947, + "step": 11160 + }, + { + "epoch": 1.94, + "learning_rate": 6.315908381151857e-05, + "loss": 1.4452, + "step": 11170 + }, + { + "epoch": 1.94, + "learning_rate": 6.298128138608059e-05, + "loss": 1.4187, + "step": 11180 + }, + { + "epoch": 1.94, + "learning_rate": 6.280361447168603e-05, + "loss": 1.3878, + "step": 11190 + }, + { + "epoch": 1.94, + "learning_rate": 6.264383064821323e-05, + "loss": 1.3753, + "step": 11200 + }, + { + "epoch": 1.95, + "learning_rate": 6.246642299615586e-05, + "loss": 1.372, + "step": 11210 + }, + { + "epoch": 1.95, + "learning_rate": 6.230687356416249e-05, + "loss": 1.361, + "step": 11220 + }, + { + "epoch": 1.95, + "learning_rate": 6.212972751884663e-05, + "loss": 1.4421, + "step": 11230 + }, + { + "epoch": 1.95, + "learning_rate": 6.195272010177959e-05, + "loss": 1.4402, + "step": 11240 + }, + { + "epoch": 1.95, + "learning_rate": 6.177585196091631e-05, + "loss": 1.3958, + "step": 11250 + }, + { + "epoch": 1.95, + "learning_rate": 6.159912374370183e-05, + "loss": 1.4365, + "step": 11260 + }, + { + "epoch": 1.96, + "learning_rate": 6.142253609706898e-05, + "loss": 1.4041, + "step": 11270 + }, + { + "epoch": 1.96, + "learning_rate": 6.124608966743606e-05, + "loss": 1.4321, + "step": 11280 + }, + { + "epoch": 1.96, + "learning_rate": 6.106978510070443e-05, + "loss": 1.4154, + "step": 11290 + }, + { + "epoch": 1.96, + "learning_rate": 6.089362304225603e-05, + "loss": 1.4208, + "step": 11300 + }, + { + "epoch": 1.96, + "learning_rate": 6.071760413695131e-05, + "loss": 1.3757, + "step": 11310 + }, + { + "epoch": 1.96, + "learning_rate": 6.054172902912656e-05, + "loss": 1.4266, + "step": 11320 + }, + { + "epoch": 1.97, + "learning_rate": 6.0365998362591744e-05, + "loss": 1.4081, + "step": 11330 + }, + { + "epoch": 1.97, + "learning_rate": 6.019041278062807e-05, + "loss": 1.4108, + "step": 11340 + }, + { + "epoch": 1.97, + "learning_rate": 6.0032510335413086e-05, + "loss": 1.3667, + "step": 11350 + }, + { + "epoch": 1.97, + "learning_rate": 5.985720218447026e-05, + "loss": 1.3987, + "step": 11360 + }, + { + "epoch": 1.97, + "learning_rate": 5.9682040980602316e-05, + "loss": 1.3553, + "step": 11370 + }, + { + "epoch": 1.98, + "learning_rate": 5.9524522066830346e-05, + "loss": 1.3722, + "step": 11380 + }, + { + "epoch": 1.98, + "learning_rate": 5.934964182845485e-05, + "loss": 1.4445, + "step": 11390 + }, + { + "epoch": 1.98, + "learning_rate": 5.917491039513411e-05, + "loss": 1.3968, + "step": 11400 + }, + { + "epoch": 1.98, + "learning_rate": 5.9000328406491425e-05, + "loss": 1.3855, + "step": 11410 + }, + { + "epoch": 1.98, + "learning_rate": 5.882589650160322e-05, + "loss": 1.3988, + "step": 11420 + }, + { + "epoch": 1.98, + "learning_rate": 5.865161531899642e-05, + "loss": 1.3642, + "step": 11430 + }, + { + "epoch": 1.99, + "learning_rate": 5.8477485496646245e-05, + "loss": 1.4189, + "step": 11440 + }, + { + "epoch": 1.99, + "learning_rate": 5.8303507671973864e-05, + "loss": 1.4004, + "step": 11450 + }, + { + "epoch": 1.99, + "learning_rate": 5.812968248184392e-05, + "loss": 1.3928, + "step": 11460 + }, + { + "epoch": 1.99, + "learning_rate": 5.795601056256257e-05, + "loss": 1.4273, + "step": 11470 + }, + { + "epoch": 1.99, + "learning_rate": 5.778249254987461e-05, + "loss": 1.3824, + "step": 11480 + }, + { + "epoch": 1.99, + "learning_rate": 5.7609129078961655e-05, + "loss": 1.4004, + "step": 11490 + }, + { + "epoch": 2.0, + "learning_rate": 5.7435920784439514e-05, + "loss": 1.3801, + "step": 11500 + }, + { + "epoch": 2.0, + "learning_rate": 5.7262868300355975e-05, + "loss": 1.373, + "step": 11510 + }, + { + "epoch": 2.0, + "learning_rate": 5.7089972260188485e-05, + "loss": 1.414, + "step": 11520 + }, + { + "epoch": 2.0, + "learning_rate": 5.6917233296841776e-05, + "loss": 1.3001, + "step": 11530 + }, + { + "epoch": 2.0, + "learning_rate": 5.6744652042645616e-05, + "loss": 1.3162, + "step": 11540 + }, + { + "epoch": 2.0, + "learning_rate": 5.6572229129352474e-05, + "loss": 1.2765, + "step": 11550 + }, + { + "epoch": 2.01, + "learning_rate": 5.6399965188135084e-05, + "loss": 1.2453, + "step": 11560 + }, + { + "epoch": 2.01, + "learning_rate": 5.622786084958437e-05, + "loss": 1.2787, + "step": 11570 + }, + { + "epoch": 2.01, + "learning_rate": 5.6090292716047934e-05, + "loss": 1.2726, + "step": 11580 + }, + { + "epoch": 2.01, + "learning_rate": 5.591847724951989e-05, + "loss": 1.2786, + "step": 11590 + }, + { + "epoch": 2.01, + "learning_rate": 5.574682314819745e-05, + "loss": 1.2788, + "step": 11600 + }, + { + "epoch": 2.02, + "learning_rate": 5.557533104043913e-05, + "loss": 1.2654, + "step": 11610 + }, + { + "epoch": 2.02, + "learning_rate": 5.54040015540104e-05, + "loss": 1.2507, + "step": 11620 + }, + { + "epoch": 2.02, + "learning_rate": 5.5249944575829906e-05, + "loss": 1.3315, + "step": 11630 + }, + { + "epoch": 2.02, + "learning_rate": 5.507892579728751e-05, + "loss": 1.2968, + "step": 11640 + }, + { + "epoch": 2.02, + "learning_rate": 5.490807145722008e-05, + "loss": 1.3051, + "step": 11650 + }, + { + "epoch": 2.02, + "learning_rate": 5.47373821810585e-05, + "loss": 1.306, + "step": 11660 + }, + { + "epoch": 2.03, + "learning_rate": 5.4566858593629454e-05, + "loss": 1.3139, + "step": 11670 + }, + { + "epoch": 2.03, + "learning_rate": 5.439650131915299e-05, + "loss": 1.277, + "step": 11680 + }, + { + "epoch": 2.03, + "learning_rate": 5.4226310981240466e-05, + "loss": 1.2737, + "step": 11690 + }, + { + "epoch": 2.03, + "learning_rate": 5.4056288202892126e-05, + "loss": 1.311, + "step": 11700 + }, + { + "epoch": 2.03, + "learning_rate": 5.3886433606494804e-05, + "loss": 1.2775, + "step": 11710 + }, + { + "epoch": 2.03, + "learning_rate": 5.37167478138197e-05, + "loss": 1.2633, + "step": 11720 + }, + { + "epoch": 2.04, + "learning_rate": 5.354723144602016e-05, + "loss": 1.3022, + "step": 11730 + }, + { + "epoch": 2.04, + "learning_rate": 5.337788512362931e-05, + "loss": 1.2979, + "step": 11740 + }, + { + "epoch": 2.04, + "learning_rate": 5.320870946655765e-05, + "loss": 1.2726, + "step": 11750 + }, + { + "epoch": 2.04, + "learning_rate": 5.303970509409113e-05, + "loss": 1.2303, + "step": 11760 + }, + { + "epoch": 2.04, + "learning_rate": 5.2870872624888615e-05, + "loss": 1.2648, + "step": 11770 + }, + { + "epoch": 2.04, + "learning_rate": 5.2702212676979704e-05, + "loss": 1.2865, + "step": 11780 + }, + { + "epoch": 2.05, + "learning_rate": 5.253372586776248e-05, + "loss": 1.2944, + "step": 11790 + }, + { + "epoch": 2.05, + "learning_rate": 5.236541281400122e-05, + "loss": 1.3188, + "step": 11800 + }, + { + "epoch": 2.05, + "learning_rate": 5.219727413182419e-05, + "loss": 1.3009, + "step": 11810 + }, + { + "epoch": 2.05, + "learning_rate": 5.202931043672124e-05, + "loss": 1.2727, + "step": 11820 + }, + { + "epoch": 2.05, + "learning_rate": 5.186152234354172e-05, + "loss": 1.2515, + "step": 11830 + }, + { + "epoch": 2.05, + "learning_rate": 5.172741871515152e-05, + "loss": 1.3243, + "step": 11840 + }, + { + "epoch": 2.06, + "learning_rate": 5.1559948252801414e-05, + "loss": 1.3009, + "step": 11850 + }, + { + "epoch": 2.06, + "learning_rate": 5.139265511052607e-05, + "loss": 1.3033, + "step": 11860 + }, + { + "epoch": 2.06, + "learning_rate": 5.122553990072023e-05, + "loss": 1.2961, + "step": 11870 + }, + { + "epoch": 2.06, + "learning_rate": 5.10586032351273e-05, + "loss": 1.2361, + "step": 11880 + }, + { + "epoch": 2.06, + "learning_rate": 5.090851339647496e-05, + "loss": 1.2856, + "step": 11890 + }, + { + "epoch": 2.07, + "learning_rate": 5.074191764789694e-05, + "loss": 1.2688, + "step": 11900 + }, + { + "epoch": 2.07, + "learning_rate": 5.0575502213883655e-05, + "loss": 1.2338, + "step": 11910 + }, + { + "epoch": 2.07, + "learning_rate": 5.040926770361687e-05, + "loss": 1.3065, + "step": 11920 + }, + { + "epoch": 2.07, + "learning_rate": 5.0243214725616126e-05, + "loss": 1.2683, + "step": 11930 + }, + { + "epoch": 2.07, + "learning_rate": 5.00773438877363e-05, + "loss": 1.3036, + "step": 11940 + }, + { + "epoch": 2.07, + "learning_rate": 4.99116557971657e-05, + "loss": 1.25, + "step": 11950 + }, + { + "epoch": 2.08, + "learning_rate": 4.9746151060423564e-05, + "loss": 1.2719, + "step": 11960 + }, + { + "epoch": 2.08, + "learning_rate": 4.958083028335794e-05, + "loss": 1.2411, + "step": 11970 + }, + { + "epoch": 2.08, + "learning_rate": 4.9415694071143584e-05, + "loss": 1.286, + "step": 11980 + }, + { + "epoch": 2.08, + "learning_rate": 4.9250743028279486e-05, + "loss": 1.2786, + "step": 11990 + }, + { + "epoch": 2.08, + "learning_rate": 4.9085977758586906e-05, + "loss": 1.2634, + "step": 12000 + }, + { + "epoch": 2.08, + "eval_loss": 0.7820777893066406, + "eval_runtime": 62.0421, + "eval_samples_per_second": 8.446, + "eval_steps_per_second": 0.532, + "step": 12000 + }, + { + "epoch": 2.08, + "learning_rate": 4.8921398865207045e-05, + "loss": 1.3052, + "step": 12010 + }, + { + "epoch": 2.09, + "learning_rate": 4.875700695059875e-05, + "loss": 1.3028, + "step": 12020 + }, + { + "epoch": 2.09, + "learning_rate": 4.859280261653654e-05, + "loss": 1.3132, + "step": 12030 + }, + { + "epoch": 2.09, + "learning_rate": 4.8428786464108225e-05, + "loss": 1.3153, + "step": 12040 + }, + { + "epoch": 2.09, + "learning_rate": 4.826495909371276e-05, + "loss": 1.3391, + "step": 12050 + }, + { + "epoch": 2.09, + "learning_rate": 4.810132110505804e-05, + "loss": 1.2821, + "step": 12060 + }, + { + "epoch": 2.09, + "learning_rate": 4.793787309715871e-05, + "loss": 1.2542, + "step": 12070 + }, + { + "epoch": 2.1, + "learning_rate": 4.779093281812042e-05, + "loss": 1.2344, + "step": 12080 + }, + { + "epoch": 2.1, + "learning_rate": 4.7627847421449165e-05, + "loss": 1.2916, + "step": 12090 + }, + { + "epoch": 2.1, + "learning_rate": 4.746495373873521e-05, + "loss": 1.2703, + "step": 12100 + }, + { + "epoch": 2.1, + "learning_rate": 4.730225236626855e-05, + "loss": 1.3033, + "step": 12110 + }, + { + "epoch": 2.1, + "learning_rate": 4.713974389963527e-05, + "loss": 1.2804, + "step": 12120 + }, + { + "epoch": 2.11, + "learning_rate": 4.697742893371525e-05, + "loss": 1.317, + "step": 12130 + }, + { + "epoch": 2.11, + "learning_rate": 4.6815308062680086e-05, + "loss": 1.3142, + "step": 12140 + }, + { + "epoch": 2.11, + "learning_rate": 4.665338187999084e-05, + "loss": 1.2592, + "step": 12150 + }, + { + "epoch": 2.11, + "learning_rate": 4.649165097839591e-05, + "loss": 1.2689, + "step": 12160 + }, + { + "epoch": 2.11, + "learning_rate": 4.6330115949928876e-05, + "loss": 1.2734, + "step": 12170 + }, + { + "epoch": 2.11, + "learning_rate": 4.618490238457079e-05, + "loss": 1.2713, + "step": 12180 + }, + { + "epoch": 2.12, + "learning_rate": 4.602374114352934e-05, + "loss": 1.3216, + "step": 12190 + }, + { + "epoch": 2.12, + "learning_rate": 4.586277748845055e-05, + "loss": 1.2775, + "step": 12200 + }, + { + "epoch": 2.12, + "learning_rate": 4.570201200855939e-05, + "loss": 1.2749, + "step": 12210 + }, + { + "epoch": 2.12, + "learning_rate": 4.554144529235537e-05, + "loss": 1.2809, + "step": 12220 + }, + { + "epoch": 2.12, + "learning_rate": 4.538107792761041e-05, + "loss": 1.2817, + "step": 12230 + }, + { + "epoch": 2.12, + "learning_rate": 4.522091050136663e-05, + "loss": 1.2324, + "step": 12240 + }, + { + "epoch": 2.13, + "learning_rate": 4.50609435999344e-05, + "loss": 1.2649, + "step": 12250 + }, + { + "epoch": 2.13, + "learning_rate": 4.4901177808889936e-05, + "loss": 1.2493, + "step": 12260 + }, + { + "epoch": 2.13, + "learning_rate": 4.474161371307322e-05, + "loss": 1.2946, + "step": 12270 + }, + { + "epoch": 2.13, + "learning_rate": 4.458225189658598e-05, + "loss": 1.2523, + "step": 12280 + }, + { + "epoch": 2.13, + "learning_rate": 4.44230929427895e-05, + "loss": 1.2769, + "step": 12290 + }, + { + "epoch": 2.13, + "learning_rate": 4.426413743430241e-05, + "loss": 1.2823, + "step": 12300 + }, + { + "epoch": 2.14, + "learning_rate": 4.410538595299864e-05, + "loss": 1.2536, + "step": 12310 + }, + { + "epoch": 2.14, + "learning_rate": 4.3962684543383956e-05, + "loss": 1.2686, + "step": 12320 + }, + { + "epoch": 2.14, + "learning_rate": 4.380432231411452e-05, + "loss": 1.2826, + "step": 12330 + }, + { + "epoch": 2.14, + "learning_rate": 4.364616579523162e-05, + "loss": 1.2906, + "step": 12340 + }, + { + "epoch": 2.14, + "learning_rate": 4.348821556568439e-05, + "loss": 1.2777, + "step": 12350 + }, + { + "epoch": 2.15, + "learning_rate": 4.3346237214366844e-05, + "loss": 1.2865, + "step": 12360 + }, + { + "epoch": 2.15, + "learning_rate": 4.3188680526855985e-05, + "loss": 1.2784, + "step": 12370 + }, + { + "epoch": 2.15, + "learning_rate": 4.303133180335535e-05, + "loss": 1.2866, + "step": 12380 + }, + { + "epoch": 2.15, + "learning_rate": 4.287419161985704e-05, + "loss": 1.2954, + "step": 12390 + }, + { + "epoch": 2.15, + "learning_rate": 4.2717260551589775e-05, + "loss": 1.2677, + "step": 12400 + }, + { + "epoch": 2.15, + "learning_rate": 4.2560539173016813e-05, + "loss": 1.2825, + "step": 12410 + }, + { + "epoch": 2.16, + "learning_rate": 4.240402805783377e-05, + "loss": 1.2749, + "step": 12420 + }, + { + "epoch": 2.16, + "learning_rate": 4.224772777896659e-05, + "loss": 1.2797, + "step": 12430 + }, + { + "epoch": 2.16, + "learning_rate": 4.209163890856951e-05, + "loss": 1.2819, + "step": 12440 + }, + { + "epoch": 2.16, + "learning_rate": 4.193576201802268e-05, + "loss": 1.2607, + "step": 12450 + }, + { + "epoch": 2.16, + "learning_rate": 4.1780097677930485e-05, + "loss": 1.2767, + "step": 12460 + }, + { + "epoch": 2.16, + "learning_rate": 4.162464645811913e-05, + "loss": 1.2333, + "step": 12470 + }, + { + "epoch": 2.17, + "learning_rate": 4.146940892763472e-05, + "loss": 1.2667, + "step": 12480 + }, + { + "epoch": 2.17, + "learning_rate": 4.131438565474112e-05, + "loss": 1.3182, + "step": 12490 + }, + { + "epoch": 2.17, + "learning_rate": 4.11595772069178e-05, + "loss": 1.2889, + "step": 12500 + }, + { + "epoch": 2.17, + "learning_rate": 4.100498415085804e-05, + "loss": 1.3046, + "step": 12510 + }, + { + "epoch": 2.17, + "learning_rate": 4.085060705246642e-05, + "loss": 1.2576, + "step": 12520 + }, + { + "epoch": 2.17, + "learning_rate": 4.069644647685712e-05, + "loss": 1.2588, + "step": 12530 + }, + { + "epoch": 2.18, + "learning_rate": 4.0542502988351686e-05, + "loss": 1.2901, + "step": 12540 + }, + { + "epoch": 2.18, + "learning_rate": 4.038877715047699e-05, + "loss": 1.3228, + "step": 12550 + }, + { + "epoch": 2.18, + "learning_rate": 4.0250610452792004e-05, + "loss": 1.2813, + "step": 12560 + }, + { + "epoch": 2.18, + "learning_rate": 4.011262091761672e-05, + "loss": 1.3074, + "step": 12570 + }, + { + "epoch": 2.18, + "learning_rate": 3.997480895410295e-05, + "loss": 1.2753, + "step": 12580 + }, + { + "epoch": 2.19, + "learning_rate": 3.9821893310242744e-05, + "loss": 1.2519, + "step": 12590 + }, + { + "epoch": 2.19, + "learning_rate": 3.966919795488333e-05, + "loss": 1.331, + "step": 12600 + }, + { + "epoch": 2.19, + "learning_rate": 3.9516723446982664e-05, + "loss": 1.3126, + "step": 12610 + }, + { + "epoch": 2.19, + "learning_rate": 3.936447034469024e-05, + "loss": 1.2616, + "step": 12620 + }, + { + "epoch": 2.19, + "learning_rate": 3.92124392053451e-05, + "loss": 1.2957, + "step": 12630 + }, + { + "epoch": 2.19, + "learning_rate": 3.9060630585473746e-05, + "loss": 1.309, + "step": 12640 + }, + { + "epoch": 2.2, + "learning_rate": 3.890904504078814e-05, + "loss": 1.2873, + "step": 12650 + }, + { + "epoch": 2.2, + "learning_rate": 3.8757683126183654e-05, + "loss": 1.283, + "step": 12660 + }, + { + "epoch": 2.2, + "learning_rate": 3.8606545395737005e-05, + "loss": 1.3069, + "step": 12670 + }, + { + "epoch": 2.2, + "learning_rate": 3.84556324027043e-05, + "loss": 1.2527, + "step": 12680 + }, + { + "epoch": 2.2, + "learning_rate": 3.8304944699518954e-05, + "loss": 1.2922, + "step": 12690 + }, + { + "epoch": 2.2, + "learning_rate": 3.816951884539331e-05, + "loss": 1.2795, + "step": 12700 + }, + { + "epoch": 2.21, + "learning_rate": 3.801926071191671e-05, + "loss": 1.285, + "step": 12710 + }, + { + "epoch": 2.21, + "learning_rate": 3.786922946567352e-05, + "loss": 1.2804, + "step": 12720 + }, + { + "epoch": 2.21, + "learning_rate": 3.771942565586933e-05, + "loss": 1.318, + "step": 12730 + }, + { + "epoch": 2.21, + "learning_rate": 3.7569849830877333e-05, + "loss": 1.3102, + "step": 12740 + }, + { + "epoch": 2.21, + "learning_rate": 3.742050253823604e-05, + "loss": 1.3083, + "step": 12750 + }, + { + "epoch": 2.21, + "learning_rate": 3.7286285821885306e-05, + "loss": 1.2854, + "step": 12760 + }, + { + "epoch": 2.22, + "learning_rate": 3.713737424618142e-05, + "loss": 1.305, + "step": 12770 + }, + { + "epoch": 2.22, + "learning_rate": 3.6988692785952173e-05, + "loss": 1.2948, + "step": 12780 + }, + { + "epoch": 2.22, + "learning_rate": 3.68402419854622e-05, + "loss": 1.2779, + "step": 12790 + }, + { + "epoch": 2.22, + "learning_rate": 3.6692022388131795e-05, + "loss": 1.3145, + "step": 12800 + }, + { + "epoch": 2.22, + "learning_rate": 3.654403453653494e-05, + "loss": 1.2673, + "step": 12810 + }, + { + "epoch": 2.23, + "learning_rate": 3.639627897239718e-05, + "loss": 1.2883, + "step": 12820 + }, + { + "epoch": 2.23, + "learning_rate": 3.6248756236593863e-05, + "loss": 1.2678, + "step": 12830 + }, + { + "epoch": 2.23, + "learning_rate": 3.6101466869147995e-05, + "loss": 1.2771, + "step": 12840 + }, + { + "epoch": 2.23, + "learning_rate": 3.5954411409228294e-05, + "loss": 1.3139, + "step": 12850 + }, + { + "epoch": 2.23, + "learning_rate": 3.580759039514729e-05, + "loss": 1.2914, + "step": 12860 + }, + { + "epoch": 2.23, + "learning_rate": 3.566100436435924e-05, + "loss": 1.2685, + "step": 12870 + }, + { + "epoch": 2.24, + "learning_rate": 3.551465385345826e-05, + "loss": 1.2932, + "step": 12880 + }, + { + "epoch": 2.24, + "learning_rate": 3.5383140205951094e-05, + "loss": 1.2751, + "step": 12890 + }, + { + "epoch": 2.24, + "learning_rate": 3.5237238658062945e-05, + "loss": 1.2775, + "step": 12900 + }, + { + "epoch": 2.24, + "learning_rate": 3.5091574181302256e-05, + "loss": 1.2826, + "step": 12910 + }, + { + "epoch": 2.24, + "learning_rate": 3.494614730888971e-05, + "loss": 1.2661, + "step": 12920 + }, + { + "epoch": 2.24, + "learning_rate": 3.480095857317618e-05, + "loss": 1.27, + "step": 12930 + }, + { + "epoch": 2.25, + "learning_rate": 3.4656008505640814e-05, + "loss": 1.3088, + "step": 12940 + }, + { + "epoch": 2.25, + "learning_rate": 3.4511297636889095e-05, + "loss": 1.263, + "step": 12950 + }, + { + "epoch": 2.25, + "learning_rate": 3.4366826496650886e-05, + "loss": 1.2896, + "step": 12960 + }, + { + "epoch": 2.25, + "learning_rate": 3.422259561377853e-05, + "loss": 1.2919, + "step": 12970 + }, + { + "epoch": 2.25, + "learning_rate": 3.4078605516244785e-05, + "loss": 1.2451, + "step": 12980 + }, + { + "epoch": 2.25, + "learning_rate": 3.396358715789669e-05, + "loss": 1.3136, + "step": 12990 + }, + { + "epoch": 2.26, + "learning_rate": 3.383437644428432e-05, + "loss": 1.2834, + "step": 13000 + }, + { + "epoch": 2.26, + "eval_loss": 0.78007972240448, + "eval_runtime": 61.984, + "eval_samples_per_second": 8.454, + "eval_steps_per_second": 0.532, + "step": 13000 + }, + { + "epoch": 2.26, + "learning_rate": 3.369103909748521e-05, + "loss": 1.2585, + "step": 13010 + }, + { + "epoch": 2.26, + "learning_rate": 3.354794448184514e-05, + "loss": 1.27, + "step": 13020 + }, + { + "epoch": 2.26, + "learning_rate": 3.340509312117752e-05, + "loss": 1.2923, + "step": 13030 + }, + { + "epoch": 2.26, + "learning_rate": 3.32624855384053e-05, + "loss": 1.2711, + "step": 13040 + }, + { + "epoch": 2.26, + "learning_rate": 3.3120122255559e-05, + "loss": 1.2891, + "step": 13050 + }, + { + "epoch": 2.27, + "learning_rate": 3.2978003793774914e-05, + "loss": 1.2635, + "step": 13060 + }, + { + "epoch": 2.27, + "learning_rate": 3.283613067329311e-05, + "loss": 1.2926, + "step": 13070 + }, + { + "epoch": 2.27, + "learning_rate": 3.269450341345558e-05, + "loss": 1.2621, + "step": 13080 + }, + { + "epoch": 2.27, + "learning_rate": 3.2553122532704325e-05, + "loss": 1.291, + "step": 13090 + }, + { + "epoch": 2.27, + "learning_rate": 3.241198854857938e-05, + "loss": 1.2658, + "step": 13100 + }, + { + "epoch": 2.28, + "learning_rate": 3.227110197771703e-05, + "loss": 1.2671, + "step": 13110 + }, + { + "epoch": 2.28, + "learning_rate": 3.213046333584792e-05, + "loss": 1.2686, + "step": 13120 + }, + { + "epoch": 2.28, + "learning_rate": 3.1990073137795066e-05, + "loss": 1.2723, + "step": 13130 + }, + { + "epoch": 2.28, + "learning_rate": 3.186393480377876e-05, + "loss": 1.3037, + "step": 13140 + }, + { + "epoch": 2.28, + "learning_rate": 3.172401806405554e-05, + "loss": 1.284, + "step": 13150 + }, + { + "epoch": 2.28, + "learning_rate": 3.1584351255985664e-05, + "loss": 1.2582, + "step": 13160 + }, + { + "epoch": 2.29, + "learning_rate": 3.144493489083469e-05, + "loss": 1.3076, + "step": 13170 + }, + { + "epoch": 2.29, + "learning_rate": 3.130576947895139e-05, + "loss": 1.3193, + "step": 13180 + }, + { + "epoch": 2.29, + "learning_rate": 3.1166855529765825e-05, + "loss": 1.2829, + "step": 13190 + }, + { + "epoch": 2.29, + "learning_rate": 3.102819355178763e-05, + "loss": 1.3018, + "step": 13200 + }, + { + "epoch": 2.29, + "learning_rate": 3.0889784052604066e-05, + "loss": 1.2842, + "step": 13210 + }, + { + "epoch": 2.29, + "learning_rate": 3.075162753887814e-05, + "loss": 1.2816, + "step": 13220 + }, + { + "epoch": 2.3, + "learning_rate": 3.061372451634678e-05, + "loss": 1.2915, + "step": 13230 + }, + { + "epoch": 2.3, + "learning_rate": 3.0503584951244668e-05, + "loss": 1.2235, + "step": 13240 + }, + { + "epoch": 2.3, + "learning_rate": 3.0366139484357482e-05, + "loss": 1.2766, + "step": 13250 + }, + { + "epoch": 2.3, + "learning_rate": 3.0228948919785782e-05, + "loss": 1.2729, + "step": 13260 + }, + { + "epoch": 2.3, + "learning_rate": 3.0092013759730564e-05, + "loss": 1.2784, + "step": 13270 + }, + { + "epoch": 2.3, + "learning_rate": 2.9955334505457845e-05, + "loss": 1.2827, + "step": 13280 + }, + { + "epoch": 2.31, + "learning_rate": 2.981891165729691e-05, + "loss": 1.279, + "step": 13290 + }, + { + "epoch": 2.31, + "learning_rate": 2.9682745714638417e-05, + "loss": 1.2917, + "step": 13300 + }, + { + "epoch": 2.31, + "learning_rate": 2.9546837175932596e-05, + "loss": 1.2764, + "step": 13310 + }, + { + "epoch": 2.31, + "learning_rate": 2.941118653868744e-05, + "loss": 1.3066, + "step": 13320 + }, + { + "epoch": 2.31, + "learning_rate": 2.9289321881345254e-05, + "loss": 1.2945, + "step": 13330 + }, + { + "epoch": 2.32, + "learning_rate": 2.9154162624127146e-05, + "loss": 1.2654, + "step": 13340 + }, + { + "epoch": 2.32, + "learning_rate": 2.9019262705797567e-05, + "loss": 1.2715, + "step": 13350 + }, + { + "epoch": 2.32, + "learning_rate": 2.888462262017233e-05, + "loss": 1.3311, + "step": 13360 + }, + { + "epoch": 2.32, + "learning_rate": 2.875024286011615e-05, + "loss": 1.306, + "step": 13370 + }, + { + "epoch": 2.32, + "learning_rate": 2.8616123917540673e-05, + "loss": 1.2865, + "step": 13380 + }, + { + "epoch": 2.32, + "learning_rate": 2.848226628340287e-05, + "loss": 1.2676, + "step": 13390 + }, + { + "epoch": 2.33, + "learning_rate": 2.8348670447703218e-05, + "loss": 1.2997, + "step": 13400 + }, + { + "epoch": 2.33, + "learning_rate": 2.8215336899483768e-05, + "loss": 1.2863, + "step": 13410 + }, + { + "epoch": 2.33, + "learning_rate": 2.808226612682646e-05, + "loss": 1.2532, + "step": 13420 + }, + { + "epoch": 2.33, + "learning_rate": 2.7949458616851343e-05, + "loss": 1.2524, + "step": 13430 + }, + { + "epoch": 2.33, + "learning_rate": 2.781691485571475e-05, + "loss": 1.263, + "step": 13440 + }, + { + "epoch": 2.33, + "learning_rate": 2.7684635328607477e-05, + "loss": 1.2607, + "step": 13450 + }, + { + "epoch": 2.34, + "learning_rate": 2.7552620519753137e-05, + "loss": 1.2505, + "step": 13460 + }, + { + "epoch": 2.34, + "learning_rate": 2.742087091240628e-05, + "loss": 1.2687, + "step": 13470 + }, + { + "epoch": 2.34, + "learning_rate": 2.7302523411710645e-05, + "loss": 1.2855, + "step": 13480 + }, + { + "epoch": 2.34, + "learning_rate": 2.7171279015116002e-05, + "loss": 1.2412, + "step": 13490 + }, + { + "epoch": 2.34, + "learning_rate": 2.7040301215970876e-05, + "loss": 1.2575, + "step": 13500 + }, + { + "epoch": 2.34, + "learning_rate": 2.6922649533852228e-05, + "loss": 1.3001, + "step": 13510 + }, + { + "epoch": 2.35, + "learning_rate": 2.6792179589961273e-05, + "loss": 1.2726, + "step": 13520 + }, + { + "epoch": 2.35, + "learning_rate": 2.66619776312545e-05, + "loss": 1.2727, + "step": 13530 + }, + { + "epoch": 2.35, + "learning_rate": 2.6532044134350288e-05, + "loss": 1.2466, + "step": 13540 + }, + { + "epoch": 2.35, + "learning_rate": 2.6402379574884418e-05, + "loss": 1.2975, + "step": 13550 + }, + { + "epoch": 2.35, + "learning_rate": 2.627298442750803e-05, + "loss": 1.2745, + "step": 13560 + }, + { + "epoch": 2.36, + "learning_rate": 2.614385916588613e-05, + "loss": 1.2988, + "step": 13570 + }, + { + "epoch": 2.36, + "learning_rate": 2.6015004262695798e-05, + "loss": 1.2541, + "step": 13580 + }, + { + "epoch": 2.36, + "learning_rate": 2.5886420189624407e-05, + "loss": 1.2596, + "step": 13590 + }, + { + "epoch": 2.36, + "learning_rate": 2.5758107417367915e-05, + "loss": 1.267, + "step": 13600 + }, + { + "epoch": 2.36, + "learning_rate": 2.5630066415629195e-05, + "loss": 1.2716, + "step": 13610 + }, + { + "epoch": 2.36, + "learning_rate": 2.550229765311628e-05, + "loss": 1.2824, + "step": 13620 + }, + { + "epoch": 2.37, + "learning_rate": 2.5400278969684065e-05, + "loss": 1.3018, + "step": 13630 + }, + { + "epoch": 2.37, + "learning_rate": 2.5273001415739562e-05, + "loss": 1.2786, + "step": 13640 + }, + { + "epoch": 2.37, + "learning_rate": 2.5145997408096057e-05, + "loss": 1.2634, + "step": 13650 + }, + { + "epoch": 2.37, + "learning_rate": 2.503192806757474e-05, + "loss": 1.3144, + "step": 13660 + }, + { + "epoch": 2.37, + "learning_rate": 2.4905445077906675e-05, + "loss": 1.2597, + "step": 13670 + }, + { + "epoch": 2.37, + "learning_rate": 2.477923698001955e-05, + "loss": 1.2646, + "step": 13680 + }, + { + "epoch": 2.38, + "learning_rate": 2.4653304235911823e-05, + "loss": 1.2471, + "step": 13690 + }, + { + "epoch": 2.38, + "learning_rate": 2.4527647306573998e-05, + "loss": 1.2835, + "step": 13700 + }, + { + "epoch": 2.38, + "learning_rate": 2.4402266651986927e-05, + "loss": 1.2674, + "step": 13710 + }, + { + "epoch": 2.38, + "learning_rate": 2.4277162731120108e-05, + "loss": 1.2984, + "step": 13720 + }, + { + "epoch": 2.38, + "learning_rate": 2.4152336001930054e-05, + "loss": 1.2879, + "step": 13730 + }, + { + "epoch": 2.38, + "learning_rate": 2.4027786921358607e-05, + "loss": 1.2361, + "step": 13740 + }, + { + "epoch": 2.39, + "learning_rate": 2.3903515945331155e-05, + "loss": 1.3072, + "step": 13750 + }, + { + "epoch": 2.39, + "learning_rate": 2.3779523528755145e-05, + "loss": 1.2665, + "step": 13760 + }, + { + "epoch": 2.39, + "learning_rate": 2.3655810125518284e-05, + "loss": 1.3312, + "step": 13770 + }, + { + "epoch": 2.39, + "learning_rate": 2.3532376188486948e-05, + "loss": 1.3107, + "step": 13780 + }, + { + "epoch": 2.39, + "learning_rate": 2.340922216950443e-05, + "loss": 1.2497, + "step": 13790 + }, + { + "epoch": 2.4, + "learning_rate": 2.328634851938949e-05, + "loss": 1.3204, + "step": 13800 + }, + { + "epoch": 2.4, + "learning_rate": 2.316375568793443e-05, + "loss": 1.2692, + "step": 13810 + }, + { + "epoch": 2.4, + "learning_rate": 2.3041444123903668e-05, + "loss": 1.26, + "step": 13820 + }, + { + "epoch": 2.4, + "learning_rate": 2.2919414275031914e-05, + "loss": 1.2608, + "step": 13830 + }, + { + "epoch": 2.4, + "learning_rate": 2.2797666588022748e-05, + "loss": 1.2862, + "step": 13840 + }, + { + "epoch": 2.4, + "learning_rate": 2.2676201508546792e-05, + "loss": 1.2762, + "step": 13850 + }, + { + "epoch": 2.41, + "learning_rate": 2.2567124933972495e-05, + "loss": 1.3093, + "step": 13860 + }, + { + "epoch": 2.41, + "learning_rate": 2.2470360715755768e-05, + "loss": 1.2785, + "step": 13870 + }, + { + "epoch": 2.41, + "learning_rate": 2.2349660894643332e-05, + "loss": 1.2841, + "step": 13880 + }, + { + "epoch": 2.41, + "learning_rate": 2.222924532103765e-05, + "loss": 1.2676, + "step": 13890 + }, + { + "epoch": 2.41, + "learning_rate": 2.2109114435733026e-05, + "loss": 1.2522, + "step": 13900 + }, + { + "epoch": 2.41, + "learning_rate": 2.19892686784816e-05, + "loss": 1.2573, + "step": 13910 + }, + { + "epoch": 2.42, + "learning_rate": 2.1869708487991812e-05, + "loss": 1.2572, + "step": 13920 + }, + { + "epoch": 2.42, + "learning_rate": 2.1750434301926704e-05, + "loss": 1.23, + "step": 13930 + }, + { + "epoch": 2.42, + "learning_rate": 2.163144655690249e-05, + "loss": 1.2547, + "step": 13940 + }, + { + "epoch": 2.42, + "learning_rate": 2.1512745688486646e-05, + "loss": 1.29, + "step": 13950 + }, + { + "epoch": 2.42, + "learning_rate": 2.139433213119664e-05, + "loss": 1.2863, + "step": 13960 + }, + { + "epoch": 2.42, + "learning_rate": 2.127620631849816e-05, + "loss": 1.2471, + "step": 13970 + }, + { + "epoch": 2.43, + "learning_rate": 2.11583686828036e-05, + "loss": 1.2756, + "step": 13980 + }, + { + "epoch": 2.43, + "learning_rate": 2.104081965547041e-05, + "loss": 1.272, + "step": 13990 + }, + { + "epoch": 2.43, + "learning_rate": 2.092355966679961e-05, + "loss": 1.2714, + "step": 14000 + }, + { + "epoch": 2.43, + "eval_loss": 0.7770761251449585, + "eval_runtime": 62.0435, + "eval_samples_per_second": 8.446, + "eval_steps_per_second": 0.532, + "step": 14000 + }, + { + "epoch": 2.43, + "learning_rate": 2.080658914603415e-05, + "loss": 1.3074, + "step": 14010 + }, + { + "epoch": 2.43, + "learning_rate": 2.068990852135728e-05, + "loss": 1.2862, + "step": 14020 + }, + { + "epoch": 2.44, + "learning_rate": 2.057351821989113e-05, + "loss": 1.285, + "step": 14030 + }, + { + "epoch": 2.44, + "learning_rate": 2.045741866769507e-05, + "loss": 1.2885, + "step": 14040 + }, + { + "epoch": 2.44, + "learning_rate": 2.034161028976408e-05, + "loss": 1.2729, + "step": 14050 + }, + { + "epoch": 2.44, + "learning_rate": 2.0226093510027388e-05, + "loss": 1.2783, + "step": 14060 + }, + { + "epoch": 2.44, + "learning_rate": 2.0110868751346678e-05, + "loss": 1.2502, + "step": 14070 + }, + { + "epoch": 2.44, + "learning_rate": 1.999593643551475e-05, + "loss": 1.2787, + "step": 14080 + }, + { + "epoch": 2.45, + "learning_rate": 1.9881296983253773e-05, + "loss": 1.2514, + "step": 14090 + }, + { + "epoch": 2.45, + "learning_rate": 1.9766950814213946e-05, + "loss": 1.3199, + "step": 14100 + }, + { + "epoch": 2.45, + "learning_rate": 1.966429036520796e-05, + "loss": 1.321, + "step": 14110 + }, + { + "epoch": 2.45, + "learning_rate": 1.9550502586578255e-05, + "loss": 1.2912, + "step": 14120 + }, + { + "epoch": 2.45, + "learning_rate": 1.9437009302078558e-05, + "loss": 1.2402, + "step": 14130 + }, + { + "epoch": 2.45, + "learning_rate": 1.9323810927163365e-05, + "loss": 1.2623, + "step": 14140 + }, + { + "epoch": 2.46, + "learning_rate": 1.921090787620764e-05, + "loss": 1.2941, + "step": 14150 + }, + { + "epoch": 2.46, + "learning_rate": 1.9098300562505266e-05, + "loss": 1.2472, + "step": 14160 + }, + { + "epoch": 2.46, + "learning_rate": 1.8985989398267557e-05, + "loss": 1.2568, + "step": 14170 + }, + { + "epoch": 2.46, + "learning_rate": 1.887397479462174e-05, + "loss": 1.2569, + "step": 14180 + }, + { + "epoch": 2.46, + "learning_rate": 1.8762257161609442e-05, + "loss": 1.2837, + "step": 14190 + }, + { + "epoch": 2.46, + "learning_rate": 1.865083690818521e-05, + "loss": 1.255, + "step": 14200 + }, + { + "epoch": 2.47, + "learning_rate": 1.8550813276774915e-05, + "loss": 1.264, + "step": 14210 + }, + { + "epoch": 2.47, + "learning_rate": 1.845103114979575e-05, + "loss": 1.2722, + "step": 14220 + }, + { + "epoch": 2.47, + "learning_rate": 1.8340445725584443e-05, + "loss": 1.2828, + "step": 14230 + }, + { + "epoch": 2.47, + "learning_rate": 1.8230159225047806e-05, + "loss": 1.2776, + "step": 14240 + }, + { + "epoch": 2.47, + "learning_rate": 1.8120172051901564e-05, + "loss": 1.2505, + "step": 14250 + }, + { + "epoch": 2.47, + "learning_rate": 1.801048460876572e-05, + "loss": 1.2663, + "step": 14260 + }, + { + "epoch": 2.48, + "learning_rate": 1.7901097297163094e-05, + "loss": 1.2305, + "step": 14270 + }, + { + "epoch": 2.48, + "learning_rate": 1.779201051751783e-05, + "loss": 1.2955, + "step": 14280 + }, + { + "epoch": 2.48, + "learning_rate": 1.768322466915392e-05, + "loss": 1.2682, + "step": 14290 + }, + { + "epoch": 2.48, + "learning_rate": 1.7574740150293778e-05, + "loss": 1.2796, + "step": 14300 + }, + { + "epoch": 2.48, + "learning_rate": 1.746655735805681e-05, + "loss": 1.2872, + "step": 14310 + }, + { + "epoch": 2.49, + "learning_rate": 1.7380228633595075e-05, + "loss": 1.2768, + "step": 14320 + }, + { + "epoch": 2.49, + "learning_rate": 1.7272589946494132e-05, + "loss": 1.2804, + "step": 14330 + }, + { + "epoch": 2.49, + "learning_rate": 1.7165254092070015e-05, + "loss": 1.2582, + "step": 14340 + }, + { + "epoch": 2.49, + "learning_rate": 1.7058221463237277e-05, + "loss": 1.3296, + "step": 14350 + }, + { + "epoch": 2.49, + "learning_rate": 1.695149245180051e-05, + "loss": 1.2726, + "step": 14360 + }, + { + "epoch": 2.49, + "learning_rate": 1.685569625731185e-05, + "loss": 1.273, + "step": 14370 + }, + { + "epoch": 2.5, + "learning_rate": 1.6749545194367288e-05, + "loss": 1.3058, + "step": 14380 + }, + { + "epoch": 2.5, + "learning_rate": 1.6643698878761716e-05, + "loss": 1.295, + "step": 14390 + }, + { + "epoch": 2.5, + "learning_rate": 1.6538157697957113e-05, + "loss": 1.3008, + "step": 14400 + }, + { + "epoch": 2.5, + "learning_rate": 1.643292203829839e-05, + "loss": 1.2531, + "step": 14410 + }, + { + "epoch": 2.5, + "learning_rate": 1.632799228501215e-05, + "loss": 1.2844, + "step": 14420 + }, + { + "epoch": 2.5, + "learning_rate": 1.622336882220514e-05, + "loss": 1.2638, + "step": 14430 + }, + { + "epoch": 2.51, + "learning_rate": 1.6119052032862915e-05, + "loss": 1.2261, + "step": 14440 + }, + { + "epoch": 2.51, + "learning_rate": 1.601504229884846e-05, + "loss": 1.2561, + "step": 14450 + }, + { + "epoch": 2.51, + "learning_rate": 1.5911340000900688e-05, + "loss": 1.2693, + "step": 14460 + }, + { + "epoch": 2.51, + "learning_rate": 1.580794551863316e-05, + "loss": 1.267, + "step": 14470 + }, + { + "epoch": 2.51, + "learning_rate": 1.5704859230532563e-05, + "loss": 1.3048, + "step": 14480 + }, + { + "epoch": 2.51, + "learning_rate": 1.560208151395749e-05, + "loss": 1.2803, + "step": 14490 + }, + { + "epoch": 2.52, + "learning_rate": 1.549961274513695e-05, + "loss": 1.2607, + "step": 14500 + }, + { + "epoch": 2.52, + "learning_rate": 1.5407655313570525e-05, + "loss": 1.275, + "step": 14510 + }, + { + "epoch": 2.52, + "learning_rate": 1.5315948706191573e-05, + "loss": 1.2627, + "step": 14520 + }, + { + "epoch": 2.52, + "learning_rate": 1.5214346982990213e-05, + "loss": 1.2514, + "step": 14530 + }, + { + "epoch": 2.52, + "learning_rate": 1.5113055626887762e-05, + "loss": 1.2496, + "step": 14540 + }, + { + "epoch": 2.53, + "learning_rate": 1.5012075008672267e-05, + "loss": 1.3028, + "step": 14550 + }, + { + "epoch": 2.53, + "learning_rate": 1.4911405497994235e-05, + "loss": 1.2599, + "step": 14560 + }, + { + "epoch": 2.53, + "learning_rate": 1.4811047463365357e-05, + "loss": 1.2633, + "step": 14570 + }, + { + "epoch": 2.53, + "learning_rate": 1.4711001272157132e-05, + "loss": 1.2443, + "step": 14580 + }, + { + "epoch": 2.53, + "learning_rate": 1.4611267290599528e-05, + "loss": 1.3036, + "step": 14590 + }, + { + "epoch": 2.53, + "learning_rate": 1.4511845883779607e-05, + "loss": 1.2934, + "step": 14600 + }, + { + "epoch": 2.54, + "learning_rate": 1.4412737415640232e-05, + "loss": 1.2485, + "step": 14610 + }, + { + "epoch": 2.54, + "learning_rate": 1.4313942248978752e-05, + "loss": 1.2625, + "step": 14620 + }, + { + "epoch": 2.54, + "learning_rate": 1.4235131935781309e-05, + "loss": 1.272, + "step": 14630 + }, + { + "epoch": 2.54, + "learning_rate": 1.4136901622367581e-05, + "loss": 1.2825, + "step": 14640 + }, + { + "epoch": 2.54, + "learning_rate": 1.403898562015863e-05, + "loss": 1.2842, + "step": 14650 + }, + { + "epoch": 2.54, + "learning_rate": 1.3941384287586633e-05, + "loss": 1.2833, + "step": 14660 + }, + { + "epoch": 2.55, + "learning_rate": 1.384409798193188e-05, + "loss": 1.2957, + "step": 14670 + }, + { + "epoch": 2.55, + "learning_rate": 1.3747127059321474e-05, + "loss": 1.2412, + "step": 14680 + }, + { + "epoch": 2.55, + "learning_rate": 1.3650471874727967e-05, + "loss": 1.2911, + "step": 14690 + }, + { + "epoch": 2.55, + "learning_rate": 1.3554132781968232e-05, + "loss": 1.3062, + "step": 14700 + }, + { + "epoch": 2.55, + "learning_rate": 1.3458110133701962e-05, + "loss": 1.2822, + "step": 14710 + }, + { + "epoch": 2.55, + "learning_rate": 1.3362404281430497e-05, + "loss": 1.2376, + "step": 14720 + }, + { + "epoch": 2.56, + "learning_rate": 1.3267015575495512e-05, + "loss": 1.2577, + "step": 14730 + }, + { + "epoch": 2.56, + "learning_rate": 1.3171944365077748e-05, + "loss": 1.2595, + "step": 14740 + }, + { + "epoch": 2.56, + "learning_rate": 1.307719099819571e-05, + "loss": 1.2946, + "step": 14750 + }, + { + "epoch": 2.56, + "learning_rate": 1.2982755821704372e-05, + "loss": 1.2915, + "step": 14760 + }, + { + "epoch": 2.56, + "learning_rate": 1.288863918129396e-05, + "loss": 1.2599, + "step": 14770 + }, + { + "epoch": 2.57, + "learning_rate": 1.2794841421488679e-05, + "loss": 1.2552, + "step": 14780 + }, + { + "epoch": 2.57, + "learning_rate": 1.2710696364389941e-05, + "loss": 1.2647, + "step": 14790 + }, + { + "epoch": 2.57, + "learning_rate": 1.2626810128213363e-05, + "loss": 1.2425, + "step": 14800 + }, + { + "epoch": 2.57, + "learning_rate": 1.2533907057030315e-05, + "loss": 1.2571, + "step": 14810 + }, + { + "epoch": 2.57, + "learning_rate": 1.244132416498789e-05, + "loss": 1.297, + "step": 14820 + }, + { + "epoch": 2.57, + "learning_rate": 1.2349061790995841e-05, + "loss": 1.2686, + "step": 14830 + }, + { + "epoch": 2.58, + "learning_rate": 1.225712027279059e-05, + "loss": 1.2944, + "step": 14840 + }, + { + "epoch": 2.58, + "learning_rate": 1.21654999469341e-05, + "loss": 1.2394, + "step": 14850 + }, + { + "epoch": 2.58, + "learning_rate": 1.2074201148812537e-05, + "loss": 1.2908, + "step": 14860 + }, + { + "epoch": 2.58, + "learning_rate": 1.1983224212635024e-05, + "loss": 1.2721, + "step": 14870 + }, + { + "epoch": 2.58, + "learning_rate": 1.1892569471432557e-05, + "loss": 1.2818, + "step": 14880 + }, + { + "epoch": 2.58, + "learning_rate": 1.1802237257056659e-05, + "loss": 1.2811, + "step": 14890 + }, + { + "epoch": 2.59, + "learning_rate": 1.171222790017823e-05, + "loss": 1.2835, + "step": 14900 + }, + { + "epoch": 2.59, + "learning_rate": 1.1622541730286296e-05, + "loss": 1.2731, + "step": 14910 + }, + { + "epoch": 2.59, + "learning_rate": 1.153317907568684e-05, + "loss": 1.2946, + "step": 14920 + }, + { + "epoch": 2.59, + "learning_rate": 1.1444140263501591e-05, + "loss": 1.2726, + "step": 14930 + }, + { + "epoch": 2.59, + "learning_rate": 1.135542561966675e-05, + "loss": 1.2807, + "step": 14940 + }, + { + "epoch": 2.59, + "learning_rate": 1.1275859872585081e-05, + "loss": 1.2817, + "step": 14950 + }, + { + "epoch": 2.6, + "learning_rate": 1.1187762042319471e-05, + "loss": 1.2802, + "step": 14960 + }, + { + "epoch": 2.6, + "learning_rate": 1.1108751952271423e-05, + "loss": 1.2721, + "step": 14970 + }, + { + "epoch": 2.6, + "learning_rate": 1.1021272099769108e-05, + "loss": 1.2398, + "step": 14980 + }, + { + "epoch": 2.6, + "learning_rate": 1.093411796357211e-05, + "loss": 1.2574, + "step": 14990 + }, + { + "epoch": 2.6, + "learning_rate": 1.0847289862717614e-05, + "loss": 1.2228, + "step": 15000 + }, + { + "epoch": 2.6, + "eval_loss": 0.7751675248146057, + "eval_runtime": 62.0277, + "eval_samples_per_second": 8.448, + "eval_steps_per_second": 0.532, + "step": 15000 + }, + { + "epoch": 2.61, + "learning_rate": 1.0760788115049313e-05, + "loss": 1.3108, + "step": 15010 + }, + { + "epoch": 2.61, + "learning_rate": 1.0674613037216263e-05, + "loss": 1.2385, + "step": 15020 + }, + { + "epoch": 2.61, + "learning_rate": 1.0588764944671713e-05, + "loss": 1.2627, + "step": 15030 + }, + { + "epoch": 2.61, + "learning_rate": 1.0503244151671942e-05, + "loss": 1.2532, + "step": 15040 + }, + { + "epoch": 2.61, + "learning_rate": 1.0426555537850258e-05, + "loss": 1.2731, + "step": 15050 + }, + { + "epoch": 2.61, + "learning_rate": 1.034165747546959e-05, + "loss": 1.2618, + "step": 15060 + }, + { + "epoch": 2.62, + "learning_rate": 1.0257087617197447e-05, + "loss": 1.2941, + "step": 15070 + }, + { + "epoch": 2.62, + "learning_rate": 1.017284627261097e-05, + "loss": 1.229, + "step": 15080 + }, + { + "epoch": 2.62, + "learning_rate": 1.008893375008475e-05, + "loss": 1.3288, + "step": 15090 + }, + { + "epoch": 2.62, + "learning_rate": 1.0005350356789733e-05, + "loss": 1.2818, + "step": 15100 + }, + { + "epoch": 2.62, + "learning_rate": 9.922096398692005e-06, + "loss": 1.2817, + "step": 15110 + }, + { + "epoch": 2.62, + "learning_rate": 9.839172180551736e-06, + "loss": 1.2831, + "step": 15120 + }, + { + "epoch": 2.63, + "learning_rate": 9.756578005922001e-06, + "loss": 1.2657, + "step": 15130 + }, + { + "epoch": 2.63, + "learning_rate": 9.674314177147791e-06, + "loss": 1.2788, + "step": 15140 + }, + { + "epoch": 2.63, + "learning_rate": 9.592380995364781e-06, + "loss": 1.2736, + "step": 15150 + }, + { + "epoch": 2.63, + "learning_rate": 9.510778760498273e-06, + "loss": 1.262, + "step": 15160 + }, + { + "epoch": 2.63, + "learning_rate": 9.429507771262148e-06, + "loss": 1.2497, + "step": 15170 + }, + { + "epoch": 2.63, + "learning_rate": 9.348568325157681e-06, + "loss": 1.2698, + "step": 15180 + }, + { + "epoch": 2.64, + "learning_rate": 9.267960718472513e-06, + "loss": 1.2894, + "step": 15190 + }, + { + "epoch": 2.64, + "learning_rate": 9.187685246279565e-06, + "loss": 1.277, + "step": 15200 + }, + { + "epoch": 2.64, + "learning_rate": 9.107742202435876e-06, + "loss": 1.2803, + "step": 15210 + }, + { + "epoch": 2.64, + "learning_rate": 9.028131879581714e-06, + "loss": 1.2451, + "step": 15220 + }, + { + "epoch": 2.64, + "learning_rate": 8.948854569139287e-06, + "loss": 1.241, + "step": 15230 + }, + { + "epoch": 2.65, + "learning_rate": 8.8699105613118e-06, + "loss": 1.2558, + "step": 15240 + }, + { + "epoch": 2.65, + "learning_rate": 8.79914616687264e-06, + "loss": 1.2357, + "step": 15250 + }, + { + "epoch": 2.65, + "learning_rate": 8.720836229152817e-06, + "loss": 1.2819, + "step": 15260 + }, + { + "epoch": 2.65, + "learning_rate": 8.642860428733857e-06, + "loss": 1.288, + "step": 15270 + }, + { + "epoch": 2.65, + "learning_rate": 8.565219051054663e-06, + "loss": 1.283, + "step": 15280 + }, + { + "epoch": 2.65, + "learning_rate": 8.495627977514654e-06, + "loss": 1.2858, + "step": 15290 + }, + { + "epoch": 2.66, + "learning_rate": 8.41862278503991e-06, + "loss": 1.2931, + "step": 15300 + }, + { + "epoch": 2.66, + "learning_rate": 8.341952836151169e-06, + "loss": 1.2803, + "step": 15310 + }, + { + "epoch": 2.66, + "learning_rate": 8.265618411507148e-06, + "loss": 1.2773, + "step": 15320 + }, + { + "epoch": 2.66, + "learning_rate": 8.189619790538295e-06, + "loss": 1.2717, + "step": 15330 + }, + { + "epoch": 2.66, + "learning_rate": 8.113957251445836e-06, + "loss": 1.2474, + "step": 15340 + }, + { + "epoch": 2.66, + "learning_rate": 8.038631071200698e-06, + "loss": 1.2828, + "step": 15350 + }, + { + "epoch": 2.67, + "learning_rate": 7.963641525542564e-06, + "loss": 1.2829, + "step": 15360 + }, + { + "epoch": 2.67, + "learning_rate": 7.888988888978833e-06, + "loss": 1.2845, + "step": 15370 + }, + { + "epoch": 2.67, + "learning_rate": 7.814673434783604e-06, + "loss": 1.2726, + "step": 15380 + }, + { + "epoch": 2.67, + "learning_rate": 7.740695434996626e-06, + "loss": 1.2498, + "step": 15390 + }, + { + "epoch": 2.67, + "learning_rate": 7.667055160422431e-06, + "loss": 1.2746, + "step": 15400 + }, + { + "epoch": 2.67, + "learning_rate": 7.593752880629257e-06, + "loss": 1.271, + "step": 15410 + }, + { + "epoch": 2.68, + "learning_rate": 7.52078886394807e-06, + "loss": 1.256, + "step": 15420 + }, + { + "epoch": 2.68, + "learning_rate": 7.448163377471562e-06, + "loss": 1.2778, + "step": 15430 + }, + { + "epoch": 2.68, + "learning_rate": 7.375876687053251e-06, + "loss": 1.2898, + "step": 15440 + }, + { + "epoch": 2.68, + "learning_rate": 7.303929057306414e-06, + "loss": 1.2512, + "step": 15450 + }, + { + "epoch": 2.68, + "learning_rate": 7.23232075160315e-06, + "loss": 1.2936, + "step": 15460 + }, + { + "epoch": 2.68, + "learning_rate": 7.161052032073445e-06, + "loss": 1.2946, + "step": 15470 + }, + { + "epoch": 2.69, + "learning_rate": 7.097200746323862e-06, + "loss": 1.2764, + "step": 15480 + }, + { + "epoch": 2.69, + "learning_rate": 7.026577958239167e-06, + "loss": 1.2406, + "step": 15490 + }, + { + "epoch": 2.69, + "learning_rate": 6.956295509471921e-06, + "loss": 1.2662, + "step": 15500 + }, + { + "epoch": 2.69, + "learning_rate": 6.88635365729865e-06, + "loss": 1.244, + "step": 15510 + }, + { + "epoch": 2.69, + "learning_rate": 6.8167526577491034e-06, + "loss": 1.2808, + "step": 15520 + }, + { + "epoch": 2.7, + "learning_rate": 6.747492765605312e-06, + "loss": 1.3011, + "step": 15530 + }, + { + "epoch": 2.7, + "learning_rate": 6.678574234400659e-06, + "loss": 1.2447, + "step": 15540 + }, + { + "epoch": 2.7, + "learning_rate": 6.60999731641887e-06, + "loss": 1.2038, + "step": 15550 + }, + { + "epoch": 2.7, + "learning_rate": 6.548570377045693e-06, + "loss": 1.3007, + "step": 15560 + }, + { + "epoch": 2.7, + "learning_rate": 6.480643214749759e-06, + "loss": 1.2823, + "step": 15570 + }, + { + "epoch": 2.7, + "learning_rate": 6.413058390224724e-06, + "loss": 1.2388, + "step": 15580 + }, + { + "epoch": 2.71, + "learning_rate": 6.345816150872197e-06, + "loss": 1.2874, + "step": 15590 + }, + { + "epoch": 2.71, + "learning_rate": 6.278916742839691e-06, + "loss": 1.2493, + "step": 15600 + }, + { + "epoch": 2.71, + "learning_rate": 6.2123604110197686e-06, + "loss": 1.282, + "step": 15610 + }, + { + "epoch": 2.71, + "learning_rate": 6.146147399049107e-06, + "loss": 1.2575, + "step": 15620 + }, + { + "epoch": 2.71, + "learning_rate": 6.0802779493076665e-06, + "loss": 1.286, + "step": 15630 + }, + { + "epoch": 2.71, + "learning_rate": 6.014752302917681e-06, + "loss": 1.281, + "step": 15640 + }, + { + "epoch": 2.72, + "learning_rate": 5.949570699742935e-06, + "loss": 1.2855, + "step": 15650 + }, + { + "epoch": 2.72, + "learning_rate": 5.8847333783877635e-06, + "loss": 1.2316, + "step": 15660 + }, + { + "epoch": 2.72, + "learning_rate": 5.820240576196223e-06, + "loss": 1.2645, + "step": 15670 + }, + { + "epoch": 2.72, + "learning_rate": 5.7560925292512335e-06, + "loss": 1.2897, + "step": 15680 + }, + { + "epoch": 2.72, + "learning_rate": 5.69228947237368e-06, + "loss": 1.2823, + "step": 15690 + }, + { + "epoch": 2.72, + "learning_rate": 5.635161880753381e-06, + "loss": 1.2788, + "step": 15700 + }, + { + "epoch": 2.73, + "learning_rate": 5.572014947411885e-06, + "loss": 1.2814, + "step": 15710 + }, + { + "epoch": 2.73, + "learning_rate": 5.515478243480177e-06, + "loss": 1.2719, + "step": 15720 + }, + { + "epoch": 2.73, + "learning_rate": 5.452988268147996e-06, + "loss": 1.2618, + "step": 15730 + }, + { + "epoch": 2.73, + "learning_rate": 5.390844392429362e-06, + "loss": 1.3436, + "step": 15740 + }, + { + "epoch": 2.73, + "learning_rate": 5.329046843808683e-06, + "loss": 1.2658, + "step": 15750 + }, + { + "epoch": 2.74, + "learning_rate": 5.267595848502604e-06, + "loss": 1.2742, + "step": 15760 + }, + { + "epoch": 2.74, + "learning_rate": 5.2064916314591646e-06, + "loss": 1.2553, + "step": 15770 + }, + { + "epoch": 2.74, + "learning_rate": 5.145734416356996e-06, + "loss": 1.2679, + "step": 15780 + }, + { + "epoch": 2.74, + "learning_rate": 5.085324425604499e-06, + "loss": 1.2254, + "step": 15790 + }, + { + "epoch": 2.74, + "learning_rate": 5.025261880338994e-06, + "loss": 1.2656, + "step": 15800 + }, + { + "epoch": 2.74, + "learning_rate": 4.965547000425985e-06, + "loss": 1.2524, + "step": 15810 + }, + { + "epoch": 2.75, + "learning_rate": 4.9061800044582385e-06, + "loss": 1.2899, + "step": 15820 + }, + { + "epoch": 2.75, + "learning_rate": 4.853047328501259e-06, + "loss": 1.2837, + "step": 15830 + }, + { + "epoch": 2.75, + "learning_rate": 4.794341909691191e-06, + "loss": 1.2689, + "step": 15840 + }, + { + "epoch": 2.75, + "learning_rate": 4.735985001541243e-06, + "loss": 1.2794, + "step": 15850 + }, + { + "epoch": 2.75, + "learning_rate": 4.677976817673235e-06, + "loss": 1.2599, + "step": 15860 + }, + { + "epoch": 2.75, + "learning_rate": 4.62031757043242e-06, + "loss": 1.2905, + "step": 15870 + }, + { + "epoch": 2.76, + "learning_rate": 4.563007470886749e-06, + "loss": 1.2726, + "step": 15880 + }, + { + "epoch": 2.76, + "learning_rate": 4.506046728826075e-06, + "loss": 1.2318, + "step": 15890 + }, + { + "epoch": 2.76, + "learning_rate": 4.449435552761372e-06, + "loss": 1.2712, + "step": 15900 + }, + { + "epoch": 2.76, + "learning_rate": 4.398784544532874e-06, + "loss": 1.3048, + "step": 15910 + }, + { + "epoch": 2.76, + "learning_rate": 4.342838113724712e-06, + "loss": 1.2803, + "step": 15920 + }, + { + "epoch": 2.76, + "learning_rate": 4.2872418463554055e-06, + "loss": 1.3073, + "step": 15930 + }, + { + "epoch": 2.77, + "learning_rate": 4.231995945941125e-06, + "loss": 1.2495, + "step": 15940 + }, + { + "epoch": 2.77, + "learning_rate": 4.1771006147155015e-06, + "loss": 1.2985, + "step": 15950 + }, + { + "epoch": 2.77, + "learning_rate": 4.122556053628868e-06, + "loss": 1.2603, + "step": 15960 + }, + { + "epoch": 2.77, + "learning_rate": 4.068362462347508e-06, + "loss": 1.2751, + "step": 15970 + }, + { + "epoch": 2.77, + "learning_rate": 4.014520039252956e-06, + "loss": 1.2502, + "step": 15980 + }, + { + "epoch": 2.78, + "learning_rate": 3.961028981441251e-06, + "loss": 1.2988, + "step": 15990 + }, + { + "epoch": 2.78, + "learning_rate": 3.907889484722238e-06, + "loss": 1.2901, + "step": 16000 + }, + { + "epoch": 2.78, + "eval_loss": 0.7744143605232239, + "eval_runtime": 61.9699, + "eval_samples_per_second": 8.456, + "eval_steps_per_second": 0.533, + "step": 16000 + }, + { + "epoch": 2.78, + "learning_rate": 3.855101743618806e-06, + "loss": 1.278, + "step": 16010 + }, + { + "epoch": 2.78, + "learning_rate": 3.8026659513662353e-06, + "loss": 1.2782, + "step": 16020 + }, + { + "epoch": 2.78, + "learning_rate": 3.7505822999114206e-06, + "loss": 1.266, + "step": 16030 + }, + { + "epoch": 2.78, + "learning_rate": 3.6988509799122494e-06, + "loss": 1.2606, + "step": 16040 + }, + { + "epoch": 2.79, + "learning_rate": 3.647472180736833e-06, + "loss": 1.2544, + "step": 16050 + }, + { + "epoch": 2.79, + "learning_rate": 3.5964460904628685e-06, + "loss": 1.2632, + "step": 16060 + }, + { + "epoch": 2.79, + "learning_rate": 3.5457728958768642e-06, + "loss": 1.2793, + "step": 16070 + }, + { + "epoch": 2.79, + "learning_rate": 3.495452782473596e-06, + "loss": 1.2691, + "step": 16080 + }, + { + "epoch": 2.79, + "learning_rate": 3.4454859344552835e-06, + "loss": 1.2889, + "step": 16090 + }, + { + "epoch": 2.79, + "learning_rate": 3.4008179643440496e-06, + "loss": 1.2798, + "step": 16100 + }, + { + "epoch": 2.8, + "learning_rate": 3.3515228234023422e-06, + "loss": 1.2751, + "step": 16110 + }, + { + "epoch": 2.8, + "learning_rate": 3.307459683817815e-06, + "loss": 1.2461, + "step": 16120 + }, + { + "epoch": 2.8, + "learning_rate": 3.2588369013774933e-06, + "loss": 1.2488, + "step": 16130 + }, + { + "epoch": 2.8, + "learning_rate": 3.210568250480306e-06, + "loss": 1.2522, + "step": 16140 + }, + { + "epoch": 2.8, + "learning_rate": 3.1626539078188687e-06, + "loss": 1.2958, + "step": 16150 + }, + { + "epoch": 2.8, + "learning_rate": 3.1150940487888804e-06, + "loss": 1.2353, + "step": 16160 + }, + { + "epoch": 2.81, + "learning_rate": 3.0678888474883316e-06, + "loss": 1.2498, + "step": 16170 + }, + { + "epoch": 2.81, + "learning_rate": 3.0210384767169975e-06, + "loss": 1.2708, + "step": 16180 + }, + { + "epoch": 2.81, + "learning_rate": 2.97454310797578e-06, + "loss": 1.2368, + "step": 16190 + }, + { + "epoch": 2.81, + "learning_rate": 2.9284029114660107e-06, + "loss": 1.2822, + "step": 16200 + }, + { + "epoch": 2.81, + "learning_rate": 2.8826180560888927e-06, + "loss": 1.2863, + "step": 16210 + }, + { + "epoch": 2.82, + "learning_rate": 2.837188709444882e-06, + "loss": 1.2477, + "step": 16220 + }, + { + "epoch": 2.82, + "learning_rate": 2.792115037833032e-06, + "loss": 1.2577, + "step": 16230 + }, + { + "epoch": 2.82, + "learning_rate": 2.7473972062503905e-06, + "loss": 1.2445, + "step": 16240 + }, + { + "epoch": 2.82, + "learning_rate": 2.707455536371439e-06, + "loss": 1.2237, + "step": 16250 + }, + { + "epoch": 2.82, + "learning_rate": 2.6634142507455885e-06, + "loss": 1.2587, + "step": 16260 + }, + { + "epoch": 2.82, + "learning_rate": 2.624081735149897e-06, + "loss": 1.2759, + "step": 16270 + }, + { + "epoch": 2.83, + "learning_rate": 2.580717577477021e-06, + "loss": 1.2786, + "step": 16280 + }, + { + "epoch": 2.83, + "learning_rate": 2.5377100336767545e-06, + "loss": 1.272, + "step": 16290 + }, + { + "epoch": 2.83, + "learning_rate": 2.495059261182886e-06, + "loss": 1.2404, + "step": 16300 + }, + { + "epoch": 2.83, + "learning_rate": 2.452765416123215e-06, + "loss": 1.2751, + "step": 16310 + }, + { + "epoch": 2.83, + "learning_rate": 2.4108286533189527e-06, + "loss": 1.2624, + "step": 16320 + }, + { + "epoch": 2.83, + "learning_rate": 2.3692491262841785e-06, + "loss": 1.2965, + "step": 16330 + }, + { + "epoch": 2.84, + "learning_rate": 2.3280269872252847e-06, + "loss": 1.2947, + "step": 16340 + }, + { + "epoch": 2.84, + "learning_rate": 2.287162387040365e-06, + "loss": 1.2839, + "step": 16350 + }, + { + "epoch": 2.84, + "learning_rate": 2.2506900662738086e-06, + "loss": 1.2637, + "step": 16360 + }, + { + "epoch": 2.84, + "learning_rate": 2.210505200985846e-06, + "loss": 1.2615, + "step": 16370 + }, + { + "epoch": 2.84, + "learning_rate": 2.1706783047731326e-06, + "loss": 1.2743, + "step": 16380 + }, + { + "epoch": 2.84, + "learning_rate": 2.1312095234263807e-06, + "loss": 1.2368, + "step": 16390 + }, + { + "epoch": 2.85, + "learning_rate": 2.0920990014253185e-06, + "loss": 1.2542, + "step": 16400 + }, + { + "epoch": 2.85, + "learning_rate": 2.0533468819382893e-06, + "loss": 1.2367, + "step": 16410 + }, + { + "epoch": 2.85, + "learning_rate": 2.014953306821632e-06, + "loss": 1.2418, + "step": 16420 + }, + { + "epoch": 2.85, + "learning_rate": 1.976918416619211e-06, + "loss": 1.2555, + "step": 16430 + }, + { + "epoch": 2.85, + "learning_rate": 1.939242350561854e-06, + "loss": 1.2583, + "step": 16440 + }, + { + "epoch": 2.86, + "learning_rate": 1.9019252465669046e-06, + "loss": 1.2734, + "step": 16450 + }, + { + "epoch": 2.86, + "learning_rate": 1.8649672412376916e-06, + "loss": 1.2125, + "step": 16460 + }, + { + "epoch": 2.86, + "learning_rate": 1.8283684698629843e-06, + "loss": 1.2853, + "step": 16470 + }, + { + "epoch": 2.86, + "learning_rate": 1.7921290664165923e-06, + "loss": 1.2621, + "step": 16480 + }, + { + "epoch": 2.86, + "learning_rate": 1.756249163556778e-06, + "loss": 1.2599, + "step": 16490 + }, + { + "epoch": 2.86, + "learning_rate": 1.7207288926258225e-06, + "loss": 1.2865, + "step": 16500 + }, + { + "epoch": 2.87, + "learning_rate": 1.6855683836495383e-06, + "loss": 1.238, + "step": 16510 + }, + { + "epoch": 2.87, + "learning_rate": 1.6507677653367915e-06, + "loss": 1.2989, + "step": 16520 + }, + { + "epoch": 2.87, + "learning_rate": 1.6163271650790456e-06, + "loss": 1.2784, + "step": 16530 + }, + { + "epoch": 2.87, + "learning_rate": 1.5822467089498304e-06, + "loss": 1.2912, + "step": 16540 + }, + { + "epoch": 2.87, + "learning_rate": 1.5485265217043854e-06, + "loss": 1.2584, + "step": 16550 + }, + { + "epoch": 2.87, + "learning_rate": 1.5184864851265469e-06, + "loss": 1.2535, + "step": 16560 + }, + { + "epoch": 2.88, + "learning_rate": 1.4854511477372047e-06, + "loss": 1.3007, + "step": 16570 + }, + { + "epoch": 2.88, + "learning_rate": 1.456027673515925e-06, + "loss": 1.2791, + "step": 16580 + }, + { + "epoch": 2.88, + "learning_rate": 1.4236776225376336e-06, + "loss": 1.2686, + "step": 16590 + }, + { + "epoch": 2.88, + "learning_rate": 1.3916884209024705e-06, + "loss": 1.2315, + "step": 16600 + }, + { + "epoch": 2.88, + "learning_rate": 1.3600601857104101e-06, + "loss": 1.2747, + "step": 16610 + }, + { + "epoch": 2.88, + "learning_rate": 1.3287930327400167e-06, + "loss": 1.2595, + "step": 16620 + }, + { + "epoch": 2.89, + "learning_rate": 1.2978870764481232e-06, + "loss": 1.2505, + "step": 16630 + }, + { + "epoch": 2.89, + "learning_rate": 1.2673424299693204e-06, + "loss": 1.2814, + "step": 16640 + }, + { + "epoch": 2.89, + "learning_rate": 1.2371592051156345e-06, + "loss": 1.2427, + "step": 16650 + }, + { + "epoch": 2.89, + "learning_rate": 1.2073375123760168e-06, + "loss": 1.2477, + "step": 16660 + }, + { + "epoch": 2.89, + "learning_rate": 1.1778774609160436e-06, + "loss": 1.2516, + "step": 16670 + }, + { + "epoch": 2.89, + "learning_rate": 1.1487791585774176e-06, + "loss": 1.2804, + "step": 16680 + }, + { + "epoch": 2.9, + "learning_rate": 1.1200427118776224e-06, + "loss": 1.2826, + "step": 16690 + }, + { + "epoch": 2.9, + "learning_rate": 1.0916682260095789e-06, + "loss": 1.2703, + "step": 16700 + }, + { + "epoch": 2.9, + "learning_rate": 1.063655804841146e-06, + "loss": 1.2501, + "step": 16710 + }, + { + "epoch": 2.9, + "learning_rate": 1.0360055509148535e-06, + "loss": 1.2323, + "step": 16720 + }, + { + "epoch": 2.9, + "learning_rate": 1.008717565447448e-06, + "loss": 1.2773, + "step": 16730 + }, + { + "epoch": 2.91, + "learning_rate": 9.871480775350161e-07, + "loss": 1.3079, + "step": 16740 + }, + { + "epoch": 2.91, + "learning_rate": 9.605124261266474e-07, + "loss": 1.2767, + "step": 16750 + }, + { + "epoch": 2.91, + "learning_rate": 9.34239319527963e-07, + "loss": 1.2475, + "step": 16760 + }, + { + "epoch": 2.91, + "learning_rate": 9.083288539145196e-07, + "loss": 1.2586, + "step": 16770 + }, + { + "epoch": 2.91, + "learning_rate": 8.827811241344131e-07, + "loss": 1.2465, + "step": 16780 + }, + { + "epoch": 2.91, + "learning_rate": 8.575962237078572e-07, + "loss": 1.2841, + "step": 16790 + }, + { + "epoch": 2.92, + "learning_rate": 8.327742448269394e-07, + "loss": 1.2984, + "step": 16800 + }, + { + "epoch": 2.92, + "learning_rate": 8.083152783552095e-07, + "loss": 1.2587, + "step": 16810 + }, + { + "epoch": 2.92, + "learning_rate": 7.842194138273584e-07, + "loss": 1.2659, + "step": 16820 + }, + { + "epoch": 2.92, + "learning_rate": 7.628436608436595e-07, + "loss": 1.2654, + "step": 16830 + }, + { + "epoch": 2.92, + "learning_rate": 7.3943793191662e-07, + "loss": 1.2415, + "step": 16840 + }, + { + "epoch": 2.92, + "learning_rate": 7.163955570664738e-07, + "loss": 1.2256, + "step": 16850 + }, + { + "epoch": 2.93, + "learning_rate": 6.937166206423485e-07, + "loss": 1.2541, + "step": 16860 + }, + { + "epoch": 2.93, + "learning_rate": 6.714012056629693e-07, + "loss": 1.2869, + "step": 16870 + }, + { + "epoch": 2.93, + "learning_rate": 6.494493938163038e-07, + "loss": 1.2867, + "step": 16880 + }, + { + "epoch": 2.93, + "learning_rate": 6.278612654593729e-07, + "loss": 1.242, + "step": 16890 + }, + { + "epoch": 2.93, + "learning_rate": 6.066368996178517e-07, + "loss": 1.2183, + "step": 16900 + }, + { + "epoch": 2.93, + "learning_rate": 5.85776373985858e-07, + "loss": 1.2517, + "step": 16910 + }, + { + "epoch": 2.94, + "learning_rate": 5.652797649255969e-07, + "loss": 1.2363, + "step": 16920 + }, + { + "epoch": 2.94, + "learning_rate": 5.4514714746714e-07, + "loss": 1.2635, + "step": 16930 + }, + { + "epoch": 2.94, + "learning_rate": 5.253785953081125e-07, + "loss": 1.2782, + "step": 16940 + }, + { + "epoch": 2.94, + "learning_rate": 5.059741808134621e-07, + "loss": 1.3006, + "step": 16950 + }, + { + "epoch": 2.94, + "learning_rate": 4.869339750151469e-07, + "loss": 1.2425, + "step": 16960 + }, + { + "epoch": 2.95, + "learning_rate": 4.682580476119247e-07, + "loss": 1.276, + "step": 16970 + }, + { + "epoch": 2.95, + "learning_rate": 4.499464669690423e-07, + "loss": 1.2827, + "step": 16980 + }, + { + "epoch": 2.95, + "learning_rate": 4.3199930011802446e-07, + "loss": 1.3223, + "step": 16990 + }, + { + "epoch": 2.95, + "learning_rate": 4.1441661275645195e-07, + "loss": 1.2453, + "step": 17000 + }, + { + "epoch": 2.95, + "eval_loss": 0.7738975882530212, + "eval_runtime": 61.8545, + "eval_samples_per_second": 8.471, + "eval_steps_per_second": 0.534, + "step": 17000 + }, + { + "epoch": 2.95, + "learning_rate": 3.971984692476394e-07, + "loss": 1.2683, + "step": 17010 + }, + { + "epoch": 2.95, + "learning_rate": 3.820138772047788e-07, + "loss": 1.2824, + "step": 17020 + }, + { + "epoch": 2.96, + "learning_rate": 3.6548853955771235e-07, + "loss": 1.271, + "step": 17030 + }, + { + "epoch": 2.96, + "learning_rate": 3.493279248699355e-07, + "loss": 1.298, + "step": 17040 + }, + { + "epoch": 2.96, + "learning_rate": 3.3353209229913806e-07, + "loss": 1.2695, + "step": 17050 + }, + { + "epoch": 2.96, + "learning_rate": 3.181010996677003e-07, + "loss": 1.2698, + "step": 17060 + }, + { + "epoch": 2.96, + "learning_rate": 3.030350034624374e-07, + "loss": 1.266, + "step": 17070 + }, + { + "epoch": 2.96, + "learning_rate": 2.88333858834422e-07, + "loss": 1.3003, + "step": 17080 + }, + { + "epoch": 2.97, + "learning_rate": 2.7399771959880637e-07, + "loss": 1.2441, + "step": 17090 + }, + { + "epoch": 2.97, + "learning_rate": 2.600266382345895e-07, + "loss": 1.2973, + "step": 17100 + }, + { + "epoch": 2.97, + "learning_rate": 2.4642066588441705e-07, + "loss": 1.2442, + "step": 17110 + }, + { + "epoch": 2.97, + "learning_rate": 2.3317985235443707e-07, + "loss": 1.2901, + "step": 17120 + }, + { + "epoch": 2.97, + "learning_rate": 2.215753710563373e-07, + "loss": 1.2668, + "step": 17130 + }, + { + "epoch": 2.97, + "learning_rate": 2.0902849171310356e-07, + "loss": 1.2799, + "step": 17140 + }, + { + "epoch": 2.98, + "learning_rate": 1.968469080681823e-07, + "loss": 1.2652, + "step": 17150 + }, + { + "epoch": 2.98, + "learning_rate": 1.8619584749273167e-07, + "loss": 1.2909, + "step": 17160 + }, + { + "epoch": 2.98, + "learning_rate": 1.747084474202576e-07, + "loss": 1.2791, + "step": 17170 + }, + { + "epoch": 2.98, + "learning_rate": 1.6358646867835615e-07, + "loss": 1.269, + "step": 17180 + }, + { + "epoch": 2.98, + "learning_rate": 1.5282995198021565e-07, + "loss": 1.2189, + "step": 17190 + }, + { + "epoch": 2.99, + "learning_rate": 1.424389367012613e-07, + "loss": 1.2991, + "step": 17200 + }, + { + "epoch": 2.99, + "learning_rate": 1.3241346087892182e-07, + "loss": 1.2893, + "step": 17210 + }, + { + "epoch": 2.99, + "learning_rate": 1.2275356121254077e-07, + "loss": 1.2596, + "step": 17220 + }, + { + "epoch": 2.99, + "learning_rate": 1.1345927306323224e-07, + "loss": 1.2393, + "step": 17230 + }, + { + "epoch": 2.99, + "learning_rate": 1.0453063045375855e-07, + "loss": 1.2814, + "step": 17240 + }, + { + "epoch": 2.99, + "learning_rate": 9.596766606836393e-08, + "loss": 1.2632, + "step": 17250 + }, + { + "epoch": 3.0, + "learning_rate": 8.777041125273e-08, + "loss": 1.2705, + "step": 17260 + }, + { + "epoch": 3.0, + "learning_rate": 7.993889601378701e-08, + "loss": 1.2207, + "step": 17270 + }, + { + "epoch": 3.0, + "learning_rate": 7.24731490196584e-08, + "loss": 1.2858, + "step": 17280 + }, + { + "epoch": 3.0, + "step": 17285, + "total_flos": 9.82692376847319e+16, + "train_loss": 1.5514631569575188, + "train_runtime": 557290.3071, + "train_samples_per_second": 3.97, + "train_steps_per_second": 0.031 + } + ], + "max_steps": 17285, + "num_train_epochs": 4, + "total_flos": 9.82692376847319e+16, + "trial_name": null, + "trial_params": null +}